PyPI - mineru - Versions diffs - 2.2.2__py3-none-any.whl → 2.5.1__py3-none-any.whl - Mend

mineru 2.2.2py3-none-any.whl → 2.5.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (43) hide show

mineru/backend/pipeline/pipeline_middle_json_mkcontent.py +3 -3
mineru/backend/vlm/model_output_to_middle_json.py +123 -0
mineru/backend/vlm/vlm_analyze.py +105 -16
mineru/backend/vlm/vlm_magic_model.py +201 -135
mineru/backend/vlm/vlm_middle_json_mkcontent.py +52 -11
mineru/cli/client.py +6 -5
mineru/cli/common.py +17 -16
mineru/cli/fast_api.py +9 -7
mineru/cli/gradio_app.py +15 -16
mineru/cli/vlm_vllm_server.py +4 -0
mineru/model/table/rec/unet_table/main.py +8 -0
mineru/model/vlm_vllm_model/__init__.py +0 -0
mineru/model/vlm_vllm_model/server.py +59 -0
mineru/resources/header.html +10 -2
mineru/utils/draw_bbox.py +32 -10
mineru/utils/enum_class.py +16 -2
mineru/utils/guess_suffix_or_lang.py +20 -0
mineru/utils/span_block_fix.py +4 -2
mineru/version.py +1 -1
{mineru-2.2.2.dist-info → mineru-2.5.1.dist-info}/METADATA +70 -25
{mineru-2.2.2.dist-info → mineru-2.5.1.dist-info}/RECORD +25 -38
{mineru-2.2.2.dist-info → mineru-2.5.1.dist-info}/entry_points.txt +1 -1
mineru/backend/vlm/base_predictor.py +0 -186
mineru/backend/vlm/hf_predictor.py +0 -217
mineru/backend/vlm/predictor.py +0 -111
mineru/backend/vlm/sglang_client_predictor.py +0 -443
mineru/backend/vlm/sglang_engine_predictor.py +0 -246
mineru/backend/vlm/token_to_middle_json.py +0 -122
mineru/backend/vlm/utils.py +0 -40
mineru/cli/vlm_sglang_server.py +0 -4
mineru/model/vlm_hf_model/__init__.py +0 -9
mineru/model/vlm_hf_model/configuration_mineru2.py +0 -38
mineru/model/vlm_hf_model/image_processing_mineru2.py +0 -269
mineru/model/vlm_hf_model/modeling_mineru2.py +0 -449
mineru/model/vlm_sglang_model/__init__.py +0 -14
mineru/model/vlm_sglang_model/engine.py +0 -264
mineru/model/vlm_sglang_model/image_processor.py +0 -213
mineru/model/vlm_sglang_model/logit_processor.py +0 -90
mineru/model/vlm_sglang_model/model.py +0 -453
mineru/model/vlm_sglang_model/server.py +0 -75
{mineru-2.2.2.dist-info → mineru-2.5.1.dist-info}/WHEEL +0 -0
{mineru-2.2.2.dist-info → mineru-2.5.1.dist-info}/licenses/LICENSE.md +0 -0
{mineru-2.2.2.dist-info → mineru-2.5.1.dist-info}/top_level.txt +0 -0

mineru/model/vlm_sglang_model/engine.py DELETED Viewed

@@ -1,264 +0,0 @@
-import asyncio
-import time
-from types import MethodType
-from typing import AsyncIterator, Dict, Iterator, List, Optional, Union
-import fastapi
-from sglang.srt.entrypoints.engine import Engine as _Engine
-from sglang.srt.managers.io_struct import EmbeddingReqInput, GenerateReqInput
-from sglang.srt.managers.tokenizer_manager import (
-    TokenizerManager,
-    dataclass_to_string_truncated,
-    logger,
-)
-from sglang.srt.sampling.sampling_params import SamplingParams
-from sglang.srt.server_args import ServerArgs
-from ...utils.run_async import run_async
-from .logit_processor import Mineru2LogitProcessor
-class BatchEngine(_Engine):
-    """
-    The engine is patched to support batch multi-modal generate, and early image preprocessing.
-    """
-    def __init__(self, server_args: ServerArgs, **kwargs):
-        server_args.enable_custom_logit_processor = True
-        super().__init__(server_args=server_args, **kwargs)
-        _patch_tokenizer_manager(self.tokenizer_manager)
-    def generate(
-        self,
-        # The input prompt. It can be a single prompt or a batch of prompts.
-        prompt: Optional[Union[List[str], str]] = None,
-        sampling_params: Optional[Union[List[Dict], Dict]] = None,
-        # The token ids for text; one can either specify text or input_ids.
-        input_ids: Optional[Union[List[List[int]], List[int]]] = None,
-        # The image input. It can be a file name, a url, or base64 encoded string.
-        # See also python/sglang/srt/utils.py:load_image.
-        image_data: Optional[Union[List[str], str]] = None,
-        return_logprob: Optional[Union[List[bool], bool]] = False,
-        logprob_start_len: Optional[Union[List[int], int]] = None,
-        top_logprobs_num: Optional[Union[List[int], int]] = None,
-        token_ids_logprob: Optional[Union[List[List[int]], List[int]]] = None,
-        lora_path: Optional[List[Optional[str]]] = None,
-        custom_logit_processor: Optional[Union[List[Optional[str]], str]] = None,
-        return_hidden_states: bool = False,
-        stream: bool = False,
-    ) -> Union[Dict, Iterator[Dict]]:
-        """
-        The arguments of this function is the same as `sglang/srt/managers/io_struct.py::GenerateReqInput`.
-        Please refer to `GenerateReqInput` for the documentation.
-        """
-        modalities_list = []
-        # EDIT
-        if isinstance(image_data, list):
-            for _ in range(len(image_data)):
-                modalities_list.append(["image"])
-        elif image_data is not None:
-            modalities_list.append("image")
-        # ADD
-        if custom_logit_processor is None:
-            custom_logit_processor = Mineru2LogitProcessor().to_str()
-        obj = GenerateReqInput(
-            text=prompt,
-            input_ids=input_ids,
-            sampling_params=sampling_params,
-            image_data=image_data,
-            return_logprob=return_logprob,
-            logprob_start_len=logprob_start_len,
-            top_logprobs_num=top_logprobs_num,
-            token_ids_logprob=token_ids_logprob,
-            lora_path=lora_path,
-            modalities=modalities_list,
-            custom_logit_processor=custom_logit_processor,
-            return_hidden_states=return_hidden_states,
-            stream=stream,
-        )
-        generator = _generate_request(self.tokenizer_manager, obj, None)
-        if stream:
-            def generator_wrapper():
-                while True:
-                    try:
-                        chunk = run_async(generator.__anext__())
-                        yield chunk
-                    except StopAsyncIteration:
-                        break
-            return generator_wrapper()
-        else:
-            ret = run_async(generator.__anext__())
-            return ret
-    async def async_generate(
-        self,
-        # The input prompt. It can be a single prompt or a batch of prompts.
-        prompt: Optional[Union[List[str], str]] = None,
-        sampling_params: Optional[Union[List[Dict], Dict]] = None,
-        # The token ids for text; one can either specify text or input_ids.
-        input_ids: Optional[Union[List[List[int]], List[int]]] = None,
-        # The image input. It can be a file name, a url, or base64 encoded string.
-        # See also python/sglang/srt/utils.py:load_image.
-        image_data: Optional[Union[List[str], str]] = None,
-        return_logprob: Optional[Union[List[bool], bool]] = False,
-        logprob_start_len: Optional[Union[List[int], int]] = None,
-        top_logprobs_num: Optional[Union[List[int], int]] = None,
-        token_ids_logprob: Optional[Union[List[List[int]], List[int]]] = None,
-        lora_path: Optional[List[Optional[str]]] = None,
-        custom_logit_processor: Optional[Union[List[Optional[str]], str]] = None,
-        return_hidden_states: bool = False,
-        stream: bool = False,
-    ) -> Union[Dict, AsyncIterator[Dict], Iterator[Dict]]:
-        """
-        The arguments of this function is the same as `sglang/srt/managers/io_struct.py::GenerateReqInput`.
-        Please refer to `GenerateReqInput` for the documentation.
-        """
-        modalities_list = []
-        # EDIT
-        if isinstance(image_data, list):
-            for _ in range(len(image_data)):
-                modalities_list.append(["image"])
-        elif image_data is not None:
-            modalities_list.append("image")
-        # ADD
-        if custom_logit_processor is None:
-            custom_logit_processor = Mineru2LogitProcessor().to_str()
-        obj = GenerateReqInput(
-            text=prompt,
-            input_ids=input_ids,
-            sampling_params=sampling_params,
-            image_data=image_data,
-            return_logprob=return_logprob,
-            logprob_start_len=logprob_start_len,
-            top_logprobs_num=top_logprobs_num,
-            token_ids_logprob=token_ids_logprob,
-            lora_path=lora_path,
-            modalities=modalities_list,
-            custom_logit_processor=custom_logit_processor,
-            return_hidden_states=return_hidden_states,
-            stream=stream,
-        )
-        generator = _generate_request(self.tokenizer_manager, obj, None)
-        if stream is True:
-            return generator
-        else:
-            return await generator.__anext__()
-def _auto_create_handle_loop(self: TokenizerManager):
-    """
-    patch the original `auto_create_handle_loop()` method to reset `no_create_loop`
-    when the event loop changes.
-    """
-    try:
-        curr_handle_loop = asyncio.get_running_loop()
-    except RuntimeError:
-        curr_handle_loop = None
-    last_handle_loop = getattr(self, "_last_handle_loop", None)
-    if last_handle_loop != curr_handle_loop:
-        self.no_create_loop = False
-        setattr(self, "_last_handle_loop", curr_handle_loop)
-    return TokenizerManager.auto_create_handle_loop(self)
-def _patch_tokenizer_manager(self: TokenizerManager):
-    self.auto_create_handle_loop = MethodType(_auto_create_handle_loop, self)
-async def _one_request(
-    self: TokenizerManager,
-    obj: Union[GenerateReqInput, EmbeddingReqInput],
-    request: Optional[fastapi.Request],
-    created_time: Optional[float],
-):
-    tokenized_obj = await self._tokenize_one_request(obj)
-    state = self._send_one_request(obj, tokenized_obj, created_time)
-    async for out in self._wait_one_response(obj, state, request):
-        yield out
-async def _handle_batch_request(
-    self: TokenizerManager,
-    obj: Union[GenerateReqInput, EmbeddingReqInput],
-    request: Optional[fastapi.Request] = None,
-    created_time: Optional[float] = None,
-):
-    batch_size = obj.batch_size
-    generators = []
-    rids = []
-    if getattr(obj, "parallel_sample_num", 1) != 1:
-        raise Exception("parallel_sample_num != 1 is not supported in this patched code.")
-    # Send all requests
-    for i in range(batch_size):
-        tmp_obj = obj[i]
-        generators.append(_one_request(self, tmp_obj, request, created_time))
-        rids.append(tmp_obj.rid)
-    # Wait for all requests
-    is_stream = hasattr(obj, "stream") and obj.stream
-    if not is_stream:
-        outputs = await asyncio.gather(*(gen.__anext__() for gen in generators))
-        yield outputs
-    else:
-        rid_to_index = {rid: i for i, rid in enumerate(rids)}
-        task_map = {asyncio.create_task(gen.__anext__()): gen for gen in generators}
-        while task_map:
-            done, _ = await asyncio.wait(task_map.keys(), return_when=asyncio.FIRST_COMPLETED)
-            for task in done:
-                gen = task_map.pop(task)
-                try:
-                    result = task.result()
-                    result["index"] = rid_to_index[result["meta_info"]["id"]]
-                    yield result
-                    new_task = asyncio.create_task(gen.__anext__())
-                    task_map[new_task] = gen
-                except StopAsyncIteration:
-                    pass
-async def _generate_request(
-    self: TokenizerManager,
-    obj: Union[GenerateReqInput, EmbeddingReqInput],
-    request: Optional[fastapi.Request] = None,
-):
-    created_time = time.time()
-    self.auto_create_handle_loop()
-    if isinstance(obj, EmbeddingReqInput) and self.is_generation:
-        raise ValueError(
-            "This model does not appear to be an embedding model by default. "
-            "Please add `--is-embedding` when launching the server or try another model."
-        )
-    obj.normalize_batch_and_arguments()
-    if self.log_requests:
-        max_length, skip_names, _ = self.log_request_metadata
-        logger.info(f"Receive: obj={dataclass_to_string_truncated(obj, max_length, skip_names=skip_names)}")
-    async with self.model_update_lock.reader_lock:
-        is_single = obj.is_single
-        if is_single:
-            tokenized_obj = await self._tokenize_one_request(obj)
-            state = self._send_one_request(obj, tokenized_obj, created_time)
-            async for response in self._wait_one_response(obj, state, request):
-                yield response
-        else:
-            async for response in _handle_batch_request(self, obj, request, created_time):
-                yield response

mineru/model/vlm_sglang_model/image_processor.py DELETED Viewed

@@ -1,213 +0,0 @@
-import ast
-import asyncio
-import re
-from typing import List, Optional, Union
-import numpy as np
-from sglang.version import __version__ as sglang_version
-from packaging import version
-if version.parse(sglang_version) >= version.parse("0.4.9"):
-    # sglang >= 0.4.9
-    from sglang.srt.multimodal.processors.base_processor import (
-        BaseMultimodalProcessor as BaseProcessor,
-    )
-    from sglang.srt.multimodal.mm_utils import divide_to_patches, expand2square, select_best_resolution
-else:
-    # 0.4.7 <= sglang < 0.4.9
-    from sglang.srt.managers.multimodal_processors.base_processor import (
-        BaseMultimodalProcessor as BaseProcessor,
-    )
-    from sglang.srt.mm_utils import divide_to_patches, expand2square, select_best_resolution
-get_global_processor = None
-from sglang.srt.utils import load_image, logger
-from sglang.utils import get_exception_traceback
-from .model import Mineru2QwenForCausalLM
-# image_best_res is only resized (not padded).
-def process_anyres_image(image, processor, grid_pinpoints):
-    if isinstance(grid_pinpoints, str) and "x" in grid_pinpoints:
-        patch_size = processor.crop_size["height"]
-        assert patch_size in [224, 336, 384, 448, 512], "patch_size should be in [224, 336, 384, 448, 512]"
-        matches = re.findall(r"\((\d+)x(\d+)\)", grid_pinpoints)
-        range_start = tuple(map(int, matches[0]))
-        range_end = tuple(map(int, matches[-1]))
-        grid_pinpoints = [
-            (i, j) for i in range(range_start[0], range_end[0] + 1) for j in range(range_start[1], range_end[1] + 1)
-        ]
-        grid_pinpoints = [[dim * patch_size for dim in pair] for pair in grid_pinpoints]
-    if type(grid_pinpoints) is list:
-        possible_resolutions = grid_pinpoints
-    else:
-        possible_resolutions = ast.literal_eval(grid_pinpoints)
-    best_resolution = select_best_resolution(image.size, possible_resolutions)
-    image_best_res = image.resize(best_resolution)  # <<<<<<< Here changed
-    patches = divide_to_patches(image_best_res, processor.crop_size["height"])
-    image_original_resize = image.resize((processor.crop_size["height"], processor.crop_size["height"]))
-    image_patches = [image_original_resize] + patches
-    image_patches = [processor.preprocess(image_patch)["pixel_values"][0] for image_patch in image_patches]
-    return np.stack(image_patches, axis=0)
-class Mineru2ImageProcessor(BaseProcessor):
-    def __init__(self, hf_config, server_args, _processor, *args, **kwargs):
-        super().__init__(hf_config, server_args, _processor, *args, **kwargs)
-    @staticmethod
-    def _process_single_image_task(
-        image_data: Union[str, bytes],
-        image_aspect_ratio: Optional[str] = None,
-        image_grid_pinpoints: Optional[str] = None,
-        image_processor=None,
-    ):
-        if image_processor is None:
-            assert get_global_processor is not None
-            image_processor = get_global_processor().image_processor
-        try:
-            image, image_size = load_image(image_data)
-            if image_size is not None:
-                # It is a video with multiple images
-                image_hash = hash(image_data)
-                pixel_values = image_processor(image)["pixel_values"]
-                pixel_values = np.stack(pixel_values, axis=0)
-                return pixel_values, image_hash, image_size
-            else:
-                # It is an image
-                image_hash = hash(image_data)
-                if image_aspect_ratio == "pad":
-                    image = expand2square(
-                        image,
-                        tuple(int(x * 255) for x in image_processor.image_mean),
-                    )
-                    pixel_values = image_processor(image.convert("RGB"))["pixel_values"][0]
-                elif image_aspect_ratio == "anyres" or (image_aspect_ratio is not None and "anyres_max" in image_aspect_ratio):
-                    pixel_values = process_anyres_image(image, image_processor, image_grid_pinpoints)
-                else:
-                    pixel_values = image_processor(image)["pixel_values"][0]
-                return pixel_values, image_hash, image.size
-        except Exception:
-            logger.error("Exception in TokenizerManager:\n" + get_exception_traceback())
-    async def _process_single_image(self, image_data: Union[bytes, str], aspect_ratio: str, grid_pinpoints: str):
-        if hasattr(self, "cpu_executor"):
-            executor = self.cpu_executor
-        else:
-            executor = self.executor
-        if get_global_processor is not None:
-            image_processor = None  # save ipc cost
-        else:
-            image_processor = self._processor.image_processor
-        if executor is not None:
-            loop = asyncio.get_running_loop()
-            return await loop.run_in_executor(
-                executor,
-                Mineru2ImageProcessor._process_single_image_task,
-                image_data,
-                aspect_ratio,
-                grid_pinpoints,
-                image_processor,
-            )
-        else:
-            return self._process_single_image_task(
-                image_data,
-                aspect_ratio,
-                grid_pinpoints,
-                image_processor,
-            )
-    async def process_mm_data_async(
-        self,
-        image_data: List[Union[str, bytes]],
-        input_text,
-        request_obj,
-        *args,
-        **kwargs,
-    ):
-        from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
-        if not image_data:
-            return None
-        modalities = request_obj.modalities or ["image"]
-        aspect_ratio = getattr(self.hf_config, "image_aspect_ratio", None)
-        grid_pinpoints = (
-            self.hf_config.image_grid_pinpoints
-            if hasattr(self.hf_config, "image_grid_pinpoints")
-               and "anyres" in aspect_ratio
-            else None
-        )
-        if isinstance(image_data, str):
-            image_data = [image_data]
-        if isinstance(image_data, list) and len(image_data) > 0:
-            if "multi-images" in modalities or "video" in modalities:
-                # Multiple images
-                aspect_ratio = "pad"  # LLaVA OneVision Handling: more than one image --> interleaved image mode or video mode. We do not use anyres
-                pixel_values, data_hashes, image_sizes = [], [], []
-                res = []
-                for img_data in image_data:
-                    res.append(
-                        self._process_single_image(
-                            img_data, aspect_ratio, grid_pinpoints
-                        )
-                    )
-                res = await asyncio.gather(*res)
-                for pixel_v, image_h, image_s in res:
-                    pixel_values.append(pixel_v)
-                    data_hashes.append(image_h)
-                    image_sizes.append(image_s)
-                if isinstance(pixel_values[0], np.ndarray):
-                    pixel_values = np.stack(pixel_values, axis=0)
-            else:
-                # A single image
-                pixel_values, image_hash, image_size = await self._process_single_image(
-                    image_data[0], aspect_ratio, grid_pinpoints
-                )
-                image_sizes = [image_size]
-        else:
-            raise ValueError(f"Invalid image data: {image_data}")
-        modality = Modality.IMAGE
-        if isinstance(request_obj.modalities, list):
-            if request_obj.modalities[0] == "multi-images":
-                modality = Modality.MULTI_IMAGES
-            elif request_obj.modalities[0] == "video":
-                modality = Modality.VIDEO
-        if version.parse(sglang_version) >= version.parse("0.4.9.post3"):
-            # sglang >= 0.4.9.post3
-            return {
-                "mm_items": [
-                    MultimodalDataItem(
-                        feature=pixel_values,
-                        model_specific_data={
-                            "image_sizes": image_sizes,
-                        },
-                        modality=modality,
-                    )
-                ],
-            }
-        else:
-            # 0.4.7 <= sglang <= 0.4.9.post2
-            return {
-                "mm_items": [
-                    MultimodalDataItem(
-                        pixel_values=pixel_values,
-                        image_sizes=image_sizes,
-                        modality=modality,
-                    )
-                ],
-            }
-ImageProcessorMapping = {Mineru2QwenForCausalLM: Mineru2ImageProcessor}

mineru/model/vlm_sglang_model/logit_processor.py DELETED Viewed

@@ -1,90 +0,0 @@
-from typing import List
-from sglang.srt.sampling.custom_logit_processor import CustomLogitProcessor
-class Mineru2LogitProcessor(CustomLogitProcessor):
-    """
-    Stateless logit processor for Mineru2.
-    (base-class: sglang.srt.sampling.custom_logit_processor.CustomLogitProcessor)
-    This processor applies token-level constraints to prevent repetition during generation.
-    It supports two main constraints:
-    - no_repeat_ngram_size (int):
-        Prevents repeating the same n-gram of specified size in the output.
-        Inspired by Hugging Face's NoRepeatNGramLogitsProcessor.
-        This implementation is slower due to its lack of specialized optimization.
-    - no_repeat_token_count (int):
-        (Placeholder for future logic)
-        Intended to prevent repeating the same token multiple times.
-        Not yet implemented in this version.
-    """
-    def __init__(self) -> None:
-        super().__init__()
-        self._generated_ngrams = {}  # Cache of generated n-grams by request ID
-        self._time = {}  # Timestamp of the last update for each request
-        self._gen_step = 0  # Global generation step counter
-    def __call__(self, logits, batch_info: List[dict]):
-        """
-        Applies repetition constraints to the logits before sampling tokens.
-        Args:
-            logits (FloatTensor): A tensor of shape (batch_size, vocab_size) containing raw token logits.
-            batch_info (List[dict]): A list of metadata dicts for each sample in the batch. Each dict must include:
-                - "__req__": Request object containing request ID and output_ids.
-                - "no_repeat_ngram_size": Size of n-gram to avoid repeating.
-        Returns:
-            FloatTensor: The modified logits tensor with banned token logits set to -inf.
-        """
-        from sglang.srt.managers.schedule_batch import Req
-        self._gen_step += 1  # Update global generation step
-        for idx, info in enumerate(batch_info):
-            if not isinstance(info, dict) or "__req__" not in info:
-                continue
-            req: Req = info["__req__"]
-            rid = req.rid
-            output_ids = req.output_ids
-            ngram_size = info.get("no_repeat_ngram_size", 0)
-            # Skip if there are not enough tokens to form an n-gram
-            if ngram_size <= 0 or len(output_ids) < ngram_size:
-                continue
-            # Record the current step for cache cleanup tracking
-            self._time[rid] = self._gen_step
-            # Initialize n-gram cache for this request if it doesn't exist
-            if rid not in self._generated_ngrams:
-                self._generated_ngrams[rid] = {}
-            # Get the n-gram prefix (all but the last token)
-            prev_ngram = tuple(output_ids[-ngram_size:-1])
-            last_token = output_ids[-1]
-            # Store this n-gram occurrence
-            self._generated_ngrams[rid][prev_ngram] = self._generated_ngrams[rid].get(prev_ngram, []) + [last_token]
-            # Get the next-token candidates to ban based on current prefix
-            current_prefix = tuple(output_ids[-ngram_size + 1 :])
-            banned_tokens = self._generated_ngrams[rid].get(current_prefix, [])
-            # Set the logits of banned tokens to negative infinity
-            for token in banned_tokens:
-                logits[idx][token] = -float("inf")
-        # Clean up cache for expired requests
-        expired_rids = [rid for rid, last_used in self._time.items() if last_used < self._gen_step]
-        for rid in expired_rids:
-            self._generated_ngrams.pop(rid, None)
-            self._time.pop(rid, None)
-        return logits

mineru 2.2.2__py3-none-any.whl → 2.5.1__py3-none-any.whl

mineru 2.2.2py3-none-any.whl → 2.5.1py3-none-any.whl