mineru 2.2.2__py3-none-any.whl → 2.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. mineru/backend/pipeline/pipeline_middle_json_mkcontent.py +3 -3
  2. mineru/backend/vlm/model_output_to_middle_json.py +123 -0
  3. mineru/backend/vlm/vlm_analyze.py +97 -16
  4. mineru/backend/vlm/vlm_magic_model.py +201 -135
  5. mineru/backend/vlm/vlm_middle_json_mkcontent.py +52 -11
  6. mineru/cli/client.py +6 -5
  7. mineru/cli/common.py +17 -16
  8. mineru/cli/fast_api.py +9 -7
  9. mineru/cli/gradio_app.py +15 -16
  10. mineru/cli/vlm_vllm_server.py +4 -0
  11. mineru/model/table/rec/unet_table/main.py +8 -0
  12. mineru/model/vlm_vllm_model/__init__.py +0 -0
  13. mineru/model/vlm_vllm_model/server.py +51 -0
  14. mineru/resources/header.html +10 -2
  15. mineru/utils/draw_bbox.py +32 -10
  16. mineru/utils/enum_class.py +16 -2
  17. mineru/utils/guess_suffix_or_lang.py +20 -0
  18. mineru/utils/span_block_fix.py +4 -2
  19. mineru/version.py +1 -1
  20. {mineru-2.2.2.dist-info → mineru-2.5.0.dist-info}/METADATA +70 -25
  21. {mineru-2.2.2.dist-info → mineru-2.5.0.dist-info}/RECORD +25 -38
  22. {mineru-2.2.2.dist-info → mineru-2.5.0.dist-info}/entry_points.txt +1 -1
  23. mineru/backend/vlm/base_predictor.py +0 -186
  24. mineru/backend/vlm/hf_predictor.py +0 -217
  25. mineru/backend/vlm/predictor.py +0 -111
  26. mineru/backend/vlm/sglang_client_predictor.py +0 -443
  27. mineru/backend/vlm/sglang_engine_predictor.py +0 -246
  28. mineru/backend/vlm/token_to_middle_json.py +0 -122
  29. mineru/backend/vlm/utils.py +0 -40
  30. mineru/cli/vlm_sglang_server.py +0 -4
  31. mineru/model/vlm_hf_model/__init__.py +0 -9
  32. mineru/model/vlm_hf_model/configuration_mineru2.py +0 -38
  33. mineru/model/vlm_hf_model/image_processing_mineru2.py +0 -269
  34. mineru/model/vlm_hf_model/modeling_mineru2.py +0 -449
  35. mineru/model/vlm_sglang_model/__init__.py +0 -14
  36. mineru/model/vlm_sglang_model/engine.py +0 -264
  37. mineru/model/vlm_sglang_model/image_processor.py +0 -213
  38. mineru/model/vlm_sglang_model/logit_processor.py +0 -90
  39. mineru/model/vlm_sglang_model/model.py +0 -453
  40. mineru/model/vlm_sglang_model/server.py +0 -75
  41. {mineru-2.2.2.dist-info → mineru-2.5.0.dist-info}/WHEEL +0 -0
  42. {mineru-2.2.2.dist-info → mineru-2.5.0.dist-info}/licenses/LICENSE.md +0 -0
  43. {mineru-2.2.2.dist-info → mineru-2.5.0.dist-info}/top_level.txt +0 -0
@@ -1,217 +0,0 @@
1
- from io import BytesIO
2
- from typing import Iterable, List, Optional, Union
3
-
4
- import torch
5
- from PIL import Image
6
- from tqdm import tqdm
7
- from transformers import AutoTokenizer, BitsAndBytesConfig, __version__
8
-
9
- from ...model.vlm_hf_model import Mineru2QwenForCausalLM
10
- from ...model.vlm_hf_model.image_processing_mineru2 import process_images
11
- from .base_predictor import (
12
- DEFAULT_MAX_NEW_TOKENS,
13
- DEFAULT_NO_REPEAT_NGRAM_SIZE,
14
- DEFAULT_PRESENCE_PENALTY,
15
- DEFAULT_REPETITION_PENALTY,
16
- DEFAULT_TEMPERATURE,
17
- DEFAULT_TOP_K,
18
- DEFAULT_TOP_P,
19
- BasePredictor,
20
- )
21
- from .utils import load_resource
22
-
23
-
24
- class HuggingfacePredictor(BasePredictor):
25
- def __init__(
26
- self,
27
- model_path: str,
28
- device_map="auto",
29
- device="cuda",
30
- torch_dtype="auto",
31
- load_in_8bit=False,
32
- load_in_4bit=False,
33
- use_flash_attn=False,
34
- temperature: float = DEFAULT_TEMPERATURE,
35
- top_p: float = DEFAULT_TOP_P,
36
- top_k: int = DEFAULT_TOP_K,
37
- repetition_penalty: float = DEFAULT_REPETITION_PENALTY,
38
- presence_penalty: float = DEFAULT_PRESENCE_PENALTY,
39
- no_repeat_ngram_size: int = DEFAULT_NO_REPEAT_NGRAM_SIZE,
40
- max_new_tokens: int = DEFAULT_MAX_NEW_TOKENS,
41
- **kwargs,
42
- ):
43
- super().__init__(
44
- temperature=temperature,
45
- top_p=top_p,
46
- top_k=top_k,
47
- repetition_penalty=repetition_penalty,
48
- presence_penalty=presence_penalty,
49
- no_repeat_ngram_size=no_repeat_ngram_size,
50
- max_new_tokens=max_new_tokens,
51
- )
52
-
53
- kwargs = {"device_map": device_map, **kwargs}
54
-
55
- if device != "cuda":
56
- kwargs["device_map"] = {"": device}
57
-
58
- if load_in_8bit:
59
- kwargs["load_in_8bit"] = True
60
- elif load_in_4bit:
61
- kwargs["load_in_4bit"] = True
62
- kwargs["quantization_config"] = BitsAndBytesConfig(
63
- load_in_4bit=True,
64
- bnb_4bit_compute_dtype=torch.float16,
65
- bnb_4bit_use_double_quant=True,
66
- bnb_4bit_quant_type="nf4",
67
- )
68
- else:
69
- from packaging import version
70
- if version.parse(__version__) >= version.parse("4.56.0"):
71
- kwargs["dtype"] = torch_dtype
72
- else:
73
- kwargs["torch_dtype"] = torch_dtype
74
-
75
- if use_flash_attn:
76
- kwargs["attn_implementation"] = "flash_attention_2"
77
-
78
- self.tokenizer = AutoTokenizer.from_pretrained(model_path)
79
- self.model = Mineru2QwenForCausalLM.from_pretrained(
80
- model_path,
81
- low_cpu_mem_usage=True,
82
- **kwargs,
83
- )
84
- setattr(self.model.config, "_name_or_path", model_path)
85
- self.model.eval()
86
-
87
- vision_tower = self.model.get_model().vision_tower
88
- if device_map != "auto":
89
- vision_tower.to(device=device_map, dtype=self.model.dtype)
90
-
91
- self.image_processor = vision_tower.image_processor
92
- self.eos_token_id = self.model.config.eos_token_id
93
-
94
- def predict(
95
- self,
96
- image: str | bytes,
97
- prompt: str = "",
98
- temperature: Optional[float] = None,
99
- top_p: Optional[float] = None,
100
- top_k: Optional[int] = None,
101
- repetition_penalty: Optional[float] = None,
102
- presence_penalty: Optional[float] = None,
103
- no_repeat_ngram_size: Optional[int] = None,
104
- max_new_tokens: Optional[int] = None,
105
- **kwargs,
106
- ) -> str:
107
- prompt = self.build_prompt(prompt)
108
-
109
- if temperature is None:
110
- temperature = self.temperature
111
- if top_p is None:
112
- top_p = self.top_p
113
- if top_k is None:
114
- top_k = self.top_k
115
- if repetition_penalty is None:
116
- repetition_penalty = self.repetition_penalty
117
- if no_repeat_ngram_size is None:
118
- no_repeat_ngram_size = self.no_repeat_ngram_size
119
- if max_new_tokens is None:
120
- max_new_tokens = self.max_new_tokens
121
-
122
- do_sample = (temperature > 0.0) and (top_k > 1)
123
-
124
- generate_kwargs = {
125
- "repetition_penalty": repetition_penalty,
126
- "no_repeat_ngram_size": no_repeat_ngram_size,
127
- "max_new_tokens": max_new_tokens,
128
- "do_sample": do_sample,
129
- }
130
- if do_sample:
131
- generate_kwargs["temperature"] = temperature
132
- generate_kwargs["top_p"] = top_p
133
- generate_kwargs["top_k"] = top_k
134
-
135
- if isinstance(image, str):
136
- image = load_resource(image)
137
-
138
- image_obj = Image.open(BytesIO(image))
139
- image_tensor = process_images([image_obj], self.image_processor, self.model.config)
140
- image_tensor = image_tensor[0].unsqueeze(0)
141
- image_tensor = image_tensor.to(device=self.model.device, dtype=self.model.dtype)
142
- image_sizes = [[*image_obj.size]]
143
-
144
- encoded_inputs = self.tokenizer(prompt, return_tensors="pt")
145
- input_ids = encoded_inputs.input_ids.to(device=self.model.device)
146
- attention_mask = encoded_inputs.attention_mask.to(device=self.model.device)
147
-
148
- with torch.inference_mode():
149
- output_ids = self.model.generate(
150
- input_ids,
151
- attention_mask=attention_mask,
152
- images=image_tensor,
153
- image_sizes=image_sizes,
154
- use_cache=True,
155
- **generate_kwargs,
156
- **kwargs,
157
- )
158
-
159
- # Remove the last token if it is the eos_token_id
160
- if len(output_ids[0]) > 0 and output_ids[0, -1] == self.eos_token_id:
161
- output_ids = output_ids[:, :-1]
162
-
163
- output = self.tokenizer.batch_decode(
164
- output_ids,
165
- skip_special_tokens=False,
166
- )[0].strip()
167
-
168
- return output
169
-
170
- def batch_predict(
171
- self,
172
- images: List[str] | List[bytes],
173
- prompts: Union[List[str], str] = "",
174
- temperature: Optional[float] = None,
175
- top_p: Optional[float] = None,
176
- top_k: Optional[int] = None,
177
- repetition_penalty: Optional[float] = None,
178
- presence_penalty: Optional[float] = None, # not supported by hf
179
- no_repeat_ngram_size: Optional[int] = None,
180
- max_new_tokens: Optional[int] = None,
181
- **kwargs,
182
- ) -> List[str]:
183
- if not isinstance(prompts, list):
184
- prompts = [prompts] * len(images)
185
-
186
- assert len(prompts) == len(images), "Length of prompts and images must match."
187
-
188
- outputs = []
189
- for prompt, image in tqdm(zip(prompts, images), total=len(images), desc="Predict"):
190
- output = self.predict(
191
- image,
192
- prompt,
193
- temperature=temperature,
194
- top_p=top_p,
195
- top_k=top_k,
196
- repetition_penalty=repetition_penalty,
197
- presence_penalty=presence_penalty,
198
- no_repeat_ngram_size=no_repeat_ngram_size,
199
- max_new_tokens=max_new_tokens,
200
- **kwargs,
201
- )
202
- outputs.append(output)
203
- return outputs
204
-
205
- def stream_predict(
206
- self,
207
- image: str | bytes,
208
- prompt: str = "",
209
- temperature: Optional[float] = None,
210
- top_p: Optional[float] = None,
211
- top_k: Optional[int] = None,
212
- repetition_penalty: Optional[float] = None,
213
- presence_penalty: Optional[float] = None,
214
- no_repeat_ngram_size: Optional[int] = None,
215
- max_new_tokens: Optional[int] = None,
216
- ) -> Iterable[str]:
217
- raise NotImplementedError("Streaming is not supported yet.")
@@ -1,111 +0,0 @@
1
- # Copyright (c) Opendatalab. All rights reserved.
2
-
3
- import time
4
-
5
- from loguru import logger
6
-
7
- from .base_predictor import (
8
- DEFAULT_MAX_NEW_TOKENS,
9
- DEFAULT_NO_REPEAT_NGRAM_SIZE,
10
- DEFAULT_PRESENCE_PENALTY,
11
- DEFAULT_REPETITION_PENALTY,
12
- DEFAULT_TEMPERATURE,
13
- DEFAULT_TOP_K,
14
- DEFAULT_TOP_P,
15
- BasePredictor,
16
- )
17
- from .sglang_client_predictor import SglangClientPredictor
18
-
19
- hf_loaded = False
20
- try:
21
- from .hf_predictor import HuggingfacePredictor
22
-
23
- hf_loaded = True
24
- except ImportError as e:
25
- logger.warning("hf is not installed. If you are not using transformers, you can ignore this warning.")
26
-
27
- engine_loaded = False
28
- try:
29
- from sglang.srt.server_args import ServerArgs
30
-
31
- from .sglang_engine_predictor import SglangEnginePredictor
32
-
33
- engine_loaded = True
34
- except Exception as e:
35
- logger.warning("sglang is not installed. If you are not using sglang, you can ignore this warning.")
36
-
37
-
38
- def get_predictor(
39
- backend: str = "sglang-client",
40
- model_path: str | None = None,
41
- server_url: str | None = None,
42
- temperature: float = DEFAULT_TEMPERATURE,
43
- top_p: float = DEFAULT_TOP_P,
44
- top_k: int = DEFAULT_TOP_K,
45
- repetition_penalty: float = DEFAULT_REPETITION_PENALTY,
46
- presence_penalty: float = DEFAULT_PRESENCE_PENALTY,
47
- no_repeat_ngram_size: int = DEFAULT_NO_REPEAT_NGRAM_SIZE,
48
- max_new_tokens: int = DEFAULT_MAX_NEW_TOKENS,
49
- http_timeout: int = 600,
50
- **kwargs,
51
- ) -> BasePredictor:
52
- start_time = time.time()
53
-
54
- if backend == "transformers":
55
- if not model_path:
56
- raise ValueError("model_path must be provided for transformers backend.")
57
- if not hf_loaded:
58
- raise ImportError(
59
- "transformers is not installed, so huggingface backend cannot be used. "
60
- "If you need to use huggingface backend, please install transformers first."
61
- )
62
- predictor = HuggingfacePredictor(
63
- model_path=model_path,
64
- temperature=temperature,
65
- top_p=top_p,
66
- top_k=top_k,
67
- repetition_penalty=repetition_penalty,
68
- presence_penalty=presence_penalty,
69
- no_repeat_ngram_size=no_repeat_ngram_size,
70
- max_new_tokens=max_new_tokens,
71
- **kwargs,
72
- )
73
- elif backend == "sglang-engine":
74
- if not model_path:
75
- raise ValueError("model_path must be provided for sglang-engine backend.")
76
- if not engine_loaded:
77
- raise ImportError(
78
- "sglang is not installed, so sglang-engine backend cannot be used. "
79
- "If you need to use sglang-engine backend for inference, "
80
- "please install sglang[all]==0.4.8 or a newer version."
81
- )
82
- predictor = SglangEnginePredictor(
83
- server_args=ServerArgs(model_path, **kwargs),
84
- temperature=temperature,
85
- top_p=top_p,
86
- top_k=top_k,
87
- repetition_penalty=repetition_penalty,
88
- presence_penalty=presence_penalty,
89
- no_repeat_ngram_size=no_repeat_ngram_size,
90
- max_new_tokens=max_new_tokens,
91
- )
92
- elif backend == "sglang-client":
93
- if not server_url:
94
- raise ValueError("server_url must be provided for sglang-client backend.")
95
- predictor = SglangClientPredictor(
96
- server_url=server_url,
97
- temperature=temperature,
98
- top_p=top_p,
99
- top_k=top_k,
100
- repetition_penalty=repetition_penalty,
101
- presence_penalty=presence_penalty,
102
- no_repeat_ngram_size=no_repeat_ngram_size,
103
- max_new_tokens=max_new_tokens,
104
- http_timeout=http_timeout,
105
- )
106
- else:
107
- raise ValueError(f"Unsupported backend: {backend}. Supports: transformers, sglang-engine, sglang-client.")
108
-
109
- elapsed = round(time.time() - start_time, 2)
110
- logger.info(f"get_predictor cost: {elapsed}s")
111
- return predictor