mineru 2.2.2__py3-none-any.whl → 2.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. mineru/backend/pipeline/pipeline_middle_json_mkcontent.py +3 -3
  2. mineru/backend/vlm/model_output_to_middle_json.py +123 -0
  3. mineru/backend/vlm/vlm_analyze.py +97 -16
  4. mineru/backend/vlm/vlm_magic_model.py +201 -135
  5. mineru/backend/vlm/vlm_middle_json_mkcontent.py +52 -11
  6. mineru/cli/client.py +6 -5
  7. mineru/cli/common.py +17 -16
  8. mineru/cli/fast_api.py +9 -7
  9. mineru/cli/gradio_app.py +15 -16
  10. mineru/cli/vlm_vllm_server.py +4 -0
  11. mineru/model/table/rec/unet_table/main.py +8 -0
  12. mineru/model/vlm_vllm_model/__init__.py +0 -0
  13. mineru/model/vlm_vllm_model/server.py +51 -0
  14. mineru/resources/header.html +10 -2
  15. mineru/utils/draw_bbox.py +32 -10
  16. mineru/utils/enum_class.py +16 -2
  17. mineru/utils/guess_suffix_or_lang.py +20 -0
  18. mineru/utils/span_block_fix.py +4 -2
  19. mineru/version.py +1 -1
  20. {mineru-2.2.2.dist-info → mineru-2.5.0.dist-info}/METADATA +70 -25
  21. {mineru-2.2.2.dist-info → mineru-2.5.0.dist-info}/RECORD +25 -38
  22. {mineru-2.2.2.dist-info → mineru-2.5.0.dist-info}/entry_points.txt +1 -1
  23. mineru/backend/vlm/base_predictor.py +0 -186
  24. mineru/backend/vlm/hf_predictor.py +0 -217
  25. mineru/backend/vlm/predictor.py +0 -111
  26. mineru/backend/vlm/sglang_client_predictor.py +0 -443
  27. mineru/backend/vlm/sglang_engine_predictor.py +0 -246
  28. mineru/backend/vlm/token_to_middle_json.py +0 -122
  29. mineru/backend/vlm/utils.py +0 -40
  30. mineru/cli/vlm_sglang_server.py +0 -4
  31. mineru/model/vlm_hf_model/__init__.py +0 -9
  32. mineru/model/vlm_hf_model/configuration_mineru2.py +0 -38
  33. mineru/model/vlm_hf_model/image_processing_mineru2.py +0 -269
  34. mineru/model/vlm_hf_model/modeling_mineru2.py +0 -449
  35. mineru/model/vlm_sglang_model/__init__.py +0 -14
  36. mineru/model/vlm_sglang_model/engine.py +0 -264
  37. mineru/model/vlm_sglang_model/image_processor.py +0 -213
  38. mineru/model/vlm_sglang_model/logit_processor.py +0 -90
  39. mineru/model/vlm_sglang_model/model.py +0 -453
  40. mineru/model/vlm_sglang_model/server.py +0 -75
  41. {mineru-2.2.2.dist-info → mineru-2.5.0.dist-info}/WHEEL +0 -0
  42. {mineru-2.2.2.dist-info → mineru-2.5.0.dist-info}/licenses/LICENSE.md +0 -0
  43. {mineru-2.2.2.dist-info → mineru-2.5.0.dist-info}/top_level.txt +0 -0
mineru/cli/common.py CHANGED
@@ -11,13 +11,14 @@ from loguru import logger
11
11
  from mineru.data.data_reader_writer import FileBasedDataWriter
12
12
  from mineru.utils.draw_bbox import draw_layout_bbox, draw_span_bbox, draw_line_sort_bbox
13
13
  from mineru.utils.enum_class import MakeMode
14
+ from mineru.utils.guess_suffix_or_lang import guess_suffix_by_bytes
14
15
  from mineru.utils.pdf_image_tools import images_bytes_to_pdf_bytes
15
16
  from mineru.backend.vlm.vlm_middle_json_mkcontent import union_make as vlm_union_make
16
17
  from mineru.backend.vlm.vlm_analyze import doc_analyze as vlm_doc_analyze
17
18
  from mineru.backend.vlm.vlm_analyze import aio_doc_analyze as aio_vlm_doc_analyze
18
19
 
19
- pdf_suffixes = [".pdf"]
20
- image_suffixes = [".png", ".jpeg", ".jpg", ".webp", ".gif"]
20
+ pdf_suffixes = ["pdf"]
21
+ image_suffixes = ["png", "jpeg", "jp2", "webp", "gif", "bmp", "jpg"]
21
22
 
22
23
 
23
24
  def read_fn(path):
@@ -25,12 +26,13 @@ def read_fn(path):
25
26
  path = Path(path)
26
27
  with open(str(path), "rb") as input_file:
27
28
  file_bytes = input_file.read()
28
- if path.suffix in image_suffixes:
29
+ file_suffix = guess_suffix_by_bytes(file_bytes)
30
+ if file_suffix in image_suffixes:
29
31
  return images_bytes_to_pdf_bytes(file_bytes)
30
- elif path.suffix in pdf_suffixes:
32
+ elif file_suffix in pdf_suffixes:
31
33
  return file_bytes
32
34
  else:
33
- raise Exception(f"Unknown file suffix: {path.suffix}")
35
+ raise Exception(f"Unknown file suffix: {file_suffix}")
34
36
 
35
37
 
36
38
  def prepare_env(output_dir, pdf_file_name, parse_method):
@@ -145,17 +147,10 @@ def _process_output(
145
147
  )
146
148
 
147
149
  if f_dump_model_output:
148
- if is_pipeline:
149
- md_writer.write_string(
150
- f"{pdf_file_name}_model.json",
151
- json.dumps(model_output, ensure_ascii=False, indent=4),
152
- )
153
- else:
154
- output_text = ("\n" + "-" * 50 + "\n").join(model_output)
155
- md_writer.write_string(
156
- f"{pdf_file_name}_model_output.txt",
157
- output_text,
158
- )
150
+ md_writer.write_string(
151
+ f"{pdf_file_name}_model.json",
152
+ json.dumps(model_output, ensure_ascii=False, indent=4),
153
+ )
159
154
 
160
155
  logger.info(f"local output dir is {local_md_dir}")
161
156
 
@@ -333,6 +328,9 @@ def do_parse(
333
328
  if backend.startswith("vlm-"):
334
329
  backend = backend[4:]
335
330
 
331
+ if backend == "vllm-async-engine":
332
+ raise Exception("vlm-vllm-async-engine backend is not supported in sync mode, please use vlm-vllm-engine backend")
333
+
336
334
  os.environ['MINERU_VLM_FORMULA_ENABLE'] = str(formula_enable)
337
335
  os.environ['MINERU_VLM_TABLE_ENABLE'] = str(table_enable)
338
336
 
@@ -381,6 +379,9 @@ async def aio_do_parse(
381
379
  if backend.startswith("vlm-"):
382
380
  backend = backend[4:]
383
381
 
382
+ if backend == "vllm-engine":
383
+ raise Exception("vlm-vllm-engine backend is not supported in async mode, please use vlm-vllm-async-engine backend")
384
+
384
385
  os.environ['MINERU_VLM_FORMULA_ENABLE'] = str(formula_enable)
385
386
  os.environ['MINERU_VLM_TABLE_ENABLE'] = str(table_enable)
386
387
 
mineru/cli/fast_api.py CHANGED
@@ -18,6 +18,7 @@ from base64 import b64encode
18
18
 
19
19
  from mineru.cli.common import aio_do_parse, read_fn, pdf_suffixes, image_suffixes
20
20
  from mineru.utils.cli_parser import arg_parse
21
+ from mineru.utils.guess_suffix_or_lang import guess_suffix_by_path
21
22
  from mineru.version import __version__
22
23
 
23
24
  app = FastAPI()
@@ -95,13 +96,14 @@ async def parse_pdf(
95
96
  content = await file.read()
96
97
  file_path = Path(file.filename)
97
98
 
98
- # 如果是图像文件或PDF,使用read_fn处理
99
- if file_path.suffix.lower() in pdf_suffixes + image_suffixes:
100
- # 创建临时文件以便使用read_fn
101
- temp_path = Path(unique_dir) / file_path.name
102
- with open(temp_path, "wb") as f:
103
- f.write(content)
99
+ # 创建临时文件
100
+ temp_path = Path(unique_dir) / file_path.name
101
+ with open(temp_path, "wb") as f:
102
+ f.write(content)
104
103
 
104
+ # 如果是图像文件或PDF,使用read_fn处理
105
+ file_suffix = guess_suffix_by_path(temp_path)
106
+ if file_suffix in pdf_suffixes + image_suffixes:
105
107
  try:
106
108
  pdf_bytes = read_fn(temp_path)
107
109
  pdf_bytes_list.append(pdf_bytes)
@@ -115,7 +117,7 @@ async def parse_pdf(
115
117
  else:
116
118
  return JSONResponse(
117
119
  status_code=400,
118
- content={"error": f"Unsupported file type: {file_path.suffix}"}
120
+ content={"error": f"Unsupported file type: {file_suffix}"}
119
121
  )
120
122
 
121
123
 
mineru/cli/gradio_app.py CHANGED
@@ -182,9 +182,9 @@ def to_pdf(file_path):
182
182
 
183
183
  # 更新界面函数
184
184
  def update_interface(backend_choice):
185
- if backend_choice in ["vlm-transformers", "vlm-sglang-engine"]:
185
+ if backend_choice in ["vlm-transformers", "vlm-vllm-async-engine"]:
186
186
  return gr.update(visible=False), gr.update(visible=False)
187
- elif backend_choice in ["vlm-sglang-client"]:
187
+ elif backend_choice in ["vlm-http-client"]:
188
188
  return gr.update(visible=True), gr.update(visible=False)
189
189
  elif backend_choice in ["pipeline"]:
190
190
  return gr.update(visible=False), gr.update(visible=True)
@@ -203,10 +203,10 @@ def update_interface(backend_choice):
203
203
  default=True,
204
204
  )
205
205
  @click.option(
206
- '--enable-sglang-engine',
207
- 'sglang_engine_enable',
206
+ '--enable-vllm-engine',
207
+ 'vllm_engine_enable',
208
208
  type=bool,
209
- help="Enable SgLang engine backend for faster processing.",
209
+ help="Enable vLLM engine backend for faster processing.",
210
210
  default=False,
211
211
  )
212
212
  @click.option(
@@ -246,7 +246,7 @@ def update_interface(backend_choice):
246
246
  default='all',
247
247
  )
248
248
  def main(ctx,
249
- example_enable, sglang_engine_enable, api_enable, max_convert_pages,
249
+ example_enable, vllm_engine_enable, api_enable, max_convert_pages,
250
250
  server_name, server_port, latex_delimiters_type, **kwargs
251
251
  ):
252
252
 
@@ -261,22 +261,21 @@ def main(ctx,
261
261
  else:
262
262
  raise ValueError(f"Invalid latex delimiters type: {latex_delimiters_type}.")
263
263
 
264
- if sglang_engine_enable:
264
+ if vllm_engine_enable:
265
265
  try:
266
- print("Start init SgLang engine...")
266
+ print("Start init vLLM engine...")
267
267
  from mineru.backend.vlm.vlm_analyze import ModelSingleton
268
268
  model_singleton = ModelSingleton()
269
269
  predictor = model_singleton.get_model(
270
- "sglang-engine",
270
+ "vllm-async-engine",
271
271
  None,
272
272
  None,
273
273
  **kwargs
274
274
  )
275
- print("SgLang engine init successfully.")
275
+ print("vLLM engine init successfully.")
276
276
  except Exception as e:
277
277
  logger.exception(e)
278
-
279
- suffixes = pdf_suffixes + image_suffixes
278
+ suffixes = [f".{suffix}" for suffix in pdf_suffixes + image_suffixes]
280
279
  with gr.Blocks() as demo:
281
280
  gr.HTML(header)
282
281
  with gr.Row():
@@ -286,11 +285,11 @@ def main(ctx,
286
285
  with gr.Row():
287
286
  max_pages = gr.Slider(1, max_convert_pages, int(max_convert_pages/2), step=1, label='Max convert pages')
288
287
  with gr.Row():
289
- if sglang_engine_enable:
290
- drop_list = ["pipeline", "vlm-sglang-engine"]
291
- preferred_option = "vlm-sglang-engine"
288
+ if vllm_engine_enable:
289
+ drop_list = ["pipeline", "vlm-vllm-async-engine"]
290
+ preferred_option = "vlm-vllm-async-engine"
292
291
  else:
293
- drop_list = ["pipeline", "vlm-transformers", "vlm-sglang-client"]
292
+ drop_list = ["pipeline", "vlm-transformers", "vlm-http-client"]
294
293
  preferred_option = "pipeline"
295
294
  backend = gr.Dropdown(drop_list, label="Backend", value=preferred_option)
296
295
  with gr.Row(visible=False) as client_options:
@@ -0,0 +1,4 @@
1
+ from mineru.model.vlm_vllm_model.server import main
2
+
3
+ if __name__ == "__main__":
4
+ main()
@@ -12,6 +12,7 @@ from PIL import Image
12
12
  from loguru import logger
13
13
  from bs4 import BeautifulSoup
14
14
 
15
+ from mineru.utils.span_pre_proc import calculate_contrast
15
16
  from .table_structure_unet import TSRUnet
16
17
 
17
18
  from mineru.utils.enum_class import ModelPath
@@ -191,6 +192,13 @@ class WiredTableRecognition:
191
192
  # logger.warning(f"Box {i} has invalid aspect ratio: {x1, y1, x2, y2}")
192
193
  continue
193
194
  img_crop = bgr_img[int(y1):int(y2), int(x1):int(x2)]
195
+
196
+ # 计算span的对比度,低于0.20的span不进行ocr
197
+ if calculate_contrast(img_crop, img_mode='bgr') <= 0.17:
198
+ cell_box_map[i] = [[box, "", 0.1]]
199
+ # logger.debug(f"Box {i} skipped due to low contrast.")
200
+ continue
201
+
194
202
  img_crop_list.append(img_crop)
195
203
  img_crop_info_list.append([i, box])
196
204
 
File without changes
@@ -0,0 +1,51 @@
1
+ import sys
2
+
3
+ from mineru.utils.models_download_utils import auto_download_and_get_model_root_path
4
+ from vllm.entrypoints.cli.main import main as vllm_main
5
+
6
+
7
+ def main():
8
+ args = sys.argv[1:]
9
+
10
+ has_port_arg = False
11
+ has_gpu_memory_utilization_arg = False
12
+ model_path = None
13
+ model_arg_indices = []
14
+
15
+ # 检查现有参数
16
+ for i, arg in enumerate(args):
17
+ if arg == "--port" or arg.startswith("--port="):
18
+ has_port_arg = True
19
+ if arg == "--gpu-memory-utilization" or arg.startswith("--gpu-memory-utilization="):
20
+ has_gpu_memory_utilization_arg = True
21
+ if arg == "--model":
22
+ if i + 1 < len(args):
23
+ model_path = args[i + 1]
24
+ model_arg_indices.extend([i, i + 1])
25
+ elif arg.startswith("--model="):
26
+ model_path = arg.split("=", 1)[1]
27
+ model_arg_indices.append(i)
28
+
29
+ # 从参数列表中移除 --model 参数
30
+ if model_arg_indices:
31
+ for index in sorted(model_arg_indices, reverse=True):
32
+ args.pop(index)
33
+
34
+ # 添加默认参数
35
+ if not has_port_arg:
36
+ args.extend(["--port", "30000"])
37
+ if not has_gpu_memory_utilization_arg:
38
+ args.extend(["--gpu-memory-utilization", "0.5"])
39
+ if not model_path:
40
+ model_path = auto_download_and_get_model_root_path("/", "vlm")
41
+
42
+ # 重构参数,将模型路径作为位置参数
43
+ sys.argv = [sys.argv[0]] + ["serve", model_path] + args
44
+
45
+ # 启动vllm服务器
46
+ print(f"start vllm server: {sys.argv}")
47
+ vllm_main()
48
+
49
+
50
+ if __name__ == "__main__":
51
+ main()
@@ -54,7 +54,7 @@
54
54
  font-family: 'Trebuchet MS', 'Lucida Sans Unicode',
55
55
  'Lucida Grande', 'Lucida Sans', Arial, sans-serif;
56
56
  ">
57
- MinerU 2: PDF Extraction Demo
57
+ MinerU 2.5: PDF Extraction Demo
58
58
  </h1>
59
59
  </div>
60
60
  </div>
@@ -88,7 +88,15 @@
88
88
  <span style="color: white">Code</span>
89
89
  </a>
90
90
  </span>
91
-
91
+ <!-- Code Link. -->
92
+ <span class="link-block">
93
+ <a href="https://huggingface.co/opendatalab/MinerU2.5-2509-1.2B" class="external-link button is-normal is-rounded is-dark" style="text-decoration: none; cursor: pointer">
94
+ <span class="icon" style="margin-right: 4px">
95
+ <i class="fas fa-archive" style="color: white; margin-right: 4px"></i>
96
+ </span>
97
+ <span style="color: white">Model</span>
98
+ </a>
99
+ </span>
92
100
  <!-- arXiv Link. -->
93
101
  <span class="link-block">
94
102
  <a href="https://arxiv.org/abs/2409.18839" class="external-link button is-normal is-rounded is-dark" style="text-decoration: none; cursor: pointer">
mineru/utils/draw_bbox.py CHANGED
@@ -119,22 +119,26 @@ def draw_bbox_with_number(i, bbox_list, page, c, rgb_config, fill_config, draw_b
119
119
 
120
120
  def draw_layout_bbox(pdf_info, pdf_bytes, out_path, filename):
121
121
  dropped_bbox_list = []
122
- tables_list, tables_body_list = [], []
123
- tables_caption_list, tables_footnote_list = [], []
124
- imgs_list, imgs_body_list, imgs_caption_list, imgs_footnote_list = [], [], [], []
122
+ tables_body_list, tables_caption_list, tables_footnote_list = [], [], []
123
+ imgs_body_list, imgs_caption_list, imgs_footnote_list = [], [], []
124
+ codes_body_list, codes_caption_list = [], []
125
125
  titles_list = []
126
126
  texts_list = []
127
127
  interequations_list = []
128
128
  lists_list = []
129
+ list_items_list = []
129
130
  indexs_list = []
131
+
130
132
  for page in pdf_info:
131
133
  page_dropped_list = []
132
- tables, tables_body, tables_caption, tables_footnote = [], [], [], []
133
- imgs, imgs_body, imgs_caption, imgs_footnote = [], [], [], []
134
+ tables_body, tables_caption, tables_footnote = [], [], []
135
+ imgs_body, imgs_caption, imgs_footnote = [], [], []
136
+ codes_body, codes_caption = [], []
134
137
  titles = []
135
138
  texts = []
136
139
  interequations = []
137
140
  lists = []
141
+ list_items = []
138
142
  indices = []
139
143
 
140
144
  for dropped_bbox in page['discarded_blocks']:
@@ -143,7 +147,6 @@ def draw_layout_bbox(pdf_info, pdf_bytes, out_path, filename):
143
147
  for block in page["para_blocks"]:
144
148
  bbox = block["bbox"]
145
149
  if block["type"] == BlockType.TABLE:
146
- tables.append(bbox)
147
150
  for nested_block in block["blocks"]:
148
151
  bbox = nested_block["bbox"]
149
152
  if nested_block["type"] == BlockType.TABLE_BODY:
@@ -155,7 +158,6 @@ def draw_layout_bbox(pdf_info, pdf_bytes, out_path, filename):
155
158
  continue
156
159
  tables_footnote.append(bbox)
157
160
  elif block["type"] == BlockType.IMAGE:
158
- imgs.append(bbox)
159
161
  for nested_block in block["blocks"]:
160
162
  bbox = nested_block["bbox"]
161
163
  if nested_block["type"] == BlockType.IMAGE_BODY:
@@ -164,22 +166,31 @@ def draw_layout_bbox(pdf_info, pdf_bytes, out_path, filename):
164
166
  imgs_caption.append(bbox)
165
167
  elif nested_block["type"] == BlockType.IMAGE_FOOTNOTE:
166
168
  imgs_footnote.append(bbox)
169
+ elif block["type"] == BlockType.CODE:
170
+ for nested_block in block["blocks"]:
171
+ if nested_block["type"] == BlockType.CODE_BODY:
172
+ bbox = nested_block["bbox"]
173
+ codes_body.append(bbox)
174
+ elif nested_block["type"] == BlockType.CODE_CAPTION:
175
+ bbox = nested_block["bbox"]
176
+ codes_caption.append(bbox)
167
177
  elif block["type"] == BlockType.TITLE:
168
178
  titles.append(bbox)
169
- elif block["type"] == BlockType.TEXT:
179
+ elif block["type"] in [BlockType.TEXT, BlockType.REF_TEXT]:
170
180
  texts.append(bbox)
171
181
  elif block["type"] == BlockType.INTERLINE_EQUATION:
172
182
  interequations.append(bbox)
173
183
  elif block["type"] == BlockType.LIST:
174
184
  lists.append(bbox)
185
+ if "blocks" in block:
186
+ for sub_block in block["blocks"]:
187
+ list_items.append(sub_block["bbox"])
175
188
  elif block["type"] == BlockType.INDEX:
176
189
  indices.append(bbox)
177
190
 
178
- tables_list.append(tables)
179
191
  tables_body_list.append(tables_body)
180
192
  tables_caption_list.append(tables_caption)
181
193
  tables_footnote_list.append(tables_footnote)
182
- imgs_list.append(imgs)
183
194
  imgs_body_list.append(imgs_body)
184
195
  imgs_caption_list.append(imgs_caption)
185
196
  imgs_footnote_list.append(imgs_footnote)
@@ -187,7 +198,10 @@ def draw_layout_bbox(pdf_info, pdf_bytes, out_path, filename):
187
198
  texts_list.append(texts)
188
199
  interequations_list.append(interequations)
189
200
  lists_list.append(lists)
201
+ list_items_list.append(list_items)
190
202
  indexs_list.append(indices)
203
+ codes_body_list.append(codes_body)
204
+ codes_caption_list.append(codes_caption)
191
205
 
192
206
  layout_bbox_list = []
193
207
 
@@ -197,6 +211,7 @@ def draw_layout_bbox(pdf_info, pdf_bytes, out_path, filename):
197
211
  for block in page["para_blocks"]:
198
212
  if block["type"] in [
199
213
  BlockType.TEXT,
214
+ BlockType.REF_TEXT,
200
215
  BlockType.TITLE,
201
216
  BlockType.INTERLINE_EQUATION,
202
217
  BlockType.LIST,
@@ -215,6 +230,10 @@ def draw_layout_bbox(pdf_info, pdf_bytes, out_path, filename):
215
230
  continue
216
231
  bbox = sub_block["bbox"]
217
232
  page_block_list.append(bbox)
233
+ elif block["type"] in [BlockType.CODE]:
234
+ for sub_block in block["blocks"]:
235
+ bbox = sub_block["bbox"]
236
+ page_block_list.append(bbox)
218
237
 
219
238
  layout_bbox_list.append(page_block_list)
220
239
 
@@ -231,6 +250,8 @@ def draw_layout_bbox(pdf_info, pdf_bytes, out_path, filename):
231
250
  # 使用原始PDF的尺寸创建canvas
232
251
  c = canvas.Canvas(packet, pagesize=custom_page_size)
233
252
 
253
+ c = draw_bbox_without_number(i, codes_body_list, page, c, [102, 0, 204], True)
254
+ c = draw_bbox_without_number(i, codes_caption_list, page, c, [204, 153, 255], True)
234
255
  c = draw_bbox_without_number(i, dropped_bbox_list, page, c, [158, 158, 158], True)
235
256
  c = draw_bbox_without_number(i, tables_body_list, page, c, [204, 204, 0], True)
236
257
  c = draw_bbox_without_number(i, tables_caption_list, page, c, [255, 255, 102], True)
@@ -242,6 +263,7 @@ def draw_layout_bbox(pdf_info, pdf_bytes, out_path, filename):
242
263
  c = draw_bbox_without_number(i, texts_list, page, c, [153, 0, 76], True)
243
264
  c = draw_bbox_without_number(i, interequations_list, page, c, [0, 255, 0], True)
244
265
  c = draw_bbox_without_number(i, lists_list, page, c, [40, 169, 92], True)
266
+ c = draw_bbox_without_number(i, list_items_list, page, c, [40, 169, 92], False)
245
267
  c = draw_bbox_without_number(i, indexs_list, page, c, [40, 169, 92], True)
246
268
  c = draw_bbox_with_number(i, layout_bbox_list, page, c, [255, 0, 0], False, draw_bbox=False)
247
269
 
@@ -14,6 +14,19 @@ class BlockType:
14
14
  INDEX = 'index'
15
15
  DISCARDED = 'discarded'
16
16
 
17
+ # Added in vlm 2.5
18
+ CODE = "code"
19
+ CODE_BODY = "code_body"
20
+ CODE_CAPTION = "code_caption"
21
+ ALGORITHM = "algorithm"
22
+ REF_TEXT = "ref_text"
23
+ PHONETIC = "phonetic"
24
+ HEADER = "header"
25
+ FOOTER = "footer"
26
+ PAGE_NUMBER = "page_number"
27
+ ASIDE_TEXT = "aside_text"
28
+ PAGE_FOOTNOTE = "page_footnote"
29
+
17
30
 
18
31
  class ContentType:
19
32
  IMAGE = 'image'
@@ -22,6 +35,7 @@ class ContentType:
22
35
  INTERLINE_EQUATION = 'interline_equation'
23
36
  INLINE_EQUATION = 'inline_equation'
24
37
  EQUATION = 'equation'
38
+ CODE = 'code'
25
39
 
26
40
 
27
41
  class CategoryId:
@@ -49,8 +63,8 @@ class MakeMode:
49
63
 
50
64
 
51
65
  class ModelPath:
52
- vlm_root_hf = "opendatalab/MinerU2.0-2505-0.9B"
53
- vlm_root_modelscope = "OpenDataLab/MinerU2.0-2505-0.9B"
66
+ vlm_root_hf = "opendatalab/MinerU2.5-2509-1.2B"
67
+ vlm_root_modelscope = "OpenDataLab/MinerU2.5-2509-1.2B"
54
68
  pipeline_root_modelscope = "OpenDataLab/PDF-Extract-Kit-1.0"
55
69
  pipeline_root_hf = "opendatalab/PDF-Extract-Kit-1.0"
56
70
  doclayout_yolo = "models/Layout/YOLO/doclayout_yolo_docstructbench_imgsz1280_2501.pt"
@@ -0,0 +1,20 @@
1
+ from magika import Magika
2
+
3
+
4
+ DEFAULT_LANG = "txt"
5
+ magika = Magika()
6
+
7
+ def guess_language_by_text(code):
8
+ codebytes = code.encode(encoding="utf-8")
9
+ lang = magika.identify_bytes(codebytes).prediction.output.label
10
+ return lang if lang != "unknown" else DEFAULT_LANG
11
+
12
+
13
+ def guess_suffix_by_bytes(file_bytes) -> str:
14
+ suffix = magika.identify_bytes(file_bytes).prediction.output.label
15
+ return suffix
16
+
17
+
18
+ def guess_suffix_by_path(file_path) -> str:
19
+ suffix = magika.identify_path(file_path).prediction.output.label
20
+ return suffix
@@ -23,9 +23,11 @@ def fill_spans_in_blocks(blocks, spans, radio):
23
23
  block_dict['group_id'] = block[-1]
24
24
  block_spans = []
25
25
  for span in spans:
26
+ temp_radio = radio
26
27
  span_bbox = span['bbox']
27
- if calculate_overlap_area_in_bbox1_area_ratio(span_bbox, block_bbox) > radio and span_block_type_compatible(
28
- span['type'], block_type):
28
+ if span['type'] in [ContentType.IMAGE, ContentType.TABLE]:
29
+ temp_radio = 0.9
30
+ if calculate_overlap_area_in_bbox1_area_ratio(span_bbox, block_bbox) > temp_radio and span_block_type_compatible(span['type'], block_type):
29
31
  block_spans.append(span)
30
32
 
31
33
  block_dict['spans'] = block_spans
mineru/version.py CHANGED
@@ -1 +1 @@
1
- __version__ = "2.2.2"
1
+ __version__ = "2.5.0"