mineru 2.5.3__py3-none-any.whl → 2.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (104) hide show
  1. mineru/backend/pipeline/model_init.py +25 -3
  2. mineru/backend/pipeline/model_json_to_middle_json.py +2 -2
  3. mineru/backend/pipeline/model_list.py +0 -1
  4. mineru/backend/utils.py +24 -0
  5. mineru/backend/vlm/model_output_to_middle_json.py +2 -2
  6. mineru/backend/vlm/{custom_logits_processors.py → utils.py} +36 -2
  7. mineru/backend/vlm/vlm_analyze.py +43 -50
  8. mineru/backend/vlm/vlm_magic_model.py +155 -1
  9. mineru/cli/common.py +26 -23
  10. mineru/cli/fast_api.py +2 -8
  11. mineru/cli/gradio_app.py +104 -13
  12. mineru/cli/models_download.py +1 -0
  13. mineru/model/mfr/pp_formulanet_plus_m/predict_formula.py +152 -0
  14. mineru/model/mfr/pp_formulanet_plus_m/processors.py +657 -0
  15. mineru/model/mfr/unimernet/unimernet_hf/modeling_unimernet.py +1 -326
  16. mineru/model/mfr/utils.py +338 -0
  17. mineru/model/ocr/paddleocr2pytorch/pytorch_paddle.py +103 -16
  18. mineru/model/table/rec/unet_table/main.py +1 -1
  19. mineru/model/{ocr/paddleocr2pytorch → utils}/pytorchocr/data/imaug/operators.py +5 -5
  20. mineru/model/{ocr/paddleocr2pytorch → utils}/pytorchocr/modeling/backbones/__init__.py +2 -1
  21. mineru/model/{ocr/paddleocr2pytorch → utils}/pytorchocr/modeling/backbones/rec_lcnetv3.py +7 -7
  22. mineru/model/{ocr/paddleocr2pytorch → utils}/pytorchocr/modeling/backbones/rec_pphgnetv2.py +2 -2
  23. mineru/model/{ocr/paddleocr2pytorch → utils}/pytorchocr/modeling/heads/__init__.py +2 -0
  24. mineru/model/utils/pytorchocr/modeling/heads/rec_ppformulanet_head.py +1383 -0
  25. mineru/model/utils/pytorchocr/modeling/heads/rec_unimernet_head.py +2631 -0
  26. mineru/model/{ocr/paddleocr2pytorch → utils}/pytorchocr/postprocess/rec_postprocess.py +25 -28
  27. mineru/model/utils/pytorchocr/utils/__init__.py +0 -0
  28. mineru/model/{ocr/paddleocr2pytorch → utils}/pytorchocr/utils/resources/arch_config.yaml +130 -0
  29. mineru/model/utils/pytorchocr/utils/resources/dict/ppocrv5_arabic_dict.txt +747 -0
  30. mineru/model/utils/pytorchocr/utils/resources/dict/ppocrv5_cyrillic_dict.txt +850 -0
  31. mineru/model/utils/pytorchocr/utils/resources/dict/ppocrv5_devanagari_dict.txt +568 -0
  32. mineru/model/utils/pytorchocr/utils/resources/dict/ppocrv5_ta_dict.txt +513 -0
  33. mineru/model/utils/pytorchocr/utils/resources/dict/ppocrv5_te_dict.txt +540 -0
  34. mineru/model/{ocr/paddleocr2pytorch → utils}/pytorchocr/utils/resources/models_config.yml +15 -15
  35. mineru/model/utils/pytorchocr/utils/resources/pp_formulanet_arch_config.yaml +24 -0
  36. mineru/model/utils/tools/infer/__init__.py +1 -0
  37. mineru/model/{ocr/paddleocr2pytorch → utils}/tools/infer/predict_det.py +6 -3
  38. mineru/model/{ocr/paddleocr2pytorch → utils}/tools/infer/predict_rec.py +16 -25
  39. mineru/model/vlm_vllm_model/server.py +4 -1
  40. mineru/resources/header.html +2 -2
  41. mineru/utils/enum_class.py +1 -0
  42. mineru/utils/guess_suffix_or_lang.py +9 -1
  43. mineru/utils/llm_aided.py +4 -2
  44. mineru/utils/ocr_utils.py +16 -0
  45. mineru/utils/table_merge.py +102 -13
  46. mineru/version.py +1 -1
  47. {mineru-2.5.3.dist-info → mineru-2.6.0.dist-info}/METADATA +33 -6
  48. mineru-2.6.0.dist-info/RECORD +195 -0
  49. mineru-2.5.3.dist-info/RECORD +0 -181
  50. /mineru/model/{ocr/paddleocr2pytorch/pytorchocr → mfr/pp_formulanet_plus_m}/__init__.py +0 -0
  51. /mineru/model/{ocr/paddleocr2pytorch/tools/infer → utils}/__init__.py +0 -0
  52. /mineru/model/{ocr/paddleocr2pytorch/pytorchocr/modeling → utils/pytorchocr}/__init__.py +0 -0
  53. /mineru/model/{ocr/paddleocr2pytorch → utils}/pytorchocr/base_ocr_v20.py +0 -0
  54. /mineru/model/{ocr/paddleocr2pytorch → utils}/pytorchocr/data/__init__.py +0 -0
  55. /mineru/model/{ocr/paddleocr2pytorch → utils}/pytorchocr/data/imaug/__init__.py +0 -0
  56. /mineru/model/{ocr/paddleocr2pytorch/pytorchocr/utils → utils/pytorchocr/modeling}/__init__.py +0 -0
  57. /mineru/model/{ocr/paddleocr2pytorch → utils}/pytorchocr/modeling/architectures/__init__.py +0 -0
  58. /mineru/model/{ocr/paddleocr2pytorch → utils}/pytorchocr/modeling/architectures/base_model.py +0 -0
  59. /mineru/model/{ocr/paddleocr2pytorch → utils}/pytorchocr/modeling/backbones/det_mobilenet_v3.py +0 -0
  60. /mineru/model/{ocr/paddleocr2pytorch → utils}/pytorchocr/modeling/backbones/rec_donut_swin.py +0 -0
  61. /mineru/model/{ocr/paddleocr2pytorch → utils}/pytorchocr/modeling/backbones/rec_hgnet.py +0 -0
  62. /mineru/model/{ocr/paddleocr2pytorch → utils}/pytorchocr/modeling/backbones/rec_mobilenet_v3.py +0 -0
  63. /mineru/model/{ocr/paddleocr2pytorch → utils}/pytorchocr/modeling/backbones/rec_mv1_enhance.py +0 -0
  64. /mineru/model/{ocr/paddleocr2pytorch → utils}/pytorchocr/modeling/backbones/rec_svtrnet.py +0 -0
  65. /mineru/model/{ocr/paddleocr2pytorch → utils}/pytorchocr/modeling/common.py +0 -0
  66. /mineru/model/{ocr/paddleocr2pytorch → utils}/pytorchocr/modeling/heads/cls_head.py +0 -0
  67. /mineru/model/{ocr/paddleocr2pytorch → utils}/pytorchocr/modeling/heads/det_db_head.py +0 -0
  68. /mineru/model/{ocr/paddleocr2pytorch → utils}/pytorchocr/modeling/heads/rec_ctc_head.py +0 -0
  69. /mineru/model/{ocr/paddleocr2pytorch → utils}/pytorchocr/modeling/heads/rec_multi_head.py +0 -0
  70. /mineru/model/{ocr/paddleocr2pytorch → utils}/pytorchocr/modeling/necks/__init__.py +0 -0
  71. /mineru/model/{ocr/paddleocr2pytorch → utils}/pytorchocr/modeling/necks/db_fpn.py +0 -0
  72. /mineru/model/{ocr/paddleocr2pytorch → utils}/pytorchocr/modeling/necks/intracl.py +0 -0
  73. /mineru/model/{ocr/paddleocr2pytorch → utils}/pytorchocr/modeling/necks/rnn.py +0 -0
  74. /mineru/model/{ocr/paddleocr2pytorch → utils}/pytorchocr/postprocess/__init__.py +0 -0
  75. /mineru/model/{ocr/paddleocr2pytorch → utils}/pytorchocr/postprocess/cls_postprocess.py +0 -0
  76. /mineru/model/{ocr/paddleocr2pytorch → utils}/pytorchocr/postprocess/db_postprocess.py +0 -0
  77. /mineru/model/{ocr/paddleocr2pytorch → utils}/pytorchocr/utils/resources/dict/arabic_dict.txt +0 -0
  78. /mineru/model/{ocr/paddleocr2pytorch → utils}/pytorchocr/utils/resources/dict/chinese_cht_dict.txt +0 -0
  79. /mineru/model/{ocr/paddleocr2pytorch → utils}/pytorchocr/utils/resources/dict/cyrillic_dict.txt +0 -0
  80. /mineru/model/{ocr/paddleocr2pytorch → utils}/pytorchocr/utils/resources/dict/devanagari_dict.txt +0 -0
  81. /mineru/model/{ocr/paddleocr2pytorch → utils}/pytorchocr/utils/resources/dict/en_dict.txt +0 -0
  82. /mineru/model/{ocr/paddleocr2pytorch → utils}/pytorchocr/utils/resources/dict/japan_dict.txt +0 -0
  83. /mineru/model/{ocr/paddleocr2pytorch → utils}/pytorchocr/utils/resources/dict/ka_dict.txt +0 -0
  84. /mineru/model/{ocr/paddleocr2pytorch → utils}/pytorchocr/utils/resources/dict/korean_dict.txt +0 -0
  85. /mineru/model/{ocr/paddleocr2pytorch → utils}/pytorchocr/utils/resources/dict/latin_dict.txt +0 -0
  86. /mineru/model/{ocr/paddleocr2pytorch → utils}/pytorchocr/utils/resources/dict/ppocr_keys_v1.txt +0 -0
  87. /mineru/model/{ocr/paddleocr2pytorch → utils}/pytorchocr/utils/resources/dict/ppocrv4_doc_dict.txt +0 -0
  88. /mineru/model/{ocr/paddleocr2pytorch → utils}/pytorchocr/utils/resources/dict/ppocrv5_dict.txt +0 -0
  89. /mineru/model/{ocr/paddleocr2pytorch → utils}/pytorchocr/utils/resources/dict/ppocrv5_el_dict.txt +0 -0
  90. /mineru/model/{ocr/paddleocr2pytorch → utils}/pytorchocr/utils/resources/dict/ppocrv5_en_dict.txt +0 -0
  91. /mineru/model/{ocr/paddleocr2pytorch → utils}/pytorchocr/utils/resources/dict/ppocrv5_eslav_dict.txt +0 -0
  92. /mineru/model/{ocr/paddleocr2pytorch → utils}/pytorchocr/utils/resources/dict/ppocrv5_korean_dict.txt +0 -0
  93. /mineru/model/{ocr/paddleocr2pytorch → utils}/pytorchocr/utils/resources/dict/ppocrv5_latin_dict.txt +0 -0
  94. /mineru/model/{ocr/paddleocr2pytorch → utils}/pytorchocr/utils/resources/dict/ppocrv5_th_dict.txt +0 -0
  95. /mineru/model/{ocr/paddleocr2pytorch → utils}/pytorchocr/utils/resources/dict/ta_dict.txt +0 -0
  96. /mineru/model/{ocr/paddleocr2pytorch → utils}/pytorchocr/utils/resources/dict/te_dict.txt +0 -0
  97. /mineru/model/{ocr/paddleocr2pytorch → utils}/tools/__init__.py +0 -0
  98. /mineru/model/{ocr/paddleocr2pytorch → utils}/tools/infer/predict_cls.py +0 -0
  99. /mineru/model/{ocr/paddleocr2pytorch → utils}/tools/infer/predict_system.py +0 -0
  100. /mineru/model/{ocr/paddleocr2pytorch → utils}/tools/infer/pytorchocr_utility.py +0 -0
  101. {mineru-2.5.3.dist-info → mineru-2.6.0.dist-info}/WHEEL +0 -0
  102. {mineru-2.5.3.dist-info → mineru-2.6.0.dist-info}/entry_points.txt +0 -0
  103. {mineru-2.5.3.dist-info → mineru-2.6.0.dist-info}/licenses/LICENSE.md +0 -0
  104. {mineru-2.5.3.dist-info → mineru-2.6.0.dist-info}/top_level.txt +0 -0
@@ -1,3 +1,4 @@
1
+ import os
1
2
  import sys
2
3
 
3
4
  from mineru.backend.vlm.custom_logits_processors import enable_custom_logits_processors
@@ -42,7 +43,7 @@ def main():
42
43
  if not has_port_arg:
43
44
  args.extend(["--port", "30000"])
44
45
  if not has_gpu_memory_utilization_arg:
45
- args.extend(["--gpu-memory-utilization", "0.5"])
46
+ args.extend(["--gpu-memory-utilization", "0.7"])
46
47
  if not model_path:
47
48
  model_path = auto_download_and_get_model_root_path("/", "vlm")
48
49
  if (not has_logits_processors_arg) and custom_logits_processors:
@@ -51,6 +52,8 @@ def main():
51
52
  # 重构参数,将模型路径作为位置参数
52
53
  sys.argv = [sys.argv[0]] + ["serve", model_path] + args
53
54
 
55
+ os.environ["OMP_NUM_THREADS"] = "1"
56
+
54
57
  # 启动vllm服务器
55
58
  print(f"start vllm server: {sys.argv}")
56
59
  vllm_main()
@@ -99,7 +99,7 @@
99
99
  </span>
100
100
  <!-- arXiv Link. -->
101
101
  <span class="link-block">
102
- <a href="https://arxiv.org/abs/2409.18839" class="external-link button is-normal is-rounded is-dark" style="text-decoration: none; cursor: pointer">
102
+ <a href="https://arxiv.org/abs/2509.22186" class="external-link button is-normal is-rounded is-dark" style="text-decoration: none; cursor: pointer">
103
103
  <span class="icon" style="margin-right: 8px">
104
104
  <i class="fas fa-file" style="color: white"></i>
105
105
  </span>
@@ -134,4 +134,4 @@
134
134
  </div>
135
135
 
136
136
 
137
- </body></html>
137
+ </body></html>
@@ -70,6 +70,7 @@ class ModelPath:
70
70
  doclayout_yolo = "models/Layout/YOLO/doclayout_yolo_docstructbench_imgsz1280_2501.pt"
71
71
  yolo_v8_mfd = "models/MFD/YOLO/yolo_v8_ft.pt"
72
72
  unimernet_small = "models/MFR/unimernet_hf_small_2503"
73
+ pp_formulanet_plus_m = "models/MFR/pp_formulanet_plus_m"
73
74
  pytorch_paddle = "models/OCR/paddleocr_torch"
74
75
  layout_reader = "models/ReadingOrder/layout_reader"
75
76
  slanet_plus = "models/TabRec/SlanetPlus/slanet-plus.onnx"
@@ -1,3 +1,5 @@
1
+ from pathlib import Path
2
+
1
3
  from magika import Magika
2
4
 
3
5
 
@@ -10,11 +12,17 @@ def guess_language_by_text(code):
10
12
  return lang if lang != "unknown" else DEFAULT_LANG
11
13
 
12
14
 
13
- def guess_suffix_by_bytes(file_bytes) -> str:
15
+ def guess_suffix_by_bytes(file_bytes, file_path=None) -> str:
14
16
  suffix = magika.identify_bytes(file_bytes).prediction.output.label
17
+ if file_path and suffix in ["ai"] and Path(file_path).suffix.lower() in [".pdf"]:
18
+ suffix = "pdf"
15
19
  return suffix
16
20
 
17
21
 
18
22
  def guess_suffix_by_path(file_path) -> str:
23
+ if not isinstance(file_path, Path):
24
+ file_path = Path(file_path)
19
25
  suffix = magika.identify_path(file_path).prediction.output.label
26
+ if suffix in ["ai"] and file_path.suffix.lower() in [".pdf"]:
27
+ suffix = "pdf"
20
28
  return suffix
mineru/utils/llm_aided.py CHANGED
@@ -51,7 +51,7 @@ def llm_aided_title(page_info_list, title_aided_config):
51
51
  3. 保持字典内key-value的对应关系不变
52
52
 
53
53
  4. 优化层次结构:
54
- - 为每个标题元素添加适当的层次结构
54
+ - 根据标题内容的语义为每个标题元素添加适当的层次结构
55
55
  - 行高较大的标题一般是更高级别的标题
56
56
  - 标题从前至后的层级必须是连续的,不能跳过层级
57
57
  - 标题层级最多为4级,不要添加过多的层级
@@ -61,7 +61,6 @@ def llm_aided_title(page_info_list, title_aided_config):
61
61
  - 在完成初步分级后,仔细检查分级结果的合理性
62
62
  - 根据上下文关系和逻辑顺序,对不合理的分级进行微调
63
63
  - 确保最终的分级结果符合文档的实际结构和逻辑
64
- - 字典中可能包含被误当成标题的正文,你可以通过将其层级标记为 0 来排除它们
65
64
 
66
65
  IMPORTANT:
67
66
  请直接返回优化过的由标题层级组成的字典,格式为{{标题id:标题层级}},如下:
@@ -78,6 +77,8 @@ Input title list:
78
77
 
79
78
  Corrected title list:
80
79
  """
80
+ #5.
81
+ #- 字典中可能包含被误当成标题的正文,你可以通过将其层级标记为 0 来排除它们
81
82
 
82
83
  retry_count = 0
83
84
  max_retries = 3
@@ -89,6 +90,7 @@ Corrected title list:
89
90
  model=title_aided_config["model"],
90
91
  messages=[
91
92
  {'role': 'user', 'content': title_optimize_prompt}],
93
+ extra_body={"enable_thinking": False},
92
94
  temperature=0.7,
93
95
  stream=True,
94
96
  )
mineru/utils/ocr_utils.py CHANGED
@@ -406,6 +406,12 @@ def calculate_is_angle(poly):
406
406
  # logger.info((p3[1] - p1[1])/height)
407
407
  return True
408
408
 
409
+ def is_bbox_aligned_rect(points):
410
+ x_coords = points[:, 0]
411
+ y_coords = points[:, 1]
412
+ unique_x = np.unique(x_coords)
413
+ unique_y = np.unique(y_coords)
414
+ return len(unique_x) == 2 and len(unique_y) == 2
409
415
 
410
416
  def get_rotate_crop_image(img, points):
411
417
  '''
@@ -419,6 +425,16 @@ def get_rotate_crop_image(img, points):
419
425
  points[:, 1] = points[:, 1] - top
420
426
  '''
421
427
  assert len(points) == 4, "shape of points must be 4*2"
428
+
429
+ if is_bbox_aligned_rect(points):
430
+ xmin = int(np.min(points[:, 0]))
431
+ xmax = int(np.max(points[:, 0]))
432
+ ymin = int(np.min(points[:, 1]))
433
+ ymax = int(np.max(points[:, 1]))
434
+ new_img = img[ymin:ymax, xmin:xmax].copy()
435
+ if new_img.shape[0] > 0 and new_img.shape[1] > 0:
436
+ return new_img
437
+
422
438
  img_crop_width = int(
423
439
  max(
424
440
  np.linalg.norm(points[0] - points[1]),
@@ -3,6 +3,7 @@
3
3
  from loguru import logger
4
4
  from bs4 import BeautifulSoup
5
5
 
6
+ from mineru.backend.vlm.vlm_middle_json_mkcontent import merge_para_with_text
6
7
  from mineru.utils.enum_class import BlockType, SplitFlag
7
8
 
8
9
 
@@ -144,8 +145,9 @@ def detect_table_headers(soup1, soup2, max_header_rows=5):
144
145
  colspan2 = int(cell2.get("colspan", 1))
145
146
  rowspan2 = int(cell2.get("rowspan", 1))
146
147
 
147
- text1 = full_to_half(cell1.get_text().strip())
148
- text2 = full_to_half(cell2.get_text().strip())
148
+ # 去除所有空白字符(包括空格、换行、制表符等)
149
+ text1 = ''.join(full_to_half(cell1.get_text()).split())
150
+ text2 = ''.join(full_to_half(cell2.get_text()).split())
149
151
 
150
152
  if colspan1 != colspan2 or rowspan1 != rowspan2 or text1 != text2:
151
153
  structure_match = False
@@ -169,8 +171,12 @@ def detect_table_headers(soup1, soup2, max_header_rows=5):
169
171
  def can_merge_tables(current_table_block, previous_table_block):
170
172
  """判断两个表格是否可以合并"""
171
173
  # 检查表格是否有caption和footnote
172
- if any(block["type"] == BlockType.TABLE_CAPTION for block in current_table_block["blocks"]):
173
- return False, None, None, None, None
174
+ # 如果有TABLE_CAPTION类型的块,检查是否至少有一个以"(续)"结尾
175
+ caption_blocks = [block for block in current_table_block["blocks"] if block["type"] == BlockType.TABLE_CAPTION]
176
+ if caption_blocks:
177
+ # 如果所有caption都不以"(续)"结尾,则不合并
178
+ if not any(full_to_half(merge_para_with_text(block).strip()).endswith("(续)") for block in caption_blocks):
179
+ return False, None, None, None, None
174
180
 
175
181
  if any(block["type"] == BlockType.TABLE_FOOTNOTE for block in previous_table_block["blocks"]):
176
182
  return False, None, None, None, None
@@ -253,6 +259,59 @@ def check_rows_match(soup1, soup2):
253
259
  return last_row_cols == first_row_cols or last_row_visual_cols == first_row_visual_cols
254
260
 
255
261
 
262
+ def check_row_columns_match(row1, row2):
263
+ # 逐个cell检测colspan属性是否一致
264
+ cells1 = row1.find_all(["td", "th"])
265
+ cells2 = row2.find_all(["td", "th"])
266
+ if len(cells1) != len(cells2):
267
+ return False
268
+ for cell1, cell2 in zip(cells1, cells2):
269
+ colspan1 = int(cell1.get("colspan", 1))
270
+ colspan2 = int(cell2.get("colspan", 1))
271
+ if colspan1 != colspan2:
272
+ return False
273
+ return True
274
+
275
+
276
+ def adjust_table_rows_colspan(rows, start_idx, end_idx,
277
+ reference_structure, reference_visual_cols,
278
+ target_cols, current_cols, reference_row):
279
+ """调整表格行的colspan属性以匹配目标列数
280
+
281
+ Args:
282
+ rows: 表格行列表
283
+ start_idx: 起始行索引
284
+ end_idx: 结束行索引(不包含)
285
+ reference_structure: 参考行的colspan结构列表
286
+ reference_visual_cols: 参考行的视觉列数
287
+ target_cols: 目标总列数
288
+ current_cols: 当前总列数
289
+ reference_row: 参考行对象
290
+ """
291
+ for i in range(start_idx, end_idx):
292
+ row = rows[i]
293
+ cells = row.find_all(["td", "th"])
294
+ if not cells:
295
+ continue
296
+
297
+ current_row_cols = calculate_row_columns(row)
298
+ if current_row_cols >= target_cols:
299
+ continue
300
+
301
+ # 检查是否与参考行结构匹配
302
+ if calculate_visual_columns(row) == reference_visual_cols and check_row_columns_match(row, reference_row):
303
+ # 尝试应用参考结构
304
+ if len(cells) <= len(reference_structure):
305
+ for j, cell in enumerate(cells):
306
+ if j < len(reference_structure) and reference_structure[j] > 1:
307
+ cell["colspan"] = str(reference_structure[j])
308
+ else:
309
+ # 扩展最后一个单元格以填补列数差异
310
+ last_cell = cells[-1]
311
+ current_last_span = int(last_cell.get("colspan", 1))
312
+ last_cell["colspan"] = str(current_last_span + (target_cols - current_cols))
313
+
314
+
256
315
  def perform_table_merge(soup1, soup2, previous_table_block, wait_merge_table_footnotes):
257
316
  """执行表格合并操作"""
258
317
  # 检测表头有几行,并确认表头内容是否一致
@@ -263,17 +322,47 @@ def perform_table_merge(soup1, soup2, previous_table_block, wait_merge_table_foo
263
322
  # 找到第一个表格的tbody,如果没有则查找table元素
264
323
  tbody1 = soup1.find("tbody") or soup1.find("table")
265
324
 
266
- # 找到第二个表格的tbody,如果没有则查找table元素
267
- tbody2 = soup2.find("tbody") or soup2.find("table")
325
+ # 获取表1和表2的所有行
326
+ rows1 = soup1.find_all("tr")
327
+ rows2 = soup2.find_all("tr")
328
+
329
+
330
+ if rows1 and rows2 and header_count < len(rows2):
331
+ # 获取表1最后一行和表2第一个非表头行
332
+ last_row1 = rows1[-1]
333
+ first_data_row2 = rows2[header_count]
334
+
335
+ # 计算表格总列数
336
+ table_cols1 = calculate_table_total_columns(soup1)
337
+ table_cols2 = calculate_table_total_columns(soup2)
338
+ if table_cols1 >= table_cols2:
339
+ reference_structure = [int(cell.get("colspan", 1)) for cell in last_row1.find_all(["td", "th"])]
340
+ reference_visual_cols = calculate_visual_columns(last_row1)
341
+ # 以表1的最后一行为参考,调整表2的行
342
+ adjust_table_rows_colspan(
343
+ rows2, header_count, len(rows2),
344
+ reference_structure, reference_visual_cols,
345
+ table_cols1, table_cols2, first_data_row2
346
+ )
347
+
348
+ else: # table_cols2 > table_cols1
349
+ reference_structure = [int(cell.get("colspan", 1)) for cell in first_data_row2.find_all(["td", "th"])]
350
+ reference_visual_cols = calculate_visual_columns(first_data_row2)
351
+ # 以表2的第一个数据行为参考,调整表1的行
352
+ adjust_table_rows_colspan(
353
+ rows1, 0, len(rows1),
354
+ reference_structure, reference_visual_cols,
355
+ table_cols2, table_cols1, last_row1
356
+ )
268
357
 
269
358
  # 将第二个表格的行添加到第一个表格中
270
- if tbody1 and tbody2:
271
- rows2 = soup2.find_all("tr")
272
- # 将第二个表格的行添加到第一个表格中(跳过表头行)
273
- for row in rows2[header_count:]:
274
- # 从原来的位置移除行,并添加到第一个表格中
275
- row.extract()
276
- tbody1.append(row)
359
+ if tbody1:
360
+ tbody2 = soup2.find("tbody") or soup2.find("table")
361
+ if tbody2:
362
+ # 将第二个表格的行添加到第一个表格中(跳过表头行)
363
+ for row in rows2[header_count:]:
364
+ row.extract()
365
+ tbody1.append(row)
277
366
 
278
367
  # 添加待合并表格的footnote到前一个表格中
279
368
  for table_footnote in wait_merge_table_footnotes:
mineru/version.py CHANGED
@@ -1 +1 @@
1
- __version__ = "2.5.3"
1
+ __version__ = "2.6.0"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: mineru
3
- Version: 2.5.3
3
+ Version: 2.6.0
4
4
  Summary: A practical tool for converting PDF to Markdown
5
5
  License: AGPL-3.0
6
6
  Project-URL: homepage, https://mineru.net/
@@ -34,10 +34,10 @@ Requires-Dist: json-repair>=0.46.2
34
34
  Requires-Dist: opencv-python>=4.11.0.86
35
35
  Requires-Dist: fast-langdetect<0.3.0,>=0.2.3
36
36
  Requires-Dist: scikit-image<1.0.0,>=0.25.0
37
- Requires-Dist: openai<2,>=1.70.0
37
+ Requires-Dist: openai<3,>=1.70.0
38
38
  Requires-Dist: beautifulsoup4<5,>=4.13.5
39
39
  Requires-Dist: magika<0.7.0,>=0.6.2
40
- Requires-Dist: mineru-vl-utils<1,>=0.1.8
40
+ Requires-Dist: mineru-vl-utils<1,>=0.1.14
41
41
  Provides-Extra: test
42
42
  Requires-Dist: mineru[core]; extra == "test"
43
43
  Requires-Dist: pytest; extra == "test"
@@ -49,7 +49,7 @@ Requires-Dist: torch<3,>=2.6.0; extra == "vlm"
49
49
  Requires-Dist: transformers<5.0.0,>=4.51.1; extra == "vlm"
50
50
  Requires-Dist: accelerate>=1.5.1; extra == "vlm"
51
51
  Provides-Extra: vllm
52
- Requires-Dist: vllm<0.11,>=0.10.1.1; extra == "vllm"
52
+ Requires-Dist: vllm<0.12,>=0.10.1.1; extra == "vllm"
53
53
  Provides-Extra: pipeline
54
54
  Requires-Dist: matplotlib<4,>=3.10; extra == "pipeline"
55
55
  Requires-Dist: ultralytics<9,>=8.3.48; extra == "pipeline"
@@ -84,7 +84,7 @@ Dynamic: license-file
84
84
  <div align="center" xmlns="http://www.w3.org/1999/html">
85
85
  <!-- logo -->
86
86
  <p align="center">
87
- <img src="docs/images/MinerU-logo.png" width="300px" style="vertical-align:middle;">
87
+ <img src="https://gcore.jsdelivr.net/gh/opendatalab/MinerU@master/docs/images/MinerU-logo.png" width="300px" style="vertical-align:middle;">
88
88
  </p>
89
89
 
90
90
  <!-- icon -->
@@ -101,7 +101,8 @@ Dynamic: license-file
101
101
  [![HuggingFace](https://img.shields.io/badge/Demo_on_HuggingFace-yellow.svg?logo=&labelColor=white)](https://huggingface.co/spaces/opendatalab/MinerU)
102
102
  [![ModelScope](https://img.shields.io/badge/Demo_on_ModelScope-purple?logo=&labelColor=white)](https://www.modelscope.cn/studios/OpenDataLab/MinerU)
103
103
  [![Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/gist/myhloli/a3cb16570ab3cfeadf9d8f0ac91b4fca/mineru_demo.ipynb)
104
- [![arXiv](https://img.shields.io/badge/arXiv-2409.18839-b31b1b.svg?logo=arXiv)](https://arxiv.org/abs/2409.18839)
104
+ [![arXiv](https://img.shields.io/badge/MinerU-Technical%20Report-b31b1b.svg?logo=arXiv)](https://arxiv.org/abs/2409.18839)
105
+ [![arXiv](https://img.shields.io/badge/MinerU2.5-Technical%20Report-b31b1b.svg?logo=arXiv)](https://arxiv.org/abs/2509.22186)
105
106
  [![Ask DeepWiki](https://deepwiki.com/badge.svg)](https://deepwiki.com/opendatalab/MinerU)
106
107
 
107
108
 
@@ -126,6 +127,22 @@ Dynamic: license-file
126
127
  </div>
127
128
 
128
129
  # Changelog
130
+ - 2025/10/24 2.6.0 Release
131
+ - `pipeline` backend optimizations
132
+ - Added experimental support for Chinese formulas, which can be enabled by setting the environment variable `export MINERU_FORMULA_CH_SUPPORT=1`. This feature may cause a slight decrease in MFR speed and failures in recognizing some long formulas. It is recommended to enable it only when parsing Chinese formulas is needed. To disable this feature, set the environment variable to `0`.
133
+ - `OCR` speed significantly improved by 200%~300%, thanks to the optimization solution provided by [@cjsdurj](https://github.com/cjsdurj)
134
+ - `OCR` models updated to `ppocr-v5` version for Cyrillic, Arabic, Devanagari, Telugu (te), and Tamil (ta) languages, with accuracy improved by over 40% compared to previous models
135
+ - `vlm` backend optimizations
136
+ - `table_caption` and `table_footnote` matching logic optimized to improve the accuracy of table caption and footnote matching and reading order rationality in scenarios with multiple consecutive tables on a page
137
+ - Optimized CPU resource usage during high concurrency when using `vllm` backend, reducing server pressure
138
+ - Adapted to `vllm` version 0.11.0
139
+ - General optimizations
140
+ - Cross-page table merging effect optimized, added support for cross-page continuation table merging, improving table merging effectiveness in multi-column merge scenarios
141
+ - Added environment variable configuration option `MINERU_TABLE_MERGE_ENABLE` for table merging feature. Table merging is enabled by default and can be disabled by setting this variable to `0`
142
+
143
+ - 2025/09/26 2.5.4 released
144
+ - 🎉🎉 The MinerU2.5 [Technical Report](https://arxiv.org/abs/2509.22186) is now available! We welcome you to read it for a comprehensive overview of its model architecture, training strategy, data engineering and evaluation results.
145
+ - Fixed an issue where some `PDF` files were mistakenly identified as `AI` files, causing parsing failures
129
146
 
130
147
  - 2025/09/20 2.5.3 Released
131
148
  - Dependency version range adjustment to enable Turing and earlier architecture GPUs to use vLLM acceleration for MinerU2.5 model inference.
@@ -822,6 +839,16 @@ Currently, some models in this project are trained based on YOLO. However, since
822
839
  # Citation
823
840
 
824
841
  ```bibtex
842
+ @misc{niu2025mineru25decoupledvisionlanguagemodel,
843
+ title={MinerU2.5: A Decoupled Vision-Language Model for Efficient High-Resolution Document Parsing},
844
+ author={Junbo Niu and Zheng Liu and Zhuangcheng Gu and Bin Wang and Linke Ouyang and Zhiyuan Zhao and Tao Chu and Tianyao He and Fan Wu and Qintong Zhang and Zhenjiang Jin and Guang Liang and Rui Zhang and Wenzheng Zhang and Yuan Qu and Zhifei Ren and Yuefeng Sun and Yuanhong Zheng and Dongsheng Ma and Zirui Tang and Boyu Niu and Ziyang Miao and Hejun Dong and Siyi Qian and Junyuan Zhang and Jingzhou Chen and Fangdong Wang and Xiaomeng Zhao and Liqun Wei and Wei Li and Shasha Wang and Ruiliang Xu and Yuanyuan Cao and Lu Chen and Qianqian Wu and Huaiyu Gu and Lindong Lu and Keming Wang and Dechen Lin and Guanlin Shen and Xuanhe Zhou and Linfeng Zhang and Yuhang Zang and Xiaoyi Dong and Jiaqi Wang and Bo Zhang and Lei Bai and Pei Chu and Weijia Li and Jiang Wu and Lijun Wu and Zhenxiang Li and Guangyu Wang and Zhongying Tu and Chao Xu and Kai Chen and Yu Qiao and Bowen Zhou and Dahua Lin and Wentao Zhang and Conghui He},
845
+ year={2025},
846
+ eprint={2509.22186},
847
+ archivePrefix={arXiv},
848
+ primaryClass={cs.CV},
849
+ url={https://arxiv.org/abs/2509.22186},
850
+ }
851
+
825
852
  @misc{wang2024mineruopensourcesolutionprecise,
826
853
  title={MinerU: An Open-Source Solution for Precise Document Content Extraction},
827
854
  author={Bin Wang and Chao Xu and Xiaomeng Zhao and Linke Ouyang and Fan Wu and Zhiyuan Zhao and Rui Xu and Kaiwen Liu and Yuan Qu and Fukai Shang and Bo Zhang and Liqun Wei and Zhihao Sui and Wei Li and Botian Shi and Yu Qiao and Dahua Lin and Conghui He},
@@ -0,0 +1,195 @@
1
+ mineru/__init__.py,sha256=8CRrCQVuExa0BttRFh3Z40lFy2K5jN0sp67KWjOlj5c,50
2
+ mineru/version.py,sha256=OEib63e0yPEGlhEXyrWE1OwRnleR0cHI7KSX7oZEQLs,22
3
+ mineru/backend/__init__.py,sha256=8CRrCQVuExa0BttRFh3Z40lFy2K5jN0sp67KWjOlj5c,50
4
+ mineru/backend/utils.py,sha256=GLJU3IznDmhE1_qNmkU1UOtsuskIHBezgsEVO6Uar-Y,698
5
+ mineru/backend/pipeline/__init__.py,sha256=8CRrCQVuExa0BttRFh3Z40lFy2K5jN0sp67KWjOlj5c,50
6
+ mineru/backend/pipeline/batch_analyze.py,sha256=dOnktvOMjfg84w1H34YlJg6N9_x6Yfvf14NIpOQcZqQ,22221
7
+ mineru/backend/pipeline/model_init.py,sha256=OfB2MMjNmZcHl4fkqS1fT5R8I3LVoSKAHGtl8PcBfBs,9372
8
+ mineru/backend/pipeline/model_json_to_middle_json.py,sha256=DtB7kE_7CtxwOMcb6QYeKzY6vMwUJNpavc5fn9z9oiI,10916
9
+ mineru/backend/pipeline/model_list.py,sha256=7cXMBfZrP0K6qWueg1D_-WoUANeSINzkn_ic9E7YQLs,222
10
+ mineru/backend/pipeline/para_split.py,sha256=Kq95MmvkPm7rKxlCSGiTvVKyF7CErHI2eGGAs5sLl0Q,17119
11
+ mineru/backend/pipeline/pipeline_analyze.py,sha256=rbO5AetOdnxR5ctkoDzFCFoElkz7Jgb7gi2Ct596NK8,6655
12
+ mineru/backend/pipeline/pipeline_magic_model.py,sha256=w8jGx8f6yZN0Wf2yPP3L9rYKc9rogxreZCrUJzJvPO8,14974
13
+ mineru/backend/pipeline/pipeline_middle_json_mkcontent.py,sha256=xWWOFmYL6hB8PLrxQFyRJ72dAmTIDHtqiWV-WFUfR44,14081
14
+ mineru/backend/vlm/__init__.py,sha256=8CRrCQVuExa0BttRFh3Z40lFy2K5jN0sp67KWjOlj5c,50
15
+ mineru/backend/vlm/model_output_to_middle_json.py,sha256=AqYX44gS9crUO_t7SuUatD71EVjow6pI6yA2Ik3gQ0s,5139
16
+ mineru/backend/vlm/utils.py,sha256=6NmVmr6-7idurCmT-1gE2SdmGaorSGgIaHmAg0fMABI,2792
17
+ mineru/backend/vlm/vlm_analyze.py,sha256=aepYsICM2LXhm4pkAa0Abyki1d8M-OdbgeL4KWt91BQ,8083
18
+ mineru/backend/vlm/vlm_magic_model.py,sha256=Pd0sOr7G1crAJIVeq6h_03gNSuxmV5U8dvGTGT_rrjs,23452
19
+ mineru/backend/vlm/vlm_middle_json_mkcontent.py,sha256=Ie95XpwTgi7EmidcwE_scvXMRQjE2xASU_Rm_F8EP-I,13377
20
+ mineru/cli/__init__.py,sha256=8CRrCQVuExa0BttRFh3Z40lFy2K5jN0sp67KWjOlj5c,50
21
+ mineru/cli/client.py,sha256=uo7db9Wqj1Mc11MYuaM-bi54BfKKU3SFB9Urc8md5X4,6641
22
+ mineru/cli/common.py,sha256=jxFJMdc-02UMO3SXAtcZ6aIdPrakAE6DCccZ9kDlPKc,14276
23
+ mineru/cli/fast_api.py,sha256=t5bda769VbM5iokAboiJfPIOnm-r5GTFReE-KQy8L3g,10941
24
+ mineru/cli/gradio_app.py,sha256=8rMdW7grwBUn0MdXyG4eOTQUzKWq6nErtMWl-vGdWbU,14525
25
+ mineru/cli/models_download.py,sha256=7KA-Boe-eIt3WW6eyaxM1HfubTXLsQ8sMmT1H1X7vAc,4815
26
+ mineru/cli/vlm_vllm_server.py,sha256=fQJyD-gIPQ41hR_6aIaDJczl66N310t0CiZEBAfX5mc,90
27
+ mineru/data/__init__.py,sha256=8CRrCQVuExa0BttRFh3Z40lFy2K5jN0sp67KWjOlj5c,50
28
+ mineru/data/data_reader_writer/__init__.py,sha256=9qnGNrsuGBMwwfsQy6oChdkz--a_LPdYWE0VZZr0yr4,490
29
+ mineru/data/data_reader_writer/base.py,sha256=nqmAcdHOXMOJO6RAT3ILligDFaw8Op0STyCw5yOzAbI,1706
30
+ mineru/data/data_reader_writer/dummy.py,sha256=MSxQaZOK8i-llkPXDn08kvvuIte5oJB_4lRjr1mnXtA,315
31
+ mineru/data/data_reader_writer/filebase.py,sha256=glXVSJJ-uA__qD0J_rdhHU2VYEOGaiC2gk9SJwynPj4,2113
32
+ mineru/data/data_reader_writer/multi_bucket_s3.py,sha256=bwVIimVVaj0MNeVJpNAArW41dGyqqvCzq9JZ6Ohdspg,5828
33
+ mineru/data/data_reader_writer/s3.py,sha256=Nwf8icHVQqm8RI2n4AzzVuwK75d1q5JZVuDpe74ChHg,2361
34
+ mineru/data/io/__init__.py,sha256=mjQ_LASaX-4_pg-1uzaMJysIElglUs3o-akqrAP8MCQ,201
35
+ mineru/data/io/base.py,sha256=SqNQqe30ZvoVvg7GVv-hLMCjN6yBgDyQQWeLgGsTfhQ,1118
36
+ mineru/data/io/http.py,sha256=d2-CZBGjMPOvdAkkC9zxUEKYYiZPgL76ZqcG4hIDeT0,941
37
+ mineru/data/io/s3.py,sha256=3fDitN7rEGn1DKDkjKtf2yC68mDrJ-tVyyi8VYkNYeA,3593
38
+ mineru/data/utils/__init__.py,sha256=8CRrCQVuExa0BttRFh3Z40lFy2K5jN0sp67KWjOlj5c,50
39
+ mineru/data/utils/exceptions.py,sha256=oxFpUQVum8LRFAgg1cZvMoN4xgSUe95rgNDU2mzTlwc,834
40
+ mineru/data/utils/path_utils.py,sha256=ykeo-WW163I2GKAWo0vIpP1MrtwI99PPqtCC05uhvVM,1093
41
+ mineru/data/utils/schemas.py,sha256=MK_pnWkK69MRnVaykni2tCRy6sx7cdCePry_W7UUghc,714
42
+ mineru/model/__init__.py,sha256=8CRrCQVuExa0BttRFh3Z40lFy2K5jN0sp67KWjOlj5c,50
43
+ mineru/model/layout/__init__.py,sha256=8CRrCQVuExa0BttRFh3Z40lFy2K5jN0sp67KWjOlj5c,50
44
+ mineru/model/layout/doclayoutyolo.py,sha256=DttINdulzTiYcVDl_70oDtUdfVmGc9qkKWmbPOGAeV0,3867
45
+ mineru/model/mfd/__init__.py,sha256=8CRrCQVuExa0BttRFh3Z40lFy2K5jN0sp67KWjOlj5c,50
46
+ mineru/model/mfd/yolo_v8.py,sha256=3zrxPQWgrSdq13CqcL9dNtZ8oJPLjQzH10hptNA1iLA,3492
47
+ mineru/model/mfr/__init__.py,sha256=8CRrCQVuExa0BttRFh3Z40lFy2K5jN0sp67KWjOlj5c,50
48
+ mineru/model/mfr/utils.py,sha256=pAi1HnkTuO0R6251Hdl-o50m0wH0Ce89PAf74WCsXPU,11499
49
+ mineru/model/mfr/pp_formulanet_plus_m/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
50
+ mineru/model/mfr/pp_formulanet_plus_m/predict_formula.py,sha256=alGX_sPJxZh_7v1sOK3DJ8akfkWO-2c5I_JR7aXMTLU,5588
51
+ mineru/model/mfr/pp_formulanet_plus_m/processors.py,sha256=MSKyanxiDDjgDQHBov-GjKtPnMx9tSmxBC9GIkM3ft8,23832
52
+ mineru/model/mfr/unimernet/Unimernet.py,sha256=MrW6F084EHBmD-IbbtKbllrZ6MSH65otMJfrhBuRweg,5589
53
+ mineru/model/mfr/unimernet/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
54
+ mineru/model/mfr/unimernet/unimernet_hf/__init__.py,sha256=kHcISG8GS4TWJW34SCJCei1jxo6HxvO00aC0dqyNFgI,413
55
+ mineru/model/mfr/unimernet/unimernet_hf/modeling_unimernet.py,sha256=_lN3zDKxeqsW-h9tXx79DYiT5uT4P9ixG49WrSYKFxE,7551
56
+ mineru/model/mfr/unimernet/unimernet_hf/unimer_mbart/__init__.py,sha256=9T2rBpyGX5YFQYj89-mWujRokOuz4xgNreBuegcg1_c,228
57
+ mineru/model/mfr/unimernet/unimernet_hf/unimer_mbart/configuration_unimer_mbart.py,sha256=hwRar3pqN_cVs3TRTNSuhB4wacBncfJ-qvaTajRb0xc,7934
58
+ mineru/model/mfr/unimernet/unimernet_hf/unimer_mbart/modeling_unimer_mbart.py,sha256=XknL6UD2shfcErAD8kLk51Ty3Ltbv7uDi_Y3kxG1je8,114098
59
+ mineru/model/mfr/unimernet/unimernet_hf/unimer_mbart/tokenization_unimer_mbart.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
60
+ mineru/model/mfr/unimernet/unimernet_hf/unimer_swin/__init__.py,sha256=8_1DKwDCDUBkeHYiJJ6MZnodZBsatHbqhygh11s9eEA,267
61
+ mineru/model/mfr/unimernet/unimernet_hf/unimer_swin/configuration_unimer_swin.py,sha256=OX3eRUKBnKCXtxJOG3sdNoB1IV-Z7efgWU-gaclYOGA,5780
62
+ mineru/model/mfr/unimernet/unimernet_hf/unimer_swin/image_processing_unimer_swin.py,sha256=a9kCvwzJJSRrKQNtW2oOpTwrapzep8BjGFWLhLF1T0k,6036
63
+ mineru/model/mfr/unimernet/unimernet_hf/unimer_swin/modeling_unimer_swin.py,sha256=Q_fdmFHUBtEoAfWp9aowdwTCE2MIFMOPbYjoSyXK2iU,48929
64
+ mineru/model/ocr/__init__.py,sha256=8CRrCQVuExa0BttRFh3Z40lFy2K5jN0sp67KWjOlj5c,50
65
+ mineru/model/ocr/paddleocr2pytorch/__init__.py,sha256=8CRrCQVuExa0BttRFh3Z40lFy2K5jN0sp67KWjOlj5c,50
66
+ mineru/model/ocr/paddleocr2pytorch/pytorch_paddle.py,sha256=wZOw82q1NARNHBW2Lk5zumjdAqzPZqnhV6rvMULvLs8,9207
67
+ mineru/model/ori_cls/__init__.py,sha256=8CRrCQVuExa0BttRFh3Z40lFy2K5jN0sp67KWjOlj5c,50
68
+ mineru/model/ori_cls/paddle_ori_cls.py,sha256=VIS22IerHST7g60AC9r2PEQIG6NQWeQaH1OrXIxNTsg,11943
69
+ mineru/model/reading_order/__init__.py,sha256=8CRrCQVuExa0BttRFh3Z40lFy2K5jN0sp67KWjOlj5c,50
70
+ mineru/model/reading_order/layout_reader.py,sha256=IVUFcNMDF3-kio-BIxjppHnWS3eHPqvvNihIw2fbIFM,4372
71
+ mineru/model/reading_order/xycut.py,sha256=ezNSq_Y4UXiztB58hbXJsjTJlOBqWIjuW5A2uLSaZSo,7349
72
+ mineru/model/table/__init__.py,sha256=8CRrCQVuExa0BttRFh3Z40lFy2K5jN0sp67KWjOlj5c,50
73
+ mineru/model/table/cls/__init__.py,sha256=8CRrCQVuExa0BttRFh3Z40lFy2K5jN0sp67KWjOlj5c,50
74
+ mineru/model/table/cls/paddle_table_cls.py,sha256=5PtieKQnAzgMNRTZFgnqQsGWKTEQ3yyFWQnBRIjfQ4A,5781
75
+ mineru/model/table/rec/RapidTable.py,sha256=FxO3dLNKfQrgcQU7gRI0kLAxllnoHWZptCtyyHNuMpM,5973
76
+ mineru/model/table/rec/__init__.py,sha256=8CRrCQVuExa0BttRFh3Z40lFy2K5jN0sp67KWjOlj5c,50
77
+ mineru/model/table/rec/slanet_plus/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
78
+ mineru/model/table/rec/slanet_plus/main.py,sha256=vfrcvQ9JBf32YZU9eNoetoqdpcrFNsA1WNqQBsG8i2o,7646
79
+ mineru/model/table/rec/slanet_plus/matcher.py,sha256=uwF-wCLaYlaQ3JQ_-YywGVl1XQYnx7G_RTuWLW8JlBk,7321
80
+ mineru/model/table/rec/slanet_plus/matcher_utils.py,sha256=9wt_ydeeViLd57bU6g3lnXXni49qLSra2C6wSFQZkiw,9597
81
+ mineru/model/table/rec/slanet_plus/table_structure.py,sha256=Ve9eUdA0ivHf5bf9gwvHHfb7-E7drJLP3S3MPlh3uZ0,3844
82
+ mineru/model/table/rec/slanet_plus/table_structure_utils.py,sha256=YYSkwN2WdLx7qkWMSGkPY7yXOH5ENVhg5CsRGhtZ5Wk,19281
83
+ mineru/model/table/rec/unet_table/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
84
+ mineru/model/table/rec/unet_table/main.py,sha256=J13Q7_6stYyedmVedf9CZD7R0tuguGfTg3Z3ob4GDuM,15565
85
+ mineru/model/table/rec/unet_table/table_recover.py,sha256=rSyeWyuP10M8dLKA5e0n4P2DXMYbVbmgLxEcdZA8_0E,9059
86
+ mineru/model/table/rec/unet_table/table_structure_unet.py,sha256=beBMmBHAOR2lAuf2rcOKRSbFaJqwuIgMJWxWQsFmIRI,7908
87
+ mineru/model/table/rec/unet_table/utils.py,sha256=CYAqJW0wePJk4NAemb8W203N7E32v0ujiWbxanDhd8I,16083
88
+ mineru/model/table/rec/unet_table/utils_table_line_rec.py,sha256=zrCdPwI4M8nu0FEfd7lRJAe0z8kYq3KFbzwElM82USE,11174
89
+ mineru/model/table/rec/unet_table/utils_table_recover.py,sha256=XksJsY82ZS0kqUnNT-jvaYzxJ3V3svMSzj0puwIau1k,10651
90
+ mineru/model/utils/__init__.py,sha256=8CRrCQVuExa0BttRFh3Z40lFy2K5jN0sp67KWjOlj5c,50
91
+ mineru/model/utils/pytorchocr/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
92
+ mineru/model/utils/pytorchocr/base_ocr_v20.py,sha256=5bI7MAu65r-vn28krwdJ6pjZMkEvWjspE7EQaTsRERw,1319
93
+ mineru/model/utils/pytorchocr/data/__init__.py,sha256=YYu3c-W4fgEErxxDM98uQ3oWwPEh-6w75LY4zcj4VtM,199
94
+ mineru/model/utils/pytorchocr/data/imaug/__init__.py,sha256=c4H0gXPRweQ0wMFnkrCLTR6MrtG-e4kUinxwq2G1V9U,1480
95
+ mineru/model/utils/pytorchocr/data/imaug/operators.py,sha256=edBaDeezmRAkGkduPF6IWcUpE2WXRh7mARqSnwyynEA,14146
96
+ mineru/model/utils/pytorchocr/modeling/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
97
+ mineru/model/utils/pytorchocr/modeling/common.py,sha256=3r2jTvPYQS4IgTvIqR4l6bBVwR7jn-87rSmpv3tlqxI,2294
98
+ mineru/model/utils/pytorchocr/modeling/architectures/__init__.py,sha256=DCA9FS4mE5oCHDlBhUrkYLdxFeQIbhPj4P8oJ_gRZD8,832
99
+ mineru/model/utils/pytorchocr/modeling/architectures/base_model.py,sha256=RhV2Dm-os08kCFylT57zRu72Hq_RJdFy3xQe1MPaCuU,3588
100
+ mineru/model/utils/pytorchocr/modeling/backbones/__init__.py,sha256=dOmqhvLrBM-2imcwt73kS6APmMbhoVYTozlmqOkRfqA,2168
101
+ mineru/model/utils/pytorchocr/modeling/backbones/det_mobilenet_v3.py,sha256=r0gWnA1Xmt0Zw4FQLx7kf-WWwZd_26PfNzhM05drcuE,8334
102
+ mineru/model/utils/pytorchocr/modeling/backbones/rec_donut_swin.py,sha256=lTCje7mPuE-fHe05ATJe5C77wxk0CRTolAtD_gfQTmg,46257
103
+ mineru/model/utils/pytorchocr/modeling/backbones/rec_hgnet.py,sha256=UsIbzqN_koyGoSh1TA9r27SggpHbeKS3HmmS-A2Aw04,8341
104
+ mineru/model/utils/pytorchocr/modeling/backbones/rec_lcnetv3.py,sha256=-VRzTPtr7LTmwJ4dCtsI--gD81YAdh3llVau9K4Vnc8,16032
105
+ mineru/model/utils/pytorchocr/modeling/backbones/rec_mobilenet_v3.py,sha256=mJmE6xGpjHZH2Vaw16LlIlqRFFm9R9yRsSJEa3Yn3nw,4822
106
+ mineru/model/utils/pytorchocr/modeling/backbones/rec_mv1_enhance.py,sha256=K4p9KFYNmltV3y3QsxHIASNxoqlGtxgAoCxeFofyCmw,6726
107
+ mineru/model/utils/pytorchocr/modeling/backbones/rec_pphgnetv2.py,sha256=Yfp1xR5Shs5b1uxGjhFSrYgPq9Bl3NogYSo-KOJu08k,57119
108
+ mineru/model/utils/pytorchocr/modeling/backbones/rec_svtrnet.py,sha256=AIaUZ3IWBkRz2pWmanBjS0QdJcYnimMSV4MWofNpQcg,20222
109
+ mineru/model/utils/pytorchocr/modeling/heads/__init__.py,sha256=dlDWAICD_3PrYihipCHDP5GCJVH_-fwSj7WfojfICMo,1368
110
+ mineru/model/utils/pytorchocr/modeling/heads/cls_head.py,sha256=puIy5GlUtAKer6eS4HWKu07PzRd-HlDAqIz5WqjBHaA,596
111
+ mineru/model/utils/pytorchocr/modeling/heads/det_db_head.py,sha256=-k8bpuGQw_xIVDsumrfimOxg0O-oP2MOAyDJTjU70Ro,3633
112
+ mineru/model/utils/pytorchocr/modeling/heads/rec_ctc_head.py,sha256=ywyk5RJgUITdXvrUZk2yBSWKsaZIqnTofdFbuQUtwjU,1311
113
+ mineru/model/utils/pytorchocr/modeling/heads/rec_multi_head.py,sha256=K40SMA8tAVWu-3fwgfh3jGWeVFAdVnMyHjeZeI9OO7Q,2016
114
+ mineru/model/utils/pytorchocr/modeling/heads/rec_ppformulanet_head.py,sha256=5cLJUasDKrYCC47zTx2D5Osl6CnPh6JAmdwb6saeDWg,53991
115
+ mineru/model/utils/pytorchocr/modeling/heads/rec_unimernet_head.py,sha256=wsHxZEX2VO6kNELR43eUMleWJXrDUgcP5nLWfNIrM-E,95763
116
+ mineru/model/utils/pytorchocr/modeling/necks/__init__.py,sha256=634L1y-QWv5P8opNiSmKvQEx3Uskc20RG8DYiCdbl8U,1030
117
+ mineru/model/utils/pytorchocr/modeling/necks/db_fpn.py,sha256=TLF2pSyvRC0oPzL0eVyNlg3W6Zvfr4J8fD1nziVB7uI,14146
118
+ mineru/model/utils/pytorchocr/modeling/necks/intracl.py,sha256=w2QdwdI9BpiW92VS4mqL31sVERIbY53TfbD5Q6okiaY,3410
119
+ mineru/model/utils/pytorchocr/modeling/necks/rnn.py,sha256=TAUq4me4g_yXxat5wFOgGTqnqC2UUK1FZ2Le-2EWKqA,7503
120
+ mineru/model/utils/pytorchocr/postprocess/__init__.py,sha256=iC1Ol6CTxRWZBUyQ_5IVMR6kIurv9WJPOWWo7NAuZBA,1183
121
+ mineru/model/utils/pytorchocr/postprocess/cls_postprocess.py,sha256=1VVWXT_b1vhGb7PGvqyfUQ3Ip7LupH62vPva98GtjTA,685
122
+ mineru/model/utils/pytorchocr/postprocess/db_postprocess.py,sha256=AdZPF7frhQ27VVdp0GFmMcXtivwDZZfXYhzJOlP4zUs,6483
123
+ mineru/model/utils/pytorchocr/postprocess/rec_postprocess.py,sha256=qGB3onFEFhHjqksIR1IKOx2EY98ewfsmjADjrRXg30Y,30552
124
+ mineru/model/utils/pytorchocr/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
125
+ mineru/model/utils/pytorchocr/utils/resources/arch_config.yaml,sha256=yl4qTf-q0Du0MEOuYDffOt776_6qXBU5b2K3N-IOjd8,14964
126
+ mineru/model/utils/pytorchocr/utils/resources/models_config.yml,sha256=70B392J5XloC7mnK1eVi8GsWKSu7UE7qGffkEmBI9Us,2278
127
+ mineru/model/utils/pytorchocr/utils/resources/pp_formulanet_arch_config.yaml,sha256=a7yueOTUrfpZo8CsK6vQokbLNB2J-P77ihaCh_LozvQ,507
128
+ mineru/model/utils/pytorchocr/utils/resources/dict/arabic_dict.txt,sha256=xbaXD14RWk0Vpc7fAHpephuszp1j-Qi3IWC4VrFKu70,407
129
+ mineru/model/utils/pytorchocr/utils/resources/dict/chinese_cht_dict.txt,sha256=gyVR_uHy-8l1CHctgevcjboSwA3pejXHHJ3fQ92sGoM,33443
130
+ mineru/model/utils/pytorchocr/utils/resources/dict/cyrillic_dict.txt,sha256=NpqCxsjEeXhKXXJkSLg7Hq-1_vCkEppeqjkpYl3c0TI,410
131
+ mineru/model/utils/pytorchocr/utils/resources/dict/devanagari_dict.txt,sha256=tfG-bYu_8aGfuWxdTKlqQjOAI0u30s4OB7WDittNGOo,508
132
+ mineru/model/utils/pytorchocr/utils/resources/dict/en_dict.txt,sha256=VmLfnS0D8OjKDTsGSdasurkEtqFLPTUhRjxxw3xmjOM,190
133
+ mineru/model/utils/pytorchocr/utils/resources/dict/japan_dict.txt,sha256=Hc_LQe7JBXapRbMITyKt4RztUG4k8Uh5JFsHFpjzCOg,17332
134
+ mineru/model/utils/pytorchocr/utils/resources/dict/ka_dict.txt,sha256=-tP3ZZQyde7CE0pvvJtSeFQmZBEE1OfbOhWdxz80Hd4,452
135
+ mineru/model/utils/pytorchocr/utils/resources/dict/korean_dict.txt,sha256=qh_ciuj3zUCg7E7bRy6wQh4RQn5sz-6ZFUQHQsGLCiA,14480
136
+ mineru/model/utils/pytorchocr/utils/resources/dict/latin_dict.txt,sha256=jm1ONil4jDXDH35TAofWFHtUm7eiZb1nCLsoETRCniw,468
137
+ mineru/model/utils/pytorchocr/utils/resources/dict/ppocr_keys_v1.txt,sha256=KLI2KtSrLcOHaapy_rU146nds_0qdYWgWSDmOTsdx_c,26249
138
+ mineru/model/utils/pytorchocr/utils/resources/dict/ppocrv4_doc_dict.txt,sha256=pbw4h8Q8kB5aP5exP_rfHFdU7efMjJ9aviLodafEg3I,62346
139
+ mineru/model/utils/pytorchocr/utils/resources/dict/ppocrv5_arabic_dict.txt,sha256=f5L327m3WkeHqDv7T20UqKtRVSUTDJ1AqQNvYc9pmek,2369
140
+ mineru/model/utils/pytorchocr/utils/resources/dict/ppocrv5_cyrillic_dict.txt,sha256=20CqUs6xEgVb6AxpSv32VdXSxPeHNwRSTMFqRHypE7o,2781
141
+ mineru/model/utils/pytorchocr/utils/resources/dict/ppocrv5_devanagari_dict.txt,sha256=CcdEC_xUd-XEEFIwS2sYWv-MSl6LK0wjwccG9v4e6fw,1943
142
+ mineru/model/utils/pytorchocr/utils/resources/dict/ppocrv5_dict.txt,sha256=0Zeen3lMRkwNLgtwp_4U3ZeOncZEwOcfFBWM34NCrxs,74012
143
+ mineru/model/utils/pytorchocr/utils/resources/dict/ppocrv5_el_dict.txt,sha256=Md78YsDDrTZ0qC2mGSImorqY70_wFKcEXLiNWfnD3jE,1103
144
+ mineru/model/utils/pytorchocr/utils/resources/dict/ppocrv5_en_dict.txt,sha256=4CWmbTHzJ7oMIy4D9AeujRBeHnCefMs_QIqneMJOcNY,1416
145
+ mineru/model/utils/pytorchocr/utils/resources/dict/ppocrv5_eslav_dict.txt,sha256=PpXxWBVXFihwys26WvkaTGviiQcQ05Www8dXjn7l5us,1663
146
+ mineru/model/utils/pytorchocr/utils/resources/dict/ppocrv5_korean_dict.txt,sha256=qIBxxowBcHSJuqeevgQFt761zKIp9PyUzD75kjKIAtc,47451
147
+ mineru/model/utils/pytorchocr/utils/resources/dict/ppocrv5_latin_dict.txt,sha256=PAqKebYSZTwl92UnFxT3EoHk6VWWLBU-Jyt7jB0rE_8,1634
148
+ mineru/model/utils/pytorchocr/utils/resources/dict/ppocrv5_ta_dict.txt,sha256=hbVBNSrhjca6bUcVLYv4rf9rAmbmBdLu8pkMG_RmEXs,1723
149
+ mineru/model/utils/pytorchocr/utils/resources/dict/ppocrv5_te_dict.txt,sha256=Qvg_XT_bUHeOT6W2bFjZmlmrd5IVHF5080uP_XthydY,1831
150
+ mineru/model/utils/pytorchocr/utils/resources/dict/ppocrv5_th_dict.txt,sha256=V_VAb5S7Zoj7cHf3vmXwi71xzs9IwB6ibFIstcSDa3o,1767
151
+ mineru/model/utils/pytorchocr/utils/resources/dict/ta_dict.txt,sha256=6T5pSBSv2f8ekYtvS7Qmf7TGWpNE7l10ZPkTW5DAonA,352
152
+ mineru/model/utils/pytorchocr/utils/resources/dict/te_dict.txt,sha256=7plGpg13AZd0dOiYg2lKTKIOqjhoojM0v3lA3NAI8Pk,429
153
+ mineru/model/utils/tools/__init__.py,sha256=xEqR65Z8YOzOLorLjK0LCHos2zX-tCuxSrxndjU00hE,49
154
+ mineru/model/utils/tools/infer/__init__.py,sha256=8CRrCQVuExa0BttRFh3Z40lFy2K5jN0sp67KWjOlj5c,50
155
+ mineru/model/utils/tools/infer/predict_cls.py,sha256=8RmKl1vejnZl65caHZNV2ta6hMsg5B_LE-FuqCO8T8A,4225
156
+ mineru/model/utils/tools/infer/predict_det.py,sha256=vYQREn7vELXxBsr72CCCVvm1gwV82ONaCwGfxUIjne8,13621
157
+ mineru/model/utils/tools/infer/predict_rec.py,sha256=-BH93JDisu0kT6CyHA4plUOKcb2L-UvDk7Tein5uwt4,19209
158
+ mineru/model/utils/tools/infer/predict_system.py,sha256=hkegkn6hq2v2zqHVAP615-k-fkTS8swRYSbZeoqmSI8,3822
159
+ mineru/model/utils/tools/infer/pytorchocr_utility.py,sha256=i1PFN-_kefJUUZ4Vk7igs1TU8gfErTDlDXY6-8Uaurw,9323
160
+ mineru/model/vlm_vllm_model/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
161
+ mineru/model/vlm_vllm_model/server.py,sha256=nv51j9yAa-u4iFGy4Idh4-viM4sqLHvzs3Lk5w-Cfxg,2105
162
+ mineru/resources/header.html,sha256=PUselBXLBn8gfeP3zwEtj6zIxfhcCN4vN_B796nQFNQ,4410
163
+ mineru/resources/fasttext-langdetect/lid.176.ftz,sha256=jzRyz-hzintgmejpmcPL-uDc0VaWqsfXc4qAOdtgPoM,938013
164
+ mineru/utils/__init__.py,sha256=8CRrCQVuExa0BttRFh3Z40lFy2K5jN0sp67KWjOlj5c,50
165
+ mineru/utils/block_pre_proc.py,sha256=uGBmxf2MR9bplTnQI8xHjCI-kj3plRhJr0hcWKidbOQ,9632
166
+ mineru/utils/block_sort.py,sha256=mViceDw3O2ksBDFxt-wmX67bCZOwKyp68yZnEjS3Ijc,12934
167
+ mineru/utils/boxbase.py,sha256=moP660AmZq_udHEsfvFkTQdJ4gjrrBwN7t0Enx7CIL8,6903
168
+ mineru/utils/cli_parser.py,sha256=4seFAu1kulsYnw6WM2q_cxgEOt2tErZVkI-LNEF_kGw,1445
169
+ mineru/utils/config_reader.py,sha256=IRVWTpBnbnRpck6eXZUKw-fcLt7hon5S4uqWW-RBb1w,4075
170
+ mineru/utils/cut_image.py,sha256=g3m4nfcJNWlxi-P0kpXTtlmspXkMcLCfGwmYuQ-Z2hE,751
171
+ mineru/utils/draw_bbox.py,sha256=FkgppjUzRhN-uxvChdkhHXcDavJEaApMD6qC6qoRwfQ,20292
172
+ mineru/utils/enum_class.py,sha256=-_Ey03vGNEQHkl6x7pZ43GgrakwhSCOa1RXdr1m-I3A,2503
173
+ mineru/utils/format_utils.py,sha256=2s89vHcSISjuolk8Hvg3K-5-rRbiT3Us7eFLzUKrNKs,10233
174
+ mineru/utils/guess_suffix_or_lang.py,sha256=nznyQpUn1BSA8JNw9HuG3pVV-xtVAtrtcGuHZ-VXt9M,856
175
+ mineru/utils/hash_utils.py,sha256=UPS_8NRBmVumdyOv16Lmv6Ly2xK8OVDJEe5gG6gKIFk,857
176
+ mineru/utils/language.py,sha256=7RT3mxSa7jdpoC5ySd7ZddHA7TO7UsnmDOWiYZAxuyg,1433
177
+ mineru/utils/llm_aided.py,sha256=eBGKCD7cJBjkyn38yqCdh0S-fgRG9fLuQCByLDQuyWs,4983
178
+ mineru/utils/magic_model_utils.py,sha256=2xOvi4oqg3MSw1FUrJTnYDtWeFrrm6qbmlEorLZSaYs,5650
179
+ mineru/utils/model_utils.py,sha256=6OsgFLsABX5JuShSzCMSNHWV-yi-1cjwHweafyxIgRo,18448
180
+ mineru/utils/models_download_utils.py,sha256=UfjvwhxO6BkJHa5JSpEVNZ71GoLMPMmJpym3THET2T4,2957
181
+ mineru/utils/ocr_utils.py,sha256=lPIrwNUib5mrzUkponRYHuUCdjV2qvETNLSzOLyflrU,15990
182
+ mineru/utils/pdf_classify.py,sha256=6DF5pH_9Uq83fsFtp7n4i-OdYQGzoNOV9L0VBUhgBMQ,8078
183
+ mineru/utils/pdf_image_tools.py,sha256=mioLEHOdDtM1YbspNaa0wWhnLw_4-H7rdHlIM40vrT4,4077
184
+ mineru/utils/pdf_reader.py,sha256=WeINm5SyWBUXT0wP9lzIbeHs8P6WUIkN6nVL5X4LzG4,3267
185
+ mineru/utils/pdf_text_tool.py,sha256=KEztjfdqsIHHuiTEAMAL7Lr1OS3R7Ur-uTqGiCRjReQ,1364
186
+ mineru/utils/run_async.py,sha256=rPeP4BCZerR8VByRDhiYzfZiahLVqoZEBVAS54dAjNg,1286
187
+ mineru/utils/span_block_fix.py,sha256=0eVQjJCrT03woRt9hoh6Uu42Tp1dacfGTv2x3B9qq94,8797
188
+ mineru/utils/span_pre_proc.py,sha256=h41q2uQajI0xQbc_30hqaju1dv3oVYxBAlKgURl8HIc,13692
189
+ mineru/utils/table_merge.py,sha256=d98zNbM1ZQ8V1kUt6RugParNUNPv7DGL-XKIzR3iJVQ,15360
190
+ mineru-2.6.0.dist-info/licenses/LICENSE.md,sha256=jVa0BUaKrRH4erV2P5AeJ24I2WRv9chIGxditreJ6e0,34524
191
+ mineru-2.6.0.dist-info/METADATA,sha256=dbt-b5mAS6fgkv06-dMemfgqARV02Ji_eCDqZ6SlRD4,68358
192
+ mineru-2.6.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
193
+ mineru-2.6.0.dist-info/entry_points.txt,sha256=luXmbhPiZK_tKlRgWuYOaW_V6EFpG-yJcAevVv9MEqE,252
194
+ mineru-2.6.0.dist-info/top_level.txt,sha256=zuGQfZcbsHv4I4oKI9gaKPqEWBFm6xJroKuug2LnKP8,7
195
+ mineru-2.6.0.dist-info/RECORD,,