mineru 2.2.1__py3-none-any.whl → 2.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. mineru/backend/pipeline/batch_analyze.py +1 -1
  2. mineru/backend/pipeline/pipeline_middle_json_mkcontent.py +3 -3
  3. mineru/backend/vlm/model_output_to_middle_json.py +123 -0
  4. mineru/backend/vlm/vlm_analyze.py +97 -16
  5. mineru/backend/vlm/vlm_magic_model.py +201 -135
  6. mineru/backend/vlm/vlm_middle_json_mkcontent.py +52 -11
  7. mineru/cli/client.py +6 -5
  8. mineru/cli/common.py +17 -16
  9. mineru/cli/fast_api.py +9 -7
  10. mineru/cli/gradio_app.py +15 -16
  11. mineru/cli/vlm_vllm_server.py +4 -0
  12. mineru/model/table/rec/unet_table/main.py +10 -2
  13. mineru/model/vlm_vllm_model/__init__.py +0 -0
  14. mineru/model/vlm_vllm_model/server.py +51 -0
  15. mineru/resources/header.html +10 -2
  16. mineru/utils/draw_bbox.py +32 -10
  17. mineru/utils/enum_class.py +16 -2
  18. mineru/utils/guess_suffix_or_lang.py +20 -0
  19. mineru/utils/span_block_fix.py +4 -2
  20. mineru/version.py +1 -1
  21. {mineru-2.2.1.dist-info → mineru-2.5.0.dist-info}/METADATA +71 -23
  22. {mineru-2.2.1.dist-info → mineru-2.5.0.dist-info}/RECORD +26 -39
  23. {mineru-2.2.1.dist-info → mineru-2.5.0.dist-info}/entry_points.txt +1 -1
  24. mineru/backend/vlm/base_predictor.py +0 -186
  25. mineru/backend/vlm/hf_predictor.py +0 -217
  26. mineru/backend/vlm/predictor.py +0 -111
  27. mineru/backend/vlm/sglang_client_predictor.py +0 -443
  28. mineru/backend/vlm/sglang_engine_predictor.py +0 -246
  29. mineru/backend/vlm/token_to_middle_json.py +0 -122
  30. mineru/backend/vlm/utils.py +0 -40
  31. mineru/cli/vlm_sglang_server.py +0 -4
  32. mineru/model/vlm_hf_model/__init__.py +0 -9
  33. mineru/model/vlm_hf_model/configuration_mineru2.py +0 -38
  34. mineru/model/vlm_hf_model/image_processing_mineru2.py +0 -269
  35. mineru/model/vlm_hf_model/modeling_mineru2.py +0 -449
  36. mineru/model/vlm_sglang_model/__init__.py +0 -14
  37. mineru/model/vlm_sglang_model/engine.py +0 -264
  38. mineru/model/vlm_sglang_model/image_processor.py +0 -213
  39. mineru/model/vlm_sglang_model/logit_processor.py +0 -90
  40. mineru/model/vlm_sglang_model/model.py +0 -453
  41. mineru/model/vlm_sglang_model/server.py +0 -75
  42. {mineru-2.2.1.dist-info → mineru-2.5.0.dist-info}/WHEEL +0 -0
  43. {mineru-2.2.1.dist-info → mineru-2.5.0.dist-info}/licenses/LICENSE.md +0 -0
  44. {mineru-2.2.1.dist-info → mineru-2.5.0.dist-info}/top_level.txt +0 -0
@@ -3,46 +3,37 @@ from typing import Literal
3
3
 
4
4
  from loguru import logger
5
5
 
6
- from mineru.utils.enum_class import ContentType, BlockType, SplitFlag
7
- from mineru.backend.vlm.vlm_middle_json_mkcontent import merge_para_with_text
8
- from mineru.utils.format_utils import block_content_to_html
6
+ from mineru.utils.boxbase import calculate_overlap_area_in_bbox1_area_ratio
7
+ from mineru.utils.enum_class import ContentType, BlockType
8
+ from mineru.utils.guess_suffix_or_lang import guess_language_by_text
9
9
  from mineru.utils.magic_model_utils import reduct_overlap, tie_up_category_by_distance_v3
10
10
 
11
11
 
12
12
  class MagicModel:
13
- def __init__(self, token: str, width, height):
14
- self.token = token
15
-
16
- # 使用正则表达式查找所有块
17
- pattern = (
18
- r"<\|box_start\|>(.*?)<\|box_end\|><\|ref_start\|>(.*?)<\|ref_end\|><\|md_start\|>(.*?)(?:<\|md_end\|>|<\|im_end\|>)"
19
- )
20
- block_infos = re.findall(pattern, token, re.DOTALL)
13
+ def __init__(self, page_blocks: list, width, height):
14
+ self.page_blocks = page_blocks
21
15
 
22
16
  blocks = []
23
17
  self.all_spans = []
24
18
  # 解析每个块
25
- for index, block_info in enumerate(block_infos):
26
- block_bbox = block_info[0].strip()
19
+ for index, block_info in enumerate(page_blocks):
20
+ block_bbox = block_info["bbox"]
27
21
  try:
28
- x1, y1, x2, y2 = map(int, block_bbox.split())
22
+ x1, y1, x2, y2 = block_bbox
29
23
  x_1, y_1, x_2, y_2 = (
30
- int(x1 * width / 1000),
31
- int(y1 * height / 1000),
32
- int(x2 * width / 1000),
33
- int(y2 * height / 1000),
24
+ int(x1 * width),
25
+ int(y1 * height),
26
+ int(x2 * width),
27
+ int(y2 * height),
34
28
  )
35
29
  if x_2 < x_1:
36
30
  x_1, x_2 = x_2, x_1
37
31
  if y_2 < y_1:
38
32
  y_1, y_2 = y_2, y_1
39
33
  block_bbox = (x_1, y_1, x_2, y_2)
40
- block_type = block_info[1].strip()
41
- block_content = block_info[2].strip()
42
-
43
- # 如果bbox是0,0,999,999,且type为text,按notes增加表格处理
44
- if x1 == 0 and y1 == 0 and x2 == 999 and y2 == 999 and block_type == "text":
45
- block_content = block_content_to_html(block_content)
34
+ block_type = block_info["type"]
35
+ block_content = block_info["content"]
36
+ block_angle = block_info["angle"]
46
37
 
47
38
  # print(f"坐标: {block_bbox}")
48
39
  # print(f"类型: {block_type}")
@@ -54,6 +45,9 @@ class MagicModel:
54
45
  continue
55
46
 
56
47
  span_type = "unknown"
48
+ line_type = None
49
+ guess_lang = None
50
+
57
51
  if block_type in [
58
52
  "text",
59
53
  "title",
@@ -61,8 +55,15 @@ class MagicModel:
61
55
  "image_footnote",
62
56
  "table_caption",
63
57
  "table_footnote",
64
- "list",
65
- "index",
58
+ "code_caption",
59
+ "ref_text",
60
+ "phonetic",
61
+ "header",
62
+ "footer",
63
+ "page_number",
64
+ "aside_text",
65
+ "page_footnote",
66
+ "list"
66
67
  ]:
67
68
  span_type = ContentType.TEXT
68
69
  elif block_type in ["image"]:
@@ -71,6 +72,12 @@ class MagicModel:
71
72
  elif block_type in ["table"]:
72
73
  block_type = BlockType.TABLE_BODY
73
74
  span_type = ContentType.TABLE
75
+ elif block_type in ["code", "algorithm"]:
76
+ block_content = code_content_clean(block_content)
77
+ line_type = block_type
78
+ block_type = BlockType.CODE_BODY
79
+ span_type = ContentType.TEXT
80
+ guess_lang = guess_language_by_text(block_content)
74
81
  elif block_type in ["equation"]:
75
82
  block_type = BlockType.INTERLINE_EQUATION
76
83
  span_type = ContentType.INTERLINE_EQUATION
@@ -81,7 +88,7 @@ class MagicModel:
81
88
  "type": span_type,
82
89
  }
83
90
  if span_type == ContentType.TABLE:
84
- span["html"] = block_content_to_html(block_content)
91
+ span["html"] = block_content
85
92
  elif span_type in [ContentType.INTERLINE_EQUATION]:
86
93
  span = {
87
94
  "bbox": block_bbox,
@@ -89,7 +96,12 @@ class MagicModel:
89
96
  "content": isolated_formula_clean(block_content),
90
97
  }
91
98
  else:
92
- if block_content.count("\\(") == block_content.count("\\)") and block_content.count("\\(") > 0:
99
+
100
+ if block_content:
101
+ block_content = clean_content(block_content)
102
+
103
+ if block_content and block_content.count("\\(") == block_content.count("\\)") and block_content.count("\\(") > 0:
104
+
93
105
  # 生成包含文本和公式的span列表
94
106
  spans = []
95
107
  last_end = 0
@@ -136,25 +148,27 @@ class MagicModel:
136
148
  "content": block_content,
137
149
  }
138
150
 
151
+ # 处理span类型并添加到all_spans
139
152
  if isinstance(span, dict) and "bbox" in span:
140
153
  self.all_spans.append(span)
141
- line = {
142
- "bbox": block_bbox,
143
- "spans": [span],
144
- }
154
+ spans = [span]
145
155
  elif isinstance(span, list):
146
156
  self.all_spans.extend(span)
147
- line = {
148
- "bbox": block_bbox,
149
- "spans": span,
150
- }
157
+ spans = span
151
158
  else:
152
159
  raise ValueError(f"Invalid span type: {span_type}, expected dict or list, got {type(span)}")
153
160
 
161
+ # 构造line对象
162
+ if block_type in [BlockType.CODE_BODY]:
163
+ line = {"bbox": block_bbox, "spans": spans, "extra": {"type": line_type, "guess_lang": guess_lang}}
164
+ else:
165
+ line = {"bbox": block_bbox, "spans": spans}
166
+
154
167
  blocks.append(
155
168
  {
156
169
  "bbox": block_bbox,
157
170
  "type": block_type,
171
+ "angle": block_angle,
158
172
  "lines": [line],
159
173
  "index": index,
160
174
  }
@@ -165,35 +179,87 @@ class MagicModel:
165
179
  self.interline_equation_blocks = []
166
180
  self.text_blocks = []
167
181
  self.title_blocks = []
182
+ self.code_blocks = []
183
+ self.discarded_blocks = []
184
+ self.ref_text_blocks = []
185
+ self.phonetic_blocks = []
186
+ self.list_blocks = []
168
187
  for block in blocks:
169
188
  if block["type"] in [BlockType.IMAGE_BODY, BlockType.IMAGE_CAPTION, BlockType.IMAGE_FOOTNOTE]:
170
189
  self.image_blocks.append(block)
171
190
  elif block["type"] in [BlockType.TABLE_BODY, BlockType.TABLE_CAPTION, BlockType.TABLE_FOOTNOTE]:
172
191
  self.table_blocks.append(block)
192
+ elif block["type"] in [BlockType.CODE_BODY, BlockType.CODE_CAPTION]:
193
+ self.code_blocks.append(block)
173
194
  elif block["type"] == BlockType.INTERLINE_EQUATION:
174
195
  self.interline_equation_blocks.append(block)
175
196
  elif block["type"] == BlockType.TEXT:
176
197
  self.text_blocks.append(block)
177
198
  elif block["type"] == BlockType.TITLE:
178
199
  self.title_blocks.append(block)
200
+ elif block["type"] in [BlockType.REF_TEXT]:
201
+ self.ref_text_blocks.append(block)
202
+ elif block["type"] in [BlockType.PHONETIC]:
203
+ self.phonetic_blocks.append(block)
204
+ elif block["type"] in [BlockType.HEADER, BlockType.FOOTER, BlockType.PAGE_NUMBER, BlockType.ASIDE_TEXT, BlockType.PAGE_FOOTNOTE]:
205
+ self.discarded_blocks.append(block)
206
+ elif block["type"] == BlockType.LIST:
207
+ self.list_blocks.append(block)
179
208
  else:
180
209
  continue
181
210
 
211
+ self.list_blocks, self.text_blocks, self.ref_text_blocks = fix_list_blocks(self.list_blocks, self.text_blocks, self.ref_text_blocks)
212
+ self.image_blocks, not_include_image_blocks = fix_two_layer_blocks(self.image_blocks, BlockType.IMAGE)
213
+ self.table_blocks, not_include_table_blocks = fix_two_layer_blocks(self.table_blocks, BlockType.TABLE)
214
+ self.code_blocks, not_include_code_blocks = fix_two_layer_blocks(self.code_blocks, BlockType.CODE)
215
+ for code_block in self.code_blocks:
216
+ for block in code_block['blocks']:
217
+ if block['type'] == BlockType.CODE_BODY:
218
+ if len(block["lines"]) > 0:
219
+ line = block["lines"][0]
220
+ code_block["sub_type"] = line["extra"]["type"]
221
+ if code_block["sub_type"] in ["code"]:
222
+ code_block["guess_lang"] = line["extra"]["guess_lang"]
223
+ del line["extra"]
224
+ else:
225
+ code_block["sub_type"] = "code"
226
+ code_block["guess_lang"] = "txt"
227
+
228
+ for block in not_include_image_blocks + not_include_table_blocks + not_include_code_blocks:
229
+ block["type"] = BlockType.TEXT
230
+ self.text_blocks.append(block)
231
+
232
+
233
+ def get_list_blocks(self):
234
+ return self.list_blocks
235
+
182
236
  def get_image_blocks(self):
183
- return fix_two_layer_blocks(self.image_blocks, BlockType.IMAGE)
237
+ return self.image_blocks
184
238
 
185
239
  def get_table_blocks(self):
186
- return fix_two_layer_blocks(self.table_blocks, BlockType.TABLE)
240
+ return self.table_blocks
241
+
242
+ def get_code_blocks(self):
243
+ return self.code_blocks
244
+
245
+ def get_ref_text_blocks(self):
246
+ return self.ref_text_blocks
247
+
248
+ def get_phonetic_blocks(self):
249
+ return self.phonetic_blocks
187
250
 
188
251
  def get_title_blocks(self):
189
- return fix_title_blocks(self.title_blocks)
252
+ return self.title_blocks
190
253
 
191
254
  def get_text_blocks(self):
192
- return fix_text_blocks(self.text_blocks)
255
+ return self.text_blocks
193
256
 
194
257
  def get_interline_equation_blocks(self):
195
258
  return self.interline_equation_blocks
196
259
 
260
+ def get_discarded_blocks(self):
261
+ return self.discarded_blocks
262
+
197
263
  def get_all_spans(self):
198
264
  return self.all_spans
199
265
 
@@ -202,48 +268,46 @@ def isolated_formula_clean(txt):
202
268
  latex = txt[:]
203
269
  if latex.startswith("\\["): latex = latex[2:]
204
270
  if latex.endswith("\\]"): latex = latex[:-2]
205
- latex = latex_fix(latex.strip())
271
+ latex = latex.strip()
206
272
  return latex
207
273
 
208
274
 
209
- def latex_fix(latex):
210
- # valid pairs:
211
- # \left\{ ... \right\}
212
- # \left( ... \right)
213
- # \left| ... \right|
214
- # \left\| ... \right\|
215
- # \left[ ... \right]
216
-
217
- LEFT_COUNT_PATTERN = re.compile(r'\\left(?![a-zA-Z])')
218
- RIGHT_COUNT_PATTERN = re.compile(r'\\right(?![a-zA-Z])')
219
- left_count = len(LEFT_COUNT_PATTERN.findall(latex)) # 不匹配\lefteqn等
220
- right_count = len(RIGHT_COUNT_PATTERN.findall(latex)) # 不匹配\rightarrow
221
-
222
- if left_count != right_count:
223
- for _ in range(2):
224
- # replace valid pairs
225
- latex = re.sub(r'\\left\\\{', "{", latex) # \left\{
226
- latex = re.sub(r"\\left\|", "|", latex) # \left|
227
- latex = re.sub(r"\\left\\\|", "|", latex) # \left\|
228
- latex = re.sub(r"\\left\(", "(", latex) # \left(
229
- latex = re.sub(r"\\left\[", "[", latex) # \left[
230
-
231
- latex = re.sub(r"\\right\\\}", "}", latex) # \right\}
232
- latex = re.sub(r"\\right\|", "|", latex) # \right|
233
- latex = re.sub(r"\\right\\\|", "|", latex) # \right\|
234
- latex = re.sub(r"\\right\)", ")", latex) # \right)
235
- latex = re.sub(r"\\right\]", "]", latex) # \right]
236
- latex = re.sub(r"\\right\.", "", latex) # \right.
237
-
238
- # replace invalid pairs first
239
- latex = re.sub(r'\\left\{', "{", latex)
240
- latex = re.sub(r'\\right\}', "}", latex) # \left{ ... \right}
241
- latex = re.sub(r'\\left\\\(', "(", latex)
242
- latex = re.sub(r'\\right\\\)', ")", latex) # \left\( ... \right\)
243
- latex = re.sub(r'\\left\\\[', "[", latex)
244
- latex = re.sub(r'\\right\\\]', "]", latex) # \left\[ ... \right\]
275
+ def code_content_clean(content):
276
+ """清理代码内容,移除Markdown代码块的开始和结束标记"""
277
+ if not content:
278
+ return ""
279
+
280
+ lines = content.splitlines()
281
+ start_idx = 0
282
+ end_idx = len(lines)
283
+
284
+ # 处理开头的三个反引号
285
+ if lines and lines[0].startswith("```"):
286
+ start_idx = 1
287
+
288
+ # 处理结尾的三个反引号
289
+ if lines and end_idx > start_idx and lines[end_idx - 1].strip() == "```":
290
+ end_idx -= 1
291
+
292
+ # 只有在有内容时才进行join操作
293
+ if start_idx < end_idx:
294
+ return "\n".join(lines[start_idx:end_idx]).strip()
295
+ return ""
245
296
 
246
- return latex
297
+
298
+ def clean_content(content):
299
+ if content and content.count("\\[") == content.count("\\]") and content.count("\\[") > 0:
300
+ # Function to handle each match
301
+ def replace_pattern(match):
302
+ # Extract content between \[ and \]
303
+ inner_content = match.group(1)
304
+ return f"[{inner_content}]"
305
+
306
+ # Find all patterns of \[x\] and apply replacement
307
+ pattern = r'\\\[(.*?)\\\]'
308
+ content = re.sub(pattern, replace_pattern, content)
309
+
310
+ return content
247
311
 
248
312
 
249
313
  def __tie_up_category_by_distance_v3(blocks, subject_block_type, object_block_type):
@@ -252,7 +316,7 @@ def __tie_up_category_by_distance_v3(blocks, subject_block_type, object_block_ty
252
316
  return reduct_overlap(
253
317
  list(
254
318
  map(
255
- lambda x: {"bbox": x["bbox"], "lines": x["lines"], "index": x["index"]},
319
+ lambda x: {"bbox": x["bbox"], "lines": x["lines"], "index": x["index"], "angle":x["angle"]},
256
320
  filter(
257
321
  lambda x: x["type"] == subject_block_type,
258
322
  blocks,
@@ -265,7 +329,7 @@ def __tie_up_category_by_distance_v3(blocks, subject_block_type, object_block_ty
265
329
  return reduct_overlap(
266
330
  list(
267
331
  map(
268
- lambda x: {"bbox": x["bbox"], "lines": x["lines"], "index": x["index"]},
332
+ lambda x: {"bbox": x["bbox"], "lines": x["lines"], "index": x["index"], "angle":x["angle"]},
269
333
  filter(
270
334
  lambda x: x["type"] == object_block_type,
271
335
  blocks,
@@ -281,7 +345,7 @@ def __tie_up_category_by_distance_v3(blocks, subject_block_type, object_block_ty
281
345
  )
282
346
 
283
347
 
284
- def get_type_blocks(blocks, block_type: Literal["image", "table"]):
348
+ def get_type_blocks(blocks, block_type: Literal["image", "table", "code"]):
285
349
  with_captions = __tie_up_category_by_distance_v3(blocks, f"{block_type}_body", f"{block_type}_caption")
286
350
  with_footnotes = __tie_up_category_by_distance_v3(blocks, f"{block_type}_body", f"{block_type}_footnote")
287
351
  ret = []
@@ -297,9 +361,13 @@ def get_type_blocks(blocks, block_type: Literal["image", "table"]):
297
361
  return ret
298
362
 
299
363
 
300
- def fix_two_layer_blocks(blocks, fix_type: Literal["image", "table"]):
364
+ def fix_two_layer_blocks(blocks, fix_type: Literal["image", "table", "code"]):
301
365
  need_fix_blocks = get_type_blocks(blocks, fix_type)
302
366
  fixed_blocks = []
367
+ not_include_blocks = []
368
+ processed_indices = set()
369
+
370
+ # 处理需要组织成two_layer结构的blocks
303
371
  for block in need_fix_blocks:
304
372
  body = block[f"{fix_type}_body"]
305
373
  caption_list = block[f"{fix_type}_caption_list"]
@@ -308,8 +376,12 @@ def fix_two_layer_blocks(blocks, fix_type: Literal["image", "table"]):
308
376
  body["type"] = f"{fix_type}_body"
309
377
  for caption in caption_list:
310
378
  caption["type"] = f"{fix_type}_caption"
379
+ processed_indices.add(caption["index"])
311
380
  for footnote in footnote_list:
312
381
  footnote["type"] = f"{fix_type}_footnote"
382
+ processed_indices.add(footnote["index"])
383
+
384
+ processed_indices.add(body["index"])
313
385
 
314
386
  two_layer_block = {
315
387
  "type": fix_type,
@@ -323,58 +395,52 @@ def fix_two_layer_blocks(blocks, fix_type: Literal["image", "table"]):
323
395
 
324
396
  fixed_blocks.append(two_layer_block)
325
397
 
326
- return fixed_blocks
327
-
328
-
329
- def fix_title_blocks(blocks):
398
+ # 添加未处理的blocks
330
399
  for block in blocks:
331
- if block["type"] == BlockType.TITLE:
332
- title_content = merge_para_with_text(block)
333
- title_level = count_leading_hashes(title_content)
334
- block['level'] = title_level
335
- for line in block['lines']:
336
- for span in line['spans']:
337
- span['content'] = strip_leading_hashes(span['content'])
338
- break
400
+ if block["index"] not in processed_indices:
401
+ # 直接添加未处理的block
402
+ not_include_blocks.append(block)
403
+
404
+ return fixed_blocks, not_include_blocks
405
+
406
+
407
+ def fix_list_blocks(list_blocks, text_blocks, ref_text_blocks):
408
+ for list_block in list_blocks:
409
+ list_block["blocks"] = []
410
+ if "lines" in list_block:
411
+ del list_block["lines"]
412
+
413
+ temp_text_blocks = text_blocks + ref_text_blocks
414
+ need_remove_blocks = []
415
+ for block in temp_text_blocks:
416
+ for list_block in list_blocks:
417
+ if calculate_overlap_area_in_bbox1_area_ratio(block["bbox"], list_block["bbox"]) >= 0.8:
418
+ list_block["blocks"].append(block)
419
+ need_remove_blocks.append(block)
339
420
  break
340
- return blocks
341
-
342
-
343
- def count_leading_hashes(text):
344
- match = re.match(r'^(#+)', text)
345
- return len(match.group(1)) if match else 0
346
-
347
-
348
- def strip_leading_hashes(text):
349
- # 去除开头的#和紧随其后的空格
350
- return re.sub(r'^#+\s*', '', text)
351
-
352
-
353
- def fix_text_blocks(blocks):
354
- i = 0
355
- while i < len(blocks):
356
- block = blocks[i]
357
- last_line = block["lines"][-1]if block["lines"] else None
358
- if last_line:
359
- last_span = last_line["spans"][-1] if last_line["spans"] else None
360
- if last_span and last_span['content'].endswith('<|txt_contd|>'):
361
- last_span['content'] = last_span['content'][:-len('<|txt_contd|>')]
362
-
363
- # 查找下一个未被清空的块
364
- next_idx = i + 1
365
- while next_idx < len(blocks) and blocks[next_idx].get(SplitFlag.LINES_DELETED, False):
366
- next_idx += 1
367
-
368
- # 如果找到下一个有效块,则合并
369
- if next_idx < len(blocks):
370
- next_block = blocks[next_idx]
371
- # 将下一个块的lines扩展到当前块的lines中
372
- block["lines"].extend(next_block["lines"])
373
- # 清空下一个块的lines
374
- next_block["lines"] = []
375
- # 在下一个块中添加标志
376
- next_block[SplitFlag.LINES_DELETED] = True
377
- # 不增加i,继续检查当前块(现在已包含下一个块的内容)
378
- continue
379
- i += 1
380
- return blocks
421
+
422
+ for block in need_remove_blocks:
423
+ if block in text_blocks:
424
+ text_blocks.remove(block)
425
+ elif block in ref_text_blocks:
426
+ ref_text_blocks.remove(block)
427
+
428
+ # 移除blocks为空的list_block
429
+ list_blocks = [lb for lb in list_blocks if lb["blocks"]]
430
+
431
+ for list_block in list_blocks:
432
+ # 统计list_block["blocks"]中所有block的type,用众数作为list_block的sub_type
433
+ type_count = {}
434
+ line_content = []
435
+ for sub_block in list_block["blocks"]:
436
+ sub_block_type = sub_block["type"]
437
+ if sub_block_type not in type_count:
438
+ type_count[sub_block_type] = 0
439
+ type_count[sub_block_type] += 1
440
+
441
+ if type_count:
442
+ list_block["sub_type"] = max(type_count, key=type_count.get)
443
+ else:
444
+ list_block["sub_type"] = "unknown"
445
+
446
+ return list_blocks, text_blocks, ref_text_blocks
@@ -3,7 +3,6 @@ import os
3
3
  from mineru.utils.config_reader import get_latex_delimiter_config, get_formula_enable, get_table_enable
4
4
  from mineru.utils.enum_class import MakeMode, BlockType, ContentType
5
5
 
6
-
7
6
  latex_delimiters_config = get_latex_delimiter_config()
8
7
 
9
8
  default_delimiters = {
@@ -50,8 +49,12 @@ def mk_blocks_to_markdown(para_blocks, make_mode, formula_enable, table_enable,
50
49
  for para_block in para_blocks:
51
50
  para_text = ''
52
51
  para_type = para_block['type']
53
- if para_type in [BlockType.TEXT, BlockType.LIST, BlockType.INDEX, BlockType.INTERLINE_EQUATION]:
52
+ if para_type in [BlockType.TEXT, BlockType.INTERLINE_EQUATION, BlockType.PHONETIC, BlockType.REF_TEXT]:
54
53
  para_text = merge_para_with_text(para_block, formula_enable=formula_enable, img_buket_path=img_buket_path)
54
+ elif para_type == BlockType.LIST:
55
+ for block in para_block['blocks']:
56
+ item_text = merge_para_with_text(block, formula_enable=formula_enable, img_buket_path=img_buket_path)
57
+ para_text += f"{item_text}\n"
55
58
  elif para_type == BlockType.TITLE:
56
59
  title_level = get_title_level(para_block)
57
60
  para_text = f'{"#" * title_level} {merge_para_with_text(para_block)}'
@@ -112,6 +115,18 @@ def mk_blocks_to_markdown(para_blocks, make_mode, formula_enable, table_enable,
112
115
  for block in para_block['blocks']: # 3rd.拼table_footnote
113
116
  if block['type'] == BlockType.TABLE_FOOTNOTE:
114
117
  para_text += '\n' + merge_para_with_text(block) + ' '
118
+ elif para_type == BlockType.CODE:
119
+ sub_type = para_block["sub_type"]
120
+ for block in para_block['blocks']: # 1st.拼code_caption
121
+ if block['type'] == BlockType.CODE_CAPTION:
122
+ para_text += merge_para_with_text(block) + ' \n'
123
+ for block in para_block['blocks']: # 2nd.拼code_body
124
+ if block['type'] == BlockType.CODE_BODY:
125
+ if sub_type == BlockType.CODE:
126
+ guess_lang = para_block["guess_lang"]
127
+ para_text += f"```{guess_lang}\n{merge_para_with_text(block)}\n```"
128
+ elif sub_type == BlockType.ALGORITHM:
129
+ para_text += merge_para_with_text(block)
115
130
 
116
131
  if para_text.strip() == '':
117
132
  continue
@@ -122,17 +137,33 @@ def mk_blocks_to_markdown(para_blocks, make_mode, formula_enable, table_enable,
122
137
  return page_markdown
123
138
 
124
139
 
125
-
126
-
127
-
128
140
  def make_blocks_to_content_list(para_block, img_buket_path, page_idx, page_size):
129
141
  para_type = para_block['type']
130
142
  para_content = {}
131
- if para_type in [BlockType.TEXT, BlockType.LIST, BlockType.INDEX]:
143
+ if para_type in [
144
+ BlockType.TEXT,
145
+ BlockType.REF_TEXT,
146
+ BlockType.PHONETIC,
147
+ BlockType.HEADER,
148
+ BlockType.FOOTER,
149
+ BlockType.PAGE_NUMBER,
150
+ BlockType.ASIDE_TEXT,
151
+ BlockType.PAGE_FOOTNOTE,
152
+ ]:
132
153
  para_content = {
133
- 'type': ContentType.TEXT,
154
+ 'type': para_type,
134
155
  'text': merge_para_with_text(para_block),
135
156
  }
157
+ elif para_type == BlockType.LIST:
158
+ para_content = {
159
+ 'type': para_type,
160
+ 'sub_type': para_block.get('sub_type', ''),
161
+ 'list_items':[],
162
+ }
163
+ for block in para_block['blocks']:
164
+ item_text = merge_para_with_text(block)
165
+ if item_text.strip():
166
+ para_content['list_items'].append(item_text)
136
167
  elif para_type == BlockType.TITLE:
137
168
  title_level = get_title_level(para_block)
138
169
  para_content = {
@@ -178,15 +209,24 @@ def make_blocks_to_content_list(para_block, img_buket_path, page_idx, page_size)
178
209
  para_content[BlockType.TABLE_CAPTION].append(merge_para_with_text(block))
179
210
  if block['type'] == BlockType.TABLE_FOOTNOTE:
180
211
  para_content[BlockType.TABLE_FOOTNOTE].append(merge_para_with_text(block))
212
+ elif para_type == BlockType.CODE:
213
+ para_content = {'type': BlockType.CODE, 'sub_type': para_block["sub_type"], BlockType.CODE_CAPTION: []}
214
+ for block in para_block['blocks']:
215
+ if block['type'] == BlockType.CODE_BODY:
216
+ para_content[BlockType.CODE_BODY] = merge_para_with_text(block)
217
+ if para_block["sub_type"] == BlockType.CODE:
218
+ para_content["guess_lang"] = para_block["guess_lang"]
219
+ if block['type'] == BlockType.CODE_CAPTION:
220
+ para_content[BlockType.CODE_CAPTION].append(merge_para_with_text(block))
181
221
 
182
- page_weight, page_height = page_size
222
+ page_width, page_height = page_size
183
223
  para_bbox = para_block.get('bbox')
184
224
  if para_bbox:
185
225
  x0, y0, x1, y1 = para_bbox
186
226
  para_content['bbox'] = [
187
- int(x0 * 1000 / page_weight),
227
+ int(x0 * 1000 / page_width),
188
228
  int(y0 * 1000 / page_height),
189
- int(x1 * 1000 / page_weight),
229
+ int(x1 * 1000 / page_width),
190
230
  int(y1 * 1000 / page_height),
191
231
  ]
192
232
 
@@ -205,6 +245,7 @@ def union_make(pdf_info_dict: list,
205
245
  output_content = []
206
246
  for page_info in pdf_info_dict:
207
247
  paras_of_layout = page_info.get('para_blocks')
248
+ paras_of_discarded = page_info.get('discarded_blocks')
208
249
  page_idx = page_info.get('page_idx')
209
250
  page_size = page_info.get('page_size')
210
251
  if not paras_of_layout:
@@ -213,7 +254,7 @@ def union_make(pdf_info_dict: list,
213
254
  page_markdown = mk_blocks_to_markdown(paras_of_layout, make_mode, formula_enable, table_enable, img_buket_path)
214
255
  output_content.extend(page_markdown)
215
256
  elif make_mode == MakeMode.CONTENT_LIST:
216
- for para_block in paras_of_layout:
257
+ for para_block in paras_of_layout+paras_of_discarded:
217
258
  para_content = make_blocks_to_content_list(para_block, img_buket_path, page_idx, page_size)
218
259
  output_content.append(para_content)
219
260
 
mineru/cli/client.py CHANGED
@@ -6,6 +6,7 @@ from loguru import logger
6
6
 
7
7
  from mineru.utils.cli_parser import arg_parse
8
8
  from mineru.utils.config_reader import get_device
9
+ from mineru.utils.guess_suffix_or_lang import guess_suffix_by_path
9
10
  from mineru.utils.model_utils import get_vram
10
11
  from ..version import __version__
11
12
  from .common import do_parse, read_fn, pdf_suffixes, image_suffixes
@@ -49,12 +50,12 @@ from .common import do_parse, read_fn, pdf_suffixes, image_suffixes
49
50
  '-b',
50
51
  '--backend',
51
52
  'backend',
52
- type=click.Choice(['pipeline', 'vlm-transformers', 'vlm-sglang-engine', 'vlm-sglang-client']),
53
+ type=click.Choice(['pipeline', 'vlm-transformers', 'vlm-vllm-engine', 'vlm-http-client']),
53
54
  help="""the backend for parsing pdf:
54
55
  pipeline: More general.
55
56
  vlm-transformers: More general.
56
- vlm-sglang-engine: Faster(engine).
57
- vlm-sglang-client: Faster(client).
57
+ vlm-vllm-engine: Faster(engine).
58
+ vlm-http-client: Faster(client).
58
59
  without method specified, pipeline will be used by default.""",
59
60
  default='pipeline',
60
61
  )
@@ -77,7 +78,7 @@ from .common import do_parse, read_fn, pdf_suffixes, image_suffixes
77
78
  'server_url',
78
79
  type=str,
79
80
  help="""
80
- When the backend is `sglang-client`, you need to specify the server_url, for example:`http://127.0.0.1:30000`
81
+ When the backend is `vlm-http-client`, you need to specify the server_url, for example:`http://127.0.0.1:30000`
81
82
  """,
82
83
  default=None,
83
84
  )
@@ -202,7 +203,7 @@ def main(
202
203
  if os.path.isdir(input_path):
203
204
  doc_path_list = []
204
205
  for doc_path in Path(input_path).glob('*'):
205
- if doc_path.suffix in pdf_suffixes + image_suffixes:
206
+ if guess_suffix_by_path(doc_path) in pdf_suffixes + image_suffixes:
206
207
  doc_path_list.append(doc_path)
207
208
  parse_doc(doc_path_list)
208
209
  else: