mineru 2.2.2__py3-none-any.whl → 2.5.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mineru/backend/pipeline/pipeline_middle_json_mkcontent.py +3 -3
- mineru/backend/vlm/model_output_to_middle_json.py +123 -0
- mineru/backend/vlm/vlm_analyze.py +105 -16
- mineru/backend/vlm/vlm_magic_model.py +201 -135
- mineru/backend/vlm/vlm_middle_json_mkcontent.py +52 -11
- mineru/cli/client.py +6 -5
- mineru/cli/common.py +17 -16
- mineru/cli/fast_api.py +9 -7
- mineru/cli/gradio_app.py +15 -16
- mineru/cli/vlm_vllm_server.py +4 -0
- mineru/model/table/rec/unet_table/main.py +8 -0
- mineru/model/vlm_vllm_model/__init__.py +0 -0
- mineru/model/vlm_vllm_model/server.py +59 -0
- mineru/resources/header.html +10 -2
- mineru/utils/draw_bbox.py +32 -10
- mineru/utils/enum_class.py +16 -2
- mineru/utils/guess_suffix_or_lang.py +20 -0
- mineru/utils/span_block_fix.py +4 -2
- mineru/version.py +1 -1
- {mineru-2.2.2.dist-info → mineru-2.5.1.dist-info}/METADATA +70 -25
- {mineru-2.2.2.dist-info → mineru-2.5.1.dist-info}/RECORD +25 -38
- {mineru-2.2.2.dist-info → mineru-2.5.1.dist-info}/entry_points.txt +1 -1
- mineru/backend/vlm/base_predictor.py +0 -186
- mineru/backend/vlm/hf_predictor.py +0 -217
- mineru/backend/vlm/predictor.py +0 -111
- mineru/backend/vlm/sglang_client_predictor.py +0 -443
- mineru/backend/vlm/sglang_engine_predictor.py +0 -246
- mineru/backend/vlm/token_to_middle_json.py +0 -122
- mineru/backend/vlm/utils.py +0 -40
- mineru/cli/vlm_sglang_server.py +0 -4
- mineru/model/vlm_hf_model/__init__.py +0 -9
- mineru/model/vlm_hf_model/configuration_mineru2.py +0 -38
- mineru/model/vlm_hf_model/image_processing_mineru2.py +0 -269
- mineru/model/vlm_hf_model/modeling_mineru2.py +0 -449
- mineru/model/vlm_sglang_model/__init__.py +0 -14
- mineru/model/vlm_sglang_model/engine.py +0 -264
- mineru/model/vlm_sglang_model/image_processor.py +0 -213
- mineru/model/vlm_sglang_model/logit_processor.py +0 -90
- mineru/model/vlm_sglang_model/model.py +0 -453
- mineru/model/vlm_sglang_model/server.py +0 -75
- {mineru-2.2.2.dist-info → mineru-2.5.1.dist-info}/WHEEL +0 -0
- {mineru-2.2.2.dist-info → mineru-2.5.1.dist-info}/licenses/LICENSE.md +0 -0
- {mineru-2.2.2.dist-info → mineru-2.5.1.dist-info}/top_level.txt +0 -0
|
@@ -3,46 +3,37 @@ from typing import Literal
|
|
|
3
3
|
|
|
4
4
|
from loguru import logger
|
|
5
5
|
|
|
6
|
-
from mineru.utils.
|
|
7
|
-
from mineru.
|
|
8
|
-
from mineru.utils.
|
|
6
|
+
from mineru.utils.boxbase import calculate_overlap_area_in_bbox1_area_ratio
|
|
7
|
+
from mineru.utils.enum_class import ContentType, BlockType
|
|
8
|
+
from mineru.utils.guess_suffix_or_lang import guess_language_by_text
|
|
9
9
|
from mineru.utils.magic_model_utils import reduct_overlap, tie_up_category_by_distance_v3
|
|
10
10
|
|
|
11
11
|
|
|
12
12
|
class MagicModel:
|
|
13
|
-
def __init__(self,
|
|
14
|
-
self.
|
|
15
|
-
|
|
16
|
-
# 使用正则表达式查找所有块
|
|
17
|
-
pattern = (
|
|
18
|
-
r"<\|box_start\|>(.*?)<\|box_end\|><\|ref_start\|>(.*?)<\|ref_end\|><\|md_start\|>(.*?)(?:<\|md_end\|>|<\|im_end\|>)"
|
|
19
|
-
)
|
|
20
|
-
block_infos = re.findall(pattern, token, re.DOTALL)
|
|
13
|
+
def __init__(self, page_blocks: list, width, height):
|
|
14
|
+
self.page_blocks = page_blocks
|
|
21
15
|
|
|
22
16
|
blocks = []
|
|
23
17
|
self.all_spans = []
|
|
24
18
|
# 解析每个块
|
|
25
|
-
for index, block_info in enumerate(
|
|
26
|
-
block_bbox = block_info[
|
|
19
|
+
for index, block_info in enumerate(page_blocks):
|
|
20
|
+
block_bbox = block_info["bbox"]
|
|
27
21
|
try:
|
|
28
|
-
x1, y1, x2, y2 =
|
|
22
|
+
x1, y1, x2, y2 = block_bbox
|
|
29
23
|
x_1, y_1, x_2, y_2 = (
|
|
30
|
-
int(x1 * width
|
|
31
|
-
int(y1 * height
|
|
32
|
-
int(x2 * width
|
|
33
|
-
int(y2 * height
|
|
24
|
+
int(x1 * width),
|
|
25
|
+
int(y1 * height),
|
|
26
|
+
int(x2 * width),
|
|
27
|
+
int(y2 * height),
|
|
34
28
|
)
|
|
35
29
|
if x_2 < x_1:
|
|
36
30
|
x_1, x_2 = x_2, x_1
|
|
37
31
|
if y_2 < y_1:
|
|
38
32
|
y_1, y_2 = y_2, y_1
|
|
39
33
|
block_bbox = (x_1, y_1, x_2, y_2)
|
|
40
|
-
block_type = block_info[
|
|
41
|
-
block_content = block_info[
|
|
42
|
-
|
|
43
|
-
# 如果bbox是0,0,999,999,且type为text,按notes增加表格处理
|
|
44
|
-
if x1 == 0 and y1 == 0 and x2 == 999 and y2 == 999 and block_type == "text":
|
|
45
|
-
block_content = block_content_to_html(block_content)
|
|
34
|
+
block_type = block_info["type"]
|
|
35
|
+
block_content = block_info["content"]
|
|
36
|
+
block_angle = block_info["angle"]
|
|
46
37
|
|
|
47
38
|
# print(f"坐标: {block_bbox}")
|
|
48
39
|
# print(f"类型: {block_type}")
|
|
@@ -54,6 +45,9 @@ class MagicModel:
|
|
|
54
45
|
continue
|
|
55
46
|
|
|
56
47
|
span_type = "unknown"
|
|
48
|
+
line_type = None
|
|
49
|
+
guess_lang = None
|
|
50
|
+
|
|
57
51
|
if block_type in [
|
|
58
52
|
"text",
|
|
59
53
|
"title",
|
|
@@ -61,8 +55,15 @@ class MagicModel:
|
|
|
61
55
|
"image_footnote",
|
|
62
56
|
"table_caption",
|
|
63
57
|
"table_footnote",
|
|
64
|
-
"
|
|
65
|
-
"
|
|
58
|
+
"code_caption",
|
|
59
|
+
"ref_text",
|
|
60
|
+
"phonetic",
|
|
61
|
+
"header",
|
|
62
|
+
"footer",
|
|
63
|
+
"page_number",
|
|
64
|
+
"aside_text",
|
|
65
|
+
"page_footnote",
|
|
66
|
+
"list"
|
|
66
67
|
]:
|
|
67
68
|
span_type = ContentType.TEXT
|
|
68
69
|
elif block_type in ["image"]:
|
|
@@ -71,6 +72,12 @@ class MagicModel:
|
|
|
71
72
|
elif block_type in ["table"]:
|
|
72
73
|
block_type = BlockType.TABLE_BODY
|
|
73
74
|
span_type = ContentType.TABLE
|
|
75
|
+
elif block_type in ["code", "algorithm"]:
|
|
76
|
+
block_content = code_content_clean(block_content)
|
|
77
|
+
line_type = block_type
|
|
78
|
+
block_type = BlockType.CODE_BODY
|
|
79
|
+
span_type = ContentType.TEXT
|
|
80
|
+
guess_lang = guess_language_by_text(block_content)
|
|
74
81
|
elif block_type in ["equation"]:
|
|
75
82
|
block_type = BlockType.INTERLINE_EQUATION
|
|
76
83
|
span_type = ContentType.INTERLINE_EQUATION
|
|
@@ -81,7 +88,7 @@ class MagicModel:
|
|
|
81
88
|
"type": span_type,
|
|
82
89
|
}
|
|
83
90
|
if span_type == ContentType.TABLE:
|
|
84
|
-
span["html"] =
|
|
91
|
+
span["html"] = block_content
|
|
85
92
|
elif span_type in [ContentType.INTERLINE_EQUATION]:
|
|
86
93
|
span = {
|
|
87
94
|
"bbox": block_bbox,
|
|
@@ -89,7 +96,12 @@ class MagicModel:
|
|
|
89
96
|
"content": isolated_formula_clean(block_content),
|
|
90
97
|
}
|
|
91
98
|
else:
|
|
92
|
-
|
|
99
|
+
|
|
100
|
+
if block_content:
|
|
101
|
+
block_content = clean_content(block_content)
|
|
102
|
+
|
|
103
|
+
if block_content and block_content.count("\\(") == block_content.count("\\)") and block_content.count("\\(") > 0:
|
|
104
|
+
|
|
93
105
|
# 生成包含文本和公式的span列表
|
|
94
106
|
spans = []
|
|
95
107
|
last_end = 0
|
|
@@ -136,25 +148,27 @@ class MagicModel:
|
|
|
136
148
|
"content": block_content,
|
|
137
149
|
}
|
|
138
150
|
|
|
151
|
+
# 处理span类型并添加到all_spans
|
|
139
152
|
if isinstance(span, dict) and "bbox" in span:
|
|
140
153
|
self.all_spans.append(span)
|
|
141
|
-
|
|
142
|
-
"bbox": block_bbox,
|
|
143
|
-
"spans": [span],
|
|
144
|
-
}
|
|
154
|
+
spans = [span]
|
|
145
155
|
elif isinstance(span, list):
|
|
146
156
|
self.all_spans.extend(span)
|
|
147
|
-
|
|
148
|
-
"bbox": block_bbox,
|
|
149
|
-
"spans": span,
|
|
150
|
-
}
|
|
157
|
+
spans = span
|
|
151
158
|
else:
|
|
152
159
|
raise ValueError(f"Invalid span type: {span_type}, expected dict or list, got {type(span)}")
|
|
153
160
|
|
|
161
|
+
# 构造line对象
|
|
162
|
+
if block_type in [BlockType.CODE_BODY]:
|
|
163
|
+
line = {"bbox": block_bbox, "spans": spans, "extra": {"type": line_type, "guess_lang": guess_lang}}
|
|
164
|
+
else:
|
|
165
|
+
line = {"bbox": block_bbox, "spans": spans}
|
|
166
|
+
|
|
154
167
|
blocks.append(
|
|
155
168
|
{
|
|
156
169
|
"bbox": block_bbox,
|
|
157
170
|
"type": block_type,
|
|
171
|
+
"angle": block_angle,
|
|
158
172
|
"lines": [line],
|
|
159
173
|
"index": index,
|
|
160
174
|
}
|
|
@@ -165,35 +179,87 @@ class MagicModel:
|
|
|
165
179
|
self.interline_equation_blocks = []
|
|
166
180
|
self.text_blocks = []
|
|
167
181
|
self.title_blocks = []
|
|
182
|
+
self.code_blocks = []
|
|
183
|
+
self.discarded_blocks = []
|
|
184
|
+
self.ref_text_blocks = []
|
|
185
|
+
self.phonetic_blocks = []
|
|
186
|
+
self.list_blocks = []
|
|
168
187
|
for block in blocks:
|
|
169
188
|
if block["type"] in [BlockType.IMAGE_BODY, BlockType.IMAGE_CAPTION, BlockType.IMAGE_FOOTNOTE]:
|
|
170
189
|
self.image_blocks.append(block)
|
|
171
190
|
elif block["type"] in [BlockType.TABLE_BODY, BlockType.TABLE_CAPTION, BlockType.TABLE_FOOTNOTE]:
|
|
172
191
|
self.table_blocks.append(block)
|
|
192
|
+
elif block["type"] in [BlockType.CODE_BODY, BlockType.CODE_CAPTION]:
|
|
193
|
+
self.code_blocks.append(block)
|
|
173
194
|
elif block["type"] == BlockType.INTERLINE_EQUATION:
|
|
174
195
|
self.interline_equation_blocks.append(block)
|
|
175
196
|
elif block["type"] == BlockType.TEXT:
|
|
176
197
|
self.text_blocks.append(block)
|
|
177
198
|
elif block["type"] == BlockType.TITLE:
|
|
178
199
|
self.title_blocks.append(block)
|
|
200
|
+
elif block["type"] in [BlockType.REF_TEXT]:
|
|
201
|
+
self.ref_text_blocks.append(block)
|
|
202
|
+
elif block["type"] in [BlockType.PHONETIC]:
|
|
203
|
+
self.phonetic_blocks.append(block)
|
|
204
|
+
elif block["type"] in [BlockType.HEADER, BlockType.FOOTER, BlockType.PAGE_NUMBER, BlockType.ASIDE_TEXT, BlockType.PAGE_FOOTNOTE]:
|
|
205
|
+
self.discarded_blocks.append(block)
|
|
206
|
+
elif block["type"] == BlockType.LIST:
|
|
207
|
+
self.list_blocks.append(block)
|
|
179
208
|
else:
|
|
180
209
|
continue
|
|
181
210
|
|
|
211
|
+
self.list_blocks, self.text_blocks, self.ref_text_blocks = fix_list_blocks(self.list_blocks, self.text_blocks, self.ref_text_blocks)
|
|
212
|
+
self.image_blocks, not_include_image_blocks = fix_two_layer_blocks(self.image_blocks, BlockType.IMAGE)
|
|
213
|
+
self.table_blocks, not_include_table_blocks = fix_two_layer_blocks(self.table_blocks, BlockType.TABLE)
|
|
214
|
+
self.code_blocks, not_include_code_blocks = fix_two_layer_blocks(self.code_blocks, BlockType.CODE)
|
|
215
|
+
for code_block in self.code_blocks:
|
|
216
|
+
for block in code_block['blocks']:
|
|
217
|
+
if block['type'] == BlockType.CODE_BODY:
|
|
218
|
+
if len(block["lines"]) > 0:
|
|
219
|
+
line = block["lines"][0]
|
|
220
|
+
code_block["sub_type"] = line["extra"]["type"]
|
|
221
|
+
if code_block["sub_type"] in ["code"]:
|
|
222
|
+
code_block["guess_lang"] = line["extra"]["guess_lang"]
|
|
223
|
+
del line["extra"]
|
|
224
|
+
else:
|
|
225
|
+
code_block["sub_type"] = "code"
|
|
226
|
+
code_block["guess_lang"] = "txt"
|
|
227
|
+
|
|
228
|
+
for block in not_include_image_blocks + not_include_table_blocks + not_include_code_blocks:
|
|
229
|
+
block["type"] = BlockType.TEXT
|
|
230
|
+
self.text_blocks.append(block)
|
|
231
|
+
|
|
232
|
+
|
|
233
|
+
def get_list_blocks(self):
|
|
234
|
+
return self.list_blocks
|
|
235
|
+
|
|
182
236
|
def get_image_blocks(self):
|
|
183
|
-
return
|
|
237
|
+
return self.image_blocks
|
|
184
238
|
|
|
185
239
|
def get_table_blocks(self):
|
|
186
|
-
return
|
|
240
|
+
return self.table_blocks
|
|
241
|
+
|
|
242
|
+
def get_code_blocks(self):
|
|
243
|
+
return self.code_blocks
|
|
244
|
+
|
|
245
|
+
def get_ref_text_blocks(self):
|
|
246
|
+
return self.ref_text_blocks
|
|
247
|
+
|
|
248
|
+
def get_phonetic_blocks(self):
|
|
249
|
+
return self.phonetic_blocks
|
|
187
250
|
|
|
188
251
|
def get_title_blocks(self):
|
|
189
|
-
return
|
|
252
|
+
return self.title_blocks
|
|
190
253
|
|
|
191
254
|
def get_text_blocks(self):
|
|
192
|
-
return
|
|
255
|
+
return self.text_blocks
|
|
193
256
|
|
|
194
257
|
def get_interline_equation_blocks(self):
|
|
195
258
|
return self.interline_equation_blocks
|
|
196
259
|
|
|
260
|
+
def get_discarded_blocks(self):
|
|
261
|
+
return self.discarded_blocks
|
|
262
|
+
|
|
197
263
|
def get_all_spans(self):
|
|
198
264
|
return self.all_spans
|
|
199
265
|
|
|
@@ -202,48 +268,46 @@ def isolated_formula_clean(txt):
|
|
|
202
268
|
latex = txt[:]
|
|
203
269
|
if latex.startswith("\\["): latex = latex[2:]
|
|
204
270
|
if latex.endswith("\\]"): latex = latex[:-2]
|
|
205
|
-
latex =
|
|
271
|
+
latex = latex.strip()
|
|
206
272
|
return latex
|
|
207
273
|
|
|
208
274
|
|
|
209
|
-
def
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
latex = re.sub(r"\\right\\\}", "}", latex) # \right\}
|
|
232
|
-
latex = re.sub(r"\\right\|", "|", latex) # \right|
|
|
233
|
-
latex = re.sub(r"\\right\\\|", "|", latex) # \right\|
|
|
234
|
-
latex = re.sub(r"\\right\)", ")", latex) # \right)
|
|
235
|
-
latex = re.sub(r"\\right\]", "]", latex) # \right]
|
|
236
|
-
latex = re.sub(r"\\right\.", "", latex) # \right.
|
|
237
|
-
|
|
238
|
-
# replace invalid pairs first
|
|
239
|
-
latex = re.sub(r'\\left\{', "{", latex)
|
|
240
|
-
latex = re.sub(r'\\right\}', "}", latex) # \left{ ... \right}
|
|
241
|
-
latex = re.sub(r'\\left\\\(', "(", latex)
|
|
242
|
-
latex = re.sub(r'\\right\\\)', ")", latex) # \left\( ... \right\)
|
|
243
|
-
latex = re.sub(r'\\left\\\[', "[", latex)
|
|
244
|
-
latex = re.sub(r'\\right\\\]', "]", latex) # \left\[ ... \right\]
|
|
275
|
+
def code_content_clean(content):
|
|
276
|
+
"""清理代码内容,移除Markdown代码块的开始和结束标记"""
|
|
277
|
+
if not content:
|
|
278
|
+
return ""
|
|
279
|
+
|
|
280
|
+
lines = content.splitlines()
|
|
281
|
+
start_idx = 0
|
|
282
|
+
end_idx = len(lines)
|
|
283
|
+
|
|
284
|
+
# 处理开头的三个反引号
|
|
285
|
+
if lines and lines[0].startswith("```"):
|
|
286
|
+
start_idx = 1
|
|
287
|
+
|
|
288
|
+
# 处理结尾的三个反引号
|
|
289
|
+
if lines and end_idx > start_idx and lines[end_idx - 1].strip() == "```":
|
|
290
|
+
end_idx -= 1
|
|
291
|
+
|
|
292
|
+
# 只有在有内容时才进行join操作
|
|
293
|
+
if start_idx < end_idx:
|
|
294
|
+
return "\n".join(lines[start_idx:end_idx]).strip()
|
|
295
|
+
return ""
|
|
245
296
|
|
|
246
|
-
|
|
297
|
+
|
|
298
|
+
def clean_content(content):
|
|
299
|
+
if content and content.count("\\[") == content.count("\\]") and content.count("\\[") > 0:
|
|
300
|
+
# Function to handle each match
|
|
301
|
+
def replace_pattern(match):
|
|
302
|
+
# Extract content between \[ and \]
|
|
303
|
+
inner_content = match.group(1)
|
|
304
|
+
return f"[{inner_content}]"
|
|
305
|
+
|
|
306
|
+
# Find all patterns of \[x\] and apply replacement
|
|
307
|
+
pattern = r'\\\[(.*?)\\\]'
|
|
308
|
+
content = re.sub(pattern, replace_pattern, content)
|
|
309
|
+
|
|
310
|
+
return content
|
|
247
311
|
|
|
248
312
|
|
|
249
313
|
def __tie_up_category_by_distance_v3(blocks, subject_block_type, object_block_type):
|
|
@@ -252,7 +316,7 @@ def __tie_up_category_by_distance_v3(blocks, subject_block_type, object_block_ty
|
|
|
252
316
|
return reduct_overlap(
|
|
253
317
|
list(
|
|
254
318
|
map(
|
|
255
|
-
lambda x: {"bbox": x["bbox"], "lines": x["lines"], "index": x["index"]},
|
|
319
|
+
lambda x: {"bbox": x["bbox"], "lines": x["lines"], "index": x["index"], "angle":x["angle"]},
|
|
256
320
|
filter(
|
|
257
321
|
lambda x: x["type"] == subject_block_type,
|
|
258
322
|
blocks,
|
|
@@ -265,7 +329,7 @@ def __tie_up_category_by_distance_v3(blocks, subject_block_type, object_block_ty
|
|
|
265
329
|
return reduct_overlap(
|
|
266
330
|
list(
|
|
267
331
|
map(
|
|
268
|
-
lambda x: {"bbox": x["bbox"], "lines": x["lines"], "index": x["index"]},
|
|
332
|
+
lambda x: {"bbox": x["bbox"], "lines": x["lines"], "index": x["index"], "angle":x["angle"]},
|
|
269
333
|
filter(
|
|
270
334
|
lambda x: x["type"] == object_block_type,
|
|
271
335
|
blocks,
|
|
@@ -281,7 +345,7 @@ def __tie_up_category_by_distance_v3(blocks, subject_block_type, object_block_ty
|
|
|
281
345
|
)
|
|
282
346
|
|
|
283
347
|
|
|
284
|
-
def get_type_blocks(blocks, block_type: Literal["image", "table"]):
|
|
348
|
+
def get_type_blocks(blocks, block_type: Literal["image", "table", "code"]):
|
|
285
349
|
with_captions = __tie_up_category_by_distance_v3(blocks, f"{block_type}_body", f"{block_type}_caption")
|
|
286
350
|
with_footnotes = __tie_up_category_by_distance_v3(blocks, f"{block_type}_body", f"{block_type}_footnote")
|
|
287
351
|
ret = []
|
|
@@ -297,9 +361,13 @@ def get_type_blocks(blocks, block_type: Literal["image", "table"]):
|
|
|
297
361
|
return ret
|
|
298
362
|
|
|
299
363
|
|
|
300
|
-
def fix_two_layer_blocks(blocks, fix_type: Literal["image", "table"]):
|
|
364
|
+
def fix_two_layer_blocks(blocks, fix_type: Literal["image", "table", "code"]):
|
|
301
365
|
need_fix_blocks = get_type_blocks(blocks, fix_type)
|
|
302
366
|
fixed_blocks = []
|
|
367
|
+
not_include_blocks = []
|
|
368
|
+
processed_indices = set()
|
|
369
|
+
|
|
370
|
+
# 处理需要组织成two_layer结构的blocks
|
|
303
371
|
for block in need_fix_blocks:
|
|
304
372
|
body = block[f"{fix_type}_body"]
|
|
305
373
|
caption_list = block[f"{fix_type}_caption_list"]
|
|
@@ -308,8 +376,12 @@ def fix_two_layer_blocks(blocks, fix_type: Literal["image", "table"]):
|
|
|
308
376
|
body["type"] = f"{fix_type}_body"
|
|
309
377
|
for caption in caption_list:
|
|
310
378
|
caption["type"] = f"{fix_type}_caption"
|
|
379
|
+
processed_indices.add(caption["index"])
|
|
311
380
|
for footnote in footnote_list:
|
|
312
381
|
footnote["type"] = f"{fix_type}_footnote"
|
|
382
|
+
processed_indices.add(footnote["index"])
|
|
383
|
+
|
|
384
|
+
processed_indices.add(body["index"])
|
|
313
385
|
|
|
314
386
|
two_layer_block = {
|
|
315
387
|
"type": fix_type,
|
|
@@ -323,58 +395,52 @@ def fix_two_layer_blocks(blocks, fix_type: Literal["image", "table"]):
|
|
|
323
395
|
|
|
324
396
|
fixed_blocks.append(two_layer_block)
|
|
325
397
|
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
def fix_title_blocks(blocks):
|
|
398
|
+
# 添加未处理的blocks
|
|
330
399
|
for block in blocks:
|
|
331
|
-
if block["
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
400
|
+
if block["index"] not in processed_indices:
|
|
401
|
+
# 直接添加未处理的block
|
|
402
|
+
not_include_blocks.append(block)
|
|
403
|
+
|
|
404
|
+
return fixed_blocks, not_include_blocks
|
|
405
|
+
|
|
406
|
+
|
|
407
|
+
def fix_list_blocks(list_blocks, text_blocks, ref_text_blocks):
|
|
408
|
+
for list_block in list_blocks:
|
|
409
|
+
list_block["blocks"] = []
|
|
410
|
+
if "lines" in list_block:
|
|
411
|
+
del list_block["lines"]
|
|
412
|
+
|
|
413
|
+
temp_text_blocks = text_blocks + ref_text_blocks
|
|
414
|
+
need_remove_blocks = []
|
|
415
|
+
for block in temp_text_blocks:
|
|
416
|
+
for list_block in list_blocks:
|
|
417
|
+
if calculate_overlap_area_in_bbox1_area_ratio(block["bbox"], list_block["bbox"]) >= 0.8:
|
|
418
|
+
list_block["blocks"].append(block)
|
|
419
|
+
need_remove_blocks.append(block)
|
|
339
420
|
break
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
next_idx += 1
|
|
367
|
-
|
|
368
|
-
# 如果找到下一个有效块,则合并
|
|
369
|
-
if next_idx < len(blocks):
|
|
370
|
-
next_block = blocks[next_idx]
|
|
371
|
-
# 将下一个块的lines扩展到当前块的lines中
|
|
372
|
-
block["lines"].extend(next_block["lines"])
|
|
373
|
-
# 清空下一个块的lines
|
|
374
|
-
next_block["lines"] = []
|
|
375
|
-
# 在下一个块中添加标志
|
|
376
|
-
next_block[SplitFlag.LINES_DELETED] = True
|
|
377
|
-
# 不增加i,继续检查当前块(现在已包含下一个块的内容)
|
|
378
|
-
continue
|
|
379
|
-
i += 1
|
|
380
|
-
return blocks
|
|
421
|
+
|
|
422
|
+
for block in need_remove_blocks:
|
|
423
|
+
if block in text_blocks:
|
|
424
|
+
text_blocks.remove(block)
|
|
425
|
+
elif block in ref_text_blocks:
|
|
426
|
+
ref_text_blocks.remove(block)
|
|
427
|
+
|
|
428
|
+
# 移除blocks为空的list_block
|
|
429
|
+
list_blocks = [lb for lb in list_blocks if lb["blocks"]]
|
|
430
|
+
|
|
431
|
+
for list_block in list_blocks:
|
|
432
|
+
# 统计list_block["blocks"]中所有block的type,用众数作为list_block的sub_type
|
|
433
|
+
type_count = {}
|
|
434
|
+
line_content = []
|
|
435
|
+
for sub_block in list_block["blocks"]:
|
|
436
|
+
sub_block_type = sub_block["type"]
|
|
437
|
+
if sub_block_type not in type_count:
|
|
438
|
+
type_count[sub_block_type] = 0
|
|
439
|
+
type_count[sub_block_type] += 1
|
|
440
|
+
|
|
441
|
+
if type_count:
|
|
442
|
+
list_block["sub_type"] = max(type_count, key=type_count.get)
|
|
443
|
+
else:
|
|
444
|
+
list_block["sub_type"] = "unknown"
|
|
445
|
+
|
|
446
|
+
return list_blocks, text_blocks, ref_text_blocks
|
|
@@ -3,7 +3,6 @@ import os
|
|
|
3
3
|
from mineru.utils.config_reader import get_latex_delimiter_config, get_formula_enable, get_table_enable
|
|
4
4
|
from mineru.utils.enum_class import MakeMode, BlockType, ContentType
|
|
5
5
|
|
|
6
|
-
|
|
7
6
|
latex_delimiters_config = get_latex_delimiter_config()
|
|
8
7
|
|
|
9
8
|
default_delimiters = {
|
|
@@ -50,8 +49,12 @@ def mk_blocks_to_markdown(para_blocks, make_mode, formula_enable, table_enable,
|
|
|
50
49
|
for para_block in para_blocks:
|
|
51
50
|
para_text = ''
|
|
52
51
|
para_type = para_block['type']
|
|
53
|
-
if para_type in [BlockType.TEXT, BlockType.
|
|
52
|
+
if para_type in [BlockType.TEXT, BlockType.INTERLINE_EQUATION, BlockType.PHONETIC, BlockType.REF_TEXT]:
|
|
54
53
|
para_text = merge_para_with_text(para_block, formula_enable=formula_enable, img_buket_path=img_buket_path)
|
|
54
|
+
elif para_type == BlockType.LIST:
|
|
55
|
+
for block in para_block['blocks']:
|
|
56
|
+
item_text = merge_para_with_text(block, formula_enable=formula_enable, img_buket_path=img_buket_path)
|
|
57
|
+
para_text += f"{item_text}\n"
|
|
55
58
|
elif para_type == BlockType.TITLE:
|
|
56
59
|
title_level = get_title_level(para_block)
|
|
57
60
|
para_text = f'{"#" * title_level} {merge_para_with_text(para_block)}'
|
|
@@ -112,6 +115,18 @@ def mk_blocks_to_markdown(para_blocks, make_mode, formula_enable, table_enable,
|
|
|
112
115
|
for block in para_block['blocks']: # 3rd.拼table_footnote
|
|
113
116
|
if block['type'] == BlockType.TABLE_FOOTNOTE:
|
|
114
117
|
para_text += '\n' + merge_para_with_text(block) + ' '
|
|
118
|
+
elif para_type == BlockType.CODE:
|
|
119
|
+
sub_type = para_block["sub_type"]
|
|
120
|
+
for block in para_block['blocks']: # 1st.拼code_caption
|
|
121
|
+
if block['type'] == BlockType.CODE_CAPTION:
|
|
122
|
+
para_text += merge_para_with_text(block) + ' \n'
|
|
123
|
+
for block in para_block['blocks']: # 2nd.拼code_body
|
|
124
|
+
if block['type'] == BlockType.CODE_BODY:
|
|
125
|
+
if sub_type == BlockType.CODE:
|
|
126
|
+
guess_lang = para_block["guess_lang"]
|
|
127
|
+
para_text += f"```{guess_lang}\n{merge_para_with_text(block)}\n```"
|
|
128
|
+
elif sub_type == BlockType.ALGORITHM:
|
|
129
|
+
para_text += merge_para_with_text(block)
|
|
115
130
|
|
|
116
131
|
if para_text.strip() == '':
|
|
117
132
|
continue
|
|
@@ -122,17 +137,33 @@ def mk_blocks_to_markdown(para_blocks, make_mode, formula_enable, table_enable,
|
|
|
122
137
|
return page_markdown
|
|
123
138
|
|
|
124
139
|
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
140
|
def make_blocks_to_content_list(para_block, img_buket_path, page_idx, page_size):
|
|
129
141
|
para_type = para_block['type']
|
|
130
142
|
para_content = {}
|
|
131
|
-
if para_type in [
|
|
143
|
+
if para_type in [
|
|
144
|
+
BlockType.TEXT,
|
|
145
|
+
BlockType.REF_TEXT,
|
|
146
|
+
BlockType.PHONETIC,
|
|
147
|
+
BlockType.HEADER,
|
|
148
|
+
BlockType.FOOTER,
|
|
149
|
+
BlockType.PAGE_NUMBER,
|
|
150
|
+
BlockType.ASIDE_TEXT,
|
|
151
|
+
BlockType.PAGE_FOOTNOTE,
|
|
152
|
+
]:
|
|
132
153
|
para_content = {
|
|
133
|
-
'type':
|
|
154
|
+
'type': para_type,
|
|
134
155
|
'text': merge_para_with_text(para_block),
|
|
135
156
|
}
|
|
157
|
+
elif para_type == BlockType.LIST:
|
|
158
|
+
para_content = {
|
|
159
|
+
'type': para_type,
|
|
160
|
+
'sub_type': para_block.get('sub_type', ''),
|
|
161
|
+
'list_items':[],
|
|
162
|
+
}
|
|
163
|
+
for block in para_block['blocks']:
|
|
164
|
+
item_text = merge_para_with_text(block)
|
|
165
|
+
if item_text.strip():
|
|
166
|
+
para_content['list_items'].append(item_text)
|
|
136
167
|
elif para_type == BlockType.TITLE:
|
|
137
168
|
title_level = get_title_level(para_block)
|
|
138
169
|
para_content = {
|
|
@@ -178,15 +209,24 @@ def make_blocks_to_content_list(para_block, img_buket_path, page_idx, page_size)
|
|
|
178
209
|
para_content[BlockType.TABLE_CAPTION].append(merge_para_with_text(block))
|
|
179
210
|
if block['type'] == BlockType.TABLE_FOOTNOTE:
|
|
180
211
|
para_content[BlockType.TABLE_FOOTNOTE].append(merge_para_with_text(block))
|
|
212
|
+
elif para_type == BlockType.CODE:
|
|
213
|
+
para_content = {'type': BlockType.CODE, 'sub_type': para_block["sub_type"], BlockType.CODE_CAPTION: []}
|
|
214
|
+
for block in para_block['blocks']:
|
|
215
|
+
if block['type'] == BlockType.CODE_BODY:
|
|
216
|
+
para_content[BlockType.CODE_BODY] = merge_para_with_text(block)
|
|
217
|
+
if para_block["sub_type"] == BlockType.CODE:
|
|
218
|
+
para_content["guess_lang"] = para_block["guess_lang"]
|
|
219
|
+
if block['type'] == BlockType.CODE_CAPTION:
|
|
220
|
+
para_content[BlockType.CODE_CAPTION].append(merge_para_with_text(block))
|
|
181
221
|
|
|
182
|
-
|
|
222
|
+
page_width, page_height = page_size
|
|
183
223
|
para_bbox = para_block.get('bbox')
|
|
184
224
|
if para_bbox:
|
|
185
225
|
x0, y0, x1, y1 = para_bbox
|
|
186
226
|
para_content['bbox'] = [
|
|
187
|
-
int(x0 * 1000 /
|
|
227
|
+
int(x0 * 1000 / page_width),
|
|
188
228
|
int(y0 * 1000 / page_height),
|
|
189
|
-
int(x1 * 1000 /
|
|
229
|
+
int(x1 * 1000 / page_width),
|
|
190
230
|
int(y1 * 1000 / page_height),
|
|
191
231
|
]
|
|
192
232
|
|
|
@@ -205,6 +245,7 @@ def union_make(pdf_info_dict: list,
|
|
|
205
245
|
output_content = []
|
|
206
246
|
for page_info in pdf_info_dict:
|
|
207
247
|
paras_of_layout = page_info.get('para_blocks')
|
|
248
|
+
paras_of_discarded = page_info.get('discarded_blocks')
|
|
208
249
|
page_idx = page_info.get('page_idx')
|
|
209
250
|
page_size = page_info.get('page_size')
|
|
210
251
|
if not paras_of_layout:
|
|
@@ -213,7 +254,7 @@ def union_make(pdf_info_dict: list,
|
|
|
213
254
|
page_markdown = mk_blocks_to_markdown(paras_of_layout, make_mode, formula_enable, table_enable, img_buket_path)
|
|
214
255
|
output_content.extend(page_markdown)
|
|
215
256
|
elif make_mode == MakeMode.CONTENT_LIST:
|
|
216
|
-
for para_block in paras_of_layout:
|
|
257
|
+
for para_block in paras_of_layout+paras_of_discarded:
|
|
217
258
|
para_content = make_blocks_to_content_list(para_block, img_buket_path, page_idx, page_size)
|
|
218
259
|
output_content.append(para_content)
|
|
219
260
|
|
mineru/cli/client.py
CHANGED
|
@@ -6,6 +6,7 @@ from loguru import logger
|
|
|
6
6
|
|
|
7
7
|
from mineru.utils.cli_parser import arg_parse
|
|
8
8
|
from mineru.utils.config_reader import get_device
|
|
9
|
+
from mineru.utils.guess_suffix_or_lang import guess_suffix_by_path
|
|
9
10
|
from mineru.utils.model_utils import get_vram
|
|
10
11
|
from ..version import __version__
|
|
11
12
|
from .common import do_parse, read_fn, pdf_suffixes, image_suffixes
|
|
@@ -49,12 +50,12 @@ from .common import do_parse, read_fn, pdf_suffixes, image_suffixes
|
|
|
49
50
|
'-b',
|
|
50
51
|
'--backend',
|
|
51
52
|
'backend',
|
|
52
|
-
type=click.Choice(['pipeline', 'vlm-transformers', 'vlm-
|
|
53
|
+
type=click.Choice(['pipeline', 'vlm-transformers', 'vlm-vllm-engine', 'vlm-http-client']),
|
|
53
54
|
help="""the backend for parsing pdf:
|
|
54
55
|
pipeline: More general.
|
|
55
56
|
vlm-transformers: More general.
|
|
56
|
-
vlm-
|
|
57
|
-
vlm-
|
|
57
|
+
vlm-vllm-engine: Faster(engine).
|
|
58
|
+
vlm-http-client: Faster(client).
|
|
58
59
|
without method specified, pipeline will be used by default.""",
|
|
59
60
|
default='pipeline',
|
|
60
61
|
)
|
|
@@ -77,7 +78,7 @@ from .common import do_parse, read_fn, pdf_suffixes, image_suffixes
|
|
|
77
78
|
'server_url',
|
|
78
79
|
type=str,
|
|
79
80
|
help="""
|
|
80
|
-
When the backend is `
|
|
81
|
+
When the backend is `vlm-http-client`, you need to specify the server_url, for example:`http://127.0.0.1:30000`
|
|
81
82
|
""",
|
|
82
83
|
default=None,
|
|
83
84
|
)
|
|
@@ -202,7 +203,7 @@ def main(
|
|
|
202
203
|
if os.path.isdir(input_path):
|
|
203
204
|
doc_path_list = []
|
|
204
205
|
for doc_path in Path(input_path).glob('*'):
|
|
205
|
-
if doc_path
|
|
206
|
+
if guess_suffix_by_path(doc_path) in pdf_suffixes + image_suffixes:
|
|
206
207
|
doc_path_list.append(doc_path)
|
|
207
208
|
parse_doc(doc_path_list)
|
|
208
209
|
else:
|