mineru 2.6.5__py3-none-any.whl → 2.6.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mineru/backend/pipeline/pipeline_analyze.py +15 -20
- mineru/backend/vlm/utils.py +8 -12
- mineru/backend/vlm/vlm_magic_model.py +10 -3
- mineru/backend/vlm/vlm_middle_json_mkcontent.py +284 -2
- mineru/cli/client.py +4 -5
- mineru/cli/common.py +7 -0
- mineru/cli/fast_api.py +95 -27
- mineru/utils/enum_class.py +26 -0
- mineru/utils/guess_suffix_or_lang.py +10 -3
- mineru/utils/model_utils.py +22 -9
- mineru/utils/table_merge.py +10 -2
- mineru/version.py +1 -1
- {mineru-2.6.5.dist-info → mineru-2.6.7.dist-info}/METADATA +15 -3
- {mineru-2.6.5.dist-info → mineru-2.6.7.dist-info}/RECORD +18 -18
- {mineru-2.6.5.dist-info → mineru-2.6.7.dist-info}/WHEEL +0 -0
- {mineru-2.6.5.dist-info → mineru-2.6.7.dist-info}/entry_points.txt +0 -0
- {mineru-2.6.5.dist-info → mineru-2.6.7.dist-info}/licenses/LICENSE.md +0 -0
- {mineru-2.6.5.dist-info → mineru-2.6.7.dist-info}/top_level.txt +0 -0
|
@@ -159,7 +159,6 @@ def batch_image_analyze(
|
|
|
159
159
|
|
|
160
160
|
model_manager = ModelSingleton()
|
|
161
161
|
|
|
162
|
-
batch_ratio = 1
|
|
163
162
|
device = get_device()
|
|
164
163
|
|
|
165
164
|
if str(device).startswith('npu'):
|
|
@@ -173,25 +172,21 @@ def batch_image_analyze(
|
|
|
173
172
|
"Please ensure that the torch_npu package is installed correctly."
|
|
174
173
|
) from e
|
|
175
174
|
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
else:
|
|
192
|
-
# Default batch_ratio when VRAM can't be determined
|
|
193
|
-
batch_ratio = 1
|
|
194
|
-
logger.info(f'Could not determine GPU memory, using default batch_ratio: {batch_ratio}')
|
|
175
|
+
gpu_memory = get_vram(device)
|
|
176
|
+
if gpu_memory >= 16:
|
|
177
|
+
batch_ratio = 16
|
|
178
|
+
elif gpu_memory >= 12:
|
|
179
|
+
batch_ratio = 8
|
|
180
|
+
elif gpu_memory >= 8:
|
|
181
|
+
batch_ratio = 4
|
|
182
|
+
elif gpu_memory >= 6:
|
|
183
|
+
batch_ratio = 2
|
|
184
|
+
else:
|
|
185
|
+
batch_ratio = 1
|
|
186
|
+
logger.info(
|
|
187
|
+
f'GPU Memory: {gpu_memory} GB, Batch Ratio: {batch_ratio}. '
|
|
188
|
+
f'You can set MINERU_VIRTUAL_VRAM_SIZE environment variable to adjust GPU memory allocation.'
|
|
189
|
+
)
|
|
195
190
|
|
|
196
191
|
# 检测torch的版本号
|
|
197
192
|
import torch
|
mineru/backend/vlm/utils.py
CHANGED
|
@@ -81,20 +81,16 @@ def set_default_gpu_memory_utilization() -> float:
|
|
|
81
81
|
def set_default_batch_size() -> int:
|
|
82
82
|
try:
|
|
83
83
|
device = get_device()
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
batch_size = 4
|
|
91
|
-
else:
|
|
92
|
-
batch_size = 1
|
|
93
|
-
logger.info(f'gpu_memory: {gpu_memory} GB, batch_size: {batch_size}')
|
|
84
|
+
gpu_memory = get_vram(device)
|
|
85
|
+
|
|
86
|
+
if gpu_memory >= 16:
|
|
87
|
+
batch_size = 8
|
|
88
|
+
elif gpu_memory >= 8:
|
|
89
|
+
batch_size = 4
|
|
94
90
|
else:
|
|
95
|
-
# Default batch_ratio when VRAM can't be determined
|
|
96
91
|
batch_size = 1
|
|
97
|
-
|
|
92
|
+
logger.info(f'gpu_memory: {gpu_memory} GB, batch_size: {batch_size}')
|
|
93
|
+
|
|
98
94
|
except Exception as e:
|
|
99
95
|
logger.warning(f'Error determining VRAM: {e}, using default batch_ratio: 1')
|
|
100
96
|
batch_size = 1
|
|
@@ -45,7 +45,7 @@ class MagicModel:
|
|
|
45
45
|
continue
|
|
46
46
|
|
|
47
47
|
span_type = "unknown"
|
|
48
|
-
|
|
48
|
+
code_block_sub_type = None
|
|
49
49
|
guess_lang = None
|
|
50
50
|
|
|
51
51
|
if block_type in [
|
|
@@ -74,7 +74,7 @@ class MagicModel:
|
|
|
74
74
|
span_type = ContentType.TABLE
|
|
75
75
|
elif block_type in ["code", "algorithm"]:
|
|
76
76
|
block_content = code_content_clean(block_content)
|
|
77
|
-
|
|
77
|
+
code_block_sub_type = block_type
|
|
78
78
|
block_type = BlockType.CODE_BODY
|
|
79
79
|
span_type = ContentType.TEXT
|
|
80
80
|
guess_lang = guess_language_by_text(block_content)
|
|
@@ -82,6 +82,9 @@ class MagicModel:
|
|
|
82
82
|
block_type = BlockType.INTERLINE_EQUATION
|
|
83
83
|
span_type = ContentType.INTERLINE_EQUATION
|
|
84
84
|
|
|
85
|
+
# code 和 algorithm 类型的块,如果内容中包含行内公式,则需要将块类型切换为algorithm
|
|
86
|
+
switch_code_to_algorithm = False
|
|
87
|
+
|
|
85
88
|
if span_type in ["image", "table"]:
|
|
86
89
|
span = {
|
|
87
90
|
"bbox": block_bbox,
|
|
@@ -102,6 +105,8 @@ class MagicModel:
|
|
|
102
105
|
|
|
103
106
|
if block_content and block_content.count("\\(") == block_content.count("\\)") and block_content.count("\\(") > 0:
|
|
104
107
|
|
|
108
|
+
switch_code_to_algorithm = True
|
|
109
|
+
|
|
105
110
|
# 生成包含文本和公式的span列表
|
|
106
111
|
spans = []
|
|
107
112
|
last_end = 0
|
|
@@ -160,7 +165,9 @@ class MagicModel:
|
|
|
160
165
|
|
|
161
166
|
# 构造line对象
|
|
162
167
|
if block_type in [BlockType.CODE_BODY]:
|
|
163
|
-
|
|
168
|
+
if switch_code_to_algorithm and code_block_sub_type == "code":
|
|
169
|
+
code_block_sub_type = "algorithm"
|
|
170
|
+
line = {"bbox": block_bbox, "spans": spans, "extra": {"type": code_block_sub_type, "guess_lang": guess_lang}}
|
|
164
171
|
else:
|
|
165
172
|
line = {"bbox": block_bbox, "spans": spans}
|
|
166
173
|
|
|
@@ -1,7 +1,8 @@
|
|
|
1
1
|
import os
|
|
2
2
|
|
|
3
|
+
from loguru import logger
|
|
3
4
|
from mineru.utils.config_reader import get_latex_delimiter_config, get_formula_enable, get_table_enable
|
|
4
|
-
from mineru.utils.enum_class import MakeMode, BlockType, ContentType
|
|
5
|
+
from mineru.utils.enum_class import MakeMode, BlockType, ContentType, ContentTypeV2
|
|
5
6
|
|
|
6
7
|
latex_delimiters_config = get_latex_delimiter_config()
|
|
7
8
|
|
|
@@ -234,6 +235,277 @@ def make_blocks_to_content_list(para_block, img_buket_path, page_idx, page_size)
|
|
|
234
235
|
|
|
235
236
|
return para_content
|
|
236
237
|
|
|
238
|
+
|
|
239
|
+
def make_blocks_to_content_list_v2(para_block, img_buket_path, page_size):
|
|
240
|
+
para_type = para_block['type']
|
|
241
|
+
para_content = {}
|
|
242
|
+
if para_type in [
|
|
243
|
+
BlockType.HEADER,
|
|
244
|
+
BlockType.FOOTER,
|
|
245
|
+
BlockType.ASIDE_TEXT,
|
|
246
|
+
BlockType.PAGE_NUMBER,
|
|
247
|
+
BlockType.PAGE_FOOTNOTE,
|
|
248
|
+
]:
|
|
249
|
+
if para_type == BlockType.HEADER:
|
|
250
|
+
content_type = ContentTypeV2.PAGE_HEADER
|
|
251
|
+
elif para_type == BlockType.FOOTER:
|
|
252
|
+
content_type = ContentTypeV2.PAGE_FOOTER
|
|
253
|
+
elif para_type == BlockType.ASIDE_TEXT:
|
|
254
|
+
content_type = ContentTypeV2.PAGE_ASIDE_TEXT
|
|
255
|
+
elif para_type == BlockType.PAGE_NUMBER:
|
|
256
|
+
content_type = ContentTypeV2.PAGE_NUMBER
|
|
257
|
+
elif para_type == BlockType.PAGE_FOOTNOTE:
|
|
258
|
+
content_type = ContentTypeV2.PAGE_FOOTNOTE
|
|
259
|
+
else:
|
|
260
|
+
raise ValueError(f"Unknown para_type: {para_type}")
|
|
261
|
+
para_content = {
|
|
262
|
+
'type': content_type,
|
|
263
|
+
'content': {
|
|
264
|
+
f"{content_type}_content": merge_para_with_text_v2(para_block),
|
|
265
|
+
}
|
|
266
|
+
}
|
|
267
|
+
elif para_type == BlockType.TITLE:
|
|
268
|
+
title_level = get_title_level(para_block)
|
|
269
|
+
if title_level != 0:
|
|
270
|
+
para_content = {
|
|
271
|
+
'type': ContentTypeV2.TITLE,
|
|
272
|
+
'content': {
|
|
273
|
+
"title_content": merge_para_with_text_v2(para_block),
|
|
274
|
+
"level": title_level
|
|
275
|
+
}
|
|
276
|
+
}
|
|
277
|
+
else:
|
|
278
|
+
para_content = {
|
|
279
|
+
'type': ContentTypeV2.PARAGRAPH,
|
|
280
|
+
'content': {
|
|
281
|
+
"paragraph_content": merge_para_with_text_v2(para_block),
|
|
282
|
+
}
|
|
283
|
+
}
|
|
284
|
+
elif para_type in [
|
|
285
|
+
BlockType.TEXT,
|
|
286
|
+
BlockType.PHONETIC
|
|
287
|
+
]:
|
|
288
|
+
para_content = {
|
|
289
|
+
'type': ContentTypeV2.PARAGRAPH,
|
|
290
|
+
'content': {
|
|
291
|
+
'paragraph_content': merge_para_with_text_v2(para_block),
|
|
292
|
+
}
|
|
293
|
+
}
|
|
294
|
+
elif para_type == BlockType.INTERLINE_EQUATION:
|
|
295
|
+
image_path, math_content = get_body_data(para_block)
|
|
296
|
+
para_content = {
|
|
297
|
+
'type': ContentTypeV2.EQUATION_INTERLINE,
|
|
298
|
+
'content': {
|
|
299
|
+
'math_content': math_content,
|
|
300
|
+
'math_type': 'latex',
|
|
301
|
+
'image_source': {'path': f"{img_buket_path}/{image_path}"},
|
|
302
|
+
}
|
|
303
|
+
}
|
|
304
|
+
elif para_type == BlockType.IMAGE:
|
|
305
|
+
image_caption = []
|
|
306
|
+
image_footnote = []
|
|
307
|
+
image_path, _ = get_body_data(para_block)
|
|
308
|
+
image_source = {
|
|
309
|
+
'path': f"{img_buket_path}/{image_path}",
|
|
310
|
+
}
|
|
311
|
+
for block in para_block['blocks']:
|
|
312
|
+
if block['type'] == BlockType.IMAGE_CAPTION:
|
|
313
|
+
image_caption.extend(merge_para_with_text_v2(block))
|
|
314
|
+
if block['type'] == BlockType.IMAGE_FOOTNOTE:
|
|
315
|
+
image_footnote.extend(merge_para_with_text_v2(block))
|
|
316
|
+
para_content = {
|
|
317
|
+
'type': ContentTypeV2.IMAGE,
|
|
318
|
+
'content': {
|
|
319
|
+
'image_source': image_source,
|
|
320
|
+
'image_caption': image_caption,
|
|
321
|
+
'image_footnote': image_footnote,
|
|
322
|
+
}
|
|
323
|
+
}
|
|
324
|
+
elif para_type == BlockType.TABLE:
|
|
325
|
+
table_caption = []
|
|
326
|
+
table_footnote = []
|
|
327
|
+
image_path, html = get_body_data(para_block)
|
|
328
|
+
image_source = {
|
|
329
|
+
'path': f"{img_buket_path}/{image_path}",
|
|
330
|
+
}
|
|
331
|
+
if html.count("<table") > 1:
|
|
332
|
+
table_nest_level = 2
|
|
333
|
+
else:
|
|
334
|
+
table_nest_level = 1
|
|
335
|
+
if (
|
|
336
|
+
"colspan" in html or
|
|
337
|
+
"rowspan" in html or
|
|
338
|
+
table_nest_level > 1
|
|
339
|
+
):
|
|
340
|
+
table_type = ContentTypeV2.TABLE_COMPLEX
|
|
341
|
+
else:
|
|
342
|
+
table_type = ContentTypeV2.TABLE_SIMPLE
|
|
343
|
+
|
|
344
|
+
for block in para_block['blocks']:
|
|
345
|
+
if block['type'] == BlockType.TABLE_CAPTION:
|
|
346
|
+
table_caption.extend(merge_para_with_text_v2(block))
|
|
347
|
+
if block['type'] == BlockType.TABLE_FOOTNOTE:
|
|
348
|
+
table_footnote.extend(merge_para_with_text_v2(block))
|
|
349
|
+
para_content = {
|
|
350
|
+
'type': ContentTypeV2.TABLE,
|
|
351
|
+
'content': {
|
|
352
|
+
'image_source': image_source,
|
|
353
|
+
'table_caption': table_caption,
|
|
354
|
+
'table_footnote': table_footnote,
|
|
355
|
+
'html': html,
|
|
356
|
+
'table_type': table_type,
|
|
357
|
+
'table_nest_level': table_nest_level,
|
|
358
|
+
}
|
|
359
|
+
}
|
|
360
|
+
elif para_type == BlockType.CODE:
|
|
361
|
+
code_caption = []
|
|
362
|
+
code_content = []
|
|
363
|
+
for block in para_block['blocks']:
|
|
364
|
+
if block['type'] == BlockType.CODE_CAPTION:
|
|
365
|
+
code_caption.extend(merge_para_with_text_v2(block))
|
|
366
|
+
if block['type'] == BlockType.CODE_BODY:
|
|
367
|
+
code_content = merge_para_with_text_v2(block)
|
|
368
|
+
sub_type = para_block["sub_type"]
|
|
369
|
+
if sub_type == BlockType.CODE:
|
|
370
|
+
para_content = {
|
|
371
|
+
'type': ContentTypeV2.CODE,
|
|
372
|
+
'content': {
|
|
373
|
+
'code_caption': code_caption,
|
|
374
|
+
'code_content': code_content,
|
|
375
|
+
'code_language': para_block.get('guess_lang', 'txt'),
|
|
376
|
+
}
|
|
377
|
+
}
|
|
378
|
+
elif sub_type == BlockType.ALGORITHM:
|
|
379
|
+
para_content = {
|
|
380
|
+
'type': ContentTypeV2.ALGORITHM,
|
|
381
|
+
'content': {
|
|
382
|
+
'algorithm_caption': code_caption,
|
|
383
|
+
'algorithm_content': code_content,
|
|
384
|
+
}
|
|
385
|
+
}
|
|
386
|
+
else:
|
|
387
|
+
raise ValueError(f"Unknown code sub_type: {sub_type}")
|
|
388
|
+
elif para_type == BlockType.REF_TEXT:
|
|
389
|
+
para_content = {
|
|
390
|
+
'type': ContentTypeV2.LIST,
|
|
391
|
+
'content': {
|
|
392
|
+
'list_type': ContentTypeV2.LIST_REF,
|
|
393
|
+
'list_items': [
|
|
394
|
+
{
|
|
395
|
+
'item_type': 'text',
|
|
396
|
+
'item_content': merge_para_with_text_v2(para_block),
|
|
397
|
+
}
|
|
398
|
+
],
|
|
399
|
+
}
|
|
400
|
+
}
|
|
401
|
+
elif para_type == BlockType.LIST:
|
|
402
|
+
if 'sub_type' in para_block:
|
|
403
|
+
if para_block['sub_type'] == BlockType.REF_TEXT:
|
|
404
|
+
list_type = ContentTypeV2.LIST_REF
|
|
405
|
+
elif para_block['sub_type'] == BlockType.TEXT:
|
|
406
|
+
list_type = ContentTypeV2.LIST_TEXT
|
|
407
|
+
else:
|
|
408
|
+
raise ValueError(f"Unknown list sub_type: {para_block['sub_type']}")
|
|
409
|
+
else:
|
|
410
|
+
list_type = ContentTypeV2.LIST_TEXT
|
|
411
|
+
list_items = []
|
|
412
|
+
for block in para_block['blocks']:
|
|
413
|
+
item_content = merge_para_with_text_v2(block)
|
|
414
|
+
if item_content:
|
|
415
|
+
list_items.append({
|
|
416
|
+
'item_type': 'text',
|
|
417
|
+
'item_content': item_content,
|
|
418
|
+
})
|
|
419
|
+
para_content = {
|
|
420
|
+
'type': ContentTypeV2.LIST,
|
|
421
|
+
'content': {
|
|
422
|
+
'list_type': list_type,
|
|
423
|
+
'list_items': list_items,
|
|
424
|
+
}
|
|
425
|
+
}
|
|
426
|
+
|
|
427
|
+
page_width, page_height = page_size
|
|
428
|
+
para_bbox = para_block.get('bbox')
|
|
429
|
+
if para_bbox:
|
|
430
|
+
x0, y0, x1, y1 = para_bbox
|
|
431
|
+
para_content['bbox'] = [
|
|
432
|
+
int(x0 * 1000 / page_width),
|
|
433
|
+
int(y0 * 1000 / page_height),
|
|
434
|
+
int(x1 * 1000 / page_width),
|
|
435
|
+
int(y1 * 1000 / page_height),
|
|
436
|
+
]
|
|
437
|
+
|
|
438
|
+
return para_content
|
|
439
|
+
|
|
440
|
+
|
|
441
|
+
|
|
442
|
+
|
|
443
|
+
|
|
444
|
+
def get_body_data(para_block):
|
|
445
|
+
"""
|
|
446
|
+
Extract image_path and html from para_block
|
|
447
|
+
Returns:
|
|
448
|
+
- For IMAGE/INTERLINE_EQUATION: (image_path, '')
|
|
449
|
+
- For TABLE: (image_path, html)
|
|
450
|
+
- Default: ('', '')
|
|
451
|
+
"""
|
|
452
|
+
|
|
453
|
+
def get_data_from_spans(lines):
|
|
454
|
+
for line in lines:
|
|
455
|
+
for span in line.get('spans', []):
|
|
456
|
+
span_type = span.get('type')
|
|
457
|
+
if span_type == ContentType.TABLE:
|
|
458
|
+
return span.get('image_path', ''), span.get('html', '')
|
|
459
|
+
elif span_type == ContentType.IMAGE:
|
|
460
|
+
return span.get('image_path', ''), ''
|
|
461
|
+
elif span_type == ContentType.INTERLINE_EQUATION:
|
|
462
|
+
return span.get('image_path', ''), span.get('content', '')
|
|
463
|
+
elif span_type == ContentType.TEXT:
|
|
464
|
+
return '', span.get('content', '')
|
|
465
|
+
return '', ''
|
|
466
|
+
|
|
467
|
+
# 处理嵌套的 blocks 结构
|
|
468
|
+
if 'blocks' in para_block:
|
|
469
|
+
for block in para_block['blocks']:
|
|
470
|
+
block_type = block.get('type')
|
|
471
|
+
if block_type in [BlockType.IMAGE_BODY, BlockType.TABLE_BODY, BlockType.CODE_BODY]:
|
|
472
|
+
result = get_data_from_spans(block.get('lines', []))
|
|
473
|
+
if result != ('', ''):
|
|
474
|
+
return result
|
|
475
|
+
return '', ''
|
|
476
|
+
|
|
477
|
+
# 处理直接包含 lines 的结构
|
|
478
|
+
return get_data_from_spans(para_block.get('lines', []))
|
|
479
|
+
|
|
480
|
+
|
|
481
|
+
def merge_para_with_text_v2(para_block):
|
|
482
|
+
para_content = []
|
|
483
|
+
para_type = para_block['type']
|
|
484
|
+
for line in para_block['lines']:
|
|
485
|
+
for span in line['spans']:
|
|
486
|
+
span_type = span['type']
|
|
487
|
+
if span['content']:
|
|
488
|
+
if para_type == BlockType.PHONETIC and span_type == ContentTypeV2.SPAN_TEXT:
|
|
489
|
+
span_type = ContentTypeV2.SPAN_PHONETIC
|
|
490
|
+
if span_type == ContentType.INLINE_EQUATION:
|
|
491
|
+
span_type = ContentTypeV2.SPAN_EQUATION_INLINE
|
|
492
|
+
if span_type in [
|
|
493
|
+
ContentTypeV2.SPAN_TEXT,
|
|
494
|
+
ContentTypeV2.SPAN_PHONETIC,
|
|
495
|
+
ContentTypeV2.SPAN_EQUATION_INLINE,
|
|
496
|
+
ContentTypeV2.SPAN_MD,
|
|
497
|
+
ContentTypeV2.SPAN_CODE_INLINE,
|
|
498
|
+
]:
|
|
499
|
+
span_content = {
|
|
500
|
+
'type': span_type,
|
|
501
|
+
'content': span['content'],
|
|
502
|
+
}
|
|
503
|
+
para_content.append(span_content)
|
|
504
|
+
else:
|
|
505
|
+
logger.warning(f"Unknown span type in merge_para_with_text_v2: {span_type}")
|
|
506
|
+
return para_content
|
|
507
|
+
|
|
508
|
+
|
|
237
509
|
def union_make(pdf_info_dict: list,
|
|
238
510
|
make_mode: str,
|
|
239
511
|
img_buket_path: str = '',
|
|
@@ -260,10 +532,20 @@ def union_make(pdf_info_dict: list,
|
|
|
260
532
|
for para_block in para_blocks:
|
|
261
533
|
para_content = make_blocks_to_content_list(para_block, img_buket_path, page_idx, page_size)
|
|
262
534
|
output_content.append(para_content)
|
|
535
|
+
elif make_mode == MakeMode.CONTENT_LIST_V2:
|
|
536
|
+
# https://github.com/drunkpig/llm-webkit-mirror/blob/dev6/docs/specification/output_format/content_list_spec.md
|
|
537
|
+
page_contents = []
|
|
538
|
+
para_blocks = (paras_of_layout or []) + (paras_of_discarded or [])
|
|
539
|
+
if not para_blocks:
|
|
540
|
+
continue
|
|
541
|
+
for para_block in para_blocks:
|
|
542
|
+
para_content = make_blocks_to_content_list_v2(para_block, img_buket_path, page_size)
|
|
543
|
+
page_contents.append(para_content)
|
|
544
|
+
output_content.append(page_contents)
|
|
263
545
|
|
|
264
546
|
if make_mode in [MakeMode.MM_MD, MakeMode.NLP_MD]:
|
|
265
547
|
return '\n\n'.join(output_content)
|
|
266
|
-
elif make_mode
|
|
548
|
+
elif make_mode in [MakeMode.CONTENT_LIST, MakeMode.CONTENT_LIST_V2]:
|
|
267
549
|
return output_content
|
|
268
550
|
return None
|
|
269
551
|
|
mineru/cli/client.py
CHANGED
|
@@ -113,7 +113,7 @@ if is_mac_os_version_supported():
|
|
|
113
113
|
'--formula',
|
|
114
114
|
'formula_enable',
|
|
115
115
|
type=bool,
|
|
116
|
-
help='Enable formula parsing. Default is True.
|
|
116
|
+
help='Enable formula parsing. Default is True. ',
|
|
117
117
|
default=True,
|
|
118
118
|
)
|
|
119
119
|
@click.option(
|
|
@@ -121,7 +121,7 @@ if is_mac_os_version_supported():
|
|
|
121
121
|
'--table',
|
|
122
122
|
'table_enable',
|
|
123
123
|
type=bool,
|
|
124
|
-
help='Enable table parsing. Default is True.
|
|
124
|
+
help='Enable table parsing. Default is True. ',
|
|
125
125
|
default=True,
|
|
126
126
|
)
|
|
127
127
|
@click.option(
|
|
@@ -172,9 +172,8 @@ def main(
|
|
|
172
172
|
def get_virtual_vram_size() -> int:
|
|
173
173
|
if virtual_vram is not None:
|
|
174
174
|
return virtual_vram
|
|
175
|
-
|
|
176
|
-
return
|
|
177
|
-
return 1
|
|
175
|
+
else:
|
|
176
|
+
return get_vram(get_device_mode())
|
|
178
177
|
if os.getenv('MINERU_VIRTUAL_VRAM_SIZE', None) is None:
|
|
179
178
|
os.environ['MINERU_VIRTUAL_VRAM_SIZE']= str(get_virtual_vram_size())
|
|
180
179
|
|
mineru/cli/common.py
CHANGED
|
@@ -144,6 +144,13 @@ def _process_output(
|
|
|
144
144
|
f"{pdf_file_name}_content_list.json",
|
|
145
145
|
json.dumps(content_list, ensure_ascii=False, indent=4),
|
|
146
146
|
)
|
|
147
|
+
if not is_pipeline:
|
|
148
|
+
content_list_v2 = make_func(pdf_info, MakeMode.CONTENT_LIST_V2, image_dir)
|
|
149
|
+
md_writer.write_string(
|
|
150
|
+
f"{pdf_file_name}_content_list_v2.json",
|
|
151
|
+
json.dumps(content_list_v2, ensure_ascii=False, indent=4),
|
|
152
|
+
)
|
|
153
|
+
|
|
147
154
|
|
|
148
155
|
if f_dump_middle_json:
|
|
149
156
|
md_writer.write_string(
|
mineru/cli/fast_api.py
CHANGED
|
@@ -8,7 +8,7 @@ import click
|
|
|
8
8
|
import zipfile
|
|
9
9
|
from pathlib import Path
|
|
10
10
|
import glob
|
|
11
|
-
from fastapi import FastAPI, UploadFile, File, Form
|
|
11
|
+
from fastapi import Depends, FastAPI, HTTPException, UploadFile, File, Form
|
|
12
12
|
from fastapi.middleware.gzip import GZipMiddleware
|
|
13
13
|
from fastapi.responses import JSONResponse, FileResponse
|
|
14
14
|
from starlette.background import BackgroundTask
|
|
@@ -21,14 +21,53 @@ from mineru.utils.cli_parser import arg_parse
|
|
|
21
21
|
from mineru.utils.guess_suffix_or_lang import guess_suffix_by_path
|
|
22
22
|
from mineru.version import __version__
|
|
23
23
|
|
|
24
|
-
|
|
25
|
-
|
|
24
|
+
# 并发控制器
|
|
25
|
+
_request_semaphore: Optional[asyncio.Semaphore] = None
|
|
26
|
+
|
|
27
|
+
# 并发控制依赖函数
|
|
28
|
+
async def limit_concurrency():
|
|
29
|
+
if _request_semaphore is not None:
|
|
30
|
+
if _request_semaphore.locked():
|
|
31
|
+
raise HTTPException(
|
|
32
|
+
status_code=503,
|
|
33
|
+
detail=f"Server is at maximum capacity: {os.getenv('MINERU_API_MAX_CONCURRENT_REQUESTS', 'unset')}. Please try again later."
|
|
34
|
+
)
|
|
35
|
+
async with _request_semaphore:
|
|
36
|
+
yield
|
|
37
|
+
else:
|
|
38
|
+
yield
|
|
39
|
+
|
|
40
|
+
def create_app():
|
|
41
|
+
# By default, the OpenAPI documentation endpoints (openapi_url, docs_url, redoc_url) are enabled.
|
|
42
|
+
# To disable the FastAPI docs and schema endpoints, set the environment variable MINERU_API_ENABLE_FASTAPI_DOCS=0.
|
|
43
|
+
enable_docs = str(os.getenv("MINERU_API_ENABLE_FASTAPI_DOCS", "1")).lower() in ("1", "true", "yes")
|
|
44
|
+
app = FastAPI(
|
|
45
|
+
openapi_url="/openapi.json" if enable_docs else None,
|
|
46
|
+
docs_url="/docs" if enable_docs else None,
|
|
47
|
+
redoc_url="/redoc" if enable_docs else None,
|
|
48
|
+
)
|
|
49
|
+
|
|
50
|
+
# 初始化并发控制器:从环境变量MINERU_API_MAX_CONCURRENT_REQUESTS读取
|
|
51
|
+
global _request_semaphore
|
|
52
|
+
try:
|
|
53
|
+
max_concurrent_requests = int(os.getenv("MINERU_API_MAX_CONCURRENT_REQUESTS", "0"))
|
|
54
|
+
except ValueError:
|
|
55
|
+
max_concurrent_requests = 0
|
|
56
|
+
|
|
57
|
+
if max_concurrent_requests > 0:
|
|
58
|
+
_request_semaphore = asyncio.Semaphore(max_concurrent_requests)
|
|
59
|
+
logger.info(f"Request concurrency limited to {max_concurrent_requests}")
|
|
60
|
+
|
|
61
|
+
app.add_middleware(GZipMiddleware, minimum_size=1000)
|
|
62
|
+
return app
|
|
63
|
+
|
|
64
|
+
app = create_app()
|
|
26
65
|
|
|
27
66
|
|
|
28
67
|
def sanitize_filename(filename: str) -> str:
|
|
29
68
|
"""
|
|
30
69
|
格式化压缩文件的文件名
|
|
31
|
-
移除路径遍历字符, 保留 Unicode 字母、数字、._-
|
|
70
|
+
移除路径遍历字符, 保留 Unicode 字母、数字、._-
|
|
32
71
|
禁止隐藏文件
|
|
33
72
|
"""
|
|
34
73
|
sanitized = re.sub(r'[/\\\.]{2,}|[/\\]', '', filename)
|
|
@@ -60,24 +99,48 @@ def get_infer_result(file_suffix_identifier: str, pdf_name: str, parse_dir: str)
|
|
|
60
99
|
return None
|
|
61
100
|
|
|
62
101
|
|
|
63
|
-
@app.post(path="/file_parse",)
|
|
102
|
+
@app.post(path="/file_parse", dependencies=[Depends(limit_concurrency)])
|
|
64
103
|
async def parse_pdf(
|
|
65
|
-
files: List[UploadFile] = File(
|
|
66
|
-
output_dir: str = Form("./output"),
|
|
67
|
-
lang_list: List[str] = Form(
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
104
|
+
files: List[UploadFile] = File(..., description="Upload pdf or image files for parsing"),
|
|
105
|
+
output_dir: str = Form("./output", description="Output local directory"),
|
|
106
|
+
lang_list: List[str] = Form(
|
|
107
|
+
["ch"],
|
|
108
|
+
description="""(Adapted only for pipeline backend)Input the languages in the pdf to improve OCR accuracy.
|
|
109
|
+
Options: ch, ch_server, ch_lite, en, korean, japan, chinese_cht, ta, te, ka, th, el, latin, arabic, east_slavic, cyrillic, devanagari.
|
|
110
|
+
"""
|
|
111
|
+
),
|
|
112
|
+
backend: str = Form(
|
|
113
|
+
"pipeline",
|
|
114
|
+
description="""The backend for parsing:
|
|
115
|
+
- pipeline: More general
|
|
116
|
+
- vlm-transformers: More general, but slower
|
|
117
|
+
- vlm-mlx-engine: Faster than transformers (need apple silicon and macOS 13.5+)
|
|
118
|
+
- vlm-vllm-async-engine: Faster (vllm-engine, need vllm installed)
|
|
119
|
+
- vlm-lmdeploy-engine: Faster (lmdeploy-engine, need lmdeploy installed)
|
|
120
|
+
- vlm-http-client: Faster (client suitable for openai-compatible servers)"""
|
|
121
|
+
),
|
|
122
|
+
parse_method: str = Form(
|
|
123
|
+
"auto",
|
|
124
|
+
description="""(Adapted only for pipeline backend)The method for parsing PDF:
|
|
125
|
+
- auto: Automatically determine the method based on the file type
|
|
126
|
+
- txt: Use text extraction method
|
|
127
|
+
- ocr: Use OCR method for image-based PDFs
|
|
128
|
+
"""
|
|
129
|
+
),
|
|
130
|
+
formula_enable: bool = Form(True, description="Enable formula parsing."),
|
|
131
|
+
table_enable: bool = Form(True, description="Enable table parsing."),
|
|
132
|
+
server_url: Optional[str] = Form(
|
|
133
|
+
None,
|
|
134
|
+
description="(Adapted only for vlm-http-client backend)openai compatible server url, e.g., http://127.0.0.1:30000"
|
|
135
|
+
),
|
|
136
|
+
return_md: bool = Form(True, description="Return markdown content in response"),
|
|
137
|
+
return_middle_json: bool = Form(False, description="Return middle JSON in response"),
|
|
138
|
+
return_model_output: bool = Form(False, description="Return model output JSON in response"),
|
|
139
|
+
return_content_list: bool = Form(False, description="Return content list JSON in response"),
|
|
140
|
+
return_images: bool = Form(False, description="Return extracted images in response"),
|
|
141
|
+
response_format_zip: bool = Form(False, description="Return results as a ZIP file instead of JSON"),
|
|
142
|
+
start_page_id: int = Form(0, description="The starting page for PDF parsing, beginning from 0"),
|
|
143
|
+
end_page_id: int = Form(99999, description="The ending page for PDF parsing, beginning from 0"),
|
|
81
144
|
):
|
|
82
145
|
|
|
83
146
|
# 获取命令行配置参数
|
|
@@ -153,7 +216,7 @@ async def parse_pdf(
|
|
|
153
216
|
# 根据 response_format_zip 决定返回类型
|
|
154
217
|
if response_format_zip:
|
|
155
218
|
zip_fd, zip_path = tempfile.mkstemp(suffix=".zip", prefix="mineru_results_")
|
|
156
|
-
os.close(zip_fd)
|
|
219
|
+
os.close(zip_fd)
|
|
157
220
|
with zipfile.ZipFile(zip_path, "w", compression=zipfile.ZIP_DEFLATED) as zf:
|
|
158
221
|
for pdf_name in pdf_file_names:
|
|
159
222
|
safe_pdf_name = sanitize_filename(pdf_name)
|
|
@@ -178,7 +241,7 @@ async def parse_pdf(
|
|
|
178
241
|
|
|
179
242
|
if return_model_output:
|
|
180
243
|
path = os.path.join(parse_dir, f"{pdf_name}_model.json")
|
|
181
|
-
if os.path.exists(path):
|
|
244
|
+
if os.path.exists(path):
|
|
182
245
|
zf.write(path, arcname=os.path.join(safe_pdf_name, os.path.basename(path)))
|
|
183
246
|
|
|
184
247
|
if return_content_list:
|
|
@@ -259,11 +322,16 @@ def main(ctx, host, port, reload, **kwargs):
|
|
|
259
322
|
# 将配置参数存储到应用状态中
|
|
260
323
|
app.state.config = kwargs
|
|
261
324
|
|
|
325
|
+
# 将 CLI 的并发参数同步到环境变量,确保 uvicorn 重载子进程可见
|
|
326
|
+
try:
|
|
327
|
+
mcr = int(kwargs.get("mineru_api_max_concurrent_requests", 0) or 0)
|
|
328
|
+
except ValueError:
|
|
329
|
+
mcr = 0
|
|
330
|
+
os.environ["MINERU_API_MAX_CONCURRENT_REQUESTS"] = str(mcr)
|
|
331
|
+
|
|
262
332
|
"""启动MinerU FastAPI服务器的命令行入口"""
|
|
263
333
|
print(f"Start MinerU FastAPI Service: http://{host}:{port}")
|
|
264
|
-
print("
|
|
265
|
-
print(f"- Swagger UI: http://{host}:{port}/docs")
|
|
266
|
-
print(f"- ReDoc: http://{host}:{port}/redoc")
|
|
334
|
+
print(f"API documentation: http://{host}:{port}/docs")
|
|
267
335
|
|
|
268
336
|
uvicorn.run(
|
|
269
337
|
"mineru.cli.fast_api:app",
|
|
@@ -274,4 +342,4 @@ def main(ctx, host, port, reload, **kwargs):
|
|
|
274
342
|
|
|
275
343
|
|
|
276
344
|
if __name__ == "__main__":
|
|
277
|
-
main()
|
|
345
|
+
main()
|
mineru/utils/enum_class.py
CHANGED
|
@@ -38,6 +38,31 @@ class ContentType:
|
|
|
38
38
|
CODE = 'code'
|
|
39
39
|
|
|
40
40
|
|
|
41
|
+
class ContentTypeV2:
|
|
42
|
+
CODE = 'code'
|
|
43
|
+
ALGORITHM = "algorithm"
|
|
44
|
+
EQUATION_INTERLINE = 'equation_interline'
|
|
45
|
+
IMAGE = 'image'
|
|
46
|
+
TABLE = 'table'
|
|
47
|
+
TABLE_SIMPLE = 'simple_table'
|
|
48
|
+
TABLE_COMPLEX = 'complex_table'
|
|
49
|
+
LIST = 'list'
|
|
50
|
+
LIST_TEXT = 'text_list'
|
|
51
|
+
LIST_REF = 'reference_list'
|
|
52
|
+
TITLE = 'title'
|
|
53
|
+
PARAGRAPH = 'paragraph'
|
|
54
|
+
SPAN_TEXT = 'text'
|
|
55
|
+
SPAN_EQUATION_INLINE = 'equation_inline'
|
|
56
|
+
SPAN_PHONETIC = 'phonetic'
|
|
57
|
+
SPAN_MD = 'md'
|
|
58
|
+
SPAN_CODE_INLINE = 'code_inline'
|
|
59
|
+
PAGE_HEADER = "page_header"
|
|
60
|
+
PAGE_FOOTER = "page_footer"
|
|
61
|
+
PAGE_NUMBER = "page_number"
|
|
62
|
+
PAGE_ASIDE_TEXT = "page_aside_text"
|
|
63
|
+
PAGE_FOOTNOTE = "page_footnote"
|
|
64
|
+
|
|
65
|
+
|
|
41
66
|
class CategoryId:
|
|
42
67
|
Title = 0
|
|
43
68
|
Text = 1
|
|
@@ -60,6 +85,7 @@ class MakeMode:
|
|
|
60
85
|
MM_MD = 'mm_markdown'
|
|
61
86
|
NLP_MD = 'nlp_markdown'
|
|
62
87
|
CONTENT_LIST = 'content_list'
|
|
88
|
+
CONTENT_LIST_V2 = 'content_list_v2'
|
|
63
89
|
|
|
64
90
|
|
|
65
91
|
class ModelPath:
|
|
@@ -1,9 +1,11 @@
|
|
|
1
1
|
from pathlib import Path
|
|
2
2
|
|
|
3
|
+
from loguru import logger
|
|
3
4
|
from magika import Magika
|
|
4
5
|
|
|
5
6
|
|
|
6
7
|
DEFAULT_LANG = "txt"
|
|
8
|
+
PDF_SIG_BYTES = b'%PDF'
|
|
7
9
|
magika = Magika()
|
|
8
10
|
|
|
9
11
|
def guess_language_by_text(code):
|
|
@@ -14,7 +16,7 @@ def guess_language_by_text(code):
|
|
|
14
16
|
|
|
15
17
|
def guess_suffix_by_bytes(file_bytes, file_path=None) -> str:
|
|
16
18
|
suffix = magika.identify_bytes(file_bytes).prediction.output.label
|
|
17
|
-
if file_path and suffix in ["ai"] and Path(file_path).suffix.lower() in [".pdf"]:
|
|
19
|
+
if file_path and suffix in ["ai", "html"] and Path(file_path).suffix.lower() in [".pdf"] and file_bytes[:4] == PDF_SIG_BYTES:
|
|
18
20
|
suffix = "pdf"
|
|
19
21
|
return suffix
|
|
20
22
|
|
|
@@ -23,6 +25,11 @@ def guess_suffix_by_path(file_path) -> str:
|
|
|
23
25
|
if not isinstance(file_path, Path):
|
|
24
26
|
file_path = Path(file_path)
|
|
25
27
|
suffix = magika.identify_path(file_path).prediction.output.label
|
|
26
|
-
if suffix in ["ai"] and file_path.suffix.lower() in [".pdf"]:
|
|
27
|
-
|
|
28
|
+
if suffix in ["ai", "html"] and file_path.suffix.lower() in [".pdf"]:
|
|
29
|
+
try:
|
|
30
|
+
with open(file_path, 'rb') as f:
|
|
31
|
+
if f.read(4) == PDF_SIG_BYTES:
|
|
32
|
+
suffix = "pdf"
|
|
33
|
+
except Exception as e:
|
|
34
|
+
logger.warning(f"Failed to read file {file_path} for PDF signature check: {e}")
|
|
28
35
|
return suffix
|
mineru/utils/model_utils.py
CHANGED
|
@@ -428,8 +428,6 @@ def clean_memory(device='cuda'):
|
|
|
428
428
|
|
|
429
429
|
def clean_vram(device, vram_threshold=8):
|
|
430
430
|
total_memory = get_vram(device)
|
|
431
|
-
if total_memory is not None:
|
|
432
|
-
total_memory = int(os.getenv('MINERU_VIRTUAL_VRAM_SIZE', round(total_memory)))
|
|
433
431
|
if total_memory and total_memory <= vram_threshold:
|
|
434
432
|
gc_start = time.time()
|
|
435
433
|
clean_memory(device)
|
|
@@ -437,13 +435,28 @@ def clean_vram(device, vram_threshold=8):
|
|
|
437
435
|
# logger.info(f"gc time: {gc_time}")
|
|
438
436
|
|
|
439
437
|
|
|
440
|
-
def get_vram(device):
|
|
438
|
+
def get_vram(device) -> int:
|
|
439
|
+
env_vram = os.getenv("MINERU_VIRTUAL_VRAM_SIZE")
|
|
440
|
+
|
|
441
|
+
# 如果环境变量已配置,尝试解析并返回
|
|
442
|
+
if env_vram is not None:
|
|
443
|
+
try:
|
|
444
|
+
total_memory = int(env_vram)
|
|
445
|
+
if total_memory > 0:
|
|
446
|
+
return total_memory
|
|
447
|
+
else:
|
|
448
|
+
logger.warning(
|
|
449
|
+
f"MINERU_VIRTUAL_VRAM_SIZE value '{env_vram}' is not positive, falling back to auto-detection")
|
|
450
|
+
except ValueError:
|
|
451
|
+
logger.warning(
|
|
452
|
+
f"MINERU_VIRTUAL_VRAM_SIZE value '{env_vram}' is not a valid integer, falling back to auto-detection")
|
|
453
|
+
|
|
454
|
+
# 环境变量未配置或配置错误,根据device自动获取
|
|
455
|
+
total_memory = 1
|
|
441
456
|
if torch.cuda.is_available() and str(device).startswith("cuda"):
|
|
442
|
-
total_memory = torch.cuda.get_device_properties(device).total_memory / (1024 ** 3) # 将字节转换为 GB
|
|
443
|
-
return total_memory
|
|
457
|
+
total_memory = round(torch.cuda.get_device_properties(device).total_memory / (1024 ** 3)) # 将字节转换为 GB
|
|
444
458
|
elif str(device).startswith("npu"):
|
|
445
459
|
if torch_npu.npu.is_available():
|
|
446
|
-
total_memory = torch_npu.npu.get_device_properties(device).total_memory / (1024 ** 3) # 转为 GB
|
|
447
|
-
|
|
448
|
-
|
|
449
|
-
return None
|
|
460
|
+
total_memory = round(torch_npu.npu.get_device_properties(device).total_memory / (1024 ** 3)) # 转为 GB
|
|
461
|
+
|
|
462
|
+
return total_memory
|
mineru/utils/table_merge.py
CHANGED
|
@@ -7,6 +7,9 @@ from mineru.backend.vlm.vlm_middle_json_mkcontent import merge_para_with_text
|
|
|
7
7
|
from mineru.utils.enum_class import BlockType, SplitFlag
|
|
8
8
|
|
|
9
9
|
|
|
10
|
+
CONTINUATION_MARKERS = ["(续)", "(续表)", "(continued)", "(cont.)"]
|
|
11
|
+
|
|
12
|
+
|
|
10
13
|
def full_to_half(text: str) -> str:
|
|
11
14
|
"""Convert full-width characters to half-width characters using code point manipulation.
|
|
12
15
|
|
|
@@ -174,8 +177,13 @@ def can_merge_tables(current_table_block, previous_table_block):
|
|
|
174
177
|
# 如果有TABLE_CAPTION类型的块,检查是否至少有一个以"(续)"结尾
|
|
175
178
|
caption_blocks = [block for block in current_table_block["blocks"] if block["type"] == BlockType.TABLE_CAPTION]
|
|
176
179
|
if caption_blocks:
|
|
177
|
-
# 如果所有caption都不以"(续)"结尾,则不合并
|
|
178
|
-
|
|
180
|
+
# 如果所有caption都不以"(续)"、"(续表)"、"(continued)"或"(cont.)"结尾,则不合并
|
|
181
|
+
|
|
182
|
+
if not any(
|
|
183
|
+
any(full_to_half(merge_para_with_text(block).strip()).lower().endswith(marker.lower())
|
|
184
|
+
for marker in CONTINUATION_MARKERS)
|
|
185
|
+
for block in caption_blocks
|
|
186
|
+
):
|
|
179
187
|
return False, None, None, None, None
|
|
180
188
|
|
|
181
189
|
if any(block["type"] == BlockType.TABLE_FOOTNOTE for block in previous_table_block["blocks"]):
|
mineru/version.py
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "2.6.
|
|
1
|
+
__version__ = "2.6.7"
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: mineru
|
|
3
|
-
Version: 2.6.
|
|
3
|
+
Version: 2.6.7
|
|
4
4
|
Summary: A practical tool for converting PDF to Markdown
|
|
5
5
|
License: AGPL-3.0
|
|
6
6
|
Project-URL: homepage, https://mineru.net/
|
|
@@ -74,8 +74,8 @@ Requires-Dist: fastapi; extra == "api"
|
|
|
74
74
|
Requires-Dist: python-multipart; extra == "api"
|
|
75
75
|
Requires-Dist: uvicorn; extra == "api"
|
|
76
76
|
Provides-Extra: gradio
|
|
77
|
-
Requires-Dist: gradio
|
|
78
|
-
Requires-Dist: gradio-pdf
|
|
77
|
+
Requires-Dist: gradio==5.49.1; extra == "gradio"
|
|
78
|
+
Requires-Dist: gradio-pdf==0.0.22; extra == "gradio"
|
|
79
79
|
Provides-Extra: core
|
|
80
80
|
Requires-Dist: mineru[vlm]; extra == "core"
|
|
81
81
|
Requires-Dist: mineru[pipeline]; extra == "core"
|
|
@@ -134,6 +134,16 @@ Dynamic: license-file
|
|
|
134
134
|
</div>
|
|
135
135
|
|
|
136
136
|
# Changelog
|
|
137
|
+
|
|
138
|
+
- 2025/12/12 2.6.7 Release
|
|
139
|
+
- Bug fix: #4168
|
|
140
|
+
|
|
141
|
+
- 2025/12/02 2.6.6 Release
|
|
142
|
+
- `mineru-api` tool optimizations
|
|
143
|
+
- Added descriptive text to `mineru-api` interface parameters to improve API documentation readability.
|
|
144
|
+
- You can use the environment variable `MINERU_API_ENABLE_FASTAPI_DOCS` to control whether the auto-generated interface documentation page is enabled (enabled by default).
|
|
145
|
+
- Added concurrency configuration options for the `vlm-vllm-async-engine`, `vlm-lmdeploy-engine`, and `vlm-http-client` backends. Users can use the environment variable `MINERU_API_MAX_CONCURRENT_REQUESTS` to set the maximum number of concurrent API requests (unlimited by default).
|
|
146
|
+
|
|
137
147
|
- 2025/11/26 2.6.5 Release
|
|
138
148
|
- Added support for a new backend vlm-lmdeploy-engine. Its usage is similar to vlm-vllm-(async)engine, but it uses lmdeploy as the inference engine and additionally supports native inference acceleration on Windows platforms compared to vllm.
|
|
139
149
|
|
|
@@ -887,6 +897,8 @@ Currently, some models in this project are trained based on YOLO. However, since
|
|
|
887
897
|
- [pdfminer.six](https://github.com/pdfminer/pdfminer.six)
|
|
888
898
|
- [pypdf](https://github.com/py-pdf/pypdf)
|
|
889
899
|
- [magika](https://github.com/google/magika)
|
|
900
|
+
- [vLLM](https://github.com/vllm-project/vllm)
|
|
901
|
+
- [LMDeploy](https://github.com/InternLM/lmdeploy)
|
|
890
902
|
|
|
891
903
|
# Citation
|
|
892
904
|
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
mineru/__init__.py,sha256=8CRrCQVuExa0BttRFh3Z40lFy2K5jN0sp67KWjOlj5c,50
|
|
2
|
-
mineru/version.py,sha256=
|
|
2
|
+
mineru/version.py,sha256=Kc2cyTvfIdfaMxraNhj61Hw9Mr3C-eTCdQPLK5KH1AQ,22
|
|
3
3
|
mineru/backend/__init__.py,sha256=8CRrCQVuExa0BttRFh3Z40lFy2K5jN0sp67KWjOlj5c,50
|
|
4
4
|
mineru/backend/utils.py,sha256=GLJU3IznDmhE1_qNmkU1UOtsuskIHBezgsEVO6Uar-Y,698
|
|
5
5
|
mineru/backend/pipeline/__init__.py,sha256=8CRrCQVuExa0BttRFh3Z40lFy2K5jN0sp67KWjOlj5c,50
|
|
@@ -8,19 +8,19 @@ mineru/backend/pipeline/model_init.py,sha256=OAylOcQD9gu5TBcX7nMt7X5NpJMtQICI5Iv
|
|
|
8
8
|
mineru/backend/pipeline/model_json_to_middle_json.py,sha256=reXkUR_wKmJD64d7vRNXMxFviwkzDlGjRshpdwsVquI,10951
|
|
9
9
|
mineru/backend/pipeline/model_list.py,sha256=7cXMBfZrP0K6qWueg1D_-WoUANeSINzkn_ic9E7YQLs,222
|
|
10
10
|
mineru/backend/pipeline/para_split.py,sha256=Kq95MmvkPm7rKxlCSGiTvVKyF7CErHI2eGGAs5sLl0Q,17119
|
|
11
|
-
mineru/backend/pipeline/pipeline_analyze.py,sha256=
|
|
11
|
+
mineru/backend/pipeline/pipeline_analyze.py,sha256=GkGOrWGnBSswUik3nt_m76bCwDISC9sxXZ6xRX3L154,6528
|
|
12
12
|
mineru/backend/pipeline/pipeline_magic_model.py,sha256=w8jGx8f6yZN0Wf2yPP3L9rYKc9rogxreZCrUJzJvPO8,14974
|
|
13
13
|
mineru/backend/pipeline/pipeline_middle_json_mkcontent.py,sha256=YlnEbbUnkniZXS13aLo5mjfFQvQM5SrIVvTAGBZsLmw,14478
|
|
14
14
|
mineru/backend/vlm/__init__.py,sha256=8CRrCQVuExa0BttRFh3Z40lFy2K5jN0sp67KWjOlj5c,50
|
|
15
15
|
mineru/backend/vlm/model_output_to_middle_json.py,sha256=AqYX44gS9crUO_t7SuUatD71EVjow6pI6yA2Ik3gQ0s,5139
|
|
16
|
-
mineru/backend/vlm/utils.py,sha256=
|
|
16
|
+
mineru/backend/vlm/utils.py,sha256=JMgS3SMFcHJYH2jIx-Xhs-P2a1bmT8U6Kn60IL0OmQA,3570
|
|
17
17
|
mineru/backend/vlm/vlm_analyze.py,sha256=wP3vuYGVec0hRsDAuzfSm2HD4Muu7wSWL767qxd_yqw,11690
|
|
18
|
-
mineru/backend/vlm/vlm_magic_model.py,sha256=
|
|
19
|
-
mineru/backend/vlm/vlm_middle_json_mkcontent.py,sha256=
|
|
18
|
+
mineru/backend/vlm/vlm_magic_model.py,sha256=e8BWkfeRkZNJjFdm9oPmvIs9ATjdzCy_5OZw2qTziZA,23839
|
|
19
|
+
mineru/backend/vlm/vlm_middle_json_mkcontent.py,sha256=9ig7kfbKDrG5QOwvYoXfcsK5EY9_VyAFmqz733Il-P0,24071
|
|
20
20
|
mineru/cli/__init__.py,sha256=8CRrCQVuExa0BttRFh3Z40lFy2K5jN0sp67KWjOlj5c,50
|
|
21
|
-
mineru/cli/client.py,sha256=
|
|
22
|
-
mineru/cli/common.py,sha256=
|
|
23
|
-
mineru/cli/fast_api.py,sha256=
|
|
21
|
+
mineru/cli/client.py,sha256=XSEIr4klUuufMAWn5IioZdXpg1xAxqRZF0HkaVIhxh0,6815
|
|
22
|
+
mineru/cli/common.py,sha256=Rld3P4FzbTYngy3BP-Irto9aMS7EPq2lIWgeIF1xPMc,14667
|
|
23
|
+
mineru/cli/fast_api.py,sha256=lLxQKKHmD8ruoZGcE6LrXzr3pQIxvw8OdJrQq_FNLSM,14447
|
|
24
24
|
mineru/cli/gradio_app.py,sha256=EUPuRHHCOECrE3E3VNEeuMDYeC3nicurOYfk8YJSOMw,15646
|
|
25
25
|
mineru/cli/models_download.py,sha256=LNfoIpUlJM7m7qb2SiCxtjMDw4jILBQtZwNP2JoY81U,4815
|
|
26
26
|
mineru/cli/vlm_server.py,sha256=27HaqO3wpMXSA_nA3CC6JOBTHK3q66SP00cD6m9HuQE,1974
|
|
@@ -159,14 +159,14 @@ mineru/utils/cli_parser.py,sha256=4seFAu1kulsYnw6WM2q_cxgEOt2tErZVkI-LNEF_kGw,14
|
|
|
159
159
|
mineru/utils/config_reader.py,sha256=IRVWTpBnbnRpck6eXZUKw-fcLt7hon5S4uqWW-RBb1w,4075
|
|
160
160
|
mineru/utils/cut_image.py,sha256=g3m4nfcJNWlxi-P0kpXTtlmspXkMcLCfGwmYuQ-Z2hE,751
|
|
161
161
|
mineru/utils/draw_bbox.py,sha256=FkgppjUzRhN-uxvChdkhHXcDavJEaApMD6qC6qoRwfQ,20292
|
|
162
|
-
mineru/utils/enum_class.py,sha256
|
|
162
|
+
mineru/utils/enum_class.py,sha256=NO9FiM2Ni6-SI1rzMm5X98ssNDxgJTyykZddJSscgJs,3221
|
|
163
163
|
mineru/utils/format_utils.py,sha256=2s89vHcSISjuolk8Hvg3K-5-rRbiT3Us7eFLzUKrNKs,10233
|
|
164
|
-
mineru/utils/guess_suffix_or_lang.py,sha256=
|
|
164
|
+
mineru/utils/guess_suffix_or_lang.py,sha256=aUC2wAJwa5LH0SHxwTbOEJqVVgvpdUCWFF6of8eDVkc,1198
|
|
165
165
|
mineru/utils/hash_utils.py,sha256=UPS_8NRBmVumdyOv16Lmv6Ly2xK8OVDJEe5gG6gKIFk,857
|
|
166
166
|
mineru/utils/language.py,sha256=7RT3mxSa7jdpoC5ySd7ZddHA7TO7UsnmDOWiYZAxuyg,1433
|
|
167
167
|
mineru/utils/llm_aided.py,sha256=9WUytvxenSAuaWR4sTQhVPQ5h8pY0wVOH1O2sj_6dLs,5149
|
|
168
168
|
mineru/utils/magic_model_utils.py,sha256=2xOvi4oqg3MSw1FUrJTnYDtWeFrrm6qbmlEorLZSaYs,5650
|
|
169
|
-
mineru/utils/model_utils.py,sha256=
|
|
169
|
+
mineru/utils/model_utils.py,sha256=6moOQqE5ShHaJKkENXP8BXJA7RCWtOGlYHZ3nidwmZs,18977
|
|
170
170
|
mineru/utils/models_download_utils.py,sha256=UfjvwhxO6BkJHa5JSpEVNZ71GoLMPMmJpym3THET2T4,2957
|
|
171
171
|
mineru/utils/ocr_utils.py,sha256=lPIrwNUib5mrzUkponRYHuUCdjV2qvETNLSzOLyflrU,15990
|
|
172
172
|
mineru/utils/os_env_config.py,sha256=ZNtkR4KrJW72CeIoTNzGDL6tMKv_hL8nzvWIssGWbqY,842
|
|
@@ -178,10 +178,10 @@ mineru/utils/pdf_text_tool.py,sha256=KEztjfdqsIHHuiTEAMAL7Lr1OS3R7Ur-uTqGiCRjReQ
|
|
|
178
178
|
mineru/utils/run_async.py,sha256=rPeP4BCZerR8VByRDhiYzfZiahLVqoZEBVAS54dAjNg,1286
|
|
179
179
|
mineru/utils/span_block_fix.py,sha256=0eVQjJCrT03woRt9hoh6Uu42Tp1dacfGTv2x3B9qq94,8797
|
|
180
180
|
mineru/utils/span_pre_proc.py,sha256=h41q2uQajI0xQbc_30hqaju1dv3oVYxBAlKgURl8HIc,13692
|
|
181
|
-
mineru/utils/table_merge.py,sha256=
|
|
182
|
-
mineru-2.6.
|
|
183
|
-
mineru-2.6.
|
|
184
|
-
mineru-2.6.
|
|
185
|
-
mineru-2.6.
|
|
186
|
-
mineru-2.6.
|
|
187
|
-
mineru-2.6.
|
|
181
|
+
mineru/utils/table_merge.py,sha256=pZHP0mIUcWPcdu0HecZDEFZ_ms_rT7C43m0Eg2iEEF4,15592
|
|
182
|
+
mineru-2.6.7.dist-info/licenses/LICENSE.md,sha256=jVa0BUaKrRH4erV2P5AeJ24I2WRv9chIGxditreJ6e0,34524
|
|
183
|
+
mineru-2.6.7.dist-info/METADATA,sha256=2SEMOR8-0ZzJL85pLQR4Gkm61YroeJ6BSDjB44YCUhs,73142
|
|
184
|
+
mineru-2.6.7.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
185
|
+
mineru-2.6.7.dist-info/entry_points.txt,sha256=JbtrCPhx1T32s7TONUsteKg-24ZwRT1HSiFtW5jypVw,376
|
|
186
|
+
mineru-2.6.7.dist-info/top_level.txt,sha256=zuGQfZcbsHv4I4oKI9gaKPqEWBFm6xJroKuug2LnKP8,7
|
|
187
|
+
mineru-2.6.7.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|