mineru 2.6.6__py3-none-any.whl → 2.6.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mineru/backend/vlm/vlm_magic_model.py +10 -3
- mineru/backend/vlm/vlm_middle_json_mkcontent.py +283 -2
- mineru/cli/common.py +7 -0
- mineru/utils/enum_class.py +26 -0
- mineru/utils/guess_suffix_or_lang.py +10 -3
- mineru/utils/table_merge.py +10 -2
- mineru/version.py +1 -1
- {mineru-2.6.6.dist-info → mineru-2.6.8.dist-info}/METADATA +4 -1
- {mineru-2.6.6.dist-info → mineru-2.6.8.dist-info}/RECORD +13 -13
- {mineru-2.6.6.dist-info → mineru-2.6.8.dist-info}/WHEEL +0 -0
- {mineru-2.6.6.dist-info → mineru-2.6.8.dist-info}/entry_points.txt +0 -0
- {mineru-2.6.6.dist-info → mineru-2.6.8.dist-info}/licenses/LICENSE.md +0 -0
- {mineru-2.6.6.dist-info → mineru-2.6.8.dist-info}/top_level.txt +0 -0
|
@@ -45,7 +45,7 @@ class MagicModel:
|
|
|
45
45
|
continue
|
|
46
46
|
|
|
47
47
|
span_type = "unknown"
|
|
48
|
-
|
|
48
|
+
code_block_sub_type = None
|
|
49
49
|
guess_lang = None
|
|
50
50
|
|
|
51
51
|
if block_type in [
|
|
@@ -74,7 +74,7 @@ class MagicModel:
|
|
|
74
74
|
span_type = ContentType.TABLE
|
|
75
75
|
elif block_type in ["code", "algorithm"]:
|
|
76
76
|
block_content = code_content_clean(block_content)
|
|
77
|
-
|
|
77
|
+
code_block_sub_type = block_type
|
|
78
78
|
block_type = BlockType.CODE_BODY
|
|
79
79
|
span_type = ContentType.TEXT
|
|
80
80
|
guess_lang = guess_language_by_text(block_content)
|
|
@@ -82,6 +82,9 @@ class MagicModel:
|
|
|
82
82
|
block_type = BlockType.INTERLINE_EQUATION
|
|
83
83
|
span_type = ContentType.INTERLINE_EQUATION
|
|
84
84
|
|
|
85
|
+
# code 和 algorithm 类型的块,如果内容中包含行内公式,则需要将块类型切换为algorithm
|
|
86
|
+
switch_code_to_algorithm = False
|
|
87
|
+
|
|
85
88
|
if span_type in ["image", "table"]:
|
|
86
89
|
span = {
|
|
87
90
|
"bbox": block_bbox,
|
|
@@ -102,6 +105,8 @@ class MagicModel:
|
|
|
102
105
|
|
|
103
106
|
if block_content and block_content.count("\\(") == block_content.count("\\)") and block_content.count("\\(") > 0:
|
|
104
107
|
|
|
108
|
+
switch_code_to_algorithm = True
|
|
109
|
+
|
|
105
110
|
# 生成包含文本和公式的span列表
|
|
106
111
|
spans = []
|
|
107
112
|
last_end = 0
|
|
@@ -160,7 +165,9 @@ class MagicModel:
|
|
|
160
165
|
|
|
161
166
|
# 构造line对象
|
|
162
167
|
if block_type in [BlockType.CODE_BODY]:
|
|
163
|
-
|
|
168
|
+
if switch_code_to_algorithm and code_block_sub_type == "code":
|
|
169
|
+
code_block_sub_type = "algorithm"
|
|
170
|
+
line = {"bbox": block_bbox, "spans": spans, "extra": {"type": code_block_sub_type, "guess_lang": guess_lang}}
|
|
164
171
|
else:
|
|
165
172
|
line = {"bbox": block_bbox, "spans": spans}
|
|
166
173
|
|
|
@@ -1,7 +1,8 @@
|
|
|
1
1
|
import os
|
|
2
2
|
|
|
3
|
+
from loguru import logger
|
|
3
4
|
from mineru.utils.config_reader import get_latex_delimiter_config, get_formula_enable, get_table_enable
|
|
4
|
-
from mineru.utils.enum_class import MakeMode, BlockType, ContentType
|
|
5
|
+
from mineru.utils.enum_class import MakeMode, BlockType, ContentType, ContentTypeV2
|
|
5
6
|
|
|
6
7
|
latex_delimiters_config = get_latex_delimiter_config()
|
|
7
8
|
|
|
@@ -234,6 +235,277 @@ def make_blocks_to_content_list(para_block, img_buket_path, page_idx, page_size)
|
|
|
234
235
|
|
|
235
236
|
return para_content
|
|
236
237
|
|
|
238
|
+
|
|
239
|
+
def make_blocks_to_content_list_v2(para_block, img_buket_path, page_size):
|
|
240
|
+
para_type = para_block['type']
|
|
241
|
+
para_content = {}
|
|
242
|
+
if para_type in [
|
|
243
|
+
BlockType.HEADER,
|
|
244
|
+
BlockType.FOOTER,
|
|
245
|
+
BlockType.ASIDE_TEXT,
|
|
246
|
+
BlockType.PAGE_NUMBER,
|
|
247
|
+
BlockType.PAGE_FOOTNOTE,
|
|
248
|
+
]:
|
|
249
|
+
if para_type == BlockType.HEADER:
|
|
250
|
+
content_type = ContentTypeV2.PAGE_HEADER
|
|
251
|
+
elif para_type == BlockType.FOOTER:
|
|
252
|
+
content_type = ContentTypeV2.PAGE_FOOTER
|
|
253
|
+
elif para_type == BlockType.ASIDE_TEXT:
|
|
254
|
+
content_type = ContentTypeV2.PAGE_ASIDE_TEXT
|
|
255
|
+
elif para_type == BlockType.PAGE_NUMBER:
|
|
256
|
+
content_type = ContentTypeV2.PAGE_NUMBER
|
|
257
|
+
elif para_type == BlockType.PAGE_FOOTNOTE:
|
|
258
|
+
content_type = ContentTypeV2.PAGE_FOOTNOTE
|
|
259
|
+
else:
|
|
260
|
+
raise ValueError(f"Unknown para_type: {para_type}")
|
|
261
|
+
para_content = {
|
|
262
|
+
'type': content_type,
|
|
263
|
+
'content': {
|
|
264
|
+
f"{content_type}_content": merge_para_with_text_v2(para_block),
|
|
265
|
+
}
|
|
266
|
+
}
|
|
267
|
+
elif para_type == BlockType.TITLE:
|
|
268
|
+
title_level = get_title_level(para_block)
|
|
269
|
+
if title_level != 0:
|
|
270
|
+
para_content = {
|
|
271
|
+
'type': ContentTypeV2.TITLE,
|
|
272
|
+
'content': {
|
|
273
|
+
"title_content": merge_para_with_text_v2(para_block),
|
|
274
|
+
"level": title_level
|
|
275
|
+
}
|
|
276
|
+
}
|
|
277
|
+
else:
|
|
278
|
+
para_content = {
|
|
279
|
+
'type': ContentTypeV2.PARAGRAPH,
|
|
280
|
+
'content': {
|
|
281
|
+
"paragraph_content": merge_para_with_text_v2(para_block),
|
|
282
|
+
}
|
|
283
|
+
}
|
|
284
|
+
elif para_type in [
|
|
285
|
+
BlockType.TEXT,
|
|
286
|
+
BlockType.PHONETIC
|
|
287
|
+
]:
|
|
288
|
+
para_content = {
|
|
289
|
+
'type': ContentTypeV2.PARAGRAPH,
|
|
290
|
+
'content': {
|
|
291
|
+
'paragraph_content': merge_para_with_text_v2(para_block),
|
|
292
|
+
}
|
|
293
|
+
}
|
|
294
|
+
elif para_type == BlockType.INTERLINE_EQUATION:
|
|
295
|
+
image_path, math_content = get_body_data(para_block)
|
|
296
|
+
para_content = {
|
|
297
|
+
'type': ContentTypeV2.EQUATION_INTERLINE,
|
|
298
|
+
'content': {
|
|
299
|
+
'math_content': math_content,
|
|
300
|
+
'math_type': 'latex',
|
|
301
|
+
'image_source': {'path': f"{img_buket_path}/{image_path}"},
|
|
302
|
+
}
|
|
303
|
+
}
|
|
304
|
+
elif para_type == BlockType.IMAGE:
|
|
305
|
+
image_caption = []
|
|
306
|
+
image_footnote = []
|
|
307
|
+
image_path, _ = get_body_data(para_block)
|
|
308
|
+
image_source = {
|
|
309
|
+
'path': f"{img_buket_path}/{image_path}",
|
|
310
|
+
}
|
|
311
|
+
for block in para_block['blocks']:
|
|
312
|
+
if block['type'] == BlockType.IMAGE_CAPTION:
|
|
313
|
+
image_caption.extend(merge_para_with_text_v2(block))
|
|
314
|
+
if block['type'] == BlockType.IMAGE_FOOTNOTE:
|
|
315
|
+
image_footnote.extend(merge_para_with_text_v2(block))
|
|
316
|
+
para_content = {
|
|
317
|
+
'type': ContentTypeV2.IMAGE,
|
|
318
|
+
'content': {
|
|
319
|
+
'image_source': image_source,
|
|
320
|
+
'image_caption': image_caption,
|
|
321
|
+
'image_footnote': image_footnote,
|
|
322
|
+
}
|
|
323
|
+
}
|
|
324
|
+
elif para_type == BlockType.TABLE:
|
|
325
|
+
table_caption = []
|
|
326
|
+
table_footnote = []
|
|
327
|
+
image_path, html = get_body_data(para_block)
|
|
328
|
+
image_source = {
|
|
329
|
+
'path': f"{img_buket_path}/{image_path}",
|
|
330
|
+
}
|
|
331
|
+
if html.count("<table") > 1:
|
|
332
|
+
table_nest_level = 2
|
|
333
|
+
else:
|
|
334
|
+
table_nest_level = 1
|
|
335
|
+
if (
|
|
336
|
+
"colspan" in html or
|
|
337
|
+
"rowspan" in html or
|
|
338
|
+
table_nest_level > 1
|
|
339
|
+
):
|
|
340
|
+
table_type = ContentTypeV2.TABLE_COMPLEX
|
|
341
|
+
else:
|
|
342
|
+
table_type = ContentTypeV2.TABLE_SIMPLE
|
|
343
|
+
|
|
344
|
+
for block in para_block['blocks']:
|
|
345
|
+
if block['type'] == BlockType.TABLE_CAPTION:
|
|
346
|
+
table_caption.extend(merge_para_with_text_v2(block))
|
|
347
|
+
if block['type'] == BlockType.TABLE_FOOTNOTE:
|
|
348
|
+
table_footnote.extend(merge_para_with_text_v2(block))
|
|
349
|
+
para_content = {
|
|
350
|
+
'type': ContentTypeV2.TABLE,
|
|
351
|
+
'content': {
|
|
352
|
+
'image_source': image_source,
|
|
353
|
+
'table_caption': table_caption,
|
|
354
|
+
'table_footnote': table_footnote,
|
|
355
|
+
'html': html,
|
|
356
|
+
'table_type': table_type,
|
|
357
|
+
'table_nest_level': table_nest_level,
|
|
358
|
+
}
|
|
359
|
+
}
|
|
360
|
+
elif para_type == BlockType.CODE:
|
|
361
|
+
code_caption = []
|
|
362
|
+
code_content = []
|
|
363
|
+
for block in para_block['blocks']:
|
|
364
|
+
if block['type'] == BlockType.CODE_CAPTION:
|
|
365
|
+
code_caption.extend(merge_para_with_text_v2(block))
|
|
366
|
+
if block['type'] == BlockType.CODE_BODY:
|
|
367
|
+
code_content = merge_para_with_text_v2(block)
|
|
368
|
+
sub_type = para_block["sub_type"]
|
|
369
|
+
if sub_type == BlockType.CODE:
|
|
370
|
+
para_content = {
|
|
371
|
+
'type': ContentTypeV2.CODE,
|
|
372
|
+
'content': {
|
|
373
|
+
'code_caption': code_caption,
|
|
374
|
+
'code_content': code_content,
|
|
375
|
+
'code_language': para_block.get('guess_lang', 'txt'),
|
|
376
|
+
}
|
|
377
|
+
}
|
|
378
|
+
elif sub_type == BlockType.ALGORITHM:
|
|
379
|
+
para_content = {
|
|
380
|
+
'type': ContentTypeV2.ALGORITHM,
|
|
381
|
+
'content': {
|
|
382
|
+
'algorithm_caption': code_caption,
|
|
383
|
+
'algorithm_content': code_content,
|
|
384
|
+
}
|
|
385
|
+
}
|
|
386
|
+
else:
|
|
387
|
+
raise ValueError(f"Unknown code sub_type: {sub_type}")
|
|
388
|
+
elif para_type == BlockType.REF_TEXT:
|
|
389
|
+
para_content = {
|
|
390
|
+
'type': ContentTypeV2.LIST,
|
|
391
|
+
'content': {
|
|
392
|
+
'list_type': ContentTypeV2.LIST_REF,
|
|
393
|
+
'list_items': [
|
|
394
|
+
{
|
|
395
|
+
'item_type': 'text',
|
|
396
|
+
'item_content': merge_para_with_text_v2(para_block),
|
|
397
|
+
}
|
|
398
|
+
],
|
|
399
|
+
}
|
|
400
|
+
}
|
|
401
|
+
elif para_type == BlockType.LIST:
|
|
402
|
+
if 'sub_type' in para_block:
|
|
403
|
+
if para_block['sub_type'] == BlockType.REF_TEXT:
|
|
404
|
+
list_type = ContentTypeV2.LIST_REF
|
|
405
|
+
elif para_block['sub_type'] == BlockType.TEXT:
|
|
406
|
+
list_type = ContentTypeV2.LIST_TEXT
|
|
407
|
+
else:
|
|
408
|
+
raise ValueError(f"Unknown list sub_type: {para_block['sub_type']}")
|
|
409
|
+
else:
|
|
410
|
+
list_type = ContentTypeV2.LIST_TEXT
|
|
411
|
+
list_items = []
|
|
412
|
+
for block in para_block['blocks']:
|
|
413
|
+
item_content = merge_para_with_text_v2(block)
|
|
414
|
+
if item_content:
|
|
415
|
+
list_items.append({
|
|
416
|
+
'item_type': 'text',
|
|
417
|
+
'item_content': item_content,
|
|
418
|
+
})
|
|
419
|
+
para_content = {
|
|
420
|
+
'type': ContentTypeV2.LIST,
|
|
421
|
+
'content': {
|
|
422
|
+
'list_type': list_type,
|
|
423
|
+
'list_items': list_items,
|
|
424
|
+
}
|
|
425
|
+
}
|
|
426
|
+
|
|
427
|
+
page_width, page_height = page_size
|
|
428
|
+
para_bbox = para_block.get('bbox')
|
|
429
|
+
if para_bbox:
|
|
430
|
+
x0, y0, x1, y1 = para_bbox
|
|
431
|
+
para_content['bbox'] = [
|
|
432
|
+
int(x0 * 1000 / page_width),
|
|
433
|
+
int(y0 * 1000 / page_height),
|
|
434
|
+
int(x1 * 1000 / page_width),
|
|
435
|
+
int(y1 * 1000 / page_height),
|
|
436
|
+
]
|
|
437
|
+
|
|
438
|
+
return para_content
|
|
439
|
+
|
|
440
|
+
|
|
441
|
+
|
|
442
|
+
|
|
443
|
+
|
|
444
|
+
def get_body_data(para_block):
|
|
445
|
+
"""
|
|
446
|
+
Extract image_path and html from para_block
|
|
447
|
+
Returns:
|
|
448
|
+
- For IMAGE/INTERLINE_EQUATION: (image_path, '')
|
|
449
|
+
- For TABLE: (image_path, html)
|
|
450
|
+
- Default: ('', '')
|
|
451
|
+
"""
|
|
452
|
+
|
|
453
|
+
def get_data_from_spans(lines):
|
|
454
|
+
for line in lines:
|
|
455
|
+
for span in line.get('spans', []):
|
|
456
|
+
span_type = span.get('type')
|
|
457
|
+
if span_type == ContentType.TABLE:
|
|
458
|
+
return span.get('image_path', ''), span.get('html', '')
|
|
459
|
+
elif span_type == ContentType.IMAGE:
|
|
460
|
+
return span.get('image_path', ''), ''
|
|
461
|
+
elif span_type == ContentType.INTERLINE_EQUATION:
|
|
462
|
+
return span.get('image_path', ''), span.get('content', '')
|
|
463
|
+
elif span_type == ContentType.TEXT:
|
|
464
|
+
return '', span.get('content', '')
|
|
465
|
+
return '', ''
|
|
466
|
+
|
|
467
|
+
# 处理嵌套的 blocks 结构
|
|
468
|
+
if 'blocks' in para_block:
|
|
469
|
+
for block in para_block['blocks']:
|
|
470
|
+
block_type = block.get('type')
|
|
471
|
+
if block_type in [BlockType.IMAGE_BODY, BlockType.TABLE_BODY, BlockType.CODE_BODY]:
|
|
472
|
+
result = get_data_from_spans(block.get('lines', []))
|
|
473
|
+
if result != ('', ''):
|
|
474
|
+
return result
|
|
475
|
+
return '', ''
|
|
476
|
+
|
|
477
|
+
# 处理直接包含 lines 的结构
|
|
478
|
+
return get_data_from_spans(para_block.get('lines', []))
|
|
479
|
+
|
|
480
|
+
|
|
481
|
+
def merge_para_with_text_v2(para_block):
|
|
482
|
+
para_content = []
|
|
483
|
+
para_type = para_block['type']
|
|
484
|
+
for line in para_block['lines']:
|
|
485
|
+
for span in line['spans']:
|
|
486
|
+
span_type = span['type']
|
|
487
|
+
if span.get("content", '').strip():
|
|
488
|
+
if para_type == BlockType.PHONETIC and span_type == ContentTypeV2.SPAN_TEXT:
|
|
489
|
+
span_type = ContentTypeV2.SPAN_PHONETIC
|
|
490
|
+
if span_type == ContentType.INLINE_EQUATION:
|
|
491
|
+
span_type = ContentTypeV2.SPAN_EQUATION_INLINE
|
|
492
|
+
if span_type in [
|
|
493
|
+
ContentTypeV2.SPAN_TEXT,
|
|
494
|
+
ContentTypeV2.SPAN_PHONETIC,
|
|
495
|
+
ContentTypeV2.SPAN_EQUATION_INLINE,
|
|
496
|
+
ContentTypeV2.SPAN_MD,
|
|
497
|
+
ContentTypeV2.SPAN_CODE_INLINE,
|
|
498
|
+
]:
|
|
499
|
+
span_content = {
|
|
500
|
+
'type': span_type,
|
|
501
|
+
'content': span['content'],
|
|
502
|
+
}
|
|
503
|
+
para_content.append(span_content)
|
|
504
|
+
else:
|
|
505
|
+
logger.warning(f"Unknown span type in merge_para_with_text_v2: {span_type}")
|
|
506
|
+
return para_content
|
|
507
|
+
|
|
508
|
+
|
|
237
509
|
def union_make(pdf_info_dict: list,
|
|
238
510
|
make_mode: str,
|
|
239
511
|
img_buket_path: str = '',
|
|
@@ -260,10 +532,19 @@ def union_make(pdf_info_dict: list,
|
|
|
260
532
|
for para_block in para_blocks:
|
|
261
533
|
para_content = make_blocks_to_content_list(para_block, img_buket_path, page_idx, page_size)
|
|
262
534
|
output_content.append(para_content)
|
|
535
|
+
elif make_mode == MakeMode.CONTENT_LIST_V2:
|
|
536
|
+
# https://github.com/drunkpig/llm-webkit-mirror/blob/dev6/docs/specification/output_format/content_list_spec.md
|
|
537
|
+
para_blocks = (paras_of_layout or []) + (paras_of_discarded or [])
|
|
538
|
+
page_contents = []
|
|
539
|
+
if para_blocks:
|
|
540
|
+
for para_block in para_blocks:
|
|
541
|
+
para_content = make_blocks_to_content_list_v2(para_block, img_buket_path, page_size)
|
|
542
|
+
page_contents.append(para_content)
|
|
543
|
+
output_content.append(page_contents)
|
|
263
544
|
|
|
264
545
|
if make_mode in [MakeMode.MM_MD, MakeMode.NLP_MD]:
|
|
265
546
|
return '\n\n'.join(output_content)
|
|
266
|
-
elif make_mode
|
|
547
|
+
elif make_mode in [MakeMode.CONTENT_LIST, MakeMode.CONTENT_LIST_V2]:
|
|
267
548
|
return output_content
|
|
268
549
|
return None
|
|
269
550
|
|
mineru/cli/common.py
CHANGED
|
@@ -144,6 +144,13 @@ def _process_output(
|
|
|
144
144
|
f"{pdf_file_name}_content_list.json",
|
|
145
145
|
json.dumps(content_list, ensure_ascii=False, indent=4),
|
|
146
146
|
)
|
|
147
|
+
if not is_pipeline:
|
|
148
|
+
content_list_v2 = make_func(pdf_info, MakeMode.CONTENT_LIST_V2, image_dir)
|
|
149
|
+
md_writer.write_string(
|
|
150
|
+
f"{pdf_file_name}_content_list_v2.json",
|
|
151
|
+
json.dumps(content_list_v2, ensure_ascii=False, indent=4),
|
|
152
|
+
)
|
|
153
|
+
|
|
147
154
|
|
|
148
155
|
if f_dump_middle_json:
|
|
149
156
|
md_writer.write_string(
|
mineru/utils/enum_class.py
CHANGED
|
@@ -38,6 +38,31 @@ class ContentType:
|
|
|
38
38
|
CODE = 'code'
|
|
39
39
|
|
|
40
40
|
|
|
41
|
+
class ContentTypeV2:
|
|
42
|
+
CODE = 'code'
|
|
43
|
+
ALGORITHM = "algorithm"
|
|
44
|
+
EQUATION_INTERLINE = 'equation_interline'
|
|
45
|
+
IMAGE = 'image'
|
|
46
|
+
TABLE = 'table'
|
|
47
|
+
TABLE_SIMPLE = 'simple_table'
|
|
48
|
+
TABLE_COMPLEX = 'complex_table'
|
|
49
|
+
LIST = 'list'
|
|
50
|
+
LIST_TEXT = 'text_list'
|
|
51
|
+
LIST_REF = 'reference_list'
|
|
52
|
+
TITLE = 'title'
|
|
53
|
+
PARAGRAPH = 'paragraph'
|
|
54
|
+
SPAN_TEXT = 'text'
|
|
55
|
+
SPAN_EQUATION_INLINE = 'equation_inline'
|
|
56
|
+
SPAN_PHONETIC = 'phonetic'
|
|
57
|
+
SPAN_MD = 'md'
|
|
58
|
+
SPAN_CODE_INLINE = 'code_inline'
|
|
59
|
+
PAGE_HEADER = "page_header"
|
|
60
|
+
PAGE_FOOTER = "page_footer"
|
|
61
|
+
PAGE_NUMBER = "page_number"
|
|
62
|
+
PAGE_ASIDE_TEXT = "page_aside_text"
|
|
63
|
+
PAGE_FOOTNOTE = "page_footnote"
|
|
64
|
+
|
|
65
|
+
|
|
41
66
|
class CategoryId:
|
|
42
67
|
Title = 0
|
|
43
68
|
Text = 1
|
|
@@ -60,6 +85,7 @@ class MakeMode:
|
|
|
60
85
|
MM_MD = 'mm_markdown'
|
|
61
86
|
NLP_MD = 'nlp_markdown'
|
|
62
87
|
CONTENT_LIST = 'content_list'
|
|
88
|
+
CONTENT_LIST_V2 = 'content_list_v2'
|
|
63
89
|
|
|
64
90
|
|
|
65
91
|
class ModelPath:
|
|
@@ -1,9 +1,11 @@
|
|
|
1
1
|
from pathlib import Path
|
|
2
2
|
|
|
3
|
+
from loguru import logger
|
|
3
4
|
from magika import Magika
|
|
4
5
|
|
|
5
6
|
|
|
6
7
|
DEFAULT_LANG = "txt"
|
|
8
|
+
PDF_SIG_BYTES = b'%PDF'
|
|
7
9
|
magika = Magika()
|
|
8
10
|
|
|
9
11
|
def guess_language_by_text(code):
|
|
@@ -14,7 +16,7 @@ def guess_language_by_text(code):
|
|
|
14
16
|
|
|
15
17
|
def guess_suffix_by_bytes(file_bytes, file_path=None) -> str:
|
|
16
18
|
suffix = magika.identify_bytes(file_bytes).prediction.output.label
|
|
17
|
-
if file_path and suffix in ["ai"] and Path(file_path).suffix.lower() in [".pdf"]:
|
|
19
|
+
if file_path and suffix in ["ai", "html"] and Path(file_path).suffix.lower() in [".pdf"] and file_bytes[:4] == PDF_SIG_BYTES:
|
|
18
20
|
suffix = "pdf"
|
|
19
21
|
return suffix
|
|
20
22
|
|
|
@@ -23,6 +25,11 @@ def guess_suffix_by_path(file_path) -> str:
|
|
|
23
25
|
if not isinstance(file_path, Path):
|
|
24
26
|
file_path = Path(file_path)
|
|
25
27
|
suffix = magika.identify_path(file_path).prediction.output.label
|
|
26
|
-
if suffix in ["ai"] and file_path.suffix.lower() in [".pdf"]:
|
|
27
|
-
|
|
28
|
+
if suffix in ["ai", "html"] and file_path.suffix.lower() in [".pdf"]:
|
|
29
|
+
try:
|
|
30
|
+
with open(file_path, 'rb') as f:
|
|
31
|
+
if f.read(4) == PDF_SIG_BYTES:
|
|
32
|
+
suffix = "pdf"
|
|
33
|
+
except Exception as e:
|
|
34
|
+
logger.warning(f"Failed to read file {file_path} for PDF signature check: {e}")
|
|
28
35
|
return suffix
|
mineru/utils/table_merge.py
CHANGED
|
@@ -7,6 +7,9 @@ from mineru.backend.vlm.vlm_middle_json_mkcontent import merge_para_with_text
|
|
|
7
7
|
from mineru.utils.enum_class import BlockType, SplitFlag
|
|
8
8
|
|
|
9
9
|
|
|
10
|
+
CONTINUATION_MARKERS = ["(续)", "(续表)", "(continued)", "(cont.)"]
|
|
11
|
+
|
|
12
|
+
|
|
10
13
|
def full_to_half(text: str) -> str:
|
|
11
14
|
"""Convert full-width characters to half-width characters using code point manipulation.
|
|
12
15
|
|
|
@@ -174,8 +177,13 @@ def can_merge_tables(current_table_block, previous_table_block):
|
|
|
174
177
|
# 如果有TABLE_CAPTION类型的块,检查是否至少有一个以"(续)"结尾
|
|
175
178
|
caption_blocks = [block for block in current_table_block["blocks"] if block["type"] == BlockType.TABLE_CAPTION]
|
|
176
179
|
if caption_blocks:
|
|
177
|
-
# 如果所有caption都不以"(续)"结尾,则不合并
|
|
178
|
-
|
|
180
|
+
# 如果所有caption都不以"(续)"、"(续表)"、"(continued)"或"(cont.)"结尾,则不合并
|
|
181
|
+
|
|
182
|
+
if not any(
|
|
183
|
+
any(full_to_half(merge_para_with_text(block).strip()).lower().endswith(marker.lower())
|
|
184
|
+
for marker in CONTINUATION_MARKERS)
|
|
185
|
+
for block in caption_blocks
|
|
186
|
+
):
|
|
179
187
|
return False, None, None, None, None
|
|
180
188
|
|
|
181
189
|
if any(block["type"] == BlockType.TABLE_FOOTNOTE for block in previous_table_block["blocks"]):
|
mineru/version.py
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "2.6.
|
|
1
|
+
__version__ = "2.6.8"
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: mineru
|
|
3
|
-
Version: 2.6.
|
|
3
|
+
Version: 2.6.8
|
|
4
4
|
Summary: A practical tool for converting PDF to Markdown
|
|
5
5
|
License: AGPL-3.0
|
|
6
6
|
Project-URL: homepage, https://mineru.net/
|
|
@@ -135,6 +135,9 @@ Dynamic: license-file
|
|
|
135
135
|
|
|
136
136
|
# Changelog
|
|
137
137
|
|
|
138
|
+
- 2025/12/12 2.6.7 Release
|
|
139
|
+
- Bug fix: #4168
|
|
140
|
+
|
|
138
141
|
- 2025/12/02 2.6.6 Release
|
|
139
142
|
- `mineru-api` tool optimizations
|
|
140
143
|
- Added descriptive text to `mineru-api` interface parameters to improve API documentation readability.
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
mineru/__init__.py,sha256=8CRrCQVuExa0BttRFh3Z40lFy2K5jN0sp67KWjOlj5c,50
|
|
2
|
-
mineru/version.py,sha256=
|
|
2
|
+
mineru/version.py,sha256=3WRCYKawwH5XZgRMQ8fXEoGyCz5YorbyTfOWuY9SV3g,22
|
|
3
3
|
mineru/backend/__init__.py,sha256=8CRrCQVuExa0BttRFh3Z40lFy2K5jN0sp67KWjOlj5c,50
|
|
4
4
|
mineru/backend/utils.py,sha256=GLJU3IznDmhE1_qNmkU1UOtsuskIHBezgsEVO6Uar-Y,698
|
|
5
5
|
mineru/backend/pipeline/__init__.py,sha256=8CRrCQVuExa0BttRFh3Z40lFy2K5jN0sp67KWjOlj5c,50
|
|
@@ -15,11 +15,11 @@ mineru/backend/vlm/__init__.py,sha256=8CRrCQVuExa0BttRFh3Z40lFy2K5jN0sp67KWjOlj5
|
|
|
15
15
|
mineru/backend/vlm/model_output_to_middle_json.py,sha256=AqYX44gS9crUO_t7SuUatD71EVjow6pI6yA2Ik3gQ0s,5139
|
|
16
16
|
mineru/backend/vlm/utils.py,sha256=JMgS3SMFcHJYH2jIx-Xhs-P2a1bmT8U6Kn60IL0OmQA,3570
|
|
17
17
|
mineru/backend/vlm/vlm_analyze.py,sha256=wP3vuYGVec0hRsDAuzfSm2HD4Muu7wSWL767qxd_yqw,11690
|
|
18
|
-
mineru/backend/vlm/vlm_magic_model.py,sha256=
|
|
19
|
-
mineru/backend/vlm/vlm_middle_json_mkcontent.py,sha256=
|
|
18
|
+
mineru/backend/vlm/vlm_magic_model.py,sha256=e8BWkfeRkZNJjFdm9oPmvIs9ATjdzCy_5OZw2qTziZA,23839
|
|
19
|
+
mineru/backend/vlm/vlm_middle_json_mkcontent.py,sha256=Oc2w2BSRXeJPKVcM1Bkb2LjLNTQAG5yUYRtF7YNBTss,24070
|
|
20
20
|
mineru/cli/__init__.py,sha256=8CRrCQVuExa0BttRFh3Z40lFy2K5jN0sp67KWjOlj5c,50
|
|
21
21
|
mineru/cli/client.py,sha256=XSEIr4klUuufMAWn5IioZdXpg1xAxqRZF0HkaVIhxh0,6815
|
|
22
|
-
mineru/cli/common.py,sha256=
|
|
22
|
+
mineru/cli/common.py,sha256=Rld3P4FzbTYngy3BP-Irto9aMS7EPq2lIWgeIF1xPMc,14667
|
|
23
23
|
mineru/cli/fast_api.py,sha256=lLxQKKHmD8ruoZGcE6LrXzr3pQIxvw8OdJrQq_FNLSM,14447
|
|
24
24
|
mineru/cli/gradio_app.py,sha256=EUPuRHHCOECrE3E3VNEeuMDYeC3nicurOYfk8YJSOMw,15646
|
|
25
25
|
mineru/cli/models_download.py,sha256=LNfoIpUlJM7m7qb2SiCxtjMDw4jILBQtZwNP2JoY81U,4815
|
|
@@ -159,9 +159,9 @@ mineru/utils/cli_parser.py,sha256=4seFAu1kulsYnw6WM2q_cxgEOt2tErZVkI-LNEF_kGw,14
|
|
|
159
159
|
mineru/utils/config_reader.py,sha256=IRVWTpBnbnRpck6eXZUKw-fcLt7hon5S4uqWW-RBb1w,4075
|
|
160
160
|
mineru/utils/cut_image.py,sha256=g3m4nfcJNWlxi-P0kpXTtlmspXkMcLCfGwmYuQ-Z2hE,751
|
|
161
161
|
mineru/utils/draw_bbox.py,sha256=FkgppjUzRhN-uxvChdkhHXcDavJEaApMD6qC6qoRwfQ,20292
|
|
162
|
-
mineru/utils/enum_class.py,sha256
|
|
162
|
+
mineru/utils/enum_class.py,sha256=NO9FiM2Ni6-SI1rzMm5X98ssNDxgJTyykZddJSscgJs,3221
|
|
163
163
|
mineru/utils/format_utils.py,sha256=2s89vHcSISjuolk8Hvg3K-5-rRbiT3Us7eFLzUKrNKs,10233
|
|
164
|
-
mineru/utils/guess_suffix_or_lang.py,sha256=
|
|
164
|
+
mineru/utils/guess_suffix_or_lang.py,sha256=aUC2wAJwa5LH0SHxwTbOEJqVVgvpdUCWFF6of8eDVkc,1198
|
|
165
165
|
mineru/utils/hash_utils.py,sha256=UPS_8NRBmVumdyOv16Lmv6Ly2xK8OVDJEe5gG6gKIFk,857
|
|
166
166
|
mineru/utils/language.py,sha256=7RT3mxSa7jdpoC5ySd7ZddHA7TO7UsnmDOWiYZAxuyg,1433
|
|
167
167
|
mineru/utils/llm_aided.py,sha256=9WUytvxenSAuaWR4sTQhVPQ5h8pY0wVOH1O2sj_6dLs,5149
|
|
@@ -178,10 +178,10 @@ mineru/utils/pdf_text_tool.py,sha256=KEztjfdqsIHHuiTEAMAL7Lr1OS3R7Ur-uTqGiCRjReQ
|
|
|
178
178
|
mineru/utils/run_async.py,sha256=rPeP4BCZerR8VByRDhiYzfZiahLVqoZEBVAS54dAjNg,1286
|
|
179
179
|
mineru/utils/span_block_fix.py,sha256=0eVQjJCrT03woRt9hoh6Uu42Tp1dacfGTv2x3B9qq94,8797
|
|
180
180
|
mineru/utils/span_pre_proc.py,sha256=h41q2uQajI0xQbc_30hqaju1dv3oVYxBAlKgURl8HIc,13692
|
|
181
|
-
mineru/utils/table_merge.py,sha256=
|
|
182
|
-
mineru-2.6.
|
|
183
|
-
mineru-2.6.
|
|
184
|
-
mineru-2.6.
|
|
185
|
-
mineru-2.6.
|
|
186
|
-
mineru-2.6.
|
|
187
|
-
mineru-2.6.
|
|
181
|
+
mineru/utils/table_merge.py,sha256=pZHP0mIUcWPcdu0HecZDEFZ_ms_rT7C43m0Eg2iEEF4,15592
|
|
182
|
+
mineru-2.6.8.dist-info/licenses/LICENSE.md,sha256=jVa0BUaKrRH4erV2P5AeJ24I2WRv9chIGxditreJ6e0,34524
|
|
183
|
+
mineru-2.6.8.dist-info/METADATA,sha256=nbMEGfGHC_Z7OUDegE18y0lrzSXaQCTtpWDYi_KZVSE,73142
|
|
184
|
+
mineru-2.6.8.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
185
|
+
mineru-2.6.8.dist-info/entry_points.txt,sha256=JbtrCPhx1T32s7TONUsteKg-24ZwRT1HSiFtW5jypVw,376
|
|
186
|
+
mineru-2.6.8.dist-info/top_level.txt,sha256=zuGQfZcbsHv4I4oKI9gaKPqEWBFm6xJroKuug2LnKP8,7
|
|
187
|
+
mineru-2.6.8.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|