mineru 2.6.6__py3-none-any.whl → 2.6.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -45,7 +45,7 @@ class MagicModel:
45
45
  continue
46
46
 
47
47
  span_type = "unknown"
48
- line_type = None
48
+ code_block_sub_type = None
49
49
  guess_lang = None
50
50
 
51
51
  if block_type in [
@@ -74,7 +74,7 @@ class MagicModel:
74
74
  span_type = ContentType.TABLE
75
75
  elif block_type in ["code", "algorithm"]:
76
76
  block_content = code_content_clean(block_content)
77
- line_type = block_type
77
+ code_block_sub_type = block_type
78
78
  block_type = BlockType.CODE_BODY
79
79
  span_type = ContentType.TEXT
80
80
  guess_lang = guess_language_by_text(block_content)
@@ -82,6 +82,9 @@ class MagicModel:
82
82
  block_type = BlockType.INTERLINE_EQUATION
83
83
  span_type = ContentType.INTERLINE_EQUATION
84
84
 
85
+ # code 和 algorithm 类型的块,如果内容中包含行内公式,则需要将块类型切换为algorithm
86
+ switch_code_to_algorithm = False
87
+
85
88
  if span_type in ["image", "table"]:
86
89
  span = {
87
90
  "bbox": block_bbox,
@@ -102,6 +105,8 @@ class MagicModel:
102
105
 
103
106
  if block_content and block_content.count("\\(") == block_content.count("\\)") and block_content.count("\\(") > 0:
104
107
 
108
+ switch_code_to_algorithm = True
109
+
105
110
  # 生成包含文本和公式的span列表
106
111
  spans = []
107
112
  last_end = 0
@@ -160,7 +165,9 @@ class MagicModel:
160
165
 
161
166
  # 构造line对象
162
167
  if block_type in [BlockType.CODE_BODY]:
163
- line = {"bbox": block_bbox, "spans": spans, "extra": {"type": line_type, "guess_lang": guess_lang}}
168
+ if switch_code_to_algorithm and code_block_sub_type == "code":
169
+ code_block_sub_type = "algorithm"
170
+ line = {"bbox": block_bbox, "spans": spans, "extra": {"type": code_block_sub_type, "guess_lang": guess_lang}}
164
171
  else:
165
172
  line = {"bbox": block_bbox, "spans": spans}
166
173
 
@@ -1,7 +1,8 @@
1
1
  import os
2
2
 
3
+ from loguru import logger
3
4
  from mineru.utils.config_reader import get_latex_delimiter_config, get_formula_enable, get_table_enable
4
- from mineru.utils.enum_class import MakeMode, BlockType, ContentType
5
+ from mineru.utils.enum_class import MakeMode, BlockType, ContentType, ContentTypeV2
5
6
 
6
7
  latex_delimiters_config = get_latex_delimiter_config()
7
8
 
@@ -234,6 +235,277 @@ def make_blocks_to_content_list(para_block, img_buket_path, page_idx, page_size)
234
235
 
235
236
  return para_content
236
237
 
238
+
239
+ def make_blocks_to_content_list_v2(para_block, img_buket_path, page_size):
240
+ para_type = para_block['type']
241
+ para_content = {}
242
+ if para_type in [
243
+ BlockType.HEADER,
244
+ BlockType.FOOTER,
245
+ BlockType.ASIDE_TEXT,
246
+ BlockType.PAGE_NUMBER,
247
+ BlockType.PAGE_FOOTNOTE,
248
+ ]:
249
+ if para_type == BlockType.HEADER:
250
+ content_type = ContentTypeV2.PAGE_HEADER
251
+ elif para_type == BlockType.FOOTER:
252
+ content_type = ContentTypeV2.PAGE_FOOTER
253
+ elif para_type == BlockType.ASIDE_TEXT:
254
+ content_type = ContentTypeV2.PAGE_ASIDE_TEXT
255
+ elif para_type == BlockType.PAGE_NUMBER:
256
+ content_type = ContentTypeV2.PAGE_NUMBER
257
+ elif para_type == BlockType.PAGE_FOOTNOTE:
258
+ content_type = ContentTypeV2.PAGE_FOOTNOTE
259
+ else:
260
+ raise ValueError(f"Unknown para_type: {para_type}")
261
+ para_content = {
262
+ 'type': content_type,
263
+ 'content': {
264
+ f"{content_type}_content": merge_para_with_text_v2(para_block),
265
+ }
266
+ }
267
+ elif para_type == BlockType.TITLE:
268
+ title_level = get_title_level(para_block)
269
+ if title_level != 0:
270
+ para_content = {
271
+ 'type': ContentTypeV2.TITLE,
272
+ 'content': {
273
+ "title_content": merge_para_with_text_v2(para_block),
274
+ "level": title_level
275
+ }
276
+ }
277
+ else:
278
+ para_content = {
279
+ 'type': ContentTypeV2.PARAGRAPH,
280
+ 'content': {
281
+ "paragraph_content": merge_para_with_text_v2(para_block),
282
+ }
283
+ }
284
+ elif para_type in [
285
+ BlockType.TEXT,
286
+ BlockType.PHONETIC
287
+ ]:
288
+ para_content = {
289
+ 'type': ContentTypeV2.PARAGRAPH,
290
+ 'content': {
291
+ 'paragraph_content': merge_para_with_text_v2(para_block),
292
+ }
293
+ }
294
+ elif para_type == BlockType.INTERLINE_EQUATION:
295
+ image_path, math_content = get_body_data(para_block)
296
+ para_content = {
297
+ 'type': ContentTypeV2.EQUATION_INTERLINE,
298
+ 'content': {
299
+ 'math_content': math_content,
300
+ 'math_type': 'latex',
301
+ 'image_source': {'path': f"{img_buket_path}/{image_path}"},
302
+ }
303
+ }
304
+ elif para_type == BlockType.IMAGE:
305
+ image_caption = []
306
+ image_footnote = []
307
+ image_path, _ = get_body_data(para_block)
308
+ image_source = {
309
+ 'path': f"{img_buket_path}/{image_path}",
310
+ }
311
+ for block in para_block['blocks']:
312
+ if block['type'] == BlockType.IMAGE_CAPTION:
313
+ image_caption.extend(merge_para_with_text_v2(block))
314
+ if block['type'] == BlockType.IMAGE_FOOTNOTE:
315
+ image_footnote.extend(merge_para_with_text_v2(block))
316
+ para_content = {
317
+ 'type': ContentTypeV2.IMAGE,
318
+ 'content': {
319
+ 'image_source': image_source,
320
+ 'image_caption': image_caption,
321
+ 'image_footnote': image_footnote,
322
+ }
323
+ }
324
+ elif para_type == BlockType.TABLE:
325
+ table_caption = []
326
+ table_footnote = []
327
+ image_path, html = get_body_data(para_block)
328
+ image_source = {
329
+ 'path': f"{img_buket_path}/{image_path}",
330
+ }
331
+ if html.count("<table") > 1:
332
+ table_nest_level = 2
333
+ else:
334
+ table_nest_level = 1
335
+ if (
336
+ "colspan" in html or
337
+ "rowspan" in html or
338
+ table_nest_level > 1
339
+ ):
340
+ table_type = ContentTypeV2.TABLE_COMPLEX
341
+ else:
342
+ table_type = ContentTypeV2.TABLE_SIMPLE
343
+
344
+ for block in para_block['blocks']:
345
+ if block['type'] == BlockType.TABLE_CAPTION:
346
+ table_caption.extend(merge_para_with_text_v2(block))
347
+ if block['type'] == BlockType.TABLE_FOOTNOTE:
348
+ table_footnote.extend(merge_para_with_text_v2(block))
349
+ para_content = {
350
+ 'type': ContentTypeV2.TABLE,
351
+ 'content': {
352
+ 'image_source': image_source,
353
+ 'table_caption': table_caption,
354
+ 'table_footnote': table_footnote,
355
+ 'html': html,
356
+ 'table_type': table_type,
357
+ 'table_nest_level': table_nest_level,
358
+ }
359
+ }
360
+ elif para_type == BlockType.CODE:
361
+ code_caption = []
362
+ code_content = []
363
+ for block in para_block['blocks']:
364
+ if block['type'] == BlockType.CODE_CAPTION:
365
+ code_caption.extend(merge_para_with_text_v2(block))
366
+ if block['type'] == BlockType.CODE_BODY:
367
+ code_content = merge_para_with_text_v2(block)
368
+ sub_type = para_block["sub_type"]
369
+ if sub_type == BlockType.CODE:
370
+ para_content = {
371
+ 'type': ContentTypeV2.CODE,
372
+ 'content': {
373
+ 'code_caption': code_caption,
374
+ 'code_content': code_content,
375
+ 'code_language': para_block.get('guess_lang', 'txt'),
376
+ }
377
+ }
378
+ elif sub_type == BlockType.ALGORITHM:
379
+ para_content = {
380
+ 'type': ContentTypeV2.ALGORITHM,
381
+ 'content': {
382
+ 'algorithm_caption': code_caption,
383
+ 'algorithm_content': code_content,
384
+ }
385
+ }
386
+ else:
387
+ raise ValueError(f"Unknown code sub_type: {sub_type}")
388
+ elif para_type == BlockType.REF_TEXT:
389
+ para_content = {
390
+ 'type': ContentTypeV2.LIST,
391
+ 'content': {
392
+ 'list_type': ContentTypeV2.LIST_REF,
393
+ 'list_items': [
394
+ {
395
+ 'item_type': 'text',
396
+ 'item_content': merge_para_with_text_v2(para_block),
397
+ }
398
+ ],
399
+ }
400
+ }
401
+ elif para_type == BlockType.LIST:
402
+ if 'sub_type' in para_block:
403
+ if para_block['sub_type'] == BlockType.REF_TEXT:
404
+ list_type = ContentTypeV2.LIST_REF
405
+ elif para_block['sub_type'] == BlockType.TEXT:
406
+ list_type = ContentTypeV2.LIST_TEXT
407
+ else:
408
+ raise ValueError(f"Unknown list sub_type: {para_block['sub_type']}")
409
+ else:
410
+ list_type = ContentTypeV2.LIST_TEXT
411
+ list_items = []
412
+ for block in para_block['blocks']:
413
+ item_content = merge_para_with_text_v2(block)
414
+ if item_content:
415
+ list_items.append({
416
+ 'item_type': 'text',
417
+ 'item_content': item_content,
418
+ })
419
+ para_content = {
420
+ 'type': ContentTypeV2.LIST,
421
+ 'content': {
422
+ 'list_type': list_type,
423
+ 'list_items': list_items,
424
+ }
425
+ }
426
+
427
+ page_width, page_height = page_size
428
+ para_bbox = para_block.get('bbox')
429
+ if para_bbox:
430
+ x0, y0, x1, y1 = para_bbox
431
+ para_content['bbox'] = [
432
+ int(x0 * 1000 / page_width),
433
+ int(y0 * 1000 / page_height),
434
+ int(x1 * 1000 / page_width),
435
+ int(y1 * 1000 / page_height),
436
+ ]
437
+
438
+ return para_content
439
+
440
+
441
+
442
+
443
+
444
+ def get_body_data(para_block):
445
+ """
446
+ Extract image_path and html from para_block
447
+ Returns:
448
+ - For IMAGE/INTERLINE_EQUATION: (image_path, '')
449
+ - For TABLE: (image_path, html)
450
+ - Default: ('', '')
451
+ """
452
+
453
+ def get_data_from_spans(lines):
454
+ for line in lines:
455
+ for span in line.get('spans', []):
456
+ span_type = span.get('type')
457
+ if span_type == ContentType.TABLE:
458
+ return span.get('image_path', ''), span.get('html', '')
459
+ elif span_type == ContentType.IMAGE:
460
+ return span.get('image_path', ''), ''
461
+ elif span_type == ContentType.INTERLINE_EQUATION:
462
+ return span.get('image_path', ''), span.get('content', '')
463
+ elif span_type == ContentType.TEXT:
464
+ return '', span.get('content', '')
465
+ return '', ''
466
+
467
+ # 处理嵌套的 blocks 结构
468
+ if 'blocks' in para_block:
469
+ for block in para_block['blocks']:
470
+ block_type = block.get('type')
471
+ if block_type in [BlockType.IMAGE_BODY, BlockType.TABLE_BODY, BlockType.CODE_BODY]:
472
+ result = get_data_from_spans(block.get('lines', []))
473
+ if result != ('', ''):
474
+ return result
475
+ return '', ''
476
+
477
+ # 处理直接包含 lines 的结构
478
+ return get_data_from_spans(para_block.get('lines', []))
479
+
480
+
481
+ def merge_para_with_text_v2(para_block):
482
+ para_content = []
483
+ para_type = para_block['type']
484
+ for line in para_block['lines']:
485
+ for span in line['spans']:
486
+ span_type = span['type']
487
+ if span['content']:
488
+ if para_type == BlockType.PHONETIC and span_type == ContentTypeV2.SPAN_TEXT:
489
+ span_type = ContentTypeV2.SPAN_PHONETIC
490
+ if span_type == ContentType.INLINE_EQUATION:
491
+ span_type = ContentTypeV2.SPAN_EQUATION_INLINE
492
+ if span_type in [
493
+ ContentTypeV2.SPAN_TEXT,
494
+ ContentTypeV2.SPAN_PHONETIC,
495
+ ContentTypeV2.SPAN_EQUATION_INLINE,
496
+ ContentTypeV2.SPAN_MD,
497
+ ContentTypeV2.SPAN_CODE_INLINE,
498
+ ]:
499
+ span_content = {
500
+ 'type': span_type,
501
+ 'content': span['content'],
502
+ }
503
+ para_content.append(span_content)
504
+ else:
505
+ logger.warning(f"Unknown span type in merge_para_with_text_v2: {span_type}")
506
+ return para_content
507
+
508
+
237
509
  def union_make(pdf_info_dict: list,
238
510
  make_mode: str,
239
511
  img_buket_path: str = '',
@@ -260,10 +532,20 @@ def union_make(pdf_info_dict: list,
260
532
  for para_block in para_blocks:
261
533
  para_content = make_blocks_to_content_list(para_block, img_buket_path, page_idx, page_size)
262
534
  output_content.append(para_content)
535
+ elif make_mode == MakeMode.CONTENT_LIST_V2:
536
+ # https://github.com/drunkpig/llm-webkit-mirror/blob/dev6/docs/specification/output_format/content_list_spec.md
537
+ page_contents = []
538
+ para_blocks = (paras_of_layout or []) + (paras_of_discarded or [])
539
+ if not para_blocks:
540
+ continue
541
+ for para_block in para_blocks:
542
+ para_content = make_blocks_to_content_list_v2(para_block, img_buket_path, page_size)
543
+ page_contents.append(para_content)
544
+ output_content.append(page_contents)
263
545
 
264
546
  if make_mode in [MakeMode.MM_MD, MakeMode.NLP_MD]:
265
547
  return '\n\n'.join(output_content)
266
- elif make_mode == MakeMode.CONTENT_LIST:
548
+ elif make_mode in [MakeMode.CONTENT_LIST, MakeMode.CONTENT_LIST_V2]:
267
549
  return output_content
268
550
  return None
269
551
 
mineru/cli/common.py CHANGED
@@ -144,6 +144,13 @@ def _process_output(
144
144
  f"{pdf_file_name}_content_list.json",
145
145
  json.dumps(content_list, ensure_ascii=False, indent=4),
146
146
  )
147
+ if not is_pipeline:
148
+ content_list_v2 = make_func(pdf_info, MakeMode.CONTENT_LIST_V2, image_dir)
149
+ md_writer.write_string(
150
+ f"{pdf_file_name}_content_list_v2.json",
151
+ json.dumps(content_list_v2, ensure_ascii=False, indent=4),
152
+ )
153
+
147
154
 
148
155
  if f_dump_middle_json:
149
156
  md_writer.write_string(
@@ -38,6 +38,31 @@ class ContentType:
38
38
  CODE = 'code'
39
39
 
40
40
 
41
+ class ContentTypeV2:
42
+ CODE = 'code'
43
+ ALGORITHM = "algorithm"
44
+ EQUATION_INTERLINE = 'equation_interline'
45
+ IMAGE = 'image'
46
+ TABLE = 'table'
47
+ TABLE_SIMPLE = 'simple_table'
48
+ TABLE_COMPLEX = 'complex_table'
49
+ LIST = 'list'
50
+ LIST_TEXT = 'text_list'
51
+ LIST_REF = 'reference_list'
52
+ TITLE = 'title'
53
+ PARAGRAPH = 'paragraph'
54
+ SPAN_TEXT = 'text'
55
+ SPAN_EQUATION_INLINE = 'equation_inline'
56
+ SPAN_PHONETIC = 'phonetic'
57
+ SPAN_MD = 'md'
58
+ SPAN_CODE_INLINE = 'code_inline'
59
+ PAGE_HEADER = "page_header"
60
+ PAGE_FOOTER = "page_footer"
61
+ PAGE_NUMBER = "page_number"
62
+ PAGE_ASIDE_TEXT = "page_aside_text"
63
+ PAGE_FOOTNOTE = "page_footnote"
64
+
65
+
41
66
  class CategoryId:
42
67
  Title = 0
43
68
  Text = 1
@@ -60,6 +85,7 @@ class MakeMode:
60
85
  MM_MD = 'mm_markdown'
61
86
  NLP_MD = 'nlp_markdown'
62
87
  CONTENT_LIST = 'content_list'
88
+ CONTENT_LIST_V2 = 'content_list_v2'
63
89
 
64
90
 
65
91
  class ModelPath:
@@ -1,9 +1,11 @@
1
1
  from pathlib import Path
2
2
 
3
+ from loguru import logger
3
4
  from magika import Magika
4
5
 
5
6
 
6
7
  DEFAULT_LANG = "txt"
8
+ PDF_SIG_BYTES = b'%PDF'
7
9
  magika = Magika()
8
10
 
9
11
  def guess_language_by_text(code):
@@ -14,7 +16,7 @@ def guess_language_by_text(code):
14
16
 
15
17
  def guess_suffix_by_bytes(file_bytes, file_path=None) -> str:
16
18
  suffix = magika.identify_bytes(file_bytes).prediction.output.label
17
- if file_path and suffix in ["ai"] and Path(file_path).suffix.lower() in [".pdf"]:
19
+ if file_path and suffix in ["ai", "html"] and Path(file_path).suffix.lower() in [".pdf"] and file_bytes[:4] == PDF_SIG_BYTES:
18
20
  suffix = "pdf"
19
21
  return suffix
20
22
 
@@ -23,6 +25,11 @@ def guess_suffix_by_path(file_path) -> str:
23
25
  if not isinstance(file_path, Path):
24
26
  file_path = Path(file_path)
25
27
  suffix = magika.identify_path(file_path).prediction.output.label
26
- if suffix in ["ai"] and file_path.suffix.lower() in [".pdf"]:
27
- suffix = "pdf"
28
+ if suffix in ["ai", "html"] and file_path.suffix.lower() in [".pdf"]:
29
+ try:
30
+ with open(file_path, 'rb') as f:
31
+ if f.read(4) == PDF_SIG_BYTES:
32
+ suffix = "pdf"
33
+ except Exception as e:
34
+ logger.warning(f"Failed to read file {file_path} for PDF signature check: {e}")
28
35
  return suffix
@@ -7,6 +7,9 @@ from mineru.backend.vlm.vlm_middle_json_mkcontent import merge_para_with_text
7
7
  from mineru.utils.enum_class import BlockType, SplitFlag
8
8
 
9
9
 
10
+ CONTINUATION_MARKERS = ["(续)", "(续表)", "(continued)", "(cont.)"]
11
+
12
+
10
13
  def full_to_half(text: str) -> str:
11
14
  """Convert full-width characters to half-width characters using code point manipulation.
12
15
 
@@ -174,8 +177,13 @@ def can_merge_tables(current_table_block, previous_table_block):
174
177
  # 如果有TABLE_CAPTION类型的块,检查是否至少有一个以"(续)"结尾
175
178
  caption_blocks = [block for block in current_table_block["blocks"] if block["type"] == BlockType.TABLE_CAPTION]
176
179
  if caption_blocks:
177
- # 如果所有caption都不以"(续)"结尾,则不合并
178
- if not any(full_to_half(merge_para_with_text(block).strip()).endswith("(续)") for block in caption_blocks):
180
+ # 如果所有caption都不以"(续)"、"(续表)"、"(continued)"或"(cont.)"结尾,则不合并
181
+
182
+ if not any(
183
+ any(full_to_half(merge_para_with_text(block).strip()).lower().endswith(marker.lower())
184
+ for marker in CONTINUATION_MARKERS)
185
+ for block in caption_blocks
186
+ ):
179
187
  return False, None, None, None, None
180
188
 
181
189
  if any(block["type"] == BlockType.TABLE_FOOTNOTE for block in previous_table_block["blocks"]):
mineru/version.py CHANGED
@@ -1 +1 @@
1
- __version__ = "2.6.6"
1
+ __version__ = "2.6.7"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: mineru
3
- Version: 2.6.6
3
+ Version: 2.6.7
4
4
  Summary: A practical tool for converting PDF to Markdown
5
5
  License: AGPL-3.0
6
6
  Project-URL: homepage, https://mineru.net/
@@ -135,6 +135,9 @@ Dynamic: license-file
135
135
 
136
136
  # Changelog
137
137
 
138
+ - 2025/12/12 2.6.7 Release
139
+ - Bug fix: #4168
140
+
138
141
  - 2025/12/02 2.6.6 Release
139
142
  - `mineru-api` tool optimizations
140
143
  - Added descriptive text to `mineru-api` interface parameters to improve API documentation readability.
@@ -1,5 +1,5 @@
1
1
  mineru/__init__.py,sha256=8CRrCQVuExa0BttRFh3Z40lFy2K5jN0sp67KWjOlj5c,50
2
- mineru/version.py,sha256=MJHGx-Qo0nycI7WHSavnK8Mok6HS_De_qLfGWXih6Og,22
2
+ mineru/version.py,sha256=Kc2cyTvfIdfaMxraNhj61Hw9Mr3C-eTCdQPLK5KH1AQ,22
3
3
  mineru/backend/__init__.py,sha256=8CRrCQVuExa0BttRFh3Z40lFy2K5jN0sp67KWjOlj5c,50
4
4
  mineru/backend/utils.py,sha256=GLJU3IznDmhE1_qNmkU1UOtsuskIHBezgsEVO6Uar-Y,698
5
5
  mineru/backend/pipeline/__init__.py,sha256=8CRrCQVuExa0BttRFh3Z40lFy2K5jN0sp67KWjOlj5c,50
@@ -15,11 +15,11 @@ mineru/backend/vlm/__init__.py,sha256=8CRrCQVuExa0BttRFh3Z40lFy2K5jN0sp67KWjOlj5
15
15
  mineru/backend/vlm/model_output_to_middle_json.py,sha256=AqYX44gS9crUO_t7SuUatD71EVjow6pI6yA2Ik3gQ0s,5139
16
16
  mineru/backend/vlm/utils.py,sha256=JMgS3SMFcHJYH2jIx-Xhs-P2a1bmT8U6Kn60IL0OmQA,3570
17
17
  mineru/backend/vlm/vlm_analyze.py,sha256=wP3vuYGVec0hRsDAuzfSm2HD4Muu7wSWL767qxd_yqw,11690
18
- mineru/backend/vlm/vlm_magic_model.py,sha256=Pd0sOr7G1crAJIVeq6h_03gNSuxmV5U8dvGTGT_rrjs,23452
19
- mineru/backend/vlm/vlm_middle_json_mkcontent.py,sha256=5V-AU9KkxxMn0DDSQBrb15I4GVpEyiQy8uNI_tQhS6M,13498
18
+ mineru/backend/vlm/vlm_magic_model.py,sha256=e8BWkfeRkZNJjFdm9oPmvIs9ATjdzCy_5OZw2qTziZA,23839
19
+ mineru/backend/vlm/vlm_middle_json_mkcontent.py,sha256=9ig7kfbKDrG5QOwvYoXfcsK5EY9_VyAFmqz733Il-P0,24071
20
20
  mineru/cli/__init__.py,sha256=8CRrCQVuExa0BttRFh3Z40lFy2K5jN0sp67KWjOlj5c,50
21
21
  mineru/cli/client.py,sha256=XSEIr4klUuufMAWn5IioZdXpg1xAxqRZF0HkaVIhxh0,6815
22
- mineru/cli/common.py,sha256=zhNOJCOnTSMbWdUWSZG-nf0odv5vBRtdZYZ1UbUPH3g,14369
22
+ mineru/cli/common.py,sha256=Rld3P4FzbTYngy3BP-Irto9aMS7EPq2lIWgeIF1xPMc,14667
23
23
  mineru/cli/fast_api.py,sha256=lLxQKKHmD8ruoZGcE6LrXzr3pQIxvw8OdJrQq_FNLSM,14447
24
24
  mineru/cli/gradio_app.py,sha256=EUPuRHHCOECrE3E3VNEeuMDYeC3nicurOYfk8YJSOMw,15646
25
25
  mineru/cli/models_download.py,sha256=LNfoIpUlJM7m7qb2SiCxtjMDw4jILBQtZwNP2JoY81U,4815
@@ -159,9 +159,9 @@ mineru/utils/cli_parser.py,sha256=4seFAu1kulsYnw6WM2q_cxgEOt2tErZVkI-LNEF_kGw,14
159
159
  mineru/utils/config_reader.py,sha256=IRVWTpBnbnRpck6eXZUKw-fcLt7hon5S4uqWW-RBb1w,4075
160
160
  mineru/utils/cut_image.py,sha256=g3m4nfcJNWlxi-P0kpXTtlmspXkMcLCfGwmYuQ-Z2hE,751
161
161
  mineru/utils/draw_bbox.py,sha256=FkgppjUzRhN-uxvChdkhHXcDavJEaApMD6qC6qoRwfQ,20292
162
- mineru/utils/enum_class.py,sha256=-_Ey03vGNEQHkl6x7pZ43GgrakwhSCOa1RXdr1m-I3A,2503
162
+ mineru/utils/enum_class.py,sha256=NO9FiM2Ni6-SI1rzMm5X98ssNDxgJTyykZddJSscgJs,3221
163
163
  mineru/utils/format_utils.py,sha256=2s89vHcSISjuolk8Hvg3K-5-rRbiT3Us7eFLzUKrNKs,10233
164
- mineru/utils/guess_suffix_or_lang.py,sha256=nznyQpUn1BSA8JNw9HuG3pVV-xtVAtrtcGuHZ-VXt9M,856
164
+ mineru/utils/guess_suffix_or_lang.py,sha256=aUC2wAJwa5LH0SHxwTbOEJqVVgvpdUCWFF6of8eDVkc,1198
165
165
  mineru/utils/hash_utils.py,sha256=UPS_8NRBmVumdyOv16Lmv6Ly2xK8OVDJEe5gG6gKIFk,857
166
166
  mineru/utils/language.py,sha256=7RT3mxSa7jdpoC5ySd7ZddHA7TO7UsnmDOWiYZAxuyg,1433
167
167
  mineru/utils/llm_aided.py,sha256=9WUytvxenSAuaWR4sTQhVPQ5h8pY0wVOH1O2sj_6dLs,5149
@@ -178,10 +178,10 @@ mineru/utils/pdf_text_tool.py,sha256=KEztjfdqsIHHuiTEAMAL7Lr1OS3R7Ur-uTqGiCRjReQ
178
178
  mineru/utils/run_async.py,sha256=rPeP4BCZerR8VByRDhiYzfZiahLVqoZEBVAS54dAjNg,1286
179
179
  mineru/utils/span_block_fix.py,sha256=0eVQjJCrT03woRt9hoh6Uu42Tp1dacfGTv2x3B9qq94,8797
180
180
  mineru/utils/span_pre_proc.py,sha256=h41q2uQajI0xQbc_30hqaju1dv3oVYxBAlKgURl8HIc,13692
181
- mineru/utils/table_merge.py,sha256=d98zNbM1ZQ8V1kUt6RugParNUNPv7DGL-XKIzR3iJVQ,15360
182
- mineru-2.6.6.dist-info/licenses/LICENSE.md,sha256=jVa0BUaKrRH4erV2P5AeJ24I2WRv9chIGxditreJ6e0,34524
183
- mineru-2.6.6.dist-info/METADATA,sha256=9f-9lcSQXdLCxbYmHItJbLgDc-TZG7u7dVUWMS0SzXA,73095
184
- mineru-2.6.6.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
185
- mineru-2.6.6.dist-info/entry_points.txt,sha256=JbtrCPhx1T32s7TONUsteKg-24ZwRT1HSiFtW5jypVw,376
186
- mineru-2.6.6.dist-info/top_level.txt,sha256=zuGQfZcbsHv4I4oKI9gaKPqEWBFm6xJroKuug2LnKP8,7
187
- mineru-2.6.6.dist-info/RECORD,,
181
+ mineru/utils/table_merge.py,sha256=pZHP0mIUcWPcdu0HecZDEFZ_ms_rT7C43m0Eg2iEEF4,15592
182
+ mineru-2.6.7.dist-info/licenses/LICENSE.md,sha256=jVa0BUaKrRH4erV2P5AeJ24I2WRv9chIGxditreJ6e0,34524
183
+ mineru-2.6.7.dist-info/METADATA,sha256=2SEMOR8-0ZzJL85pLQR4Gkm61YroeJ6BSDjB44YCUhs,73142
184
+ mineru-2.6.7.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
185
+ mineru-2.6.7.dist-info/entry_points.txt,sha256=JbtrCPhx1T32s7TONUsteKg-24ZwRT1HSiFtW5jypVw,376
186
+ mineru-2.6.7.dist-info/top_level.txt,sha256=zuGQfZcbsHv4I4oKI9gaKPqEWBFm6xJroKuug2LnKP8,7
187
+ mineru-2.6.7.dist-info/RECORD,,
File without changes