mineru 2.6.5__py3-none-any.whl → 2.6.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -159,7 +159,6 @@ def batch_image_analyze(
159
159
 
160
160
  model_manager = ModelSingleton()
161
161
 
162
- batch_ratio = 1
163
162
  device = get_device()
164
163
 
165
164
  if str(device).startswith('npu'):
@@ -173,25 +172,21 @@ def batch_image_analyze(
173
172
  "Please ensure that the torch_npu package is installed correctly."
174
173
  ) from e
175
174
 
176
- if str(device).startswith('npu') or str(device).startswith('cuda'):
177
- vram = get_vram(device)
178
- if vram is not None:
179
- gpu_memory = int(os.getenv('MINERU_VIRTUAL_VRAM_SIZE', round(vram)))
180
- if gpu_memory >= 16:
181
- batch_ratio = 16
182
- elif gpu_memory >= 12:
183
- batch_ratio = 8
184
- elif gpu_memory >= 8:
185
- batch_ratio = 4
186
- elif gpu_memory >= 6:
187
- batch_ratio = 2
188
- else:
189
- batch_ratio = 1
190
- logger.info(f'gpu_memory: {gpu_memory} GB, batch_ratio: {batch_ratio}')
191
- else:
192
- # Default batch_ratio when VRAM can't be determined
193
- batch_ratio = 1
194
- logger.info(f'Could not determine GPU memory, using default batch_ratio: {batch_ratio}')
175
+ gpu_memory = get_vram(device)
176
+ if gpu_memory >= 16:
177
+ batch_ratio = 16
178
+ elif gpu_memory >= 12:
179
+ batch_ratio = 8
180
+ elif gpu_memory >= 8:
181
+ batch_ratio = 4
182
+ elif gpu_memory >= 6:
183
+ batch_ratio = 2
184
+ else:
185
+ batch_ratio = 1
186
+ logger.info(
187
+ f'GPU Memory: {gpu_memory} GB, Batch Ratio: {batch_ratio}. '
188
+ f'You can set MINERU_VIRTUAL_VRAM_SIZE environment variable to adjust GPU memory allocation.'
189
+ )
195
190
 
196
191
  # 检测torch的版本号
197
192
  import torch
@@ -81,20 +81,16 @@ def set_default_gpu_memory_utilization() -> float:
81
81
  def set_default_batch_size() -> int:
82
82
  try:
83
83
  device = get_device()
84
- vram = get_vram(device)
85
- if vram is not None:
86
- gpu_memory = int(os.getenv('MINERU_VIRTUAL_VRAM_SIZE', round(vram)))
87
- if gpu_memory >= 16:
88
- batch_size = 8
89
- elif gpu_memory >= 8:
90
- batch_size = 4
91
- else:
92
- batch_size = 1
93
- logger.info(f'gpu_memory: {gpu_memory} GB, batch_size: {batch_size}')
84
+ gpu_memory = get_vram(device)
85
+
86
+ if gpu_memory >= 16:
87
+ batch_size = 8
88
+ elif gpu_memory >= 8:
89
+ batch_size = 4
94
90
  else:
95
- # Default batch_ratio when VRAM can't be determined
96
91
  batch_size = 1
97
- logger.info(f'Could not determine GPU memory, using default batch_ratio: {batch_size}')
92
+ logger.info(f'gpu_memory: {gpu_memory} GB, batch_size: {batch_size}')
93
+
98
94
  except Exception as e:
99
95
  logger.warning(f'Error determining VRAM: {e}, using default batch_ratio: 1')
100
96
  batch_size = 1
@@ -45,7 +45,7 @@ class MagicModel:
45
45
  continue
46
46
 
47
47
  span_type = "unknown"
48
- line_type = None
48
+ code_block_sub_type = None
49
49
  guess_lang = None
50
50
 
51
51
  if block_type in [
@@ -74,7 +74,7 @@ class MagicModel:
74
74
  span_type = ContentType.TABLE
75
75
  elif block_type in ["code", "algorithm"]:
76
76
  block_content = code_content_clean(block_content)
77
- line_type = block_type
77
+ code_block_sub_type = block_type
78
78
  block_type = BlockType.CODE_BODY
79
79
  span_type = ContentType.TEXT
80
80
  guess_lang = guess_language_by_text(block_content)
@@ -82,6 +82,9 @@ class MagicModel:
82
82
  block_type = BlockType.INTERLINE_EQUATION
83
83
  span_type = ContentType.INTERLINE_EQUATION
84
84
 
85
+ # code 和 algorithm 类型的块,如果内容中包含行内公式,则需要将块类型切换为algorithm
86
+ switch_code_to_algorithm = False
87
+
85
88
  if span_type in ["image", "table"]:
86
89
  span = {
87
90
  "bbox": block_bbox,
@@ -102,6 +105,8 @@ class MagicModel:
102
105
 
103
106
  if block_content and block_content.count("\\(") == block_content.count("\\)") and block_content.count("\\(") > 0:
104
107
 
108
+ switch_code_to_algorithm = True
109
+
105
110
  # 生成包含文本和公式的span列表
106
111
  spans = []
107
112
  last_end = 0
@@ -160,7 +165,9 @@ class MagicModel:
160
165
 
161
166
  # 构造line对象
162
167
  if block_type in [BlockType.CODE_BODY]:
163
- line = {"bbox": block_bbox, "spans": spans, "extra": {"type": line_type, "guess_lang": guess_lang}}
168
+ if switch_code_to_algorithm and code_block_sub_type == "code":
169
+ code_block_sub_type = "algorithm"
170
+ line = {"bbox": block_bbox, "spans": spans, "extra": {"type": code_block_sub_type, "guess_lang": guess_lang}}
164
171
  else:
165
172
  line = {"bbox": block_bbox, "spans": spans}
166
173
 
@@ -1,7 +1,8 @@
1
1
  import os
2
2
 
3
+ from loguru import logger
3
4
  from mineru.utils.config_reader import get_latex_delimiter_config, get_formula_enable, get_table_enable
4
- from mineru.utils.enum_class import MakeMode, BlockType, ContentType
5
+ from mineru.utils.enum_class import MakeMode, BlockType, ContentType, ContentTypeV2
5
6
 
6
7
  latex_delimiters_config = get_latex_delimiter_config()
7
8
 
@@ -234,6 +235,277 @@ def make_blocks_to_content_list(para_block, img_buket_path, page_idx, page_size)
234
235
 
235
236
  return para_content
236
237
 
238
+
239
+ def make_blocks_to_content_list_v2(para_block, img_buket_path, page_size):
240
+ para_type = para_block['type']
241
+ para_content = {}
242
+ if para_type in [
243
+ BlockType.HEADER,
244
+ BlockType.FOOTER,
245
+ BlockType.ASIDE_TEXT,
246
+ BlockType.PAGE_NUMBER,
247
+ BlockType.PAGE_FOOTNOTE,
248
+ ]:
249
+ if para_type == BlockType.HEADER:
250
+ content_type = ContentTypeV2.PAGE_HEADER
251
+ elif para_type == BlockType.FOOTER:
252
+ content_type = ContentTypeV2.PAGE_FOOTER
253
+ elif para_type == BlockType.ASIDE_TEXT:
254
+ content_type = ContentTypeV2.PAGE_ASIDE_TEXT
255
+ elif para_type == BlockType.PAGE_NUMBER:
256
+ content_type = ContentTypeV2.PAGE_NUMBER
257
+ elif para_type == BlockType.PAGE_FOOTNOTE:
258
+ content_type = ContentTypeV2.PAGE_FOOTNOTE
259
+ else:
260
+ raise ValueError(f"Unknown para_type: {para_type}")
261
+ para_content = {
262
+ 'type': content_type,
263
+ 'content': {
264
+ f"{content_type}_content": merge_para_with_text_v2(para_block),
265
+ }
266
+ }
267
+ elif para_type == BlockType.TITLE:
268
+ title_level = get_title_level(para_block)
269
+ if title_level != 0:
270
+ para_content = {
271
+ 'type': ContentTypeV2.TITLE,
272
+ 'content': {
273
+ "title_content": merge_para_with_text_v2(para_block),
274
+ "level": title_level
275
+ }
276
+ }
277
+ else:
278
+ para_content = {
279
+ 'type': ContentTypeV2.PARAGRAPH,
280
+ 'content': {
281
+ "paragraph_content": merge_para_with_text_v2(para_block),
282
+ }
283
+ }
284
+ elif para_type in [
285
+ BlockType.TEXT,
286
+ BlockType.PHONETIC
287
+ ]:
288
+ para_content = {
289
+ 'type': ContentTypeV2.PARAGRAPH,
290
+ 'content': {
291
+ 'paragraph_content': merge_para_with_text_v2(para_block),
292
+ }
293
+ }
294
+ elif para_type == BlockType.INTERLINE_EQUATION:
295
+ image_path, math_content = get_body_data(para_block)
296
+ para_content = {
297
+ 'type': ContentTypeV2.EQUATION_INTERLINE,
298
+ 'content': {
299
+ 'math_content': math_content,
300
+ 'math_type': 'latex',
301
+ 'image_source': {'path': f"{img_buket_path}/{image_path}"},
302
+ }
303
+ }
304
+ elif para_type == BlockType.IMAGE:
305
+ image_caption = []
306
+ image_footnote = []
307
+ image_path, _ = get_body_data(para_block)
308
+ image_source = {
309
+ 'path': f"{img_buket_path}/{image_path}",
310
+ }
311
+ for block in para_block['blocks']:
312
+ if block['type'] == BlockType.IMAGE_CAPTION:
313
+ image_caption.extend(merge_para_with_text_v2(block))
314
+ if block['type'] == BlockType.IMAGE_FOOTNOTE:
315
+ image_footnote.extend(merge_para_with_text_v2(block))
316
+ para_content = {
317
+ 'type': ContentTypeV2.IMAGE,
318
+ 'content': {
319
+ 'image_source': image_source,
320
+ 'image_caption': image_caption,
321
+ 'image_footnote': image_footnote,
322
+ }
323
+ }
324
+ elif para_type == BlockType.TABLE:
325
+ table_caption = []
326
+ table_footnote = []
327
+ image_path, html = get_body_data(para_block)
328
+ image_source = {
329
+ 'path': f"{img_buket_path}/{image_path}",
330
+ }
331
+ if html.count("<table") > 1:
332
+ table_nest_level = 2
333
+ else:
334
+ table_nest_level = 1
335
+ if (
336
+ "colspan" in html or
337
+ "rowspan" in html or
338
+ table_nest_level > 1
339
+ ):
340
+ table_type = ContentTypeV2.TABLE_COMPLEX
341
+ else:
342
+ table_type = ContentTypeV2.TABLE_SIMPLE
343
+
344
+ for block in para_block['blocks']:
345
+ if block['type'] == BlockType.TABLE_CAPTION:
346
+ table_caption.extend(merge_para_with_text_v2(block))
347
+ if block['type'] == BlockType.TABLE_FOOTNOTE:
348
+ table_footnote.extend(merge_para_with_text_v2(block))
349
+ para_content = {
350
+ 'type': ContentTypeV2.TABLE,
351
+ 'content': {
352
+ 'image_source': image_source,
353
+ 'table_caption': table_caption,
354
+ 'table_footnote': table_footnote,
355
+ 'html': html,
356
+ 'table_type': table_type,
357
+ 'table_nest_level': table_nest_level,
358
+ }
359
+ }
360
+ elif para_type == BlockType.CODE:
361
+ code_caption = []
362
+ code_content = []
363
+ for block in para_block['blocks']:
364
+ if block['type'] == BlockType.CODE_CAPTION:
365
+ code_caption.extend(merge_para_with_text_v2(block))
366
+ if block['type'] == BlockType.CODE_BODY:
367
+ code_content = merge_para_with_text_v2(block)
368
+ sub_type = para_block["sub_type"]
369
+ if sub_type == BlockType.CODE:
370
+ para_content = {
371
+ 'type': ContentTypeV2.CODE,
372
+ 'content': {
373
+ 'code_caption': code_caption,
374
+ 'code_content': code_content,
375
+ 'code_language': para_block.get('guess_lang', 'txt'),
376
+ }
377
+ }
378
+ elif sub_type == BlockType.ALGORITHM:
379
+ para_content = {
380
+ 'type': ContentTypeV2.ALGORITHM,
381
+ 'content': {
382
+ 'algorithm_caption': code_caption,
383
+ 'algorithm_content': code_content,
384
+ }
385
+ }
386
+ else:
387
+ raise ValueError(f"Unknown code sub_type: {sub_type}")
388
+ elif para_type == BlockType.REF_TEXT:
389
+ para_content = {
390
+ 'type': ContentTypeV2.LIST,
391
+ 'content': {
392
+ 'list_type': ContentTypeV2.LIST_REF,
393
+ 'list_items': [
394
+ {
395
+ 'item_type': 'text',
396
+ 'item_content': merge_para_with_text_v2(para_block),
397
+ }
398
+ ],
399
+ }
400
+ }
401
+ elif para_type == BlockType.LIST:
402
+ if 'sub_type' in para_block:
403
+ if para_block['sub_type'] == BlockType.REF_TEXT:
404
+ list_type = ContentTypeV2.LIST_REF
405
+ elif para_block['sub_type'] == BlockType.TEXT:
406
+ list_type = ContentTypeV2.LIST_TEXT
407
+ else:
408
+ raise ValueError(f"Unknown list sub_type: {para_block['sub_type']}")
409
+ else:
410
+ list_type = ContentTypeV2.LIST_TEXT
411
+ list_items = []
412
+ for block in para_block['blocks']:
413
+ item_content = merge_para_with_text_v2(block)
414
+ if item_content:
415
+ list_items.append({
416
+ 'item_type': 'text',
417
+ 'item_content': item_content,
418
+ })
419
+ para_content = {
420
+ 'type': ContentTypeV2.LIST,
421
+ 'content': {
422
+ 'list_type': list_type,
423
+ 'list_items': list_items,
424
+ }
425
+ }
426
+
427
+ page_width, page_height = page_size
428
+ para_bbox = para_block.get('bbox')
429
+ if para_bbox:
430
+ x0, y0, x1, y1 = para_bbox
431
+ para_content['bbox'] = [
432
+ int(x0 * 1000 / page_width),
433
+ int(y0 * 1000 / page_height),
434
+ int(x1 * 1000 / page_width),
435
+ int(y1 * 1000 / page_height),
436
+ ]
437
+
438
+ return para_content
439
+
440
+
441
+
442
+
443
+
444
+ def get_body_data(para_block):
445
+ """
446
+ Extract image_path and html from para_block
447
+ Returns:
448
+ - For IMAGE/INTERLINE_EQUATION: (image_path, '')
449
+ - For TABLE: (image_path, html)
450
+ - Default: ('', '')
451
+ """
452
+
453
+ def get_data_from_spans(lines):
454
+ for line in lines:
455
+ for span in line.get('spans', []):
456
+ span_type = span.get('type')
457
+ if span_type == ContentType.TABLE:
458
+ return span.get('image_path', ''), span.get('html', '')
459
+ elif span_type == ContentType.IMAGE:
460
+ return span.get('image_path', ''), ''
461
+ elif span_type == ContentType.INTERLINE_EQUATION:
462
+ return span.get('image_path', ''), span.get('content', '')
463
+ elif span_type == ContentType.TEXT:
464
+ return '', span.get('content', '')
465
+ return '', ''
466
+
467
+ # 处理嵌套的 blocks 结构
468
+ if 'blocks' in para_block:
469
+ for block in para_block['blocks']:
470
+ block_type = block.get('type')
471
+ if block_type in [BlockType.IMAGE_BODY, BlockType.TABLE_BODY, BlockType.CODE_BODY]:
472
+ result = get_data_from_spans(block.get('lines', []))
473
+ if result != ('', ''):
474
+ return result
475
+ return '', ''
476
+
477
+ # 处理直接包含 lines 的结构
478
+ return get_data_from_spans(para_block.get('lines', []))
479
+
480
+
481
+ def merge_para_with_text_v2(para_block):
482
+ para_content = []
483
+ para_type = para_block['type']
484
+ for line in para_block['lines']:
485
+ for span in line['spans']:
486
+ span_type = span['type']
487
+ if span['content']:
488
+ if para_type == BlockType.PHONETIC and span_type == ContentTypeV2.SPAN_TEXT:
489
+ span_type = ContentTypeV2.SPAN_PHONETIC
490
+ if span_type == ContentType.INLINE_EQUATION:
491
+ span_type = ContentTypeV2.SPAN_EQUATION_INLINE
492
+ if span_type in [
493
+ ContentTypeV2.SPAN_TEXT,
494
+ ContentTypeV2.SPAN_PHONETIC,
495
+ ContentTypeV2.SPAN_EQUATION_INLINE,
496
+ ContentTypeV2.SPAN_MD,
497
+ ContentTypeV2.SPAN_CODE_INLINE,
498
+ ]:
499
+ span_content = {
500
+ 'type': span_type,
501
+ 'content': span['content'],
502
+ }
503
+ para_content.append(span_content)
504
+ else:
505
+ logger.warning(f"Unknown span type in merge_para_with_text_v2: {span_type}")
506
+ return para_content
507
+
508
+
237
509
  def union_make(pdf_info_dict: list,
238
510
  make_mode: str,
239
511
  img_buket_path: str = '',
@@ -260,10 +532,20 @@ def union_make(pdf_info_dict: list,
260
532
  for para_block in para_blocks:
261
533
  para_content = make_blocks_to_content_list(para_block, img_buket_path, page_idx, page_size)
262
534
  output_content.append(para_content)
535
+ elif make_mode == MakeMode.CONTENT_LIST_V2:
536
+ # https://github.com/drunkpig/llm-webkit-mirror/blob/dev6/docs/specification/output_format/content_list_spec.md
537
+ page_contents = []
538
+ para_blocks = (paras_of_layout or []) + (paras_of_discarded or [])
539
+ if not para_blocks:
540
+ continue
541
+ for para_block in para_blocks:
542
+ para_content = make_blocks_to_content_list_v2(para_block, img_buket_path, page_size)
543
+ page_contents.append(para_content)
544
+ output_content.append(page_contents)
263
545
 
264
546
  if make_mode in [MakeMode.MM_MD, MakeMode.NLP_MD]:
265
547
  return '\n\n'.join(output_content)
266
- elif make_mode == MakeMode.CONTENT_LIST:
548
+ elif make_mode in [MakeMode.CONTENT_LIST, MakeMode.CONTENT_LIST_V2]:
267
549
  return output_content
268
550
  return None
269
551
 
mineru/cli/client.py CHANGED
@@ -113,7 +113,7 @@ if is_mac_os_version_supported():
113
113
  '--formula',
114
114
  'formula_enable',
115
115
  type=bool,
116
- help='Enable formula parsing. Default is True. Adapted only for the case where the backend is set to "pipeline".',
116
+ help='Enable formula parsing. Default is True. ',
117
117
  default=True,
118
118
  )
119
119
  @click.option(
@@ -121,7 +121,7 @@ if is_mac_os_version_supported():
121
121
  '--table',
122
122
  'table_enable',
123
123
  type=bool,
124
- help='Enable table parsing. Default is True. Adapted only for the case where the backend is set to "pipeline".',
124
+ help='Enable table parsing. Default is True. ',
125
125
  default=True,
126
126
  )
127
127
  @click.option(
@@ -172,9 +172,8 @@ def main(
172
172
  def get_virtual_vram_size() -> int:
173
173
  if virtual_vram is not None:
174
174
  return virtual_vram
175
- if get_device_mode().startswith("cuda") or get_device_mode().startswith("npu"):
176
- return round(get_vram(get_device_mode()))
177
- return 1
175
+ else:
176
+ return get_vram(get_device_mode())
178
177
  if os.getenv('MINERU_VIRTUAL_VRAM_SIZE', None) is None:
179
178
  os.environ['MINERU_VIRTUAL_VRAM_SIZE']= str(get_virtual_vram_size())
180
179
 
mineru/cli/common.py CHANGED
@@ -144,6 +144,13 @@ def _process_output(
144
144
  f"{pdf_file_name}_content_list.json",
145
145
  json.dumps(content_list, ensure_ascii=False, indent=4),
146
146
  )
147
+ if not is_pipeline:
148
+ content_list_v2 = make_func(pdf_info, MakeMode.CONTENT_LIST_V2, image_dir)
149
+ md_writer.write_string(
150
+ f"{pdf_file_name}_content_list_v2.json",
151
+ json.dumps(content_list_v2, ensure_ascii=False, indent=4),
152
+ )
153
+
147
154
 
148
155
  if f_dump_middle_json:
149
156
  md_writer.write_string(
mineru/cli/fast_api.py CHANGED
@@ -8,7 +8,7 @@ import click
8
8
  import zipfile
9
9
  from pathlib import Path
10
10
  import glob
11
- from fastapi import FastAPI, UploadFile, File, Form
11
+ from fastapi import Depends, FastAPI, HTTPException, UploadFile, File, Form
12
12
  from fastapi.middleware.gzip import GZipMiddleware
13
13
  from fastapi.responses import JSONResponse, FileResponse
14
14
  from starlette.background import BackgroundTask
@@ -21,14 +21,53 @@ from mineru.utils.cli_parser import arg_parse
21
21
  from mineru.utils.guess_suffix_or_lang import guess_suffix_by_path
22
22
  from mineru.version import __version__
23
23
 
24
- app = FastAPI()
25
- app.add_middleware(GZipMiddleware, minimum_size=1000)
24
+ # 并发控制器
25
+ _request_semaphore: Optional[asyncio.Semaphore] = None
26
+
27
+ # 并发控制依赖函数
28
+ async def limit_concurrency():
29
+ if _request_semaphore is not None:
30
+ if _request_semaphore.locked():
31
+ raise HTTPException(
32
+ status_code=503,
33
+ detail=f"Server is at maximum capacity: {os.getenv('MINERU_API_MAX_CONCURRENT_REQUESTS', 'unset')}. Please try again later."
34
+ )
35
+ async with _request_semaphore:
36
+ yield
37
+ else:
38
+ yield
39
+
40
+ def create_app():
41
+ # By default, the OpenAPI documentation endpoints (openapi_url, docs_url, redoc_url) are enabled.
42
+ # To disable the FastAPI docs and schema endpoints, set the environment variable MINERU_API_ENABLE_FASTAPI_DOCS=0.
43
+ enable_docs = str(os.getenv("MINERU_API_ENABLE_FASTAPI_DOCS", "1")).lower() in ("1", "true", "yes")
44
+ app = FastAPI(
45
+ openapi_url="/openapi.json" if enable_docs else None,
46
+ docs_url="/docs" if enable_docs else None,
47
+ redoc_url="/redoc" if enable_docs else None,
48
+ )
49
+
50
+ # 初始化并发控制器:从环境变量MINERU_API_MAX_CONCURRENT_REQUESTS读取
51
+ global _request_semaphore
52
+ try:
53
+ max_concurrent_requests = int(os.getenv("MINERU_API_MAX_CONCURRENT_REQUESTS", "0"))
54
+ except ValueError:
55
+ max_concurrent_requests = 0
56
+
57
+ if max_concurrent_requests > 0:
58
+ _request_semaphore = asyncio.Semaphore(max_concurrent_requests)
59
+ logger.info(f"Request concurrency limited to {max_concurrent_requests}")
60
+
61
+ app.add_middleware(GZipMiddleware, minimum_size=1000)
62
+ return app
63
+
64
+ app = create_app()
26
65
 
27
66
 
28
67
  def sanitize_filename(filename: str) -> str:
29
68
  """
30
69
  格式化压缩文件的文件名
31
- 移除路径遍历字符, 保留 Unicode 字母、数字、._-
70
+ 移除路径遍历字符, 保留 Unicode 字母、数字、._-
32
71
  禁止隐藏文件
33
72
  """
34
73
  sanitized = re.sub(r'[/\\\.]{2,}|[/\\]', '', filename)
@@ -60,24 +99,48 @@ def get_infer_result(file_suffix_identifier: str, pdf_name: str, parse_dir: str)
60
99
  return None
61
100
 
62
101
 
63
- @app.post(path="/file_parse",)
102
+ @app.post(path="/file_parse", dependencies=[Depends(limit_concurrency)])
64
103
  async def parse_pdf(
65
- files: List[UploadFile] = File(...),
66
- output_dir: str = Form("./output"),
67
- lang_list: List[str] = Form(["ch"]),
68
- backend: str = Form("pipeline"),
69
- parse_method: str = Form("auto"),
70
- formula_enable: bool = Form(True),
71
- table_enable: bool = Form(True),
72
- server_url: Optional[str] = Form(None),
73
- return_md: bool = Form(True),
74
- return_middle_json: bool = Form(False),
75
- return_model_output: bool = Form(False),
76
- return_content_list: bool = Form(False),
77
- return_images: bool = Form(False),
78
- response_format_zip: bool = Form(False),
79
- start_page_id: int = Form(0),
80
- end_page_id: int = Form(99999),
104
+ files: List[UploadFile] = File(..., description="Upload pdf or image files for parsing"),
105
+ output_dir: str = Form("./output", description="Output local directory"),
106
+ lang_list: List[str] = Form(
107
+ ["ch"],
108
+ description="""(Adapted only for pipeline backend)Input the languages in the pdf to improve OCR accuracy.
109
+ Options: ch, ch_server, ch_lite, en, korean, japan, chinese_cht, ta, te, ka, th, el, latin, arabic, east_slavic, cyrillic, devanagari.
110
+ """
111
+ ),
112
+ backend: str = Form(
113
+ "pipeline",
114
+ description="""The backend for parsing:
115
+ - pipeline: More general
116
+ - vlm-transformers: More general, but slower
117
+ - vlm-mlx-engine: Faster than transformers (need apple silicon and macOS 13.5+)
118
+ - vlm-vllm-async-engine: Faster (vllm-engine, need vllm installed)
119
+ - vlm-lmdeploy-engine: Faster (lmdeploy-engine, need lmdeploy installed)
120
+ - vlm-http-client: Faster (client suitable for openai-compatible servers)"""
121
+ ),
122
+ parse_method: str = Form(
123
+ "auto",
124
+ description="""(Adapted only for pipeline backend)The method for parsing PDF:
125
+ - auto: Automatically determine the method based on the file type
126
+ - txt: Use text extraction method
127
+ - ocr: Use OCR method for image-based PDFs
128
+ """
129
+ ),
130
+ formula_enable: bool = Form(True, description="Enable formula parsing."),
131
+ table_enable: bool = Form(True, description="Enable table parsing."),
132
+ server_url: Optional[str] = Form(
133
+ None,
134
+ description="(Adapted only for vlm-http-client backend)openai compatible server url, e.g., http://127.0.0.1:30000"
135
+ ),
136
+ return_md: bool = Form(True, description="Return markdown content in response"),
137
+ return_middle_json: bool = Form(False, description="Return middle JSON in response"),
138
+ return_model_output: bool = Form(False, description="Return model output JSON in response"),
139
+ return_content_list: bool = Form(False, description="Return content list JSON in response"),
140
+ return_images: bool = Form(False, description="Return extracted images in response"),
141
+ response_format_zip: bool = Form(False, description="Return results as a ZIP file instead of JSON"),
142
+ start_page_id: int = Form(0, description="The starting page for PDF parsing, beginning from 0"),
143
+ end_page_id: int = Form(99999, description="The ending page for PDF parsing, beginning from 0"),
81
144
  ):
82
145
 
83
146
  # 获取命令行配置参数
@@ -153,7 +216,7 @@ async def parse_pdf(
153
216
  # 根据 response_format_zip 决定返回类型
154
217
  if response_format_zip:
155
218
  zip_fd, zip_path = tempfile.mkstemp(suffix=".zip", prefix="mineru_results_")
156
- os.close(zip_fd)
219
+ os.close(zip_fd)
157
220
  with zipfile.ZipFile(zip_path, "w", compression=zipfile.ZIP_DEFLATED) as zf:
158
221
  for pdf_name in pdf_file_names:
159
222
  safe_pdf_name = sanitize_filename(pdf_name)
@@ -178,7 +241,7 @@ async def parse_pdf(
178
241
 
179
242
  if return_model_output:
180
243
  path = os.path.join(parse_dir, f"{pdf_name}_model.json")
181
- if os.path.exists(path):
244
+ if os.path.exists(path):
182
245
  zf.write(path, arcname=os.path.join(safe_pdf_name, os.path.basename(path)))
183
246
 
184
247
  if return_content_list:
@@ -259,11 +322,16 @@ def main(ctx, host, port, reload, **kwargs):
259
322
  # 将配置参数存储到应用状态中
260
323
  app.state.config = kwargs
261
324
 
325
+ # 将 CLI 的并发参数同步到环境变量,确保 uvicorn 重载子进程可见
326
+ try:
327
+ mcr = int(kwargs.get("mineru_api_max_concurrent_requests", 0) or 0)
328
+ except ValueError:
329
+ mcr = 0
330
+ os.environ["MINERU_API_MAX_CONCURRENT_REQUESTS"] = str(mcr)
331
+
262
332
  """启动MinerU FastAPI服务器的命令行入口"""
263
333
  print(f"Start MinerU FastAPI Service: http://{host}:{port}")
264
- print("The API documentation can be accessed at the following address:")
265
- print(f"- Swagger UI: http://{host}:{port}/docs")
266
- print(f"- ReDoc: http://{host}:{port}/redoc")
334
+ print(f"API documentation: http://{host}:{port}/docs")
267
335
 
268
336
  uvicorn.run(
269
337
  "mineru.cli.fast_api:app",
@@ -274,4 +342,4 @@ def main(ctx, host, port, reload, **kwargs):
274
342
 
275
343
 
276
344
  if __name__ == "__main__":
277
- main()
345
+ main()
@@ -38,6 +38,31 @@ class ContentType:
38
38
  CODE = 'code'
39
39
 
40
40
 
41
+ class ContentTypeV2:
42
+ CODE = 'code'
43
+ ALGORITHM = "algorithm"
44
+ EQUATION_INTERLINE = 'equation_interline'
45
+ IMAGE = 'image'
46
+ TABLE = 'table'
47
+ TABLE_SIMPLE = 'simple_table'
48
+ TABLE_COMPLEX = 'complex_table'
49
+ LIST = 'list'
50
+ LIST_TEXT = 'text_list'
51
+ LIST_REF = 'reference_list'
52
+ TITLE = 'title'
53
+ PARAGRAPH = 'paragraph'
54
+ SPAN_TEXT = 'text'
55
+ SPAN_EQUATION_INLINE = 'equation_inline'
56
+ SPAN_PHONETIC = 'phonetic'
57
+ SPAN_MD = 'md'
58
+ SPAN_CODE_INLINE = 'code_inline'
59
+ PAGE_HEADER = "page_header"
60
+ PAGE_FOOTER = "page_footer"
61
+ PAGE_NUMBER = "page_number"
62
+ PAGE_ASIDE_TEXT = "page_aside_text"
63
+ PAGE_FOOTNOTE = "page_footnote"
64
+
65
+
41
66
  class CategoryId:
42
67
  Title = 0
43
68
  Text = 1
@@ -60,6 +85,7 @@ class MakeMode:
60
85
  MM_MD = 'mm_markdown'
61
86
  NLP_MD = 'nlp_markdown'
62
87
  CONTENT_LIST = 'content_list'
88
+ CONTENT_LIST_V2 = 'content_list_v2'
63
89
 
64
90
 
65
91
  class ModelPath:
@@ -1,9 +1,11 @@
1
1
  from pathlib import Path
2
2
 
3
+ from loguru import logger
3
4
  from magika import Magika
4
5
 
5
6
 
6
7
  DEFAULT_LANG = "txt"
8
+ PDF_SIG_BYTES = b'%PDF'
7
9
  magika = Magika()
8
10
 
9
11
  def guess_language_by_text(code):
@@ -14,7 +16,7 @@ def guess_language_by_text(code):
14
16
 
15
17
  def guess_suffix_by_bytes(file_bytes, file_path=None) -> str:
16
18
  suffix = magika.identify_bytes(file_bytes).prediction.output.label
17
- if file_path and suffix in ["ai"] and Path(file_path).suffix.lower() in [".pdf"]:
19
+ if file_path and suffix in ["ai", "html"] and Path(file_path).suffix.lower() in [".pdf"] and file_bytes[:4] == PDF_SIG_BYTES:
18
20
  suffix = "pdf"
19
21
  return suffix
20
22
 
@@ -23,6 +25,11 @@ def guess_suffix_by_path(file_path) -> str:
23
25
  if not isinstance(file_path, Path):
24
26
  file_path = Path(file_path)
25
27
  suffix = magika.identify_path(file_path).prediction.output.label
26
- if suffix in ["ai"] and file_path.suffix.lower() in [".pdf"]:
27
- suffix = "pdf"
28
+ if suffix in ["ai", "html"] and file_path.suffix.lower() in [".pdf"]:
29
+ try:
30
+ with open(file_path, 'rb') as f:
31
+ if f.read(4) == PDF_SIG_BYTES:
32
+ suffix = "pdf"
33
+ except Exception as e:
34
+ logger.warning(f"Failed to read file {file_path} for PDF signature check: {e}")
28
35
  return suffix
@@ -428,8 +428,6 @@ def clean_memory(device='cuda'):
428
428
 
429
429
  def clean_vram(device, vram_threshold=8):
430
430
  total_memory = get_vram(device)
431
- if total_memory is not None:
432
- total_memory = int(os.getenv('MINERU_VIRTUAL_VRAM_SIZE', round(total_memory)))
433
431
  if total_memory and total_memory <= vram_threshold:
434
432
  gc_start = time.time()
435
433
  clean_memory(device)
@@ -437,13 +435,28 @@ def clean_vram(device, vram_threshold=8):
437
435
  # logger.info(f"gc time: {gc_time}")
438
436
 
439
437
 
440
- def get_vram(device):
438
+ def get_vram(device) -> int:
439
+ env_vram = os.getenv("MINERU_VIRTUAL_VRAM_SIZE")
440
+
441
+ # 如果环境变量已配置,尝试解析并返回
442
+ if env_vram is not None:
443
+ try:
444
+ total_memory = int(env_vram)
445
+ if total_memory > 0:
446
+ return total_memory
447
+ else:
448
+ logger.warning(
449
+ f"MINERU_VIRTUAL_VRAM_SIZE value '{env_vram}' is not positive, falling back to auto-detection")
450
+ except ValueError:
451
+ logger.warning(
452
+ f"MINERU_VIRTUAL_VRAM_SIZE value '{env_vram}' is not a valid integer, falling back to auto-detection")
453
+
454
+ # 环境变量未配置或配置错误,根据device自动获取
455
+ total_memory = 1
441
456
  if torch.cuda.is_available() and str(device).startswith("cuda"):
442
- total_memory = torch.cuda.get_device_properties(device).total_memory / (1024 ** 3) # 将字节转换为 GB
443
- return total_memory
457
+ total_memory = round(torch.cuda.get_device_properties(device).total_memory / (1024 ** 3)) # 将字节转换为 GB
444
458
  elif str(device).startswith("npu"):
445
459
  if torch_npu.npu.is_available():
446
- total_memory = torch_npu.npu.get_device_properties(device).total_memory / (1024 ** 3) # 转为 GB
447
- return total_memory
448
- else:
449
- return None
460
+ total_memory = round(torch_npu.npu.get_device_properties(device).total_memory / (1024 ** 3)) # 转为 GB
461
+
462
+ return total_memory
@@ -7,6 +7,9 @@ from mineru.backend.vlm.vlm_middle_json_mkcontent import merge_para_with_text
7
7
  from mineru.utils.enum_class import BlockType, SplitFlag
8
8
 
9
9
 
10
+ CONTINUATION_MARKERS = ["(续)", "(续表)", "(continued)", "(cont.)"]
11
+
12
+
10
13
  def full_to_half(text: str) -> str:
11
14
  """Convert full-width characters to half-width characters using code point manipulation.
12
15
 
@@ -174,8 +177,13 @@ def can_merge_tables(current_table_block, previous_table_block):
174
177
  # 如果有TABLE_CAPTION类型的块,检查是否至少有一个以"(续)"结尾
175
178
  caption_blocks = [block for block in current_table_block["blocks"] if block["type"] == BlockType.TABLE_CAPTION]
176
179
  if caption_blocks:
177
- # 如果所有caption都不以"(续)"结尾,则不合并
178
- if not any(full_to_half(merge_para_with_text(block).strip()).endswith("(续)") for block in caption_blocks):
180
+ # 如果所有caption都不以"(续)"、"(续表)"、"(continued)"或"(cont.)"结尾,则不合并
181
+
182
+ if not any(
183
+ any(full_to_half(merge_para_with_text(block).strip()).lower().endswith(marker.lower())
184
+ for marker in CONTINUATION_MARKERS)
185
+ for block in caption_blocks
186
+ ):
179
187
  return False, None, None, None, None
180
188
 
181
189
  if any(block["type"] == BlockType.TABLE_FOOTNOTE for block in previous_table_block["blocks"]):
mineru/version.py CHANGED
@@ -1 +1 @@
1
- __version__ = "2.6.5"
1
+ __version__ = "2.6.7"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: mineru
3
- Version: 2.6.5
3
+ Version: 2.6.7
4
4
  Summary: A practical tool for converting PDF to Markdown
5
5
  License: AGPL-3.0
6
6
  Project-URL: homepage, https://mineru.net/
@@ -74,8 +74,8 @@ Requires-Dist: fastapi; extra == "api"
74
74
  Requires-Dist: python-multipart; extra == "api"
75
75
  Requires-Dist: uvicorn; extra == "api"
76
76
  Provides-Extra: gradio
77
- Requires-Dist: gradio<6,>=5.34; extra == "gradio"
78
- Requires-Dist: gradio-pdf>=0.0.22; extra == "gradio"
77
+ Requires-Dist: gradio==5.49.1; extra == "gradio"
78
+ Requires-Dist: gradio-pdf==0.0.22; extra == "gradio"
79
79
  Provides-Extra: core
80
80
  Requires-Dist: mineru[vlm]; extra == "core"
81
81
  Requires-Dist: mineru[pipeline]; extra == "core"
@@ -134,6 +134,16 @@ Dynamic: license-file
134
134
  </div>
135
135
 
136
136
  # Changelog
137
+
138
+ - 2025/12/12 2.6.7 Release
139
+ - Bug fix: #4168
140
+
141
+ - 2025/12/02 2.6.6 Release
142
+ - `mineru-api` tool optimizations
143
+ - Added descriptive text to `mineru-api` interface parameters to improve API documentation readability.
144
+ - You can use the environment variable `MINERU_API_ENABLE_FASTAPI_DOCS` to control whether the auto-generated interface documentation page is enabled (enabled by default).
145
+ - Added concurrency configuration options for the `vlm-vllm-async-engine`, `vlm-lmdeploy-engine`, and `vlm-http-client` backends. Users can use the environment variable `MINERU_API_MAX_CONCURRENT_REQUESTS` to set the maximum number of concurrent API requests (unlimited by default).
146
+
137
147
  - 2025/11/26 2.6.5 Release
138
148
  - Added support for a new backend vlm-lmdeploy-engine. Its usage is similar to vlm-vllm-(async)engine, but it uses lmdeploy as the inference engine and additionally supports native inference acceleration on Windows platforms compared to vllm.
139
149
 
@@ -887,6 +897,8 @@ Currently, some models in this project are trained based on YOLO. However, since
887
897
  - [pdfminer.six](https://github.com/pdfminer/pdfminer.six)
888
898
  - [pypdf](https://github.com/py-pdf/pypdf)
889
899
  - [magika](https://github.com/google/magika)
900
+ - [vLLM](https://github.com/vllm-project/vllm)
901
+ - [LMDeploy](https://github.com/InternLM/lmdeploy)
890
902
 
891
903
  # Citation
892
904
 
@@ -1,5 +1,5 @@
1
1
  mineru/__init__.py,sha256=8CRrCQVuExa0BttRFh3Z40lFy2K5jN0sp67KWjOlj5c,50
2
- mineru/version.py,sha256=b8L3dijps7oaMPmOpJzOuXwvOcbIuro9wWmuPwiL87o,22
2
+ mineru/version.py,sha256=Kc2cyTvfIdfaMxraNhj61Hw9Mr3C-eTCdQPLK5KH1AQ,22
3
3
  mineru/backend/__init__.py,sha256=8CRrCQVuExa0BttRFh3Z40lFy2K5jN0sp67KWjOlj5c,50
4
4
  mineru/backend/utils.py,sha256=GLJU3IznDmhE1_qNmkU1UOtsuskIHBezgsEVO6Uar-Y,698
5
5
  mineru/backend/pipeline/__init__.py,sha256=8CRrCQVuExa0BttRFh3Z40lFy2K5jN0sp67KWjOlj5c,50
@@ -8,19 +8,19 @@ mineru/backend/pipeline/model_init.py,sha256=OAylOcQD9gu5TBcX7nMt7X5NpJMtQICI5Iv
8
8
  mineru/backend/pipeline/model_json_to_middle_json.py,sha256=reXkUR_wKmJD64d7vRNXMxFviwkzDlGjRshpdwsVquI,10951
9
9
  mineru/backend/pipeline/model_list.py,sha256=7cXMBfZrP0K6qWueg1D_-WoUANeSINzkn_ic9E7YQLs,222
10
10
  mineru/backend/pipeline/para_split.py,sha256=Kq95MmvkPm7rKxlCSGiTvVKyF7CErHI2eGGAs5sLl0Q,17119
11
- mineru/backend/pipeline/pipeline_analyze.py,sha256=O_HGifodg03VZbmTve-U6Cmo0T03AmuK86t1v1J9X-Q,6897
11
+ mineru/backend/pipeline/pipeline_analyze.py,sha256=GkGOrWGnBSswUik3nt_m76bCwDISC9sxXZ6xRX3L154,6528
12
12
  mineru/backend/pipeline/pipeline_magic_model.py,sha256=w8jGx8f6yZN0Wf2yPP3L9rYKc9rogxreZCrUJzJvPO8,14974
13
13
  mineru/backend/pipeline/pipeline_middle_json_mkcontent.py,sha256=YlnEbbUnkniZXS13aLo5mjfFQvQM5SrIVvTAGBZsLmw,14478
14
14
  mineru/backend/vlm/__init__.py,sha256=8CRrCQVuExa0BttRFh3Z40lFy2K5jN0sp67KWjOlj5c,50
15
15
  mineru/backend/vlm/model_output_to_middle_json.py,sha256=AqYX44gS9crUO_t7SuUatD71EVjow6pI6yA2Ik3gQ0s,5139
16
- mineru/backend/vlm/utils.py,sha256=taiPNKtsykImUYkkosk1CjxFIJEutygK8iZTLly-ZqU,3905
16
+ mineru/backend/vlm/utils.py,sha256=JMgS3SMFcHJYH2jIx-Xhs-P2a1bmT8U6Kn60IL0OmQA,3570
17
17
  mineru/backend/vlm/vlm_analyze.py,sha256=wP3vuYGVec0hRsDAuzfSm2HD4Muu7wSWL767qxd_yqw,11690
18
- mineru/backend/vlm/vlm_magic_model.py,sha256=Pd0sOr7G1crAJIVeq6h_03gNSuxmV5U8dvGTGT_rrjs,23452
19
- mineru/backend/vlm/vlm_middle_json_mkcontent.py,sha256=5V-AU9KkxxMn0DDSQBrb15I4GVpEyiQy8uNI_tQhS6M,13498
18
+ mineru/backend/vlm/vlm_magic_model.py,sha256=e8BWkfeRkZNJjFdm9oPmvIs9ATjdzCy_5OZw2qTziZA,23839
19
+ mineru/backend/vlm/vlm_middle_json_mkcontent.py,sha256=9ig7kfbKDrG5QOwvYoXfcsK5EY9_VyAFmqz733Il-P0,24071
20
20
  mineru/cli/__init__.py,sha256=8CRrCQVuExa0BttRFh3Z40lFy2K5jN0sp67KWjOlj5c,50
21
- mineru/cli/client.py,sha256=__CQknekVeq6s72JzHQRPSpR7mfNaO-ob9wq6oiEj6s,7047
22
- mineru/cli/common.py,sha256=zhNOJCOnTSMbWdUWSZG-nf0odv5vBRtdZYZ1UbUPH3g,14369
23
- mineru/cli/fast_api.py,sha256=t5bda769VbM5iokAboiJfPIOnm-r5GTFReE-KQy8L3g,10941
21
+ mineru/cli/client.py,sha256=XSEIr4klUuufMAWn5IioZdXpg1xAxqRZF0HkaVIhxh0,6815
22
+ mineru/cli/common.py,sha256=Rld3P4FzbTYngy3BP-Irto9aMS7EPq2lIWgeIF1xPMc,14667
23
+ mineru/cli/fast_api.py,sha256=lLxQKKHmD8ruoZGcE6LrXzr3pQIxvw8OdJrQq_FNLSM,14447
24
24
  mineru/cli/gradio_app.py,sha256=EUPuRHHCOECrE3E3VNEeuMDYeC3nicurOYfk8YJSOMw,15646
25
25
  mineru/cli/models_download.py,sha256=LNfoIpUlJM7m7qb2SiCxtjMDw4jILBQtZwNP2JoY81U,4815
26
26
  mineru/cli/vlm_server.py,sha256=27HaqO3wpMXSA_nA3CC6JOBTHK3q66SP00cD6m9HuQE,1974
@@ -159,14 +159,14 @@ mineru/utils/cli_parser.py,sha256=4seFAu1kulsYnw6WM2q_cxgEOt2tErZVkI-LNEF_kGw,14
159
159
  mineru/utils/config_reader.py,sha256=IRVWTpBnbnRpck6eXZUKw-fcLt7hon5S4uqWW-RBb1w,4075
160
160
  mineru/utils/cut_image.py,sha256=g3m4nfcJNWlxi-P0kpXTtlmspXkMcLCfGwmYuQ-Z2hE,751
161
161
  mineru/utils/draw_bbox.py,sha256=FkgppjUzRhN-uxvChdkhHXcDavJEaApMD6qC6qoRwfQ,20292
162
- mineru/utils/enum_class.py,sha256=-_Ey03vGNEQHkl6x7pZ43GgrakwhSCOa1RXdr1m-I3A,2503
162
+ mineru/utils/enum_class.py,sha256=NO9FiM2Ni6-SI1rzMm5X98ssNDxgJTyykZddJSscgJs,3221
163
163
  mineru/utils/format_utils.py,sha256=2s89vHcSISjuolk8Hvg3K-5-rRbiT3Us7eFLzUKrNKs,10233
164
- mineru/utils/guess_suffix_or_lang.py,sha256=nznyQpUn1BSA8JNw9HuG3pVV-xtVAtrtcGuHZ-VXt9M,856
164
+ mineru/utils/guess_suffix_or_lang.py,sha256=aUC2wAJwa5LH0SHxwTbOEJqVVgvpdUCWFF6of8eDVkc,1198
165
165
  mineru/utils/hash_utils.py,sha256=UPS_8NRBmVumdyOv16Lmv6Ly2xK8OVDJEe5gG6gKIFk,857
166
166
  mineru/utils/language.py,sha256=7RT3mxSa7jdpoC5ySd7ZddHA7TO7UsnmDOWiYZAxuyg,1433
167
167
  mineru/utils/llm_aided.py,sha256=9WUytvxenSAuaWR4sTQhVPQ5h8pY0wVOH1O2sj_6dLs,5149
168
168
  mineru/utils/magic_model_utils.py,sha256=2xOvi4oqg3MSw1FUrJTnYDtWeFrrm6qbmlEorLZSaYs,5650
169
- mineru/utils/model_utils.py,sha256=6OsgFLsABX5JuShSzCMSNHWV-yi-1cjwHweafyxIgRo,18448
169
+ mineru/utils/model_utils.py,sha256=6moOQqE5ShHaJKkENXP8BXJA7RCWtOGlYHZ3nidwmZs,18977
170
170
  mineru/utils/models_download_utils.py,sha256=UfjvwhxO6BkJHa5JSpEVNZ71GoLMPMmJpym3THET2T4,2957
171
171
  mineru/utils/ocr_utils.py,sha256=lPIrwNUib5mrzUkponRYHuUCdjV2qvETNLSzOLyflrU,15990
172
172
  mineru/utils/os_env_config.py,sha256=ZNtkR4KrJW72CeIoTNzGDL6tMKv_hL8nzvWIssGWbqY,842
@@ -178,10 +178,10 @@ mineru/utils/pdf_text_tool.py,sha256=KEztjfdqsIHHuiTEAMAL7Lr1OS3R7Ur-uTqGiCRjReQ
178
178
  mineru/utils/run_async.py,sha256=rPeP4BCZerR8VByRDhiYzfZiahLVqoZEBVAS54dAjNg,1286
179
179
  mineru/utils/span_block_fix.py,sha256=0eVQjJCrT03woRt9hoh6Uu42Tp1dacfGTv2x3B9qq94,8797
180
180
  mineru/utils/span_pre_proc.py,sha256=h41q2uQajI0xQbc_30hqaju1dv3oVYxBAlKgURl8HIc,13692
181
- mineru/utils/table_merge.py,sha256=d98zNbM1ZQ8V1kUt6RugParNUNPv7DGL-XKIzR3iJVQ,15360
182
- mineru-2.6.5.dist-info/licenses/LICENSE.md,sha256=jVa0BUaKrRH4erV2P5AeJ24I2WRv9chIGxditreJ6e0,34524
183
- mineru-2.6.5.dist-info/METADATA,sha256=BUj9fYR_NiRpYGqXWd3J_fOTE8IN0bdl0PgY6FUGVcg,72362
184
- mineru-2.6.5.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
185
- mineru-2.6.5.dist-info/entry_points.txt,sha256=JbtrCPhx1T32s7TONUsteKg-24ZwRT1HSiFtW5jypVw,376
186
- mineru-2.6.5.dist-info/top_level.txt,sha256=zuGQfZcbsHv4I4oKI9gaKPqEWBFm6xJroKuug2LnKP8,7
187
- mineru-2.6.5.dist-info/RECORD,,
181
+ mineru/utils/table_merge.py,sha256=pZHP0mIUcWPcdu0HecZDEFZ_ms_rT7C43m0Eg2iEEF4,15592
182
+ mineru-2.6.7.dist-info/licenses/LICENSE.md,sha256=jVa0BUaKrRH4erV2P5AeJ24I2WRv9chIGxditreJ6e0,34524
183
+ mineru-2.6.7.dist-info/METADATA,sha256=2SEMOR8-0ZzJL85pLQR4Gkm61YroeJ6BSDjB44YCUhs,73142
184
+ mineru-2.6.7.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
185
+ mineru-2.6.7.dist-info/entry_points.txt,sha256=JbtrCPhx1T32s7TONUsteKg-24ZwRT1HSiFtW5jypVw,376
186
+ mineru-2.6.7.dist-info/top_level.txt,sha256=zuGQfZcbsHv4I4oKI9gaKPqEWBFm6xJroKuug2LnKP8,7
187
+ mineru-2.6.7.dist-info/RECORD,,
File without changes