magic-pdf 1.3.10__py3-none-any.whl → 1.3.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
magic_pdf/libs/version.py CHANGED
@@ -1 +1 @@
1
- __version__ = "1.3.10"
1
+ __version__ = "1.3.11"
@@ -156,7 +156,10 @@ def doc_analyze(
156
156
  batch_images = [images_with_extra_info]
157
157
 
158
158
  results = []
159
- for batch_image in batch_images:
159
+ processed_images_count = 0
160
+ for index, batch_image in enumerate(batch_images):
161
+ processed_images_count += len(batch_image)
162
+ logger.info(f'Batch {index + 1}/{len(batch_images)}: {processed_images_count} pages/{len(images_with_extra_info)} pages')
160
163
  result = may_batch_image_analyze(batch_image, ocr, show_log,layout_model, formula_enable, table_enable)
161
164
  results.extend(result)
162
165
 
@@ -66,9 +66,9 @@ LEFT_RIGHT_REMOVE_PATTERN = re.compile(r'\\left\.?|\\right\.?')
66
66
 
67
67
  def fix_latex_left_right(s):
68
68
  """
69
- 修复LaTeX中的\left和\right命令
69
+ 修复LaTeX中的\\left和\\right命令
70
70
  1. 确保它们后面跟有效分隔符
71
- 2. 平衡\left和\right的数量
71
+ 2. 平衡\\left和\\right的数量
72
72
  """
73
73
  # 白名单分隔符
74
74
  valid_delims_list = [r'(', r')', r'[', r']', r'{', r'}', r'/', r'|',
@@ -106,7 +106,7 @@ def fix_latex_left_right(s):
106
106
 
107
107
  def fix_left_right_pairs(latex_formula):
108
108
  """
109
- 检测并修复LaTeX公式中\left和\right不在同一组的情况
109
+ 检测并修复LaTeX公式中\\left和\\right不在同一组的情况
110
110
 
111
111
  Args:
112
112
  latex_formula (str): 输入的LaTeX公式
@@ -308,9 +308,9 @@ ENV_FORMAT_PATTERNS = {env: re.compile(r'\\begin\{' + env + r'\}\{([^}]*)\}') fo
308
308
 
309
309
  def fix_latex_environments(s):
310
310
  """
311
- 检测LaTeX中环境(如array)的\begin和\end是否匹配
312
- 1. 如果缺少\begin标签则在开头添加
313
- 2. 如果缺少\end标签则在末尾添加
311
+ 检测LaTeX中环境(如array)的\\begin和\\end是否匹配
312
+ 1. 如果缺少\\begin标签则在开头添加
313
+ 2. 如果缺少\\end标签则在末尾添加
314
314
  """
315
315
  for env in ENV_TYPES:
316
316
  begin_count = len(ENV_BEGIN_PATTERNS[env].findall(s))
@@ -334,7 +334,7 @@ def fix_latex_environments(s):
334
334
 
335
335
  UP_PATTERN = re.compile(r'\\up([a-zA-Z]+)')
336
336
  COMMANDS_TO_REMOVE_PATTERN = re.compile(
337
- r'\\(?:lefteqn|boldmath|ensuremath|centering|textsubscript|sides|textsl|textcent|emph)')
337
+ r'\\(?:lefteqn|boldmath|ensuremath|centering|textsubscript|sides|textsl|textcent|emph|protect|null)')
338
338
  REPLACEMENTS_PATTERNS = {
339
339
  re.compile(r'\\underbar'): r'\\underline',
340
340
  re.compile(r'\\Bar'): r'\\hat',
@@ -346,6 +346,9 @@ REPLACEMENTS_PATTERNS = {
346
346
  re.compile(r'\\textunderscore'): r'\\_',
347
347
  re.compile(r'\\fint'): r'⨏',
348
348
  re.compile(r'\\up '): r'\\ ',
349
+ re.compile(r'\\vline = '): r'\\models ',
350
+ re.compile(r'\\vDash '): r'\\models ',
351
+ re.compile(r'\\sq \\sqcup '): r'\\square ',
349
352
  }
350
353
  QQUAD_PATTERN = re.compile(r'\\qquad(?!\s)')
351
354
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: magic-pdf
3
- Version: 1.3.10
3
+ Version: 1.3.11
4
4
  Summary: A practical tool for converting PDF to Markdown
5
5
  License: AGPL-3.0
6
6
  Project-URL: Home, https://mineru.net/
@@ -10,7 +10,7 @@ Classifier: Programming Language :: Python :: 3.10
10
10
  Classifier: Programming Language :: Python :: 3.11
11
11
  Classifier: Programming Language :: Python :: 3.12
12
12
  Classifier: Programming Language :: Python :: 3.13
13
- Requires-Python: >=3.10,<4
13
+ Requires-Python: >=3.10,<3.14
14
14
  Description-Content-Type: text/markdown
15
15
  License-File: LICENSE.md
16
16
  Requires-Dist: Brotli >=1.1.0
@@ -20,10 +20,10 @@ Requires-Dist: click >=8.1.7
20
20
  Requires-Dist: fast-langdetect <0.3.0,>=0.2.3
21
21
  Requires-Dist: loguru >=0.6.0
22
22
  Requires-Dist: numpy >=1.21.6
23
- Requires-Dist: pdfminer.six ==20250324
23
+ Requires-Dist: pdfminer.six ==20250506
24
24
  Requires-Dist: pydantic <2.11,>=2.7.2
25
25
  Requires-Dist: scikit-learn >=1.0.2
26
- Requires-Dist: torch !=2.5.0,!=2.5.1,>=2.2.2
26
+ Requires-Dist: torch !=2.5.0,!=2.5.1,<3,>=2.2.2
27
27
  Requires-Dist: torchvision
28
28
  Requires-Dist: tqdm >=4.67.1
29
29
  Requires-Dist: transformers !=4.51.0,<5.0.0,>=4.49.0
@@ -109,10 +109,8 @@ Easier to use: Just grab MinerU Desktop. No coding, no login, just a simple inte
109
109
  # Changelog
110
110
  - 2025/04/29 1.3.10 Released
111
111
  - Support for custom formula delimiters can be achieved by modifying the `latex-delimiter-config` item in the `magic-pdf.json` file under the user directory.
112
- - Pinned `pdfminer.six` to version `20250324` to prevent parsing failures caused by new versions.
113
112
  - 2025/04/27 1.3.9 Released
114
- - Optimized the formula parsing function to improve the success rate of formula rendering
115
- - Updated `pdfminer.six` to the latest version, fixing some abnormal PDF parsing issues
113
+ - Optimized the formula parsing function to improve the success rate of formula rendering
116
114
  - 2025/04/23 1.3.8 Released
117
115
  - The default `ocr` model (`ch`) has been updated to `PP-OCRv4_server_rec_doc` (model update required)
118
116
  - `PP-OCRv4_server_rec_doc` is trained on a mix of more Chinese document data and PP-OCR training data, enhancing recognition capabilities for some traditional Chinese characters, Japanese, and special characters. It supports over 15,000 recognizable characters, improving text recognition in documents while also boosting general text recognition.
@@ -414,7 +412,7 @@ There are three different ways to experience MinerU:
414
412
  </tr>
415
413
  <tr>
416
414
  <td colspan="3">Python Version</td>
417
- <td colspan="3">>=3.10</td>
415
+ <td colspan="3">3.10~3.13</td>
418
416
  </tr>
419
417
  <tr>
420
418
  <td colspan="3">Nvidia Driver Version</td>
@@ -424,8 +422,7 @@ There are three different ways to experience MinerU:
424
422
  </tr>
425
423
  <tr>
426
424
  <td colspan="3">CUDA Environment</td>
427
- <td>11.8/12.4/12.6/12.8</td>
428
- <td>11.8/12.4/12.6/12.8</td>
425
+ <td colspan="2"><a href="https://pytorch.org/get-started/locally/">Refer to the PyTorch official website</a></td>
429
426
  <td>None</td>
430
427
  </tr>
431
428
  <tr>
@@ -456,7 +453,7 @@ Synced with dev branch updates:
456
453
  #### 1. Install magic-pdf
457
454
 
458
455
  ```bash
459
- conda create -n mineru 'python>=3.10' -y
456
+ conda create -n mineru 'python=3.12' -y
460
457
  conda activate mineru
461
458
  pip install -U "magic-pdf[full]"
462
459
  ```
@@ -52,10 +52,10 @@ magic_pdf/libs/pdf_check.py,sha256=7GWWvDR6g_rj_fE6XJlbTq5AFVX11ngRIzT0N18F214,3
52
52
  magic_pdf/libs/pdf_image_tools.py,sha256=_au7plmKKctpPKozBumSKgP8689q4vH1mU8VMLO0IbM,2260
53
53
  magic_pdf/libs/performance_stats.py,sha256=DW-c6nUTUnWKGTONRKfpucsYZm1ake016F9K7jJwbik,2136
54
54
  magic_pdf/libs/safe_filename.py,sha256=ckwcM_eqoysTb5id8czp-tXq2G9da0-l3pshZDCHQtE,236
55
- magic_pdf/libs/version.py,sha256=4o4BxiWDvKULo_NByGymiLj9KXGht1PsOBGUMmasvxM,23
55
+ magic_pdf/libs/version.py,sha256=cYqar-M7VsRkfFRByjCNAq9dYm1GavgcND7ugDzle7Q,23
56
56
  magic_pdf/model/__init__.py,sha256=sa-dO2k-TLy25I2gRrzjm_cQeYfzMf-pLwBJHkIxGo0,51
57
57
  magic_pdf/model/batch_analyze.py,sha256=F0WsjbQ6z9txdiUiVy6n6zhyJWJ-4moljNx8fe8HFws,10977
58
- magic_pdf/model/doc_analyze_by_custom_model.py,sha256=-cjn7DQi6kZCqVZ0IxbXuL2kmeGhSVLzLaezIHPFzMU,10317
58
+ magic_pdf/model/doc_analyze_by_custom_model.py,sha256=R-d3_ZLZtlAztA23yaDvC0YC2-heXxrBBEyGlX7or7c,10547
59
59
  magic_pdf/model/magic_model.py,sha256=yZKWo_wRck_-YLyFGRiUHGar8sV1Y6458BFLbyBAt74,30682
60
60
  magic_pdf/model/model_list.py,sha256=aqfEJlEfbib3D3ISrxc0Coh6SbffYh8Yq2FlQN35_zA,213
61
61
  magic_pdf/model/pdf_extract_kit.py,sha256=C3sKqRkoD20Ldmo-cqGn1zRldEL-l5NYqcFvd05_fGU,10845
@@ -97,7 +97,7 @@ magic_pdf/model/sub_modules/mfr/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm
97
97
  magic_pdf/model/sub_modules/mfr/unimernet/Unimernet.py,sha256=hq48TP1Ac8Y4FlK7GamnU-WZTQfdZotxBKuFhOIjrcM,5349
98
98
  magic_pdf/model/sub_modules/mfr/unimernet/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
99
99
  magic_pdf/model/sub_modules/mfr/unimernet/unimernet_hf/__init__.py,sha256=kHcISG8GS4TWJW34SCJCei1jxo6HxvO00aC0dqyNFgI,413
100
- magic_pdf/model/sub_modules/mfr/unimernet/unimernet_hf/modeling_unimernet.py,sha256=BEZhsW6TqVXDNOgbvcW_0XMtYYiR3hFjpABRhKZgZC8,18101
100
+ magic_pdf/model/sub_modules/mfr/unimernet/unimernet_hf/modeling_unimernet.py,sha256=v5vf3a4sBLZ11FKbhb9bE_GEXcGbmdLxgTtl2pRmW34,18260
101
101
  magic_pdf/model/sub_modules/mfr/unimernet/unimernet_hf/unimer_mbart/__init__.py,sha256=9T2rBpyGX5YFQYj89-mWujRokOuz4xgNreBuegcg1_c,228
102
102
  magic_pdf/model/sub_modules/mfr/unimernet/unimernet_hf/unimer_mbart/configuration_unimer_mbart.py,sha256=hwRar3pqN_cVs3TRTNSuhB4wacBncfJ-qvaTajRb0xc,7934
103
103
  magic_pdf/model/sub_modules/mfr/unimernet/unimernet_hf/unimer_mbart/modeling_unimer_mbart.py,sha256=Y5nwqDRzSb8zAbY1U0HGJGVtFggJW1zATP8RdnasNcA,113605
@@ -196,9 +196,9 @@ magic_pdf/tools/common.py,sha256=-x0RSFr7SNbdYq7DntaLYmQmaxyF-xKSf4xMpSUTzA0,126
196
196
  magic_pdf/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
197
197
  magic_pdf/utils/annotations.py,sha256=82ou3uELNbQWa9hOFFkVt0gsIskAKf5msCv5J2IJ5V0,211
198
198
  magic_pdf/utils/office_to_pdf.py,sha256=bFRYe6v3-pfx5R8-bV8cmf12jPnOrYZsleKoECTXzbM,3958
199
- magic_pdf-1.3.10.dist-info/LICENSE.md,sha256=jVa0BUaKrRH4erV2P5AeJ24I2WRv9chIGxditreJ6e0,34524
200
- magic_pdf-1.3.10.dist-info/METADATA,sha256=_Z35A_31Utec2rGaKDxxpZLjtG2uZywgnJrjNRUF__w,47901
201
- magic_pdf-1.3.10.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
202
- magic_pdf-1.3.10.dist-info/entry_points.txt,sha256=wXwYke3j8fqDQTocUspL-CqDUEv3Tfcwp09fM8dZAhA,98
203
- magic_pdf-1.3.10.dist-info/top_level.txt,sha256=J9I0AzmHWGkp9c6DL8Oe4mEx3yYphLzkRn4H25Lg1rE,10
204
- magic_pdf-1.3.10.dist-info/RECORD,,
199
+ magic_pdf-1.3.11.dist-info/LICENSE.md,sha256=jVa0BUaKrRH4erV2P5AeJ24I2WRv9chIGxditreJ6e0,34524
200
+ magic_pdf-1.3.11.dist-info/METADATA,sha256=MFffMcv9-ZlVG8jBKDtTPofJC_F-x_OPU4aLwxd-aUs,47765
201
+ magic_pdf-1.3.11.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
202
+ magic_pdf-1.3.11.dist-info/entry_points.txt,sha256=wXwYke3j8fqDQTocUspL-CqDUEv3Tfcwp09fM8dZAhA,98
203
+ magic_pdf-1.3.11.dist-info/top_level.txt,sha256=J9I0AzmHWGkp9c6DL8Oe4mEx3yYphLzkRn4H25Lg1rE,10
204
+ magic_pdf-1.3.11.dist-info/RECORD,,