magic-pdf 1.3.10__py3-none-any.whl → 1.3.11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- magic_pdf/libs/version.py +1 -1
- magic_pdf/model/doc_analyze_by_custom_model.py +4 -1
- magic_pdf/model/sub_modules/mfr/unimernet/unimernet_hf/modeling_unimernet.py +10 -7
- {magic_pdf-1.3.10.dist-info → magic_pdf-1.3.11.dist-info}/METADATA +8 -11
- {magic_pdf-1.3.10.dist-info → magic_pdf-1.3.11.dist-info}/RECORD +9 -9
- {magic_pdf-1.3.10.dist-info → magic_pdf-1.3.11.dist-info}/LICENSE.md +0 -0
- {magic_pdf-1.3.10.dist-info → magic_pdf-1.3.11.dist-info}/WHEEL +0 -0
- {magic_pdf-1.3.10.dist-info → magic_pdf-1.3.11.dist-info}/entry_points.txt +0 -0
- {magic_pdf-1.3.10.dist-info → magic_pdf-1.3.11.dist-info}/top_level.txt +0 -0
magic_pdf/libs/version.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
__version__ = "1.3.
|
1
|
+
__version__ = "1.3.11"
|
@@ -156,7 +156,10 @@ def doc_analyze(
|
|
156
156
|
batch_images = [images_with_extra_info]
|
157
157
|
|
158
158
|
results = []
|
159
|
-
|
159
|
+
processed_images_count = 0
|
160
|
+
for index, batch_image in enumerate(batch_images):
|
161
|
+
processed_images_count += len(batch_image)
|
162
|
+
logger.info(f'Batch {index + 1}/{len(batch_images)}: {processed_images_count} pages/{len(images_with_extra_info)} pages')
|
160
163
|
result = may_batch_image_analyze(batch_image, ocr, show_log,layout_model, formula_enable, table_enable)
|
161
164
|
results.extend(result)
|
162
165
|
|
@@ -66,9 +66,9 @@ LEFT_RIGHT_REMOVE_PATTERN = re.compile(r'\\left\.?|\\right\.?')
|
|
66
66
|
|
67
67
|
def fix_latex_left_right(s):
|
68
68
|
"""
|
69
|
-
修复LaTeX
|
69
|
+
修复LaTeX中的\\left和\\right命令
|
70
70
|
1. 确保它们后面跟有效分隔符
|
71
|
-
2.
|
71
|
+
2. 平衡\\left和\\right的数量
|
72
72
|
"""
|
73
73
|
# 白名单分隔符
|
74
74
|
valid_delims_list = [r'(', r')', r'[', r']', r'{', r'}', r'/', r'|',
|
@@ -106,7 +106,7 @@ def fix_latex_left_right(s):
|
|
106
106
|
|
107
107
|
def fix_left_right_pairs(latex_formula):
|
108
108
|
"""
|
109
|
-
检测并修复LaTeX
|
109
|
+
检测并修复LaTeX公式中\\left和\\right不在同一组的情况
|
110
110
|
|
111
111
|
Args:
|
112
112
|
latex_formula (str): 输入的LaTeX公式
|
@@ -308,9 +308,9 @@ ENV_FORMAT_PATTERNS = {env: re.compile(r'\\begin\{' + env + r'\}\{([^}]*)\}') fo
|
|
308
308
|
|
309
309
|
def fix_latex_environments(s):
|
310
310
|
"""
|
311
|
-
检测LaTeX中环境(如array
|
312
|
-
1.
|
313
|
-
2.
|
311
|
+
检测LaTeX中环境(如array)的\\begin和\\end是否匹配
|
312
|
+
1. 如果缺少\\begin标签则在开头添加
|
313
|
+
2. 如果缺少\\end标签则在末尾添加
|
314
314
|
"""
|
315
315
|
for env in ENV_TYPES:
|
316
316
|
begin_count = len(ENV_BEGIN_PATTERNS[env].findall(s))
|
@@ -334,7 +334,7 @@ def fix_latex_environments(s):
|
|
334
334
|
|
335
335
|
UP_PATTERN = re.compile(r'\\up([a-zA-Z]+)')
|
336
336
|
COMMANDS_TO_REMOVE_PATTERN = re.compile(
|
337
|
-
r'\\(?:lefteqn|boldmath|ensuremath|centering|textsubscript|sides|textsl|textcent|emph)')
|
337
|
+
r'\\(?:lefteqn|boldmath|ensuremath|centering|textsubscript|sides|textsl|textcent|emph|protect|null)')
|
338
338
|
REPLACEMENTS_PATTERNS = {
|
339
339
|
re.compile(r'\\underbar'): r'\\underline',
|
340
340
|
re.compile(r'\\Bar'): r'\\hat',
|
@@ -346,6 +346,9 @@ REPLACEMENTS_PATTERNS = {
|
|
346
346
|
re.compile(r'\\textunderscore'): r'\\_',
|
347
347
|
re.compile(r'\\fint'): r'⨏',
|
348
348
|
re.compile(r'\\up '): r'\\ ',
|
349
|
+
re.compile(r'\\vline = '): r'\\models ',
|
350
|
+
re.compile(r'\\vDash '): r'\\models ',
|
351
|
+
re.compile(r'\\sq \\sqcup '): r'\\square ',
|
349
352
|
}
|
350
353
|
QQUAD_PATTERN = re.compile(r'\\qquad(?!\s)')
|
351
354
|
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: magic-pdf
|
3
|
-
Version: 1.3.
|
3
|
+
Version: 1.3.11
|
4
4
|
Summary: A practical tool for converting PDF to Markdown
|
5
5
|
License: AGPL-3.0
|
6
6
|
Project-URL: Home, https://mineru.net/
|
@@ -10,7 +10,7 @@ Classifier: Programming Language :: Python :: 3.10
|
|
10
10
|
Classifier: Programming Language :: Python :: 3.11
|
11
11
|
Classifier: Programming Language :: Python :: 3.12
|
12
12
|
Classifier: Programming Language :: Python :: 3.13
|
13
|
-
Requires-Python: >=3.10,<
|
13
|
+
Requires-Python: >=3.10,<3.14
|
14
14
|
Description-Content-Type: text/markdown
|
15
15
|
License-File: LICENSE.md
|
16
16
|
Requires-Dist: Brotli >=1.1.0
|
@@ -20,10 +20,10 @@ Requires-Dist: click >=8.1.7
|
|
20
20
|
Requires-Dist: fast-langdetect <0.3.0,>=0.2.3
|
21
21
|
Requires-Dist: loguru >=0.6.0
|
22
22
|
Requires-Dist: numpy >=1.21.6
|
23
|
-
Requires-Dist: pdfminer.six ==
|
23
|
+
Requires-Dist: pdfminer.six ==20250506
|
24
24
|
Requires-Dist: pydantic <2.11,>=2.7.2
|
25
25
|
Requires-Dist: scikit-learn >=1.0.2
|
26
|
-
Requires-Dist: torch !=2.5.0,!=2.5.1,>=2.2.2
|
26
|
+
Requires-Dist: torch !=2.5.0,!=2.5.1,<3,>=2.2.2
|
27
27
|
Requires-Dist: torchvision
|
28
28
|
Requires-Dist: tqdm >=4.67.1
|
29
29
|
Requires-Dist: transformers !=4.51.0,<5.0.0,>=4.49.0
|
@@ -109,10 +109,8 @@ Easier to use: Just grab MinerU Desktop. No coding, no login, just a simple inte
|
|
109
109
|
# Changelog
|
110
110
|
- 2025/04/29 1.3.10 Released
|
111
111
|
- Support for custom formula delimiters can be achieved by modifying the `latex-delimiter-config` item in the `magic-pdf.json` file under the user directory.
|
112
|
-
- Pinned `pdfminer.six` to version `20250324` to prevent parsing failures caused by new versions.
|
113
112
|
- 2025/04/27 1.3.9 Released
|
114
|
-
- Optimized the formula parsing function to improve the success rate of formula rendering
|
115
|
-
- Updated `pdfminer.six` to the latest version, fixing some abnormal PDF parsing issues
|
113
|
+
- Optimized the formula parsing function to improve the success rate of formula rendering
|
116
114
|
- 2025/04/23 1.3.8 Released
|
117
115
|
- The default `ocr` model (`ch`) has been updated to `PP-OCRv4_server_rec_doc` (model update required)
|
118
116
|
- `PP-OCRv4_server_rec_doc` is trained on a mix of more Chinese document data and PP-OCR training data, enhancing recognition capabilities for some traditional Chinese characters, Japanese, and special characters. It supports over 15,000 recognizable characters, improving text recognition in documents while also boosting general text recognition.
|
@@ -414,7 +412,7 @@ There are three different ways to experience MinerU:
|
|
414
412
|
</tr>
|
415
413
|
<tr>
|
416
414
|
<td colspan="3">Python Version</td>
|
417
|
-
<td colspan="3"
|
415
|
+
<td colspan="3">3.10~3.13</td>
|
418
416
|
</tr>
|
419
417
|
<tr>
|
420
418
|
<td colspan="3">Nvidia Driver Version</td>
|
@@ -424,8 +422,7 @@ There are three different ways to experience MinerU:
|
|
424
422
|
</tr>
|
425
423
|
<tr>
|
426
424
|
<td colspan="3">CUDA Environment</td>
|
427
|
-
<td
|
428
|
-
<td>11.8/12.4/12.6/12.8</td>
|
425
|
+
<td colspan="2"><a href="https://pytorch.org/get-started/locally/">Refer to the PyTorch official website</a></td>
|
429
426
|
<td>None</td>
|
430
427
|
</tr>
|
431
428
|
<tr>
|
@@ -456,7 +453,7 @@ Synced with dev branch updates:
|
|
456
453
|
#### 1. Install magic-pdf
|
457
454
|
|
458
455
|
```bash
|
459
|
-
conda create -n mineru 'python
|
456
|
+
conda create -n mineru 'python=3.12' -y
|
460
457
|
conda activate mineru
|
461
458
|
pip install -U "magic-pdf[full]"
|
462
459
|
```
|
@@ -52,10 +52,10 @@ magic_pdf/libs/pdf_check.py,sha256=7GWWvDR6g_rj_fE6XJlbTq5AFVX11ngRIzT0N18F214,3
|
|
52
52
|
magic_pdf/libs/pdf_image_tools.py,sha256=_au7plmKKctpPKozBumSKgP8689q4vH1mU8VMLO0IbM,2260
|
53
53
|
magic_pdf/libs/performance_stats.py,sha256=DW-c6nUTUnWKGTONRKfpucsYZm1ake016F9K7jJwbik,2136
|
54
54
|
magic_pdf/libs/safe_filename.py,sha256=ckwcM_eqoysTb5id8czp-tXq2G9da0-l3pshZDCHQtE,236
|
55
|
-
magic_pdf/libs/version.py,sha256=
|
55
|
+
magic_pdf/libs/version.py,sha256=cYqar-M7VsRkfFRByjCNAq9dYm1GavgcND7ugDzle7Q,23
|
56
56
|
magic_pdf/model/__init__.py,sha256=sa-dO2k-TLy25I2gRrzjm_cQeYfzMf-pLwBJHkIxGo0,51
|
57
57
|
magic_pdf/model/batch_analyze.py,sha256=F0WsjbQ6z9txdiUiVy6n6zhyJWJ-4moljNx8fe8HFws,10977
|
58
|
-
magic_pdf/model/doc_analyze_by_custom_model.py,sha256
|
58
|
+
magic_pdf/model/doc_analyze_by_custom_model.py,sha256=R-d3_ZLZtlAztA23yaDvC0YC2-heXxrBBEyGlX7or7c,10547
|
59
59
|
magic_pdf/model/magic_model.py,sha256=yZKWo_wRck_-YLyFGRiUHGar8sV1Y6458BFLbyBAt74,30682
|
60
60
|
magic_pdf/model/model_list.py,sha256=aqfEJlEfbib3D3ISrxc0Coh6SbffYh8Yq2FlQN35_zA,213
|
61
61
|
magic_pdf/model/pdf_extract_kit.py,sha256=C3sKqRkoD20Ldmo-cqGn1zRldEL-l5NYqcFvd05_fGU,10845
|
@@ -97,7 +97,7 @@ magic_pdf/model/sub_modules/mfr/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm
|
|
97
97
|
magic_pdf/model/sub_modules/mfr/unimernet/Unimernet.py,sha256=hq48TP1Ac8Y4FlK7GamnU-WZTQfdZotxBKuFhOIjrcM,5349
|
98
98
|
magic_pdf/model/sub_modules/mfr/unimernet/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
99
99
|
magic_pdf/model/sub_modules/mfr/unimernet/unimernet_hf/__init__.py,sha256=kHcISG8GS4TWJW34SCJCei1jxo6HxvO00aC0dqyNFgI,413
|
100
|
-
magic_pdf/model/sub_modules/mfr/unimernet/unimernet_hf/modeling_unimernet.py,sha256=
|
100
|
+
magic_pdf/model/sub_modules/mfr/unimernet/unimernet_hf/modeling_unimernet.py,sha256=v5vf3a4sBLZ11FKbhb9bE_GEXcGbmdLxgTtl2pRmW34,18260
|
101
101
|
magic_pdf/model/sub_modules/mfr/unimernet/unimernet_hf/unimer_mbart/__init__.py,sha256=9T2rBpyGX5YFQYj89-mWujRokOuz4xgNreBuegcg1_c,228
|
102
102
|
magic_pdf/model/sub_modules/mfr/unimernet/unimernet_hf/unimer_mbart/configuration_unimer_mbart.py,sha256=hwRar3pqN_cVs3TRTNSuhB4wacBncfJ-qvaTajRb0xc,7934
|
103
103
|
magic_pdf/model/sub_modules/mfr/unimernet/unimernet_hf/unimer_mbart/modeling_unimer_mbart.py,sha256=Y5nwqDRzSb8zAbY1U0HGJGVtFggJW1zATP8RdnasNcA,113605
|
@@ -196,9 +196,9 @@ magic_pdf/tools/common.py,sha256=-x0RSFr7SNbdYq7DntaLYmQmaxyF-xKSf4xMpSUTzA0,126
|
|
196
196
|
magic_pdf/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
197
197
|
magic_pdf/utils/annotations.py,sha256=82ou3uELNbQWa9hOFFkVt0gsIskAKf5msCv5J2IJ5V0,211
|
198
198
|
magic_pdf/utils/office_to_pdf.py,sha256=bFRYe6v3-pfx5R8-bV8cmf12jPnOrYZsleKoECTXzbM,3958
|
199
|
-
magic_pdf-1.3.
|
200
|
-
magic_pdf-1.3.
|
201
|
-
magic_pdf-1.3.
|
202
|
-
magic_pdf-1.3.
|
203
|
-
magic_pdf-1.3.
|
204
|
-
magic_pdf-1.3.
|
199
|
+
magic_pdf-1.3.11.dist-info/LICENSE.md,sha256=jVa0BUaKrRH4erV2P5AeJ24I2WRv9chIGxditreJ6e0,34524
|
200
|
+
magic_pdf-1.3.11.dist-info/METADATA,sha256=MFffMcv9-ZlVG8jBKDtTPofJC_F-x_OPU4aLwxd-aUs,47765
|
201
|
+
magic_pdf-1.3.11.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
|
202
|
+
magic_pdf-1.3.11.dist-info/entry_points.txt,sha256=wXwYke3j8fqDQTocUspL-CqDUEv3Tfcwp09fM8dZAhA,98
|
203
|
+
magic_pdf-1.3.11.dist-info/top_level.txt,sha256=J9I0AzmHWGkp9c6DL8Oe4mEx3yYphLzkRn4H25Lg1rE,10
|
204
|
+
magic_pdf-1.3.11.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|