magic-pdf 1.3.9__py3-none-any.whl → 1.3.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- magic_pdf/dict2md/ocr_mkcontent.py +16 -2
- magic_pdf/libs/config_reader.py +9 -0
- magic_pdf/libs/version.py +1 -1
- magic_pdf/model/sub_modules/mfr/unimernet/unimernet_hf/modeling_unimernet.py +4 -1
- magic_pdf/model/sub_modules/model_utils.py +2 -2
- {magic_pdf-1.3.9.dist-info → magic_pdf-1.3.10.dist-info}/METADATA +5 -2
- {magic_pdf-1.3.9.dist-info → magic_pdf-1.3.10.dist-info}/RECORD +11 -11
- {magic_pdf-1.3.9.dist-info → magic_pdf-1.3.10.dist-info}/LICENSE.md +0 -0
- {magic_pdf-1.3.9.dist-info → magic_pdf-1.3.10.dist-info}/WHEEL +0 -0
- {magic_pdf-1.3.9.dist-info → magic_pdf-1.3.10.dist-info}/entry_points.txt +0 -0
- {magic_pdf-1.3.9.dist-info → magic_pdf-1.3.10.dist-info}/top_level.txt +0 -0
@@ -5,6 +5,7 @@ from loguru import logger
|
|
5
5
|
from magic_pdf.config.make_content_config import DropMode, MakeMode
|
6
6
|
from magic_pdf.config.ocr_content_type import BlockType, ContentType
|
7
7
|
from magic_pdf.libs.commons import join_path
|
8
|
+
from magic_pdf.libs.config_reader import get_latex_delimiter_config
|
8
9
|
from magic_pdf.libs.language import detect_lang
|
9
10
|
from magic_pdf.libs.markdown_utils import ocr_escape_special_markdown_char
|
10
11
|
from magic_pdf.post_proc.para_split_v3 import ListLineTag
|
@@ -145,6 +146,19 @@ def full_to_half(text: str) -> str:
|
|
145
146
|
result.append(char)
|
146
147
|
return ''.join(result)
|
147
148
|
|
149
|
+
latex_delimiters_config = get_latex_delimiter_config()
|
150
|
+
|
151
|
+
default_delimiters = {
|
152
|
+
'display': {'left': '$$', 'right': '$$'},
|
153
|
+
'inline': {'left': '$', 'right': '$'}
|
154
|
+
}
|
155
|
+
|
156
|
+
delimiters = latex_delimiters_config if latex_delimiters_config else default_delimiters
|
157
|
+
|
158
|
+
display_left_delimiter = delimiters['display']['left']
|
159
|
+
display_right_delimiter = delimiters['display']['right']
|
160
|
+
inline_left_delimiter = delimiters['inline']['left']
|
161
|
+
inline_right_delimiter = delimiters['inline']['right']
|
148
162
|
|
149
163
|
def merge_para_with_text(para_block):
|
150
164
|
block_text = ''
|
@@ -168,9 +182,9 @@ def merge_para_with_text(para_block):
|
|
168
182
|
if span_type == ContentType.Text:
|
169
183
|
content = ocr_escape_special_markdown_char(span['content'])
|
170
184
|
elif span_type == ContentType.InlineEquation:
|
171
|
-
content = f"
|
185
|
+
content = f"{inline_left_delimiter}{span['content']}{inline_right_delimiter}"
|
172
186
|
elif span_type == ContentType.InterlineEquation:
|
173
|
-
content = f"\n
|
187
|
+
content = f"\n{display_left_delimiter}\n{span['content']}\n{display_right_delimiter}\n"
|
174
188
|
|
175
189
|
content = content.strip()
|
176
190
|
|
magic_pdf/libs/config_reader.py
CHANGED
@@ -125,6 +125,15 @@ def get_llm_aided_config():
|
|
125
125
|
else:
|
126
126
|
return llm_aided_config
|
127
127
|
|
128
|
+
def get_latex_delimiter_config():
|
129
|
+
config = read_config()
|
130
|
+
latex_delimiter_config = config.get('latex-delimiter-config')
|
131
|
+
if latex_delimiter_config is None:
|
132
|
+
logger.warning(f"'latex-delimiter-config' not found in {CONFIG_FILE_NAME}, use 'None' as default")
|
133
|
+
return None
|
134
|
+
else:
|
135
|
+
return latex_delimiter_config
|
136
|
+
|
128
137
|
|
129
138
|
if __name__ == '__main__':
|
130
139
|
ak, sk, endpoint = get_s3_config('llm-raw')
|
magic_pdf/libs/version.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
__version__ = "1.3.
|
1
|
+
__version__ = "1.3.10"
|
@@ -342,7 +342,10 @@ REPLACEMENTS_PATTERNS = {
|
|
342
342
|
re.compile(r'\\Tilde'): r'\\tilde',
|
343
343
|
re.compile(r'\\slash'): r'/',
|
344
344
|
re.compile(r'\\textperthousand'): r'‰',
|
345
|
-
re.compile(r'\\sun'): r'☉'
|
345
|
+
re.compile(r'\\sun'): r'☉',
|
346
|
+
re.compile(r'\\textunderscore'): r'\\_',
|
347
|
+
re.compile(r'\\fint'): r'⨏',
|
348
|
+
re.compile(r'\\up '): r'\\ ',
|
346
349
|
}
|
347
350
|
QQUAD_PATTERN = re.compile(r'\\qquad(?!\s)')
|
348
351
|
|
@@ -172,8 +172,8 @@ def filter_nested_tables(table_res_list, overlap_threshold=0.8, area_threshold=0
|
|
172
172
|
tables_inside = [j for j in range(len(table_res_list))
|
173
173
|
if i != j and is_inside(table_info[j], table_info[i], overlap_threshold)]
|
174
174
|
|
175
|
-
# Continue if there are at least
|
176
|
-
if len(tables_inside) >=
|
175
|
+
# Continue if there are at least 3 tables inside
|
176
|
+
if len(tables_inside) >= 3:
|
177
177
|
# Check if inside tables overlap with each other
|
178
178
|
tables_overlap = any(do_overlap(table_info[tables_inside[idx1]], table_info[tables_inside[idx2]])
|
179
179
|
for idx1 in range(len(tables_inside))
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: magic-pdf
|
3
|
-
Version: 1.3.
|
3
|
+
Version: 1.3.10
|
4
4
|
Summary: A practical tool for converting PDF to Markdown
|
5
5
|
License: AGPL-3.0
|
6
6
|
Project-URL: Home, https://mineru.net/
|
@@ -20,7 +20,7 @@ Requires-Dist: click >=8.1.7
|
|
20
20
|
Requires-Dist: fast-langdetect <0.3.0,>=0.2.3
|
21
21
|
Requires-Dist: loguru >=0.6.0
|
22
22
|
Requires-Dist: numpy >=1.21.6
|
23
|
-
Requires-Dist: pdfminer.six
|
23
|
+
Requires-Dist: pdfminer.six ==20250324
|
24
24
|
Requires-Dist: pydantic <2.11,>=2.7.2
|
25
25
|
Requires-Dist: scikit-learn >=1.0.2
|
26
26
|
Requires-Dist: torch !=2.5.0,!=2.5.1,>=2.2.2
|
@@ -107,6 +107,9 @@ Easier to use: Just grab MinerU Desktop. No coding, no login, just a simple inte
|
|
107
107
|
</div>
|
108
108
|
|
109
109
|
# Changelog
|
110
|
+
- 2025/04/29 1.3.10 Released
|
111
|
+
- Support for custom formula delimiters can be achieved by modifying the `latex-delimiter-config` item in the `magic-pdf.json` file under the user directory.
|
112
|
+
- Pinned `pdfminer.six` to version `20250324` to prevent parsing failures caused by new versions.
|
110
113
|
- 2025/04/27 1.3.9 Released
|
111
114
|
- Optimized the formula parsing function to improve the success rate of formula rendering
|
112
115
|
- Updated `pdfminer.six` to the latest version, fixing some abnormal PDF parsing issues
|
@@ -25,7 +25,7 @@ magic_pdf/data/io/base.py,sha256=SqNQqe30ZvoVvg7GVv-hLMCjN6yBgDyQQWeLgGsTfhQ,111
|
|
25
25
|
magic_pdf/data/io/http.py,sha256=XlKB0DNf4a_uUnfgcclvaaOtmE7lmddx0DnK8A-emAM,958
|
26
26
|
magic_pdf/data/io/s3.py,sha256=hyA7sbNriQy64xd_uyJ7acN_oneQ1Pdmoc7_xcvkue8,3606
|
27
27
|
magic_pdf/dict2md/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
28
|
-
magic_pdf/dict2md/ocr_mkcontent.py,sha256=
|
28
|
+
magic_pdf/dict2md/ocr_mkcontent.py,sha256=jRGoSNeR3XBgzGhKdQ25CmsdW0pi7NA-5NY3TB2pja0,14421
|
29
29
|
magic_pdf/filter/__init__.py,sha256=_7lSez_myu4b6cdzPpQ-NfREuqeBSq_QdyBPKVLyq2U,1505
|
30
30
|
magic_pdf/filter/pdf_classify_by_type.py,sha256=YNYXamxYgEiSujwilCNHOtrwpgJGDiQ597qJfardDVc,42354
|
31
31
|
magic_pdf/filter/pdf_meta_scan.py,sha256=eOuM0-JgaXvHolSgepGoNDJDmv_uITWLQpH_0MfnVQw,17478
|
@@ -38,7 +38,7 @@ magic_pdf/libs/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
38
38
|
magic_pdf/libs/boxbase.py,sha256=DKZXhwpJd-HE2_Du7NmkeeYW5gG-iwX3GeSWL7rYGv0,16956
|
39
39
|
magic_pdf/libs/clean_memory.py,sha256=OsQexCjmBO2i-Hv-0uYQfn72dbUWR8sTW81nG2zlNQQ,479
|
40
40
|
magic_pdf/libs/commons.py,sha256=xD0fGA16KNB5rhbl4zRrOqdrNHYwaRablT_s9W2ZTbw,1174
|
41
|
-
magic_pdf/libs/config_reader.py,sha256=
|
41
|
+
magic_pdf/libs/config_reader.py,sha256=Z8C5o2uYfByB0Sj-jpgzu6VRobNp0y2gTheVXWkZV_0,4716
|
42
42
|
magic_pdf/libs/convert_utils.py,sha256=Ov-lsfCLBPz_15iSJXIslBNmrSf_E_1g_XDWJy8NgO8,143
|
43
43
|
magic_pdf/libs/coordinate_transform.py,sha256=Bbop2cP2uz2ZG0U0gwd7J6EKkgABq5Rv03qf2LMPw80,429
|
44
44
|
magic_pdf/libs/draw_bbox.py,sha256=hpUmpPiQVu7UgWQa3M49dS22G6A9gcG2jpq4dQjTjzA,18331
|
@@ -52,7 +52,7 @@ magic_pdf/libs/pdf_check.py,sha256=7GWWvDR6g_rj_fE6XJlbTq5AFVX11ngRIzT0N18F214,3
|
|
52
52
|
magic_pdf/libs/pdf_image_tools.py,sha256=_au7plmKKctpPKozBumSKgP8689q4vH1mU8VMLO0IbM,2260
|
53
53
|
magic_pdf/libs/performance_stats.py,sha256=DW-c6nUTUnWKGTONRKfpucsYZm1ake016F9K7jJwbik,2136
|
54
54
|
magic_pdf/libs/safe_filename.py,sha256=ckwcM_eqoysTb5id8czp-tXq2G9da0-l3pshZDCHQtE,236
|
55
|
-
magic_pdf/libs/version.py,sha256=
|
55
|
+
magic_pdf/libs/version.py,sha256=4o4BxiWDvKULo_NByGymiLj9KXGht1PsOBGUMmasvxM,23
|
56
56
|
magic_pdf/model/__init__.py,sha256=sa-dO2k-TLy25I2gRrzjm_cQeYfzMf-pLwBJHkIxGo0,51
|
57
57
|
magic_pdf/model/batch_analyze.py,sha256=F0WsjbQ6z9txdiUiVy6n6zhyJWJ-4moljNx8fe8HFws,10977
|
58
58
|
magic_pdf/model/doc_analyze_by_custom_model.py,sha256=-cjn7DQi6kZCqVZ0IxbXuL2kmeGhSVLzLaezIHPFzMU,10317
|
@@ -62,7 +62,7 @@ magic_pdf/model/pdf_extract_kit.py,sha256=C3sKqRkoD20Ldmo-cqGn1zRldEL-l5NYqcFvd0
|
|
62
62
|
magic_pdf/model/pp_structure_v2.py,sha256=NcqFWL4nUtjl82MFak8HX_8V3i4Aw_fK4dATrIp5uGs,3840
|
63
63
|
magic_pdf/model/sub_modules/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
64
64
|
magic_pdf/model/sub_modules/model_init.py,sha256=e2t95kxiuU47luOHByDokbQ2uob6oQhEA4b6UGVfMjY,8303
|
65
|
-
magic_pdf/model/sub_modules/model_utils.py,sha256=
|
65
|
+
magic_pdf/model/sub_modules/model_utils.py,sha256=Md5yOki9uqW31sWIi7AKRwAJNKnCJBVSfQx6LXRKngs,12166
|
66
66
|
magic_pdf/model/sub_modules/language_detection/__init__.py,sha256=8CRrCQVuExa0BttRFh3Z40lFy2K5jN0sp67KWjOlj5c,50
|
67
67
|
magic_pdf/model/sub_modules/language_detection/utils.py,sha256=Q__v6DdNJztt8GhVSuSB0txahVq-aj8RLhWn2VScx4w,3047
|
68
68
|
magic_pdf/model/sub_modules/language_detection/yolov11/YOLOv11.py,sha256=7T8eFl8zlZe6F0j0jSB3jrwOapDft320JQ1fuWxpvAY,5230
|
@@ -97,7 +97,7 @@ magic_pdf/model/sub_modules/mfr/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm
|
|
97
97
|
magic_pdf/model/sub_modules/mfr/unimernet/Unimernet.py,sha256=hq48TP1Ac8Y4FlK7GamnU-WZTQfdZotxBKuFhOIjrcM,5349
|
98
98
|
magic_pdf/model/sub_modules/mfr/unimernet/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
99
99
|
magic_pdf/model/sub_modules/mfr/unimernet/unimernet_hf/__init__.py,sha256=kHcISG8GS4TWJW34SCJCei1jxo6HxvO00aC0dqyNFgI,413
|
100
|
-
magic_pdf/model/sub_modules/mfr/unimernet/unimernet_hf/modeling_unimernet.py,sha256=
|
100
|
+
magic_pdf/model/sub_modules/mfr/unimernet/unimernet_hf/modeling_unimernet.py,sha256=BEZhsW6TqVXDNOgbvcW_0XMtYYiR3hFjpABRhKZgZC8,18101
|
101
101
|
magic_pdf/model/sub_modules/mfr/unimernet/unimernet_hf/unimer_mbart/__init__.py,sha256=9T2rBpyGX5YFQYj89-mWujRokOuz4xgNreBuegcg1_c,228
|
102
102
|
magic_pdf/model/sub_modules/mfr/unimernet/unimernet_hf/unimer_mbart/configuration_unimer_mbart.py,sha256=hwRar3pqN_cVs3TRTNSuhB4wacBncfJ-qvaTajRb0xc,7934
|
103
103
|
magic_pdf/model/sub_modules/mfr/unimernet/unimernet_hf/unimer_mbart/modeling_unimer_mbart.py,sha256=Y5nwqDRzSb8zAbY1U0HGJGVtFggJW1zATP8RdnasNcA,113605
|
@@ -196,9 +196,9 @@ magic_pdf/tools/common.py,sha256=-x0RSFr7SNbdYq7DntaLYmQmaxyF-xKSf4xMpSUTzA0,126
|
|
196
196
|
magic_pdf/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
197
197
|
magic_pdf/utils/annotations.py,sha256=82ou3uELNbQWa9hOFFkVt0gsIskAKf5msCv5J2IJ5V0,211
|
198
198
|
magic_pdf/utils/office_to_pdf.py,sha256=bFRYe6v3-pfx5R8-bV8cmf12jPnOrYZsleKoECTXzbM,3958
|
199
|
-
magic_pdf-1.3.
|
200
|
-
magic_pdf-1.3.
|
201
|
-
magic_pdf-1.3.
|
202
|
-
magic_pdf-1.3.
|
203
|
-
magic_pdf-1.3.
|
204
|
-
magic_pdf-1.3.
|
199
|
+
magic_pdf-1.3.10.dist-info/LICENSE.md,sha256=jVa0BUaKrRH4erV2P5AeJ24I2WRv9chIGxditreJ6e0,34524
|
200
|
+
magic_pdf-1.3.10.dist-info/METADATA,sha256=_Z35A_31Utec2rGaKDxxpZLjtG2uZywgnJrjNRUF__w,47901
|
201
|
+
magic_pdf-1.3.10.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
|
202
|
+
magic_pdf-1.3.10.dist-info/entry_points.txt,sha256=wXwYke3j8fqDQTocUspL-CqDUEv3Tfcwp09fM8dZAhA,98
|
203
|
+
magic_pdf-1.3.10.dist-info/top_level.txt,sha256=J9I0AzmHWGkp9c6DL8Oe4mEx3yYphLzkRn4H25Lg1rE,10
|
204
|
+
magic_pdf-1.3.10.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|