magic-pdf 1.3.9__py3-none-any.whl → 1.3.10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -5,6 +5,7 @@ from loguru import logger
5
5
  from magic_pdf.config.make_content_config import DropMode, MakeMode
6
6
  from magic_pdf.config.ocr_content_type import BlockType, ContentType
7
7
  from magic_pdf.libs.commons import join_path
8
+ from magic_pdf.libs.config_reader import get_latex_delimiter_config
8
9
  from magic_pdf.libs.language import detect_lang
9
10
  from magic_pdf.libs.markdown_utils import ocr_escape_special_markdown_char
10
11
  from magic_pdf.post_proc.para_split_v3 import ListLineTag
@@ -145,6 +146,19 @@ def full_to_half(text: str) -> str:
145
146
  result.append(char)
146
147
  return ''.join(result)
147
148
 
149
+ latex_delimiters_config = get_latex_delimiter_config()
150
+
151
+ default_delimiters = {
152
+ 'display': {'left': '$$', 'right': '$$'},
153
+ 'inline': {'left': '$', 'right': '$'}
154
+ }
155
+
156
+ delimiters = latex_delimiters_config if latex_delimiters_config else default_delimiters
157
+
158
+ display_left_delimiter = delimiters['display']['left']
159
+ display_right_delimiter = delimiters['display']['right']
160
+ inline_left_delimiter = delimiters['inline']['left']
161
+ inline_right_delimiter = delimiters['inline']['right']
148
162
 
149
163
  def merge_para_with_text(para_block):
150
164
  block_text = ''
@@ -168,9 +182,9 @@ def merge_para_with_text(para_block):
168
182
  if span_type == ContentType.Text:
169
183
  content = ocr_escape_special_markdown_char(span['content'])
170
184
  elif span_type == ContentType.InlineEquation:
171
- content = f"${span['content']}$"
185
+ content = f"{inline_left_delimiter}{span['content']}{inline_right_delimiter}"
172
186
  elif span_type == ContentType.InterlineEquation:
173
- content = f"\n$$\n{span['content']}\n$$\n"
187
+ content = f"\n{display_left_delimiter}\n{span['content']}\n{display_right_delimiter}\n"
174
188
 
175
189
  content = content.strip()
176
190
 
@@ -125,6 +125,15 @@ def get_llm_aided_config():
125
125
  else:
126
126
  return llm_aided_config
127
127
 
128
+ def get_latex_delimiter_config():
129
+ config = read_config()
130
+ latex_delimiter_config = config.get('latex-delimiter-config')
131
+ if latex_delimiter_config is None:
132
+ logger.warning(f"'latex-delimiter-config' not found in {CONFIG_FILE_NAME}, use 'None' as default")
133
+ return None
134
+ else:
135
+ return latex_delimiter_config
136
+
128
137
 
129
138
  if __name__ == '__main__':
130
139
  ak, sk, endpoint = get_s3_config('llm-raw')
magic_pdf/libs/version.py CHANGED
@@ -1 +1 @@
1
- __version__ = "1.3.9"
1
+ __version__ = "1.3.10"
@@ -342,7 +342,10 @@ REPLACEMENTS_PATTERNS = {
342
342
  re.compile(r'\\Tilde'): r'\\tilde',
343
343
  re.compile(r'\\slash'): r'/',
344
344
  re.compile(r'\\textperthousand'): r'‰',
345
- re.compile(r'\\sun'): r'☉'
345
+ re.compile(r'\\sun'): r'☉',
346
+ re.compile(r'\\textunderscore'): r'\\_',
347
+ re.compile(r'\\fint'): r'⨏',
348
+ re.compile(r'\\up '): r'\\ ',
346
349
  }
347
350
  QQUAD_PATTERN = re.compile(r'\\qquad(?!\s)')
348
351
 
@@ -172,8 +172,8 @@ def filter_nested_tables(table_res_list, overlap_threshold=0.8, area_threshold=0
172
172
  tables_inside = [j for j in range(len(table_res_list))
173
173
  if i != j and is_inside(table_info[j], table_info[i], overlap_threshold)]
174
174
 
175
- # Continue if there are at least 2 tables inside
176
- if len(tables_inside) >= 2:
175
+ # Continue if there are at least 3 tables inside
176
+ if len(tables_inside) >= 3:
177
177
  # Check if inside tables overlap with each other
178
178
  tables_overlap = any(do_overlap(table_info[tables_inside[idx1]], table_info[tables_inside[idx2]])
179
179
  for idx1 in range(len(tables_inside))
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: magic-pdf
3
- Version: 1.3.9
3
+ Version: 1.3.10
4
4
  Summary: A practical tool for converting PDF to Markdown
5
5
  License: AGPL-3.0
6
6
  Project-URL: Home, https://mineru.net/
@@ -20,7 +20,7 @@ Requires-Dist: click >=8.1.7
20
20
  Requires-Dist: fast-langdetect <0.3.0,>=0.2.3
21
21
  Requires-Dist: loguru >=0.6.0
22
22
  Requires-Dist: numpy >=1.21.6
23
- Requires-Dist: pdfminer.six >=20250416
23
+ Requires-Dist: pdfminer.six ==20250324
24
24
  Requires-Dist: pydantic <2.11,>=2.7.2
25
25
  Requires-Dist: scikit-learn >=1.0.2
26
26
  Requires-Dist: torch !=2.5.0,!=2.5.1,>=2.2.2
@@ -107,6 +107,9 @@ Easier to use: Just grab MinerU Desktop. No coding, no login, just a simple inte
107
107
  </div>
108
108
 
109
109
  # Changelog
110
+ - 2025/04/29 1.3.10 Released
111
+ - Support for custom formula delimiters can be achieved by modifying the `latex-delimiter-config` item in the `magic-pdf.json` file under the user directory.
112
+ - Pinned `pdfminer.six` to version `20250324` to prevent parsing failures caused by new versions.
110
113
  - 2025/04/27 1.3.9 Released
111
114
  - Optimized the formula parsing function to improve the success rate of formula rendering
112
115
  - Updated `pdfminer.six` to the latest version, fixing some abnormal PDF parsing issues
@@ -25,7 +25,7 @@ magic_pdf/data/io/base.py,sha256=SqNQqe30ZvoVvg7GVv-hLMCjN6yBgDyQQWeLgGsTfhQ,111
25
25
  magic_pdf/data/io/http.py,sha256=XlKB0DNf4a_uUnfgcclvaaOtmE7lmddx0DnK8A-emAM,958
26
26
  magic_pdf/data/io/s3.py,sha256=hyA7sbNriQy64xd_uyJ7acN_oneQ1Pdmoc7_xcvkue8,3606
27
27
  magic_pdf/dict2md/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
28
- magic_pdf/dict2md/ocr_mkcontent.py,sha256=U4DqKfD4dJ2S5Z8NEAGhuLYkEOIeC-BWuArMbwi7BJs,13784
28
+ magic_pdf/dict2md/ocr_mkcontent.py,sha256=jRGoSNeR3XBgzGhKdQ25CmsdW0pi7NA-5NY3TB2pja0,14421
29
29
  magic_pdf/filter/__init__.py,sha256=_7lSez_myu4b6cdzPpQ-NfREuqeBSq_QdyBPKVLyq2U,1505
30
30
  magic_pdf/filter/pdf_classify_by_type.py,sha256=YNYXamxYgEiSujwilCNHOtrwpgJGDiQ597qJfardDVc,42354
31
31
  magic_pdf/filter/pdf_meta_scan.py,sha256=eOuM0-JgaXvHolSgepGoNDJDmv_uITWLQpH_0MfnVQw,17478
@@ -38,7 +38,7 @@ magic_pdf/libs/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
38
38
  magic_pdf/libs/boxbase.py,sha256=DKZXhwpJd-HE2_Du7NmkeeYW5gG-iwX3GeSWL7rYGv0,16956
39
39
  magic_pdf/libs/clean_memory.py,sha256=OsQexCjmBO2i-Hv-0uYQfn72dbUWR8sTW81nG2zlNQQ,479
40
40
  magic_pdf/libs/commons.py,sha256=xD0fGA16KNB5rhbl4zRrOqdrNHYwaRablT_s9W2ZTbw,1174
41
- magic_pdf/libs/config_reader.py,sha256=9GXK7jtDyA_jxXKWZAQ69rB02v5UW4mOmo1IaOYOkW0,4374
41
+ magic_pdf/libs/config_reader.py,sha256=Z8C5o2uYfByB0Sj-jpgzu6VRobNp0y2gTheVXWkZV_0,4716
42
42
  magic_pdf/libs/convert_utils.py,sha256=Ov-lsfCLBPz_15iSJXIslBNmrSf_E_1g_XDWJy8NgO8,143
43
43
  magic_pdf/libs/coordinate_transform.py,sha256=Bbop2cP2uz2ZG0U0gwd7J6EKkgABq5Rv03qf2LMPw80,429
44
44
  magic_pdf/libs/draw_bbox.py,sha256=hpUmpPiQVu7UgWQa3M49dS22G6A9gcG2jpq4dQjTjzA,18331
@@ -52,7 +52,7 @@ magic_pdf/libs/pdf_check.py,sha256=7GWWvDR6g_rj_fE6XJlbTq5AFVX11ngRIzT0N18F214,3
52
52
  magic_pdf/libs/pdf_image_tools.py,sha256=_au7plmKKctpPKozBumSKgP8689q4vH1mU8VMLO0IbM,2260
53
53
  magic_pdf/libs/performance_stats.py,sha256=DW-c6nUTUnWKGTONRKfpucsYZm1ake016F9K7jJwbik,2136
54
54
  magic_pdf/libs/safe_filename.py,sha256=ckwcM_eqoysTb5id8czp-tXq2G9da0-l3pshZDCHQtE,236
55
- magic_pdf/libs/version.py,sha256=SaWgUI6v92kVfF_Qdoxbfc38bwA34RuDGZmXMqa5g3c,22
55
+ magic_pdf/libs/version.py,sha256=4o4BxiWDvKULo_NByGymiLj9KXGht1PsOBGUMmasvxM,23
56
56
  magic_pdf/model/__init__.py,sha256=sa-dO2k-TLy25I2gRrzjm_cQeYfzMf-pLwBJHkIxGo0,51
57
57
  magic_pdf/model/batch_analyze.py,sha256=F0WsjbQ6z9txdiUiVy6n6zhyJWJ-4moljNx8fe8HFws,10977
58
58
  magic_pdf/model/doc_analyze_by_custom_model.py,sha256=-cjn7DQi6kZCqVZ0IxbXuL2kmeGhSVLzLaezIHPFzMU,10317
@@ -62,7 +62,7 @@ magic_pdf/model/pdf_extract_kit.py,sha256=C3sKqRkoD20Ldmo-cqGn1zRldEL-l5NYqcFvd0
62
62
  magic_pdf/model/pp_structure_v2.py,sha256=NcqFWL4nUtjl82MFak8HX_8V3i4Aw_fK4dATrIp5uGs,3840
63
63
  magic_pdf/model/sub_modules/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
64
64
  magic_pdf/model/sub_modules/model_init.py,sha256=e2t95kxiuU47luOHByDokbQ2uob6oQhEA4b6UGVfMjY,8303
65
- magic_pdf/model/sub_modules/model_utils.py,sha256=HKRC9ubCs6O0nNqaztrZO0YKuFpRhs0LKWOaeZfDrTw,12166
65
+ magic_pdf/model/sub_modules/model_utils.py,sha256=Md5yOki9uqW31sWIi7AKRwAJNKnCJBVSfQx6LXRKngs,12166
66
66
  magic_pdf/model/sub_modules/language_detection/__init__.py,sha256=8CRrCQVuExa0BttRFh3Z40lFy2K5jN0sp67KWjOlj5c,50
67
67
  magic_pdf/model/sub_modules/language_detection/utils.py,sha256=Q__v6DdNJztt8GhVSuSB0txahVq-aj8RLhWn2VScx4w,3047
68
68
  magic_pdf/model/sub_modules/language_detection/yolov11/YOLOv11.py,sha256=7T8eFl8zlZe6F0j0jSB3jrwOapDft320JQ1fuWxpvAY,5230
@@ -97,7 +97,7 @@ magic_pdf/model/sub_modules/mfr/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm
97
97
  magic_pdf/model/sub_modules/mfr/unimernet/Unimernet.py,sha256=hq48TP1Ac8Y4FlK7GamnU-WZTQfdZotxBKuFhOIjrcM,5349
98
98
  magic_pdf/model/sub_modules/mfr/unimernet/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
99
99
  magic_pdf/model/sub_modules/mfr/unimernet/unimernet_hf/__init__.py,sha256=kHcISG8GS4TWJW34SCJCei1jxo6HxvO00aC0dqyNFgI,413
100
- magic_pdf/model/sub_modules/mfr/unimernet/unimernet_hf/modeling_unimernet.py,sha256=uRqyXsr3S0bw6CnkkJzAgne_MT3Q9Wz-npIXIMlRnlo,17986
100
+ magic_pdf/model/sub_modules/mfr/unimernet/unimernet_hf/modeling_unimernet.py,sha256=BEZhsW6TqVXDNOgbvcW_0XMtYYiR3hFjpABRhKZgZC8,18101
101
101
  magic_pdf/model/sub_modules/mfr/unimernet/unimernet_hf/unimer_mbart/__init__.py,sha256=9T2rBpyGX5YFQYj89-mWujRokOuz4xgNreBuegcg1_c,228
102
102
  magic_pdf/model/sub_modules/mfr/unimernet/unimernet_hf/unimer_mbart/configuration_unimer_mbart.py,sha256=hwRar3pqN_cVs3TRTNSuhB4wacBncfJ-qvaTajRb0xc,7934
103
103
  magic_pdf/model/sub_modules/mfr/unimernet/unimernet_hf/unimer_mbart/modeling_unimer_mbart.py,sha256=Y5nwqDRzSb8zAbY1U0HGJGVtFggJW1zATP8RdnasNcA,113605
@@ -196,9 +196,9 @@ magic_pdf/tools/common.py,sha256=-x0RSFr7SNbdYq7DntaLYmQmaxyF-xKSf4xMpSUTzA0,126
196
196
  magic_pdf/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
197
197
  magic_pdf/utils/annotations.py,sha256=82ou3uELNbQWa9hOFFkVt0gsIskAKf5msCv5J2IJ5V0,211
198
198
  magic_pdf/utils/office_to_pdf.py,sha256=bFRYe6v3-pfx5R8-bV8cmf12jPnOrYZsleKoECTXzbM,3958
199
- magic_pdf-1.3.9.dist-info/LICENSE.md,sha256=jVa0BUaKrRH4erV2P5AeJ24I2WRv9chIGxditreJ6e0,34524
200
- magic_pdf-1.3.9.dist-info/METADATA,sha256=L98_kmfvo1RrBO3LR3Np2ySVd9nkMmXDf3TA5LmTVcQ,47611
201
- magic_pdf-1.3.9.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
202
- magic_pdf-1.3.9.dist-info/entry_points.txt,sha256=wXwYke3j8fqDQTocUspL-CqDUEv3Tfcwp09fM8dZAhA,98
203
- magic_pdf-1.3.9.dist-info/top_level.txt,sha256=J9I0AzmHWGkp9c6DL8Oe4mEx3yYphLzkRn4H25Lg1rE,10
204
- magic_pdf-1.3.9.dist-info/RECORD,,
199
+ magic_pdf-1.3.10.dist-info/LICENSE.md,sha256=jVa0BUaKrRH4erV2P5AeJ24I2WRv9chIGxditreJ6e0,34524
200
+ magic_pdf-1.3.10.dist-info/METADATA,sha256=_Z35A_31Utec2rGaKDxxpZLjtG2uZywgnJrjNRUF__w,47901
201
+ magic_pdf-1.3.10.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
202
+ magic_pdf-1.3.10.dist-info/entry_points.txt,sha256=wXwYke3j8fqDQTocUspL-CqDUEv3Tfcwp09fM8dZAhA,98
203
+ magic_pdf-1.3.10.dist-info/top_level.txt,sha256=J9I0AzmHWGkp9c6DL8Oe4mEx3yYphLzkRn4H25Lg1rE,10
204
+ magic_pdf-1.3.10.dist-info/RECORD,,