magic-pdf 1.3.9__py3-none-any.whl → 1.3.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -5,6 +5,7 @@ from loguru import logger
5
5
  from magic_pdf.config.make_content_config import DropMode, MakeMode
6
6
  from magic_pdf.config.ocr_content_type import BlockType, ContentType
7
7
  from magic_pdf.libs.commons import join_path
8
+ from magic_pdf.libs.config_reader import get_latex_delimiter_config
8
9
  from magic_pdf.libs.language import detect_lang
9
10
  from magic_pdf.libs.markdown_utils import ocr_escape_special_markdown_char
10
11
  from magic_pdf.post_proc.para_split_v3 import ListLineTag
@@ -145,6 +146,19 @@ def full_to_half(text: str) -> str:
145
146
  result.append(char)
146
147
  return ''.join(result)
147
148
 
149
+ latex_delimiters_config = get_latex_delimiter_config()
150
+
151
+ default_delimiters = {
152
+ 'display': {'left': '$$', 'right': '$$'},
153
+ 'inline': {'left': '$', 'right': '$'}
154
+ }
155
+
156
+ delimiters = latex_delimiters_config if latex_delimiters_config else default_delimiters
157
+
158
+ display_left_delimiter = delimiters['display']['left']
159
+ display_right_delimiter = delimiters['display']['right']
160
+ inline_left_delimiter = delimiters['inline']['left']
161
+ inline_right_delimiter = delimiters['inline']['right']
148
162
 
149
163
  def merge_para_with_text(para_block):
150
164
  block_text = ''
@@ -168,9 +182,9 @@ def merge_para_with_text(para_block):
168
182
  if span_type == ContentType.Text:
169
183
  content = ocr_escape_special_markdown_char(span['content'])
170
184
  elif span_type == ContentType.InlineEquation:
171
- content = f"${span['content']}$"
185
+ content = f"{inline_left_delimiter}{span['content']}{inline_right_delimiter}"
172
186
  elif span_type == ContentType.InterlineEquation:
173
- content = f"\n$$\n{span['content']}\n$$\n"
187
+ content = f"\n{display_left_delimiter}\n{span['content']}\n{display_right_delimiter}\n"
174
188
 
175
189
  content = content.strip()
176
190
 
@@ -125,6 +125,15 @@ def get_llm_aided_config():
125
125
  else:
126
126
  return llm_aided_config
127
127
 
128
+ def get_latex_delimiter_config():
129
+ config = read_config()
130
+ latex_delimiter_config = config.get('latex-delimiter-config')
131
+ if latex_delimiter_config is None:
132
+ logger.warning(f"'latex-delimiter-config' not found in {CONFIG_FILE_NAME}, use 'None' as default")
133
+ return None
134
+ else:
135
+ return latex_delimiter_config
136
+
128
137
 
129
138
  if __name__ == '__main__':
130
139
  ak, sk, endpoint = get_s3_config('llm-raw')
magic_pdf/libs/version.py CHANGED
@@ -1 +1 @@
1
- __version__ = "1.3.9"
1
+ __version__ = "1.3.11"
@@ -156,7 +156,10 @@ def doc_analyze(
156
156
  batch_images = [images_with_extra_info]
157
157
 
158
158
  results = []
159
- for batch_image in batch_images:
159
+ processed_images_count = 0
160
+ for index, batch_image in enumerate(batch_images):
161
+ processed_images_count += len(batch_image)
162
+ logger.info(f'Batch {index + 1}/{len(batch_images)}: {processed_images_count} pages/{len(images_with_extra_info)} pages')
160
163
  result = may_batch_image_analyze(batch_image, ocr, show_log,layout_model, formula_enable, table_enable)
161
164
  results.extend(result)
162
165
 
@@ -66,9 +66,9 @@ LEFT_RIGHT_REMOVE_PATTERN = re.compile(r'\\left\.?|\\right\.?')
66
66
 
67
67
  def fix_latex_left_right(s):
68
68
  """
69
- 修复LaTeX中的\left和\right命令
69
+ 修复LaTeX中的\\left和\\right命令
70
70
  1. 确保它们后面跟有效分隔符
71
- 2. 平衡\left和\right的数量
71
+ 2. 平衡\\left和\\right的数量
72
72
  """
73
73
  # 白名单分隔符
74
74
  valid_delims_list = [r'(', r')', r'[', r']', r'{', r'}', r'/', r'|',
@@ -106,7 +106,7 @@ def fix_latex_left_right(s):
106
106
 
107
107
  def fix_left_right_pairs(latex_formula):
108
108
  """
109
- 检测并修复LaTeX公式中\left和\right不在同一组的情况
109
+ 检测并修复LaTeX公式中\\left和\\right不在同一组的情况
110
110
 
111
111
  Args:
112
112
  latex_formula (str): 输入的LaTeX公式
@@ -308,9 +308,9 @@ ENV_FORMAT_PATTERNS = {env: re.compile(r'\\begin\{' + env + r'\}\{([^}]*)\}') fo
308
308
 
309
309
  def fix_latex_environments(s):
310
310
  """
311
- 检测LaTeX中环境(如array)的\begin和\end是否匹配
312
- 1. 如果缺少\begin标签则在开头添加
313
- 2. 如果缺少\end标签则在末尾添加
311
+ 检测LaTeX中环境(如array)的\\begin和\\end是否匹配
312
+ 1. 如果缺少\\begin标签则在开头添加
313
+ 2. 如果缺少\\end标签则在末尾添加
314
314
  """
315
315
  for env in ENV_TYPES:
316
316
  begin_count = len(ENV_BEGIN_PATTERNS[env].findall(s))
@@ -334,7 +334,7 @@ def fix_latex_environments(s):
334
334
 
335
335
  UP_PATTERN = re.compile(r'\\up([a-zA-Z]+)')
336
336
  COMMANDS_TO_REMOVE_PATTERN = re.compile(
337
- r'\\(?:lefteqn|boldmath|ensuremath|centering|textsubscript|sides|textsl|textcent|emph)')
337
+ r'\\(?:lefteqn|boldmath|ensuremath|centering|textsubscript|sides|textsl|textcent|emph|protect|null)')
338
338
  REPLACEMENTS_PATTERNS = {
339
339
  re.compile(r'\\underbar'): r'\\underline',
340
340
  re.compile(r'\\Bar'): r'\\hat',
@@ -342,7 +342,13 @@ REPLACEMENTS_PATTERNS = {
342
342
  re.compile(r'\\Tilde'): r'\\tilde',
343
343
  re.compile(r'\\slash'): r'/',
344
344
  re.compile(r'\\textperthousand'): r'‰',
345
- re.compile(r'\\sun'): r'☉'
345
+ re.compile(r'\\sun'): r'☉',
346
+ re.compile(r'\\textunderscore'): r'\\_',
347
+ re.compile(r'\\fint'): r'⨏',
348
+ re.compile(r'\\up '): r'\\ ',
349
+ re.compile(r'\\vline = '): r'\\models ',
350
+ re.compile(r'\\vDash '): r'\\models ',
351
+ re.compile(r'\\sq \\sqcup '): r'\\square ',
346
352
  }
347
353
  QQUAD_PATTERN = re.compile(r'\\qquad(?!\s)')
348
354
 
@@ -172,8 +172,8 @@ def filter_nested_tables(table_res_list, overlap_threshold=0.8, area_threshold=0
172
172
  tables_inside = [j for j in range(len(table_res_list))
173
173
  if i != j and is_inside(table_info[j], table_info[i], overlap_threshold)]
174
174
 
175
- # Continue if there are at least 2 tables inside
176
- if len(tables_inside) >= 2:
175
+ # Continue if there are at least 3 tables inside
176
+ if len(tables_inside) >= 3:
177
177
  # Check if inside tables overlap with each other
178
178
  tables_overlap = any(do_overlap(table_info[tables_inside[idx1]], table_info[tables_inside[idx2]])
179
179
  for idx1 in range(len(tables_inside))
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: magic-pdf
3
- Version: 1.3.9
3
+ Version: 1.3.11
4
4
  Summary: A practical tool for converting PDF to Markdown
5
5
  License: AGPL-3.0
6
6
  Project-URL: Home, https://mineru.net/
@@ -10,7 +10,7 @@ Classifier: Programming Language :: Python :: 3.10
10
10
  Classifier: Programming Language :: Python :: 3.11
11
11
  Classifier: Programming Language :: Python :: 3.12
12
12
  Classifier: Programming Language :: Python :: 3.13
13
- Requires-Python: >=3.10,<4
13
+ Requires-Python: >=3.10,<3.14
14
14
  Description-Content-Type: text/markdown
15
15
  License-File: LICENSE.md
16
16
  Requires-Dist: Brotli >=1.1.0
@@ -20,10 +20,10 @@ Requires-Dist: click >=8.1.7
20
20
  Requires-Dist: fast-langdetect <0.3.0,>=0.2.3
21
21
  Requires-Dist: loguru >=0.6.0
22
22
  Requires-Dist: numpy >=1.21.6
23
- Requires-Dist: pdfminer.six >=20250416
23
+ Requires-Dist: pdfminer.six ==20250506
24
24
  Requires-Dist: pydantic <2.11,>=2.7.2
25
25
  Requires-Dist: scikit-learn >=1.0.2
26
- Requires-Dist: torch !=2.5.0,!=2.5.1,>=2.2.2
26
+ Requires-Dist: torch !=2.5.0,!=2.5.1,<3,>=2.2.2
27
27
  Requires-Dist: torchvision
28
28
  Requires-Dist: tqdm >=4.67.1
29
29
  Requires-Dist: transformers !=4.51.0,<5.0.0,>=4.49.0
@@ -107,9 +107,10 @@ Easier to use: Just grab MinerU Desktop. No coding, no login, just a simple inte
107
107
  </div>
108
108
 
109
109
  # Changelog
110
+ - 2025/04/29 1.3.10 Released
111
+ - Support for custom formula delimiters can be achieved by modifying the `latex-delimiter-config` item in the `magic-pdf.json` file under the user directory.
110
112
  - 2025/04/27 1.3.9 Released
111
- - Optimized the formula parsing function to improve the success rate of formula rendering
112
- - Updated `pdfminer.six` to the latest version, fixing some abnormal PDF parsing issues
113
+ - Optimized the formula parsing function to improve the success rate of formula rendering
113
114
  - 2025/04/23 1.3.8 Released
114
115
  - The default `ocr` model (`ch`) has been updated to `PP-OCRv4_server_rec_doc` (model update required)
115
116
  - `PP-OCRv4_server_rec_doc` is trained on a mix of more Chinese document data and PP-OCR training data, enhancing recognition capabilities for some traditional Chinese characters, Japanese, and special characters. It supports over 15,000 recognizable characters, improving text recognition in documents while also boosting general text recognition.
@@ -411,7 +412,7 @@ There are three different ways to experience MinerU:
411
412
  </tr>
412
413
  <tr>
413
414
  <td colspan="3">Python Version</td>
414
- <td colspan="3">>=3.10</td>
415
+ <td colspan="3">3.10~3.13</td>
415
416
  </tr>
416
417
  <tr>
417
418
  <td colspan="3">Nvidia Driver Version</td>
@@ -421,8 +422,7 @@ There are three different ways to experience MinerU:
421
422
  </tr>
422
423
  <tr>
423
424
  <td colspan="3">CUDA Environment</td>
424
- <td>11.8/12.4/12.6/12.8</td>
425
- <td>11.8/12.4/12.6/12.8</td>
425
+ <td colspan="2"><a href="https://pytorch.org/get-started/locally/">Refer to the PyTorch official website</a></td>
426
426
  <td>None</td>
427
427
  </tr>
428
428
  <tr>
@@ -453,7 +453,7 @@ Synced with dev branch updates:
453
453
  #### 1. Install magic-pdf
454
454
 
455
455
  ```bash
456
- conda create -n mineru 'python>=3.10' -y
456
+ conda create -n mineru 'python=3.12' -y
457
457
  conda activate mineru
458
458
  pip install -U "magic-pdf[full]"
459
459
  ```
@@ -25,7 +25,7 @@ magic_pdf/data/io/base.py,sha256=SqNQqe30ZvoVvg7GVv-hLMCjN6yBgDyQQWeLgGsTfhQ,111
25
25
  magic_pdf/data/io/http.py,sha256=XlKB0DNf4a_uUnfgcclvaaOtmE7lmddx0DnK8A-emAM,958
26
26
  magic_pdf/data/io/s3.py,sha256=hyA7sbNriQy64xd_uyJ7acN_oneQ1Pdmoc7_xcvkue8,3606
27
27
  magic_pdf/dict2md/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
28
- magic_pdf/dict2md/ocr_mkcontent.py,sha256=U4DqKfD4dJ2S5Z8NEAGhuLYkEOIeC-BWuArMbwi7BJs,13784
28
+ magic_pdf/dict2md/ocr_mkcontent.py,sha256=jRGoSNeR3XBgzGhKdQ25CmsdW0pi7NA-5NY3TB2pja0,14421
29
29
  magic_pdf/filter/__init__.py,sha256=_7lSez_myu4b6cdzPpQ-NfREuqeBSq_QdyBPKVLyq2U,1505
30
30
  magic_pdf/filter/pdf_classify_by_type.py,sha256=YNYXamxYgEiSujwilCNHOtrwpgJGDiQ597qJfardDVc,42354
31
31
  magic_pdf/filter/pdf_meta_scan.py,sha256=eOuM0-JgaXvHolSgepGoNDJDmv_uITWLQpH_0MfnVQw,17478
@@ -38,7 +38,7 @@ magic_pdf/libs/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
38
38
  magic_pdf/libs/boxbase.py,sha256=DKZXhwpJd-HE2_Du7NmkeeYW5gG-iwX3GeSWL7rYGv0,16956
39
39
  magic_pdf/libs/clean_memory.py,sha256=OsQexCjmBO2i-Hv-0uYQfn72dbUWR8sTW81nG2zlNQQ,479
40
40
  magic_pdf/libs/commons.py,sha256=xD0fGA16KNB5rhbl4zRrOqdrNHYwaRablT_s9W2ZTbw,1174
41
- magic_pdf/libs/config_reader.py,sha256=9GXK7jtDyA_jxXKWZAQ69rB02v5UW4mOmo1IaOYOkW0,4374
41
+ magic_pdf/libs/config_reader.py,sha256=Z8C5o2uYfByB0Sj-jpgzu6VRobNp0y2gTheVXWkZV_0,4716
42
42
  magic_pdf/libs/convert_utils.py,sha256=Ov-lsfCLBPz_15iSJXIslBNmrSf_E_1g_XDWJy8NgO8,143
43
43
  magic_pdf/libs/coordinate_transform.py,sha256=Bbop2cP2uz2ZG0U0gwd7J6EKkgABq5Rv03qf2LMPw80,429
44
44
  magic_pdf/libs/draw_bbox.py,sha256=hpUmpPiQVu7UgWQa3M49dS22G6A9gcG2jpq4dQjTjzA,18331
@@ -52,17 +52,17 @@ magic_pdf/libs/pdf_check.py,sha256=7GWWvDR6g_rj_fE6XJlbTq5AFVX11ngRIzT0N18F214,3
52
52
  magic_pdf/libs/pdf_image_tools.py,sha256=_au7plmKKctpPKozBumSKgP8689q4vH1mU8VMLO0IbM,2260
53
53
  magic_pdf/libs/performance_stats.py,sha256=DW-c6nUTUnWKGTONRKfpucsYZm1ake016F9K7jJwbik,2136
54
54
  magic_pdf/libs/safe_filename.py,sha256=ckwcM_eqoysTb5id8czp-tXq2G9da0-l3pshZDCHQtE,236
55
- magic_pdf/libs/version.py,sha256=SaWgUI6v92kVfF_Qdoxbfc38bwA34RuDGZmXMqa5g3c,22
55
+ magic_pdf/libs/version.py,sha256=cYqar-M7VsRkfFRByjCNAq9dYm1GavgcND7ugDzle7Q,23
56
56
  magic_pdf/model/__init__.py,sha256=sa-dO2k-TLy25I2gRrzjm_cQeYfzMf-pLwBJHkIxGo0,51
57
57
  magic_pdf/model/batch_analyze.py,sha256=F0WsjbQ6z9txdiUiVy6n6zhyJWJ-4moljNx8fe8HFws,10977
58
- magic_pdf/model/doc_analyze_by_custom_model.py,sha256=-cjn7DQi6kZCqVZ0IxbXuL2kmeGhSVLzLaezIHPFzMU,10317
58
+ magic_pdf/model/doc_analyze_by_custom_model.py,sha256=R-d3_ZLZtlAztA23yaDvC0YC2-heXxrBBEyGlX7or7c,10547
59
59
  magic_pdf/model/magic_model.py,sha256=yZKWo_wRck_-YLyFGRiUHGar8sV1Y6458BFLbyBAt74,30682
60
60
  magic_pdf/model/model_list.py,sha256=aqfEJlEfbib3D3ISrxc0Coh6SbffYh8Yq2FlQN35_zA,213
61
61
  magic_pdf/model/pdf_extract_kit.py,sha256=C3sKqRkoD20Ldmo-cqGn1zRldEL-l5NYqcFvd05_fGU,10845
62
62
  magic_pdf/model/pp_structure_v2.py,sha256=NcqFWL4nUtjl82MFak8HX_8V3i4Aw_fK4dATrIp5uGs,3840
63
63
  magic_pdf/model/sub_modules/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
64
64
  magic_pdf/model/sub_modules/model_init.py,sha256=e2t95kxiuU47luOHByDokbQ2uob6oQhEA4b6UGVfMjY,8303
65
- magic_pdf/model/sub_modules/model_utils.py,sha256=HKRC9ubCs6O0nNqaztrZO0YKuFpRhs0LKWOaeZfDrTw,12166
65
+ magic_pdf/model/sub_modules/model_utils.py,sha256=Md5yOki9uqW31sWIi7AKRwAJNKnCJBVSfQx6LXRKngs,12166
66
66
  magic_pdf/model/sub_modules/language_detection/__init__.py,sha256=8CRrCQVuExa0BttRFh3Z40lFy2K5jN0sp67KWjOlj5c,50
67
67
  magic_pdf/model/sub_modules/language_detection/utils.py,sha256=Q__v6DdNJztt8GhVSuSB0txahVq-aj8RLhWn2VScx4w,3047
68
68
  magic_pdf/model/sub_modules/language_detection/yolov11/YOLOv11.py,sha256=7T8eFl8zlZe6F0j0jSB3jrwOapDft320JQ1fuWxpvAY,5230
@@ -97,7 +97,7 @@ magic_pdf/model/sub_modules/mfr/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm
97
97
  magic_pdf/model/sub_modules/mfr/unimernet/Unimernet.py,sha256=hq48TP1Ac8Y4FlK7GamnU-WZTQfdZotxBKuFhOIjrcM,5349
98
98
  magic_pdf/model/sub_modules/mfr/unimernet/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
99
99
  magic_pdf/model/sub_modules/mfr/unimernet/unimernet_hf/__init__.py,sha256=kHcISG8GS4TWJW34SCJCei1jxo6HxvO00aC0dqyNFgI,413
100
- magic_pdf/model/sub_modules/mfr/unimernet/unimernet_hf/modeling_unimernet.py,sha256=uRqyXsr3S0bw6CnkkJzAgne_MT3Q9Wz-npIXIMlRnlo,17986
100
+ magic_pdf/model/sub_modules/mfr/unimernet/unimernet_hf/modeling_unimernet.py,sha256=v5vf3a4sBLZ11FKbhb9bE_GEXcGbmdLxgTtl2pRmW34,18260
101
101
  magic_pdf/model/sub_modules/mfr/unimernet/unimernet_hf/unimer_mbart/__init__.py,sha256=9T2rBpyGX5YFQYj89-mWujRokOuz4xgNreBuegcg1_c,228
102
102
  magic_pdf/model/sub_modules/mfr/unimernet/unimernet_hf/unimer_mbart/configuration_unimer_mbart.py,sha256=hwRar3pqN_cVs3TRTNSuhB4wacBncfJ-qvaTajRb0xc,7934
103
103
  magic_pdf/model/sub_modules/mfr/unimernet/unimernet_hf/unimer_mbart/modeling_unimer_mbart.py,sha256=Y5nwqDRzSb8zAbY1U0HGJGVtFggJW1zATP8RdnasNcA,113605
@@ -196,9 +196,9 @@ magic_pdf/tools/common.py,sha256=-x0RSFr7SNbdYq7DntaLYmQmaxyF-xKSf4xMpSUTzA0,126
196
196
  magic_pdf/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
197
197
  magic_pdf/utils/annotations.py,sha256=82ou3uELNbQWa9hOFFkVt0gsIskAKf5msCv5J2IJ5V0,211
198
198
  magic_pdf/utils/office_to_pdf.py,sha256=bFRYe6v3-pfx5R8-bV8cmf12jPnOrYZsleKoECTXzbM,3958
199
- magic_pdf-1.3.9.dist-info/LICENSE.md,sha256=jVa0BUaKrRH4erV2P5AeJ24I2WRv9chIGxditreJ6e0,34524
200
- magic_pdf-1.3.9.dist-info/METADATA,sha256=L98_kmfvo1RrBO3LR3Np2ySVd9nkMmXDf3TA5LmTVcQ,47611
201
- magic_pdf-1.3.9.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
202
- magic_pdf-1.3.9.dist-info/entry_points.txt,sha256=wXwYke3j8fqDQTocUspL-CqDUEv3Tfcwp09fM8dZAhA,98
203
- magic_pdf-1.3.9.dist-info/top_level.txt,sha256=J9I0AzmHWGkp9c6DL8Oe4mEx3yYphLzkRn4H25Lg1rE,10
204
- magic_pdf-1.3.9.dist-info/RECORD,,
199
+ magic_pdf-1.3.11.dist-info/LICENSE.md,sha256=jVa0BUaKrRH4erV2P5AeJ24I2WRv9chIGxditreJ6e0,34524
200
+ magic_pdf-1.3.11.dist-info/METADATA,sha256=MFffMcv9-ZlVG8jBKDtTPofJC_F-x_OPU4aLwxd-aUs,47765
201
+ magic_pdf-1.3.11.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
202
+ magic_pdf-1.3.11.dist-info/entry_points.txt,sha256=wXwYke3j8fqDQTocUspL-CqDUEv3Tfcwp09fM8dZAhA,98
203
+ magic_pdf-1.3.11.dist-info/top_level.txt,sha256=J9I0AzmHWGkp9c6DL8Oe4mEx3yYphLzkRn4H25Lg1rE,10
204
+ magic_pdf-1.3.11.dist-info/RECORD,,