magic-pdf 0.9.3__py3-none-any.whl → 0.10.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (63) hide show
  1. magic_pdf/config/constants.py +53 -0
  2. magic_pdf/config/drop_reason.py +35 -0
  3. magic_pdf/config/drop_tag.py +19 -0
  4. magic_pdf/config/make_content_config.py +11 -0
  5. magic_pdf/{libs/ModelBlockTypeEnum.py → config/model_block_type.py} +2 -1
  6. magic_pdf/data/data_reader_writer/filebase.py +3 -0
  7. magic_pdf/data/read_api.py +1 -1
  8. magic_pdf/dict2md/mkcontent.py +226 -185
  9. magic_pdf/dict2md/ocr_mkcontent.py +11 -11
  10. magic_pdf/filter/pdf_meta_scan.py +101 -79
  11. magic_pdf/integrations/rag/utils.py +4 -5
  12. magic_pdf/libs/config_reader.py +5 -5
  13. magic_pdf/libs/draw_bbox.py +3 -2
  14. magic_pdf/libs/pdf_image_tools.py +36 -12
  15. magic_pdf/libs/version.py +1 -1
  16. magic_pdf/model/doc_analyze_by_custom_model.py +2 -0
  17. magic_pdf/model/magic_model.py +13 -13
  18. magic_pdf/model/pdf_extract_kit.py +122 -76
  19. magic_pdf/model/sub_modules/model_init.py +40 -35
  20. magic_pdf/model/sub_modules/ocr/paddleocr/ocr_utils.py +33 -7
  21. magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_273_mod.py +12 -4
  22. magic_pdf/model/sub_modules/table/rapidtable/rapid_table.py +2 -0
  23. magic_pdf/model/sub_modules/table/tablemaster/tablemaster_paddle.py +30 -28
  24. magic_pdf/para/para_split.py +411 -248
  25. magic_pdf/para/para_split_v2.py +352 -182
  26. magic_pdf/para/para_split_v3.py +110 -53
  27. magic_pdf/pdf_parse_by_ocr.py +2 -0
  28. magic_pdf/pdf_parse_by_txt.py +2 -0
  29. magic_pdf/pdf_parse_union_core.py +174 -100
  30. magic_pdf/pdf_parse_union_core_v2.py +202 -36
  31. magic_pdf/pipe/AbsPipe.py +28 -44
  32. magic_pdf/pipe/OCRPipe.py +5 -5
  33. magic_pdf/pipe/TXTPipe.py +5 -6
  34. magic_pdf/pipe/UNIPipe.py +24 -25
  35. magic_pdf/post_proc/pdf_post_filter.py +7 -14
  36. magic_pdf/pre_proc/cut_image.py +9 -11
  37. magic_pdf/pre_proc/equations_replace.py +203 -212
  38. magic_pdf/pre_proc/ocr_detect_all_bboxes.py +235 -49
  39. magic_pdf/pre_proc/ocr_dict_merge.py +5 -5
  40. magic_pdf/pre_proc/ocr_span_list_modify.py +122 -63
  41. magic_pdf/pre_proc/pdf_pre_filter.py +37 -33
  42. magic_pdf/pre_proc/remove_bbox_overlap.py +20 -18
  43. magic_pdf/pre_proc/remove_colored_strip_bbox.py +36 -14
  44. magic_pdf/pre_proc/remove_footer_header.py +2 -5
  45. magic_pdf/pre_proc/remove_rotate_bbox.py +111 -63
  46. magic_pdf/pre_proc/resolve_bbox_conflict.py +10 -17
  47. magic_pdf/spark/spark_api.py +15 -17
  48. magic_pdf/tools/cli.py +3 -4
  49. magic_pdf/tools/cli_dev.py +6 -9
  50. magic_pdf/tools/common.py +26 -36
  51. magic_pdf/user_api.py +29 -38
  52. {magic_pdf-0.9.3.dist-info → magic_pdf-0.10.1.dist-info}/METADATA +11 -12
  53. {magic_pdf-0.9.3.dist-info → magic_pdf-0.10.1.dist-info}/RECORD +58 -59
  54. {magic_pdf-0.9.3.dist-info → magic_pdf-0.10.1.dist-info}/WHEEL +1 -1
  55. magic_pdf/libs/Constants.py +0 -55
  56. magic_pdf/libs/MakeContentConfig.py +0 -11
  57. magic_pdf/libs/drop_reason.py +0 -27
  58. magic_pdf/libs/drop_tag.py +0 -19
  59. magic_pdf/para/para_pipeline.py +0 -297
  60. /magic_pdf/{libs → config}/ocr_content_type.py +0 -0
  61. {magic_pdf-0.9.3.dist-info → magic_pdf-0.10.1.dist-info}/LICENSE.md +0 -0
  62. {magic_pdf-0.9.3.dist-info → magic_pdf-0.10.1.dist-info}/entry_points.txt +0 -0
  63. {magic_pdf-0.9.3.dist-info → magic_pdf-0.10.1.dist-info}/top_level.txt +0 -0
magic_pdf/user_api.py CHANGED
@@ -1,36 +1,28 @@
1
- """
2
- 用户输入:
3
- model数组,每个元素代表一个页面
4
- pdf在s3的路径
5
- 截图保存的s3位置
1
+ """用户输入: model数组,每个元素代表一个页面 pdf在s3的路径 截图保存的s3位置.
6
2
 
7
3
  然后:
8
4
  1)根据s3路径,调用spark集群的api,拿到ak,sk,endpoint,构造出s3PDFReader
9
5
  2)根据用户输入的s3地址,调用spark集群的api,拿到ak,sk,endpoint,构造出s3ImageWriter
10
6
 
11
7
  其余部分至于构造s3cli, 获取ak,sk都在code-clean里写代码完成。不要反向依赖!!!
12
-
13
8
  """
14
- import re
15
9
 
16
10
  from loguru import logger
17
11
 
12
+ from magic_pdf.data.data_reader_writer import DataWriter
18
13
  from magic_pdf.libs.version import __version__
19
14
  from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
20
- from magic_pdf.rw import AbsReaderWriter
21
15
  from magic_pdf.pdf_parse_by_ocr import parse_pdf_by_ocr
22
16
  from magic_pdf.pdf_parse_by_txt import parse_pdf_by_txt
23
17
 
24
- PARSE_TYPE_TXT = "txt"
25
- PARSE_TYPE_OCR = "ocr"
18
+ PARSE_TYPE_TXT = 'txt'
19
+ PARSE_TYPE_OCR = 'ocr'
26
20
 
27
21
 
28
- def parse_txt_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWriter, is_debug=False,
22
+ def parse_txt_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: DataWriter, is_debug=False,
29
23
  start_page_id=0, end_page_id=None, lang=None,
30
24
  *args, **kwargs):
31
- """
32
- 解析文本类pdf
33
- """
25
+ """解析文本类pdf."""
34
26
  pdf_info_dict = parse_pdf_by_txt(
35
27
  pdf_bytes,
36
28
  pdf_models,
@@ -38,24 +30,23 @@ def parse_txt_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWrit
38
30
  start_page_id=start_page_id,
39
31
  end_page_id=end_page_id,
40
32
  debug_mode=is_debug,
33
+ lang=lang,
41
34
  )
42
35
 
43
- pdf_info_dict["_parse_type"] = PARSE_TYPE_TXT
36
+ pdf_info_dict['_parse_type'] = PARSE_TYPE_TXT
44
37
 
45
- pdf_info_dict["_version_name"] = __version__
38
+ pdf_info_dict['_version_name'] = __version__
46
39
 
47
40
  if lang is not None:
48
- pdf_info_dict["_lang"] = lang
41
+ pdf_info_dict['_lang'] = lang
49
42
 
50
43
  return pdf_info_dict
51
44
 
52
45
 
53
- def parse_ocr_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWriter, is_debug=False,
46
+ def parse_ocr_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: DataWriter, is_debug=False,
54
47
  start_page_id=0, end_page_id=None, lang=None,
55
48
  *args, **kwargs):
56
- """
57
- 解析ocr类pdf
58
- """
49
+ """解析ocr类pdf."""
59
50
  pdf_info_dict = parse_pdf_by_ocr(
60
51
  pdf_bytes,
61
52
  pdf_models,
@@ -63,25 +54,24 @@ def parse_ocr_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWrit
63
54
  start_page_id=start_page_id,
64
55
  end_page_id=end_page_id,
65
56
  debug_mode=is_debug,
57
+ lang=lang,
66
58
  )
67
59
 
68
- pdf_info_dict["_parse_type"] = PARSE_TYPE_OCR
60
+ pdf_info_dict['_parse_type'] = PARSE_TYPE_OCR
69
61
 
70
- pdf_info_dict["_version_name"] = __version__
62
+ pdf_info_dict['_version_name'] = __version__
71
63
 
72
64
  if lang is not None:
73
- pdf_info_dict["_lang"] = lang
65
+ pdf_info_dict['_lang'] = lang
74
66
 
75
67
  return pdf_info_dict
76
68
 
77
69
 
78
- def parse_union_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWriter, is_debug=False,
70
+ def parse_union_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: DataWriter, is_debug=False,
79
71
  input_model_is_empty: bool = False,
80
72
  start_page_id=0, end_page_id=None, lang=None,
81
73
  *args, **kwargs):
82
- """
83
- ocr和文本混合的pdf,全部解析出来
84
- """
74
+ """ocr和文本混合的pdf,全部解析出来."""
85
75
 
86
76
  def parse_pdf(method):
87
77
  try:
@@ -92,18 +82,19 @@ def parse_union_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWr
92
82
  start_page_id=start_page_id,
93
83
  end_page_id=end_page_id,
94
84
  debug_mode=is_debug,
85
+ lang=lang,
95
86
  )
96
87
  except Exception as e:
97
88
  logger.exception(e)
98
89
  return None
99
90
 
100
91
  pdf_info_dict = parse_pdf(parse_pdf_by_txt)
101
- if pdf_info_dict is None or pdf_info_dict.get("_need_drop", False):
102
- logger.warning(f"parse_pdf_by_txt drop or error, switch to parse_pdf_by_ocr")
92
+ if pdf_info_dict is None or pdf_info_dict.get('_need_drop', False):
93
+ logger.warning('parse_pdf_by_txt drop or error, switch to parse_pdf_by_ocr')
103
94
  if input_model_is_empty:
104
- layout_model = kwargs.get("layout_model", None)
105
- formula_enable = kwargs.get("formula_enable", None)
106
- table_enable = kwargs.get("table_enable", None)
95
+ layout_model = kwargs.get('layout_model', None)
96
+ formula_enable = kwargs.get('formula_enable', None)
97
+ table_enable = kwargs.get('table_enable', None)
107
98
  pdf_models = doc_analyze(
108
99
  pdf_bytes,
109
100
  ocr=True,
@@ -116,15 +107,15 @@ def parse_union_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWr
116
107
  )
117
108
  pdf_info_dict = parse_pdf(parse_pdf_by_ocr)
118
109
  if pdf_info_dict is None:
119
- raise Exception("Both parse_pdf_by_txt and parse_pdf_by_ocr failed.")
110
+ raise Exception('Both parse_pdf_by_txt and parse_pdf_by_ocr failed.')
120
111
  else:
121
- pdf_info_dict["_parse_type"] = PARSE_TYPE_OCR
112
+ pdf_info_dict['_parse_type'] = PARSE_TYPE_OCR
122
113
  else:
123
- pdf_info_dict["_parse_type"] = PARSE_TYPE_TXT
114
+ pdf_info_dict['_parse_type'] = PARSE_TYPE_TXT
124
115
 
125
- pdf_info_dict["_version_name"] = __version__
116
+ pdf_info_dict['_version_name'] = __version__
126
117
 
127
118
  if lang is not None:
128
- pdf_info_dict["_lang"] = lang
119
+ pdf_info_dict['_lang'] = lang
129
120
 
130
121
  return pdf_info_dict
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: magic-pdf
3
- Version: 0.9.3
3
+ Version: 0.10.1
4
4
  Summary: A practical tool for converting PDF to Markdown
5
5
  Home-page: https://github.com/opendatalab/MinerU
6
6
  Requires-Python: >=3.9
@@ -38,6 +38,8 @@ Provides-Extra: lite
38
38
  Requires-Dist: paddleocr==2.7.3; extra == "lite"
39
39
  Requires-Dist: paddlepaddle==3.0.0b1; platform_system == "Linux" and extra == "lite"
40
40
  Requires-Dist: paddlepaddle==2.6.1; (platform_system == "Windows" or platform_system == "Darwin") and extra == "lite"
41
+ Provides-Extra: old_linux
42
+ Requires-Dist: albumentations<=1.4.20; extra == "old-linux"
41
43
 
42
44
  <div align="center" xmlns="http://www.w3.org/1999/html">
43
45
  <!-- logo -->
@@ -83,6 +85,9 @@ Requires-Dist: paddlepaddle==2.6.1; (platform_system == "Windows" or platform_sy
83
85
  </div>
84
86
 
85
87
  # Changelog
88
+ - 2024/11/22 0.10.0 released. Introducing hybrid OCR text extraction capabilities,
89
+ - Significantly improved parsing performance in complex text distribution scenarios such as dense formulas, irregular span regions, and text represented by images.
90
+ - Combines the dual advantages of accurate content extraction and faster speed in text mode, and more precise span/line region recognition in OCR mode.
86
91
  - 2024/11/15 0.9.3 released. Integrated [RapidTable](https://github.com/RapidAI/RapidTable) for table recognition, improving single-table parsing speed by more than 10 times, with higher accuracy and lower GPU memory usage.
87
92
  - 2024/11/06 0.9.2 released. Integrated the [StructTable-InternVL2-1B](https://huggingface.co/U4R/StructTable-InternVL2-1B) model for table recognition functionality.
88
93
  - 2024/10/31 0.9.0 released. This is a major new version with extensive code refactoring, addressing numerous issues, improving performance, reducing hardware requirements, and enhancing usability:
@@ -162,7 +167,7 @@ https://github.com/user-attachments/assets/4bea02c9-6d54-4cd6-97ed-dff14340982c
162
167
  - Preserve the structure of the original document, including headings, paragraphs, lists, etc.
163
168
  - Extract images, image descriptions, tables, table titles, and footnotes.
164
169
  - Automatically recognize and convert formulas in the document to LaTeX format.
165
- - Automatically recognize and convert tables in the document to LaTeX or HTML format.
170
+ - Automatically recognize and convert tables in the document to HTML format.
166
171
  - Automatically detect scanned PDFs and garbled PDFs and enable OCR functionality.
167
172
  - OCR supports detection and recognition of 84 languages.
168
173
  - Supports multiple output formats, such as multimodal and NLP Markdown, JSON sorted by reading order, and rich intermediate formats.
@@ -226,17 +231,11 @@ There are three different ways to experience MinerU:
226
231
  </tr>
227
232
  <tr>
228
233
  <td rowspan="2">GPU Hardware Support List</td>
229
- <td colspan="2">Minimum Requirement 8G+ VRAM</td>
230
- <td colspan="2">3060ti/3070/4060<br>
231
- 8G VRAM enables layout, formula recognition acceleration and OCR acceleration</td>
234
+ <td colspan="2">GPU VRAM 8GB or more</td>
235
+ <td colspan="2">2080~2080Ti / 3060Ti~3090Ti / 4060~4090<br>
236
+ 8G VRAM can enable all acceleration features</td>
232
237
  <td rowspan="2">None</td>
233
238
  </tr>
234
- <tr>
235
- <td colspan="2">Recommended Configuration 10G+ VRAM</td>
236
- <td colspan="2">3080/3080ti/3090/3090ti/4070/4070ti/4070tisuper/4080/4090<br>
237
- 10G VRAM or more can enable layout, formula recognition, OCR acceleration and table recognition acceleration simultaneously
238
- </td>
239
- </tr>
240
239
  </table>
241
240
 
242
241
  ### Online Demo
@@ -288,7 +287,7 @@ You can modify certain configurations in this file to enable or disable features
288
287
  "enable": true // The formula recognition feature is enabled by default. If you need to disable it, please change the value here to "false".
289
288
  },
290
289
  "table-config": {
291
- "model": "rapid_table", // When using structEqTable, please change to "struct_eqtable".
290
+ "model": "rapid_table", // Default to using "rapid_table", can be switched to "tablemaster" or "struct_eqtable".
292
291
  "enable": false, // The table recognition feature is disabled by default. If you need to enable it, please change the value here to "true".
293
292
  "max_time": 400
294
293
  }
@@ -1,20 +1,26 @@
1
1
  magic_pdf/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
- magic_pdf/pdf_parse_by_ocr.py,sha256=E-AYHUXjzorFli0CEtmnAi09SI2STJ7FX58yjU0c9PI,810
3
- magic_pdf/pdf_parse_by_txt.py,sha256=YeFYVAdfwF1CXOHq0LVE5131nqPHA14nt5t_sb-CMMk,709
4
- magic_pdf/pdf_parse_union_core.py,sha256=AGIrP7ahc6Ycku0PxAlbjZhwqsdJ8iuRPIn-PFASKWY,11772
5
- magic_pdf/pdf_parse_union_core_v2.py,sha256=GAgSP0PqbPg4U_nJXUztr-uBmakIK5rKwuxv0o9nMK0,25228
6
- magic_pdf/user_api.py,sha256=gM-3RQYc6pMEsVvEPFXfWf5RBjAvHcUccugL6fXpP_U,3991
2
+ magic_pdf/pdf_parse_by_ocr.py,sha256=WTaLVSU2wRpgtldasnqbrw1B0OvVi8VvcB_t-dAIfmw,880
3
+ magic_pdf/pdf_parse_by_txt.py,sha256=dh3ZM6BVrFzwbH4137BPUdKhgacGlpS2N4mn74_-UaA,762
4
+ magic_pdf/pdf_parse_union_core.py,sha256=w90lFIMOYUMAq4iv8bpsbBtLXFphPV4HyYeqbOTYQUI,12420
5
+ magic_pdf/pdf_parse_union_core_v2.py,sha256=EqEi9AahBBh2JbXoY8uOCmClvi9W_H_26U4jK8RwPwU,31308
6
+ magic_pdf/user_api.py,sha256=Sh6U7iD5VsH7Qkav_0o5GTx-Rlj7vhmhHQHZSBKR5T8,4006
7
7
  magic_pdf/config/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
8
+ magic_pdf/config/constants.py,sha256=gqhUEtso7rCop-k-VvEPAMW_6pA6Tv2Y9smrr_0Iajo,1173
9
+ magic_pdf/config/drop_reason.py,sha256=CqjMzBE96Qo8OeFvhhhItY8WhyqsKhE3DmyJLoQZNCc,2248
10
+ magic_pdf/config/drop_tag.py,sha256=CjveyzhAsHm_bfXB7ZZNKruw1NR-WdKD8Hz6OhQdG0A,680
8
11
  magic_pdf/config/enums.py,sha256=CImYuw4sbKpq9zrj6zrrEvtdoGkjxDt8S8ByNVDpypU,89
9
12
  magic_pdf/config/exceptions.py,sha256=87UX7gyUpj4HqjPcz2hLqdnYeImtDQAxOxj8oXZ_zkE,622
13
+ magic_pdf/config/make_content_config.py,sha256=J2eJIhVHBPGwX18zVQomQUOxs8LcfeGLxLgdBEeRvLg,248
14
+ magic_pdf/config/model_block_type.py,sha256=y5ie2ZLvo-h8OdVk8HOEha6qK0OJFtLmtOhYjrV680g,166
15
+ magic_pdf/config/ocr_content_type.py,sha256=e_7RBTdShaWvWhMO2SFou7GM521elMH_Jtn5usbHWdY,890
10
16
  magic_pdf/data/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
11
17
  magic_pdf/data/dataset.py,sha256=n8rGw1-wizABR8giSk_XWPCXzx3478u5DK2Z0wOCOeI,5089
12
- magic_pdf/data/read_api.py,sha256=3fKLsEYAow5RwAmGFMMgvcCh0-_WEEHem2uewukjXOA,3570
18
+ magic_pdf/data/read_api.py,sha256=hGpSVg9EcyM2mIlOsDIwsl7Y_ybWf9kkoxRumIXSzQQ,3566
13
19
  magic_pdf/data/schemas.py,sha256=oIUTBzK8Wq8Wuy8A_uilWAbVhucRvOs9_f3lSKYgcmQ,664
14
20
  magic_pdf/data/utils.py,sha256=dJZiqygwNier0UG5tbt5jAPjgwcnfsAN6-m-G1kVPLQ,917
15
21
  magic_pdf/data/data_reader_writer/__init__.py,sha256=QtevUaeSivv9dQKi3Tomfn4Z0E4To0cB8qXTnglxaHc,705
16
22
  magic_pdf/data/data_reader_writer/base.py,sha256=gUrHCMTHYBrWpqgHdIc-hN7HHwUC2ApK_VXrDUrnfdg,1320
17
- magic_pdf/data/data_reader_writer/filebase.py,sha256=21RYy4m9MqJGqwd2HWICQJHM-PZXp7UYETCQQK390Kk,1988
23
+ magic_pdf/data/data_reader_writer/filebase.py,sha256=iVjBT1M_89F5HB2uMsBzLfCNhsTVnmurJk4VUuAA2tw,2111
18
24
  magic_pdf/data/data_reader_writer/multi_bucket_s3.py,sha256=_HA8NJO1Be7KwozlwOJ90o8Ik2vfjlvlDPXppESeIfk,5885
19
25
  magic_pdf/data/data_reader_writer/s3.py,sha256=9Oy1cNuXMwG1e8PgZ7AR-pn_MqHAhkgAGnyEZCYoYAA,2408
20
26
  magic_pdf/data/io/__init__.py,sha256=WKaIlu8i5AWYxFCGNJcorAfMnlUQDOF8CX07Ycfnu2c,294
@@ -22,59 +28,53 @@ magic_pdf/data/io/base.py,sha256=SqNQqe30ZvoVvg7GVv-hLMCjN6yBgDyQQWeLgGsTfhQ,111
22
28
  magic_pdf/data/io/http.py,sha256=XlKB0DNf4a_uUnfgcclvaaOtmE7lmddx0DnK8A-emAM,958
23
29
  magic_pdf/data/io/s3.py,sha256=hyA7sbNriQy64xd_uyJ7acN_oneQ1Pdmoc7_xcvkue8,3606
24
30
  magic_pdf/dict2md/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
25
- magic_pdf/dict2md/mkcontent.py,sha256=rWUY-2opd0jeowEUEVOV_uWcKum1Q7ng4nOoT6-ka_s,17459
26
- magic_pdf/dict2md/ocr_mkcontent.py,sha256=lM5UBDueiZcm4_z-jtmcgbJH2jhaXhMVY5ubggaKqHU,12954
31
+ magic_pdf/dict2md/mkcontent.py,sha256=bMQK7uiay76YaWA92VIK57YajINV20SnOs65wOEXyKE,18667
32
+ magic_pdf/dict2md/ocr_mkcontent.py,sha256=ohjhEFS9YFrzTCC9c9yrvi4QuZe9iZm1qlkQWB6xxIw,13038
27
33
  magic_pdf/filter/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
28
34
  magic_pdf/filter/pdf_classify_by_type.py,sha256=spmDO-f2ihAl1d6-EP-j271Yi50oyu6mw4X2kRd_m0s,42320
29
- magic_pdf/filter/pdf_meta_scan.py,sha256=5R2XDiBZw0xd4ugbDxuyk6fztGlT5jFsGN85hLvo-hQ,17390
35
+ magic_pdf/filter/pdf_meta_scan.py,sha256=h4D4O0OeAlEy2A8mJ6E0aQ8wIizIfsIxEagbjaomnAo,17823
30
36
  magic_pdf/integrations/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
31
37
  magic_pdf/integrations/rag/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
32
38
  magic_pdf/integrations/rag/api.py,sha256=t38wvIBzLje4_JzTP3dewMLqV-tQJ-A3B92Sj2oyrfs,2507
33
39
  magic_pdf/integrations/rag/type.py,sha256=Z_1g_ZIOCsb7-FmZBudReIXj8nzGrgj_BygCalhJdmk,3193
34
- magic_pdf/integrations/rag/utils.py,sha256=UX_EySxi-WA1nwFLq6IpVQQ7mMAkMl257oEELaqpSzc,11833
40
+ magic_pdf/integrations/rag/utils.py,sha256=DCb-UhC8TElb6Eq7_6NmmETreKEk5DVE18hNL8sTEBk,11762
35
41
  magic_pdf/layout/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
36
42
  magic_pdf/layout/bbox_sort.py,sha256=PzzaBf6MC_AZ-ZWGU0Kg-KIsw874l_gML73mM3hE4Ps,30807
37
43
  magic_pdf/layout/layout_det_utils.py,sha256=NCYBTvsrULE3Cue53aMD1MfXTmOL9Xy0nivl6ku2cls,9137
38
44
  magic_pdf/layout/layout_sort.py,sha256=jtacQVcxnuYAksvEqtS0DH-v6U8qyjX-jmyZgDJ-egA,37005
39
45
  magic_pdf/layout/layout_spiler_recog.py,sha256=QjBSgB-a7J2yjUR1eaCs9ZD7URtiRnV6W934hpAeuC4,3067
40
46
  magic_pdf/layout/mcol_sort.py,sha256=ADnLisBJBHXDKYChcf2lzTb_TC_vZ4q89_CSN8mwEJc,11331
41
- magic_pdf/libs/Constants.py,sha256=ptiwMvWDUmzRZ0IbP1bM3PjGJ24BQVQQHO4sCeioPv8,1173
42
- magic_pdf/libs/MakeContentConfig.py,sha256=Do5VKNQp3gfUKyhrZStfzfBj7l-vbsYpsJFF1SsmEc0,248
43
- magic_pdf/libs/ModelBlockTypeEnum.py,sha256=kalXPbo5ya6hKhhBHPGlHl1yjWOURoXZWQM3rVUyPsY,164
44
47
  magic_pdf/libs/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
45
48
  magic_pdf/libs/boxbase.py,sha256=ELMHWolgWROxOAQDgwmL7VS5kveZp4ifvEzRmPul2Ws,16925
46
49
  magic_pdf/libs/calc_span_stats.py,sha256=5vnU27DcbkFDRSAoLqAmX0KQ3I9ehWkEgh_t9hxg_zI,10147
47
50
  magic_pdf/libs/clean_memory.py,sha256=BIOmEWuwR7c_p4OwTSW2muE3PRaGhmOplS-wTXt_EXk,211
48
51
  magic_pdf/libs/commons.py,sha256=6Zu9-OyamyCNDY7qj0SxR-rux-ggj9im3CVPtC4ubB8,7108
49
- magic_pdf/libs/config_reader.py,sha256=7QIeUPLb8CNa7E3n8TT3MN61lZdYVTylxn5cyXPsPfA,4066
52
+ magic_pdf/libs/config_reader.py,sha256=vDsxw2xbW7Gb1mKqERTSlttbXFNtVU0BDdae2dG7wEI,4068
50
53
  magic_pdf/libs/convert_utils.py,sha256=Ov-lsfCLBPz_15iSJXIslBNmrSf_E_1g_XDWJy8NgO8,143
51
54
  magic_pdf/libs/coordinate_transform.py,sha256=Bbop2cP2uz2ZG0U0gwd7J6EKkgABq5Rv03qf2LMPw80,429
52
55
  magic_pdf/libs/detect_language_from_model.py,sha256=Uln8F9qs8EJOw4EgI7KRlaU3lD_mK8KMTlADLFtz8fk,816
53
- magic_pdf/libs/draw_bbox.py,sha256=Ri_jbOv3Tgnx6s1IscRIWiIKNfUHPkGW8v4q4jPtgo8,17623
54
- magic_pdf/libs/drop_reason.py,sha256=IfjPSrPLMmVziqjOXPep7r_ioQKFRahDgbOW1SD-Tuw,2148
55
- magic_pdf/libs/drop_tag.py,sha256=bZDg3bIVWvBT1Ec1icwj5WLOkt5-hI6eRYZ2tX9_a74,673
56
+ magic_pdf/libs/draw_bbox.py,sha256=NhAfqib5HYuGjjrAG_SvJR-yOHZTy6tzDxLXdxKlULQ,17676
56
57
  magic_pdf/libs/hash_utils.py,sha256=VEKK9WfFoZgrPfi8kfITjLpr8Ahufs8tXh9R1Y5lAL8,404
57
58
  magic_pdf/libs/json_compressor.py,sha256=6-KCu0lb5ksmyqWtQGb4QqmP-FjRb5dP7P-Hevcn68g,875
58
59
  magic_pdf/libs/language.py,sha256=Hj5-lrGoNExxdHLbkcNG-c27U4AjJ9AZPdZblaNSehU,1099
59
60
  magic_pdf/libs/local_math.py,sha256=tqljQOgqh3fZc146HYhO88JXJaiXMVwArBkk_CSGICc,177
60
61
  magic_pdf/libs/markdown_utils.py,sha256=cLxLXjRhrNp_wCHvtglrGA_FVdrvfd1KULeTtj1p18w,944
61
62
  magic_pdf/libs/nlp_utils.py,sha256=-X9W3-Ns5ZdDYFvyyEq6i6P2b5hCATaFEZeOjwNOH9M,6901
62
- magic_pdf/libs/ocr_content_type.py,sha256=e_7RBTdShaWvWhMO2SFou7GM521elMH_Jtn5usbHWdY,890
63
63
  magic_pdf/libs/path_utils.py,sha256=Hykw_l5CU736b2egHV9P7B-qh3QNKO4nZSGCbsi0Z8E,1043
64
64
  magic_pdf/libs/pdf_check.py,sha256=MAe8wzwT0qvPf_I72wEZG7k1g4haNHS7oUtLqkB5rlE,2145
65
- magic_pdf/libs/pdf_image_tools.py,sha256=CAd01giTKr_UJz1_QtDOARG9G9z69GFpzRZwcWSfLtE,1282
65
+ magic_pdf/libs/pdf_image_tools.py,sha256=sh8hgBQu_83R71qBLodOFdByBUuQujsOMfgpSD9mrhE,1981
66
66
  magic_pdf/libs/safe_filename.py,sha256=ckwcM_eqoysTb5id8czp-tXq2G9da0-l3pshZDCHQtE,236
67
67
  magic_pdf/libs/textbase.py,sha256=SC1Frhz3Fb7V7n2SFRBsl7Bmg0JZdlvZskq0lfW1vIk,732
68
- magic_pdf/libs/version.py,sha256=xKd3pzbczuMsdB08eLAOqZDUd_q1IRxwZ_ccAFL4c4A,22
68
+ magic_pdf/libs/version.py,sha256=v7Gyp89umFzDtY45tTjCdXqZnQ2RN01AibdYNxEvxYo,23
69
69
  magic_pdf/libs/vis_utils.py,sha256=hTOTEakKV0pGMbk0tbRkVI_tku7A3dGc96ynObZ4kwI,10207
70
70
  magic_pdf/model/__init__.py,sha256=1QcfMKET0xQhSaZMjNQHi_TjzSSDR6PI5mjkmaXHPe8,52
71
- magic_pdf/model/doc_analyze_by_custom_model.py,sha256=eYrtOIlFqw8O95ShoCTaAhLBHk7TXc5DGif93VikW4s,6977
72
- magic_pdf/model/magic_model.py,sha256=RKJOruUGAV1lHcGqSlCDbkJn5kutb3fphDreOHASPQg,43505
71
+ magic_pdf/model/doc_analyze_by_custom_model.py,sha256=HOT6chGx2VPyH6O9WB0c6xGPeDs9m_6oZn3iOa745yw,7125
72
+ magic_pdf/model/magic_model.py,sha256=8nJLzNCa0Ag4JhMAQbjj5qrkj617qKPCXVJAiT9DnaA,43472
73
73
  magic_pdf/model/model_list.py,sha256=tJ9jtMB93HGx8Rmt8wmQSDFXZBUIPQrwaaYsep4luTM,183
74
- magic_pdf/model/pdf_extract_kit.py,sha256=6y8tQSwse8cAgqjDoJvJ-uSPdT8FYzyUeCW5g7j1Tyw,10126
74
+ magic_pdf/model/pdf_extract_kit.py,sha256=ceYWlSU1BhakfsHPVM9SrUx35EvCBa20uJmgDO5PAtE,10933
75
75
  magic_pdf/model/pp_structure_v2.py,sha256=BKPN7W4BjG0eWPAPjPEac1RMnb5eIzmAz4E4Rq-9b1U,3019
76
76
  magic_pdf/model/sub_modules/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
77
- magic_pdf/model/sub_modules/model_init.py,sha256=iFugp79H_QLi-P7t_6Ug0qIs2oOc4zSnf-8hhZhezHA,5021
77
+ magic_pdf/model/sub_modules/model_init.py,sha256=CnlZLsiSOmGJXQRASH-hMmuPiF6hYKCNfmzDTjQqy5g,5073
78
78
  magic_pdf/model/sub_modules/model_utils.py,sha256=ToiuwXbrvH_CPIwW2AXzz9miadUN5FA7lthwBljtIco,2118
79
79
  magic_pdf/model/sub_modules/layout/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
80
80
  magic_pdf/model/sub_modules/layout/doclayout_yolo/DocLayoutYOLO.py,sha256=roe6Rth6cvBrCw0MWXcj1CBjvK3S_Ni7GC4DxY4-yBQ,886
@@ -107,8 +107,8 @@ magic_pdf/model/sub_modules/mfr/unimernet/Unimernet.py,sha256=jeJkqID6L1ZivPMdK1
107
107
  magic_pdf/model/sub_modules/mfr/unimernet/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
108
108
  magic_pdf/model/sub_modules/ocr/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
109
109
  magic_pdf/model/sub_modules/ocr/paddleocr/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
110
- magic_pdf/model/sub_modules/ocr/paddleocr/ocr_utils.py,sha256=2QAxxs0awZ_osLMiL-oP8Ik6VQ3f2C4dgJ0EV93bxlQ,9202
111
- magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_273_mod.py,sha256=BZ7wtkYvvcKtv8jUOI1n6wsSramt-Ob5faP7UeqrfCU,6710
110
+ magic_pdf/model/sub_modules/ocr/paddleocr/ocr_utils.py,sha256=UP7fADPGoxAMj2SUKmeW-fe_AcAQxlT9Mfy4WF6vHmU,9796
111
+ magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_273_mod.py,sha256=a6xkQHqLMUL4NCaORp8oo4Tfa8GB8PN9MVvG7Pj6jIE,7316
112
112
  magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_291_mod.py,sha256=VouMTvi6M5TV6pQdlpusgfyZapxiZ_Wi7Ff53eMC3rE,8996
113
113
  magic_pdf/model/sub_modules/reading_oreder/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
114
114
  magic_pdf/model/sub_modules/reading_oreder/layoutreader/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -117,11 +117,11 @@ magic_pdf/model/sub_modules/reading_oreder/layoutreader/xycut.py,sha256=ezNSq_Y4
117
117
  magic_pdf/model/sub_modules/table/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
118
118
  magic_pdf/model/sub_modules/table/table_utils.py,sha256=B9BC4f5EEjlt2ldYxrIC8Wic2Tz3t3gTJeEyK3ggrOU,282
119
119
  magic_pdf/model/sub_modules/table/rapidtable/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
120
- magic_pdf/model/sub_modules/table/rapidtable/rapid_table.py,sha256=UT__wzKQ4tVxlxgFacDqJfTyBU911CTJXD_6CTw6iS8,516
120
+ magic_pdf/model/sub_modules/table/rapidtable/rapid_table.py,sha256=_FKKOSKeceusx6DCnhqYzP-4b1zSWptrefimxFTmy8Q,583
121
121
  magic_pdf/model/sub_modules/table/structeqtable/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
122
122
  magic_pdf/model/sub_modules/table/structeqtable/struct_eqtable.py,sha256=SrNPm-uOFEvN5muFGbXTAuwzXm-rCiaihVdqbydIBIA,1131
123
123
  magic_pdf/model/sub_modules/table/tablemaster/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
124
- magic_pdf/model/sub_modules/table/tablemaster/tablemaster_paddle.py,sha256=keSvrxuTVqc8PbNenwb43VDhJqqzp0ayxK691kxClac,2702
124
+ magic_pdf/model/sub_modules/table/tablemaster/tablemaster_paddle.py,sha256=AdH3UGu4BEoII0uFjPKUf61W7HmG4fDlWgR1xxMeFlE,2775
125
125
  magic_pdf/para/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
126
126
  magic_pdf/para/block_continuation_processor.py,sha256=IkReB5hirjm0OAirNzQQpnWe2f2bdP3Hug3Ef8qTRDA,22749
127
127
  magic_pdf/para/block_termination_processor.py,sha256=YU3ZYqJy9e3OQmOuQYZrR6AUpmAlQ0mhj0PgZZPZ_fM,17957
@@ -130,26 +130,25 @@ magic_pdf/para/denoise.py,sha256=J7dM2KNnbdzAd2A3agB04U6L1GL9RrhAs-MLrq-_Ftg,104
130
130
  magic_pdf/para/draw.py,sha256=KyWc03do_WuBKQ028HYzepYwbIkel9ID0uqRhuPVOHc,5643
131
131
  magic_pdf/para/exceptions.py,sha256=kpjGxrSZ-drNmoKlmuQ0asTjI8cKKKWsdDDBoDHQP9M,4978
132
132
  magic_pdf/para/layout_match_processor.py,sha256=yr4FEO7GJ502udShqGRqIJQ_FQxoa0aG_mhmWd8nLwI,1554
133
- magic_pdf/para/para_pipeline.py,sha256=zLaCHI9jLi1UPzh0lHP44mUjpKVTHS0gE_5YrkjVqEY,11796
134
- magic_pdf/para/para_split.py,sha256=-UJM2jREW_2h3ZlJAU7dRD8bK3CMGKuhJrfgqv3Auvk,31310
135
- magic_pdf/para/para_split_v2.py,sha256=ZIiLzpvVL364x1zcEG9IbT6ARJ-6JnWLIVrsDmf4w1M,36878
136
- magic_pdf/para/para_split_v3.py,sha256=vSJ5_QqGKP1rbTbGQg5ONNpybidpTdbgXZgTGd2bGsw,14539
133
+ magic_pdf/para/para_split.py,sha256=z7nYeg86BjZOAdJNMwYKSu51W9evurtl3cy1ZUcQLlw,33222
134
+ magic_pdf/para/para_split_v2.py,sha256=vJJqqMMKbv8D702nODThL-5hjkgZ7Vl2BTmEIdwmmDw,39051
135
+ magic_pdf/para/para_split_v3.py,sha256=atfELVRx-90paAS3nZptgP0qG8UpTTaj3LG_2x3NAlQ,15977
137
136
  magic_pdf/para/raw_processor.py,sha256=mHxD9FrdOSXH7NqM41s55URyCyuyACvm9kKtowkIb3k,6317
138
137
  magic_pdf/para/stats.py,sha256=-6Pf9Y8jkP1uJOYWiHUjw9Lb-Fb9GY7MHr_ok7x2GX0,9731
139
138
  magic_pdf/para/title_processor.py,sha256=pYZv9vEkIjAtCz8jIUtl9AVUy_ib5SdAZmMVoZtsMRI,38593
140
- magic_pdf/pipe/AbsPipe.py,sha256=jpJD-8S5K89bCxI4FY2_tMKGV1lghlbtmpOeJXqEhz0,4487
141
- magic_pdf/pipe/OCRPipe.py,sha256=TswwfMkAYP-fpfsBoNIArtcCoNEqzCQCVWfwR4n8G-E,2159
142
- magic_pdf/pipe/TXTPipe.py,sha256=zcsiOreOHJBtaeYgEc5yGKMSiNzBED8HcsD7NGu5RaY,2218
143
- magic_pdf/pipe/UNIPipe.py,sha256=3ytYHoyTboHUGlRQpfyITiAP_mYXTOBjqy7wYHoUHpE,4944
139
+ magic_pdf/pipe/AbsPipe.py,sha256=jPtAa0pz_vPddya3ZpUk6UrGqp8PcBdLONO1spzavQo,4371
140
+ magic_pdf/pipe/OCRPipe.py,sha256=nuN-zpUzu--gyrC0_vsvvilAyK7Mp3Tom_UOnsur1ps,2158
141
+ magic_pdf/pipe/TXTPipe.py,sha256=5OFo2e8U5Y24wJrFDEJghBDpklnKFEnzKTYVnnhQssE,2159
142
+ magic_pdf/pipe/UNIPipe.py,sha256=ik0xXPdsHo7Un0gFpLC5ul04BP3Omd2mp5gqem40deE,4807
144
143
  magic_pdf/pipe/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
145
144
  magic_pdf/post_proc/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
146
145
  magic_pdf/post_proc/detect_para.py,sha256=5LX86ueHQGOV9CNimAxqZH4R3KTi78leum1de_Na0pw,126181
147
- magic_pdf/post_proc/pdf_post_filter.py,sha256=FeZceyjGG_UvBrBoa51Ohge5edQzCoJtZTaocidKCHg,2530
146
+ magic_pdf/post_proc/pdf_post_filter.py,sha256=3EJDovQPckPKJaBY1wvAty-LGKyRG63WICY_bA_Kfbs,2501
148
147
  magic_pdf/post_proc/remove_footnote.py,sha256=701P7xRu6gzLaEHfb2xkYpLZI4CwK2FAo7Ggho4bOTI,7596
149
148
  magic_pdf/pre_proc/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
150
149
  magic_pdf/pre_proc/citationmarker_remove.py,sha256=IitOERaK9fGaktsYMyiaaL_71uMIrlG5ZdmpZaR6dsA,6640
151
150
  magic_pdf/pre_proc/construct_page_dict.py,sha256=lp3zBmInlWYYIcGC1-NSqT9s44AjDvlnWxDPeZoBVSY,3043
152
- magic_pdf/pre_proc/cut_image.py,sha256=bbeELTg2-SFyHkVEnGAL_7S6k8hyy1xtDSoFmXDQDOA,2768
151
+ magic_pdf/pre_proc/cut_image.py,sha256=TghshkDTgdUbyLSbKZoFI9-n-xaFub02IYPyu0IAnRY,2761
153
152
  magic_pdf/pre_proc/detect_equation.py,sha256=9omDHKTI8QO9Qd46eVFHWhZeMmTNx7XDuWRgjXI-KFA,6627
154
153
  magic_pdf/pre_proc/detect_footer_by_model.py,sha256=_EghAM_zWBcqVY8XBkbSoprKqKUa0mlN1U8YNWxNNLI,2848
155
154
  magic_pdf/pre_proc/detect_footer_header_by_statistics.py,sha256=924soXZ51QVpitPgVgnwbC7BqOZI30j5hGW5zP86y-w,11250
@@ -158,21 +157,21 @@ magic_pdf/pre_proc/detect_header.py,sha256=KOmRehgKMuMqNa_2weXkdNSiRVWMFgLMQE4e1
158
157
  magic_pdf/pre_proc/detect_images.py,sha256=8DwGGTb5IjxqADZDTc_ngwJrTYXxK2qpRqI2FBoPr00,30432
159
158
  magic_pdf/pre_proc/detect_page_number.py,sha256=qvYrBbCtBbREvw-MySL_p7byCRvcm1fkLJ5ZB4TP8OM,2848
160
159
  magic_pdf/pre_proc/detect_tables.py,sha256=srJzgLVeVuOsqnESqfdJfVukTF84K8qmI5mgFX_BZGs,2800
161
- magic_pdf/pre_proc/equations_replace.py,sha256=fXj7ZV7F3YtkDYrAhE9g5tHk4_3pVUyLbhDtMjbxjWU,20386
160
+ magic_pdf/pre_proc/equations_replace.py,sha256=7mexRPwD9C_UJ-SbvO_-XnpcnN7YtGUUznmPjHbjhnw,20526
162
161
  magic_pdf/pre_proc/fix_image.py,sha256=5MOfkXc8abfIp49g-68vll40wwTUZ5tcQ2gtsJuFmvs,11486
163
162
  magic_pdf/pre_proc/fix_table.py,sha256=20sqJe27fAXcL7_C0qQ9mpsggmH37WuX-wPYWyRgACA,13227
164
163
  magic_pdf/pre_proc/main_text_font.py,sha256=1gkjvPuBdKC4oVFkLvnRm2zghsLtVlfAEMKXouyVonM,1048
165
- magic_pdf/pre_proc/ocr_detect_all_bboxes.py,sha256=omR2aOQNerKmKUF13CFP5-Z6Hv4GZztmsAQgLpHiugc,11341
164
+ magic_pdf/pre_proc/ocr_detect_all_bboxes.py,sha256=DMc2H2xGqVePBReZu5AQbPdvDw3sxOssmujCLlNW3Vs,14143
166
165
  magic_pdf/pre_proc/ocr_detect_layout.py,sha256=DW0_HXzmcbW22cXKIYFsyZNFh8mEjSHXIFVjXndJsvQ,5878
167
- magic_pdf/pre_proc/ocr_dict_merge.py,sha256=KyXSW36lJ1PsK816J55vVQQtXTWODX1xG1_DvA7pOSo,14248
168
- magic_pdf/pre_proc/ocr_span_list_modify.py,sha256=jqBheXF8EuYCfS9tn6typr-aE57nfMoeBC36J5GjpbQ,11519
169
- magic_pdf/pre_proc/pdf_pre_filter.py,sha256=FIMwe8Lei9LI2RmkqiaSyTHV5b7ViADbpyBwgVwZH-c,2687
166
+ magic_pdf/pre_proc/ocr_dict_merge.py,sha256=Au8y1NBhbWpq_VuPLg3b9dAMUhyPS71xtTghtd21K5M,14273
167
+ magic_pdf/pre_proc/ocr_span_list_modify.py,sha256=9DxEyy1pH87g4T_JEgI3cTVCL2TVrEBl38wsmqhQM4k,12758
168
+ magic_pdf/pre_proc/pdf_pre_filter.py,sha256=qvNlNyj4Mc3qa73mgfkp0PMR-ucABbx3mMcyVipaEpQ,2776
170
169
  magic_pdf/pre_proc/post_layout_split.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
171
- magic_pdf/pre_proc/remove_bbox_overlap.py,sha256=u_ObNLkZ8pPDNBUkSMpA9ffiSpfz42B4807cdBPZmLU,3085
172
- magic_pdf/pre_proc/remove_colored_strip_bbox.py,sha256=QbFCsiFFracBNC_kLlZgwvRQ-OQ8saexgbYABlhjDQE,3633
173
- magic_pdf/pre_proc/remove_footer_header.py,sha256=nUC28KXkaIPQZL2g94omcnfeyB4s3MBqo_-8KKvcZxQ,5691
174
- magic_pdf/pre_proc/remove_rotate_bbox.py,sha256=0FlBXeiEwjZAGAWo-DiMptclFOj04POuH_dovSA4HUI,7772
175
- magic_pdf/pre_proc/resolve_bbox_conflict.py,sha256=bJiegofPUeDyi--oZjfipQ5Q5RLm6TOCW0TLXbPii_Q,7307
170
+ magic_pdf/pre_proc/remove_bbox_overlap.py,sha256=8eXNdsz9s06LX0kS0AxYSkaY1tWQQMkJfVtVSdjTQNE,3090
171
+ magic_pdf/pre_proc/remove_colored_strip_bbox.py,sha256=WVKhgeWifRdO-u2ETYffkcMOFVYIbiaZu5pMr1RpEdA,4090
172
+ magic_pdf/pre_proc/remove_footer_header.py,sha256=Igdr4jH7BUGuTcapWPiKEGKxhWH12c3VVmX5xwUVn7w,5680
173
+ magic_pdf/pre_proc/remove_rotate_bbox.py,sha256=di7geS7AFhSaAvkWZHT6J3dlXEq8uu9Z4oBYtolQjl0,8803
174
+ magic_pdf/pre_proc/resolve_bbox_conflict.py,sha256=ABl0vo8kkcCPSTI8dpXQTOH1b9R-lbzsJDDFONU6ELk,7313
176
175
  magic_pdf/pre_proc/solve_line_alien.py,sha256=aNoQptPcC38Sm1I2ABhgw8jeH_5kjsRHx3VYlFFtm1g,853
177
176
  magic_pdf/pre_proc/statistics.py,sha256=_9jGlXq0iXd03UMxB92ZqCiu7cjNkG5vHvFlTF_9ytA,220
178
177
  magic_pdf/resources/fasttext-langdetect/lid.176.ftz,sha256=jzRyz-hzintgmejpmcPL-uDc0VaWqsfXc4qAOdtgPoM,938013
@@ -184,16 +183,16 @@ magic_pdf/rw/DiskReaderWriter.py,sha256=7ZAekH8V6xlBo_1WeSZ6sNwAj2WGPtjNl50zq1Co
184
183
  magic_pdf/rw/S3ReaderWriter.py,sha256=_DmL45Ubio-_VsKD84KrqOQ-VNDUTzcXSrXfNMb5vww,5310
185
184
  magic_pdf/rw/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
186
185
  magic_pdf/spark/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
187
- magic_pdf/spark/spark_api.py,sha256=eSLXTjMYW5Ya41VMIApRVfji1ZxEZXdH9ZdsL6fy5Kw,1131
186
+ magic_pdf/spark/spark_api.py,sha256=BYO6zlRW0cEnIUB3ZzNQTu_LsPHEVitqiUN7gy3x_wo,1124
188
187
  magic_pdf/tools/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
189
- magic_pdf/tools/cli.py,sha256=yl2E-DYxBN3XF7bWOBseYxptbmeE7tXWpwV-sp2aGIE,3140
190
- magic_pdf/tools/cli_dev.py,sha256=3e5eyCQEt_EujXZu5fUAWr_W-YQQVqS9pB0Qgw7t1D8,4122
191
- magic_pdf/tools/common.py,sha256=oo6DsbriyQv0azRNZSt4B-13eXvsMsPgE_kwgO0-aM8,7364
188
+ magic_pdf/tools/cli.py,sha256=83a8p4_DvVdDOTuviE6WqexSXsDE_MUY-af3QDxXeoU,3067
189
+ magic_pdf/tools/cli_dev.py,sha256=3RbubfTIagWoFYdu8wSDanr-BJDjFGeDet55jTy7He0,3948
190
+ magic_pdf/tools/common.py,sha256=ILTv8YjnK-XTVV5nzak3Sm-EJJXjG1hJJghlYKgYVBQ,6809
192
191
  magic_pdf/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
193
192
  magic_pdf/utils/annotations.py,sha256=82ou3uELNbQWa9hOFFkVt0gsIskAKf5msCv5J2IJ5V0,211
194
- magic_pdf-0.9.3.dist-info/LICENSE.md,sha256=jVa0BUaKrRH4erV2P5AeJ24I2WRv9chIGxditreJ6e0,34524
195
- magic_pdf-0.9.3.dist-info/METADATA,sha256=IpWvg-cnoZ9euLIh_3PYmPGh-DCQ8n8Lp2Ar4oyUfuc,40128
196
- magic_pdf-0.9.3.dist-info/WHEEL,sha256=bFJAMchF8aTQGUgMZzHJyDDMPTO3ToJ7x23SLJa1SVo,92
197
- magic_pdf-0.9.3.dist-info/entry_points.txt,sha256=wXwYke3j8fqDQTocUspL-CqDUEv3Tfcwp09fM8dZAhA,98
198
- magic_pdf-0.9.3.dist-info/top_level.txt,sha256=J9I0AzmHWGkp9c6DL8Oe4mEx3yYphLzkRn4H25Lg1rE,10
199
- magic_pdf-0.9.3.dist-info/RECORD,,
193
+ magic_pdf-0.10.1.dist-info/LICENSE.md,sha256=jVa0BUaKrRH4erV2P5AeJ24I2WRv9chIGxditreJ6e0,34524
194
+ magic_pdf-0.10.1.dist-info/METADATA,sha256=QdRsUeX9lmB2tTEFLT92qEWnPcgxIu7L0GeqTOHBGms,40300
195
+ magic_pdf-0.10.1.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
196
+ magic_pdf-0.10.1.dist-info/entry_points.txt,sha256=wXwYke3j8fqDQTocUspL-CqDUEv3Tfcwp09fM8dZAhA,98
197
+ magic_pdf-0.10.1.dist-info/top_level.txt,sha256=J9I0AzmHWGkp9c6DL8Oe4mEx3yYphLzkRn4H25Lg1rE,10
198
+ magic_pdf-0.10.1.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: bdist_wheel (0.45.0)
2
+ Generator: bdist_wheel (0.45.1)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
@@ -1,55 +0,0 @@
1
- """
2
- span维度自定义字段
3
- """
4
- # span是否是跨页合并的
5
- CROSS_PAGE = "cross_page"
6
-
7
- """
8
- block维度自定义字段
9
- """
10
- # block中lines是否被删除
11
- LINES_DELETED = "lines_deleted"
12
-
13
- # table recognition max time default value
14
- TABLE_MAX_TIME_VALUE = 400
15
-
16
- # pp_table_result_max_length
17
- TABLE_MAX_LEN = 480
18
-
19
- # table master structure dict
20
- TABLE_MASTER_DICT = "table_master_structure_dict.txt"
21
-
22
- # table master dir
23
- TABLE_MASTER_DIR = "table_structure_tablemaster_infer/"
24
-
25
- # pp detect model dir
26
- DETECT_MODEL_DIR = "ch_PP-OCRv4_det_infer"
27
-
28
- # pp rec model dir
29
- REC_MODEL_DIR = "ch_PP-OCRv4_rec_infer"
30
-
31
- # pp rec char dict path
32
- REC_CHAR_DICT = "ppocr_keys_v1.txt"
33
-
34
- # pp rec copy rec directory
35
- PP_REC_DIRECTORY = ".paddleocr/whl/rec/ch/ch_PP-OCRv4_rec_infer"
36
-
37
- # pp rec copy det directory
38
- PP_DET_DIRECTORY = ".paddleocr/whl/det/ch/ch_PP-OCRv4_det_infer"
39
-
40
-
41
- class MODEL_NAME:
42
- # pp table structure algorithm
43
- TABLE_MASTER = "tablemaster"
44
- # struct eqtable
45
- STRUCT_EQTABLE = "struct_eqtable"
46
-
47
- DocLayout_YOLO = "doclayout_yolo"
48
-
49
- LAYOUTLMv3 = "layoutlmv3"
50
-
51
- YOLO_V8_MFD = "yolo_v8_mfd"
52
-
53
- UniMerNet_v2_Small = "unimernet_small"
54
-
55
- RAPID_TABLE = "rapid_table"
@@ -1,11 +0,0 @@
1
- class MakeMode:
2
- MM_MD = "mm_markdown"
3
- NLP_MD = "nlp_markdown"
4
- STANDARD_FORMAT = "standard_format"
5
-
6
-
7
- class DropMode:
8
- WHOLE_PDF = "whole_pdf"
9
- SINGLE_PAGE = "single_page"
10
- NONE = "none"
11
- NONE_WITH_REASON = "none_with_reason"
@@ -1,27 +0,0 @@
1
-
2
- class DropReason:
3
- TEXT_BLCOK_HOR_OVERLAP = "text_block_horizontal_overlap" # 文字块有水平互相覆盖,导致无法准确定位文字顺序
4
- USEFUL_BLOCK_HOR_OVERLAP = "useful_block_horizontal_overlap" # 需保留的block水平覆盖
5
- COMPLICATED_LAYOUT = "complicated_layout" # 复杂的布局,暂时不支持
6
- TOO_MANY_LAYOUT_COLUMNS = "too_many_layout_columns" # 目前不支持分栏超过2列的
7
- COLOR_BACKGROUND_TEXT_BOX = "color_background_text_box" # 含有带色块的PDF,色块会改变阅读顺序,目前不支持带底色文字块的PDF。
8
- HIGH_COMPUTATIONAL_lOAD_BY_IMGS = "high_computational_load_by_imgs" # 含特殊图片,计算量太大,从而丢弃
9
- HIGH_COMPUTATIONAL_lOAD_BY_SVGS = "high_computational_load_by_svgs" # 特殊的SVG图,计算量太大,从而丢弃
10
- HIGH_COMPUTATIONAL_lOAD_BY_TOTAL_PAGES = "high_computational_load_by_total_pages" # 计算量超过负荷,当前方法下计算量消耗过大
11
- MISS_DOC_LAYOUT_RESULT = "missing doc_layout_result" # 版面分析失败
12
- Exception = "_exception" # 解析中发生异常
13
- ENCRYPTED = "encrypted" # PDF是加密的
14
- EMPTY_PDF = "total_page=0" # PDF页面总数为0
15
- NOT_IS_TEXT_PDF = "not_is_text_pdf" # 不是文字版PDF,无法直接解析
16
- DENSE_SINGLE_LINE_BLOCK = "dense_single_line_block" # 无法清晰的分段
17
- TITLE_DETECTION_FAILED = "title_detection_failed" # 探测标题失败
18
- TITLE_LEVEL_FAILED = "title_level_failed" # 分析标题级别失败(例如一级、二级、三级标题)
19
- PARA_SPLIT_FAILED = "para_split_failed" # 识别段落失败
20
- PARA_MERGE_FAILED = "para_merge_failed" # 段落合并失败
21
- NOT_ALLOW_LANGUAGE = "not_allow_language" # 不支持的语种
22
- SPECIAL_PDF = "special_pdf"
23
- PSEUDO_SINGLE_COLUMN = "pseudo_single_column" # 无法精确判断文字分栏
24
- CAN_NOT_DETECT_PAGE_LAYOUT="can_not_detect_page_layout" # 无法分析页面的版面
25
- NEGATIVE_BBOX_AREA = "negative_bbox_area" # 缩放导致 bbox 面积为负
26
- OVERLAP_BLOCKS_CAN_NOT_SEPARATION = "overlap_blocks_can_t_separation" # 无法分离重叠的block
27
-
@@ -1,19 +0,0 @@
1
-
2
- COLOR_BG_HEADER_TXT_BLOCK = "color_background_header_txt_block"
3
- PAGE_NO = "page-no" # 页码
4
- CONTENT_IN_FOOT_OR_HEADER = 'in-foot-header-area' # 页眉页脚内的文本
5
- VERTICAL_TEXT = 'vertical-text' # 垂直文本
6
- ROTATE_TEXT = 'rotate-text' # 旋转文本
7
- EMPTY_SIDE_BLOCK = 'empty-side-block' # 边缘上的空白没有任何内容的block
8
- ON_IMAGE_TEXT = 'on-image-text' # 文本在图片上
9
- ON_TABLE_TEXT = 'on-table-text' # 文本在表格上
10
-
11
-
12
- class DropTag:
13
- PAGE_NUMBER = "page_no"
14
- HEADER = "header"
15
- FOOTER = "footer"
16
- FOOTNOTE = "footnote"
17
- NOT_IN_LAYOUT = "not_in_layout"
18
- SPAN_OVERLAP = "span_overlap"
19
- BLOCK_OVERLAP = "block_overlap"