magic-pdf 0.9.3__py3-none-any.whl → 0.10.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- magic_pdf/config/constants.py +53 -0
- magic_pdf/config/drop_reason.py +35 -0
- magic_pdf/config/drop_tag.py +19 -0
- magic_pdf/config/make_content_config.py +11 -0
- magic_pdf/{libs/ModelBlockTypeEnum.py → config/model_block_type.py} +2 -1
- magic_pdf/data/data_reader_writer/filebase.py +3 -0
- magic_pdf/data/read_api.py +1 -1
- magic_pdf/dict2md/mkcontent.py +226 -185
- magic_pdf/dict2md/ocr_mkcontent.py +11 -11
- magic_pdf/filter/pdf_meta_scan.py +101 -79
- magic_pdf/integrations/rag/utils.py +4 -5
- magic_pdf/libs/config_reader.py +5 -5
- magic_pdf/libs/draw_bbox.py +3 -2
- magic_pdf/libs/pdf_image_tools.py +36 -12
- magic_pdf/libs/version.py +1 -1
- magic_pdf/model/doc_analyze_by_custom_model.py +2 -0
- magic_pdf/model/magic_model.py +13 -13
- magic_pdf/model/pdf_extract_kit.py +122 -76
- magic_pdf/model/sub_modules/model_init.py +40 -35
- magic_pdf/model/sub_modules/ocr/paddleocr/ocr_utils.py +33 -7
- magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_273_mod.py +12 -4
- magic_pdf/model/sub_modules/table/rapidtable/rapid_table.py +2 -0
- magic_pdf/model/sub_modules/table/tablemaster/tablemaster_paddle.py +30 -28
- magic_pdf/para/para_split.py +411 -248
- magic_pdf/para/para_split_v2.py +352 -182
- magic_pdf/para/para_split_v3.py +110 -53
- magic_pdf/pdf_parse_by_ocr.py +2 -0
- magic_pdf/pdf_parse_by_txt.py +2 -0
- magic_pdf/pdf_parse_union_core.py +174 -100
- magic_pdf/pdf_parse_union_core_v2.py +202 -36
- magic_pdf/pipe/AbsPipe.py +28 -44
- magic_pdf/pipe/OCRPipe.py +5 -5
- magic_pdf/pipe/TXTPipe.py +5 -6
- magic_pdf/pipe/UNIPipe.py +24 -25
- magic_pdf/post_proc/pdf_post_filter.py +7 -14
- magic_pdf/pre_proc/cut_image.py +9 -11
- magic_pdf/pre_proc/equations_replace.py +203 -212
- magic_pdf/pre_proc/ocr_detect_all_bboxes.py +235 -49
- magic_pdf/pre_proc/ocr_dict_merge.py +5 -5
- magic_pdf/pre_proc/ocr_span_list_modify.py +122 -63
- magic_pdf/pre_proc/pdf_pre_filter.py +37 -33
- magic_pdf/pre_proc/remove_bbox_overlap.py +20 -18
- magic_pdf/pre_proc/remove_colored_strip_bbox.py +36 -14
- magic_pdf/pre_proc/remove_footer_header.py +2 -5
- magic_pdf/pre_proc/remove_rotate_bbox.py +111 -63
- magic_pdf/pre_proc/resolve_bbox_conflict.py +10 -17
- magic_pdf/spark/spark_api.py +15 -17
- magic_pdf/tools/cli.py +3 -4
- magic_pdf/tools/cli_dev.py +6 -9
- magic_pdf/tools/common.py +26 -36
- magic_pdf/user_api.py +29 -38
- {magic_pdf-0.9.3.dist-info → magic_pdf-0.10.1.dist-info}/METADATA +11 -12
- {magic_pdf-0.9.3.dist-info → magic_pdf-0.10.1.dist-info}/RECORD +58 -59
- {magic_pdf-0.9.3.dist-info → magic_pdf-0.10.1.dist-info}/WHEEL +1 -1
- magic_pdf/libs/Constants.py +0 -55
- magic_pdf/libs/MakeContentConfig.py +0 -11
- magic_pdf/libs/drop_reason.py +0 -27
- magic_pdf/libs/drop_tag.py +0 -19
- magic_pdf/para/para_pipeline.py +0 -297
- /magic_pdf/{libs → config}/ocr_content_type.py +0 -0
- {magic_pdf-0.9.3.dist-info → magic_pdf-0.10.1.dist-info}/LICENSE.md +0 -0
- {magic_pdf-0.9.3.dist-info → magic_pdf-0.10.1.dist-info}/entry_points.txt +0 -0
- {magic_pdf-0.9.3.dist-info → magic_pdf-0.10.1.dist-info}/top_level.txt +0 -0
magic_pdf/user_api.py
CHANGED
@@ -1,36 +1,28 @@
|
|
1
|
-
"""
|
2
|
-
用户输入:
|
3
|
-
model数组,每个元素代表一个页面
|
4
|
-
pdf在s3的路径
|
5
|
-
截图保存的s3位置
|
1
|
+
"""用户输入: model数组,每个元素代表一个页面 pdf在s3的路径 截图保存的s3位置.
|
6
2
|
|
7
3
|
然后:
|
8
4
|
1)根据s3路径,调用spark集群的api,拿到ak,sk,endpoint,构造出s3PDFReader
|
9
5
|
2)根据用户输入的s3地址,调用spark集群的api,拿到ak,sk,endpoint,构造出s3ImageWriter
|
10
6
|
|
11
7
|
其余部分至于构造s3cli, 获取ak,sk都在code-clean里写代码完成。不要反向依赖!!!
|
12
|
-
|
13
8
|
"""
|
14
|
-
import re
|
15
9
|
|
16
10
|
from loguru import logger
|
17
11
|
|
12
|
+
from magic_pdf.data.data_reader_writer import DataWriter
|
18
13
|
from magic_pdf.libs.version import __version__
|
19
14
|
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
|
20
|
-
from magic_pdf.rw import AbsReaderWriter
|
21
15
|
from magic_pdf.pdf_parse_by_ocr import parse_pdf_by_ocr
|
22
16
|
from magic_pdf.pdf_parse_by_txt import parse_pdf_by_txt
|
23
17
|
|
24
|
-
PARSE_TYPE_TXT =
|
25
|
-
PARSE_TYPE_OCR =
|
18
|
+
PARSE_TYPE_TXT = 'txt'
|
19
|
+
PARSE_TYPE_OCR = 'ocr'
|
26
20
|
|
27
21
|
|
28
|
-
def parse_txt_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter:
|
22
|
+
def parse_txt_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: DataWriter, is_debug=False,
|
29
23
|
start_page_id=0, end_page_id=None, lang=None,
|
30
24
|
*args, **kwargs):
|
31
|
-
"""
|
32
|
-
解析文本类pdf
|
33
|
-
"""
|
25
|
+
"""解析文本类pdf."""
|
34
26
|
pdf_info_dict = parse_pdf_by_txt(
|
35
27
|
pdf_bytes,
|
36
28
|
pdf_models,
|
@@ -38,24 +30,23 @@ def parse_txt_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWrit
|
|
38
30
|
start_page_id=start_page_id,
|
39
31
|
end_page_id=end_page_id,
|
40
32
|
debug_mode=is_debug,
|
33
|
+
lang=lang,
|
41
34
|
)
|
42
35
|
|
43
|
-
pdf_info_dict[
|
36
|
+
pdf_info_dict['_parse_type'] = PARSE_TYPE_TXT
|
44
37
|
|
45
|
-
pdf_info_dict[
|
38
|
+
pdf_info_dict['_version_name'] = __version__
|
46
39
|
|
47
40
|
if lang is not None:
|
48
|
-
pdf_info_dict[
|
41
|
+
pdf_info_dict['_lang'] = lang
|
49
42
|
|
50
43
|
return pdf_info_dict
|
51
44
|
|
52
45
|
|
53
|
-
def parse_ocr_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter:
|
46
|
+
def parse_ocr_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: DataWriter, is_debug=False,
|
54
47
|
start_page_id=0, end_page_id=None, lang=None,
|
55
48
|
*args, **kwargs):
|
56
|
-
"""
|
57
|
-
解析ocr类pdf
|
58
|
-
"""
|
49
|
+
"""解析ocr类pdf."""
|
59
50
|
pdf_info_dict = parse_pdf_by_ocr(
|
60
51
|
pdf_bytes,
|
61
52
|
pdf_models,
|
@@ -63,25 +54,24 @@ def parse_ocr_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWrit
|
|
63
54
|
start_page_id=start_page_id,
|
64
55
|
end_page_id=end_page_id,
|
65
56
|
debug_mode=is_debug,
|
57
|
+
lang=lang,
|
66
58
|
)
|
67
59
|
|
68
|
-
pdf_info_dict[
|
60
|
+
pdf_info_dict['_parse_type'] = PARSE_TYPE_OCR
|
69
61
|
|
70
|
-
pdf_info_dict[
|
62
|
+
pdf_info_dict['_version_name'] = __version__
|
71
63
|
|
72
64
|
if lang is not None:
|
73
|
-
pdf_info_dict[
|
65
|
+
pdf_info_dict['_lang'] = lang
|
74
66
|
|
75
67
|
return pdf_info_dict
|
76
68
|
|
77
69
|
|
78
|
-
def parse_union_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter:
|
70
|
+
def parse_union_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: DataWriter, is_debug=False,
|
79
71
|
input_model_is_empty: bool = False,
|
80
72
|
start_page_id=0, end_page_id=None, lang=None,
|
81
73
|
*args, **kwargs):
|
82
|
-
"""
|
83
|
-
ocr和文本混合的pdf,全部解析出来
|
84
|
-
"""
|
74
|
+
"""ocr和文本混合的pdf,全部解析出来."""
|
85
75
|
|
86
76
|
def parse_pdf(method):
|
87
77
|
try:
|
@@ -92,18 +82,19 @@ def parse_union_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWr
|
|
92
82
|
start_page_id=start_page_id,
|
93
83
|
end_page_id=end_page_id,
|
94
84
|
debug_mode=is_debug,
|
85
|
+
lang=lang,
|
95
86
|
)
|
96
87
|
except Exception as e:
|
97
88
|
logger.exception(e)
|
98
89
|
return None
|
99
90
|
|
100
91
|
pdf_info_dict = parse_pdf(parse_pdf_by_txt)
|
101
|
-
if pdf_info_dict is None or pdf_info_dict.get(
|
102
|
-
logger.warning(
|
92
|
+
if pdf_info_dict is None or pdf_info_dict.get('_need_drop', False):
|
93
|
+
logger.warning('parse_pdf_by_txt drop or error, switch to parse_pdf_by_ocr')
|
103
94
|
if input_model_is_empty:
|
104
|
-
layout_model = kwargs.get(
|
105
|
-
formula_enable = kwargs.get(
|
106
|
-
table_enable = kwargs.get(
|
95
|
+
layout_model = kwargs.get('layout_model', None)
|
96
|
+
formula_enable = kwargs.get('formula_enable', None)
|
97
|
+
table_enable = kwargs.get('table_enable', None)
|
107
98
|
pdf_models = doc_analyze(
|
108
99
|
pdf_bytes,
|
109
100
|
ocr=True,
|
@@ -116,15 +107,15 @@ def parse_union_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWr
|
|
116
107
|
)
|
117
108
|
pdf_info_dict = parse_pdf(parse_pdf_by_ocr)
|
118
109
|
if pdf_info_dict is None:
|
119
|
-
raise Exception(
|
110
|
+
raise Exception('Both parse_pdf_by_txt and parse_pdf_by_ocr failed.')
|
120
111
|
else:
|
121
|
-
pdf_info_dict[
|
112
|
+
pdf_info_dict['_parse_type'] = PARSE_TYPE_OCR
|
122
113
|
else:
|
123
|
-
pdf_info_dict[
|
114
|
+
pdf_info_dict['_parse_type'] = PARSE_TYPE_TXT
|
124
115
|
|
125
|
-
pdf_info_dict[
|
116
|
+
pdf_info_dict['_version_name'] = __version__
|
126
117
|
|
127
118
|
if lang is not None:
|
128
|
-
pdf_info_dict[
|
119
|
+
pdf_info_dict['_lang'] = lang
|
129
120
|
|
130
121
|
return pdf_info_dict
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: magic-pdf
|
3
|
-
Version: 0.
|
3
|
+
Version: 0.10.1
|
4
4
|
Summary: A practical tool for converting PDF to Markdown
|
5
5
|
Home-page: https://github.com/opendatalab/MinerU
|
6
6
|
Requires-Python: >=3.9
|
@@ -38,6 +38,8 @@ Provides-Extra: lite
|
|
38
38
|
Requires-Dist: paddleocr==2.7.3; extra == "lite"
|
39
39
|
Requires-Dist: paddlepaddle==3.0.0b1; platform_system == "Linux" and extra == "lite"
|
40
40
|
Requires-Dist: paddlepaddle==2.6.1; (platform_system == "Windows" or platform_system == "Darwin") and extra == "lite"
|
41
|
+
Provides-Extra: old_linux
|
42
|
+
Requires-Dist: albumentations<=1.4.20; extra == "old-linux"
|
41
43
|
|
42
44
|
<div align="center" xmlns="http://www.w3.org/1999/html">
|
43
45
|
<!-- logo -->
|
@@ -83,6 +85,9 @@ Requires-Dist: paddlepaddle==2.6.1; (platform_system == "Windows" or platform_sy
|
|
83
85
|
</div>
|
84
86
|
|
85
87
|
# Changelog
|
88
|
+
- 2024/11/22 0.10.0 released. Introducing hybrid OCR text extraction capabilities,
|
89
|
+
- Significantly improved parsing performance in complex text distribution scenarios such as dense formulas, irregular span regions, and text represented by images.
|
90
|
+
- Combines the dual advantages of accurate content extraction and faster speed in text mode, and more precise span/line region recognition in OCR mode.
|
86
91
|
- 2024/11/15 0.9.3 released. Integrated [RapidTable](https://github.com/RapidAI/RapidTable) for table recognition, improving single-table parsing speed by more than 10 times, with higher accuracy and lower GPU memory usage.
|
87
92
|
- 2024/11/06 0.9.2 released. Integrated the [StructTable-InternVL2-1B](https://huggingface.co/U4R/StructTable-InternVL2-1B) model for table recognition functionality.
|
88
93
|
- 2024/10/31 0.9.0 released. This is a major new version with extensive code refactoring, addressing numerous issues, improving performance, reducing hardware requirements, and enhancing usability:
|
@@ -162,7 +167,7 @@ https://github.com/user-attachments/assets/4bea02c9-6d54-4cd6-97ed-dff14340982c
|
|
162
167
|
- Preserve the structure of the original document, including headings, paragraphs, lists, etc.
|
163
168
|
- Extract images, image descriptions, tables, table titles, and footnotes.
|
164
169
|
- Automatically recognize and convert formulas in the document to LaTeX format.
|
165
|
-
- Automatically recognize and convert tables in the document to
|
170
|
+
- Automatically recognize and convert tables in the document to HTML format.
|
166
171
|
- Automatically detect scanned PDFs and garbled PDFs and enable OCR functionality.
|
167
172
|
- OCR supports detection and recognition of 84 languages.
|
168
173
|
- Supports multiple output formats, such as multimodal and NLP Markdown, JSON sorted by reading order, and rich intermediate formats.
|
@@ -226,17 +231,11 @@ There are three different ways to experience MinerU:
|
|
226
231
|
</tr>
|
227
232
|
<tr>
|
228
233
|
<td rowspan="2">GPU Hardware Support List</td>
|
229
|
-
<td colspan="2">
|
230
|
-
<td colspan="2">
|
231
|
-
8G VRAM
|
234
|
+
<td colspan="2">GPU VRAM 8GB or more</td>
|
235
|
+
<td colspan="2">2080~2080Ti / 3060Ti~3090Ti / 4060~4090<br>
|
236
|
+
8G VRAM can enable all acceleration features</td>
|
232
237
|
<td rowspan="2">None</td>
|
233
238
|
</tr>
|
234
|
-
<tr>
|
235
|
-
<td colspan="2">Recommended Configuration 10G+ VRAM</td>
|
236
|
-
<td colspan="2">3080/3080ti/3090/3090ti/4070/4070ti/4070tisuper/4080/4090<br>
|
237
|
-
10G VRAM or more can enable layout, formula recognition, OCR acceleration and table recognition acceleration simultaneously
|
238
|
-
</td>
|
239
|
-
</tr>
|
240
239
|
</table>
|
241
240
|
|
242
241
|
### Online Demo
|
@@ -288,7 +287,7 @@ You can modify certain configurations in this file to enable or disable features
|
|
288
287
|
"enable": true // The formula recognition feature is enabled by default. If you need to disable it, please change the value here to "false".
|
289
288
|
},
|
290
289
|
"table-config": {
|
291
|
-
"model": "rapid_table", //
|
290
|
+
"model": "rapid_table", // Default to using "rapid_table", can be switched to "tablemaster" or "struct_eqtable".
|
292
291
|
"enable": false, // The table recognition feature is disabled by default. If you need to enable it, please change the value here to "true".
|
293
292
|
"max_time": 400
|
294
293
|
}
|
@@ -1,20 +1,26 @@
|
|
1
1
|
magic_pdf/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
2
|
-
magic_pdf/pdf_parse_by_ocr.py,sha256=
|
3
|
-
magic_pdf/pdf_parse_by_txt.py,sha256=
|
4
|
-
magic_pdf/pdf_parse_union_core.py,sha256=
|
5
|
-
magic_pdf/pdf_parse_union_core_v2.py,sha256=
|
6
|
-
magic_pdf/user_api.py,sha256=
|
2
|
+
magic_pdf/pdf_parse_by_ocr.py,sha256=WTaLVSU2wRpgtldasnqbrw1B0OvVi8VvcB_t-dAIfmw,880
|
3
|
+
magic_pdf/pdf_parse_by_txt.py,sha256=dh3ZM6BVrFzwbH4137BPUdKhgacGlpS2N4mn74_-UaA,762
|
4
|
+
magic_pdf/pdf_parse_union_core.py,sha256=w90lFIMOYUMAq4iv8bpsbBtLXFphPV4HyYeqbOTYQUI,12420
|
5
|
+
magic_pdf/pdf_parse_union_core_v2.py,sha256=EqEi9AahBBh2JbXoY8uOCmClvi9W_H_26U4jK8RwPwU,31308
|
6
|
+
magic_pdf/user_api.py,sha256=Sh6U7iD5VsH7Qkav_0o5GTx-Rlj7vhmhHQHZSBKR5T8,4006
|
7
7
|
magic_pdf/config/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
8
|
+
magic_pdf/config/constants.py,sha256=gqhUEtso7rCop-k-VvEPAMW_6pA6Tv2Y9smrr_0Iajo,1173
|
9
|
+
magic_pdf/config/drop_reason.py,sha256=CqjMzBE96Qo8OeFvhhhItY8WhyqsKhE3DmyJLoQZNCc,2248
|
10
|
+
magic_pdf/config/drop_tag.py,sha256=CjveyzhAsHm_bfXB7ZZNKruw1NR-WdKD8Hz6OhQdG0A,680
|
8
11
|
magic_pdf/config/enums.py,sha256=CImYuw4sbKpq9zrj6zrrEvtdoGkjxDt8S8ByNVDpypU,89
|
9
12
|
magic_pdf/config/exceptions.py,sha256=87UX7gyUpj4HqjPcz2hLqdnYeImtDQAxOxj8oXZ_zkE,622
|
13
|
+
magic_pdf/config/make_content_config.py,sha256=J2eJIhVHBPGwX18zVQomQUOxs8LcfeGLxLgdBEeRvLg,248
|
14
|
+
magic_pdf/config/model_block_type.py,sha256=y5ie2ZLvo-h8OdVk8HOEha6qK0OJFtLmtOhYjrV680g,166
|
15
|
+
magic_pdf/config/ocr_content_type.py,sha256=e_7RBTdShaWvWhMO2SFou7GM521elMH_Jtn5usbHWdY,890
|
10
16
|
magic_pdf/data/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
11
17
|
magic_pdf/data/dataset.py,sha256=n8rGw1-wizABR8giSk_XWPCXzx3478u5DK2Z0wOCOeI,5089
|
12
|
-
magic_pdf/data/read_api.py,sha256=
|
18
|
+
magic_pdf/data/read_api.py,sha256=hGpSVg9EcyM2mIlOsDIwsl7Y_ybWf9kkoxRumIXSzQQ,3566
|
13
19
|
magic_pdf/data/schemas.py,sha256=oIUTBzK8Wq8Wuy8A_uilWAbVhucRvOs9_f3lSKYgcmQ,664
|
14
20
|
magic_pdf/data/utils.py,sha256=dJZiqygwNier0UG5tbt5jAPjgwcnfsAN6-m-G1kVPLQ,917
|
15
21
|
magic_pdf/data/data_reader_writer/__init__.py,sha256=QtevUaeSivv9dQKi3Tomfn4Z0E4To0cB8qXTnglxaHc,705
|
16
22
|
magic_pdf/data/data_reader_writer/base.py,sha256=gUrHCMTHYBrWpqgHdIc-hN7HHwUC2ApK_VXrDUrnfdg,1320
|
17
|
-
magic_pdf/data/data_reader_writer/filebase.py,sha256=
|
23
|
+
magic_pdf/data/data_reader_writer/filebase.py,sha256=iVjBT1M_89F5HB2uMsBzLfCNhsTVnmurJk4VUuAA2tw,2111
|
18
24
|
magic_pdf/data/data_reader_writer/multi_bucket_s3.py,sha256=_HA8NJO1Be7KwozlwOJ90o8Ik2vfjlvlDPXppESeIfk,5885
|
19
25
|
magic_pdf/data/data_reader_writer/s3.py,sha256=9Oy1cNuXMwG1e8PgZ7AR-pn_MqHAhkgAGnyEZCYoYAA,2408
|
20
26
|
magic_pdf/data/io/__init__.py,sha256=WKaIlu8i5AWYxFCGNJcorAfMnlUQDOF8CX07Ycfnu2c,294
|
@@ -22,59 +28,53 @@ magic_pdf/data/io/base.py,sha256=SqNQqe30ZvoVvg7GVv-hLMCjN6yBgDyQQWeLgGsTfhQ,111
|
|
22
28
|
magic_pdf/data/io/http.py,sha256=XlKB0DNf4a_uUnfgcclvaaOtmE7lmddx0DnK8A-emAM,958
|
23
29
|
magic_pdf/data/io/s3.py,sha256=hyA7sbNriQy64xd_uyJ7acN_oneQ1Pdmoc7_xcvkue8,3606
|
24
30
|
magic_pdf/dict2md/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
25
|
-
magic_pdf/dict2md/mkcontent.py,sha256=
|
26
|
-
magic_pdf/dict2md/ocr_mkcontent.py,sha256=
|
31
|
+
magic_pdf/dict2md/mkcontent.py,sha256=bMQK7uiay76YaWA92VIK57YajINV20SnOs65wOEXyKE,18667
|
32
|
+
magic_pdf/dict2md/ocr_mkcontent.py,sha256=ohjhEFS9YFrzTCC9c9yrvi4QuZe9iZm1qlkQWB6xxIw,13038
|
27
33
|
magic_pdf/filter/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
28
34
|
magic_pdf/filter/pdf_classify_by_type.py,sha256=spmDO-f2ihAl1d6-EP-j271Yi50oyu6mw4X2kRd_m0s,42320
|
29
|
-
magic_pdf/filter/pdf_meta_scan.py,sha256=
|
35
|
+
magic_pdf/filter/pdf_meta_scan.py,sha256=h4D4O0OeAlEy2A8mJ6E0aQ8wIizIfsIxEagbjaomnAo,17823
|
30
36
|
magic_pdf/integrations/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
31
37
|
magic_pdf/integrations/rag/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
32
38
|
magic_pdf/integrations/rag/api.py,sha256=t38wvIBzLje4_JzTP3dewMLqV-tQJ-A3B92Sj2oyrfs,2507
|
33
39
|
magic_pdf/integrations/rag/type.py,sha256=Z_1g_ZIOCsb7-FmZBudReIXj8nzGrgj_BygCalhJdmk,3193
|
34
|
-
magic_pdf/integrations/rag/utils.py,sha256=
|
40
|
+
magic_pdf/integrations/rag/utils.py,sha256=DCb-UhC8TElb6Eq7_6NmmETreKEk5DVE18hNL8sTEBk,11762
|
35
41
|
magic_pdf/layout/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
36
42
|
magic_pdf/layout/bbox_sort.py,sha256=PzzaBf6MC_AZ-ZWGU0Kg-KIsw874l_gML73mM3hE4Ps,30807
|
37
43
|
magic_pdf/layout/layout_det_utils.py,sha256=NCYBTvsrULE3Cue53aMD1MfXTmOL9Xy0nivl6ku2cls,9137
|
38
44
|
magic_pdf/layout/layout_sort.py,sha256=jtacQVcxnuYAksvEqtS0DH-v6U8qyjX-jmyZgDJ-egA,37005
|
39
45
|
magic_pdf/layout/layout_spiler_recog.py,sha256=QjBSgB-a7J2yjUR1eaCs9ZD7URtiRnV6W934hpAeuC4,3067
|
40
46
|
magic_pdf/layout/mcol_sort.py,sha256=ADnLisBJBHXDKYChcf2lzTb_TC_vZ4q89_CSN8mwEJc,11331
|
41
|
-
magic_pdf/libs/Constants.py,sha256=ptiwMvWDUmzRZ0IbP1bM3PjGJ24BQVQQHO4sCeioPv8,1173
|
42
|
-
magic_pdf/libs/MakeContentConfig.py,sha256=Do5VKNQp3gfUKyhrZStfzfBj7l-vbsYpsJFF1SsmEc0,248
|
43
|
-
magic_pdf/libs/ModelBlockTypeEnum.py,sha256=kalXPbo5ya6hKhhBHPGlHl1yjWOURoXZWQM3rVUyPsY,164
|
44
47
|
magic_pdf/libs/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
45
48
|
magic_pdf/libs/boxbase.py,sha256=ELMHWolgWROxOAQDgwmL7VS5kveZp4ifvEzRmPul2Ws,16925
|
46
49
|
magic_pdf/libs/calc_span_stats.py,sha256=5vnU27DcbkFDRSAoLqAmX0KQ3I9ehWkEgh_t9hxg_zI,10147
|
47
50
|
magic_pdf/libs/clean_memory.py,sha256=BIOmEWuwR7c_p4OwTSW2muE3PRaGhmOplS-wTXt_EXk,211
|
48
51
|
magic_pdf/libs/commons.py,sha256=6Zu9-OyamyCNDY7qj0SxR-rux-ggj9im3CVPtC4ubB8,7108
|
49
|
-
magic_pdf/libs/config_reader.py,sha256=
|
52
|
+
magic_pdf/libs/config_reader.py,sha256=vDsxw2xbW7Gb1mKqERTSlttbXFNtVU0BDdae2dG7wEI,4068
|
50
53
|
magic_pdf/libs/convert_utils.py,sha256=Ov-lsfCLBPz_15iSJXIslBNmrSf_E_1g_XDWJy8NgO8,143
|
51
54
|
magic_pdf/libs/coordinate_transform.py,sha256=Bbop2cP2uz2ZG0U0gwd7J6EKkgABq5Rv03qf2LMPw80,429
|
52
55
|
magic_pdf/libs/detect_language_from_model.py,sha256=Uln8F9qs8EJOw4EgI7KRlaU3lD_mK8KMTlADLFtz8fk,816
|
53
|
-
magic_pdf/libs/draw_bbox.py,sha256=
|
54
|
-
magic_pdf/libs/drop_reason.py,sha256=IfjPSrPLMmVziqjOXPep7r_ioQKFRahDgbOW1SD-Tuw,2148
|
55
|
-
magic_pdf/libs/drop_tag.py,sha256=bZDg3bIVWvBT1Ec1icwj5WLOkt5-hI6eRYZ2tX9_a74,673
|
56
|
+
magic_pdf/libs/draw_bbox.py,sha256=NhAfqib5HYuGjjrAG_SvJR-yOHZTy6tzDxLXdxKlULQ,17676
|
56
57
|
magic_pdf/libs/hash_utils.py,sha256=VEKK9WfFoZgrPfi8kfITjLpr8Ahufs8tXh9R1Y5lAL8,404
|
57
58
|
magic_pdf/libs/json_compressor.py,sha256=6-KCu0lb5ksmyqWtQGb4QqmP-FjRb5dP7P-Hevcn68g,875
|
58
59
|
magic_pdf/libs/language.py,sha256=Hj5-lrGoNExxdHLbkcNG-c27U4AjJ9AZPdZblaNSehU,1099
|
59
60
|
magic_pdf/libs/local_math.py,sha256=tqljQOgqh3fZc146HYhO88JXJaiXMVwArBkk_CSGICc,177
|
60
61
|
magic_pdf/libs/markdown_utils.py,sha256=cLxLXjRhrNp_wCHvtglrGA_FVdrvfd1KULeTtj1p18w,944
|
61
62
|
magic_pdf/libs/nlp_utils.py,sha256=-X9W3-Ns5ZdDYFvyyEq6i6P2b5hCATaFEZeOjwNOH9M,6901
|
62
|
-
magic_pdf/libs/ocr_content_type.py,sha256=e_7RBTdShaWvWhMO2SFou7GM521elMH_Jtn5usbHWdY,890
|
63
63
|
magic_pdf/libs/path_utils.py,sha256=Hykw_l5CU736b2egHV9P7B-qh3QNKO4nZSGCbsi0Z8E,1043
|
64
64
|
magic_pdf/libs/pdf_check.py,sha256=MAe8wzwT0qvPf_I72wEZG7k1g4haNHS7oUtLqkB5rlE,2145
|
65
|
-
magic_pdf/libs/pdf_image_tools.py,sha256=
|
65
|
+
magic_pdf/libs/pdf_image_tools.py,sha256=sh8hgBQu_83R71qBLodOFdByBUuQujsOMfgpSD9mrhE,1981
|
66
66
|
magic_pdf/libs/safe_filename.py,sha256=ckwcM_eqoysTb5id8czp-tXq2G9da0-l3pshZDCHQtE,236
|
67
67
|
magic_pdf/libs/textbase.py,sha256=SC1Frhz3Fb7V7n2SFRBsl7Bmg0JZdlvZskq0lfW1vIk,732
|
68
|
-
magic_pdf/libs/version.py,sha256=
|
68
|
+
magic_pdf/libs/version.py,sha256=v7Gyp89umFzDtY45tTjCdXqZnQ2RN01AibdYNxEvxYo,23
|
69
69
|
magic_pdf/libs/vis_utils.py,sha256=hTOTEakKV0pGMbk0tbRkVI_tku7A3dGc96ynObZ4kwI,10207
|
70
70
|
magic_pdf/model/__init__.py,sha256=1QcfMKET0xQhSaZMjNQHi_TjzSSDR6PI5mjkmaXHPe8,52
|
71
|
-
magic_pdf/model/doc_analyze_by_custom_model.py,sha256=
|
72
|
-
magic_pdf/model/magic_model.py,sha256=
|
71
|
+
magic_pdf/model/doc_analyze_by_custom_model.py,sha256=HOT6chGx2VPyH6O9WB0c6xGPeDs9m_6oZn3iOa745yw,7125
|
72
|
+
magic_pdf/model/magic_model.py,sha256=8nJLzNCa0Ag4JhMAQbjj5qrkj617qKPCXVJAiT9DnaA,43472
|
73
73
|
magic_pdf/model/model_list.py,sha256=tJ9jtMB93HGx8Rmt8wmQSDFXZBUIPQrwaaYsep4luTM,183
|
74
|
-
magic_pdf/model/pdf_extract_kit.py,sha256=
|
74
|
+
magic_pdf/model/pdf_extract_kit.py,sha256=ceYWlSU1BhakfsHPVM9SrUx35EvCBa20uJmgDO5PAtE,10933
|
75
75
|
magic_pdf/model/pp_structure_v2.py,sha256=BKPN7W4BjG0eWPAPjPEac1RMnb5eIzmAz4E4Rq-9b1U,3019
|
76
76
|
magic_pdf/model/sub_modules/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
77
|
-
magic_pdf/model/sub_modules/model_init.py,sha256=
|
77
|
+
magic_pdf/model/sub_modules/model_init.py,sha256=CnlZLsiSOmGJXQRASH-hMmuPiF6hYKCNfmzDTjQqy5g,5073
|
78
78
|
magic_pdf/model/sub_modules/model_utils.py,sha256=ToiuwXbrvH_CPIwW2AXzz9miadUN5FA7lthwBljtIco,2118
|
79
79
|
magic_pdf/model/sub_modules/layout/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
80
80
|
magic_pdf/model/sub_modules/layout/doclayout_yolo/DocLayoutYOLO.py,sha256=roe6Rth6cvBrCw0MWXcj1CBjvK3S_Ni7GC4DxY4-yBQ,886
|
@@ -107,8 +107,8 @@ magic_pdf/model/sub_modules/mfr/unimernet/Unimernet.py,sha256=jeJkqID6L1ZivPMdK1
|
|
107
107
|
magic_pdf/model/sub_modules/mfr/unimernet/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
108
108
|
magic_pdf/model/sub_modules/ocr/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
109
109
|
magic_pdf/model/sub_modules/ocr/paddleocr/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
110
|
-
magic_pdf/model/sub_modules/ocr/paddleocr/ocr_utils.py,sha256=
|
111
|
-
magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_273_mod.py,sha256=
|
110
|
+
magic_pdf/model/sub_modules/ocr/paddleocr/ocr_utils.py,sha256=UP7fADPGoxAMj2SUKmeW-fe_AcAQxlT9Mfy4WF6vHmU,9796
|
111
|
+
magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_273_mod.py,sha256=a6xkQHqLMUL4NCaORp8oo4Tfa8GB8PN9MVvG7Pj6jIE,7316
|
112
112
|
magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_291_mod.py,sha256=VouMTvi6M5TV6pQdlpusgfyZapxiZ_Wi7Ff53eMC3rE,8996
|
113
113
|
magic_pdf/model/sub_modules/reading_oreder/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
114
114
|
magic_pdf/model/sub_modules/reading_oreder/layoutreader/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
@@ -117,11 +117,11 @@ magic_pdf/model/sub_modules/reading_oreder/layoutreader/xycut.py,sha256=ezNSq_Y4
|
|
117
117
|
magic_pdf/model/sub_modules/table/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
118
118
|
magic_pdf/model/sub_modules/table/table_utils.py,sha256=B9BC4f5EEjlt2ldYxrIC8Wic2Tz3t3gTJeEyK3ggrOU,282
|
119
119
|
magic_pdf/model/sub_modules/table/rapidtable/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
120
|
-
magic_pdf/model/sub_modules/table/rapidtable/rapid_table.py,sha256=
|
120
|
+
magic_pdf/model/sub_modules/table/rapidtable/rapid_table.py,sha256=_FKKOSKeceusx6DCnhqYzP-4b1zSWptrefimxFTmy8Q,583
|
121
121
|
magic_pdf/model/sub_modules/table/structeqtable/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
122
122
|
magic_pdf/model/sub_modules/table/structeqtable/struct_eqtable.py,sha256=SrNPm-uOFEvN5muFGbXTAuwzXm-rCiaihVdqbydIBIA,1131
|
123
123
|
magic_pdf/model/sub_modules/table/tablemaster/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
124
|
-
magic_pdf/model/sub_modules/table/tablemaster/tablemaster_paddle.py,sha256=
|
124
|
+
magic_pdf/model/sub_modules/table/tablemaster/tablemaster_paddle.py,sha256=AdH3UGu4BEoII0uFjPKUf61W7HmG4fDlWgR1xxMeFlE,2775
|
125
125
|
magic_pdf/para/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
126
126
|
magic_pdf/para/block_continuation_processor.py,sha256=IkReB5hirjm0OAirNzQQpnWe2f2bdP3Hug3Ef8qTRDA,22749
|
127
127
|
magic_pdf/para/block_termination_processor.py,sha256=YU3ZYqJy9e3OQmOuQYZrR6AUpmAlQ0mhj0PgZZPZ_fM,17957
|
@@ -130,26 +130,25 @@ magic_pdf/para/denoise.py,sha256=J7dM2KNnbdzAd2A3agB04U6L1GL9RrhAs-MLrq-_Ftg,104
|
|
130
130
|
magic_pdf/para/draw.py,sha256=KyWc03do_WuBKQ028HYzepYwbIkel9ID0uqRhuPVOHc,5643
|
131
131
|
magic_pdf/para/exceptions.py,sha256=kpjGxrSZ-drNmoKlmuQ0asTjI8cKKKWsdDDBoDHQP9M,4978
|
132
132
|
magic_pdf/para/layout_match_processor.py,sha256=yr4FEO7GJ502udShqGRqIJQ_FQxoa0aG_mhmWd8nLwI,1554
|
133
|
-
magic_pdf/para/
|
134
|
-
magic_pdf/para/
|
135
|
-
magic_pdf/para/
|
136
|
-
magic_pdf/para/para_split_v3.py,sha256=vSJ5_QqGKP1rbTbGQg5ONNpybidpTdbgXZgTGd2bGsw,14539
|
133
|
+
magic_pdf/para/para_split.py,sha256=z7nYeg86BjZOAdJNMwYKSu51W9evurtl3cy1ZUcQLlw,33222
|
134
|
+
magic_pdf/para/para_split_v2.py,sha256=vJJqqMMKbv8D702nODThL-5hjkgZ7Vl2BTmEIdwmmDw,39051
|
135
|
+
magic_pdf/para/para_split_v3.py,sha256=atfELVRx-90paAS3nZptgP0qG8UpTTaj3LG_2x3NAlQ,15977
|
137
136
|
magic_pdf/para/raw_processor.py,sha256=mHxD9FrdOSXH7NqM41s55URyCyuyACvm9kKtowkIb3k,6317
|
138
137
|
magic_pdf/para/stats.py,sha256=-6Pf9Y8jkP1uJOYWiHUjw9Lb-Fb9GY7MHr_ok7x2GX0,9731
|
139
138
|
magic_pdf/para/title_processor.py,sha256=pYZv9vEkIjAtCz8jIUtl9AVUy_ib5SdAZmMVoZtsMRI,38593
|
140
|
-
magic_pdf/pipe/AbsPipe.py,sha256=
|
141
|
-
magic_pdf/pipe/OCRPipe.py,sha256=
|
142
|
-
magic_pdf/pipe/TXTPipe.py,sha256=
|
143
|
-
magic_pdf/pipe/UNIPipe.py,sha256=
|
139
|
+
magic_pdf/pipe/AbsPipe.py,sha256=jPtAa0pz_vPddya3ZpUk6UrGqp8PcBdLONO1spzavQo,4371
|
140
|
+
magic_pdf/pipe/OCRPipe.py,sha256=nuN-zpUzu--gyrC0_vsvvilAyK7Mp3Tom_UOnsur1ps,2158
|
141
|
+
magic_pdf/pipe/TXTPipe.py,sha256=5OFo2e8U5Y24wJrFDEJghBDpklnKFEnzKTYVnnhQssE,2159
|
142
|
+
magic_pdf/pipe/UNIPipe.py,sha256=ik0xXPdsHo7Un0gFpLC5ul04BP3Omd2mp5gqem40deE,4807
|
144
143
|
magic_pdf/pipe/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
145
144
|
magic_pdf/post_proc/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
146
145
|
magic_pdf/post_proc/detect_para.py,sha256=5LX86ueHQGOV9CNimAxqZH4R3KTi78leum1de_Na0pw,126181
|
147
|
-
magic_pdf/post_proc/pdf_post_filter.py,sha256=
|
146
|
+
magic_pdf/post_proc/pdf_post_filter.py,sha256=3EJDovQPckPKJaBY1wvAty-LGKyRG63WICY_bA_Kfbs,2501
|
148
147
|
magic_pdf/post_proc/remove_footnote.py,sha256=701P7xRu6gzLaEHfb2xkYpLZI4CwK2FAo7Ggho4bOTI,7596
|
149
148
|
magic_pdf/pre_proc/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
150
149
|
magic_pdf/pre_proc/citationmarker_remove.py,sha256=IitOERaK9fGaktsYMyiaaL_71uMIrlG5ZdmpZaR6dsA,6640
|
151
150
|
magic_pdf/pre_proc/construct_page_dict.py,sha256=lp3zBmInlWYYIcGC1-NSqT9s44AjDvlnWxDPeZoBVSY,3043
|
152
|
-
magic_pdf/pre_proc/cut_image.py,sha256=
|
151
|
+
magic_pdf/pre_proc/cut_image.py,sha256=TghshkDTgdUbyLSbKZoFI9-n-xaFub02IYPyu0IAnRY,2761
|
153
152
|
magic_pdf/pre_proc/detect_equation.py,sha256=9omDHKTI8QO9Qd46eVFHWhZeMmTNx7XDuWRgjXI-KFA,6627
|
154
153
|
magic_pdf/pre_proc/detect_footer_by_model.py,sha256=_EghAM_zWBcqVY8XBkbSoprKqKUa0mlN1U8YNWxNNLI,2848
|
155
154
|
magic_pdf/pre_proc/detect_footer_header_by_statistics.py,sha256=924soXZ51QVpitPgVgnwbC7BqOZI30j5hGW5zP86y-w,11250
|
@@ -158,21 +157,21 @@ magic_pdf/pre_proc/detect_header.py,sha256=KOmRehgKMuMqNa_2weXkdNSiRVWMFgLMQE4e1
|
|
158
157
|
magic_pdf/pre_proc/detect_images.py,sha256=8DwGGTb5IjxqADZDTc_ngwJrTYXxK2qpRqI2FBoPr00,30432
|
159
158
|
magic_pdf/pre_proc/detect_page_number.py,sha256=qvYrBbCtBbREvw-MySL_p7byCRvcm1fkLJ5ZB4TP8OM,2848
|
160
159
|
magic_pdf/pre_proc/detect_tables.py,sha256=srJzgLVeVuOsqnESqfdJfVukTF84K8qmI5mgFX_BZGs,2800
|
161
|
-
magic_pdf/pre_proc/equations_replace.py,sha256=
|
160
|
+
magic_pdf/pre_proc/equations_replace.py,sha256=7mexRPwD9C_UJ-SbvO_-XnpcnN7YtGUUznmPjHbjhnw,20526
|
162
161
|
magic_pdf/pre_proc/fix_image.py,sha256=5MOfkXc8abfIp49g-68vll40wwTUZ5tcQ2gtsJuFmvs,11486
|
163
162
|
magic_pdf/pre_proc/fix_table.py,sha256=20sqJe27fAXcL7_C0qQ9mpsggmH37WuX-wPYWyRgACA,13227
|
164
163
|
magic_pdf/pre_proc/main_text_font.py,sha256=1gkjvPuBdKC4oVFkLvnRm2zghsLtVlfAEMKXouyVonM,1048
|
165
|
-
magic_pdf/pre_proc/ocr_detect_all_bboxes.py,sha256=
|
164
|
+
magic_pdf/pre_proc/ocr_detect_all_bboxes.py,sha256=DMc2H2xGqVePBReZu5AQbPdvDw3sxOssmujCLlNW3Vs,14143
|
166
165
|
magic_pdf/pre_proc/ocr_detect_layout.py,sha256=DW0_HXzmcbW22cXKIYFsyZNFh8mEjSHXIFVjXndJsvQ,5878
|
167
|
-
magic_pdf/pre_proc/ocr_dict_merge.py,sha256=
|
168
|
-
magic_pdf/pre_proc/ocr_span_list_modify.py,sha256=
|
169
|
-
magic_pdf/pre_proc/pdf_pre_filter.py,sha256=
|
166
|
+
magic_pdf/pre_proc/ocr_dict_merge.py,sha256=Au8y1NBhbWpq_VuPLg3b9dAMUhyPS71xtTghtd21K5M,14273
|
167
|
+
magic_pdf/pre_proc/ocr_span_list_modify.py,sha256=9DxEyy1pH87g4T_JEgI3cTVCL2TVrEBl38wsmqhQM4k,12758
|
168
|
+
magic_pdf/pre_proc/pdf_pre_filter.py,sha256=qvNlNyj4Mc3qa73mgfkp0PMR-ucABbx3mMcyVipaEpQ,2776
|
170
169
|
magic_pdf/pre_proc/post_layout_split.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
171
|
-
magic_pdf/pre_proc/remove_bbox_overlap.py,sha256=
|
172
|
-
magic_pdf/pre_proc/remove_colored_strip_bbox.py,sha256=
|
173
|
-
magic_pdf/pre_proc/remove_footer_header.py,sha256=
|
174
|
-
magic_pdf/pre_proc/remove_rotate_bbox.py,sha256=
|
175
|
-
magic_pdf/pre_proc/resolve_bbox_conflict.py,sha256=
|
170
|
+
magic_pdf/pre_proc/remove_bbox_overlap.py,sha256=8eXNdsz9s06LX0kS0AxYSkaY1tWQQMkJfVtVSdjTQNE,3090
|
171
|
+
magic_pdf/pre_proc/remove_colored_strip_bbox.py,sha256=WVKhgeWifRdO-u2ETYffkcMOFVYIbiaZu5pMr1RpEdA,4090
|
172
|
+
magic_pdf/pre_proc/remove_footer_header.py,sha256=Igdr4jH7BUGuTcapWPiKEGKxhWH12c3VVmX5xwUVn7w,5680
|
173
|
+
magic_pdf/pre_proc/remove_rotate_bbox.py,sha256=di7geS7AFhSaAvkWZHT6J3dlXEq8uu9Z4oBYtolQjl0,8803
|
174
|
+
magic_pdf/pre_proc/resolve_bbox_conflict.py,sha256=ABl0vo8kkcCPSTI8dpXQTOH1b9R-lbzsJDDFONU6ELk,7313
|
176
175
|
magic_pdf/pre_proc/solve_line_alien.py,sha256=aNoQptPcC38Sm1I2ABhgw8jeH_5kjsRHx3VYlFFtm1g,853
|
177
176
|
magic_pdf/pre_proc/statistics.py,sha256=_9jGlXq0iXd03UMxB92ZqCiu7cjNkG5vHvFlTF_9ytA,220
|
178
177
|
magic_pdf/resources/fasttext-langdetect/lid.176.ftz,sha256=jzRyz-hzintgmejpmcPL-uDc0VaWqsfXc4qAOdtgPoM,938013
|
@@ -184,16 +183,16 @@ magic_pdf/rw/DiskReaderWriter.py,sha256=7ZAekH8V6xlBo_1WeSZ6sNwAj2WGPtjNl50zq1Co
|
|
184
183
|
magic_pdf/rw/S3ReaderWriter.py,sha256=_DmL45Ubio-_VsKD84KrqOQ-VNDUTzcXSrXfNMb5vww,5310
|
185
184
|
magic_pdf/rw/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
186
185
|
magic_pdf/spark/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
187
|
-
magic_pdf/spark/spark_api.py,sha256=
|
186
|
+
magic_pdf/spark/spark_api.py,sha256=BYO6zlRW0cEnIUB3ZzNQTu_LsPHEVitqiUN7gy3x_wo,1124
|
188
187
|
magic_pdf/tools/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
189
|
-
magic_pdf/tools/cli.py,sha256=
|
190
|
-
magic_pdf/tools/cli_dev.py,sha256=
|
191
|
-
magic_pdf/tools/common.py,sha256=
|
188
|
+
magic_pdf/tools/cli.py,sha256=83a8p4_DvVdDOTuviE6WqexSXsDE_MUY-af3QDxXeoU,3067
|
189
|
+
magic_pdf/tools/cli_dev.py,sha256=3RbubfTIagWoFYdu8wSDanr-BJDjFGeDet55jTy7He0,3948
|
190
|
+
magic_pdf/tools/common.py,sha256=ILTv8YjnK-XTVV5nzak3Sm-EJJXjG1hJJghlYKgYVBQ,6809
|
192
191
|
magic_pdf/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
193
192
|
magic_pdf/utils/annotations.py,sha256=82ou3uELNbQWa9hOFFkVt0gsIskAKf5msCv5J2IJ5V0,211
|
194
|
-
magic_pdf-0.
|
195
|
-
magic_pdf-0.
|
196
|
-
magic_pdf-0.
|
197
|
-
magic_pdf-0.
|
198
|
-
magic_pdf-0.
|
199
|
-
magic_pdf-0.
|
193
|
+
magic_pdf-0.10.1.dist-info/LICENSE.md,sha256=jVa0BUaKrRH4erV2P5AeJ24I2WRv9chIGxditreJ6e0,34524
|
194
|
+
magic_pdf-0.10.1.dist-info/METADATA,sha256=QdRsUeX9lmB2tTEFLT92qEWnPcgxIu7L0GeqTOHBGms,40300
|
195
|
+
magic_pdf-0.10.1.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
|
196
|
+
magic_pdf-0.10.1.dist-info/entry_points.txt,sha256=wXwYke3j8fqDQTocUspL-CqDUEv3Tfcwp09fM8dZAhA,98
|
197
|
+
magic_pdf-0.10.1.dist-info/top_level.txt,sha256=J9I0AzmHWGkp9c6DL8Oe4mEx3yYphLzkRn4H25Lg1rE,10
|
198
|
+
magic_pdf-0.10.1.dist-info/RECORD,,
|
magic_pdf/libs/Constants.py
DELETED
@@ -1,55 +0,0 @@
|
|
1
|
-
"""
|
2
|
-
span维度自定义字段
|
3
|
-
"""
|
4
|
-
# span是否是跨页合并的
|
5
|
-
CROSS_PAGE = "cross_page"
|
6
|
-
|
7
|
-
"""
|
8
|
-
block维度自定义字段
|
9
|
-
"""
|
10
|
-
# block中lines是否被删除
|
11
|
-
LINES_DELETED = "lines_deleted"
|
12
|
-
|
13
|
-
# table recognition max time default value
|
14
|
-
TABLE_MAX_TIME_VALUE = 400
|
15
|
-
|
16
|
-
# pp_table_result_max_length
|
17
|
-
TABLE_MAX_LEN = 480
|
18
|
-
|
19
|
-
# table master structure dict
|
20
|
-
TABLE_MASTER_DICT = "table_master_structure_dict.txt"
|
21
|
-
|
22
|
-
# table master dir
|
23
|
-
TABLE_MASTER_DIR = "table_structure_tablemaster_infer/"
|
24
|
-
|
25
|
-
# pp detect model dir
|
26
|
-
DETECT_MODEL_DIR = "ch_PP-OCRv4_det_infer"
|
27
|
-
|
28
|
-
# pp rec model dir
|
29
|
-
REC_MODEL_DIR = "ch_PP-OCRv4_rec_infer"
|
30
|
-
|
31
|
-
# pp rec char dict path
|
32
|
-
REC_CHAR_DICT = "ppocr_keys_v1.txt"
|
33
|
-
|
34
|
-
# pp rec copy rec directory
|
35
|
-
PP_REC_DIRECTORY = ".paddleocr/whl/rec/ch/ch_PP-OCRv4_rec_infer"
|
36
|
-
|
37
|
-
# pp rec copy det directory
|
38
|
-
PP_DET_DIRECTORY = ".paddleocr/whl/det/ch/ch_PP-OCRv4_det_infer"
|
39
|
-
|
40
|
-
|
41
|
-
class MODEL_NAME:
|
42
|
-
# pp table structure algorithm
|
43
|
-
TABLE_MASTER = "tablemaster"
|
44
|
-
# struct eqtable
|
45
|
-
STRUCT_EQTABLE = "struct_eqtable"
|
46
|
-
|
47
|
-
DocLayout_YOLO = "doclayout_yolo"
|
48
|
-
|
49
|
-
LAYOUTLMv3 = "layoutlmv3"
|
50
|
-
|
51
|
-
YOLO_V8_MFD = "yolo_v8_mfd"
|
52
|
-
|
53
|
-
UniMerNet_v2_Small = "unimernet_small"
|
54
|
-
|
55
|
-
RAPID_TABLE = "rapid_table"
|
magic_pdf/libs/drop_reason.py
DELETED
@@ -1,27 +0,0 @@
|
|
1
|
-
|
2
|
-
class DropReason:
|
3
|
-
TEXT_BLCOK_HOR_OVERLAP = "text_block_horizontal_overlap" # 文字块有水平互相覆盖,导致无法准确定位文字顺序
|
4
|
-
USEFUL_BLOCK_HOR_OVERLAP = "useful_block_horizontal_overlap" # 需保留的block水平覆盖
|
5
|
-
COMPLICATED_LAYOUT = "complicated_layout" # 复杂的布局,暂时不支持
|
6
|
-
TOO_MANY_LAYOUT_COLUMNS = "too_many_layout_columns" # 目前不支持分栏超过2列的
|
7
|
-
COLOR_BACKGROUND_TEXT_BOX = "color_background_text_box" # 含有带色块的PDF,色块会改变阅读顺序,目前不支持带底色文字块的PDF。
|
8
|
-
HIGH_COMPUTATIONAL_lOAD_BY_IMGS = "high_computational_load_by_imgs" # 含特殊图片,计算量太大,从而丢弃
|
9
|
-
HIGH_COMPUTATIONAL_lOAD_BY_SVGS = "high_computational_load_by_svgs" # 特殊的SVG图,计算量太大,从而丢弃
|
10
|
-
HIGH_COMPUTATIONAL_lOAD_BY_TOTAL_PAGES = "high_computational_load_by_total_pages" # 计算量超过负荷,当前方法下计算量消耗过大
|
11
|
-
MISS_DOC_LAYOUT_RESULT = "missing doc_layout_result" # 版面分析失败
|
12
|
-
Exception = "_exception" # 解析中发生异常
|
13
|
-
ENCRYPTED = "encrypted" # PDF是加密的
|
14
|
-
EMPTY_PDF = "total_page=0" # PDF页面总数为0
|
15
|
-
NOT_IS_TEXT_PDF = "not_is_text_pdf" # 不是文字版PDF,无法直接解析
|
16
|
-
DENSE_SINGLE_LINE_BLOCK = "dense_single_line_block" # 无法清晰的分段
|
17
|
-
TITLE_DETECTION_FAILED = "title_detection_failed" # 探测标题失败
|
18
|
-
TITLE_LEVEL_FAILED = "title_level_failed" # 分析标题级别失败(例如一级、二级、三级标题)
|
19
|
-
PARA_SPLIT_FAILED = "para_split_failed" # 识别段落失败
|
20
|
-
PARA_MERGE_FAILED = "para_merge_failed" # 段落合并失败
|
21
|
-
NOT_ALLOW_LANGUAGE = "not_allow_language" # 不支持的语种
|
22
|
-
SPECIAL_PDF = "special_pdf"
|
23
|
-
PSEUDO_SINGLE_COLUMN = "pseudo_single_column" # 无法精确判断文字分栏
|
24
|
-
CAN_NOT_DETECT_PAGE_LAYOUT="can_not_detect_page_layout" # 无法分析页面的版面
|
25
|
-
NEGATIVE_BBOX_AREA = "negative_bbox_area" # 缩放导致 bbox 面积为负
|
26
|
-
OVERLAP_BLOCKS_CAN_NOT_SEPARATION = "overlap_blocks_can_t_separation" # 无法分离重叠的block
|
27
|
-
|
magic_pdf/libs/drop_tag.py
DELETED
@@ -1,19 +0,0 @@
|
|
1
|
-
|
2
|
-
COLOR_BG_HEADER_TXT_BLOCK = "color_background_header_txt_block"
|
3
|
-
PAGE_NO = "page-no" # 页码
|
4
|
-
CONTENT_IN_FOOT_OR_HEADER = 'in-foot-header-area' # 页眉页脚内的文本
|
5
|
-
VERTICAL_TEXT = 'vertical-text' # 垂直文本
|
6
|
-
ROTATE_TEXT = 'rotate-text' # 旋转文本
|
7
|
-
EMPTY_SIDE_BLOCK = 'empty-side-block' # 边缘上的空白没有任何内容的block
|
8
|
-
ON_IMAGE_TEXT = 'on-image-text' # 文本在图片上
|
9
|
-
ON_TABLE_TEXT = 'on-table-text' # 文本在表格上
|
10
|
-
|
11
|
-
|
12
|
-
class DropTag:
|
13
|
-
PAGE_NUMBER = "page_no"
|
14
|
-
HEADER = "header"
|
15
|
-
FOOTER = "footer"
|
16
|
-
FOOTNOTE = "footnote"
|
17
|
-
NOT_IN_LAYOUT = "not_in_layout"
|
18
|
-
SPAN_OVERLAP = "span_overlap"
|
19
|
-
BLOCK_OVERLAP = "block_overlap"
|