magic-pdf 0.10.4__py3-none-any.whl → 0.10.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (34) hide show
  1. magic_pdf/config/constants.py +5 -0
  2. magic_pdf/data/data_reader_writer/base.py +13 -1
  3. magic_pdf/data/dataset.py +175 -4
  4. magic_pdf/data/utils.py +2 -2
  5. magic_pdf/dict2md/ocr_mkcontent.py +2 -2
  6. magic_pdf/filter/__init__.py +32 -0
  7. magic_pdf/filter/pdf_meta_scan.py +3 -2
  8. magic_pdf/libs/draw_bbox.py +11 -10
  9. magic_pdf/libs/pdf_check.py +30 -30
  10. magic_pdf/libs/version.py +1 -1
  11. magic_pdf/model/__init__.py +124 -0
  12. magic_pdf/model/doc_analyze_by_custom_model.py +119 -60
  13. magic_pdf/model/operators.py +190 -0
  14. magic_pdf/model/pdf_extract_kit.py +20 -1
  15. magic_pdf/model/sub_modules/model_init.py +13 -3
  16. magic_pdf/model/sub_modules/model_utils.py +11 -5
  17. magic_pdf/para/para_split_v3.py +2 -2
  18. magic_pdf/pdf_parse_by_ocr.py +4 -5
  19. magic_pdf/pdf_parse_by_txt.py +4 -5
  20. magic_pdf/pdf_parse_union_core_v2.py +10 -11
  21. magic_pdf/pipe/AbsPipe.py +3 -2
  22. magic_pdf/pipe/OCRPipe.py +54 -15
  23. magic_pdf/pipe/TXTPipe.py +5 -4
  24. magic_pdf/pipe/UNIPipe.py +82 -30
  25. magic_pdf/pipe/operators.py +138 -0
  26. magic_pdf/pre_proc/cut_image.py +2 -2
  27. magic_pdf/tools/common.py +108 -59
  28. magic_pdf/user_api.py +47 -24
  29. {magic_pdf-0.10.4.dist-info → magic_pdf-0.10.6.dist-info}/METADATA +7 -4
  30. {magic_pdf-0.10.4.dist-info → magic_pdf-0.10.6.dist-info}/RECORD +34 -32
  31. {magic_pdf-0.10.4.dist-info → magic_pdf-0.10.6.dist-info}/LICENSE.md +0 -0
  32. {magic_pdf-0.10.4.dist-info → magic_pdf-0.10.6.dist-info}/WHEEL +0 -0
  33. {magic_pdf-0.10.4.dist-info → magic_pdf-0.10.6.dist-info}/entry_points.txt +0 -0
  34. {magic_pdf-0.10.4.dist-info → magic_pdf-0.10.6.dist-info}/top_level.txt +0 -0
magic_pdf/tools/common.py CHANGED
@@ -1,5 +1,3 @@
1
- import copy
2
- import json as json_parse
3
1
  import os
4
2
 
5
3
  import click
@@ -7,13 +5,12 @@ import fitz
7
5
  from loguru import logger
8
6
 
9
7
  import magic_pdf.model as model_config
8
+ from magic_pdf.config.enums import SupportedPdfParseMethod
10
9
  from magic_pdf.config.make_content_config import DropMode, MakeMode
11
10
  from magic_pdf.data.data_reader_writer import FileBasedDataWriter
12
- from magic_pdf.libs.draw_bbox import (draw_layout_bbox, draw_line_sort_bbox,
13
- draw_model_bbox, draw_span_bbox)
14
- from magic_pdf.pipe.OCRPipe import OCRPipe
15
- from magic_pdf.pipe.TXTPipe import TXTPipe
16
- from magic_pdf.pipe.UNIPipe import UNIPipe
11
+ from magic_pdf.data.dataset import PymuDocDataset
12
+ from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
13
+ from magic_pdf.model.operators import InferenceResult
17
14
 
18
15
  # from io import BytesIO
19
16
  # from pypdf import PdfReader, PdfWriter
@@ -56,7 +53,11 @@ def prepare_env(output_dir, pdf_file_name, method):
56
53
  def convert_pdf_bytes_to_bytes_by_pymupdf(pdf_bytes, start_page_id=0, end_page_id=None):
57
54
  document = fitz.open('pdf', pdf_bytes)
58
55
  output_document = fitz.open()
59
- end_page_id = end_page_id if end_page_id is not None and end_page_id >= 0 else len(document) - 1
56
+ end_page_id = (
57
+ end_page_id
58
+ if end_page_id is not None and end_page_id >= 0
59
+ else len(document) - 1
60
+ )
60
61
  if end_page_id > len(document) - 1:
61
62
  logger.warning('end_page_id is out of range, use pdf_docs length')
62
63
  end_page_id = len(document) - 1
@@ -94,78 +95,126 @@ def do_parse(
94
95
  f_draw_model_bbox = True
95
96
  f_draw_line_sort_bbox = True
96
97
 
97
- if lang == "":
98
+ if lang == '':
98
99
  lang = None
99
100
 
100
- pdf_bytes = convert_pdf_bytes_to_bytes_by_pymupdf(pdf_bytes, start_page_id, end_page_id)
101
+ pdf_bytes = convert_pdf_bytes_to_bytes_by_pymupdf(
102
+ pdf_bytes, start_page_id, end_page_id
103
+ )
101
104
 
102
- orig_model_list = copy.deepcopy(model_list)
103
- local_image_dir, local_md_dir = prepare_env(output_dir, pdf_file_name,
104
- parse_method)
105
+ local_image_dir, local_md_dir = prepare_env(output_dir, pdf_file_name, parse_method)
105
106
 
106
- image_writer, md_writer = FileBasedDataWriter(
107
- local_image_dir), FileBasedDataWriter(local_md_dir)
107
+ image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(
108
+ local_md_dir
109
+ )
108
110
  image_dir = str(os.path.basename(local_image_dir))
109
111
 
110
- if parse_method == 'auto':
111
- jso_useful_key = {'_pdf_type': '', 'model_list': model_list}
112
- pipe = UNIPipe(pdf_bytes, jso_useful_key, image_writer, is_debug=True,
113
- # start_page_id=start_page_id, end_page_id=end_page_id,
114
- lang=lang,
115
- layout_model=layout_model, formula_enable=formula_enable, table_enable=table_enable)
116
- elif parse_method == 'txt':
117
- pipe = TXTPipe(pdf_bytes, model_list, image_writer, is_debug=True,
118
- # start_page_id=start_page_id, end_page_id=end_page_id,
119
- lang=lang,
120
- layout_model=layout_model, formula_enable=formula_enable, table_enable=table_enable)
121
- elif parse_method == 'ocr':
122
- pipe = OCRPipe(pdf_bytes, model_list, image_writer, is_debug=True,
123
- # start_page_id=start_page_id, end_page_id=end_page_id,
124
- lang=lang,
125
- layout_model=layout_model, formula_enable=formula_enable, table_enable=table_enable)
126
- else:
127
- logger.error('unknown parse method')
128
- exit(1)
129
-
130
- pipe.pipe_classify()
112
+ ds = PymuDocDataset(pdf_bytes)
131
113
 
132
114
  if len(model_list) == 0:
133
115
  if model_config.__use_inside_model__:
134
- pipe.pipe_analyze()
135
- orig_model_list = copy.deepcopy(pipe.model_list)
116
+ if parse_method == 'auto':
117
+ if ds.classify() == SupportedPdfParseMethod.TXT:
118
+ infer_result = ds.apply(
119
+ doc_analyze,
120
+ ocr=False,
121
+ lang=lang,
122
+ layout_model=layout_model,
123
+ formula_enable=formula_enable,
124
+ table_enable=table_enable,
125
+ )
126
+ pipe_result = infer_result.pipe_txt_mode(
127
+ image_writer, debug_mode=True, lang=lang
128
+ )
129
+ else:
130
+ infer_result = ds.apply(
131
+ doc_analyze,
132
+ ocr=True,
133
+ lang=lang,
134
+ layout_model=layout_model,
135
+ formula_enable=formula_enable,
136
+ table_enable=table_enable,
137
+ )
138
+ pipe_result = infer_result.pipe_ocr_mode(
139
+ image_writer, debug_mode=True, lang=lang
140
+ )
141
+
142
+ elif parse_method == 'txt':
143
+ infer_result = ds.apply(
144
+ doc_analyze,
145
+ ocr=False,
146
+ lang=lang,
147
+ layout_model=layout_model,
148
+ formula_enable=formula_enable,
149
+ table_enable=table_enable,
150
+ )
151
+ pipe_result = infer_result.pipe_txt_mode(
152
+ image_writer, debug_mode=True, lang=lang
153
+ )
154
+ elif parse_method == 'ocr':
155
+ infer_result = ds.apply(
156
+ doc_analyze,
157
+ ocr=True,
158
+ lang=lang,
159
+ layout_model=layout_model,
160
+ formula_enable=formula_enable,
161
+ table_enable=table_enable,
162
+ )
163
+ pipe_result = infer_result.pipe_ocr_mode(
164
+ image_writer, debug_mode=True, lang=lang
165
+ )
166
+ else:
167
+ logger.error('unknown parse method')
168
+ exit(1)
136
169
  else:
137
170
  logger.error('need model list input')
138
171
  exit(2)
172
+ else:
173
+ infer_result = InferenceResult(model_list, ds)
174
+ if parse_method == 'ocr':
175
+ pipe_result = infer_result.pipe_ocr_mode(
176
+ image_writer, debug_mode=True, lang=lang
177
+ )
178
+ elif parse_method == 'txt':
179
+ pipe_result = infer_result.pipe_txt_mode(
180
+ image_writer, debug_mode=True, lang=lang
181
+ )
182
+ else:
183
+ pipe_result = infer_result.pipe_auto_mode(
184
+ image_writer, debug_mode=True, lang=lang
185
+ )
186
+
187
+ if f_draw_model_bbox:
188
+ infer_result.draw_model(
189
+ os.path.join(local_md_dir, f'{pdf_file_name}_model.pdf')
190
+ )
139
191
 
140
- pipe.pipe_parse()
141
- pdf_info = pipe.pdf_mid_data['pdf_info']
142
192
  if f_draw_layout_bbox:
143
- draw_layout_bbox(pdf_info, pdf_bytes, local_md_dir, pdf_file_name)
193
+ pipe_result.draw_layout(
194
+ os.path.join(local_md_dir, f'{pdf_file_name}_layout.pdf')
195
+ )
144
196
  if f_draw_span_bbox:
145
- draw_span_bbox(pdf_info, pdf_bytes, local_md_dir, pdf_file_name)
146
- if f_draw_model_bbox:
147
- draw_model_bbox(copy.deepcopy(orig_model_list), pdf_bytes, local_md_dir, pdf_file_name)
197
+ pipe_result.draw_span(os.path.join(local_md_dir, f'{pdf_file_name}_spans.pdf'))
198
+
148
199
  if f_draw_line_sort_bbox:
149
- draw_line_sort_bbox(pdf_info, pdf_bytes, local_md_dir, pdf_file_name)
200
+ pipe_result.draw_line_sort(
201
+ os.path.join(local_md_dir, f'{pdf_file_name}_line_sort.pdf')
202
+ )
150
203
 
151
- md_content = pipe.pipe_mk_markdown(image_dir, drop_mode=DropMode.NONE, md_make_mode=f_make_md_mode)
152
204
  if f_dump_md:
153
- md_writer.write_string(
205
+ pipe_result.dump_md(
206
+ md_writer,
154
207
  f'{pdf_file_name}.md',
155
- md_content
208
+ image_dir,
209
+ drop_mode=DropMode.NONE,
210
+ md_make_mode=f_make_md_mode,
156
211
  )
157
212
 
158
213
  if f_dump_middle_json:
159
- md_writer.write_string(
160
- f'{pdf_file_name}_middle.json',
161
- json_parse.dumps(pipe.pdf_mid_data, ensure_ascii=False, indent=4)
162
- )
214
+ pipe_result.dump_middle_json(md_writer, f'{pdf_file_name}_middle.json')
163
215
 
164
216
  if f_dump_model_json:
165
- md_writer.write_string(
166
- f'{pdf_file_name}_model.json',
167
- json_parse.dumps(orig_model_list, ensure_ascii=False, indent=4)
168
- )
217
+ infer_result.dump_model(md_writer, f'{pdf_file_name}_model.json')
169
218
 
170
219
  if f_dump_orig_pdf:
171
220
  md_writer.write(
@@ -173,11 +222,11 @@ def do_parse(
173
222
  pdf_bytes,
174
223
  )
175
224
 
176
- content_list = pipe.pipe_mk_uni_format(image_dir, drop_mode=DropMode.NONE)
177
225
  if f_dump_content_list:
178
- md_writer.write_string(
226
+ pipe_result.dump_content_list(
227
+ md_writer,
179
228
  f'{pdf_file_name}_content_list.json',
180
- json_parse.dumps(content_list, ensure_ascii=False, indent=4)
229
+ image_dir
181
230
  )
182
231
 
183
232
  logger.info(f'local output dir is {local_md_dir}')
magic_pdf/user_api.py CHANGED
@@ -10,22 +10,29 @@
10
10
  from loguru import logger
11
11
 
12
12
  from magic_pdf.data.data_reader_writer import DataWriter
13
+ from magic_pdf.data.dataset import Dataset
13
14
  from magic_pdf.libs.version import __version__
14
15
  from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
15
16
  from magic_pdf.pdf_parse_by_ocr import parse_pdf_by_ocr
16
17
  from magic_pdf.pdf_parse_by_txt import parse_pdf_by_txt
17
-
18
- PARSE_TYPE_TXT = 'txt'
19
- PARSE_TYPE_OCR = 'ocr'
20
-
21
-
22
- def parse_txt_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: DataWriter, is_debug=False,
23
- start_page_id=0, end_page_id=None, lang=None,
24
- *args, **kwargs):
18
+ from magic_pdf.config.constants import PARSE_TYPE_TXT, PARSE_TYPE_OCR
19
+
20
+
21
+ def parse_txt_pdf(
22
+ dataset: Dataset,
23
+ model_list: list,
24
+ imageWriter: DataWriter,
25
+ is_debug=False,
26
+ start_page_id=0,
27
+ end_page_id=None,
28
+ lang=None,
29
+ *args,
30
+ **kwargs
31
+ ):
25
32
  """解析文本类pdf."""
26
33
  pdf_info_dict = parse_pdf_by_txt(
27
- pdf_bytes,
28
- pdf_models,
34
+ dataset,
35
+ model_list,
29
36
  imageWriter,
30
37
  start_page_id=start_page_id,
31
38
  end_page_id=end_page_id,
@@ -43,13 +50,21 @@ def parse_txt_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: DataWriter, i
43
50
  return pdf_info_dict
44
51
 
45
52
 
46
- def parse_ocr_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: DataWriter, is_debug=False,
47
- start_page_id=0, end_page_id=None, lang=None,
48
- *args, **kwargs):
53
+ def parse_ocr_pdf(
54
+ dataset: Dataset,
55
+ model_list: list,
56
+ imageWriter: DataWriter,
57
+ is_debug=False,
58
+ start_page_id=0,
59
+ end_page_id=None,
60
+ lang=None,
61
+ *args,
62
+ **kwargs
63
+ ):
49
64
  """解析ocr类pdf."""
50
65
  pdf_info_dict = parse_pdf_by_ocr(
51
- pdf_bytes,
52
- pdf_models,
66
+ dataset,
67
+ model_list,
53
68
  imageWriter,
54
69
  start_page_id=start_page_id,
55
70
  end_page_id=end_page_id,
@@ -67,17 +82,24 @@ def parse_ocr_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: DataWriter, i
67
82
  return pdf_info_dict
68
83
 
69
84
 
70
- def parse_union_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: DataWriter, is_debug=False,
71
- input_model_is_empty: bool = False,
72
- start_page_id=0, end_page_id=None, lang=None,
73
- *args, **kwargs):
85
+ def parse_union_pdf(
86
+ dataset: Dataset,
87
+ model_list: list,
88
+ imageWriter: DataWriter,
89
+ is_debug=False,
90
+ start_page_id=0,
91
+ end_page_id=None,
92
+ lang=None,
93
+ *args,
94
+ **kwargs
95
+ ):
74
96
  """ocr和文本混合的pdf,全部解析出来."""
75
97
 
76
98
  def parse_pdf(method):
77
99
  try:
78
100
  return method(
79
- pdf_bytes,
80
- pdf_models,
101
+ dataset,
102
+ model_list,
81
103
  imageWriter,
82
104
  start_page_id=start_page_id,
83
105
  end_page_id=end_page_id,
@@ -91,12 +113,12 @@ def parse_union_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: DataWriter,
91
113
  pdf_info_dict = parse_pdf(parse_pdf_by_txt)
92
114
  if pdf_info_dict is None or pdf_info_dict.get('_need_drop', False):
93
115
  logger.warning('parse_pdf_by_txt drop or error, switch to parse_pdf_by_ocr')
94
- if input_model_is_empty:
116
+ if len(model_list) == 0:
95
117
  layout_model = kwargs.get('layout_model', None)
96
118
  formula_enable = kwargs.get('formula_enable', None)
97
119
  table_enable = kwargs.get('table_enable', None)
98
- pdf_models = doc_analyze(
99
- pdf_bytes,
120
+ infer_res = doc_analyze(
121
+ dataset,
100
122
  ocr=True,
101
123
  start_page_id=start_page_id,
102
124
  end_page_id=end_page_id,
@@ -105,6 +127,7 @@ def parse_union_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: DataWriter,
105
127
  formula_enable=formula_enable,
106
128
  table_enable=table_enable,
107
129
  )
130
+ model_list = infer_res.get_infer_res()
108
131
  pdf_info_dict = parse_pdf(parse_pdf_by_ocr)
109
132
  if pdf_info_dict is None:
110
133
  raise Exception('Both parse_pdf_by_txt and parse_pdf_by_ocr failed.')
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: magic-pdf
3
- Version: 0.10.4
3
+ Version: 0.10.6
4
4
  Summary: A practical tool for converting PDF to Markdown
5
5
  Home-page: https://github.com/opendatalab/MinerU
6
6
  Requires-Python: >=3.9
@@ -15,11 +15,14 @@ Requires-Dist: numpy<2.0.0,>=1.21.6
15
15
  Requires-Dist: pydantic<2.8.0,>=2.7.2
16
16
  Requires-Dist: PyMuPDF>=1.24.9
17
17
  Requires-Dist: scikit-learn>=1.0.2
18
- Requires-Dist: torch<=2.3.1,>=2.2.2
18
+ Requires-Dist: torch>=2.2.2
19
19
  Requires-Dist: transformers
20
+ Requires-Dist: pdfminer.six==20231228
20
21
  Provides-Extra: full
21
- Requires-Dist: unimernet==0.2.1; extra == "full"
22
- Requires-Dist: ultralytics; extra == "full"
22
+ Requires-Dist: unimernet==0.2.2; extra == "full"
23
+ Requires-Dist: torch<=2.3.1,>=2.2.2; extra == "full"
24
+ Requires-Dist: torchvision<=0.18.1,>=0.17.2; extra == "full"
25
+ Requires-Dist: ultralytics>=8.3.48; extra == "full"
23
26
  Requires-Dist: paddleocr==2.7.3; extra == "full"
24
27
  Requires-Dist: struct-eqtable==0.3.2; extra == "full"
25
28
  Requires-Dist: einops; extra == "full"
@@ -1,10 +1,10 @@
1
1
  magic_pdf/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
- magic_pdf/pdf_parse_by_ocr.py,sha256=WTaLVSU2wRpgtldasnqbrw1B0OvVi8VvcB_t-dAIfmw,880
3
- magic_pdf/pdf_parse_by_txt.py,sha256=dh3ZM6BVrFzwbH4137BPUdKhgacGlpS2N4mn74_-UaA,762
4
- magic_pdf/pdf_parse_union_core_v2.py,sha256=6Apku7-pW450HbHNTtbVLDyroRSKlQ57w9f0ScOaZv4,30879
5
- magic_pdf/user_api.py,sha256=Sh6U7iD5VsH7Qkav_0o5GTx-Rlj7vhmhHQHZSBKR5T8,4006
2
+ magic_pdf/pdf_parse_by_ocr.py,sha256=WFk6jhHSGvy8-hU2Qlpo5q-VORdSK_5Erh9IA_H7ZbQ,840
3
+ magic_pdf/pdf_parse_by_txt.py,sha256=1-xieVOP8qmAC957ftzSzaeviv0-QC4yL6Lv6Pcg_6Y,722
4
+ magic_pdf/pdf_parse_union_core_v2.py,sha256=Hl8PSJOJFHAGCdTwX4YY2_MMgjAuat47yALLb_E-DYg,30879
5
+ magic_pdf/user_api.py,sha256=EAalk3WfQTfBq4qKMcISuHSjQg2Ku61ox_WiOPeFfuY,4060
6
6
  magic_pdf/config/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
7
- magic_pdf/config/constants.py,sha256=gqhUEtso7rCop-k-VvEPAMW_6pA6Tv2Y9smrr_0Iajo,1173
7
+ magic_pdf/config/constants.py,sha256=CEhNtP8o_2zcK6DesO6cNDlpS9fUdRv-QUyHw0_vsso,1222
8
8
  magic_pdf/config/drop_reason.py,sha256=CqjMzBE96Qo8OeFvhhhItY8WhyqsKhE3DmyJLoQZNCc,2248
9
9
  magic_pdf/config/drop_tag.py,sha256=CjveyzhAsHm_bfXB7ZZNKruw1NR-WdKD8Hz6OhQdG0A,680
10
10
  magic_pdf/config/enums.py,sha256=CImYuw4sbKpq9zrj6zrrEvtdoGkjxDt8S8ByNVDpypU,89
@@ -13,12 +13,12 @@ magic_pdf/config/make_content_config.py,sha256=J2eJIhVHBPGwX18zVQomQUOxs8LcfeGLx
13
13
  magic_pdf/config/model_block_type.py,sha256=y5ie2ZLvo-h8OdVk8HOEha6qK0OJFtLmtOhYjrV680g,166
14
14
  magic_pdf/config/ocr_content_type.py,sha256=e_7RBTdShaWvWhMO2SFou7GM521elMH_Jtn5usbHWdY,890
15
15
  magic_pdf/data/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
16
- magic_pdf/data/dataset.py,sha256=n8rGw1-wizABR8giSk_XWPCXzx3478u5DK2Z0wOCOeI,5089
16
+ magic_pdf/data/dataset.py,sha256=NpljxcttgRk4_Rl8Rf191t_vNIdbqIpK5x1xHAGE2iI,10686
17
17
  magic_pdf/data/read_api.py,sha256=hGpSVg9EcyM2mIlOsDIwsl7Y_ybWf9kkoxRumIXSzQQ,3566
18
18
  magic_pdf/data/schemas.py,sha256=oIUTBzK8Wq8Wuy8A_uilWAbVhucRvOs9_f3lSKYgcmQ,664
19
- magic_pdf/data/utils.py,sha256=dJZiqygwNier0UG5tbt5jAPjgwcnfsAN6-m-G1kVPLQ,917
19
+ magic_pdf/data/utils.py,sha256=uaSHprh80D_puPUmd1slQDoE4uecNn4zZMzYWY0-a-8,917
20
20
  magic_pdf/data/data_reader_writer/__init__.py,sha256=QtevUaeSivv9dQKi3Tomfn4Z0E4To0cB8qXTnglxaHc,705
21
- magic_pdf/data/data_reader_writer/base.py,sha256=gUrHCMTHYBrWpqgHdIc-hN7HHwUC2ApK_VXrDUrnfdg,1320
21
+ magic_pdf/data/data_reader_writer/base.py,sha256=nqmAcdHOXMOJO6RAT3ILligDFaw8Op0STyCw5yOzAbI,1706
22
22
  magic_pdf/data/data_reader_writer/filebase.py,sha256=iVjBT1M_89F5HB2uMsBzLfCNhsTVnmurJk4VUuAA2tw,2111
23
23
  magic_pdf/data/data_reader_writer/multi_bucket_s3.py,sha256=_HA8NJO1Be7KwozlwOJ90o8Ik2vfjlvlDPXppESeIfk,5885
24
24
  magic_pdf/data/data_reader_writer/s3.py,sha256=9Oy1cNuXMwG1e8PgZ7AR-pn_MqHAhkgAGnyEZCYoYAA,2408
@@ -27,10 +27,10 @@ magic_pdf/data/io/base.py,sha256=SqNQqe30ZvoVvg7GVv-hLMCjN6yBgDyQQWeLgGsTfhQ,111
27
27
  magic_pdf/data/io/http.py,sha256=XlKB0DNf4a_uUnfgcclvaaOtmE7lmddx0DnK8A-emAM,958
28
28
  magic_pdf/data/io/s3.py,sha256=hyA7sbNriQy64xd_uyJ7acN_oneQ1Pdmoc7_xcvkue8,3606
29
29
  magic_pdf/dict2md/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
30
- magic_pdf/dict2md/ocr_mkcontent.py,sha256=pE-lEUsYAhZC3nSmbgYO42Kvk_bW8Ds-AL-QMPHFu8c,12941
31
- magic_pdf/filter/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
30
+ magic_pdf/dict2md/ocr_mkcontent.py,sha256=zmEbxuIdFPfy3W72Zx_EEgyYtIOKcTa-0JoXHgXkEJ8,13046
31
+ magic_pdf/filter/__init__.py,sha256=rV4dvUxfKyVErDx9ZbUp8DVq_fRIlv0lfSXp1ND4STc,1503
32
32
  magic_pdf/filter/pdf_classify_by_type.py,sha256=spmDO-f2ihAl1d6-EP-j271Yi50oyu6mw4X2kRd_m0s,42320
33
- magic_pdf/filter/pdf_meta_scan.py,sha256=3ba7SxXu1z2r5N97Dxmp_L10Lo7llsrBlvtEAJeIJBQ,17403
33
+ magic_pdf/filter/pdf_meta_scan.py,sha256=rqTuStW2_ICr3HmV_9IQ5jnsl4JnSh7-bL11vbtH3i0,17470
34
34
  magic_pdf/integrations/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
35
35
  magic_pdf/integrations/rag/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
36
36
  magic_pdf/integrations/rag/api.py,sha256=t38wvIBzLje4_JzTP3dewMLqV-tQJ-A3B92Sj2oyrfs,2507
@@ -43,26 +43,27 @@ magic_pdf/libs/commons.py,sha256=xD0fGA16KNB5rhbl4zRrOqdrNHYwaRablT_s9W2ZTbw,117
43
43
  magic_pdf/libs/config_reader.py,sha256=vDsxw2xbW7Gb1mKqERTSlttbXFNtVU0BDdae2dG7wEI,4068
44
44
  magic_pdf/libs/convert_utils.py,sha256=Ov-lsfCLBPz_15iSJXIslBNmrSf_E_1g_XDWJy8NgO8,143
45
45
  magic_pdf/libs/coordinate_transform.py,sha256=Bbop2cP2uz2ZG0U0gwd7J6EKkgABq5Rv03qf2LMPw80,429
46
- magic_pdf/libs/draw_bbox.py,sha256=2IXr4TUxm0-pXYIPkNaELWo9pOysZC6etpqzTE5eg-w,17588
46
+ magic_pdf/libs/draw_bbox.py,sha256=Z7-OOETUo90yj3tCV8MwbiJwckThcC0bjs4MXI9ocac,17561
47
47
  magic_pdf/libs/hash_utils.py,sha256=VEKK9WfFoZgrPfi8kfITjLpr8Ahufs8tXh9R1Y5lAL8,404
48
48
  magic_pdf/libs/json_compressor.py,sha256=6-KCu0lb5ksmyqWtQGb4QqmP-FjRb5dP7P-Hevcn68g,875
49
49
  magic_pdf/libs/language.py,sha256=Hj5-lrGoNExxdHLbkcNG-c27U4AjJ9AZPdZblaNSehU,1099
50
50
  magic_pdf/libs/local_math.py,sha256=tqljQOgqh3fZc146HYhO88JXJaiXMVwArBkk_CSGICc,177
51
51
  magic_pdf/libs/markdown_utils.py,sha256=86v2BmsSV4NkoRZrH4uQD1youJhYFF3vIKr_vDeg3z0,270
52
52
  magic_pdf/libs/path_utils.py,sha256=Hykw_l5CU736b2egHV9P7B-qh3QNKO4nZSGCbsi0Z8E,1043
53
- magic_pdf/libs/pdf_check.py,sha256=wCVOcwEPeMRcHW5OGN-GSQnPT5qNXUYHWWowoUknxF4,3178
53
+ magic_pdf/libs/pdf_check.py,sha256=zBwUThKKBtnrNPmgE10lYsTy1Kq7j_6IejO7JR0J4pA,3118
54
54
  magic_pdf/libs/pdf_image_tools.py,sha256=kjzSEbm7K0yiHv8kJ4VbZ9HHktM8qvAv3LhxRyDZEQk,1987
55
55
  magic_pdf/libs/safe_filename.py,sha256=ckwcM_eqoysTb5id8czp-tXq2G9da0-l3pshZDCHQtE,236
56
- magic_pdf/libs/version.py,sha256=fGZMaoPHZfTX9I4TDkr07gp-kj_1U_SD-gjQC_2flQs,23
57
- magic_pdf/model/__init__.py,sha256=1QcfMKET0xQhSaZMjNQHi_TjzSSDR6PI5mjkmaXHPe8,52
58
- magic_pdf/model/doc_analyze_by_custom_model.py,sha256=YZwlhIgidy1_MUyTM_MRSLfKR_rpi508Bra6Vpj8PJ4,7125
56
+ magic_pdf/libs/version.py,sha256=7qmFu9Qmzy5OxKJPN-LQOkzV_2T4cJYrUSLTfq7F3kE,23
57
+ magic_pdf/model/__init__.py,sha256=R6uhAQucHJa87V81ahYHWEffG0-3F1792J4kaSxZpi8,3698
58
+ magic_pdf/model/doc_analyze_by_custom_model.py,sha256=KAPRDgWUAzsXbofZ6i0ll9eaanPdPnfjM1nn4Pl8-Zo,7588
59
59
  magic_pdf/model/magic_model.py,sha256=ppMkMqtP7sKncHTZ2SbXuPOoR988iRPexBEMA6QeiIc,42208
60
60
  magic_pdf/model/model_list.py,sha256=tJ9jtMB93HGx8Rmt8wmQSDFXZBUIPQrwaaYsep4luTM,183
61
- magic_pdf/model/pdf_extract_kit.py,sha256=ceYWlSU1BhakfsHPVM9SrUx35EvCBa20uJmgDO5PAtE,10933
61
+ magic_pdf/model/operators.py,sha256=qcacETf6j-gDUj9g0zYJgBrkq0YWe6ZlfoPjJhCMUYU,6628
62
+ magic_pdf/model/pdf_extract_kit.py,sha256=6JdWkdKOgL9UyAlI5znPMexs0AMZzn1SgrIpJUxWiGs,11839
62
63
  magic_pdf/model/pp_structure_v2.py,sha256=NcqFWL4nUtjl82MFak8HX_8V3i4Aw_fK4dATrIp5uGs,3840
63
64
  magic_pdf/model/sub_modules/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
64
- magic_pdf/model/sub_modules/model_init.py,sha256=CnlZLsiSOmGJXQRASH-hMmuPiF6hYKCNfmzDTjQqy5g,5073
65
- magic_pdf/model/sub_modules/model_utils.py,sha256=ToiuwXbrvH_CPIwW2AXzz9miadUN5FA7lthwBljtIco,2118
65
+ magic_pdf/model/sub_modules/model_init.py,sha256=Sp4I2tQ2oFsTIBRHXv8-44WU1PvPSx4L3VfwnQUaRFo,5438
66
+ magic_pdf/model/sub_modules/model_utils.py,sha256=svV5bn_Xw3QqSa22h7OrmlQQQySSqe3DdE6KMEURr2c,2219
66
67
  magic_pdf/model/sub_modules/layout/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
67
68
  magic_pdf/model/sub_modules/layout/doclayout_yolo/DocLayoutYOLO.py,sha256=roe6Rth6cvBrCw0MWXcj1CBjvK3S_Ni7GC4DxY4-yBQ,886
68
69
  magic_pdf/model/sub_modules/layout/doclayout_yolo/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -110,15 +111,16 @@ magic_pdf/model/sub_modules/table/structeqtable/struct_eqtable.py,sha256=SrNPm-u
110
111
  magic_pdf/model/sub_modules/table/tablemaster/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
111
112
  magic_pdf/model/sub_modules/table/tablemaster/tablemaster_paddle.py,sha256=QEQ-56AzoIAU7UWsEidWW_KDOY5r16qm2kSpox8cxq4,2755
112
113
  magic_pdf/para/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
113
- magic_pdf/para/para_split_v3.py,sha256=UOQe0HUVX7FAlMbJp1OkGfdM7JECWeqscv3s8Hge7ps,16922
114
- magic_pdf/pipe/AbsPipe.py,sha256=jPtAa0pz_vPddya3ZpUk6UrGqp8PcBdLONO1spzavQo,4371
115
- magic_pdf/pipe/OCRPipe.py,sha256=nuN-zpUzu--gyrC0_vsvvilAyK7Mp3Tom_UOnsur1ps,2158
116
- magic_pdf/pipe/TXTPipe.py,sha256=5OFo2e8U5Y24wJrFDEJghBDpklnKFEnzKTYVnnhQssE,2159
117
- magic_pdf/pipe/UNIPipe.py,sha256=ik0xXPdsHo7Un0gFpLC5ul04BP3Omd2mp5gqem40deE,4807
114
+ magic_pdf/para/para_split_v3.py,sha256=v4SdQn4OZdHRXpWQMfQ-FGJz_tglQ88uFUqpwY542Fo,16922
115
+ magic_pdf/pipe/AbsPipe.py,sha256=_Lx4Ags5suEvmJEvgHEvg6n0RP4Yqjc1VBWaCP0la2o,4410
116
+ magic_pdf/pipe/OCRPipe.py,sha256=nH21Rq7mQEw7pS7AVD2MRFdSE0DxGc1wk9VXB6T0m3A,2396
117
+ magic_pdf/pipe/TXTPipe.py,sha256=JXJ7hzD7TNq5VnCt33dck2FM15GpozJoHibaRlYD14s,2196
118
+ magic_pdf/pipe/UNIPipe.py,sha256=i0kWflZ5BFHrx8p8vDntRcN6jecaxOfGq11ANtYvrZY,5011
118
119
  magic_pdf/pipe/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
120
+ magic_pdf/pipe/operators.py,sha256=5z7kF95IWyBGxs4tIhqJml2YMlfDkU9B5xy__NiUxz0,4962
119
121
  magic_pdf/pre_proc/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
120
122
  magic_pdf/pre_proc/construct_page_dict.py,sha256=OFmq5XRKi6fYIo-lmGlL-NB16Sf0egzsfEx-fT2uYrc,660
121
- magic_pdf/pre_proc/cut_image.py,sha256=U-ttnl3lAhhmgtkR1GGyPAVm0i0-6VscXf3E2EDy3lE,1187
123
+ magic_pdf/pre_proc/cut_image.py,sha256=NDzbxwD7z7Tb4uAxL4KR6LzURFdN1Tzr4nPvj-VmEqc,1225
122
124
  magic_pdf/pre_proc/ocr_detect_all_bboxes.py,sha256=nt88ttXCEI_1ihAF7HU15SQjwM69V-iJmk-L_nyzA6o,9328
123
125
  magic_pdf/pre_proc/ocr_dict_merge.py,sha256=Ycgz2whzotL7kwl0-mHNV48QOQ2j4tRXqLSQrJRojYg,4847
124
126
  magic_pdf/pre_proc/ocr_span_list_modify.py,sha256=a5OmIwtkXkz6fTQg6p8R-f1nA_w0rgMwKFQjfs_HwrE,2864
@@ -136,12 +138,12 @@ magic_pdf/spark/spark_api.py,sha256=BYO6zlRW0cEnIUB3ZzNQTu_LsPHEVitqiUN7gy3x_wo,
136
138
  magic_pdf/tools/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
137
139
  magic_pdf/tools/cli.py,sha256=83a8p4_DvVdDOTuviE6WqexSXsDE_MUY-af3QDxXeoU,3067
138
140
  magic_pdf/tools/cli_dev.py,sha256=3RbubfTIagWoFYdu8wSDanr-BJDjFGeDet55jTy7He0,3948
139
- magic_pdf/tools/common.py,sha256=ILTv8YjnK-XTVV5nzak3Sm-EJJXjG1hJJghlYKgYVBQ,6809
141
+ magic_pdf/tools/common.py,sha256=x3dNHT9wEpdmkkEb4Y70DmUMMPavre5C82T0v9OmA2g,7894
140
142
  magic_pdf/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
141
143
  magic_pdf/utils/annotations.py,sha256=82ou3uELNbQWa9hOFFkVt0gsIskAKf5msCv5J2IJ5V0,211
142
- magic_pdf-0.10.4.dist-info/LICENSE.md,sha256=jVa0BUaKrRH4erV2P5AeJ24I2WRv9chIGxditreJ6e0,34524
143
- magic_pdf-0.10.4.dist-info/METADATA,sha256=pujqC_qUWiPT-L6R065MoL0QO9q4IEra0iW4BCRkxr4,36992
144
- magic_pdf-0.10.4.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
145
- magic_pdf-0.10.4.dist-info/entry_points.txt,sha256=wXwYke3j8fqDQTocUspL-CqDUEv3Tfcwp09fM8dZAhA,98
146
- magic_pdf-0.10.4.dist-info/top_level.txt,sha256=J9I0AzmHWGkp9c6DL8Oe4mEx3yYphLzkRn4H25Lg1rE,10
147
- magic_pdf-0.10.4.dist-info/RECORD,,
144
+ magic_pdf-0.10.6.dist-info/LICENSE.md,sha256=jVa0BUaKrRH4erV2P5AeJ24I2WRv9chIGxditreJ6e0,34524
145
+ magic_pdf-0.10.6.dist-info/METADATA,sha256=CbT8tghajhhMHEawiHakbU-ndjeJ_J9J1011PFoYDbA,37144
146
+ magic_pdf-0.10.6.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
147
+ magic_pdf-0.10.6.dist-info/entry_points.txt,sha256=wXwYke3j8fqDQTocUspL-CqDUEv3Tfcwp09fM8dZAhA,98
148
+ magic_pdf-0.10.6.dist-info/top_level.txt,sha256=J9I0AzmHWGkp9c6DL8Oe4mEx3yYphLzkRn4H25Lg1rE,10
149
+ magic_pdf-0.10.6.dist-info/RECORD,,