magic-pdf 0.10.1__py3-none-any.whl → 0.10.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- magic_pdf/dict2md/ocr_mkcontent.py +16 -22
- magic_pdf/filter/pdf_meta_scan.py +5 -19
- magic_pdf/libs/commons.py +0 -161
- magic_pdf/libs/draw_bbox.py +2 -3
- magic_pdf/libs/markdown_utils.py +0 -21
- magic_pdf/libs/pdf_check.py +52 -25
- magic_pdf/libs/pdf_image_tools.py +2 -1
- magic_pdf/libs/version.py +1 -1
- magic_pdf/model/doc_analyze_by_custom_model.py +2 -2
- magic_pdf/model/magic_model.py +0 -30
- magic_pdf/model/pp_structure_v2.py +23 -3
- magic_pdf/model/sub_modules/ocr/paddleocr/ocr_utils.py +50 -29
- magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_273_mod.py +11 -9
- magic_pdf/model/sub_modules/table/tablemaster/tablemaster_paddle.py +2 -2
- magic_pdf/para/para_split_v3.py +21 -7
- magic_pdf/pdf_parse_union_core_v2.py +134 -146
- magic_pdf/pre_proc/construct_page_dict.py +0 -55
- magic_pdf/pre_proc/cut_image.py +0 -37
- magic_pdf/pre_proc/ocr_detect_all_bboxes.py +5 -178
- magic_pdf/pre_proc/ocr_dict_merge.py +1 -224
- magic_pdf/pre_proc/ocr_span_list_modify.py +2 -252
- magic_pdf/rw/S3ReaderWriter.py +1 -1
- {magic_pdf-0.10.1.dist-info → magic_pdf-0.10.3.dist-info}/METADATA +3 -78
- {magic_pdf-0.10.1.dist-info → magic_pdf-0.10.3.dist-info}/RECORD +28 -79
- magic_pdf/dict2md/mkcontent.py +0 -438
- magic_pdf/layout/__init__.py +0 -0
- magic_pdf/layout/bbox_sort.py +0 -681
- magic_pdf/layout/layout_det_utils.py +0 -182
- magic_pdf/layout/layout_sort.py +0 -921
- magic_pdf/layout/layout_spiler_recog.py +0 -101
- magic_pdf/layout/mcol_sort.py +0 -336
- magic_pdf/libs/calc_span_stats.py +0 -239
- magic_pdf/libs/detect_language_from_model.py +0 -21
- magic_pdf/libs/nlp_utils.py +0 -203
- magic_pdf/libs/textbase.py +0 -33
- magic_pdf/libs/vis_utils.py +0 -308
- magic_pdf/para/block_continuation_processor.py +0 -562
- magic_pdf/para/block_termination_processor.py +0 -480
- magic_pdf/para/commons.py +0 -222
- magic_pdf/para/denoise.py +0 -246
- magic_pdf/para/draw.py +0 -121
- magic_pdf/para/exceptions.py +0 -198
- magic_pdf/para/layout_match_processor.py +0 -40
- magic_pdf/para/para_split.py +0 -807
- magic_pdf/para/para_split_v2.py +0 -959
- magic_pdf/para/raw_processor.py +0 -207
- magic_pdf/para/stats.py +0 -268
- magic_pdf/para/title_processor.py +0 -1014
- magic_pdf/pdf_parse_union_core.py +0 -345
- magic_pdf/post_proc/__init__.py +0 -0
- magic_pdf/post_proc/detect_para.py +0 -3472
- magic_pdf/post_proc/pdf_post_filter.py +0 -60
- magic_pdf/post_proc/remove_footnote.py +0 -153
- magic_pdf/pre_proc/citationmarker_remove.py +0 -161
- magic_pdf/pre_proc/detect_equation.py +0 -134
- magic_pdf/pre_proc/detect_footer_by_model.py +0 -64
- magic_pdf/pre_proc/detect_footer_header_by_statistics.py +0 -284
- magic_pdf/pre_proc/detect_footnote.py +0 -170
- magic_pdf/pre_proc/detect_header.py +0 -64
- magic_pdf/pre_proc/detect_images.py +0 -647
- magic_pdf/pre_proc/detect_page_number.py +0 -64
- magic_pdf/pre_proc/detect_tables.py +0 -62
- magic_pdf/pre_proc/equations_replace.py +0 -550
- magic_pdf/pre_proc/fix_image.py +0 -244
- magic_pdf/pre_proc/fix_table.py +0 -270
- magic_pdf/pre_proc/main_text_font.py +0 -23
- magic_pdf/pre_proc/ocr_detect_layout.py +0 -133
- magic_pdf/pre_proc/pdf_pre_filter.py +0 -78
- magic_pdf/pre_proc/post_layout_split.py +0 -0
- magic_pdf/pre_proc/remove_colored_strip_bbox.py +0 -101
- magic_pdf/pre_proc/remove_footer_header.py +0 -114
- magic_pdf/pre_proc/remove_rotate_bbox.py +0 -236
- magic_pdf/pre_proc/resolve_bbox_conflict.py +0 -184
- magic_pdf/pre_proc/solve_line_alien.py +0 -29
- magic_pdf/pre_proc/statistics.py +0 -12
- {magic_pdf-0.10.1.dist-info → magic_pdf-0.10.3.dist-info}/LICENSE.md +0 -0
- {magic_pdf-0.10.1.dist-info → magic_pdf-0.10.3.dist-info}/WHEEL +0 -0
- {magic_pdf-0.10.1.dist-info → magic_pdf-0.10.3.dist-info}/entry_points.txt +0 -0
- {magic_pdf-0.10.1.dist-info → magic_pdf-0.10.3.dist-info}/top_level.txt +0 -0
@@ -1,10 +1,7 @@
|
|
1
1
|
|
2
2
|
from magic_pdf.config.drop_tag import DropTag
|
3
|
-
from magic_pdf.config.ocr_content_type import BlockType
|
4
|
-
from magic_pdf.libs.boxbase import
|
5
|
-
calculate_iou,
|
6
|
-
calculate_overlap_area_in_bbox1_area_ratio,
|
7
|
-
get_minbox_if_overlap_by_ratio)
|
3
|
+
from magic_pdf.config.ocr_content_type import BlockType
|
4
|
+
from magic_pdf.libs.boxbase import calculate_iou, get_minbox_if_overlap_by_ratio
|
8
5
|
|
9
6
|
|
10
7
|
def remove_overlaps_low_confidence_spans(spans):
|
@@ -59,253 +56,6 @@ def remove_overlaps_min_spans(spans):
|
|
59
56
|
return spans, dropped_spans
|
60
57
|
|
61
58
|
|
62
|
-
def remove_spans_by_bboxes(spans, need_remove_spans_bboxes):
|
63
|
-
# 遍历spans, 判断是否在removed_span_block_bboxes中
|
64
|
-
# 如果是, 则删除该span 否则, 保留该span
|
65
|
-
need_remove_spans = []
|
66
|
-
for span in spans:
|
67
|
-
for removed_bbox in need_remove_spans_bboxes:
|
68
|
-
if (
|
69
|
-
calculate_overlap_area_in_bbox1_area_ratio(span['bbox'], removed_bbox)
|
70
|
-
> 0.5
|
71
|
-
):
|
72
|
-
if span not in need_remove_spans:
|
73
|
-
need_remove_spans.append(span)
|
74
|
-
break
|
75
|
-
|
76
|
-
if len(need_remove_spans) > 0:
|
77
|
-
for span in need_remove_spans:
|
78
|
-
spans.remove(span)
|
79
|
-
|
80
|
-
return spans
|
81
|
-
|
82
|
-
|
83
|
-
def remove_spans_by_bboxes_dict(spans, need_remove_spans_bboxes_dict):
|
84
|
-
dropped_spans = []
|
85
|
-
for drop_tag, removed_bboxes in need_remove_spans_bboxes_dict.items():
|
86
|
-
# logger.info(f"remove spans by bbox dict, drop_tag: {drop_tag}, removed_bboxes: {removed_bboxes}")
|
87
|
-
need_remove_spans = []
|
88
|
-
for span in spans:
|
89
|
-
# 通过判断span的bbox是否在removed_bboxes中, 判断是否需要删除该span
|
90
|
-
for removed_bbox in removed_bboxes:
|
91
|
-
if (
|
92
|
-
calculate_overlap_area_in_bbox1_area_ratio(
|
93
|
-
span['bbox'], removed_bbox
|
94
|
-
)
|
95
|
-
> 0.5
|
96
|
-
):
|
97
|
-
need_remove_spans.append(span)
|
98
|
-
break
|
99
|
-
# 当drop_tag为DropTag.FOOTNOTE时, 判断span是否在removed_bboxes中任意一个的下方,如果是,则删除该span
|
100
|
-
elif (
|
101
|
-
drop_tag == DropTag.FOOTNOTE
|
102
|
-
and (span['bbox'][1] + span['bbox'][3]) / 2 > removed_bbox[3]
|
103
|
-
and removed_bbox[0]
|
104
|
-
< (span['bbox'][0] + span['bbox'][2]) / 2
|
105
|
-
< removed_bbox[2]
|
106
|
-
):
|
107
|
-
need_remove_spans.append(span)
|
108
|
-
break
|
109
|
-
|
110
|
-
for span in need_remove_spans:
|
111
|
-
spans.remove(span)
|
112
|
-
span['tag'] = drop_tag
|
113
|
-
dropped_spans.append(span)
|
114
|
-
|
115
|
-
return spans, dropped_spans
|
116
|
-
|
117
|
-
|
118
|
-
def adjust_bbox_for_standalone_block(spans):
|
119
|
-
# 对tpye=["interline_equation", "image", "table"]进行额外处理,如果左边有字的话,将该span的bbox中y0调整至不高于文字的y0
|
120
|
-
for sb_span in spans:
|
121
|
-
if sb_span['type'] in [
|
122
|
-
ContentType.InterlineEquation,
|
123
|
-
ContentType.Image,
|
124
|
-
ContentType.Table,
|
125
|
-
]:
|
126
|
-
for text_span in spans:
|
127
|
-
if text_span['type'] in [ContentType.Text, ContentType.InlineEquation]:
|
128
|
-
# 判断span2的纵向高度是否被span所覆盖
|
129
|
-
if (
|
130
|
-
sb_span['bbox'][1] < text_span['bbox'][1]
|
131
|
-
and sb_span['bbox'][3] > text_span['bbox'][3]
|
132
|
-
):
|
133
|
-
# 判断span2是否在span左边
|
134
|
-
if text_span['bbox'][0] < sb_span['bbox'][0]:
|
135
|
-
# 调整span的y0和span2的y0一致
|
136
|
-
sb_span['bbox'][1] = text_span['bbox'][1]
|
137
|
-
return spans
|
138
|
-
|
139
|
-
|
140
|
-
def modify_y_axis(spans: list, displayed_list: list, text_inline_lines: list):
|
141
|
-
# displayed_list = []
|
142
|
-
# 如果spans为空,则不处理
|
143
|
-
if len(spans) == 0:
|
144
|
-
pass
|
145
|
-
else:
|
146
|
-
spans.sort(key=lambda span: span['bbox'][1])
|
147
|
-
|
148
|
-
lines = []
|
149
|
-
current_line = [spans[0]]
|
150
|
-
if spans[0]['type'] in [
|
151
|
-
ContentType.InterlineEquation,
|
152
|
-
ContentType.Image,
|
153
|
-
ContentType.Table,
|
154
|
-
]:
|
155
|
-
displayed_list.append(spans[0])
|
156
|
-
|
157
|
-
line_first_y0 = spans[0]['bbox'][1]
|
158
|
-
line_first_y = spans[0]['bbox'][3]
|
159
|
-
# 用于给行间公式搜索
|
160
|
-
# text_inline_lines = []
|
161
|
-
for span in spans[1:]:
|
162
|
-
# if span.get("content","") == "78.":
|
163
|
-
# print("debug")
|
164
|
-
# 如果当前的span类型为"interline_equation" 或者 当前行中已经有"interline_equation"
|
165
|
-
# image和table类型,同上
|
166
|
-
if span['type'] in [
|
167
|
-
ContentType.InterlineEquation,
|
168
|
-
ContentType.Image,
|
169
|
-
ContentType.Table,
|
170
|
-
] or any(
|
171
|
-
s['type']
|
172
|
-
in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table]
|
173
|
-
for s in current_line
|
174
|
-
):
|
175
|
-
# 传入
|
176
|
-
if span['type'] in [
|
177
|
-
ContentType.InterlineEquation,
|
178
|
-
ContentType.Image,
|
179
|
-
ContentType.Table,
|
180
|
-
]:
|
181
|
-
displayed_list.append(span)
|
182
|
-
# 则开始新行
|
183
|
-
lines.append(current_line)
|
184
|
-
if len(current_line) > 1 or current_line[0]['type'] in [
|
185
|
-
ContentType.Text,
|
186
|
-
ContentType.InlineEquation,
|
187
|
-
]:
|
188
|
-
text_inline_lines.append(
|
189
|
-
(current_line, (line_first_y0, line_first_y))
|
190
|
-
)
|
191
|
-
current_line = [span]
|
192
|
-
line_first_y0 = span['bbox'][1]
|
193
|
-
line_first_y = span['bbox'][3]
|
194
|
-
continue
|
195
|
-
|
196
|
-
# 如果当前的span与当前行的最后一个span在y轴上重叠,则添加到当前行
|
197
|
-
if __is_overlaps_y_exceeds_threshold(
|
198
|
-
span['bbox'], current_line[-1]['bbox']
|
199
|
-
):
|
200
|
-
if span['type'] == 'text':
|
201
|
-
line_first_y0 = span['bbox'][1]
|
202
|
-
line_first_y = span['bbox'][3]
|
203
|
-
current_line.append(span)
|
204
|
-
|
205
|
-
else:
|
206
|
-
# 否则,开始新行
|
207
|
-
lines.append(current_line)
|
208
|
-
text_inline_lines.append((current_line, (line_first_y0, line_first_y)))
|
209
|
-
current_line = [span]
|
210
|
-
line_first_y0 = span['bbox'][1]
|
211
|
-
line_first_y = span['bbox'][3]
|
212
|
-
|
213
|
-
# 添加最后一行
|
214
|
-
if current_line:
|
215
|
-
lines.append(current_line)
|
216
|
-
if len(current_line) > 1 or current_line[0]['type'] in [
|
217
|
-
ContentType.Text,
|
218
|
-
ContentType.InlineEquation,
|
219
|
-
]:
|
220
|
-
text_inline_lines.append((current_line, (line_first_y0, line_first_y)))
|
221
|
-
for line in text_inline_lines:
|
222
|
-
# 按照x0坐标排序
|
223
|
-
current_line = line[0]
|
224
|
-
current_line.sort(key=lambda span: span['bbox'][0])
|
225
|
-
|
226
|
-
# 调整每一个文字行内bbox统一
|
227
|
-
for line in text_inline_lines:
|
228
|
-
current_line, (line_first_y0, line_first_y) = line
|
229
|
-
for span in current_line:
|
230
|
-
span['bbox'][1] = line_first_y0
|
231
|
-
span['bbox'][3] = line_first_y
|
232
|
-
|
233
|
-
# return spans, displayed_list, text_inline_lines
|
234
|
-
|
235
|
-
|
236
|
-
def modify_inline_equation(spans: list, displayed_list: list, text_inline_lines: list):
|
237
|
-
# 错误行间公式转行内公式
|
238
|
-
j = 0
|
239
|
-
for i in range(len(displayed_list)):
|
240
|
-
# if i == 8:
|
241
|
-
# print("debug")
|
242
|
-
span = displayed_list[i]
|
243
|
-
span_y0, span_y = span['bbox'][1], span['bbox'][3]
|
244
|
-
|
245
|
-
while j < len(text_inline_lines):
|
246
|
-
text_line = text_inline_lines[j]
|
247
|
-
y0, y1 = text_line[1]
|
248
|
-
if (
|
249
|
-
span_y0 < y0 < span_y
|
250
|
-
or span_y0 < y1 < span_y
|
251
|
-
or span_y0 < y0
|
252
|
-
and span_y > y1
|
253
|
-
) and __is_overlaps_y_exceeds_threshold(span['bbox'], (0, y0, 0, y1)):
|
254
|
-
# 调整公式类型
|
255
|
-
if span['type'] == ContentType.InterlineEquation:
|
256
|
-
# 最后一行是行间公式
|
257
|
-
if j + 1 >= len(text_inline_lines):
|
258
|
-
span['type'] = ContentType.InlineEquation
|
259
|
-
span['bbox'][1] = y0
|
260
|
-
span['bbox'][3] = y1
|
261
|
-
else:
|
262
|
-
# 行间公式旁边有多行文字或者行间公式比文字高3倍则不转换
|
263
|
-
y0_next, y1_next = text_inline_lines[j + 1][1]
|
264
|
-
if (
|
265
|
-
not __is_overlaps_y_exceeds_threshold(
|
266
|
-
span['bbox'], (0, y0_next, 0, y1_next)
|
267
|
-
)
|
268
|
-
and 3 * (y1 - y0) > span_y - span_y0
|
269
|
-
):
|
270
|
-
span['type'] = ContentType.InlineEquation
|
271
|
-
span['bbox'][1] = y0
|
272
|
-
span['bbox'][3] = y1
|
273
|
-
break
|
274
|
-
elif (
|
275
|
-
span_y < y0
|
276
|
-
or span_y0 < y0 < span_y
|
277
|
-
and not __is_overlaps_y_exceeds_threshold(span['bbox'], (0, y0, 0, y1))
|
278
|
-
):
|
279
|
-
break
|
280
|
-
else:
|
281
|
-
j += 1
|
282
|
-
|
283
|
-
return spans
|
284
|
-
|
285
|
-
|
286
|
-
def get_qa_need_list(blocks):
|
287
|
-
# 创建 images, tables, interline_equations, inline_equations 的副本
|
288
|
-
images = []
|
289
|
-
tables = []
|
290
|
-
interline_equations = []
|
291
|
-
inline_equations = []
|
292
|
-
|
293
|
-
for block in blocks:
|
294
|
-
for line in block['lines']:
|
295
|
-
for span in line['spans']:
|
296
|
-
if span['type'] == ContentType.Image:
|
297
|
-
images.append(span)
|
298
|
-
elif span['type'] == ContentType.Table:
|
299
|
-
tables.append(span)
|
300
|
-
elif span['type'] == ContentType.InlineEquation:
|
301
|
-
inline_equations.append(span)
|
302
|
-
elif span['type'] == ContentType.InterlineEquation:
|
303
|
-
interline_equations.append(span)
|
304
|
-
else:
|
305
|
-
continue
|
306
|
-
return images, tables, interline_equations, inline_equations
|
307
|
-
|
308
|
-
|
309
59
|
def get_qa_need_list_v2(blocks):
|
310
60
|
# 创建 images, tables, interline_equations, inline_equations 的副本
|
311
61
|
images = []
|
magic_pdf/rw/S3ReaderWriter.py
CHANGED
@@ -1,5 +1,5 @@
|
|
1
1
|
from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
|
2
|
-
from magic_pdf.libs.commons import
|
2
|
+
from magic_pdf.libs.commons import parse_bucket_key, join_path
|
3
3
|
import boto3
|
4
4
|
from loguru import logger
|
5
5
|
from botocore.config import Config
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: magic-pdf
|
3
|
-
Version: 0.10.
|
3
|
+
Version: 0.10.3
|
4
4
|
Summary: A practical tool for converting PDF to Markdown
|
5
5
|
Home-page: https://github.com/opendatalab/MinerU
|
6
6
|
Requires-Python: >=3.9
|
@@ -12,7 +12,6 @@ Requires-Dist: click>=8.1.7
|
|
12
12
|
Requires-Dist: fast-langdetect==0.2.0
|
13
13
|
Requires-Dist: loguru>=0.6.0
|
14
14
|
Requires-Dist: numpy<2.0.0,>=1.21.6
|
15
|
-
Requires-Dist: pdfminer.six==20231228
|
16
15
|
Requires-Dist: pydantic<2.8.0,>=2.7.2
|
17
16
|
Requires-Dist: PyMuPDF>=1.24.9
|
18
17
|
Requires-Dist: scikit-learn>=1.0.2
|
@@ -320,88 +319,14 @@ If your device supports CUDA and meets the GPU requirements of the mainline envi
|
|
320
319
|
|
321
320
|
### Command Line
|
322
321
|
|
323
|
-
|
324
|
-
magic-pdf --help
|
325
|
-
Usage: magic-pdf [OPTIONS]
|
326
|
-
|
327
|
-
Options:
|
328
|
-
-v, --version display the version and exit
|
329
|
-
-p, --path PATH local pdf filepath or directory [required]
|
330
|
-
-o, --output-dir PATH output local directory [required]
|
331
|
-
-m, --method [ocr|txt|auto] the method for parsing pdf. ocr: using ocr
|
332
|
-
technique to extract information from pdf. txt:
|
333
|
-
suitable for the text-based pdf only and
|
334
|
-
outperform ocr. auto: automatically choose the
|
335
|
-
best method for parsing pdf from ocr and txt.
|
336
|
-
without method specified, auto will be used by
|
337
|
-
default.
|
338
|
-
-l, --lang TEXT Input the languages in the pdf (if known) to
|
339
|
-
improve OCR accuracy. Optional. You should
|
340
|
-
input "Abbreviation" with language form url: ht
|
341
|
-
tps://paddlepaddle.github.io/PaddleOCR/latest/en
|
342
|
-
/ppocr/blog/multi_languages.html#5-support-languages-
|
343
|
-
and-abbreviations
|
344
|
-
-d, --debug BOOLEAN Enables detailed debugging information during
|
345
|
-
the execution of the CLI commands.
|
346
|
-
-s, --start INTEGER The starting page for PDF parsing, beginning
|
347
|
-
from 0.
|
348
|
-
-e, --end INTEGER The ending page for PDF parsing, beginning from
|
349
|
-
0.
|
350
|
-
--help Show this message and exit.
|
351
|
-
|
352
|
-
|
353
|
-
## show version
|
354
|
-
magic-pdf -v
|
355
|
-
|
356
|
-
## command line example
|
357
|
-
magic-pdf -p {some_pdf} -o {some_output_dir} -m auto
|
358
|
-
```
|
322
|
+
[Using MinerU via Command Line](https://mineru.readthedocs.io/en/latest/user_guide/quick_start/command_line.html)
|
359
323
|
|
360
|
-
`{some_pdf}` can be a single PDF file or a directory containing multiple PDFs.
|
361
|
-
The results will be saved in the `{some_output_dir}` directory. The output file list is as follows:
|
362
|
-
|
363
|
-
```text
|
364
|
-
├── some_pdf.md # markdown file
|
365
|
-
├── images # directory for storing images
|
366
|
-
├── some_pdf_layout.pdf # layout diagram (Include layout reading order)
|
367
|
-
├── some_pdf_middle.json # MinerU intermediate processing result
|
368
|
-
├── some_pdf_model.json # model inference result
|
369
|
-
├── some_pdf_origin.pdf # original PDF file
|
370
|
-
├── some_pdf_spans.pdf # smallest granularity bbox position information diagram
|
371
|
-
└── some_pdf_content_list.json # Rich text JSON arranged in reading order
|
372
|
-
```
|
373
324
|
> [!TIP]
|
374
325
|
> For more information about the output files, please refer to the [Output File Description](docs/output_file_en_us.md).
|
375
326
|
|
376
327
|
### API
|
377
328
|
|
378
|
-
|
379
|
-
|
380
|
-
```python
|
381
|
-
image_writer = DiskReaderWriter(local_image_dir)
|
382
|
-
image_dir = str(os.path.basename(local_image_dir))
|
383
|
-
jso_useful_key = {"_pdf_type": "", "model_list": []}
|
384
|
-
pipe = UNIPipe(pdf_bytes, jso_useful_key, image_writer)
|
385
|
-
pipe.pipe_classify()
|
386
|
-
pipe.pipe_analyze()
|
387
|
-
pipe.pipe_parse()
|
388
|
-
md_content = pipe.pipe_mk_markdown(image_dir, drop_mode="none")
|
389
|
-
```
|
390
|
-
|
391
|
-
Processing files from object storage
|
392
|
-
|
393
|
-
```python
|
394
|
-
s3pdf_cli = S3ReaderWriter(pdf_ak, pdf_sk, pdf_endpoint)
|
395
|
-
image_dir = "s3://img_bucket/"
|
396
|
-
s3image_cli = S3ReaderWriter(img_ak, img_sk, img_endpoint, parent_path=image_dir)
|
397
|
-
pdf_bytes = s3pdf_cli.read(s3_pdf_path, mode=s3pdf_cli.MODE_BIN)
|
398
|
-
jso_useful_key = {"_pdf_type": "", "model_list": []}
|
399
|
-
pipe = UNIPipe(pdf_bytes, jso_useful_key, s3image_cli)
|
400
|
-
pipe.pipe_classify()
|
401
|
-
pipe.pipe_analyze()
|
402
|
-
pipe.pipe_parse()
|
403
|
-
md_content = pipe.pipe_mk_markdown(image_dir, drop_mode="none")
|
404
|
-
```
|
329
|
+
[Using MinerU via Python API](https://mineru.readthedocs.io/en/latest/user_guide/quick_start/to_markdown.html)
|
405
330
|
|
406
331
|
For detailed implementation, refer to:
|
407
332
|
|
@@ -1,8 +1,7 @@
|
|
1
1
|
magic_pdf/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
2
2
|
magic_pdf/pdf_parse_by_ocr.py,sha256=WTaLVSU2wRpgtldasnqbrw1B0OvVi8VvcB_t-dAIfmw,880
|
3
3
|
magic_pdf/pdf_parse_by_txt.py,sha256=dh3ZM6BVrFzwbH4137BPUdKhgacGlpS2N4mn74_-UaA,762
|
4
|
-
magic_pdf/
|
5
|
-
magic_pdf/pdf_parse_union_core_v2.py,sha256=EqEi9AahBBh2JbXoY8uOCmClvi9W_H_26U4jK8RwPwU,31308
|
4
|
+
magic_pdf/pdf_parse_union_core_v2.py,sha256=6Apku7-pW450HbHNTtbVLDyroRSKlQ57w9f0ScOaZv4,30879
|
6
5
|
magic_pdf/user_api.py,sha256=Sh6U7iD5VsH7Qkav_0o5GTx-Rlj7vhmhHQHZSBKR5T8,4006
|
7
6
|
magic_pdf/config/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
8
7
|
magic_pdf/config/constants.py,sha256=gqhUEtso7rCop-k-VvEPAMW_6pA6Tv2Y9smrr_0Iajo,1173
|
@@ -28,51 +27,39 @@ magic_pdf/data/io/base.py,sha256=SqNQqe30ZvoVvg7GVv-hLMCjN6yBgDyQQWeLgGsTfhQ,111
|
|
28
27
|
magic_pdf/data/io/http.py,sha256=XlKB0DNf4a_uUnfgcclvaaOtmE7lmddx0DnK8A-emAM,958
|
29
28
|
magic_pdf/data/io/s3.py,sha256=hyA7sbNriQy64xd_uyJ7acN_oneQ1Pdmoc7_xcvkue8,3606
|
30
29
|
magic_pdf/dict2md/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
31
|
-
magic_pdf/dict2md/
|
32
|
-
magic_pdf/dict2md/ocr_mkcontent.py,sha256=ohjhEFS9YFrzTCC9c9yrvi4QuZe9iZm1qlkQWB6xxIw,13038
|
30
|
+
magic_pdf/dict2md/ocr_mkcontent.py,sha256=hwcHTEx1tbIlM9ukmPBOAyH0G6rmbOTu87nVtZ1gE6k,12354
|
33
31
|
magic_pdf/filter/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
34
32
|
magic_pdf/filter/pdf_classify_by_type.py,sha256=spmDO-f2ihAl1d6-EP-j271Yi50oyu6mw4X2kRd_m0s,42320
|
35
|
-
magic_pdf/filter/pdf_meta_scan.py,sha256=
|
33
|
+
magic_pdf/filter/pdf_meta_scan.py,sha256=3ba7SxXu1z2r5N97Dxmp_L10Lo7llsrBlvtEAJeIJBQ,17403
|
36
34
|
magic_pdf/integrations/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
37
35
|
magic_pdf/integrations/rag/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
38
36
|
magic_pdf/integrations/rag/api.py,sha256=t38wvIBzLje4_JzTP3dewMLqV-tQJ-A3B92Sj2oyrfs,2507
|
39
37
|
magic_pdf/integrations/rag/type.py,sha256=Z_1g_ZIOCsb7-FmZBudReIXj8nzGrgj_BygCalhJdmk,3193
|
40
38
|
magic_pdf/integrations/rag/utils.py,sha256=DCb-UhC8TElb6Eq7_6NmmETreKEk5DVE18hNL8sTEBk,11762
|
41
|
-
magic_pdf/layout/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
42
|
-
magic_pdf/layout/bbox_sort.py,sha256=PzzaBf6MC_AZ-ZWGU0Kg-KIsw874l_gML73mM3hE4Ps,30807
|
43
|
-
magic_pdf/layout/layout_det_utils.py,sha256=NCYBTvsrULE3Cue53aMD1MfXTmOL9Xy0nivl6ku2cls,9137
|
44
|
-
magic_pdf/layout/layout_sort.py,sha256=jtacQVcxnuYAksvEqtS0DH-v6U8qyjX-jmyZgDJ-egA,37005
|
45
|
-
magic_pdf/layout/layout_spiler_recog.py,sha256=QjBSgB-a7J2yjUR1eaCs9ZD7URtiRnV6W934hpAeuC4,3067
|
46
|
-
magic_pdf/layout/mcol_sort.py,sha256=ADnLisBJBHXDKYChcf2lzTb_TC_vZ4q89_CSN8mwEJc,11331
|
47
39
|
magic_pdf/libs/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
48
40
|
magic_pdf/libs/boxbase.py,sha256=ELMHWolgWROxOAQDgwmL7VS5kveZp4ifvEzRmPul2Ws,16925
|
49
|
-
magic_pdf/libs/calc_span_stats.py,sha256=5vnU27DcbkFDRSAoLqAmX0KQ3I9ehWkEgh_t9hxg_zI,10147
|
50
41
|
magic_pdf/libs/clean_memory.py,sha256=BIOmEWuwR7c_p4OwTSW2muE3PRaGhmOplS-wTXt_EXk,211
|
51
|
-
magic_pdf/libs/commons.py,sha256=
|
42
|
+
magic_pdf/libs/commons.py,sha256=xD0fGA16KNB5rhbl4zRrOqdrNHYwaRablT_s9W2ZTbw,1174
|
52
43
|
magic_pdf/libs/config_reader.py,sha256=vDsxw2xbW7Gb1mKqERTSlttbXFNtVU0BDdae2dG7wEI,4068
|
53
44
|
magic_pdf/libs/convert_utils.py,sha256=Ov-lsfCLBPz_15iSJXIslBNmrSf_E_1g_XDWJy8NgO8,143
|
54
45
|
magic_pdf/libs/coordinate_transform.py,sha256=Bbop2cP2uz2ZG0U0gwd7J6EKkgABq5Rv03qf2LMPw80,429
|
55
|
-
magic_pdf/libs/
|
56
|
-
magic_pdf/libs/draw_bbox.py,sha256=NhAfqib5HYuGjjrAG_SvJR-yOHZTy6tzDxLXdxKlULQ,17676
|
46
|
+
magic_pdf/libs/draw_bbox.py,sha256=2IXr4TUxm0-pXYIPkNaELWo9pOysZC6etpqzTE5eg-w,17588
|
57
47
|
magic_pdf/libs/hash_utils.py,sha256=VEKK9WfFoZgrPfi8kfITjLpr8Ahufs8tXh9R1Y5lAL8,404
|
58
48
|
magic_pdf/libs/json_compressor.py,sha256=6-KCu0lb5ksmyqWtQGb4QqmP-FjRb5dP7P-Hevcn68g,875
|
59
49
|
magic_pdf/libs/language.py,sha256=Hj5-lrGoNExxdHLbkcNG-c27U4AjJ9AZPdZblaNSehU,1099
|
60
50
|
magic_pdf/libs/local_math.py,sha256=tqljQOgqh3fZc146HYhO88JXJaiXMVwArBkk_CSGICc,177
|
61
|
-
magic_pdf/libs/markdown_utils.py,sha256=
|
62
|
-
magic_pdf/libs/nlp_utils.py,sha256=-X9W3-Ns5ZdDYFvyyEq6i6P2b5hCATaFEZeOjwNOH9M,6901
|
51
|
+
magic_pdf/libs/markdown_utils.py,sha256=86v2BmsSV4NkoRZrH4uQD1youJhYFF3vIKr_vDeg3z0,270
|
63
52
|
magic_pdf/libs/path_utils.py,sha256=Hykw_l5CU736b2egHV9P7B-qh3QNKO4nZSGCbsi0Z8E,1043
|
64
|
-
magic_pdf/libs/pdf_check.py,sha256=
|
65
|
-
magic_pdf/libs/pdf_image_tools.py,sha256=
|
53
|
+
magic_pdf/libs/pdf_check.py,sha256=wCVOcwEPeMRcHW5OGN-GSQnPT5qNXUYHWWowoUknxF4,3178
|
54
|
+
magic_pdf/libs/pdf_image_tools.py,sha256=kjzSEbm7K0yiHv8kJ4VbZ9HHktM8qvAv3LhxRyDZEQk,1987
|
66
55
|
magic_pdf/libs/safe_filename.py,sha256=ckwcM_eqoysTb5id8czp-tXq2G9da0-l3pshZDCHQtE,236
|
67
|
-
magic_pdf/libs/
|
68
|
-
magic_pdf/libs/version.py,sha256=v7Gyp89umFzDtY45tTjCdXqZnQ2RN01AibdYNxEvxYo,23
|
69
|
-
magic_pdf/libs/vis_utils.py,sha256=hTOTEakKV0pGMbk0tbRkVI_tku7A3dGc96ynObZ4kwI,10207
|
56
|
+
magic_pdf/libs/version.py,sha256=0C8KcY1dzs3hdkAre06v0NCQ0Uxcqv6g9a93bRcVLW0,23
|
70
57
|
magic_pdf/model/__init__.py,sha256=1QcfMKET0xQhSaZMjNQHi_TjzSSDR6PI5mjkmaXHPe8,52
|
71
|
-
magic_pdf/model/doc_analyze_by_custom_model.py,sha256=
|
72
|
-
magic_pdf/model/magic_model.py,sha256=
|
58
|
+
magic_pdf/model/doc_analyze_by_custom_model.py,sha256=YZwlhIgidy1_MUyTM_MRSLfKR_rpi508Bra6Vpj8PJ4,7125
|
59
|
+
magic_pdf/model/magic_model.py,sha256=ppMkMqtP7sKncHTZ2SbXuPOoR988iRPexBEMA6QeiIc,42208
|
73
60
|
magic_pdf/model/model_list.py,sha256=tJ9jtMB93HGx8Rmt8wmQSDFXZBUIPQrwaaYsep4luTM,183
|
74
61
|
magic_pdf/model/pdf_extract_kit.py,sha256=ceYWlSU1BhakfsHPVM9SrUx35EvCBa20uJmgDO5PAtE,10933
|
75
|
-
magic_pdf/model/pp_structure_v2.py,sha256=
|
62
|
+
magic_pdf/model/pp_structure_v2.py,sha256=NcqFWL4nUtjl82MFak8HX_8V3i4Aw_fK4dATrIp5uGs,3840
|
76
63
|
magic_pdf/model/sub_modules/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
77
64
|
magic_pdf/model/sub_modules/model_init.py,sha256=CnlZLsiSOmGJXQRASH-hMmuPiF6hYKCNfmzDTjQqy5g,5073
|
78
65
|
magic_pdf/model/sub_modules/model_utils.py,sha256=ToiuwXbrvH_CPIwW2AXzz9miadUN5FA7lthwBljtIco,2118
|
@@ -107,8 +94,8 @@ magic_pdf/model/sub_modules/mfr/unimernet/Unimernet.py,sha256=jeJkqID6L1ZivPMdK1
|
|
107
94
|
magic_pdf/model/sub_modules/mfr/unimernet/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
108
95
|
magic_pdf/model/sub_modules/ocr/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
109
96
|
magic_pdf/model/sub_modules/ocr/paddleocr/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
110
|
-
magic_pdf/model/sub_modules/ocr/paddleocr/ocr_utils.py,sha256=
|
111
|
-
magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_273_mod.py,sha256=
|
97
|
+
magic_pdf/model/sub_modules/ocr/paddleocr/ocr_utils.py,sha256=rwKphio9SZgiNgqASWOBWZIf6PPi3kvgQO_qJLc_diE,10726
|
98
|
+
magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_273_mod.py,sha256=d__xICejA_Q-Cz4cfajwroDjfA0dT4TL18XAFYYc4OQ,7265
|
112
99
|
magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_291_mod.py,sha256=VouMTvi6M5TV6pQdlpusgfyZapxiZ_Wi7Ff53eMC3rE,8996
|
113
100
|
magic_pdf/model/sub_modules/reading_oreder/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
114
101
|
magic_pdf/model/sub_modules/reading_oreder/layoutreader/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
@@ -121,66 +108,28 @@ magic_pdf/model/sub_modules/table/rapidtable/rapid_table.py,sha256=_FKKOSKeceusx
|
|
121
108
|
magic_pdf/model/sub_modules/table/structeqtable/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
122
109
|
magic_pdf/model/sub_modules/table/structeqtable/struct_eqtable.py,sha256=SrNPm-uOFEvN5muFGbXTAuwzXm-rCiaihVdqbydIBIA,1131
|
123
110
|
magic_pdf/model/sub_modules/table/tablemaster/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
124
|
-
magic_pdf/model/sub_modules/table/tablemaster/tablemaster_paddle.py,sha256=
|
111
|
+
magic_pdf/model/sub_modules/table/tablemaster/tablemaster_paddle.py,sha256=QEQ-56AzoIAU7UWsEidWW_KDOY5r16qm2kSpox8cxq4,2755
|
125
112
|
magic_pdf/para/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
126
|
-
magic_pdf/para/
|
127
|
-
magic_pdf/para/block_termination_processor.py,sha256=YU3ZYqJy9e3OQmOuQYZrR6AUpmAlQ0mhj0PgZZPZ_fM,17957
|
128
|
-
magic_pdf/para/commons.py,sha256=VdJ8SY9qJTtcRyx8HH-PFeZSJwL4Tsf50197RD_-dwc,5414
|
129
|
-
magic_pdf/para/denoise.py,sha256=J7dM2KNnbdzAd2A3agB04U6L1GL9RrhAs-MLrq-_Ftg,10443
|
130
|
-
magic_pdf/para/draw.py,sha256=KyWc03do_WuBKQ028HYzepYwbIkel9ID0uqRhuPVOHc,5643
|
131
|
-
magic_pdf/para/exceptions.py,sha256=kpjGxrSZ-drNmoKlmuQ0asTjI8cKKKWsdDDBoDHQP9M,4978
|
132
|
-
magic_pdf/para/layout_match_processor.py,sha256=yr4FEO7GJ502udShqGRqIJQ_FQxoa0aG_mhmWd8nLwI,1554
|
133
|
-
magic_pdf/para/para_split.py,sha256=z7nYeg86BjZOAdJNMwYKSu51W9evurtl3cy1ZUcQLlw,33222
|
134
|
-
magic_pdf/para/para_split_v2.py,sha256=vJJqqMMKbv8D702nODThL-5hjkgZ7Vl2BTmEIdwmmDw,39051
|
135
|
-
magic_pdf/para/para_split_v3.py,sha256=atfELVRx-90paAS3nZptgP0qG8UpTTaj3LG_2x3NAlQ,15977
|
136
|
-
magic_pdf/para/raw_processor.py,sha256=mHxD9FrdOSXH7NqM41s55URyCyuyACvm9kKtowkIb3k,6317
|
137
|
-
magic_pdf/para/stats.py,sha256=-6Pf9Y8jkP1uJOYWiHUjw9Lb-Fb9GY7MHr_ok7x2GX0,9731
|
138
|
-
magic_pdf/para/title_processor.py,sha256=pYZv9vEkIjAtCz8jIUtl9AVUy_ib5SdAZmMVoZtsMRI,38593
|
113
|
+
magic_pdf/para/para_split_v3.py,sha256=UOQe0HUVX7FAlMbJp1OkGfdM7JECWeqscv3s8Hge7ps,16922
|
139
114
|
magic_pdf/pipe/AbsPipe.py,sha256=jPtAa0pz_vPddya3ZpUk6UrGqp8PcBdLONO1spzavQo,4371
|
140
115
|
magic_pdf/pipe/OCRPipe.py,sha256=nuN-zpUzu--gyrC0_vsvvilAyK7Mp3Tom_UOnsur1ps,2158
|
141
116
|
magic_pdf/pipe/TXTPipe.py,sha256=5OFo2e8U5Y24wJrFDEJghBDpklnKFEnzKTYVnnhQssE,2159
|
142
117
|
magic_pdf/pipe/UNIPipe.py,sha256=ik0xXPdsHo7Un0gFpLC5ul04BP3Omd2mp5gqem40deE,4807
|
143
118
|
magic_pdf/pipe/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
144
|
-
magic_pdf/post_proc/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
145
|
-
magic_pdf/post_proc/detect_para.py,sha256=5LX86ueHQGOV9CNimAxqZH4R3KTi78leum1de_Na0pw,126181
|
146
|
-
magic_pdf/post_proc/pdf_post_filter.py,sha256=3EJDovQPckPKJaBY1wvAty-LGKyRG63WICY_bA_Kfbs,2501
|
147
|
-
magic_pdf/post_proc/remove_footnote.py,sha256=701P7xRu6gzLaEHfb2xkYpLZI4CwK2FAo7Ggho4bOTI,7596
|
148
119
|
magic_pdf/pre_proc/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
149
|
-
magic_pdf/pre_proc/
|
150
|
-
magic_pdf/pre_proc/
|
151
|
-
magic_pdf/pre_proc/
|
152
|
-
magic_pdf/pre_proc/
|
153
|
-
magic_pdf/pre_proc/
|
154
|
-
magic_pdf/pre_proc/detect_footer_header_by_statistics.py,sha256=924soXZ51QVpitPgVgnwbC7BqOZI30j5hGW5zP86y-w,11250
|
155
|
-
magic_pdf/pre_proc/detect_footnote.py,sha256=UxFuTCRwXdAv3wKCgRQJJVt12hM9O9oPTwzPAChQXoM,8309
|
156
|
-
magic_pdf/pre_proc/detect_header.py,sha256=KOmRehgKMuMqNa_2weXkdNSiRVWMFgLMQE4e1itbY7g,2848
|
157
|
-
magic_pdf/pre_proc/detect_images.py,sha256=8DwGGTb5IjxqADZDTc_ngwJrTYXxK2qpRqI2FBoPr00,30432
|
158
|
-
magic_pdf/pre_proc/detect_page_number.py,sha256=qvYrBbCtBbREvw-MySL_p7byCRvcm1fkLJ5ZB4TP8OM,2848
|
159
|
-
magic_pdf/pre_proc/detect_tables.py,sha256=srJzgLVeVuOsqnESqfdJfVukTF84K8qmI5mgFX_BZGs,2800
|
160
|
-
magic_pdf/pre_proc/equations_replace.py,sha256=7mexRPwD9C_UJ-SbvO_-XnpcnN7YtGUUznmPjHbjhnw,20526
|
161
|
-
magic_pdf/pre_proc/fix_image.py,sha256=5MOfkXc8abfIp49g-68vll40wwTUZ5tcQ2gtsJuFmvs,11486
|
162
|
-
magic_pdf/pre_proc/fix_table.py,sha256=20sqJe27fAXcL7_C0qQ9mpsggmH37WuX-wPYWyRgACA,13227
|
163
|
-
magic_pdf/pre_proc/main_text_font.py,sha256=1gkjvPuBdKC4oVFkLvnRm2zghsLtVlfAEMKXouyVonM,1048
|
164
|
-
magic_pdf/pre_proc/ocr_detect_all_bboxes.py,sha256=DMc2H2xGqVePBReZu5AQbPdvDw3sxOssmujCLlNW3Vs,14143
|
165
|
-
magic_pdf/pre_proc/ocr_detect_layout.py,sha256=DW0_HXzmcbW22cXKIYFsyZNFh8mEjSHXIFVjXndJsvQ,5878
|
166
|
-
magic_pdf/pre_proc/ocr_dict_merge.py,sha256=Au8y1NBhbWpq_VuPLg3b9dAMUhyPS71xtTghtd21K5M,14273
|
167
|
-
magic_pdf/pre_proc/ocr_span_list_modify.py,sha256=9DxEyy1pH87g4T_JEgI3cTVCL2TVrEBl38wsmqhQM4k,12758
|
168
|
-
magic_pdf/pre_proc/pdf_pre_filter.py,sha256=qvNlNyj4Mc3qa73mgfkp0PMR-ucABbx3mMcyVipaEpQ,2776
|
169
|
-
magic_pdf/pre_proc/post_layout_split.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
120
|
+
magic_pdf/pre_proc/construct_page_dict.py,sha256=OFmq5XRKi6fYIo-lmGlL-NB16Sf0egzsfEx-fT2uYrc,660
|
121
|
+
magic_pdf/pre_proc/cut_image.py,sha256=U-ttnl3lAhhmgtkR1GGyPAVm0i0-6VscXf3E2EDy3lE,1187
|
122
|
+
magic_pdf/pre_proc/ocr_detect_all_bboxes.py,sha256=xQ1L6pwQjN4xBSKEXslheip1aMFaiB0grqlX3BF-kh0,9282
|
123
|
+
magic_pdf/pre_proc/ocr_dict_merge.py,sha256=Ycgz2whzotL7kwl0-mHNV48QOQ2j4tRXqLSQrJRojYg,4847
|
124
|
+
magic_pdf/pre_proc/ocr_span_list_modify.py,sha256=a5OmIwtkXkz6fTQg6p8R-f1nA_w0rgMwKFQjfs_HwrE,2864
|
170
125
|
magic_pdf/pre_proc/remove_bbox_overlap.py,sha256=8eXNdsz9s06LX0kS0AxYSkaY1tWQQMkJfVtVSdjTQNE,3090
|
171
|
-
magic_pdf/pre_proc/remove_colored_strip_bbox.py,sha256=WVKhgeWifRdO-u2ETYffkcMOFVYIbiaZu5pMr1RpEdA,4090
|
172
|
-
magic_pdf/pre_proc/remove_footer_header.py,sha256=Igdr4jH7BUGuTcapWPiKEGKxhWH12c3VVmX5xwUVn7w,5680
|
173
|
-
magic_pdf/pre_proc/remove_rotate_bbox.py,sha256=di7geS7AFhSaAvkWZHT6J3dlXEq8uu9Z4oBYtolQjl0,8803
|
174
|
-
magic_pdf/pre_proc/resolve_bbox_conflict.py,sha256=ABl0vo8kkcCPSTI8dpXQTOH1b9R-lbzsJDDFONU6ELk,7313
|
175
|
-
magic_pdf/pre_proc/solve_line_alien.py,sha256=aNoQptPcC38Sm1I2ABhgw8jeH_5kjsRHx3VYlFFtm1g,853
|
176
|
-
magic_pdf/pre_proc/statistics.py,sha256=_9jGlXq0iXd03UMxB92ZqCiu7cjNkG5vHvFlTF_9ytA,220
|
177
126
|
magic_pdf/resources/fasttext-langdetect/lid.176.ftz,sha256=jzRyz-hzintgmejpmcPL-uDc0VaWqsfXc4qAOdtgPoM,938013
|
178
127
|
magic_pdf/resources/model_config/model_configs.yaml,sha256=S2BnVQxPd0xsZswn9WqJKTfnqd7ayY5lRwDVifTEAfw,290
|
179
128
|
magic_pdf/resources/model_config/UniMERNet/demo.yaml,sha256=Jdaim2D2lAYrV9rhc1X5Sy2_IacGOrfysJhxEUgSElo,827
|
180
129
|
magic_pdf/resources/model_config/layoutlmv3/layoutlmv3_base_inference.yaml,sha256=9aNAEYgpHTAWpcUrDvuPG2y4V-Qw8QdcJefi96y8yDU,6109
|
181
130
|
magic_pdf/rw/AbsReaderWriter.py,sha256=2H5SDJfAAOX9kPfel06a8VRCHxD1Y8aPbWEkQDdn9JM,452
|
182
131
|
magic_pdf/rw/DiskReaderWriter.py,sha256=7ZAekH8V6xlBo_1WeSZ6sNwAj2WGPtjNl50zq1CoMDY,2614
|
183
|
-
magic_pdf/rw/S3ReaderWriter.py,sha256=
|
132
|
+
magic_pdf/rw/S3ReaderWriter.py,sha256=LmbtA-pZlC745nnSUs67C1iqSrBDS7IzE6QC8YMB644,5293
|
184
133
|
magic_pdf/rw/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
185
134
|
magic_pdf/spark/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
186
135
|
magic_pdf/spark/spark_api.py,sha256=BYO6zlRW0cEnIUB3ZzNQTu_LsPHEVitqiUN7gy3x_wo,1124
|
@@ -190,9 +139,9 @@ magic_pdf/tools/cli_dev.py,sha256=3RbubfTIagWoFYdu8wSDanr-BJDjFGeDet55jTy7He0,39
|
|
190
139
|
magic_pdf/tools/common.py,sha256=ILTv8YjnK-XTVV5nzak3Sm-EJJXjG1hJJghlYKgYVBQ,6809
|
191
140
|
magic_pdf/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
192
141
|
magic_pdf/utils/annotations.py,sha256=82ou3uELNbQWa9hOFFkVt0gsIskAKf5msCv5J2IJ5V0,211
|
193
|
-
magic_pdf-0.10.
|
194
|
-
magic_pdf-0.10.
|
195
|
-
magic_pdf-0.10.
|
196
|
-
magic_pdf-0.10.
|
197
|
-
magic_pdf-0.10.
|
198
|
-
magic_pdf-0.10.
|
142
|
+
magic_pdf-0.10.3.dist-info/LICENSE.md,sha256=jVa0BUaKrRH4erV2P5AeJ24I2WRv9chIGxditreJ6e0,34524
|
143
|
+
magic_pdf-0.10.3.dist-info/METADATA,sha256=R86XDaSfj1tcu3etkvhQfg3FSoARv8mKW2KpwjsdqWs,36992
|
144
|
+
magic_pdf-0.10.3.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
|
145
|
+
magic_pdf-0.10.3.dist-info/entry_points.txt,sha256=wXwYke3j8fqDQTocUspL-CqDUEv3Tfcwp09fM8dZAhA,98
|
146
|
+
magic_pdf-0.10.3.dist-info/top_level.txt,sha256=J9I0AzmHWGkp9c6DL8Oe4mEx3yYphLzkRn4H25Lg1rE,10
|
147
|
+
magic_pdf-0.10.3.dist-info/RECORD,,
|