magic-pdf 0.7.1__py3-none-any.whl → 0.8.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- magic_pdf/dict2md/ocr_mkcontent.py +130 -76
- magic_pdf/integrations/__init__.py +0 -0
- magic_pdf/integrations/rag/__init__.py +0 -0
- magic_pdf/integrations/rag/api.py +82 -0
- magic_pdf/integrations/rag/type.py +82 -0
- magic_pdf/integrations/rag/utils.py +285 -0
- magic_pdf/layout/layout_sort.py +472 -283
- magic_pdf/libs/boxbase.py +188 -149
- magic_pdf/libs/draw_bbox.py +113 -87
- magic_pdf/libs/ocr_content_type.py +21 -18
- magic_pdf/libs/version.py +1 -1
- magic_pdf/model/doc_analyze_by_custom_model.py +14 -2
- magic_pdf/model/magic_model.py +283 -166
- magic_pdf/model/model_list.py +8 -0
- magic_pdf/model/pdf_extract_kit.py +105 -15
- magic_pdf/model/pek_sub_modules/self_modify.py +84 -0
- magic_pdf/para/para_split_v2.py +26 -27
- magic_pdf/pdf_parse_union_core.py +34 -6
- magic_pdf/pipe/AbsPipe.py +4 -1
- magic_pdf/pipe/OCRPipe.py +7 -4
- magic_pdf/pipe/TXTPipe.py +7 -4
- magic_pdf/pipe/UNIPipe.py +11 -6
- magic_pdf/pre_proc/ocr_detect_all_bboxes.py +12 -3
- magic_pdf/pre_proc/ocr_dict_merge.py +60 -59
- magic_pdf/tools/cli.py +56 -29
- magic_pdf/tools/cli_dev.py +61 -64
- magic_pdf/tools/common.py +57 -37
- magic_pdf/user_api.py +17 -9
- {magic_pdf-0.7.1.dist-info → magic_pdf-0.8.1.dist-info}/METADATA +72 -27
- {magic_pdf-0.7.1.dist-info → magic_pdf-0.8.1.dist-info}/RECORD +34 -29
- {magic_pdf-0.7.1.dist-info → magic_pdf-0.8.1.dist-info}/LICENSE.md +0 -0
- {magic_pdf-0.7.1.dist-info → magic_pdf-0.8.1.dist-info}/WHEEL +0 -0
- {magic_pdf-0.7.1.dist-info → magic_pdf-0.8.1.dist-info}/entry_points.txt +0 -0
- {magic_pdf-0.7.1.dist-info → magic_pdf-0.8.1.dist-info}/top_level.txt +0 -0
@@ -133,6 +133,7 @@ def remove_need_drop_blocks(all_bboxes, discarded_blocks):
|
|
133
133
|
|
134
134
|
|
135
135
|
def remove_overlaps_min_blocks(all_bboxes):
|
136
|
+
# 重叠block,小的不能直接删除,需要和大的那个合并成一个更大的。
|
136
137
|
# 删除重叠blocks中较小的那些
|
137
138
|
need_remove = []
|
138
139
|
for block1 in all_bboxes:
|
@@ -142,9 +143,17 @@ def remove_overlaps_min_blocks(all_bboxes):
|
|
142
143
|
block2_bbox = block2[:4]
|
143
144
|
overlap_box = get_minbox_if_overlap_by_ratio(block1_bbox, block2_bbox, 0.8)
|
144
145
|
if overlap_box is not None:
|
145
|
-
|
146
|
-
if
|
147
|
-
|
146
|
+
block_to_remove = next((block for block in all_bboxes if block[:4] == overlap_box), None)
|
147
|
+
if block_to_remove is not None and block_to_remove not in need_remove:
|
148
|
+
large_block = block1 if block1 != block_to_remove else block2
|
149
|
+
x1, y1, x2, y2 = large_block[:4]
|
150
|
+
sx1, sy1, sx2, sy2 = block_to_remove[:4]
|
151
|
+
x1 = min(x1, sx1)
|
152
|
+
y1 = min(y1, sy1)
|
153
|
+
x2 = max(x2, sx2)
|
154
|
+
y2 = max(y2, sy2)
|
155
|
+
large_block[:4] = [x1, y1, x2, y2]
|
156
|
+
need_remove.append(block_to_remove)
|
148
157
|
|
149
158
|
if len(need_remove) > 0:
|
150
159
|
for block in need_remove:
|
@@ -1,18 +1,15 @@
|
|
1
|
-
from
|
2
|
-
|
3
|
-
|
4
|
-
calculate_overlap_area_in_bbox1_area_ratio, _is_in_or_part_overlap_with_area_ratio
|
1
|
+
from magic_pdf.libs.boxbase import (__is_overlaps_y_exceeds_threshold,
|
2
|
+
_is_in_or_part_overlap_with_area_ratio,
|
3
|
+
calculate_overlap_area_in_bbox1_area_ratio)
|
5
4
|
from magic_pdf.libs.drop_tag import DropTag
|
6
|
-
from magic_pdf.libs.ocr_content_type import
|
7
|
-
from magic_pdf.pre_proc.ocr_span_list_modify import modify_y_axis, modify_inline_equation
|
8
|
-
from magic_pdf.pre_proc.remove_bbox_overlap import remove_overlap_between_bbox_for_span
|
5
|
+
from magic_pdf.libs.ocr_content_type import BlockType, ContentType
|
9
6
|
|
10
7
|
|
11
8
|
# 将每一个line中的span从左到右排序
|
12
9
|
def line_sort_spans_by_left_to_right(lines):
|
13
10
|
line_objects = []
|
14
11
|
for line in lines:
|
15
|
-
#
|
12
|
+
# 按照x0坐标排序
|
16
13
|
line.sort(key=lambda span: span['bbox'][0])
|
17
14
|
line_bbox = [
|
18
15
|
min(span['bbox'][0] for span in line), # x0
|
@@ -21,8 +18,8 @@ def line_sort_spans_by_left_to_right(lines):
|
|
21
18
|
max(span['bbox'][3] for span in line), # y1
|
22
19
|
]
|
23
20
|
line_objects.append({
|
24
|
-
|
25
|
-
|
21
|
+
'bbox': line_bbox,
|
22
|
+
'spans': line,
|
26
23
|
})
|
27
24
|
return line_objects
|
28
25
|
|
@@ -39,16 +36,21 @@ def merge_spans_to_line(spans):
|
|
39
36
|
for span in spans[1:]:
|
40
37
|
# 如果当前的span类型为"interline_equation" 或者 当前行中已经有"interline_equation"
|
41
38
|
# image和table类型,同上
|
42
|
-
if span['type'] in [
|
43
|
-
|
44
|
-
|
39
|
+
if span['type'] in [
|
40
|
+
ContentType.InterlineEquation, ContentType.Image,
|
41
|
+
ContentType.Table
|
42
|
+
] or any(s['type'] in [
|
43
|
+
ContentType.InterlineEquation, ContentType.Image,
|
44
|
+
ContentType.Table
|
45
|
+
] for s in current_line):
|
45
46
|
# 则开始新行
|
46
47
|
lines.append(current_line)
|
47
48
|
current_line = [span]
|
48
49
|
continue
|
49
50
|
|
50
51
|
# 如果当前的span与当前行的最后一个span在y轴上重叠,则添加到当前行
|
51
|
-
if __is_overlaps_y_exceeds_threshold(span['bbox'],
|
52
|
+
if __is_overlaps_y_exceeds_threshold(span['bbox'],
|
53
|
+
current_line[-1]['bbox']):
|
52
54
|
current_line.append(span)
|
53
55
|
else:
|
54
56
|
# 否则,开始新行
|
@@ -71,7 +73,8 @@ def merge_spans_to_line_by_layout(spans, layout_bboxes):
|
|
71
73
|
# 遍历spans,将每个span放入对应的layout中
|
72
74
|
layout_sapns = []
|
73
75
|
for span in spans:
|
74
|
-
if calculate_overlap_area_in_bbox1_area_ratio(
|
76
|
+
if calculate_overlap_area_in_bbox1_area_ratio(
|
77
|
+
span['bbox'], layout_bbox) > 0.6:
|
75
78
|
layout_sapns.append(span)
|
76
79
|
# 如果layout_sapns不为空,则放入new_spans中
|
77
80
|
if len(layout_sapns) > 0:
|
@@ -99,12 +102,10 @@ def merge_lines_to_block(lines):
|
|
99
102
|
# 目前不做block拼接,先做个结构,每个block中只有一个line,block的bbox就是line的bbox
|
100
103
|
blocks = []
|
101
104
|
for line in lines:
|
102
|
-
blocks.append(
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
}
|
107
|
-
)
|
105
|
+
blocks.append({
|
106
|
+
'bbox': line['bbox'],
|
107
|
+
'lines': [line],
|
108
|
+
})
|
108
109
|
return blocks
|
109
110
|
|
110
111
|
|
@@ -121,7 +122,8 @@ def sort_blocks_by_layout(all_bboxes, layout_bboxes):
|
|
121
122
|
if block[7] == BlockType.Footnote:
|
122
123
|
continue
|
123
124
|
block_bbox = block[:4]
|
124
|
-
if calculate_overlap_area_in_bbox1_area_ratio(
|
125
|
+
if calculate_overlap_area_in_bbox1_area_ratio(
|
126
|
+
block_bbox, layout_bbox) > 0.8:
|
125
127
|
layout_blocks.append(block)
|
126
128
|
|
127
129
|
# 如果layout_blocks不为空,则放入new_blocks中
|
@@ -134,7 +136,8 @@ def sort_blocks_by_layout(all_bboxes, layout_bboxes):
|
|
134
136
|
# 如果new_blocks不为空,则对new_blocks中每个block进行排序
|
135
137
|
if len(new_blocks) > 0:
|
136
138
|
for bboxes_in_layout_block in new_blocks:
|
137
|
-
bboxes_in_layout_block.sort(
|
139
|
+
bboxes_in_layout_block.sort(
|
140
|
+
key=lambda x: x[1]) # 一个layout内部的box,按照y0自上而下排序
|
138
141
|
sort_blocks.extend(bboxes_in_layout_block)
|
139
142
|
|
140
143
|
# sort_blocks中已经包含了当前页面所有最终留下的block,且已经排好了顺序
|
@@ -142,9 +145,7 @@ def sort_blocks_by_layout(all_bboxes, layout_bboxes):
|
|
142
145
|
|
143
146
|
|
144
147
|
def fill_spans_in_blocks(blocks, spans, radio):
|
145
|
-
|
146
|
-
将allspans中的span按位置关系,放入blocks中
|
147
|
-
'''
|
148
|
+
"""将allspans中的span按位置关系,放入blocks中."""
|
148
149
|
block_with_spans = []
|
149
150
|
for block in blocks:
|
150
151
|
block_type = block[7]
|
@@ -156,17 +157,15 @@ def fill_spans_in_blocks(blocks, spans, radio):
|
|
156
157
|
block_spans = []
|
157
158
|
for span in spans:
|
158
159
|
span_bbox = span['bbox']
|
159
|
-
if calculate_overlap_area_in_bbox1_area_ratio(
|
160
|
+
if calculate_overlap_area_in_bbox1_area_ratio(
|
161
|
+
span_bbox, block_bbox) > radio:
|
160
162
|
block_spans.append(span)
|
161
|
-
|
162
163
|
'''行内公式调整, 高度调整至与同行文字高度一致(优先左侧, 其次右侧)'''
|
163
164
|
# displayed_list = []
|
164
165
|
# text_inline_lines = []
|
165
166
|
# modify_y_axis(block_spans, displayed_list, text_inline_lines)
|
166
|
-
|
167
167
|
'''模型识别错误的行间公式, type类型转换成行内公式'''
|
168
168
|
# block_spans = modify_inline_equation(block_spans, displayed_list, text_inline_lines)
|
169
|
-
|
170
169
|
'''bbox去除粘连''' # 去粘连会影响span的bbox,导致后续fill的时候出错
|
171
170
|
# block_spans = remove_overlap_between_bbox_for_span(block_spans)
|
172
171
|
|
@@ -182,12 +181,9 @@ def fill_spans_in_blocks(blocks, spans, radio):
|
|
182
181
|
|
183
182
|
|
184
183
|
def fix_block_spans(block_with_spans, img_blocks, table_blocks):
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
caption_block和footnote_block中
|
189
|
-
2、同时需要删除block中的spans字段
|
190
|
-
'''
|
184
|
+
"""1、img_block和table_block因为包含caption和footnote的关系,存在block的嵌套关系
|
185
|
+
需要将caption和footnote的text_span放入相应img_block和table_block内的
|
186
|
+
caption_block和footnote_block中 2、同时需要删除block中的spans字段."""
|
191
187
|
fix_blocks = []
|
192
188
|
for block in block_with_spans:
|
193
189
|
block_type = block['type']
|
@@ -218,16 +214,13 @@ def merge_spans_to_block(spans: list, block_bbox: list, block_type: str):
|
|
218
214
|
block_spans = []
|
219
215
|
# 如果有img_caption,则将img_block中的text_spans放入img_caption_block中
|
220
216
|
for span in spans:
|
221
|
-
if calculate_overlap_area_in_bbox1_area_ratio(span['bbox'],
|
217
|
+
if calculate_overlap_area_in_bbox1_area_ratio(span['bbox'],
|
218
|
+
block_bbox) > 0.6:
|
222
219
|
block_spans.append(span)
|
223
220
|
block_lines = merge_spans_to_line(block_spans)
|
224
221
|
# 对line中的span进行排序
|
225
222
|
sort_block_lines = line_sort_spans_by_left_to_right(block_lines)
|
226
|
-
block = {
|
227
|
-
'bbox': block_bbox,
|
228
|
-
'type': block_type,
|
229
|
-
'lines': sort_block_lines
|
230
|
-
}
|
223
|
+
block = {'bbox': block_bbox, 'type': block_type, 'lines': sort_block_lines}
|
231
224
|
return block, block_spans
|
232
225
|
|
233
226
|
|
@@ -237,11 +230,7 @@ def make_body_block(span: dict, block_bbox: list, block_type: str):
|
|
237
230
|
'bbox': block_bbox,
|
238
231
|
'spans': [span],
|
239
232
|
}
|
240
|
-
body_block = {
|
241
|
-
'bbox': block_bbox,
|
242
|
-
'type': block_type,
|
243
|
-
'lines': [body_line]
|
244
|
-
}
|
233
|
+
body_block = {'bbox': block_bbox, 'type': block_type, 'lines': [body_line]}
|
245
234
|
return body_block
|
246
235
|
|
247
236
|
|
@@ -249,13 +238,16 @@ def fix_image_block(block, img_blocks):
|
|
249
238
|
block['blocks'] = []
|
250
239
|
# 遍历img_blocks,找到与当前block匹配的img_block
|
251
240
|
for img_block in img_blocks:
|
252
|
-
if _is_in_or_part_overlap_with_area_ratio(block['bbox'],
|
241
|
+
if _is_in_or_part_overlap_with_area_ratio(block['bbox'],
|
242
|
+
img_block['bbox'], 0.95):
|
253
243
|
|
254
244
|
# 创建img_body_block
|
255
245
|
for span in block['spans']:
|
256
|
-
if span['type'] == ContentType.Image and img_block[
|
246
|
+
if span['type'] == ContentType.Image and img_block[
|
247
|
+
'img_body_bbox'] == span['bbox']:
|
257
248
|
# 创建img_body_block
|
258
|
-
img_body_block = make_body_block(
|
249
|
+
img_body_block = make_body_block(
|
250
|
+
span, img_block['img_body_bbox'], BlockType.ImageBody)
|
259
251
|
block['blocks'].append(img_body_block)
|
260
252
|
|
261
253
|
# 从spans中移除img_body_block中已经放入的span
|
@@ -265,10 +257,15 @@ def fix_image_block(block, img_blocks):
|
|
265
257
|
# 根据list长度,判断img_block中是否有img_caption
|
266
258
|
if img_block['img_caption_bbox'] is not None:
|
267
259
|
img_caption_block, img_caption_spans = merge_spans_to_block(
|
268
|
-
block['spans'], img_block['img_caption_bbox'],
|
269
|
-
|
260
|
+
block['spans'], img_block['img_caption_bbox'],
|
261
|
+
BlockType.ImageCaption)
|
270
262
|
block['blocks'].append(img_caption_block)
|
271
263
|
|
264
|
+
if img_block['img_footnote_bbox'] is not None:
|
265
|
+
img_footnote_block, img_footnote_spans = merge_spans_to_block(
|
266
|
+
block['spans'], img_block['img_footnote_bbox'],
|
267
|
+
BlockType.ImageFootnote)
|
268
|
+
block['blocks'].append(img_footnote_block)
|
272
269
|
break
|
273
270
|
del block['spans']
|
274
271
|
return block
|
@@ -278,13 +275,17 @@ def fix_table_block(block, table_blocks):
|
|
278
275
|
block['blocks'] = []
|
279
276
|
# 遍历table_blocks,找到与当前block匹配的table_block
|
280
277
|
for table_block in table_blocks:
|
281
|
-
if _is_in_or_part_overlap_with_area_ratio(block['bbox'],
|
278
|
+
if _is_in_or_part_overlap_with_area_ratio(block['bbox'],
|
279
|
+
table_block['bbox'], 0.95):
|
282
280
|
|
283
281
|
# 创建table_body_block
|
284
282
|
for span in block['spans']:
|
285
|
-
if span['type'] == ContentType.Table and table_block[
|
283
|
+
if span['type'] == ContentType.Table and table_block[
|
284
|
+
'table_body_bbox'] == span['bbox']:
|
286
285
|
# 创建table_body_block
|
287
|
-
table_body_block = make_body_block(
|
286
|
+
table_body_block = make_body_block(
|
287
|
+
span, table_block['table_body_bbox'],
|
288
|
+
BlockType.TableBody)
|
288
289
|
block['blocks'].append(table_body_block)
|
289
290
|
|
290
291
|
# 从spans中移除img_body_block中已经放入的span
|
@@ -294,8 +295,8 @@ def fix_table_block(block, table_blocks):
|
|
294
295
|
# 根据list长度,判断table_block中是否有caption
|
295
296
|
if table_block['table_caption_bbox'] is not None:
|
296
297
|
table_caption_block, table_caption_spans = merge_spans_to_block(
|
297
|
-
block['spans'], table_block['table_caption_bbox'],
|
298
|
-
|
298
|
+
block['spans'], table_block['table_caption_bbox'],
|
299
|
+
BlockType.TableCaption)
|
299
300
|
block['blocks'].append(table_caption_block)
|
300
301
|
|
301
302
|
# 如果table_caption_block_spans不为空
|
@@ -307,8 +308,8 @@ def fix_table_block(block, table_blocks):
|
|
307
308
|
# 根据list长度,判断table_block中是否有table_note
|
308
309
|
if table_block['table_footnote_bbox'] is not None:
|
309
310
|
table_footnote_block, table_footnote_spans = merge_spans_to_block(
|
310
|
-
block['spans'], table_block['table_footnote_bbox'],
|
311
|
-
|
311
|
+
block['spans'], table_block['table_footnote_bbox'],
|
312
|
+
BlockType.TableFootnote)
|
312
313
|
block['blocks'].append(table_footnote_block)
|
313
314
|
|
314
315
|
break
|
magic_pdf/tools/cli.py
CHANGED
@@ -1,53 +1,77 @@
|
|
1
1
|
import os
|
2
|
+
from pathlib import Path
|
3
|
+
|
2
4
|
import click
|
3
5
|
from loguru import logger
|
4
|
-
from pathlib import Path
|
5
6
|
|
6
|
-
from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
|
7
|
-
from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
|
8
7
|
import magic_pdf.model as model_config
|
9
|
-
from magic_pdf.tools.common import parse_pdf_methods, do_parse
|
10
8
|
from magic_pdf.libs.version import __version__
|
9
|
+
from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
|
10
|
+
from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
|
11
|
+
from magic_pdf.tools.common import do_parse, parse_pdf_methods
|
11
12
|
|
12
13
|
|
13
14
|
@click.command()
|
14
|
-
@click.version_option(__version__,
|
15
|
+
@click.version_option(__version__,
|
16
|
+
'--version',
|
17
|
+
'-v',
|
18
|
+
help='display the version and exit')
|
15
19
|
@click.option(
|
16
|
-
|
17
|
-
|
18
|
-
|
20
|
+
'-p',
|
21
|
+
'--path',
|
22
|
+
'path',
|
19
23
|
type=click.Path(exists=True),
|
20
24
|
required=True,
|
21
|
-
help=
|
25
|
+
help='local pdf filepath or directory',
|
22
26
|
)
|
23
27
|
@click.option(
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
type=
|
28
|
-
|
29
|
-
|
28
|
+
'-o',
|
29
|
+
'--output-dir',
|
30
|
+
'output_dir',
|
31
|
+
type=click.Path(),
|
32
|
+
required=True,
|
33
|
+
help='output local directory',
|
30
34
|
)
|
31
35
|
@click.option(
|
32
|
-
|
33
|
-
|
34
|
-
|
36
|
+
'-m',
|
37
|
+
'--method',
|
38
|
+
'method',
|
35
39
|
type=parse_pdf_methods,
|
36
|
-
help="""the method for parsing pdf.
|
40
|
+
help="""the method for parsing pdf.
|
37
41
|
ocr: using ocr technique to extract information from pdf.
|
38
42
|
txt: suitable for the text-based pdf only and outperform ocr.
|
39
43
|
auto: automatically choose the best method for parsing pdf from ocr and txt.
|
40
44
|
without method specified, auto will be used by default.""",
|
41
|
-
default=
|
45
|
+
default='auto',
|
46
|
+
)
|
47
|
+
@click.option(
|
48
|
+
'-d',
|
49
|
+
'--debug',
|
50
|
+
'debug_able',
|
51
|
+
type=bool,
|
52
|
+
help='Enables detailed debugging information during the execution of the CLI commands.',
|
53
|
+
default=False,
|
54
|
+
)
|
55
|
+
@click.option(
|
56
|
+
'-s',
|
57
|
+
'--start',
|
58
|
+
'start_page_id',
|
59
|
+
type=int,
|
60
|
+
help='The starting page for PDF parsing, beginning from 0.',
|
61
|
+
default=0,
|
62
|
+
)
|
63
|
+
@click.option(
|
64
|
+
'-e',
|
65
|
+
'--end',
|
66
|
+
'end_page_id',
|
67
|
+
type=int,
|
68
|
+
help='The ending page for PDF parsing, beginning from 0.',
|
69
|
+
default=None,
|
42
70
|
)
|
43
|
-
def cli(path, output_dir, method):
|
71
|
+
def cli(path, output_dir, method, debug_able, start_page_id, end_page_id):
|
44
72
|
model_config.__use_inside_model__ = True
|
45
|
-
model_config.__model_mode__ =
|
46
|
-
|
47
|
-
if os.path.isdir(path):
|
48
|
-
output_dir = os.path.join(path, "output")
|
49
|
-
else:
|
50
|
-
output_dir = os.path.join(os.path.dirname(path), "output")
|
73
|
+
model_config.__model_mode__ = 'full'
|
74
|
+
os.makedirs(output_dir, exist_ok=True)
|
51
75
|
|
52
76
|
def read_fn(path):
|
53
77
|
disk_rw = DiskReaderWriter(os.path.dirname(path))
|
@@ -63,17 +87,20 @@ def cli(path, output_dir, method):
|
|
63
87
|
pdf_data,
|
64
88
|
[],
|
65
89
|
method,
|
90
|
+
debug_able,
|
91
|
+
start_page_id=start_page_id,
|
92
|
+
end_page_id=end_page_id,
|
66
93
|
)
|
67
94
|
|
68
95
|
except Exception as e:
|
69
96
|
logger.exception(e)
|
70
97
|
|
71
98
|
if os.path.isdir(path):
|
72
|
-
for doc_path in Path(path).glob(
|
99
|
+
for doc_path in Path(path).glob('*.pdf'):
|
73
100
|
parse_doc(doc_path)
|
74
101
|
else:
|
75
102
|
parse_doc(path)
|
76
103
|
|
77
104
|
|
78
|
-
if __name__ ==
|
105
|
+
if __name__ == '__main__':
|
79
106
|
cli()
|
magic_pdf/tools/cli_dev.py
CHANGED
@@ -1,35 +1,32 @@
|
|
1
|
-
import os
|
2
1
|
import json as json_parse
|
3
|
-
import
|
2
|
+
import os
|
4
3
|
from pathlib import Path
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
remove_non_official_s3_args,
|
9
|
-
)
|
10
|
-
from magic_pdf.libs.config_reader import (
|
11
|
-
get_s3_config,
|
12
|
-
)
|
13
|
-
from magic_pdf.rw.S3ReaderWriter import S3ReaderWriter
|
14
|
-
from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
|
15
|
-
from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
|
4
|
+
|
5
|
+
import click
|
6
|
+
|
16
7
|
import magic_pdf.model as model_config
|
17
|
-
from magic_pdf.
|
8
|
+
from magic_pdf.libs.config_reader import get_s3_config
|
9
|
+
from magic_pdf.libs.path_utils import (parse_s3_range_params, parse_s3path,
|
10
|
+
remove_non_official_s3_args)
|
18
11
|
from magic_pdf.libs.version import __version__
|
12
|
+
from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
|
13
|
+
from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
|
14
|
+
from magic_pdf.rw.S3ReaderWriter import S3ReaderWriter
|
15
|
+
from magic_pdf.tools.common import do_parse, parse_pdf_methods
|
19
16
|
|
20
17
|
|
21
18
|
def read_s3_path(s3path):
|
22
19
|
bucket, key = parse_s3path(s3path)
|
23
20
|
|
24
21
|
s3_ak, s3_sk, s3_endpoint = get_s3_config(bucket)
|
25
|
-
s3_rw = S3ReaderWriter(
|
26
|
-
|
27
|
-
)
|
22
|
+
s3_rw = S3ReaderWriter(s3_ak, s3_sk, s3_endpoint, 'auto',
|
23
|
+
remove_non_official_s3_args(s3path))
|
28
24
|
may_range_params = parse_s3_range_params(s3path)
|
29
25
|
if may_range_params is None or 2 != len(may_range_params):
|
30
26
|
byte_start, byte_end = 0, None
|
31
27
|
else:
|
32
|
-
byte_start, byte_end = int(may_range_params[0]), int(
|
28
|
+
byte_start, byte_end = int(may_range_params[0]), int(
|
29
|
+
may_range_params[1])
|
33
30
|
return s3_rw.read_offset(
|
34
31
|
remove_non_official_s3_args(s3path),
|
35
32
|
byte_start,
|
@@ -38,51 +35,47 @@ def read_s3_path(s3path):
|
|
38
35
|
|
39
36
|
|
40
37
|
@click.group()
|
41
|
-
@click.version_option(__version__,
|
38
|
+
@click.version_option(__version__, '--version', '-v', help='显示版本信息')
|
42
39
|
def cli():
|
43
40
|
pass
|
44
41
|
|
45
42
|
|
46
43
|
@cli.command()
|
47
44
|
@click.option(
|
48
|
-
|
49
|
-
|
50
|
-
|
45
|
+
'-j',
|
46
|
+
'--jsonl',
|
47
|
+
'jsonl',
|
51
48
|
type=str,
|
52
|
-
help=
|
49
|
+
help='输入 jsonl 路径,本地或者 s3 上的文件',
|
53
50
|
required=True,
|
54
51
|
)
|
55
52
|
@click.option(
|
56
|
-
|
57
|
-
|
58
|
-
|
53
|
+
'-m',
|
54
|
+
'--method',
|
55
|
+
'method',
|
59
56
|
type=parse_pdf_methods,
|
60
|
-
help=
|
61
|
-
default=
|
57
|
+
help='指定解析方法。txt: 文本型 pdf 解析方法, ocr: 光学识别解析 pdf, auto: 程序智能选择解析方法',
|
58
|
+
default='auto',
|
62
59
|
)
|
63
60
|
@click.option(
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
type=
|
68
|
-
|
69
|
-
|
61
|
+
'-o',
|
62
|
+
'--output-dir',
|
63
|
+
'output_dir',
|
64
|
+
type=click.Path(),
|
65
|
+
required=True,
|
66
|
+
help='输出到本地目录',
|
70
67
|
)
|
71
68
|
def jsonl(jsonl, method, output_dir):
|
72
69
|
model_config.__use_inside_model__ = False
|
73
|
-
if jsonl.startswith(
|
74
|
-
jso = json_parse.loads(read_s3_path(jsonl).decode(
|
75
|
-
full_jsonl_path = "."
|
70
|
+
if jsonl.startswith('s3://'):
|
71
|
+
jso = json_parse.loads(read_s3_path(jsonl).decode('utf-8'))
|
76
72
|
else:
|
77
|
-
full_jsonl_path = os.path.realpath(jsonl)
|
78
73
|
with open(jsonl) as f:
|
79
74
|
jso = json_parse.loads(f.readline())
|
80
|
-
|
81
|
-
|
82
|
-
output_dir = os.path.join(os.path.dirname(full_jsonl_path), "output")
|
83
|
-
s3_file_path = jso.get("file_location")
|
75
|
+
os.makedirs(output_dir, exist_ok=True)
|
76
|
+
s3_file_path = jso.get('file_location')
|
84
77
|
if s3_file_path is None:
|
85
|
-
s3_file_path = jso.get(
|
78
|
+
s3_file_path = jso.get('path')
|
86
79
|
pdf_file_name = Path(s3_file_path).stem
|
87
80
|
pdf_data = read_s3_path(s3_file_path)
|
88
81
|
|
@@ -91,8 +84,9 @@ def jsonl(jsonl, method, output_dir):
|
|
91
84
|
output_dir,
|
92
85
|
pdf_file_name,
|
93
86
|
pdf_data,
|
94
|
-
jso[
|
87
|
+
jso['doc_layout_result'],
|
95
88
|
method,
|
89
|
+
False,
|
96
90
|
f_dump_content_list=True,
|
97
91
|
f_draw_model_bbox=True,
|
98
92
|
)
|
@@ -100,43 +94,45 @@ def jsonl(jsonl, method, output_dir):
|
|
100
94
|
|
101
95
|
@cli.command()
|
102
96
|
@click.option(
|
103
|
-
|
104
|
-
|
105
|
-
|
97
|
+
'-p',
|
98
|
+
'--pdf',
|
99
|
+
'pdf',
|
106
100
|
type=click.Path(exists=True),
|
107
101
|
required=True,
|
108
|
-
help=
|
102
|
+
help='本地 PDF 文件',
|
109
103
|
)
|
110
104
|
@click.option(
|
111
|
-
|
112
|
-
|
113
|
-
|
105
|
+
'-j',
|
106
|
+
'--json',
|
107
|
+
'json_data',
|
114
108
|
type=click.Path(exists=True),
|
115
109
|
required=True,
|
116
|
-
help=
|
117
|
-
)
|
118
|
-
@click.option(
|
119
|
-
"-o", "--output-dir", "output_dir", type=str, help="本地输出目录", default=""
|
110
|
+
help='本地模型推理出的 json 数据',
|
120
111
|
)
|
112
|
+
@click.option('-o',
|
113
|
+
'--output-dir',
|
114
|
+
'output_dir',
|
115
|
+
type=click.Path(),
|
116
|
+
required=True,
|
117
|
+
help='本地输出目录')
|
121
118
|
@click.option(
|
122
|
-
|
123
|
-
|
124
|
-
|
119
|
+
'-m',
|
120
|
+
'--method',
|
121
|
+
'method',
|
125
122
|
type=parse_pdf_methods,
|
126
|
-
help=
|
127
|
-
default=
|
123
|
+
help='指定解析方法。txt: 文本型 pdf 解析方法, ocr: 光学识别解析 pdf, auto: 程序智能选择解析方法',
|
124
|
+
default='auto',
|
128
125
|
)
|
129
126
|
def pdf(pdf, json_data, output_dir, method):
|
130
127
|
model_config.__use_inside_model__ = False
|
131
128
|
full_pdf_path = os.path.realpath(pdf)
|
132
|
-
|
133
|
-
output_dir = os.path.join(os.path.dirname(full_pdf_path), "output")
|
129
|
+
os.makedirs(output_dir, exist_ok=True)
|
134
130
|
|
135
131
|
def read_fn(path):
|
136
132
|
disk_rw = DiskReaderWriter(os.path.dirname(path))
|
137
133
|
return disk_rw.read(os.path.basename(path), AbsReaderWriter.MODE_BIN)
|
138
134
|
|
139
|
-
model_json_list = json_parse.loads(read_fn(json_data).decode(
|
135
|
+
model_json_list = json_parse.loads(read_fn(json_data).decode('utf-8'))
|
140
136
|
|
141
137
|
file_name = str(Path(full_pdf_path).stem)
|
142
138
|
pdf_data = read_fn(full_pdf_path)
|
@@ -146,10 +142,11 @@ def pdf(pdf, json_data, output_dir, method):
|
|
146
142
|
pdf_data,
|
147
143
|
model_json_list,
|
148
144
|
method,
|
145
|
+
False,
|
149
146
|
f_dump_content_list=True,
|
150
147
|
f_draw_model_bbox=True,
|
151
148
|
)
|
152
149
|
|
153
150
|
|
154
|
-
if __name__ ==
|
151
|
+
if __name__ == '__main__':
|
155
152
|
cli()
|