magic-pdf 0.7.1__py3-none-any.whl → 0.8.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (34) hide show
  1. magic_pdf/dict2md/ocr_mkcontent.py +130 -76
  2. magic_pdf/integrations/__init__.py +0 -0
  3. magic_pdf/integrations/rag/__init__.py +0 -0
  4. magic_pdf/integrations/rag/api.py +82 -0
  5. magic_pdf/integrations/rag/type.py +82 -0
  6. magic_pdf/integrations/rag/utils.py +285 -0
  7. magic_pdf/layout/layout_sort.py +472 -283
  8. magic_pdf/libs/boxbase.py +188 -149
  9. magic_pdf/libs/draw_bbox.py +113 -87
  10. magic_pdf/libs/ocr_content_type.py +21 -18
  11. magic_pdf/libs/version.py +1 -1
  12. magic_pdf/model/doc_analyze_by_custom_model.py +14 -2
  13. magic_pdf/model/magic_model.py +283 -166
  14. magic_pdf/model/model_list.py +8 -0
  15. magic_pdf/model/pdf_extract_kit.py +105 -15
  16. magic_pdf/model/pek_sub_modules/self_modify.py +84 -0
  17. magic_pdf/para/para_split_v2.py +26 -27
  18. magic_pdf/pdf_parse_union_core.py +34 -6
  19. magic_pdf/pipe/AbsPipe.py +4 -1
  20. magic_pdf/pipe/OCRPipe.py +7 -4
  21. magic_pdf/pipe/TXTPipe.py +7 -4
  22. magic_pdf/pipe/UNIPipe.py +11 -6
  23. magic_pdf/pre_proc/ocr_detect_all_bboxes.py +12 -3
  24. magic_pdf/pre_proc/ocr_dict_merge.py +60 -59
  25. magic_pdf/tools/cli.py +56 -29
  26. magic_pdf/tools/cli_dev.py +61 -64
  27. magic_pdf/tools/common.py +57 -37
  28. magic_pdf/user_api.py +17 -9
  29. {magic_pdf-0.7.1.dist-info → magic_pdf-0.8.1.dist-info}/METADATA +72 -27
  30. {magic_pdf-0.7.1.dist-info → magic_pdf-0.8.1.dist-info}/RECORD +34 -29
  31. {magic_pdf-0.7.1.dist-info → magic_pdf-0.8.1.dist-info}/LICENSE.md +0 -0
  32. {magic_pdf-0.7.1.dist-info → magic_pdf-0.8.1.dist-info}/WHEEL +0 -0
  33. {magic_pdf-0.7.1.dist-info → magic_pdf-0.8.1.dist-info}/entry_points.txt +0 -0
  34. {magic_pdf-0.7.1.dist-info → magic_pdf-0.8.1.dist-info}/top_level.txt +0 -0
@@ -133,6 +133,7 @@ def remove_need_drop_blocks(all_bboxes, discarded_blocks):
133
133
 
134
134
 
135
135
  def remove_overlaps_min_blocks(all_bboxes):
136
+ # 重叠block,小的不能直接删除,需要和大的那个合并成一个更大的。
136
137
  # 删除重叠blocks中较小的那些
137
138
  need_remove = []
138
139
  for block1 in all_bboxes:
@@ -142,9 +143,17 @@ def remove_overlaps_min_blocks(all_bboxes):
142
143
  block2_bbox = block2[:4]
143
144
  overlap_box = get_minbox_if_overlap_by_ratio(block1_bbox, block2_bbox, 0.8)
144
145
  if overlap_box is not None:
145
- bbox_to_remove = next((block for block in all_bboxes if block[:4] == overlap_box), None)
146
- if bbox_to_remove is not None and bbox_to_remove not in need_remove:
147
- need_remove.append(bbox_to_remove)
146
+ block_to_remove = next((block for block in all_bboxes if block[:4] == overlap_box), None)
147
+ if block_to_remove is not None and block_to_remove not in need_remove:
148
+ large_block = block1 if block1 != block_to_remove else block2
149
+ x1, y1, x2, y2 = large_block[:4]
150
+ sx1, sy1, sx2, sy2 = block_to_remove[:4]
151
+ x1 = min(x1, sx1)
152
+ y1 = min(y1, sy1)
153
+ x2 = max(x2, sx2)
154
+ y2 = max(y2, sy2)
155
+ large_block[:4] = [x1, y1, x2, y2]
156
+ need_remove.append(block_to_remove)
148
157
 
149
158
  if len(need_remove) > 0:
150
159
  for block in need_remove:
@@ -1,18 +1,15 @@
1
- from loguru import logger
2
-
3
- from magic_pdf.libs.boxbase import __is_overlaps_y_exceeds_threshold, get_minbox_if_overlap_by_ratio, \
4
- calculate_overlap_area_in_bbox1_area_ratio, _is_in_or_part_overlap_with_area_ratio
1
+ from magic_pdf.libs.boxbase import (__is_overlaps_y_exceeds_threshold,
2
+ _is_in_or_part_overlap_with_area_ratio,
3
+ calculate_overlap_area_in_bbox1_area_ratio)
5
4
  from magic_pdf.libs.drop_tag import DropTag
6
- from magic_pdf.libs.ocr_content_type import ContentType, BlockType
7
- from magic_pdf.pre_proc.ocr_span_list_modify import modify_y_axis, modify_inline_equation
8
- from magic_pdf.pre_proc.remove_bbox_overlap import remove_overlap_between_bbox_for_span
5
+ from magic_pdf.libs.ocr_content_type import BlockType, ContentType
9
6
 
10
7
 
11
8
  # 将每一个line中的span从左到右排序
12
9
  def line_sort_spans_by_left_to_right(lines):
13
10
  line_objects = []
14
11
  for line in lines:
15
- # 按照x0坐标排序
12
+ # 按照x0坐标排序
16
13
  line.sort(key=lambda span: span['bbox'][0])
17
14
  line_bbox = [
18
15
  min(span['bbox'][0] for span in line), # x0
@@ -21,8 +18,8 @@ def line_sort_spans_by_left_to_right(lines):
21
18
  max(span['bbox'][3] for span in line), # y1
22
19
  ]
23
20
  line_objects.append({
24
- "bbox": line_bbox,
25
- "spans": line,
21
+ 'bbox': line_bbox,
22
+ 'spans': line,
26
23
  })
27
24
  return line_objects
28
25
 
@@ -39,16 +36,21 @@ def merge_spans_to_line(spans):
39
36
  for span in spans[1:]:
40
37
  # 如果当前的span类型为"interline_equation" 或者 当前行中已经有"interline_equation"
41
38
  # image和table类型,同上
42
- if span['type'] in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table] or any(
43
- s['type'] in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table] for s in
44
- current_line):
39
+ if span['type'] in [
40
+ ContentType.InterlineEquation, ContentType.Image,
41
+ ContentType.Table
42
+ ] or any(s['type'] in [
43
+ ContentType.InterlineEquation, ContentType.Image,
44
+ ContentType.Table
45
+ ] for s in current_line):
45
46
  # 则开始新行
46
47
  lines.append(current_line)
47
48
  current_line = [span]
48
49
  continue
49
50
 
50
51
  # 如果当前的span与当前行的最后一个span在y轴上重叠,则添加到当前行
51
- if __is_overlaps_y_exceeds_threshold(span['bbox'], current_line[-1]['bbox']):
52
+ if __is_overlaps_y_exceeds_threshold(span['bbox'],
53
+ current_line[-1]['bbox']):
52
54
  current_line.append(span)
53
55
  else:
54
56
  # 否则,开始新行
@@ -71,7 +73,8 @@ def merge_spans_to_line_by_layout(spans, layout_bboxes):
71
73
  # 遍历spans,将每个span放入对应的layout中
72
74
  layout_sapns = []
73
75
  for span in spans:
74
- if calculate_overlap_area_in_bbox1_area_ratio(span['bbox'], layout_bbox) > 0.6:
76
+ if calculate_overlap_area_in_bbox1_area_ratio(
77
+ span['bbox'], layout_bbox) > 0.6:
75
78
  layout_sapns.append(span)
76
79
  # 如果layout_sapns不为空,则放入new_spans中
77
80
  if len(layout_sapns) > 0:
@@ -99,12 +102,10 @@ def merge_lines_to_block(lines):
99
102
  # 目前不做block拼接,先做个结构,每个block中只有一个line,block的bbox就是line的bbox
100
103
  blocks = []
101
104
  for line in lines:
102
- blocks.append(
103
- {
104
- "bbox": line["bbox"],
105
- "lines": [line],
106
- }
107
- )
105
+ blocks.append({
106
+ 'bbox': line['bbox'],
107
+ 'lines': [line],
108
+ })
108
109
  return blocks
109
110
 
110
111
 
@@ -121,7 +122,8 @@ def sort_blocks_by_layout(all_bboxes, layout_bboxes):
121
122
  if block[7] == BlockType.Footnote:
122
123
  continue
123
124
  block_bbox = block[:4]
124
- if calculate_overlap_area_in_bbox1_area_ratio(block_bbox, layout_bbox) > 0.8:
125
+ if calculate_overlap_area_in_bbox1_area_ratio(
126
+ block_bbox, layout_bbox) > 0.8:
125
127
  layout_blocks.append(block)
126
128
 
127
129
  # 如果layout_blocks不为空,则放入new_blocks中
@@ -134,7 +136,8 @@ def sort_blocks_by_layout(all_bboxes, layout_bboxes):
134
136
  # 如果new_blocks不为空,则对new_blocks中每个block进行排序
135
137
  if len(new_blocks) > 0:
136
138
  for bboxes_in_layout_block in new_blocks:
137
- bboxes_in_layout_block.sort(key=lambda x: x[1]) # 一个layout内部的box,按照y0自上而下排序
139
+ bboxes_in_layout_block.sort(
140
+ key=lambda x: x[1]) # 一个layout内部的box,按照y0自上而下排序
138
141
  sort_blocks.extend(bboxes_in_layout_block)
139
142
 
140
143
  # sort_blocks中已经包含了当前页面所有最终留下的block,且已经排好了顺序
@@ -142,9 +145,7 @@ def sort_blocks_by_layout(all_bboxes, layout_bboxes):
142
145
 
143
146
 
144
147
  def fill_spans_in_blocks(blocks, spans, radio):
145
- '''
146
- 将allspans中的span按位置关系,放入blocks中
147
- '''
148
+ """将allspans中的span按位置关系,放入blocks中."""
148
149
  block_with_spans = []
149
150
  for block in blocks:
150
151
  block_type = block[7]
@@ -156,17 +157,15 @@ def fill_spans_in_blocks(blocks, spans, radio):
156
157
  block_spans = []
157
158
  for span in spans:
158
159
  span_bbox = span['bbox']
159
- if calculate_overlap_area_in_bbox1_area_ratio(span_bbox, block_bbox) > radio:
160
+ if calculate_overlap_area_in_bbox1_area_ratio(
161
+ span_bbox, block_bbox) > radio:
160
162
  block_spans.append(span)
161
-
162
163
  '''行内公式调整, 高度调整至与同行文字高度一致(优先左侧, 其次右侧)'''
163
164
  # displayed_list = []
164
165
  # text_inline_lines = []
165
166
  # modify_y_axis(block_spans, displayed_list, text_inline_lines)
166
-
167
167
  '''模型识别错误的行间公式, type类型转换成行内公式'''
168
168
  # block_spans = modify_inline_equation(block_spans, displayed_list, text_inline_lines)
169
-
170
169
  '''bbox去除粘连''' # 去粘连会影响span的bbox,导致后续fill的时候出错
171
170
  # block_spans = remove_overlap_between_bbox_for_span(block_spans)
172
171
 
@@ -182,12 +181,9 @@ def fill_spans_in_blocks(blocks, spans, radio):
182
181
 
183
182
 
184
183
  def fix_block_spans(block_with_spans, img_blocks, table_blocks):
185
- '''
186
- 1、img_block和table_block因为包含caption和footnote的关系,存在block的嵌套关系
187
- 需要将captionfootnote的text_span放入相应img_block和table_block内的
188
- caption_block和footnote_block中
189
- 2、同时需要删除block中的spans字段
190
- '''
184
+ """1、img_block和table_block因为包含caption和footnote的关系,存在block的嵌套关系
185
+ 需要将caption和footnote的text_span放入相应img_block和table_block内的
186
+ caption_blockfootnote_block中 2、同时需要删除block中的spans字段."""
191
187
  fix_blocks = []
192
188
  for block in block_with_spans:
193
189
  block_type = block['type']
@@ -218,16 +214,13 @@ def merge_spans_to_block(spans: list, block_bbox: list, block_type: str):
218
214
  block_spans = []
219
215
  # 如果有img_caption,则将img_block中的text_spans放入img_caption_block中
220
216
  for span in spans:
221
- if calculate_overlap_area_in_bbox1_area_ratio(span['bbox'], block_bbox) > 0.6:
217
+ if calculate_overlap_area_in_bbox1_area_ratio(span['bbox'],
218
+ block_bbox) > 0.6:
222
219
  block_spans.append(span)
223
220
  block_lines = merge_spans_to_line(block_spans)
224
221
  # 对line中的span进行排序
225
222
  sort_block_lines = line_sort_spans_by_left_to_right(block_lines)
226
- block = {
227
- 'bbox': block_bbox,
228
- 'type': block_type,
229
- 'lines': sort_block_lines
230
- }
223
+ block = {'bbox': block_bbox, 'type': block_type, 'lines': sort_block_lines}
231
224
  return block, block_spans
232
225
 
233
226
 
@@ -237,11 +230,7 @@ def make_body_block(span: dict, block_bbox: list, block_type: str):
237
230
  'bbox': block_bbox,
238
231
  'spans': [span],
239
232
  }
240
- body_block = {
241
- 'bbox': block_bbox,
242
- 'type': block_type,
243
- 'lines': [body_line]
244
- }
233
+ body_block = {'bbox': block_bbox, 'type': block_type, 'lines': [body_line]}
245
234
  return body_block
246
235
 
247
236
 
@@ -249,13 +238,16 @@ def fix_image_block(block, img_blocks):
249
238
  block['blocks'] = []
250
239
  # 遍历img_blocks,找到与当前block匹配的img_block
251
240
  for img_block in img_blocks:
252
- if _is_in_or_part_overlap_with_area_ratio(block['bbox'], img_block['bbox'], 0.95):
241
+ if _is_in_or_part_overlap_with_area_ratio(block['bbox'],
242
+ img_block['bbox'], 0.95):
253
243
 
254
244
  # 创建img_body_block
255
245
  for span in block['spans']:
256
- if span['type'] == ContentType.Image and img_block['img_body_bbox'] == span['bbox']:
246
+ if span['type'] == ContentType.Image and img_block[
247
+ 'img_body_bbox'] == span['bbox']:
257
248
  # 创建img_body_block
258
- img_body_block = make_body_block(span, img_block['img_body_bbox'], BlockType.ImageBody)
249
+ img_body_block = make_body_block(
250
+ span, img_block['img_body_bbox'], BlockType.ImageBody)
259
251
  block['blocks'].append(img_body_block)
260
252
 
261
253
  # 从spans中移除img_body_block中已经放入的span
@@ -265,10 +257,15 @@ def fix_image_block(block, img_blocks):
265
257
  # 根据list长度,判断img_block中是否有img_caption
266
258
  if img_block['img_caption_bbox'] is not None:
267
259
  img_caption_block, img_caption_spans = merge_spans_to_block(
268
- block['spans'], img_block['img_caption_bbox'], BlockType.ImageCaption
269
- )
260
+ block['spans'], img_block['img_caption_bbox'],
261
+ BlockType.ImageCaption)
270
262
  block['blocks'].append(img_caption_block)
271
263
 
264
+ if img_block['img_footnote_bbox'] is not None:
265
+ img_footnote_block, img_footnote_spans = merge_spans_to_block(
266
+ block['spans'], img_block['img_footnote_bbox'],
267
+ BlockType.ImageFootnote)
268
+ block['blocks'].append(img_footnote_block)
272
269
  break
273
270
  del block['spans']
274
271
  return block
@@ -278,13 +275,17 @@ def fix_table_block(block, table_blocks):
278
275
  block['blocks'] = []
279
276
  # 遍历table_blocks,找到与当前block匹配的table_block
280
277
  for table_block in table_blocks:
281
- if _is_in_or_part_overlap_with_area_ratio(block['bbox'], table_block['bbox'], 0.95):
278
+ if _is_in_or_part_overlap_with_area_ratio(block['bbox'],
279
+ table_block['bbox'], 0.95):
282
280
 
283
281
  # 创建table_body_block
284
282
  for span in block['spans']:
285
- if span['type'] == ContentType.Table and table_block['table_body_bbox'] == span['bbox']:
283
+ if span['type'] == ContentType.Table and table_block[
284
+ 'table_body_bbox'] == span['bbox']:
286
285
  # 创建table_body_block
287
- table_body_block = make_body_block(span, table_block['table_body_bbox'], BlockType.TableBody)
286
+ table_body_block = make_body_block(
287
+ span, table_block['table_body_bbox'],
288
+ BlockType.TableBody)
288
289
  block['blocks'].append(table_body_block)
289
290
 
290
291
  # 从spans中移除img_body_block中已经放入的span
@@ -294,8 +295,8 @@ def fix_table_block(block, table_blocks):
294
295
  # 根据list长度,判断table_block中是否有caption
295
296
  if table_block['table_caption_bbox'] is not None:
296
297
  table_caption_block, table_caption_spans = merge_spans_to_block(
297
- block['spans'], table_block['table_caption_bbox'], BlockType.TableCaption
298
- )
298
+ block['spans'], table_block['table_caption_bbox'],
299
+ BlockType.TableCaption)
299
300
  block['blocks'].append(table_caption_block)
300
301
 
301
302
  # 如果table_caption_block_spans不为空
@@ -307,8 +308,8 @@ def fix_table_block(block, table_blocks):
307
308
  # 根据list长度,判断table_block中是否有table_note
308
309
  if table_block['table_footnote_bbox'] is not None:
309
310
  table_footnote_block, table_footnote_spans = merge_spans_to_block(
310
- block['spans'], table_block['table_footnote_bbox'], BlockType.TableFootnote
311
- )
311
+ block['spans'], table_block['table_footnote_bbox'],
312
+ BlockType.TableFootnote)
312
313
  block['blocks'].append(table_footnote_block)
313
314
 
314
315
  break
magic_pdf/tools/cli.py CHANGED
@@ -1,53 +1,77 @@
1
1
  import os
2
+ from pathlib import Path
3
+
2
4
  import click
3
5
  from loguru import logger
4
- from pathlib import Path
5
6
 
6
- from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
7
- from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
8
7
  import magic_pdf.model as model_config
9
- from magic_pdf.tools.common import parse_pdf_methods, do_parse
10
8
  from magic_pdf.libs.version import __version__
9
+ from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
10
+ from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
11
+ from magic_pdf.tools.common import do_parse, parse_pdf_methods
11
12
 
12
13
 
13
14
  @click.command()
14
- @click.version_option(__version__, "--version", "-v", help="display the version and exit")
15
+ @click.version_option(__version__,
16
+ '--version',
17
+ '-v',
18
+ help='display the version and exit')
15
19
  @click.option(
16
- "-p",
17
- "--path",
18
- "path",
20
+ '-p',
21
+ '--path',
22
+ 'path',
19
23
  type=click.Path(exists=True),
20
24
  required=True,
21
- help="local pdf filepath or directory",
25
+ help='local pdf filepath or directory',
22
26
  )
23
27
  @click.option(
24
- "-o",
25
- "--output-dir",
26
- "output_dir",
27
- type=str,
28
- help="output local directory",
29
- default="",
28
+ '-o',
29
+ '--output-dir',
30
+ 'output_dir',
31
+ type=click.Path(),
32
+ required=True,
33
+ help='output local directory',
30
34
  )
31
35
  @click.option(
32
- "-m",
33
- "--method",
34
- "method",
36
+ '-m',
37
+ '--method',
38
+ 'method',
35
39
  type=parse_pdf_methods,
36
- help="""the method for parsing pdf.
40
+ help="""the method for parsing pdf.
37
41
  ocr: using ocr technique to extract information from pdf.
38
42
  txt: suitable for the text-based pdf only and outperform ocr.
39
43
  auto: automatically choose the best method for parsing pdf from ocr and txt.
40
44
  without method specified, auto will be used by default.""",
41
- default="auto",
45
+ default='auto',
46
+ )
47
+ @click.option(
48
+ '-d',
49
+ '--debug',
50
+ 'debug_able',
51
+ type=bool,
52
+ help='Enables detailed debugging information during the execution of the CLI commands.',
53
+ default=False,
54
+ )
55
+ @click.option(
56
+ '-s',
57
+ '--start',
58
+ 'start_page_id',
59
+ type=int,
60
+ help='The starting page for PDF parsing, beginning from 0.',
61
+ default=0,
62
+ )
63
+ @click.option(
64
+ '-e',
65
+ '--end',
66
+ 'end_page_id',
67
+ type=int,
68
+ help='The ending page for PDF parsing, beginning from 0.',
69
+ default=None,
42
70
  )
43
- def cli(path, output_dir, method):
71
+ def cli(path, output_dir, method, debug_able, start_page_id, end_page_id):
44
72
  model_config.__use_inside_model__ = True
45
- model_config.__model_mode__ = "full"
46
- if output_dir == "":
47
- if os.path.isdir(path):
48
- output_dir = os.path.join(path, "output")
49
- else:
50
- output_dir = os.path.join(os.path.dirname(path), "output")
73
+ model_config.__model_mode__ = 'full'
74
+ os.makedirs(output_dir, exist_ok=True)
51
75
 
52
76
  def read_fn(path):
53
77
  disk_rw = DiskReaderWriter(os.path.dirname(path))
@@ -63,17 +87,20 @@ def cli(path, output_dir, method):
63
87
  pdf_data,
64
88
  [],
65
89
  method,
90
+ debug_able,
91
+ start_page_id=start_page_id,
92
+ end_page_id=end_page_id,
66
93
  )
67
94
 
68
95
  except Exception as e:
69
96
  logger.exception(e)
70
97
 
71
98
  if os.path.isdir(path):
72
- for doc_path in Path(path).glob("*.pdf"):
99
+ for doc_path in Path(path).glob('*.pdf'):
73
100
  parse_doc(doc_path)
74
101
  else:
75
102
  parse_doc(path)
76
103
 
77
104
 
78
- if __name__ == "__main__":
105
+ if __name__ == '__main__':
79
106
  cli()
@@ -1,35 +1,32 @@
1
- import os
2
1
  import json as json_parse
3
- import click
2
+ import os
4
3
  from pathlib import Path
5
- from magic_pdf.libs.path_utils import (
6
- parse_s3path,
7
- parse_s3_range_params,
8
- remove_non_official_s3_args,
9
- )
10
- from magic_pdf.libs.config_reader import (
11
- get_s3_config,
12
- )
13
- from magic_pdf.rw.S3ReaderWriter import S3ReaderWriter
14
- from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
15
- from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
4
+
5
+ import click
6
+
16
7
  import magic_pdf.model as model_config
17
- from magic_pdf.tools.common import parse_pdf_methods, do_parse
8
+ from magic_pdf.libs.config_reader import get_s3_config
9
+ from magic_pdf.libs.path_utils import (parse_s3_range_params, parse_s3path,
10
+ remove_non_official_s3_args)
18
11
  from magic_pdf.libs.version import __version__
12
+ from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
13
+ from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
14
+ from magic_pdf.rw.S3ReaderWriter import S3ReaderWriter
15
+ from magic_pdf.tools.common import do_parse, parse_pdf_methods
19
16
 
20
17
 
21
18
  def read_s3_path(s3path):
22
19
  bucket, key = parse_s3path(s3path)
23
20
 
24
21
  s3_ak, s3_sk, s3_endpoint = get_s3_config(bucket)
25
- s3_rw = S3ReaderWriter(
26
- s3_ak, s3_sk, s3_endpoint, "auto", remove_non_official_s3_args(s3path)
27
- )
22
+ s3_rw = S3ReaderWriter(s3_ak, s3_sk, s3_endpoint, 'auto',
23
+ remove_non_official_s3_args(s3path))
28
24
  may_range_params = parse_s3_range_params(s3path)
29
25
  if may_range_params is None or 2 != len(may_range_params):
30
26
  byte_start, byte_end = 0, None
31
27
  else:
32
- byte_start, byte_end = int(may_range_params[0]), int(may_range_params[1])
28
+ byte_start, byte_end = int(may_range_params[0]), int(
29
+ may_range_params[1])
33
30
  return s3_rw.read_offset(
34
31
  remove_non_official_s3_args(s3path),
35
32
  byte_start,
@@ -38,51 +35,47 @@ def read_s3_path(s3path):
38
35
 
39
36
 
40
37
  @click.group()
41
- @click.version_option(__version__, "--version", "-v", help="显示版本信息")
38
+ @click.version_option(__version__, '--version', '-v', help='显示版本信息')
42
39
  def cli():
43
40
  pass
44
41
 
45
42
 
46
43
  @cli.command()
47
44
  @click.option(
48
- "-j",
49
- "--jsonl",
50
- "jsonl",
45
+ '-j',
46
+ '--jsonl',
47
+ 'jsonl',
51
48
  type=str,
52
- help="输入 jsonl 路径,本地或者 s3 上的文件",
49
+ help='输入 jsonl 路径,本地或者 s3 上的文件',
53
50
  required=True,
54
51
  )
55
52
  @click.option(
56
- "-m",
57
- "--method",
58
- "method",
53
+ '-m',
54
+ '--method',
55
+ 'method',
59
56
  type=parse_pdf_methods,
60
- help="指定解析方法。txt: 文本型 pdf 解析方法, ocr: 光学识别解析 pdf, auto: 程序智能选择解析方法",
61
- default="auto",
57
+ help='指定解析方法。txt: 文本型 pdf 解析方法, ocr: 光学识别解析 pdf, auto: 程序智能选择解析方法',
58
+ default='auto',
62
59
  )
63
60
  @click.option(
64
- "-o",
65
- "--output-dir",
66
- "output_dir",
67
- type=str,
68
- help="输出到本地目录",
69
- default="",
61
+ '-o',
62
+ '--output-dir',
63
+ 'output_dir',
64
+ type=click.Path(),
65
+ required=True,
66
+ help='输出到本地目录',
70
67
  )
71
68
  def jsonl(jsonl, method, output_dir):
72
69
  model_config.__use_inside_model__ = False
73
- if jsonl.startswith("s3://"):
74
- jso = json_parse.loads(read_s3_path(jsonl).decode("utf-8"))
75
- full_jsonl_path = "."
70
+ if jsonl.startswith('s3://'):
71
+ jso = json_parse.loads(read_s3_path(jsonl).decode('utf-8'))
76
72
  else:
77
- full_jsonl_path = os.path.realpath(jsonl)
78
73
  with open(jsonl) as f:
79
74
  jso = json_parse.loads(f.readline())
80
-
81
- if output_dir == "":
82
- output_dir = os.path.join(os.path.dirname(full_jsonl_path), "output")
83
- s3_file_path = jso.get("file_location")
75
+ os.makedirs(output_dir, exist_ok=True)
76
+ s3_file_path = jso.get('file_location')
84
77
  if s3_file_path is None:
85
- s3_file_path = jso.get("path")
78
+ s3_file_path = jso.get('path')
86
79
  pdf_file_name = Path(s3_file_path).stem
87
80
  pdf_data = read_s3_path(s3_file_path)
88
81
 
@@ -91,8 +84,9 @@ def jsonl(jsonl, method, output_dir):
91
84
  output_dir,
92
85
  pdf_file_name,
93
86
  pdf_data,
94
- jso["doc_layout_result"],
87
+ jso['doc_layout_result'],
95
88
  method,
89
+ False,
96
90
  f_dump_content_list=True,
97
91
  f_draw_model_bbox=True,
98
92
  )
@@ -100,43 +94,45 @@ def jsonl(jsonl, method, output_dir):
100
94
 
101
95
  @cli.command()
102
96
  @click.option(
103
- "-p",
104
- "--pdf",
105
- "pdf",
97
+ '-p',
98
+ '--pdf',
99
+ 'pdf',
106
100
  type=click.Path(exists=True),
107
101
  required=True,
108
- help="本地 PDF 文件",
102
+ help='本地 PDF 文件',
109
103
  )
110
104
  @click.option(
111
- "-j",
112
- "--json",
113
- "json_data",
105
+ '-j',
106
+ '--json',
107
+ 'json_data',
114
108
  type=click.Path(exists=True),
115
109
  required=True,
116
- help="本地模型推理出的 json 数据",
117
- )
118
- @click.option(
119
- "-o", "--output-dir", "output_dir", type=str, help="本地输出目录", default=""
110
+ help='本地模型推理出的 json 数据',
120
111
  )
112
+ @click.option('-o',
113
+ '--output-dir',
114
+ 'output_dir',
115
+ type=click.Path(),
116
+ required=True,
117
+ help='本地输出目录')
121
118
  @click.option(
122
- "-m",
123
- "--method",
124
- "method",
119
+ '-m',
120
+ '--method',
121
+ 'method',
125
122
  type=parse_pdf_methods,
126
- help="指定解析方法。txt: 文本型 pdf 解析方法, ocr: 光学识别解析 pdf, auto: 程序智能选择解析方法",
127
- default="auto",
123
+ help='指定解析方法。txt: 文本型 pdf 解析方法, ocr: 光学识别解析 pdf, auto: 程序智能选择解析方法',
124
+ default='auto',
128
125
  )
129
126
  def pdf(pdf, json_data, output_dir, method):
130
127
  model_config.__use_inside_model__ = False
131
128
  full_pdf_path = os.path.realpath(pdf)
132
- if output_dir == "":
133
- output_dir = os.path.join(os.path.dirname(full_pdf_path), "output")
129
+ os.makedirs(output_dir, exist_ok=True)
134
130
 
135
131
  def read_fn(path):
136
132
  disk_rw = DiskReaderWriter(os.path.dirname(path))
137
133
  return disk_rw.read(os.path.basename(path), AbsReaderWriter.MODE_BIN)
138
134
 
139
- model_json_list = json_parse.loads(read_fn(json_data).decode("utf-8"))
135
+ model_json_list = json_parse.loads(read_fn(json_data).decode('utf-8'))
140
136
 
141
137
  file_name = str(Path(full_pdf_path).stem)
142
138
  pdf_data = read_fn(full_pdf_path)
@@ -146,10 +142,11 @@ def pdf(pdf, json_data, output_dir, method):
146
142
  pdf_data,
147
143
  model_json_list,
148
144
  method,
145
+ False,
149
146
  f_dump_content_list=True,
150
147
  f_draw_model_bbox=True,
151
148
  )
152
149
 
153
150
 
154
- if __name__ == "__main__":
151
+ if __name__ == '__main__':
155
152
  cli()