magic-pdf 0.10.5__py3-none-any.whl → 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. magic_pdf/config/constants.py +7 -0
  2. magic_pdf/config/exceptions.py +7 -0
  3. magic_pdf/data/data_reader_writer/base.py +13 -1
  4. magic_pdf/data/data_reader_writer/filebase.py +1 -1
  5. magic_pdf/data/data_reader_writer/multi_bucket_s3.py +8 -6
  6. magic_pdf/data/dataset.py +188 -5
  7. magic_pdf/data/read_api.py +59 -12
  8. magic_pdf/data/utils.py +35 -0
  9. magic_pdf/dict2md/ocr_mkcontent.py +16 -15
  10. magic_pdf/filter/__init__.py +32 -0
  11. magic_pdf/filter/pdf_meta_scan.py +3 -2
  12. magic_pdf/libs/clean_memory.py +11 -4
  13. magic_pdf/libs/config_reader.py +9 -0
  14. magic_pdf/libs/draw_bbox.py +19 -22
  15. magic_pdf/libs/language.py +3 -0
  16. magic_pdf/libs/pdf_check.py +30 -30
  17. magic_pdf/libs/version.py +1 -1
  18. magic_pdf/model/__init__.py +1 -1
  19. magic_pdf/model/batch_analyze.py +275 -0
  20. magic_pdf/model/doc_analyze_by_custom_model.py +104 -92
  21. magic_pdf/model/magic_model.py +4 -435
  22. magic_pdf/model/model_list.py +1 -0
  23. magic_pdf/model/pdf_extract_kit.py +35 -5
  24. magic_pdf/model/sub_modules/language_detection/__init__.py +1 -0
  25. magic_pdf/model/sub_modules/language_detection/utils.py +82 -0
  26. magic_pdf/model/sub_modules/language_detection/yolov11/YOLOv11.py +139 -0
  27. magic_pdf/model/sub_modules/language_detection/yolov11/__init__.py +1 -0
  28. magic_pdf/model/sub_modules/layout/doclayout_yolo/DocLayoutYOLO.py +44 -7
  29. magic_pdf/model/sub_modules/mfd/yolov8/YOLOv8.py +21 -2
  30. magic_pdf/model/sub_modules/mfr/unimernet/Unimernet.py +70 -27
  31. magic_pdf/model/sub_modules/model_init.py +43 -7
  32. magic_pdf/model/sub_modules/model_utils.py +17 -5
  33. magic_pdf/model/sub_modules/ocr/paddleocr/ocr_utils.py +51 -1
  34. magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_273_mod.py +32 -6
  35. magic_pdf/model/sub_modules/table/rapidtable/rapid_table.py +42 -7
  36. magic_pdf/operators/__init__.py +94 -0
  37. magic_pdf/operators/models.py +154 -0
  38. magic_pdf/operators/pipes.py +191 -0
  39. magic_pdf/pdf_parse_union_core_v2.py +77 -27
  40. magic_pdf/post_proc/__init__.py +1 -0
  41. magic_pdf/post_proc/llm_aided.py +133 -0
  42. magic_pdf/pre_proc/ocr_span_list_modify.py +8 -0
  43. magic_pdf/pre_proc/remove_bbox_overlap.py +1 -1
  44. magic_pdf/resources/yolov11-langdetect/yolo_v11_ft.pt +0 -0
  45. magic_pdf/tools/cli.py +36 -11
  46. magic_pdf/tools/common.py +120 -61
  47. magic_pdf/utils/office_to_pdf.py +29 -0
  48. {magic_pdf-0.10.5.dist-info → magic_pdf-1.0.0.dist-info}/METADATA +78 -25
  49. {magic_pdf-0.10.5.dist-info → magic_pdf-1.0.0.dist-info}/RECORD +54 -55
  50. magic_pdf/para/__init__.py +0 -0
  51. magic_pdf/pdf_parse_by_ocr.py +0 -23
  52. magic_pdf/pdf_parse_by_txt.py +0 -24
  53. magic_pdf/pipe/AbsPipe.py +0 -98
  54. magic_pdf/pipe/OCRPipe.py +0 -41
  55. magic_pdf/pipe/TXTPipe.py +0 -41
  56. magic_pdf/pipe/UNIPipe.py +0 -98
  57. magic_pdf/pipe/__init__.py +0 -0
  58. magic_pdf/rw/AbsReaderWriter.py +0 -17
  59. magic_pdf/rw/DiskReaderWriter.py +0 -74
  60. magic_pdf/rw/S3ReaderWriter.py +0 -142
  61. magic_pdf/rw/__init__.py +0 -0
  62. magic_pdf/user_api.py +0 -121
  63. /magic_pdf/{para → post_proc}/para_split_v3.py +0 -0
  64. {magic_pdf-0.10.5.dist-info → magic_pdf-1.0.0.dist-info}/LICENSE.md +0 -0
  65. {magic_pdf-0.10.5.dist-info → magic_pdf-1.0.0.dist-info}/WHEEL +0 -0
  66. {magic_pdf-0.10.5.dist-info → magic_pdf-1.0.0.dist-info}/entry_points.txt +0 -0
  67. {magic_pdf-0.10.5.dist-info → magic_pdf-1.0.0.dist-info}/top_level.txt +0 -0
@@ -1,11 +1,12 @@
1
1
  import copy
2
2
  import os
3
+ import re
3
4
  import statistics
4
5
  import time
5
6
  from typing import List
6
7
 
7
- import torch
8
8
  import fitz
9
+ import torch
9
10
  from loguru import logger
10
11
 
11
12
  from magic_pdf.config.enums import SupportedPdfParseMethod
@@ -13,31 +14,31 @@ from magic_pdf.config.ocr_content_type import BlockType, ContentType
13
14
  from magic_pdf.data.dataset import Dataset, PageableData
14
15
  from magic_pdf.libs.boxbase import calculate_overlap_area_in_bbox1_area_ratio
15
16
  from magic_pdf.libs.clean_memory import clean_memory
16
- from magic_pdf.libs.config_reader import get_local_layoutreader_model_dir
17
+ from magic_pdf.libs.config_reader import get_local_layoutreader_model_dir, get_llm_aided_config, get_device
17
18
  from magic_pdf.libs.convert_utils import dict_to_list
18
19
  from magic_pdf.libs.hash_utils import compute_md5
19
-
20
20
  from magic_pdf.libs.pdf_image_tools import cut_image_to_pil_image
21
21
  from magic_pdf.model.magic_model import MagicModel
22
-
23
- os.environ['NO_ALBUMENTATIONS_UPDATE'] = '1' # 禁止albumentations检查更新
24
- os.environ['YOLO_VERBOSE'] = 'False' # disable yolo logger
22
+ from magic_pdf.post_proc.llm_aided import llm_aided_formula, llm_aided_text, llm_aided_title
25
23
 
26
24
  try:
27
25
  import torchtext
28
26
 
29
- if torchtext.__version__ >= "0.18.0":
27
+ if torchtext.__version__ >= '0.18.0':
30
28
  torchtext.disable_torchtext_deprecation_warning()
31
29
  except ImportError:
32
30
  pass
33
31
 
34
32
  from magic_pdf.model.sub_modules.model_init import AtomModelSingleton
35
- from magic_pdf.para.para_split_v3 import para_split
33
+ from magic_pdf.post_proc.para_split_v3 import para_split
36
34
  from magic_pdf.pre_proc.construct_page_dict import ocr_construct_page_component_v2
37
35
  from magic_pdf.pre_proc.cut_image import ocr_cut_image_and_table
38
36
  from magic_pdf.pre_proc.ocr_detect_all_bboxes import ocr_prepare_bboxes_for_layout_split_v2
39
37
  from magic_pdf.pre_proc.ocr_dict_merge import fill_spans_in_blocks, fix_block_spans_v2, fix_discarded_block
40
- from magic_pdf.pre_proc.ocr_span_list_modify import get_qa_need_list_v2, remove_overlaps_low_confidence_spans, remove_overlaps_min_spans
38
+ from magic_pdf.pre_proc.ocr_span_list_modify import get_qa_need_list_v2, remove_overlaps_low_confidence_spans, \
39
+ remove_overlaps_min_spans, check_chars_is_overlap_in_span
40
+
41
+ os.environ['NO_ALBUMENTATIONS_UPDATE'] = '1' # 禁止albumentations检查更新
41
42
 
42
43
 
43
44
  def __replace_STX_ETX(text_str: str):
@@ -64,11 +65,22 @@ def __replace_0xfffd(text_str: str):
64
65
  return s
65
66
  return text_str
66
67
 
68
+
69
+ # 连写字符拆分
70
+ def __replace_ligatures(text: str):
71
+ ligatures = {
72
+ 'fi': 'fi', 'fl': 'fl', 'ff': 'ff', 'ffi': 'ffi', 'ffl': 'ffl', 'ſt': 'ft', 'st': 'st'
73
+ }
74
+ return re.sub('|'.join(map(re.escape, ligatures.keys())), lambda m: ligatures[m.group()], text)
75
+
76
+
67
77
  def chars_to_content(span):
68
78
  # 检查span中的char是否为空
69
79
  if len(span['chars']) == 0:
70
80
  pass
71
81
  # span['content'] = ''
82
+ elif check_chars_is_overlap_in_span(span['chars']):
83
+ pass
72
84
  else:
73
85
  # 先给chars按char['bbox']的中心点的x坐标排序
74
86
  span['chars'] = sorted(span['chars'], key=lambda x: (x['bbox'][0] + x['bbox'][2]) / 2)
@@ -79,11 +91,16 @@ def chars_to_content(span):
79
91
 
80
92
  content = ''
81
93
  for char in span['chars']:
82
- # 如果下一个char的x0和上一个char的x1距离超过一个字符宽度,则需要在中间插入一个空格
83
- if char['bbox'][0] - span['chars'][span['chars'].index(char) - 1]['bbox'][2] > char_avg_width:
84
- content += ' '
85
- content += char['c']
86
94
 
95
+ # 如果下一个char的x0和上一个char的x1距离超过0.25个字符宽度,则需要在中间插入一个空格
96
+ char1 = char
97
+ char2 = span['chars'][span['chars'].index(char) + 1] if span['chars'].index(char) + 1 < len(span['chars']) else None
98
+ if char2 and char2['bbox'][0] - char1['bbox'][2] > char_avg_width * 0.25 and char['c'] != ' ' and char2['c'] != ' ':
99
+ content += f"{char['c']} "
100
+ else:
101
+ content += char['c']
102
+
103
+ content = __replace_ligatures(content)
87
104
  span['content'] = __replace_0xfffd(content)
88
105
 
89
106
  del span['chars']
@@ -99,6 +116,10 @@ def fill_char_in_spans(spans, all_chars):
99
116
  spans = sorted(spans, key=lambda x: x['bbox'][1])
100
117
 
101
118
  for char in all_chars:
119
+ # 跳过非法bbox的char
120
+ x1, y1, x2, y2 = char['bbox']
121
+ if abs(x1 - x2) <= 0.01 or abs(y1 - y2) <= 0.01:
122
+ continue
102
123
  for span in spans:
103
124
  if calculate_char_in_span(char['bbox'], span['bbox'], char['c']):
104
125
  span['chars'].append(char)
@@ -153,14 +174,16 @@ def calculate_char_in_span(char_bbox, span_bbox, char, span_height_radio=0.33):
153
174
 
154
175
 
155
176
  def txt_spans_extract_v2(pdf_page, spans, all_bboxes, all_discarded_blocks, lang):
177
+ # cid用0xfffd表示,连字符拆开
178
+ # text_blocks_raw = pdf_page.get_text('rawdict', flags=fitz.TEXT_PRESERVE_WHITESPACE | fitz.TEXT_MEDIABOX_CLIP)['blocks']
156
179
 
157
- text_blocks_raw = pdf_page.get_text('rawdict', flags=fitz.TEXT_PRESERVE_WHITESPACE | fitz.TEXT_MEDIABOX_CLIP)['blocks']
158
-
180
+ # cid用0xfffd表示,连字符不拆开
181
+ text_blocks_raw = pdf_page.get_text('rawdict', flags=fitz.TEXT_PRESERVE_LIGATURES | fitz.TEXT_PRESERVE_WHITESPACE | fitz.TEXT_MEDIABOX_CLIP)['blocks']
159
182
  all_pymu_chars = []
160
183
  for block in text_blocks_raw:
161
184
  for line in block['lines']:
162
185
  cosine, sine = line['dir']
163
- if abs (cosine) < 0.9 or abs(sine) > 0.1:
186
+ if abs(cosine) < 0.9 or abs(sine) > 0.1:
164
187
  continue
165
188
  for span in line['spans']:
166
189
  all_pymu_chars.extend(span['chars'])
@@ -233,7 +256,7 @@ def txt_spans_extract_v2(pdf_page, spans, all_bboxes, all_discarded_blocks, lang
233
256
  # 初始化ocr模型
234
257
  atom_model_manager = AtomModelSingleton()
235
258
  ocr_model = atom_model_manager.get_atom_model(
236
- atom_model_name="ocr",
259
+ atom_model_name='ocr',
237
260
  ocr_show_log=False,
238
261
  det_db_box_thresh=0.3,
239
262
  lang=lang
@@ -241,7 +264,7 @@ def txt_spans_extract_v2(pdf_page, spans, all_bboxes, all_discarded_blocks, lang
241
264
 
242
265
  for span in empty_spans:
243
266
  # 对span的bbox截图再ocr
244
- span_img = cut_image_to_pil_image(span['bbox'], pdf_page, mode="cv2")
267
+ span_img = cut_image_to_pil_image(span['bbox'], pdf_page, mode='cv2')
245
268
  ocr_res = ocr_model.ocr(span_img, det=False)
246
269
  if ocr_res and len(ocr_res) > 0:
247
270
  if len(ocr_res[0]) > 0:
@@ -256,19 +279,23 @@ def txt_spans_extract_v2(pdf_page, spans, all_bboxes, all_discarded_blocks, lang
256
279
  return spans
257
280
 
258
281
 
259
- def replace_text_span(pymu_spans, ocr_spans):
260
- return list(filter(lambda x: x['type'] != ContentType.Text, ocr_spans)) + pymu_spans
261
-
262
-
263
282
  def model_init(model_name: str):
264
283
  from transformers import LayoutLMv3ForTokenClassification
265
-
284
+ device = get_device()
266
285
  if torch.cuda.is_available():
267
286
  device = torch.device('cuda')
268
287
  if torch.cuda.is_bf16_supported():
269
288
  supports_bfloat16 = True
270
289
  else:
271
290
  supports_bfloat16 = False
291
+ elif str(device).startswith("npu"):
292
+ import torch_npu
293
+ if torch_npu.npu.is_available():
294
+ device = torch.device('npu')
295
+ supports_bfloat16 = False
296
+ else:
297
+ device = torch.device('cpu')
298
+ supports_bfloat16 = False
272
299
  else:
273
300
  device = torch.device('cpu')
274
301
  supports_bfloat16 = False
@@ -346,6 +373,8 @@ def cal_block_index(fix_blocks, sorted_bboxes):
346
373
  # 使用xycut排序
347
374
  block_bboxes = []
348
375
  for block in fix_blocks:
376
+ # 如果block['bbox']任意值小于0,将其置为0
377
+ block['bbox'] = [max(0, x) for x in block['bbox']]
349
378
  block_bboxes.append(block['bbox'])
350
379
 
351
380
  # 删除图表body block中的虚拟line信息, 并用real_lines信息回填
@@ -681,7 +710,7 @@ def parse_page_core(
681
710
  """根据parse_mode,构造spans,主要是文本类的字符填充"""
682
711
  if parse_mode == SupportedPdfParseMethod.TXT:
683
712
 
684
- """使用新版本的混合ocr方案"""
713
+ """使用新版本的混合ocr方案."""
685
714
  spans = txt_spans_extract_v2(page_doc, spans, all_bboxes, all_discarded_blocks, lang)
686
715
 
687
716
  elif parse_mode == SupportedPdfParseMethod.OCR:
@@ -689,7 +718,6 @@ def parse_page_core(
689
718
  else:
690
719
  raise Exception('parse_mode must be txt or ocr')
691
720
 
692
-
693
721
  """先处理不需要排版的discarded_blocks"""
694
722
  discarded_block_with_spans, spans = fill_spans_in_blocks(
695
723
  all_discarded_blocks, spans, 0.4
@@ -740,6 +768,11 @@ def parse_page_core(
740
768
  """重排block"""
741
769
  sorted_blocks = sorted(fix_blocks, key=lambda b: b['index'])
742
770
 
771
+ """block内重排(img和table的block内多个caption或footnote的排序)"""
772
+ for block in sorted_blocks:
773
+ if block['type'] in [BlockType.Image, BlockType.Table]:
774
+ block['blocks'] = sorted(block['blocks'], key=lambda b: b['index'])
775
+
743
776
  """获取QA需要外置的list"""
744
777
  images, tables, interline_equations = get_qa_need_list_v2(sorted_blocks)
745
778
 
@@ -762,8 +795,8 @@ def parse_page_core(
762
795
 
763
796
 
764
797
  def pdf_parse_union(
765
- dataset: Dataset,
766
798
  model_list,
799
+ dataset: Dataset,
767
800
  imageWriter,
768
801
  parse_mode,
769
802
  start_page_id=0,
@@ -771,6 +804,7 @@ def pdf_parse_union(
771
804
  debug_mode=False,
772
805
  lang=None,
773
806
  ):
807
+
774
808
  pdf_bytes_md5 = compute_md5(dataset.data_bits())
775
809
 
776
810
  """初始化空的pdf_info_dict"""
@@ -820,13 +854,29 @@ def pdf_parse_union(
820
854
  """分段"""
821
855
  para_split(pdf_info_dict)
822
856
 
857
+ """llm优化"""
858
+ llm_aided_config = get_llm_aided_config()
859
+ if llm_aided_config is not None:
860
+ """公式优化"""
861
+ formula_aided_config = llm_aided_config.get('formula_aided', None)
862
+ if formula_aided_config is not None:
863
+ llm_aided_formula(pdf_info_dict, formula_aided_config)
864
+ """文本优化"""
865
+ text_aided_config = llm_aided_config.get('text_aided', None)
866
+ if text_aided_config is not None:
867
+ llm_aided_text(pdf_info_dict, text_aided_config)
868
+ """标题优化"""
869
+ title_aided_config = llm_aided_config.get('title_aided', None)
870
+ if title_aided_config is not None:
871
+ llm_aided_title(pdf_info_dict, title_aided_config)
872
+
823
873
  """dict转list"""
824
874
  pdf_info_list = dict_to_list(pdf_info_dict)
825
875
  new_pdf_info_dict = {
826
876
  'pdf_info': pdf_info_list,
827
877
  }
828
878
 
829
- clean_memory()
879
+ clean_memory(get_device())
830
880
 
831
881
  return new_pdf_info_dict
832
882
 
@@ -0,0 +1 @@
1
+ # Copyright (c) Opendatalab. All rights reserved.
@@ -0,0 +1,133 @@
1
+ # Copyright (c) Opendatalab. All rights reserved.
2
+ import json
3
+ from loguru import logger
4
+ from magic_pdf.dict2md.ocr_mkcontent import merge_para_with_text
5
+ from openai import OpenAI
6
+
7
+
8
+ #@todo: 有的公式以"\"结尾,这样会导致尾部拼接的"$"被转义,也需要修复
9
+ formula_optimize_prompt = """请根据以下指南修正LaTeX公式的错误,确保公式能够渲染且符合原始内容:
10
+
11
+ 1. 修正渲染或编译错误:
12
+ - Some syntax errors such as mismatched/missing/extra tokens. Your task is to fix these syntax errors and make sure corrected results conform to latex math syntax principles.
13
+ - 包含KaTeX不支持的关键词等原因导致的无法编译或渲染的错误
14
+
15
+ 2. 保留原始信息:
16
+ - 保留原始公式中的所有重要信息
17
+ - 不要添加任何原始公式中没有的新信息
18
+
19
+ IMPORTANT:请仅返回修正后的公式,不要包含任何介绍、解释或元数据。
20
+
21
+ LaTeX recognition result:
22
+ $FORMULA
23
+
24
+ Your corrected result:
25
+ """
26
+
27
+ text_optimize_prompt = f"""请根据以下指南修正OCR引起的错误,确保文本连贯并符合原始内容:
28
+
29
+ 1. 修正OCR引起的拼写错误和错误:
30
+ - 修正常见的OCR错误(例如,'rn' 被误读为 'm')
31
+ - 使用上下文和常识进行修正
32
+ - 只修正明显的错误,不要不必要的修改内容
33
+ - 不要添加额外的句号或其他不必要的标点符号
34
+
35
+ 2. 保持原始结构:
36
+ - 保留所有标题和子标题
37
+
38
+ 3. 保留原始内容:
39
+ - 保留原始文本中的所有重要信息
40
+ - 不要添加任何原始文本中没有的新信息
41
+ - 保留段落之间的换行符
42
+
43
+ 4. 保持连贯性:
44
+ - 确保内容与前文顺畅连接
45
+ - 适当处理在句子中间开始或结束的文本
46
+
47
+ 5. 修正行内公式:
48
+ - 去除行内公式前后多余的空格
49
+ - 修正公式中的OCR错误
50
+ - 确保公式能够通过KaTeX渲染
51
+
52
+ 6. 修正全角字符
53
+ - 修正全角标点符号为半角标点符号
54
+ - 修正全角字母为半角字母
55
+ - 修正全角数字为半角数字
56
+
57
+ IMPORTANT:请仅返回修正后的文本,保留所有原始格式,包括换行符。不要包含任何介绍、解释或元数据。
58
+
59
+ Previous context:
60
+
61
+ Current chunk to process:
62
+
63
+ Corrected text:
64
+ """
65
+
66
+ def llm_aided_formula(pdf_info_dict, formula_aided_config):
67
+ pass
68
+
69
+ def llm_aided_text(pdf_info_dict, text_aided_config):
70
+ pass
71
+
72
+ def llm_aided_title(pdf_info_dict, title_aided_config):
73
+ client = OpenAI(
74
+ api_key=title_aided_config["api_key"],
75
+ base_url=title_aided_config["base_url"],
76
+ )
77
+ title_dict = {}
78
+ origin_title_list = []
79
+ i = 0
80
+ for page_num, page in pdf_info_dict.items():
81
+ blocks = page["para_blocks"]
82
+ for block in blocks:
83
+ if block["type"] == "title":
84
+ origin_title_list.append(block)
85
+ title_text = merge_para_with_text(block)
86
+ title_dict[f"{i}"] = title_text
87
+ i += 1
88
+ # logger.info(f"Title list: {title_dict}")
89
+
90
+ title_optimize_prompt = f"""输入的内容是一篇文档中所有标题组成的字典,请根据以下指南优化标题的结果,使结果符合正常文档的层次结构:
91
+
92
+ 1. 保留原始内容:
93
+ - 输入的字典中所有元素都是有效的,不能删除字典中的任何元素
94
+ - 请务必保证输出的字典中元素的数量和输入的数量一致
95
+
96
+ 2. 保持字典内key-value的对应关系不变
97
+
98
+ 3. 优化层次结构:
99
+ - 为每个标题元素添加适当的层次结构
100
+ - 标题层级应具有连续性,不能跳过某一层级
101
+ - 标题层级最多为4级,不要添加过多的层级
102
+ - 优化后的标题为一个整数,代表该标题的层级
103
+
104
+ IMPORTANT:
105
+ 请直接返回优化过的由标题层级组成的json,返回的json不需要格式化。
106
+
107
+ Input title list:
108
+ {title_dict}
109
+
110
+ Corrected title list:
111
+ """
112
+
113
+ completion = client.chat.completions.create(
114
+ model=title_aided_config["model"],
115
+ messages=[
116
+ {'role': 'user', 'content': title_optimize_prompt}],
117
+ temperature=0.7,
118
+ )
119
+
120
+ json_completion = json.loads(completion.choices[0].message.content)
121
+
122
+ # logger.info(f"Title completion: {json_completion}")
123
+
124
+ # logger.info(f"len(json_completion): {len(json_completion)}, len(title_dict): {len(title_dict)}")
125
+ if len(json_completion) == len(title_dict):
126
+ try:
127
+ for i, origin_title_block in enumerate(origin_title_list):
128
+ origin_title_block["level"] = int(json_completion[str(i)])
129
+ except Exception as e:
130
+ logger.exception(e)
131
+ else:
132
+ logger.error("The number of titles in the optimized result is not equal to the number of titles in the input.")
133
+
@@ -33,6 +33,14 @@ def remove_overlaps_low_confidence_spans(spans):
33
33
  return spans, dropped_spans
34
34
 
35
35
 
36
+ def check_chars_is_overlap_in_span(chars):
37
+ for i in range(len(chars)):
38
+ for j in range(i + 1, len(chars)):
39
+ if calculate_iou(chars[i]['bbox'], chars[j]['bbox']) > 0.9:
40
+ return True
41
+ return False
42
+
43
+
36
44
  def remove_overlaps_min_spans(spans):
37
45
  dropped_spans = []
38
46
  # 删除重叠spans中较小的那些
@@ -70,7 +70,7 @@ def _remove_overlap_between_bboxes(arr):
70
70
  res[i] = None
71
71
  else:
72
72
  keeps[idx] = False
73
- drop_reasons.append(drop_reasons)
73
+ drop_reasons.append(drop_reason)
74
74
  if keeps[idx]:
75
75
  res[idx] = v
76
76
  return res, drop_reasons
magic_pdf/tools/cli.py CHANGED
@@ -1,13 +1,20 @@
1
1
  import os
2
- from pathlib import Path
3
-
2
+ import shutil
3
+ import tempfile
4
4
  import click
5
+ import fitz
5
6
  from loguru import logger
7
+ from pathlib import Path
6
8
 
7
9
  import magic_pdf.model as model_config
8
10
  from magic_pdf.data.data_reader_writer import FileBasedDataReader
9
11
  from magic_pdf.libs.version import __version__
10
12
  from magic_pdf.tools.common import do_parse, parse_pdf_methods
13
+ from magic_pdf.utils.office_to_pdf import convert_file_to_pdf
14
+
15
+ pdf_suffixes = ['.pdf']
16
+ ms_office_suffixes = ['.ppt', '.pptx', '.doc', '.docx']
17
+ image_suffixes = ['.png', '.jpeg', '.jpg']
11
18
 
12
19
 
13
20
  @click.command()
@@ -21,7 +28,7 @@ from magic_pdf.tools.common import do_parse, parse_pdf_methods
21
28
  'path',
22
29
  type=click.Path(exists=True),
23
30
  required=True,
24
- help='local pdf filepath or directory',
31
+ help='local filepath or directory. support PDF, PPT, PPTX, DOC, DOCX, PNG, JPG files',
25
32
  )
26
33
  @click.option(
27
34
  '-o',
@@ -83,12 +90,27 @@ def cli(path, output_dir, method, lang, debug_able, start_page_id, end_page_id):
83
90
  model_config.__use_inside_model__ = True
84
91
  model_config.__model_mode__ = 'full'
85
92
  os.makedirs(output_dir, exist_ok=True)
93
+ temp_dir = tempfile.mkdtemp()
94
+ def read_fn(path: Path):
95
+ if path.suffix in ms_office_suffixes:
96
+ convert_file_to_pdf(str(path), temp_dir)
97
+ fn = os.path.join(temp_dir, f"{path.stem}.pdf")
98
+ elif path.suffix in image_suffixes:
99
+ with open(str(path), 'rb') as f:
100
+ bits = f.read()
101
+ pdf_bytes = fitz.open(stream=bits).convert_to_pdf()
102
+ fn = os.path.join(temp_dir, f"{path.stem}.pdf")
103
+ with open(fn, 'wb') as f:
104
+ f.write(pdf_bytes)
105
+ elif path.suffix in pdf_suffixes:
106
+ fn = str(path)
107
+ else:
108
+ raise Exception(f"Unknown file suffix: {path.suffix}")
109
+
110
+ disk_rw = FileBasedDataReader(os.path.dirname(fn))
111
+ return disk_rw.read(os.path.basename(fn))
86
112
 
87
- def read_fn(path):
88
- disk_rw = FileBasedDataReader(os.path.dirname(path))
89
- return disk_rw.read(os.path.basename(path))
90
-
91
- def parse_doc(doc_path: str):
113
+ def parse_doc(doc_path: Path):
92
114
  try:
93
115
  file_name = str(Path(doc_path).stem)
94
116
  pdf_data = read_fn(doc_path)
@@ -108,10 +130,13 @@ def cli(path, output_dir, method, lang, debug_able, start_page_id, end_page_id):
108
130
  logger.exception(e)
109
131
 
110
132
  if os.path.isdir(path):
111
- for doc_path in Path(path).glob('*.pdf'):
112
- parse_doc(doc_path)
133
+ for doc_path in Path(path).glob('*'):
134
+ if doc_path.suffix in pdf_suffixes + image_suffixes + ms_office_suffixes:
135
+ parse_doc(doc_path)
113
136
  else:
114
- parse_doc(path)
137
+ parse_doc(Path(path))
138
+
139
+ shutil.rmtree(temp_dir)
115
140
 
116
141
 
117
142
  if __name__ == '__main__':