magic-pdf 0.9.2__py3-none-any.whl → 0.10.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (110) hide show
  1. magic_pdf/config/constants.py +53 -0
  2. magic_pdf/config/drop_reason.py +35 -0
  3. magic_pdf/config/drop_tag.py +19 -0
  4. magic_pdf/config/make_content_config.py +11 -0
  5. magic_pdf/{libs/ModelBlockTypeEnum.py → config/model_block_type.py} +2 -1
  6. magic_pdf/data/read_api.py +1 -1
  7. magic_pdf/dict2md/mkcontent.py +226 -185
  8. magic_pdf/dict2md/ocr_mkcontent.py +12 -12
  9. magic_pdf/filter/pdf_meta_scan.py +101 -79
  10. magic_pdf/integrations/rag/utils.py +4 -5
  11. magic_pdf/libs/config_reader.py +6 -6
  12. magic_pdf/libs/draw_bbox.py +13 -6
  13. magic_pdf/libs/pdf_image_tools.py +36 -12
  14. magic_pdf/libs/version.py +1 -1
  15. magic_pdf/model/doc_analyze_by_custom_model.py +2 -0
  16. magic_pdf/model/magic_model.py +13 -13
  17. magic_pdf/model/pdf_extract_kit.py +142 -351
  18. magic_pdf/model/sub_modules/layout/doclayout_yolo/DocLayoutYOLO.py +21 -0
  19. magic_pdf/model/sub_modules/mfd/__init__.py +0 -0
  20. magic_pdf/model/sub_modules/mfd/yolov8/YOLOv8.py +12 -0
  21. magic_pdf/model/sub_modules/mfd/yolov8/__init__.py +0 -0
  22. magic_pdf/model/sub_modules/mfr/__init__.py +0 -0
  23. magic_pdf/model/sub_modules/mfr/unimernet/Unimernet.py +98 -0
  24. magic_pdf/model/sub_modules/mfr/unimernet/__init__.py +0 -0
  25. magic_pdf/model/sub_modules/model_init.py +149 -0
  26. magic_pdf/model/sub_modules/model_utils.py +51 -0
  27. magic_pdf/model/sub_modules/ocr/__init__.py +0 -0
  28. magic_pdf/model/sub_modules/ocr/paddleocr/__init__.py +0 -0
  29. magic_pdf/model/sub_modules/ocr/paddleocr/ocr_utils.py +285 -0
  30. magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_273_mod.py +176 -0
  31. magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_291_mod.py +213 -0
  32. magic_pdf/model/sub_modules/reading_oreder/__init__.py +0 -0
  33. magic_pdf/model/sub_modules/reading_oreder/layoutreader/__init__.py +0 -0
  34. magic_pdf/model/sub_modules/reading_oreder/layoutreader/xycut.py +242 -0
  35. magic_pdf/model/sub_modules/table/__init__.py +0 -0
  36. magic_pdf/model/sub_modules/table/rapidtable/__init__.py +0 -0
  37. magic_pdf/model/sub_modules/table/rapidtable/rapid_table.py +16 -0
  38. magic_pdf/model/sub_modules/table/structeqtable/__init__.py +0 -0
  39. magic_pdf/model/{pek_sub_modules/structeqtable/StructTableModel.py → sub_modules/table/structeqtable/struct_eqtable.py} +3 -11
  40. magic_pdf/model/sub_modules/table/table_utils.py +11 -0
  41. magic_pdf/model/sub_modules/table/tablemaster/__init__.py +0 -0
  42. magic_pdf/model/{ppTableModel.py → sub_modules/table/tablemaster/tablemaster_paddle.py} +31 -29
  43. magic_pdf/para/para_split.py +411 -248
  44. magic_pdf/para/para_split_v2.py +352 -182
  45. magic_pdf/para/para_split_v3.py +121 -66
  46. magic_pdf/pdf_parse_by_ocr.py +2 -0
  47. magic_pdf/pdf_parse_by_txt.py +2 -0
  48. magic_pdf/pdf_parse_union_core.py +174 -100
  49. magic_pdf/pdf_parse_union_core_v2.py +253 -50
  50. magic_pdf/pipe/AbsPipe.py +28 -44
  51. magic_pdf/pipe/OCRPipe.py +5 -5
  52. magic_pdf/pipe/TXTPipe.py +5 -6
  53. magic_pdf/pipe/UNIPipe.py +24 -25
  54. magic_pdf/post_proc/pdf_post_filter.py +7 -14
  55. magic_pdf/pre_proc/cut_image.py +9 -11
  56. magic_pdf/pre_proc/equations_replace.py +203 -212
  57. magic_pdf/pre_proc/ocr_detect_all_bboxes.py +235 -49
  58. magic_pdf/pre_proc/ocr_dict_merge.py +5 -5
  59. magic_pdf/pre_proc/ocr_span_list_modify.py +122 -63
  60. magic_pdf/pre_proc/pdf_pre_filter.py +37 -33
  61. magic_pdf/pre_proc/remove_bbox_overlap.py +20 -18
  62. magic_pdf/pre_proc/remove_colored_strip_bbox.py +36 -14
  63. magic_pdf/pre_proc/remove_footer_header.py +2 -5
  64. magic_pdf/pre_proc/remove_rotate_bbox.py +111 -63
  65. magic_pdf/pre_proc/resolve_bbox_conflict.py +10 -17
  66. magic_pdf/resources/model_config/model_configs.yaml +2 -1
  67. magic_pdf/spark/spark_api.py +15 -17
  68. magic_pdf/tools/cli.py +3 -4
  69. magic_pdf/tools/cli_dev.py +6 -9
  70. magic_pdf/tools/common.py +70 -36
  71. magic_pdf/user_api.py +29 -38
  72. {magic_pdf-0.9.2.dist-info → magic_pdf-0.10.0.dist-info}/METADATA +18 -13
  73. magic_pdf-0.10.0.dist-info/RECORD +198 -0
  74. {magic_pdf-0.9.2.dist-info → magic_pdf-0.10.0.dist-info}/WHEEL +1 -1
  75. magic_pdf/libs/Constants.py +0 -53
  76. magic_pdf/libs/MakeContentConfig.py +0 -11
  77. magic_pdf/libs/drop_reason.py +0 -27
  78. magic_pdf/libs/drop_tag.py +0 -19
  79. magic_pdf/model/pek_sub_modules/post_process.py +0 -36
  80. magic_pdf/model/pek_sub_modules/self_modify.py +0 -388
  81. magic_pdf/para/para_pipeline.py +0 -297
  82. magic_pdf-0.9.2.dist-info/RECORD +0 -178
  83. /magic_pdf/{libs → config}/ocr_content_type.py +0 -0
  84. /magic_pdf/model/{pek_sub_modules → sub_modules}/__init__.py +0 -0
  85. /magic_pdf/model/{pek_sub_modules/layoutlmv3 → sub_modules/layout}/__init__.py +0 -0
  86. /magic_pdf/model/{pek_sub_modules/structeqtable → sub_modules/layout/doclayout_yolo}/__init__.py +0 -0
  87. /magic_pdf/model/{v3 → sub_modules/layout/layoutlmv3}/__init__.py +0 -0
  88. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/backbone.py +0 -0
  89. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/beit.py +0 -0
  90. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/deit.py +0 -0
  91. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/__init__.py +0 -0
  92. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/data/__init__.py +0 -0
  93. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/data/cord.py +0 -0
  94. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/data/data_collator.py +0 -0
  95. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/data/funsd.py +0 -0
  96. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/data/image_utils.py +0 -0
  97. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/data/xfund.py +0 -0
  98. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/models/__init__.py +0 -0
  99. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/models/layoutlmv3/__init__.py +0 -0
  100. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/models/layoutlmv3/configuration_layoutlmv3.py +0 -0
  101. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/models/layoutlmv3/modeling_layoutlmv3.py +0 -0
  102. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/models/layoutlmv3/tokenization_layoutlmv3.py +0 -0
  103. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/models/layoutlmv3/tokenization_layoutlmv3_fast.py +0 -0
  104. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/model_init.py +0 -0
  105. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/rcnn_vl.py +0 -0
  106. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/visualizer.py +0 -0
  107. /magic_pdf/model/{v3 → sub_modules/reading_oreder/layoutreader}/helpers.py +0 -0
  108. {magic_pdf-0.9.2.dist-info → magic_pdf-0.10.0.dist-info}/LICENSE.md +0 -0
  109. {magic_pdf-0.9.2.dist-info → magic_pdf-0.10.0.dist-info}/entry_points.txt +0 -0
  110. {magic_pdf-0.9.2.dist-info → magic_pdf-0.10.0.dist-info}/top_level.txt +0 -0
@@ -2,21 +2,20 @@ import re
2
2
 
3
3
  from loguru import logger
4
4
 
5
+ from magic_pdf.config.make_content_config import DropMode, MakeMode
6
+ from magic_pdf.config.ocr_content_type import BlockType, ContentType
5
7
  from magic_pdf.libs.commons import join_path
6
8
  from magic_pdf.libs.language import detect_lang
7
- from magic_pdf.libs.MakeContentConfig import DropMode, MakeMode
8
9
  from magic_pdf.libs.markdown_utils import ocr_escape_special_markdown_char
9
- from magic_pdf.libs.ocr_content_type import BlockType, ContentType
10
10
  from magic_pdf.para.para_split_v3 import ListLineTag
11
11
 
12
12
 
13
13
  def __is_hyphen_at_line_end(line):
14
- """
15
- Check if a line ends with one or more letters followed by a hyphen.
16
-
14
+ """Check if a line ends with one or more letters followed by a hyphen.
15
+
17
16
  Args:
18
17
  line (str): The line of text to check.
19
-
18
+
20
19
  Returns:
21
20
  bool: True if the line ends with one or more letters followed by a hyphen, False otherwise.
22
21
  """
@@ -142,9 +141,10 @@ def merge_para_with_text(para_block):
142
141
  span_type = span['type']
143
142
  if span_type == ContentType.Text:
144
143
  line_text += span['content'].strip()
144
+
145
145
  if line_text != '':
146
146
  line_lang = detect_lang(line_text)
147
- for span in line['spans']:
147
+ for j, span in enumerate(line['spans']):
148
148
 
149
149
  span_type = span['type']
150
150
  content = ''
@@ -162,16 +162,16 @@ def merge_para_with_text(para_block):
162
162
  if span_type in [ContentType.Text, ContentType.InterlineEquation]:
163
163
  para_text += content # 中文/日语/韩文语境下,content间不需要空格分隔
164
164
  elif span_type == ContentType.InlineEquation:
165
- para_text += f" {content} "
165
+ para_text += f' {content} '
166
166
  else:
167
167
  if span_type in [ContentType.Text, ContentType.InlineEquation]:
168
- # 如果是前一行带有-连字符,那么末尾不应该加空格
169
- if __is_hyphen_at_line_end(content):
168
+ # 如果span是line的最后一个且末尾带有-连字符,那么末尾不应该加空格,同时应该把-删除
169
+ if j == len(line['spans'])-1 and __is_hyphen_at_line_end(content):
170
170
  para_text += content[:-1]
171
- elif len(content) == 1 and content not in ['A', 'I', 'a', 'i']:
171
+ elif len(content) == 1 and content not in ['A', 'I', 'a', 'i'] and not content.isdigit():
172
172
  para_text += content
173
173
  else: # 西方文本语境下 content间需要空格分隔
174
- para_text += f"{content} "
174
+ para_text += f'{content} '
175
175
  elif span_type == ContentType.InterlineEquation:
176
176
  para_text += content
177
177
  else:
@@ -1,16 +1,13 @@
1
- """
2
- 输入: s3路径,每行一个
3
- 输出: pdf文件元信息,包括每一页上的所有图片的长宽高,bbox位置
4
- """
1
+ """输入: s3路径,每行一个 输出: pdf文件元信息,包括每一页上的所有图片的长宽高,bbox位置."""
2
+
5
3
  import sys
6
- import click
4
+ from collections import Counter
7
5
 
8
- from magic_pdf.libs.commons import read_file, mymax, get_top_percent_list
9
- from magic_pdf.libs.commons import fitz
6
+ import click
10
7
  from loguru import logger
11
- from collections import Counter
12
8
 
13
- from magic_pdf.libs.drop_reason import DropReason
9
+ from magic_pdf.config.drop_reason import DropReason
10
+ from magic_pdf.libs.commons import fitz, get_top_percent_list, mymax, read_file
14
11
  from magic_pdf.libs.language import detect_lang
15
12
  from magic_pdf.libs.pdf_check import detect_invalid_chars
16
13
 
@@ -19,8 +16,10 @@ junk_limit_min = 10
19
16
 
20
17
 
21
18
  def calculate_max_image_area_per_page(result: list, page_width_pts, page_height_pts):
22
- max_image_area_per_page = [mymax([(x1 - x0) * (y1 - y0) for x0, y0, x1, y1, _ in page_img_sz]) for page_img_sz in
23
- result]
19
+ max_image_area_per_page = [
20
+ mymax([(x1 - x0) * (y1 - y0) for x0, y0, x1, y1, _ in page_img_sz])
21
+ for page_img_sz in result
22
+ ]
24
23
  page_area = int(page_width_pts) * int(page_height_pts)
25
24
  max_image_area_per_page = [area / page_area for area in max_image_area_per_page]
26
25
  max_image_area_per_page = [area for area in max_image_area_per_page if area > 0.6]
@@ -32,8 +31,10 @@ def process_image(page, junk_img_bojids=[]):
32
31
  items = page.get_images()
33
32
  dedup = set()
34
33
  for img in items:
35
- # 这里返回的是图片在page上的实际展示的大小。返回一个数组,每个元素第一部分是
36
- img_bojid = img[0] # 在pdf文件中是全局唯一的,如果这个图反复出现在pdf里那么就可能是垃圾信息,例如水印、页眉页脚等
34
+ # 这里返回的是图片在page上的实际展示的大小。返回一个数组,每个元素第一部分是
35
+ img_bojid = img[
36
+ 0
37
+ ] # 在pdf文件中是全局唯一的,如果这个图反复出现在pdf里那么就可能是垃圾信息,例如水印、页眉页脚等
37
38
  if img_bojid in junk_img_bojids: # 如果是垃圾图像,就跳过
38
39
  continue
39
40
  recs = page.get_image_rects(img, transform=True)
@@ -42,9 +43,17 @@ def process_image(page, junk_img_bojids=[]):
42
43
  x0, y0, x1, y1 = map(int, rec)
43
44
  width = x1 - x0
44
45
  height = y1 - y0
45
- if (x0, y0, x1, y1, img_bojid) in dedup: # 这里面会出现一些重复的bbox,无需重复出现,需要去掉
46
+ if (
47
+ x0,
48
+ y0,
49
+ x1,
50
+ y1,
51
+ img_bojid,
52
+ ) in dedup: # 这里面会出现一些重复的bbox,无需重复出现,需要去掉
46
53
  continue
47
- if not all([width, height]): # 长和宽任何一个都不能是0,否则这个图片不可见,没有实际意义
54
+ if not all(
55
+ [width, height]
56
+ ): # 长和宽任何一个都不能是0,否则这个图片不可见,没有实际意义
48
57
  continue
49
58
  dedup.add((x0, y0, x1, y1, img_bojid))
50
59
  page_result.append([x0, y0, x1, y1, img_bojid])
@@ -52,29 +61,33 @@ def process_image(page, junk_img_bojids=[]):
52
61
 
53
62
 
54
63
  def get_image_info(doc: fitz.Document, page_width_pts, page_height_pts) -> list:
55
- """
56
- 返回每个页面里的图片的四元组,每个页面多个图片。
64
+ """返回每个页面里的图片的四元组,每个页面多个图片。
65
+
57
66
  :param doc:
58
67
  :return:
59
68
  """
60
- # 使用 Counter 计数 img_bojid 的出现次数
69
+ # 使用 Counter 计数 img_bojid 的出现次数
61
70
  img_bojid_counter = Counter(img[0] for page in doc for img in page.get_images())
62
- # 找出出现次数超过 len(doc) 半数的 img_bojid
71
+ # 找出出现次数超过 len(doc) 半数的 img_bojid
63
72
 
64
73
  junk_limit = max(len(doc) * 0.5, junk_limit_min) # 对一些页数比较少的进行豁免
65
74
 
66
- junk_img_bojids = [img_bojid for img_bojid, count in img_bojid_counter.items() if count >= junk_limit]
67
-
68
- #todo 加个判断,用前十页就行,这些垃圾图片需要满足两个条件,不止出现的次数要足够多,而且图片占书页面积的比例要足够大,且图与图大小都差不多
69
- #有两种扫描版,一种文字版,这里可能会有误判
70
- #扫描版1:每页都有所有扫描页图片,特点是图占比大,每页展示1张
71
- #扫描版2,每页存储的扫描页图片数量递增,特点是图占比大,每页展示1张,需要清空junklist跑前50页图片信息用于分类判断
72
- #文字版1.每页存储所有图片,特点是图片占页面比例不大,每页展示可能为0也可能不止1张 这种pdf需要拿前10页抽样检测img大小和个数,如果符合需要清空junklist
75
+ junk_img_bojids = [
76
+ img_bojid
77
+ for img_bojid, count in img_bojid_counter.items()
78
+ if count >= junk_limit
79
+ ]
80
+
81
+ # todo 加个判断,用前十页就行,这些垃圾图片需要满足两个条件,不止出现的次数要足够多,而且图片占书页面积的比例要足够大,且图与图大小都差不多
82
+ # 有两种扫描版,一种文字版,这里可能会有误判
83
+ # 扫描版1:每页都有所有扫描页图片,特点是图占比大,每页展示1张
84
+ # 扫描版2,每页存储的扫描页图片数量递增,特点是图占比大,每页展示1张,需要清空junklist跑前50页图片信息用于分类判断
85
+ # 文 字版1.每页存储所有图片,特点是图片占页面比例不大,每页展示可能为0也可能不止1张 这种pdf需要拿前10页抽样检测img大小和个数,如果符合需要清空junklist
73
86
  imgs_len_list = [len(page.get_images()) for page in doc]
74
87
 
75
88
  special_limit_pages = 10
76
89
 
77
- # 统一用前十页结果做判断
90
+ # 统一用前十页结果做判断
78
91
  result = []
79
92
  break_loop = False
80
93
  for i, page in enumerate(doc):
@@ -82,12 +95,18 @@ def get_image_info(doc: fitz.Document, page_width_pts, page_height_pts) -> list:
82
95
  break
83
96
  if i >= special_limit_pages:
84
97
  break
85
- page_result = process_image(page) # 这里不传junk_img_bojids,拿前十页所有图片信息用于后续分析
98
+ page_result = process_image(
99
+ page
100
+ ) # 这里不传junk_img_bojids,拿前十页所有图片信息用于后续分析
86
101
  result.append(page_result)
87
102
  for item in result:
88
- if not any(item): # 如果任何一页没有图片,说明是个文字版,需要判断是否为特殊文字版
89
- if max(imgs_len_list) == min(imgs_len_list) and max(
90
- imgs_len_list) >= junk_limit_min: # 如果是特殊文字版,就把junklist置空并break
103
+ if not any(
104
+ item
105
+ ): # 如果任何一页没有图片,说明是个文字版,需要判断是否为特殊文字版
106
+ if (
107
+ max(imgs_len_list) == min(imgs_len_list)
108
+ and max(imgs_len_list) >= junk_limit_min
109
+ ): # 如果是特殊文字版,就把junklist置空并break
91
110
  junk_img_bojids = []
92
111
  else: # 不是特殊文字版,是个普通文字版,但是存在垃圾图片,不置空junklist
93
112
  pass
@@ -98,20 +117,23 @@ def get_image_info(doc: fitz.Document, page_width_pts, page_height_pts) -> list:
98
117
  top_eighty_percent = get_top_percent_list(imgs_len_list, 0.8)
99
118
  # 检查前80%的元素是否都相等
100
119
  if len(set(top_eighty_percent)) == 1 and max(imgs_len_list) >= junk_limit_min:
101
-
102
120
  # # 如果前10页跑完都有图,根据每页图片数量是否相等判断是否需要清除junklist
103
121
  # if max(imgs_len_list) == min(imgs_len_list) and max(imgs_len_list) >= junk_limit_min:
104
122
 
105
- #前10页都有图,且每页数量一致,需要检测图片大小占页面的比例判断是否需要清除junklist
106
- max_image_area_per_page = calculate_max_image_area_per_page(result, page_width_pts, page_height_pts)
107
- if len(max_image_area_per_page) < 0.8 * special_limit_pages: # 前10页不全是大图,说明可能是个文字版pdf,把垃圾图片list置空
123
+ # 前10页都有图,且每页数量一致,需要检测图片大小占页面的比例判断是否需要清除junklist
124
+ max_image_area_per_page = calculate_max_image_area_per_page(
125
+ result, page_width_pts, page_height_pts
126
+ )
127
+ if (
128
+ len(max_image_area_per_page) < 0.8 * special_limit_pages
129
+ ): # 前10页不全是大图,说明可能是个文字版pdf,把垃圾图片list置空
108
130
  junk_img_bojids = []
109
131
  else: # 前10页都有图,而且80%都是大图,且每页图片数量一致并都很多,说明是扫描版1,不需要清空junklist
110
132
  pass
111
133
  else: # 每页图片数量不一致,需要清掉junklist全量跑前50页图片
112
134
  junk_img_bojids = []
113
135
 
114
- #正式进入取前50页图片的信息流程
136
+ # 正式进入取前50页图片的信息流程
115
137
  result = []
116
138
  for i, page in enumerate(doc):
117
139
  if i >= scan_max_page:
@@ -126,7 +148,7 @@ def get_image_info(doc: fitz.Document, page_width_pts, page_height_pts) -> list:
126
148
  def get_pdf_page_size_pts(doc: fitz.Document):
127
149
  page_cnt = len(doc)
128
150
  l: int = min(page_cnt, 50)
129
- #把所有宽度和高度塞到两个list 分别取中位数(中间遇到了个在纵页里塞横页的pdf,导致宽高互换了)
151
+ # 把所有宽度和高度塞到两个list 分别取中位数(中间遇到了个在纵页里塞横页的pdf,导致宽高互换了)
130
152
  page_width_list = []
131
153
  page_height_list = []
132
154
  for i in range(l):
@@ -152,8 +174,8 @@ def get_pdf_textlen_per_page(doc: fitz.Document):
152
174
  # 拿所有text的blocks
153
175
  # text_block = page.get_text("words")
154
176
  # text_block_len = sum([len(t[4]) for t in text_block])
155
- #拿所有text的str
156
- text_block = page.get_text("text")
177
+ # 拿所有text的str
178
+ text_block = page.get_text('text')
157
179
  text_block_len = len(text_block)
158
180
  # logger.info(f"page {page.number} text_block_len: {text_block_len}")
159
181
  text_len_lst.append(text_block_len)
@@ -162,15 +184,13 @@ def get_pdf_textlen_per_page(doc: fitz.Document):
162
184
 
163
185
 
164
186
  def get_pdf_text_layout_per_page(doc: fitz.Document):
165
- """
166
- 根据PDF文档的每一页文本布局,判断该页的文本布局是横向、纵向还是未知。
187
+ """根据PDF文档的每一页文本布局,判断该页的文本布局是横向、纵向还是未知。
167
188
 
168
189
  Args:
169
190
  doc (fitz.Document): PDF文档对象。
170
191
 
171
192
  Returns:
172
193
  List[str]: 每一页的文本布局(横向、纵向、未知)。
173
-
174
194
  """
175
195
  text_layout_list = []
176
196
 
@@ -180,11 +200,11 @@ def get_pdf_text_layout_per_page(doc: fitz.Document):
180
200
  # 创建每一页的纵向和横向的文本行数计数器
181
201
  vertical_count = 0
182
202
  horizontal_count = 0
183
- text_dict = page.get_text("dict")
184
- if "blocks" in text_dict:
185
- for block in text_dict["blocks"]:
203
+ text_dict = page.get_text('dict')
204
+ if 'blocks' in text_dict:
205
+ for block in text_dict['blocks']:
186
206
  if 'lines' in block:
187
- for line in block["lines"]:
207
+ for line in block['lines']:
188
208
  # 获取line的bbox顶点坐标
189
209
  x0, y0, x1, y1 = line['bbox']
190
210
  # 计算bbox的宽高
@@ -199,8 +219,12 @@ def get_pdf_text_layout_per_page(doc: fitz.Document):
199
219
  if len(font_sizes) > 0:
200
220
  average_font_size = sum(font_sizes) / len(font_sizes)
201
221
  else:
202
- average_font_size = 10 # 有的line拿不到font_size,先定一个阈值100
203
- if area <= average_font_size ** 2: # 判断bbox的面积是否小于平均字体大小的平方,单字无法计算是横向还是纵向
222
+ average_font_size = (
223
+ 10 # 有的line拿不到font_size,先定一个阈值100
224
+ )
225
+ if (
226
+ area <= average_font_size**2
227
+ ): # 判断bbox的面积是否小于平均字体大小的平方,单字无法计算是横向还是纵向
204
228
  continue
205
229
  else:
206
230
  if 'wmode' in line: # 通过wmode判断文本方向
@@ -228,22 +252,22 @@ def get_pdf_text_layout_per_page(doc: fitz.Document):
228
252
  # print(f"page_id: {page_id}, vertical_count: {vertical_count}, horizontal_count: {horizontal_count}")
229
253
  # 判断每一页的文本布局
230
254
  if vertical_count == 0 and horizontal_count == 0: # 该页没有文本,无法判断
231
- text_layout_list.append("unknow")
255
+ text_layout_list.append('unknow')
232
256
  continue
233
257
  else:
234
258
  if vertical_count > horizontal_count: # 该页的文本纵向行数大于横向的
235
- text_layout_list.append("vertical")
259
+ text_layout_list.append('vertical')
236
260
  else: # 该页的文本横向行数大于纵向的
237
- text_layout_list.append("horizontal")
261
+ text_layout_list.append('horizontal')
238
262
  # logger.info(f"page_id: {page_id}, vertical_count: {vertical_count}, horizontal_count: {horizontal_count}")
239
263
  return text_layout_list
240
264
 
241
265
 
242
- '''定义一个自定义异常用来抛出单页svg太多的pdf'''
266
+ """定义一个自定义异常用来抛出单页svg太多的pdf"""
243
267
 
244
268
 
245
269
  class PageSvgsTooManyError(Exception):
246
- def __init__(self, message="Page SVGs are too many"):
270
+ def __init__(self, message='Page SVGs are too many'):
247
271
  self.message = message
248
272
  super().__init__(self.message)
249
273
 
@@ -285,7 +309,7 @@ def get_language(doc: fitz.Document):
285
309
  if page_id >= scan_max_page:
286
310
  break
287
311
  # 拿所有text的str
288
- text_block = page.get_text("text")
312
+ text_block = page.get_text('text')
289
313
  page_language = detect_lang(text_block)
290
314
  language_lst.append(page_language)
291
315
 
@@ -299,9 +323,7 @@ def get_language(doc: fitz.Document):
299
323
 
300
324
 
301
325
  def check_invalid_chars(pdf_bytes):
302
- """
303
- 乱码检测
304
- """
326
+ """乱码检测."""
305
327
  return detect_invalid_chars(pdf_bytes)
306
328
 
307
329
 
@@ -311,13 +333,13 @@ def pdf_meta_scan(pdf_bytes: bytes):
311
333
  :param pdf_bytes: pdf文件的二进制数据
312
334
  几个维度来评价:是否加密,是否需要密码,纸张大小,总页数,是否文字可提取
313
335
  """
314
- doc = fitz.open("pdf", pdf_bytes)
336
+ doc = fitz.open('pdf', pdf_bytes)
315
337
  is_needs_password = doc.needs_pass
316
338
  is_encrypted = doc.is_encrypted
317
339
  total_page = len(doc)
318
340
  if total_page == 0:
319
- logger.warning(f"drop this pdf, drop_reason: {DropReason.EMPTY_PDF}")
320
- result = {"_need_drop": True, "_drop_reason": DropReason.EMPTY_PDF}
341
+ logger.warning(f'drop this pdf, drop_reason: {DropReason.EMPTY_PDF}')
342
+ result = {'_need_drop': True, '_drop_reason': DropReason.EMPTY_PDF}
321
343
  return result
322
344
  else:
323
345
  page_width_pts, page_height_pts = get_pdf_page_size_pts(doc)
@@ -328,7 +350,9 @@ def pdf_meta_scan(pdf_bytes: bytes):
328
350
  imgs_per_page = get_imgs_per_page(doc)
329
351
  # logger.info(f"imgs_per_page: {imgs_per_page}")
330
352
 
331
- image_info_per_page, junk_img_bojids = get_image_info(doc, page_width_pts, page_height_pts)
353
+ image_info_per_page, junk_img_bojids = get_image_info(
354
+ doc, page_width_pts, page_height_pts
355
+ )
332
356
  # logger.info(f"image_info_per_page: {image_info_per_page}, junk_img_bojids: {junk_img_bojids}")
333
357
  text_len_per_page = get_pdf_textlen_per_page(doc)
334
358
  # logger.info(f"text_len_per_page: {text_len_per_page}")
@@ -341,20 +365,20 @@ def pdf_meta_scan(pdf_bytes: bytes):
341
365
 
342
366
  # 最后输出一条json
343
367
  res = {
344
- "is_needs_password": is_needs_password,
345
- "is_encrypted": is_encrypted,
346
- "total_page": total_page,
347
- "page_width_pts": int(page_width_pts),
348
- "page_height_pts": int(page_height_pts),
349
- "image_info_per_page": image_info_per_page,
350
- "text_len_per_page": text_len_per_page,
351
- "text_layout_per_page": text_layout_per_page,
352
- "text_language": text_language,
368
+ 'is_needs_password': is_needs_password,
369
+ 'is_encrypted': is_encrypted,
370
+ 'total_page': total_page,
371
+ 'page_width_pts': int(page_width_pts),
372
+ 'page_height_pts': int(page_height_pts),
373
+ 'image_info_per_page': image_info_per_page,
374
+ 'text_len_per_page': text_len_per_page,
375
+ 'text_layout_per_page': text_layout_per_page,
376
+ 'text_language': text_language,
353
377
  # "svgs_per_page": svgs_per_page,
354
- "imgs_per_page": imgs_per_page, # 增加每页img数量list
355
- "junk_img_bojids": junk_img_bojids, # 增加垃圾图片的bojid list
356
- "invalid_chars": invalid_chars,
357
- "metadata": doc.metadata
378
+ 'imgs_per_page': imgs_per_page, # 增加每页img数量list
379
+ 'junk_img_bojids': junk_img_bojids, # 增加垃圾图片的bojid list
380
+ 'invalid_chars': invalid_chars,
381
+ 'metadata': doc.metadata,
358
382
  }
359
383
  # logger.info(json.dumps(res, ensure_ascii=False))
360
384
  return res
@@ -364,14 +388,12 @@ def pdf_meta_scan(pdf_bytes: bytes):
364
388
  @click.option('--s3-pdf-path', help='s3上pdf文件的路径')
365
389
  @click.option('--s3-profile', help='s3上的profile')
366
390
  def main(s3_pdf_path: str, s3_profile: str):
367
- """
368
-
369
- """
391
+ """"""
370
392
  try:
371
393
  file_content = read_file(s3_pdf_path, s3_profile)
372
394
  pdf_meta_scan(file_content)
373
395
  except Exception as e:
374
- print(f"ERROR: {s3_pdf_path}, {e}", file=sys.stderr)
396
+ print(f'ERROR: {s3_pdf_path}, {e}', file=sys.stderr)
375
397
  logger.exception(e)
376
398
 
377
399
 
@@ -381,7 +403,7 @@ if __name__ == '__main__':
381
403
  # "D:\project/20231108code-clean\pdf_cost_time\竖排例子\三国演义_繁体竖排版.pdf"
382
404
  # "D:\project/20231108code-clean\pdf_cost_time\scihub\scihub_86800000\libgen.scimag86880000-86880999.zip_10.1021/acsami.1c03109.s002.pdf"
383
405
  # "D:/project/20231108code-clean/pdf_cost_time/scihub/scihub_18600000/libgen.scimag18645000-18645999.zip_10.1021/om3006239.pdf"
384
- # file_content = read_file("D:/project/20231108code-clean/pdf_cost_time/scihub/scihub_31000000/libgen.scimag31098000-31098999.zip_10.1109/isit.2006.261791.pdf","")
406
+ # file_content = read_file("D:/project/20231108code-clean/pdf_cost_time/scihub/scihub_31000000/libgen.scimag31098000-31098999.zip_10.1109/isit.2006.261791.pdf","") # noqa: E501
385
407
  # file_content = read_file("D:\project/20231108code-clean\pdf_cost_time\竖排例子\净空法师_大乘无量寿.pdf","")
386
408
  # doc = fitz.open("pdf", file_content)
387
409
  # text_layout_lst = get_pdf_text_layout_per_page(doc)
@@ -5,14 +5,13 @@ from pathlib import Path
5
5
  from loguru import logger
6
6
 
7
7
  import magic_pdf.model as model_config
8
+ from magic_pdf.config.ocr_content_type import BlockType, ContentType
9
+ from magic_pdf.data.data_reader_writer import FileBasedDataReader
8
10
  from magic_pdf.dict2md.ocr_mkcontent import merge_para_with_text
9
11
  from magic_pdf.integrations.rag.type import (CategoryType, ContentObject,
10
12
  ElementRelation, ElementRelType,
11
13
  LayoutElements,
12
14
  LayoutElementsExtra, PageInfo)
13
- from magic_pdf.libs.ocr_content_type import BlockType, ContentType
14
- from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
15
- from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
16
15
  from magic_pdf.tools.common import do_parse, prepare_env
17
16
 
18
17
 
@@ -224,8 +223,8 @@ def inference(path, output_dir, method):
224
223
  str(Path(path).stem), method)
225
224
 
226
225
  def read_fn(path):
227
- disk_rw = DiskReaderWriter(os.path.dirname(path))
228
- return disk_rw.read(os.path.basename(path), AbsReaderWriter.MODE_BIN)
226
+ disk_rw = FileBasedDataReader(os.path.dirname(path))
227
+ return disk_rw.read(os.path.basename(path))
229
228
 
230
229
  def parse_doc(doc_path: str):
231
230
  try:
@@ -5,7 +5,7 @@ import os
5
5
 
6
6
  from loguru import logger
7
7
 
8
- from magic_pdf.libs.Constants import MODEL_NAME
8
+ from magic_pdf.config.constants import MODEL_NAME
9
9
  from magic_pdf.libs.commons import parse_bucket_key
10
10
 
11
11
  # 定义配置文件名常量
@@ -92,14 +92,14 @@ def get_table_recog_config():
92
92
  table_config = config.get('table-config')
93
93
  if table_config is None:
94
94
  logger.warning(f"'table-config' not found in {CONFIG_FILE_NAME}, use 'False' as default")
95
- return json.loads(f'{{"model": "{MODEL_NAME.TABLE_MASTER}","enable": false, "max_time": 400}}')
95
+ return json.loads(f'{{"model": "{MODEL_NAME.RAPID_TABLE}","enable": false, "max_time": 400}}')
96
96
  else:
97
97
  return table_config
98
98
 
99
99
 
100
100
  def get_layout_config():
101
101
  config = read_config()
102
- layout_config = config.get("layout-config")
102
+ layout_config = config.get('layout-config')
103
103
  if layout_config is None:
104
104
  logger.warning(f"'layout-config' not found in {CONFIG_FILE_NAME}, use '{MODEL_NAME.LAYOUTLMv3}' as default")
105
105
  return json.loads(f'{{"model": "{MODEL_NAME.LAYOUTLMv3}"}}')
@@ -109,7 +109,7 @@ def get_layout_config():
109
109
 
110
110
  def get_formula_config():
111
111
  config = read_config()
112
- formula_config = config.get("formula-config")
112
+ formula_config = config.get('formula-config')
113
113
  if formula_config is None:
114
114
  logger.warning(f"'formula-config' not found in {CONFIG_FILE_NAME}, use 'True' as default")
115
115
  return json.loads(f'{{"mfd_model": "{MODEL_NAME.YOLO_V8_MFD}","mfr_model": "{MODEL_NAME.UniMerNet_v2_Small}","enable": true}}')
@@ -117,5 +117,5 @@ def get_formula_config():
117
117
  return formula_config
118
118
 
119
119
 
120
- if __name__ == "__main__":
121
- ak, sk, endpoint = get_s3_config("llm-raw")
120
+ if __name__ == '__main__':
121
+ ak, sk, endpoint = get_s3_config('llm-raw')
@@ -1,7 +1,8 @@
1
+ from magic_pdf.config.constants import CROSS_PAGE
2
+ from magic_pdf.config.ocr_content_type import (BlockType, CategoryId,
3
+ ContentType)
1
4
  from magic_pdf.data.dataset import PymuDocDataset
2
5
  from magic_pdf.libs.commons import fitz # PyMuPDF
3
- from magic_pdf.libs.Constants import CROSS_PAGE
4
- from magic_pdf.libs.ocr_content_type import BlockType, CategoryId, ContentType
5
6
  from magic_pdf.model.magic_model import MagicModel
6
7
 
7
8
 
@@ -369,10 +370,16 @@ def draw_line_sort_bbox(pdf_info, pdf_bytes, out_path, filename):
369
370
  if block['type'] in [BlockType.Image, BlockType.Table]:
370
371
  for sub_block in block['blocks']:
371
372
  if sub_block['type'] in [BlockType.ImageBody, BlockType.TableBody]:
372
- for line in sub_block['virtual_lines']:
373
- bbox = line['bbox']
374
- index = line['index']
375
- page_line_list.append({'index': index, 'bbox': bbox})
373
+ if len(sub_block['virtual_lines']) > 0 and sub_block['virtual_lines'][0].get('index', None) is not None:
374
+ for line in sub_block['virtual_lines']:
375
+ bbox = line['bbox']
376
+ index = line['index']
377
+ page_line_list.append({'index': index, 'bbox': bbox})
378
+ else:
379
+ for line in sub_block['lines']:
380
+ bbox = line['bbox']
381
+ index = line['index']
382
+ page_line_list.append({'index': index, 'bbox': bbox})
376
383
  elif sub_block['type'] in [BlockType.ImageCaption, BlockType.TableCaption, BlockType.ImageFootnote, BlockType.TableFootnote]:
377
384
  for line in sub_block['lines']:
378
385
  bbox = line['bbox']
@@ -1,23 +1,24 @@
1
-
2
- from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
3
- from magic_pdf.libs.commons import fitz
4
- from magic_pdf.libs.commons import join_path
1
+ from io import BytesIO
2
+ import cv2
3
+ import numpy as np
4
+ from PIL import Image
5
+ from magic_pdf.data.data_reader_writer import DataWriter
6
+ from magic_pdf.libs.commons import fitz, join_path
5
7
  from magic_pdf.libs.hash_utils import compute_sha256
6
8
 
7
9
 
8
- def cut_image(bbox: tuple, page_num: int, page: fitz.Page, return_path, imageWriter: AbsReaderWriter):
9
- """
10
- 从第page_num页的page中,根据bbox进行裁剪出一张jpg图片,返回图片路径
11
- save_path:需要同时支持s3和本地, 图片存放在save_path下,文件名是: {page_num}_{bbox[0]}_{bbox[1]}_{bbox[2]}_{bbox[3]}.jpg , bbox内数字取整。
12
- """
10
+ def cut_image(bbox: tuple, page_num: int, page: fitz.Page, return_path, imageWriter: DataWriter):
11
+ """从第page_num页的page中,根据bbox进行裁剪出一张jpg图片,返回图片路径 save_path:需要同时支持s3和本地,
12
+ 图片存放在save_path下,文件名是:
13
+ {page_num}_{bbox[0]}_{bbox[1]}_{bbox[2]}_{bbox[3]}.jpg , bbox内数字取整。"""
13
14
  # 拼接文件名
14
- filename = f"{page_num}_{int(bbox[0])}_{int(bbox[1])}_{int(bbox[2])}_{int(bbox[3])}"
15
+ filename = f'{page_num}_{int(bbox[0])}_{int(bbox[1])}_{int(bbox[2])}_{int(bbox[3])}'
15
16
 
16
17
  # 老版本返回不带bucket的路径
17
18
  img_path = join_path(return_path, filename) if return_path is not None else None
18
19
 
19
20
  # 新版本生成平铺路径
20
- img_hash256_path = f"{compute_sha256(img_path)}.jpg"
21
+ img_hash256_path = f'{compute_sha256(img_path)}.jpg'
21
22
 
22
23
  # 将坐标转换为fitz.Rect对象
23
24
  rect = fitz.Rect(*bbox)
@@ -28,6 +29,29 @@ def cut_image(bbox: tuple, page_num: int, page: fitz.Page, return_path, imageWri
28
29
 
29
30
  byte_data = pix.tobytes(output='jpeg', jpg_quality=95)
30
31
 
31
- imageWriter.write(byte_data, img_hash256_path, AbsReaderWriter.MODE_BIN)
32
+ imageWriter.write(img_hash256_path, byte_data)
32
33
 
33
34
  return img_hash256_path
35
+
36
+
37
+ def cut_image_to_pil_image(bbox: tuple, page: fitz.Page, mode="pillow"):
38
+
39
+ # 将坐标转换为fitz.Rect对象
40
+ rect = fitz.Rect(*bbox)
41
+ # 配置缩放倍数为3倍
42
+ zoom = fitz.Matrix(3, 3)
43
+ # 截取图片
44
+ pix = page.get_pixmap(clip=rect, matrix=zoom)
45
+
46
+ # 将字节数据转换为文件对象
47
+ image_file = BytesIO(pix.tobytes(output='png'))
48
+ # 使用 Pillow 打开图像
49
+ pil_image = Image.open(image_file)
50
+ if mode == "cv2":
51
+ image_result = cv2.cvtColor(np.asarray(pil_image), cv2.COLOR_RGB2BGR)
52
+ elif mode == "pillow":
53
+ image_result = pil_image
54
+ else:
55
+ raise ValueError(f"mode: {mode} is not supported.")
56
+
57
+ return image_result
magic_pdf/libs/version.py CHANGED
@@ -1 +1 @@
1
- __version__ = "0.9.2"
1
+ __version__ = "0.10.0"
@@ -163,7 +163,9 @@ def doc_analyze(pdf_bytes: bytes, ocr: bool = False, show_log: bool = False,
163
163
  page_width = img_dict["width"]
164
164
  page_height = img_dict["height"]
165
165
  if start_page_id <= index <= end_page_id:
166
+ page_start = time.time()
166
167
  result = custom_model(img)
168
+ logger.info(f'-----page_id : {index}, page total time: {round(time.time() - page_start, 2)}-----')
167
169
  else:
168
170
  result = []
169
171
  page_info = {"page_no": index, "height": page_height, "width": page_width}