magic-pdf 0.5.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (121) hide show
  1. magic_pdf/__init__.py +0 -0
  2. magic_pdf/cli/__init__.py +0 -0
  3. magic_pdf/cli/magicpdf.py +294 -0
  4. magic_pdf/dict2md/__init__.py +0 -0
  5. magic_pdf/dict2md/mkcontent.py +397 -0
  6. magic_pdf/dict2md/ocr_mkcontent.py +356 -0
  7. magic_pdf/filter/__init__.py +0 -0
  8. magic_pdf/filter/pdf_classify_by_type.py +381 -0
  9. magic_pdf/filter/pdf_meta_scan.py +368 -0
  10. magic_pdf/layout/__init__.py +0 -0
  11. magic_pdf/layout/bbox_sort.py +681 -0
  12. magic_pdf/layout/layout_det_utils.py +182 -0
  13. magic_pdf/layout/layout_sort.py +732 -0
  14. magic_pdf/layout/layout_spiler_recog.py +101 -0
  15. magic_pdf/layout/mcol_sort.py +336 -0
  16. magic_pdf/libs/Constants.py +11 -0
  17. magic_pdf/libs/MakeContentConfig.py +10 -0
  18. magic_pdf/libs/ModelBlockTypeEnum.py +9 -0
  19. magic_pdf/libs/__init__.py +0 -0
  20. magic_pdf/libs/boxbase.py +408 -0
  21. magic_pdf/libs/calc_span_stats.py +239 -0
  22. magic_pdf/libs/commons.py +204 -0
  23. magic_pdf/libs/config_reader.py +63 -0
  24. magic_pdf/libs/convert_utils.py +5 -0
  25. magic_pdf/libs/coordinate_transform.py +9 -0
  26. magic_pdf/libs/detect_language_from_model.py +21 -0
  27. magic_pdf/libs/draw_bbox.py +227 -0
  28. magic_pdf/libs/drop_reason.py +27 -0
  29. magic_pdf/libs/drop_tag.py +19 -0
  30. magic_pdf/libs/hash_utils.py +15 -0
  31. magic_pdf/libs/json_compressor.py +27 -0
  32. magic_pdf/libs/language.py +31 -0
  33. magic_pdf/libs/markdown_utils.py +31 -0
  34. magic_pdf/libs/math.py +9 -0
  35. magic_pdf/libs/nlp_utils.py +203 -0
  36. magic_pdf/libs/ocr_content_type.py +21 -0
  37. magic_pdf/libs/path_utils.py +23 -0
  38. magic_pdf/libs/pdf_image_tools.py +33 -0
  39. magic_pdf/libs/safe_filename.py +11 -0
  40. magic_pdf/libs/textbase.py +33 -0
  41. magic_pdf/libs/version.py +1 -0
  42. magic_pdf/libs/vis_utils.py +308 -0
  43. magic_pdf/model/__init__.py +0 -0
  44. magic_pdf/model/doc_analyze_by_360layout.py +8 -0
  45. magic_pdf/model/doc_analyze_by_pp_structurev2.py +125 -0
  46. magic_pdf/model/magic_model.py +632 -0
  47. magic_pdf/para/__init__.py +0 -0
  48. magic_pdf/para/block_continuation_processor.py +562 -0
  49. magic_pdf/para/block_termination_processor.py +480 -0
  50. magic_pdf/para/commons.py +222 -0
  51. magic_pdf/para/denoise.py +246 -0
  52. magic_pdf/para/draw.py +121 -0
  53. magic_pdf/para/exceptions.py +198 -0
  54. magic_pdf/para/layout_match_processor.py +40 -0
  55. magic_pdf/para/para_pipeline.py +297 -0
  56. magic_pdf/para/para_split.py +644 -0
  57. magic_pdf/para/para_split_v2.py +772 -0
  58. magic_pdf/para/raw_processor.py +207 -0
  59. magic_pdf/para/stats.py +268 -0
  60. magic_pdf/para/title_processor.py +1014 -0
  61. magic_pdf/pdf_parse_by_ocr.py +219 -0
  62. magic_pdf/pdf_parse_by_ocr_v2.py +17 -0
  63. magic_pdf/pdf_parse_by_txt.py +410 -0
  64. magic_pdf/pdf_parse_by_txt_v2.py +56 -0
  65. magic_pdf/pdf_parse_for_train.py +685 -0
  66. magic_pdf/pdf_parse_union_core.py +241 -0
  67. magic_pdf/pipe/AbsPipe.py +112 -0
  68. magic_pdf/pipe/OCRPipe.py +28 -0
  69. magic_pdf/pipe/TXTPipe.py +29 -0
  70. magic_pdf/pipe/UNIPipe.py +83 -0
  71. magic_pdf/pipe/__init__.py +0 -0
  72. magic_pdf/post_proc/__init__.py +0 -0
  73. magic_pdf/post_proc/detect_para.py +3472 -0
  74. magic_pdf/post_proc/pdf_post_filter.py +67 -0
  75. magic_pdf/post_proc/remove_footnote.py +153 -0
  76. magic_pdf/pre_proc/__init__.py +0 -0
  77. magic_pdf/pre_proc/citationmarker_remove.py +157 -0
  78. magic_pdf/pre_proc/construct_page_dict.py +72 -0
  79. magic_pdf/pre_proc/cut_image.py +71 -0
  80. magic_pdf/pre_proc/detect_equation.py +134 -0
  81. magic_pdf/pre_proc/detect_footer_by_model.py +64 -0
  82. magic_pdf/pre_proc/detect_footer_header_by_statistics.py +284 -0
  83. magic_pdf/pre_proc/detect_footnote.py +170 -0
  84. magic_pdf/pre_proc/detect_header.py +64 -0
  85. magic_pdf/pre_proc/detect_images.py +647 -0
  86. magic_pdf/pre_proc/detect_page_number.py +64 -0
  87. magic_pdf/pre_proc/detect_tables.py +62 -0
  88. magic_pdf/pre_proc/equations_replace.py +559 -0
  89. magic_pdf/pre_proc/fix_image.py +244 -0
  90. magic_pdf/pre_proc/fix_table.py +270 -0
  91. magic_pdf/pre_proc/main_text_font.py +23 -0
  92. magic_pdf/pre_proc/ocr_detect_all_bboxes.py +115 -0
  93. magic_pdf/pre_proc/ocr_detect_layout.py +133 -0
  94. magic_pdf/pre_proc/ocr_dict_merge.py +336 -0
  95. magic_pdf/pre_proc/ocr_span_list_modify.py +258 -0
  96. magic_pdf/pre_proc/pdf_pre_filter.py +74 -0
  97. magic_pdf/pre_proc/post_layout_split.py +0 -0
  98. magic_pdf/pre_proc/remove_bbox_overlap.py +98 -0
  99. magic_pdf/pre_proc/remove_colored_strip_bbox.py +79 -0
  100. magic_pdf/pre_proc/remove_footer_header.py +117 -0
  101. magic_pdf/pre_proc/remove_rotate_bbox.py +188 -0
  102. magic_pdf/pre_proc/resolve_bbox_conflict.py +191 -0
  103. magic_pdf/pre_proc/solve_line_alien.py +29 -0
  104. magic_pdf/pre_proc/statistics.py +12 -0
  105. magic_pdf/rw/AbsReaderWriter.py +34 -0
  106. magic_pdf/rw/DiskReaderWriter.py +66 -0
  107. magic_pdf/rw/S3ReaderWriter.py +107 -0
  108. magic_pdf/rw/__init__.py +0 -0
  109. magic_pdf/spark/__init__.py +0 -0
  110. magic_pdf/spark/spark_api.py +51 -0
  111. magic_pdf/train_utils/__init__.py +0 -0
  112. magic_pdf/train_utils/convert_to_train_format.py +65 -0
  113. magic_pdf/train_utils/extract_caption.py +59 -0
  114. magic_pdf/train_utils/remove_footer_header.py +159 -0
  115. magic_pdf/train_utils/vis_utils.py +327 -0
  116. magic_pdf/user_api.py +136 -0
  117. magic_pdf-0.5.4.dist-info/LICENSE.md +661 -0
  118. magic_pdf-0.5.4.dist-info/METADATA +24 -0
  119. magic_pdf-0.5.4.dist-info/RECORD +121 -0
  120. magic_pdf-0.5.4.dist-info/WHEEL +5 -0
  121. magic_pdf-0.5.4.dist-info/top_level.txt +1 -0
@@ -0,0 +1,65 @@
1
+ def convert_to_train_format(jso: dict) -> []:
2
+ pages = []
3
+ for k, v in jso.items():
4
+ if not k.startswith("page_"):
5
+ continue
6
+ page_idx = v["page_idx"]
7
+ width, height = v["page_size"]
8
+
9
+ info = {"page_info": {"page_no": page_idx, "height": height, "width": width}}
10
+
11
+ bboxes: list[dict] = []
12
+ for img_bbox in v["image_bboxes_with_caption"]:
13
+ bbox = {"category_id": 1, "bbox": img_bbox["bbox"]}
14
+ if "caption" in img_bbox:
15
+ bbox["caption_bbox"] = img_bbox["caption"]
16
+ bboxes.append(bbox)
17
+
18
+ for tbl_bbox in v["table_bboxes_with_caption"]:
19
+ bbox = {"category_id": 7, "bbox": tbl_bbox["bbox"]}
20
+ if "caption" in tbl_bbox:
21
+ bbox["caption_bbox"] = tbl_bbox["caption"]
22
+ bboxes.append(bbox)
23
+
24
+ for bbox in v["bak_page_no_bboxes"]:
25
+ n_bbox = {"category_id": 4, "bbox": bbox}
26
+ bboxes.append(n_bbox)
27
+
28
+ for bbox in v["bak_header_bboxes"]:
29
+ n_bbox = {"category_id": 3, "bbox": bbox}
30
+ bboxes.append(n_bbox)
31
+
32
+ for bbox in v["bak_footer_bboxes"]:
33
+ n_bbox = {"category_id": 6, "bbox": bbox}
34
+ bboxes.append(n_bbox)
35
+
36
+ # 脚注, 目前没有看到例子
37
+ for para in v["para_blocks"]:
38
+ if "paras" in para:
39
+ paras = para["paras"]
40
+ for para_key, para_content in paras.items():
41
+ para_bbox = para_content["para_bbox"]
42
+ is_para_title = para_content["is_para_title"]
43
+ if is_para_title:
44
+ n_bbox = {"category_id": 0, "bbox": para_bbox}
45
+ else:
46
+ n_bbox = {"category_id": 2, "bbox": para_bbox}
47
+ bboxes.append(n_bbox)
48
+
49
+ for inline_equation in v["inline_equations"]:
50
+ n_bbox = {"category_id": 13, "bbox": inline_equation["bbox"]}
51
+ bboxes.append(n_bbox)
52
+
53
+ for inter_equation in v["interline_equations"]:
54
+ n_bbox = {"category_id": 10, "bbox": inter_equation["bbox"]}
55
+ bboxes.append(n_bbox)
56
+
57
+ for footnote_bbox in v["bak_footer_note_bboxes"]:
58
+ n_bbox = {"category_id": 5, "bbox": list(footnote_bbox)}
59
+ bboxes.append(n_bbox)
60
+
61
+ info["bboxes"] = bboxes
62
+ info["layout_tree"] = v["layout_bboxes"]
63
+ pages.append(info)
64
+
65
+ return pages
@@ -0,0 +1,59 @@
1
+ from magic_pdf.libs.boxbase import _is_in
2
+
3
+
4
+ def extract_caption_bbox(outer: list, inner: list) -> list:
5
+ """
6
+ ret: list of {
7
+ "bbox": [1,2,3,4],
8
+ "caption": [5,6,7,8] # may existed
9
+ }
10
+
11
+ """
12
+ found_count = 0 # for debug
13
+ print(outer, inner)
14
+
15
+ def is_float_equal(a, b):
16
+ if 0.01 > abs(a - b): # non strict float equal compare
17
+ return True
18
+ return False
19
+
20
+ outer_h = {i: outer[i] for i in range(len(outer))}
21
+ ret = []
22
+ for v in inner:
23
+ ix0, iy0, ix1, iy1 = v
24
+ found_idx = None
25
+ d = {"bbox": v[:4]}
26
+ for k in outer_h:
27
+ ox0, oy0, ox1, oy1 = outer_h[k]
28
+ equal_float_flags = [
29
+ is_float_equal(ix0, ox0),
30
+ is_float_equal(iy0, oy0),
31
+ is_float_equal(ix1, ox1),
32
+ is_float_equal(iy1, oy1),
33
+ ]
34
+ if _is_in(v, outer_h[k]) and not all(equal_float_flags):
35
+ found_idx = k
36
+ break
37
+ if found_idx is not None:
38
+ found_count += 1
39
+ captions: list[list] = []
40
+ ox0, oy0, ox1, oy1 = outer_h[found_idx]
41
+ captions = [
42
+ [ox0, oy0, ix0, oy1],
43
+ [ox0, oy0, ox1, iy0],
44
+ [ox0, iy1, ox1, oy1],
45
+ [ix1, oy0, ox1, oy1],
46
+ ]
47
+ captions = sorted(
48
+ captions,
49
+ key=lambda rect: abs(rect[0] - rect[2]) * abs(rect[1] - rect[3]),
50
+ ) # 面积最大的框就是caption
51
+ d["caption"] = captions[-1]
52
+ outer_h.pop(
53
+ found_idx
54
+ ) # 同一个 outer box 只能用于确定一个 inner box 的 caption 位置。
55
+
56
+ ret.append(d)
57
+
58
+ print("found_count: ", found_count)
59
+ return ret
@@ -0,0 +1,159 @@
1
+ import re
2
+
3
+ from magic_pdf.libs.boxbase import _is_in_or_part_overlap
4
+ from magic_pdf.libs.drop_tag import CONTENT_IN_FOOT_OR_HEADER, PAGE_NO
5
+
6
+
7
+ """
8
+ copy from pre_proc/remove_footer_header.py
9
+ """
10
+
11
+
12
+ def remove_headder_footer_one_page(
13
+ text_raw_blocks,
14
+ image_bboxes,
15
+ table_bboxes,
16
+ header_bboxs,
17
+ footer_bboxs,
18
+ page_no_bboxs,
19
+ page_w,
20
+ page_h,
21
+ ):
22
+ """
23
+ 删除页眉页脚,页码
24
+ 从line级别进行删除,删除之后观察这个text-block是否是空的,如果是空的,则移动到remove_list中
25
+ """
26
+ if 1:
27
+ return image_bboxes, table_bboxes, text_raw_blocks, [], [], []
28
+
29
+ header = []
30
+ footer = []
31
+ if len(header) == 0:
32
+ model_header = header_bboxs
33
+ if model_header:
34
+ x0 = min([x for x, _, _, _ in model_header])
35
+ y0 = min([y for _, y, _, _ in model_header])
36
+ x1 = max([x1 for _, _, x1, _ in model_header])
37
+ y1 = max([y1 for _, _, _, y1 in model_header])
38
+ header = [x0, y0, x1, y1]
39
+ if len(footer) == 0:
40
+ model_footer = footer_bboxs
41
+ if model_footer:
42
+ x0 = min([x for x, _, _, _ in model_footer])
43
+ y0 = min([y for _, y, _, _ in model_footer])
44
+ x1 = max([x1 for _, _, x1, _ in model_footer])
45
+ y1 = max([y1 for _, _, _, y1 in model_footer])
46
+ footer = [x0, y0, x1, y1]
47
+
48
+ header_y0 = 0 if len(header) == 0 else header[3]
49
+ footer_y0 = page_h if len(footer) == 0 else footer[1]
50
+ if page_no_bboxs:
51
+ top_part = [b for b in page_no_bboxs if b[3] < page_h / 2]
52
+ btn_part = [b for b in page_no_bboxs if b[1] > page_h / 2]
53
+
54
+ top_max_y0 = max([b[1] for b in top_part]) if top_part else 0
55
+ btn_min_y1 = min([b[3] for b in btn_part]) if btn_part else page_h
56
+
57
+ header_y0 = max(header_y0, top_max_y0)
58
+ footer_y0 = min(footer_y0, btn_min_y1)
59
+
60
+ content_boundry = [0, header_y0, page_w, footer_y0]
61
+
62
+ header = [0, 0, page_w, header_y0]
63
+ footer = [0, footer_y0, page_w, page_h]
64
+
65
+ """以上计算出来了页眉页脚的边界,下面开始进行删除"""
66
+ text_block_to_remove = []
67
+ # 首先检查每个textblock
68
+ for blk in text_raw_blocks:
69
+ if len(blk["lines"]) > 0:
70
+ for line in blk["lines"]:
71
+ line_del = []
72
+ for span in line["spans"]:
73
+ span_del = []
74
+ if span["bbox"][3] < header_y0:
75
+ span_del.append(span)
76
+ elif _is_in_or_part_overlap(
77
+ span["bbox"], header
78
+ ) or _is_in_or_part_overlap(span["bbox"], footer):
79
+ span_del.append(span)
80
+ for span in span_del:
81
+ line["spans"].remove(span)
82
+ if not line["spans"]:
83
+ line_del.append(line)
84
+
85
+ for line in line_del:
86
+ blk["lines"].remove(line)
87
+ else:
88
+ # if not blk['lines']:
89
+ blk["tag"] = CONTENT_IN_FOOT_OR_HEADER
90
+ text_block_to_remove.append(blk)
91
+
92
+ """有的时候由于pageNo太小了,总是会有一点和content_boundry重叠一点,被放入正文,因此对于pageNo,进行span粒度的删除"""
93
+ page_no_block_2_remove = []
94
+ if page_no_bboxs:
95
+ for pagenobox in page_no_bboxs:
96
+ for block in text_raw_blocks:
97
+ if _is_in_or_part_overlap(
98
+ pagenobox, block["bbox"]
99
+ ): # 在span级别删除页码
100
+ for line in block["lines"]:
101
+ for span in line["spans"]:
102
+ if _is_in_or_part_overlap(pagenobox, span["bbox"]):
103
+ # span['text'] = ''
104
+ span["tag"] = PAGE_NO
105
+ # 检查这个block是否只有这一个span,如果是,那么就把这个block也删除
106
+ if len(line["spans"]) == 1 and len(block["lines"]) == 1:
107
+ page_no_block_2_remove.append(block)
108
+ else:
109
+ # 测试最后一个是不是页码:规则是,最后一个block仅有1个line,一个span,且text是数字,空格,符号组成,不含字母,并且包含数字
110
+ if len(text_raw_blocks) > 0:
111
+ text_raw_blocks.sort(key=lambda x: x["bbox"][1], reverse=True)
112
+ last_block = text_raw_blocks[0]
113
+ if len(last_block["lines"]) == 1:
114
+ last_line = last_block["lines"][0]
115
+ if len(last_line["spans"]) == 1:
116
+ last_span = last_line["spans"][0]
117
+ if (
118
+ last_span["text"].strip()
119
+ and not re.search("[a-zA-Z]", last_span["text"])
120
+ and re.search("[0-9]", last_span["text"])
121
+ ):
122
+ last_span["tag"] = PAGE_NO
123
+ page_no_block_2_remove.append(last_block)
124
+
125
+ for b in page_no_block_2_remove:
126
+ text_block_to_remove.append(b)
127
+
128
+ for blk in text_block_to_remove:
129
+ if blk in text_raw_blocks:
130
+ text_raw_blocks.remove(blk)
131
+
132
+ text_block_remain = text_raw_blocks
133
+ image_bbox_to_remove = [
134
+ bbox
135
+ for bbox in image_bboxes
136
+ if not _is_in_or_part_overlap(bbox, content_boundry)
137
+ ]
138
+
139
+ image_bbox_remain = [
140
+ bbox for bbox in image_bboxes if _is_in_or_part_overlap(bbox, content_boundry)
141
+ ]
142
+ table_bbox_to_remove = [
143
+ bbox
144
+ for bbox in table_bboxes
145
+ if not _is_in_or_part_overlap(bbox, content_boundry)
146
+ ]
147
+ table_bbox_remain = [
148
+ bbox for bbox in table_bboxes if _is_in_or_part_overlap(bbox, content_boundry)
149
+ ]
150
+
151
+ # 1, 2, 3
152
+ return (
153
+ image_bbox_remain,
154
+ table_bbox_remain,
155
+ text_block_remain,
156
+ text_block_to_remove,
157
+ image_bbox_to_remove,
158
+ table_bbox_to_remove,
159
+ )
@@ -0,0 +1,327 @@
1
+ from magic_pdf.libs.commons import fitz
2
+ import os
3
+ from magic_pdf.libs.coordinate_transform import get_scale_ratio
4
+
5
+
6
+ def draw_model_output(
7
+ raw_pdf_doc: fitz.Document, paras_dict_arr: list[dict], save_path: str
8
+ ):
9
+ """
10
+ 在page上画出bbox,保存到save_path
11
+ """
12
+ """
13
+
14
+ # {0: 'title', # 标题
15
+ # 1: 'figure', # 图片
16
+ # 2: 'plain text', # 文本
17
+ # 3: 'header', # 页眉
18
+ # 4: 'page number', # 页码
19
+ # 5: 'footnote', # 脚注
20
+ # 6: 'footer', # 页脚
21
+ # 7: 'table', # 表格
22
+ # 8: 'table caption', # 表格描述
23
+ # 9: 'figure caption', # 图片描述
24
+ # 10: 'equation', # 公式
25
+ # 11: 'full column', # 单栏
26
+ # 12: 'sub column', # 多栏
27
+ # 13: 'embedding', # 嵌入公式
28
+ # 14: 'isolated'} # 单行公式
29
+
30
+ """
31
+
32
+ color_map = {
33
+ "body": fitz.pdfcolor["green"],
34
+ "non_body": fitz.pdfcolor["red"],
35
+ }
36
+ """
37
+ {"layout_dets": [], "subfield_dets": [], "page_info": {"page_no": 22, "height": 1650, "width": 1275}}
38
+ """
39
+ for i, page in enumerate(raw_pdf_doc):
40
+ v = paras_dict_arr[i]
41
+ page_idx = v["page_info"]["page_no"]
42
+ width = v["page_info"]["width"]
43
+ height = v["page_info"]["height"]
44
+
45
+ horizontal_scale_ratio, vertical_scale_ratio = get_scale_ratio(
46
+ paras_dict_arr[i], page
47
+ )
48
+
49
+ for order, block in enumerate(v["layout_dets"]):
50
+ L = block["poly"][0] / horizontal_scale_ratio
51
+ U = block["poly"][1] / vertical_scale_ratio
52
+ R = block["poly"][2] / horizontal_scale_ratio
53
+ D = block["poly"][5] / vertical_scale_ratio
54
+ # L += pageL # 有的页面,artBox偏移了。不在(0,0)
55
+ # R += pageL
56
+ # U += pageU
57
+ # D += pageU
58
+ L, R = min(L, R), max(L, R)
59
+ U, D = min(U, D), max(U, D)
60
+ bbox = [L, U, R, D]
61
+ color = color_map["body"]
62
+ if block["category_id"] in (3, 4, 5, 6, 0):
63
+ color = color_map["non_body"]
64
+
65
+ rect = fitz.Rect(bbox)
66
+ page.draw_rect(rect, fill=None, width=0.5, overlay=True, color=color)
67
+
68
+ parent_dir = os.path.dirname(save_path)
69
+ if not os.path.exists(parent_dir):
70
+ os.makedirs(parent_dir)
71
+ raw_pdf_doc.save(save_path)
72
+
73
+
74
+ def debug_show_bbox(
75
+ raw_pdf_doc: fitz.Document,
76
+ page_idx: int,
77
+ bboxes: list,
78
+ droped_bboxes: list,
79
+ expect_drop_bboxes: list,
80
+ save_path: str,
81
+ expected_page_id: int,
82
+ ):
83
+ """
84
+ 以覆盖的方式写个临时的pdf,用于debug
85
+ """
86
+ if page_idx != expected_page_id:
87
+ return
88
+
89
+ if os.path.exists(save_path):
90
+ # 删除已经存在的文件
91
+ os.remove(save_path)
92
+ # 创建一个新的空白 PDF 文件
93
+ doc = fitz.open("")
94
+
95
+ width = raw_pdf_doc[page_idx].rect.width
96
+ height = raw_pdf_doc[page_idx].rect.height
97
+ new_page = doc.new_page(width=width, height=height)
98
+
99
+ shape = new_page.new_shape()
100
+ for bbox in bboxes:
101
+ # 原始box画上去
102
+ rect = fitz.Rect(*bbox[0:4])
103
+ shape = new_page.new_shape()
104
+ shape.draw_rect(rect)
105
+ shape.finish(
106
+ color=fitz.pdfcolor["red"], fill=fitz.pdfcolor["blue"], fill_opacity=0.2
107
+ )
108
+ shape.finish()
109
+ shape.commit()
110
+
111
+ for bbox in droped_bboxes:
112
+ # 原始box画上去
113
+ rect = fitz.Rect(*bbox[0:4])
114
+ shape = new_page.new_shape()
115
+ shape.draw_rect(rect)
116
+ shape.finish(color=None, fill=fitz.pdfcolor["yellow"], fill_opacity=0.2)
117
+ shape.finish()
118
+ shape.commit()
119
+
120
+ for bbox in expect_drop_bboxes:
121
+ # 原始box画上去
122
+ rect = fitz.Rect(*bbox[0:4])
123
+ shape = new_page.new_shape()
124
+ shape.draw_rect(rect)
125
+ shape.finish(color=fitz.pdfcolor["red"], fill=None)
126
+ shape.finish()
127
+ shape.commit()
128
+
129
+ # shape.insert_textbox(fitz.Rect(200, 0, 600, 20), f"total bboxes: {len(bboxes)}", fontname="helv", fontsize=12,
130
+ # color=(0, 0, 0))
131
+ # shape.finish(color=fitz.pdfcolor['black'])
132
+ # shape.commit()
133
+
134
+ parent_dir = os.path.dirname(save_path)
135
+ if not os.path.exists(parent_dir):
136
+ os.makedirs(parent_dir)
137
+
138
+ doc.save(save_path)
139
+ doc.close()
140
+
141
+
142
+ def debug_show_page(
143
+ page,
144
+ bboxes1: list,
145
+ bboxes2: list,
146
+ bboxes3: list,
147
+ ):
148
+ save_path = "./tmp/debug.pdf"
149
+ if os.path.exists(save_path):
150
+ # 删除已经存在的文件
151
+ os.remove(save_path)
152
+ # 创建一个新的空白 PDF 文件
153
+ doc = fitz.open("")
154
+
155
+ width = page.rect.width
156
+ height = page.rect.height
157
+ new_page = doc.new_page(width=width, height=height)
158
+
159
+ shape = new_page.new_shape()
160
+ for bbox in bboxes1:
161
+ # 原始box画上去
162
+ rect = fitz.Rect(*bbox[0:4])
163
+ shape = new_page.new_shape()
164
+ shape.draw_rect(rect)
165
+ shape.finish(
166
+ color=fitz.pdfcolor["red"], fill=fitz.pdfcolor["blue"], fill_opacity=0.2
167
+ )
168
+ shape.finish()
169
+ shape.commit()
170
+
171
+ for bbox in bboxes2:
172
+ # 原始box画上去
173
+ rect = fitz.Rect(*bbox[0:4])
174
+ shape = new_page.new_shape()
175
+ shape.draw_rect(rect)
176
+ shape.finish(color=None, fill=fitz.pdfcolor["yellow"], fill_opacity=0.2)
177
+ shape.finish()
178
+ shape.commit()
179
+
180
+ for bbox in bboxes3:
181
+ # 原始box画上去
182
+ rect = fitz.Rect(*bbox[0:4])
183
+ shape = new_page.new_shape()
184
+ shape.draw_rect(rect)
185
+ shape.finish(color=fitz.pdfcolor["red"], fill=None)
186
+ shape.finish()
187
+ shape.commit()
188
+
189
+ parent_dir = os.path.dirname(save_path)
190
+ if not os.path.exists(parent_dir):
191
+ os.makedirs(parent_dir)
192
+
193
+ doc.save(save_path)
194
+ doc.close()
195
+
196
+
197
+ def draw_layout_bbox_on_page(
198
+ raw_pdf_doc: fitz.Document, paras_dict: dict, header, footer, pdf_path: str
199
+ ):
200
+ """
201
+ 在page上画出bbox,保存到save_path
202
+ """
203
+ # 检查文件是否存在
204
+ is_new_pdf = False
205
+ if os.path.exists(pdf_path):
206
+ # 打开现有的 PDF 文件
207
+ doc = fitz.open(pdf_path)
208
+ else:
209
+ # 创建一个新的空白 PDF 文件
210
+ is_new_pdf = True
211
+ doc = fitz.open("")
212
+
213
+ for k, v in paras_dict.items():
214
+ page_idx = v["page_idx"]
215
+ layouts = v["layout_bboxes"]
216
+ page = doc[page_idx]
217
+ shape = page.new_shape()
218
+ for order, layout in enumerate(layouts):
219
+ border_offset = 1
220
+ rect_box = layout["layout_bbox"]
221
+ layout_label = layout["layout_label"]
222
+ fill_color = fitz.pdfcolor["pink"] if layout_label == "U" else None
223
+ rect_box = [
224
+ rect_box[0] + 1,
225
+ rect_box[1] - border_offset,
226
+ rect_box[2] - 1,
227
+ rect_box[3] + border_offset,
228
+ ]
229
+ rect = fitz.Rect(*rect_box)
230
+ shape.draw_rect(rect)
231
+ shape.finish(color=fitz.pdfcolor["red"], fill=fill_color, fill_opacity=0.4)
232
+ """
233
+ draw order text on layout box
234
+ """
235
+ font_size = 10
236
+ shape.insert_text(
237
+ (rect_box[0] + 1, rect_box[1] + font_size),
238
+ f"{order}",
239
+ fontsize=font_size,
240
+ color=(0, 0, 0),
241
+ )
242
+
243
+ """画上footer header"""
244
+ if header:
245
+ shape.draw_rect(fitz.Rect(header))
246
+ shape.finish(color=None, fill=fitz.pdfcolor["black"], fill_opacity=0.2)
247
+ if footer:
248
+ shape.draw_rect(fitz.Rect(footer))
249
+ shape.finish(color=None, fill=fitz.pdfcolor["black"], fill_opacity=0.2)
250
+
251
+ shape.commit()
252
+
253
+ if is_new_pdf:
254
+ doc.save(pdf_path)
255
+ else:
256
+ doc.saveIncr()
257
+ doc.close()
258
+
259
+
260
+ @DeprecationWarning
261
+ def draw_layout_on_page(
262
+ raw_pdf_doc: fitz.Document, page_idx: int, page_layout: list, pdf_path: str
263
+ ):
264
+ """
265
+ 把layout的box用红色边框花在pdf_path的page_idx上
266
+ """
267
+
268
+ def draw(shape, layout, fill_color=fitz.pdfcolor["pink"]):
269
+ border_offset = 1
270
+ rect_box = layout["layout_bbox"]
271
+ layout_label = layout["layout_label"]
272
+ sub_layout = layout["sub_layout"]
273
+ if len(sub_layout) == 0:
274
+ fill_color = fill_color if layout_label == "U" else None
275
+ rect_box = [
276
+ rect_box[0] + 1,
277
+ rect_box[1] - border_offset,
278
+ rect_box[2] - 1,
279
+ rect_box[3] + border_offset,
280
+ ]
281
+ rect = fitz.Rect(*rect_box)
282
+ shape.draw_rect(rect)
283
+ shape.finish(color=fitz.pdfcolor["red"], fill=fill_color, fill_opacity=0.2)
284
+ # if layout_label=='U':
285
+ # bad_boxes = layout.get("bad_boxes", [])
286
+ # for bad_box in bad_boxes:
287
+ # rect = fitz.Rect(*bad_box)
288
+ # shape.draw_rect(rect)
289
+ # shape.finish(color=fitz.pdfcolor['red'], fill=fitz.pdfcolor['red'], fill_opacity=0.2)
290
+ # else:
291
+ # rect = fitz.Rect(*rect_box)
292
+ # shape.draw_rect(rect)
293
+ # shape.finish(color=fitz.pdfcolor['blue'])
294
+
295
+ for sub_layout in sub_layout:
296
+ draw(shape, sub_layout)
297
+ shape.commit()
298
+
299
+ # 检查文件是否存在
300
+ is_new_pdf = False
301
+ if os.path.exists(pdf_path):
302
+ # 打开现有的 PDF 文件
303
+ doc = fitz.open(pdf_path)
304
+ else:
305
+ # 创建一个新的空白 PDF 文件
306
+ is_new_pdf = True
307
+ doc = fitz.open("")
308
+
309
+ page = doc[page_idx]
310
+ shape = page.new_shape()
311
+ for order, layout in enumerate(page_layout):
312
+ draw(shape, layout, fitz.pdfcolor["yellow"])
313
+
314
+ # shape.insert_textbox(fitz.Rect(200, 0, 600, 20), f"total bboxes: {len(layout)}", fontname="helv", fontsize=12,
315
+ # color=(0, 0, 0))
316
+ # shape.finish(color=fitz.pdfcolor['black'])
317
+ # shape.commit()
318
+
319
+ parent_dir = os.path.dirname(pdf_path)
320
+ if not os.path.exists(parent_dir):
321
+ os.makedirs(parent_dir)
322
+
323
+ if is_new_pdf:
324
+ doc.save(pdf_path)
325
+ else:
326
+ doc.saveIncr()
327
+ doc.close()