magic-pdf 0.10.1__py3-none-any.whl → 0.10.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (75) hide show
  1. magic_pdf/filter/pdf_meta_scan.py +3 -17
  2. magic_pdf/libs/commons.py +0 -161
  3. magic_pdf/libs/draw_bbox.py +2 -3
  4. magic_pdf/libs/markdown_utils.py +0 -21
  5. magic_pdf/libs/pdf_image_tools.py +2 -1
  6. magic_pdf/libs/version.py +1 -1
  7. magic_pdf/model/doc_analyze_by_custom_model.py +2 -2
  8. magic_pdf/model/magic_model.py +0 -30
  9. magic_pdf/model/sub_modules/ocr/paddleocr/ocr_utils.py +3 -28
  10. magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_273_mod.py +3 -3
  11. magic_pdf/para/para_split_v3.py +7 -2
  12. magic_pdf/pdf_parse_union_core_v2.py +97 -124
  13. magic_pdf/pre_proc/construct_page_dict.py +0 -55
  14. magic_pdf/pre_proc/cut_image.py +0 -37
  15. magic_pdf/pre_proc/ocr_detect_all_bboxes.py +5 -178
  16. magic_pdf/pre_proc/ocr_dict_merge.py +1 -224
  17. magic_pdf/pre_proc/ocr_span_list_modify.py +2 -252
  18. magic_pdf/rw/S3ReaderWriter.py +1 -1
  19. {magic_pdf-0.10.1.dist-info → magic_pdf-0.10.2.dist-info}/METADATA +3 -77
  20. {magic_pdf-0.10.1.dist-info → magic_pdf-0.10.2.dist-info}/RECORD +24 -75
  21. magic_pdf/dict2md/mkcontent.py +0 -438
  22. magic_pdf/layout/__init__.py +0 -0
  23. magic_pdf/layout/bbox_sort.py +0 -681
  24. magic_pdf/layout/layout_det_utils.py +0 -182
  25. magic_pdf/layout/layout_sort.py +0 -921
  26. magic_pdf/layout/layout_spiler_recog.py +0 -101
  27. magic_pdf/layout/mcol_sort.py +0 -336
  28. magic_pdf/libs/calc_span_stats.py +0 -239
  29. magic_pdf/libs/detect_language_from_model.py +0 -21
  30. magic_pdf/libs/nlp_utils.py +0 -203
  31. magic_pdf/libs/textbase.py +0 -33
  32. magic_pdf/libs/vis_utils.py +0 -308
  33. magic_pdf/para/block_continuation_processor.py +0 -562
  34. magic_pdf/para/block_termination_processor.py +0 -480
  35. magic_pdf/para/commons.py +0 -222
  36. magic_pdf/para/denoise.py +0 -246
  37. magic_pdf/para/draw.py +0 -121
  38. magic_pdf/para/exceptions.py +0 -198
  39. magic_pdf/para/layout_match_processor.py +0 -40
  40. magic_pdf/para/para_split.py +0 -807
  41. magic_pdf/para/para_split_v2.py +0 -959
  42. magic_pdf/para/raw_processor.py +0 -207
  43. magic_pdf/para/stats.py +0 -268
  44. magic_pdf/para/title_processor.py +0 -1014
  45. magic_pdf/pdf_parse_union_core.py +0 -345
  46. magic_pdf/post_proc/__init__.py +0 -0
  47. magic_pdf/post_proc/detect_para.py +0 -3472
  48. magic_pdf/post_proc/pdf_post_filter.py +0 -60
  49. magic_pdf/post_proc/remove_footnote.py +0 -153
  50. magic_pdf/pre_proc/citationmarker_remove.py +0 -161
  51. magic_pdf/pre_proc/detect_equation.py +0 -134
  52. magic_pdf/pre_proc/detect_footer_by_model.py +0 -64
  53. magic_pdf/pre_proc/detect_footer_header_by_statistics.py +0 -284
  54. magic_pdf/pre_proc/detect_footnote.py +0 -170
  55. magic_pdf/pre_proc/detect_header.py +0 -64
  56. magic_pdf/pre_proc/detect_images.py +0 -647
  57. magic_pdf/pre_proc/detect_page_number.py +0 -64
  58. magic_pdf/pre_proc/detect_tables.py +0 -62
  59. magic_pdf/pre_proc/equations_replace.py +0 -550
  60. magic_pdf/pre_proc/fix_image.py +0 -244
  61. magic_pdf/pre_proc/fix_table.py +0 -270
  62. magic_pdf/pre_proc/main_text_font.py +0 -23
  63. magic_pdf/pre_proc/ocr_detect_layout.py +0 -133
  64. magic_pdf/pre_proc/pdf_pre_filter.py +0 -78
  65. magic_pdf/pre_proc/post_layout_split.py +0 -0
  66. magic_pdf/pre_proc/remove_colored_strip_bbox.py +0 -101
  67. magic_pdf/pre_proc/remove_footer_header.py +0 -114
  68. magic_pdf/pre_proc/remove_rotate_bbox.py +0 -236
  69. magic_pdf/pre_proc/resolve_bbox_conflict.py +0 -184
  70. magic_pdf/pre_proc/solve_line_alien.py +0 -29
  71. magic_pdf/pre_proc/statistics.py +0 -12
  72. {magic_pdf-0.10.1.dist-info → magic_pdf-0.10.2.dist-info}/LICENSE.md +0 -0
  73. {magic_pdf-0.10.1.dist-info → magic_pdf-0.10.2.dist-info}/WHEEL +0 -0
  74. {magic_pdf-0.10.1.dist-info → magic_pdf-0.10.2.dist-info}/entry_points.txt +0 -0
  75. {magic_pdf-0.10.1.dist-info → magic_pdf-0.10.2.dist-info}/top_level.txt +0 -0
@@ -1,203 +0,0 @@
1
- import re
2
- from os import path
3
-
4
- from collections import Counter
5
-
6
- from loguru import logger
7
-
8
- # from langdetect import detect
9
- import spacy
10
- import en_core_web_sm
11
- import zh_core_web_sm
12
-
13
- from magic_pdf.libs.language import detect_lang
14
-
15
-
16
- class NLPModels:
17
- """
18
- How to upload local models to s3:
19
- - config aws cli:
20
- doc\SETUP-CLI.md
21
- doc\setup_cli.sh
22
- app\config\__init__.py
23
- - $ cd {local_dir_storing_models}
24
- - $ ls models
25
- en_core_web_sm-3.7.1/
26
- zh_core_web_sm-3.7.0/
27
- - $ aws s3 sync models/ s3://llm-infra/models --profile=p_project_norm
28
- - $ aws s3 --profile=p_project_norm ls s3://llm-infra/models/
29
- PRE en_core_web_sm-3.7.1/
30
- PRE zh_core_web_sm-3.7.0/
31
- """
32
-
33
- def __init__(self):
34
- # if OS is windows, set "TMP_DIR" to "D:/tmp"
35
-
36
- home_dir = path.expanduser("~")
37
- self.default_local_path = path.join(home_dir, ".nlp_models")
38
- self.default_shared_path = "/share/pdf_processor/nlp_models"
39
- self.default_hdfs_path = "hdfs://pdf_processor/nlp_models"
40
- self.default_s3_path = "s3://llm-infra/models"
41
- self.nlp_models = self.nlp_models = {
42
- "en_core_web_sm": {
43
- "type": "spacy",
44
- "version": "3.7.1",
45
- },
46
- "en_core_web_md": {
47
- "type": "spacy",
48
- "version": "3.7.1",
49
- },
50
- "en_core_web_lg": {
51
- "type": "spacy",
52
- "version": "3.7.1",
53
- },
54
- "zh_core_web_sm": {
55
- "type": "spacy",
56
- "version": "3.7.0",
57
- },
58
- "zh_core_web_md": {
59
- "type": "spacy",
60
- "version": "3.7.0",
61
- },
62
- "zh_core_web_lg": {
63
- "type": "spacy",
64
- "version": "3.7.0",
65
- },
66
- }
67
- self.en_core_web_sm_model = en_core_web_sm.load()
68
- self.zh_core_web_sm_model = zh_core_web_sm.load()
69
-
70
- def load_model(self, model_name, model_type, model_version):
71
- if (
72
- model_name in self.nlp_models
73
- and self.nlp_models[model_name]["type"] == model_type
74
- and self.nlp_models[model_name]["version"] == model_version
75
- ):
76
- return spacy.load(model_name) if spacy.util.is_package(model_name) else None
77
-
78
- else:
79
- logger.error(f"Unsupported model name or version: {model_name} {model_version}")
80
- return None
81
-
82
- def detect_language(self, text, use_langdetect=False):
83
- if len(text) == 0:
84
- return None
85
- if use_langdetect:
86
- # print("use_langdetect")
87
- # print(detect_lang(text))
88
- # return detect_lang(text)
89
- if detect_lang(text) == "zh":
90
- return "zh"
91
- else:
92
- return "en"
93
-
94
- if not use_langdetect:
95
- en_count = len(re.findall(r"[a-zA-Z]", text))
96
- cn_count = len(re.findall(r"[\u4e00-\u9fff]", text))
97
-
98
- if en_count > cn_count:
99
- return "en"
100
-
101
- if cn_count > en_count:
102
- return "zh"
103
-
104
- def detect_entity_catgr_using_nlp(self, text, threshold=0.5):
105
- """
106
- Detect entity categories using NLP models and return the most frequent entity types.
107
-
108
- Parameters
109
- ----------
110
- text : str
111
- Text to be processed.
112
-
113
- Returns
114
- -------
115
- str
116
- The most frequent entity type.
117
- """
118
- lang = self.detect_language(text, use_langdetect=True)
119
-
120
- if lang == "en":
121
- nlp_model = self.en_core_web_sm_model
122
- elif lang == "zh":
123
- nlp_model = self.zh_core_web_sm_model
124
- else:
125
- # logger.error(f"Unsupported language: {lang}")
126
- return {}
127
-
128
- # Splitting text into smaller parts
129
- text_parts = re.split(r"[,;,;、\s & |]+", text)
130
-
131
- text_parts = [part for part in text_parts if not re.match(r"[\d\W]+", part)] # Remove non-words
132
- text_combined = " ".join(text_parts)
133
-
134
- try:
135
- doc = nlp_model(text_combined)
136
- entity_counts = Counter([ent.label_ for ent in doc.ents])
137
- word_counts_in_entities = Counter()
138
-
139
- for ent in doc.ents:
140
- word_counts_in_entities[ent.label_] += len(ent.text.split())
141
-
142
- total_words_in_entities = sum(word_counts_in_entities.values())
143
- total_words = len([token for token in doc if not token.is_punct])
144
-
145
- if total_words_in_entities == 0 or total_words == 0:
146
- return None
147
-
148
- entity_percentage = total_words_in_entities / total_words
149
- if entity_percentage < 0.5:
150
- return None
151
-
152
- most_common_entity, word_count = word_counts_in_entities.most_common(1)[0]
153
- entity_percentage = word_count / total_words_in_entities
154
-
155
- if entity_percentage >= threshold:
156
- return most_common_entity
157
- else:
158
- return None
159
- except Exception as e:
160
- logger.error(f"Error in entity detection: {e}")
161
- return None
162
-
163
-
164
- def __main__():
165
- nlpModel = NLPModels()
166
-
167
- test_strings = [
168
- "张三",
169
- "张三, 李四,王五; 赵六",
170
- "John Doe",
171
- "Jane Smith",
172
- "Lee, John",
173
- "John Doe, Jane Smith; Alice Johnson,Bob Lee",
174
- "孙七, Michael Jordan;赵八",
175
- "David Smith Michael O'Connor; Kevin ßáçøñ",
176
- "李雷·韩梅梅, 张三·李四",
177
- "Charles Robert Darwin, Isaac Newton",
178
- "莱昂纳多·迪卡普里奥, 杰克·吉伦哈尔",
179
- "John Doe, Jane Smith; Alice Johnson",
180
- "张三, 李四,王五; 赵六",
181
- "Lei Wang, Jia Li, and Xiaojun Chen, LINKE YANG OU, and YUAN ZHANG",
182
- "Rachel Mills & William Barry & Susanne B. Haga",
183
- "Claire Chabut* and Jean-François Bussières",
184
- "1 Department of Chemistry, Northeastern University, Shenyang 110004, China 2 State Key Laboratory of Polymer Physics and Chemistry, Changchun Institute of Applied Chemistry, Chinese Academy of Sciences, Changchun 130022, China",
185
- "Changchun",
186
- "china",
187
- "Rongjun Song, 1,2 Baoyan Zhang, 1 Baotong Huang, 2 Tao Tang 2",
188
- "Synergistic Effect of Supported Nickel Catalyst with Intumescent Flame-Retardants on Flame Retardancy and Thermal Stability of Polypropylene",
189
- "Synergistic Effect of Supported Nickel Catalyst with",
190
- "Intumescent Flame-Retardants on Flame Retardancy",
191
- "and Thermal Stability of Polypropylene",
192
- ]
193
-
194
- for test in test_strings:
195
- print()
196
- print(f"Original String: {test}")
197
-
198
- result = nlpModel.detect_entity_catgr_using_nlp(test)
199
- print(f"Detected entities: {result}")
200
-
201
-
202
- if __name__ == "__main__":
203
- __main__()
@@ -1,33 +0,0 @@
1
- import math
2
-
3
-
4
- def __inc_dict_val(mp, key, val_inc:int):
5
- if mp.get(key):
6
- mp[key] = mp[key] + val_inc
7
- else:
8
- mp[key] = val_inc
9
-
10
-
11
-
12
- def get_text_block_base_info(block):
13
- """
14
- 获取这个文本块里的字体的颜色、字号、字体
15
- 按照正文字数最多的返回
16
- """
17
-
18
- counter = {}
19
-
20
- for line in block['lines']:
21
- for span in line['spans']:
22
- color = span['color']
23
- size = round(span['size'], 2)
24
- font = span['font']
25
-
26
- txt_len = len(span['text'])
27
- __inc_dict_val(counter, (color, size, font), txt_len)
28
-
29
-
30
- c, s, ft = max(counter, key=counter.get)
31
-
32
- return c, s, ft
33
-
@@ -1,308 +0,0 @@
1
- from magic_pdf.libs.commons import fitz
2
- import os
3
-
4
-
5
- def draw_bbox_on_page(raw_pdf_doc: fitz.Document, paras_dict:dict, save_path: str):
6
- """
7
- 在page上画出bbox,保存到save_path
8
- """
9
- # 检查文件是否存在
10
- is_new_pdf = False
11
- if os.path.exists(save_path):
12
- # 打开现有的 PDF 文件
13
- doc = fitz.open(save_path)
14
- else:
15
- # 创建一个新的空白 PDF 文件
16
- is_new_pdf = True
17
- doc = fitz.open('')
18
-
19
- color_map = {
20
- 'image': fitz.pdfcolor["yellow"],
21
- 'text': fitz.pdfcolor['blue'],
22
- "table": fitz.pdfcolor['green']
23
- }
24
-
25
- for k, v in paras_dict.items():
26
- page_idx = v['page_idx']
27
- width = raw_pdf_doc[page_idx].rect.width
28
- height = raw_pdf_doc[page_idx].rect.height
29
- new_page = doc.new_page(width=width, height=height)
30
-
31
- shape = new_page.new_shape()
32
- for order, block in enumerate(v['preproc_blocks']):
33
- rect = fitz.Rect(block['bbox'])
34
- shape = new_page.new_shape()
35
- shape.draw_rect(rect)
36
- shape.finish(color=None, fill=color_map['text'], fill_opacity=0.2)
37
- shape.finish()
38
- shape.commit()
39
-
40
- for img in v['images']:
41
- # 原始box画上去
42
- rect = fitz.Rect(img['bbox'])
43
- shape = new_page.new_shape()
44
- shape.draw_rect(rect)
45
- shape.finish(color=None, fill=fitz.pdfcolor['yellow'])
46
- shape.finish()
47
- shape.commit()
48
-
49
- for img in v['image_backup']:
50
- # 原始box画上去
51
- rect = fitz.Rect(img['bbox'])
52
- shape = new_page.new_shape()
53
- shape.draw_rect(rect)
54
- shape.finish(color=fitz.pdfcolor['yellow'], fill=None)
55
- shape.finish()
56
- shape.commit()
57
-
58
- for tb in v['droped_text_block']:
59
- # 原始box画上去
60
- rect = fitz.Rect(tb['bbox'])
61
- shape = new_page.new_shape()
62
- shape.draw_rect(rect)
63
- shape.finish(color=None, fill=fitz.pdfcolor['black'], fill_opacity=0.4)
64
- shape.finish()
65
- shape.commit()
66
-
67
- # TODO table
68
- for tb in v['tables']:
69
- rect = fitz.Rect(tb['bbox'])
70
- shape = new_page.new_shape()
71
- shape.draw_rect(rect)
72
- shape.finish(color=None, fill=fitz.pdfcolor['green'], fill_opacity=0.2)
73
- shape.finish()
74
- shape.commit()
75
-
76
-
77
- parent_dir = os.path.dirname(save_path)
78
- if not os.path.exists(parent_dir):
79
- os.makedirs(parent_dir)
80
-
81
- if is_new_pdf:
82
- doc.save(save_path)
83
- else:
84
- doc.saveIncr()
85
- doc.close()
86
-
87
-
88
- def debug_show_bbox(raw_pdf_doc: fitz.Document, page_idx: int, bboxes: list, droped_bboxes:list, expect_drop_bboxes:list, save_path: str, expected_page_id:int):
89
- """
90
- 以覆盖的方式写个临时的pdf,用于debug
91
- """
92
- if page_idx!=expected_page_id:
93
- return
94
-
95
- if os.path.exists(save_path):
96
- # 删除已经存在的文件
97
- os.remove(save_path)
98
- # 创建一个新的空白 PDF 文件
99
- doc = fitz.open('')
100
-
101
- width = raw_pdf_doc[page_idx].rect.width
102
- height = raw_pdf_doc[page_idx].rect.height
103
- new_page = doc.new_page(width=width, height=height)
104
-
105
- shape = new_page.new_shape()
106
- for bbox in bboxes:
107
- # 原始box画上去
108
- rect = fitz.Rect(*bbox[0:4])
109
- shape = new_page.new_shape()
110
- shape.draw_rect(rect)
111
- shape.finish(color=fitz.pdfcolor['red'], fill=fitz.pdfcolor['blue'], fill_opacity=0.2)
112
- shape.finish()
113
- shape.commit()
114
-
115
- for bbox in droped_bboxes:
116
- # 原始box画上去
117
- rect = fitz.Rect(*bbox[0:4])
118
- shape = new_page.new_shape()
119
- shape.draw_rect(rect)
120
- shape.finish(color=None, fill=fitz.pdfcolor['yellow'], fill_opacity=0.2)
121
- shape.finish()
122
- shape.commit()
123
-
124
- for bbox in expect_drop_bboxes:
125
- # 原始box画上去
126
- rect = fitz.Rect(*bbox[0:4])
127
- shape = new_page.new_shape()
128
- shape.draw_rect(rect)
129
- shape.finish(color=fitz.pdfcolor['red'], fill=None)
130
- shape.finish()
131
- shape.commit()
132
-
133
- # shape.insert_textbox(fitz.Rect(200, 0, 600, 20), f"total bboxes: {len(bboxes)}", fontname="helv", fontsize=12,
134
- # color=(0, 0, 0))
135
- # shape.finish(color=fitz.pdfcolor['black'])
136
- # shape.commit()
137
-
138
- parent_dir = os.path.dirname(save_path)
139
- if not os.path.exists(parent_dir):
140
- os.makedirs(parent_dir)
141
-
142
- doc.save(save_path)
143
- doc.close()
144
-
145
-
146
- def debug_show_page(page, bboxes1: list,bboxes2: list,bboxes3: list,):
147
- save_path = "./tmp/debug.pdf"
148
- if os.path.exists(save_path):
149
- # 删除已经存在的文件
150
- os.remove(save_path)
151
- # 创建一个新的空白 PDF 文件
152
- doc = fitz.open('')
153
-
154
- width = page.rect.width
155
- height = page.rect.height
156
- new_page = doc.new_page(width=width, height=height)
157
-
158
- shape = new_page.new_shape()
159
- for bbox in bboxes1:
160
- # 原始box画上去
161
- rect = fitz.Rect(*bbox[0:4])
162
- shape = new_page.new_shape()
163
- shape.draw_rect(rect)
164
- shape.finish(color=fitz.pdfcolor['red'], fill=fitz.pdfcolor['blue'], fill_opacity=0.2)
165
- shape.finish()
166
- shape.commit()
167
-
168
- for bbox in bboxes2:
169
- # 原始box画上去
170
- rect = fitz.Rect(*bbox[0:4])
171
- shape = new_page.new_shape()
172
- shape.draw_rect(rect)
173
- shape.finish(color=None, fill=fitz.pdfcolor['yellow'], fill_opacity=0.2)
174
- shape.finish()
175
- shape.commit()
176
-
177
- for bbox in bboxes3:
178
- # 原始box画上去
179
- rect = fitz.Rect(*bbox[0:4])
180
- shape = new_page.new_shape()
181
- shape.draw_rect(rect)
182
- shape.finish(color=fitz.pdfcolor['red'], fill=None)
183
- shape.finish()
184
- shape.commit()
185
-
186
- parent_dir = os.path.dirname(save_path)
187
- if not os.path.exists(parent_dir):
188
- os.makedirs(parent_dir)
189
-
190
- doc.save(save_path)
191
- doc.close()
192
-
193
-
194
-
195
-
196
- def draw_layout_bbox_on_page(raw_pdf_doc: fitz.Document, paras_dict:dict, header, footer, pdf_path: str):
197
- """
198
- 在page上画出bbox,保存到save_path
199
- """
200
- # 检查文件是否存在
201
- is_new_pdf = False
202
- if os.path.exists(pdf_path):
203
- # 打开现有的 PDF 文件
204
- doc = fitz.open(pdf_path)
205
- else:
206
- # 创建一个新的空白 PDF 文件
207
- is_new_pdf = True
208
- doc = fitz.open('')
209
-
210
- for k, v in paras_dict.items():
211
- page_idx = v['page_idx']
212
- layouts = v['layout_bboxes']
213
- page = doc[page_idx]
214
- shape = page.new_shape()
215
- for order, layout in enumerate(layouts):
216
- border_offset = 1
217
- rect_box = layout['layout_bbox']
218
- layout_label = layout['layout_label']
219
- fill_color = fitz.pdfcolor['pink'] if layout_label=='U' else None
220
- rect_box = [rect_box[0]+1, rect_box[1]-border_offset, rect_box[2]-1, rect_box[3]+border_offset]
221
- rect = fitz.Rect(*rect_box)
222
- shape.draw_rect(rect)
223
- shape.finish(color=fitz.pdfcolor['red'], fill=fill_color, fill_opacity=0.4)
224
- """
225
- draw order text on layout box
226
- """
227
- font_size = 10
228
- shape.insert_text((rect_box[0] + 1, rect_box[1] + font_size), f"{order}", fontsize=font_size, color=(0, 0, 0))
229
-
230
- """画上footer header"""
231
- if header:
232
- shape.draw_rect(fitz.Rect(header))
233
- shape.finish(color=None, fill=fitz.pdfcolor['black'], fill_opacity=0.2)
234
- if footer:
235
- shape.draw_rect(fitz.Rect(footer))
236
- shape.finish(color=None, fill=fitz.pdfcolor['black'], fill_opacity=0.2)
237
-
238
- shape.commit()
239
-
240
- if is_new_pdf:
241
- doc.save(pdf_path)
242
- else:
243
- doc.saveIncr()
244
- doc.close()
245
-
246
-
247
- @DeprecationWarning
248
- def draw_layout_on_page(raw_pdf_doc: fitz.Document, page_idx: int, page_layout: list, pdf_path: str):
249
- """
250
- 把layout的box用红色边框花在pdf_path的page_idx上
251
- """
252
- def draw(shape, layout, fill_color=fitz.pdfcolor['pink']):
253
- border_offset = 1
254
- rect_box = layout['layout_bbox']
255
- layout_label = layout['layout_label']
256
- sub_layout = layout['sub_layout']
257
- if len(sub_layout)==0:
258
- fill_color = fill_color if layout_label=='U' else None
259
- rect_box = [rect_box[0]+1, rect_box[1]-border_offset, rect_box[2]-1, rect_box[3]+border_offset]
260
- rect = fitz.Rect(*rect_box)
261
- shape.draw_rect(rect)
262
- shape.finish(color=fitz.pdfcolor['red'], fill=fill_color, fill_opacity=0.2)
263
- # if layout_label=='U':
264
- # bad_boxes = layout.get("bad_boxes", [])
265
- # for bad_box in bad_boxes:
266
- # rect = fitz.Rect(*bad_box)
267
- # shape.draw_rect(rect)
268
- # shape.finish(color=fitz.pdfcolor['red'], fill=fitz.pdfcolor['red'], fill_opacity=0.2)
269
- # else:
270
- # rect = fitz.Rect(*rect_box)
271
- # shape.draw_rect(rect)
272
- # shape.finish(color=fitz.pdfcolor['blue'])
273
-
274
- for sub_layout in sub_layout:
275
- draw(shape, sub_layout)
276
- shape.commit()
277
-
278
-
279
- # 检查文件是否存在
280
- is_new_pdf = False
281
- if os.path.exists(pdf_path):
282
- # 打开现有的 PDF 文件
283
- doc = fitz.open(pdf_path)
284
- else:
285
- # 创建一个新的空白 PDF 文件
286
- is_new_pdf = True
287
- doc = fitz.open('')
288
-
289
- page = doc[page_idx]
290
- shape = page.new_shape()
291
- for order, layout in enumerate(page_layout):
292
- draw(shape, layout, fitz.pdfcolor['yellow'])
293
-
294
- # shape.insert_textbox(fitz.Rect(200, 0, 600, 20), f"total bboxes: {len(layout)}", fontname="helv", fontsize=12,
295
- # color=(0, 0, 0))
296
- # shape.finish(color=fitz.pdfcolor['black'])
297
- # shape.commit()
298
-
299
- parent_dir = os.path.dirname(pdf_path)
300
- if not os.path.exists(parent_dir):
301
- os.makedirs(parent_dir)
302
-
303
- if is_new_pdf:
304
- doc.save(pdf_path)
305
- else:
306
- doc.saveIncr()
307
- doc.close()
308
-