magic-pdf 0.5.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (121) hide show
  1. magic_pdf/__init__.py +0 -0
  2. magic_pdf/cli/__init__.py +0 -0
  3. magic_pdf/cli/magicpdf.py +294 -0
  4. magic_pdf/dict2md/__init__.py +0 -0
  5. magic_pdf/dict2md/mkcontent.py +397 -0
  6. magic_pdf/dict2md/ocr_mkcontent.py +356 -0
  7. magic_pdf/filter/__init__.py +0 -0
  8. magic_pdf/filter/pdf_classify_by_type.py +381 -0
  9. magic_pdf/filter/pdf_meta_scan.py +368 -0
  10. magic_pdf/layout/__init__.py +0 -0
  11. magic_pdf/layout/bbox_sort.py +681 -0
  12. magic_pdf/layout/layout_det_utils.py +182 -0
  13. magic_pdf/layout/layout_sort.py +732 -0
  14. magic_pdf/layout/layout_spiler_recog.py +101 -0
  15. magic_pdf/layout/mcol_sort.py +336 -0
  16. magic_pdf/libs/Constants.py +11 -0
  17. magic_pdf/libs/MakeContentConfig.py +10 -0
  18. magic_pdf/libs/ModelBlockTypeEnum.py +9 -0
  19. magic_pdf/libs/__init__.py +0 -0
  20. magic_pdf/libs/boxbase.py +408 -0
  21. magic_pdf/libs/calc_span_stats.py +239 -0
  22. magic_pdf/libs/commons.py +204 -0
  23. magic_pdf/libs/config_reader.py +63 -0
  24. magic_pdf/libs/convert_utils.py +5 -0
  25. magic_pdf/libs/coordinate_transform.py +9 -0
  26. magic_pdf/libs/detect_language_from_model.py +21 -0
  27. magic_pdf/libs/draw_bbox.py +227 -0
  28. magic_pdf/libs/drop_reason.py +27 -0
  29. magic_pdf/libs/drop_tag.py +19 -0
  30. magic_pdf/libs/hash_utils.py +15 -0
  31. magic_pdf/libs/json_compressor.py +27 -0
  32. magic_pdf/libs/language.py +31 -0
  33. magic_pdf/libs/markdown_utils.py +31 -0
  34. magic_pdf/libs/math.py +9 -0
  35. magic_pdf/libs/nlp_utils.py +203 -0
  36. magic_pdf/libs/ocr_content_type.py +21 -0
  37. magic_pdf/libs/path_utils.py +23 -0
  38. magic_pdf/libs/pdf_image_tools.py +33 -0
  39. magic_pdf/libs/safe_filename.py +11 -0
  40. magic_pdf/libs/textbase.py +33 -0
  41. magic_pdf/libs/version.py +1 -0
  42. magic_pdf/libs/vis_utils.py +308 -0
  43. magic_pdf/model/__init__.py +0 -0
  44. magic_pdf/model/doc_analyze_by_360layout.py +8 -0
  45. magic_pdf/model/doc_analyze_by_pp_structurev2.py +125 -0
  46. magic_pdf/model/magic_model.py +632 -0
  47. magic_pdf/para/__init__.py +0 -0
  48. magic_pdf/para/block_continuation_processor.py +562 -0
  49. magic_pdf/para/block_termination_processor.py +480 -0
  50. magic_pdf/para/commons.py +222 -0
  51. magic_pdf/para/denoise.py +246 -0
  52. magic_pdf/para/draw.py +121 -0
  53. magic_pdf/para/exceptions.py +198 -0
  54. magic_pdf/para/layout_match_processor.py +40 -0
  55. magic_pdf/para/para_pipeline.py +297 -0
  56. magic_pdf/para/para_split.py +644 -0
  57. magic_pdf/para/para_split_v2.py +772 -0
  58. magic_pdf/para/raw_processor.py +207 -0
  59. magic_pdf/para/stats.py +268 -0
  60. magic_pdf/para/title_processor.py +1014 -0
  61. magic_pdf/pdf_parse_by_ocr.py +219 -0
  62. magic_pdf/pdf_parse_by_ocr_v2.py +17 -0
  63. magic_pdf/pdf_parse_by_txt.py +410 -0
  64. magic_pdf/pdf_parse_by_txt_v2.py +56 -0
  65. magic_pdf/pdf_parse_for_train.py +685 -0
  66. magic_pdf/pdf_parse_union_core.py +241 -0
  67. magic_pdf/pipe/AbsPipe.py +112 -0
  68. magic_pdf/pipe/OCRPipe.py +28 -0
  69. magic_pdf/pipe/TXTPipe.py +29 -0
  70. magic_pdf/pipe/UNIPipe.py +83 -0
  71. magic_pdf/pipe/__init__.py +0 -0
  72. magic_pdf/post_proc/__init__.py +0 -0
  73. magic_pdf/post_proc/detect_para.py +3472 -0
  74. magic_pdf/post_proc/pdf_post_filter.py +67 -0
  75. magic_pdf/post_proc/remove_footnote.py +153 -0
  76. magic_pdf/pre_proc/__init__.py +0 -0
  77. magic_pdf/pre_proc/citationmarker_remove.py +157 -0
  78. magic_pdf/pre_proc/construct_page_dict.py +72 -0
  79. magic_pdf/pre_proc/cut_image.py +71 -0
  80. magic_pdf/pre_proc/detect_equation.py +134 -0
  81. magic_pdf/pre_proc/detect_footer_by_model.py +64 -0
  82. magic_pdf/pre_proc/detect_footer_header_by_statistics.py +284 -0
  83. magic_pdf/pre_proc/detect_footnote.py +170 -0
  84. magic_pdf/pre_proc/detect_header.py +64 -0
  85. magic_pdf/pre_proc/detect_images.py +647 -0
  86. magic_pdf/pre_proc/detect_page_number.py +64 -0
  87. magic_pdf/pre_proc/detect_tables.py +62 -0
  88. magic_pdf/pre_proc/equations_replace.py +559 -0
  89. magic_pdf/pre_proc/fix_image.py +244 -0
  90. magic_pdf/pre_proc/fix_table.py +270 -0
  91. magic_pdf/pre_proc/main_text_font.py +23 -0
  92. magic_pdf/pre_proc/ocr_detect_all_bboxes.py +115 -0
  93. magic_pdf/pre_proc/ocr_detect_layout.py +133 -0
  94. magic_pdf/pre_proc/ocr_dict_merge.py +336 -0
  95. magic_pdf/pre_proc/ocr_span_list_modify.py +258 -0
  96. magic_pdf/pre_proc/pdf_pre_filter.py +74 -0
  97. magic_pdf/pre_proc/post_layout_split.py +0 -0
  98. magic_pdf/pre_proc/remove_bbox_overlap.py +98 -0
  99. magic_pdf/pre_proc/remove_colored_strip_bbox.py +79 -0
  100. magic_pdf/pre_proc/remove_footer_header.py +117 -0
  101. magic_pdf/pre_proc/remove_rotate_bbox.py +188 -0
  102. magic_pdf/pre_proc/resolve_bbox_conflict.py +191 -0
  103. magic_pdf/pre_proc/solve_line_alien.py +29 -0
  104. magic_pdf/pre_proc/statistics.py +12 -0
  105. magic_pdf/rw/AbsReaderWriter.py +34 -0
  106. magic_pdf/rw/DiskReaderWriter.py +66 -0
  107. magic_pdf/rw/S3ReaderWriter.py +107 -0
  108. magic_pdf/rw/__init__.py +0 -0
  109. magic_pdf/spark/__init__.py +0 -0
  110. magic_pdf/spark/spark_api.py +51 -0
  111. magic_pdf/train_utils/__init__.py +0 -0
  112. magic_pdf/train_utils/convert_to_train_format.py +65 -0
  113. magic_pdf/train_utils/extract_caption.py +59 -0
  114. magic_pdf/train_utils/remove_footer_header.py +159 -0
  115. magic_pdf/train_utils/vis_utils.py +327 -0
  116. magic_pdf/user_api.py +136 -0
  117. magic_pdf-0.5.4.dist-info/LICENSE.md +661 -0
  118. magic_pdf-0.5.4.dist-info/METADATA +24 -0
  119. magic_pdf-0.5.4.dist-info/RECORD +121 -0
  120. magic_pdf-0.5.4.dist-info/WHEEL +5 -0
  121. magic_pdf-0.5.4.dist-info/top_level.txt +1 -0
@@ -0,0 +1,31 @@
1
+ import re
2
+
3
+
4
+ def escape_special_markdown_char(pymu_blocks):
5
+ """
6
+ 转义正文里对markdown语法有特殊意义的字符
7
+ """
8
+ special_chars = ["*", "`", "~", "$"]
9
+ for blk in pymu_blocks:
10
+ for line in blk['lines']:
11
+ for span in line['spans']:
12
+ for char in special_chars:
13
+ span_text = span['text']
14
+ span_type = span.get("_type", None)
15
+ if span_type in ['inline-equation', 'interline-equation']:
16
+ continue
17
+ elif span_text:
18
+ span['text'] = span['text'].replace(char, "\\" + char)
19
+
20
+ return pymu_blocks
21
+
22
+
23
+ def ocr_escape_special_markdown_char(content):
24
+ """
25
+ 转义正文里对markdown语法有特殊意义的字符
26
+ """
27
+ special_chars = ["*", "`", "~", "$"]
28
+ for char in special_chars:
29
+ content = content.replace(char, "\\" + char)
30
+
31
+ return content
magic_pdf/libs/math.py ADDED
@@ -0,0 +1,9 @@
1
+ def float_gt(a, b):
2
+ if 0.0001 >= abs(a -b):
3
+ return False
4
+ return a > b
5
+
6
+ def float_equal(a, b):
7
+ if 0.0001 >= abs(a-b):
8
+ return True
9
+ return False
@@ -0,0 +1,203 @@
1
+ import re
2
+ from os import path
3
+
4
+ from collections import Counter
5
+
6
+ from loguru import logger
7
+
8
+ # from langdetect import detect
9
+ import spacy
10
+ import en_core_web_sm
11
+ import zh_core_web_sm
12
+
13
+ from magic_pdf.libs.language import detect_lang
14
+
15
+
16
+ class NLPModels:
17
+ """
18
+ How to upload local models to s3:
19
+ - config aws cli:
20
+ doc\SETUP-CLI.md
21
+ doc\setup_cli.sh
22
+ app\config\__init__.py
23
+ - $ cd {local_dir_storing_models}
24
+ - $ ls models
25
+ en_core_web_sm-3.7.1/
26
+ zh_core_web_sm-3.7.0/
27
+ - $ aws s3 sync models/ s3://llm-infra/models --profile=p_project_norm
28
+ - $ aws s3 --profile=p_project_norm ls s3://llm-infra/models/
29
+ PRE en_core_web_sm-3.7.1/
30
+ PRE zh_core_web_sm-3.7.0/
31
+ """
32
+
33
+ def __init__(self):
34
+ # if OS is windows, set "TMP_DIR" to "D:/tmp"
35
+
36
+ home_dir = path.expanduser("~")
37
+ self.default_local_path = path.join(home_dir, ".nlp_models")
38
+ self.default_shared_path = "/share/pdf_processor/nlp_models"
39
+ self.default_hdfs_path = "hdfs://pdf_processor/nlp_models"
40
+ self.default_s3_path = "s3://llm-infra/models"
41
+ self.nlp_models = self.nlp_models = {
42
+ "en_core_web_sm": {
43
+ "type": "spacy",
44
+ "version": "3.7.1",
45
+ },
46
+ "en_core_web_md": {
47
+ "type": "spacy",
48
+ "version": "3.7.1",
49
+ },
50
+ "en_core_web_lg": {
51
+ "type": "spacy",
52
+ "version": "3.7.1",
53
+ },
54
+ "zh_core_web_sm": {
55
+ "type": "spacy",
56
+ "version": "3.7.0",
57
+ },
58
+ "zh_core_web_md": {
59
+ "type": "spacy",
60
+ "version": "3.7.0",
61
+ },
62
+ "zh_core_web_lg": {
63
+ "type": "spacy",
64
+ "version": "3.7.0",
65
+ },
66
+ }
67
+ self.en_core_web_sm_model = en_core_web_sm.load()
68
+ self.zh_core_web_sm_model = zh_core_web_sm.load()
69
+
70
+ def load_model(self, model_name, model_type, model_version):
71
+ if (
72
+ model_name in self.nlp_models
73
+ and self.nlp_models[model_name]["type"] == model_type
74
+ and self.nlp_models[model_name]["version"] == model_version
75
+ ):
76
+ return spacy.load(model_name) if spacy.util.is_package(model_name) else None
77
+
78
+ else:
79
+ logger.error(f"Unsupported model name or version: {model_name} {model_version}")
80
+ return None
81
+
82
+ def detect_language(self, text, use_langdetect=False):
83
+ if len(text) == 0:
84
+ return None
85
+ if use_langdetect:
86
+ # print("use_langdetect")
87
+ # print(detect_lang(text))
88
+ # return detect_lang(text)
89
+ if detect_lang(text) == "zh":
90
+ return "zh"
91
+ else:
92
+ return "en"
93
+
94
+ if not use_langdetect:
95
+ en_count = len(re.findall(r"[a-zA-Z]", text))
96
+ cn_count = len(re.findall(r"[\u4e00-\u9fff]", text))
97
+
98
+ if en_count > cn_count:
99
+ return "en"
100
+
101
+ if cn_count > en_count:
102
+ return "zh"
103
+
104
+ def detect_entity_catgr_using_nlp(self, text, threshold=0.5):
105
+ """
106
+ Detect entity categories using NLP models and return the most frequent entity types.
107
+
108
+ Parameters
109
+ ----------
110
+ text : str
111
+ Text to be processed.
112
+
113
+ Returns
114
+ -------
115
+ str
116
+ The most frequent entity type.
117
+ """
118
+ lang = self.detect_language(text, use_langdetect=True)
119
+
120
+ if lang == "en":
121
+ nlp_model = self.en_core_web_sm_model
122
+ elif lang == "zh":
123
+ nlp_model = self.zh_core_web_sm_model
124
+ else:
125
+ # logger.error(f"Unsupported language: {lang}")
126
+ return {}
127
+
128
+ # Splitting text into smaller parts
129
+ text_parts = re.split(r"[,;,;、\s & |]+", text)
130
+
131
+ text_parts = [part for part in text_parts if not re.match(r"[\d\W]+", part)] # Remove non-words
132
+ text_combined = " ".join(text_parts)
133
+
134
+ try:
135
+ doc = nlp_model(text_combined)
136
+ entity_counts = Counter([ent.label_ for ent in doc.ents])
137
+ word_counts_in_entities = Counter()
138
+
139
+ for ent in doc.ents:
140
+ word_counts_in_entities[ent.label_] += len(ent.text.split())
141
+
142
+ total_words_in_entities = sum(word_counts_in_entities.values())
143
+ total_words = len([token for token in doc if not token.is_punct])
144
+
145
+ if total_words_in_entities == 0 or total_words == 0:
146
+ return None
147
+
148
+ entity_percentage = total_words_in_entities / total_words
149
+ if entity_percentage < 0.5:
150
+ return None
151
+
152
+ most_common_entity, word_count = word_counts_in_entities.most_common(1)[0]
153
+ entity_percentage = word_count / total_words_in_entities
154
+
155
+ if entity_percentage >= threshold:
156
+ return most_common_entity
157
+ else:
158
+ return None
159
+ except Exception as e:
160
+ logger.error(f"Error in entity detection: {e}")
161
+ return None
162
+
163
+
164
+ def __main__():
165
+ nlpModel = NLPModels()
166
+
167
+ test_strings = [
168
+ "张三",
169
+ "张三, 李四,王五; 赵六",
170
+ "John Doe",
171
+ "Jane Smith",
172
+ "Lee, John",
173
+ "John Doe, Jane Smith; Alice Johnson,Bob Lee",
174
+ "孙七, Michael Jordan;赵八",
175
+ "David Smith Michael O'Connor; Kevin ßáçøñ",
176
+ "李雷·韩梅梅, 张三·李四",
177
+ "Charles Robert Darwin, Isaac Newton",
178
+ "莱昂纳多·迪卡普里奥, 杰克·吉伦哈尔",
179
+ "John Doe, Jane Smith; Alice Johnson",
180
+ "张三, 李四,王五; 赵六",
181
+ "Lei Wang, Jia Li, and Xiaojun Chen, LINKE YANG OU, and YUAN ZHANG",
182
+ "Rachel Mills & William Barry & Susanne B. Haga",
183
+ "Claire Chabut* and Jean-François Bussières",
184
+ "1 Department of Chemistry, Northeastern University, Shenyang 110004, China 2 State Key Laboratory of Polymer Physics and Chemistry, Changchun Institute of Applied Chemistry, Chinese Academy of Sciences, Changchun 130022, China",
185
+ "Changchun",
186
+ "china",
187
+ "Rongjun Song, 1,2 Baoyan Zhang, 1 Baotong Huang, 2 Tao Tang 2",
188
+ "Synergistic Effect of Supported Nickel Catalyst with Intumescent Flame-Retardants on Flame Retardancy and Thermal Stability of Polypropylene",
189
+ "Synergistic Effect of Supported Nickel Catalyst with",
190
+ "Intumescent Flame-Retardants on Flame Retardancy",
191
+ "and Thermal Stability of Polypropylene",
192
+ ]
193
+
194
+ for test in test_strings:
195
+ print()
196
+ print(f"Original String: {test}")
197
+
198
+ result = nlpModel.detect_entity_catgr_using_nlp(test)
199
+ print(f"Detected entities: {result}")
200
+
201
+
202
+ if __name__ == "__main__":
203
+ __main__()
@@ -0,0 +1,21 @@
1
+ class ContentType:
2
+ Image = "image"
3
+ Table = "table"
4
+ Text = "text"
5
+ InlineEquation = "inline_equation"
6
+ InterlineEquation = "interline_equation"
7
+
8
+ class BlockType:
9
+ Image = "image"
10
+ ImageBody = "image_body"
11
+ ImageCaption = "image_caption"
12
+ Table = "table"
13
+ TableBody = "table_body"
14
+ TableCaption = "table_caption"
15
+ TableFootnote = "table_footnote"
16
+ Text = "text"
17
+ Title = "title"
18
+ InterlineEquation = "interline_equation"
19
+ Footnote = "footnote"
20
+ Discarded = "discarded"
21
+
@@ -0,0 +1,23 @@
1
+
2
+
3
+ from s3pathlib import S3Path
4
+
5
+ def remove_non_official_s3_args(s3path):
6
+ """
7
+ example: s3://abc/xxxx.json?bytes=0,81350 ==> s3://abc/xxxx.json
8
+ """
9
+ arr = s3path.split("?")
10
+ return arr[0]
11
+
12
+ def parse_s3path(s3path: str):
13
+ p = S3Path(remove_non_official_s3_args(s3path))
14
+ return p.bucket, p.key
15
+
16
+ def parse_s3_range_params(s3path: str):
17
+ """
18
+ example: s3://abc/xxxx.json?bytes=0,81350 ==> [0, 81350]
19
+ """
20
+ arr = s3path.split("?bytes=")
21
+ if len(arr) == 1:
22
+ return None
23
+ return arr[1].split(",")
@@ -0,0 +1,33 @@
1
+
2
+ from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
3
+ from magic_pdf.libs.commons import fitz
4
+ from magic_pdf.libs.commons import join_path
5
+ from magic_pdf.libs.hash_utils import compute_sha256
6
+
7
+
8
+ def cut_image(bbox: tuple, page_num: int, page: fitz.Page, return_path, imageWriter: AbsReaderWriter):
9
+ """
10
+ 从第page_num页的page中,根据bbox进行裁剪出一张jpg图片,返回图片路径
11
+ save_path:需要同时支持s3和本地, 图片存放在save_path下,文件名是: {page_num}_{bbox[0]}_{bbox[1]}_{bbox[2]}_{bbox[3]}.jpg , bbox内数字取整。
12
+ """
13
+ # 拼接文件名
14
+ filename = f"{page_num}_{int(bbox[0])}_{int(bbox[1])}_{int(bbox[2])}_{int(bbox[3])}"
15
+
16
+ # 老版本返回不带bucket的路径
17
+ img_path = join_path(return_path, filename) if return_path is not None else None
18
+
19
+ # 新版本生成平铺路径
20
+ img_hash256_path = f"{compute_sha256(img_path)}.jpg"
21
+
22
+ # 将坐标转换为fitz.Rect对象
23
+ rect = fitz.Rect(*bbox)
24
+ # 配置缩放倍数为3倍
25
+ zoom = fitz.Matrix(3, 3)
26
+ # 截取图片
27
+ pix = page.get_pixmap(clip=rect, matrix=zoom)
28
+
29
+ byte_data = pix.tobytes(output='jpeg', jpg_quality=95)
30
+
31
+ imageWriter.write(byte_data, img_hash256_path, AbsReaderWriter.MODE_BIN)
32
+
33
+ return img_hash256_path
@@ -0,0 +1,11 @@
1
+ import os
2
+
3
+
4
+ def sanitize_filename(filename, replacement="_"):
5
+ if os.name == 'nt':
6
+ invalid_chars = '<>:"|?*'
7
+
8
+ for char in invalid_chars:
9
+ filename = filename.replace(char, replacement)
10
+
11
+ return filename
@@ -0,0 +1,33 @@
1
+ import math
2
+
3
+
4
+ def __inc_dict_val(mp, key, val_inc:int):
5
+ if mp.get(key):
6
+ mp[key] = mp[key] + val_inc
7
+ else:
8
+ mp[key] = val_inc
9
+
10
+
11
+
12
+ def get_text_block_base_info(block):
13
+ """
14
+ 获取这个文本块里的字体的颜色、字号、字体
15
+ 按照正文字数最多的返回
16
+ """
17
+
18
+ counter = {}
19
+
20
+ for line in block['lines']:
21
+ for span in line['spans']:
22
+ color = span['color']
23
+ size = round(span['size'], 2)
24
+ font = span['font']
25
+
26
+ txt_len = len(span['text'])
27
+ __inc_dict_val(counter, (color, size, font), txt_len)
28
+
29
+
30
+ c, s, ft = max(counter, key=counter.get)
31
+
32
+ return c, s, ft
33
+
@@ -0,0 +1 @@
1
+ __version__ = "0.5.4"
@@ -0,0 +1,308 @@
1
+ from magic_pdf.libs.commons import fitz
2
+ import os
3
+
4
+
5
+ def draw_bbox_on_page(raw_pdf_doc: fitz.Document, paras_dict:dict, save_path: str):
6
+ """
7
+ 在page上画出bbox,保存到save_path
8
+ """
9
+ # 检查文件是否存在
10
+ is_new_pdf = False
11
+ if os.path.exists(save_path):
12
+ # 打开现有的 PDF 文件
13
+ doc = fitz.open(save_path)
14
+ else:
15
+ # 创建一个新的空白 PDF 文件
16
+ is_new_pdf = True
17
+ doc = fitz.open('')
18
+
19
+ color_map = {
20
+ 'image': fitz.pdfcolor["yellow"],
21
+ 'text': fitz.pdfcolor['blue'],
22
+ "table": fitz.pdfcolor['green']
23
+ }
24
+
25
+ for k, v in paras_dict.items():
26
+ page_idx = v['page_idx']
27
+ width = raw_pdf_doc[page_idx].rect.width
28
+ height = raw_pdf_doc[page_idx].rect.height
29
+ new_page = doc.new_page(width=width, height=height)
30
+
31
+ shape = new_page.new_shape()
32
+ for order, block in enumerate(v['preproc_blocks']):
33
+ rect = fitz.Rect(block['bbox'])
34
+ shape = new_page.new_shape()
35
+ shape.draw_rect(rect)
36
+ shape.finish(color=None, fill=color_map['text'], fill_opacity=0.2)
37
+ shape.finish()
38
+ shape.commit()
39
+
40
+ for img in v['images']:
41
+ # 原始box画上去
42
+ rect = fitz.Rect(img['bbox'])
43
+ shape = new_page.new_shape()
44
+ shape.draw_rect(rect)
45
+ shape.finish(color=None, fill=fitz.pdfcolor['yellow'])
46
+ shape.finish()
47
+ shape.commit()
48
+
49
+ for img in v['image_backup']:
50
+ # 原始box画上去
51
+ rect = fitz.Rect(img['bbox'])
52
+ shape = new_page.new_shape()
53
+ shape.draw_rect(rect)
54
+ shape.finish(color=fitz.pdfcolor['yellow'], fill=None)
55
+ shape.finish()
56
+ shape.commit()
57
+
58
+ for tb in v['droped_text_block']:
59
+ # 原始box画上去
60
+ rect = fitz.Rect(tb['bbox'])
61
+ shape = new_page.new_shape()
62
+ shape.draw_rect(rect)
63
+ shape.finish(color=None, fill=fitz.pdfcolor['black'], fill_opacity=0.4)
64
+ shape.finish()
65
+ shape.commit()
66
+
67
+ # TODO table
68
+ for tb in v['tables']:
69
+ rect = fitz.Rect(tb['bbox'])
70
+ shape = new_page.new_shape()
71
+ shape.draw_rect(rect)
72
+ shape.finish(color=None, fill=fitz.pdfcolor['green'], fill_opacity=0.2)
73
+ shape.finish()
74
+ shape.commit()
75
+
76
+
77
+ parent_dir = os.path.dirname(save_path)
78
+ if not os.path.exists(parent_dir):
79
+ os.makedirs(parent_dir)
80
+
81
+ if is_new_pdf:
82
+ doc.save(save_path)
83
+ else:
84
+ doc.saveIncr()
85
+ doc.close()
86
+
87
+
88
+ def debug_show_bbox(raw_pdf_doc: fitz.Document, page_idx: int, bboxes: list, droped_bboxes:list, expect_drop_bboxes:list, save_path: str, expected_page_id:int):
89
+ """
90
+ 以覆盖的方式写个临时的pdf,用于debug
91
+ """
92
+ if page_idx!=expected_page_id:
93
+ return
94
+
95
+ if os.path.exists(save_path):
96
+ # 删除已经存在的文件
97
+ os.remove(save_path)
98
+ # 创建一个新的空白 PDF 文件
99
+ doc = fitz.open('')
100
+
101
+ width = raw_pdf_doc[page_idx].rect.width
102
+ height = raw_pdf_doc[page_idx].rect.height
103
+ new_page = doc.new_page(width=width, height=height)
104
+
105
+ shape = new_page.new_shape()
106
+ for bbox in bboxes:
107
+ # 原始box画上去
108
+ rect = fitz.Rect(*bbox[0:4])
109
+ shape = new_page.new_shape()
110
+ shape.draw_rect(rect)
111
+ shape.finish(color=fitz.pdfcolor['red'], fill=fitz.pdfcolor['blue'], fill_opacity=0.2)
112
+ shape.finish()
113
+ shape.commit()
114
+
115
+ for bbox in droped_bboxes:
116
+ # 原始box画上去
117
+ rect = fitz.Rect(*bbox[0:4])
118
+ shape = new_page.new_shape()
119
+ shape.draw_rect(rect)
120
+ shape.finish(color=None, fill=fitz.pdfcolor['yellow'], fill_opacity=0.2)
121
+ shape.finish()
122
+ shape.commit()
123
+
124
+ for bbox in expect_drop_bboxes:
125
+ # 原始box画上去
126
+ rect = fitz.Rect(*bbox[0:4])
127
+ shape = new_page.new_shape()
128
+ shape.draw_rect(rect)
129
+ shape.finish(color=fitz.pdfcolor['red'], fill=None)
130
+ shape.finish()
131
+ shape.commit()
132
+
133
+ # shape.insert_textbox(fitz.Rect(200, 0, 600, 20), f"total bboxes: {len(bboxes)}", fontname="helv", fontsize=12,
134
+ # color=(0, 0, 0))
135
+ # shape.finish(color=fitz.pdfcolor['black'])
136
+ # shape.commit()
137
+
138
+ parent_dir = os.path.dirname(save_path)
139
+ if not os.path.exists(parent_dir):
140
+ os.makedirs(parent_dir)
141
+
142
+ doc.save(save_path)
143
+ doc.close()
144
+
145
+
146
+ def debug_show_page(page, bboxes1: list,bboxes2: list,bboxes3: list,):
147
+ save_path = "./tmp/debug.pdf"
148
+ if os.path.exists(save_path):
149
+ # 删除已经存在的文件
150
+ os.remove(save_path)
151
+ # 创建一个新的空白 PDF 文件
152
+ doc = fitz.open('')
153
+
154
+ width = page.rect.width
155
+ height = page.rect.height
156
+ new_page = doc.new_page(width=width, height=height)
157
+
158
+ shape = new_page.new_shape()
159
+ for bbox in bboxes1:
160
+ # 原始box画上去
161
+ rect = fitz.Rect(*bbox[0:4])
162
+ shape = new_page.new_shape()
163
+ shape.draw_rect(rect)
164
+ shape.finish(color=fitz.pdfcolor['red'], fill=fitz.pdfcolor['blue'], fill_opacity=0.2)
165
+ shape.finish()
166
+ shape.commit()
167
+
168
+ for bbox in bboxes2:
169
+ # 原始box画上去
170
+ rect = fitz.Rect(*bbox[0:4])
171
+ shape = new_page.new_shape()
172
+ shape.draw_rect(rect)
173
+ shape.finish(color=None, fill=fitz.pdfcolor['yellow'], fill_opacity=0.2)
174
+ shape.finish()
175
+ shape.commit()
176
+
177
+ for bbox in bboxes3:
178
+ # 原始box画上去
179
+ rect = fitz.Rect(*bbox[0:4])
180
+ shape = new_page.new_shape()
181
+ shape.draw_rect(rect)
182
+ shape.finish(color=fitz.pdfcolor['red'], fill=None)
183
+ shape.finish()
184
+ shape.commit()
185
+
186
+ parent_dir = os.path.dirname(save_path)
187
+ if not os.path.exists(parent_dir):
188
+ os.makedirs(parent_dir)
189
+
190
+ doc.save(save_path)
191
+ doc.close()
192
+
193
+
194
+
195
+
196
+ def draw_layout_bbox_on_page(raw_pdf_doc: fitz.Document, paras_dict:dict, header, footer, pdf_path: str):
197
+ """
198
+ 在page上画出bbox,保存到save_path
199
+ """
200
+ # 检查文件是否存在
201
+ is_new_pdf = False
202
+ if os.path.exists(pdf_path):
203
+ # 打开现有的 PDF 文件
204
+ doc = fitz.open(pdf_path)
205
+ else:
206
+ # 创建一个新的空白 PDF 文件
207
+ is_new_pdf = True
208
+ doc = fitz.open('')
209
+
210
+ for k, v in paras_dict.items():
211
+ page_idx = v['page_idx']
212
+ layouts = v['layout_bboxes']
213
+ page = doc[page_idx]
214
+ shape = page.new_shape()
215
+ for order, layout in enumerate(layouts):
216
+ border_offset = 1
217
+ rect_box = layout['layout_bbox']
218
+ layout_label = layout['layout_label']
219
+ fill_color = fitz.pdfcolor['pink'] if layout_label=='U' else None
220
+ rect_box = [rect_box[0]+1, rect_box[1]-border_offset, rect_box[2]-1, rect_box[3]+border_offset]
221
+ rect = fitz.Rect(*rect_box)
222
+ shape.draw_rect(rect)
223
+ shape.finish(color=fitz.pdfcolor['red'], fill=fill_color, fill_opacity=0.4)
224
+ """
225
+ draw order text on layout box
226
+ """
227
+ font_size = 10
228
+ shape.insert_text((rect_box[0] + 1, rect_box[1] + font_size), f"{order}", fontsize=font_size, color=(0, 0, 0))
229
+
230
+ """画上footer header"""
231
+ if header:
232
+ shape.draw_rect(fitz.Rect(header))
233
+ shape.finish(color=None, fill=fitz.pdfcolor['black'], fill_opacity=0.2)
234
+ if footer:
235
+ shape.draw_rect(fitz.Rect(footer))
236
+ shape.finish(color=None, fill=fitz.pdfcolor['black'], fill_opacity=0.2)
237
+
238
+ shape.commit()
239
+
240
+ if is_new_pdf:
241
+ doc.save(pdf_path)
242
+ else:
243
+ doc.saveIncr()
244
+ doc.close()
245
+
246
+
247
+ @DeprecationWarning
248
+ def draw_layout_on_page(raw_pdf_doc: fitz.Document, page_idx: int, page_layout: list, pdf_path: str):
249
+ """
250
+ 把layout的box用红色边框花在pdf_path的page_idx上
251
+ """
252
+ def draw(shape, layout, fill_color=fitz.pdfcolor['pink']):
253
+ border_offset = 1
254
+ rect_box = layout['layout_bbox']
255
+ layout_label = layout['layout_label']
256
+ sub_layout = layout['sub_layout']
257
+ if len(sub_layout)==0:
258
+ fill_color = fill_color if layout_label=='U' else None
259
+ rect_box = [rect_box[0]+1, rect_box[1]-border_offset, rect_box[2]-1, rect_box[3]+border_offset]
260
+ rect = fitz.Rect(*rect_box)
261
+ shape.draw_rect(rect)
262
+ shape.finish(color=fitz.pdfcolor['red'], fill=fill_color, fill_opacity=0.2)
263
+ # if layout_label=='U':
264
+ # bad_boxes = layout.get("bad_boxes", [])
265
+ # for bad_box in bad_boxes:
266
+ # rect = fitz.Rect(*bad_box)
267
+ # shape.draw_rect(rect)
268
+ # shape.finish(color=fitz.pdfcolor['red'], fill=fitz.pdfcolor['red'], fill_opacity=0.2)
269
+ # else:
270
+ # rect = fitz.Rect(*rect_box)
271
+ # shape.draw_rect(rect)
272
+ # shape.finish(color=fitz.pdfcolor['blue'])
273
+
274
+ for sub_layout in sub_layout:
275
+ draw(shape, sub_layout)
276
+ shape.commit()
277
+
278
+
279
+ # 检查文件是否存在
280
+ is_new_pdf = False
281
+ if os.path.exists(pdf_path):
282
+ # 打开现有的 PDF 文件
283
+ doc = fitz.open(pdf_path)
284
+ else:
285
+ # 创建一个新的空白 PDF 文件
286
+ is_new_pdf = True
287
+ doc = fitz.open('')
288
+
289
+ page = doc[page_idx]
290
+ shape = page.new_shape()
291
+ for order, layout in enumerate(page_layout):
292
+ draw(shape, layout, fitz.pdfcolor['yellow'])
293
+
294
+ # shape.insert_textbox(fitz.Rect(200, 0, 600, 20), f"total bboxes: {len(layout)}", fontname="helv", fontsize=12,
295
+ # color=(0, 0, 0))
296
+ # shape.finish(color=fitz.pdfcolor['black'])
297
+ # shape.commit()
298
+
299
+ parent_dir = os.path.dirname(pdf_path)
300
+ if not os.path.exists(parent_dir):
301
+ os.makedirs(parent_dir)
302
+
303
+ if is_new_pdf:
304
+ doc.save(pdf_path)
305
+ else:
306
+ doc.saveIncr()
307
+ doc.close()
308
+
File without changes
@@ -0,0 +1,8 @@
1
+ from ultralytics import YOLO
2
+
3
+ image_path = '' # 待预测图片路径
4
+ model_path = '' # 权重路径
5
+ model = YOLO(model_path)
6
+
7
+ result = model(image_path, save=True, conf=0.5, save_crop=False, line_width=2)
8
+ print(result)