magic-pdf 0.5.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (121) hide show
  1. magic_pdf/__init__.py +0 -0
  2. magic_pdf/cli/__init__.py +0 -0
  3. magic_pdf/cli/magicpdf.py +294 -0
  4. magic_pdf/dict2md/__init__.py +0 -0
  5. magic_pdf/dict2md/mkcontent.py +397 -0
  6. magic_pdf/dict2md/ocr_mkcontent.py +356 -0
  7. magic_pdf/filter/__init__.py +0 -0
  8. magic_pdf/filter/pdf_classify_by_type.py +381 -0
  9. magic_pdf/filter/pdf_meta_scan.py +368 -0
  10. magic_pdf/layout/__init__.py +0 -0
  11. magic_pdf/layout/bbox_sort.py +681 -0
  12. magic_pdf/layout/layout_det_utils.py +182 -0
  13. magic_pdf/layout/layout_sort.py +732 -0
  14. magic_pdf/layout/layout_spiler_recog.py +101 -0
  15. magic_pdf/layout/mcol_sort.py +336 -0
  16. magic_pdf/libs/Constants.py +11 -0
  17. magic_pdf/libs/MakeContentConfig.py +10 -0
  18. magic_pdf/libs/ModelBlockTypeEnum.py +9 -0
  19. magic_pdf/libs/__init__.py +0 -0
  20. magic_pdf/libs/boxbase.py +408 -0
  21. magic_pdf/libs/calc_span_stats.py +239 -0
  22. magic_pdf/libs/commons.py +204 -0
  23. magic_pdf/libs/config_reader.py +63 -0
  24. magic_pdf/libs/convert_utils.py +5 -0
  25. magic_pdf/libs/coordinate_transform.py +9 -0
  26. magic_pdf/libs/detect_language_from_model.py +21 -0
  27. magic_pdf/libs/draw_bbox.py +227 -0
  28. magic_pdf/libs/drop_reason.py +27 -0
  29. magic_pdf/libs/drop_tag.py +19 -0
  30. magic_pdf/libs/hash_utils.py +15 -0
  31. magic_pdf/libs/json_compressor.py +27 -0
  32. magic_pdf/libs/language.py +31 -0
  33. magic_pdf/libs/markdown_utils.py +31 -0
  34. magic_pdf/libs/math.py +9 -0
  35. magic_pdf/libs/nlp_utils.py +203 -0
  36. magic_pdf/libs/ocr_content_type.py +21 -0
  37. magic_pdf/libs/path_utils.py +23 -0
  38. magic_pdf/libs/pdf_image_tools.py +33 -0
  39. magic_pdf/libs/safe_filename.py +11 -0
  40. magic_pdf/libs/textbase.py +33 -0
  41. magic_pdf/libs/version.py +1 -0
  42. magic_pdf/libs/vis_utils.py +308 -0
  43. magic_pdf/model/__init__.py +0 -0
  44. magic_pdf/model/doc_analyze_by_360layout.py +8 -0
  45. magic_pdf/model/doc_analyze_by_pp_structurev2.py +125 -0
  46. magic_pdf/model/magic_model.py +632 -0
  47. magic_pdf/para/__init__.py +0 -0
  48. magic_pdf/para/block_continuation_processor.py +562 -0
  49. magic_pdf/para/block_termination_processor.py +480 -0
  50. magic_pdf/para/commons.py +222 -0
  51. magic_pdf/para/denoise.py +246 -0
  52. magic_pdf/para/draw.py +121 -0
  53. magic_pdf/para/exceptions.py +198 -0
  54. magic_pdf/para/layout_match_processor.py +40 -0
  55. magic_pdf/para/para_pipeline.py +297 -0
  56. magic_pdf/para/para_split.py +644 -0
  57. magic_pdf/para/para_split_v2.py +772 -0
  58. magic_pdf/para/raw_processor.py +207 -0
  59. magic_pdf/para/stats.py +268 -0
  60. magic_pdf/para/title_processor.py +1014 -0
  61. magic_pdf/pdf_parse_by_ocr.py +219 -0
  62. magic_pdf/pdf_parse_by_ocr_v2.py +17 -0
  63. magic_pdf/pdf_parse_by_txt.py +410 -0
  64. magic_pdf/pdf_parse_by_txt_v2.py +56 -0
  65. magic_pdf/pdf_parse_for_train.py +685 -0
  66. magic_pdf/pdf_parse_union_core.py +241 -0
  67. magic_pdf/pipe/AbsPipe.py +112 -0
  68. magic_pdf/pipe/OCRPipe.py +28 -0
  69. magic_pdf/pipe/TXTPipe.py +29 -0
  70. magic_pdf/pipe/UNIPipe.py +83 -0
  71. magic_pdf/pipe/__init__.py +0 -0
  72. magic_pdf/post_proc/__init__.py +0 -0
  73. magic_pdf/post_proc/detect_para.py +3472 -0
  74. magic_pdf/post_proc/pdf_post_filter.py +67 -0
  75. magic_pdf/post_proc/remove_footnote.py +153 -0
  76. magic_pdf/pre_proc/__init__.py +0 -0
  77. magic_pdf/pre_proc/citationmarker_remove.py +157 -0
  78. magic_pdf/pre_proc/construct_page_dict.py +72 -0
  79. magic_pdf/pre_proc/cut_image.py +71 -0
  80. magic_pdf/pre_proc/detect_equation.py +134 -0
  81. magic_pdf/pre_proc/detect_footer_by_model.py +64 -0
  82. magic_pdf/pre_proc/detect_footer_header_by_statistics.py +284 -0
  83. magic_pdf/pre_proc/detect_footnote.py +170 -0
  84. magic_pdf/pre_proc/detect_header.py +64 -0
  85. magic_pdf/pre_proc/detect_images.py +647 -0
  86. magic_pdf/pre_proc/detect_page_number.py +64 -0
  87. magic_pdf/pre_proc/detect_tables.py +62 -0
  88. magic_pdf/pre_proc/equations_replace.py +559 -0
  89. magic_pdf/pre_proc/fix_image.py +244 -0
  90. magic_pdf/pre_proc/fix_table.py +270 -0
  91. magic_pdf/pre_proc/main_text_font.py +23 -0
  92. magic_pdf/pre_proc/ocr_detect_all_bboxes.py +115 -0
  93. magic_pdf/pre_proc/ocr_detect_layout.py +133 -0
  94. magic_pdf/pre_proc/ocr_dict_merge.py +336 -0
  95. magic_pdf/pre_proc/ocr_span_list_modify.py +258 -0
  96. magic_pdf/pre_proc/pdf_pre_filter.py +74 -0
  97. magic_pdf/pre_proc/post_layout_split.py +0 -0
  98. magic_pdf/pre_proc/remove_bbox_overlap.py +98 -0
  99. magic_pdf/pre_proc/remove_colored_strip_bbox.py +79 -0
  100. magic_pdf/pre_proc/remove_footer_header.py +117 -0
  101. magic_pdf/pre_proc/remove_rotate_bbox.py +188 -0
  102. magic_pdf/pre_proc/resolve_bbox_conflict.py +191 -0
  103. magic_pdf/pre_proc/solve_line_alien.py +29 -0
  104. magic_pdf/pre_proc/statistics.py +12 -0
  105. magic_pdf/rw/AbsReaderWriter.py +34 -0
  106. magic_pdf/rw/DiskReaderWriter.py +66 -0
  107. magic_pdf/rw/S3ReaderWriter.py +107 -0
  108. magic_pdf/rw/__init__.py +0 -0
  109. magic_pdf/spark/__init__.py +0 -0
  110. magic_pdf/spark/spark_api.py +51 -0
  111. magic_pdf/train_utils/__init__.py +0 -0
  112. magic_pdf/train_utils/convert_to_train_format.py +65 -0
  113. magic_pdf/train_utils/extract_caption.py +59 -0
  114. magic_pdf/train_utils/remove_footer_header.py +159 -0
  115. magic_pdf/train_utils/vis_utils.py +327 -0
  116. magic_pdf/user_api.py +136 -0
  117. magic_pdf-0.5.4.dist-info/LICENSE.md +661 -0
  118. magic_pdf-0.5.4.dist-info/METADATA +24 -0
  119. magic_pdf-0.5.4.dist-info/RECORD +121 -0
  120. magic_pdf-0.5.4.dist-info/WHEEL +5 -0
  121. magic_pdf-0.5.4.dist-info/top_level.txt +1 -0
@@ -0,0 +1,101 @@
1
+ """
2
+ 找到能分割布局的水平的横线、色块
3
+ """
4
+
5
+ import os
6
+ from magic_pdf.libs.commons import fitz
7
+ from magic_pdf.libs.boxbase import _is_in_or_part_overlap
8
+
9
+
10
+ def __rect_filter_by_width(rect, page_w, page_h):
11
+ mid_x = page_w/2
12
+ if rect[0]< mid_x < rect[2]:
13
+ return True
14
+ return False
15
+
16
+
17
+ def __rect_filter_by_pos(rect, image_bboxes, table_bboxes):
18
+ """
19
+ 不能出现在table和image的位置
20
+ """
21
+ for box in image_bboxes:
22
+ if _is_in_or_part_overlap(rect, box):
23
+ return False
24
+
25
+ for box in table_bboxes:
26
+ if _is_in_or_part_overlap(rect, box):
27
+ return False
28
+
29
+ return True
30
+
31
+
32
+ def __debug_show_page(page, bboxes1: list,bboxes2: list,bboxes3: list,):
33
+ save_path = "./tmp/debug.pdf"
34
+ if os.path.exists(save_path):
35
+ # 删除已经存在的文件
36
+ os.remove(save_path)
37
+ # 创建一个新的空白 PDF 文件
38
+ doc = fitz.open('')
39
+
40
+ width = page.rect.width
41
+ height = page.rect.height
42
+ new_page = doc.new_page(width=width, height=height)
43
+
44
+ shape = new_page.new_shape()
45
+ for bbox in bboxes1:
46
+ # 原始box画上去
47
+ rect = fitz.Rect(*bbox[0:4])
48
+ shape = new_page.new_shape()
49
+ shape.draw_rect(rect)
50
+ shape.finish(color=fitz.pdfcolor['red'], fill=fitz.pdfcolor['blue'], fill_opacity=0.2)
51
+ shape.finish()
52
+ shape.commit()
53
+
54
+ for bbox in bboxes2:
55
+ # 原始box画上去
56
+ rect = fitz.Rect(*bbox[0:4])
57
+ shape = new_page.new_shape()
58
+ shape.draw_rect(rect)
59
+ shape.finish(color=None, fill=fitz.pdfcolor['yellow'], fill_opacity=0.2)
60
+ shape.finish()
61
+ shape.commit()
62
+
63
+ for bbox in bboxes3:
64
+ # 原始box画上去
65
+ rect = fitz.Rect(*bbox[0:4])
66
+ shape = new_page.new_shape()
67
+ shape.draw_rect(rect)
68
+ shape.finish(color=fitz.pdfcolor['red'], fill=None)
69
+ shape.finish()
70
+ shape.commit()
71
+
72
+ parent_dir = os.path.dirname(save_path)
73
+ if not os.path.exists(parent_dir):
74
+ os.makedirs(parent_dir)
75
+
76
+ doc.save(save_path)
77
+ doc.close()
78
+
79
+ def get_spilter_of_page(page, image_bboxes, table_bboxes):
80
+ """
81
+ 获取到色块和横线
82
+ """
83
+ cdrawings = page.get_cdrawings()
84
+
85
+ spilter_bbox = []
86
+ for block in cdrawings:
87
+ if 'fill' in block:
88
+ fill = block['fill']
89
+ if 'fill' in block and block['fill'] and block['fill']!=(1.0,1.0,1.0):
90
+ rect = block['rect']
91
+ if __rect_filter_by_width(rect, page.rect.width, page.rect.height) and __rect_filter_by_pos(rect, image_bboxes, table_bboxes):
92
+ spilter_bbox.append(list(rect))
93
+
94
+ """过滤、修正一下这些box。因为有时候会有一些矩形,高度为0或者为负数,造成layout计算无限循环。如果是负高度或者0高度,统一修正为高度为1"""
95
+ for box in spilter_bbox:
96
+ if box[3]-box[1] <= 0:
97
+ box[3] = box[1] + 1
98
+
99
+ #__debug_show_page(page, spilter_bbox, [], [])
100
+
101
+ return spilter_bbox
@@ -0,0 +1,336 @@
1
+ """
2
+ This is an advanced PyMuPDF utility for detecting multi-column pages.
3
+ It can be used in a shell script, or its main function can be imported and
4
+ invoked as descript below.
5
+
6
+ Features
7
+ ---------
8
+ - Identify text belonging to (a variable number of) columns on the page.
9
+ - Text with different background color is handled separately, allowing for
10
+ easier treatment of side remarks, comment boxes, etc.
11
+ - Uses text block detection capability to identify text blocks and
12
+ uses the block bboxes as primary structuring principle.
13
+ - Supports ignoring footers via a footer margin parameter.
14
+ - Returns re-created text boundary boxes (integer coordinates), sorted ascending
15
+ by the top, then by the left coordinates.
16
+
17
+ Restrictions
18
+ -------------
19
+ - Only supporting horizontal, left-to-right text
20
+ - Returns a list of text boundary boxes - not the text itself. The caller is
21
+ expected to extract text from within the returned boxes.
22
+ - Text written above images is ignored altogether (option).
23
+ - This utility works as expected in most cases. The following situation cannot
24
+ be handled correctly:
25
+ * overlapping (non-disjoint) text blocks
26
+ * image captions are not recognized and are handled like normal text
27
+
28
+ Usage
29
+ ------
30
+ - As a CLI shell command use
31
+
32
+ python multi_column.py input.pdf footer_margin
33
+
34
+ Where footer margin is the height of the bottom stripe to ignore on each page.
35
+ This code is intended to be modified according to your need.
36
+
37
+ - Use in a Python script as follows:
38
+
39
+ ----------------------------------------------------------------------------------
40
+ from multi_column import column_boxes
41
+
42
+ # for each page execute
43
+ bboxes = column_boxes(page, footer_margin=50, no_image_text=True)
44
+
45
+ # bboxes is a list of fitz.IRect objects, that are sort ascending by their y0,
46
+ # then x0 coordinates. Their text content can be extracted by all PyMuPDF
47
+ # get_text() variants, like for instance the following:
48
+ for rect in bboxes:
49
+ print(page.get_text(clip=rect, sort=True))
50
+ ----------------------------------------------------------------------------------
51
+ """
52
+ import sys
53
+ from magic_pdf.libs.commons import fitz
54
+
55
+
56
+ def column_boxes(page, footer_margin=50, header_margin=50, no_image_text=True):
57
+ """Determine bboxes which wrap a column."""
58
+ paths = page.get_drawings()
59
+ bboxes = []
60
+
61
+ # path rectangles
62
+ path_rects = []
63
+
64
+ # image bboxes
65
+ img_bboxes = []
66
+
67
+ # bboxes of non-horizontal text
68
+ # avoid when expanding horizontal text boxes
69
+ vert_bboxes = []
70
+
71
+ # compute relevant page area
72
+ clip = +page.rect
73
+ clip.y1 -= footer_margin # Remove footer area
74
+ clip.y0 += header_margin # Remove header area
75
+
76
+ def can_extend(temp, bb, bboxlist):
77
+ """Determines whether rectangle 'temp' can be extended by 'bb'
78
+ without intersecting any of the rectangles contained in 'bboxlist'.
79
+
80
+ Items of bboxlist may be None if they have been removed.
81
+
82
+ Returns:
83
+ True if 'temp' has no intersections with items of 'bboxlist'.
84
+ """
85
+ for b in bboxlist:
86
+ if not intersects_bboxes(temp, vert_bboxes) and (
87
+ b == None or b == bb or (temp & b).is_empty
88
+ ):
89
+ continue
90
+ return False
91
+
92
+ return True
93
+
94
+ def in_bbox(bb, bboxes):
95
+ """Return 1-based number if a bbox contains bb, else return 0."""
96
+ for i, bbox in enumerate(bboxes):
97
+ if bb in bbox:
98
+ return i + 1
99
+ return 0
100
+
101
+ def intersects_bboxes(bb, bboxes):
102
+ """Return True if a bbox intersects bb, else return False."""
103
+ for bbox in bboxes:
104
+ if not (bb & bbox).is_empty:
105
+ return True
106
+ return False
107
+
108
+ def extend_right(bboxes, width, path_bboxes, vert_bboxes, img_bboxes):
109
+ """Extend a bbox to the right page border.
110
+
111
+ Whenever there is no text to the right of a bbox, enlarge it up
112
+ to the right page border.
113
+
114
+ Args:
115
+ bboxes: (list[IRect]) bboxes to check
116
+ width: (int) page width
117
+ path_bboxes: (list[IRect]) bboxes with a background color
118
+ vert_bboxes: (list[IRect]) bboxes with vertical text
119
+ img_bboxes: (list[IRect]) bboxes of images
120
+ Returns:
121
+ Potentially modified bboxes.
122
+ """
123
+ for i, bb in enumerate(bboxes):
124
+ # do not extend text with background color
125
+ if in_bbox(bb, path_bboxes):
126
+ continue
127
+
128
+ # do not extend text in images
129
+ if in_bbox(bb, img_bboxes):
130
+ continue
131
+
132
+ # temp extends bb to the right page border
133
+ temp = +bb
134
+ temp.x1 = width
135
+
136
+ # do not cut through colored background or images
137
+ if intersects_bboxes(temp, path_bboxes + vert_bboxes + img_bboxes):
138
+ continue
139
+
140
+ # also, do not intersect other text bboxes
141
+ check = can_extend(temp, bb, bboxes)
142
+ if check:
143
+ bboxes[i] = temp # replace with enlarged bbox
144
+
145
+ return [b for b in bboxes if b != None]
146
+
147
+ def clean_nblocks(nblocks):
148
+ """Do some elementary cleaning."""
149
+
150
+ # 1. remove any duplicate blocks.
151
+ blen = len(nblocks)
152
+ if blen < 2:
153
+ return nblocks
154
+ start = blen - 1
155
+ for i in range(start, -1, -1):
156
+ bb1 = nblocks[i]
157
+ bb0 = nblocks[i - 1]
158
+ if bb0 == bb1:
159
+ del nblocks[i]
160
+
161
+ # 2. repair sequence in special cases:
162
+ # consecutive bboxes with almost same bottom value are sorted ascending
163
+ # by x-coordinate.
164
+ y1 = nblocks[0].y1 # first bottom coordinate
165
+ i0 = 0 # its index
166
+ i1 = -1 # index of last bbox with same bottom
167
+
168
+ # Iterate over bboxes, identifying segments with approx. same bottom value.
169
+ # Replace every segment by its sorted version.
170
+ for i in range(1, len(nblocks)):
171
+ b1 = nblocks[i]
172
+ if abs(b1.y1 - y1) > 10: # different bottom
173
+ if i1 > i0: # segment length > 1? Sort it!
174
+ nblocks[i0 : i1 + 1] = sorted(
175
+ nblocks[i0 : i1 + 1], key=lambda b: b.x0
176
+ )
177
+ y1 = b1.y1 # store new bottom value
178
+ i0 = i # store its start index
179
+ i1 = i # store current index
180
+ if i1 > i0: # segment waiting to be sorted
181
+ nblocks[i0 : i1 + 1] = sorted(nblocks[i0 : i1 + 1], key=lambda b: b.x0)
182
+ return nblocks
183
+
184
+ # extract vector graphics
185
+ for p in paths:
186
+ path_rects.append(p["rect"].irect)
187
+ path_bboxes = path_rects
188
+
189
+ # sort path bboxes by ascending top, then left coordinates
190
+ path_bboxes.sort(key=lambda b: (b.y0, b.x0))
191
+
192
+ # bboxes of images on page, no need to sort them
193
+ for item in page.get_images():
194
+ img_bboxes.extend(page.get_image_rects(item[0]))
195
+
196
+ # blocks of text on page
197
+ blocks = page.get_text(
198
+ "dict",
199
+ flags=fitz.TEXTFLAGS_TEXT,
200
+ clip=clip,
201
+ )["blocks"]
202
+
203
+ # Make block rectangles, ignoring non-horizontal text
204
+ for b in blocks:
205
+ bbox = fitz.IRect(b["bbox"]) # bbox of the block
206
+
207
+ # ignore text written upon images
208
+ if no_image_text and in_bbox(bbox, img_bboxes):
209
+ continue
210
+
211
+ # confirm first line to be horizontal
212
+ line0 = b["lines"][0] # get first line
213
+ if line0["dir"] != (1, 0): # only accept horizontal text
214
+ vert_bboxes.append(bbox)
215
+ continue
216
+
217
+ srect = fitz.EMPTY_IRECT()
218
+ for line in b["lines"]:
219
+ lbbox = fitz.IRect(line["bbox"])
220
+ text = "".join([s["text"].strip() for s in line["spans"]])
221
+ if len(text) > 1:
222
+ srect |= lbbox
223
+ bbox = +srect
224
+
225
+ if not bbox.is_empty:
226
+ bboxes.append(bbox)
227
+
228
+ # Sort text bboxes by ascending background, top, then left coordinates
229
+ bboxes.sort(key=lambda k: (in_bbox(k, path_bboxes), k.y0, k.x0))
230
+
231
+ # Extend bboxes to the right where possible
232
+ bboxes = extend_right(
233
+ bboxes, int(page.rect.width), path_bboxes, vert_bboxes, img_bboxes
234
+ )
235
+
236
+ # immediately return of no text found
237
+ if bboxes == []:
238
+ return []
239
+
240
+ # --------------------------------------------------------------------
241
+ # Join bboxes to establish some column structure
242
+ # --------------------------------------------------------------------
243
+ # the final block bboxes on page
244
+ nblocks = [bboxes[0]] # pre-fill with first bbox
245
+ bboxes = bboxes[1:] # remaining old bboxes
246
+
247
+ for i, bb in enumerate(bboxes): # iterate old bboxes
248
+ check = False # indicates unwanted joins
249
+
250
+ # check if bb can extend one of the new blocks
251
+ for j in range(len(nblocks)):
252
+ nbb = nblocks[j] # a new block
253
+
254
+ # never join across columns
255
+ if bb == None or nbb.x1 < bb.x0 or bb.x1 < nbb.x0:
256
+ continue
257
+
258
+ # never join across different background colors
259
+ if in_bbox(nbb, path_bboxes) != in_bbox(bb, path_bboxes):
260
+ continue
261
+
262
+ temp = bb | nbb # temporary extension of new block
263
+ check = can_extend(temp, nbb, nblocks)
264
+ if check == True:
265
+ break
266
+
267
+ if not check: # bb cannot be used to extend any of the new bboxes
268
+ nblocks.append(bb) # so add it to the list
269
+ j = len(nblocks) - 1 # index of it
270
+ temp = nblocks[j] # new bbox added
271
+
272
+ # check if some remaining bbox is contained in temp
273
+ check = can_extend(temp, bb, bboxes)
274
+ if check == False:
275
+ nblocks.append(bb)
276
+ else:
277
+ nblocks[j] = temp
278
+ bboxes[i] = None
279
+
280
+ # do some elementary cleaning
281
+ nblocks = clean_nblocks(nblocks)
282
+
283
+ # return identified text bboxes
284
+ return nblocks
285
+
286
+
287
+ if __name__ == "__main__":
288
+ """Only for debugging purposes, currently.
289
+
290
+ Draw red borders around the returned text bboxes and insert
291
+ the bbox number.
292
+ Then save the file under the name "input-blocks.pdf".
293
+ """
294
+
295
+ # get the file name
296
+ filename = sys.argv[1]
297
+
298
+ # check if footer margin is given
299
+ if len(sys.argv) > 2:
300
+ footer_margin = int(sys.argv[2])
301
+ else: # use default vaue
302
+ footer_margin = 50
303
+
304
+ # check if header margin is given
305
+ if len(sys.argv) > 3:
306
+ header_margin = int(sys.argv[3])
307
+ else: # use default vaue
308
+ header_margin = 50
309
+
310
+ # open document
311
+ doc = fitz.open(filename)
312
+
313
+ # iterate over the pages
314
+ for page in doc:
315
+ # remove any geometry issues
316
+ page.wrap_contents()
317
+
318
+ # get the text bboxes
319
+ bboxes = column_boxes(page, footer_margin=footer_margin, header_margin=header_margin)
320
+
321
+ # prepare a canvas to draw rectangles and text
322
+ shape = page.new_shape()
323
+
324
+ # iterate over the bboxes
325
+ for i, rect in enumerate(bboxes):
326
+ shape.draw_rect(rect) # draw a border
327
+
328
+ # write sequence number
329
+ shape.insert_text(rect.tl + (5, 15), str(i), color=fitz.pdfcolor["red"])
330
+
331
+ # finish drawing / text with color red
332
+ shape.finish(color=fitz.pdfcolor["red"])
333
+ shape.commit() # store to the page
334
+
335
+ # save document with text bboxes
336
+ doc.ez_save(filename.replace(".pdf", "-blocks.pdf"))
@@ -0,0 +1,11 @@
1
+ """
2
+ span维度自定义字段
3
+ """
4
+ # span是否是跨页合并的
5
+ CROSS_PAGE = "cross_page"
6
+
7
+ """
8
+ block维度自定义字段
9
+ """
10
+ # block中lines是否被删除
11
+ LINES_DELETED = "lines_deleted"
@@ -0,0 +1,10 @@
1
+ class MakeMode:
2
+ MM_MD = "mm_markdown"
3
+ NLP_MD = "nlp_markdown"
4
+ STANDARD_FORMAT = "standard_format"
5
+
6
+
7
+ class DropMode:
8
+ WHOLE_PDF = "whole_pdf"
9
+ SINGLE_PAGE = "single_page"
10
+ NONE = "none"
@@ -0,0 +1,9 @@
1
+ from enum import Enum
2
+
3
+ class ModelBlockTypeEnum(Enum):
4
+ TITLE = 0
5
+ PLAIN_TEXT = 1
6
+ ABANDON = 2
7
+ ISOLATE_FORMULA = 8
8
+ EMBEDDING = 13
9
+ ISOLATED = 14
File without changes