magic-pdf 0.10.1__py3-none-any.whl → 0.10.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (75) hide show
  1. magic_pdf/filter/pdf_meta_scan.py +3 -17
  2. magic_pdf/libs/commons.py +0 -161
  3. magic_pdf/libs/draw_bbox.py +2 -3
  4. magic_pdf/libs/markdown_utils.py +0 -21
  5. magic_pdf/libs/pdf_image_tools.py +2 -1
  6. magic_pdf/libs/version.py +1 -1
  7. magic_pdf/model/doc_analyze_by_custom_model.py +2 -2
  8. magic_pdf/model/magic_model.py +0 -30
  9. magic_pdf/model/sub_modules/ocr/paddleocr/ocr_utils.py +3 -28
  10. magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_273_mod.py +3 -3
  11. magic_pdf/para/para_split_v3.py +7 -2
  12. magic_pdf/pdf_parse_union_core_v2.py +97 -124
  13. magic_pdf/pre_proc/construct_page_dict.py +0 -55
  14. magic_pdf/pre_proc/cut_image.py +0 -37
  15. magic_pdf/pre_proc/ocr_detect_all_bboxes.py +5 -178
  16. magic_pdf/pre_proc/ocr_dict_merge.py +1 -224
  17. magic_pdf/pre_proc/ocr_span_list_modify.py +2 -252
  18. magic_pdf/rw/S3ReaderWriter.py +1 -1
  19. {magic_pdf-0.10.1.dist-info → magic_pdf-0.10.2.dist-info}/METADATA +3 -77
  20. {magic_pdf-0.10.1.dist-info → magic_pdf-0.10.2.dist-info}/RECORD +24 -75
  21. magic_pdf/dict2md/mkcontent.py +0 -438
  22. magic_pdf/layout/__init__.py +0 -0
  23. magic_pdf/layout/bbox_sort.py +0 -681
  24. magic_pdf/layout/layout_det_utils.py +0 -182
  25. magic_pdf/layout/layout_sort.py +0 -921
  26. magic_pdf/layout/layout_spiler_recog.py +0 -101
  27. magic_pdf/layout/mcol_sort.py +0 -336
  28. magic_pdf/libs/calc_span_stats.py +0 -239
  29. magic_pdf/libs/detect_language_from_model.py +0 -21
  30. magic_pdf/libs/nlp_utils.py +0 -203
  31. magic_pdf/libs/textbase.py +0 -33
  32. magic_pdf/libs/vis_utils.py +0 -308
  33. magic_pdf/para/block_continuation_processor.py +0 -562
  34. magic_pdf/para/block_termination_processor.py +0 -480
  35. magic_pdf/para/commons.py +0 -222
  36. magic_pdf/para/denoise.py +0 -246
  37. magic_pdf/para/draw.py +0 -121
  38. magic_pdf/para/exceptions.py +0 -198
  39. magic_pdf/para/layout_match_processor.py +0 -40
  40. magic_pdf/para/para_split.py +0 -807
  41. magic_pdf/para/para_split_v2.py +0 -959
  42. magic_pdf/para/raw_processor.py +0 -207
  43. magic_pdf/para/stats.py +0 -268
  44. magic_pdf/para/title_processor.py +0 -1014
  45. magic_pdf/pdf_parse_union_core.py +0 -345
  46. magic_pdf/post_proc/__init__.py +0 -0
  47. magic_pdf/post_proc/detect_para.py +0 -3472
  48. magic_pdf/post_proc/pdf_post_filter.py +0 -60
  49. magic_pdf/post_proc/remove_footnote.py +0 -153
  50. magic_pdf/pre_proc/citationmarker_remove.py +0 -161
  51. magic_pdf/pre_proc/detect_equation.py +0 -134
  52. magic_pdf/pre_proc/detect_footer_by_model.py +0 -64
  53. magic_pdf/pre_proc/detect_footer_header_by_statistics.py +0 -284
  54. magic_pdf/pre_proc/detect_footnote.py +0 -170
  55. magic_pdf/pre_proc/detect_header.py +0 -64
  56. magic_pdf/pre_proc/detect_images.py +0 -647
  57. magic_pdf/pre_proc/detect_page_number.py +0 -64
  58. magic_pdf/pre_proc/detect_tables.py +0 -62
  59. magic_pdf/pre_proc/equations_replace.py +0 -550
  60. magic_pdf/pre_proc/fix_image.py +0 -244
  61. magic_pdf/pre_proc/fix_table.py +0 -270
  62. magic_pdf/pre_proc/main_text_font.py +0 -23
  63. magic_pdf/pre_proc/ocr_detect_layout.py +0 -133
  64. magic_pdf/pre_proc/pdf_pre_filter.py +0 -78
  65. magic_pdf/pre_proc/post_layout_split.py +0 -0
  66. magic_pdf/pre_proc/remove_colored_strip_bbox.py +0 -101
  67. magic_pdf/pre_proc/remove_footer_header.py +0 -114
  68. magic_pdf/pre_proc/remove_rotate_bbox.py +0 -236
  69. magic_pdf/pre_proc/resolve_bbox_conflict.py +0 -184
  70. magic_pdf/pre_proc/solve_line_alien.py +0 -29
  71. magic_pdf/pre_proc/statistics.py +0 -12
  72. {magic_pdf-0.10.1.dist-info → magic_pdf-0.10.2.dist-info}/LICENSE.md +0 -0
  73. {magic_pdf-0.10.1.dist-info → magic_pdf-0.10.2.dist-info}/WHEEL +0 -0
  74. {magic_pdf-0.10.1.dist-info → magic_pdf-0.10.2.dist-info}/entry_points.txt +0 -0
  75. {magic_pdf-0.10.1.dist-info → magic_pdf-0.10.2.dist-info}/top_level.txt +0 -0
@@ -1,101 +0,0 @@
1
- """
2
- 找到能分割布局的水平的横线、色块
3
- """
4
-
5
- import os
6
- from magic_pdf.libs.commons import fitz
7
- from magic_pdf.libs.boxbase import _is_in_or_part_overlap
8
-
9
-
10
- def __rect_filter_by_width(rect, page_w, page_h):
11
- mid_x = page_w/2
12
- if rect[0]< mid_x < rect[2]:
13
- return True
14
- return False
15
-
16
-
17
- def __rect_filter_by_pos(rect, image_bboxes, table_bboxes):
18
- """
19
- 不能出现在table和image的位置
20
- """
21
- for box in image_bboxes:
22
- if _is_in_or_part_overlap(rect, box):
23
- return False
24
-
25
- for box in table_bboxes:
26
- if _is_in_or_part_overlap(rect, box):
27
- return False
28
-
29
- return True
30
-
31
-
32
- def __debug_show_page(page, bboxes1: list,bboxes2: list,bboxes3: list,):
33
- save_path = "./tmp/debug.pdf"
34
- if os.path.exists(save_path):
35
- # 删除已经存在的文件
36
- os.remove(save_path)
37
- # 创建一个新的空白 PDF 文件
38
- doc = fitz.open('')
39
-
40
- width = page.rect.width
41
- height = page.rect.height
42
- new_page = doc.new_page(width=width, height=height)
43
-
44
- shape = new_page.new_shape()
45
- for bbox in bboxes1:
46
- # 原始box画上去
47
- rect = fitz.Rect(*bbox[0:4])
48
- shape = new_page.new_shape()
49
- shape.draw_rect(rect)
50
- shape.finish(color=fitz.pdfcolor['red'], fill=fitz.pdfcolor['blue'], fill_opacity=0.2)
51
- shape.finish()
52
- shape.commit()
53
-
54
- for bbox in bboxes2:
55
- # 原始box画上去
56
- rect = fitz.Rect(*bbox[0:4])
57
- shape = new_page.new_shape()
58
- shape.draw_rect(rect)
59
- shape.finish(color=None, fill=fitz.pdfcolor['yellow'], fill_opacity=0.2)
60
- shape.finish()
61
- shape.commit()
62
-
63
- for bbox in bboxes3:
64
- # 原始box画上去
65
- rect = fitz.Rect(*bbox[0:4])
66
- shape = new_page.new_shape()
67
- shape.draw_rect(rect)
68
- shape.finish(color=fitz.pdfcolor['red'], fill=None)
69
- shape.finish()
70
- shape.commit()
71
-
72
- parent_dir = os.path.dirname(save_path)
73
- if not os.path.exists(parent_dir):
74
- os.makedirs(parent_dir)
75
-
76
- doc.save(save_path)
77
- doc.close()
78
-
79
- def get_spilter_of_page(page, image_bboxes, table_bboxes):
80
- """
81
- 获取到色块和横线
82
- """
83
- cdrawings = page.get_cdrawings()
84
-
85
- spilter_bbox = []
86
- for block in cdrawings:
87
- if 'fill' in block:
88
- fill = block['fill']
89
- if 'fill' in block and block['fill'] and block['fill']!=(1.0,1.0,1.0):
90
- rect = block['rect']
91
- if __rect_filter_by_width(rect, page.rect.width, page.rect.height) and __rect_filter_by_pos(rect, image_bboxes, table_bboxes):
92
- spilter_bbox.append(list(rect))
93
-
94
- """过滤、修正一下这些box。因为有时候会有一些矩形,高度为0或者为负数,造成layout计算无限循环。如果是负高度或者0高度,统一修正为高度为1"""
95
- for box in spilter_bbox:
96
- if box[3]-box[1] <= 0:
97
- box[3] = box[1] + 1
98
-
99
- #__debug_show_page(page, spilter_bbox, [], [])
100
-
101
- return spilter_bbox
@@ -1,336 +0,0 @@
1
- """
2
- This is an advanced PyMuPDF utility for detecting multi-column pages.
3
- It can be used in a shell script, or its main function can be imported and
4
- invoked as descript below.
5
-
6
- Features
7
- ---------
8
- - Identify text belonging to (a variable number of) columns on the page.
9
- - Text with different background color is handled separately, allowing for
10
- easier treatment of side remarks, comment boxes, etc.
11
- - Uses text block detection capability to identify text blocks and
12
- uses the block bboxes as primary structuring principle.
13
- - Supports ignoring footers via a footer margin parameter.
14
- - Returns re-created text boundary boxes (integer coordinates), sorted ascending
15
- by the top, then by the left coordinates.
16
-
17
- Restrictions
18
- -------------
19
- - Only supporting horizontal, left-to-right text
20
- - Returns a list of text boundary boxes - not the text itself. The caller is
21
- expected to extract text from within the returned boxes.
22
- - Text written above images is ignored altogether (option).
23
- - This utility works as expected in most cases. The following situation cannot
24
- be handled correctly:
25
- * overlapping (non-disjoint) text blocks
26
- * image captions are not recognized and are handled like normal text
27
-
28
- Usage
29
- ------
30
- - As a CLI shell command use
31
-
32
- python multi_column.py input.pdf footer_margin
33
-
34
- Where footer margin is the height of the bottom stripe to ignore on each page.
35
- This code is intended to be modified according to your need.
36
-
37
- - Use in a Python script as follows:
38
-
39
- ----------------------------------------------------------------------------------
40
- from multi_column import column_boxes
41
-
42
- # for each page execute
43
- bboxes = column_boxes(page, footer_margin=50, no_image_text=True)
44
-
45
- # bboxes is a list of fitz.IRect objects, that are sort ascending by their y0,
46
- # then x0 coordinates. Their text content can be extracted by all PyMuPDF
47
- # get_text() variants, like for instance the following:
48
- for rect in bboxes:
49
- print(page.get_text(clip=rect, sort=True))
50
- ----------------------------------------------------------------------------------
51
- """
52
- import sys
53
- from magic_pdf.libs.commons import fitz
54
-
55
-
56
- def column_boxes(page, footer_margin=50, header_margin=50, no_image_text=True):
57
- """Determine bboxes which wrap a column."""
58
- paths = page.get_drawings()
59
- bboxes = []
60
-
61
- # path rectangles
62
- path_rects = []
63
-
64
- # image bboxes
65
- img_bboxes = []
66
-
67
- # bboxes of non-horizontal text
68
- # avoid when expanding horizontal text boxes
69
- vert_bboxes = []
70
-
71
- # compute relevant page area
72
- clip = +page.rect
73
- clip.y1 -= footer_margin # Remove footer area
74
- clip.y0 += header_margin # Remove header area
75
-
76
- def can_extend(temp, bb, bboxlist):
77
- """Determines whether rectangle 'temp' can be extended by 'bb'
78
- without intersecting any of the rectangles contained in 'bboxlist'.
79
-
80
- Items of bboxlist may be None if they have been removed.
81
-
82
- Returns:
83
- True if 'temp' has no intersections with items of 'bboxlist'.
84
- """
85
- for b in bboxlist:
86
- if not intersects_bboxes(temp, vert_bboxes) and (
87
- b == None or b == bb or (temp & b).is_empty
88
- ):
89
- continue
90
- return False
91
-
92
- return True
93
-
94
- def in_bbox(bb, bboxes):
95
- """Return 1-based number if a bbox contains bb, else return 0."""
96
- for i, bbox in enumerate(bboxes):
97
- if bb in bbox:
98
- return i + 1
99
- return 0
100
-
101
- def intersects_bboxes(bb, bboxes):
102
- """Return True if a bbox intersects bb, else return False."""
103
- for bbox in bboxes:
104
- if not (bb & bbox).is_empty:
105
- return True
106
- return False
107
-
108
- def extend_right(bboxes, width, path_bboxes, vert_bboxes, img_bboxes):
109
- """Extend a bbox to the right page border.
110
-
111
- Whenever there is no text to the right of a bbox, enlarge it up
112
- to the right page border.
113
-
114
- Args:
115
- bboxes: (list[IRect]) bboxes to check
116
- width: (int) page width
117
- path_bboxes: (list[IRect]) bboxes with a background color
118
- vert_bboxes: (list[IRect]) bboxes with vertical text
119
- img_bboxes: (list[IRect]) bboxes of images
120
- Returns:
121
- Potentially modified bboxes.
122
- """
123
- for i, bb in enumerate(bboxes):
124
- # do not extend text with background color
125
- if in_bbox(bb, path_bboxes):
126
- continue
127
-
128
- # do not extend text in images
129
- if in_bbox(bb, img_bboxes):
130
- continue
131
-
132
- # temp extends bb to the right page border
133
- temp = +bb
134
- temp.x1 = width
135
-
136
- # do not cut through colored background or images
137
- if intersects_bboxes(temp, path_bboxes + vert_bboxes + img_bboxes):
138
- continue
139
-
140
- # also, do not intersect other text bboxes
141
- check = can_extend(temp, bb, bboxes)
142
- if check:
143
- bboxes[i] = temp # replace with enlarged bbox
144
-
145
- return [b for b in bboxes if b != None]
146
-
147
- def clean_nblocks(nblocks):
148
- """Do some elementary cleaning."""
149
-
150
- # 1. remove any duplicate blocks.
151
- blen = len(nblocks)
152
- if blen < 2:
153
- return nblocks
154
- start = blen - 1
155
- for i in range(start, -1, -1):
156
- bb1 = nblocks[i]
157
- bb0 = nblocks[i - 1]
158
- if bb0 == bb1:
159
- del nblocks[i]
160
-
161
- # 2. repair sequence in special cases:
162
- # consecutive bboxes with almost same bottom value are sorted ascending
163
- # by x-coordinate.
164
- y1 = nblocks[0].y1 # first bottom coordinate
165
- i0 = 0 # its index
166
- i1 = -1 # index of last bbox with same bottom
167
-
168
- # Iterate over bboxes, identifying segments with approx. same bottom value.
169
- # Replace every segment by its sorted version.
170
- for i in range(1, len(nblocks)):
171
- b1 = nblocks[i]
172
- if abs(b1.y1 - y1) > 10: # different bottom
173
- if i1 > i0: # segment length > 1? Sort it!
174
- nblocks[i0 : i1 + 1] = sorted(
175
- nblocks[i0 : i1 + 1], key=lambda b: b.x0
176
- )
177
- y1 = b1.y1 # store new bottom value
178
- i0 = i # store its start index
179
- i1 = i # store current index
180
- if i1 > i0: # segment waiting to be sorted
181
- nblocks[i0 : i1 + 1] = sorted(nblocks[i0 : i1 + 1], key=lambda b: b.x0)
182
- return nblocks
183
-
184
- # extract vector graphics
185
- for p in paths:
186
- path_rects.append(p["rect"].irect)
187
- path_bboxes = path_rects
188
-
189
- # sort path bboxes by ascending top, then left coordinates
190
- path_bboxes.sort(key=lambda b: (b.y0, b.x0))
191
-
192
- # bboxes of images on page, no need to sort them
193
- for item in page.get_images():
194
- img_bboxes.extend(page.get_image_rects(item[0]))
195
-
196
- # blocks of text on page
197
- blocks = page.get_text(
198
- "dict",
199
- flags=fitz.TEXTFLAGS_TEXT,
200
- clip=clip,
201
- )["blocks"]
202
-
203
- # Make block rectangles, ignoring non-horizontal text
204
- for b in blocks:
205
- bbox = fitz.IRect(b["bbox"]) # bbox of the block
206
-
207
- # ignore text written upon images
208
- if no_image_text and in_bbox(bbox, img_bboxes):
209
- continue
210
-
211
- # confirm first line to be horizontal
212
- line0 = b["lines"][0] # get first line
213
- if line0["dir"] != (1, 0): # only accept horizontal text
214
- vert_bboxes.append(bbox)
215
- continue
216
-
217
- srect = fitz.EMPTY_IRECT()
218
- for line in b["lines"]:
219
- lbbox = fitz.IRect(line["bbox"])
220
- text = "".join([s["text"].strip() for s in line["spans"]])
221
- if len(text) > 1:
222
- srect |= lbbox
223
- bbox = +srect
224
-
225
- if not bbox.is_empty:
226
- bboxes.append(bbox)
227
-
228
- # Sort text bboxes by ascending background, top, then left coordinates
229
- bboxes.sort(key=lambda k: (in_bbox(k, path_bboxes), k.y0, k.x0))
230
-
231
- # Extend bboxes to the right where possible
232
- bboxes = extend_right(
233
- bboxes, int(page.rect.width), path_bboxes, vert_bboxes, img_bboxes
234
- )
235
-
236
- # immediately return of no text found
237
- if bboxes == []:
238
- return []
239
-
240
- # --------------------------------------------------------------------
241
- # Join bboxes to establish some column structure
242
- # --------------------------------------------------------------------
243
- # the final block bboxes on page
244
- nblocks = [bboxes[0]] # pre-fill with first bbox
245
- bboxes = bboxes[1:] # remaining old bboxes
246
-
247
- for i, bb in enumerate(bboxes): # iterate old bboxes
248
- check = False # indicates unwanted joins
249
-
250
- # check if bb can extend one of the new blocks
251
- for j in range(len(nblocks)):
252
- nbb = nblocks[j] # a new block
253
-
254
- # never join across columns
255
- if bb == None or nbb.x1 < bb.x0 or bb.x1 < nbb.x0:
256
- continue
257
-
258
- # never join across different background colors
259
- if in_bbox(nbb, path_bboxes) != in_bbox(bb, path_bboxes):
260
- continue
261
-
262
- temp = bb | nbb # temporary extension of new block
263
- check = can_extend(temp, nbb, nblocks)
264
- if check == True:
265
- break
266
-
267
- if not check: # bb cannot be used to extend any of the new bboxes
268
- nblocks.append(bb) # so add it to the list
269
- j = len(nblocks) - 1 # index of it
270
- temp = nblocks[j] # new bbox added
271
-
272
- # check if some remaining bbox is contained in temp
273
- check = can_extend(temp, bb, bboxes)
274
- if check == False:
275
- nblocks.append(bb)
276
- else:
277
- nblocks[j] = temp
278
- bboxes[i] = None
279
-
280
- # do some elementary cleaning
281
- nblocks = clean_nblocks(nblocks)
282
-
283
- # return identified text bboxes
284
- return nblocks
285
-
286
-
287
- if __name__ == "__main__":
288
- """Only for debugging purposes, currently.
289
-
290
- Draw red borders around the returned text bboxes and insert
291
- the bbox number.
292
- Then save the file under the name "input-blocks.pdf".
293
- """
294
-
295
- # get the file name
296
- filename = sys.argv[1]
297
-
298
- # check if footer margin is given
299
- if len(sys.argv) > 2:
300
- footer_margin = int(sys.argv[2])
301
- else: # use default vaue
302
- footer_margin = 50
303
-
304
- # check if header margin is given
305
- if len(sys.argv) > 3:
306
- header_margin = int(sys.argv[3])
307
- else: # use default vaue
308
- header_margin = 50
309
-
310
- # open document
311
- doc = fitz.open(filename)
312
-
313
- # iterate over the pages
314
- for page in doc:
315
- # remove any geometry issues
316
- page.wrap_contents()
317
-
318
- # get the text bboxes
319
- bboxes = column_boxes(page, footer_margin=footer_margin, header_margin=header_margin)
320
-
321
- # prepare a canvas to draw rectangles and text
322
- shape = page.new_shape()
323
-
324
- # iterate over the bboxes
325
- for i, rect in enumerate(bboxes):
326
- shape.draw_rect(rect) # draw a border
327
-
328
- # write sequence number
329
- shape.insert_text(rect.tl + (5, 15), str(i), color=fitz.pdfcolor["red"])
330
-
331
- # finish drawing / text with color red
332
- shape.finish(color=fitz.pdfcolor["red"])
333
- shape.commit() # store to the page
334
-
335
- # save document with text bboxes
336
- doc.ez_save(filename.replace(".pdf", "-blocks.pdf"))
@@ -1,239 +0,0 @@
1
- import os
2
- import csv
3
- import json
4
- import pandas as pd
5
- from pandas import DataFrame as df
6
- from matplotlib import pyplot as plt
7
- from termcolor import cprint
8
-
9
- """
10
- Execute this script in the following way:
11
-
12
- 1. Make sure there are pdf_dic.json files under the directory code-clean/tmp/unittest/md/, such as the following:
13
-
14
- code-clean/tmp/unittest/md/scihub/scihub_00500000/libgen.scimag00527000-00527999.zip_10.1002/app.25178/pdf_dic.json
15
-
16
- 2. Under the directory code-clean, execute the following command:
17
-
18
- $ python -m libs.calc_span_stats
19
-
20
- """
21
-
22
-
23
- def print_green_on_red(text):
24
- cprint(text, "green", "on_red", attrs=["bold"], end="\n\n")
25
-
26
-
27
- def print_green(text):
28
- print()
29
- cprint(text, "green", attrs=["bold"], end="\n\n")
30
-
31
-
32
- def print_red(text):
33
- print()
34
- cprint(text, "red", attrs=["bold"], end="\n\n")
35
-
36
-
37
- def safe_get(dict_obj, key, default):
38
- val = dict_obj.get(key)
39
- if val is None:
40
- return default
41
- else:
42
- return val
43
-
44
-
45
- class SpanStatsCalc:
46
- """Calculate statistics of span."""
47
-
48
- def draw_charts(self, span_stats: pd.DataFrame, fig_num: int, save_path: str):
49
- """Draw multiple figures in one figure."""
50
- # make a canvas
51
- fig = plt.figure(fig_num, figsize=(20, 20))
52
-
53
- pass
54
-
55
- def calc_stats_per_dict(self, pdf_dict) -> pd.DataFrame:
56
- """Calculate statistics per pdf_dict."""
57
- span_stats = pd.DataFrame()
58
-
59
- span_stats = []
60
- span_id = 0
61
- for page_id, blocks in pdf_dict.items():
62
- if page_id.startswith("page_"):
63
- if "para_blocks" in blocks.keys():
64
- for para_block in blocks["para_blocks"]:
65
- for line in para_block["lines"]:
66
- for span in line["spans"]:
67
- span_text = safe_get(span, "text", "")
68
- span_font_name = safe_get(span, "font", "")
69
- span_font_size = safe_get(span, "size", 0)
70
- span_font_color = safe_get(span, "color", "")
71
- span_font_flags = safe_get(span, "flags", 0)
72
-
73
- span_font_flags_decoded = safe_get(span, "decomposed_flags", {})
74
- span_is_super_script = safe_get(span_font_flags_decoded, "is_superscript", False)
75
- span_is_italic = safe_get(span_font_flags_decoded, "is_italic", False)
76
- span_is_serifed = safe_get(span_font_flags_decoded, "is_serifed", False)
77
- span_is_sans_serifed = safe_get(span_font_flags_decoded, "is_sans_serifed", False)
78
- span_is_monospaced = safe_get(span_font_flags_decoded, "is_monospaced", False)
79
- span_is_proportional = safe_get(span_font_flags_decoded, "is_proportional", False)
80
- span_is_bold = safe_get(span_font_flags_decoded, "is_bold", False)
81
-
82
- span_stats.append(
83
- {
84
- "span_id": span_id, # id of span
85
- "page_id": page_id, # page number of pdf
86
- "span_text": span_text, # text of span
87
- "span_font_name": span_font_name, # font name of span
88
- "span_font_size": span_font_size, # font size of span
89
- "span_font_color": span_font_color, # font color of span
90
- "span_font_flags": span_font_flags, # font flags of span
91
- "span_is_superscript": int(
92
- span_is_super_script
93
- ), # indicate whether the span is super script or not
94
- "span_is_italic": int(span_is_italic), # indicate whether the span is italic or not
95
- "span_is_serifed": int(span_is_serifed), # indicate whether the span is serifed or not
96
- "span_is_sans_serifed": int(
97
- span_is_sans_serifed
98
- ), # indicate whether the span is sans serifed or not
99
- "span_is_monospaced": int(
100
- span_is_monospaced
101
- ), # indicate whether the span is monospaced or not
102
- "span_is_proportional": int(
103
- span_is_proportional
104
- ), # indicate whether the span is proportional or not
105
- "span_is_bold": int(span_is_bold), # indicate whether the span is bold or not
106
- }
107
- )
108
-
109
- span_id += 1
110
-
111
- span_stats = pd.DataFrame(span_stats)
112
- # print(span_stats)
113
-
114
- return span_stats
115
-
116
-
117
- def __find_pdf_dic_files(
118
- jf_name="pdf_dic.json",
119
- base_code_name="code-clean",
120
- tgt_base_dir_name="tmp",
121
- unittest_dir_name="unittest",
122
- md_dir_name="md",
123
- book_names=[
124
- "scihub",
125
- ], # other possible values: "zlib", "arxiv" and so on
126
- ):
127
- pdf_dict_files = []
128
-
129
- curr_dir = os.path.dirname(__file__)
130
-
131
- for i in range(len(curr_dir)):
132
- if curr_dir[i : i + len(base_code_name)] == base_code_name:
133
- base_code_dir_name = curr_dir[: i + len(base_code_name)]
134
- for book_name in book_names:
135
- search_dir_relative_name = os.path.join(tgt_base_dir_name, unittest_dir_name, md_dir_name, book_name)
136
- if os.path.exists(base_code_dir_name):
137
- search_dir_name = os.path.join(base_code_dir_name, search_dir_relative_name)
138
- for root, dirs, files in os.walk(search_dir_name):
139
- for file in files:
140
- if file == jf_name:
141
- pdf_dict_files.append(os.path.join(root, file))
142
- break
143
-
144
- return pdf_dict_files
145
-
146
-
147
- def combine_span_texts(group_df, span_stats):
148
- combined_span_texts = []
149
- for _, row in group_df.iterrows():
150
- curr_span_id = row.name
151
- curr_span_text = row["span_text"]
152
-
153
- pre_span_id = curr_span_id - 1
154
- pre_span_text = span_stats.at[pre_span_id, "span_text"] if pre_span_id in span_stats.index else ""
155
-
156
- next_span_id = curr_span_id + 1
157
- next_span_text = span_stats.at[next_span_id, "span_text"] if next_span_id in span_stats.index else ""
158
-
159
- # pointer_sign is a right arrow if the span is superscript, otherwise it is a down arrow
160
- pointer_sign = "→ → → "
161
- combined_text = "\n".join([pointer_sign + pre_span_text, pointer_sign + curr_span_text, pointer_sign + next_span_text])
162
- combined_span_texts.append(combined_text)
163
-
164
- return "\n\n".join(combined_span_texts)
165
-
166
-
167
- # pd.set_option("display.max_colwidth", None) # 设置为 None 来显示完整的文本
168
- pd.set_option("display.max_rows", None) # 设置为 None 来显示更多的行
169
-
170
-
171
- def main():
172
- pdf_dict_files = __find_pdf_dic_files()
173
- # print(pdf_dict_files)
174
-
175
- span_stats_calc = SpanStatsCalc()
176
-
177
- for pdf_dict_file in pdf_dict_files:
178
- print("-" * 100)
179
- print_green_on_red(f"Processing {pdf_dict_file}")
180
-
181
- with open(pdf_dict_file, "r", encoding="utf-8") as f:
182
- pdf_dict = json.load(f)
183
-
184
- raw_df = span_stats_calc.calc_stats_per_dict(pdf_dict)
185
- save_path = pdf_dict_file.replace("pdf_dic.json", "span_stats_raw.csv")
186
- raw_df.to_csv(save_path, index=False)
187
-
188
- filtered_df = raw_df[raw_df["span_is_superscript"] == 1]
189
- if filtered_df.empty:
190
- print("No superscript span found!")
191
- continue
192
-
193
- filtered_grouped_df = filtered_df.groupby(["span_font_name", "span_font_size", "span_font_color"])
194
-
195
- combined_span_texts = filtered_grouped_df.apply(combine_span_texts, span_stats=raw_df) # type: ignore
196
-
197
- final_df = filtered_grouped_df.size().reset_index(name="count")
198
- final_df["span_texts"] = combined_span_texts.reset_index(level=[0, 1, 2], drop=True)
199
-
200
- print(final_df)
201
-
202
- final_df["span_texts"] = final_df["span_texts"].apply(lambda x: x.replace("\n", "\r\n"))
203
-
204
- save_path = pdf_dict_file.replace("pdf_dic.json", "span_stats_final.csv")
205
- # 使用 UTF-8 编码并添加 BOM,确保所有字段被双引号包围
206
- final_df.to_csv(save_path, index=False, encoding="utf-8-sig", quoting=csv.QUOTE_ALL)
207
-
208
- # 创建一个 2x2 的图表布局
209
- fig, axs = plt.subplots(2, 2, figsize=(15, 10))
210
-
211
- # 按照 span_font_name 分类作图
212
- final_df.groupby("span_font_name")["count"].sum().plot(kind="bar", ax=axs[0, 0], title="By Font Name")
213
-
214
- # 按照 span_font_size 分类作图
215
- final_df.groupby("span_font_size")["count"].sum().plot(kind="bar", ax=axs[0, 1], title="By Font Size")
216
-
217
- # 按照 span_font_color 分类作图
218
- final_df.groupby("span_font_color")["count"].sum().plot(kind="bar", ax=axs[1, 0], title="By Font Color")
219
-
220
- # 按照 span_font_name、span_font_size 和 span_font_color 共同分类作图
221
- grouped = final_df.groupby(["span_font_name", "span_font_size", "span_font_color"])
222
- grouped["count"].sum().unstack().plot(kind="bar", ax=axs[1, 1], title="Combined Grouping")
223
-
224
- # 调整布局
225
- plt.tight_layout()
226
-
227
- # 显示图表
228
- # plt.show()
229
-
230
- # 保存图表到 PNG 文件
231
- save_path = pdf_dict_file.replace("pdf_dic.json", "span_stats_combined.png")
232
- plt.savefig(save_path)
233
-
234
- # 清除画布
235
- plt.clf()
236
-
237
-
238
- if __name__ == "__main__":
239
- main()
@@ -1,21 +0,0 @@
1
- from collections import Counter
2
-
3
- from magic_pdf.libs.language import detect_lang
4
-
5
- def get_language_from_model(model_list: list):
6
- language_lst = []
7
- for ocr_page_info in model_list:
8
- page_text = ""
9
- layout_dets = ocr_page_info["layout_dets"]
10
- for layout_det in layout_dets:
11
- category_id = layout_det["category_id"]
12
- allow_category_id_list = [15]
13
- if category_id in allow_category_id_list:
14
- page_text += layout_det["text"]
15
- page_language = detect_lang(page_text)
16
- language_lst.append(page_language)
17
- # 统计text_language_list中每种语言的个数
18
- count_dict = Counter(language_lst)
19
- # 输出text_language_list中出现的次数最多的语言
20
- language = max(count_dict, key=count_dict.get)
21
- return language