magic-pdf 0.10.0__py3-none-any.whl → 0.10.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (76) hide show
  1. magic_pdf/data/data_reader_writer/filebase.py +3 -0
  2. magic_pdf/filter/pdf_meta_scan.py +3 -17
  3. magic_pdf/libs/commons.py +0 -161
  4. magic_pdf/libs/draw_bbox.py +2 -3
  5. magic_pdf/libs/markdown_utils.py +0 -21
  6. magic_pdf/libs/pdf_image_tools.py +2 -1
  7. magic_pdf/libs/version.py +1 -1
  8. magic_pdf/model/doc_analyze_by_custom_model.py +2 -2
  9. magic_pdf/model/magic_model.py +0 -30
  10. magic_pdf/model/sub_modules/ocr/paddleocr/ocr_utils.py +3 -28
  11. magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_273_mod.py +3 -3
  12. magic_pdf/para/para_split_v3.py +7 -2
  13. magic_pdf/pdf_parse_union_core_v2.py +97 -124
  14. magic_pdf/pre_proc/construct_page_dict.py +0 -55
  15. magic_pdf/pre_proc/cut_image.py +0 -37
  16. magic_pdf/pre_proc/ocr_detect_all_bboxes.py +5 -178
  17. magic_pdf/pre_proc/ocr_dict_merge.py +1 -224
  18. magic_pdf/pre_proc/ocr_span_list_modify.py +2 -252
  19. magic_pdf/rw/S3ReaderWriter.py +1 -1
  20. {magic_pdf-0.10.0.dist-info → magic_pdf-0.10.2.dist-info}/METADATA +3 -77
  21. {magic_pdf-0.10.0.dist-info → magic_pdf-0.10.2.dist-info}/RECORD +25 -76
  22. {magic_pdf-0.10.0.dist-info → magic_pdf-0.10.2.dist-info}/WHEEL +1 -1
  23. magic_pdf/dict2md/mkcontent.py +0 -438
  24. magic_pdf/layout/__init__.py +0 -0
  25. magic_pdf/layout/bbox_sort.py +0 -681
  26. magic_pdf/layout/layout_det_utils.py +0 -182
  27. magic_pdf/layout/layout_sort.py +0 -921
  28. magic_pdf/layout/layout_spiler_recog.py +0 -101
  29. magic_pdf/layout/mcol_sort.py +0 -336
  30. magic_pdf/libs/calc_span_stats.py +0 -239
  31. magic_pdf/libs/detect_language_from_model.py +0 -21
  32. magic_pdf/libs/nlp_utils.py +0 -203
  33. magic_pdf/libs/textbase.py +0 -33
  34. magic_pdf/libs/vis_utils.py +0 -308
  35. magic_pdf/para/block_continuation_processor.py +0 -562
  36. magic_pdf/para/block_termination_processor.py +0 -480
  37. magic_pdf/para/commons.py +0 -222
  38. magic_pdf/para/denoise.py +0 -246
  39. magic_pdf/para/draw.py +0 -121
  40. magic_pdf/para/exceptions.py +0 -198
  41. magic_pdf/para/layout_match_processor.py +0 -40
  42. magic_pdf/para/para_split.py +0 -807
  43. magic_pdf/para/para_split_v2.py +0 -959
  44. magic_pdf/para/raw_processor.py +0 -207
  45. magic_pdf/para/stats.py +0 -268
  46. magic_pdf/para/title_processor.py +0 -1014
  47. magic_pdf/pdf_parse_union_core.py +0 -345
  48. magic_pdf/post_proc/__init__.py +0 -0
  49. magic_pdf/post_proc/detect_para.py +0 -3472
  50. magic_pdf/post_proc/pdf_post_filter.py +0 -60
  51. magic_pdf/post_proc/remove_footnote.py +0 -153
  52. magic_pdf/pre_proc/citationmarker_remove.py +0 -161
  53. magic_pdf/pre_proc/detect_equation.py +0 -134
  54. magic_pdf/pre_proc/detect_footer_by_model.py +0 -64
  55. magic_pdf/pre_proc/detect_footer_header_by_statistics.py +0 -284
  56. magic_pdf/pre_proc/detect_footnote.py +0 -170
  57. magic_pdf/pre_proc/detect_header.py +0 -64
  58. magic_pdf/pre_proc/detect_images.py +0 -647
  59. magic_pdf/pre_proc/detect_page_number.py +0 -64
  60. magic_pdf/pre_proc/detect_tables.py +0 -62
  61. magic_pdf/pre_proc/equations_replace.py +0 -550
  62. magic_pdf/pre_proc/fix_image.py +0 -244
  63. magic_pdf/pre_proc/fix_table.py +0 -270
  64. magic_pdf/pre_proc/main_text_font.py +0 -23
  65. magic_pdf/pre_proc/ocr_detect_layout.py +0 -133
  66. magic_pdf/pre_proc/pdf_pre_filter.py +0 -78
  67. magic_pdf/pre_proc/post_layout_split.py +0 -0
  68. magic_pdf/pre_proc/remove_colored_strip_bbox.py +0 -101
  69. magic_pdf/pre_proc/remove_footer_header.py +0 -114
  70. magic_pdf/pre_proc/remove_rotate_bbox.py +0 -236
  71. magic_pdf/pre_proc/resolve_bbox_conflict.py +0 -184
  72. magic_pdf/pre_proc/solve_line_alien.py +0 -29
  73. magic_pdf/pre_proc/statistics.py +0 -12
  74. {magic_pdf-0.10.0.dist-info → magic_pdf-0.10.2.dist-info}/LICENSE.md +0 -0
  75. {magic_pdf-0.10.0.dist-info → magic_pdf-0.10.2.dist-info}/entry_points.txt +0 -0
  76. {magic_pdf-0.10.0.dist-info → magic_pdf-0.10.2.dist-info}/top_level.txt +0 -0
@@ -1,60 +0,0 @@
1
- from loguru import logger
2
-
3
- from magic_pdf.config.drop_reason import DropReason
4
- from magic_pdf.layout.layout_sort import get_columns_cnt_of_layout
5
-
6
-
7
- def __is_pseudo_single_column(page_info) -> bool:
8
- """判断一个页面是否伪单列。
9
-
10
- Args:
11
- page_info (dict): 页面信息字典,包括'_layout_tree'和'preproc_blocks'。
12
-
13
- Returns:
14
- Tuple[bool, Optional[str]]: 如果页面伪单列返回(True, extra_info),否则返回(False, None)。
15
- """
16
- layout_tree = page_info['_layout_tree']
17
- layout_column_width = get_columns_cnt_of_layout(layout_tree)
18
- if layout_column_width == 1:
19
- text_blocks = page_info['preproc_blocks']
20
- # 遍历每一个text_block
21
- for text_block in text_blocks:
22
- lines = text_block['lines']
23
- num_lines = len(lines)
24
- num_satisfying_lines = 0
25
-
26
- for i in range(num_lines - 1):
27
- current_line = lines[i]
28
- next_line = lines[i + 1]
29
-
30
- # 获取当前line和下一个line的bbox属性
31
- current_bbox = current_line['bbox']
32
- next_bbox = next_line['bbox']
33
-
34
- # 检查是否满足条件
35
- if next_bbox[0] > current_bbox[2] or next_bbox[2] < current_bbox[0]:
36
- num_satisfying_lines += 1
37
- # 如果有一半以上的line满足条件,就drop
38
- # print("num_satisfying_lines:", num_satisfying_lines, "num_lines:", num_lines)
39
- if num_lines > 20:
40
- radio = num_satisfying_lines / num_lines
41
- if radio >= 0.5:
42
- extra_info = f'{{num_lines: {num_lines}, num_satisfying_lines: {num_satisfying_lines}}}'
43
- block_text = []
44
- for line in lines:
45
- if line['spans']:
46
- for span in line['spans']:
47
- block_text.append(span['text'])
48
- logger.warning(f'pseudo_single_column block_text: {block_text}')
49
- return True, extra_info
50
-
51
- return False, None
52
-
53
-
54
- def pdf_post_filter(page_info) -> tuple:
55
- """return:(True|False, err_msg) True, 如果pdf符合要求 False, 如果pdf不符合要求."""
56
- bool_is_pseudo_single_column, extra_info = __is_pseudo_single_column(page_info)
57
- if bool_is_pseudo_single_column:
58
- return False, {'_need_drop': True, '_drop_reason': DropReason.PSEUDO_SINGLE_COLUMN, 'extra_info': extra_info}
59
-
60
- return True, None
@@ -1,153 +0,0 @@
1
- from magic_pdf.libs.boxbase import _is_in, _is_in_or_part_overlap
2
- import collections # 统计库
3
-
4
-
5
-
6
- def is_below(bbox1, bbox2):
7
- # 如果block1的上边y坐标大于block2的下边y坐标,那么block1在block2下面
8
- return bbox1[1] > bbox2[3]
9
-
10
-
11
- def merge_bboxes(bboxes):
12
- # 找出所有blocks的最小x0,最大y1,最大x1,最小y0,这就是合并后的bbox
13
- x0 = min(bbox[0] for bbox in bboxes)
14
- y0 = min(bbox[1] for bbox in bboxes)
15
- x1 = max(bbox[2] for bbox in bboxes)
16
- y1 = max(bbox[3] for bbox in bboxes)
17
- return [x0, y0, x1, y1]
18
-
19
-
20
- def merge_footnote_blocks(page_info, main_text_font):
21
- page_info['merged_bboxes'] = []
22
- for layout in page_info['layout_bboxes']:
23
- # 找出layout中的所有footnote blocks和preproc_blocks
24
- footnote_bboxes = [block for block in page_info['footnote_bboxes_tmp'] if _is_in(block, layout['layout_bbox'])]
25
- # 如果没有footnote_blocks,就跳过这个layout
26
- if not footnote_bboxes:
27
- continue
28
-
29
- preproc_blocks = [block for block in page_info['preproc_blocks'] if _is_in(block['bbox'], layout['layout_bbox'])]
30
- # preproc_bboxes = [block['bbox'] for block in preproc_blocks]
31
- font_names = collections.Counter()
32
- if len(preproc_blocks) > 0:
33
- # 存储每一行的文本块大小的列表
34
- line_sizes = []
35
- # 存储每个文本块的平均行大小
36
- block_sizes = []
37
- for block in preproc_blocks:
38
- block_line_sizes = []
39
- block_fonts = collections.Counter()
40
- for line in block['lines']:
41
- # 提取每个span的size属性,并计算行大小
42
- span_sizes = [span['size'] for span in line['spans'] if 'size' in span]
43
- if span_sizes:
44
- line_size = sum(span_sizes) / len(span_sizes)
45
- line_sizes.append(line_size)
46
- block_line_sizes.append(line_size)
47
- span_font = [(span['font'], len(span['text'])) for span in line['spans'] if
48
- 'font' in span and len(span['text']) > 0]
49
- if span_font:
50
- # # todo main_text_font应该用基于字数最多的字体而不是span级别的统计
51
- # font_names.append(font_name for font_name in span_font)
52
- # block_fonts.append(font_name for font_name in span_font)
53
- for font, count in span_font:
54
- # font_names.extend([font] * count)
55
- # block_fonts.extend([font] * count)
56
- font_names[font] += count
57
- block_fonts[font] += count
58
- if block_line_sizes:
59
- # 计算文本块的平均行大小
60
- block_size = sum(block_line_sizes) / len(block_line_sizes)
61
- block_font = block_fonts.most_common(1)[0][0]
62
- block_sizes.append((block, block_size, block_font))
63
-
64
- # 计算main_text_size
65
- # main_text_font = font_names.most_common(1)[0][0]
66
- main_text_size = collections.Counter(line_sizes).most_common(1)[0][0]
67
- else:
68
- continue
69
-
70
- need_merge_bboxes = []
71
- # 任何一个下面有正文block的footnote bbox都是假footnote
72
- for footnote_bbox in footnote_bboxes:
73
- # 检测footnote下面是否有正文block(正文block需满足,block平均size大于等于main_text_size,且block行数大于等于5)
74
- main_text_bboxes_below = [block['bbox'] for block, size, block_font in block_sizes if
75
- is_below(block['bbox'], footnote_bbox) and
76
- sum([size >= main_text_size,
77
- len(block['lines']) >= 5,
78
- block_font == main_text_font])
79
- >= 2]
80
- # 如果main_text_bboxes_below不为空,说明footnote下面有正文block,这个footnote不成立,跳过
81
- if len(main_text_bboxes_below) > 0:
82
- continue
83
- else:
84
- # 否则,说明footnote下面没有正文block,这个footnote成立,添加到待merge的footnote_bboxes中
85
- need_merge_bboxes.append(footnote_bbox)
86
- if len(need_merge_bboxes) == 0:
87
- continue
88
- # 找出最靠上的footnote block
89
- top_footnote_bbox = min(need_merge_bboxes, key=lambda bbox: bbox[1])
90
- # 找出所有在top_footnote_block下面的preproc_blocks,并确保这些preproc_blocks的平均行大小小于main_text_size
91
- bboxes_below = [block['bbox'] for block, size, block_font in block_sizes if is_below(block['bbox'], top_footnote_bbox)]
92
- # # 找出所有在top_footnote_block下面的preproc_blocks
93
- # bboxes_below = [bbox for bbox in preproc_bboxes if is_below(bbox, top_footnote_bbox)]
94
- # 合并top_footnote_block和blocks_below
95
- merged_bbox = merge_bboxes([top_footnote_bbox] + bboxes_below)
96
- # 添加到新的footnote_bboxes_tmp中
97
- page_info['merged_bboxes'].append(merged_bbox)
98
- return page_info
99
-
100
-
101
- def remove_footnote_blocks(page_info):
102
- if page_info.get('merged_bboxes'):
103
- # 从文字中去掉footnote
104
- remain_text_blocks, removed_footnote_text_blocks = remove_footnote_text(page_info['preproc_blocks'], page_info['merged_bboxes'])
105
- # 从图片中去掉footnote
106
- image_blocks, removed_footnote_imgs_blocks = remove_footnote_image(page_info['images'], page_info['merged_bboxes'])
107
- # 更新page_info
108
- page_info['preproc_blocks'] = remain_text_blocks
109
- page_info['images'] = image_blocks
110
- page_info['droped_text_block'].extend(removed_footnote_text_blocks)
111
- page_info['droped_image_block'].extend(removed_footnote_imgs_blocks)
112
- # 删除footnote_bboxes_tmp和merged_bboxes
113
- del page_info['merged_bboxes']
114
- del page_info['footnote_bboxes_tmp']
115
- return page_info
116
-
117
-
118
- def remove_footnote_text(raw_text_block, footnote_bboxes):
119
- """
120
- :param raw_text_block: str类型,是当前页的文本内容
121
- :param footnoteBboxes: list类型,是当前页的脚注bbox
122
- """
123
- footnote_text_blocks = []
124
- for block in raw_text_block:
125
- text_bbox = block['bbox']
126
- # TODO 更严谨点在line级别做
127
- if any([_is_in_or_part_overlap(text_bbox, footnote_bbox) for footnote_bbox in footnote_bboxes]):
128
- # if any([text_bbox[3]>=footnote_bbox[1] for footnote_bbox in footnote_bboxes]):
129
- block['tag'] = 'footnote'
130
- footnote_text_blocks.append(block)
131
- # raw_text_block.remove(block)
132
-
133
- # 移除,不能再内部移除,否则会出错
134
- for block in footnote_text_blocks:
135
- raw_text_block.remove(block)
136
-
137
- return raw_text_block, footnote_text_blocks
138
-
139
-
140
- def remove_footnote_image(image_blocks, footnote_bboxes):
141
- """
142
- :param image_bboxes: list类型,是当前页的图片bbox(结构体)
143
- :param footnoteBboxes: list类型,是当前页的脚注bbox
144
- """
145
- footnote_imgs_blocks = []
146
- for image_block in image_blocks:
147
- if any([_is_in(image_block['bbox'], footnote_bbox) for footnote_bbox in footnote_bboxes]):
148
- footnote_imgs_blocks.append(image_block)
149
-
150
- for footnote_imgs_block in footnote_imgs_blocks:
151
- image_blocks.remove(footnote_imgs_block)
152
-
153
- return image_blocks, footnote_imgs_blocks
@@ -1,161 +0,0 @@
1
- """
2
- 去掉正文的引文引用marker
3
- https://aicarrier.feishu.cn/wiki/YLOPwo1PGiwFRdkwmyhcZmr0n3d
4
- """
5
- import re
6
- # from magic_pdf.libs.nlp_utils import NLPModels
7
-
8
-
9
- # __NLP_MODEL = NLPModels()
10
-
11
- def check_1(spans, cur_span_i):
12
- """寻找前一个char,如果是句号,逗号,那么就是角标"""
13
- if cur_span_i==0:
14
- return False # 不是角标
15
- pre_span = spans[cur_span_i-1]
16
- pre_char = pre_span['chars'][-1]['c']
17
- if pre_char in ['。', ',', '.', ',']:
18
- return True
19
-
20
- return False
21
-
22
-
23
- # def check_2(spans, cur_span_i):
24
- # """检查前面一个span的最后一个单词,如果长度大于5,全都是字母,并且不含大写,就是角标"""
25
- # pattern = r'\b[A-Z]\.\s[A-Z][a-z]*\b' # 形如A. Bcde, L. Bcde, 人名的缩写
26
- #
27
- # if cur_span_i==0 and len(spans)>1:
28
- # next_span = spans[cur_span_i+1]
29
- # next_txt = "".join([c['c'] for c in next_span['chars']])
30
- # result = __NLP_MODEL.detect_entity_catgr_using_nlp(next_txt)
31
- # if result in ["PERSON", "GPE", "ORG"]:
32
- # return True
33
- #
34
- # if re.findall(pattern, next_txt):
35
- # return True
36
- #
37
- # return False # 不是角标
38
- # elif cur_span_i==0 and len(spans)==1: # 角标占用了整行?谨慎删除
39
- # return False
40
- #
41
- # # 如果这个span是最后一个span,
42
- # if cur_span_i==len(spans)-1:
43
- # pre_span = spans[cur_span_i-1]
44
- # pre_txt = "".join([c['c'] for c in pre_span['chars']])
45
- # pre_word = pre_txt.split(' ')[-1]
46
- # result = __NLP_MODEL.detect_entity_catgr_using_nlp(pre_txt)
47
- # if result in ["PERSON", "GPE", "ORG"]:
48
- # return True
49
- #
50
- # if re.findall(pattern, pre_txt):
51
- # return True
52
- #
53
- # return len(pre_word) > 5 and pre_word.isalpha() and pre_word.islower()
54
- # else: # 既不是第一个span,也不是最后一个span,那么此时检查一下这个角标距离前后哪个单词更近就属于谁的角标
55
- # pre_span = spans[cur_span_i-1]
56
- # next_span = spans[cur_span_i+1]
57
- # cur_span = spans[cur_span_i]
58
- # # 找到前一个和后一个span里的距离最近的单词
59
- # pre_distance = 10000 # 一个很大的数
60
- # next_distance = 10000 # 一个很大的数
61
- # for c in pre_span['chars'][::-1]:
62
- # if c['c'].isalpha():
63
- # pre_distance = cur_span['bbox'][0] - c['bbox'][2]
64
- # break
65
- # for c in next_span['chars']:
66
- # if c['c'].isalpha():
67
- # next_distance = c['bbox'][0] - cur_span['bbox'][2]
68
- # break
69
- #
70
- # if pre_distance<next_distance:
71
- # belong_to_span = pre_span
72
- # else:
73
- # belong_to_span = next_span
74
- #
75
- # txt = "".join([c['c'] for c in belong_to_span['chars']])
76
- # pre_word = txt.split(' ')[-1]
77
- # result = __NLP_MODEL.detect_entity_catgr_using_nlp(txt)
78
- # if result in ["PERSON", "GPE", "ORG"]:
79
- # return True
80
- #
81
- # if re.findall(pattern, txt):
82
- # return True
83
- #
84
- # return len(pre_word) > 5 and pre_word.isalpha() and pre_word.islower()
85
-
86
-
87
- def check_3(spans, cur_span_i):
88
- """上标里有[], 有*, 有-, 有逗号"""
89
- # 如[2-3],[22]
90
- # 如 2,3,4
91
- cur_span_txt = ''.join(c['c'] for c in spans[cur_span_i]['chars']).strip()
92
- bad_char = ['[', ']', '*', ',']
93
-
94
- if any([c in cur_span_txt for c in bad_char]) and any(character.isdigit() for character in cur_span_txt):
95
- return True
96
-
97
- # 如2-3, a-b
98
- patterns = [r'\d+-\d+', r'[a-zA-Z]-[a-zA-Z]', r'[a-zA-Z],[a-zA-Z]']
99
- for pattern in patterns:
100
- match = re.match(pattern, cur_span_txt)
101
- if match is not None:
102
- return True
103
-
104
- return False
105
-
106
-
107
- def remove_citation_marker(with_char_text_blcoks):
108
- for blk in with_char_text_blcoks:
109
- for line in blk['lines']:
110
- # 如果span里的个数少于2个,那只能忽略,角标不可能自己独占一行
111
- if len(line['spans'])<=1:
112
- continue
113
-
114
- # 找到高度最高的span作为位置比较的基准
115
- max_hi_span = line['spans'][0]['bbox']
116
- min_font_sz = 10000 # line里最小的字体
117
- max_font_sz = 0 # line里最大的字体
118
-
119
- for s in line['spans']:
120
- if max_hi_span[3]-max_hi_span[1]<s['bbox'][3]-s['bbox'][1]:
121
- max_hi_span = s['bbox']
122
- if min_font_sz>s['size']:
123
- min_font_sz = s['size']
124
- if max_font_sz<s['size']:
125
- max_font_sz = s['size']
126
-
127
- base_span_mid_y = (max_hi_span[3]+max_hi_span[1])/2
128
-
129
-
130
- span_to_del = []
131
- for i, span in enumerate(line['spans']):
132
- span_hi = span['bbox'][3]-span['bbox'][1]
133
- span_mid_y = (span['bbox'][3]+span['bbox'][1])/2
134
- span_font_sz = span['size']
135
-
136
- if max_font_sz-span_font_sz<1: # 先以字体过滤正文,如果是正文就不再继续判断了
137
- continue
138
-
139
- # 对被除数为0的情况进行过滤
140
- if span_hi==0 or min_font_sz==0:
141
- continue
142
-
143
- if (base_span_mid_y-span_mid_y)/span_hi>0.2 or (base_span_mid_y-span_mid_y>0 and abs(span_font_sz-min_font_sz)/min_font_sz<0.1):
144
- """
145
- 1. 它的前一个char如果是句号或者逗号的话,那么肯定是角标而不是公式
146
- 2. 如果这个角标的前面是一个单词(长度大于5)而不是任何大写或小写的短字母的话 应该也是角标
147
- 3. 上标里有数字和逗号或者数字+星号的组合,方括号,一般肯定就是角标了
148
- 4. 这个角标属于前文还是后文要根据距离来判断,如果距离前面的文本太近,那么就是前面的角标,否则就是后面的角标
149
- """
150
- if (check_1(line['spans'], i) or
151
- # check_2(line['spans'], i) or
152
- check_3(line['spans'], i)
153
- ):
154
- """删除掉这个角标:删除这个span, 同时还要更新line的text"""
155
- span_to_del.append(span)
156
- if len(span_to_del)>0:
157
- for span in span_to_del:
158
- line['spans'].remove(span)
159
- line['text'] = ''.join([c['c'] for s in line['spans'] for c in s['chars']])
160
-
161
- return with_char_text_blcoks
@@ -1,134 +0,0 @@
1
- from magic_pdf.libs.boxbase import _is_in, calculate_overlap_area_2_minbox_area_ratio # 正则
2
- from magic_pdf.libs.commons import fitz # pyMuPDF库
3
-
4
-
5
- def __solve_contain_bboxs(all_bbox_list: list):
6
-
7
- """将两个公式的bbox做判断是否有包含关系,若有的话则删掉较小的bbox"""
8
-
9
- dump_list = []
10
- for i in range(len(all_bbox_list)):
11
- for j in range(i + 1, len(all_bbox_list)):
12
- # 获取当前两个值
13
- bbox1 = all_bbox_list[i][:4]
14
- bbox2 = all_bbox_list[j][:4]
15
-
16
- # 删掉较小的框
17
- if _is_in(bbox1, bbox2):
18
- dump_list.append(all_bbox_list[i])
19
- elif _is_in(bbox2, bbox1):
20
- dump_list.append(all_bbox_list[j])
21
- else:
22
- ratio = calculate_overlap_area_2_minbox_area_ratio(bbox1, bbox2)
23
- if ratio > 0.7:
24
- s1 = (bbox1[2] - bbox1[0]) * (bbox1[3] - bbox1[1])
25
- s2 = (bbox2[2] - bbox2[0]) * (bbox2[3] - bbox2[1])
26
- if s2 > s1:
27
- dump_list.append(all_bbox_list[i])
28
- else:
29
- dump_list.append(all_bbox_list[i])
30
-
31
- # 遍历需要删除的列表中的每个元素
32
- for item in dump_list:
33
-
34
- while item in all_bbox_list:
35
- all_bbox_list.remove(item)
36
- return all_bbox_list
37
-
38
-
39
- def parse_equations(page_ID: int, page: fitz.Page, json_from_DocXchain_obj: dict):
40
- """
41
- :param page_ID: int类型,当前page在当前pdf文档中是第page_D页。
42
- :param page :fitz读取的当前页的内容
43
- :param res_dir_path: str类型,是每一个pdf文档,在当前.py文件的目录下生成一个与pdf文档同名的文件夹,res_dir_path就是文件夹的dir
44
- :param json_from_DocXchain_obj: dict类型,把pdf文档送入DocXChain模型中后,提取bbox,结果保存到pdf文档同名文件夹下的 page_ID.json文件中了。json_from_DocXchain_obj就是打开后的dict
45
- """
46
- DPI = 72 # use this resolution
47
- pix = page.get_pixmap(dpi=DPI)
48
- pageL = 0
49
- pageR = int(pix.w)
50
- pageU = 0
51
- pageD = int(pix.h)
52
-
53
-
54
- #--------- 通过json_from_DocXchain来获取 table ---------#
55
- equationEmbedding_from_DocXChain_bboxs = []
56
- equationIsolated_from_DocXChain_bboxs = []
57
-
58
- xf_json = json_from_DocXchain_obj
59
- width_from_json = xf_json['page_info']['width']
60
- height_from_json = xf_json['page_info']['height']
61
- LR_scaleRatio = width_from_json / (pageR - pageL)
62
- UD_scaleRatio = height_from_json / (pageD - pageU)
63
-
64
- for xf in xf_json['layout_dets']:
65
- # {0: 'title', 1: 'figure', 2: 'plain text', 3: 'header', 4: 'page number', 5: 'footnote', 6: 'footer', 7: 'table', 8: 'table caption', 9: 'figure caption', 10: 'equation', 11: 'full column', 12: 'sub column'}
66
- L = xf['poly'][0] / LR_scaleRatio
67
- U = xf['poly'][1] / UD_scaleRatio
68
- R = xf['poly'][2] / LR_scaleRatio
69
- D = xf['poly'][5] / UD_scaleRatio
70
- # L += pageL # 有的页面,artBox偏移了。不在(0,0)
71
- # R += pageL
72
- # U += pageU
73
- # D += pageU
74
- L, R = min(L, R), max(L, R)
75
- U, D = min(U, D), max(U, D)
76
- # equation
77
- img_suffix = f"{page_ID}_{int(L)}_{int(U)}_{int(R)}_{int(D)}"
78
- if xf['category_id'] == 13 and xf['score'] >= 0.3:
79
- latex_text = xf.get("latex", "EmptyInlineEquationResult")
80
- debugable_latex_text = f"{latex_text}|{img_suffix}"
81
- equationEmbedding_from_DocXChain_bboxs.append((L, U, R, D, latex_text))
82
- if xf['category_id'] == 14 and xf['score'] >= 0.3:
83
- latex_text = xf.get("latex", "EmptyInterlineEquationResult")
84
- debugable_latex_text = f"{latex_text}|{img_suffix}"
85
- equationIsolated_from_DocXChain_bboxs.append((L, U, R, D, latex_text))
86
-
87
- #---------------------------------------- 排序,编号,保存 -----------------------------------------#
88
- equationIsolated_from_DocXChain_bboxs.sort(key = lambda LURD: (LURD[1], LURD[0]))
89
- equationIsolated_from_DocXChain_bboxs.sort(key = lambda LURD: (LURD[1], LURD[0]))
90
-
91
- equationEmbedding_from_DocXChain_names = []
92
- equationEmbedding_ID = 0
93
-
94
- equationIsolated_from_DocXChain_names = []
95
- equationIsolated_ID = 0
96
-
97
- for L, U, R, D, _ in equationEmbedding_from_DocXChain_bboxs:
98
- if not(L < R and U < D):
99
- continue
100
- try:
101
- # cur_equation = page.get_pixmap(clip=(L,U,R,D))
102
- new_equation_name = "equationEmbedding_{}_{}.png".format(page_ID, equationEmbedding_ID) # 公式name
103
- # cur_equation.save(res_dir_path + '/' + new_equation_name) # 把公式存出在新建的文件夹,并命名
104
- equationEmbedding_from_DocXChain_names.append(new_equation_name) # 把公式的名字存在list中,方便在md中插入引用
105
- equationEmbedding_ID += 1
106
- except:
107
- pass
108
-
109
- for L, U, R, D, _ in equationIsolated_from_DocXChain_bboxs:
110
- if not(L < R and U < D):
111
- continue
112
- try:
113
- # cur_equation = page.get_pixmap(clip=(L,U,R,D))
114
- new_equation_name = "equationEmbedding_{}_{}.png".format(page_ID, equationIsolated_ID) # 公式name
115
- # cur_equation.save(res_dir_path + '/' + new_equation_name) # 把公式存出在新建的文件夹,并命名
116
- equationIsolated_from_DocXChain_names.append(new_equation_name) # 把公式的名字存在list中,方便在md中插入引用
117
- equationIsolated_ID += 1
118
- except:
119
- pass
120
-
121
- equationEmbedding_from_DocXChain_bboxs.sort(key = lambda LURD: (LURD[1], LURD[0]))
122
- equationIsolated_from_DocXChain_bboxs.sort(key = lambda LURD: (LURD[1], LURD[0]))
123
-
124
-
125
- """根据pdf可视区域,调整bbox的坐标"""
126
- cropbox = page.cropbox
127
- if cropbox[0]!=page.rect[0] or cropbox[1]!=page.rect[1]:
128
- for eq_box in equationEmbedding_from_DocXChain_bboxs:
129
- eq_box = [eq_box[0]+cropbox[0], eq_box[1]+cropbox[1], eq_box[2]+cropbox[0], eq_box[3]+cropbox[1], eq_box[4]]
130
- for eq_box in equationIsolated_from_DocXChain_bboxs:
131
- eq_box = [eq_box[0]+cropbox[0], eq_box[1]+cropbox[1], eq_box[2]+cropbox[0], eq_box[3]+cropbox[1], eq_box[4]]
132
-
133
- deduped_embedding_eq_bboxes = __solve_contain_bboxs(equationEmbedding_from_DocXChain_bboxs)
134
- return deduped_embedding_eq_bboxes, equationIsolated_from_DocXChain_bboxs
@@ -1,64 +0,0 @@
1
- from magic_pdf.libs.commons import fitz # pyMuPDF库
2
- from magic_pdf.libs.coordinate_transform import get_scale_ratio
3
-
4
-
5
- def parse_footers(page_ID: int, page: fitz.Page, json_from_DocXchain_obj: dict):
6
- """
7
- :param page_ID: int类型,当前page在当前pdf文档中是第page_D页。
8
- :param page :fitz读取的当前页的内容
9
- :param res_dir_path: str类型,是每一个pdf文档,在当前.py文件的目录下生成一个与pdf文档同名的文件夹,res_dir_path就是文件夹的dir
10
- :param json_from_DocXchain_obj: dict类型,把pdf文档送入DocXChain模型中后,提取bbox,结果保存到pdf文档同名文件夹下的 page_ID.json文件中了。json_from_DocXchain_obj就是打开后的dict
11
- """
12
-
13
- #--------- 通过json_from_DocXchain来获取 footer ---------#
14
- footer_bbox_from_DocXChain = []
15
-
16
- xf_json = json_from_DocXchain_obj
17
- horizontal_scale_ratio, vertical_scale_ratio = get_scale_ratio(xf_json, page)
18
-
19
- # {0: 'title', # 标题
20
- # 1: 'figure', # 图片
21
- # 2: 'plain text', # 文本
22
- # 3: 'header', # 页眉
23
- # 4: 'page number', # 页码
24
- # 5: 'footnote', # 脚注
25
- # 6: 'footer', # 页脚
26
- # 7: 'table', # 表格
27
- # 8: 'table caption', # 表格描述
28
- # 9: 'figure caption', # 图片描述
29
- # 10: 'equation', # 公式
30
- # 11: 'full column', # 单栏
31
- # 12: 'sub column', # 多栏
32
- # 13: 'embedding', # 嵌入公式
33
- # 14: 'isolated'} # 单行公式
34
- for xf in xf_json['layout_dets']:
35
- L = xf['poly'][0] / horizontal_scale_ratio
36
- U = xf['poly'][1] / vertical_scale_ratio
37
- R = xf['poly'][2] / horizontal_scale_ratio
38
- D = xf['poly'][5] / vertical_scale_ratio
39
- # L += pageL # 有的页面,artBox偏移了。不在(0,0)
40
- # R += pageL
41
- # U += pageU
42
- # D += pageU
43
- L, R = min(L, R), max(L, R)
44
- U, D = min(U, D), max(U, D)
45
- if xf['category_id'] == 6 and xf['score'] >= 0.3:
46
- footer_bbox_from_DocXChain.append((L, U, R, D))
47
-
48
-
49
- footer_final_names = []
50
- footer_final_bboxs = []
51
- footer_ID = 0
52
- for L, U, R, D in footer_bbox_from_DocXChain:
53
- # cur_footer = page.get_pixmap(clip=(L,U,R,D))
54
- new_footer_name = "footer_{}_{}.png".format(page_ID, footer_ID) # 脚注name
55
- # cur_footer.save(res_dir_path + '/' + new_footer_name) # 把页脚存储在新建的文件夹,并命名
56
- footer_final_names.append(new_footer_name) # 把脚注的名字存在list中
57
- footer_final_bboxs.append((L, U, R, D))
58
- footer_ID += 1
59
-
60
-
61
- footer_final_bboxs.sort(key = lambda LURD: (LURD[1], LURD[0]))
62
- curPage_all_footer_bboxs = footer_final_bboxs
63
- return curPage_all_footer_bboxs
64
-