magic-pdf 0.10.0__py3-none-any.whl → 0.10.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (76) hide show
  1. magic_pdf/data/data_reader_writer/filebase.py +3 -0
  2. magic_pdf/filter/pdf_meta_scan.py +3 -17
  3. magic_pdf/libs/commons.py +0 -161
  4. magic_pdf/libs/draw_bbox.py +2 -3
  5. magic_pdf/libs/markdown_utils.py +0 -21
  6. magic_pdf/libs/pdf_image_tools.py +2 -1
  7. magic_pdf/libs/version.py +1 -1
  8. magic_pdf/model/doc_analyze_by_custom_model.py +2 -2
  9. magic_pdf/model/magic_model.py +0 -30
  10. magic_pdf/model/sub_modules/ocr/paddleocr/ocr_utils.py +3 -28
  11. magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_273_mod.py +3 -3
  12. magic_pdf/para/para_split_v3.py +7 -2
  13. magic_pdf/pdf_parse_union_core_v2.py +97 -124
  14. magic_pdf/pre_proc/construct_page_dict.py +0 -55
  15. magic_pdf/pre_proc/cut_image.py +0 -37
  16. magic_pdf/pre_proc/ocr_detect_all_bboxes.py +5 -178
  17. magic_pdf/pre_proc/ocr_dict_merge.py +1 -224
  18. magic_pdf/pre_proc/ocr_span_list_modify.py +2 -252
  19. magic_pdf/rw/S3ReaderWriter.py +1 -1
  20. {magic_pdf-0.10.0.dist-info → magic_pdf-0.10.2.dist-info}/METADATA +3 -77
  21. {magic_pdf-0.10.0.dist-info → magic_pdf-0.10.2.dist-info}/RECORD +25 -76
  22. {magic_pdf-0.10.0.dist-info → magic_pdf-0.10.2.dist-info}/WHEEL +1 -1
  23. magic_pdf/dict2md/mkcontent.py +0 -438
  24. magic_pdf/layout/__init__.py +0 -0
  25. magic_pdf/layout/bbox_sort.py +0 -681
  26. magic_pdf/layout/layout_det_utils.py +0 -182
  27. magic_pdf/layout/layout_sort.py +0 -921
  28. magic_pdf/layout/layout_spiler_recog.py +0 -101
  29. magic_pdf/layout/mcol_sort.py +0 -336
  30. magic_pdf/libs/calc_span_stats.py +0 -239
  31. magic_pdf/libs/detect_language_from_model.py +0 -21
  32. magic_pdf/libs/nlp_utils.py +0 -203
  33. magic_pdf/libs/textbase.py +0 -33
  34. magic_pdf/libs/vis_utils.py +0 -308
  35. magic_pdf/para/block_continuation_processor.py +0 -562
  36. magic_pdf/para/block_termination_processor.py +0 -480
  37. magic_pdf/para/commons.py +0 -222
  38. magic_pdf/para/denoise.py +0 -246
  39. magic_pdf/para/draw.py +0 -121
  40. magic_pdf/para/exceptions.py +0 -198
  41. magic_pdf/para/layout_match_processor.py +0 -40
  42. magic_pdf/para/para_split.py +0 -807
  43. magic_pdf/para/para_split_v2.py +0 -959
  44. magic_pdf/para/raw_processor.py +0 -207
  45. magic_pdf/para/stats.py +0 -268
  46. magic_pdf/para/title_processor.py +0 -1014
  47. magic_pdf/pdf_parse_union_core.py +0 -345
  48. magic_pdf/post_proc/__init__.py +0 -0
  49. magic_pdf/post_proc/detect_para.py +0 -3472
  50. magic_pdf/post_proc/pdf_post_filter.py +0 -60
  51. magic_pdf/post_proc/remove_footnote.py +0 -153
  52. magic_pdf/pre_proc/citationmarker_remove.py +0 -161
  53. magic_pdf/pre_proc/detect_equation.py +0 -134
  54. magic_pdf/pre_proc/detect_footer_by_model.py +0 -64
  55. magic_pdf/pre_proc/detect_footer_header_by_statistics.py +0 -284
  56. magic_pdf/pre_proc/detect_footnote.py +0 -170
  57. magic_pdf/pre_proc/detect_header.py +0 -64
  58. magic_pdf/pre_proc/detect_images.py +0 -647
  59. magic_pdf/pre_proc/detect_page_number.py +0 -64
  60. magic_pdf/pre_proc/detect_tables.py +0 -62
  61. magic_pdf/pre_proc/equations_replace.py +0 -550
  62. magic_pdf/pre_proc/fix_image.py +0 -244
  63. magic_pdf/pre_proc/fix_table.py +0 -270
  64. magic_pdf/pre_proc/main_text_font.py +0 -23
  65. magic_pdf/pre_proc/ocr_detect_layout.py +0 -133
  66. magic_pdf/pre_proc/pdf_pre_filter.py +0 -78
  67. magic_pdf/pre_proc/post_layout_split.py +0 -0
  68. magic_pdf/pre_proc/remove_colored_strip_bbox.py +0 -101
  69. magic_pdf/pre_proc/remove_footer_header.py +0 -114
  70. magic_pdf/pre_proc/remove_rotate_bbox.py +0 -236
  71. magic_pdf/pre_proc/resolve_bbox_conflict.py +0 -184
  72. magic_pdf/pre_proc/solve_line_alien.py +0 -29
  73. magic_pdf/pre_proc/statistics.py +0 -12
  74. {magic_pdf-0.10.0.dist-info → magic_pdf-0.10.2.dist-info}/LICENSE.md +0 -0
  75. {magic_pdf-0.10.0.dist-info → magic_pdf-0.10.2.dist-info}/entry_points.txt +0 -0
  76. {magic_pdf-0.10.0.dist-info → magic_pdf-0.10.2.dist-info}/top_level.txt +0 -0
@@ -1,78 +0,0 @@
1
- from magic_pdf.config.drop_reason import DropReason
2
- from magic_pdf.libs.boxbase import _is_in, _is_in_or_part_overlap
3
- from magic_pdf.libs.commons import fitz
4
-
5
-
6
- def __area(box):
7
- return (box[2] - box[0]) * (box[3] - box[1])
8
-
9
-
10
- def __is_contain_color_background_rect(
11
- page: fitz.Page, text_blocks, image_bboxes
12
- ) -> bool:
13
- """检查page是包含有颜色背景的矩形."""
14
- color_bg_rect = []
15
- p_width, p_height = page.rect.width, page.rect.height
16
-
17
- # 先找到最大的带背景矩形
18
- blocks = page.get_cdrawings()
19
- for block in blocks:
20
- if 'fill' in block and block['fill']: # 过滤掉透明的
21
- fill = list(block['fill'])
22
- fill[0], fill[1], fill[2] = int(fill[0]), int(fill[1]), int(fill[2])
23
- if fill == (1.0, 1.0, 1.0):
24
- continue
25
- rect = block['rect']
26
- # 过滤掉特别小的矩形
27
- if __area(rect) < 10 * 10:
28
- continue
29
- # 为了防止是svg图片上的色块,这里过滤掉这类
30
-
31
- if any(
32
- [_is_in_or_part_overlap(rect, img_bbox) for img_bbox in image_bboxes]
33
- ):
34
- continue
35
- color_bg_rect.append(rect)
36
-
37
- # 找到最大的背景矩形
38
- if len(color_bg_rect) > 0:
39
- max_rect = max(color_bg_rect, key=lambda x: __area(x))
40
- max_rect_int = (
41
- int(max_rect[0]),
42
- int(max_rect[1]),
43
- int(max_rect[2]),
44
- int(max_rect[3]),
45
- )
46
- # 判断最大的背景矩形是否包含超过3行文字,或者50个字 TODO
47
- if (
48
- max_rect[2] - max_rect[0] > 0.2 * p_width
49
- and max_rect[3] - max_rect[1] > 0.1 * p_height
50
- ): # 宽度符合
51
- # 看是否有文本块落入到这个矩形中
52
- for text_block in text_blocks:
53
- box = text_block['bbox']
54
- box_int = (int(box[0]), int(box[1]), int(box[2]), int(box[3]))
55
- if _is_in(box_int, max_rect_int):
56
- return True
57
-
58
- return False
59
-
60
-
61
- def __is_table_overlap_text_block(text_blocks, table_bbox):
62
- """检查table_bbox是否覆盖了text_blocks里的文本块 TODO."""
63
- for text_block in text_blocks:
64
- box = text_block['bbox']
65
- if _is_in_or_part_overlap(table_bbox, box):
66
- return True
67
- return False
68
-
69
-
70
- def pdf_filter(page: fitz.Page, text_blocks, table_bboxes, image_bboxes) -> tuple:
71
- """return:(True|False, err_msg) True, 如果pdf符合要求 False, 如果pdf不符合要求."""
72
- if __is_contain_color_background_rect(page, text_blocks, image_bboxes):
73
- return False, {
74
- '_need_drop': True,
75
- '_drop_reason': DropReason.COLOR_BACKGROUND_TEXT_BOX,
76
- }
77
-
78
- return True, None
File without changes
@@ -1,101 +0,0 @@
1
- from loguru import logger
2
-
3
- from magic_pdf.config.drop_tag import COLOR_BG_HEADER_TXT_BLOCK
4
- from magic_pdf.libs.boxbase import (_is_in, _is_in_or_part_overlap,
5
- calculate_overlap_area_2_minbox_area_ratio)
6
-
7
-
8
- def __area(box):
9
- return (box[2] - box[0]) * (box[3] - box[1])
10
-
11
-
12
- def rectangle_position_determination(rect, p_width):
13
- """判断矩形是否在页面中轴线附近。
14
-
15
- Args:
16
- rect (list): 矩形坐标,格式为[x1, y1, x2, y2]。
17
- p_width (int): 页面宽度。
18
-
19
- Returns:
20
- bool: 若矩形在页面中轴线附近则返回True,否则返回False。
21
- """
22
- # 页面中轴线x坐标
23
- x_axis = p_width / 2
24
- # 矩形是否跨越中轴线
25
- is_span = rect[0] < x_axis and rect[2] > x_axis
26
- if is_span:
27
- return True
28
- else:
29
- # 矩形与中轴线的距离,只算近的那一边
30
- distance = rect[0] - x_axis if rect[0] > x_axis else x_axis - rect[2]
31
- # 判断矩形与中轴线的距离是否小于页面宽度的20%
32
- if distance < p_width * 0.2:
33
- return True
34
- else:
35
- return False
36
-
37
-
38
- def remove_colored_strip_textblock(remain_text_blocks, page):
39
- """根据页面中特定颜色和大小过滤文本块,将符合条件的文本块从remain_text_blocks中移除,并返回移除的文本块列表colored_str
40
- ip_textblock。
41
-
42
- Args:
43
- remain_text_blocks (list): 剩余文本块列表。
44
- page (Page): 页面对象。
45
-
46
- Returns:
47
- tuple: 剩余文本块列表和移除的文本块列表。
48
- """
49
- colored_strip_textblocks = [] # 先构造一个空的返回
50
- if len(remain_text_blocks) > 0:
51
- p_width, p_height = page.rect.width, page.rect.height
52
- blocks = page.get_cdrawings()
53
- colored_strip_bg_rect = []
54
- for block in blocks:
55
- is_filled = (
56
- 'fill' in block and block['fill'] and block['fill'] != (1.0, 1.0, 1.0)
57
- ) # 过滤掉透明的
58
- rect = block['rect']
59
- area_is_large_enough = __area(rect) > 100 # 过滤掉特别小的矩形
60
- rectangle_position_determination_result = rectangle_position_determination(
61
- rect, p_width
62
- )
63
- in_upper_half_page = (
64
- rect[3] < p_height * 0.3
65
- ) # 找到位于页面上半部分的矩形,下边界小于页面高度的30%
66
- aspect_ratio_exceeds_4 = (rect[2] - rect[0]) > (
67
- rect[3] - rect[1]
68
- ) * 4 # 找到长宽比超过4的矩形
69
-
70
- if (
71
- is_filled
72
- and area_is_large_enough
73
- and rectangle_position_determination_result
74
- and in_upper_half_page
75
- and aspect_ratio_exceeds_4
76
- ):
77
- colored_strip_bg_rect.append(rect)
78
-
79
- if len(colored_strip_bg_rect) > 0:
80
- for colored_strip_block_bbox in colored_strip_bg_rect:
81
- for text_block in remain_text_blocks:
82
- text_bbox = text_block['bbox']
83
- if _is_in(text_bbox, colored_strip_block_bbox) or (
84
- _is_in_or_part_overlap(text_bbox, colored_strip_block_bbox)
85
- and calculate_overlap_area_2_minbox_area_ratio(
86
- text_bbox, colored_strip_block_bbox
87
- )
88
- > 0.6
89
- ):
90
- logger.info(
91
- f'remove_colored_strip_textblock: {text_bbox}, {colored_strip_block_bbox}'
92
- )
93
- text_block['tag'] = COLOR_BG_HEADER_TXT_BLOCK
94
- colored_strip_textblocks.append(text_block)
95
-
96
- if len(colored_strip_textblocks) > 0:
97
- for colored_strip_textblock in colored_strip_textblocks:
98
- if colored_strip_textblock in remain_text_blocks:
99
- remain_text_blocks.remove(colored_strip_textblock)
100
-
101
- return remain_text_blocks, colored_strip_textblocks
@@ -1,114 +0,0 @@
1
- import re
2
-
3
- from magic_pdf.config.drop_tag import CONTENT_IN_FOOT_OR_HEADER, PAGE_NO
4
- from magic_pdf.libs.boxbase import _is_in_or_part_overlap
5
-
6
-
7
- def remove_headder_footer_one_page(text_raw_blocks, image_bboxes, table_bboxes, header_bboxs, footer_bboxs,
8
- page_no_bboxs, page_w, page_h):
9
- """删除页眉页脚,页码 从line级别进行删除,删除之后观察这个text-block是否是空的,如果是空的,则移动到remove_list中."""
10
- header = []
11
- footer = []
12
- if len(header) == 0:
13
- model_header = header_bboxs
14
- if model_header:
15
- x0 = min([x for x, _, _, _ in model_header])
16
- y0 = min([y for _, y, _, _ in model_header])
17
- x1 = max([x1 for _, _, x1, _ in model_header])
18
- y1 = max([y1 for _, _, _, y1 in model_header])
19
- header = [x0, y0, x1, y1]
20
- if len(footer) == 0:
21
- model_footer = footer_bboxs
22
- if model_footer:
23
- x0 = min([x for x, _, _, _ in model_footer])
24
- y0 = min([y for _, y, _, _ in model_footer])
25
- x1 = max([x1 for _, _, x1, _ in model_footer])
26
- y1 = max([y1 for _, _, _, y1 in model_footer])
27
- footer = [x0, y0, x1, y1]
28
-
29
- header_y0 = 0 if len(header) == 0 else header[3]
30
- footer_y0 = page_h if len(footer) == 0 else footer[1]
31
- if page_no_bboxs:
32
- top_part = [b for b in page_no_bboxs if b[3] < page_h / 2]
33
- btn_part = [b for b in page_no_bboxs if b[1] > page_h / 2]
34
-
35
- top_max_y0 = max([b[1] for b in top_part]) if top_part else 0
36
- btn_min_y1 = min([b[3] for b in btn_part]) if btn_part else page_h
37
-
38
- header_y0 = max(header_y0, top_max_y0)
39
- footer_y0 = min(footer_y0, btn_min_y1)
40
-
41
- content_boundry = [0, header_y0, page_w, footer_y0]
42
-
43
- header = [0, 0, page_w, header_y0]
44
- footer = [0, footer_y0, page_w, page_h]
45
-
46
- """以上计算出来了页眉页脚的边界,下面开始进行删除"""
47
- text_block_to_remove = []
48
- # 首先检查每个textblock
49
- for blk in text_raw_blocks:
50
- if len(blk['lines']) > 0:
51
- for line in blk['lines']:
52
- line_del = []
53
- for span in line['spans']:
54
- span_del = []
55
- if span['bbox'][3] < header_y0:
56
- span_del.append(span)
57
- elif _is_in_or_part_overlap(span['bbox'], header) or _is_in_or_part_overlap(span['bbox'], footer):
58
- span_del.append(span)
59
- for span in span_del:
60
- line['spans'].remove(span)
61
- if not line['spans']:
62
- line_del.append(line)
63
-
64
- for line in line_del:
65
- blk['lines'].remove(line)
66
- else:
67
- # if not blk['lines']:
68
- blk['tag'] = CONTENT_IN_FOOT_OR_HEADER
69
- text_block_to_remove.append(blk)
70
-
71
- """有的时候由于pageNo太小了,总是会有一点和content_boundry重叠一点,被放入正文,因此对于pageNo,进行span粒度的删除"""
72
- page_no_block_2_remove = []
73
- if page_no_bboxs:
74
- for pagenobox in page_no_bboxs:
75
- for block in text_raw_blocks:
76
- if _is_in_or_part_overlap(pagenobox, block['bbox']): # 在span级别删除页码
77
- for line in block['lines']:
78
- for span in line['spans']:
79
- if _is_in_or_part_overlap(pagenobox, span['bbox']):
80
- # span['text'] = ''
81
- span['tag'] = PAGE_NO
82
- # 检查这个block是否只有这一个span,如果是,那么就把这个block也删除
83
- if len(line['spans']) == 1 and len(block['lines']) == 1:
84
- page_no_block_2_remove.append(block)
85
- else:
86
- # 测试最后一个是不是页码:规则是,最后一个block仅有1个line,一个span,且text是数字,空格,符号组成,不含字母,并且包含数字
87
- if len(text_raw_blocks) > 0:
88
- text_raw_blocks.sort(key=lambda x: x['bbox'][1], reverse=True)
89
- last_block = text_raw_blocks[0]
90
- if len(last_block['lines']) == 1:
91
- last_line = last_block['lines'][0]
92
- if len(last_line['spans']) == 1:
93
- last_span = last_line['spans'][0]
94
- if last_span['text'].strip() and not re.search('[a-zA-Z]', last_span['text']) and re.search('[0-9]',
95
- last_span[
96
- 'text']):
97
- last_span['tag'] = PAGE_NO
98
- page_no_block_2_remove.append(last_block)
99
-
100
- for b in page_no_block_2_remove:
101
- text_block_to_remove.append(b)
102
-
103
- for blk in text_block_to_remove:
104
- if blk in text_raw_blocks:
105
- text_raw_blocks.remove(blk)
106
-
107
- text_block_remain = text_raw_blocks
108
- image_bbox_to_remove = [bbox for bbox in image_bboxes if not _is_in_or_part_overlap(bbox, content_boundry)]
109
-
110
- image_bbox_remain = [bbox for bbox in image_bboxes if _is_in_or_part_overlap(bbox, content_boundry)]
111
- table_bbox_to_remove = [bbox for bbox in table_bboxes if not _is_in_or_part_overlap(bbox, content_boundry)]
112
- table_bbox_remain = [bbox for bbox in table_bboxes if _is_in_or_part_overlap(bbox, content_boundry)]
113
-
114
- return image_bbox_remain, table_bbox_remain, text_block_remain, text_block_to_remove, image_bbox_to_remove, table_bbox_to_remove
@@ -1,236 +0,0 @@
1
- import math
2
- import re
3
-
4
- from magic_pdf.config.drop_tag import (EMPTY_SIDE_BLOCK, ROTATE_TEXT,
5
- VERTICAL_TEXT)
6
- from magic_pdf.libs.boxbase import is_vbox_on_side
7
-
8
-
9
- def detect_non_horizontal_texts(result_dict):
10
- """This function detects watermarks and vertical margin notes in the
11
- document.
12
-
13
- Watermarks are identified by finding blocks with the same coordinates and frequently occurring identical texts across multiple pages.
14
- If these conditions are met, the blocks are highly likely to be watermarks, as opposed to headers or footers, which can change from page to page.
15
- If the direction of these blocks is not horizontal, they are definitely considered to be watermarks.
16
-
17
- Vertical margin notes are identified by finding blocks with the same coordinates and frequently occurring identical texts across multiple pages.
18
- If these conditions are met, the blocks are highly likely to be vertical margin notes, which typically appear on the left and right sides of the page. # noqa: E501
19
- If the direction of these blocks is vertical, they are definitely considered to be vertical margin notes.
20
-
21
-
22
- Parameters
23
- ----------
24
- result_dict : dict
25
- The result dictionary.
26
-
27
- Returns
28
- -------
29
- result_dict : dict
30
- The updated result dictionary.
31
- """
32
- # Dictionary to store information about potential watermarks
33
- potential_watermarks = {}
34
- potential_margin_notes = {}
35
-
36
- for page_id, page_content in result_dict.items():
37
- if page_id.startswith('page_'):
38
- for block_id, block_data in page_content.items():
39
- if block_id.startswith('block_'):
40
- if 'dir' in block_data:
41
- coordinates_text = (
42
- block_data['bbox'],
43
- block_data['text'],
44
- ) # Tuple of coordinates and text
45
-
46
- angle = math.atan2(block_data['dir'][1], block_data['dir'][0])
47
- angle = abs(math.degrees(angle))
48
-
49
- if angle > 5 and angle < 85: # Check if direction is watermarks
50
- if coordinates_text in potential_watermarks:
51
- potential_watermarks[coordinates_text] += 1
52
- else:
53
- potential_watermarks[coordinates_text] = 1
54
-
55
- if angle > 85 and angle < 105: # Check if direction is vertical
56
- if coordinates_text in potential_margin_notes:
57
- potential_margin_notes[coordinates_text] += (
58
- 1 # Increment count
59
- )
60
- else:
61
- potential_margin_notes[coordinates_text] = (
62
- 1 # Initialize count
63
- )
64
-
65
- # Identify watermarks by finding entries with counts higher than a threshold (e.g., appearing on more than half of the pages)
66
- watermark_threshold = len(result_dict) // 2
67
- watermarks = {
68
- k: v for k, v in potential_watermarks.items() if v > watermark_threshold
69
- }
70
-
71
- # Identify margin notes by finding entries with counts higher than a threshold (e.g., appearing on more than half of the pages)
72
- margin_note_threshold = len(result_dict) // 2
73
- margin_notes = {
74
- k: v for k, v in potential_margin_notes.items() if v > margin_note_threshold
75
- }
76
-
77
- # Add watermark information to the result dictionary
78
- for page_id, blocks in result_dict.items():
79
- if page_id.startswith('page_'):
80
- for block_id, block_data in blocks.items():
81
- coordinates_text = (block_data['bbox'], block_data['text'])
82
- if coordinates_text in watermarks:
83
- block_data['is_watermark'] = 1
84
- else:
85
- block_data['is_watermark'] = 0
86
-
87
- if coordinates_text in margin_notes:
88
- block_data['is_vertical_margin_note'] = 1
89
- else:
90
- block_data['is_vertical_margin_note'] = 0
91
-
92
- return result_dict
93
-
94
-
95
- """
96
- 1. 当一个block里全部文字都不是dir=(1,0),这个block整体去掉
97
- 2. 当一个block里全部文字都是dir=(1,0),但是每行只有一个字,这个block整体去掉。这个block必须出现在页面的四周,否则不去掉
98
- """
99
-
100
-
101
- def __is_a_word(sentence):
102
- # 如果输入是中文并且长度为1,则返回True
103
- if re.fullmatch(r'[\u4e00-\u9fa5]', sentence):
104
- return True
105
- # 判断是否为单个英文单词或字符(包括ASCII标点)
106
- elif re.fullmatch(r'[a-zA-Z0-9]+', sentence) and len(sentence) <= 2:
107
- return True
108
- else:
109
- return False
110
-
111
-
112
- def __get_text_color(num):
113
- """获取字体的颜色RGB值."""
114
- blue = num & 255
115
- green = (num >> 8) & 255
116
- red = (num >> 16) & 255
117
- return red, green, blue
118
-
119
-
120
- def __is_empty_side_box(text_block):
121
- """是否是边缘上的空白没有任何内容的block."""
122
- for line in text_block['lines']:
123
- for span in line['spans']:
124
- font_color = span['color']
125
- r, g, b = __get_text_color(font_color)
126
- if len(span['text'].strip()) > 0 and (r, g, b) != (255, 255, 255):
127
- return False
128
-
129
- return True
130
-
131
-
132
- def remove_rotate_side_textblock(pymu_text_block, page_width, page_height):
133
- """返回删除了垂直,水印,旋转的textblock 删除的内容打上tag返回."""
134
- removed_text_block = []
135
-
136
- for i, block in enumerate(
137
- pymu_text_block
138
- ): # 格式参考test/assets/papre/pymu_textblocks.json
139
- lines = block['lines']
140
- block_bbox = block['bbox']
141
- if not is_vbox_on_side(
142
- block_bbox, page_width, page_height, 0.2
143
- ): # 保证这些box必须在页面的两边
144
- continue
145
-
146
- if (
147
- all(
148
- [
149
- __is_a_word(line['spans'][0]['text'])
150
- for line in lines
151
- if len(line['spans']) > 0
152
- ]
153
- )
154
- and len(lines) > 1
155
- and all([len(line['spans']) == 1 for line in lines])
156
- ):
157
- is_box_valign = (
158
- (
159
- len(
160
- set(
161
- [
162
- int(line['spans'][0]['bbox'][0])
163
- for line in lines
164
- if len(line['spans']) > 0
165
- ]
166
- )
167
- )
168
- == 1
169
- )
170
- and (
171
- len(
172
- [
173
- int(line['spans'][0]['bbox'][0])
174
- for line in lines
175
- if len(line['spans']) > 0
176
- ]
177
- )
178
- > 1
179
- )
180
- ) # 测试bbox在垂直方向是不是x0都相等,也就是在垂直方向排列.同时必须大于等于2个字
181
-
182
- if is_box_valign:
183
- block['tag'] = VERTICAL_TEXT
184
- removed_text_block.append(block)
185
- continue
186
-
187
- for line in lines:
188
- if line['dir'] != (1, 0):
189
- block['tag'] = ROTATE_TEXT
190
- removed_text_block.append(
191
- block
192
- ) # 只要有一个line不是dir=(1,0),就把整个block都删掉
193
- break
194
-
195
- for block in removed_text_block:
196
- pymu_text_block.remove(block)
197
-
198
- return pymu_text_block, removed_text_block
199
-
200
-
201
- def get_side_boundry(rotate_bbox, page_width, page_height):
202
- """根据rotate_bbox,返回页面的左右正文边界."""
203
- left_x = 0
204
- right_x = page_width
205
- for x in rotate_bbox:
206
- box = x['bbox']
207
- if box[2] < page_width / 2:
208
- left_x = max(left_x, box[2])
209
- else:
210
- right_x = min(right_x, box[0])
211
-
212
- return left_x + 1, right_x - 1
213
-
214
-
215
- def remove_side_blank_block(pymu_text_block, page_width, page_height):
216
- """删除页面两侧的空白block."""
217
- removed_text_block = []
218
-
219
- for i, block in enumerate(
220
- pymu_text_block
221
- ): # 格式参考test/assets/papre/pymu_textblocks.json
222
- block_bbox = block['bbox']
223
- if not is_vbox_on_side(
224
- block_bbox, page_width, page_height, 0.2
225
- ): # 保证这些box必须在页面的两边
226
- continue
227
-
228
- if __is_empty_side_box(block):
229
- block['tag'] = EMPTY_SIDE_BLOCK
230
- removed_text_block.append(block)
231
- continue
232
-
233
- for block in removed_text_block:
234
- pymu_text_block.remove(block)
235
-
236
- return pymu_text_block, removed_text_block