magic-pdf 0.10.2__py3-none-any.whl → 0.10.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -5,7 +5,6 @@ from loguru import logger
5
5
  from magic_pdf.config.make_content_config import DropMode, MakeMode
6
6
  from magic_pdf.config.ocr_content_type import BlockType, ContentType
7
7
  from magic_pdf.libs.commons import join_path
8
- from magic_pdf.libs.language import detect_lang
9
8
  from magic_pdf.libs.markdown_utils import ocr_escape_special_markdown_char
10
9
  from magic_pdf.para.para_split_v3 import ListLineTag
11
10
 
@@ -30,6 +29,13 @@ def ocr_mk_mm_markdown_with_para_and_pagination(pdf_info_dict: list,
30
29
  for page_info in pdf_info_dict:
31
30
  paras_of_layout = page_info.get('para_blocks')
32
31
  if not paras_of_layout:
32
+ markdown_with_para_and_pagination.append({
33
+ 'page_no':
34
+ page_no,
35
+ 'md_content':
36
+ '',
37
+ })
38
+ page_no += 1
33
39
  continue
34
40
  page_markdown = ocr_mk_markdown_with_para_core_v2(
35
41
  paras_of_layout, 'mm', img_buket_path)
@@ -136,14 +142,11 @@ def merge_para_with_text(para_block):
136
142
  para_text += ' \n'
137
143
 
138
144
  line_text = ''
139
- line_lang = ''
140
145
  for span in line['spans']:
141
146
  span_type = span['type']
142
147
  if span_type == ContentType.Text:
143
148
  line_text += span['content'].strip()
144
149
 
145
- if line_text != '':
146
- line_lang = detect_lang(line_text)
147
150
  for j, span in enumerate(line['spans']):
148
151
 
149
152
  span_type = span['type']
@@ -157,27 +160,18 @@ def merge_para_with_text(para_block):
157
160
 
158
161
  content = content.strip()
159
162
  if content != '':
160
- langs = ['zh', 'ja', 'ko']
161
- if line_lang in langs: # 遇到一些一个字一个span的文档,这种单字语言判断不准,需要用整行文本判断
162
- if span_type in [ContentType.Text, ContentType.InterlineEquation]:
163
- para_text += content # 中文/日语/韩文语境下,content间不需要空格分隔
164
- elif span_type == ContentType.InlineEquation:
165
- para_text += f' {content} '
166
- else:
167
- if span_type in [ContentType.Text, ContentType.InlineEquation]:
168
- # 如果span是line的最后一个且末尾带有-连字符,那么末尾不应该加空格,同时应该把-删除
169
- if j == len(line['spans'])-1 and __is_hyphen_at_line_end(content):
170
- para_text += content[:-1]
171
- elif len(content) == 1 and content not in ['A', 'I', 'a', 'i'] and not content.isdigit():
172
- para_text += content
173
- else: # 西方文本语境下 content间需要空格分隔
174
- para_text += f'{content} '
175
- elif span_type == ContentType.InterlineEquation:
176
- para_text += content
163
+ if span_type in [ContentType.Text, ContentType.InlineEquation]:
164
+ # 如果span是line的最后一个且末尾带有-连字符,那么末尾不应该加空格,同时应该把-删除
165
+ if j == len(line['spans'])-1 and __is_hyphen_at_line_end(content):
166
+ para_text += content[:-1]
167
+ else: # content间需要空格分隔
168
+ para_text += f'{content} '
169
+ elif span_type == ContentType.InterlineEquation:
170
+ para_text += content
177
171
  else:
178
172
  continue
179
173
  # 连写字符拆分
180
- para_text = __replace_ligatures(para_text)
174
+ # para_text = __replace_ligatures(para_text)
181
175
 
182
176
  return para_text
183
177
 
@@ -8,7 +8,7 @@ from loguru import logger
8
8
  from magic_pdf.config.drop_reason import DropReason
9
9
  from magic_pdf.libs.commons import get_top_percent_list, mymax
10
10
  from magic_pdf.libs.language import detect_lang
11
- from magic_pdf.libs.pdf_check import detect_invalid_chars
11
+ from magic_pdf.libs.pdf_check import detect_invalid_chars_by_pymupdf
12
12
 
13
13
  scan_max_page = 50
14
14
  junk_limit_min = 10
@@ -323,7 +323,7 @@ def get_language(doc: fitz.Document):
323
323
 
324
324
  def check_invalid_chars(pdf_bytes):
325
325
  """乱码检测."""
326
- return detect_invalid_chars(pdf_bytes)
326
+ return detect_invalid_chars_by_pymupdf(pdf_bytes)
327
327
 
328
328
 
329
329
  def pdf_meta_scan(pdf_bytes: bytes):
@@ -1,9 +1,9 @@
1
- from io import BytesIO
2
- import re
3
1
  import fitz
4
2
  import numpy as np
5
3
  from loguru import logger
6
- from pdfminer.high_level import extract_text
4
+ # import re
5
+ # from io import BytesIO
6
+ # from pdfminer.high_level import extract_text
7
7
 
8
8
 
9
9
  def calculate_sample_count(total_page: int):
@@ -14,7 +14,7 @@ def calculate_sample_count(total_page: int):
14
14
  return select_page_cnt
15
15
 
16
16
 
17
- def extract_pages(src_pdf_bytes: bytes):
17
+ def extract_pages(src_pdf_bytes: bytes) -> fitz.Document:
18
18
  pdf_docs = fitz.open("pdf", src_pdf_bytes)
19
19
  total_page = len(pdf_docs)
20
20
  if total_page == 0:
@@ -33,30 +33,57 @@ def extract_pages(src_pdf_bytes: bytes):
33
33
  return sample_docs
34
34
 
35
35
 
36
- def detect_invalid_chars(src_pdf_bytes: bytes) -> bool:
37
- """"
38
- 检测PDF中是否包含非法字符
36
+ # def detect_invalid_chars(src_pdf_bytes: bytes) -> bool:
37
+ # """"
38
+ # 检测PDF中是否包含非法字符
39
+ # """
40
+ # '''pdfminer比较慢,需要先随机抽取10页左右的sample'''
41
+ # sample_docs = extract_pages(src_pdf_bytes)
42
+ # sample_pdf_bytes = sample_docs.tobytes()
43
+ # sample_pdf_file_like_object = BytesIO(sample_pdf_bytes)
44
+ # text = extract_text(sample_pdf_file_like_object)
45
+ # text = text.replace("\n", "")
46
+ # # logger.info(text)
47
+ # '''乱码文本用pdfminer提取出来的文本特征是(cid:xxx)'''
48
+ # cid_pattern = re.compile(r'\(cid:\d+\)')
49
+ # matches = cid_pattern.findall(text)
50
+ # cid_count = len(matches)
51
+ # cid_len = sum(len(match) for match in matches)
52
+ # text_len = len(text)
53
+ # if text_len == 0:
54
+ # cid_chars_radio = 0
55
+ # else:
56
+ # cid_chars_radio = cid_count/(cid_count + text_len - cid_len)
57
+ # logger.info(f"cid_count: {cid_count}, text_len: {text_len}, cid_chars_radio: {cid_chars_radio}")
58
+ # '''当一篇文章存在5%以上的文本是乱码时,认为该文档为乱码文档'''
59
+ # if cid_chars_radio > 0.05:
60
+ # return False # 乱码文档
61
+ # else:
62
+ # return True # 正常文档
63
+
64
+
65
+ def count_replacement_characters(text: str) -> int:
66
+ """
67
+ 统计字符串中 0xfffd 字符的数量。
39
68
  """
40
- '''pdfminer比较慢,需要先随机抽取10页左右的sample'''
69
+ return text.count('\ufffd')
70
+
71
+
72
+ def detect_invalid_chars_by_pymupdf(src_pdf_bytes: bytes) -> bool:
41
73
  sample_docs = extract_pages(src_pdf_bytes)
42
- sample_pdf_bytes = sample_docs.tobytes()
43
- sample_pdf_file_like_object = BytesIO(sample_pdf_bytes)
44
- text = extract_text(sample_pdf_file_like_object)
45
- text = text.replace("\n", "")
46
- # logger.info(text)
47
- '''乱码文本用pdfminer提取出来的文本特征是(cid:xxx)'''
48
- cid_pattern = re.compile(r'\(cid:\d+\)')
49
- matches = cid_pattern.findall(text)
50
- cid_count = len(matches)
51
- cid_len = sum(len(match) for match in matches)
52
- text_len = len(text)
74
+ doc_text = ""
75
+ for page in sample_docs:
76
+ page_text = page.get_text('text', flags=fitz.TEXT_PRESERVE_WHITESPACE | fitz.TEXT_MEDIABOX_CLIP)
77
+ doc_text += page_text
78
+ text_len = len(doc_text)
79
+ uffd_count = count_replacement_characters(doc_text)
53
80
  if text_len == 0:
54
- cid_chars_radio = 0
81
+ uffd_chars_radio = 0
55
82
  else:
56
- cid_chars_radio = cid_count/(cid_count + text_len - cid_len)
57
- logger.info(f"cid_count: {cid_count}, text_len: {text_len}, cid_chars_radio: {cid_chars_radio}")
58
- '''当一篇文章存在5%以上的文本是乱码时,认为该文档为乱码文档'''
59
- if cid_chars_radio > 0.05:
83
+ uffd_chars_radio = uffd_count / text_len
84
+ logger.info(f"uffd_count: {uffd_count}, text_len: {text_len}, uffd_chars_radio: {uffd_chars_radio}")
85
+ '''当一篇文章存在1%以上的文本是乱码时,认为该文档为乱码文档'''
86
+ if uffd_chars_radio > 0.01:
60
87
  return False # 乱码文档
61
88
  else:
62
- return True # 正常文档
89
+ return True # 正常文档
magic_pdf/libs/version.py CHANGED
@@ -1 +1 @@
1
- __version__ = "0.10.2"
1
+ __version__ = "0.10.3"
@@ -18,11 +18,31 @@ def region_to_bbox(region):
18
18
 
19
19
 
20
20
  class CustomPaddleModel:
21
- def __init__(self, ocr: bool = False, show_log: bool = False, lang=None):
21
+ def __init__(self,
22
+ ocr: bool = False,
23
+ show_log: bool = False,
24
+ lang=None,
25
+ det_db_box_thresh=0.3,
26
+ use_dilation=True,
27
+ det_db_unclip_ratio=1.8
28
+ ):
22
29
  if lang is not None:
23
- self.model = PPStructure(table=False, ocr=ocr, show_log=show_log, lang=lang)
30
+ self.model = PPStructure(table=False,
31
+ ocr=True,
32
+ show_log=show_log,
33
+ lang=lang,
34
+ det_db_box_thresh=det_db_box_thresh,
35
+ use_dilation=use_dilation,
36
+ det_db_unclip_ratio=det_db_unclip_ratio,
37
+ )
24
38
  else:
25
- self.model = PPStructure(table=False, ocr=ocr, show_log=show_log)
39
+ self.model = PPStructure(table=False,
40
+ ocr=True,
41
+ show_log=show_log,
42
+ det_db_box_thresh=det_db_box_thresh,
43
+ use_dilation=use_dilation,
44
+ det_db_unclip_ratio=det_db_unclip_ratio,
45
+ )
26
46
 
27
47
  def __call__(self, img):
28
48
  try:
@@ -1,9 +1,55 @@
1
+ import cv2
1
2
  import numpy as np
2
3
  from loguru import logger
3
-
4
+ from io import BytesIO
5
+ from PIL import Image
6
+ import base64
4
7
  from magic_pdf.libs.boxbase import __is_overlaps_y_exceeds_threshold
5
8
  from magic_pdf.pre_proc.ocr_dict_merge import merge_spans_to_line
6
9
 
10
+ from ppocr.utils.utility import check_and_read
11
+
12
+
13
+ def img_decode(content: bytes):
14
+ np_arr = np.frombuffer(content, dtype=np.uint8)
15
+ return cv2.imdecode(np_arr, cv2.IMREAD_UNCHANGED)
16
+
17
+
18
+ def check_img(img):
19
+ if isinstance(img, bytes):
20
+ img = img_decode(img)
21
+ if isinstance(img, str):
22
+ image_file = img
23
+ img, flag_gif, flag_pdf = check_and_read(image_file)
24
+ if not flag_gif and not flag_pdf:
25
+ with open(image_file, 'rb') as f:
26
+ img_str = f.read()
27
+ img = img_decode(img_str)
28
+ if img is None:
29
+ try:
30
+ buf = BytesIO()
31
+ image = BytesIO(img_str)
32
+ im = Image.open(image)
33
+ rgb = im.convert('RGB')
34
+ rgb.save(buf, 'jpeg')
35
+ buf.seek(0)
36
+ image_bytes = buf.read()
37
+ data_base64 = str(base64.b64encode(image_bytes),
38
+ encoding="utf-8")
39
+ image_decode = base64.b64decode(data_base64)
40
+ img_array = np.frombuffer(image_decode, np.uint8)
41
+ img = cv2.imdecode(img_array, cv2.IMREAD_COLOR)
42
+ except:
43
+ logger.error("error in loading image:{}".format(image_file))
44
+ return None
45
+ if img is None:
46
+ logger.error("error in loading image:{}".format(image_file))
47
+ return None
48
+ if isinstance(img, np.ndarray) and len(img.shape) == 2:
49
+ img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR)
50
+
51
+ return img
52
+
7
53
 
8
54
  def bbox_to_points(bbox):
9
55
  """ 将bbox格式转换为四个顶点的数组 """
@@ -1,15 +1,17 @@
1
1
  import copy
2
2
  import time
3
-
4
3
  import cv2
5
4
  import numpy as np
5
+
6
6
  from paddleocr import PaddleOCR
7
- from paddleocr.paddleocr import check_img, logger
8
- from paddleocr.ppocr.utils.utility import alpha_to_color, binarize_img
9
- from paddleocr.tools.infer.predict_system import sorted_boxes
10
- from paddleocr.tools.infer.utility import get_rotate_crop_image, get_minarea_rect_crop
7
+ from ppocr.utils.logging import get_logger
8
+ from ppocr.utils.utility import alpha_to_color, binarize_img
9
+ from tools.infer.predict_system import sorted_boxes
10
+ from tools.infer.utility import get_rotate_crop_image, get_minarea_rect_crop
11
+
12
+ from magic_pdf.model.sub_modules.ocr.paddleocr.ocr_utils import update_det_boxes, merge_det_boxes, check_img
11
13
 
12
- from magic_pdf.model.sub_modules.ocr.paddleocr.ocr_utils import update_det_boxes, merge_det_boxes
14
+ logger = get_logger()
13
15
 
14
16
 
15
17
  class ModifiedPaddleOCR(PaddleOCR):
@@ -2,8 +2,8 @@ import os
2
2
 
3
3
  import cv2
4
4
  import numpy as np
5
- from paddleocr.ppstructure.table.predict_table import TableSystem
6
- from paddleocr.ppstructure.utility import init_args
5
+ from ppstructure.table.predict_table import TableSystem
6
+ from ppstructure.utility import init_args
7
7
  from PIL import Image
8
8
 
9
9
  from magic_pdf.config.constants import * # noqa: F403
@@ -1,7 +1,10 @@
1
1
  import copy
2
2
 
3
+ from loguru import logger
4
+
3
5
  from magic_pdf.config.constants import CROSS_PAGE, LINES_DELETED
4
6
  from magic_pdf.config.ocr_content_type import BlockType, ContentType
7
+ from magic_pdf.libs.language import detect_lang
5
8
 
6
9
  LINE_STOP_FLAG = (
7
10
  '.',
@@ -125,6 +128,9 @@ def __is_list_or_index_block(block):
125
128
 
126
129
  # 添加所有文本,包括空行,保持与block['lines']长度一致
127
130
  lines_text_list.append(line_text)
131
+ block_text = ''.join(lines_text_list)
132
+ block_lang = detect_lang(block_text)
133
+ # logger.info(f"block_lang: {block_lang}")
128
134
 
129
135
  # 计算line左侧顶格数量是否大于2,是否顶格用abs(block['bbox_fs'][0] - line['bbox'][0]) < line_height/2 来判断
130
136
  if abs(block['bbox_fs'][0] - line['bbox'][0]) < line_height / 2:
@@ -136,13 +142,16 @@ def __is_list_or_index_block(block):
136
142
  if abs(block['bbox_fs'][2] - line['bbox'][2]) < line_height:
137
143
  right_close_num += 1
138
144
  else:
139
- # 右侧不顶格情况下是否有一段距离,拍脑袋用0.3block宽度做阈值
140
- # block宽的阈值可以小些,block窄的阈值要大
141
-
142
- if block_weight_radio >= 0.5:
145
+ # 类中文没有超长单词的情况,可以用统一的阈值
146
+ if block_lang in ['zh', 'ja', 'ko']:
143
147
  closed_area = 0.26 * block_weight
144
148
  else:
145
- closed_area = 0.36 * block_weight
149
+ # 右侧不顶格情况下是否有一段距离,拍脑袋用0.3block宽度做阈值
150
+ # block宽的阈值可以小些,block窄的阈值要大
151
+ if block_weight_radio >= 0.5:
152
+ closed_area = 0.26 * block_weight
153
+ else:
154
+ closed_area = 0.36 * block_weight
146
155
  if block['bbox_fs'][2] - line['bbox'][2] > closed_area:
147
156
  right_not_close_num += 1
148
157
 
@@ -30,22 +30,14 @@ try:
30
30
  torchtext.disable_torchtext_deprecation_warning()
31
31
  except ImportError:
32
32
  pass
33
- from magic_pdf.model.sub_modules.model_init import AtomModelSingleton
34
33
 
34
+ from magic_pdf.model.sub_modules.model_init import AtomModelSingleton
35
35
  from magic_pdf.para.para_split_v3 import para_split
36
-
37
- from magic_pdf.pre_proc.construct_page_dict import \
38
- ocr_construct_page_component_v2
36
+ from magic_pdf.pre_proc.construct_page_dict import ocr_construct_page_component_v2
39
37
  from magic_pdf.pre_proc.cut_image import ocr_cut_image_and_table
40
-
41
- from magic_pdf.pre_proc.ocr_detect_all_bboxes import \
42
- ocr_prepare_bboxes_for_layout_split_v2
43
- from magic_pdf.pre_proc.ocr_dict_merge import (fill_spans_in_blocks,
44
- fix_block_spans_v2,
45
- fix_discarded_block)
46
- from magic_pdf.pre_proc.ocr_span_list_modify import (
47
- get_qa_need_list_v2, remove_overlaps_low_confidence_spans,
48
- remove_overlaps_min_spans)
38
+ from magic_pdf.pre_proc.ocr_detect_all_bboxes import ocr_prepare_bboxes_for_layout_split_v2
39
+ from magic_pdf.pre_proc.ocr_dict_merge import fill_spans_in_blocks, fix_block_spans_v2, fix_discarded_block
40
+ from magic_pdf.pre_proc.ocr_span_list_modify import get_qa_need_list_v2, remove_overlaps_low_confidence_spans, remove_overlaps_min_spans
49
41
 
50
42
 
51
43
  def __replace_STX_ETX(text_str: str):
@@ -65,10 +57,18 @@ def __replace_STX_ETX(text_str: str):
65
57
  return text_str
66
58
 
67
59
 
60
+ def __replace_0xfffd(text_str: str):
61
+ """Replace \ufffd, as these characters become garbled when extracted using pymupdf."""
62
+ if text_str:
63
+ s = text_str.replace('\ufffd', " ")
64
+ return s
65
+ return text_str
66
+
68
67
  def chars_to_content(span):
69
68
  # 检查span中的char是否为空
70
69
  if len(span['chars']) == 0:
71
- span['content'] = ''
70
+ pass
71
+ # span['content'] = ''
72
72
  else:
73
73
  # 先给chars按char['bbox']的中心点的x坐标排序
74
74
  span['chars'] = sorted(span['chars'], key=lambda x: (x['bbox'][0] + x['bbox'][2]) / 2)
@@ -83,22 +83,24 @@ def chars_to_content(span):
83
83
  if char['bbox'][0] - span['chars'][span['chars'].index(char) - 1]['bbox'][2] > char_avg_width:
84
84
  content += ' '
85
85
  content += char['c']
86
- span['content'] = __replace_STX_ETX(content)
86
+
87
+ span['content'] = __replace_0xfffd(content)
87
88
 
88
89
  del span['chars']
89
90
 
90
91
 
91
92
  LINE_STOP_FLAG = ('.', '!', '?', '。', '!', '?', ')', ')', '"', '”', ':', ':', ';', ';', ']', '】', '}', '}', '>', '》', '、', ',', ',', '-', '—', '–',)
93
+ LINE_START_FLAG = ('(', '(', '"', '“', '【', '{', '《', '<', '「', '『', '【', '[',)
94
+
95
+
92
96
  def fill_char_in_spans(spans, all_chars):
93
97
 
98
+ # 简单从上到下排一下序
99
+ spans = sorted(spans, key=lambda x: x['bbox'][1])
100
+
94
101
  for char in all_chars:
95
102
  for span in spans:
96
- # 判断char是否属于LINE_STOP_FLAG
97
- if char['c'] in LINE_STOP_FLAG:
98
- char_is_line_stop_flag = True
99
- else:
100
- char_is_line_stop_flag = False
101
- if calculate_char_in_span(char['bbox'], span['bbox'], char_is_line_stop_flag):
103
+ if calculate_char_in_span(char['bbox'], span['bbox'], char['c']):
102
104
  span['chars'].append(char)
103
105
  break
104
106
 
@@ -106,13 +108,16 @@ def fill_char_in_spans(spans, all_chars):
106
108
 
107
109
  for span in spans:
108
110
  chars_to_content(span)
109
- if len(span['content']) == 0:
111
+ # 有的span中虽然没有字但有一两个空的占位符,用宽高和content长度过滤
112
+ if len(span['content']) * span['height'] < span['width'] * 0.5:
113
+ # logger.info(f"maybe empty span: {len(span['content'])}, {span['height']}, {span['width']}")
110
114
  empty_spans.append(span)
115
+ del span['height'], span['width']
111
116
  return empty_spans
112
117
 
113
118
 
114
119
  # 使用鲁棒性更强的中心点坐标判断
115
- def calculate_char_in_span(char_bbox, span_bbox, char_is_line_stop_flag):
120
+ def calculate_char_in_span(char_bbox, span_bbox, char, span_height_radio=0.33):
116
121
  char_center_x = (char_bbox[0] + char_bbox[2]) / 2
117
122
  char_center_y = (char_bbox[1] + char_bbox[3]) / 2
118
123
  span_center_y = (span_bbox[1] + span_bbox[3]) / 2
@@ -121,18 +126,26 @@ def calculate_char_in_span(char_bbox, span_bbox, char_is_line_stop_flag):
121
126
  if (
122
127
  span_bbox[0] < char_center_x < span_bbox[2]
123
128
  and span_bbox[1] < char_center_y < span_bbox[3]
124
- and abs(char_center_y - span_center_y) < span_height / 4 # 字符的中轴和span的中轴高度差不能超过1/4span高度
129
+ and abs(char_center_y - span_center_y) < span_height * span_height_radio # 字符的中轴和span的中轴高度差不能超过1/4span高度
125
130
  ):
126
131
  return True
127
132
  else:
128
133
  # 如果char是LINE_STOP_FLAG,就不用中心点判定,换一种方案(左边界在span区域内,高度判定和之前逻辑一致)
129
134
  # 主要是给结尾符号一个进入span的机会,这个char还应该离span右边界较近
130
- if char_is_line_stop_flag:
135
+ if char in LINE_STOP_FLAG:
131
136
  if (
132
137
  (span_bbox[2] - span_height) < char_bbox[0] < span_bbox[2]
133
138
  and char_center_x > span_bbox[0]
134
139
  and span_bbox[1] < char_center_y < span_bbox[3]
135
- and abs(char_center_y - span_center_y) < span_height / 4
140
+ and abs(char_center_y - span_center_y) < span_height * span_height_radio
141
+ ):
142
+ return True
143
+ elif char in LINE_START_FLAG:
144
+ if (
145
+ span_bbox[0] < char_bbox[2] < (span_bbox[0] + span_height)
146
+ and char_center_x < span_bbox[2]
147
+ and span_bbox[1] < char_center_y < span_bbox[3]
148
+ and abs(char_center_y - span_center_y) < span_height * span_height_radio
136
149
  ):
137
150
  return True
138
151
  else:
@@ -141,12 +154,14 @@ def calculate_char_in_span(char_bbox, span_bbox, char_is_line_stop_flag):
141
154
 
142
155
  def txt_spans_extract_v2(pdf_page, spans, all_bboxes, all_discarded_blocks, lang):
143
156
 
144
- text_blocks_raw = pdf_page.get_text('rawdict', flags=fitz.TEXTFLAGS_TEXT)['blocks']
157
+ text_blocks_raw = pdf_page.get_text('rawdict', flags=fitz.TEXT_PRESERVE_WHITESPACE | fitz.TEXT_MEDIABOX_CLIP)['blocks']
145
158
 
146
- # @todo: 拿到char之后把倾斜角度较大的先删一遍
147
159
  all_pymu_chars = []
148
160
  for block in text_blocks_raw:
149
161
  for line in block['lines']:
162
+ cosine, sine = line['dir']
163
+ if abs (cosine) < 0.9 or abs(sine) > 0.1:
164
+ continue
150
165
  for span in line['spans']:
151
166
  all_pymu_chars.extend(span['chars'])
152
167
 
@@ -157,6 +172,7 @@ def txt_spans_extract_v2(pdf_page, spans, all_bboxes, all_discarded_blocks, lang
157
172
  continue
158
173
  span_height = span['bbox'][3] - span['bbox'][1]
159
174
  span['height'] = span_height
175
+ span['width'] = span['bbox'][2] - span['bbox'][0]
160
176
  span_height_list.append(span_height)
161
177
  if len(span_height_list) == 0:
162
178
  return spans
@@ -174,15 +190,13 @@ def txt_spans_extract_v2(pdf_page, spans, all_bboxes, all_discarded_blocks, lang
174
190
  if block[7] in [BlockType.ImageBody, BlockType.TableBody, BlockType.InterlineEquation]:
175
191
  continue
176
192
  if calculate_overlap_area_in_bbox1_area_ratio(span['bbox'], block[0:4]) > 0.5:
177
- if span['height'] > median_span_height * 3 and span['height'] > (span['bbox'][2] - span['bbox'][0]) * 3:
193
+ if span['height'] > median_span_height * 3 and span['height'] > span['width'] * 3:
178
194
  vertical_spans.append(span)
179
195
  elif block in all_bboxes:
180
196
  useful_spans.append(span)
181
197
  else:
182
198
  unuseful_spans.append(span)
183
199
 
184
- del span['height']
185
-
186
200
  break
187
201
 
188
202
  """垂直的span框直接用pymu的line进行填充"""
@@ -232,6 +246,7 @@ def txt_spans_extract_v2(pdf_page, spans, all_bboxes, all_discarded_blocks, lang
232
246
  if ocr_res and len(ocr_res) > 0:
233
247
  if len(ocr_res[0]) > 0:
234
248
  ocr_text, ocr_score = ocr_res[0][0]
249
+ # logger.info(f"ocr_text: {ocr_text}, ocr_score: {ocr_score}")
235
250
  if ocr_score > 0.5 and len(ocr_text) > 0:
236
251
  span['content'] = ocr_text
237
252
  span['score'] = ocr_score
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: magic-pdf
3
- Version: 0.10.2
3
+ Version: 0.10.3
4
4
  Summary: A practical tool for converting PDF to Markdown
5
5
  Home-page: https://github.com/opendatalab/MinerU
6
6
  Requires-Python: >=3.9
@@ -12,7 +12,6 @@ Requires-Dist: click>=8.1.7
12
12
  Requires-Dist: fast-langdetect==0.2.0
13
13
  Requires-Dist: loguru>=0.6.0
14
14
  Requires-Dist: numpy<2.0.0,>=1.21.6
15
- Requires-Dist: pdfminer.six==20231228
16
15
  Requires-Dist: pydantic<2.8.0,>=2.7.2
17
16
  Requires-Dist: PyMuPDF>=1.24.9
18
17
  Requires-Dist: scikit-learn>=1.0.2
@@ -1,7 +1,7 @@
1
1
  magic_pdf/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
2
  magic_pdf/pdf_parse_by_ocr.py,sha256=WTaLVSU2wRpgtldasnqbrw1B0OvVi8VvcB_t-dAIfmw,880
3
3
  magic_pdf/pdf_parse_by_txt.py,sha256=dh3ZM6BVrFzwbH4137BPUdKhgacGlpS2N4mn74_-UaA,762
4
- magic_pdf/pdf_parse_union_core_v2.py,sha256=-4yJwcSMcGwQKJhmK_MbBMa-fexzkqeD1CQHWpzGC3I,29920
4
+ magic_pdf/pdf_parse_union_core_v2.py,sha256=6Apku7-pW450HbHNTtbVLDyroRSKlQ57w9f0ScOaZv4,30879
5
5
  magic_pdf/user_api.py,sha256=Sh6U7iD5VsH7Qkav_0o5GTx-Rlj7vhmhHQHZSBKR5T8,4006
6
6
  magic_pdf/config/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
7
7
  magic_pdf/config/constants.py,sha256=gqhUEtso7rCop-k-VvEPAMW_6pA6Tv2Y9smrr_0Iajo,1173
@@ -27,10 +27,10 @@ magic_pdf/data/io/base.py,sha256=SqNQqe30ZvoVvg7GVv-hLMCjN6yBgDyQQWeLgGsTfhQ,111
27
27
  magic_pdf/data/io/http.py,sha256=XlKB0DNf4a_uUnfgcclvaaOtmE7lmddx0DnK8A-emAM,958
28
28
  magic_pdf/data/io/s3.py,sha256=hyA7sbNriQy64xd_uyJ7acN_oneQ1Pdmoc7_xcvkue8,3606
29
29
  magic_pdf/dict2md/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
30
- magic_pdf/dict2md/ocr_mkcontent.py,sha256=ohjhEFS9YFrzTCC9c9yrvi4QuZe9iZm1qlkQWB6xxIw,13038
30
+ magic_pdf/dict2md/ocr_mkcontent.py,sha256=hwcHTEx1tbIlM9ukmPBOAyH0G6rmbOTu87nVtZ1gE6k,12354
31
31
  magic_pdf/filter/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
32
32
  magic_pdf/filter/pdf_classify_by_type.py,sha256=spmDO-f2ihAl1d6-EP-j271Yi50oyu6mw4X2kRd_m0s,42320
33
- magic_pdf/filter/pdf_meta_scan.py,sha256=3I-t3PSrQUZ3PZAPl_NGoEhxLmIUE9Fpc0jueEXP7Xw,17381
33
+ magic_pdf/filter/pdf_meta_scan.py,sha256=3ba7SxXu1z2r5N97Dxmp_L10Lo7llsrBlvtEAJeIJBQ,17403
34
34
  magic_pdf/integrations/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
35
35
  magic_pdf/integrations/rag/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
36
36
  magic_pdf/integrations/rag/api.py,sha256=t38wvIBzLje4_JzTP3dewMLqV-tQJ-A3B92Sj2oyrfs,2507
@@ -50,16 +50,16 @@ magic_pdf/libs/language.py,sha256=Hj5-lrGoNExxdHLbkcNG-c27U4AjJ9AZPdZblaNSehU,10
50
50
  magic_pdf/libs/local_math.py,sha256=tqljQOgqh3fZc146HYhO88JXJaiXMVwArBkk_CSGICc,177
51
51
  magic_pdf/libs/markdown_utils.py,sha256=86v2BmsSV4NkoRZrH4uQD1youJhYFF3vIKr_vDeg3z0,270
52
52
  magic_pdf/libs/path_utils.py,sha256=Hykw_l5CU736b2egHV9P7B-qh3QNKO4nZSGCbsi0Z8E,1043
53
- magic_pdf/libs/pdf_check.py,sha256=MAe8wzwT0qvPf_I72wEZG7k1g4haNHS7oUtLqkB5rlE,2145
53
+ magic_pdf/libs/pdf_check.py,sha256=wCVOcwEPeMRcHW5OGN-GSQnPT5qNXUYHWWowoUknxF4,3178
54
54
  magic_pdf/libs/pdf_image_tools.py,sha256=kjzSEbm7K0yiHv8kJ4VbZ9HHktM8qvAv3LhxRyDZEQk,1987
55
55
  magic_pdf/libs/safe_filename.py,sha256=ckwcM_eqoysTb5id8czp-tXq2G9da0-l3pshZDCHQtE,236
56
- magic_pdf/libs/version.py,sha256=A_AARqtxTOj_AQTpjpgOxNx-UOBio5wYFfZ2mrdMKfs,23
56
+ magic_pdf/libs/version.py,sha256=0C8KcY1dzs3hdkAre06v0NCQ0Uxcqv6g9a93bRcVLW0,23
57
57
  magic_pdf/model/__init__.py,sha256=1QcfMKET0xQhSaZMjNQHi_TjzSSDR6PI5mjkmaXHPe8,52
58
58
  magic_pdf/model/doc_analyze_by_custom_model.py,sha256=YZwlhIgidy1_MUyTM_MRSLfKR_rpi508Bra6Vpj8PJ4,7125
59
59
  magic_pdf/model/magic_model.py,sha256=ppMkMqtP7sKncHTZ2SbXuPOoR988iRPexBEMA6QeiIc,42208
60
60
  magic_pdf/model/model_list.py,sha256=tJ9jtMB93HGx8Rmt8wmQSDFXZBUIPQrwaaYsep4luTM,183
61
61
  magic_pdf/model/pdf_extract_kit.py,sha256=ceYWlSU1BhakfsHPVM9SrUx35EvCBa20uJmgDO5PAtE,10933
62
- magic_pdf/model/pp_structure_v2.py,sha256=BKPN7W4BjG0eWPAPjPEac1RMnb5eIzmAz4E4Rq-9b1U,3019
62
+ magic_pdf/model/pp_structure_v2.py,sha256=NcqFWL4nUtjl82MFak8HX_8V3i4Aw_fK4dATrIp5uGs,3840
63
63
  magic_pdf/model/sub_modules/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
64
64
  magic_pdf/model/sub_modules/model_init.py,sha256=CnlZLsiSOmGJXQRASH-hMmuPiF6hYKCNfmzDTjQqy5g,5073
65
65
  magic_pdf/model/sub_modules/model_utils.py,sha256=ToiuwXbrvH_CPIwW2AXzz9miadUN5FA7lthwBljtIco,2118
@@ -94,8 +94,8 @@ magic_pdf/model/sub_modules/mfr/unimernet/Unimernet.py,sha256=jeJkqID6L1ZivPMdK1
94
94
  magic_pdf/model/sub_modules/mfr/unimernet/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
95
95
  magic_pdf/model/sub_modules/ocr/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
96
96
  magic_pdf/model/sub_modules/ocr/paddleocr/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
97
- magic_pdf/model/sub_modules/ocr/paddleocr/ocr_utils.py,sha256=bya-KGr5OPCmE8KC8K5Pp6OlGigCmUmB9xpm59nExaM,9056
98
- magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_273_mod.py,sha256=Deoth86bltlLz1Y-1jpyhLCwCaRfq-KKI0tiFyKKqA8,7268
97
+ magic_pdf/model/sub_modules/ocr/paddleocr/ocr_utils.py,sha256=rwKphio9SZgiNgqASWOBWZIf6PPi3kvgQO_qJLc_diE,10726
98
+ magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_273_mod.py,sha256=d__xICejA_Q-Cz4cfajwroDjfA0dT4TL18XAFYYc4OQ,7265
99
99
  magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_291_mod.py,sha256=VouMTvi6M5TV6pQdlpusgfyZapxiZ_Wi7Ff53eMC3rE,8996
100
100
  magic_pdf/model/sub_modules/reading_oreder/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
101
101
  magic_pdf/model/sub_modules/reading_oreder/layoutreader/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -108,9 +108,9 @@ magic_pdf/model/sub_modules/table/rapidtable/rapid_table.py,sha256=_FKKOSKeceusx
108
108
  magic_pdf/model/sub_modules/table/structeqtable/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
109
109
  magic_pdf/model/sub_modules/table/structeqtable/struct_eqtable.py,sha256=SrNPm-uOFEvN5muFGbXTAuwzXm-rCiaihVdqbydIBIA,1131
110
110
  magic_pdf/model/sub_modules/table/tablemaster/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
111
- magic_pdf/model/sub_modules/table/tablemaster/tablemaster_paddle.py,sha256=AdH3UGu4BEoII0uFjPKUf61W7HmG4fDlWgR1xxMeFlE,2775
111
+ magic_pdf/model/sub_modules/table/tablemaster/tablemaster_paddle.py,sha256=QEQ-56AzoIAU7UWsEidWW_KDOY5r16qm2kSpox8cxq4,2755
112
112
  magic_pdf/para/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
113
- magic_pdf/para/para_split_v3.py,sha256=x6nfjyt38W-wdxXjo6Chd18eiqLzmhbTNyGHhBQcEHs,16459
113
+ magic_pdf/para/para_split_v3.py,sha256=UOQe0HUVX7FAlMbJp1OkGfdM7JECWeqscv3s8Hge7ps,16922
114
114
  magic_pdf/pipe/AbsPipe.py,sha256=jPtAa0pz_vPddya3ZpUk6UrGqp8PcBdLONO1spzavQo,4371
115
115
  magic_pdf/pipe/OCRPipe.py,sha256=nuN-zpUzu--gyrC0_vsvvilAyK7Mp3Tom_UOnsur1ps,2158
116
116
  magic_pdf/pipe/TXTPipe.py,sha256=5OFo2e8U5Y24wJrFDEJghBDpklnKFEnzKTYVnnhQssE,2159
@@ -139,9 +139,9 @@ magic_pdf/tools/cli_dev.py,sha256=3RbubfTIagWoFYdu8wSDanr-BJDjFGeDet55jTy7He0,39
139
139
  magic_pdf/tools/common.py,sha256=ILTv8YjnK-XTVV5nzak3Sm-EJJXjG1hJJghlYKgYVBQ,6809
140
140
  magic_pdf/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
141
141
  magic_pdf/utils/annotations.py,sha256=82ou3uELNbQWa9hOFFkVt0gsIskAKf5msCv5J2IJ5V0,211
142
- magic_pdf-0.10.2.dist-info/LICENSE.md,sha256=jVa0BUaKrRH4erV2P5AeJ24I2WRv9chIGxditreJ6e0,34524
143
- magic_pdf-0.10.2.dist-info/METADATA,sha256=5pYglDeTXZaIsMRAHSfNl57Yq3gPXdcexNxt1zdvmu4,37030
144
- magic_pdf-0.10.2.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
145
- magic_pdf-0.10.2.dist-info/entry_points.txt,sha256=wXwYke3j8fqDQTocUspL-CqDUEv3Tfcwp09fM8dZAhA,98
146
- magic_pdf-0.10.2.dist-info/top_level.txt,sha256=J9I0AzmHWGkp9c6DL8Oe4mEx3yYphLzkRn4H25Lg1rE,10
147
- magic_pdf-0.10.2.dist-info/RECORD,,
142
+ magic_pdf-0.10.3.dist-info/LICENSE.md,sha256=jVa0BUaKrRH4erV2P5AeJ24I2WRv9chIGxditreJ6e0,34524
143
+ magic_pdf-0.10.3.dist-info/METADATA,sha256=R86XDaSfj1tcu3etkvhQfg3FSoARv8mKW2KpwjsdqWs,36992
144
+ magic_pdf-0.10.3.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
145
+ magic_pdf-0.10.3.dist-info/entry_points.txt,sha256=wXwYke3j8fqDQTocUspL-CqDUEv3Tfcwp09fM8dZAhA,98
146
+ magic_pdf-0.10.3.dist-info/top_level.txt,sha256=J9I0AzmHWGkp9c6DL8Oe4mEx3yYphLzkRn4H25Lg1rE,10
147
+ magic_pdf-0.10.3.dist-info/RECORD,,