magic-pdf 0.5.4__py3-none-any.whl → 0.5.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
magic_pdf/cli/magicpdf.py CHANGED
@@ -31,7 +31,6 @@ from magic_pdf.libs.version import __version__
31
31
 
32
32
  from magic_pdf.libs.MakeContentConfig import DropMode
33
33
  from magic_pdf.libs.draw_bbox import draw_layout_bbox, draw_span_bbox
34
- from magic_pdf.model.doc_analyze_by_pp_structurev2 import doc_analyze
35
34
  from magic_pdf.pipe.UNIPipe import UNIPipe
36
35
  from magic_pdf.pipe.OCRPipe import OCRPipe
37
36
  from magic_pdf.pipe.TXTPipe import TXTPipe
@@ -101,18 +100,34 @@ def _do_parse(pdf_file_name, pdf_bytes, model_list, parse_method, image_writer,
101
100
  # [pdf_file_name, pipe.pdf_mid_data['not_common_character_rate'], pipe.pdf_mid_data['not_printable_rate']])
102
101
 
103
102
  md_content = pipe.pipe_mk_markdown(image_dir, drop_mode=DropMode.NONE)
103
+ '''写markdown'''
104
104
  md_writer.write(
105
105
  content=md_content, path=f"{pdf_file_name}.md", mode=AbsReaderWriter.MODE_TXT
106
106
  )
107
+ '''写middle_json'''
107
108
  md_writer.write(
108
109
  content=json_parse.dumps(pipe.pdf_mid_data, ensure_ascii=False, indent=4),
109
- path=f"{pdf_file_name}.json",
110
+ path=f"{pdf_file_name}_middle.json",
110
111
  mode=AbsReaderWriter.MODE_TXT,
111
112
  )
112
-
113
+ '''写model_json'''
114
+ md_writer.write(
115
+ content=json_parse.dumps(pipe.model_list, ensure_ascii=False, indent=4),
116
+ path=f"{pdf_file_name}_model.json",
117
+ mode=AbsReaderWriter.MODE_TXT,
118
+ )
119
+ '''写源pdf'''
120
+ md_writer.write(
121
+ content=pdf_bytes,
122
+ path=f"{pdf_file_name}_origin.pdf",
123
+ mode=AbsReaderWriter.MODE_BIN,
124
+ )
113
125
  content_list = pipe.pipe_mk_uni_format(image_dir, drop_mode=DropMode.NONE)
126
+ '''写content_list'''
114
127
  md_writer.write(
115
- str(content_list), f"{pdf_file_name}.txt", AbsReaderWriter.MODE_TXT
128
+ content=json_parse.dumps(content_list, ensure_ascii=False, indent=4),
129
+ path=f"{pdf_file_name}_content_list.json",
130
+ mode=AbsReaderWriter.MODE_TXT
116
131
  )
117
132
 
118
133
 
@@ -144,10 +144,17 @@ def ocr_mk_markdown_with_para_core_v2(paras_of_layout, mode, img_buket_path=""):
144
144
  def merge_para_with_text(para_block):
145
145
  para_text = ''
146
146
  for line in para_block['lines']:
147
+ line_text = ""
148
+ line_lang = ""
149
+ for span in line['spans']:
150
+ span_type = span['type']
151
+ if span_type == ContentType.Text:
152
+ line_text += span['content'].strip()
153
+ if line_text != "":
154
+ line_lang = detect_lang(line_text)
147
155
  for span in line['spans']:
148
156
  span_type = span['type']
149
157
  content = ''
150
- language = ''
151
158
  if span_type == ContentType.Text:
152
159
  content = span['content']
153
160
  language = detect_lang(content)
@@ -161,7 +168,7 @@ def merge_para_with_text(para_block):
161
168
  content = f"\n$$\n{span['content']}\n$$\n"
162
169
 
163
170
  if content != '':
164
- if 'zh' in language:
171
+ if 'zh' in line_lang: # 遇到一些一个字一个span的文档,这种单字语言判断不准,需要用整行文本判断
165
172
  para_text += content # 中文语境下,content间不需要空格分隔
166
173
  else:
167
174
  para_text += content + ' ' # 英文语境下 content间需要空格分隔
@@ -21,7 +21,7 @@ from magic_pdf.libs.commons import mymax, get_top_percent_list
21
21
  from magic_pdf.filter.pdf_meta_scan import scan_max_page, junk_limit_min
22
22
 
23
23
  TEXT_LEN_THRESHOLD = 100
24
- AVG_TEXT_LEN_THRESHOLD = 200
24
+ AVG_TEXT_LEN_THRESHOLD = 100
25
25
  TEXT_LEN_SAMPLE_RATIO = 0.1 # 抽取0.1的页面进行文字长度统计
26
26
 
27
27
 
@@ -65,12 +65,14 @@ def merge_images(image_list, page_width, page_height, max_offset=5, max_gap=2):
65
65
  # 如果宽达标,检测是否能竖着拼
66
66
  if full_width:
67
67
  # 竖着拼需要满足两个前提,左右边界各偏移不能超过 max_offset,第一张图的下边界和第二张图的上边界偏移不能超过 max_gap
68
- close1 = (last_x0 - max_offset) <= x0 <= (last_x0 + max_offset) and (last_x1 - max_offset) <= x1 <= (last_x1 + max_offset) and (last_y1 - max_gap) <= y0 <= (last_y1 + max_gap)
68
+ close1 = (last_x0 - max_offset) <= x0 <= (last_x0 + max_offset) and (last_x1 - max_offset) <= x1 <= (
69
+ last_x1 + max_offset) and (last_y1 - max_gap) <= y0 <= (last_y1 + max_gap)
69
70
 
70
71
  # 如果高达标,检测是否可以横着拼
71
72
  if full_height:
72
73
  # 横着拼需要满足两个前提,上下边界各偏移不能超过 max_offset,第一张图的右边界和第二张图的左边界偏移不能超过 max_gap
73
- close2 = (last_y0 - max_offset) <= y0 <= (last_y0 + max_offset) and (last_y1 - max_offset) <= y1 <= (last_y1 + max_offset) and (last_x1 - max_gap) <= x0 <= (last_x1 + max_gap)
74
+ close2 = (last_y0 - max_offset) <= y0 <= (last_y0 + max_offset) and (last_y1 - max_offset) <= y1 <= (
75
+ last_y1 + max_offset) and (last_x1 - max_gap) <= x0 <= (last_x1 + max_gap)
74
76
 
75
77
  # Check if the image can be merged with the last image
76
78
  if (full_width and close1) or (full_height and close2):
@@ -109,10 +111,9 @@ def classify_by_area(total_page: int, page_width, page_height, img_sz_list, text
109
111
  # 先对每个id出现的次数做个统计
110
112
  objid_cnt = Counter([objid for page_img_sz in img_sz_list for _, _, _, _, objid in page_img_sz])
111
113
  # 再去掉出现次数大于10的
112
- if total_page >= scan_max_page:# 新的meta_scan只扫描前 scan_max_page 页,页数大于 scan_max_page 当total_page为 scan_max_page
114
+ if total_page >= scan_max_page: # 新的meta_scan只扫描前 scan_max_page 页,页数大于 scan_max_page 当total_page为 scan_max_page
113
115
  total_page = scan_max_page
114
116
 
115
-
116
117
  repeat_threshold = 2 # 把bad_image的阈值设为2
117
118
  # repeat_threshold = min(2, total_page) # 当total_page为1时,repeat_threshold为1,会产生误判导致所有img变成bad_img
118
119
  bad_image_objid = set([objid for objid, cnt in objid_cnt.items() if cnt >= repeat_threshold])
@@ -129,26 +130,26 @@ def classify_by_area(total_page: int, page_width, page_height, img_sz_list, text
129
130
  # if len(fake_image_ids) > 0 and any([l > TEXT_LEN_THRESHOLD for l in text_len_at_bad_image_page_idx]): # 这些透明图片所在的页面上有文字大于阈值
130
131
  # return True
131
132
 
132
- img_sz_list = [[img_sz for img_sz in page_img_sz if img_sz[-1] not in bad_image_objid] for page_img_sz in img_sz_list] # 过滤掉重复出现的图片
133
-
133
+ img_sz_list = [[img_sz for img_sz in page_img_sz if img_sz[-1] not in bad_image_objid] for page_img_sz in
134
+ img_sz_list] # 过滤掉重复出现的图片
134
135
 
135
136
  # 有的扫描版会把一页图片拆成很多张,需要先把图拼起来再计算
136
137
  img_sz_list = merge_images(img_sz_list, page_width, page_height)
137
138
 
138
139
  # 计算每个页面上最大的图的面积,然后计算这个面积占页面面积的比例
139
- max_image_area_per_page = [mymax([(x1 - x0) * (y1 - y0) for x0, y0, x1, y1, _ in page_img_sz]) for page_img_sz in img_sz_list]
140
+ max_image_area_per_page = [mymax([(x1 - x0) * (y1 - y0) for x0, y0, x1, y1, _ in page_img_sz]) for page_img_sz in
141
+ img_sz_list]
140
142
  page_area = page_width * page_height
141
143
  max_image_area_per_page = [area / page_area for area in max_image_area_per_page]
142
144
  max_image_area_per_page = [area for area in max_image_area_per_page if area > 0.5]
143
145
 
144
- if len(max_image_area_per_page) >= 0.5 * total_page: # 阈值从0.8改到0.5,适配3页里面有两页和两页里面有一页的情况
146
+ if len(max_image_area_per_page) >= 0.5 * total_page: # 阈值从0.8改到0.5,适配3页里面有两页和两页里面有一页的情况
145
147
  # 这里条件成立的前提是把反复出现的图片去掉了。这些图片是隐藏的透明图层,其特点是id都一样
146
148
  return False
147
149
  else:
148
150
  return True
149
151
 
150
152
 
151
-
152
153
  def classify_by_text_len(text_len_list: list, total_page: int):
153
154
  """
154
155
  随机抽取10%的页面,如果少于5个页面,那么就取全部页面。
@@ -173,6 +174,7 @@ def classify_by_text_len(text_len_list: list, total_page: int):
173
174
  is_text_pdf = any([text_len > TEXT_LEN_THRESHOLD for text_len in text_len_lst])
174
175
  return is_text_pdf
175
176
 
177
+
176
178
  def classify_by_avg_words(text_len_list: list):
177
179
  """
178
180
  补充规则,如果平均每页字数少于 AVG_TEXT_LEN_THRESHOLD,就不是文字pdf
@@ -193,6 +195,7 @@ def classify_by_avg_words(text_len_list: list):
193
195
 
194
196
  return is_text_pdf
195
197
 
198
+
196
199
  def classify_by_img_num(img_sz_list: list, img_num_list: list):
197
200
  """
198
201
  补充规则,有一种扫描版本的PDF,每一页都会放所有的扫描页进去,在 metascan 时会被去重,
@@ -208,11 +211,11 @@ def classify_by_img_num(img_sz_list: list, img_num_list: list):
208
211
  # img_sz_list中非空元素的个数小于1,前80%的元素都相等,且最大值大于等于junk_limit_min
209
212
  if count_img_sz_list_not_none <= 1 and len(set(top_eighty_percent)) == 1 and max(img_num_list) >= junk_limit_min:
210
213
 
211
- #拿max和min的值,用来判断list内的值是否全都相等
212
- # min_imgs = min(img_num_list)
213
- # max_imgs = max(img_num_list)
214
- #
215
- # if count_img_sz_list_not_none == 0 and max_imgs == min_imgs and max_imgs >= junk_limit_min:
214
+ #拿max和min的值,用来判断list内的值是否全都相等
215
+ # min_imgs = min(img_num_list)
216
+ # max_imgs = max(img_num_list)
217
+ #
218
+ # if count_img_sz_list_not_none == 0 and max_imgs == min_imgs and max_imgs >= junk_limit_min:
216
219
  return False # 如果满足这个条件,一定不是文字版pdf
217
220
  else:
218
221
  return True # 不满足这三个条件,可能是文字版pdf,通过其他规则判断
@@ -244,6 +247,7 @@ def classify_by_text_layout(text_layout_per_page: list):
244
247
  else:
245
248
  return False # 文本布局未知,默认认为不是文字版pdf
246
249
 
250
+
247
251
  def classify_by_img_narrow_strips(page_width, page_height, img_sz_list):
248
252
  """
249
253
  判断一页是否由细长条组成,有两个条件:
@@ -258,6 +262,7 @@ def classify_by_img_narrow_strips(page_width, page_height, img_sz_list):
258
262
  Returns:
259
263
  bool: 如果满足条件的页面的比例小于0.5,返回True,否则返回False
260
264
  """
265
+
261
266
  def is_narrow_strip(img):
262
267
  x0, y0, x1, y1, _ = img
263
268
  width, height = x1 - x0, y1 - y0
@@ -299,7 +304,8 @@ def classify_by_img_narrow_strips(page_width, page_height, img_sz_list):
299
304
  return narrow_strip_pages_ratio < 0.5
300
305
 
301
306
 
302
- def classify(total_page: int, page_width, page_height, img_sz_list: list, text_len_list: list, img_num_list: list, text_layout_list: list):
307
+ def classify(total_page: int, page_width, page_height, img_sz_list: list, text_len_list: list, img_num_list: list,
308
+ text_layout_list: list, invalid_chars: bool):
303
309
  """
304
310
  这里的图片和页面长度单位是pts
305
311
  :param total_page:
@@ -316,7 +322,8 @@ def classify(total_page: int, page_width, page_height, img_sz_list: list, text_l
316
322
  'by_avg_words': classify_by_avg_words(text_len_list),
317
323
  'by_img_num': classify_by_img_num(img_sz_list, img_num_list),
318
324
  'by_text_layout': classify_by_text_layout(text_layout_list),
319
- 'by_img_narrow_strips': classify_by_img_narrow_strips(page_width, page_height, img_sz_list)
325
+ 'by_img_narrow_strips': classify_by_img_narrow_strips(page_width, page_height, img_sz_list),
326
+ 'by_invalid_chars': invalid_chars,
320
327
  }
321
328
 
322
329
  if all(results.values()):
@@ -324,7 +331,12 @@ def classify(total_page: int, page_width, page_height, img_sz_list: list, text_l
324
331
  elif not any(results.values()):
325
332
  return False, results
326
333
  else:
327
- logger.warning(f"pdf is not classified by area and text_len, by_image_area: {results['by_image_area']}, by_text: {results['by_text_len']}, by_avg_words: {results['by_avg_words']}, by_img_num: {results['by_img_num']}, by_text_layout: {results['by_text_layout']}, by_img_narrow_strips: {results['by_img_narrow_strips']}", file=sys.stderr) # 利用这种情况可以快速找出来哪些pdf比较特殊,针对性修正分类算法
334
+ logger.warning(
335
+ f"pdf is not classified by area and text_len, by_image_area: {results['by_image_area']},"
336
+ f" by_text: {results['by_text_len']}, by_avg_words: {results['by_avg_words']}, by_img_num: {results['by_img_num']},"
337
+ f" by_text_layout: {results['by_text_layout']}, by_img_narrow_strips: {results['by_img_narrow_strips']},"
338
+ f" by_invalid_chars: {results['by_invalid_chars']}",
339
+ file=sys.stderr) # 利用这种情况可以快速找出来哪些pdf比较特殊,针对性修正分类算法
328
340
  return False, results
329
341
 
330
342
 
@@ -12,12 +12,13 @@ from collections import Counter
12
12
 
13
13
  from magic_pdf.libs.drop_reason import DropReason
14
14
  from magic_pdf.libs.language import detect_lang
15
+ from magic_pdf.libs.pdf_check import detect_invalid_chars
15
16
 
16
17
  scan_max_page = 50
17
18
  junk_limit_min = 10
18
19
 
19
20
 
20
- def calculate_max_image_area_per_page(result:list, page_width_pts, page_height_pts):
21
+ def calculate_max_image_area_per_page(result: list, page_width_pts, page_height_pts):
21
22
  max_image_area_per_page = [mymax([(x1 - x0) * (y1 - y0) for x0, y0, x1, y1, _ in page_img_sz]) for page_img_sz in
22
23
  result]
23
24
  page_area = int(page_width_pts) * int(page_height_pts)
@@ -25,14 +26,15 @@ def calculate_max_image_area_per_page(result:list, page_width_pts, page_height_p
25
26
  max_image_area_per_page = [area for area in max_image_area_per_page if area > 0.6]
26
27
  return max_image_area_per_page
27
28
 
29
+
28
30
  def process_image(page, junk_img_bojids=[]):
29
- page_result = []# 存每个页面里的多张图四元组信息
31
+ page_result = [] # 存每个页面里的多张图四元组信息
30
32
  items = page.get_images()
31
33
  dedup = set()
32
34
  for img in items:
33
35
  # 这里返回的是图片在page上的实际展示的大小。返回一个数组,每个元素第一部分是
34
- img_bojid = img[0]# 在pdf文件中是全局唯一的,如果这个图反复出现在pdf里那么就可能是垃圾信息,例如水印、页眉页脚等
35
- if img_bojid in junk_img_bojids:# 如果是垃圾图像,就跳过
36
+ img_bojid = img[0] # 在pdf文件中是全局唯一的,如果这个图反复出现在pdf里那么就可能是垃圾信息,例如水印、页眉页脚等
37
+ if img_bojid in junk_img_bojids: # 如果是垃圾图像,就跳过
36
38
  continue
37
39
  recs = page.get_image_rects(img, transform=True)
38
40
  if recs:
@@ -47,6 +49,8 @@ def process_image(page, junk_img_bojids=[]):
47
49
  dedup.add((x0, y0, x1, y1, img_bojid))
48
50
  page_result.append([x0, y0, x1, y1, img_bojid])
49
51
  return page_result
52
+
53
+
50
54
  def get_image_info(doc: fitz.Document, page_width_pts, page_height_pts) -> list:
51
55
  """
52
56
  返回每个页面里的图片的四元组,每个页面多个图片。
@@ -57,7 +61,7 @@ def get_image_info(doc: fitz.Document, page_width_pts, page_height_pts) -> list:
57
61
  img_bojid_counter = Counter(img[0] for page in doc for img in page.get_images())
58
62
  # 找出出现次数超过 len(doc) 半数的 img_bojid
59
63
 
60
- junk_limit = max(len(doc)*0.5, junk_limit_min)# 对一些页数比较少的进行豁免
64
+ junk_limit = max(len(doc) * 0.5, junk_limit_min) # 对一些页数比较少的进行豁免
61
65
 
62
66
  junk_img_bojids = [img_bojid for img_bojid, count in img_bojid_counter.items() if count >= junk_limit]
63
67
 
@@ -82,9 +86,10 @@ def get_image_info(doc: fitz.Document, page_width_pts, page_height_pts) -> list:
82
86
  result.append(page_result)
83
87
  for item in result:
84
88
  if not any(item): # 如果任何一页没有图片,说明是个文字版,需要判断是否为特殊文字版
85
- if max(imgs_len_list) == min(imgs_len_list) and max(imgs_len_list) >= junk_limit_min:# 如果是特殊文字版,就把junklist置空并break
89
+ if max(imgs_len_list) == min(imgs_len_list) and max(
90
+ imgs_len_list) >= junk_limit_min: # 如果是特殊文字版,就把junklist置空并break
86
91
  junk_img_bojids = []
87
- else:# 不是特殊文字版,是个普通文字版,但是存在垃圾图片,不置空junklist
92
+ else: # 不是特殊文字版,是个普通文字版,但是存在垃圾图片,不置空junklist
88
93
  pass
89
94
  break_loop = True
90
95
  break
@@ -94,16 +99,16 @@ def get_image_info(doc: fitz.Document, page_width_pts, page_height_pts) -> list:
94
99
  # 检查前80%的元素是否都相等
95
100
  if len(set(top_eighty_percent)) == 1 and max(imgs_len_list) >= junk_limit_min:
96
101
 
97
- # # 如果前10页跑完都有图,根据每页图片数量是否相等判断是否需要清除junklist
98
- # if max(imgs_len_list) == min(imgs_len_list) and max(imgs_len_list) >= junk_limit_min:
102
+ # # 如果前10页跑完都有图,根据每页图片数量是否相等判断是否需要清除junklist
103
+ # if max(imgs_len_list) == min(imgs_len_list) and max(imgs_len_list) >= junk_limit_min:
99
104
 
100
105
  #前10页都有图,且每页数量一致,需要检测图片大小占页面的比例判断是否需要清除junklist
101
106
  max_image_area_per_page = calculate_max_image_area_per_page(result, page_width_pts, page_height_pts)
102
107
  if len(max_image_area_per_page) < 0.8 * special_limit_pages: # 前10页不全是大图,说明可能是个文字版pdf,把垃圾图片list置空
103
108
  junk_img_bojids = []
104
- else:# 前10页都有图,而且80%都是大图,且每页图片数量一致并都很多,说明是扫描版1,不需要清空junklist
109
+ else: # 前10页都有图,而且80%都是大图,且每页图片数量一致并都很多,说明是扫描版1,不需要清空junklist
105
110
  pass
106
- else:# 每页图片数量不一致,需要清掉junklist全量跑前50页图片
111
+ else: # 每页图片数量不一致,需要清掉junklist全量跑前50页图片
107
112
  junk_img_bojids = []
108
113
 
109
114
  #正式进入取前50页图片的信息流程
@@ -136,7 +141,6 @@ def get_pdf_page_size_pts(doc: fitz.Document):
136
141
  median_width = page_width_list[len(page_width_list) // 2]
137
142
  median_height = page_height_list[len(page_height_list) // 2]
138
143
 
139
-
140
144
  return median_width, median_height
141
145
 
142
146
 
@@ -156,6 +160,7 @@ def get_pdf_textlen_per_page(doc: fitz.Document):
156
160
 
157
161
  return text_len_lst
158
162
 
163
+
159
164
  def get_pdf_text_layout_per_page(doc: fitz.Document):
160
165
  """
161
166
  根据PDF文档的每一页文本布局,判断该页的文本布局是横向、纵向还是未知。
@@ -233,11 +238,16 @@ def get_pdf_text_layout_per_page(doc: fitz.Document):
233
238
  # logger.info(f"page_id: {page_id}, vertical_count: {vertical_count}, horizontal_count: {horizontal_count}")
234
239
  return text_layout_list
235
240
 
241
+
236
242
  '''定义一个自定义异常用来抛出单页svg太多的pdf'''
243
+
244
+
237
245
  class PageSvgsTooManyError(Exception):
238
246
  def __init__(self, message="Page SVGs are too many"):
239
247
  self.message = message
240
248
  super().__init__(self.message)
249
+
250
+
241
251
  def get_svgs_per_page(doc: fitz.Document):
242
252
  svgs_len_list = []
243
253
  for page_id, page in enumerate(doc):
@@ -251,6 +261,7 @@ def get_svgs_per_page(doc: fitz.Document):
251
261
  # logger.info(f"page_id: {page_id}, svgs_len: {len(svgs)}")
252
262
  return svgs_len_list
253
263
 
264
+
254
265
  def get_imgs_per_page(doc: fitz.Document):
255
266
  imgs_len_list = []
256
267
  for page_id, page in enumerate(doc):
@@ -287,6 +298,13 @@ def get_language(doc: fitz.Document):
287
298
  return language
288
299
 
289
300
 
301
+ def check_invalid_chars(pdf_bytes):
302
+ """
303
+ 乱码检测
304
+ """
305
+ return detect_invalid_chars(pdf_bytes)
306
+
307
+
290
308
  def pdf_meta_scan(pdf_bytes: bytes):
291
309
  """
292
310
  :param s3_pdf_path:
@@ -318,7 +336,8 @@ def pdf_meta_scan(pdf_bytes: bytes):
318
336
  # logger.info(f"text_layout_per_page: {text_layout_per_page}")
319
337
  text_language = get_language(doc)
320
338
  # logger.info(f"text_language: {text_language}")
321
-
339
+ invalid_chars = check_invalid_chars(pdf_bytes)
340
+ # logger.info(f"invalid_chars: {invalid_chars}")
322
341
 
323
342
  # 最后输出一条json
324
343
  res = {
@@ -334,6 +353,7 @@ def pdf_meta_scan(pdf_bytes: bytes):
334
353
  # "svgs_per_page": svgs_per_page,
335
354
  "imgs_per_page": imgs_per_page, # 增加每页img数量list
336
355
  "junk_img_bojids": junk_img_bojids, # 增加垃圾图片的bojid list
356
+ "invalid_chars": invalid_chars,
337
357
  "metadata": doc.metadata
338
358
  }
339
359
  # logger.info(json.dumps(res, ensure_ascii=False))
@@ -365,4 +385,4 @@ if __name__ == '__main__':
365
385
  # file_content = read_file("D:\project/20231108code-clean\pdf_cost_time\竖排例子\净空法师_大乘无量寿.pdf","")
366
386
  # doc = fitz.open("pdf", file_content)
367
387
  # text_layout_lst = get_pdf_text_layout_per_page(doc)
368
- # print(text_layout_lst)
388
+ # print(text_layout_lst)
@@ -0,0 +1,59 @@
1
+ from io import BytesIO
2
+ import re
3
+ import fitz
4
+ import numpy as np
5
+ from loguru import logger
6
+ from pdfminer.high_level import extract_text
7
+
8
+
9
+ def calculate_sample_count(total_page: int, sample_ratio=0.1):
10
+ """
11
+ 根据总页数和采样率计算采样页面的数量。
12
+ """
13
+ select_page_cnt = int(total_page * sample_ratio)
14
+ if select_page_cnt < 5:
15
+ select_page_cnt = min(10, total_page)
16
+ elif select_page_cnt > 10:
17
+ select_page_cnt = 10
18
+ return select_page_cnt
19
+
20
+
21
+ def extract_pages(src_pdf_bytes: bytes):
22
+ pdf_docs = fitz.open("pdf", src_pdf_bytes)
23
+ total_page = len(pdf_docs)
24
+ if total_page == 0:
25
+ # 如果PDF没有页面,直接返回空文档
26
+ logger.warning("PDF is empty, return empty document")
27
+ return fitz.Document()
28
+ select_page_cnt = calculate_sample_count(total_page)
29
+
30
+ page_num = np.random.choice(total_page, select_page_cnt, replace=False)
31
+ sample_docs = fitz.Document()
32
+ try:
33
+ for index in page_num:
34
+ sample_docs.insert_pdf(pdf_docs, from_page=int(index), to_page=int(index))
35
+ except Exception as e:
36
+ logger.exception(e)
37
+ return sample_docs
38
+
39
+
40
+ def detect_invalid_chars(src_pdf_bytes: bytes) -> bool:
41
+ """"
42
+ 检测PDF中是否包含非法字符
43
+ """
44
+ '''需要使用'''
45
+ sample_docs = extract_pages(src_pdf_bytes)
46
+ sample_pdf_bytes = sample_docs.tobytes()
47
+ sample_pdf_file_like_object = BytesIO(sample_pdf_bytes)
48
+ text = extract_text(sample_pdf_file_like_object)
49
+ # logger.info(text)
50
+ '''乱码文本用pdfminer提取出来的文本特征是(cid:xxx)'''
51
+ cid_pattern = re.compile(r'\(cid:\d+\)')
52
+ matches = cid_pattern.findall(text)
53
+ cid_count = len(matches)
54
+ text_len = len(text)
55
+ logger.info(f"cid_count: {cid_count}, text_len: {text_len}")
56
+ if cid_count > 10:
57
+ return False # 乱码文档
58
+ else:
59
+ return True # 正常文档
magic_pdf/libs/version.py CHANGED
@@ -1 +1 @@
1
- __version__ = "0.5.4"
1
+ __version__ = "0.5.6"
@@ -0,0 +1,61 @@
1
+ import fitz
2
+ import cv2
3
+ from PIL import Image
4
+ import numpy as np
5
+
6
+ from magic_pdf.model.model_list import MODEL
7
+ from magic_pdf.model.pp_structure_v2 import CustomPaddleModel
8
+
9
+
10
+ def dict_compare(d1, d2):
11
+ return d1.items() == d2.items()
12
+
13
+
14
+ def remove_duplicates_dicts(lst):
15
+ unique_dicts = []
16
+ for dict_item in lst:
17
+ if not any(
18
+ dict_compare(dict_item, existing_dict) for existing_dict in unique_dicts
19
+ ):
20
+ unique_dicts.append(dict_item)
21
+ return unique_dicts
22
+
23
+
24
+ def load_images_from_pdf(pdf_bytes: bytes, dpi=200) -> list:
25
+ images = []
26
+ with fitz.open("pdf", pdf_bytes) as doc:
27
+ for index in range(0, doc.page_count):
28
+ page = doc[index]
29
+ mat = fitz.Matrix(dpi / 72, dpi / 72)
30
+ pm = page.get_pixmap(matrix=mat, alpha=False)
31
+
32
+ # if width or height > 2000 pixels, don't enlarge the image
33
+ # if pm.width > 2000 or pm.height > 2000:
34
+ # pm = page.get_pixmap(matrix=fitz.Matrix(1, 1), alpha=False)
35
+
36
+ img = Image.frombytes("RGB", [pm.width, pm.height], pm.samples)
37
+ img = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)
38
+ img_dict = {"img": img, "width": pm.width, "height": pm.height}
39
+ images.append(img_dict)
40
+ return images
41
+
42
+
43
+ def doc_analyze(pdf_bytes: bytes, ocr: bool = False, show_log: bool = False, model=MODEL.Paddle):
44
+ images = load_images_from_pdf(pdf_bytes)
45
+ custom_model = None
46
+ if model == MODEL.Paddle:
47
+ custom_model = CustomPaddleModel(ocr=ocr, show_log=show_log)
48
+ else:
49
+ pass
50
+ model_json = []
51
+ for index, img_dict in enumerate(images):
52
+ img = img_dict["img"]
53
+ page_width = img_dict["width"]
54
+ page_height = img_dict["height"]
55
+ result = custom_model(img)
56
+ page_info = {"page_no": index, "height": page_height, "width": page_width}
57
+ page_dict = {"layout_dets": result, "page_info": page_info}
58
+
59
+ model_json.append(page_dict)
60
+
61
+ return model_json
@@ -0,0 +1,2 @@
1
+ class MODEL:
2
+ Paddle = "pp_structure_v2"
@@ -0,0 +1,75 @@
1
+ import random
2
+
3
+ from loguru import logger
4
+ from paddleocr import PPStructure
5
+
6
+
7
+ def region_to_bbox(region):
8
+ x0 = region[0][0]
9
+ y0 = region[0][1]
10
+ x1 = region[2][0]
11
+ y1 = region[2][1]
12
+ return [x0, y0, x1, y1]
13
+
14
+
15
+ class CustomPaddleModel:
16
+ def __init__(self, ocr: bool = False, show_log: bool = False):
17
+ self.model = PPStructure(table=False, ocr=ocr, show_log=show_log)
18
+
19
+ def __call__(self, img):
20
+ result = self.model(img)
21
+ spans = []
22
+ for line in result:
23
+ line.pop("img")
24
+ """
25
+ 为paddle输出适配type no.
26
+ title: 0 # 标题
27
+ text: 1 # 文本
28
+ header: 2 # abandon
29
+ footer: 2 # abandon
30
+ reference: 1 # 文本 or abandon
31
+ equation: 8 # 行间公式 block
32
+ equation: 14 # 行间公式 text
33
+ figure: 3 # 图片
34
+ figure_caption: 4 # 图片描述
35
+ table: 5 # 表格
36
+ table_caption: 6 # 表格描述
37
+ """
38
+ if line["type"] == "title":
39
+ line["category_id"] = 0
40
+ elif line["type"] in ["text", "reference"]:
41
+ line["category_id"] = 1
42
+ elif line["type"] == "figure":
43
+ line["category_id"] = 3
44
+ elif line["type"] == "figure_caption":
45
+ line["category_id"] = 4
46
+ elif line["type"] == "table":
47
+ line["category_id"] = 5
48
+ elif line["type"] == "table_caption":
49
+ line["category_id"] = 6
50
+ elif line["type"] == "equation":
51
+ line["category_id"] = 8
52
+ elif line["type"] in ["header", "footer"]:
53
+ line["category_id"] = 2
54
+ else:
55
+ logger.warning(f"unknown type: {line['type']}")
56
+
57
+ # 兼容不输出score的paddleocr版本
58
+ if line.get("score") is None:
59
+ line["score"] = 0.5 + random.random() * 0.5
60
+
61
+ res = line.pop("res", None)
62
+ if res is not None and len(res) > 0:
63
+ for span in res:
64
+ new_span = {
65
+ "category_id": 15,
66
+ "bbox": region_to_bbox(span["text_region"]),
67
+ "score": span["confidence"],
68
+ "text": span["text"],
69
+ }
70
+ spans.append(new_span)
71
+
72
+ if len(spans) > 0:
73
+ result.extend(spans)
74
+
75
+ return result
magic_pdf/pipe/AbsPipe.py CHANGED
@@ -83,6 +83,7 @@ class AbsPipe(ABC):
83
83
  pdf_meta["text_len_per_page"],
84
84
  pdf_meta["imgs_per_page"],
85
85
  pdf_meta["text_layout_per_page"],
86
+ pdf_meta["invalid_chars"],
86
87
  )
87
88
  if is_text_pdf:
88
89
  return AbsPipe.PIP_TXT
magic_pdf/pipe/OCRPipe.py CHANGED
@@ -1,5 +1,5 @@
1
1
  from magic_pdf.libs.MakeContentConfig import DropMode
2
- from magic_pdf.model.doc_analyze_by_pp_structurev2 import doc_analyze
2
+ from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
3
3
  from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
4
4
  from magic_pdf.pipe.AbsPipe import AbsPipe
5
5
  from magic_pdf.user_api import parse_ocr_pdf
magic_pdf/pipe/TXTPipe.py CHANGED
@@ -1,5 +1,5 @@
1
1
  from magic_pdf.libs.MakeContentConfig import DropMode
2
- from magic_pdf.model.doc_analyze_by_pp_structurev2 import doc_analyze
2
+ from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
3
3
  from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
4
4
  from magic_pdf.libs.json_compressor import JsonCompressor
5
5
  from magic_pdf.pipe.AbsPipe import AbsPipe
magic_pdf/pipe/UNIPipe.py CHANGED
@@ -3,7 +3,7 @@ import json
3
3
  from loguru import logger
4
4
 
5
5
  from magic_pdf.libs.MakeContentConfig import DropMode
6
- from magic_pdf.model.doc_analyze_by_pp_structurev2 import doc_analyze
6
+ from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
7
7
  from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
8
8
  from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
9
9
  from magic_pdf.libs.commons import join_path
magic_pdf/user_api.py CHANGED
@@ -16,7 +16,7 @@ import re
16
16
  from loguru import logger
17
17
 
18
18
  from magic_pdf.libs.version import __version__
19
- from magic_pdf.model.doc_analyze_by_pp_structurev2 import doc_analyze
19
+ from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
20
20
  from magic_pdf.rw import AbsReaderWriter
21
21
  from magic_pdf.pdf_parse_by_ocr_v2 import parse_pdf_by_ocr
22
22
  from magic_pdf.pdf_parse_by_txt_v2 import parse_pdf_by_txt
@@ -86,41 +86,46 @@ def parse_union_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWr
86
86
  return None
87
87
 
88
88
  pdf_info_dict = parse_pdf(parse_pdf_by_txt)
89
- text_all = ""
90
- for page_dict in pdf_info_dict['pdf_info']:
91
- for para_block in page_dict['para_blocks']:
92
- if para_block['type'] in ['title', 'text']:
93
- for line in para_block['lines']:
94
- for span in line['spans']:
95
- text_all += span['content']
96
-
97
- def calculate_not_common_character_rate(text):
98
- garbage_regex = re.compile(r'[^\u4e00-\u9fa5\u0030-\u0039\u0041-\u005a\u0061-\u007a\u3000-\u303f\uff00-\uffef]')
99
- # 计算乱码字符的数量
100
- garbage_count = len(garbage_regex.findall(text))
101
- total = len(text)
102
- if total == 0:
103
- return 0 # 避免除以零的错误
104
- return garbage_count / total
105
-
106
- def calculate_not_printable_rate(text):
107
- printable = sum(1 for c in text if c.isprintable())
108
- total = len(text)
109
- if total == 0:
110
- return 0 # 避免除以零的错误
111
- return (total - printable) / total
112
-
113
- not_common_character_rate = calculate_not_common_character_rate(text_all)
114
- not_printable_rate = calculate_not_printable_rate(text_all)
115
- pdf_info_dict["_not_common_character_rate"] = not_common_character_rate
116
- pdf_info_dict["_not_printable_rate"] = not_printable_rate
117
- logger.info(f"not_common_character_rate: {not_common_character_rate}, not_printable_rate: {not_printable_rate}")
89
+ # text_all = ""
90
+ # for page_dict in pdf_info_dict['pdf_info']:
91
+ # for para_block in page_dict['para_blocks']:
92
+ # if para_block['type'] in ['title', 'text']:
93
+ # for line in para_block['lines']:
94
+ # for span in line['spans']:
95
+ # text_all += span['content']
96
+
97
+ # def calculate_not_common_character_rate(text):
98
+ # garbage_regex = re.compile(r'[^\u4e00-\u9fa5\u0030-\u0039\u0041-\u005a\u0061-\u007a\u3000-\u303f\uff00-\uffef]')
99
+ # # 计算乱码字符的数量
100
+ # garbage_count = len(garbage_regex.findall(text))
101
+ # total = len(text)
102
+ # if total == 0:
103
+ # return 0 # 避免除以零的错误
104
+ # return garbage_count / total
105
+ #
106
+ # def calculate_not_printable_rate(text):
107
+ # printable_text = ""
108
+ # for c in text:
109
+ # if c.isprintable():
110
+ # printable_text += c
111
+ # printable_total = len(printable_text)
112
+ # total = len(text)
113
+ # if total == 0:
114
+ # return 0 # 避免除以零的错误
115
+ # return (total - printable_total) / total
116
+ #
117
+ # not_common_character_rate = calculate_not_common_character_rate(text_all)
118
+ # not_printable_rate = calculate_not_printable_rate(text_all)
119
+ # pdf_info_dict["_not_common_character_rate"] = not_common_character_rate
120
+ # pdf_info_dict["_not_printable_rate"] = not_printable_rate
121
+ # logger.info(f"not_common_character_rate: {not_common_character_rate}, not_printable_rate: {not_printable_rate}")
122
+ '''新逻辑使用pdfminer识别乱码pdf,准确率高且不会误伤,已在解析流程之前进行处理'''
118
123
  # not_common_character_rate对小语种可能会有误伤,not_printable_rate对小语种较为友好
119
124
  if (pdf_info_dict is None
120
- or pdf_info_dict.get("_need_drop", False)
121
- or not_printable_rate > 0.02 # 参考一些正常的pdf,这个值没有超过0.01的,阈值设为0.02
125
+ or pdf_info_dict.get("_need_drop", False)
126
+ # or not_printable_rate > 0.02 # 参考一些正常的pdf,这个值没有超过0.01的,阈值设为0.02
122
127
  ):
123
- logger.warning(f"parse_pdf_by_txt drop or error or garbled_rate too large, switch to parse_pdf_by_ocr")
128
+ logger.warning(f"parse_pdf_by_txt drop or error, switch to parse_pdf_by_ocr")
124
129
  if input_model_is_empty:
125
130
  pdf_models = doc_analyze(pdf_bytes, ocr=True)
126
131
  pdf_info_dict = parse_pdf(parse_pdf_by_ocr)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: magic-pdf
3
- Version: 0.5.4
3
+ Version: 0.5.6
4
4
  Requires-Python: >=3.9
5
5
  License-File: LICENSE.md
6
6
  Requires-Dist: boto3 >=1.28.43
@@ -19,6 +19,10 @@ Requires-Dist: wordninja >=2.0.0
19
19
  Requires-Dist: scikit-learn >=1.0.2
20
20
  Requires-Dist: nltk ==3.8.1
21
21
  Requires-Dist: s3pathlib >=2.1.1
22
- Requires-Dist: paddlepaddle
23
22
  Requires-Dist: paddleocr
23
+ Requires-Dist: pdfminer.six >=20231228
24
+ Provides-Extra: cpu
25
+ Requires-Dist: paddlepaddle ; extra == 'cpu'
26
+ Provides-Extra: gpu
27
+ Requires-Dist: paddlepaddle-gpu ; extra == 'gpu'
24
28
 
@@ -5,15 +5,15 @@ magic_pdf/pdf_parse_by_txt.py,sha256=5_kdfvDkv_XwDove2AW7SopGysYLJ1-tsOQy2yuII1Y
5
5
  magic_pdf/pdf_parse_by_txt_v2.py,sha256=mGadyYamoCNGNsKOQM1uXQR65zMUKyL24yURGHADmVs,1908
6
6
  magic_pdf/pdf_parse_for_train.py,sha256=Oby61DMjJ716Jj_ri7lwXfv2Chus0pbBR2RPXrmBW08,28661
7
7
  magic_pdf/pdf_parse_union_core.py,sha256=a67iQuEfuslAEF-wQplGZKXUuz5mT3HiCyvuR52E6Gw,10584
8
- magic_pdf/user_api.py,sha256=qaPK7A_VG10tPjHgmURDVOmVkeilCVnRqBbL27LaF7Q,4694
8
+ magic_pdf/user_api.py,sha256=R4onPBXlA8GARwlqBh5wmxUtTxwQb-PUeFK7eTAWzoU,4971
9
9
  magic_pdf/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
10
- magic_pdf/cli/magicpdf.py,sha256=rTGrXc0hWv-nCqE--QRm1jwiGHaxnxUKf9jlAYWyXoQ,10885
10
+ magic_pdf/cli/magicpdf.py,sha256=IoyuWsnJp5lLDS4G9brtCqNdIWKb57Ini4uftkCl2Mg,11357
11
11
  magic_pdf/dict2md/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
12
12
  magic_pdf/dict2md/mkcontent.py,sha256=rWUY-2opd0jeowEUEVOV_uWcKum1Q7ng4nOoT6-ka_s,17459
13
- magic_pdf/dict2md/ocr_mkcontent.py,sha256=Y0nFbCX7zSVLq-vQqJvR8azumd0003ixrk5wy0vIJxU,15068
13
+ magic_pdf/dict2md/ocr_mkcontent.py,sha256=RyxebPtvFfNce_HCa-_YGxwFx_srzL-BfMKc85V9JG0,15442
14
14
  magic_pdf/filter/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
15
- magic_pdf/filter/pdf_classify_by_type.py,sha256=c11aGI-I5zhLdZQnRLeTVPcmoK14ahr8aP7hTHl8_kM,41990
16
- magic_pdf/filter/pdf_meta_scan.py,sha256=KLih7jfVqABhdeZ9tAu9-WZm0W0wX-PKCws4mFBGtYk,17001
15
+ magic_pdf/filter/pdf_classify_by_type.py,sha256=spmDO-f2ihAl1d6-EP-j271Yi50oyu6mw4X2kRd_m0s,42320
16
+ magic_pdf/filter/pdf_meta_scan.py,sha256=5R2XDiBZw0xd4ugbDxuyk6fztGlT5jFsGN85hLvo-hQ,17390
17
17
  magic_pdf/layout/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
18
18
  magic_pdf/layout/bbox_sort.py,sha256=PzzaBf6MC_AZ-ZWGU0Kg-KIsw874l_gML73mM3hE4Ps,30807
19
19
  magic_pdf/layout/layout_det_utils.py,sha256=NCYBTvsrULE3Cue53aMD1MfXTmOL9Xy0nivl6ku2cls,9137
@@ -42,15 +42,18 @@ magic_pdf/libs/math.py,sha256=tqljQOgqh3fZc146HYhO88JXJaiXMVwArBkk_CSGICc,177
42
42
  magic_pdf/libs/nlp_utils.py,sha256=-X9W3-Ns5ZdDYFvyyEq6i6P2b5hCATaFEZeOjwNOH9M,6901
43
43
  magic_pdf/libs/ocr_content_type.py,sha256=DiGTYppd6WlibwCAeVpIy3NHCQkglfIAQsJ_ffu5BPw,526
44
44
  magic_pdf/libs/path_utils.py,sha256=YYh8a0K8KiUhFDd_S1oLohL8n8fcSos4iMj74YEJ57s,538
45
+ magic_pdf/libs/pdf_check.py,sha256=LeCoMTVaVPWTgE0MSD6OnyXbpdjV7HfiX1RD6xesIWM,1911
45
46
  magic_pdf/libs/pdf_image_tools.py,sha256=CAd01giTKr_UJz1_QtDOARG9G9z69GFpzRZwcWSfLtE,1282
46
47
  magic_pdf/libs/safe_filename.py,sha256=ckwcM_eqoysTb5id8czp-tXq2G9da0-l3pshZDCHQtE,236
47
48
  magic_pdf/libs/textbase.py,sha256=SC1Frhz3Fb7V7n2SFRBsl7Bmg0JZdlvZskq0lfW1vIk,732
48
- magic_pdf/libs/version.py,sha256=DITpct-LrdIsTgwx2NgH5Ghx5y8Xgz1YMimy1ZV5RTY,22
49
+ magic_pdf/libs/version.py,sha256=CMH34Gt1AqO7z_TqRj94XwohGoVCf8aes0djkqm45mk,22
49
50
  magic_pdf/libs/vis_utils.py,sha256=hTOTEakKV0pGMbk0tbRkVI_tku7A3dGc96ynObZ4kwI,10207
51
+ magic_pdf/model/360_layout_analysis.py,sha256=GbchKPJRVcrxvwNXMnR4vt8lOLPauTWMl-43ayyhX7U,221
50
52
  magic_pdf/model/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
51
- magic_pdf/model/doc_analyze_by_360layout.py,sha256=GbchKPJRVcrxvwNXMnR4vt8lOLPauTWMl-43ayyhX7U,221
52
- magic_pdf/model/doc_analyze_by_pp_structurev2.py,sha256=ry2sLGt10ShgvHZvhpf_QA0QGG9kXRdoAsYmxLNcPWE,4082
53
+ magic_pdf/model/doc_analyze_by_custom_model.py,sha256=QD4NWEIz8UXdIG4V_3P8EaYesxk6PvC1SOtTWEy2GEY,2007
53
54
  magic_pdf/model/magic_model.py,sha256=2H6Gz1mg0f0YCvz-TLIWrAWXCQLgZftBXJNRPlSIjwc,25077
55
+ magic_pdf/model/model_list.py,sha256=dNfnDodnbkgIW0PFDjn_KsQMA8DODOzo4Z4jxfOilaA,44
56
+ magic_pdf/model/pp_structure_v2.py,sha256=qsyt9vFDGaVizBMiSaeFVHTDsJTrIHx46Ec2J8SOj1A,2469
54
57
  magic_pdf/para/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
55
58
  magic_pdf/para/block_continuation_processor.py,sha256=IkReB5hirjm0OAirNzQQpnWe2f2bdP3Hug3Ef8qTRDA,22749
56
59
  magic_pdf/para/block_termination_processor.py,sha256=YU3ZYqJy9e3OQmOuQYZrR6AUpmAlQ0mhj0PgZZPZ_fM,17957
@@ -65,10 +68,10 @@ magic_pdf/para/para_split_v2.py,sha256=a04dsUFE3JD4DA9e2DULJgbKrcqWuCfK58de1p-T3
65
68
  magic_pdf/para/raw_processor.py,sha256=mHxD9FrdOSXH7NqM41s55URyCyuyACvm9kKtowkIb3k,6317
66
69
  magic_pdf/para/stats.py,sha256=-6Pf9Y8jkP1uJOYWiHUjw9Lb-Fb9GY7MHr_ok7x2GX0,9731
67
70
  magic_pdf/para/title_processor.py,sha256=pYZv9vEkIjAtCz8jIUtl9AVUy_ib5SdAZmMVoZtsMRI,38593
68
- magic_pdf/pipe/AbsPipe.py,sha256=jUngTfYeVeltp03QwTcZvmBYghTgA5Gd7SdZSsFUr0o,3932
69
- magic_pdf/pipe/OCRPipe.py,sha256=av5-mJ-tYSdV13v1DbVruf1rUZ72YeAxTkN-x52mSfo,1198
70
- magic_pdf/pipe/TXTPipe.py,sha256=-RSHV-CJfod4bvOghd3XsaQ1iMiSPkrW5R5j4VSTJW4,1257
71
- magic_pdf/pipe/UNIPipe.py,sha256=FRnB2L1swqhsT8gGjeV4NCx9_ACtcRKGeUQcTQp_uRM,3509
71
+ magic_pdf/pipe/AbsPipe.py,sha256=28e3HxybBO86npy_L4WD6F7hfjKHHc86-IhiwzAnLdk,3979
72
+ magic_pdf/pipe/OCRPipe.py,sha256=iKnNveVfsrBGl_2Xtd4hAAS5HntYyjwfBeVIKGc8V5U,1196
73
+ magic_pdf/pipe/TXTPipe.py,sha256=R0UzMZ7Z_59Vh7cPdBAO4gvHtgA5wLoODnCPnpEjbPM,1255
74
+ magic_pdf/pipe/UNIPipe.py,sha256=47a9jx1a_zO4m3sVnhcOnrmNc_QT-TI-9mv2x7L6SrQ,3507
72
75
  magic_pdf/pipe/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
73
76
  magic_pdf/post_proc/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
74
77
  magic_pdf/post_proc/detect_para.py,sha256=5LX86ueHQGOV9CNimAxqZH4R3KTi78leum1de_Na0pw,126181
@@ -114,8 +117,8 @@ magic_pdf/train_utils/convert_to_train_format.py,sha256=ifo2FAoBMa_etCvz0O4v03xO
114
117
  magic_pdf/train_utils/extract_caption.py,sha256=gommEqIEWLplSDEJWD7_66daqlOBsWhpRBW1DHpkny4,1825
115
118
  magic_pdf/train_utils/remove_footer_header.py,sha256=pyeNNdJ-th3wl5Xwb10ZLYNaFN4-6BmahoMFE8VTNNs,5978
116
119
  magic_pdf/train_utils/vis_utils.py,sha256=MV9N9cT3ifJ35u7LFKGF9I_bOIQrtU1zcsxu2hj3aqM,10111
117
- magic_pdf-0.5.4.dist-info/LICENSE.md,sha256=hIahDEOTzuHCU5J2nd07LWwkLW7Hko4UFO__ffsvB-8,34523
118
- magic_pdf-0.5.4.dist-info/METADATA,sha256=hIojO9tsfaKc8J5UhdZR8dGWsg19vSFTEKj7bM-V80M,669
119
- magic_pdf-0.5.4.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
120
- magic_pdf-0.5.4.dist-info/top_level.txt,sha256=J9I0AzmHWGkp9c6DL8Oe4mEx3yYphLzkRn4H25Lg1rE,10
121
- magic_pdf-0.5.4.dist-info/RECORD,,
120
+ magic_pdf-0.5.6.dist-info/LICENSE.md,sha256=hIahDEOTzuHCU5J2nd07LWwkLW7Hko4UFO__ffsvB-8,34523
121
+ magic_pdf-0.5.6.dist-info/METADATA,sha256=R1Rjdsta6IJ197EPwgSb7c-LtgPg2HnLibsGKRUa-i4,814
122
+ magic_pdf-0.5.6.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
123
+ magic_pdf-0.5.6.dist-info/top_level.txt,sha256=J9I0AzmHWGkp9c6DL8Oe4mEx3yYphLzkRn4H25Lg1rE,10
124
+ magic_pdf-0.5.6.dist-info/RECORD,,
@@ -1,125 +0,0 @@
1
- import random
2
-
3
- import fitz
4
- import cv2
5
- from paddleocr import PPStructure
6
- from PIL import Image
7
- from loguru import logger
8
- import numpy as np
9
-
10
- def region_to_bbox(region):
11
- x0 = region[0][0]
12
- y0 = region[0][1]
13
- x1 = region[2][0]
14
- y1 = region[2][1]
15
- return [x0, y0, x1, y1]
16
-
17
-
18
- def dict_compare(d1, d2):
19
- return d1.items() == d2.items()
20
-
21
-
22
- def remove_duplicates_dicts(lst):
23
- unique_dicts = []
24
- for dict_item in lst:
25
- if not any(dict_compare(dict_item, existing_dict) for existing_dict in unique_dicts):
26
- unique_dicts.append(dict_item)
27
- return unique_dicts
28
- def doc_analyze(pdf_bytes: bytes, ocr: bool = False, show_log: bool = False):
29
- ocr_engine = PPStructure(table=False, ocr=ocr, show_log=show_log)
30
-
31
- imgs = []
32
- with fitz.open("pdf", pdf_bytes) as doc:
33
- for index in range(0, doc.page_count):
34
- page = doc[index]
35
- dpi = 200
36
- mat = fitz.Matrix(dpi / 72, dpi / 72)
37
- pm = page.get_pixmap(matrix=mat, alpha=False)
38
-
39
- # if width or height > 2000 pixels, don't enlarge the image
40
- # if pm.width > 2000 or pm.height > 2000:
41
- # pm = page.get_pixmap(matrix=fitz.Matrix(1, 1), alpha=False)
42
-
43
- img = Image.frombytes("RGB", [pm.width, pm.height], pm.samples)
44
- img = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)
45
- img_dict = {
46
- "img": img,
47
- "width": pm.width,
48
- "height": pm.height
49
- }
50
- imgs.append(img_dict)
51
-
52
- model_json = []
53
- for index, img_dict in enumerate(imgs):
54
- img = img_dict['img']
55
- page_width = img_dict['width']
56
- page_height = img_dict['height']
57
- result = ocr_engine(img)
58
- spans = []
59
- for line in result:
60
- line.pop('img')
61
- '''
62
- 为paddle输出适配type no.
63
- title: 0 # 标题
64
- text: 1 # 文本
65
- header: 2 # abandon
66
- footer: 2 # abandon
67
- reference: 1 # 文本 or abandon
68
- equation: 8 # 行间公式 block
69
- equation: 14 # 行间公式 text
70
- figure: 3 # 图片
71
- figure_caption: 4 # 图片描述
72
- table: 5 # 表格
73
- table_caption: 6 # 表格描述
74
- '''
75
- if line['type'] == 'title':
76
- line['category_id'] = 0
77
- elif line['type'] in ['text', 'reference']:
78
- line['category_id'] = 1
79
- elif line['type'] == 'figure':
80
- line['category_id'] = 3
81
- elif line['type'] == 'figure_caption':
82
- line['category_id'] = 4
83
- elif line['type'] == 'table':
84
- line['category_id'] = 5
85
- elif line['type'] == 'table_caption':
86
- line['category_id'] = 6
87
- elif line['type'] == 'equation':
88
- line['category_id'] = 8
89
- elif line['type'] in ['header', 'footer']:
90
- line['category_id'] = 2
91
- else:
92
- logger.warning(f"unknown type: {line['type']}")
93
-
94
- # 兼容不输出score的paddleocr版本
95
- if line.get("score") is None:
96
- line['score'] = 0.5 + random.random() * 0.5
97
-
98
- res = line.pop('res', None)
99
- if res is not None and len(res) > 0:
100
- for span in res:
101
- new_span = {'category_id': 15,
102
- 'bbox': region_to_bbox(span['text_region']),
103
- 'score': span['confidence'],
104
- 'text': span['text']
105
- }
106
- spans.append(new_span)
107
-
108
- if len(spans) > 0:
109
- result.extend(spans)
110
-
111
- result = remove_duplicates_dicts(result)
112
-
113
- page_info = {
114
- "page_no": index,
115
- "height": page_height,
116
- "width": page_width
117
- }
118
- page_dict = {
119
- "layout_dets": result,
120
- "page_info": page_info
121
- }
122
-
123
- model_json.append(page_dict)
124
-
125
- return model_json