magic-pdf 0.9.3__py3-none-any.whl → 0.10.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. magic_pdf/config/constants.py +53 -0
  2. magic_pdf/config/drop_reason.py +35 -0
  3. magic_pdf/config/drop_tag.py +19 -0
  4. magic_pdf/config/make_content_config.py +11 -0
  5. magic_pdf/{libs/ModelBlockTypeEnum.py → config/model_block_type.py} +2 -1
  6. magic_pdf/data/read_api.py +1 -1
  7. magic_pdf/dict2md/mkcontent.py +226 -185
  8. magic_pdf/dict2md/ocr_mkcontent.py +11 -11
  9. magic_pdf/filter/pdf_meta_scan.py +101 -79
  10. magic_pdf/integrations/rag/utils.py +4 -5
  11. magic_pdf/libs/config_reader.py +5 -5
  12. magic_pdf/libs/draw_bbox.py +3 -2
  13. magic_pdf/libs/pdf_image_tools.py +36 -12
  14. magic_pdf/libs/version.py +1 -1
  15. magic_pdf/model/doc_analyze_by_custom_model.py +2 -0
  16. magic_pdf/model/magic_model.py +13 -13
  17. magic_pdf/model/pdf_extract_kit.py +122 -76
  18. magic_pdf/model/sub_modules/model_init.py +40 -35
  19. magic_pdf/model/sub_modules/ocr/paddleocr/ocr_utils.py +33 -7
  20. magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_273_mod.py +12 -4
  21. magic_pdf/model/sub_modules/table/rapidtable/rapid_table.py +2 -0
  22. magic_pdf/model/sub_modules/table/tablemaster/tablemaster_paddle.py +30 -28
  23. magic_pdf/para/para_split.py +411 -248
  24. magic_pdf/para/para_split_v2.py +352 -182
  25. magic_pdf/para/para_split_v3.py +110 -53
  26. magic_pdf/pdf_parse_by_ocr.py +2 -0
  27. magic_pdf/pdf_parse_by_txt.py +2 -0
  28. magic_pdf/pdf_parse_union_core.py +174 -100
  29. magic_pdf/pdf_parse_union_core_v2.py +202 -36
  30. magic_pdf/pipe/AbsPipe.py +28 -44
  31. magic_pdf/pipe/OCRPipe.py +5 -5
  32. magic_pdf/pipe/TXTPipe.py +5 -6
  33. magic_pdf/pipe/UNIPipe.py +24 -25
  34. magic_pdf/post_proc/pdf_post_filter.py +7 -14
  35. magic_pdf/pre_proc/cut_image.py +9 -11
  36. magic_pdf/pre_proc/equations_replace.py +203 -212
  37. magic_pdf/pre_proc/ocr_detect_all_bboxes.py +235 -49
  38. magic_pdf/pre_proc/ocr_dict_merge.py +5 -5
  39. magic_pdf/pre_proc/ocr_span_list_modify.py +122 -63
  40. magic_pdf/pre_proc/pdf_pre_filter.py +37 -33
  41. magic_pdf/pre_proc/remove_bbox_overlap.py +20 -18
  42. magic_pdf/pre_proc/remove_colored_strip_bbox.py +36 -14
  43. magic_pdf/pre_proc/remove_footer_header.py +2 -5
  44. magic_pdf/pre_proc/remove_rotate_bbox.py +111 -63
  45. magic_pdf/pre_proc/resolve_bbox_conflict.py +10 -17
  46. magic_pdf/spark/spark_api.py +15 -17
  47. magic_pdf/tools/cli.py +3 -4
  48. magic_pdf/tools/cli_dev.py +6 -9
  49. magic_pdf/tools/common.py +26 -36
  50. magic_pdf/user_api.py +29 -38
  51. {magic_pdf-0.9.3.dist-info → magic_pdf-0.10.0.dist-info}/METADATA +11 -12
  52. {magic_pdf-0.9.3.dist-info → magic_pdf-0.10.0.dist-info}/RECORD +57 -58
  53. magic_pdf/libs/Constants.py +0 -55
  54. magic_pdf/libs/MakeContentConfig.py +0 -11
  55. magic_pdf/libs/drop_reason.py +0 -27
  56. magic_pdf/libs/drop_tag.py +0 -19
  57. magic_pdf/para/para_pipeline.py +0 -297
  58. /magic_pdf/{libs → config}/ocr_content_type.py +0 -0
  59. {magic_pdf-0.9.3.dist-info → magic_pdf-0.10.0.dist-info}/LICENSE.md +0 -0
  60. {magic_pdf-0.9.3.dist-info → magic_pdf-0.10.0.dist-info}/WHEEL +0 -0
  61. {magic_pdf-0.9.3.dist-info → magic_pdf-0.10.0.dist-info}/entry_points.txt +0 -0
  62. {magic_pdf-0.9.3.dist-info → magic_pdf-0.10.0.dist-info}/top_level.txt +0 -0
@@ -1,19 +1,21 @@
1
1
  import math
2
+ import re
2
3
 
4
+ from magic_pdf.config.drop_tag import (EMPTY_SIDE_BLOCK, ROTATE_TEXT,
5
+ VERTICAL_TEXT)
3
6
  from magic_pdf.libs.boxbase import is_vbox_on_side
4
- from magic_pdf.libs.drop_tag import EMPTY_SIDE_BLOCK, ROTATE_TEXT, VERTICAL_TEXT
5
7
 
6
8
 
7
9
  def detect_non_horizontal_texts(result_dict):
8
- """
9
- This function detects watermarks and vertical margin notes in the document.
10
+ """This function detects watermarks and vertical margin notes in the
11
+ document.
10
12
 
11
13
  Watermarks are identified by finding blocks with the same coordinates and frequently occurring identical texts across multiple pages.
12
14
  If these conditions are met, the blocks are highly likely to be watermarks, as opposed to headers or footers, which can change from page to page.
13
15
  If the direction of these blocks is not horizontal, they are definitely considered to be watermarks.
14
16
 
15
17
  Vertical margin notes are identified by finding blocks with the same coordinates and frequently occurring identical texts across multiple pages.
16
- If these conditions are met, the blocks are highly likely to be vertical margin notes, which typically appear on the left and right sides of the page.
18
+ If these conditions are met, the blocks are highly likely to be vertical margin notes, which typically appear on the left and right sides of the page. # noqa: E501
17
19
  If the direction of these blocks is vertical, they are definitely considered to be vertical margin notes.
18
20
 
19
21
 
@@ -32,13 +34,16 @@ def detect_non_horizontal_texts(result_dict):
32
34
  potential_margin_notes = {}
33
35
 
34
36
  for page_id, page_content in result_dict.items():
35
- if page_id.startswith("page_"):
37
+ if page_id.startswith('page_'):
36
38
  for block_id, block_data in page_content.items():
37
- if block_id.startswith("block_"):
38
- if "dir" in block_data:
39
- coordinates_text = (block_data["bbox"], block_data["text"]) # Tuple of coordinates and text
40
-
41
- angle = math.atan2(block_data["dir"][1], block_data["dir"][0])
39
+ if block_id.startswith('block_'):
40
+ if 'dir' in block_data:
41
+ coordinates_text = (
42
+ block_data['bbox'],
43
+ block_data['text'],
44
+ ) # Tuple of coordinates and text
45
+
46
+ angle = math.atan2(block_data['dir'][1], block_data['dir'][0])
42
47
  angle = abs(math.degrees(angle))
43
48
 
44
49
  if angle > 5 and angle < 85: # Check if direction is watermarks
@@ -49,32 +54,40 @@ def detect_non_horizontal_texts(result_dict):
49
54
 
50
55
  if angle > 85 and angle < 105: # Check if direction is vertical
51
56
  if coordinates_text in potential_margin_notes:
52
- potential_margin_notes[coordinates_text] += 1 # Increment count
57
+ potential_margin_notes[coordinates_text] += (
58
+ 1 # Increment count
59
+ )
53
60
  else:
54
- potential_margin_notes[coordinates_text] = 1 # Initialize count
61
+ potential_margin_notes[coordinates_text] = (
62
+ 1 # Initialize count
63
+ )
55
64
 
56
65
  # Identify watermarks by finding entries with counts higher than a threshold (e.g., appearing on more than half of the pages)
57
66
  watermark_threshold = len(result_dict) // 2
58
- watermarks = {k: v for k, v in potential_watermarks.items() if v > watermark_threshold}
67
+ watermarks = {
68
+ k: v for k, v in potential_watermarks.items() if v > watermark_threshold
69
+ }
59
70
 
60
71
  # Identify margin notes by finding entries with counts higher than a threshold (e.g., appearing on more than half of the pages)
61
72
  margin_note_threshold = len(result_dict) // 2
62
- margin_notes = {k: v for k, v in potential_margin_notes.items() if v > margin_note_threshold}
73
+ margin_notes = {
74
+ k: v for k, v in potential_margin_notes.items() if v > margin_note_threshold
75
+ }
63
76
 
64
77
  # Add watermark information to the result dictionary
65
78
  for page_id, blocks in result_dict.items():
66
- if page_id.startswith("page_"):
79
+ if page_id.startswith('page_'):
67
80
  for block_id, block_data in blocks.items():
68
- coordinates_text = (block_data["bbox"], block_data["text"])
81
+ coordinates_text = (block_data['bbox'], block_data['text'])
69
82
  if coordinates_text in watermarks:
70
- block_data["is_watermark"] = 1
83
+ block_data['is_watermark'] = 1
71
84
  else:
72
- block_data["is_watermark"] = 0
85
+ block_data['is_watermark'] = 0
73
86
 
74
87
  if coordinates_text in margin_notes:
75
- block_data["is_vertical_margin_note"] = 1
88
+ block_data['is_vertical_margin_note'] = 1
76
89
  else:
77
- block_data["is_vertical_margin_note"] = 0
90
+ block_data['is_vertical_margin_note'] = 0
78
91
 
79
92
  return result_dict
80
93
 
@@ -83,21 +96,21 @@ def detect_non_horizontal_texts(result_dict):
83
96
  1. 当一个block里全部文字都不是dir=(1,0),这个block整体去掉
84
97
  2. 当一个block里全部文字都是dir=(1,0),但是每行只有一个字,这个block整体去掉。这个block必须出现在页面的四周,否则不去掉
85
98
  """
86
- import re
99
+
87
100
 
88
101
  def __is_a_word(sentence):
89
102
  # 如果输入是中文并且长度为1,则返回True
90
103
  if re.fullmatch(r'[\u4e00-\u9fa5]', sentence):
91
104
  return True
92
105
  # 判断是否为单个英文单词或字符(包括ASCII标点)
93
- elif re.fullmatch(r'[a-zA-Z0-9]+', sentence) and len(sentence) <=2:
106
+ elif re.fullmatch(r'[a-zA-Z0-9]+', sentence) and len(sentence) <= 2:
94
107
  return True
95
108
  else:
96
109
  return False
97
110
 
98
111
 
99
112
  def __get_text_color(num):
100
- """获取字体的颜色RGB"""
113
+ """获取字体的颜色RGB值."""
101
114
  blue = num & 255
102
115
  green = (num >> 8) & 255
103
116
  red = (num >> 16) & 255
@@ -105,84 +118,119 @@ def __get_text_color(num):
105
118
 
106
119
 
107
120
  def __is_empty_side_box(text_block):
108
- """
109
- 是否是边缘上的空白没有任何内容的block
110
- """
121
+ """是否是边缘上的空白没有任何内容的block."""
111
122
  for line in text_block['lines']:
112
123
  for span in line['spans']:
113
124
  font_color = span['color']
114
- r,g,b = __get_text_color(font_color)
115
- if len(span['text'].strip())>0 and (r,g,b)!=(255,255,255):
125
+ r, g, b = __get_text_color(font_color)
126
+ if len(span['text'].strip()) > 0 and (r, g, b) != (255, 255, 255):
116
127
  return False
117
-
128
+
118
129
  return True
119
130
 
120
131
 
121
132
  def remove_rotate_side_textblock(pymu_text_block, page_width, page_height):
122
- """
123
- 返回删除了垂直,水印,旋转的textblock
124
- 删除的内容打上tag返回
125
- """
133
+ """返回删除了垂直,水印,旋转的textblock 删除的内容打上tag返回."""
126
134
  removed_text_block = []
127
-
128
- for i, block in enumerate(pymu_text_block): # 格式参考test/assets/papre/pymu_textblocks.json
135
+
136
+ for i, block in enumerate(
137
+ pymu_text_block
138
+ ): # 格式参考test/assets/papre/pymu_textblocks.json
129
139
  lines = block['lines']
130
140
  block_bbox = block['bbox']
131
- if not is_vbox_on_side(block_bbox, page_width, page_height, 0.2): # 保证这些box必须在页面的两边
132
- continue
133
-
134
- if all([__is_a_word(line['spans'][0]["text"]) for line in lines if len(line['spans'])>0]) and len(lines)>1 and all([len(line['spans'])==1 for line in lines]):
135
- is_box_valign = (len(set([int(line['spans'][0]['bbox'][0] ) for line in lines if len(line['spans'])>0]))==1) and (len([int(line['spans'][0]['bbox'][0] ) for line in lines if len(line['spans'])>0])>1) # 测试bbox在垂直方向是不是x0都相等,也就是在垂直方向排列.同时必须大于等于2个字
136
-
141
+ if not is_vbox_on_side(
142
+ block_bbox, page_width, page_height, 0.2
143
+ ): # 保证这些box必须在页面的两边
144
+ continue
145
+
146
+ if (
147
+ all(
148
+ [
149
+ __is_a_word(line['spans'][0]['text'])
150
+ for line in lines
151
+ if len(line['spans']) > 0
152
+ ]
153
+ )
154
+ and len(lines) > 1
155
+ and all([len(line['spans']) == 1 for line in lines])
156
+ ):
157
+ is_box_valign = (
158
+ (
159
+ len(
160
+ set(
161
+ [
162
+ int(line['spans'][0]['bbox'][0])
163
+ for line in lines
164
+ if len(line['spans']) > 0
165
+ ]
166
+ )
167
+ )
168
+ == 1
169
+ )
170
+ and (
171
+ len(
172
+ [
173
+ int(line['spans'][0]['bbox'][0])
174
+ for line in lines
175
+ if len(line['spans']) > 0
176
+ ]
177
+ )
178
+ > 1
179
+ )
180
+ ) # 测试bbox在垂直方向是不是x0都相等,也就是在垂直方向排列.同时必须大于等于2个字
181
+
137
182
  if is_box_valign:
138
183
  block['tag'] = VERTICAL_TEXT
139
184
  removed_text_block.append(block)
140
185
  continue
141
-
186
+
142
187
  for line in lines:
143
- if line['dir']!=(1,0):
188
+ if line['dir'] != (1, 0):
144
189
  block['tag'] = ROTATE_TEXT
145
- removed_text_block.append(block) # 只要有一个line不是dir=(1,0),就把整个block都删掉
190
+ removed_text_block.append(
191
+ block
192
+ ) # 只要有一个line不是dir=(1,0),就把整个block都删掉
146
193
  break
147
-
194
+
148
195
  for block in removed_text_block:
149
196
  pymu_text_block.remove(block)
150
-
197
+
151
198
  return pymu_text_block, removed_text_block
152
199
 
200
+
153
201
  def get_side_boundry(rotate_bbox, page_width, page_height):
154
- """
155
- 根据rotate_bbox,返回页面的左右正文边界
156
- """
202
+ """根据rotate_bbox,返回页面的左右正文边界."""
157
203
  left_x = 0
158
204
  right_x = page_width
159
205
  for x in rotate_bbox:
160
206
  box = x['bbox']
161
- if box[2]<page_width/2:
207
+ if box[2] < page_width / 2:
162
208
  left_x = max(left_x, box[2])
163
209
  else:
164
210
  right_x = min(right_x, box[0])
165
-
166
- return left_x+1, right_x-1
211
+
212
+ return left_x + 1, right_x - 1
167
213
 
168
214
 
169
215
  def remove_side_blank_block(pymu_text_block, page_width, page_height):
170
- """
171
- 删除页面两侧的空白block
172
- """
216
+ """删除页面两侧的空白block."""
173
217
  removed_text_block = []
174
-
175
- for i, block in enumerate(pymu_text_block): # 格式参考test/assets/papre/pymu_textblocks.json
218
+
219
+ for i, block in enumerate(
220
+ pymu_text_block
221
+ ): # 格式参考test/assets/papre/pymu_textblocks.json
176
222
  block_bbox = block['bbox']
177
- if not is_vbox_on_side(block_bbox, page_width, page_height, 0.2): # 保证这些box必须在页面的两边
178
- continue
179
-
223
+ if not is_vbox_on_side(
224
+ block_bbox, page_width, page_height, 0.2
225
+ ): # 保证这些box必须在页面的两边
226
+ continue
227
+
180
228
  if __is_empty_side_box(block):
181
229
  block['tag'] = EMPTY_SIDE_BLOCK
182
230
  removed_text_block.append(block)
183
231
  continue
184
-
232
+
185
233
  for block in removed_text_block:
186
234
  pymu_text_block.remove(block)
187
-
188
- return pymu_text_block, removed_text_block
235
+
236
+ return pymu_text_block, removed_text_block
@@ -4,8 +4,9 @@
4
4
  2. 然后去掉出现在文字blcok上的图片bbox
5
5
  """
6
6
 
7
- from magic_pdf.libs.boxbase import _is_in, _is_in_or_part_overlap, _is_left_overlap
8
- from magic_pdf.libs.drop_tag import ON_IMAGE_TEXT, ON_TABLE_TEXT
7
+ from magic_pdf.config.drop_tag import ON_IMAGE_TEXT, ON_TABLE_TEXT
8
+ from magic_pdf.libs.boxbase import (_is_in, _is_in_or_part_overlap,
9
+ _is_left_overlap)
9
10
 
10
11
 
11
12
  def resolve_bbox_overlap_conflict(images: list, tables: list, interline_equations: list, inline_equations: list,
@@ -26,14 +27,14 @@ def resolve_bbox_overlap_conflict(images: list, tables: list, interline_equation
26
27
  # 去掉位于图片上的文字block
27
28
  for image_box in images:
28
29
  for text_block in text_raw_blocks:
29
- text_bbox = text_block["bbox"]
30
+ text_bbox = text_block['bbox']
30
31
  if _is_in(text_bbox, image_box):
31
32
  text_block['tag'] = ON_IMAGE_TEXT
32
33
  text_block_removed.append(text_block)
33
34
  # 去掉table上的文字block
34
35
  for table_box in tables:
35
36
  for text_block in text_raw_blocks:
36
- text_bbox = text_block["bbox"]
37
+ text_bbox = text_block['bbox']
37
38
  if _is_in(text_bbox, table_box):
38
39
  text_block['tag'] = ON_TABLE_TEXT
39
40
  text_block_removed.append(text_block)
@@ -77,7 +78,7 @@ def resolve_bbox_overlap_conflict(images: list, tables: list, interline_equation
77
78
  # 图片和文字重叠,丢掉图片
78
79
  for image_box in images:
79
80
  for text_block in text_raw_blocks:
80
- text_bbox = text_block["bbox"]
81
+ text_bbox = text_block['bbox']
81
82
  if _is_in_or_part_overlap(image_box, text_bbox):
82
83
  images_backup.append(image_box)
83
84
  break
@@ -122,11 +123,7 @@ def resolve_bbox_overlap_conflict(images: list, tables: list, interline_equation
122
123
 
123
124
 
124
125
  def check_text_block_horizontal_overlap(text_blocks: list, header, footer) -> bool:
125
- """
126
- 检查文本block之间的水平重叠情况,这种情况如果发生,那么这个pdf就不再继续处理了。
127
- 因为这种情况大概率发生了公式没有被检测出来。
128
-
129
- """
126
+ """检查文本block之间的水平重叠情况,这种情况如果发生,那么这个pdf就不再继续处理了。 因为这种情况大概率发生了公式没有被检测出来。"""
130
127
  if len(text_blocks) == 0:
131
128
  return False
132
129
 
@@ -148,7 +145,7 @@ def check_text_block_horizontal_overlap(text_blocks: list, header, footer) -> bo
148
145
 
149
146
  txt_bboxes = []
150
147
  for text_block in text_blocks:
151
- bbox = text_block["bbox"]
148
+ bbox = text_block['bbox']
152
149
  if bbox[1] >= clip_y0 and bbox[3] <= clip_y1:
153
150
  txt_bboxes.append(bbox)
154
151
 
@@ -161,11 +158,7 @@ def check_text_block_horizontal_overlap(text_blocks: list, header, footer) -> bo
161
158
 
162
159
 
163
160
  def check_useful_block_horizontal_overlap(useful_blocks: list) -> bool:
164
- """
165
- 检查文本block之间的水平重叠情况,这种情况如果发生,那么这个pdf就不再继续处理了。
166
- 因为这种情况大概率发生了公式没有被检测出来。
167
-
168
- """
161
+ """检查文本block之间的水平重叠情况,这种情况如果发生,那么这个pdf就不再继续处理了。 因为这种情况大概率发生了公式没有被检测出来。"""
169
162
  if len(useful_blocks) == 0:
170
163
  return False
171
164
 
@@ -174,7 +167,7 @@ def check_useful_block_horizontal_overlap(useful_blocks: list) -> bool:
174
167
 
175
168
  useful_bboxes = []
176
169
  for text_block in useful_blocks:
177
- bbox = text_block["bbox"]
170
+ bbox = text_block['bbox']
178
171
  if bbox[1] >= page_min_y and bbox[3] <= page_max_y:
179
172
  useful_bboxes.append(bbox)
180
173
 
@@ -1,51 +1,49 @@
1
1
  from loguru import logger
2
2
 
3
- from magic_pdf.libs.drop_reason import DropReason
3
+ from magic_pdf.config.drop_reason import DropReason
4
4
 
5
5
 
6
6
  def get_data_source(jso: dict):
7
- data_source = jso.get("data_source")
7
+ data_source = jso.get('data_source')
8
8
  if data_source is None:
9
- data_source = jso.get("file_source")
9
+ data_source = jso.get('file_source')
10
10
  return data_source
11
11
 
12
12
 
13
13
  def get_data_type(jso: dict):
14
- data_type = jso.get("data_type")
14
+ data_type = jso.get('data_type')
15
15
  if data_type is None:
16
- data_type = jso.get("file_type")
16
+ data_type = jso.get('file_type')
17
17
  return data_type
18
18
 
19
19
 
20
20
  def get_bookid(jso: dict):
21
- book_id = jso.get("bookid")
21
+ book_id = jso.get('bookid')
22
22
  if book_id is None:
23
- book_id = jso.get("original_file_id")
23
+ book_id = jso.get('original_file_id')
24
24
  return book_id
25
25
 
26
26
 
27
27
  def exception_handler(jso: dict, e):
28
28
  logger.exception(e)
29
- jso["_need_drop"] = True
30
- jso["_drop_reason"] = DropReason.Exception
31
- jso["_exception"] = f"ERROR: {e}"
29
+ jso['_need_drop'] = True
30
+ jso['_drop_reason'] = DropReason.Exception
31
+ jso['_exception'] = f'ERROR: {e}'
32
32
  return jso
33
33
 
34
34
 
35
35
  def get_bookname(jso: dict):
36
36
  data_source = get_data_source(jso)
37
- file_id = jso.get("file_id")
38
- book_name = f"{data_source}/{file_id}"
37
+ file_id = jso.get('file_id')
38
+ book_name = f'{data_source}/{file_id}'
39
39
  return book_name
40
40
 
41
41
 
42
42
  def spark_json_extractor(jso: dict) -> dict:
43
43
 
44
- """
45
- 从json中提取数据,返回一个dict
46
- """
44
+ """从json中提取数据,返回一个dict."""
47
45
 
48
46
  return {
49
- "_pdf_type": jso["_pdf_type"],
50
- "model_list": jso["doc_layout_result"],
47
+ '_pdf_type': jso['_pdf_type'],
48
+ 'model_list': jso['doc_layout_result'],
51
49
  }
magic_pdf/tools/cli.py CHANGED
@@ -5,9 +5,8 @@ import click
5
5
  from loguru import logger
6
6
 
7
7
  import magic_pdf.model as model_config
8
+ from magic_pdf.data.data_reader_writer import FileBasedDataReader
8
9
  from magic_pdf.libs.version import __version__
9
- from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
10
- from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
11
10
  from magic_pdf.tools.common import do_parse, parse_pdf_methods
12
11
 
13
12
 
@@ -86,8 +85,8 @@ def cli(path, output_dir, method, lang, debug_able, start_page_id, end_page_id):
86
85
  os.makedirs(output_dir, exist_ok=True)
87
86
 
88
87
  def read_fn(path):
89
- disk_rw = DiskReaderWriter(os.path.dirname(path))
90
- return disk_rw.read(os.path.basename(path), AbsReaderWriter.MODE_BIN)
88
+ disk_rw = FileBasedDataReader(os.path.dirname(path))
89
+ return disk_rw.read(os.path.basename(path))
91
90
 
92
91
  def parse_doc(doc_path: str):
93
92
  try:
@@ -5,13 +5,11 @@ from pathlib import Path
5
5
  import click
6
6
 
7
7
  import magic_pdf.model as model_config
8
+ from magic_pdf.data.data_reader_writer import FileBasedDataReader, S3DataReader
8
9
  from magic_pdf.libs.config_reader import get_s3_config
9
10
  from magic_pdf.libs.path_utils import (parse_s3_range_params, parse_s3path,
10
11
  remove_non_official_s3_args)
11
12
  from magic_pdf.libs.version import __version__
12
- from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
13
- from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
14
- from magic_pdf.rw.S3ReaderWriter import S3ReaderWriter
15
13
  from magic_pdf.tools.common import do_parse, parse_pdf_methods
16
14
 
17
15
 
@@ -19,15 +17,14 @@ def read_s3_path(s3path):
19
17
  bucket, key = parse_s3path(s3path)
20
18
 
21
19
  s3_ak, s3_sk, s3_endpoint = get_s3_config(bucket)
22
- s3_rw = S3ReaderWriter(s3_ak, s3_sk, s3_endpoint, 'auto',
23
- remove_non_official_s3_args(s3path))
20
+ s3_rw = S3DataReader('', bucket, s3_ak, s3_sk, s3_endpoint, 'auto')
24
21
  may_range_params = parse_s3_range_params(s3path)
25
22
  if may_range_params is None or 2 != len(may_range_params):
26
- byte_start, byte_end = 0, None
23
+ byte_start, byte_end = 0, -1
27
24
  else:
28
25
  byte_start, byte_end = int(may_range_params[0]), int(
29
26
  may_range_params[1])
30
- return s3_rw.read_offset(
27
+ return s3_rw.read_at(
31
28
  remove_non_official_s3_args(s3path),
32
29
  byte_start,
33
30
  byte_end,
@@ -129,8 +126,8 @@ def pdf(pdf, json_data, output_dir, method):
129
126
  os.makedirs(output_dir, exist_ok=True)
130
127
 
131
128
  def read_fn(path):
132
- disk_rw = DiskReaderWriter(os.path.dirname(path))
133
- return disk_rw.read(os.path.basename(path), AbsReaderWriter.MODE_BIN)
129
+ disk_rw = FileBasedDataReader(os.path.dirname(path))
130
+ return disk_rw.read(os.path.basename(path))
134
131
 
135
132
  model_json_list = json_parse.loads(read_fn(json_data).decode('utf-8'))
136
133
 
magic_pdf/tools/common.py CHANGED
@@ -3,18 +3,18 @@ import json as json_parse
3
3
  import os
4
4
 
5
5
  import click
6
+ import fitz
6
7
  from loguru import logger
7
8
 
8
9
  import magic_pdf.model as model_config
10
+ from magic_pdf.config.make_content_config import DropMode, MakeMode
11
+ from magic_pdf.data.data_reader_writer import FileBasedDataWriter
9
12
  from magic_pdf.libs.draw_bbox import (draw_layout_bbox, draw_line_sort_bbox,
10
13
  draw_model_bbox, draw_span_bbox)
11
- from magic_pdf.libs.MakeContentConfig import DropMode, MakeMode
12
14
  from magic_pdf.pipe.OCRPipe import OCRPipe
13
15
  from magic_pdf.pipe.TXTPipe import TXTPipe
14
16
  from magic_pdf.pipe.UNIPipe import UNIPipe
15
- from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
16
- from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
17
- import fitz
17
+
18
18
  # from io import BytesIO
19
19
  # from pypdf import PdfReader, PdfWriter
20
20
 
@@ -54,11 +54,11 @@ def prepare_env(output_dir, pdf_file_name, method):
54
54
 
55
55
 
56
56
  def convert_pdf_bytes_to_bytes_by_pymupdf(pdf_bytes, start_page_id=0, end_page_id=None):
57
- document = fitz.open("pdf", pdf_bytes)
57
+ document = fitz.open('pdf', pdf_bytes)
58
58
  output_document = fitz.open()
59
59
  end_page_id = end_page_id if end_page_id is not None and end_page_id >= 0 else len(document) - 1
60
60
  if end_page_id > len(document) - 1:
61
- logger.warning("end_page_id is out of range, use pdf_docs length")
61
+ logger.warning('end_page_id is out of range, use pdf_docs length')
62
62
  end_page_id = len(document) - 1
63
63
  output_document.insert_pdf(document, from_page=start_page_id, to_page=end_page_id)
64
64
  output_bytes = output_document.tobytes()
@@ -94,14 +94,17 @@ def do_parse(
94
94
  f_draw_model_bbox = True
95
95
  f_draw_line_sort_bbox = True
96
96
 
97
+ if lang == "":
98
+ lang = None
99
+
97
100
  pdf_bytes = convert_pdf_bytes_to_bytes_by_pymupdf(pdf_bytes, start_page_id, end_page_id)
98
101
 
99
102
  orig_model_list = copy.deepcopy(model_list)
100
103
  local_image_dir, local_md_dir = prepare_env(output_dir, pdf_file_name,
101
104
  parse_method)
102
105
 
103
- image_writer, md_writer = DiskReaderWriter(
104
- local_image_dir), DiskReaderWriter(local_md_dir)
106
+ image_writer, md_writer = FileBasedDataWriter(
107
+ local_image_dir), FileBasedDataWriter(local_md_dir)
105
108
  image_dir = str(os.path.basename(local_image_dir))
106
109
 
107
110
  if parse_method == 'auto':
@@ -145,49 +148,36 @@ def do_parse(
145
148
  if f_draw_line_sort_bbox:
146
149
  draw_line_sort_bbox(pdf_info, pdf_bytes, local_md_dir, pdf_file_name)
147
150
 
148
- md_content = pipe.pipe_mk_markdown(image_dir,
149
- drop_mode=DropMode.NONE,
150
- md_make_mode=f_make_md_mode)
151
+ md_content = pipe.pipe_mk_markdown(image_dir, drop_mode=DropMode.NONE, md_make_mode=f_make_md_mode)
151
152
  if f_dump_md:
152
- md_writer.write(
153
- content=md_content,
154
- path=f'{pdf_file_name}.md',
155
- mode=AbsReaderWriter.MODE_TXT,
153
+ md_writer.write_string(
154
+ f'{pdf_file_name}.md',
155
+ md_content
156
156
  )
157
157
 
158
158
  if f_dump_middle_json:
159
- md_writer.write(
160
- content=json_parse.dumps(pipe.pdf_mid_data,
161
- ensure_ascii=False,
162
- indent=4),
163
- path=f'{pdf_file_name}_middle.json',
164
- mode=AbsReaderWriter.MODE_TXT,
159
+ md_writer.write_string(
160
+ f'{pdf_file_name}_middle.json',
161
+ json_parse.dumps(pipe.pdf_mid_data, ensure_ascii=False, indent=4)
165
162
  )
166
163
 
167
164
  if f_dump_model_json:
168
- md_writer.write(
169
- content=json_parse.dumps(orig_model_list,
170
- ensure_ascii=False,
171
- indent=4),
172
- path=f'{pdf_file_name}_model.json',
173
- mode=AbsReaderWriter.MODE_TXT,
165
+ md_writer.write_string(
166
+ f'{pdf_file_name}_model.json',
167
+ json_parse.dumps(orig_model_list, ensure_ascii=False, indent=4)
174
168
  )
175
169
 
176
170
  if f_dump_orig_pdf:
177
171
  md_writer.write(
178
- content=pdf_bytes,
179
- path=f'{pdf_file_name}_origin.pdf',
180
- mode=AbsReaderWriter.MODE_BIN,
172
+ f'{pdf_file_name}_origin.pdf',
173
+ pdf_bytes,
181
174
  )
182
175
 
183
176
  content_list = pipe.pipe_mk_uni_format(image_dir, drop_mode=DropMode.NONE)
184
177
  if f_dump_content_list:
185
- md_writer.write(
186
- content=json_parse.dumps(content_list,
187
- ensure_ascii=False,
188
- indent=4),
189
- path=f'{pdf_file_name}_content_list.json',
190
- mode=AbsReaderWriter.MODE_TXT,
178
+ md_writer.write_string(
179
+ f'{pdf_file_name}_content_list.json',
180
+ json_parse.dumps(content_list, ensure_ascii=False, indent=4)
191
181
  )
192
182
 
193
183
  logger.info(f'local output dir is {local_md_dir}')