magic-pdf 0.6.2b1__py3-none-any.whl → 0.7.0a1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (29) hide show
  1. magic_pdf/dict2md/ocr_mkcontent.py +10 -3
  2. magic_pdf/libs/config_reader.py +10 -10
  3. magic_pdf/libs/version.py +1 -1
  4. magic_pdf/model/doc_analyze_by_custom_model.py +8 -2
  5. magic_pdf/model/magic_model.py +4 -0
  6. magic_pdf/model/pdf_extract_kit.py +45 -2
  7. magic_pdf/model/pek_sub_modules/structeqtable/StructTableModel.py +22 -0
  8. magic_pdf/resources/model_config/model_configs.yaml +4 -0
  9. magic_pdf/rw/AbsReaderWriter.py +1 -18
  10. magic_pdf/rw/DiskReaderWriter.py +32 -24
  11. magic_pdf/rw/S3ReaderWriter.py +83 -48
  12. magic_pdf/tools/cli.py +79 -0
  13. magic_pdf/tools/cli_dev.py +156 -0
  14. magic_pdf/tools/common.py +119 -0
  15. {magic_pdf-0.6.2b1.dist-info → magic_pdf-0.7.0a1.dist-info}/METADATA +49 -31
  16. {magic_pdf-0.6.2b1.dist-info → magic_pdf-0.7.0a1.dist-info}/RECORD +22 -24
  17. {magic_pdf-0.6.2b1.dist-info → magic_pdf-0.7.0a1.dist-info}/WHEEL +1 -1
  18. magic_pdf-0.7.0a1.dist-info/entry_points.txt +3 -0
  19. magic_pdf/cli/magicpdf.py +0 -359
  20. magic_pdf/pdf_parse_for_train.py +0 -685
  21. magic_pdf/train_utils/convert_to_train_format.py +0 -65
  22. magic_pdf/train_utils/extract_caption.py +0 -59
  23. magic_pdf/train_utils/remove_footer_header.py +0 -159
  24. magic_pdf/train_utils/vis_utils.py +0 -327
  25. magic_pdf-0.6.2b1.dist-info/entry_points.txt +0 -2
  26. /magic_pdf/{cli → model/pek_sub_modules/structeqtable}/__init__.py +0 -0
  27. /magic_pdf/{train_utils → tools}/__init__.py +0 -0
  28. {magic_pdf-0.6.2b1.dist-info → magic_pdf-0.7.0a1.dist-info}/LICENSE.md +0 -0
  29. {magic_pdf-0.6.2b1.dist-info → magic_pdf-0.7.0a1.dist-info}/top_level.txt +0 -0
@@ -120,15 +120,20 @@ def ocr_mk_markdown_with_para_core_v2(paras_of_layout, mode, img_buket_path=""):
120
120
  if mode == 'nlp':
121
121
  continue
122
122
  elif mode == 'mm':
123
+ table_caption = ''
123
124
  for block in para_block['blocks']: # 1st.拼table_caption
124
125
  if block['type'] == BlockType.TableCaption:
125
- para_text += merge_para_with_text(block)
126
+ table_caption = merge_para_with_text(block)
126
127
  for block in para_block['blocks']: # 2nd.拼table_body
127
128
  if block['type'] == BlockType.TableBody:
128
129
  for line in block['lines']:
129
130
  for span in line['spans']:
130
131
  if span['type'] == ContentType.Table:
131
- para_text += f"\n![]({join_path(img_buket_path, span['image_path'])}) \n"
132
+ # if processed by table model
133
+ if span.get('latex', ''):
134
+ para_text += f"\n\n$\n {span['latex']}\n$\n\n"
135
+ else:
136
+ para_text += f"\n![{table_caption}]({join_path(img_buket_path, span['image_path'])}) \n"
132
137
  for block in para_block['blocks']: # 3rd.拼table_footnote
133
138
  if block['type'] == BlockType.TableFootnote:
134
139
  para_text += merge_para_with_text(block)
@@ -163,7 +168,7 @@ def merge_para_with_text(para_block):
163
168
  else:
164
169
  content = ocr_escape_special_markdown_char(content)
165
170
  elif span_type == ContentType.InlineEquation:
166
- content = f"${span['content']}$"
171
+ content = f" ${span['content']}$ "
167
172
  elif span_type == ContentType.InterlineEquation:
168
173
  content = f"\n$$\n{span['content']}\n$$\n"
169
174
 
@@ -249,6 +254,8 @@ def para_to_standard_format_v2(para_block, img_buket_path, page_idx):
249
254
  }
250
255
  for block in para_block['blocks']:
251
256
  if block['type'] == BlockType.TableBody:
257
+ if block["lines"][0]["spans"][0].get('latex', ''):
258
+ para_content['table_body'] = f"\n\n$\n {block['lines'][0]['spans'][0]['latex']}\n$\n\n"
252
259
  para_content['img_path'] = join_path(img_buket_path, block["lines"][0]["spans"][0]['image_path'])
253
260
  if block['type'] == BlockType.TableCaption:
254
261
  para_content['table_caption'] = merge_para_with_text(block)
@@ -57,16 +57,6 @@ def get_bucket_name(path):
57
57
  return bucket
58
58
 
59
59
 
60
- def get_local_dir():
61
- config = read_config()
62
- local_dir = config.get("temp-output-dir")
63
- if local_dir is None:
64
- logger.warning(f"'temp-output-dir' not found in {CONFIG_FILE_NAME}, use '/tmp' as default")
65
- return "/tmp"
66
- else:
67
- return local_dir
68
-
69
-
70
60
  def get_local_models_dir():
71
61
  config = read_config()
72
62
  models_dir = config.get("models-dir")
@@ -87,5 +77,15 @@ def get_device():
87
77
  return device
88
78
 
89
79
 
80
+ def get_table_recog_config():
81
+ config = read_config()
82
+ table_config = config.get("table-config")
83
+ if table_config is None:
84
+ logger.warning(f"'table-config' not found in {CONFIG_FILE_NAME}, use 'False' as default")
85
+ return json.loads('{"is_table_recog_enable": false, "max_time": 400}')
86
+ else:
87
+ return table_config
88
+
89
+
90
90
  if __name__ == "__main__":
91
91
  ak, sk, endpoint = get_s3_config("llm-raw")
magic_pdf/libs/version.py CHANGED
@@ -1 +1 @@
1
- __version__ = "0.6.2b1"
1
+ __version__ = "0.7.0a1"
@@ -4,7 +4,7 @@ import fitz
4
4
  import numpy as np
5
5
  from loguru import logger
6
6
 
7
- from magic_pdf.libs.config_reader import get_local_models_dir, get_device
7
+ from magic_pdf.libs.config_reader import get_local_models_dir, get_device, get_table_recog_config
8
8
  from magic_pdf.model.model_list import MODEL
9
9
  import magic_pdf.model as model_config
10
10
 
@@ -84,7 +84,13 @@ def custom_model_init(ocr: bool = False, show_log: bool = False):
84
84
  # 从配置文件读取model-dir和device
85
85
  local_models_dir = get_local_models_dir()
86
86
  device = get_device()
87
- custom_model = CustomPEKModel(ocr=ocr, show_log=show_log, models_dir=local_models_dir, device=device)
87
+ table_config = get_table_recog_config()
88
+ model_input = {"ocr": ocr,
89
+ "show_log": show_log,
90
+ "models_dir": local_models_dir,
91
+ "device": device,
92
+ "table_config": table_config}
93
+ custom_model = CustomPEKModel(**model_input)
88
94
  else:
89
95
  logger.error("Not allow model_name!")
90
96
  exit(1)
@@ -560,6 +560,10 @@ class MagicModel:
560
560
  if category_id == 3:
561
561
  span["type"] = ContentType.Image
562
562
  elif category_id == 5:
563
+ # 获取table模型结果
564
+ latex = layout_det.get("latex", None)
565
+ if latex:
566
+ span["latex"] = latex
563
567
  span["type"] = ContentType.Table
564
568
  elif category_id == 13:
565
569
  span["content"] = layout_det["latex"]
@@ -2,6 +2,7 @@ from loguru import logger
2
2
  import os
3
3
  import time
4
4
 
5
+
5
6
  os.environ['NO_ALBUMENTATIONS_UPDATE'] = '1' # 禁止albumentations检查更新
6
7
  try:
7
8
  import cv2
@@ -10,6 +11,7 @@ try:
10
11
  import numpy as np
11
12
  import torch
12
13
  import torchtext
14
+
13
15
  if torchtext.__version__ >= "0.18.0":
14
16
  torchtext.disable_torchtext_deprecation_warning()
15
17
  from PIL import Image
@@ -30,6 +32,12 @@ except ImportError as e:
30
32
  from magic_pdf.model.pek_sub_modules.layoutlmv3.model_init import Layoutlmv3_Predictor
31
33
  from magic_pdf.model.pek_sub_modules.post_process import get_croped_image, latex_rm_whitespace
32
34
  from magic_pdf.model.pek_sub_modules.self_modify import ModifiedPaddleOCR
35
+ from magic_pdf.model.pek_sub_modules.structeqtable.StructTableModel import StructTableModel
36
+
37
+
38
+ def table_model_init(model_path, max_time=400, _device_='cpu'):
39
+ table_model = StructTableModel(model_path, max_time=max_time, device=_device_)
40
+ return table_model
33
41
 
34
42
 
35
43
  def mfd_model_init(weight):
@@ -95,10 +103,12 @@ class CustomPEKModel:
95
103
  # 初始化解析配置
96
104
  self.apply_layout = kwargs.get("apply_layout", self.configs["config"]["layout"])
97
105
  self.apply_formula = kwargs.get("apply_formula", self.configs["config"]["formula"])
106
+ self.table_config = kwargs.get("table_config", self.configs["config"]["table_config"])
107
+ self.apply_table = self.table_config.get("is_table_recog_enable", False)
98
108
  self.apply_ocr = ocr
99
109
  logger.info(
100
- "DocAnalysis init, this may take some times. apply_layout: {}, apply_formula: {}, apply_ocr: {}".format(
101
- self.apply_layout, self.apply_formula, self.apply_ocr
110
+ "DocAnalysis init, this may take some times. apply_layout: {}, apply_formula: {}, apply_ocr: {}, apply_table: {}".format(
111
+ self.apply_layout, self.apply_formula, self.apply_ocr, self.apply_table
102
112
  )
103
113
  )
104
114
  assert self.apply_layout, "DocAnalysis must contain layout model."
@@ -129,6 +139,11 @@ class CustomPEKModel:
129
139
  if self.apply_ocr:
130
140
  self.ocr_model = ModifiedPaddleOCR(show_log=show_log)
131
141
 
142
+ # init structeqtable
143
+ if self.apply_table:
144
+ max_time = self.table_config.get("max_time", 400)
145
+ self.table_model = table_model_init(str(os.path.join(models_dir, self.configs["weights"]["table"])),
146
+ max_time=max_time, _device_=self.device)
132
147
  logger.info('DocAnalysis init done!')
133
148
 
134
149
  def __call__(self, image):
@@ -249,4 +264,32 @@ class CustomPEKModel:
249
264
  ocr_cost = round(time.time() - ocr_start, 2)
250
265
  logger.info(f"ocr cost: {ocr_cost}")
251
266
 
267
+ # 表格识别 table recognition
268
+ if self.apply_table:
269
+ pil_img = Image.fromarray(image)
270
+ for layout in layout_res:
271
+ if layout.get("category_id", -1) == 5:
272
+ poly = layout["poly"]
273
+ xmin, ymin = int(poly[0]), int(poly[1])
274
+ xmax, ymax = int(poly[4]), int(poly[5])
275
+
276
+ paste_x = 50
277
+ paste_y = 50
278
+ # 创建一个宽高各多50的白色背景 create a whiteboard with 50 larger width and length
279
+ new_width = xmax - xmin + paste_x * 2
280
+ new_height = ymax - ymin + paste_y * 2
281
+ new_image = Image.new('RGB', (new_width, new_height), 'white')
282
+
283
+ # 裁剪图像 crop image
284
+ crop_box = (xmin, ymin, xmax, ymax)
285
+ cropped_img = pil_img.crop(crop_box)
286
+ new_image.paste(cropped_img, (paste_x, paste_y))
287
+ start_time = time.time()
288
+ logger.info("------------------table recognition processing begins-----------------")
289
+ latex_code = self.table_model.image2latex(new_image)[0]
290
+ end_time = time.time()
291
+ run_time = end_time - start_time
292
+ logger.info(f"------------table recognition processing ends within {run_time}s-----")
293
+ layout["latex"] = latex_code
294
+
252
295
  return layout_res
@@ -0,0 +1,22 @@
1
+ from struct_eqtable.model import StructTable
2
+ from pypandoc import convert_text
3
+ class StructTableModel:
4
+ def __init__(self, model_path, max_new_tokens=2048, max_time=400, device = 'cpu'):
5
+ # init
6
+ self.model_path = model_path
7
+ self.max_new_tokens = max_new_tokens # maximum output tokens length
8
+ self.max_time = max_time # timeout for processing in seconds
9
+ if device == 'cuda':
10
+ self.model = StructTable(self.model_path, self.max_new_tokens, self.max_time).cuda()
11
+ else:
12
+ self.model = StructTable(self.model_path, self.max_new_tokens, self.max_time)
13
+
14
+ def image2latex(self, image) -> str:
15
+ #
16
+ table_latex = self.model.forward(image)
17
+ return table_latex
18
+
19
+ def image2html(self, image) -> str:
20
+ table_latex = self.image2latex(image)
21
+ table_html = convert_text(table_latex, 'html', format='latex')
22
+ return table_html
@@ -2,8 +2,12 @@ config:
2
2
  device: cpu
3
3
  layout: True
4
4
  formula: True
5
+ table_config:
6
+ is_table_recog_enable: False
7
+ max_time: 400
5
8
 
6
9
  weights:
7
10
  layout: Layout/model_final.pth
8
11
  mfd: MFD/weights.pt
9
12
  mfr: MFR/UniMERNet
13
+ table: TabRec/StructEqTable
@@ -2,33 +2,16 @@ from abc import ABC, abstractmethod
2
2
 
3
3
 
4
4
  class AbsReaderWriter(ABC):
5
- """
6
- 同时支持二进制和文本读写的抽象类
7
- """
8
5
  MODE_TXT = "text"
9
6
  MODE_BIN = "binary"
10
-
11
- def __init__(self, parent_path):
12
- # 初始化代码可以在这里添加,如果需要的话
13
- self.parent_path = parent_path # 对于本地目录是父目录,对于s3是会写到这个path下。
14
-
15
7
  @abstractmethod
16
8
  def read(self, path: str, mode=MODE_TXT):
17
- """
18
- 无论对于本地还是s3的路径,检查如果path是绝对路径,那么就不再 拼接parent_path, 如果是相对路径就拼接parent_path
19
- """
20
9
  raise NotImplementedError
21
10
 
22
11
  @abstractmethod
23
12
  def write(self, content: str, path: str, mode=MODE_TXT):
24
- """
25
- 无论对于本地还是s3的路径,检查如果path是绝对路径,那么就不再 拼接parent_path, 如果是相对路径就拼接parent_path
26
- """
27
13
  raise NotImplementedError
28
14
 
29
15
  @abstractmethod
30
- def read_jsonl(self, path: str, byte_start=0, byte_end=None, encoding='utf-8'):
31
- """
32
- 无论对于本地还是s3的路径,检查如果path是绝对路径,那么就不再 拼接parent_path, 如果是相对路径就拼接parent_path
33
- """
16
+ def read_offset(self, path: str, offset=0, limit=None) -> bytes:
34
17
  raise NotImplementedError
@@ -3,34 +3,29 @@ from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
3
3
  from loguru import logger
4
4
 
5
5
 
6
- MODE_TXT = "text"
7
- MODE_BIN = "binary"
8
-
9
-
10
6
  class DiskReaderWriter(AbsReaderWriter):
11
-
12
7
  def __init__(self, parent_path, encoding="utf-8"):
13
8
  self.path = parent_path
14
9
  self.encoding = encoding
15
10
 
16
- def read(self, path, mode=MODE_TXT):
11
+ def read(self, path, mode=AbsReaderWriter.MODE_TXT):
17
12
  if os.path.isabs(path):
18
13
  abspath = path
19
14
  else:
20
15
  abspath = os.path.join(self.path, path)
21
16
  if not os.path.exists(abspath):
22
- logger.error(f"文件 {abspath} 不存在")
23
- raise Exception(f"文件 {abspath} 不存在")
24
- if mode == MODE_TXT:
17
+ logger.error(f"file {abspath} not exists")
18
+ raise Exception(f"file {abspath} no exists")
19
+ if mode == AbsReaderWriter.MODE_TXT:
25
20
  with open(abspath, "r", encoding=self.encoding) as f:
26
21
  return f.read()
27
- elif mode == MODE_BIN:
22
+ elif mode == AbsReaderWriter.MODE_BIN:
28
23
  with open(abspath, "rb") as f:
29
24
  return f.read()
30
25
  else:
31
26
  raise ValueError("Invalid mode. Use 'text' or 'binary'.")
32
27
 
33
- def write(self, content, path, mode=MODE_TXT):
28
+ def write(self, content, path, mode=AbsReaderWriter.MODE_TXT):
34
29
  if os.path.isabs(path):
35
30
  abspath = path
36
31
  else:
@@ -38,29 +33,42 @@ class DiskReaderWriter(AbsReaderWriter):
38
33
  directory_path = os.path.dirname(abspath)
39
34
  if not os.path.exists(directory_path):
40
35
  os.makedirs(directory_path)
41
- if mode == MODE_TXT:
36
+ if mode == AbsReaderWriter.MODE_TXT:
42
37
  with open(abspath, "w", encoding=self.encoding, errors="replace") as f:
43
38
  f.write(content)
44
39
 
45
- elif mode == MODE_BIN:
40
+ elif mode == AbsReaderWriter.MODE_BIN:
46
41
  with open(abspath, "wb") as f:
47
42
  f.write(content)
48
43
  else:
49
44
  raise ValueError("Invalid mode. Use 'text' or 'binary'.")
50
45
 
51
- def read_jsonl(self, path: str, byte_start=0, byte_end=None, encoding="utf-8"):
52
- return self.read(path)
46
+ def read_offset(self, path: str, offset=0, limit=None):
47
+ abspath = path
48
+ if not os.path.isabs(path):
49
+ abspath = os.path.join(self.path, path)
50
+ with open(abspath, "rb") as f:
51
+ f.seek(offset)
52
+ return f.read(limit)
53
53
 
54
54
 
55
- # 使用示例
56
55
  if __name__ == "__main__":
57
- file_path = "io/test/example.txt"
58
- drw = DiskReaderWriter("D:\projects\papayfork\Magic-PDF\magic_pdf")
56
+ if 0:
57
+ file_path = "io/test/example.txt"
58
+ drw = DiskReaderWriter("D:\projects\papayfork\Magic-PDF\magic_pdf")
59
+
60
+ # 写入内容到文件
61
+ drw.write(b"Hello, World!", path="io/test/example.txt", mode="binary")
62
+
63
+ # 从文件读取内容
64
+ content = drw.read(path=file_path)
65
+ if content:
66
+ logger.info(f"从 {file_path} 读取的内容: {content}")
67
+ if 1:
68
+ drw = DiskReaderWriter("/opt/data/pdf/resources/test/io/")
69
+ content_bin = drw.read_offset("1.txt")
70
+ assert content_bin == b"ABCD!"
59
71
 
60
- # 写入内容到文件
61
- drw.write(b"Hello, World!", path="io/test/example.txt", mode="binary")
72
+ content_bin = drw.read_offset("1.txt", offset=1, limit=2)
73
+ assert content_bin == b"BC"
62
74
 
63
- # 从文件读取内容
64
- content = drw.read(path=file_path)
65
- if content:
66
- logger.info(f"从 {file_path} 读取的内容: {content}")
@@ -2,16 +2,18 @@ from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
2
2
  from magic_pdf.libs.commons import parse_aws_param, parse_bucket_key, join_path
3
3
  import boto3
4
4
  from loguru import logger
5
- from boto3.s3.transfer import TransferConfig
6
5
  from botocore.config import Config
7
- import os
8
-
9
- MODE_TXT = "text"
10
- MODE_BIN = "binary"
11
6
 
12
7
 
13
8
  class S3ReaderWriter(AbsReaderWriter):
14
- def __init__(self, ak: str, sk: str, endpoint_url: str, addressing_style: str = 'auto', parent_path: str = ''):
9
+ def __init__(
10
+ self,
11
+ ak: str,
12
+ sk: str,
13
+ endpoint_url: str,
14
+ addressing_style: str = "auto",
15
+ parent_path: str = "",
16
+ ):
15
17
  self.client = self._get_client(ak, sk, endpoint_url, addressing_style)
16
18
  self.path = parent_path
17
19
 
@@ -21,12 +23,14 @@ class S3ReaderWriter(AbsReaderWriter):
21
23
  aws_access_key_id=ak,
22
24
  aws_secret_access_key=sk,
23
25
  endpoint_url=endpoint_url,
24
- config=Config(s3={"addressing_style": addressing_style},
25
- retries={'max_attempts': 5, 'mode': 'standard'}),
26
+ config=Config(
27
+ s3={"addressing_style": addressing_style},
28
+ retries={"max_attempts": 5, "mode": "standard"},
29
+ ),
26
30
  )
27
31
  return s3_client
28
32
 
29
- def read(self, s3_relative_path, mode=MODE_TXT, encoding="utf-8"):
33
+ def read(self, s3_relative_path, mode=AbsReaderWriter.MODE_TXT, encoding="utf-8"):
30
34
  if s3_relative_path.startswith("s3://"):
31
35
  s3_path = s3_relative_path
32
36
  else:
@@ -34,22 +38,22 @@ class S3ReaderWriter(AbsReaderWriter):
34
38
  bucket_name, key = parse_bucket_key(s3_path)
35
39
  res = self.client.get_object(Bucket=bucket_name, Key=key)
36
40
  body = res["Body"].read()
37
- if mode == MODE_TXT:
41
+ if mode == AbsReaderWriter.MODE_TXT:
38
42
  data = body.decode(encoding) # Decode bytes to text
39
- elif mode == MODE_BIN:
43
+ elif mode == AbsReaderWriter.MODE_BIN:
40
44
  data = body
41
45
  else:
42
46
  raise ValueError("Invalid mode. Use 'text' or 'binary'.")
43
47
  return data
44
48
 
45
- def write(self, content, s3_relative_path, mode=MODE_TXT, encoding="utf-8"):
49
+ def write(self, content, s3_relative_path, mode=AbsReaderWriter.MODE_TXT, encoding="utf-8"):
46
50
  if s3_relative_path.startswith("s3://"):
47
51
  s3_path = s3_relative_path
48
52
  else:
49
53
  s3_path = join_path(self.path, s3_relative_path)
50
- if mode == MODE_TXT:
54
+ if mode == AbsReaderWriter.MODE_TXT:
51
55
  body = content.encode(encoding) # Encode text data as bytes
52
- elif mode == MODE_BIN:
56
+ elif mode == AbsReaderWriter.MODE_BIN:
53
57
  body = content
54
58
  else:
55
59
  raise ValueError("Invalid mode. Use 'text' or 'binary'.")
@@ -57,51 +61,82 @@ class S3ReaderWriter(AbsReaderWriter):
57
61
  self.client.put_object(Body=body, Bucket=bucket_name, Key=key)
58
62
  logger.info(f"内容已写入 {s3_path} ")
59
63
 
60
- def read_jsonl(self, path: str, byte_start=0, byte_end=None, mode=MODE_TXT, encoding='utf-8'):
64
+ def read_offset(self, path: str, offset=0, limit=None) -> bytes:
61
65
  if path.startswith("s3://"):
62
66
  s3_path = path
63
67
  else:
64
68
  s3_path = join_path(self.path, path)
65
69
  bucket_name, key = parse_bucket_key(s3_path)
66
70
 
67
- range_header = f'bytes={byte_start}-{byte_end}' if byte_end else f'bytes={byte_start}-'
71
+ range_header = (
72
+ f"bytes={offset}-{offset+limit-1}" if limit else f"bytes={offset}-"
73
+ )
68
74
  res = self.client.get_object(Bucket=bucket_name, Key=key, Range=range_header)
69
- body = res["Body"].read()
70
- if mode == MODE_TXT:
71
- data = body.decode(encoding) # Decode bytes to text
72
- elif mode == MODE_BIN:
73
- data = body
74
- else:
75
- raise ValueError("Invalid mode. Use 'text' or 'binary'.")
76
- return data
75
+ return res["Body"].read()
77
76
 
78
77
 
79
78
  if __name__ == "__main__":
80
- # Config the connection info
81
- ak = ""
82
- sk = ""
83
- endpoint_url = ""
84
- addressing_style = "auto"
85
- bucket_name = ""
86
- # Create an S3ReaderWriter object
87
- s3_reader_writer = S3ReaderWriter(ak, sk, endpoint_url, addressing_style, "s3://bucket_name/")
79
+ if 0:
80
+ # Config the connection info
81
+ ak = ""
82
+ sk = ""
83
+ endpoint_url = ""
84
+ addressing_style = "auto"
85
+ bucket_name = ""
86
+ # Create an S3ReaderWriter object
87
+ s3_reader_writer = S3ReaderWriter(
88
+ ak, sk, endpoint_url, addressing_style, "s3://bucket_name/"
89
+ )
88
90
 
89
- # Write text data to S3
90
- text_data = "This is some text data"
91
- s3_reader_writer.write(data=text_data, s3_relative_path=f"s3://{bucket_name}/ebook/test/test.json", mode=MODE_TXT)
91
+ # Write text data to S3
92
+ text_data = "This is some text data"
93
+ s3_reader_writer.write(
94
+ text_data,
95
+ s3_relative_path=f"s3://{bucket_name}/ebook/test/test.json",
96
+ mode=AbsReaderWriter.MODE_TXT,
97
+ )
98
+
99
+ # Read text data from S3
100
+ text_data_read = s3_reader_writer.read(
101
+ s3_relative_path=f"s3://{bucket_name}/ebook/test/test.json", mode=AbsReaderWriter.MODE_TXT
102
+ )
103
+ logger.info(f"Read text data from S3: {text_data_read}")
104
+ # Write binary data to S3
105
+ binary_data = b"This is some binary data"
106
+ s3_reader_writer.write(
107
+ text_data,
108
+ s3_relative_path=f"s3://{bucket_name}/ebook/test/test.json",
109
+ mode=AbsReaderWriter.MODE_BIN,
110
+ )
92
111
 
93
- # Read text data from S3
94
- text_data_read = s3_reader_writer.read(s3_relative_path=f"s3://{bucket_name}/ebook/test/test.json", mode=MODE_TXT)
95
- logger.info(f"Read text data from S3: {text_data_read}")
96
- # Write binary data to S3
97
- binary_data = b"This is some binary data"
98
- s3_reader_writer.write(data=text_data, s3_relative_path=f"s3://{bucket_name}/ebook/test/test.json", mode=MODE_BIN)
112
+ # Read binary data from S3
113
+ binary_data_read = s3_reader_writer.read(
114
+ s3_relative_path=f"s3://{bucket_name}/ebook/test/test.json", mode=AbsReaderWriter.MODE_BIN
115
+ )
116
+ logger.info(f"Read binary data from S3: {binary_data_read}")
117
+
118
+ # Range Read text data from S3
119
+ binary_data_read = s3_reader_writer.read_offset(
120
+ path=f"s3://{bucket_name}/ebook/test/test.json", offset=0, limit=10
121
+ )
122
+ logger.info(f"Read binary data from S3: {binary_data_read}")
123
+ if 1:
124
+ import os
125
+ import json
99
126
 
100
- # Read binary data from S3
101
- binary_data_read = s3_reader_writer.read(s3_relative_path=f"s3://{bucket_name}/ebook/test/test.json", mode=MODE_BIN)
102
- logger.info(f"Read binary data from S3: {binary_data_read}")
127
+ ak = os.getenv("AK", "")
128
+ sk = os.getenv("SK", "")
129
+ endpoint_url = os.getenv("ENDPOINT", "")
130
+ bucket = os.getenv("S3_BUCKET", "")
131
+ prefix = os.getenv("S3_PREFIX", "")
132
+ key_basename = os.getenv("S3_KEY_BASENAME", "")
133
+ s3_reader_writer = S3ReaderWriter(
134
+ ak, sk, endpoint_url, "auto", f"s3://{bucket}/{prefix}"
135
+ )
136
+ content_bin = s3_reader_writer.read_offset(key_basename)
137
+ assert content_bin[:10] == b'{"track_id'
138
+ assert content_bin[-10:] == b'r":null}}\n'
103
139
 
104
- # Range Read text data from S3
105
- binary_data_read = s3_reader_writer.read_jsonl(path=f"s3://{bucket_name}/ebook/test/test.json",
106
- byte_start=0, byte_end=10, mode=MODE_BIN)
107
- logger.info(f"Read binary data from S3: {binary_data_read}")
140
+ content_bin = s3_reader_writer.read_offset(key_basename, offset=424, limit=426)
141
+ jso = json.dumps(content_bin.decode("utf-8"))
142
+ print(jso)
magic_pdf/tools/cli.py ADDED
@@ -0,0 +1,79 @@
1
+ import os
2
+ import click
3
+ from loguru import logger
4
+ from pathlib import Path
5
+
6
+ from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
7
+ from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
8
+ import magic_pdf.model as model_config
9
+ from magic_pdf.tools.common import parse_pdf_methods, do_parse
10
+ from magic_pdf.libs.version import __version__
11
+
12
+
13
+ @click.command()
14
+ @click.version_option(__version__, "--version", "-v", help="display the version and exit")
15
+ @click.option(
16
+ "-p",
17
+ "--path",
18
+ "path",
19
+ type=click.Path(exists=True),
20
+ required=True,
21
+ help="local pdf filepath or directory",
22
+ )
23
+ @click.option(
24
+ "-o",
25
+ "--output-dir",
26
+ "output_dir",
27
+ type=str,
28
+ help="output local directory",
29
+ default="",
30
+ )
31
+ @click.option(
32
+ "-m",
33
+ "--method",
34
+ "method",
35
+ type=parse_pdf_methods,
36
+ help="""the method for parsing pdf.
37
+ ocr: using ocr technique to extract information from pdf.
38
+ txt: suitable for the text-based pdf only and outperform ocr.
39
+ auto: automatically choose the best method for parsing pdf from ocr and txt.
40
+ without method specified, auto will be used by default.""",
41
+ default="auto",
42
+ )
43
+ def cli(path, output_dir, method):
44
+ model_config.__use_inside_model__ = True
45
+ model_config.__model_mode__ = "full"
46
+ if output_dir == "":
47
+ if os.path.isdir(path):
48
+ output_dir = os.path.join(path, "output")
49
+ else:
50
+ output_dir = os.path.join(os.path.dirname(path), "output")
51
+
52
+ def read_fn(path):
53
+ disk_rw = DiskReaderWriter(os.path.dirname(path))
54
+ return disk_rw.read(os.path.basename(path), AbsReaderWriter.MODE_BIN)
55
+
56
+ def parse_doc(doc_path: str):
57
+ try:
58
+ file_name = str(Path(doc_path).stem)
59
+ pdf_data = read_fn(doc_path)
60
+ do_parse(
61
+ output_dir,
62
+ file_name,
63
+ pdf_data,
64
+ [],
65
+ method,
66
+ )
67
+
68
+ except Exception as e:
69
+ logger.exception(e)
70
+
71
+ if os.path.isdir(path):
72
+ for doc_path in Path(path).glob("*.pdf"):
73
+ parse_doc(doc_path)
74
+ else:
75
+ parse_doc(path)
76
+
77
+
78
+ if __name__ == "__main__":
79
+ cli()