pydatamax 0.1.16__py3-none-any.whl → 0.1.16.post2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. datamax/loader/core.py +67 -42
  2. datamax/loader/minio_handler.py +38 -19
  3. datamax/parser/__init__.py +2 -1
  4. datamax/parser/base.py +46 -22
  5. datamax/parser/core.py +215 -126
  6. datamax/parser/csv_parser.py +25 -5
  7. datamax/parser/doc_parser.py +230 -141
  8. datamax/parser/docx_parser.py +275 -186
  9. datamax/parser/epub_parser.py +49 -13
  10. datamax/parser/html_parser.py +36 -16
  11. datamax/parser/image_parser.py +52 -14
  12. datamax/parser/json_parser.py +26 -5
  13. datamax/parser/md_parser.py +40 -21
  14. datamax/parser/pdf_parser.py +69 -29
  15. datamax/parser/ppt_parser.py +41 -9
  16. datamax/parser/pptx_parser.py +49 -21
  17. datamax/parser/txt_parser.py +45 -14
  18. datamax/parser/xls_parser.py +34 -6
  19. datamax/parser/xlsx_parser.py +58 -51
  20. datamax/utils/__init__.py +2 -1
  21. datamax/utils/data_cleaner.py +36 -22
  22. datamax/utils/env_setup.py +25 -18
  23. datamax/utils/gotocr_pdf.py +13 -13
  24. datamax/utils/lifecycle_types.py +18 -0
  25. datamax/utils/mineru_operator.py +17 -15
  26. datamax/utils/paddleocr_pdf_operator.py +34 -19
  27. datamax/utils/ppt_extract.py +34 -11
  28. datamax/utils/qa_generator.py +332 -44
  29. datamax/utils/tokenizer.py +10 -9
  30. datamax/utils/uno_handler.py +91 -68
  31. {pydatamax-0.1.16.dist-info → pydatamax-0.1.16.post2.dist-info}/METADATA +54 -2
  32. pydatamax-0.1.16.post2.dist-info/RECORD +39 -0
  33. pydatamax-0.1.16.dist-info/RECORD +0 -38
  34. {pydatamax-0.1.16.dist-info → pydatamax-0.1.16.post2.dist-info}/WHEEL +0 -0
  35. {pydatamax-0.1.16.dist-info → pydatamax-0.1.16.post2.dist-info}/licenses/LICENSE +0 -0
  36. {pydatamax-0.1.16.dist-info → pydatamax-0.1.16.post2.dist-info}/top_level.txt +0 -0
@@ -18,11 +18,11 @@ from transformers import AutoTokenizer
18
18
 
19
19
  fitz = try_import("fitz")
20
20
 
21
- DEFAULT_IMAGE_TOKEN = "<image>" # nosec B105 - 这是技术常量,不是密码
22
- DEFAULT_IMAGE_PATCH_TOKEN = "<imgpad>" # nosec B105 - 这是技术常量,不是密码
21
+ DEFAULT_IMAGE_TOKEN = "<image>" # nosec B105 - technical const,not a passward
22
+ DEFAULT_IMAGE_PATCH_TOKEN = "<imgpad>" # nosec B105 - technical const,not a passward
23
23
 
24
- DEFAULT_IM_START_TOKEN = "<img>" # nosec B105 - 这是技术常量,不是密码
25
- DEFAULT_IM_END_TOKEN = "</img>" # nosec B105 - 这是技术常量,不是密码
24
+ DEFAULT_IM_START_TOKEN = "<img>" # nosec B105 - technical const,not a passward
25
+ DEFAULT_IM_END_TOKEN = "</img>" # nosec B105 - technical const,not a passward
26
26
 
27
27
  translation_table = str.maketrans(punctuation_dict)
28
28
 
@@ -50,7 +50,7 @@ def covert_pdf_to_image(image_path: str):
50
50
  with fitz.open(image_path) as pdf:
51
51
  for pg in range(0, pdf.page_count):
52
52
  page = pdf[pg]
53
- mat = fitz.Matrix(4, 4) # 全程放大四倍
53
+ mat = fitz.Matrix(4, 4) # Magnify by four times throughout the process
54
54
  pm = page.get_pixmap(matrix=mat, alpha=False)
55
55
  # if pm.width > 2000 or pm.height > 2000:
56
56
  # pm = page.get_pixmap(matrix=fitz.Matrix(1, 1), alpha=False)
@@ -64,16 +64,16 @@ def covert_pdf_to_image(image_path: str):
64
64
  output = "output"
65
65
  img_paths = []
66
66
  for index, pdf_img in enumerate(imgs):
67
- # 图片处理
67
+ # img processing
68
68
 
69
69
  gray_img = cv2.cvtColor(pdf_img, cv2.COLOR_BGR2GRAY)
70
70
 
71
- # 二值化处理
71
+ # Binarization processing
72
72
  _, binary_img = cv2.threshold(
73
73
  gray_img, 128, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU
74
74
  )
75
75
 
76
- # 去噪
76
+ # denoise
77
77
  filtered_img = cv2.medianBlur(binary_img, 3)
78
78
  processed_img = filtered_img
79
79
 
@@ -100,7 +100,7 @@ def initialize_model(model_path: str = "./GOT_weights/", gpu_id: int = 6):
100
100
  model_name = os.path.expanduser(model_path)
101
101
  tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
102
102
 
103
- # 加载模型
103
+ # load model
104
104
  model = GOTQwenForCausalLM.from_pretrained(
105
105
  model_name,
106
106
  low_cpu_mem_usage=True,
@@ -109,13 +109,13 @@ def initialize_model(model_path: str = "./GOT_weights/", gpu_id: int = 6):
109
109
  pad_token_id=151643,
110
110
  ).eval()
111
111
 
112
- # 确保模型和张量都移动到目标设备
112
+ # Ensure that both the model and the tensor are moved to the target device.
113
113
  device = torch.device(f"cuda:{gpu_id}")
114
114
  model.to(device=device, dtype=torch.bfloat16)
115
115
 
116
- # 确保分词器的输出也在目标设备上
117
- tokenizer.model_max_length = 512 # 设置最大长度,根据需要调整
118
- tokenizer.padding_side = "right" # 设置填充方向,根据需要调整
116
+ # Ensure that the output of the tokenizer is also on the target device.
117
+ tokenizer.model_max_length = 512 # maxlength,adjust to need
118
+ tokenizer.padding_side = "right" # padding side,adjust to need
119
119
 
120
120
  return model, tokenizer
121
121
 
@@ -0,0 +1,18 @@
1
+ from enum import Enum
2
+
3
+
4
+ class LifeType(Enum):
5
+ # 数据处理阶段
6
+ DATA_PROCESSING = "DATA_PROCESSING" # 正在处理
7
+ DATA_PROCESSED = "DATA_PROCESSED" # 处理完成
8
+ DATA_PROCESS_FAILED = "DATA_PROCESS_FAILED" # 处理失败
9
+
10
+ # 数据清洗阶段
11
+ DATA_CLEANING = "DATA_CLEANING" # 正在清洗
12
+ DATA_CLEANED = "DATA_CLEANED" # 清洗完成
13
+ DATA_CLEAN_FAILED = "DATA_CLEAN_FAILED" # 清洗失败
14
+
15
+ # 数据标注阶段
16
+ DATA_LABELLING = "DATA_LABELLING" # 正在标注
17
+ DATA_LABELLED = "DATA_LABELLED" # 标注完成
18
+ DATA_LABEL_FAILED = "DATA_LABEL_FAILED" # 标注失败
@@ -1,8 +1,9 @@
1
1
  import os
2
- from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader
2
+
3
+ from magic_pdf.config.enums import SupportedPdfParseMethod
4
+ from magic_pdf.data.data_reader_writer import FileBasedDataReader, FileBasedDataWriter
3
5
  from magic_pdf.data.dataset import PymuDocDataset
4
6
  from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
5
- from magic_pdf.config.enums import SupportedPdfParseMethod
6
7
 
7
8
 
8
9
  class PdfProcessor:
@@ -31,32 +32,33 @@ class PdfProcessor:
31
32
 
32
33
  # 处理流程
33
34
  ds = PymuDocDataset(pdf_bytes)
34
- markdown_path = os.path.join(local_md_dir, f"{name_without_suff}.md") # 完整路径
35
- image_dir = os.path.basename(local_image_dir) # 保持相对路径为 "images"
35
+ markdown_path = os.path.join(
36
+ local_md_dir, f"{name_without_suff}.md"
37
+ ) # absolute path
38
+ image_dir = os.path.basename(local_image_dir) # keep relative path as "images"
36
39
 
37
40
  if ds.classify() == SupportedPdfParseMethod.OCR:
38
41
  ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md(
39
- md_writer,
40
- os.path.basename(markdown_path), # 文件名部分
41
- image_dir
42
+ md_writer, os.path.basename(markdown_path), image_dir # filename
42
43
  )
43
44
  else:
44
45
  ds.apply(doc_analyze, ocr=False).pipe_txt_mode(image_writer).dump_md(
45
- md_writer,
46
- os.path.basename(markdown_path), # 文件名部分
47
- image_dir
46
+ md_writer, os.path.basename(markdown_path), image_dir # filename
48
47
  )
49
48
 
50
- with open(markdown_path, "r", encoding='utf-8') as f:
49
+ with open(markdown_path, "r", encoding="utf-8") as f:
51
50
  markdown_content = f.read()
52
51
 
53
52
  return markdown_content
54
53
 
54
+
55
55
  pdf_processor = PdfProcessor()
56
56
 
57
- # 使用示例
57
+ # usage example
58
58
  if __name__ == "__main__":
59
59
  # pdf_processor = PdfProcessor()
60
- print(pdf_processor.process_pdf(
61
- "/home/caocaiyu/datamax-service/backend/uploaded_files/fde1daee-e899-4e93-87ff-706234c399c3/20250227132500_5447d25cbf094a3295f9d52d3408a048.pdf"
62
- ))
60
+ print(
61
+ pdf_processor.process_pdf(
62
+ "/home/caocaiyu/datamax-service/backend/uploaded_files/fde1daee-e899-4e93-87ff-706234c399c3/20250227132500_5447d25cbf094a3295f9d52d3408a048.pdf"
63
+ )
64
+ )
@@ -1,22 +1,24 @@
1
1
  """layout_recovery"""
2
+
2
3
  import os
3
4
  import pathlib
4
5
  import sys
6
+ from copy import deepcopy
7
+ from datetime import datetime
8
+
5
9
  import cv2
6
10
  import numpy as np
7
11
  from PIL import Image
8
- from copy import deepcopy
9
- from datetime import datetime
10
12
 
11
- os.environ['KMP_DUPLICATE_LIB_OK']='True'
13
+ os.environ["KMP_DUPLICATE_LIB_OK"] = "True"
12
14
  ROOT_DIR: pathlib.Path = pathlib.Path(__file__).parent.parent.parent.resolve()
13
15
  sys.path.append(str(ROOT_DIR))
14
16
 
15
17
  from paddle.utils import try_import
16
18
  from paddleocr import PPStructure, save_structure_res
17
- sys.path.append('/usr/local/lib/python3.10/dist-packages/paddleocr')
18
- from ppstructure.recovery.recovery_to_doc import convert_info_docx
19
- from ppstructure.recovery.recovery_to_doc import sorted_layout_boxes
19
+
20
+ sys.path.append("/usr/local/lib/python3.10/dist-packages/paddleocr")
21
+ from ppstructure.recovery.recovery_to_doc import convert_info_docx, sorted_layout_boxes
20
22
 
21
23
 
22
24
  def recovery(img_path, output, use_gpu, gpu_id):
@@ -48,20 +50,24 @@ def recovery(img_path, output, use_gpu, gpu_id):
48
50
  img_paths = []
49
51
  for index, pdf_img in enumerate(imgs):
50
52
  os.makedirs(os.path.join(output, img_name), exist_ok=True)
51
- pdf_img_path = os.path.join(output, img_name, img_name + "_" + str(index) + ".jpg")
53
+ pdf_img_path = os.path.join(
54
+ output, img_name, img_name + "_" + str(index) + ".jpg"
55
+ )
52
56
  cv2.imwrite(pdf_img_path, pdf_img)
53
57
  img_paths.append([pdf_img_path, pdf_img])
54
58
 
55
59
  # step3: Convert images to DOCX
56
60
  all_res = []
57
- engine = PPStructure(recovery=True,
58
- use_gpu=use_gpu,
59
- gpu_id=gpu_id,
60
- det_model_dir=f'{ROOT_DIR}/ocr_model_dir/det/en/en_PP-OCRv3_det_infer',
61
- rec_model_dir=f'{ROOT_DIR}/ocr_model_dir/rec/ch/ch_PP-OCRv4_rec_infer',
62
- table_model_dir=f'{ROOT_DIR}/ocr_model_dir/table/en_ppstructure_mobile_v2.0_SLANet_infer',
63
- layout_model_dir=f'{ROOT_DIR}/ocr_model_dir/layout/picodet_lcnet_x1_0_fgd_layout_infer',
64
- formula_model_dir=f'{ROOT_DIR}/ocr_model_dir/formula/rec_latex_ocr_infer')
61
+ engine = PPStructure(
62
+ recovery=True,
63
+ use_gpu=use_gpu,
64
+ gpu_id=gpu_id,
65
+ det_model_dir=f"{ROOT_DIR}/ocr_model_dir/det/en/en_PP-OCRv3_det_infer",
66
+ rec_model_dir=f"{ROOT_DIR}/ocr_model_dir/rec/ch/ch_PP-OCRv4_rec_infer",
67
+ table_model_dir=f"{ROOT_DIR}/ocr_model_dir/table/en_ppstructure_mobile_v2.0_SLANet_infer",
68
+ layout_model_dir=f"{ROOT_DIR}/ocr_model_dir/layout/picodet_lcnet_x1_0_fgd_layout_infer",
69
+ formula_model_dir=f"{ROOT_DIR}/ocr_model_dir/formula/rec_latex_ocr_infer",
70
+ )
65
71
  for index, (new_img_path, imgs) in enumerate(img_paths):
66
72
  print("processing {}/{} page:".format(index + 1, len(img_paths)))
67
73
  result = engine(imgs, img_idx=index)
@@ -72,18 +78,27 @@ def recovery(img_path, output, use_gpu, gpu_id):
72
78
  all_res += result_sorted
73
79
  try:
74
80
  convert_info_docx(imgs, all_res, output, img_name)
75
- os.rename(f'./output/{img_name}_ocr.docx',
76
- f'./output/{os.path.basename(img_path).replace(".pdf", "")}_ocr.docx')
81
+ os.rename(
82
+ f"./output/{img_name}_ocr.docx",
83
+ f'./output/{os.path.basename(img_path).replace(".pdf", "")}_ocr.docx',
84
+ )
77
85
  except Exception as e:
78
86
  raise e
79
87
 
80
88
 
81
- def use_paddleocr(input_files: str, output_files: str, use_gpu: bool = False, gpu_id: int = 6):
89
+ def use_paddleocr(
90
+ input_files: str, output_files: str, use_gpu: bool = False, gpu_id: int = 6
91
+ ):
82
92
  try:
83
93
  if not os.path.exists(output_files):
84
94
  os.makedirs(output_files)
85
95
  try:
86
- recovery(img_path=input_files, output=output_files, use_gpu=use_gpu, gpu_id=gpu_id)
96
+ recovery(
97
+ img_path=input_files,
98
+ output=output_files,
99
+ use_gpu=use_gpu,
100
+ gpu_id=gpu_id,
101
+ )
87
102
  except Exception as e:
88
103
  raise e
89
104
  except Exception as e:
@@ -1,17 +1,18 @@
1
1
  import os
2
- from loguru import logger
3
2
  from functools import lru_cache
4
- from typing import List, Dict, Union
5
3
  from pathlib import Path
4
+ from typing import Dict, List, Union
5
+
6
+ from loguru import logger
6
7
  from PIL.Image import Image
7
8
  from pptx import Presentation
8
9
  from pptx.enum.shapes import MSO_SHAPE_TYPE
9
10
  from pptx.shapes.base import BaseShape as Shape
10
- from pptx.shapes.picture import Picture
11
11
  from pptx.shapes.graphfrm import GraphicFrame
12
12
  from pptx.shapes.group import GroupShape
13
+ from pptx.shapes.picture import Picture
13
14
  from pptx.slide import Slide
14
- from pptx.table import Table, _Row, _Cell
15
+ from pptx.table import Table, _Cell, _Row
15
16
  from werkzeug.datastructures import FileStorage
16
17
 
17
18
 
@@ -25,7 +26,15 @@ class PPtExtractor:
25
26
  raise ValueError("img_name must be a string")
26
27
  return f"media/{id}/{img_name}"
27
28
 
28
- def handle_shape(self, shape: Shape, content_list: List[Dict[str, str]], media_dir: Path, img_map: Dict[Path, str], id: str, skip_image: bool):
29
+ def handle_shape(
30
+ self,
31
+ shape: Shape,
32
+ content_list: List[Dict[str, str]],
33
+ media_dir: Path,
34
+ img_map: Dict[Path, str],
35
+ id: str,
36
+ skip_image: bool,
37
+ ):
29
38
  if not isinstance(shape, Shape):
30
39
  raise ValueError("Invalid shape object")
31
40
  if not isinstance(content_list, list):
@@ -53,7 +62,9 @@ class PPtExtractor:
53
62
  shape: Picture
54
63
  image: Image = shape.image
55
64
  image_bytes = image.blob
56
- img_path = media_dir.resolve().joinpath(f"pic-{len(img_map)}.{image.ext}")
65
+ img_path = media_dir.resolve().joinpath(
66
+ f"pic-{len(img_map)}.{image.ext}"
67
+ )
57
68
  if not media_dir.exists():
58
69
  media_dir.mkdir(parents=True, exist_ok=True)
59
70
  if not os.access(media_dir, os.W_OK):
@@ -76,14 +87,18 @@ class PPtExtractor:
76
87
  md += "\n|"
77
88
  for col in row.cells:
78
89
  cell: _Cell = col
79
- md += " " + cell.text.replace("\r", " ").replace("\n", " ") + " |"
90
+ md += (
91
+ " " + cell.text.replace("\r", " ").replace("\n", " ") + " |"
92
+ )
80
93
  md += "\n"
81
94
  md += "\n"
82
95
  content_list.append({"type": "md", "data": md})
83
96
  elif shape_type == MSO_SHAPE_TYPE.GROUP:
84
97
  shape: GroupShape
85
98
  for sub_shape in shape.shapes:
86
- self.handle_shape(sub_shape, content_list, media_dir, img_map, id, skip_image)
99
+ self.handle_shape(
100
+ sub_shape, content_list, media_dir, img_map, id, skip_image
101
+ )
87
102
  else:
88
103
  logger.info(f"Unknown shape type: {shape_type}, {type(shape)}")
89
104
  except PermissionError as pe:
@@ -93,8 +108,14 @@ class PPtExtractor:
93
108
  except Exception as e:
94
109
  logger.error(f"Error handling shape: {e}")
95
110
 
96
- def extract(self, presentation_source: Union[FileStorage, Path], id: str, dir: Path, media_dir: Path,
97
- skip_image: bool):
111
+ def extract(
112
+ self,
113
+ presentation_source: Union[FileStorage, Path],
114
+ id: str,
115
+ dir: Path,
116
+ media_dir: Path,
117
+ skip_image: bool,
118
+ ):
98
119
  if not isinstance(presentation_source, (FileStorage, Path)):
99
120
  raise ValueError("presentation_source must be a FileStorage or Path object")
100
121
  if not isinstance(id, str):
@@ -115,7 +136,9 @@ class PPtExtractor:
115
136
  slide: Slide
116
137
  page = {"page_no": page_no, "content_list": []}
117
138
  for shape in slide.shapes:
118
- self.handle_shape(shape, page["content_list"], media_dir, img_map, id, skip_image)
139
+ self.handle_shape(
140
+ shape, page["content_list"], media_dir, img_map, id, skip_image
141
+ )
119
142
  pages.append(page)
120
143
  except FileNotFoundError as fnfe:
121
144
  logger.error(f"File not found: {fnfe}")