pydatamax 0.1.13__py3-none-any.whl → 0.1.15__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. datamax/__init__.py +1 -1
  2. datamax/loader/core.py +118 -118
  3. datamax/loader/{MinioHandler.py → minio_handler.py} +171 -171
  4. datamax/loader/{OssHandler.py → oss_handler.py} +191 -191
  5. datamax/parser/__init__.py +2 -4
  6. datamax/parser/base.py +76 -76
  7. datamax/parser/core.py +406 -288
  8. datamax/parser/csv_parser.py +31 -10
  9. datamax/parser/doc_parser.py +525 -61
  10. datamax/parser/docx_parser.py +512 -62
  11. datamax/parser/epub_parser.py +41 -41
  12. datamax/parser/html_parser.py +37 -37
  13. datamax/parser/image_parser.py +34 -34
  14. datamax/parser/json_parser.py +32 -10
  15. datamax/parser/md_parser.py +72 -72
  16. datamax/parser/pdf_parser.py +101 -101
  17. datamax/parser/ppt_parser.py +70 -20
  18. datamax/parser/pptx_parser.py +45 -45
  19. datamax/parser/txt_parser.py +45 -45
  20. datamax/parser/xls_parser.py +26 -26
  21. datamax/parser/xlsx_parser.py +212 -208
  22. datamax/utils/__init__.py +23 -2
  23. datamax/utils/constants.py +58 -58
  24. datamax/utils/data_cleaner.py +275 -237
  25. datamax/utils/env_setup.py +79 -79
  26. datamax/utils/gotocr_pdf.py +265 -265
  27. datamax/utils/mineru_operator.py +62 -62
  28. datamax/utils/paddleocr_pdf_operator.py +90 -90
  29. datamax/utils/ppt_extract.py +140 -140
  30. datamax/utils/qa_generator.py +369 -376
  31. datamax/utils/tokenizer.py +21 -21
  32. datamax/utils/uno_handler.py +426 -0
  33. pydatamax-0.1.15.dist-info/METADATA +340 -0
  34. pydatamax-0.1.15.dist-info/RECORD +38 -0
  35. {pydatamax-0.1.13.dist-info → pydatamax-0.1.15.dist-info}/licenses/LICENSE +21 -21
  36. {pydatamax-0.1.13.dist-info → pydatamax-0.1.15.dist-info}/top_level.txt +0 -1
  37. pydatamax-0.1.13.dist-info/METADATA +0 -280
  38. pydatamax-0.1.13.dist-info/RECORD +0 -39
  39. tests/__init__.py +0 -0
  40. tests/test_basic.py +0 -20
  41. {pydatamax-0.1.13.dist-info → pydatamax-0.1.15.dist-info}/WHEEL +0 -0
@@ -1,90 +1,90 @@
1
- """layout_recovery"""
2
- import os
3
- import pathlib
4
- import sys
5
- import cv2
6
- import numpy as np
7
- from PIL import Image
8
- from copy import deepcopy
9
- from datetime import datetime
10
-
11
- os.environ['KMP_DUPLICATE_LIB_OK']='True'
12
- ROOT_DIR: pathlib.Path = pathlib.Path(__file__).parent.parent.parent.resolve()
13
- sys.path.append(str(ROOT_DIR))
14
-
15
- from paddle.utils import try_import
16
- from paddleocr import PPStructure, save_structure_res
17
- sys.path.append('/usr/local/lib/python3.10/dist-packages/paddleocr')
18
- from ppstructure.recovery.recovery_to_doc import convert_info_docx
19
- from ppstructure.recovery.recovery_to_doc import sorted_layout_boxes
20
-
21
-
22
- def recovery(img_path, output, use_gpu, gpu_id):
23
- """
24
- Convert a PDF file to a Word document with layout recovery.
25
-
26
- :param img_path: Path to the PDF file
27
- :param output: Path to the output folder
28
- """
29
- fitz = try_import("fitz")
30
-
31
- # step1: Convert PDF to images
32
- imgs = []
33
- with fitz.open(img_path) as pdf:
34
- for pg in range(0, pdf.page_count):
35
- page = pdf[pg]
36
- mat = fitz.Matrix(2, 2)
37
- pm = page.get_pixmap(matrix=mat, alpha=False)
38
- if pm.width > 2000 or pm.height > 2000:
39
- pm = page.get_pixmap(matrix=fitz.Matrix(1, 1), alpha=False)
40
-
41
- img = Image.frombytes("RGB", [pm.width, pm.height], pm.samples)
42
- img = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)
43
- imgs.append(img)
44
-
45
- img_name = datetime.now().strftime("%Y%m%d%H%M%S")
46
-
47
- # step2: Process images
48
- img_paths = []
49
- for index, pdf_img in enumerate(imgs):
50
- os.makedirs(os.path.join(output, img_name), exist_ok=True)
51
- pdf_img_path = os.path.join(output, img_name, img_name + "_" + str(index) + ".jpg")
52
- cv2.imwrite(pdf_img_path, pdf_img)
53
- img_paths.append([pdf_img_path, pdf_img])
54
-
55
- # step3: Convert images to DOCX
56
- all_res = []
57
- engine = PPStructure(recovery=True,
58
- use_gpu=use_gpu,
59
- gpu_id=gpu_id,
60
- det_model_dir=f'{ROOT_DIR}/ocr_model_dir/det/en/en_PP-OCRv3_det_infer',
61
- rec_model_dir=f'{ROOT_DIR}/ocr_model_dir/rec/ch/ch_PP-OCRv4_rec_infer',
62
- table_model_dir=f'{ROOT_DIR}/ocr_model_dir/table/en_ppstructure_mobile_v2.0_SLANet_infer',
63
- layout_model_dir=f'{ROOT_DIR}/ocr_model_dir/layout/picodet_lcnet_x1_0_fgd_layout_infer',
64
- formula_model_dir=f'{ROOT_DIR}/ocr_model_dir/formula/rec_latex_ocr_infer')
65
- for index, (new_img_path, imgs) in enumerate(img_paths):
66
- print("processing {}/{} page:".format(index + 1, len(img_paths)))
67
- result = engine(imgs, img_idx=index)
68
- save_structure_res(result, output, img_name, index)
69
- h, w, _ = imgs.shape
70
- result_cp = deepcopy(result)
71
- result_sorted = sorted_layout_boxes(result_cp, w)
72
- all_res += result_sorted
73
- try:
74
- convert_info_docx(imgs, all_res, output, img_name)
75
- os.rename(f'./output/{img_name}_ocr.docx',
76
- f'./output/{os.path.basename(img_path).replace(".pdf", "")}_ocr.docx')
77
- except Exception as e:
78
- raise e
79
-
80
-
81
- def use_paddleocr(input_files: str, output_files: str, use_gpu: bool = False, gpu_id: int = 6):
82
- try:
83
- if not os.path.exists(output_files):
84
- os.makedirs(output_files)
85
- try:
86
- recovery(img_path=input_files, output=output_files, use_gpu=use_gpu, gpu_id=gpu_id)
87
- except Exception as e:
88
- raise e
89
- except Exception as e:
90
- raise e
1
+ """layout_recovery"""
2
+ import os
3
+ import pathlib
4
+ import sys
5
+ import cv2
6
+ import numpy as np
7
+ from PIL import Image
8
+ from copy import deepcopy
9
+ from datetime import datetime
10
+
11
+ os.environ['KMP_DUPLICATE_LIB_OK']='True'
12
+ ROOT_DIR: pathlib.Path = pathlib.Path(__file__).parent.parent.parent.resolve()
13
+ sys.path.append(str(ROOT_DIR))
14
+
15
+ from paddle.utils import try_import
16
+ from paddleocr import PPStructure, save_structure_res
17
+ sys.path.append('/usr/local/lib/python3.10/dist-packages/paddleocr')
18
+ from ppstructure.recovery.recovery_to_doc import convert_info_docx
19
+ from ppstructure.recovery.recovery_to_doc import sorted_layout_boxes
20
+
21
+
22
+ def recovery(img_path, output, use_gpu, gpu_id):
23
+ """
24
+ Convert a PDF file to a Word document with layout recovery.
25
+
26
+ :param img_path: Path to the PDF file
27
+ :param output: Path to the output folder
28
+ """
29
+ fitz = try_import("fitz")
30
+
31
+ # step1: Convert PDF to images
32
+ imgs = []
33
+ with fitz.open(img_path) as pdf:
34
+ for pg in range(0, pdf.page_count):
35
+ page = pdf[pg]
36
+ mat = fitz.Matrix(2, 2)
37
+ pm = page.get_pixmap(matrix=mat, alpha=False)
38
+ if pm.width > 2000 or pm.height > 2000:
39
+ pm = page.get_pixmap(matrix=fitz.Matrix(1, 1), alpha=False)
40
+
41
+ img = Image.frombytes("RGB", [pm.width, pm.height], pm.samples)
42
+ img = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)
43
+ imgs.append(img)
44
+
45
+ img_name = datetime.now().strftime("%Y%m%d%H%M%S")
46
+
47
+ # step2: Process images
48
+ img_paths = []
49
+ for index, pdf_img in enumerate(imgs):
50
+ os.makedirs(os.path.join(output, img_name), exist_ok=True)
51
+ pdf_img_path = os.path.join(output, img_name, img_name + "_" + str(index) + ".jpg")
52
+ cv2.imwrite(pdf_img_path, pdf_img)
53
+ img_paths.append([pdf_img_path, pdf_img])
54
+
55
+ # step3: Convert images to DOCX
56
+ all_res = []
57
+ engine = PPStructure(recovery=True,
58
+ use_gpu=use_gpu,
59
+ gpu_id=gpu_id,
60
+ det_model_dir=f'{ROOT_DIR}/ocr_model_dir/det/en/en_PP-OCRv3_det_infer',
61
+ rec_model_dir=f'{ROOT_DIR}/ocr_model_dir/rec/ch/ch_PP-OCRv4_rec_infer',
62
+ table_model_dir=f'{ROOT_DIR}/ocr_model_dir/table/en_ppstructure_mobile_v2.0_SLANet_infer',
63
+ layout_model_dir=f'{ROOT_DIR}/ocr_model_dir/layout/picodet_lcnet_x1_0_fgd_layout_infer',
64
+ formula_model_dir=f'{ROOT_DIR}/ocr_model_dir/formula/rec_latex_ocr_infer')
65
+ for index, (new_img_path, imgs) in enumerate(img_paths):
66
+ print("processing {}/{} page:".format(index + 1, len(img_paths)))
67
+ result = engine(imgs, img_idx=index)
68
+ save_structure_res(result, output, img_name, index)
69
+ h, w, _ = imgs.shape
70
+ result_cp = deepcopy(result)
71
+ result_sorted = sorted_layout_boxes(result_cp, w)
72
+ all_res += result_sorted
73
+ try:
74
+ convert_info_docx(imgs, all_res, output, img_name)
75
+ os.rename(f'./output/{img_name}_ocr.docx',
76
+ f'./output/{os.path.basename(img_path).replace(".pdf", "")}_ocr.docx')
77
+ except Exception as e:
78
+ raise e
79
+
80
+
81
+ def use_paddleocr(input_files: str, output_files: str, use_gpu: bool = False, gpu_id: int = 6):
82
+ try:
83
+ if not os.path.exists(output_files):
84
+ os.makedirs(output_files)
85
+ try:
86
+ recovery(img_path=input_files, output=output_files, use_gpu=use_gpu, gpu_id=gpu_id)
87
+ except Exception as e:
88
+ raise e
89
+ except Exception as e:
90
+ raise e
@@ -1,140 +1,140 @@
1
- import os
2
- import logging
3
- from functools import lru_cache
4
- from typing import List, Dict, Union
5
- from pathlib import Path
6
- from PIL.Image import Image
7
- from pptx import Presentation
8
- from pptx.enum.shapes import MSO_SHAPE_TYPE
9
- from pptx.shapes.base import BaseShape as Shape
10
- from pptx.shapes.picture import Picture
11
- from pptx.shapes.graphfrm import GraphicFrame
12
- from pptx.shapes.group import GroupShape
13
- from pptx.slide import Slide
14
- from pptx.table import Table, _Row, _Cell
15
- from werkzeug.datastructures import FileStorage
16
-
17
-
18
- class PPtExtractor:
19
-
20
- @lru_cache(maxsize=128)
21
- def generate_img_path(self, id: str, img_name: str) -> str:
22
- if not isinstance(id, str):
23
- raise ValueError("id must be a string")
24
- if not isinstance(img_name, str):
25
- raise ValueError("img_name must be a string")
26
- return f"media/{id}/{img_name}"
27
-
28
- def handle_shape(self, shape: Shape, content_list: List[Dict[str, str]], media_dir: Path, img_map: Dict[Path, str], id: str, skip_image: bool):
29
- if not isinstance(shape, Shape):
30
- raise ValueError("Invalid shape object")
31
- if not isinstance(content_list, list):
32
- raise ValueError("content_list must be a list")
33
- if not isinstance(media_dir, Path):
34
- raise ValueError("media_dir must be a Path object")
35
- if not isinstance(img_map, dict):
36
- raise ValueError("img_map must be a dictionary")
37
- if not isinstance(id, str):
38
- raise ValueError("id must be a string")
39
- if not isinstance(skip_image, bool):
40
- raise ValueError("skip_image must be a boolean")
41
-
42
- try:
43
- shape_type = shape.shape_type
44
- if shape.has_text_frame:
45
- for paragraph in shape.text_frame.paragraphs:
46
- content_list.append(
47
- {
48
- "type": "text",
49
- "data": paragraph.text + "\n",
50
- }
51
- )
52
- elif shape_type == MSO_SHAPE_TYPE.PICTURE and not skip_image:
53
- shape: Picture
54
- image: Image = shape.image
55
- image_bytes = image.blob
56
- img_path = media_dir.resolve().joinpath(f"pic-{len(img_map)}.{image.ext}")
57
- if not media_dir.exists():
58
- media_dir.mkdir(parents=True, exist_ok=True)
59
- if not os.access(media_dir, os.W_OK):
60
- raise PermissionError(f"Cannot write to directory: {media_dir}")
61
- img_s3_path = self.generate_img_path(id, img_path.name)
62
- img_map[img_path] = img_s3_path
63
- content_list.append({"type": "image", "data": img_s3_path})
64
- with open(img_path, "wb") as file:
65
- file.write(image_bytes)
66
- elif shape_type == MSO_SHAPE_TYPE.TABLE:
67
- shape: GraphicFrame
68
- table: Table = shape.table
69
- md = "\n"
70
- for row_no, row in enumerate(table.rows):
71
- row: _Row
72
- md += "|"
73
- if row_no == 1:
74
- for col in row.cells:
75
- md += "---|"
76
- md += "\n|"
77
- for col in row.cells:
78
- cell: _Cell = col
79
- md += " " + cell.text.replace("\r", " ").replace("\n", " ") + " |"
80
- md += "\n"
81
- md += "\n"
82
- content_list.append({"type": "md", "data": md})
83
- elif shape_type == MSO_SHAPE_TYPE.GROUP:
84
- shape: GroupShape
85
- for sub_shape in shape.shapes:
86
- self.handle_shape(sub_shape, content_list, media_dir, img_map, id, skip_image)
87
- else:
88
- logging.info(f"Unknown shape type: {shape_type}, {type(shape)}")
89
- except PermissionError as pe:
90
- logging.error(f"Permission error: {pe}")
91
- except IOError as ie:
92
- logging.error(f"IO error: {ie}")
93
- except Exception as e:
94
- logging.error(f"Error handling shape: {e}")
95
-
96
- def extract(self, presentation_source: Union[FileStorage, Path], id: str, dir: Path, media_dir: Path,
97
- skip_image: bool):
98
- if not isinstance(presentation_source, (FileStorage, Path)):
99
- raise ValueError("presentation_source must be a FileStorage or Path object")
100
- if not isinstance(id, str):
101
- raise ValueError("id must be a string")
102
- if not isinstance(dir, Path):
103
- raise ValueError("dir must be a Path object")
104
- if not isinstance(media_dir, Path):
105
- raise ValueError("media_dir must be a Path object")
106
- if not isinstance(skip_image, bool):
107
- raise ValueError("skip_image must be a boolean")
108
-
109
- pages = []
110
- img_map = {}
111
-
112
- try:
113
- presentation: Presentation = Presentation(presentation_source)
114
- for page_no, slide in enumerate(presentation.slides):
115
- slide: Slide
116
- page = {"page_no": page_no, "content_list": []}
117
- for shape in slide.shapes:
118
- self.handle_shape(shape, page["content_list"], media_dir, img_map, id, skip_image)
119
- pages.append(page)
120
- except FileNotFoundError as fnfe:
121
- logging.error(f"File not found: {fnfe}")
122
- except PermissionError as pe:
123
- logging.error(f"Permission error: {pe}")
124
- except IOError as ie:
125
- logging.error(f"IO error: {ie}")
126
- except Exception as e:
127
- logging.error(f"Error extracting presentation: {e}")
128
-
129
- return pages
130
-
131
- def run(self, id: str, file_path: Path, skip_image: bool = False):
132
- if not isinstance(id, str):
133
- raise ValueError("id must be a string")
134
- if not isinstance(file_path, Path):
135
- raise ValueError("file_path must be a Path object")
136
- if not isinstance(skip_image, bool):
137
- raise ValueError("skip_image must be a boolean")
138
-
139
- media_dir = Path("media").resolve()
140
- return self.extract(file_path, id, Path("."), media_dir, skip_image)
1
+ import os
2
+ from loguru import logger
3
+ from functools import lru_cache
4
+ from typing import List, Dict, Union
5
+ from pathlib import Path
6
+ from PIL.Image import Image
7
+ from pptx import Presentation
8
+ from pptx.enum.shapes import MSO_SHAPE_TYPE
9
+ from pptx.shapes.base import BaseShape as Shape
10
+ from pptx.shapes.picture import Picture
11
+ from pptx.shapes.graphfrm import GraphicFrame
12
+ from pptx.shapes.group import GroupShape
13
+ from pptx.slide import Slide
14
+ from pptx.table import Table, _Row, _Cell
15
+ from werkzeug.datastructures import FileStorage
16
+
17
+
18
+ class PPtExtractor:
19
+
20
+ @lru_cache(maxsize=128)
21
+ def generate_img_path(self, id: str, img_name: str) -> str:
22
+ if not isinstance(id, str):
23
+ raise ValueError("id must be a string")
24
+ if not isinstance(img_name, str):
25
+ raise ValueError("img_name must be a string")
26
+ return f"media/{id}/{img_name}"
27
+
28
+ def handle_shape(self, shape: Shape, content_list: List[Dict[str, str]], media_dir: Path, img_map: Dict[Path, str], id: str, skip_image: bool):
29
+ if not isinstance(shape, Shape):
30
+ raise ValueError("Invalid shape object")
31
+ if not isinstance(content_list, list):
32
+ raise ValueError("content_list must be a list")
33
+ if not isinstance(media_dir, Path):
34
+ raise ValueError("media_dir must be a Path object")
35
+ if not isinstance(img_map, dict):
36
+ raise ValueError("img_map must be a dictionary")
37
+ if not isinstance(id, str):
38
+ raise ValueError("id must be a string")
39
+ if not isinstance(skip_image, bool):
40
+ raise ValueError("skip_image must be a boolean")
41
+
42
+ try:
43
+ shape_type = shape.shape_type
44
+ if shape.has_text_frame:
45
+ for paragraph in shape.text_frame.paragraphs:
46
+ content_list.append(
47
+ {
48
+ "type": "text",
49
+ "data": paragraph.text + "\n",
50
+ }
51
+ )
52
+ elif shape_type == MSO_SHAPE_TYPE.PICTURE and not skip_image:
53
+ shape: Picture
54
+ image: Image = shape.image
55
+ image_bytes = image.blob
56
+ img_path = media_dir.resolve().joinpath(f"pic-{len(img_map)}.{image.ext}")
57
+ if not media_dir.exists():
58
+ media_dir.mkdir(parents=True, exist_ok=True)
59
+ if not os.access(media_dir, os.W_OK):
60
+ raise PermissionError(f"Cannot write to directory: {media_dir}")
61
+ img_s3_path = self.generate_img_path(id, img_path.name)
62
+ img_map[img_path] = img_s3_path
63
+ content_list.append({"type": "image", "data": img_s3_path})
64
+ with open(img_path, "wb") as file:
65
+ file.write(image_bytes)
66
+ elif shape_type == MSO_SHAPE_TYPE.TABLE:
67
+ shape: GraphicFrame
68
+ table: Table = shape.table
69
+ md = "\n"
70
+ for row_no, row in enumerate(table.rows):
71
+ row: _Row
72
+ md += "|"
73
+ if row_no == 1:
74
+ for col in row.cells:
75
+ md += "---|"
76
+ md += "\n|"
77
+ for col in row.cells:
78
+ cell: _Cell = col
79
+ md += " " + cell.text.replace("\r", " ").replace("\n", " ") + " |"
80
+ md += "\n"
81
+ md += "\n"
82
+ content_list.append({"type": "md", "data": md})
83
+ elif shape_type == MSO_SHAPE_TYPE.GROUP:
84
+ shape: GroupShape
85
+ for sub_shape in shape.shapes:
86
+ self.handle_shape(sub_shape, content_list, media_dir, img_map, id, skip_image)
87
+ else:
88
+ logger.info(f"Unknown shape type: {shape_type}, {type(shape)}")
89
+ except PermissionError as pe:
90
+ logger.error(f"Permission error: {pe}")
91
+ except IOError as ie:
92
+ logger.error(f"IO error: {ie}")
93
+ except Exception as e:
94
+ logger.error(f"Error handling shape: {e}")
95
+
96
+ def extract(self, presentation_source: Union[FileStorage, Path], id: str, dir: Path, media_dir: Path,
97
+ skip_image: bool):
98
+ if not isinstance(presentation_source, (FileStorage, Path)):
99
+ raise ValueError("presentation_source must be a FileStorage or Path object")
100
+ if not isinstance(id, str):
101
+ raise ValueError("id must be a string")
102
+ if not isinstance(dir, Path):
103
+ raise ValueError("dir must be a Path object")
104
+ if not isinstance(media_dir, Path):
105
+ raise ValueError("media_dir must be a Path object")
106
+ if not isinstance(skip_image, bool):
107
+ raise ValueError("skip_image must be a boolean")
108
+
109
+ pages = []
110
+ img_map = {}
111
+
112
+ try:
113
+ presentation: Presentation = Presentation(presentation_source)
114
+ for page_no, slide in enumerate(presentation.slides):
115
+ slide: Slide
116
+ page = {"page_no": page_no, "content_list": []}
117
+ for shape in slide.shapes:
118
+ self.handle_shape(shape, page["content_list"], media_dir, img_map, id, skip_image)
119
+ pages.append(page)
120
+ except FileNotFoundError as fnfe:
121
+ logger.error(f"File not found: {fnfe}")
122
+ except PermissionError as pe:
123
+ logger.error(f"Permission error: {pe}")
124
+ except IOError as ie:
125
+ logger.error(f"IO error: {ie}")
126
+ except Exception as e:
127
+ logger.error(f"Error extracting presentation: {e}")
128
+
129
+ return pages
130
+
131
+ def run(self, id: str, file_path: Path, skip_image: bool = False):
132
+ if not isinstance(id, str):
133
+ raise ValueError("id must be a string")
134
+ if not isinstance(file_path, Path):
135
+ raise ValueError("file_path must be a Path object")
136
+ if not isinstance(skip_image, bool):
137
+ raise ValueError("skip_image must be a boolean")
138
+
139
+ media_dir = Path("media").resolve()
140
+ return self.extract(file_path, id, Path("."), media_dir, skip_image)