pydatamax 0.1.5__py3-none-any.whl → 0.1.12__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datamax/__init__.py +1 -1
- datamax/loader/OssHandler.py +85 -51
- datamax/parser/__init__.py +1 -1
- datamax/parser/base.py +2 -2
- datamax/parser/core.py +205 -31
- datamax/parser/doc_parser.py +2 -5
- datamax/parser/docx_parser.py +3 -6
- datamax/parser/epub_parser.py +2 -5
- datamax/parser/html_parser.py +2 -5
- datamax/parser/image_parser.py +18 -14
- datamax/parser/md_parser.py +67 -4
- datamax/parser/pdf_parser.py +59 -20
- datamax/parser/ppt_parser.py +3 -5
- datamax/parser/pptx_parser.py +10 -13
- datamax/parser/txt_parser.py +2 -5
- datamax/parser/xls_parser.py +26 -0
- datamax/parser/xlsx_parser.py +65 -4
- datamax/utils/__init__.py +1 -0
- datamax/utils/constants.py +58 -0
- datamax/utils/data_cleaner.py +45 -28
- datamax/utils/env_setup.py +80 -0
- datamax/utils/gotocr_pdf.py +265 -0
- datamax/utils/mineru_operator.py +62 -0
- datamax/utils/paddleocr_pdf_operator.py +2 -1
- datamax/utils/qa_generator.py +376 -0
- datamax/utils/tokenizer.py +1 -1
- pydatamax-0.1.12.dist-info/METADATA +281 -0
- pydatamax-0.1.12.dist-info/RECORD +39 -0
- {pydatamax-0.1.5.dist-info → pydatamax-0.1.12.dist-info}/WHEEL +1 -1
- {pydatamax-0.1.5.dist-info → pydatamax-0.1.12.dist-info/licenses}/LICENSE +0 -0
- {pydatamax-0.1.5.dist-info → pydatamax-0.1.12.dist-info}/top_level.txt +1 -0
- tests/__init__.py +0 -0
- tests/test_basic.py +20 -0
- pydatamax-0.1.5.dist-info/METADATA +0 -282
- pydatamax-0.1.5.dist-info/RECORD +0 -31
datamax/__init__.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
from .parser import
|
1
|
+
from .parser import DataMax
|
datamax/loader/OssHandler.py
CHANGED
@@ -1,9 +1,11 @@
|
|
1
|
+
import datetime
|
1
2
|
import os
|
3
|
+
import subprocess
|
4
|
+
|
2
5
|
import oss2
|
3
|
-
import datetime
|
4
|
-
from tqdm import tqdm
|
5
|
-
from loguru import logger
|
6
6
|
from dotenv import load_dotenv
|
7
|
+
from loguru import logger
|
8
|
+
from tqdm import tqdm
|
7
9
|
|
8
10
|
load_dotenv()
|
9
11
|
|
@@ -11,8 +13,9 @@ load_dotenv()
|
|
11
13
|
def removing(path):
|
12
14
|
for root, dirs, files in os.walk(path):
|
13
15
|
for dir in dirs:
|
14
|
-
if dir ==
|
15
|
-
|
16
|
+
if dir == "__pycache__":
|
17
|
+
pycache_path = os.path.join(root, dir)
|
18
|
+
subprocess.run(["rm", "-rf", pycache_path], check=False)
|
16
19
|
|
17
20
|
|
18
21
|
def format_size_adaptive(value):
|
@@ -25,103 +28,125 @@ def format_size_adaptive(value):
|
|
25
28
|
|
26
29
|
|
27
30
|
def format_datetime_into_isoformat(date_time: datetime.datetime) -> str:
|
28
|
-
return
|
29
|
-
|
31
|
+
return (
|
32
|
+
date_time.replace(tzinfo=datetime.timezone.utc)
|
33
|
+
.isoformat()
|
34
|
+
.replace("+00:00", "Z")
|
35
|
+
)
|
30
36
|
|
31
37
|
|
32
38
|
class OssClient:
|
33
|
-
def __init__(
|
39
|
+
def __init__(
|
40
|
+
self, oss_access_key_id, oss_access_key_secret, oss_endpoint, oss_bucket_name
|
41
|
+
):
|
34
42
|
self.bucket_name = oss_bucket_name
|
35
|
-
self.auth = oss2.Auth(
|
36
|
-
|
43
|
+
self.auth = oss2.Auth(
|
44
|
+
os.getenv("OSS_ACCESS_KEY_ID", oss_access_key_id),
|
45
|
+
os.getenv("OSS_ACCESS_KEY_SECRET", oss_access_key_secret),
|
46
|
+
)
|
37
47
|
self.endpoint = os.getenv("OSS_ENDPOINT", oss_endpoint)
|
38
|
-
self.bucket = oss2.Bucket(
|
48
|
+
self.bucket = oss2.Bucket(
|
49
|
+
self.auth, self.endpoint, os.getenv("OSS_BUCKET_NAME", oss_bucket_name)
|
50
|
+
)
|
39
51
|
|
40
52
|
# Upload a file
|
41
53
|
# Usage: ossBucket.put_object_from_file("my-object-key", "path/to/local/file.txt")
|
42
54
|
def put_object_from_file(self, object_name, file_path, progress_callback=None):
|
43
|
-
self.bucket.put_object_from_file(
|
55
|
+
self.bucket.put_object_from_file(
|
56
|
+
object_name, file_path, progress_callback=progress_callback
|
57
|
+
)
|
44
58
|
|
45
59
|
# Download a file
|
46
60
|
# Usage: ossBucket.get_object_to_file("my-object-key", "path/to/local/output-file.txt")
|
47
61
|
def get_object_to_file(self, object_name, file_path, progress_callback=None):
|
48
62
|
try:
|
49
|
-
self.bucket.get_object_to_file(
|
50
|
-
|
63
|
+
self.bucket.get_object_to_file(
|
64
|
+
object_name, file_path, progress_callback=progress_callback
|
65
|
+
)
|
66
|
+
except oss2.exceptions.NoSuchKey:
|
51
67
|
raise
|
52
68
|
|
53
69
|
# Upload a folder
|
54
70
|
|
55
71
|
# Usage: ossBucket.put_object_from_folder("my-object-folder", "path/to/local/folder")
|
56
|
-
def put_pdf_word_from_folder(
|
72
|
+
def put_pdf_word_from_folder(
|
73
|
+
self, object_folder_name, local_folder_path, progress_callback=None
|
74
|
+
):
|
57
75
|
for root, dirs, files in os.walk(local_folder_path):
|
58
76
|
for file in tqdm(files, desc=root):
|
59
|
-
if file.endswith(
|
77
|
+
if file.endswith(".pdf") or file.endswith(".word"):
|
60
78
|
file_path = os.path.join(root, file)
|
61
79
|
object_name = os.path.join(
|
62
|
-
object_folder_name, file_path[len(local_folder_path) + 1:]
|
63
|
-
|
80
|
+
object_folder_name, file_path[len(local_folder_path) + 1 :]
|
81
|
+
)
|
82
|
+
self.bucket.put_object_from_file(
|
83
|
+
object_name, file_path, progress_callback=progress_callback
|
84
|
+
)
|
64
85
|
# logger.info("object name: {}, file path: {}".format(
|
65
86
|
# object_name, file_path))
|
66
87
|
|
67
88
|
# Upload a folder
|
68
89
|
# Usage: ossBucket.put_object_from_folder("my-object-folder", "path/to/local/folder")
|
69
|
-
def put_object_from_folder(
|
90
|
+
def put_object_from_folder(
|
91
|
+
self, object_folder_name, local_folder_path, progress_callback=None
|
92
|
+
):
|
70
93
|
for root, dirs, files in os.walk(local_folder_path):
|
71
94
|
for file in tqdm(files, desc=root):
|
72
95
|
file_path = os.path.join(root, file)
|
73
96
|
object_name = os.path.join(
|
74
|
-
object_folder_name, file_path[len(local_folder_path) + 1:]
|
75
|
-
|
76
|
-
|
77
|
-
object_name, file_path
|
97
|
+
object_folder_name, file_path[len(local_folder_path) + 1 :]
|
98
|
+
)
|
99
|
+
self.bucket.put_object_from_file(
|
100
|
+
object_name, file_path, progress_callback=progress_callback
|
101
|
+
)
|
102
|
+
logger.info(
|
103
|
+
"object name: {}, file path: {}".format(object_name, file_path)
|
104
|
+
)
|
78
105
|
|
79
106
|
# Download a folder
|
80
107
|
# Usage: ossBucket.get_object_to_folder("my-object-folder", "path/to/local/output-folder")
|
81
|
-
def get_object_to_folder(
|
82
|
-
|
83
|
-
|
84
|
-
progress_callback=None):
|
108
|
+
def get_object_to_folder(
|
109
|
+
self, object_folder_name, local_folder_path, progress_callback=None
|
110
|
+
):
|
85
111
|
os.makedirs(local_folder_path, exist_ok=True)
|
86
112
|
for obj in oss2.ObjectIterator(self.bucket, prefix=object_folder_name):
|
87
|
-
file_path = os.path.join(
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
113
|
+
file_path = os.path.join(
|
114
|
+
local_folder_path, obj.key[len(object_folder_name) + 1 :]
|
115
|
+
)
|
116
|
+
self.bucket.get_object_to_file(
|
117
|
+
obj.key, file_path, progress_callback=progress_callback
|
118
|
+
)
|
92
119
|
|
93
120
|
# Get all objects in the bucket
|
94
121
|
# Usage: ossBucket.get_all_objects_in_bucket()
|
95
122
|
def get_all_objects_in_bucket(self, prefix=None, delimiter=None):
|
96
|
-
for obj in oss2.ObjectIterator(self.bucket,
|
97
|
-
prefix=prefix,
|
98
|
-
delimiter=delimiter):
|
123
|
+
for obj in oss2.ObjectIterator(self.bucket, prefix=prefix, delimiter=delimiter):
|
99
124
|
if obj.is_prefix(): # obj is folder
|
100
125
|
logger.info("directory key: {}".format(obj.key))
|
101
126
|
else: # obj is file
|
102
127
|
logger.info(
|
103
|
-
"file key: {}, object last modified: {}, object size: {}".
|
104
|
-
format(
|
128
|
+
"file key: {}, object last modified: {}, object size: {}".format(
|
105
129
|
obj.key,
|
106
130
|
format_datetime_into_isoformat(
|
107
|
-
datetime.datetime.fromtimestamp(
|
108
|
-
|
109
|
-
format_size_adaptive(obj.size)
|
131
|
+
datetime.datetime.fromtimestamp(obj.last_modified)
|
132
|
+
),
|
133
|
+
format_size_adaptive(obj.size),
|
134
|
+
)
|
135
|
+
)
|
110
136
|
|
111
137
|
def get_objects_in_folders(self, prefix: str):
|
112
138
|
all_keys = []
|
113
|
-
for obj in oss2.ObjectIterator(self.bucket,
|
114
|
-
prefix=prefix):
|
139
|
+
for obj in oss2.ObjectIterator(self.bucket, prefix=prefix):
|
115
140
|
if obj.is_prefix(): # obj is folder
|
116
141
|
pass
|
117
142
|
else: # obj is file
|
118
|
-
if obj.key.endswith(
|
143
|
+
if obj.key.endswith("/"):
|
119
144
|
continue
|
120
145
|
all_keys.append(obj.key)
|
121
146
|
return all_keys
|
122
147
|
|
123
|
-
def delete_object(self, object_name=
|
124
|
-
if object_name is None or object_name ==
|
148
|
+
def delete_object(self, object_name="test"):
|
149
|
+
if object_name is None or object_name == "":
|
125
150
|
raise Exception(
|
126
151
|
"Danger! object name is None or '' Will delete all objects in bucket!"
|
127
152
|
)
|
@@ -129,8 +154,8 @@ class OssClient:
|
|
129
154
|
|
130
155
|
# Delete a folder
|
131
156
|
# Usage: ossBucket.delete_object_folder("my-object-folder")
|
132
|
-
def delete_object_folder(self, object_folder_name=
|
133
|
-
if object_folder_name is None or object_folder_name ==
|
157
|
+
def delete_object_folder(self, object_folder_name="test"):
|
158
|
+
if object_folder_name is None or object_folder_name == "":
|
134
159
|
raise Exception(
|
135
160
|
"Danger! object name is None or '' Will delete all objects in bucket!"
|
136
161
|
)
|
@@ -138,20 +163,29 @@ class OssClient:
|
|
138
163
|
self.bucket.delete_object(obj.key)
|
139
164
|
logger.info("delete object key: {}".format(obj.key))
|
140
165
|
|
141
|
-
def get_oss_url(
|
166
|
+
def get_oss_url(
|
167
|
+
self, object_name, url_expires_time, aliyun_oss_url_prefix, csnt_url_prefix
|
168
|
+
):
|
142
169
|
oss_prefix = "oss://" + os.getenv("OSS_BUCKET_NAME", self.bucket_name) + "/"
|
143
170
|
if object_name.__contains__(oss_prefix):
|
144
171
|
object_name = object_name.replace(oss_prefix, "")
|
145
172
|
aliyun_url = self.bucket.sign_url(
|
146
|
-
"GET",
|
173
|
+
"GET",
|
174
|
+
object_name,
|
175
|
+
int(os.getenv("URL_EXPIRES_TIME", url_expires_time)),
|
176
|
+
slash_safe=True,
|
147
177
|
)
|
148
178
|
csnt_url = aliyun_url.replace(
|
149
|
-
os.getenv("ALIYUN_OSS_URL_PREFIX", aliyun_oss_url_prefix),
|
179
|
+
os.getenv("ALIYUN_OSS_URL_PREFIX", aliyun_oss_url_prefix),
|
180
|
+
os.getenv("CSNT_URL_PREFIX", csnt_url_prefix),
|
150
181
|
)
|
151
182
|
return csnt_url
|
152
183
|
|
153
184
|
def get_default_oss_url(self, object_name: str, url_expires_time):
|
154
185
|
aliyun_url = self.bucket.sign_url(
|
155
|
-
"GET",
|
186
|
+
"GET",
|
187
|
+
object_name,
|
188
|
+
int(os.getenv("url_expires_time", url_expires_time)),
|
189
|
+
slash_safe=True,
|
156
190
|
)
|
157
191
|
return aliyun_url
|
datamax/parser/__init__.py
CHANGED
datamax/parser/base.py
CHANGED
@@ -58,12 +58,12 @@ class BaseLife:
|
|
58
58
|
tk_client = DashScopeClient()
|
59
59
|
|
60
60
|
@staticmethod
|
61
|
-
def generate_lifecycle(source_file,
|
61
|
+
def generate_lifecycle(source_file, domain, life_type, usage_purpose) -> LifeCycle:
|
62
62
|
update_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
63
63
|
life_type = [life_type]
|
64
64
|
storage = os.stat(source_file)
|
65
65
|
life_metadata = {
|
66
|
-
"token_count": token_count, # Token count of the text
|
66
|
+
# "token_count": token_count, # Token count of the text
|
67
67
|
"storage_size": storage.st_size, # Storage size in bytes
|
68
68
|
"source_file": source_file, # Source file
|
69
69
|
"domain": domain, # Domain
|
datamax/parser/core.py
CHANGED
@@ -1,24 +1,49 @@
|
|
1
1
|
import os
|
2
2
|
import importlib
|
3
|
+
from typing import List, Union, Dict
|
4
|
+
from openai import OpenAI
|
5
|
+
from datamax.utils import data_cleaner
|
6
|
+
from datamax.utils.qa_generator import generatr_qa_pairs
|
7
|
+
|
8
|
+
|
9
|
+
class ModelInvoker:
|
10
|
+
def __init__(self):
|
11
|
+
self.client = None
|
12
|
+
|
13
|
+
def invoke_model(self, api_key, base_url, model_name, messages):
|
14
|
+
self.client = OpenAI(
|
15
|
+
api_key=api_key,
|
16
|
+
base_url=base_url,
|
17
|
+
)
|
18
|
+
|
19
|
+
completion = self.client.chat.completions.create(
|
20
|
+
model=model_name,
|
21
|
+
messages=messages,
|
22
|
+
)
|
23
|
+
json_data = completion.model_dump()
|
24
|
+
return json_data.get("choices")[0].get("message").get("content", "")
|
3
25
|
|
4
26
|
|
5
27
|
class ParserFactory:
|
6
28
|
@staticmethod
|
7
|
-
def create_parser(
|
8
|
-
|
29
|
+
def create_parser(
|
30
|
+
file_path: str,
|
31
|
+
use_mineru: bool = False,
|
32
|
+
to_markdown: bool = False,
|
33
|
+
timeout: int = 1200
|
34
|
+
):
|
9
35
|
"""
|
10
36
|
Create a parser instance based on the file extension.
|
11
|
-
|
12
37
|
:param file_path: The path to the file to be parsed.
|
13
|
-
:param use_ocr: Flag to indicate whether OCR should be used.
|
14
|
-
:param use_gpu: Flag to indicate whether GPU should be used.
|
15
|
-
:param gpu_id: The ID of the GPU to use.
|
16
38
|
:param to_markdown: Flag to indicate whether the output should be in Markdown format.
|
17
39
|
(only supported files in .doc or .docx format)
|
40
|
+
:param use_mineru: Flag to indicate whether MinerU should be used. (only supported files in .pdf format)
|
41
|
+
:param timeout: Timeout for the request .(only supported files in .xlsx format)
|
18
42
|
:return: An instance of the parser class corresponding to the file extension.
|
19
43
|
"""
|
20
44
|
file_extension = os.path.splitext(file_path)[1].lower()
|
21
45
|
parser_class_name = {
|
46
|
+
'.md': 'MarkdownParser',
|
22
47
|
'.docx': 'DocxParser',
|
23
48
|
'.doc': 'DocParser',
|
24
49
|
'.epub': 'EpubParser',
|
@@ -28,13 +53,17 @@ class ParserFactory:
|
|
28
53
|
'.ppt': 'PPtParser',
|
29
54
|
'.pdf': 'PdfParser',
|
30
55
|
'.jpg': 'ImageParser',
|
31
|
-
'.
|
56
|
+
'.jpeg': 'ImageParser',
|
57
|
+
'.png': 'ImageParser',
|
58
|
+
'.webp': 'ImageParser',
|
59
|
+
'.xlsx': 'XlsxParser',
|
60
|
+
'.xls': 'XlsParser'
|
32
61
|
}.get(file_extension)
|
33
62
|
|
34
63
|
if not parser_class_name:
|
35
64
|
return None
|
36
65
|
|
37
|
-
if file_extension
|
66
|
+
if file_extension in ['.jpg', 'jpeg', '.png', '.webp']:
|
38
67
|
module_name = f'datamax.parser.image_parser'
|
39
68
|
else:
|
40
69
|
# Dynamically determine the module name based on the file extension
|
@@ -47,33 +76,55 @@ class ParserFactory:
|
|
47
76
|
|
48
77
|
# Special handling for PdfParser arguments
|
49
78
|
if parser_class_name == 'PdfParser':
|
50
|
-
return parser_class(
|
79
|
+
return parser_class(
|
80
|
+
file_path=file_path,
|
81
|
+
use_mineru=use_mineru,
|
82
|
+
)
|
51
83
|
elif parser_class_name == 'DocxParser' or parser_class_name == 'DocParser':
|
52
|
-
return parser_class(
|
84
|
+
return parser_class(
|
85
|
+
file_path=file_path, to_markdown=to_markdown
|
86
|
+
)
|
87
|
+
elif parser_class_name == 'XlsxParser':
|
88
|
+
return parser_class(
|
89
|
+
file_path=file_path,
|
90
|
+
timeout=timeout
|
91
|
+
)
|
53
92
|
else:
|
54
|
-
return parser_class(
|
93
|
+
return parser_class(
|
94
|
+
file_path=file_path
|
95
|
+
)
|
55
96
|
|
56
97
|
except (ImportError, AttributeError) as e:
|
57
98
|
raise e
|
58
99
|
|
59
100
|
|
60
|
-
class
|
61
|
-
def __init__(self,
|
62
|
-
|
101
|
+
class DataMax:
|
102
|
+
def __init__(self,
|
103
|
+
file_path: Union[str, list] = '',
|
104
|
+
use_mineru: bool = False,
|
105
|
+
to_markdown: bool = False,
|
106
|
+
timeout: int = 1200
|
107
|
+
):
|
63
108
|
"""
|
64
109
|
Initialize the DataMaxParser with file path and parsing options.
|
65
110
|
|
111
|
+
# <Abandon>
|
112
|
+
# :param use_paddle_ocr: Flag to indicate whether PaddleOCR should be used.
|
113
|
+
# :param use_paddle_gpu: Flag to indicate whether PaddleOCR-GPU should be used.
|
114
|
+
# :param use_got_ocr: Flag to indicate whether GOT-OCR should be used.
|
115
|
+
# :param got_weights_path: GOT-OCR Weights Path.
|
116
|
+
# :param gpu_id: The ID of the GPU to use.
|
117
|
+
|
66
118
|
:param file_path: The path to the file or directory to be parsed.
|
67
|
-
:param
|
68
|
-
:param use_gpu: Flag to indicate whether GPU should be used.
|
69
|
-
:param gpu_id: The ID of the GPU to use.
|
119
|
+
:param use_mineru: Flag to indicate whether MinerU should be used.
|
70
120
|
:param to_markdown: Flag to indicate whether the output should be in Markdown format.
|
71
121
|
"""
|
72
122
|
self.file_path = file_path
|
73
|
-
self.
|
74
|
-
self.use_gpu = use_gpu
|
75
|
-
self.gpu_id = gpu_id
|
123
|
+
self.use_mineru = use_mineru
|
76
124
|
self.to_markdown = to_markdown
|
125
|
+
self.parsed_data = None
|
126
|
+
self.model_invoker = ModelInvoker()
|
127
|
+
self.timeout = timeout
|
77
128
|
|
78
129
|
def get_data(self):
|
79
130
|
"""
|
@@ -83,19 +134,136 @@ class DataMaxParser:
|
|
83
134
|
"""
|
84
135
|
try:
|
85
136
|
if isinstance(self.file_path, list):
|
86
|
-
|
87
|
-
|
137
|
+
parsed_data = [self._parse_file(f) for f in self.file_path]
|
138
|
+
self.parsed_data = parsed_data
|
139
|
+
return parsed_data
|
88
140
|
|
89
141
|
elif isinstance(self.file_path, str) and os.path.isfile(self.file_path):
|
90
|
-
|
142
|
+
parsed_data = self._parse_file(self.file_path)
|
143
|
+
self.parsed_data = parsed_data
|
144
|
+
return parsed_data
|
91
145
|
|
92
146
|
elif isinstance(self.file_path, str) and os.path.isdir(self.file_path):
|
93
147
|
file_list = [os.path.join(self.file_path, file) for file in os.listdir(self.file_path)]
|
94
|
-
|
95
|
-
|
148
|
+
parsed_data = [self._parse_file(f) for f in file_list if os.path.isfile(f)]
|
149
|
+
self.parsed_data = parsed_data
|
150
|
+
return parsed_data
|
151
|
+
else:
|
152
|
+
raise ValueError("Invalid file path.")
|
153
|
+
|
96
154
|
except Exception as e:
|
97
155
|
raise e
|
98
156
|
|
157
|
+
def clean_data(self, method_list: List[str], text: str = None):
|
158
|
+
"""
|
159
|
+
Clean data
|
160
|
+
|
161
|
+
methods include AbnormalCleaner, TextFilter, PrivacyDesensitization which is 1 2 3
|
162
|
+
|
163
|
+
:return:
|
164
|
+
"""
|
165
|
+
if text:
|
166
|
+
cleaned_text = text
|
167
|
+
elif self.parsed_data:
|
168
|
+
cleaned_text = self.parsed_data.get('content')
|
169
|
+
else:
|
170
|
+
raise ValueError("No data to clean.")
|
171
|
+
|
172
|
+
for method in method_list:
|
173
|
+
if method == 'abnormal':
|
174
|
+
cleaned_text = data_cleaner.AbnormalCleaner(cleaned_text).to_clean().get("text")
|
175
|
+
elif method == 'filter':
|
176
|
+
cleaned_text = data_cleaner.TextFilter(cleaned_text).to_filter()
|
177
|
+
cleaned_text = cleaned_text.get("text") if cleaned_text else ''
|
178
|
+
elif method == 'private':
|
179
|
+
cleaned_text = data_cleaner.PrivacyDesensitization(cleaned_text).to_private().get("text")
|
180
|
+
|
181
|
+
if self.parsed_data:
|
182
|
+
origin_dict = self.parsed_data
|
183
|
+
origin_dict['content'] = cleaned_text
|
184
|
+
self.parsed_data = None
|
185
|
+
return origin_dict
|
186
|
+
else:
|
187
|
+
return cleaned_text
|
188
|
+
|
189
|
+
def get_pre_label(self,
|
190
|
+
api_key: str,
|
191
|
+
base_url: str,
|
192
|
+
model_name: str,
|
193
|
+
chunk_size: int = 500,
|
194
|
+
chunk_overlap: int = 100,
|
195
|
+
question_number: int = 5,
|
196
|
+
max_workers: int = 5,
|
197
|
+
messages: List[Dict[str, str]] = None):
|
198
|
+
return generatr_qa_pairs(
|
199
|
+
api_key=api_key,
|
200
|
+
base_url=base_url,
|
201
|
+
model_name=model_name,
|
202
|
+
chunk_size=chunk_size,
|
203
|
+
chunk_overlap=chunk_overlap,
|
204
|
+
question_number=question_number,
|
205
|
+
max_workers=max_workers,
|
206
|
+
message=messages,
|
207
|
+
file_path=self.file_path
|
208
|
+
)
|
209
|
+
|
210
|
+
## <Abandon>
|
211
|
+
# def enhance_with_model(self, api_key: str, base_url: str, model_name: str, iteration: int = 1,
|
212
|
+
# messages: List[Dict[str, str]] = None):
|
213
|
+
# """
|
214
|
+
# Enhance the parsed content using a large language model.
|
215
|
+
#
|
216
|
+
# :param api_key: API key for the large model service.
|
217
|
+
# :param base_url: Base URL for the large model service.
|
218
|
+
# :param model_name: Name of the model to use.
|
219
|
+
# :param iteration: Number of iterations
|
220
|
+
# :param messages: Custom messages list [{"role": "system", "content": "..."}, ...]
|
221
|
+
# :return: Enhanced text.
|
222
|
+
# """
|
223
|
+
# if not messages:
|
224
|
+
# # If no custom message is provided, the default message structure is used, but only if there is parsed data
|
225
|
+
# if self.parsed_data:
|
226
|
+
# system_prompt = get_system_prompt(self.parsed_data)
|
227
|
+
# default_message_user = {"role": "user", "content": "按照json格式给出问答对"}
|
228
|
+
# messages = [
|
229
|
+
# {"role": "system", "content": system_prompt},
|
230
|
+
# default_message_user
|
231
|
+
# ]
|
232
|
+
# else:
|
233
|
+
# raise ValueError("No data to enhance and no custom messages provided.")
|
234
|
+
# try:
|
235
|
+
# if isinstance(iteration, int) and iteration >= 1:
|
236
|
+
# results = []
|
237
|
+
# current_messages = messages.copy() # Avoid modifying the original message during iteration
|
238
|
+
#
|
239
|
+
# for _ in range(iteration):
|
240
|
+
# enhanced_text = self.model_invoker.invoke_model(
|
241
|
+
# api_key=api_key,
|
242
|
+
# base_url=base_url,
|
243
|
+
# model_name=model_name,
|
244
|
+
# messages=current_messages
|
245
|
+
# )
|
246
|
+
#
|
247
|
+
# # Append the generated content to the conversation history in multiple iterations
|
248
|
+
# if iteration > 1:
|
249
|
+
# current_messages.append({"role": "assistant", "content": enhanced_text})
|
250
|
+
# current_messages.append(
|
251
|
+
# {"role": "user", "content": "请继续生成, 生成要求不变, 结果是jsonlist, 且长度不超过5"})
|
252
|
+
#
|
253
|
+
# # If there is parsed data, update the contents and return a copy of the original dictionary; Otherwise, return the enhanced text directly
|
254
|
+
# if self.parsed_data:
|
255
|
+
# origin_dict = self.parsed_data.copy()
|
256
|
+
# origin_dict['content'] = enhanced_text
|
257
|
+
# results.append(origin_dict)
|
258
|
+
# else:
|
259
|
+
# results.append(enhanced_text)
|
260
|
+
#
|
261
|
+
# return results if iteration > 1 else results[0]
|
262
|
+
# else:
|
263
|
+
# raise ValueError("Invalid iteration parameter.")
|
264
|
+
# except Exception as e:
|
265
|
+
# raise Exception(f"An error occurred while enhancing with the model: {e}")
|
266
|
+
|
99
267
|
def _parse_file(self, file_path):
|
100
268
|
"""
|
101
269
|
Create a parser instance using ParserFactory and parse the file.
|
@@ -103,12 +271,18 @@ class DataMaxParser:
|
|
103
271
|
:param file_path: The path to the file to be parsed.
|
104
272
|
:return: The parsed data.
|
105
273
|
"""
|
106
|
-
|
107
|
-
|
108
|
-
|
274
|
+
try:
|
275
|
+
parser = ParserFactory.create_parser(
|
276
|
+
use_mineru=self.use_mineru,
|
277
|
+
file_path=file_path,
|
278
|
+
to_markdown=self.to_markdown,
|
279
|
+
timeout=self.timeout
|
280
|
+
)
|
281
|
+
if parser:
|
282
|
+
return parser.parse(file_path=file_path)
|
283
|
+
except Exception as e:
|
284
|
+
raise e
|
109
285
|
|
110
286
|
|
111
287
|
if __name__ == '__main__':
|
112
|
-
|
113
|
-
data = data.get_data()
|
114
|
-
print(data)
|
288
|
+
pass
|
datamax/parser/doc_parser.py
CHANGED
@@ -9,7 +9,6 @@ from typing import Union
|
|
9
9
|
from docx import Document
|
10
10
|
from datamax.parser.base import BaseLife
|
11
11
|
from datamax.parser.base import MarkdownOutputVo
|
12
|
-
from datamax.utils import clean_original_text
|
13
12
|
|
14
13
|
|
15
14
|
class DocParser(BaseLife):
|
@@ -68,10 +67,8 @@ class DocParser(BaseLife):
|
|
68
67
|
mk_content = self.read_docx_file(doc_path=file_path, to_mk=True)
|
69
68
|
else:
|
70
69
|
content = self.read_docx_file(doc_path=file_path, to_mk=False)
|
71
|
-
|
72
|
-
|
73
|
-
token_count = self.tk_client.get_tokenizer(content=mk_content)
|
74
|
-
lifecycle = self.generate_lifecycle(source_file=file_path, token_count=token_count, domain="Technology",
|
70
|
+
mk_content = content
|
71
|
+
lifecycle = self.generate_lifecycle(source_file=file_path, domain="Technology",
|
75
72
|
usage_purpose="Documentation", life_type="LLM_ORIGIN")
|
76
73
|
output_vo = MarkdownOutputVo(title, mk_content)
|
77
74
|
output_vo.add_lifecycle(lifecycle)
|
datamax/parser/docx_parser.py
CHANGED
@@ -4,7 +4,6 @@ from docx import Document
|
|
4
4
|
from typing import Union
|
5
5
|
from datamax.parser.base import BaseLife
|
6
6
|
from datamax.parser.base import MarkdownOutputVo
|
7
|
-
from datamax.utils import clean_original_text
|
8
7
|
|
9
8
|
|
10
9
|
class DocxParser(BaseLife):
|
@@ -34,13 +33,11 @@ class DocxParser(BaseLife):
|
|
34
33
|
mk_content = open(output_md_dir, 'r', encoding='utf-8').read()
|
35
34
|
else:
|
36
35
|
content = self.read_docx_file(file_path=file_path)
|
37
|
-
|
38
|
-
|
39
|
-
token_count = self.tk_client.get_tokenizer(content=mk_content)
|
40
|
-
lifecycle = self.generate_lifecycle(source_file=file_path, token_count=token_count, domain="Technology",
|
36
|
+
mk_content = content
|
37
|
+
lifecycle = self.generate_lifecycle(source_file=file_path, domain="Technology",
|
41
38
|
usage_purpose="Documentation", life_type="LLM_ORIGIN")
|
42
39
|
output_vo = MarkdownOutputVo(title, mk_content)
|
43
40
|
output_vo.add_lifecycle(lifecycle)
|
44
41
|
return output_vo.to_dict()
|
45
42
|
except Exception as e:
|
46
|
-
raise e
|
43
|
+
raise e
|
datamax/parser/epub_parser.py
CHANGED
@@ -4,7 +4,6 @@ from bs4 import BeautifulSoup
|
|
4
4
|
from ebooklib import epub
|
5
5
|
from datamax.parser.base import BaseLife
|
6
6
|
from datamax.parser.base import MarkdownOutputVo
|
7
|
-
from datamax.utils import clean_original_text
|
8
7
|
|
9
8
|
|
10
9
|
class EpubParser(BaseLife):
|
@@ -32,10 +31,8 @@ class EpubParser(BaseLife):
|
|
32
31
|
try:
|
33
32
|
title = self.get_file_extension(file_path)
|
34
33
|
content = self.read_epub_file(file_path=file_path)
|
35
|
-
|
36
|
-
|
37
|
-
token_count = self.tk_client.get_tokenizer(content=mk_content)
|
38
|
-
lifecycle = self.generate_lifecycle(source_file=file_path, token_count=token_count, domain="Technology",
|
34
|
+
mk_content = content
|
35
|
+
lifecycle = self.generate_lifecycle(source_file=file_path, domain="Technology",
|
39
36
|
usage_purpose="Documentation", life_type="LLM_ORIGIN")
|
40
37
|
output_vo = MarkdownOutputVo(title, mk_content)
|
41
38
|
output_vo.add_lifecycle(lifecycle)
|
datamax/parser/html_parser.py
CHANGED
@@ -7,7 +7,6 @@ sys.path.insert(0, str(ROOT_DIR))
|
|
7
7
|
from datamax.parser.base import BaseLife
|
8
8
|
from datamax.parser.base import MarkdownOutputVo
|
9
9
|
from bs4 import BeautifulSoup
|
10
|
-
from datamax.utils import clean_original_text
|
11
10
|
|
12
11
|
|
13
12
|
class HtmlParser(BaseLife):
|
@@ -29,10 +28,8 @@ class HtmlParser(BaseLife):
|
|
29
28
|
try:
|
30
29
|
title = self.get_file_extension(file_path)
|
31
30
|
content = self.read_html_file(file_path=file_path)
|
32
|
-
|
33
|
-
|
34
|
-
token_count = self.tk_client.get_tokenizer(content=mk_content)
|
35
|
-
lifecycle = self.generate_lifecycle(source_file=file_path, token_count=token_count, domain="Technology",
|
31
|
+
mk_content = content
|
32
|
+
lifecycle = self.generate_lifecycle(source_file=file_path, domain="Technology",
|
36
33
|
usage_purpose="Documentation", life_type="LLM_ORIGIN")
|
37
34
|
output_vo = MarkdownOutputVo(title, mk_content)
|
38
35
|
output_vo.add_lifecycle(lifecycle)
|