pydatamax 0.1.5__py3-none-any.whl → 0.1.12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
datamax/__init__.py CHANGED
@@ -1 +1 @@
1
- from .parser import DataMaxParser
1
+ from .parser import DataMax
@@ -1,9 +1,11 @@
1
+ import datetime
1
2
  import os
3
+ import subprocess
4
+
2
5
  import oss2
3
- import datetime
4
- from tqdm import tqdm
5
- from loguru import logger
6
6
  from dotenv import load_dotenv
7
+ from loguru import logger
8
+ from tqdm import tqdm
7
9
 
8
10
  load_dotenv()
9
11
 
@@ -11,8 +13,9 @@ load_dotenv()
11
13
  def removing(path):
12
14
  for root, dirs, files in os.walk(path):
13
15
  for dir in dirs:
14
- if dir == '__pycache__':
15
- os.system(f'rm -rf {os.path.join(root, dir)}')
16
+ if dir == "__pycache__":
17
+ pycache_path = os.path.join(root, dir)
18
+ subprocess.run(["rm", "-rf", pycache_path], check=False)
16
19
 
17
20
 
18
21
  def format_size_adaptive(value):
@@ -25,103 +28,125 @@ def format_size_adaptive(value):
25
28
 
26
29
 
27
30
  def format_datetime_into_isoformat(date_time: datetime.datetime) -> str:
28
- return date_time.replace(tzinfo=datetime.timezone.utc).isoformat().replace(
29
- "+00:00", "Z")
31
+ return (
32
+ date_time.replace(tzinfo=datetime.timezone.utc)
33
+ .isoformat()
34
+ .replace("+00:00", "Z")
35
+ )
30
36
 
31
37
 
32
38
  class OssClient:
33
- def __init__(self, oss_access_key_id, oss_access_key_secret, oss_endpoint, oss_bucket_name):
39
+ def __init__(
40
+ self, oss_access_key_id, oss_access_key_secret, oss_endpoint, oss_bucket_name
41
+ ):
34
42
  self.bucket_name = oss_bucket_name
35
- self.auth = oss2.Auth(os.getenv("OSS_ACCESS_KEY_ID", oss_access_key_id),
36
- os.getenv("OSS_ACCESS_KEY_SECRET", oss_access_key_secret))
43
+ self.auth = oss2.Auth(
44
+ os.getenv("OSS_ACCESS_KEY_ID", oss_access_key_id),
45
+ os.getenv("OSS_ACCESS_KEY_SECRET", oss_access_key_secret),
46
+ )
37
47
  self.endpoint = os.getenv("OSS_ENDPOINT", oss_endpoint)
38
- self.bucket = oss2.Bucket(self.auth, self.endpoint, os.getenv("OSS_BUCKET_NAME", oss_bucket_name))
48
+ self.bucket = oss2.Bucket(
49
+ self.auth, self.endpoint, os.getenv("OSS_BUCKET_NAME", oss_bucket_name)
50
+ )
39
51
 
40
52
  # Upload a file
41
53
  # Usage: ossBucket.put_object_from_file("my-object-key", "path/to/local/file.txt")
42
54
  def put_object_from_file(self, object_name, file_path, progress_callback=None):
43
- self.bucket.put_object_from_file(object_name, file_path, progress_callback=progress_callback)
55
+ self.bucket.put_object_from_file(
56
+ object_name, file_path, progress_callback=progress_callback
57
+ )
44
58
 
45
59
  # Download a file
46
60
  # Usage: ossBucket.get_object_to_file("my-object-key", "path/to/local/output-file.txt")
47
61
  def get_object_to_file(self, object_name, file_path, progress_callback=None):
48
62
  try:
49
- self.bucket.get_object_to_file(object_name, file_path, progress_callback=progress_callback)
50
- except oss2.exceptions.NoSuchKey as e:
63
+ self.bucket.get_object_to_file(
64
+ object_name, file_path, progress_callback=progress_callback
65
+ )
66
+ except oss2.exceptions.NoSuchKey:
51
67
  raise
52
68
 
53
69
  # Upload a folder
54
70
 
55
71
  # Usage: ossBucket.put_object_from_folder("my-object-folder", "path/to/local/folder")
56
- def put_pdf_word_from_folder(self, object_folder_name, local_folder_path, progress_callback=None):
72
+ def put_pdf_word_from_folder(
73
+ self, object_folder_name, local_folder_path, progress_callback=None
74
+ ):
57
75
  for root, dirs, files in os.walk(local_folder_path):
58
76
  for file in tqdm(files, desc=root):
59
- if file.endswith('.pdf') or file.endswith('.word'):
77
+ if file.endswith(".pdf") or file.endswith(".word"):
60
78
  file_path = os.path.join(root, file)
61
79
  object_name = os.path.join(
62
- object_folder_name, file_path[len(local_folder_path) + 1:])
63
- self.bucket.put_object_from_file(object_name, file_path, progress_callback=progress_callback)
80
+ object_folder_name, file_path[len(local_folder_path) + 1 :]
81
+ )
82
+ self.bucket.put_object_from_file(
83
+ object_name, file_path, progress_callback=progress_callback
84
+ )
64
85
  # logger.info("object name: {}, file path: {}".format(
65
86
  # object_name, file_path))
66
87
 
67
88
  # Upload a folder
68
89
  # Usage: ossBucket.put_object_from_folder("my-object-folder", "path/to/local/folder")
69
- def put_object_from_folder(self, object_folder_name, local_folder_path, progress_callback=None):
90
+ def put_object_from_folder(
91
+ self, object_folder_name, local_folder_path, progress_callback=None
92
+ ):
70
93
  for root, dirs, files in os.walk(local_folder_path):
71
94
  for file in tqdm(files, desc=root):
72
95
  file_path = os.path.join(root, file)
73
96
  object_name = os.path.join(
74
- object_folder_name, file_path[len(local_folder_path) + 1:])
75
- self.bucket.put_object_from_file(object_name, file_path, progress_callback=progress_callback)
76
- logger.info("object name: {}, file path: {}".format(
77
- object_name, file_path))
97
+ object_folder_name, file_path[len(local_folder_path) + 1 :]
98
+ )
99
+ self.bucket.put_object_from_file(
100
+ object_name, file_path, progress_callback=progress_callback
101
+ )
102
+ logger.info(
103
+ "object name: {}, file path: {}".format(object_name, file_path)
104
+ )
78
105
 
79
106
  # Download a folder
80
107
  # Usage: ossBucket.get_object_to_folder("my-object-folder", "path/to/local/output-folder")
81
- def get_object_to_folder(self,
82
- object_folder_name,
83
- local_folder_path,
84
- progress_callback=None):
108
+ def get_object_to_folder(
109
+ self, object_folder_name, local_folder_path, progress_callback=None
110
+ ):
85
111
  os.makedirs(local_folder_path, exist_ok=True)
86
112
  for obj in oss2.ObjectIterator(self.bucket, prefix=object_folder_name):
87
- file_path = os.path.join(local_folder_path,
88
- obj.key[len(object_folder_name) + 1:])
89
- self.bucket.get_object_to_file(obj.key,
90
- file_path,
91
- progress_callback=progress_callback)
113
+ file_path = os.path.join(
114
+ local_folder_path, obj.key[len(object_folder_name) + 1 :]
115
+ )
116
+ self.bucket.get_object_to_file(
117
+ obj.key, file_path, progress_callback=progress_callback
118
+ )
92
119
 
93
120
  # Get all objects in the bucket
94
121
  # Usage: ossBucket.get_all_objects_in_bucket()
95
122
  def get_all_objects_in_bucket(self, prefix=None, delimiter=None):
96
- for obj in oss2.ObjectIterator(self.bucket,
97
- prefix=prefix,
98
- delimiter=delimiter):
123
+ for obj in oss2.ObjectIterator(self.bucket, prefix=prefix, delimiter=delimiter):
99
124
  if obj.is_prefix(): # obj is folder
100
125
  logger.info("directory key: {}".format(obj.key))
101
126
  else: # obj is file
102
127
  logger.info(
103
- "file key: {}, object last modified: {}, object size: {}".
104
- format(
128
+ "file key: {}, object last modified: {}, object size: {}".format(
105
129
  obj.key,
106
130
  format_datetime_into_isoformat(
107
- datetime.datetime.fromtimestamp(
108
- obj.last_modified)),
109
- format_size_adaptive(obj.size)))
131
+ datetime.datetime.fromtimestamp(obj.last_modified)
132
+ ),
133
+ format_size_adaptive(obj.size),
134
+ )
135
+ )
110
136
 
111
137
  def get_objects_in_folders(self, prefix: str):
112
138
  all_keys = []
113
- for obj in oss2.ObjectIterator(self.bucket,
114
- prefix=prefix):
139
+ for obj in oss2.ObjectIterator(self.bucket, prefix=prefix):
115
140
  if obj.is_prefix(): # obj is folder
116
141
  pass
117
142
  else: # obj is file
118
- if obj.key.endswith('/'):
143
+ if obj.key.endswith("/"):
119
144
  continue
120
145
  all_keys.append(obj.key)
121
146
  return all_keys
122
147
 
123
- def delete_object(self, object_name='test'):
124
- if object_name is None or object_name == '':
148
+ def delete_object(self, object_name="test"):
149
+ if object_name is None or object_name == "":
125
150
  raise Exception(
126
151
  "Danger! object name is None or '' Will delete all objects in bucket!"
127
152
  )
@@ -129,8 +154,8 @@ class OssClient:
129
154
 
130
155
  # Delete a folder
131
156
  # Usage: ossBucket.delete_object_folder("my-object-folder")
132
- def delete_object_folder(self, object_folder_name='test'):
133
- if object_folder_name is None or object_folder_name == '':
157
+ def delete_object_folder(self, object_folder_name="test"):
158
+ if object_folder_name is None or object_folder_name == "":
134
159
  raise Exception(
135
160
  "Danger! object name is None or '' Will delete all objects in bucket!"
136
161
  )
@@ -138,20 +163,29 @@ class OssClient:
138
163
  self.bucket.delete_object(obj.key)
139
164
  logger.info("delete object key: {}".format(obj.key))
140
165
 
141
- def get_oss_url(self, object_name, url_expires_time, aliyun_oss_url_prefix, csnt_url_prefix):
166
+ def get_oss_url(
167
+ self, object_name, url_expires_time, aliyun_oss_url_prefix, csnt_url_prefix
168
+ ):
142
169
  oss_prefix = "oss://" + os.getenv("OSS_BUCKET_NAME", self.bucket_name) + "/"
143
170
  if object_name.__contains__(oss_prefix):
144
171
  object_name = object_name.replace(oss_prefix, "")
145
172
  aliyun_url = self.bucket.sign_url(
146
- "GET", object_name, int(os.getenv("URL_EXPIRES_TIME", url_expires_time)), slash_safe=True
173
+ "GET",
174
+ object_name,
175
+ int(os.getenv("URL_EXPIRES_TIME", url_expires_time)),
176
+ slash_safe=True,
147
177
  )
148
178
  csnt_url = aliyun_url.replace(
149
- os.getenv("ALIYUN_OSS_URL_PREFIX", aliyun_oss_url_prefix), os.getenv("CSNT_URL_PREFIX", csnt_url_prefix)
179
+ os.getenv("ALIYUN_OSS_URL_PREFIX", aliyun_oss_url_prefix),
180
+ os.getenv("CSNT_URL_PREFIX", csnt_url_prefix),
150
181
  )
151
182
  return csnt_url
152
183
 
153
184
  def get_default_oss_url(self, object_name: str, url_expires_time):
154
185
  aliyun_url = self.bucket.sign_url(
155
- "GET", object_name, int(os.getenv("url_expires_time", url_expires_time)), slash_safe=True
186
+ "GET",
187
+ object_name,
188
+ int(os.getenv("url_expires_time", url_expires_time)),
189
+ slash_safe=True,
156
190
  )
157
191
  return aliyun_url
@@ -1,4 +1,4 @@
1
- from .core import DataMaxParser
1
+ from .core import DataMax
2
2
  import logging
3
3
  logger = logging.getLogger()
4
4
  logger.addHandler(logging.NullHandler())
datamax/parser/base.py CHANGED
@@ -58,12 +58,12 @@ class BaseLife:
58
58
  tk_client = DashScopeClient()
59
59
 
60
60
  @staticmethod
61
- def generate_lifecycle(source_file, token_count, domain, life_type, usage_purpose) -> LifeCycle:
61
+ def generate_lifecycle(source_file, domain, life_type, usage_purpose) -> LifeCycle:
62
62
  update_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
63
63
  life_type = [life_type]
64
64
  storage = os.stat(source_file)
65
65
  life_metadata = {
66
- "token_count": token_count, # Token count of the text
66
+ # "token_count": token_count, # Token count of the text
67
67
  "storage_size": storage.st_size, # Storage size in bytes
68
68
  "source_file": source_file, # Source file
69
69
  "domain": domain, # Domain
datamax/parser/core.py CHANGED
@@ -1,24 +1,49 @@
1
1
  import os
2
2
  import importlib
3
+ from typing import List, Union, Dict
4
+ from openai import OpenAI
5
+ from datamax.utils import data_cleaner
6
+ from datamax.utils.qa_generator import generatr_qa_pairs
7
+
8
+
9
+ class ModelInvoker:
10
+ def __init__(self):
11
+ self.client = None
12
+
13
+ def invoke_model(self, api_key, base_url, model_name, messages):
14
+ self.client = OpenAI(
15
+ api_key=api_key,
16
+ base_url=base_url,
17
+ )
18
+
19
+ completion = self.client.chat.completions.create(
20
+ model=model_name,
21
+ messages=messages,
22
+ )
23
+ json_data = completion.model_dump()
24
+ return json_data.get("choices")[0].get("message").get("content", "")
3
25
 
4
26
 
5
27
  class ParserFactory:
6
28
  @staticmethod
7
- def create_parser(file_path: str, use_ocr: bool = False, use_gpu: bool = False, gpu_id: int = 6,
8
- to_markdown: bool = False):
29
+ def create_parser(
30
+ file_path: str,
31
+ use_mineru: bool = False,
32
+ to_markdown: bool = False,
33
+ timeout: int = 1200
34
+ ):
9
35
  """
10
36
  Create a parser instance based on the file extension.
11
-
12
37
  :param file_path: The path to the file to be parsed.
13
- :param use_ocr: Flag to indicate whether OCR should be used.
14
- :param use_gpu: Flag to indicate whether GPU should be used.
15
- :param gpu_id: The ID of the GPU to use.
16
38
  :param to_markdown: Flag to indicate whether the output should be in Markdown format.
17
39
  (only supported files in .doc or .docx format)
40
+ :param use_mineru: Flag to indicate whether MinerU should be used. (only supported files in .pdf format)
41
+ :param timeout: Timeout for the request .(only supported files in .xlsx format)
18
42
  :return: An instance of the parser class corresponding to the file extension.
19
43
  """
20
44
  file_extension = os.path.splitext(file_path)[1].lower()
21
45
  parser_class_name = {
46
+ '.md': 'MarkdownParser',
22
47
  '.docx': 'DocxParser',
23
48
  '.doc': 'DocParser',
24
49
  '.epub': 'EpubParser',
@@ -28,13 +53,17 @@ class ParserFactory:
28
53
  '.ppt': 'PPtParser',
29
54
  '.pdf': 'PdfParser',
30
55
  '.jpg': 'ImageParser',
31
- '.png': 'ImageParser'
56
+ '.jpeg': 'ImageParser',
57
+ '.png': 'ImageParser',
58
+ '.webp': 'ImageParser',
59
+ '.xlsx': 'XlsxParser',
60
+ '.xls': 'XlsParser'
32
61
  }.get(file_extension)
33
62
 
34
63
  if not parser_class_name:
35
64
  return None
36
65
 
37
- if file_extension == '.jpg' or file_extension == '.png':
66
+ if file_extension in ['.jpg', 'jpeg', '.png', '.webp']:
38
67
  module_name = f'datamax.parser.image_parser'
39
68
  else:
40
69
  # Dynamically determine the module name based on the file extension
@@ -47,33 +76,55 @@ class ParserFactory:
47
76
 
48
77
  # Special handling for PdfParser arguments
49
78
  if parser_class_name == 'PdfParser':
50
- return parser_class(file_path, use_ocr, use_gpu, gpu_id)
79
+ return parser_class(
80
+ file_path=file_path,
81
+ use_mineru=use_mineru,
82
+ )
51
83
  elif parser_class_name == 'DocxParser' or parser_class_name == 'DocParser':
52
- return parser_class(file_path, to_markdown)
84
+ return parser_class(
85
+ file_path=file_path, to_markdown=to_markdown
86
+ )
87
+ elif parser_class_name == 'XlsxParser':
88
+ return parser_class(
89
+ file_path=file_path,
90
+ timeout=timeout
91
+ )
53
92
  else:
54
- return parser_class(file_path)
93
+ return parser_class(
94
+ file_path=file_path
95
+ )
55
96
 
56
97
  except (ImportError, AttributeError) as e:
57
98
  raise e
58
99
 
59
100
 
60
- class DataMaxParser:
61
- def __init__(self, file_path, use_ocr: bool = False, use_gpu: bool = False, gpu_id: int = 6,
62
- to_markdown: bool = False):
101
+ class DataMax:
102
+ def __init__(self,
103
+ file_path: Union[str, list] = '',
104
+ use_mineru: bool = False,
105
+ to_markdown: bool = False,
106
+ timeout: int = 1200
107
+ ):
63
108
  """
64
109
  Initialize the DataMaxParser with file path and parsing options.
65
110
 
111
+ # <Abandon>
112
+ # :param use_paddle_ocr: Flag to indicate whether PaddleOCR should be used.
113
+ # :param use_paddle_gpu: Flag to indicate whether PaddleOCR-GPU should be used.
114
+ # :param use_got_ocr: Flag to indicate whether GOT-OCR should be used.
115
+ # :param got_weights_path: GOT-OCR Weights Path.
116
+ # :param gpu_id: The ID of the GPU to use.
117
+
66
118
  :param file_path: The path to the file or directory to be parsed.
67
- :param use_ocr: Flag to indicate whether OCR should be used.
68
- :param use_gpu: Flag to indicate whether GPU should be used.
69
- :param gpu_id: The ID of the GPU to use.
119
+ :param use_mineru: Flag to indicate whether MinerU should be used.
70
120
  :param to_markdown: Flag to indicate whether the output should be in Markdown format.
71
121
  """
72
122
  self.file_path = file_path
73
- self.use_ocr = use_ocr
74
- self.use_gpu = use_gpu
75
- self.gpu_id = gpu_id
123
+ self.use_mineru = use_mineru
76
124
  self.to_markdown = to_markdown
125
+ self.parsed_data = None
126
+ self.model_invoker = ModelInvoker()
127
+ self.timeout = timeout
77
128
 
78
129
  def get_data(self):
79
130
  """
@@ -83,19 +134,136 @@ class DataMaxParser:
83
134
  """
84
135
  try:
85
136
  if isinstance(self.file_path, list):
86
- data = [self._parse_file(f) for f in self.file_path]
87
- return data
137
+ parsed_data = [self._parse_file(f) for f in self.file_path]
138
+ self.parsed_data = parsed_data
139
+ return parsed_data
88
140
 
89
141
  elif isinstance(self.file_path, str) and os.path.isfile(self.file_path):
90
- return self._parse_file(self.file_path)
142
+ parsed_data = self._parse_file(self.file_path)
143
+ self.parsed_data = parsed_data
144
+ return parsed_data
91
145
 
92
146
  elif isinstance(self.file_path, str) and os.path.isdir(self.file_path):
93
147
  file_list = [os.path.join(self.file_path, file) for file in os.listdir(self.file_path)]
94
- data = [self._parse_file(f) for f in file_list if os.path.isfile(f)]
95
- return data
148
+ parsed_data = [self._parse_file(f) for f in file_list if os.path.isfile(f)]
149
+ self.parsed_data = parsed_data
150
+ return parsed_data
151
+ else:
152
+ raise ValueError("Invalid file path.")
153
+
96
154
  except Exception as e:
97
155
  raise e
98
156
 
157
+ def clean_data(self, method_list: List[str], text: str = None):
158
+ """
159
+ Clean data
160
+
161
+ methods include AbnormalCleaner, TextFilter, PrivacyDesensitization which is 1 2 3
162
+
163
+ :return:
164
+ """
165
+ if text:
166
+ cleaned_text = text
167
+ elif self.parsed_data:
168
+ cleaned_text = self.parsed_data.get('content')
169
+ else:
170
+ raise ValueError("No data to clean.")
171
+
172
+ for method in method_list:
173
+ if method == 'abnormal':
174
+ cleaned_text = data_cleaner.AbnormalCleaner(cleaned_text).to_clean().get("text")
175
+ elif method == 'filter':
176
+ cleaned_text = data_cleaner.TextFilter(cleaned_text).to_filter()
177
+ cleaned_text = cleaned_text.get("text") if cleaned_text else ''
178
+ elif method == 'private':
179
+ cleaned_text = data_cleaner.PrivacyDesensitization(cleaned_text).to_private().get("text")
180
+
181
+ if self.parsed_data:
182
+ origin_dict = self.parsed_data
183
+ origin_dict['content'] = cleaned_text
184
+ self.parsed_data = None
185
+ return origin_dict
186
+ else:
187
+ return cleaned_text
188
+
189
+ def get_pre_label(self,
190
+ api_key: str,
191
+ base_url: str,
192
+ model_name: str,
193
+ chunk_size: int = 500,
194
+ chunk_overlap: int = 100,
195
+ question_number: int = 5,
196
+ max_workers: int = 5,
197
+ messages: List[Dict[str, str]] = None):
198
+ return generatr_qa_pairs(
199
+ api_key=api_key,
200
+ base_url=base_url,
201
+ model_name=model_name,
202
+ chunk_size=chunk_size,
203
+ chunk_overlap=chunk_overlap,
204
+ question_number=question_number,
205
+ max_workers=max_workers,
206
+ message=messages,
207
+ file_path=self.file_path
208
+ )
209
+
210
+ ## <Abandon>
211
+ # def enhance_with_model(self, api_key: str, base_url: str, model_name: str, iteration: int = 1,
212
+ # messages: List[Dict[str, str]] = None):
213
+ # """
214
+ # Enhance the parsed content using a large language model.
215
+ #
216
+ # :param api_key: API key for the large model service.
217
+ # :param base_url: Base URL for the large model service.
218
+ # :param model_name: Name of the model to use.
219
+ # :param iteration: Number of iterations
220
+ # :param messages: Custom messages list [{"role": "system", "content": "..."}, ...]
221
+ # :return: Enhanced text.
222
+ # """
223
+ # if not messages:
224
+ # # If no custom message is provided, the default message structure is used, but only if there is parsed data
225
+ # if self.parsed_data:
226
+ # system_prompt = get_system_prompt(self.parsed_data)
227
+ # default_message_user = {"role": "user", "content": "按照json格式给出问答对"}
228
+ # messages = [
229
+ # {"role": "system", "content": system_prompt},
230
+ # default_message_user
231
+ # ]
232
+ # else:
233
+ # raise ValueError("No data to enhance and no custom messages provided.")
234
+ # try:
235
+ # if isinstance(iteration, int) and iteration >= 1:
236
+ # results = []
237
+ # current_messages = messages.copy() # Avoid modifying the original message during iteration
238
+ #
239
+ # for _ in range(iteration):
240
+ # enhanced_text = self.model_invoker.invoke_model(
241
+ # api_key=api_key,
242
+ # base_url=base_url,
243
+ # model_name=model_name,
244
+ # messages=current_messages
245
+ # )
246
+ #
247
+ # # Append the generated content to the conversation history in multiple iterations
248
+ # if iteration > 1:
249
+ # current_messages.append({"role": "assistant", "content": enhanced_text})
250
+ # current_messages.append(
251
+ # {"role": "user", "content": "请继续生成, 生成要求不变, 结果是jsonlist, 且长度不超过5"})
252
+ #
253
+ # # If there is parsed data, update the contents and return a copy of the original dictionary; Otherwise, return the enhanced text directly
254
+ # if self.parsed_data:
255
+ # origin_dict = self.parsed_data.copy()
256
+ # origin_dict['content'] = enhanced_text
257
+ # results.append(origin_dict)
258
+ # else:
259
+ # results.append(enhanced_text)
260
+ #
261
+ # return results if iteration > 1 else results[0]
262
+ # else:
263
+ # raise ValueError("Invalid iteration parameter.")
264
+ # except Exception as e:
265
+ # raise Exception(f"An error occurred while enhancing with the model: {e}")
266
+
99
267
  def _parse_file(self, file_path):
100
268
  """
101
269
  Create a parser instance using ParserFactory and parse the file.
@@ -103,12 +271,18 @@ class DataMaxParser:
103
271
  :param file_path: The path to the file to be parsed.
104
272
  :return: The parsed data.
105
273
  """
106
- parser = ParserFactory.create_parser(file_path, self.use_ocr, self.use_gpu, self.gpu_id, self.to_markdown)
107
- if parser:
108
- return parser.parse(file_path)
274
+ try:
275
+ parser = ParserFactory.create_parser(
276
+ use_mineru=self.use_mineru,
277
+ file_path=file_path,
278
+ to_markdown=self.to_markdown,
279
+ timeout=self.timeout
280
+ )
281
+ if parser:
282
+ return parser.parse(file_path=file_path)
283
+ except Exception as e:
284
+ raise e
109
285
 
110
286
 
111
287
  if __name__ == '__main__':
112
- data = DataMaxParser(file_path=r"C:\Users\cykro\Desktop\数据工厂.pdf", use_ocr=True)
113
- data = data.get_data()
114
- print(data)
288
+ pass
@@ -9,7 +9,6 @@ from typing import Union
9
9
  from docx import Document
10
10
  from datamax.parser.base import BaseLife
11
11
  from datamax.parser.base import MarkdownOutputVo
12
- from datamax.utils import clean_original_text
13
12
 
14
13
 
15
14
  class DocParser(BaseLife):
@@ -68,10 +67,8 @@ class DocParser(BaseLife):
68
67
  mk_content = self.read_docx_file(doc_path=file_path, to_mk=True)
69
68
  else:
70
69
  content = self.read_docx_file(doc_path=file_path, to_mk=False)
71
- clean_text = clean_original_text(content)
72
- mk_content = clean_text.get('text', '')
73
- token_count = self.tk_client.get_tokenizer(content=mk_content)
74
- lifecycle = self.generate_lifecycle(source_file=file_path, token_count=token_count, domain="Technology",
70
+ mk_content = content
71
+ lifecycle = self.generate_lifecycle(source_file=file_path, domain="Technology",
75
72
  usage_purpose="Documentation", life_type="LLM_ORIGIN")
76
73
  output_vo = MarkdownOutputVo(title, mk_content)
77
74
  output_vo.add_lifecycle(lifecycle)
@@ -4,7 +4,6 @@ from docx import Document
4
4
  from typing import Union
5
5
  from datamax.parser.base import BaseLife
6
6
  from datamax.parser.base import MarkdownOutputVo
7
- from datamax.utils import clean_original_text
8
7
 
9
8
 
10
9
  class DocxParser(BaseLife):
@@ -34,13 +33,11 @@ class DocxParser(BaseLife):
34
33
  mk_content = open(output_md_dir, 'r', encoding='utf-8').read()
35
34
  else:
36
35
  content = self.read_docx_file(file_path=file_path)
37
- clean_text = clean_original_text(content)
38
- mk_content = clean_text.get('text', '')
39
- token_count = self.tk_client.get_tokenizer(content=mk_content)
40
- lifecycle = self.generate_lifecycle(source_file=file_path, token_count=token_count, domain="Technology",
36
+ mk_content = content
37
+ lifecycle = self.generate_lifecycle(source_file=file_path, domain="Technology",
41
38
  usage_purpose="Documentation", life_type="LLM_ORIGIN")
42
39
  output_vo = MarkdownOutputVo(title, mk_content)
43
40
  output_vo.add_lifecycle(lifecycle)
44
41
  return output_vo.to_dict()
45
42
  except Exception as e:
46
- raise e
43
+ raise e
@@ -4,7 +4,6 @@ from bs4 import BeautifulSoup
4
4
  from ebooklib import epub
5
5
  from datamax.parser.base import BaseLife
6
6
  from datamax.parser.base import MarkdownOutputVo
7
- from datamax.utils import clean_original_text
8
7
 
9
8
 
10
9
  class EpubParser(BaseLife):
@@ -32,10 +31,8 @@ class EpubParser(BaseLife):
32
31
  try:
33
32
  title = self.get_file_extension(file_path)
34
33
  content = self.read_epub_file(file_path=file_path)
35
- clean_text = clean_original_text(content)
36
- mk_content = clean_text.get('text', '')
37
- token_count = self.tk_client.get_tokenizer(content=mk_content)
38
- lifecycle = self.generate_lifecycle(source_file=file_path, token_count=token_count, domain="Technology",
34
+ mk_content = content
35
+ lifecycle = self.generate_lifecycle(source_file=file_path, domain="Technology",
39
36
  usage_purpose="Documentation", life_type="LLM_ORIGIN")
40
37
  output_vo = MarkdownOutputVo(title, mk_content)
41
38
  output_vo.add_lifecycle(lifecycle)
@@ -7,7 +7,6 @@ sys.path.insert(0, str(ROOT_DIR))
7
7
  from datamax.parser.base import BaseLife
8
8
  from datamax.parser.base import MarkdownOutputVo
9
9
  from bs4 import BeautifulSoup
10
- from datamax.utils import clean_original_text
11
10
 
12
11
 
13
12
  class HtmlParser(BaseLife):
@@ -29,10 +28,8 @@ class HtmlParser(BaseLife):
29
28
  try:
30
29
  title = self.get_file_extension(file_path)
31
30
  content = self.read_html_file(file_path=file_path)
32
- clean_text = clean_original_text(content)
33
- mk_content = clean_text.get('text')
34
- token_count = self.tk_client.get_tokenizer(content=mk_content)
35
- lifecycle = self.generate_lifecycle(source_file=file_path, token_count=token_count, domain="Technology",
31
+ mk_content = content
32
+ lifecycle = self.generate_lifecycle(source_file=file_path, domain="Technology",
36
33
  usage_purpose="Documentation", life_type="LLM_ORIGIN")
37
34
  output_vo = MarkdownOutputVo(title, mk_content)
38
35
  output_vo.add_lifecycle(lifecycle)