pydatamax 0.1.16.post1__py3-none-any.whl → 0.1.16.post2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. datamax/loader/core.py +67 -42
  2. datamax/loader/minio_handler.py +38 -19
  3. datamax/parser/__init__.py +2 -1
  4. datamax/parser/base.py +46 -22
  5. datamax/parser/core.py +215 -126
  6. datamax/parser/csv_parser.py +25 -5
  7. datamax/parser/doc_parser.py +230 -141
  8. datamax/parser/docx_parser.py +275 -186
  9. datamax/parser/epub_parser.py +49 -13
  10. datamax/parser/html_parser.py +36 -16
  11. datamax/parser/image_parser.py +52 -14
  12. datamax/parser/json_parser.py +26 -5
  13. datamax/parser/md_parser.py +40 -21
  14. datamax/parser/pdf_parser.py +69 -29
  15. datamax/parser/ppt_parser.py +41 -9
  16. datamax/parser/pptx_parser.py +49 -21
  17. datamax/parser/txt_parser.py +45 -14
  18. datamax/parser/xls_parser.py +34 -6
  19. datamax/parser/xlsx_parser.py +58 -51
  20. datamax/utils/__init__.py +2 -1
  21. datamax/utils/data_cleaner.py +36 -22
  22. datamax/utils/env_setup.py +25 -18
  23. datamax/utils/gotocr_pdf.py +13 -13
  24. datamax/utils/lifecycle_types.py +18 -0
  25. datamax/utils/mineru_operator.py +17 -15
  26. datamax/utils/paddleocr_pdf_operator.py +34 -19
  27. datamax/utils/ppt_extract.py +34 -11
  28. datamax/utils/qa_generator.py +332 -44
  29. datamax/utils/tokenizer.py +10 -9
  30. datamax/utils/uno_handler.py +84 -72
  31. {pydatamax-0.1.16.post1.dist-info → pydatamax-0.1.16.post2.dist-info}/METADATA +54 -2
  32. pydatamax-0.1.16.post2.dist-info/RECORD +39 -0
  33. pydatamax-0.1.16.post1.dist-info/RECORD +0 -38
  34. {pydatamax-0.1.16.post1.dist-info → pydatamax-0.1.16.post2.dist-info}/WHEEL +0 -0
  35. {pydatamax-0.1.16.post1.dist-info → pydatamax-0.1.16.post2.dist-info}/licenses/LICENSE +0 -0
  36. {pydatamax-0.1.16.post1.dist-info → pydatamax-0.1.16.post2.dist-info}/top_level.txt +0 -0
datamax/parser/core.py CHANGED
@@ -1,14 +1,16 @@
1
- import os
1
+ import importlib
2
2
  import json
3
+ import os
3
4
  import time
4
- import importlib
5
+ from pathlib import Path
6
+ from typing import Dict, List, Union
7
+
8
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
5
9
  from loguru import logger
6
- from typing import List, Union, Dict
7
10
  from openai import OpenAI
8
- from pathlib import Path
11
+
9
12
  from datamax.utils import data_cleaner
10
- from datamax.utils.qa_generator import generatr_qa_pairs
11
- from langchain.text_splitter import RecursiveCharacterTextSplitter
13
+ from datamax.utils.qa_generator import generate_qa_from_content
12
14
 
13
15
 
14
16
  class ModelInvoker:
@@ -32,10 +34,9 @@ class ModelInvoker:
32
34
  class ParserFactory:
33
35
  @staticmethod
34
36
  def create_parser(
35
- file_path: str,
36
- use_mineru: bool = False,
37
- to_markdown: bool = False,
38
- timeout: int = 1200
37
+ file_path: str,
38
+ use_mineru: bool = False,
39
+ to_markdown: bool = False,
39
40
  ):
40
41
  """
41
42
  Create a parser instance based on the file extension.
@@ -43,36 +44,35 @@ class ParserFactory:
43
44
  :param to_markdown: Flag to indicate whether the output should be in Markdown format.
44
45
  (only supported files in .doc or .docx format)
45
46
  :param use_mineru: Flag to indicate whether MinerU should be used. (only supported files in .pdf format)
46
- :param timeout: Timeout for the request .(only supported files in .xlsx format)
47
47
  :return: An instance of the parser class corresponding to the file extension.
48
48
  """
49
49
  file_extension = os.path.splitext(file_path)[1].lower()
50
50
  parser_class_name = {
51
- '.md': 'MarkdownParser',
52
- '.docx': 'DocxParser',
53
- '.doc': 'DocParser',
54
- '.epub': 'EpubParser',
55
- '.html': 'HtmlParser',
56
- '.txt': 'TxtParser',
57
- '.pptx': 'PPtxParser',
58
- '.ppt': 'PPtParser',
59
- '.pdf': 'PdfParser',
60
- '.jpg': 'ImageParser',
61
- '.jpeg': 'ImageParser',
62
- '.png': 'ImageParser',
63
- '.webp': 'ImageParser',
64
- '.xlsx': 'XlsxParser',
65
- '.xls': 'XlsParser'
51
+ ".md": "MarkdownParser",
52
+ ".docx": "DocxParser",
53
+ ".doc": "DocParser",
54
+ ".epub": "EpubParser",
55
+ ".html": "HtmlParser",
56
+ ".txt": "TxtParser",
57
+ ".pptx": "PPtxParser",
58
+ ".ppt": "PPtParser",
59
+ ".pdf": "PdfParser",
60
+ ".jpg": "ImageParser",
61
+ ".jpeg": "ImageParser",
62
+ ".png": "ImageParser",
63
+ ".webp": "ImageParser",
64
+ ".xlsx": "XlsxParser",
65
+ ".xls": "XlsParser",
66
66
  }.get(file_extension)
67
67
 
68
68
  if not parser_class_name:
69
69
  return None
70
70
 
71
- if file_extension in ['.jpg', 'jpeg', '.png', '.webp']:
72
- module_name = f'datamax.parser.image_parser'
71
+ if file_extension in [".jpg", "jpeg", ".png", ".webp"]:
72
+ module_name = f"datamax.parser.image_parser"
73
73
  else:
74
74
  # Dynamically determine the module name based on the file extension
75
- module_name = f'datamax.parser.{file_extension[1:]}_parser'
75
+ module_name = f"datamax.parser.{file_extension[1:]}_parser"
76
76
 
77
77
  try:
78
78
  # Dynamically import the module and get the class
@@ -80,44 +80,38 @@ class ParserFactory:
80
80
  parser_class = getattr(module, parser_class_name)
81
81
 
82
82
  # Special handling for PdfParser arguments
83
- if parser_class_name == 'PdfParser':
83
+ if parser_class_name == "PdfParser":
84
84
  return parser_class(
85
85
  file_path=file_path,
86
86
  use_mineru=use_mineru,
87
87
  )
88
- elif parser_class_name == 'DocxParser' or parser_class_name == 'DocParser':
88
+ elif parser_class_name == "DocxParser" or parser_class_name == "DocParser":
89
89
  return parser_class(
90
90
  file_path=file_path, to_markdown=to_markdown, use_uno=True
91
91
  )
92
- elif parser_class_name == 'XlsxParser':
93
- return parser_class(
94
- file_path=file_path,
95
- timeout=timeout
96
- )
92
+ elif parser_class_name == "XlsxParser":
93
+ return parser_class(file_path=file_path)
97
94
  else:
98
- return parser_class(
99
- file_path=file_path
100
- )
95
+ return parser_class(file_path=file_path)
101
96
 
102
97
  except (ImportError, AttributeError) as e:
103
98
  raise e
104
99
 
105
100
 
106
101
  class DataMax:
107
- def __init__(self,
108
- file_path: Union[str, list] = '',
109
- use_mineru: bool = False,
110
- to_markdown: bool = False,
111
- timeout: int = 1200,
112
- ttl: int = 3600
113
- ):
102
+ def __init__(
103
+ self,
104
+ file_path: Union[str, list] = "",
105
+ use_mineru: bool = False,
106
+ to_markdown: bool = False,
107
+ ttl: int = 3600,
108
+ ):
114
109
  """
115
110
  Initialize the DataMaxParser with file path and parsing options.
116
111
 
117
112
  :param file_path: The path to the file or directory to be parsed.
118
113
  :param use_mineru: Flag to indicate whether MinerU should be used.
119
114
  :param to_markdown: Flag to indicate whether the output should be in Markdown format.
120
- :param timeout: Timeout for the request.
121
115
  :param ttl: Time to live for the cache.
122
116
  """
123
117
  self.file_path = file_path
@@ -125,10 +119,9 @@ class DataMax:
125
119
  self.to_markdown = to_markdown
126
120
  self.parsed_data = None
127
121
  self.model_invoker = ModelInvoker()
128
- self.timeout = timeout
129
122
  self._cache = {}
130
123
  self.ttl = ttl
131
-
124
+
132
125
  def set_data(self, file_name, parsed_data):
133
126
  """
134
127
  Set cached data
@@ -137,8 +130,13 @@ class DataMax:
137
130
  """
138
131
  logger.info(f"cache ttl is {self.ttl}s")
139
132
  if self.ttl > 0:
140
- self._cache[file_name] = {'data': parsed_data, 'ttl': time.time() + self.ttl}
141
- logger.info(f"✅ [Cache Updated] Cached data for {file_name}, ttl: {self._cache[file_name]['ttl']}")
133
+ self._cache[file_name] = {
134
+ "data": parsed_data,
135
+ "ttl": time.time() + self.ttl,
136
+ }
137
+ logger.info(
138
+ f"✅ [Cache Updated] Cached data for {file_name}, ttl: {self._cache[file_name]['ttl']}"
139
+ )
142
140
 
143
141
  def get_data(self):
144
142
  """
@@ -151,12 +149,21 @@ class DataMax:
151
149
  parsed_data = []
152
150
  for f in self.file_path:
153
151
  file_name = os.path.basename(f)
154
- if file_name in self._cache and self._cache[file_name]['ttl'] > time.time():
152
+ if (
153
+ file_name in self._cache
154
+ and self._cache[file_name]["ttl"] > time.time()
155
+ ):
155
156
  logger.info(f"✅ [Cache Hit] Using cached data for {file_name}")
156
- parsed_data.append(self._cache[file_name]['data'])
157
+ parsed_data.append(self._cache[file_name]["data"])
157
158
  else:
158
- logger.info(f"⏳ [Cache Miss] No cached data for {file_name}, parsing...")
159
- self._cache = {k: v for k, v in self._cache.items() if v['ttl'] > time.time()}
159
+ logger.info(
160
+ f"⏳ [Cache Miss] No cached data for {file_name}, parsing..."
161
+ )
162
+ self._cache = {
163
+ k: v
164
+ for k, v in self._cache.items()
165
+ if v["ttl"] > time.time()
166
+ }
160
167
  res_data = self._parse_file(f)
161
168
  parsed_data.append(res_data)
162
169
  self.set_data(file_name, res_data)
@@ -164,29 +171,49 @@ class DataMax:
164
171
 
165
172
  elif isinstance(self.file_path, str) and os.path.isfile(self.file_path):
166
173
  file_name = os.path.basename(self.file_path)
167
- if file_name in self._cache and self._cache[file_name]['ttl'] > time.time():
174
+ if (
175
+ file_name in self._cache
176
+ and self._cache[file_name]["ttl"] > time.time()
177
+ ):
168
178
  logger.info(f"✅ [Cache Hit] Using cached data for {file_name}")
169
- return self._cache[file_name]['data']
179
+ return self._cache[file_name]["data"]
170
180
  else:
171
- logger.info(f"⏳ [Cache Miss] No cached data for {file_name}, parsing...")
172
- self._cache = {k: v for k, v in self._cache.items() if v['ttl'] > time.time()}
181
+ logger.info(
182
+ f"⏳ [Cache Miss] No cached data for {file_name}, parsing..."
183
+ )
184
+ self._cache = {
185
+ k: v for k, v in self._cache.items() if v["ttl"] > time.time()
186
+ }
173
187
  parsed_data = self._parse_file(self.file_path)
174
188
  self.parsed_data = parsed_data
175
189
  self.set_data(file_name, parsed_data)
176
190
  return parsed_data
177
191
 
178
192
  elif isinstance(self.file_path, str) and os.path.isdir(self.file_path):
179
- file_list = [str(file) for file in list(Path(self.file_path).rglob('*.*'))]
193
+ file_list = [
194
+ str(file) for file in list(Path(self.file_path).rglob("*.*"))
195
+ ]
180
196
  parsed_data = []
181
197
  for f in file_list:
182
198
  if os.path.isfile(f):
183
199
  file_name = os.path.basename(f)
184
- if file_name in self._cache and self._cache[file_name]['ttl'] > time.time():
185
- logger.info(f"✅ [Cache Hit] Using cached data for {file_name}")
186
- parsed_data.append(self._cache[file_name]['data'])
200
+ if (
201
+ file_name in self._cache
202
+ and self._cache[file_name]["ttl"] > time.time()
203
+ ):
204
+ logger.info(
205
+ f"✅ [Cache Hit] Using cached data for {file_name}"
206
+ )
207
+ parsed_data.append(self._cache[file_name]["data"])
187
208
  else:
188
- logger.info(f"⏳ [Cache Miss] No cached data for {file_name}, parsing...")
189
- self._cache = {k: v for k, v in self._cache.items() if v['ttl'] > time.time()}
209
+ logger.info(
210
+ f"⏳ [Cache Miss] No cached data for {file_name}, parsing..."
211
+ )
212
+ self._cache = {
213
+ k: v
214
+ for k, v in self._cache.items()
215
+ if v["ttl"] > time.time()
216
+ }
190
217
  res_data = self._parse_file(f)
191
218
  parsed_data.append(res_data)
192
219
  self.set_data(file_name, res_data)
@@ -201,53 +228,99 @@ class DataMax:
201
228
  """
202
229
  Clean data
203
230
 
204
- methods include AbnormalCleaner TextFilter PrivacyDesensitization which is 1 2 3
231
+ methods include AbnormalCleaner, TextFilter, PrivacyDesensitization which are 1, 2, 3
205
232
 
206
- :return:
233
+ :return: Cleaned data
207
234
  """
208
235
  if text:
209
236
  cleaned_text = text
210
237
  elif self.parsed_data:
211
- cleaned_text = self.parsed_data.get('content')
238
+ cleaned_text = self.parsed_data.get("content")
212
239
  else:
213
240
  raise ValueError("No data to clean.")
214
241
 
215
242
  for method in method_list:
216
- if method == 'abnormal':
217
- cleaned_text = data_cleaner.AbnormalCleaner(cleaned_text).to_clean().get("text")
218
- elif method == 'filter':
243
+ if method == "abnormal":
244
+ cleaned_text = (
245
+ data_cleaner.AbnormalCleaner(cleaned_text).to_clean().get("text")
246
+ )
247
+ elif method == "filter":
219
248
  cleaned_text = data_cleaner.TextFilter(cleaned_text).to_filter()
220
- cleaned_text = cleaned_text.get("text") if cleaned_text else ''
221
- elif method == 'private':
222
- cleaned_text = data_cleaner.PrivacyDesensitization(cleaned_text).to_private().get("text")
249
+ cleaned_text = cleaned_text.get("text") if cleaned_text else ""
250
+ elif method == "private":
251
+ cleaned_text = (
252
+ data_cleaner.PrivacyDesensitization(cleaned_text)
253
+ .to_private()
254
+ .get("text")
255
+ )
223
256
 
224
257
  if self.parsed_data:
225
258
  origin_dict = self.parsed_data
226
- origin_dict['content'] = cleaned_text
259
+ origin_dict["content"] = cleaned_text
227
260
  self.parsed_data = None
228
261
  return origin_dict
229
262
  else:
230
263
  return cleaned_text
231
264
 
232
- def get_pre_label(self,
233
- api_key: str,
234
- base_url: str,
235
- model_name: str,
236
- chunk_size: int = 500,
237
- chunk_overlap: int = 100,
238
- question_number: int = 5,
239
- max_workers: int = 5,
240
- messages: List[Dict[str, str]] = None):
241
- return generatr_qa_pairs(
265
+ def get_pre_label(
266
+ self,
267
+ api_key: str,
268
+ base_url: str,
269
+ model_name: str,
270
+ chunk_size: int = 500,
271
+ chunk_overlap: int = 100,
272
+ question_number: int = 5,
273
+ max_workers: int = 5,
274
+ language: str = "zh",
275
+ messages: List[Dict[str, str]] = None,
276
+ ):
277
+ """
278
+ Generate pre-labeling data based on processed document content instead of file path
279
+
280
+ :param api_key: API key
281
+ :param base_url: API base URL
282
+ :param model_name: Model name
283
+ :param chunk_size: Chunk size
284
+ :param chunk_overlap: Overlap length
285
+ :param question_number: Number of questions generated per chunk
286
+ :param max_workers: Number of concurrent workers
287
+ :param language: Language for QA generation ("zh" for Chinese, "en" for English)
288
+ :param messages: Custom messages
289
+ :return: List of QA pairs
290
+ """
291
+ # First get the processed data
292
+ processed_data = self.get_data()
293
+
294
+ # If it's a list (multiple files), merge all content
295
+ if isinstance(processed_data, list):
296
+ content_list = []
297
+ for data in processed_data:
298
+ if isinstance(data, dict) and "content" in data:
299
+ content_list.append(data["content"])
300
+ elif isinstance(data, str):
301
+ content_list.append(data)
302
+ content = "\n\n".join(content_list)
303
+ # If it's a dictionary for a single file
304
+ elif isinstance(processed_data, dict) and "content" in processed_data:
305
+ content = processed_data["content"]
306
+ # If it's a string
307
+ elif isinstance(processed_data, str):
308
+ content = processed_data
309
+ else:
310
+ raise ValueError("Unable to extract content field from processed data")
311
+
312
+ # Generate QA pairs using content instead of reading files
313
+ return generate_qa_from_content(
314
+ content=content,
242
315
  api_key=api_key,
243
316
  base_url=base_url,
244
317
  model_name=model_name,
245
318
  chunk_size=chunk_size,
246
319
  chunk_overlap=chunk_overlap,
247
320
  question_number=question_number,
321
+ language=language,
248
322
  max_workers=max_workers,
249
323
  message=messages,
250
- file_path=self.file_path
251
324
  )
252
325
 
253
326
  def save_label_data(self, label_data: list, save_file_name: str = None):
@@ -262,27 +335,30 @@ class DataMax:
262
335
  if isinstance(self.file_path, str):
263
336
  save_file_name = os.path.splitext(os.path.basename(self.file_path))[0]
264
337
  else:
265
- save_file_name = 'label_data'
338
+ save_file_name = "label_data"
266
339
  if isinstance(label_data, list):
267
- with open(save_file_name + '.jsonl', 'w', encoding='utf-8') as f:
340
+ with open(save_file_name + ".jsonl", "w", encoding="utf-8") as f:
268
341
  for qa_entry in label_data:
269
342
  f.write(json.dumps(qa_entry, ensure_ascii=False) + "\n")
270
- logger.info(f"✅ [Label Data Saved] Label data saved to {save_file_name}.jsonl")
271
-
343
+ logger.info(
344
+ f"✅ [Label Data Saved] Label data saved to {save_file_name}.jsonl"
345
+ )
272
346
 
273
- @staticmethod
274
- def split_text_into_paragraphs(text: str, max_length:int = 500, chunk_overlap: int = 100):
347
+ @staticmethod
348
+ def split_text_into_paragraphs(
349
+ text: str, max_length: int = 500, chunk_overlap: int = 100
350
+ ):
275
351
  """
276
352
  Split text into paragraphs by sentence boundaries, each paragraph not exceeding max_length characters.
277
353
  Paragraphs will have chunk_overlap characters of overlap between them.
278
354
  """
279
- import re
355
+ import re
280
356
 
281
357
  # Split sentences using Chinese punctuation marks
282
- sentences = re.split('(?<=[。!?])', text)
358
+ sentences = re.split("(?<=[。!?])", text)
283
359
  paragraphs = []
284
- current_paragraph = ''
285
- overlap_buffer = ''
360
+ current_paragraph = ""
361
+ overlap_buffer = ""
286
362
 
287
363
  for sentence in sentences:
288
364
  # If current paragraph plus new sentence doesn't exceed max length
@@ -293,20 +369,26 @@ class DataMax:
293
369
  # Add current paragraph to results
294
370
  paragraphs.append(current_paragraph)
295
371
  # Save overlap portion
296
- overlap_buffer = current_paragraph[-chunk_overlap:] if chunk_overlap > 0 else ''
372
+ overlap_buffer = (
373
+ current_paragraph[-chunk_overlap:] if chunk_overlap > 0 else ""
374
+ )
297
375
  # Start new paragraph with overlap
298
376
  current_paragraph = overlap_buffer + sentence
299
- overlap_buffer = ''
300
-
377
+ overlap_buffer = ""
378
+
301
379
  # Handle overly long sentences
302
380
  while len(current_paragraph) > max_length:
303
381
  # Split long paragraph
304
382
  split_point = max_length - len(overlap_buffer)
305
383
  paragraphs.append(current_paragraph[:split_point])
306
384
  # Update overlap buffer
307
- overlap_buffer = current_paragraph[split_point - chunk_overlap:split_point] if chunk_overlap > 0 else ''
385
+ overlap_buffer = (
386
+ current_paragraph[split_point - chunk_overlap : split_point]
387
+ if chunk_overlap > 0
388
+ else ""
389
+ )
308
390
  current_paragraph = overlap_buffer + current_paragraph[split_point:]
309
- overlap_buffer = ''
391
+ overlap_buffer = ""
310
392
 
311
393
  # Add the last paragraph
312
394
  if current_paragraph:
@@ -315,10 +397,12 @@ class DataMax:
315
397
  return paragraphs
316
398
 
317
399
  @staticmethod
318
- def split_with_langchain(text: str, chunk_size: int = 500, chunk_overlap: int = 100):
400
+ def split_with_langchain(
401
+ text: str, chunk_size: int = 500, chunk_overlap: int = 100
402
+ ):
319
403
  """
320
404
  Split text using LangChain's intelligent text splitting
321
-
405
+
322
406
  :param text: Text to be split
323
407
  :param chunk_size: Maximum length of each chunk
324
408
  :param chunk_overlap: Number of overlapping characters between chunks
@@ -333,14 +417,15 @@ class DataMax:
333
417
  return text_splitter.split_text(text)
334
418
 
335
419
  def split_data(
336
- self,
337
- parsed_data: Union[str, dict] = None,
338
- chunk_size: int = 500,
339
- chunk_overlap: int = 100,
340
- use_langchain: bool = False):
420
+ self,
421
+ parsed_data: Union[str, dict] = None,
422
+ chunk_size: int = 500,
423
+ chunk_overlap: int = 100,
424
+ use_langchain: bool = False,
425
+ ):
341
426
  """
342
427
  Improved splitting method with LangChain option
343
-
428
+
344
429
  :param use_langchain: Whether to use LangChain for splitting
345
430
  :param parsed_data: Data to be split, either string or dict
346
431
  :param chunk_size: Maximum length of each chunk
@@ -351,36 +436,41 @@ class DataMax:
351
436
  self.parsed_data = parsed_data
352
437
  if not self.parsed_data:
353
438
  raise ValueError("No data to split.")
354
-
439
+
355
440
  if use_langchain:
356
441
  if isinstance(self.parsed_data, str):
357
- return self.split_with_langchain(self.parsed_data, chunk_size, chunk_overlap)
442
+ return self.split_with_langchain(
443
+ self.parsed_data, chunk_size, chunk_overlap
444
+ )
358
445
  elif isinstance(self.parsed_data, dict):
359
- if 'content' not in self.parsed_data:
446
+ if "content" not in self.parsed_data:
360
447
  raise ValueError("Input dict must contain 'content' key")
361
- chunks = self.split_with_langchain(self.parsed_data['content'], chunk_size, chunk_overlap)
448
+ chunks = self.split_with_langchain(
449
+ self.parsed_data["content"], chunk_size, chunk_overlap
450
+ )
362
451
  result = self.parsed_data.copy()
363
- result['content'] = chunks
452
+ result["content"] = chunks
364
453
  return result
365
-
454
+
366
455
  # Handle string input
367
456
  if isinstance(self.parsed_data, str):
368
- return self.split_text_into_paragraphs(self.parsed_data, chunk_size, chunk_overlap)
369
-
457
+ return self.split_text_into_paragraphs(
458
+ self.parsed_data, chunk_size, chunk_overlap
459
+ )
460
+
370
461
  # Handle dict input
371
462
  elif isinstance(self.parsed_data, dict):
372
- if 'content' not in self.parsed_data:
463
+ if "content" not in self.parsed_data:
373
464
  raise ValueError("Input dict must contain 'content' key")
374
-
375
- content = self.parsed_data['content']
465
+
466
+ content = self.parsed_data["content"]
376
467
  chunks = self.split_text_into_paragraphs(content, chunk_size, chunk_overlap)
377
-
468
+
378
469
  result = self.parsed_data.copy()
379
- result['content'] = chunks
470
+ result["content"] = chunks
380
471
  return result
381
472
  else:
382
473
  raise ValueError("Unsupported input type")
383
-
384
474
 
385
475
  def _parse_file(self, file_path):
386
476
  """
@@ -394,7 +484,6 @@ class DataMax:
394
484
  use_mineru=self.use_mineru,
395
485
  file_path=file_path,
396
486
  to_markdown=self.to_markdown,
397
- timeout=self.timeout
398
487
  )
399
488
  if parser:
400
489
  return parser.parse(file_path=file_path)
@@ -402,5 +491,5 @@ class DataMax:
402
491
  raise e
403
492
 
404
493
 
405
- if __name__ == '__main__':
406
- pass
494
+ if __name__ == "__main__":
495
+ pass
@@ -1,6 +1,7 @@
1
1
  import pandas as pd
2
2
 
3
3
  from datamax.parser.base import BaseLife, MarkdownOutputVo
4
+ from datamax.utils.lifecycle_types import LifeType
4
5
 
5
6
 
6
7
  class CsvParser(BaseLife):
@@ -16,16 +17,35 @@ class CsvParser(BaseLife):
16
17
 
17
18
  def parse(self, file_path: str) -> MarkdownOutputVo:
18
19
  try:
20
+ # 1) 处理开始
21
+ extension = self.get_file_extension(file_path)
22
+ lc_start = self.generate_lifecycle(
23
+ source_file=file_path,
24
+ domain="Technology",
25
+ life_type=LifeType.DATA_PROCESSING,
26
+ usage_purpose="Parsing",
27
+ )
28
+
29
+ # 2) 核心解析
19
30
  df = self.read_csv_file(file_path)
20
31
  mk_content = df.to_markdown(index=False)
21
- lifecycle = self.generate_lifecycle(
32
+
33
+ # 3) 处理结束或失败
34
+ lc_end = self.generate_lifecycle(
22
35
  source_file=file_path,
23
36
  domain="Technology",
24
- usage_purpose="Documentation",
25
- life_type="LLM_ORIGIN",
37
+ life_type=(
38
+ LifeType.DATA_PROCESSED
39
+ if mk_content.strip()
40
+ else LifeType.DATA_PROCESS_FAILED
41
+ ),
42
+ usage_purpose="Parsing",
26
43
  )
27
- output_vo = MarkdownOutputVo(self.get_file_extension(file_path), mk_content)
28
- output_vo.add_lifecycle(lifecycle)
44
+
45
+ # 4) 封装输出并添加生命周期
46
+ output_vo = MarkdownOutputVo(extension, mk_content)
47
+ output_vo.add_lifecycle(lc_start)
48
+ output_vo.add_lifecycle(lc_end)
29
49
  return output_vo.to_dict()
30
50
  except Exception as e:
31
51
  raise e