pydatamax 0.1.16.post1__py3-none-any.whl → 0.1.16.post2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datamax/loader/core.py +67 -42
- datamax/loader/minio_handler.py +38 -19
- datamax/parser/__init__.py +2 -1
- datamax/parser/base.py +46 -22
- datamax/parser/core.py +215 -126
- datamax/parser/csv_parser.py +25 -5
- datamax/parser/doc_parser.py +230 -141
- datamax/parser/docx_parser.py +275 -186
- datamax/parser/epub_parser.py +49 -13
- datamax/parser/html_parser.py +36 -16
- datamax/parser/image_parser.py +52 -14
- datamax/parser/json_parser.py +26 -5
- datamax/parser/md_parser.py +40 -21
- datamax/parser/pdf_parser.py +69 -29
- datamax/parser/ppt_parser.py +41 -9
- datamax/parser/pptx_parser.py +49 -21
- datamax/parser/txt_parser.py +45 -14
- datamax/parser/xls_parser.py +34 -6
- datamax/parser/xlsx_parser.py +58 -51
- datamax/utils/__init__.py +2 -1
- datamax/utils/data_cleaner.py +36 -22
- datamax/utils/env_setup.py +25 -18
- datamax/utils/gotocr_pdf.py +13 -13
- datamax/utils/lifecycle_types.py +18 -0
- datamax/utils/mineru_operator.py +17 -15
- datamax/utils/paddleocr_pdf_operator.py +34 -19
- datamax/utils/ppt_extract.py +34 -11
- datamax/utils/qa_generator.py +332 -44
- datamax/utils/tokenizer.py +10 -9
- datamax/utils/uno_handler.py +84 -72
- {pydatamax-0.1.16.post1.dist-info → pydatamax-0.1.16.post2.dist-info}/METADATA +54 -2
- pydatamax-0.1.16.post2.dist-info/RECORD +39 -0
- pydatamax-0.1.16.post1.dist-info/RECORD +0 -38
- {pydatamax-0.1.16.post1.dist-info → pydatamax-0.1.16.post2.dist-info}/WHEEL +0 -0
- {pydatamax-0.1.16.post1.dist-info → pydatamax-0.1.16.post2.dist-info}/licenses/LICENSE +0 -0
- {pydatamax-0.1.16.post1.dist-info → pydatamax-0.1.16.post2.dist-info}/top_level.txt +0 -0
datamax/parser/core.py
CHANGED
@@ -1,14 +1,16 @@
|
|
1
|
-
import
|
1
|
+
import importlib
|
2
2
|
import json
|
3
|
+
import os
|
3
4
|
import time
|
4
|
-
import
|
5
|
+
from pathlib import Path
|
6
|
+
from typing import Dict, List, Union
|
7
|
+
|
8
|
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
5
9
|
from loguru import logger
|
6
|
-
from typing import List, Union, Dict
|
7
10
|
from openai import OpenAI
|
8
|
-
|
11
|
+
|
9
12
|
from datamax.utils import data_cleaner
|
10
|
-
from datamax.utils.qa_generator import
|
11
|
-
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
13
|
+
from datamax.utils.qa_generator import generate_qa_from_content
|
12
14
|
|
13
15
|
|
14
16
|
class ModelInvoker:
|
@@ -32,10 +34,9 @@ class ModelInvoker:
|
|
32
34
|
class ParserFactory:
|
33
35
|
@staticmethod
|
34
36
|
def create_parser(
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
timeout: int = 1200
|
37
|
+
file_path: str,
|
38
|
+
use_mineru: bool = False,
|
39
|
+
to_markdown: bool = False,
|
39
40
|
):
|
40
41
|
"""
|
41
42
|
Create a parser instance based on the file extension.
|
@@ -43,36 +44,35 @@ class ParserFactory:
|
|
43
44
|
:param to_markdown: Flag to indicate whether the output should be in Markdown format.
|
44
45
|
(only supported files in .doc or .docx format)
|
45
46
|
:param use_mineru: Flag to indicate whether MinerU should be used. (only supported files in .pdf format)
|
46
|
-
:param timeout: Timeout for the request .(only supported files in .xlsx format)
|
47
47
|
:return: An instance of the parser class corresponding to the file extension.
|
48
48
|
"""
|
49
49
|
file_extension = os.path.splitext(file_path)[1].lower()
|
50
50
|
parser_class_name = {
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
51
|
+
".md": "MarkdownParser",
|
52
|
+
".docx": "DocxParser",
|
53
|
+
".doc": "DocParser",
|
54
|
+
".epub": "EpubParser",
|
55
|
+
".html": "HtmlParser",
|
56
|
+
".txt": "TxtParser",
|
57
|
+
".pptx": "PPtxParser",
|
58
|
+
".ppt": "PPtParser",
|
59
|
+
".pdf": "PdfParser",
|
60
|
+
".jpg": "ImageParser",
|
61
|
+
".jpeg": "ImageParser",
|
62
|
+
".png": "ImageParser",
|
63
|
+
".webp": "ImageParser",
|
64
|
+
".xlsx": "XlsxParser",
|
65
|
+
".xls": "XlsParser",
|
66
66
|
}.get(file_extension)
|
67
67
|
|
68
68
|
if not parser_class_name:
|
69
69
|
return None
|
70
70
|
|
71
|
-
if file_extension in [
|
72
|
-
module_name = f
|
71
|
+
if file_extension in [".jpg", "jpeg", ".png", ".webp"]:
|
72
|
+
module_name = f"datamax.parser.image_parser"
|
73
73
|
else:
|
74
74
|
# Dynamically determine the module name based on the file extension
|
75
|
-
module_name = f
|
75
|
+
module_name = f"datamax.parser.{file_extension[1:]}_parser"
|
76
76
|
|
77
77
|
try:
|
78
78
|
# Dynamically import the module and get the class
|
@@ -80,44 +80,38 @@ class ParserFactory:
|
|
80
80
|
parser_class = getattr(module, parser_class_name)
|
81
81
|
|
82
82
|
# Special handling for PdfParser arguments
|
83
|
-
if parser_class_name ==
|
83
|
+
if parser_class_name == "PdfParser":
|
84
84
|
return parser_class(
|
85
85
|
file_path=file_path,
|
86
86
|
use_mineru=use_mineru,
|
87
87
|
)
|
88
|
-
elif parser_class_name ==
|
88
|
+
elif parser_class_name == "DocxParser" or parser_class_name == "DocParser":
|
89
89
|
return parser_class(
|
90
90
|
file_path=file_path, to_markdown=to_markdown, use_uno=True
|
91
91
|
)
|
92
|
-
elif parser_class_name ==
|
93
|
-
return parser_class(
|
94
|
-
file_path=file_path,
|
95
|
-
timeout=timeout
|
96
|
-
)
|
92
|
+
elif parser_class_name == "XlsxParser":
|
93
|
+
return parser_class(file_path=file_path)
|
97
94
|
else:
|
98
|
-
return parser_class(
|
99
|
-
file_path=file_path
|
100
|
-
)
|
95
|
+
return parser_class(file_path=file_path)
|
101
96
|
|
102
97
|
except (ImportError, AttributeError) as e:
|
103
98
|
raise e
|
104
99
|
|
105
100
|
|
106
101
|
class DataMax:
|
107
|
-
def __init__(
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
102
|
+
def __init__(
|
103
|
+
self,
|
104
|
+
file_path: Union[str, list] = "",
|
105
|
+
use_mineru: bool = False,
|
106
|
+
to_markdown: bool = False,
|
107
|
+
ttl: int = 3600,
|
108
|
+
):
|
114
109
|
"""
|
115
110
|
Initialize the DataMaxParser with file path and parsing options.
|
116
111
|
|
117
112
|
:param file_path: The path to the file or directory to be parsed.
|
118
113
|
:param use_mineru: Flag to indicate whether MinerU should be used.
|
119
114
|
:param to_markdown: Flag to indicate whether the output should be in Markdown format.
|
120
|
-
:param timeout: Timeout for the request.
|
121
115
|
:param ttl: Time to live for the cache.
|
122
116
|
"""
|
123
117
|
self.file_path = file_path
|
@@ -125,10 +119,9 @@ class DataMax:
|
|
125
119
|
self.to_markdown = to_markdown
|
126
120
|
self.parsed_data = None
|
127
121
|
self.model_invoker = ModelInvoker()
|
128
|
-
self.timeout = timeout
|
129
122
|
self._cache = {}
|
130
123
|
self.ttl = ttl
|
131
|
-
|
124
|
+
|
132
125
|
def set_data(self, file_name, parsed_data):
|
133
126
|
"""
|
134
127
|
Set cached data
|
@@ -137,8 +130,13 @@ class DataMax:
|
|
137
130
|
"""
|
138
131
|
logger.info(f"cache ttl is {self.ttl}s")
|
139
132
|
if self.ttl > 0:
|
140
|
-
self._cache[file_name] = {
|
141
|
-
|
133
|
+
self._cache[file_name] = {
|
134
|
+
"data": parsed_data,
|
135
|
+
"ttl": time.time() + self.ttl,
|
136
|
+
}
|
137
|
+
logger.info(
|
138
|
+
f"✅ [Cache Updated] Cached data for {file_name}, ttl: {self._cache[file_name]['ttl']}"
|
139
|
+
)
|
142
140
|
|
143
141
|
def get_data(self):
|
144
142
|
"""
|
@@ -151,12 +149,21 @@ class DataMax:
|
|
151
149
|
parsed_data = []
|
152
150
|
for f in self.file_path:
|
153
151
|
file_name = os.path.basename(f)
|
154
|
-
if
|
152
|
+
if (
|
153
|
+
file_name in self._cache
|
154
|
+
and self._cache[file_name]["ttl"] > time.time()
|
155
|
+
):
|
155
156
|
logger.info(f"✅ [Cache Hit] Using cached data for {file_name}")
|
156
|
-
parsed_data.append(self._cache[file_name][
|
157
|
+
parsed_data.append(self._cache[file_name]["data"])
|
157
158
|
else:
|
158
|
-
logger.info(
|
159
|
-
|
159
|
+
logger.info(
|
160
|
+
f"⏳ [Cache Miss] No cached data for {file_name}, parsing..."
|
161
|
+
)
|
162
|
+
self._cache = {
|
163
|
+
k: v
|
164
|
+
for k, v in self._cache.items()
|
165
|
+
if v["ttl"] > time.time()
|
166
|
+
}
|
160
167
|
res_data = self._parse_file(f)
|
161
168
|
parsed_data.append(res_data)
|
162
169
|
self.set_data(file_name, res_data)
|
@@ -164,29 +171,49 @@ class DataMax:
|
|
164
171
|
|
165
172
|
elif isinstance(self.file_path, str) and os.path.isfile(self.file_path):
|
166
173
|
file_name = os.path.basename(self.file_path)
|
167
|
-
if
|
174
|
+
if (
|
175
|
+
file_name in self._cache
|
176
|
+
and self._cache[file_name]["ttl"] > time.time()
|
177
|
+
):
|
168
178
|
logger.info(f"✅ [Cache Hit] Using cached data for {file_name}")
|
169
|
-
return self._cache[file_name][
|
179
|
+
return self._cache[file_name]["data"]
|
170
180
|
else:
|
171
|
-
logger.info(
|
172
|
-
|
181
|
+
logger.info(
|
182
|
+
f"⏳ [Cache Miss] No cached data for {file_name}, parsing..."
|
183
|
+
)
|
184
|
+
self._cache = {
|
185
|
+
k: v for k, v in self._cache.items() if v["ttl"] > time.time()
|
186
|
+
}
|
173
187
|
parsed_data = self._parse_file(self.file_path)
|
174
188
|
self.parsed_data = parsed_data
|
175
189
|
self.set_data(file_name, parsed_data)
|
176
190
|
return parsed_data
|
177
191
|
|
178
192
|
elif isinstance(self.file_path, str) and os.path.isdir(self.file_path):
|
179
|
-
file_list = [
|
193
|
+
file_list = [
|
194
|
+
str(file) for file in list(Path(self.file_path).rglob("*.*"))
|
195
|
+
]
|
180
196
|
parsed_data = []
|
181
197
|
for f in file_list:
|
182
198
|
if os.path.isfile(f):
|
183
199
|
file_name = os.path.basename(f)
|
184
|
-
if
|
185
|
-
|
186
|
-
|
200
|
+
if (
|
201
|
+
file_name in self._cache
|
202
|
+
and self._cache[file_name]["ttl"] > time.time()
|
203
|
+
):
|
204
|
+
logger.info(
|
205
|
+
f"✅ [Cache Hit] Using cached data for {file_name}"
|
206
|
+
)
|
207
|
+
parsed_data.append(self._cache[file_name]["data"])
|
187
208
|
else:
|
188
|
-
logger.info(
|
189
|
-
|
209
|
+
logger.info(
|
210
|
+
f"⏳ [Cache Miss] No cached data for {file_name}, parsing..."
|
211
|
+
)
|
212
|
+
self._cache = {
|
213
|
+
k: v
|
214
|
+
for k, v in self._cache.items()
|
215
|
+
if v["ttl"] > time.time()
|
216
|
+
}
|
190
217
|
res_data = self._parse_file(f)
|
191
218
|
parsed_data.append(res_data)
|
192
219
|
self.set_data(file_name, res_data)
|
@@ -201,53 +228,99 @@ class DataMax:
|
|
201
228
|
"""
|
202
229
|
Clean data
|
203
230
|
|
204
|
-
methods include AbnormalCleaner
|
231
|
+
methods include AbnormalCleaner, TextFilter, PrivacyDesensitization which are 1, 2, 3
|
205
232
|
|
206
|
-
:return:
|
233
|
+
:return: Cleaned data
|
207
234
|
"""
|
208
235
|
if text:
|
209
236
|
cleaned_text = text
|
210
237
|
elif self.parsed_data:
|
211
|
-
cleaned_text = self.parsed_data.get(
|
238
|
+
cleaned_text = self.parsed_data.get("content")
|
212
239
|
else:
|
213
240
|
raise ValueError("No data to clean.")
|
214
241
|
|
215
242
|
for method in method_list:
|
216
|
-
if method ==
|
217
|
-
cleaned_text =
|
218
|
-
|
243
|
+
if method == "abnormal":
|
244
|
+
cleaned_text = (
|
245
|
+
data_cleaner.AbnormalCleaner(cleaned_text).to_clean().get("text")
|
246
|
+
)
|
247
|
+
elif method == "filter":
|
219
248
|
cleaned_text = data_cleaner.TextFilter(cleaned_text).to_filter()
|
220
|
-
cleaned_text = cleaned_text.get("text") if cleaned_text else
|
221
|
-
elif method ==
|
222
|
-
cleaned_text =
|
249
|
+
cleaned_text = cleaned_text.get("text") if cleaned_text else ""
|
250
|
+
elif method == "private":
|
251
|
+
cleaned_text = (
|
252
|
+
data_cleaner.PrivacyDesensitization(cleaned_text)
|
253
|
+
.to_private()
|
254
|
+
.get("text")
|
255
|
+
)
|
223
256
|
|
224
257
|
if self.parsed_data:
|
225
258
|
origin_dict = self.parsed_data
|
226
|
-
origin_dict[
|
259
|
+
origin_dict["content"] = cleaned_text
|
227
260
|
self.parsed_data = None
|
228
261
|
return origin_dict
|
229
262
|
else:
|
230
263
|
return cleaned_text
|
231
264
|
|
232
|
-
def get_pre_label(
|
233
|
-
|
234
|
-
|
235
|
-
|
236
|
-
|
237
|
-
|
238
|
-
|
239
|
-
|
240
|
-
|
241
|
-
|
265
|
+
def get_pre_label(
|
266
|
+
self,
|
267
|
+
api_key: str,
|
268
|
+
base_url: str,
|
269
|
+
model_name: str,
|
270
|
+
chunk_size: int = 500,
|
271
|
+
chunk_overlap: int = 100,
|
272
|
+
question_number: int = 5,
|
273
|
+
max_workers: int = 5,
|
274
|
+
language: str = "zh",
|
275
|
+
messages: List[Dict[str, str]] = None,
|
276
|
+
):
|
277
|
+
"""
|
278
|
+
Generate pre-labeling data based on processed document content instead of file path
|
279
|
+
|
280
|
+
:param api_key: API key
|
281
|
+
:param base_url: API base URL
|
282
|
+
:param model_name: Model name
|
283
|
+
:param chunk_size: Chunk size
|
284
|
+
:param chunk_overlap: Overlap length
|
285
|
+
:param question_number: Number of questions generated per chunk
|
286
|
+
:param max_workers: Number of concurrent workers
|
287
|
+
:param language: Language for QA generation ("zh" for Chinese, "en" for English)
|
288
|
+
:param messages: Custom messages
|
289
|
+
:return: List of QA pairs
|
290
|
+
"""
|
291
|
+
# First get the processed data
|
292
|
+
processed_data = self.get_data()
|
293
|
+
|
294
|
+
# If it's a list (multiple files), merge all content
|
295
|
+
if isinstance(processed_data, list):
|
296
|
+
content_list = []
|
297
|
+
for data in processed_data:
|
298
|
+
if isinstance(data, dict) and "content" in data:
|
299
|
+
content_list.append(data["content"])
|
300
|
+
elif isinstance(data, str):
|
301
|
+
content_list.append(data)
|
302
|
+
content = "\n\n".join(content_list)
|
303
|
+
# If it's a dictionary for a single file
|
304
|
+
elif isinstance(processed_data, dict) and "content" in processed_data:
|
305
|
+
content = processed_data["content"]
|
306
|
+
# If it's a string
|
307
|
+
elif isinstance(processed_data, str):
|
308
|
+
content = processed_data
|
309
|
+
else:
|
310
|
+
raise ValueError("Unable to extract content field from processed data")
|
311
|
+
|
312
|
+
# Generate QA pairs using content instead of reading files
|
313
|
+
return generate_qa_from_content(
|
314
|
+
content=content,
|
242
315
|
api_key=api_key,
|
243
316
|
base_url=base_url,
|
244
317
|
model_name=model_name,
|
245
318
|
chunk_size=chunk_size,
|
246
319
|
chunk_overlap=chunk_overlap,
|
247
320
|
question_number=question_number,
|
321
|
+
language=language,
|
248
322
|
max_workers=max_workers,
|
249
323
|
message=messages,
|
250
|
-
file_path=self.file_path
|
251
324
|
)
|
252
325
|
|
253
326
|
def save_label_data(self, label_data: list, save_file_name: str = None):
|
@@ -262,27 +335,30 @@ class DataMax:
|
|
262
335
|
if isinstance(self.file_path, str):
|
263
336
|
save_file_name = os.path.splitext(os.path.basename(self.file_path))[0]
|
264
337
|
else:
|
265
|
-
save_file_name =
|
338
|
+
save_file_name = "label_data"
|
266
339
|
if isinstance(label_data, list):
|
267
|
-
with open(save_file_name +
|
340
|
+
with open(save_file_name + ".jsonl", "w", encoding="utf-8") as f:
|
268
341
|
for qa_entry in label_data:
|
269
342
|
f.write(json.dumps(qa_entry, ensure_ascii=False) + "\n")
|
270
|
-
logger.info(
|
271
|
-
|
343
|
+
logger.info(
|
344
|
+
f"✅ [Label Data Saved] Label data saved to {save_file_name}.jsonl"
|
345
|
+
)
|
272
346
|
|
273
|
-
@staticmethod
|
274
|
-
def split_text_into_paragraphs(
|
347
|
+
@staticmethod
|
348
|
+
def split_text_into_paragraphs(
|
349
|
+
text: str, max_length: int = 500, chunk_overlap: int = 100
|
350
|
+
):
|
275
351
|
"""
|
276
352
|
Split text into paragraphs by sentence boundaries, each paragraph not exceeding max_length characters.
|
277
353
|
Paragraphs will have chunk_overlap characters of overlap between them.
|
278
354
|
"""
|
279
|
-
import re
|
355
|
+
import re
|
280
356
|
|
281
357
|
# Split sentences using Chinese punctuation marks
|
282
|
-
sentences = re.split(
|
358
|
+
sentences = re.split("(?<=[。!?])", text)
|
283
359
|
paragraphs = []
|
284
|
-
current_paragraph =
|
285
|
-
overlap_buffer =
|
360
|
+
current_paragraph = ""
|
361
|
+
overlap_buffer = ""
|
286
362
|
|
287
363
|
for sentence in sentences:
|
288
364
|
# If current paragraph plus new sentence doesn't exceed max length
|
@@ -293,20 +369,26 @@ class DataMax:
|
|
293
369
|
# Add current paragraph to results
|
294
370
|
paragraphs.append(current_paragraph)
|
295
371
|
# Save overlap portion
|
296
|
-
overlap_buffer =
|
372
|
+
overlap_buffer = (
|
373
|
+
current_paragraph[-chunk_overlap:] if chunk_overlap > 0 else ""
|
374
|
+
)
|
297
375
|
# Start new paragraph with overlap
|
298
376
|
current_paragraph = overlap_buffer + sentence
|
299
|
-
overlap_buffer =
|
300
|
-
|
377
|
+
overlap_buffer = ""
|
378
|
+
|
301
379
|
# Handle overly long sentences
|
302
380
|
while len(current_paragraph) > max_length:
|
303
381
|
# Split long paragraph
|
304
382
|
split_point = max_length - len(overlap_buffer)
|
305
383
|
paragraphs.append(current_paragraph[:split_point])
|
306
384
|
# Update overlap buffer
|
307
|
-
overlap_buffer =
|
385
|
+
overlap_buffer = (
|
386
|
+
current_paragraph[split_point - chunk_overlap : split_point]
|
387
|
+
if chunk_overlap > 0
|
388
|
+
else ""
|
389
|
+
)
|
308
390
|
current_paragraph = overlap_buffer + current_paragraph[split_point:]
|
309
|
-
overlap_buffer =
|
391
|
+
overlap_buffer = ""
|
310
392
|
|
311
393
|
# Add the last paragraph
|
312
394
|
if current_paragraph:
|
@@ -315,10 +397,12 @@ class DataMax:
|
|
315
397
|
return paragraphs
|
316
398
|
|
317
399
|
@staticmethod
|
318
|
-
def split_with_langchain(
|
400
|
+
def split_with_langchain(
|
401
|
+
text: str, chunk_size: int = 500, chunk_overlap: int = 100
|
402
|
+
):
|
319
403
|
"""
|
320
404
|
Split text using LangChain's intelligent text splitting
|
321
|
-
|
405
|
+
|
322
406
|
:param text: Text to be split
|
323
407
|
:param chunk_size: Maximum length of each chunk
|
324
408
|
:param chunk_overlap: Number of overlapping characters between chunks
|
@@ -333,14 +417,15 @@ class DataMax:
|
|
333
417
|
return text_splitter.split_text(text)
|
334
418
|
|
335
419
|
def split_data(
|
336
|
-
|
337
|
-
|
338
|
-
|
339
|
-
|
340
|
-
|
420
|
+
self,
|
421
|
+
parsed_data: Union[str, dict] = None,
|
422
|
+
chunk_size: int = 500,
|
423
|
+
chunk_overlap: int = 100,
|
424
|
+
use_langchain: bool = False,
|
425
|
+
):
|
341
426
|
"""
|
342
427
|
Improved splitting method with LangChain option
|
343
|
-
|
428
|
+
|
344
429
|
:param use_langchain: Whether to use LangChain for splitting
|
345
430
|
:param parsed_data: Data to be split, either string or dict
|
346
431
|
:param chunk_size: Maximum length of each chunk
|
@@ -351,36 +436,41 @@ class DataMax:
|
|
351
436
|
self.parsed_data = parsed_data
|
352
437
|
if not self.parsed_data:
|
353
438
|
raise ValueError("No data to split.")
|
354
|
-
|
439
|
+
|
355
440
|
if use_langchain:
|
356
441
|
if isinstance(self.parsed_data, str):
|
357
|
-
return self.split_with_langchain(
|
442
|
+
return self.split_with_langchain(
|
443
|
+
self.parsed_data, chunk_size, chunk_overlap
|
444
|
+
)
|
358
445
|
elif isinstance(self.parsed_data, dict):
|
359
|
-
if
|
446
|
+
if "content" not in self.parsed_data:
|
360
447
|
raise ValueError("Input dict must contain 'content' key")
|
361
|
-
chunks = self.split_with_langchain(
|
448
|
+
chunks = self.split_with_langchain(
|
449
|
+
self.parsed_data["content"], chunk_size, chunk_overlap
|
450
|
+
)
|
362
451
|
result = self.parsed_data.copy()
|
363
|
-
result[
|
452
|
+
result["content"] = chunks
|
364
453
|
return result
|
365
|
-
|
454
|
+
|
366
455
|
# Handle string input
|
367
456
|
if isinstance(self.parsed_data, str):
|
368
|
-
return self.split_text_into_paragraphs(
|
369
|
-
|
457
|
+
return self.split_text_into_paragraphs(
|
458
|
+
self.parsed_data, chunk_size, chunk_overlap
|
459
|
+
)
|
460
|
+
|
370
461
|
# Handle dict input
|
371
462
|
elif isinstance(self.parsed_data, dict):
|
372
|
-
if
|
463
|
+
if "content" not in self.parsed_data:
|
373
464
|
raise ValueError("Input dict must contain 'content' key")
|
374
|
-
|
375
|
-
content = self.parsed_data[
|
465
|
+
|
466
|
+
content = self.parsed_data["content"]
|
376
467
|
chunks = self.split_text_into_paragraphs(content, chunk_size, chunk_overlap)
|
377
|
-
|
468
|
+
|
378
469
|
result = self.parsed_data.copy()
|
379
|
-
result[
|
470
|
+
result["content"] = chunks
|
380
471
|
return result
|
381
472
|
else:
|
382
473
|
raise ValueError("Unsupported input type")
|
383
|
-
|
384
474
|
|
385
475
|
def _parse_file(self, file_path):
|
386
476
|
"""
|
@@ -394,7 +484,6 @@ class DataMax:
|
|
394
484
|
use_mineru=self.use_mineru,
|
395
485
|
file_path=file_path,
|
396
486
|
to_markdown=self.to_markdown,
|
397
|
-
timeout=self.timeout
|
398
487
|
)
|
399
488
|
if parser:
|
400
489
|
return parser.parse(file_path=file_path)
|
@@ -402,5 +491,5 @@ class DataMax:
|
|
402
491
|
raise e
|
403
492
|
|
404
493
|
|
405
|
-
if __name__ ==
|
406
|
-
pass
|
494
|
+
if __name__ == "__main__":
|
495
|
+
pass
|
datamax/parser/csv_parser.py
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
import pandas as pd
|
2
2
|
|
3
3
|
from datamax.parser.base import BaseLife, MarkdownOutputVo
|
4
|
+
from datamax.utils.lifecycle_types import LifeType
|
4
5
|
|
5
6
|
|
6
7
|
class CsvParser(BaseLife):
|
@@ -16,16 +17,35 @@ class CsvParser(BaseLife):
|
|
16
17
|
|
17
18
|
def parse(self, file_path: str) -> MarkdownOutputVo:
|
18
19
|
try:
|
20
|
+
# 1) 处理开始
|
21
|
+
extension = self.get_file_extension(file_path)
|
22
|
+
lc_start = self.generate_lifecycle(
|
23
|
+
source_file=file_path,
|
24
|
+
domain="Technology",
|
25
|
+
life_type=LifeType.DATA_PROCESSING,
|
26
|
+
usage_purpose="Parsing",
|
27
|
+
)
|
28
|
+
|
29
|
+
# 2) 核心解析
|
19
30
|
df = self.read_csv_file(file_path)
|
20
31
|
mk_content = df.to_markdown(index=False)
|
21
|
-
|
32
|
+
|
33
|
+
# 3) 处理结束或失败
|
34
|
+
lc_end = self.generate_lifecycle(
|
22
35
|
source_file=file_path,
|
23
36
|
domain="Technology",
|
24
|
-
|
25
|
-
|
37
|
+
life_type=(
|
38
|
+
LifeType.DATA_PROCESSED
|
39
|
+
if mk_content.strip()
|
40
|
+
else LifeType.DATA_PROCESS_FAILED
|
41
|
+
),
|
42
|
+
usage_purpose="Parsing",
|
26
43
|
)
|
27
|
-
|
28
|
-
|
44
|
+
|
45
|
+
# 4) 封装输出并添加生命周期
|
46
|
+
output_vo = MarkdownOutputVo(extension, mk_content)
|
47
|
+
output_vo.add_lifecycle(lc_start)
|
48
|
+
output_vo.add_lifecycle(lc_end)
|
29
49
|
return output_vo.to_dict()
|
30
50
|
except Exception as e:
|
31
51
|
raise e
|