pydatamax 0.1.14__py3-none-any.whl → 0.1.15.post2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datamax/__init__.py +1 -1
- datamax/loader/core.py +118 -118
- datamax/loader/minio_handler.py +171 -171
- datamax/loader/oss_handler.py +191 -191
- datamax/parser/__init__.py +2 -4
- datamax/parser/base.py +76 -76
- datamax/parser/core.py +406 -288
- datamax/parser/csv_parser.py +31 -10
- datamax/parser/doc_parser.py +466 -10
- datamax/parser/docx_parser.py +449 -11
- datamax/parser/epub_parser.py +41 -41
- datamax/parser/html_parser.py +37 -37
- datamax/parser/image_parser.py +34 -34
- datamax/parser/json_parser.py +32 -10
- datamax/parser/md_parser.py +72 -72
- datamax/parser/pdf_parser.py +101 -101
- datamax/parser/ppt_parser.py +70 -20
- datamax/parser/pptx_parser.py +45 -45
- datamax/parser/txt_parser.py +45 -45
- datamax/parser/xls_parser.py +26 -26
- datamax/parser/xlsx_parser.py +212 -215
- datamax/utils/__init__.py +23 -2
- datamax/utils/constants.py +58 -58
- datamax/utils/data_cleaner.py +275 -237
- datamax/utils/env_setup.py +79 -79
- datamax/utils/gotocr_pdf.py +265 -265
- datamax/utils/mineru_operator.py +62 -62
- datamax/utils/paddleocr_pdf_operator.py +90 -90
- datamax/utils/ppt_extract.py +140 -140
- datamax/utils/qa_generator.py +369 -376
- datamax/utils/tokenizer.py +21 -21
- datamax/utils/uno_handler.py +426 -0
- {pydatamax-0.1.14.dist-info → pydatamax-0.1.15.post2.dist-info}/METADATA +117 -5
- pydatamax-0.1.15.post2.dist-info/RECORD +38 -0
- {pydatamax-0.1.14.dist-info → pydatamax-0.1.15.post2.dist-info}/licenses/LICENSE +21 -21
- {pydatamax-0.1.14.dist-info → pydatamax-0.1.15.post2.dist-info}/top_level.txt +0 -1
- pydatamax-0.1.14.dist-info/RECORD +0 -39
- tests/__init__.py +0 -0
- tests/test_basic.py +0 -20
- {pydatamax-0.1.14.dist-info → pydatamax-0.1.15.post2.dist-info}/WHEEL +0 -0
datamax/parser/core.py
CHANGED
@@ -1,288 +1,406 @@
|
|
1
|
-
import os
|
2
|
-
import
|
3
|
-
|
4
|
-
|
5
|
-
from
|
6
|
-
from
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
)
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
:
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
'.
|
52
|
-
'.
|
53
|
-
'.
|
54
|
-
'.
|
55
|
-
'.
|
56
|
-
'.
|
57
|
-
'.
|
58
|
-
'.
|
59
|
-
'.
|
60
|
-
'.
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
module
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
return parser_class(
|
85
|
-
file_path=file_path,
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
)
|
92
|
-
|
93
|
-
return parser_class(
|
94
|
-
file_path=file_path
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
:param
|
119
|
-
:param
|
120
|
-
:param
|
121
|
-
|
122
|
-
|
123
|
-
self.
|
124
|
-
self.
|
125
|
-
self.
|
126
|
-
self.
|
127
|
-
self.
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
elif
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
|
220
|
-
|
221
|
-
|
222
|
-
|
223
|
-
|
224
|
-
|
225
|
-
|
226
|
-
|
227
|
-
|
228
|
-
|
229
|
-
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
|
234
|
-
|
235
|
-
|
236
|
-
|
237
|
-
|
238
|
-
|
239
|
-
|
240
|
-
|
241
|
-
|
242
|
-
|
243
|
-
|
244
|
-
|
245
|
-
|
246
|
-
|
247
|
-
|
248
|
-
|
249
|
-
|
250
|
-
|
251
|
-
|
252
|
-
|
253
|
-
|
254
|
-
|
255
|
-
|
256
|
-
|
257
|
-
|
258
|
-
|
259
|
-
|
260
|
-
|
261
|
-
|
262
|
-
|
263
|
-
|
264
|
-
|
265
|
-
|
266
|
-
|
267
|
-
|
268
|
-
|
269
|
-
|
270
|
-
|
271
|
-
|
272
|
-
|
273
|
-
|
274
|
-
|
275
|
-
|
276
|
-
|
277
|
-
|
278
|
-
|
279
|
-
|
280
|
-
|
281
|
-
|
282
|
-
|
283
|
-
|
284
|
-
|
285
|
-
|
286
|
-
|
287
|
-
|
288
|
-
|
1
|
+
import os
|
2
|
+
import json
|
3
|
+
import time
|
4
|
+
import importlib
|
5
|
+
from loguru import logger
|
6
|
+
from typing import List, Union, Dict
|
7
|
+
from openai import OpenAI
|
8
|
+
from pathlib import Path
|
9
|
+
from datamax.utils import data_cleaner
|
10
|
+
from datamax.utils.qa_generator import generatr_qa_pairs
|
11
|
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
12
|
+
|
13
|
+
|
14
|
+
class ModelInvoker:
|
15
|
+
def __init__(self):
|
16
|
+
self.client = None
|
17
|
+
|
18
|
+
def invoke_model(self, api_key, base_url, model_name, messages):
|
19
|
+
self.client = OpenAI(
|
20
|
+
api_key=api_key,
|
21
|
+
base_url=base_url,
|
22
|
+
)
|
23
|
+
|
24
|
+
completion = self.client.chat.completions.create(
|
25
|
+
model=model_name,
|
26
|
+
messages=messages,
|
27
|
+
)
|
28
|
+
json_data = completion.model_dump()
|
29
|
+
return json_data.get("choices")[0].get("message").get("content", "")
|
30
|
+
|
31
|
+
|
32
|
+
class ParserFactory:
|
33
|
+
@staticmethod
|
34
|
+
def create_parser(
|
35
|
+
file_path: str,
|
36
|
+
use_mineru: bool = False,
|
37
|
+
to_markdown: bool = False,
|
38
|
+
timeout: int = 1200
|
39
|
+
):
|
40
|
+
"""
|
41
|
+
Create a parser instance based on the file extension.
|
42
|
+
:param file_path: The path to the file to be parsed.
|
43
|
+
:param to_markdown: Flag to indicate whether the output should be in Markdown format.
|
44
|
+
(only supported files in .doc or .docx format)
|
45
|
+
:param use_mineru: Flag to indicate whether MinerU should be used. (only supported files in .pdf format)
|
46
|
+
:param timeout: Timeout for the request .(only supported files in .xlsx format)
|
47
|
+
:return: An instance of the parser class corresponding to the file extension.
|
48
|
+
"""
|
49
|
+
file_extension = os.path.splitext(file_path)[1].lower()
|
50
|
+
parser_class_name = {
|
51
|
+
'.md': 'MarkdownParser',
|
52
|
+
'.docx': 'DocxParser',
|
53
|
+
'.doc': 'DocParser',
|
54
|
+
'.epub': 'EpubParser',
|
55
|
+
'.html': 'HtmlParser',
|
56
|
+
'.txt': 'TxtParser',
|
57
|
+
'.pptx': 'PPtxParser',
|
58
|
+
'.ppt': 'PPtParser',
|
59
|
+
'.pdf': 'PdfParser',
|
60
|
+
'.jpg': 'ImageParser',
|
61
|
+
'.jpeg': 'ImageParser',
|
62
|
+
'.png': 'ImageParser',
|
63
|
+
'.webp': 'ImageParser',
|
64
|
+
'.xlsx': 'XlsxParser',
|
65
|
+
'.xls': 'XlsParser'
|
66
|
+
}.get(file_extension)
|
67
|
+
|
68
|
+
if not parser_class_name:
|
69
|
+
return None
|
70
|
+
|
71
|
+
if file_extension in ['.jpg', 'jpeg', '.png', '.webp']:
|
72
|
+
module_name = f'datamax.parser.image_parser'
|
73
|
+
else:
|
74
|
+
# Dynamically determine the module name based on the file extension
|
75
|
+
module_name = f'datamax.parser.{file_extension[1:]}_parser'
|
76
|
+
|
77
|
+
try:
|
78
|
+
# Dynamically import the module and get the class
|
79
|
+
module = importlib.import_module(module_name)
|
80
|
+
parser_class = getattr(module, parser_class_name)
|
81
|
+
|
82
|
+
# Special handling for PdfParser arguments
|
83
|
+
if parser_class_name == 'PdfParser':
|
84
|
+
return parser_class(
|
85
|
+
file_path=file_path,
|
86
|
+
use_mineru=use_mineru,
|
87
|
+
)
|
88
|
+
elif parser_class_name == 'DocxParser' or parser_class_name == 'DocParser':
|
89
|
+
return parser_class(
|
90
|
+
file_path=file_path, to_markdown=to_markdown
|
91
|
+
)
|
92
|
+
elif parser_class_name == 'XlsxParser':
|
93
|
+
return parser_class(
|
94
|
+
file_path=file_path,
|
95
|
+
timeout=timeout
|
96
|
+
)
|
97
|
+
else:
|
98
|
+
return parser_class(
|
99
|
+
file_path=file_path
|
100
|
+
)
|
101
|
+
|
102
|
+
except (ImportError, AttributeError) as e:
|
103
|
+
raise e
|
104
|
+
|
105
|
+
|
106
|
+
class DataMax:
|
107
|
+
def __init__(self,
|
108
|
+
file_path: Union[str, list] = '',
|
109
|
+
use_mineru: bool = False,
|
110
|
+
to_markdown: bool = False,
|
111
|
+
timeout: int = 1200,
|
112
|
+
ttl: int = 3600
|
113
|
+
):
|
114
|
+
"""
|
115
|
+
Initialize the DataMaxParser with file path and parsing options.
|
116
|
+
|
117
|
+
:param file_path: The path to the file or directory to be parsed.
|
118
|
+
:param use_mineru: Flag to indicate whether MinerU should be used.
|
119
|
+
:param to_markdown: Flag to indicate whether the output should be in Markdown format.
|
120
|
+
:param timeout: Timeout for the request.
|
121
|
+
:param ttl: Time to live for the cache.
|
122
|
+
"""
|
123
|
+
self.file_path = file_path
|
124
|
+
self.use_mineru = use_mineru
|
125
|
+
self.to_markdown = to_markdown
|
126
|
+
self.parsed_data = None
|
127
|
+
self.model_invoker = ModelInvoker()
|
128
|
+
self.timeout = timeout
|
129
|
+
self._cache = {}
|
130
|
+
self.ttl = ttl
|
131
|
+
|
132
|
+
def set_data(self, file_name, parsed_data):
|
133
|
+
"""
|
134
|
+
Set cached data
|
135
|
+
:param file_name: File name as cache key
|
136
|
+
:param parsed_data: Parsed data as value
|
137
|
+
"""
|
138
|
+
logger.info(f"cache ttl is {self.ttl}s")
|
139
|
+
if self.ttl > 0:
|
140
|
+
self._cache[file_name] = {'data': parsed_data, 'ttl': time.time() + self.ttl}
|
141
|
+
logger.info(f"✅ [Cache Updated] Cached data for {file_name}, ttl: {self._cache[file_name]['ttl']}")
|
142
|
+
|
143
|
+
def get_data(self):
|
144
|
+
"""
|
145
|
+
Parse the file or directory specified in the file path and return the data.
|
146
|
+
|
147
|
+
:return: A list of parsed data if the file path is a directory, otherwise a single parsed data.
|
148
|
+
"""
|
149
|
+
try:
|
150
|
+
if isinstance(self.file_path, list):
|
151
|
+
parsed_data = []
|
152
|
+
for f in self.file_path:
|
153
|
+
file_name = os.path.basename(f)
|
154
|
+
if file_name in self._cache and self._cache[file_name]['ttl'] > time.time():
|
155
|
+
logger.info(f"✅ [Cache Hit] Using cached data for {file_name}")
|
156
|
+
parsed_data.append(self._cache[file_name]['data'])
|
157
|
+
else:
|
158
|
+
logger.info(f"⏳ [Cache Miss] No cached data for {file_name}, parsing...")
|
159
|
+
self._cache = {k: v for k, v in self._cache.items() if v['ttl'] > time.time()}
|
160
|
+
res_data = self._parse_file(f)
|
161
|
+
parsed_data.append(res_data)
|
162
|
+
self.set_data(file_name, res_data)
|
163
|
+
return parsed_data
|
164
|
+
|
165
|
+
elif isinstance(self.file_path, str) and os.path.isfile(self.file_path):
|
166
|
+
file_name = os.path.basename(self.file_path)
|
167
|
+
if file_name in self._cache and self._cache[file_name]['ttl'] > time.time():
|
168
|
+
logger.info(f"✅ [Cache Hit] Using cached data for {file_name}")
|
169
|
+
return self._cache[file_name]['data']
|
170
|
+
else:
|
171
|
+
logger.info(f"⏳ [Cache Miss] No cached data for {file_name}, parsing...")
|
172
|
+
self._cache = {k: v for k, v in self._cache.items() if v['ttl'] > time.time()}
|
173
|
+
parsed_data = self._parse_file(self.file_path)
|
174
|
+
self.parsed_data = parsed_data
|
175
|
+
self.set_data(file_name, parsed_data)
|
176
|
+
return parsed_data
|
177
|
+
|
178
|
+
elif isinstance(self.file_path, str) and os.path.isdir(self.file_path):
|
179
|
+
file_list = [str(file) for file in list(Path(self.file_path).rglob('*.*'))]
|
180
|
+
parsed_data = []
|
181
|
+
for f in file_list:
|
182
|
+
if os.path.isfile(f):
|
183
|
+
file_name = os.path.basename(f)
|
184
|
+
if file_name in self._cache and self._cache[file_name]['ttl'] > time.time():
|
185
|
+
logger.info(f"✅ [Cache Hit] Using cached data for {file_name}")
|
186
|
+
parsed_data.append(self._cache[file_name]['data'])
|
187
|
+
else:
|
188
|
+
logger.info(f"⏳ [Cache Miss] No cached data for {file_name}, parsing...")
|
189
|
+
self._cache = {k: v for k, v in self._cache.items() if v['ttl'] > time.time()}
|
190
|
+
res_data = self._parse_file(f)
|
191
|
+
parsed_data.append(res_data)
|
192
|
+
self.set_data(file_name, res_data)
|
193
|
+
return parsed_data
|
194
|
+
else:
|
195
|
+
raise ValueError("Invalid file path.")
|
196
|
+
|
197
|
+
except Exception as e:
|
198
|
+
raise e
|
199
|
+
|
200
|
+
def clean_data(self, method_list: List[str], text: str = None):
|
201
|
+
"""
|
202
|
+
Clean data
|
203
|
+
|
204
|
+
methods include AbnormalCleaner, TextFilter, PrivacyDesensitization which is 1 2 3
|
205
|
+
|
206
|
+
:return:
|
207
|
+
"""
|
208
|
+
if text:
|
209
|
+
cleaned_text = text
|
210
|
+
elif self.parsed_data:
|
211
|
+
cleaned_text = self.parsed_data.get('content')
|
212
|
+
else:
|
213
|
+
raise ValueError("No data to clean.")
|
214
|
+
|
215
|
+
for method in method_list:
|
216
|
+
if method == 'abnormal':
|
217
|
+
cleaned_text = data_cleaner.AbnormalCleaner(cleaned_text).to_clean().get("text")
|
218
|
+
elif method == 'filter':
|
219
|
+
cleaned_text = data_cleaner.TextFilter(cleaned_text).to_filter()
|
220
|
+
cleaned_text = cleaned_text.get("text") if cleaned_text else ''
|
221
|
+
elif method == 'private':
|
222
|
+
cleaned_text = data_cleaner.PrivacyDesensitization(cleaned_text).to_private().get("text")
|
223
|
+
|
224
|
+
if self.parsed_data:
|
225
|
+
origin_dict = self.parsed_data
|
226
|
+
origin_dict['content'] = cleaned_text
|
227
|
+
self.parsed_data = None
|
228
|
+
return origin_dict
|
229
|
+
else:
|
230
|
+
return cleaned_text
|
231
|
+
|
232
|
+
def get_pre_label(self,
|
233
|
+
api_key: str,
|
234
|
+
base_url: str,
|
235
|
+
model_name: str,
|
236
|
+
chunk_size: int = 500,
|
237
|
+
chunk_overlap: int = 100,
|
238
|
+
question_number: int = 5,
|
239
|
+
max_workers: int = 5,
|
240
|
+
messages: List[Dict[str, str]] = None):
|
241
|
+
return generatr_qa_pairs(
|
242
|
+
api_key=api_key,
|
243
|
+
base_url=base_url,
|
244
|
+
model_name=model_name,
|
245
|
+
chunk_size=chunk_size,
|
246
|
+
chunk_overlap=chunk_overlap,
|
247
|
+
question_number=question_number,
|
248
|
+
max_workers=max_workers,
|
249
|
+
message=messages,
|
250
|
+
file_path=self.file_path
|
251
|
+
)
|
252
|
+
|
253
|
+
def save_label_data(self, label_data: list, save_file_name: str = None):
|
254
|
+
"""
|
255
|
+
Save label data to file.
|
256
|
+
:param label_data: Label data to be saved.
|
257
|
+
:param save_file_name: File name to save the label data.
|
258
|
+
"""
|
259
|
+
if not label_data:
|
260
|
+
raise ValueError("No data to save.")
|
261
|
+
if not save_file_name:
|
262
|
+
if isinstance(self.file_path, str):
|
263
|
+
save_file_name = os.path.splitext(os.path.basename(self.file_path))[0]
|
264
|
+
else:
|
265
|
+
save_file_name = 'label_data'
|
266
|
+
if isinstance(label_data, list):
|
267
|
+
with open(save_file_name + '.jsonl', 'w', encoding='utf-8') as f:
|
268
|
+
for qa_entry in label_data:
|
269
|
+
f.write(json.dumps(qa_entry, ensure_ascii=False) + "\n")
|
270
|
+
logger.info(f"✅ [Label Data Saved] Label data saved to {save_file_name}.jsonl")
|
271
|
+
|
272
|
+
|
273
|
+
@staticmethod
|
274
|
+
def split_text_into_paragraphs(text: str, max_length:int = 500, chunk_overlap: int = 100):
|
275
|
+
"""
|
276
|
+
Split text into paragraphs by sentence boundaries, each paragraph not exceeding max_length characters.
|
277
|
+
Paragraphs will have chunk_overlap characters of overlap between them.
|
278
|
+
"""
|
279
|
+
import re
|
280
|
+
|
281
|
+
# Split sentences using Chinese punctuation marks
|
282
|
+
sentences = re.split('(?<=[。!?])', text)
|
283
|
+
paragraphs = []
|
284
|
+
current_paragraph = ''
|
285
|
+
overlap_buffer = ''
|
286
|
+
|
287
|
+
for sentence in sentences:
|
288
|
+
# If current paragraph plus new sentence doesn't exceed max length
|
289
|
+
if len(current_paragraph) + len(sentence) <= max_length:
|
290
|
+
current_paragraph += sentence
|
291
|
+
else:
|
292
|
+
if current_paragraph:
|
293
|
+
# Add current paragraph to results
|
294
|
+
paragraphs.append(current_paragraph)
|
295
|
+
# Save overlap portion
|
296
|
+
overlap_buffer = current_paragraph[-chunk_overlap:] if chunk_overlap > 0 else ''
|
297
|
+
# Start new paragraph with overlap
|
298
|
+
current_paragraph = overlap_buffer + sentence
|
299
|
+
overlap_buffer = ''
|
300
|
+
|
301
|
+
# Handle overly long sentences
|
302
|
+
while len(current_paragraph) > max_length:
|
303
|
+
# Split long paragraph
|
304
|
+
split_point = max_length - len(overlap_buffer)
|
305
|
+
paragraphs.append(current_paragraph[:split_point])
|
306
|
+
# Update overlap buffer
|
307
|
+
overlap_buffer = current_paragraph[split_point - chunk_overlap:split_point] if chunk_overlap > 0 else ''
|
308
|
+
current_paragraph = overlap_buffer + current_paragraph[split_point:]
|
309
|
+
overlap_buffer = ''
|
310
|
+
|
311
|
+
# Add the last paragraph
|
312
|
+
if current_paragraph:
|
313
|
+
paragraphs.append(current_paragraph)
|
314
|
+
|
315
|
+
return paragraphs
|
316
|
+
|
317
|
+
@staticmethod
|
318
|
+
def split_with_langchain(text: str, chunk_size: int = 500, chunk_overlap: int = 100):
|
319
|
+
"""
|
320
|
+
Split text using LangChain's intelligent text splitting
|
321
|
+
|
322
|
+
:param text: Text to be split
|
323
|
+
:param chunk_size: Maximum length of each chunk
|
324
|
+
:param chunk_overlap: Number of overlapping characters between chunks
|
325
|
+
:return: List of split text
|
326
|
+
"""
|
327
|
+
text_splitter = RecursiveCharacterTextSplitter(
|
328
|
+
chunk_size=chunk_size,
|
329
|
+
chunk_overlap=chunk_overlap,
|
330
|
+
length_function=len,
|
331
|
+
is_separator_regex=False,
|
332
|
+
)
|
333
|
+
return text_splitter.split_text(text)
|
334
|
+
|
335
|
+
def split_data(
|
336
|
+
self,
|
337
|
+
parsed_data: Union[str, dict] = None,
|
338
|
+
chunk_size: int = 500,
|
339
|
+
chunk_overlap: int = 100,
|
340
|
+
use_langchain: bool = False):
|
341
|
+
"""
|
342
|
+
Improved splitting method with LangChain option
|
343
|
+
|
344
|
+
:param use_langchain: Whether to use LangChain for splitting
|
345
|
+
:param parsed_data: Data to be split, either string or dict
|
346
|
+
:param chunk_size: Maximum length of each chunk
|
347
|
+
:param chunk_overlap: Number of overlapping characters between chunks
|
348
|
+
:return: List or dict of split text
|
349
|
+
"""
|
350
|
+
if parsed_data:
|
351
|
+
self.parsed_data = parsed_data
|
352
|
+
if not self.parsed_data:
|
353
|
+
raise ValueError("No data to split.")
|
354
|
+
|
355
|
+
if use_langchain:
|
356
|
+
if isinstance(self.parsed_data, str):
|
357
|
+
return self.split_with_langchain(self.parsed_data, chunk_size, chunk_overlap)
|
358
|
+
elif isinstance(self.parsed_data, dict):
|
359
|
+
if 'content' not in self.parsed_data:
|
360
|
+
raise ValueError("Input dict must contain 'content' key")
|
361
|
+
chunks = self.split_with_langchain(self.parsed_data['content'], chunk_size, chunk_overlap)
|
362
|
+
result = self.parsed_data.copy()
|
363
|
+
result['content'] = chunks
|
364
|
+
return result
|
365
|
+
|
366
|
+
# Handle string input
|
367
|
+
if isinstance(self.parsed_data, str):
|
368
|
+
return self.split_text_into_paragraphs(self.parsed_data, chunk_size, chunk_overlap)
|
369
|
+
|
370
|
+
# Handle dict input
|
371
|
+
elif isinstance(self.parsed_data, dict):
|
372
|
+
if 'content' not in self.parsed_data:
|
373
|
+
raise ValueError("Input dict must contain 'content' key")
|
374
|
+
|
375
|
+
content = self.parsed_data['content']
|
376
|
+
chunks = self.split_text_into_paragraphs(content, chunk_size, chunk_overlap)
|
377
|
+
|
378
|
+
result = self.parsed_data.copy()
|
379
|
+
result['content'] = chunks
|
380
|
+
return result
|
381
|
+
else:
|
382
|
+
raise ValueError("Unsupported input type")
|
383
|
+
|
384
|
+
|
385
|
+
def _parse_file(self, file_path):
|
386
|
+
"""
|
387
|
+
Create a parser instance using ParserFactory and parse the file.
|
388
|
+
|
389
|
+
:param file_path: The path to the file to be parsed.
|
390
|
+
:return: The parsed data.
|
391
|
+
"""
|
392
|
+
try:
|
393
|
+
parser = ParserFactory.create_parser(
|
394
|
+
use_mineru=self.use_mineru,
|
395
|
+
file_path=file_path,
|
396
|
+
to_markdown=self.to_markdown,
|
397
|
+
timeout=self.timeout
|
398
|
+
)
|
399
|
+
if parser:
|
400
|
+
return parser.parse(file_path=file_path)
|
401
|
+
except Exception as e:
|
402
|
+
raise e
|
403
|
+
|
404
|
+
|
405
|
+
if __name__ == '__main__':
|
406
|
+
pass
|