pydatamax 0.1.14__py3-none-any.whl → 0.1.15__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. datamax/__init__.py +1 -1
  2. datamax/loader/core.py +118 -118
  3. datamax/loader/minio_handler.py +171 -171
  4. datamax/loader/oss_handler.py +191 -191
  5. datamax/parser/__init__.py +2 -4
  6. datamax/parser/base.py +76 -76
  7. datamax/parser/core.py +406 -288
  8. datamax/parser/csv_parser.py +31 -10
  9. datamax/parser/doc_parser.py +466 -10
  10. datamax/parser/docx_parser.py +449 -11
  11. datamax/parser/epub_parser.py +41 -41
  12. datamax/parser/html_parser.py +37 -37
  13. datamax/parser/image_parser.py +34 -34
  14. datamax/parser/json_parser.py +32 -10
  15. datamax/parser/md_parser.py +72 -72
  16. datamax/parser/pdf_parser.py +101 -101
  17. datamax/parser/ppt_parser.py +70 -20
  18. datamax/parser/pptx_parser.py +45 -45
  19. datamax/parser/txt_parser.py +45 -45
  20. datamax/parser/xls_parser.py +26 -26
  21. datamax/parser/xlsx_parser.py +212 -215
  22. datamax/utils/__init__.py +23 -2
  23. datamax/utils/constants.py +58 -58
  24. datamax/utils/data_cleaner.py +275 -237
  25. datamax/utils/env_setup.py +79 -79
  26. datamax/utils/gotocr_pdf.py +265 -265
  27. datamax/utils/mineru_operator.py +62 -62
  28. datamax/utils/paddleocr_pdf_operator.py +90 -90
  29. datamax/utils/ppt_extract.py +140 -140
  30. datamax/utils/qa_generator.py +369 -376
  31. datamax/utils/tokenizer.py +21 -21
  32. datamax/utils/uno_handler.py +426 -0
  33. {pydatamax-0.1.14.dist-info → pydatamax-0.1.15.dist-info}/METADATA +117 -5
  34. pydatamax-0.1.15.dist-info/RECORD +38 -0
  35. {pydatamax-0.1.14.dist-info → pydatamax-0.1.15.dist-info}/licenses/LICENSE +21 -21
  36. {pydatamax-0.1.14.dist-info → pydatamax-0.1.15.dist-info}/top_level.txt +0 -1
  37. pydatamax-0.1.14.dist-info/RECORD +0 -39
  38. tests/__init__.py +0 -0
  39. tests/test_basic.py +0 -20
  40. {pydatamax-0.1.14.dist-info → pydatamax-0.1.15.dist-info}/WHEEL +0 -0
datamax/parser/core.py CHANGED
@@ -1,288 +1,406 @@
1
- import os
2
- import importlib
3
- from typing import List, Union, Dict
4
- from openai import OpenAI
5
- from datamax.utils import data_cleaner
6
- from datamax.utils.qa_generator import generatr_qa_pairs
7
-
8
-
9
- class ModelInvoker:
10
- def __init__(self):
11
- self.client = None
12
-
13
- def invoke_model(self, api_key, base_url, model_name, messages):
14
- self.client = OpenAI(
15
- api_key=api_key,
16
- base_url=base_url,
17
- )
18
-
19
- completion = self.client.chat.completions.create(
20
- model=model_name,
21
- messages=messages,
22
- )
23
- json_data = completion.model_dump()
24
- return json_data.get("choices")[0].get("message").get("content", "")
25
-
26
-
27
- class ParserFactory:
28
- @staticmethod
29
- def create_parser(
30
- file_path: str,
31
- use_mineru: bool = False,
32
- to_markdown: bool = False,
33
- timeout: int = 1200
34
- ):
35
- """
36
- Create a parser instance based on the file extension.
37
- :param file_path: The path to the file to be parsed.
38
- :param to_markdown: Flag to indicate whether the output should be in Markdown format.
39
- (only supported files in .doc or .docx format)
40
- :param use_mineru: Flag to indicate whether MinerU should be used. (only supported files in .pdf format)
41
- :param timeout: Timeout for the request .(only supported files in .xlsx format)
42
- :return: An instance of the parser class corresponding to the file extension.
43
- """
44
- file_extension = os.path.splitext(file_path)[1].lower()
45
- parser_class_name = {
46
- '.md': 'MarkdownParser',
47
- '.docx': 'DocxParser',
48
- '.doc': 'DocParser',
49
- '.epub': 'EpubParser',
50
- '.html': 'HtmlParser',
51
- '.txt': 'TxtParser',
52
- '.pptx': 'PPtxParser',
53
- '.ppt': 'PPtParser',
54
- '.pdf': 'PdfParser',
55
- '.jpg': 'ImageParser',
56
- '.jpeg': 'ImageParser',
57
- '.png': 'ImageParser',
58
- '.webp': 'ImageParser',
59
- '.xlsx': 'XlsxParser',
60
- '.xls': 'XlsParser'
61
- }.get(file_extension)
62
-
63
- if not parser_class_name:
64
- return None
65
-
66
- if file_extension in ['.jpg', 'jpeg', '.png', '.webp']:
67
- module_name = f'datamax.parser.image_parser'
68
- else:
69
- # Dynamically determine the module name based on the file extension
70
- module_name = f'datamax.parser.{file_extension[1:]}_parser'
71
-
72
- try:
73
- # Dynamically import the module and get the class
74
- module = importlib.import_module(module_name)
75
- parser_class = getattr(module, parser_class_name)
76
-
77
- # Special handling for PdfParser arguments
78
- if parser_class_name == 'PdfParser':
79
- return parser_class(
80
- file_path=file_path,
81
- use_mineru=use_mineru,
82
- )
83
- elif parser_class_name == 'DocxParser' or parser_class_name == 'DocParser':
84
- return parser_class(
85
- file_path=file_path, to_markdown=to_markdown
86
- )
87
- elif parser_class_name == 'XlsxParser':
88
- return parser_class(
89
- file_path=file_path,
90
- timeout=timeout
91
- )
92
- else:
93
- return parser_class(
94
- file_path=file_path
95
- )
96
-
97
- except (ImportError, AttributeError) as e:
98
- raise e
99
-
100
-
101
- class DataMax:
102
- def __init__(self,
103
- file_path: Union[str, list] = '',
104
- use_mineru: bool = False,
105
- to_markdown: bool = False,
106
- timeout: int = 1200
107
- ):
108
- """
109
- Initialize the DataMaxParser with file path and parsing options.
110
-
111
- # <Abandon>
112
- # :param use_paddle_ocr: Flag to indicate whether PaddleOCR should be used.
113
- # :param use_paddle_gpu: Flag to indicate whether PaddleOCR-GPU should be used.
114
- # :param use_got_ocr: Flag to indicate whether GOT-OCR should be used.
115
- # :param got_weights_path: GOT-OCR Weights Path.
116
- # :param gpu_id: The ID of the GPU to use.
117
-
118
- :param file_path: The path to the file or directory to be parsed.
119
- :param use_mineru: Flag to indicate whether MinerU should be used.
120
- :param to_markdown: Flag to indicate whether the output should be in Markdown format.
121
- """
122
- self.file_path = file_path
123
- self.use_mineru = use_mineru
124
- self.to_markdown = to_markdown
125
- self.parsed_data = None
126
- self.model_invoker = ModelInvoker()
127
- self.timeout = timeout
128
-
129
- def get_data(self):
130
- """
131
- Parse the file or directory specified in the file path and return the data.
132
-
133
- :return: A list of parsed data if the file path is a directory, otherwise a single parsed data.
134
- """
135
- try:
136
- if isinstance(self.file_path, list):
137
- parsed_data = [self._parse_file(f) for f in self.file_path]
138
- self.parsed_data = parsed_data
139
- return parsed_data
140
-
141
- elif isinstance(self.file_path, str) and os.path.isfile(self.file_path):
142
- parsed_data = self._parse_file(self.file_path)
143
- self.parsed_data = parsed_data
144
- return parsed_data
145
-
146
- elif isinstance(self.file_path, str) and os.path.isdir(self.file_path):
147
- file_list = [os.path.join(self.file_path, file) for file in os.listdir(self.file_path)]
148
- parsed_data = [self._parse_file(f) for f in file_list if os.path.isfile(f)]
149
- self.parsed_data = parsed_data
150
- return parsed_data
151
- else:
152
- raise ValueError("Invalid file path.")
153
-
154
- except Exception as e:
155
- raise e
156
-
157
- def clean_data(self, method_list: List[str], text: str = None):
158
- """
159
- Clean data
160
-
161
- methods include AbnormalCleaner, TextFilter, PrivacyDesensitization which is 1 2 3
162
-
163
- :return:
164
- """
165
- if text:
166
- cleaned_text = text
167
- elif self.parsed_data:
168
- cleaned_text = self.parsed_data.get('content')
169
- else:
170
- raise ValueError("No data to clean.")
171
-
172
- for method in method_list:
173
- if method == 'abnormal':
174
- cleaned_text = data_cleaner.AbnormalCleaner(cleaned_text).to_clean().get("text")
175
- elif method == 'filter':
176
- cleaned_text = data_cleaner.TextFilter(cleaned_text).to_filter()
177
- cleaned_text = cleaned_text.get("text") if cleaned_text else ''
178
- elif method == 'private':
179
- cleaned_text = data_cleaner.PrivacyDesensitization(cleaned_text).to_private().get("text")
180
-
181
- if self.parsed_data:
182
- origin_dict = self.parsed_data
183
- origin_dict['content'] = cleaned_text
184
- self.parsed_data = None
185
- return origin_dict
186
- else:
187
- return cleaned_text
188
-
189
- def get_pre_label(self,
190
- api_key: str,
191
- base_url: str,
192
- model_name: str,
193
- chunk_size: int = 500,
194
- chunk_overlap: int = 100,
195
- question_number: int = 5,
196
- max_workers: int = 5,
197
- messages: List[Dict[str, str]] = None):
198
- return generatr_qa_pairs(
199
- api_key=api_key,
200
- base_url=base_url,
201
- model_name=model_name,
202
- chunk_size=chunk_size,
203
- chunk_overlap=chunk_overlap,
204
- question_number=question_number,
205
- max_workers=max_workers,
206
- message=messages,
207
- file_path=self.file_path
208
- )
209
-
210
- ## <Abandon>
211
- # def enhance_with_model(self, api_key: str, base_url: str, model_name: str, iteration: int = 1,
212
- # messages: List[Dict[str, str]] = None):
213
- # """
214
- # Enhance the parsed content using a large language model.
215
- #
216
- # :param api_key: API key for the large model service.
217
- # :param base_url: Base URL for the large model service.
218
- # :param model_name: Name of the model to use.
219
- # :param iteration: Number of iterations
220
- # :param messages: Custom messages list [{"role": "system", "content": "..."}, ...]
221
- # :return: Enhanced text.
222
- # """
223
- # if not messages:
224
- # # If no custom message is provided, the default message structure is used, but only if there is parsed data
225
- # if self.parsed_data:
226
- # system_prompt = get_system_prompt(self.parsed_data)
227
- # default_message_user = {"role": "user", "content": "按照json格式给出问答对"}
228
- # messages = [
229
- # {"role": "system", "content": system_prompt},
230
- # default_message_user
231
- # ]
232
- # else:
233
- # raise ValueError("No data to enhance and no custom messages provided.")
234
- # try:
235
- # if isinstance(iteration, int) and iteration >= 1:
236
- # results = []
237
- # current_messages = messages.copy() # Avoid modifying the original message during iteration
238
- #
239
- # for _ in range(iteration):
240
- # enhanced_text = self.model_invoker.invoke_model(
241
- # api_key=api_key,
242
- # base_url=base_url,
243
- # model_name=model_name,
244
- # messages=current_messages
245
- # )
246
- #
247
- # # Append the generated content to the conversation history in multiple iterations
248
- # if iteration > 1:
249
- # current_messages.append({"role": "assistant", "content": enhanced_text})
250
- # current_messages.append(
251
- # {"role": "user", "content": "请继续生成, 生成要求不变, 结果是jsonlist, 且长度不超过5"})
252
- #
253
- # # If there is parsed data, update the contents and return a copy of the original dictionary; Otherwise, return the enhanced text directly
254
- # if self.parsed_data:
255
- # origin_dict = self.parsed_data.copy()
256
- # origin_dict['content'] = enhanced_text
257
- # results.append(origin_dict)
258
- # else:
259
- # results.append(enhanced_text)
260
- #
261
- # return results if iteration > 1 else results[0]
262
- # else:
263
- # raise ValueError("Invalid iteration parameter.")
264
- # except Exception as e:
265
- # raise Exception(f"An error occurred while enhancing with the model: {e}")
266
-
267
- def _parse_file(self, file_path):
268
- """
269
- Create a parser instance using ParserFactory and parse the file.
270
-
271
- :param file_path: The path to the file to be parsed.
272
- :return: The parsed data.
273
- """
274
- try:
275
- parser = ParserFactory.create_parser(
276
- use_mineru=self.use_mineru,
277
- file_path=file_path,
278
- to_markdown=self.to_markdown,
279
- timeout=self.timeout
280
- )
281
- if parser:
282
- return parser.parse(file_path=file_path)
283
- except Exception as e:
284
- raise e
285
-
286
-
287
- if __name__ == '__main__':
288
- pass
1
+ import os
2
+ import json
3
+ import time
4
+ import importlib
5
+ from loguru import logger
6
+ from typing import List, Union, Dict
7
+ from openai import OpenAI
8
+ from pathlib import Path
9
+ from datamax.utils import data_cleaner
10
+ from datamax.utils.qa_generator import generatr_qa_pairs
11
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
12
+
13
+
14
+ class ModelInvoker:
15
+ def __init__(self):
16
+ self.client = None
17
+
18
+ def invoke_model(self, api_key, base_url, model_name, messages):
19
+ self.client = OpenAI(
20
+ api_key=api_key,
21
+ base_url=base_url,
22
+ )
23
+
24
+ completion = self.client.chat.completions.create(
25
+ model=model_name,
26
+ messages=messages,
27
+ )
28
+ json_data = completion.model_dump()
29
+ return json_data.get("choices")[0].get("message").get("content", "")
30
+
31
+
32
+ class ParserFactory:
33
+ @staticmethod
34
+ def create_parser(
35
+ file_path: str,
36
+ use_mineru: bool = False,
37
+ to_markdown: bool = False,
38
+ timeout: int = 1200
39
+ ):
40
+ """
41
+ Create a parser instance based on the file extension.
42
+ :param file_path: The path to the file to be parsed.
43
+ :param to_markdown: Flag to indicate whether the output should be in Markdown format.
44
+ (only supported files in .doc or .docx format)
45
+ :param use_mineru: Flag to indicate whether MinerU should be used. (only supported files in .pdf format)
46
+ :param timeout: Timeout for the request .(only supported files in .xlsx format)
47
+ :return: An instance of the parser class corresponding to the file extension.
48
+ """
49
+ file_extension = os.path.splitext(file_path)[1].lower()
50
+ parser_class_name = {
51
+ '.md': 'MarkdownParser',
52
+ '.docx': 'DocxParser',
53
+ '.doc': 'DocParser',
54
+ '.epub': 'EpubParser',
55
+ '.html': 'HtmlParser',
56
+ '.txt': 'TxtParser',
57
+ '.pptx': 'PPtxParser',
58
+ '.ppt': 'PPtParser',
59
+ '.pdf': 'PdfParser',
60
+ '.jpg': 'ImageParser',
61
+ '.jpeg': 'ImageParser',
62
+ '.png': 'ImageParser',
63
+ '.webp': 'ImageParser',
64
+ '.xlsx': 'XlsxParser',
65
+ '.xls': 'XlsParser'
66
+ }.get(file_extension)
67
+
68
+ if not parser_class_name:
69
+ return None
70
+
71
+ if file_extension in ['.jpg', 'jpeg', '.png', '.webp']:
72
+ module_name = f'datamax.parser.image_parser'
73
+ else:
74
+ # Dynamically determine the module name based on the file extension
75
+ module_name = f'datamax.parser.{file_extension[1:]}_parser'
76
+
77
+ try:
78
+ # Dynamically import the module and get the class
79
+ module = importlib.import_module(module_name)
80
+ parser_class = getattr(module, parser_class_name)
81
+
82
+ # Special handling for PdfParser arguments
83
+ if parser_class_name == 'PdfParser':
84
+ return parser_class(
85
+ file_path=file_path,
86
+ use_mineru=use_mineru,
87
+ )
88
+ elif parser_class_name == 'DocxParser' or parser_class_name == 'DocParser':
89
+ return parser_class(
90
+ file_path=file_path, to_markdown=to_markdown
91
+ )
92
+ elif parser_class_name == 'XlsxParser':
93
+ return parser_class(
94
+ file_path=file_path,
95
+ timeout=timeout
96
+ )
97
+ else:
98
+ return parser_class(
99
+ file_path=file_path
100
+ )
101
+
102
+ except (ImportError, AttributeError) as e:
103
+ raise e
104
+
105
+
106
+ class DataMax:
107
+ def __init__(self,
108
+ file_path: Union[str, list] = '',
109
+ use_mineru: bool = False,
110
+ to_markdown: bool = False,
111
+ timeout: int = 1200,
112
+ ttl: int = 3600
113
+ ):
114
+ """
115
+ Initialize the DataMaxParser with file path and parsing options.
116
+
117
+ :param file_path: The path to the file or directory to be parsed.
118
+ :param use_mineru: Flag to indicate whether MinerU should be used.
119
+ :param to_markdown: Flag to indicate whether the output should be in Markdown format.
120
+ :param timeout: Timeout for the request.
121
+ :param ttl: Time to live for the cache.
122
+ """
123
+ self.file_path = file_path
124
+ self.use_mineru = use_mineru
125
+ self.to_markdown = to_markdown
126
+ self.parsed_data = None
127
+ self.model_invoker = ModelInvoker()
128
+ self.timeout = timeout
129
+ self._cache = {}
130
+ self.ttl = ttl
131
+
132
+ def set_data(self, file_name, parsed_data):
133
+ """
134
+ Set cached data
135
+ :param file_name: File name as cache key
136
+ :param parsed_data: Parsed data as value
137
+ """
138
+ logger.info(f"cache ttl is {self.ttl}s")
139
+ if self.ttl > 0:
140
+ self._cache[file_name] = {'data': parsed_data, 'ttl': time.time() + self.ttl}
141
+ logger.info(f"✅ [Cache Updated] Cached data for {file_name}, ttl: {self._cache[file_name]['ttl']}")
142
+
143
+ def get_data(self):
144
+ """
145
+ Parse the file or directory specified in the file path and return the data.
146
+
147
+ :return: A list of parsed data if the file path is a directory, otherwise a single parsed data.
148
+ """
149
+ try:
150
+ if isinstance(self.file_path, list):
151
+ parsed_data = []
152
+ for f in self.file_path:
153
+ file_name = os.path.basename(f)
154
+ if file_name in self._cache and self._cache[file_name]['ttl'] > time.time():
155
+ logger.info(f"✅ [Cache Hit] Using cached data for {file_name}")
156
+ parsed_data.append(self._cache[file_name]['data'])
157
+ else:
158
+ logger.info(f"⏳ [Cache Miss] No cached data for {file_name}, parsing...")
159
+ self._cache = {k: v for k, v in self._cache.items() if v['ttl'] > time.time()}
160
+ res_data = self._parse_file(f)
161
+ parsed_data.append(res_data)
162
+ self.set_data(file_name, res_data)
163
+ return parsed_data
164
+
165
+ elif isinstance(self.file_path, str) and os.path.isfile(self.file_path):
166
+ file_name = os.path.basename(self.file_path)
167
+ if file_name in self._cache and self._cache[file_name]['ttl'] > time.time():
168
+ logger.info(f"✅ [Cache Hit] Using cached data for {file_name}")
169
+ return self._cache[file_name]['data']
170
+ else:
171
+ logger.info(f"⏳ [Cache Miss] No cached data for {file_name}, parsing...")
172
+ self._cache = {k: v for k, v in self._cache.items() if v['ttl'] > time.time()}
173
+ parsed_data = self._parse_file(self.file_path)
174
+ self.parsed_data = parsed_data
175
+ self.set_data(file_name, parsed_data)
176
+ return parsed_data
177
+
178
+ elif isinstance(self.file_path, str) and os.path.isdir(self.file_path):
179
+ file_list = [str(file) for file in list(Path(self.file_path).rglob('*.*'))]
180
+ parsed_data = []
181
+ for f in file_list:
182
+ if os.path.isfile(f):
183
+ file_name = os.path.basename(f)
184
+ if file_name in self._cache and self._cache[file_name]['ttl'] > time.time():
185
+ logger.info(f"✅ [Cache Hit] Using cached data for {file_name}")
186
+ parsed_data.append(self._cache[file_name]['data'])
187
+ else:
188
+ logger.info(f"⏳ [Cache Miss] No cached data for {file_name}, parsing...")
189
+ self._cache = {k: v for k, v in self._cache.items() if v['ttl'] > time.time()}
190
+ res_data = self._parse_file(f)
191
+ parsed_data.append(res_data)
192
+ self.set_data(file_name, res_data)
193
+ return parsed_data
194
+ else:
195
+ raise ValueError("Invalid file path.")
196
+
197
+ except Exception as e:
198
+ raise e
199
+
200
+ def clean_data(self, method_list: List[str], text: str = None):
201
+ """
202
+ Clean data
203
+
204
+ methods include AbnormalCleaner, TextFilter, PrivacyDesensitization which is 1 2 3
205
+
206
+ :return:
207
+ """
208
+ if text:
209
+ cleaned_text = text
210
+ elif self.parsed_data:
211
+ cleaned_text = self.parsed_data.get('content')
212
+ else:
213
+ raise ValueError("No data to clean.")
214
+
215
+ for method in method_list:
216
+ if method == 'abnormal':
217
+ cleaned_text = data_cleaner.AbnormalCleaner(cleaned_text).to_clean().get("text")
218
+ elif method == 'filter':
219
+ cleaned_text = data_cleaner.TextFilter(cleaned_text).to_filter()
220
+ cleaned_text = cleaned_text.get("text") if cleaned_text else ''
221
+ elif method == 'private':
222
+ cleaned_text = data_cleaner.PrivacyDesensitization(cleaned_text).to_private().get("text")
223
+
224
+ if self.parsed_data:
225
+ origin_dict = self.parsed_data
226
+ origin_dict['content'] = cleaned_text
227
+ self.parsed_data = None
228
+ return origin_dict
229
+ else:
230
+ return cleaned_text
231
+
232
+ def get_pre_label(self,
233
+ api_key: str,
234
+ base_url: str,
235
+ model_name: str,
236
+ chunk_size: int = 500,
237
+ chunk_overlap: int = 100,
238
+ question_number: int = 5,
239
+ max_workers: int = 5,
240
+ messages: List[Dict[str, str]] = None):
241
+ return generatr_qa_pairs(
242
+ api_key=api_key,
243
+ base_url=base_url,
244
+ model_name=model_name,
245
+ chunk_size=chunk_size,
246
+ chunk_overlap=chunk_overlap,
247
+ question_number=question_number,
248
+ max_workers=max_workers,
249
+ message=messages,
250
+ file_path=self.file_path
251
+ )
252
+
253
+ def save_label_data(self, label_data: list, save_file_name: str = None):
254
+ """
255
+ Save label data to file.
256
+ :param label_data: Label data to be saved.
257
+ :param save_file_name: File name to save the label data.
258
+ """
259
+ if not label_data:
260
+ raise ValueError("No data to save.")
261
+ if not save_file_name:
262
+ if isinstance(self.file_path, str):
263
+ save_file_name = os.path.splitext(os.path.basename(self.file_path))[0]
264
+ else:
265
+ save_file_name = 'label_data'
266
+ if isinstance(label_data, list):
267
+ with open(save_file_name + '.jsonl', 'w', encoding='utf-8') as f:
268
+ for qa_entry in label_data:
269
+ f.write(json.dumps(qa_entry, ensure_ascii=False) + "\n")
270
+ logger.info(f"✅ [Label Data Saved] Label data saved to {save_file_name}.jsonl")
271
+
272
+
273
+ @staticmethod
274
+ def split_text_into_paragraphs(text: str, max_length:int = 500, chunk_overlap: int = 100):
275
+ """
276
+ Split text into paragraphs by sentence boundaries, each paragraph not exceeding max_length characters.
277
+ Paragraphs will have chunk_overlap characters of overlap between them.
278
+ """
279
+ import re
280
+
281
+ # Split sentences using Chinese punctuation marks
282
+ sentences = re.split('(?<=[。!?])', text)
283
+ paragraphs = []
284
+ current_paragraph = ''
285
+ overlap_buffer = ''
286
+
287
+ for sentence in sentences:
288
+ # If current paragraph plus new sentence doesn't exceed max length
289
+ if len(current_paragraph) + len(sentence) <= max_length:
290
+ current_paragraph += sentence
291
+ else:
292
+ if current_paragraph:
293
+ # Add current paragraph to results
294
+ paragraphs.append(current_paragraph)
295
+ # Save overlap portion
296
+ overlap_buffer = current_paragraph[-chunk_overlap:] if chunk_overlap > 0 else ''
297
+ # Start new paragraph with overlap
298
+ current_paragraph = overlap_buffer + sentence
299
+ overlap_buffer = ''
300
+
301
+ # Handle overly long sentences
302
+ while len(current_paragraph) > max_length:
303
+ # Split long paragraph
304
+ split_point = max_length - len(overlap_buffer)
305
+ paragraphs.append(current_paragraph[:split_point])
306
+ # Update overlap buffer
307
+ overlap_buffer = current_paragraph[split_point - chunk_overlap:split_point] if chunk_overlap > 0 else ''
308
+ current_paragraph = overlap_buffer + current_paragraph[split_point:]
309
+ overlap_buffer = ''
310
+
311
+ # Add the last paragraph
312
+ if current_paragraph:
313
+ paragraphs.append(current_paragraph)
314
+
315
+ return paragraphs
316
+
317
+ @staticmethod
318
+ def split_with_langchain(text: str, chunk_size: int = 500, chunk_overlap: int = 100):
319
+ """
320
+ Split text using LangChain's intelligent text splitting
321
+
322
+ :param text: Text to be split
323
+ :param chunk_size: Maximum length of each chunk
324
+ :param chunk_overlap: Number of overlapping characters between chunks
325
+ :return: List of split text
326
+ """
327
+ text_splitter = RecursiveCharacterTextSplitter(
328
+ chunk_size=chunk_size,
329
+ chunk_overlap=chunk_overlap,
330
+ length_function=len,
331
+ is_separator_regex=False,
332
+ )
333
+ return text_splitter.split_text(text)
334
+
335
+ def split_data(
336
+ self,
337
+ parsed_data: Union[str, dict] = None,
338
+ chunk_size: int = 500,
339
+ chunk_overlap: int = 100,
340
+ use_langchain: bool = False):
341
+ """
342
+ Improved splitting method with LangChain option
343
+
344
+ :param use_langchain: Whether to use LangChain for splitting
345
+ :param parsed_data: Data to be split, either string or dict
346
+ :param chunk_size: Maximum length of each chunk
347
+ :param chunk_overlap: Number of overlapping characters between chunks
348
+ :return: List or dict of split text
349
+ """
350
+ if parsed_data:
351
+ self.parsed_data = parsed_data
352
+ if not self.parsed_data:
353
+ raise ValueError("No data to split.")
354
+
355
+ if use_langchain:
356
+ if isinstance(self.parsed_data, str):
357
+ return self.split_with_langchain(self.parsed_data, chunk_size, chunk_overlap)
358
+ elif isinstance(self.parsed_data, dict):
359
+ if 'content' not in self.parsed_data:
360
+ raise ValueError("Input dict must contain 'content' key")
361
+ chunks = self.split_with_langchain(self.parsed_data['content'], chunk_size, chunk_overlap)
362
+ result = self.parsed_data.copy()
363
+ result['content'] = chunks
364
+ return result
365
+
366
+ # Handle string input
367
+ if isinstance(self.parsed_data, str):
368
+ return self.split_text_into_paragraphs(self.parsed_data, chunk_size, chunk_overlap)
369
+
370
+ # Handle dict input
371
+ elif isinstance(self.parsed_data, dict):
372
+ if 'content' not in self.parsed_data:
373
+ raise ValueError("Input dict must contain 'content' key")
374
+
375
+ content = self.parsed_data['content']
376
+ chunks = self.split_text_into_paragraphs(content, chunk_size, chunk_overlap)
377
+
378
+ result = self.parsed_data.copy()
379
+ result['content'] = chunks
380
+ return result
381
+ else:
382
+ raise ValueError("Unsupported input type")
383
+
384
+
385
+ def _parse_file(self, file_path):
386
+ """
387
+ Create a parser instance using ParserFactory and parse the file.
388
+
389
+ :param file_path: The path to the file to be parsed.
390
+ :return: The parsed data.
391
+ """
392
+ try:
393
+ parser = ParserFactory.create_parser(
394
+ use_mineru=self.use_mineru,
395
+ file_path=file_path,
396
+ to_markdown=self.to_markdown,
397
+ timeout=self.timeout
398
+ )
399
+ if parser:
400
+ return parser.parse(file_path=file_path)
401
+ except Exception as e:
402
+ raise e
403
+
404
+
405
+ if __name__ == '__main__':
406
+ pass