pydatamax 0.1.14__py3-none-any.whl → 0.1.15.post2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. datamax/__init__.py +1 -1
  2. datamax/loader/core.py +118 -118
  3. datamax/loader/minio_handler.py +171 -171
  4. datamax/loader/oss_handler.py +191 -191
  5. datamax/parser/__init__.py +2 -4
  6. datamax/parser/base.py +76 -76
  7. datamax/parser/core.py +406 -288
  8. datamax/parser/csv_parser.py +31 -10
  9. datamax/parser/doc_parser.py +466 -10
  10. datamax/parser/docx_parser.py +449 -11
  11. datamax/parser/epub_parser.py +41 -41
  12. datamax/parser/html_parser.py +37 -37
  13. datamax/parser/image_parser.py +34 -34
  14. datamax/parser/json_parser.py +32 -10
  15. datamax/parser/md_parser.py +72 -72
  16. datamax/parser/pdf_parser.py +101 -101
  17. datamax/parser/ppt_parser.py +70 -20
  18. datamax/parser/pptx_parser.py +45 -45
  19. datamax/parser/txt_parser.py +45 -45
  20. datamax/parser/xls_parser.py +26 -26
  21. datamax/parser/xlsx_parser.py +212 -215
  22. datamax/utils/__init__.py +23 -2
  23. datamax/utils/constants.py +58 -58
  24. datamax/utils/data_cleaner.py +275 -237
  25. datamax/utils/env_setup.py +79 -79
  26. datamax/utils/gotocr_pdf.py +265 -265
  27. datamax/utils/mineru_operator.py +62 -62
  28. datamax/utils/paddleocr_pdf_operator.py +90 -90
  29. datamax/utils/ppt_extract.py +140 -140
  30. datamax/utils/qa_generator.py +369 -376
  31. datamax/utils/tokenizer.py +21 -21
  32. datamax/utils/uno_handler.py +426 -0
  33. {pydatamax-0.1.14.dist-info → pydatamax-0.1.15.post2.dist-info}/METADATA +117 -5
  34. pydatamax-0.1.15.post2.dist-info/RECORD +38 -0
  35. {pydatamax-0.1.14.dist-info → pydatamax-0.1.15.post2.dist-info}/licenses/LICENSE +21 -21
  36. {pydatamax-0.1.14.dist-info → pydatamax-0.1.15.post2.dist-info}/top_level.txt +0 -1
  37. pydatamax-0.1.14.dist-info/RECORD +0 -39
  38. tests/__init__.py +0 -0
  39. tests/test_basic.py +0 -20
  40. {pydatamax-0.1.14.dist-info → pydatamax-0.1.15.post2.dist-info}/WHEEL +0 -0
@@ -1,22 +1,22 @@
1
- import tiktoken
2
-
3
-
4
- class DashScopeClient:
5
- _instance = None
6
- def __new__(cls, *args, **kwargs):
7
- if not cls._instance:
8
- cls._instance = super(DashScopeClient, cls).__new__(cls)
9
- return cls._instance
10
-
11
- def get_tokenizer(self, content):
12
- '''
13
- Note: tiktoken only supports the following models with different token calculations
14
- A BPE word divider developed by tiktoken openai
15
- o200k_base corresponds to models: gpt-4o, GPT-4O-MINI
16
- cl100k_base models: GPT-4-Turbo, gpt-4, gpt-3.5-turbo...
17
- p50k_base corresponds to models text-davinci-002 and text-davinci-003
18
- r50k_base corresponds to model gpt2
19
- '''
20
- encoding = tiktoken.get_encoding(encoding_name="cl100k_base")
21
- num_tokens = len(encoding.encode(content))
1
+ import tiktoken
2
+
3
+
4
+ class DashScopeClient:
5
+ _instance = None
6
+ def __new__(cls, *args, **kwargs):
7
+ if not cls._instance:
8
+ cls._instance = super(DashScopeClient, cls).__new__(cls)
9
+ return cls._instance
10
+
11
+ def get_tokenizer(self, content):
12
+ '''
13
+ Note: tiktoken only supports the following models with different token calculations
14
+ A BPE word divider developed by tiktoken openai
15
+ o200k_base corresponds to models: gpt-4o, GPT-4O-MINI
16
+ cl100k_base models: GPT-4-Turbo, gpt-4, gpt-3.5-turbo...
17
+ p50k_base corresponds to models text-davinci-002 and text-davinci-003
18
+ r50k_base corresponds to model gpt2
19
+ '''
20
+ encoding = tiktoken.get_encoding(encoding_name="cl100k_base")
21
+ num_tokens = len(encoding.encode(content))
22
22
  return num_tokens
@@ -0,0 +1,426 @@
1
+ from loguru import logger
2
+ import os
3
+ import subprocess
4
+ import threading
5
+ import time
6
+ from contextlib import contextmanager
7
+ from pathlib import Path
8
+ from typing import Optional
9
+
10
+ # 延迟导入标志和锁
11
+ _uno_imported = False
12
+ _import_error = None
13
+ _import_lock = threading.Lock()
14
+
15
+
16
+ def _lazy_import_uno():
17
+ """延迟导入 UNO 模块,避免与其他库冲突(线程安全)"""
18
+ global _uno_imported, _import_error
19
+
20
+ # 快速检查,避免不必要的锁获取
21
+ if _uno_imported:
22
+ return True
23
+
24
+ with _import_lock:
25
+ # 双重检查锁定模式
26
+ if _uno_imported:
27
+ return True
28
+
29
+ try:
30
+ # 在这里导入所有 UNO 相关的模块
31
+ global uno, PropertyValue, NoConnectException
32
+ import uno
33
+ from com.sun.star.beans import PropertyValue
34
+ from com.sun.star.connection import NoConnectException
35
+
36
+ _uno_imported = True
37
+ logger.info("✅ UNO模块导入成功")
38
+ return True
39
+ except ImportError as e:
40
+ _import_error = e
41
+ logger.error(f"❌ UNO模块导入失败: {str(e)}")
42
+ return False
43
+
44
+
45
+ def ensure_uno_imported():
46
+ """确保UNO已导入,适用于需要提前导入的场景"""
47
+ if not _lazy_import_uno():
48
+ raise ImportError(
49
+ f"python-uno未安装或无法导入。错误: {_import_error}\n"
50
+ "请安装LibreOffice并确保python-uno可用。\n"
51
+ "Ubuntu/Debian: apt-get install libreoffice python3-uno\n"
52
+ "其他系统请参考: https://wiki.documentfoundation.org/Documentation/DevGuide/Installing_the_SDK"
53
+ )
54
+
55
+
56
+ # 检查 UNO 是否可用(但不立即导入)
57
+ def check_uno_available():
58
+ """检查 UNO 是否可用(不会真正导入)"""
59
+ try:
60
+ import importlib.util
61
+ spec = importlib.util.find_spec("uno")
62
+ return spec is not None
63
+ except:
64
+ return False
65
+
66
+
67
+ HAS_UNO = check_uno_available()
68
+
69
+
70
+ class UnoManager:
71
+ """
72
+ UNO管理器,用于管理LibreOffice服务实例和文档转换
73
+ 单线程版本,适合稳定高效的文档处理
74
+ """
75
+
76
+ def __init__(self, host: str = "localhost", port: int = 2002, timeout: int = 30):
77
+ """
78
+ 初始化UNO管理器
79
+
80
+ Args:
81
+ host: LibreOffice服务主机地址
82
+ port: LibreOffice服务端口
83
+ timeout: 连接超时时间(秒)
84
+ """
85
+ # 确保UNO已导入(使用线程安全的方式)
86
+ ensure_uno_imported()
87
+
88
+ self.host = host
89
+ self.port = port
90
+ self.timeout = timeout
91
+ self.connection_string = (
92
+ f"socket,host={host},port={port};urp;StarOffice.ComponentContext"
93
+ )
94
+ self._lock = threading.Lock()
95
+ self._desktop = None
96
+ self._ctx = None
97
+ self._soffice_process = None
98
+ self._connected = False
99
+ logger.info(f"🚀 UnoManager初始化 - 主机: {host}, 端口: {port} (单线程模式)")
100
+
101
+ def _start_soffice_service(self):
102
+ """启动LibreOffice服务"""
103
+ logger.info(f"🌟 启动LibreOffice服务,监听端口 {self.port}...")
104
+
105
+ # 检查是否已有服务在运行
106
+ if self._check_soffice_running():
107
+ logger.info("✅ LibreOffice服务已在运行")
108
+ return
109
+
110
+ # 启动新的服务实例
111
+ cmd = [
112
+ "soffice",
113
+ "--headless",
114
+ "--invisible",
115
+ "--nocrashreport",
116
+ "--nodefault",
117
+ "--nofirststartwizard",
118
+ "--nologo",
119
+ "--norestore",
120
+ f"--accept={self.connection_string}",
121
+ ]
122
+
123
+ try:
124
+ self._soffice_process = subprocess.Popen(
125
+ cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL
126
+ )
127
+ logger.info(f"⏳ 等待LibreOffice服务启动...")
128
+ time.sleep(5) # 给服务一些启动时间
129
+
130
+ if self._check_soffice_running():
131
+ logger.info("✅ LibreOffice服务启动成功")
132
+ else:
133
+ raise Exception("LibreOffice服务启动失败")
134
+
135
+ except Exception as e:
136
+ logger.error(f"❌ 启动LibreOffice服务失败: {str(e)}")
137
+ raise
138
+
139
+ def _check_soffice_running(self) -> bool:
140
+ """检查LibreOffice服务是否在运行"""
141
+ try:
142
+ import socket
143
+
144
+ sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
145
+ result = sock.connect_ex((self.host, self.port))
146
+ sock.close()
147
+ return result == 0
148
+ except:
149
+ return False
150
+
151
+ def is_connected(self) -> bool:
152
+ """检查是否已连接"""
153
+ with self._lock:
154
+ return self._connected and self._desktop is not None
155
+
156
+ def connect(self):
157
+ """连接到LibreOffice服务"""
158
+ with self._lock:
159
+ if self._connected and self._desktop is not None:
160
+ return # 已连接
161
+
162
+ self._start_soffice_service()
163
+
164
+ logger.info(f"🔌 连接到LibreOffice服务...")
165
+ start_time = time.time()
166
+
167
+ while time.time() - start_time < self.timeout:
168
+ try:
169
+ # 获取组件上下文
170
+ local_ctx = uno.getComponentContext()
171
+ resolver = local_ctx.ServiceManager.createInstanceWithContext(
172
+ "com.sun.star.bridge.UnoUrlResolver", local_ctx
173
+ )
174
+
175
+ # 连接到LibreOffice
176
+ self._ctx = resolver.resolve(f"uno:{self.connection_string}")
177
+ self._desktop = self._ctx.ServiceManager.createInstanceWithContext(
178
+ "com.sun.star.frame.Desktop", self._ctx
179
+ )
180
+
181
+ self._connected = True
182
+ logger.info("✅ 成功连接到LibreOffice服务")
183
+ return
184
+
185
+ except NoConnectException:
186
+ logger.debug("⏳ 等待LibreOffice服务就绪...")
187
+ time.sleep(1)
188
+ except Exception as e:
189
+ logger.error(f"❌ 连接失败: {str(e)}")
190
+ time.sleep(1)
191
+
192
+ raise TimeoutError(f"连接LibreOffice服务超时({self.timeout}秒)")
193
+
194
+ def disconnect(self):
195
+ """断开与LibreOffice服务的连接"""
196
+ with self._lock:
197
+ if self._desktop is not None:
198
+ try:
199
+ self._desktop.terminate()
200
+ except:
201
+ pass
202
+ self._desktop = None
203
+ self._ctx = None
204
+ self._connected = False
205
+ logger.info("🔌 已断开LibreOffice服务连接")
206
+
207
+ def stop_service(self):
208
+ """停止LibreOffice服务"""
209
+ self.disconnect()
210
+ if self._soffice_process:
211
+ try:
212
+ self._soffice_process.terminate()
213
+ self._soffice_process.wait(timeout=10)
214
+ except:
215
+ self._soffice_process.kill()
216
+ self._soffice_process = None
217
+ logger.info("🛑 LibreOffice服务已停止")
218
+
219
+ @contextmanager
220
+ def get_document(self, file_path: str):
221
+ """
222
+ 获取文档对象的上下文管理器
223
+
224
+ Args:
225
+ file_path: 文档路径
226
+
227
+ Yields:
228
+ 文档对象
229
+ """
230
+ self.connect()
231
+
232
+ # 将路径转换为URL格式
233
+ file_url = uno.systemPathToFileUrl(os.path.abspath(file_path))
234
+
235
+ # 打开文档
236
+ properties = []
237
+ properties.append(self._make_property("Hidden", True))
238
+ properties.append(self._make_property("ReadOnly", True))
239
+
240
+ document = None
241
+ try:
242
+ document = self._desktop.loadComponentFromURL(
243
+ file_url, "_blank", 0, properties
244
+ )
245
+ logger.debug(f"📄 打开文档: {file_path}")
246
+ yield document
247
+ finally:
248
+ if document:
249
+ try:
250
+ document.dispose()
251
+ logger.debug(f"📄 关闭文档: {file_path}")
252
+ except:
253
+ pass
254
+
255
+ def convert_document(
256
+ self,
257
+ input_path: str,
258
+ output_path: str,
259
+ output_format: str,
260
+ filter_name: Optional[str] = None,
261
+ ):
262
+ """
263
+ 转换文档格式
264
+
265
+ Args:
266
+ input_path: 输入文件路径
267
+ output_path: 输出文件路径
268
+ output_format: 输出格式(如'txt', 'pdf', 'docx'等)
269
+ filter_name: 过滤器名称(可选)
270
+ """
271
+ logger.info(f"🔄 开始转换文档: {input_path} -> {output_path} ({output_format})")
272
+
273
+ with self.get_document(input_path) as document:
274
+ if document is None:
275
+ raise Exception(f"无法打开文档: {input_path}")
276
+
277
+ # 准备输出属性
278
+ properties = []
279
+
280
+ # 设置过滤器
281
+ if filter_name:
282
+ properties.append(self._make_property("FilterName", filter_name))
283
+ else:
284
+ # 根据格式自动选择过滤器
285
+ if output_format == "txt":
286
+ # 对于文本格式,尝试多个过滤器
287
+ filter_options = [
288
+ ("Text (encoded)", "UTF8"),
289
+ ("Text", None),
290
+ ("HTML (StarWriter)", None)
291
+ ]
292
+
293
+ success = False
294
+ for filter_name, filter_option in filter_options:
295
+ try:
296
+ properties = []
297
+ properties.append(self._make_property("FilterName", filter_name))
298
+ if filter_option:
299
+ properties.append(self._make_property("FilterOptions", filter_option))
300
+
301
+ # 确保输出目录存在
302
+ output_dir = os.path.dirname(output_path)
303
+ if output_dir and not os.path.exists(output_dir):
304
+ os.makedirs(output_dir)
305
+
306
+ # 转换为URL格式
307
+ output_url = uno.systemPathToFileUrl(os.path.abspath(output_path))
308
+
309
+ # 执行转换
310
+ document.storeToURL(output_url, properties)
311
+ logger.info(f"✅ 文档转换成功 (使用过滤器: {filter_name}): {output_path}")
312
+ success = True
313
+ break
314
+ except Exception as e:
315
+ logger.debug(f"🔄 过滤器 {filter_name} 失败: {str(e)}")
316
+ continue
317
+
318
+ if not success:
319
+ raise Exception(f"所有文本过滤器都失败,无法转换文档: {input_path}")
320
+
321
+ return # 已经完成转换,直接返回
322
+ else:
323
+ # 其他格式使用默认过滤器
324
+ filter_map = {
325
+ "pdf": "writer_pdf_Export",
326
+ "docx": "MS Word 2007 XML",
327
+ "pptx": "Impress MS PowerPoint 2007 XML",
328
+ "xlsx": "Calc MS Excel 2007 XML",
329
+ }
330
+ if output_format in filter_map:
331
+ properties.append(
332
+ self._make_property("FilterName", filter_map[output_format])
333
+ )
334
+
335
+ # 确保输出目录存在
336
+ output_dir = os.path.dirname(output_path)
337
+ if output_dir and not os.path.exists(output_dir):
338
+ os.makedirs(output_dir)
339
+
340
+ # 转换为URL格式
341
+ output_url = uno.systemPathToFileUrl(os.path.abspath(output_path))
342
+
343
+ # 执行转换
344
+ document.storeToURL(output_url, properties)
345
+ logger.info(f"✅ 文档转换成功: {output_path}")
346
+
347
+ def _make_property(self, name: str, value):
348
+ """创建属性对象"""
349
+ prop = PropertyValue()
350
+ prop.Name = name
351
+ prop.Value = value
352
+ return prop
353
+
354
+
355
+ # 全局单例UnoManager
356
+ _global_uno_manager: Optional[UnoManager] = None
357
+ _manager_lock = threading.Lock()
358
+
359
+
360
+ def get_uno_manager() -> UnoManager:
361
+ """获取全局单例UNO管理器"""
362
+ global _global_uno_manager
363
+
364
+ if _global_uno_manager is None:
365
+ with _manager_lock:
366
+ if _global_uno_manager is None:
367
+ _global_uno_manager = UnoManager()
368
+ logger.info("🎯 创建全局单例UnoManager (单线程模式)")
369
+
370
+ return _global_uno_manager
371
+
372
+
373
+ def cleanup_uno_manager():
374
+ """清理全局UNO管理器"""
375
+ global _global_uno_manager
376
+
377
+ with _manager_lock:
378
+ if _global_uno_manager is not None:
379
+ try:
380
+ _global_uno_manager.stop_service()
381
+ except:
382
+ pass
383
+ _global_uno_manager = None
384
+ logger.info("🧹 清理全局UnoManager")
385
+
386
+
387
+ @contextmanager
388
+ def uno_manager_context():
389
+ """UNO管理器上下文管理器,自动获取和管理"""
390
+ manager = get_uno_manager()
391
+ try:
392
+ yield manager
393
+ finally:
394
+ # 在单线程模式下,保持连接以提高效率
395
+ pass
396
+
397
+
398
+ def convert_with_uno(
399
+ input_path: str,
400
+ output_format: str,
401
+ output_dir: Optional[str] = None
402
+ ) -> str:
403
+ """
404
+ 使用UNO转换文档格式(便捷函数)
405
+
406
+ Args:
407
+ input_path: 输入文件路径
408
+ output_format: 输出格式
409
+ output_dir: 输出目录(可选,默认为输入文件所在目录)
410
+
411
+ Returns:
412
+ 输出文件路径
413
+ """
414
+ input_path = Path(input_path)
415
+
416
+ if output_dir is None:
417
+ output_dir = input_path.parent
418
+ else:
419
+ output_dir = Path(output_dir)
420
+
421
+ output_path = output_dir / f"{input_path.stem}.{output_format}"
422
+
423
+ with uno_manager_context() as manager:
424
+ manager.convert_document(str(input_path), str(output_path), output_format)
425
+
426
+ return str(output_path)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: pydatamax
3
- Version: 0.1.14
3
+ Version: 0.1.15.post2
4
4
  Summary: A library for parsing and converting various file formats.
5
5
  Home-page: https://github.com/Hi-Dolphin/datamax
6
6
  Author: ccy
@@ -105,10 +105,15 @@ cleaned_data = dm.clean_data(method_list=["abnormal", "private", "filter"])
105
105
 
106
106
  # AI annotation
107
107
  qa_data = dm.get_pre_label(
108
- api_key="your-api-key",
109
- base_url="https://api.openai.com/v1",
110
- model_name="gpt-3.5-turbo"
108
+ api_key="sk-xxx",
109
+ base_url="https://api.provider.com/v1",
110
+ model_name="model-name",
111
+ chunk_size=500, # 文本块大小
112
+ chunk_overlap=100, # 重叠长度
113
+ question_number=5, # 每块生成问题数
114
+ max_workers=5 # 并发数
111
115
  )
116
+ dm.save_label_data(res)
112
117
  ```
113
118
 
114
119
  ## 📖 Detailed Documentation
@@ -138,8 +143,54 @@ dm = DataMax(file_path="document.docx", to_markdown=True)
138
143
  # Image OCR
139
144
  dm = DataMax(file_path="image.jpg", use_ocr=True)
140
145
  ```
146
+ ### Batch Processing
147
+ ```python
148
+ # Parse multiple files in batch
149
+ dm = DataMax(
150
+ file_path=["file1.pdf", "file2.docx"],
151
+ use_mineru=True
152
+ )
153
+ data = dm.get_data()
154
+ ```
155
+
156
+ ### Cache parsed results
157
+ ```python
158
+ # Cache parsed results to avoid repeated parsing
159
+ dm = DataMax(
160
+ file_path=["file1.pdf", "file2.docx"],
161
+ ttl=3600 # Cache duration in seconds, default 3600s, 0 means no caching
162
+ )
163
+ data = dm.get_data()
164
+ ```
141
165
 
142
166
  ### Data Cleaning
167
+ ## Exception Handling
168
+
169
+ - remove_abnormal_chars Remove abnormal characters from text
170
+ - remove_html_tags Remove HTML tags
171
+ - convert_newlines Convert \r to \n and merge multiple \n into single \n
172
+ - single_space Convert multiple spaces (more than 2) to single space
173
+ - tabs_to_spaces Convert tabs to 4 spaces
174
+ - remove_invisible_chars Remove invisible ASCII characters
175
+ - simplify_chinese Convert traditional Chinese to simplified Chinese
176
+
177
+ ## Text Filtering
178
+
179
+ - filter_by_word_repetition Filter by word repetition rate
180
+ - filter_by_char_count Filter by character count
181
+ - filter_by_numeric_content Filter by numeric content ratio
182
+
183
+ ## Privacy Desensitization
184
+
185
+ - replace_ip
186
+ - replace_email
187
+ - replace_customer_number Clean hotline numbers like 4008-123-123
188
+ - replace_bank_id
189
+ - replace_phone_number
190
+ - replace_qq
191
+ - replace_id_card
192
+
193
+
143
194
 
144
195
  ```python
145
196
  # Three cleaning modes
@@ -148,6 +199,67 @@ dm.clean_data(method_list=[
148
199
  "private", # Privacy information masking
149
200
  "filter" # Text filtering and normalization
150
201
  ])
202
+
203
+ # Custom cleaning mode
204
+ from datamax.utils.data_cleaner import TextFilter, PrivacyDesensitization, AbnormalCleaner
205
+ dm = DataMax(
206
+ file_path=r"C:\Users\cykro\Desktop\HongKongDevMachine.txt"
207
+ )
208
+ parsed_data = dm.get_data().get('content')
209
+ # 1. Text filtering
210
+ tf = TextFilter(parsed_data=parsed_data)
211
+ # Word repetition filtering - default threshold is 0.6 (max 60% of characters can be repeated)
212
+ tf_bool = tf.filter_by_word_repetition(threshold=0.6)
213
+ if tf_bool:
214
+ print("Text passed word repetition filtering")
215
+ else:
216
+ print("Text failed word repetition filtering")
217
+
218
+ # Character count filtering - default min_chars=30 (minimum 30 chars), max_chars=500000 (maximum 500000 chars)
219
+ tf_bool = tf.filter_by_char_count(min_chars=30, max_chars=500000)
220
+ if tf_bool:
221
+ print("Text passed character count filtering")
222
+ else:
223
+ print("Text failed character count filtering")
224
+
225
+ # Numeric content filtering - default threshold=0.6 (max 60% of characters can be digits)
226
+ tf_bool = tf.filter_by_numeric_content(threshold=0.6)
227
+ if tf_bool:
228
+ print("Text passed numeric ratio filtering")
229
+ else:
230
+ print("Text failed numeric ratio filtering")
231
+
232
+ # 2. Privacy desensitization
233
+ pd = PrivacyDesensitization(parsed_data=parsed_data)
234
+ res = pd.replace_ip(
235
+ token="MyIP"
236
+ )
237
+ print(res)
238
+
239
+ # 3. Abnormal character cleaning
240
+ ac = AbnormalCleaner(parsed_data=parsed_data)
241
+ res = ac.remove_abnormal_chars()
242
+ res = ac.remove_html_tags()
243
+ res = ac.convert_newlines()
244
+ res = ac.single_space()
245
+ res = ac.tabs_to_spaces()
246
+ res = ac.remove_invisible_chars()
247
+ res = ac.simplify_chinese()
248
+ print(res)
249
+ ```
250
+ # Text Segmentation
251
+ ```python
252
+ dm.split_data(
253
+ chunk_size=500, # Chunk size
254
+ chunk_overlap=100, # Overlap length
255
+ use_langchain=True # Use LangChain for text segmentation
256
+ )
257
+
258
+ # When use_langchain is False, use custom segmentation method
259
+ # Using 。!? as separators, consecutive separators will be merged
260
+ # chunk_size strictly limits the string length
261
+ for chunk in parser.split_data(chunk_size=500, chunk_overlap=100, use_langchain=False).get("content"):
262
+ print(chunk)
151
263
  ```
152
264
 
153
265
  ### AI Annotation
@@ -225,4 +337,4 @@ This project is licensed under the [MIT License](LICENSE).
225
337
 
226
338
  ---
227
339
 
228
- ⭐ If this project helps you, please give us a star!
340
+ ⭐ If this project helps you, please give us a star!
@@ -0,0 +1,38 @@
1
+ datamax/__init__.py,sha256=IGJxWkFpUj1xuHfwtPTrNqsRdLB4jBZIweAVHzDKrvU,29
2
+ datamax/loader/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
+ datamax/loader/core.py,sha256=NGnK2m59GRBauYxZST0kyX5f4zhvAOk4Z5bVoF0CjGo,5218
4
+ datamax/loader/minio_handler.py,sha256=e7ZUlwoStVe5iQfAVfNgEwRLxen4NbxwokpJZl6AR0U,6557
5
+ datamax/loader/oss_handler.py,sha256=ZO8ZbbA1oyuNN4Z7iVgSbMArYgJ1gvpqADkXDNDw4y0,7648
6
+ datamax/parser/__init__.py,sha256=sIB1N4B_fVguSBN-Uw9tGxAO6s0oi4Tq4kRZ59YlUKo,52
7
+ datamax/parser/base.py,sha256=FamDV6csc3aXVbobMR1lPNtVpvYMO19koRJW9poj_gE,2590
8
+ datamax/parser/core.py,sha256=2Gkz08WrRPt9ga0DisXrV1Aa_Yu7jUMlECOXNaexpwQ,16831
9
+ datamax/parser/csv_parser.py,sha256=lHQs1MHK9WM4Vl0p9nsE3fFhewF0EoXZUhtk8ixznRw,1028
10
+ datamax/parser/doc_parser.py,sha256=g2vZEdwcA-5AM2y0IHBy0bx1cOflkvLBobQ6tljX1fo,27940
11
+ datamax/parser/docx_parser.py,sha256=fehvMdOexWRRm1HTOCkVa_zDWI8A_LbGJdEFwW-MGss,29613
12
+ datamax/parser/epub_parser.py,sha256=K4eCS4wIXJzDicvtVAfQT8yt1gFHeibZN5-EdQZfJe8,1621
13
+ datamax/parser/html_parser.py,sha256=5ACrVc03Q9pJqWI_b0EtRgOYy0eMYJq4podgHGD68Z8,1453
14
+ datamax/parser/image_parser.py,sha256=UH3duPvB7Xu6CFlEeAukX5uJ8VlqnMR89hcLsW2O-aU,1281
15
+ datamax/parser/json_parser.py,sha256=2Ns2Lm6sei9TnDaFFYvl-xhyhse12sMJBwjKNACw4po,1072
16
+ datamax/parser/md_parser.py,sha256=rHJqtRV78XgQuKtDdwn1LcgRSUEuhGBqN5uaHG6oPT4,2251
17
+ datamax/parser/pdf_parser.py,sha256=YOJFOEC8DxWLAE6yNd2x6qMvYJl3sWVcM1eA8R4uA30,4116
18
+ datamax/parser/ppt_parser.py,sha256=0OlsIrzZZJnYZqLmQkUz4J_Hiv-rQHLHJnHIsw345c8,4631
19
+ datamax/parser/pptx_parser.py,sha256=yWajVd8kpyTdFavR8XcbwxOj94uNBswDoMHfSOycn0o,1870
20
+ datamax/parser/txt_parser.py,sha256=NXs7aNpm1PUwiUSlN1RU23ittuuQSBaBNI3KeQjJFXs,1750
21
+ datamax/parser/xls_parser.py,sha256=iNMx8iPakjE984dkaFL-oUBYWpQwxbWoDnQdwfAeeGM,980
22
+ datamax/parser/xlsx_parser.py,sha256=hUOFqkqkI0XPcwwrp2cs4PFKbChpZtb8orGsZc9kxJ0,9089
23
+ datamax/utils/__init__.py,sha256=75D4WFE_FVG9MyT8qWtvtlgzuuRelTC7ObSqqfjDKIY,1476
24
+ datamax/utils/constants.py,sha256=1hzHnYsm43Q36Czc7OnC-zJVTunThx82d_ZZAZBErHw,4565
25
+ datamax/utils/data_cleaner.py,sha256=TrrxC1r0__wuOhrQSJZcJKoEIyB4eNKWZkA1IoBYhyQ,9937
26
+ datamax/utils/env_setup.py,sha256=p_7sqHwyXroeOI_yFZpUOK6wOGmPVmf-gBa6M3351O4,3539
27
+ datamax/utils/gotocr_pdf.py,sha256=A7sn77EQBDbAe-4edCBUlYXKuE2mY7JcsFGm8U3-xbE,8744
28
+ datamax/utils/mineru_operator.py,sha256=4i4FtDkDE61FWPyRoDjPujHYJq_kDUAkwlowmFWdEOA,2303
29
+ datamax/utils/paddleocr_pdf_operator.py,sha256=SW06dts1SxDnUvyf5zWYpAN_6t9PLtJXUSsYhSS317I,3645
30
+ datamax/utils/ppt_extract.py,sha256=jBVGYEsBGPjHqyq7drHTOM8MnFOwqKyHhbkKmEAryAk,6307
31
+ datamax/utils/qa_generator.py,sha256=pXxdFm_EnWgMuilfmLKgy2c6NDexQZN8nWxT-bYBt74,12548
32
+ datamax/utils/tokenizer.py,sha256=o78GPmeJ3vs3-SF0b2nMm35XtbrCKbrhDW0gI9gqGl4,880
33
+ datamax/utils/uno_handler.py,sha256=gDm42OQQQoCiOP0SB7xZ9TRF6A_XBHNavwG5ycj6kEQ,14807
34
+ pydatamax-0.1.15.post2.dist-info/licenses/LICENSE,sha256=RltoeMa9c1956S08ThvZE2yJSPbnem68Y8cmiIfOgco,1088
35
+ pydatamax-0.1.15.post2.dist-info/METADATA,sha256=iBXzrENUTG5pKYywsogIDOCv4_69iXAMZa5FWpaT48I,9801
36
+ pydatamax-0.1.15.post2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
37
+ pydatamax-0.1.15.post2.dist-info/top_level.txt,sha256=N9TrwI3GKnWW07RRXHr0xX5Bm8dIM_sahfAnf9j8J9M,8
38
+ pydatamax-0.1.15.post2.dist-info/RECORD,,