pydatamax 0.1.14__py3-none-any.whl → 0.1.15__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datamax/__init__.py +1 -1
- datamax/loader/core.py +118 -118
- datamax/loader/minio_handler.py +171 -171
- datamax/loader/oss_handler.py +191 -191
- datamax/parser/__init__.py +2 -4
- datamax/parser/base.py +76 -76
- datamax/parser/core.py +406 -288
- datamax/parser/csv_parser.py +31 -10
- datamax/parser/doc_parser.py +466 -10
- datamax/parser/docx_parser.py +449 -11
- datamax/parser/epub_parser.py +41 -41
- datamax/parser/html_parser.py +37 -37
- datamax/parser/image_parser.py +34 -34
- datamax/parser/json_parser.py +32 -10
- datamax/parser/md_parser.py +72 -72
- datamax/parser/pdf_parser.py +101 -101
- datamax/parser/ppt_parser.py +70 -20
- datamax/parser/pptx_parser.py +45 -45
- datamax/parser/txt_parser.py +45 -45
- datamax/parser/xls_parser.py +26 -26
- datamax/parser/xlsx_parser.py +212 -215
- datamax/utils/__init__.py +23 -2
- datamax/utils/constants.py +58 -58
- datamax/utils/data_cleaner.py +275 -237
- datamax/utils/env_setup.py +79 -79
- datamax/utils/gotocr_pdf.py +265 -265
- datamax/utils/mineru_operator.py +62 -62
- datamax/utils/paddleocr_pdf_operator.py +90 -90
- datamax/utils/ppt_extract.py +140 -140
- datamax/utils/qa_generator.py +369 -376
- datamax/utils/tokenizer.py +21 -21
- datamax/utils/uno_handler.py +426 -0
- {pydatamax-0.1.14.dist-info → pydatamax-0.1.15.dist-info}/METADATA +117 -5
- pydatamax-0.1.15.dist-info/RECORD +38 -0
- {pydatamax-0.1.14.dist-info → pydatamax-0.1.15.dist-info}/licenses/LICENSE +21 -21
- {pydatamax-0.1.14.dist-info → pydatamax-0.1.15.dist-info}/top_level.txt +0 -1
- pydatamax-0.1.14.dist-info/RECORD +0 -39
- tests/__init__.py +0 -0
- tests/test_basic.py +0 -20
- {pydatamax-0.1.14.dist-info → pydatamax-0.1.15.dist-info}/WHEEL +0 -0
datamax/utils/tokenizer.py
CHANGED
@@ -1,22 +1,22 @@
|
|
1
|
-
import tiktoken
|
2
|
-
|
3
|
-
|
4
|
-
class DashScopeClient:
|
5
|
-
_instance = None
|
6
|
-
def __new__(cls, *args, **kwargs):
|
7
|
-
if not cls._instance:
|
8
|
-
cls._instance = super(DashScopeClient, cls).__new__(cls)
|
9
|
-
return cls._instance
|
10
|
-
|
11
|
-
def get_tokenizer(self, content):
|
12
|
-
'''
|
13
|
-
Note: tiktoken only supports the following models with different token calculations
|
14
|
-
A BPE word divider developed by tiktoken openai
|
15
|
-
o200k_base corresponds to models: gpt-4o, GPT-4O-MINI
|
16
|
-
cl100k_base models: GPT-4-Turbo, gpt-4, gpt-3.5-turbo...
|
17
|
-
p50k_base corresponds to models text-davinci-002 and text-davinci-003
|
18
|
-
r50k_base corresponds to model gpt2
|
19
|
-
'''
|
20
|
-
encoding = tiktoken.get_encoding(encoding_name="cl100k_base")
|
21
|
-
num_tokens = len(encoding.encode(content))
|
1
|
+
import tiktoken
|
2
|
+
|
3
|
+
|
4
|
+
class DashScopeClient:
|
5
|
+
_instance = None
|
6
|
+
def __new__(cls, *args, **kwargs):
|
7
|
+
if not cls._instance:
|
8
|
+
cls._instance = super(DashScopeClient, cls).__new__(cls)
|
9
|
+
return cls._instance
|
10
|
+
|
11
|
+
def get_tokenizer(self, content):
|
12
|
+
'''
|
13
|
+
Note: tiktoken only supports the following models with different token calculations
|
14
|
+
A BPE word divider developed by tiktoken openai
|
15
|
+
o200k_base corresponds to models: gpt-4o, GPT-4O-MINI
|
16
|
+
cl100k_base models: GPT-4-Turbo, gpt-4, gpt-3.5-turbo...
|
17
|
+
p50k_base corresponds to models text-davinci-002 and text-davinci-003
|
18
|
+
r50k_base corresponds to model gpt2
|
19
|
+
'''
|
20
|
+
encoding = tiktoken.get_encoding(encoding_name="cl100k_base")
|
21
|
+
num_tokens = len(encoding.encode(content))
|
22
22
|
return num_tokens
|
@@ -0,0 +1,426 @@
|
|
1
|
+
from loguru import logger
|
2
|
+
import os
|
3
|
+
import subprocess
|
4
|
+
import threading
|
5
|
+
import time
|
6
|
+
from contextlib import contextmanager
|
7
|
+
from pathlib import Path
|
8
|
+
from typing import Optional
|
9
|
+
|
10
|
+
# 延迟导入标志和锁
|
11
|
+
_uno_imported = False
|
12
|
+
_import_error = None
|
13
|
+
_import_lock = threading.Lock()
|
14
|
+
|
15
|
+
|
16
|
+
def _lazy_import_uno():
|
17
|
+
"""延迟导入 UNO 模块,避免与其他库冲突(线程安全)"""
|
18
|
+
global _uno_imported, _import_error
|
19
|
+
|
20
|
+
# 快速检查,避免不必要的锁获取
|
21
|
+
if _uno_imported:
|
22
|
+
return True
|
23
|
+
|
24
|
+
with _import_lock:
|
25
|
+
# 双重检查锁定模式
|
26
|
+
if _uno_imported:
|
27
|
+
return True
|
28
|
+
|
29
|
+
try:
|
30
|
+
# 在这里导入所有 UNO 相关的模块
|
31
|
+
global uno, PropertyValue, NoConnectException
|
32
|
+
import uno
|
33
|
+
from com.sun.star.beans import PropertyValue
|
34
|
+
from com.sun.star.connection import NoConnectException
|
35
|
+
|
36
|
+
_uno_imported = True
|
37
|
+
logger.info("✅ UNO模块导入成功")
|
38
|
+
return True
|
39
|
+
except ImportError as e:
|
40
|
+
_import_error = e
|
41
|
+
logger.error(f"❌ UNO模块导入失败: {str(e)}")
|
42
|
+
return False
|
43
|
+
|
44
|
+
|
45
|
+
def ensure_uno_imported():
|
46
|
+
"""确保UNO已导入,适用于需要提前导入的场景"""
|
47
|
+
if not _lazy_import_uno():
|
48
|
+
raise ImportError(
|
49
|
+
f"python-uno未安装或无法导入。错误: {_import_error}\n"
|
50
|
+
"请安装LibreOffice并确保python-uno可用。\n"
|
51
|
+
"Ubuntu/Debian: apt-get install libreoffice python3-uno\n"
|
52
|
+
"其他系统请参考: https://wiki.documentfoundation.org/Documentation/DevGuide/Installing_the_SDK"
|
53
|
+
)
|
54
|
+
|
55
|
+
|
56
|
+
# 检查 UNO 是否可用(但不立即导入)
|
57
|
+
def check_uno_available():
|
58
|
+
"""检查 UNO 是否可用(不会真正导入)"""
|
59
|
+
try:
|
60
|
+
import importlib.util
|
61
|
+
spec = importlib.util.find_spec("uno")
|
62
|
+
return spec is not None
|
63
|
+
except:
|
64
|
+
return False
|
65
|
+
|
66
|
+
|
67
|
+
HAS_UNO = check_uno_available()
|
68
|
+
|
69
|
+
|
70
|
+
class UnoManager:
|
71
|
+
"""
|
72
|
+
UNO管理器,用于管理LibreOffice服务实例和文档转换
|
73
|
+
单线程版本,适合稳定高效的文档处理
|
74
|
+
"""
|
75
|
+
|
76
|
+
def __init__(self, host: str = "localhost", port: int = 2002, timeout: int = 30):
|
77
|
+
"""
|
78
|
+
初始化UNO管理器
|
79
|
+
|
80
|
+
Args:
|
81
|
+
host: LibreOffice服务主机地址
|
82
|
+
port: LibreOffice服务端口
|
83
|
+
timeout: 连接超时时间(秒)
|
84
|
+
"""
|
85
|
+
# 确保UNO已导入(使用线程安全的方式)
|
86
|
+
ensure_uno_imported()
|
87
|
+
|
88
|
+
self.host = host
|
89
|
+
self.port = port
|
90
|
+
self.timeout = timeout
|
91
|
+
self.connection_string = (
|
92
|
+
f"socket,host={host},port={port};urp;StarOffice.ComponentContext"
|
93
|
+
)
|
94
|
+
self._lock = threading.Lock()
|
95
|
+
self._desktop = None
|
96
|
+
self._ctx = None
|
97
|
+
self._soffice_process = None
|
98
|
+
self._connected = False
|
99
|
+
logger.info(f"🚀 UnoManager初始化 - 主机: {host}, 端口: {port} (单线程模式)")
|
100
|
+
|
101
|
+
def _start_soffice_service(self):
|
102
|
+
"""启动LibreOffice服务"""
|
103
|
+
logger.info(f"🌟 启动LibreOffice服务,监听端口 {self.port}...")
|
104
|
+
|
105
|
+
# 检查是否已有服务在运行
|
106
|
+
if self._check_soffice_running():
|
107
|
+
logger.info("✅ LibreOffice服务已在运行")
|
108
|
+
return
|
109
|
+
|
110
|
+
# 启动新的服务实例
|
111
|
+
cmd = [
|
112
|
+
"soffice",
|
113
|
+
"--headless",
|
114
|
+
"--invisible",
|
115
|
+
"--nocrashreport",
|
116
|
+
"--nodefault",
|
117
|
+
"--nofirststartwizard",
|
118
|
+
"--nologo",
|
119
|
+
"--norestore",
|
120
|
+
f"--accept={self.connection_string}",
|
121
|
+
]
|
122
|
+
|
123
|
+
try:
|
124
|
+
self._soffice_process = subprocess.Popen(
|
125
|
+
cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL
|
126
|
+
)
|
127
|
+
logger.info(f"⏳ 等待LibreOffice服务启动...")
|
128
|
+
time.sleep(5) # 给服务一些启动时间
|
129
|
+
|
130
|
+
if self._check_soffice_running():
|
131
|
+
logger.info("✅ LibreOffice服务启动成功")
|
132
|
+
else:
|
133
|
+
raise Exception("LibreOffice服务启动失败")
|
134
|
+
|
135
|
+
except Exception as e:
|
136
|
+
logger.error(f"❌ 启动LibreOffice服务失败: {str(e)}")
|
137
|
+
raise
|
138
|
+
|
139
|
+
def _check_soffice_running(self) -> bool:
|
140
|
+
"""检查LibreOffice服务是否在运行"""
|
141
|
+
try:
|
142
|
+
import socket
|
143
|
+
|
144
|
+
sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
|
145
|
+
result = sock.connect_ex((self.host, self.port))
|
146
|
+
sock.close()
|
147
|
+
return result == 0
|
148
|
+
except:
|
149
|
+
return False
|
150
|
+
|
151
|
+
def is_connected(self) -> bool:
|
152
|
+
"""检查是否已连接"""
|
153
|
+
with self._lock:
|
154
|
+
return self._connected and self._desktop is not None
|
155
|
+
|
156
|
+
def connect(self):
|
157
|
+
"""连接到LibreOffice服务"""
|
158
|
+
with self._lock:
|
159
|
+
if self._connected and self._desktop is not None:
|
160
|
+
return # 已连接
|
161
|
+
|
162
|
+
self._start_soffice_service()
|
163
|
+
|
164
|
+
logger.info(f"🔌 连接到LibreOffice服务...")
|
165
|
+
start_time = time.time()
|
166
|
+
|
167
|
+
while time.time() - start_time < self.timeout:
|
168
|
+
try:
|
169
|
+
# 获取组件上下文
|
170
|
+
local_ctx = uno.getComponentContext()
|
171
|
+
resolver = local_ctx.ServiceManager.createInstanceWithContext(
|
172
|
+
"com.sun.star.bridge.UnoUrlResolver", local_ctx
|
173
|
+
)
|
174
|
+
|
175
|
+
# 连接到LibreOffice
|
176
|
+
self._ctx = resolver.resolve(f"uno:{self.connection_string}")
|
177
|
+
self._desktop = self._ctx.ServiceManager.createInstanceWithContext(
|
178
|
+
"com.sun.star.frame.Desktop", self._ctx
|
179
|
+
)
|
180
|
+
|
181
|
+
self._connected = True
|
182
|
+
logger.info("✅ 成功连接到LibreOffice服务")
|
183
|
+
return
|
184
|
+
|
185
|
+
except NoConnectException:
|
186
|
+
logger.debug("⏳ 等待LibreOffice服务就绪...")
|
187
|
+
time.sleep(1)
|
188
|
+
except Exception as e:
|
189
|
+
logger.error(f"❌ 连接失败: {str(e)}")
|
190
|
+
time.sleep(1)
|
191
|
+
|
192
|
+
raise TimeoutError(f"连接LibreOffice服务超时({self.timeout}秒)")
|
193
|
+
|
194
|
+
def disconnect(self):
|
195
|
+
"""断开与LibreOffice服务的连接"""
|
196
|
+
with self._lock:
|
197
|
+
if self._desktop is not None:
|
198
|
+
try:
|
199
|
+
self._desktop.terminate()
|
200
|
+
except:
|
201
|
+
pass
|
202
|
+
self._desktop = None
|
203
|
+
self._ctx = None
|
204
|
+
self._connected = False
|
205
|
+
logger.info("🔌 已断开LibreOffice服务连接")
|
206
|
+
|
207
|
+
def stop_service(self):
|
208
|
+
"""停止LibreOffice服务"""
|
209
|
+
self.disconnect()
|
210
|
+
if self._soffice_process:
|
211
|
+
try:
|
212
|
+
self._soffice_process.terminate()
|
213
|
+
self._soffice_process.wait(timeout=10)
|
214
|
+
except:
|
215
|
+
self._soffice_process.kill()
|
216
|
+
self._soffice_process = None
|
217
|
+
logger.info("🛑 LibreOffice服务已停止")
|
218
|
+
|
219
|
+
@contextmanager
|
220
|
+
def get_document(self, file_path: str):
|
221
|
+
"""
|
222
|
+
获取文档对象的上下文管理器
|
223
|
+
|
224
|
+
Args:
|
225
|
+
file_path: 文档路径
|
226
|
+
|
227
|
+
Yields:
|
228
|
+
文档对象
|
229
|
+
"""
|
230
|
+
self.connect()
|
231
|
+
|
232
|
+
# 将路径转换为URL格式
|
233
|
+
file_url = uno.systemPathToFileUrl(os.path.abspath(file_path))
|
234
|
+
|
235
|
+
# 打开文档
|
236
|
+
properties = []
|
237
|
+
properties.append(self._make_property("Hidden", True))
|
238
|
+
properties.append(self._make_property("ReadOnly", True))
|
239
|
+
|
240
|
+
document = None
|
241
|
+
try:
|
242
|
+
document = self._desktop.loadComponentFromURL(
|
243
|
+
file_url, "_blank", 0, properties
|
244
|
+
)
|
245
|
+
logger.debug(f"📄 打开文档: {file_path}")
|
246
|
+
yield document
|
247
|
+
finally:
|
248
|
+
if document:
|
249
|
+
try:
|
250
|
+
document.dispose()
|
251
|
+
logger.debug(f"📄 关闭文档: {file_path}")
|
252
|
+
except:
|
253
|
+
pass
|
254
|
+
|
255
|
+
def convert_document(
|
256
|
+
self,
|
257
|
+
input_path: str,
|
258
|
+
output_path: str,
|
259
|
+
output_format: str,
|
260
|
+
filter_name: Optional[str] = None,
|
261
|
+
):
|
262
|
+
"""
|
263
|
+
转换文档格式
|
264
|
+
|
265
|
+
Args:
|
266
|
+
input_path: 输入文件路径
|
267
|
+
output_path: 输出文件路径
|
268
|
+
output_format: 输出格式(如'txt', 'pdf', 'docx'等)
|
269
|
+
filter_name: 过滤器名称(可选)
|
270
|
+
"""
|
271
|
+
logger.info(f"🔄 开始转换文档: {input_path} -> {output_path} ({output_format})")
|
272
|
+
|
273
|
+
with self.get_document(input_path) as document:
|
274
|
+
if document is None:
|
275
|
+
raise Exception(f"无法打开文档: {input_path}")
|
276
|
+
|
277
|
+
# 准备输出属性
|
278
|
+
properties = []
|
279
|
+
|
280
|
+
# 设置过滤器
|
281
|
+
if filter_name:
|
282
|
+
properties.append(self._make_property("FilterName", filter_name))
|
283
|
+
else:
|
284
|
+
# 根据格式自动选择过滤器
|
285
|
+
if output_format == "txt":
|
286
|
+
# 对于文本格式,尝试多个过滤器
|
287
|
+
filter_options = [
|
288
|
+
("Text (encoded)", "UTF8"),
|
289
|
+
("Text", None),
|
290
|
+
("HTML (StarWriter)", None)
|
291
|
+
]
|
292
|
+
|
293
|
+
success = False
|
294
|
+
for filter_name, filter_option in filter_options:
|
295
|
+
try:
|
296
|
+
properties = []
|
297
|
+
properties.append(self._make_property("FilterName", filter_name))
|
298
|
+
if filter_option:
|
299
|
+
properties.append(self._make_property("FilterOptions", filter_option))
|
300
|
+
|
301
|
+
# 确保输出目录存在
|
302
|
+
output_dir = os.path.dirname(output_path)
|
303
|
+
if output_dir and not os.path.exists(output_dir):
|
304
|
+
os.makedirs(output_dir)
|
305
|
+
|
306
|
+
# 转换为URL格式
|
307
|
+
output_url = uno.systemPathToFileUrl(os.path.abspath(output_path))
|
308
|
+
|
309
|
+
# 执行转换
|
310
|
+
document.storeToURL(output_url, properties)
|
311
|
+
logger.info(f"✅ 文档转换成功 (使用过滤器: {filter_name}): {output_path}")
|
312
|
+
success = True
|
313
|
+
break
|
314
|
+
except Exception as e:
|
315
|
+
logger.debug(f"🔄 过滤器 {filter_name} 失败: {str(e)}")
|
316
|
+
continue
|
317
|
+
|
318
|
+
if not success:
|
319
|
+
raise Exception(f"所有文本过滤器都失败,无法转换文档: {input_path}")
|
320
|
+
|
321
|
+
return # 已经完成转换,直接返回
|
322
|
+
else:
|
323
|
+
# 其他格式使用默认过滤器
|
324
|
+
filter_map = {
|
325
|
+
"pdf": "writer_pdf_Export",
|
326
|
+
"docx": "MS Word 2007 XML",
|
327
|
+
"pptx": "Impress MS PowerPoint 2007 XML",
|
328
|
+
"xlsx": "Calc MS Excel 2007 XML",
|
329
|
+
}
|
330
|
+
if output_format in filter_map:
|
331
|
+
properties.append(
|
332
|
+
self._make_property("FilterName", filter_map[output_format])
|
333
|
+
)
|
334
|
+
|
335
|
+
# 确保输出目录存在
|
336
|
+
output_dir = os.path.dirname(output_path)
|
337
|
+
if output_dir and not os.path.exists(output_dir):
|
338
|
+
os.makedirs(output_dir)
|
339
|
+
|
340
|
+
# 转换为URL格式
|
341
|
+
output_url = uno.systemPathToFileUrl(os.path.abspath(output_path))
|
342
|
+
|
343
|
+
# 执行转换
|
344
|
+
document.storeToURL(output_url, properties)
|
345
|
+
logger.info(f"✅ 文档转换成功: {output_path}")
|
346
|
+
|
347
|
+
def _make_property(self, name: str, value):
|
348
|
+
"""创建属性对象"""
|
349
|
+
prop = PropertyValue()
|
350
|
+
prop.Name = name
|
351
|
+
prop.Value = value
|
352
|
+
return prop
|
353
|
+
|
354
|
+
|
355
|
+
# 全局单例UnoManager
|
356
|
+
_global_uno_manager: Optional[UnoManager] = None
|
357
|
+
_manager_lock = threading.Lock()
|
358
|
+
|
359
|
+
|
360
|
+
def get_uno_manager() -> UnoManager:
|
361
|
+
"""获取全局单例UNO管理器"""
|
362
|
+
global _global_uno_manager
|
363
|
+
|
364
|
+
if _global_uno_manager is None:
|
365
|
+
with _manager_lock:
|
366
|
+
if _global_uno_manager is None:
|
367
|
+
_global_uno_manager = UnoManager()
|
368
|
+
logger.info("🎯 创建全局单例UnoManager (单线程模式)")
|
369
|
+
|
370
|
+
return _global_uno_manager
|
371
|
+
|
372
|
+
|
373
|
+
def cleanup_uno_manager():
|
374
|
+
"""清理全局UNO管理器"""
|
375
|
+
global _global_uno_manager
|
376
|
+
|
377
|
+
with _manager_lock:
|
378
|
+
if _global_uno_manager is not None:
|
379
|
+
try:
|
380
|
+
_global_uno_manager.stop_service()
|
381
|
+
except:
|
382
|
+
pass
|
383
|
+
_global_uno_manager = None
|
384
|
+
logger.info("🧹 清理全局UnoManager")
|
385
|
+
|
386
|
+
|
387
|
+
@contextmanager
|
388
|
+
def uno_manager_context():
|
389
|
+
"""UNO管理器上下文管理器,自动获取和管理"""
|
390
|
+
manager = get_uno_manager()
|
391
|
+
try:
|
392
|
+
yield manager
|
393
|
+
finally:
|
394
|
+
# 在单线程模式下,保持连接以提高效率
|
395
|
+
pass
|
396
|
+
|
397
|
+
|
398
|
+
def convert_with_uno(
|
399
|
+
input_path: str,
|
400
|
+
output_format: str,
|
401
|
+
output_dir: Optional[str] = None
|
402
|
+
) -> str:
|
403
|
+
"""
|
404
|
+
使用UNO转换文档格式(便捷函数)
|
405
|
+
|
406
|
+
Args:
|
407
|
+
input_path: 输入文件路径
|
408
|
+
output_format: 输出格式
|
409
|
+
output_dir: 输出目录(可选,默认为输入文件所在目录)
|
410
|
+
|
411
|
+
Returns:
|
412
|
+
输出文件路径
|
413
|
+
"""
|
414
|
+
input_path = Path(input_path)
|
415
|
+
|
416
|
+
if output_dir is None:
|
417
|
+
output_dir = input_path.parent
|
418
|
+
else:
|
419
|
+
output_dir = Path(output_dir)
|
420
|
+
|
421
|
+
output_path = output_dir / f"{input_path.stem}.{output_format}"
|
422
|
+
|
423
|
+
with uno_manager_context() as manager:
|
424
|
+
manager.convert_document(str(input_path), str(output_path), output_format)
|
425
|
+
|
426
|
+
return str(output_path)
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: pydatamax
|
3
|
-
Version: 0.1.
|
3
|
+
Version: 0.1.15
|
4
4
|
Summary: A library for parsing and converting various file formats.
|
5
5
|
Home-page: https://github.com/Hi-Dolphin/datamax
|
6
6
|
Author: ccy
|
@@ -105,10 +105,15 @@ cleaned_data = dm.clean_data(method_list=["abnormal", "private", "filter"])
|
|
105
105
|
|
106
106
|
# AI annotation
|
107
107
|
qa_data = dm.get_pre_label(
|
108
|
-
api_key="
|
109
|
-
base_url="https://api.
|
110
|
-
model_name="
|
108
|
+
api_key="sk-xxx",
|
109
|
+
base_url="https://api.provider.com/v1",
|
110
|
+
model_name="model-name",
|
111
|
+
chunk_size=500, # 文本块大小
|
112
|
+
chunk_overlap=100, # 重叠长度
|
113
|
+
question_number=5, # 每块生成问题数
|
114
|
+
max_workers=5 # 并发数
|
111
115
|
)
|
116
|
+
dm.save_label_data(res)
|
112
117
|
```
|
113
118
|
|
114
119
|
## 📖 Detailed Documentation
|
@@ -138,8 +143,54 @@ dm = DataMax(file_path="document.docx", to_markdown=True)
|
|
138
143
|
# Image OCR
|
139
144
|
dm = DataMax(file_path="image.jpg", use_ocr=True)
|
140
145
|
```
|
146
|
+
### Batch Processing
|
147
|
+
```python
|
148
|
+
# Parse multiple files in batch
|
149
|
+
dm = DataMax(
|
150
|
+
file_path=["file1.pdf", "file2.docx"],
|
151
|
+
use_mineru=True
|
152
|
+
)
|
153
|
+
data = dm.get_data()
|
154
|
+
```
|
155
|
+
|
156
|
+
### Cache parsed results
|
157
|
+
```python
|
158
|
+
# Cache parsed results to avoid repeated parsing
|
159
|
+
dm = DataMax(
|
160
|
+
file_path=["file1.pdf", "file2.docx"],
|
161
|
+
ttl=3600 # Cache duration in seconds, default 3600s, 0 means no caching
|
162
|
+
)
|
163
|
+
data = dm.get_data()
|
164
|
+
```
|
141
165
|
|
142
166
|
### Data Cleaning
|
167
|
+
## Exception Handling
|
168
|
+
|
169
|
+
- remove_abnormal_chars Remove abnormal characters from text
|
170
|
+
- remove_html_tags Remove HTML tags
|
171
|
+
- convert_newlines Convert \r to \n and merge multiple \n into single \n
|
172
|
+
- single_space Convert multiple spaces (more than 2) to single space
|
173
|
+
- tabs_to_spaces Convert tabs to 4 spaces
|
174
|
+
- remove_invisible_chars Remove invisible ASCII characters
|
175
|
+
- simplify_chinese Convert traditional Chinese to simplified Chinese
|
176
|
+
|
177
|
+
## Text Filtering
|
178
|
+
|
179
|
+
- filter_by_word_repetition Filter by word repetition rate
|
180
|
+
- filter_by_char_count Filter by character count
|
181
|
+
- filter_by_numeric_content Filter by numeric content ratio
|
182
|
+
|
183
|
+
## Privacy Desensitization
|
184
|
+
|
185
|
+
- replace_ip
|
186
|
+
- replace_email
|
187
|
+
- replace_customer_number Clean hotline numbers like 4008-123-123
|
188
|
+
- replace_bank_id
|
189
|
+
- replace_phone_number
|
190
|
+
- replace_qq
|
191
|
+
- replace_id_card
|
192
|
+
|
193
|
+
|
143
194
|
|
144
195
|
```python
|
145
196
|
# Three cleaning modes
|
@@ -148,6 +199,67 @@ dm.clean_data(method_list=[
|
|
148
199
|
"private", # Privacy information masking
|
149
200
|
"filter" # Text filtering and normalization
|
150
201
|
])
|
202
|
+
|
203
|
+
# Custom cleaning mode
|
204
|
+
from datamax.utils.data_cleaner import TextFilter, PrivacyDesensitization, AbnormalCleaner
|
205
|
+
dm = DataMax(
|
206
|
+
file_path=r"C:\Users\cykro\Desktop\HongKongDevMachine.txt"
|
207
|
+
)
|
208
|
+
parsed_data = dm.get_data().get('content')
|
209
|
+
# 1. Text filtering
|
210
|
+
tf = TextFilter(parsed_data=parsed_data)
|
211
|
+
# Word repetition filtering - default threshold is 0.6 (max 60% of characters can be repeated)
|
212
|
+
tf_bool = tf.filter_by_word_repetition(threshold=0.6)
|
213
|
+
if tf_bool:
|
214
|
+
print("Text passed word repetition filtering")
|
215
|
+
else:
|
216
|
+
print("Text failed word repetition filtering")
|
217
|
+
|
218
|
+
# Character count filtering - default min_chars=30 (minimum 30 chars), max_chars=500000 (maximum 500000 chars)
|
219
|
+
tf_bool = tf.filter_by_char_count(min_chars=30, max_chars=500000)
|
220
|
+
if tf_bool:
|
221
|
+
print("Text passed character count filtering")
|
222
|
+
else:
|
223
|
+
print("Text failed character count filtering")
|
224
|
+
|
225
|
+
# Numeric content filtering - default threshold=0.6 (max 60% of characters can be digits)
|
226
|
+
tf_bool = tf.filter_by_numeric_content(threshold=0.6)
|
227
|
+
if tf_bool:
|
228
|
+
print("Text passed numeric ratio filtering")
|
229
|
+
else:
|
230
|
+
print("Text failed numeric ratio filtering")
|
231
|
+
|
232
|
+
# 2. Privacy desensitization
|
233
|
+
pd = PrivacyDesensitization(parsed_data=parsed_data)
|
234
|
+
res = pd.replace_ip(
|
235
|
+
token="MyIP"
|
236
|
+
)
|
237
|
+
print(res)
|
238
|
+
|
239
|
+
# 3. Abnormal character cleaning
|
240
|
+
ac = AbnormalCleaner(parsed_data=parsed_data)
|
241
|
+
res = ac.remove_abnormal_chars()
|
242
|
+
res = ac.remove_html_tags()
|
243
|
+
res = ac.convert_newlines()
|
244
|
+
res = ac.single_space()
|
245
|
+
res = ac.tabs_to_spaces()
|
246
|
+
res = ac.remove_invisible_chars()
|
247
|
+
res = ac.simplify_chinese()
|
248
|
+
print(res)
|
249
|
+
```
|
250
|
+
# Text Segmentation
|
251
|
+
```python
|
252
|
+
dm.split_data(
|
253
|
+
chunk_size=500, # Chunk size
|
254
|
+
chunk_overlap=100, # Overlap length
|
255
|
+
use_langchain=True # Use LangChain for text segmentation
|
256
|
+
)
|
257
|
+
|
258
|
+
# When use_langchain is False, use custom segmentation method
|
259
|
+
# Using 。!? as separators, consecutive separators will be merged
|
260
|
+
# chunk_size strictly limits the string length
|
261
|
+
for chunk in parser.split_data(chunk_size=500, chunk_overlap=100, use_langchain=False).get("content"):
|
262
|
+
print(chunk)
|
151
263
|
```
|
152
264
|
|
153
265
|
### AI Annotation
|
@@ -225,4 +337,4 @@ This project is licensed under the [MIT License](LICENSE).
|
|
225
337
|
|
226
338
|
---
|
227
339
|
|
228
|
-
⭐ If this project helps you, please give us a star!
|
340
|
+
⭐ If this project helps you, please give us a star!
|
@@ -0,0 +1,38 @@
|
|
1
|
+
datamax/__init__.py,sha256=IGJxWkFpUj1xuHfwtPTrNqsRdLB4jBZIweAVHzDKrvU,29
|
2
|
+
datamax/loader/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
3
|
+
datamax/loader/core.py,sha256=NGnK2m59GRBauYxZST0kyX5f4zhvAOk4Z5bVoF0CjGo,5218
|
4
|
+
datamax/loader/minio_handler.py,sha256=e7ZUlwoStVe5iQfAVfNgEwRLxen4NbxwokpJZl6AR0U,6557
|
5
|
+
datamax/loader/oss_handler.py,sha256=ZO8ZbbA1oyuNN4Z7iVgSbMArYgJ1gvpqADkXDNDw4y0,7648
|
6
|
+
datamax/parser/__init__.py,sha256=sIB1N4B_fVguSBN-Uw9tGxAO6s0oi4Tq4kRZ59YlUKo,52
|
7
|
+
datamax/parser/base.py,sha256=FamDV6csc3aXVbobMR1lPNtVpvYMO19koRJW9poj_gE,2590
|
8
|
+
datamax/parser/core.py,sha256=2Gkz08WrRPt9ga0DisXrV1Aa_Yu7jUMlECOXNaexpwQ,16831
|
9
|
+
datamax/parser/csv_parser.py,sha256=lHQs1MHK9WM4Vl0p9nsE3fFhewF0EoXZUhtk8ixznRw,1028
|
10
|
+
datamax/parser/doc_parser.py,sha256=g2vZEdwcA-5AM2y0IHBy0bx1cOflkvLBobQ6tljX1fo,27940
|
11
|
+
datamax/parser/docx_parser.py,sha256=fehvMdOexWRRm1HTOCkVa_zDWI8A_LbGJdEFwW-MGss,29613
|
12
|
+
datamax/parser/epub_parser.py,sha256=K4eCS4wIXJzDicvtVAfQT8yt1gFHeibZN5-EdQZfJe8,1621
|
13
|
+
datamax/parser/html_parser.py,sha256=5ACrVc03Q9pJqWI_b0EtRgOYy0eMYJq4podgHGD68Z8,1453
|
14
|
+
datamax/parser/image_parser.py,sha256=UH3duPvB7Xu6CFlEeAukX5uJ8VlqnMR89hcLsW2O-aU,1281
|
15
|
+
datamax/parser/json_parser.py,sha256=2Ns2Lm6sei9TnDaFFYvl-xhyhse12sMJBwjKNACw4po,1072
|
16
|
+
datamax/parser/md_parser.py,sha256=rHJqtRV78XgQuKtDdwn1LcgRSUEuhGBqN5uaHG6oPT4,2251
|
17
|
+
datamax/parser/pdf_parser.py,sha256=YOJFOEC8DxWLAE6yNd2x6qMvYJl3sWVcM1eA8R4uA30,4116
|
18
|
+
datamax/parser/ppt_parser.py,sha256=0OlsIrzZZJnYZqLmQkUz4J_Hiv-rQHLHJnHIsw345c8,4631
|
19
|
+
datamax/parser/pptx_parser.py,sha256=yWajVd8kpyTdFavR8XcbwxOj94uNBswDoMHfSOycn0o,1870
|
20
|
+
datamax/parser/txt_parser.py,sha256=NXs7aNpm1PUwiUSlN1RU23ittuuQSBaBNI3KeQjJFXs,1750
|
21
|
+
datamax/parser/xls_parser.py,sha256=iNMx8iPakjE984dkaFL-oUBYWpQwxbWoDnQdwfAeeGM,980
|
22
|
+
datamax/parser/xlsx_parser.py,sha256=hUOFqkqkI0XPcwwrp2cs4PFKbChpZtb8orGsZc9kxJ0,9089
|
23
|
+
datamax/utils/__init__.py,sha256=75D4WFE_FVG9MyT8qWtvtlgzuuRelTC7ObSqqfjDKIY,1476
|
24
|
+
datamax/utils/constants.py,sha256=1hzHnYsm43Q36Czc7OnC-zJVTunThx82d_ZZAZBErHw,4565
|
25
|
+
datamax/utils/data_cleaner.py,sha256=TrrxC1r0__wuOhrQSJZcJKoEIyB4eNKWZkA1IoBYhyQ,9937
|
26
|
+
datamax/utils/env_setup.py,sha256=p_7sqHwyXroeOI_yFZpUOK6wOGmPVmf-gBa6M3351O4,3539
|
27
|
+
datamax/utils/gotocr_pdf.py,sha256=A7sn77EQBDbAe-4edCBUlYXKuE2mY7JcsFGm8U3-xbE,8744
|
28
|
+
datamax/utils/mineru_operator.py,sha256=4i4FtDkDE61FWPyRoDjPujHYJq_kDUAkwlowmFWdEOA,2303
|
29
|
+
datamax/utils/paddleocr_pdf_operator.py,sha256=SW06dts1SxDnUvyf5zWYpAN_6t9PLtJXUSsYhSS317I,3645
|
30
|
+
datamax/utils/ppt_extract.py,sha256=jBVGYEsBGPjHqyq7drHTOM8MnFOwqKyHhbkKmEAryAk,6307
|
31
|
+
datamax/utils/qa_generator.py,sha256=pXxdFm_EnWgMuilfmLKgy2c6NDexQZN8nWxT-bYBt74,12548
|
32
|
+
datamax/utils/tokenizer.py,sha256=o78GPmeJ3vs3-SF0b2nMm35XtbrCKbrhDW0gI9gqGl4,880
|
33
|
+
datamax/utils/uno_handler.py,sha256=gDm42OQQQoCiOP0SB7xZ9TRF6A_XBHNavwG5ycj6kEQ,14807
|
34
|
+
pydatamax-0.1.15.dist-info/licenses/LICENSE,sha256=RltoeMa9c1956S08ThvZE2yJSPbnem68Y8cmiIfOgco,1088
|
35
|
+
pydatamax-0.1.15.dist-info/METADATA,sha256=ySaiq1-bWbeW8W5ECuRTSPXzeNxTxaaJEAdqGnWvw0M,9795
|
36
|
+
pydatamax-0.1.15.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
37
|
+
pydatamax-0.1.15.dist-info/top_level.txt,sha256=N9TrwI3GKnWW07RRXHr0xX5Bm8dIM_sahfAnf9j8J9M,8
|
38
|
+
pydatamax-0.1.15.dist-info/RECORD,,
|