pydatamax 0.1.13__py3-none-any.whl → 0.1.15__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. datamax/__init__.py +1 -1
  2. datamax/loader/core.py +118 -118
  3. datamax/loader/{MinioHandler.py → minio_handler.py} +171 -171
  4. datamax/loader/{OssHandler.py → oss_handler.py} +191 -191
  5. datamax/parser/__init__.py +2 -4
  6. datamax/parser/base.py +76 -76
  7. datamax/parser/core.py +406 -288
  8. datamax/parser/csv_parser.py +31 -10
  9. datamax/parser/doc_parser.py +525 -61
  10. datamax/parser/docx_parser.py +512 -62
  11. datamax/parser/epub_parser.py +41 -41
  12. datamax/parser/html_parser.py +37 -37
  13. datamax/parser/image_parser.py +34 -34
  14. datamax/parser/json_parser.py +32 -10
  15. datamax/parser/md_parser.py +72 -72
  16. datamax/parser/pdf_parser.py +101 -101
  17. datamax/parser/ppt_parser.py +70 -20
  18. datamax/parser/pptx_parser.py +45 -45
  19. datamax/parser/txt_parser.py +45 -45
  20. datamax/parser/xls_parser.py +26 -26
  21. datamax/parser/xlsx_parser.py +212 -208
  22. datamax/utils/__init__.py +23 -2
  23. datamax/utils/constants.py +58 -58
  24. datamax/utils/data_cleaner.py +275 -237
  25. datamax/utils/env_setup.py +79 -79
  26. datamax/utils/gotocr_pdf.py +265 -265
  27. datamax/utils/mineru_operator.py +62 -62
  28. datamax/utils/paddleocr_pdf_operator.py +90 -90
  29. datamax/utils/ppt_extract.py +140 -140
  30. datamax/utils/qa_generator.py +369 -376
  31. datamax/utils/tokenizer.py +21 -21
  32. datamax/utils/uno_handler.py +426 -0
  33. pydatamax-0.1.15.dist-info/METADATA +340 -0
  34. pydatamax-0.1.15.dist-info/RECORD +38 -0
  35. {pydatamax-0.1.13.dist-info → pydatamax-0.1.15.dist-info}/licenses/LICENSE +21 -21
  36. {pydatamax-0.1.13.dist-info → pydatamax-0.1.15.dist-info}/top_level.txt +0 -1
  37. pydatamax-0.1.13.dist-info/METADATA +0 -280
  38. pydatamax-0.1.13.dist-info/RECORD +0 -39
  39. tests/__init__.py +0 -0
  40. tests/test_basic.py +0 -20
  41. {pydatamax-0.1.13.dist-info → pydatamax-0.1.15.dist-info}/WHEEL +0 -0
@@ -1,22 +1,22 @@
1
- import tiktoken
2
-
3
-
4
- class DashScopeClient:
5
- _instance = None
6
- def __new__(cls, *args, **kwargs):
7
- if not cls._instance:
8
- cls._instance = super(DashScopeClient, cls).__new__(cls)
9
- return cls._instance
10
-
11
- def get_tokenizer(self, content):
12
- '''
13
- Note: tiktoken only supports the following models with different token calculations
14
- A BPE word divider developed by tiktoken openai
15
- o200k_base corresponds to models: gpt-4o, GPT-4O-MINI
16
- cl100k_base models: GPT-4-Turbo, gpt-4, gpt-3.5-turbo...
17
- p50k_base corresponds to models text-davinci-002 and text-davinci-003
18
- r50k_base corresponds to model gpt2
19
- '''
20
- encoding = tiktoken.get_encoding(encoding_name="cl100k_base")
21
- num_tokens = len(encoding.encode(content))
1
+ import tiktoken
2
+
3
+
4
+ class DashScopeClient:
5
+ _instance = None
6
+ def __new__(cls, *args, **kwargs):
7
+ if not cls._instance:
8
+ cls._instance = super(DashScopeClient, cls).__new__(cls)
9
+ return cls._instance
10
+
11
+ def get_tokenizer(self, content):
12
+ '''
13
+ Note: tiktoken only supports the following models with different token calculations
14
+ A BPE word divider developed by tiktoken openai
15
+ o200k_base corresponds to models: gpt-4o, GPT-4O-MINI
16
+ cl100k_base models: GPT-4-Turbo, gpt-4, gpt-3.5-turbo...
17
+ p50k_base corresponds to models text-davinci-002 and text-davinci-003
18
+ r50k_base corresponds to model gpt2
19
+ '''
20
+ encoding = tiktoken.get_encoding(encoding_name="cl100k_base")
21
+ num_tokens = len(encoding.encode(content))
22
22
  return num_tokens
@@ -0,0 +1,426 @@
1
+ from loguru import logger
2
+ import os
3
+ import subprocess
4
+ import threading
5
+ import time
6
+ from contextlib import contextmanager
7
+ from pathlib import Path
8
+ from typing import Optional
9
+
10
+ # 延迟导入标志和锁
11
+ _uno_imported = False
12
+ _import_error = None
13
+ _import_lock = threading.Lock()
14
+
15
+
16
+ def _lazy_import_uno():
17
+ """延迟导入 UNO 模块,避免与其他库冲突(线程安全)"""
18
+ global _uno_imported, _import_error
19
+
20
+ # 快速检查,避免不必要的锁获取
21
+ if _uno_imported:
22
+ return True
23
+
24
+ with _import_lock:
25
+ # 双重检查锁定模式
26
+ if _uno_imported:
27
+ return True
28
+
29
+ try:
30
+ # 在这里导入所有 UNO 相关的模块
31
+ global uno, PropertyValue, NoConnectException
32
+ import uno
33
+ from com.sun.star.beans import PropertyValue
34
+ from com.sun.star.connection import NoConnectException
35
+
36
+ _uno_imported = True
37
+ logger.info("✅ UNO模块导入成功")
38
+ return True
39
+ except ImportError as e:
40
+ _import_error = e
41
+ logger.error(f"❌ UNO模块导入失败: {str(e)}")
42
+ return False
43
+
44
+
45
+ def ensure_uno_imported():
46
+ """确保UNO已导入,适用于需要提前导入的场景"""
47
+ if not _lazy_import_uno():
48
+ raise ImportError(
49
+ f"python-uno未安装或无法导入。错误: {_import_error}\n"
50
+ "请安装LibreOffice并确保python-uno可用。\n"
51
+ "Ubuntu/Debian: apt-get install libreoffice python3-uno\n"
52
+ "其他系统请参考: https://wiki.documentfoundation.org/Documentation/DevGuide/Installing_the_SDK"
53
+ )
54
+
55
+
56
+ # 检查 UNO 是否可用(但不立即导入)
57
+ def check_uno_available():
58
+ """检查 UNO 是否可用(不会真正导入)"""
59
+ try:
60
+ import importlib.util
61
+ spec = importlib.util.find_spec("uno")
62
+ return spec is not None
63
+ except:
64
+ return False
65
+
66
+
67
+ HAS_UNO = check_uno_available()
68
+
69
+
70
+ class UnoManager:
71
+ """
72
+ UNO管理器,用于管理LibreOffice服务实例和文档转换
73
+ 单线程版本,适合稳定高效的文档处理
74
+ """
75
+
76
+ def __init__(self, host: str = "localhost", port: int = 2002, timeout: int = 30):
77
+ """
78
+ 初始化UNO管理器
79
+
80
+ Args:
81
+ host: LibreOffice服务主机地址
82
+ port: LibreOffice服务端口
83
+ timeout: 连接超时时间(秒)
84
+ """
85
+ # 确保UNO已导入(使用线程安全的方式)
86
+ ensure_uno_imported()
87
+
88
+ self.host = host
89
+ self.port = port
90
+ self.timeout = timeout
91
+ self.connection_string = (
92
+ f"socket,host={host},port={port};urp;StarOffice.ComponentContext"
93
+ )
94
+ self._lock = threading.Lock()
95
+ self._desktop = None
96
+ self._ctx = None
97
+ self._soffice_process = None
98
+ self._connected = False
99
+ logger.info(f"🚀 UnoManager初始化 - 主机: {host}, 端口: {port} (单线程模式)")
100
+
101
+ def _start_soffice_service(self):
102
+ """启动LibreOffice服务"""
103
+ logger.info(f"🌟 启动LibreOffice服务,监听端口 {self.port}...")
104
+
105
+ # 检查是否已有服务在运行
106
+ if self._check_soffice_running():
107
+ logger.info("✅ LibreOffice服务已在运行")
108
+ return
109
+
110
+ # 启动新的服务实例
111
+ cmd = [
112
+ "soffice",
113
+ "--headless",
114
+ "--invisible",
115
+ "--nocrashreport",
116
+ "--nodefault",
117
+ "--nofirststartwizard",
118
+ "--nologo",
119
+ "--norestore",
120
+ f"--accept={self.connection_string}",
121
+ ]
122
+
123
+ try:
124
+ self._soffice_process = subprocess.Popen(
125
+ cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL
126
+ )
127
+ logger.info(f"⏳ 等待LibreOffice服务启动...")
128
+ time.sleep(5) # 给服务一些启动时间
129
+
130
+ if self._check_soffice_running():
131
+ logger.info("✅ LibreOffice服务启动成功")
132
+ else:
133
+ raise Exception("LibreOffice服务启动失败")
134
+
135
+ except Exception as e:
136
+ logger.error(f"❌ 启动LibreOffice服务失败: {str(e)}")
137
+ raise
138
+
139
+ def _check_soffice_running(self) -> bool:
140
+ """检查LibreOffice服务是否在运行"""
141
+ try:
142
+ import socket
143
+
144
+ sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
145
+ result = sock.connect_ex((self.host, self.port))
146
+ sock.close()
147
+ return result == 0
148
+ except:
149
+ return False
150
+
151
+ def is_connected(self) -> bool:
152
+ """检查是否已连接"""
153
+ with self._lock:
154
+ return self._connected and self._desktop is not None
155
+
156
+ def connect(self):
157
+ """连接到LibreOffice服务"""
158
+ with self._lock:
159
+ if self._connected and self._desktop is not None:
160
+ return # 已连接
161
+
162
+ self._start_soffice_service()
163
+
164
+ logger.info(f"🔌 连接到LibreOffice服务...")
165
+ start_time = time.time()
166
+
167
+ while time.time() - start_time < self.timeout:
168
+ try:
169
+ # 获取组件上下文
170
+ local_ctx = uno.getComponentContext()
171
+ resolver = local_ctx.ServiceManager.createInstanceWithContext(
172
+ "com.sun.star.bridge.UnoUrlResolver", local_ctx
173
+ )
174
+
175
+ # 连接到LibreOffice
176
+ self._ctx = resolver.resolve(f"uno:{self.connection_string}")
177
+ self._desktop = self._ctx.ServiceManager.createInstanceWithContext(
178
+ "com.sun.star.frame.Desktop", self._ctx
179
+ )
180
+
181
+ self._connected = True
182
+ logger.info("✅ 成功连接到LibreOffice服务")
183
+ return
184
+
185
+ except NoConnectException:
186
+ logger.debug("⏳ 等待LibreOffice服务就绪...")
187
+ time.sleep(1)
188
+ except Exception as e:
189
+ logger.error(f"❌ 连接失败: {str(e)}")
190
+ time.sleep(1)
191
+
192
+ raise TimeoutError(f"连接LibreOffice服务超时({self.timeout}秒)")
193
+
194
+ def disconnect(self):
195
+ """断开与LibreOffice服务的连接"""
196
+ with self._lock:
197
+ if self._desktop is not None:
198
+ try:
199
+ self._desktop.terminate()
200
+ except:
201
+ pass
202
+ self._desktop = None
203
+ self._ctx = None
204
+ self._connected = False
205
+ logger.info("🔌 已断开LibreOffice服务连接")
206
+
207
+ def stop_service(self):
208
+ """停止LibreOffice服务"""
209
+ self.disconnect()
210
+ if self._soffice_process:
211
+ try:
212
+ self._soffice_process.terminate()
213
+ self._soffice_process.wait(timeout=10)
214
+ except:
215
+ self._soffice_process.kill()
216
+ self._soffice_process = None
217
+ logger.info("🛑 LibreOffice服务已停止")
218
+
219
+ @contextmanager
220
+ def get_document(self, file_path: str):
221
+ """
222
+ 获取文档对象的上下文管理器
223
+
224
+ Args:
225
+ file_path: 文档路径
226
+
227
+ Yields:
228
+ 文档对象
229
+ """
230
+ self.connect()
231
+
232
+ # 将路径转换为URL格式
233
+ file_url = uno.systemPathToFileUrl(os.path.abspath(file_path))
234
+
235
+ # 打开文档
236
+ properties = []
237
+ properties.append(self._make_property("Hidden", True))
238
+ properties.append(self._make_property("ReadOnly", True))
239
+
240
+ document = None
241
+ try:
242
+ document = self._desktop.loadComponentFromURL(
243
+ file_url, "_blank", 0, properties
244
+ )
245
+ logger.debug(f"📄 打开文档: {file_path}")
246
+ yield document
247
+ finally:
248
+ if document:
249
+ try:
250
+ document.dispose()
251
+ logger.debug(f"📄 关闭文档: {file_path}")
252
+ except:
253
+ pass
254
+
255
+ def convert_document(
256
+ self,
257
+ input_path: str,
258
+ output_path: str,
259
+ output_format: str,
260
+ filter_name: Optional[str] = None,
261
+ ):
262
+ """
263
+ 转换文档格式
264
+
265
+ Args:
266
+ input_path: 输入文件路径
267
+ output_path: 输出文件路径
268
+ output_format: 输出格式(如'txt', 'pdf', 'docx'等)
269
+ filter_name: 过滤器名称(可选)
270
+ """
271
+ logger.info(f"🔄 开始转换文档: {input_path} -> {output_path} ({output_format})")
272
+
273
+ with self.get_document(input_path) as document:
274
+ if document is None:
275
+ raise Exception(f"无法打开文档: {input_path}")
276
+
277
+ # 准备输出属性
278
+ properties = []
279
+
280
+ # 设置过滤器
281
+ if filter_name:
282
+ properties.append(self._make_property("FilterName", filter_name))
283
+ else:
284
+ # 根据格式自动选择过滤器
285
+ if output_format == "txt":
286
+ # 对于文本格式,尝试多个过滤器
287
+ filter_options = [
288
+ ("Text (encoded)", "UTF8"),
289
+ ("Text", None),
290
+ ("HTML (StarWriter)", None)
291
+ ]
292
+
293
+ success = False
294
+ for filter_name, filter_option in filter_options:
295
+ try:
296
+ properties = []
297
+ properties.append(self._make_property("FilterName", filter_name))
298
+ if filter_option:
299
+ properties.append(self._make_property("FilterOptions", filter_option))
300
+
301
+ # 确保输出目录存在
302
+ output_dir = os.path.dirname(output_path)
303
+ if output_dir and not os.path.exists(output_dir):
304
+ os.makedirs(output_dir)
305
+
306
+ # 转换为URL格式
307
+ output_url = uno.systemPathToFileUrl(os.path.abspath(output_path))
308
+
309
+ # 执行转换
310
+ document.storeToURL(output_url, properties)
311
+ logger.info(f"✅ 文档转换成功 (使用过滤器: {filter_name}): {output_path}")
312
+ success = True
313
+ break
314
+ except Exception as e:
315
+ logger.debug(f"🔄 过滤器 {filter_name} 失败: {str(e)}")
316
+ continue
317
+
318
+ if not success:
319
+ raise Exception(f"所有文本过滤器都失败,无法转换文档: {input_path}")
320
+
321
+ return # 已经完成转换,直接返回
322
+ else:
323
+ # 其他格式使用默认过滤器
324
+ filter_map = {
325
+ "pdf": "writer_pdf_Export",
326
+ "docx": "MS Word 2007 XML",
327
+ "pptx": "Impress MS PowerPoint 2007 XML",
328
+ "xlsx": "Calc MS Excel 2007 XML",
329
+ }
330
+ if output_format in filter_map:
331
+ properties.append(
332
+ self._make_property("FilterName", filter_map[output_format])
333
+ )
334
+
335
+ # 确保输出目录存在
336
+ output_dir = os.path.dirname(output_path)
337
+ if output_dir and not os.path.exists(output_dir):
338
+ os.makedirs(output_dir)
339
+
340
+ # 转换为URL格式
341
+ output_url = uno.systemPathToFileUrl(os.path.abspath(output_path))
342
+
343
+ # 执行转换
344
+ document.storeToURL(output_url, properties)
345
+ logger.info(f"✅ 文档转换成功: {output_path}")
346
+
347
+ def _make_property(self, name: str, value):
348
+ """创建属性对象"""
349
+ prop = PropertyValue()
350
+ prop.Name = name
351
+ prop.Value = value
352
+ return prop
353
+
354
+
355
+ # 全局单例UnoManager
356
+ _global_uno_manager: Optional[UnoManager] = None
357
+ _manager_lock = threading.Lock()
358
+
359
+
360
+ def get_uno_manager() -> UnoManager:
361
+ """获取全局单例UNO管理器"""
362
+ global _global_uno_manager
363
+
364
+ if _global_uno_manager is None:
365
+ with _manager_lock:
366
+ if _global_uno_manager is None:
367
+ _global_uno_manager = UnoManager()
368
+ logger.info("🎯 创建全局单例UnoManager (单线程模式)")
369
+
370
+ return _global_uno_manager
371
+
372
+
373
+ def cleanup_uno_manager():
374
+ """清理全局UNO管理器"""
375
+ global _global_uno_manager
376
+
377
+ with _manager_lock:
378
+ if _global_uno_manager is not None:
379
+ try:
380
+ _global_uno_manager.stop_service()
381
+ except:
382
+ pass
383
+ _global_uno_manager = None
384
+ logger.info("🧹 清理全局UnoManager")
385
+
386
+
387
+ @contextmanager
388
+ def uno_manager_context():
389
+ """UNO管理器上下文管理器,自动获取和管理"""
390
+ manager = get_uno_manager()
391
+ try:
392
+ yield manager
393
+ finally:
394
+ # 在单线程模式下,保持连接以提高效率
395
+ pass
396
+
397
+
398
+ def convert_with_uno(
399
+ input_path: str,
400
+ output_format: str,
401
+ output_dir: Optional[str] = None
402
+ ) -> str:
403
+ """
404
+ 使用UNO转换文档格式(便捷函数)
405
+
406
+ Args:
407
+ input_path: 输入文件路径
408
+ output_format: 输出格式
409
+ output_dir: 输出目录(可选,默认为输入文件所在目录)
410
+
411
+ Returns:
412
+ 输出文件路径
413
+ """
414
+ input_path = Path(input_path)
415
+
416
+ if output_dir is None:
417
+ output_dir = input_path.parent
418
+ else:
419
+ output_dir = Path(output_dir)
420
+
421
+ output_path = output_dir / f"{input_path.stem}.{output_format}"
422
+
423
+ with uno_manager_context() as manager:
424
+ manager.convert_document(str(input_path), str(output_path), output_format)
425
+
426
+ return str(output_path)