pydatamax 0.1.16__py3-none-any.whl → 0.1.16.post2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datamax/loader/core.py +67 -42
- datamax/loader/minio_handler.py +38 -19
- datamax/parser/__init__.py +2 -1
- datamax/parser/base.py +46 -22
- datamax/parser/core.py +215 -126
- datamax/parser/csv_parser.py +25 -5
- datamax/parser/doc_parser.py +230 -141
- datamax/parser/docx_parser.py +275 -186
- datamax/parser/epub_parser.py +49 -13
- datamax/parser/html_parser.py +36 -16
- datamax/parser/image_parser.py +52 -14
- datamax/parser/json_parser.py +26 -5
- datamax/parser/md_parser.py +40 -21
- datamax/parser/pdf_parser.py +69 -29
- datamax/parser/ppt_parser.py +41 -9
- datamax/parser/pptx_parser.py +49 -21
- datamax/parser/txt_parser.py +45 -14
- datamax/parser/xls_parser.py +34 -6
- datamax/parser/xlsx_parser.py +58 -51
- datamax/utils/__init__.py +2 -1
- datamax/utils/data_cleaner.py +36 -22
- datamax/utils/env_setup.py +25 -18
- datamax/utils/gotocr_pdf.py +13 -13
- datamax/utils/lifecycle_types.py +18 -0
- datamax/utils/mineru_operator.py +17 -15
- datamax/utils/paddleocr_pdf_operator.py +34 -19
- datamax/utils/ppt_extract.py +34 -11
- datamax/utils/qa_generator.py +332 -44
- datamax/utils/tokenizer.py +10 -9
- datamax/utils/uno_handler.py +91 -68
- {pydatamax-0.1.16.dist-info → pydatamax-0.1.16.post2.dist-info}/METADATA +54 -2
- pydatamax-0.1.16.post2.dist-info/RECORD +39 -0
- pydatamax-0.1.16.dist-info/RECORD +0 -38
- {pydatamax-0.1.16.dist-info → pydatamax-0.1.16.post2.dist-info}/WHEEL +0 -0
- {pydatamax-0.1.16.dist-info → pydatamax-0.1.16.post2.dist-info}/licenses/LICENSE +0 -0
- {pydatamax-0.1.16.dist-info → pydatamax-0.1.16.post2.dist-info}/top_level.txt +0 -0
datamax/utils/uno_handler.py
CHANGED
@@ -1,4 +1,3 @@
|
|
1
|
-
from loguru import logger
|
2
1
|
import os
|
3
2
|
import subprocess
|
4
3
|
import threading
|
@@ -7,7 +6,9 @@ from contextlib import contextmanager
|
|
7
6
|
from pathlib import Path
|
8
7
|
from typing import Optional
|
9
8
|
|
10
|
-
|
9
|
+
from loguru import logger
|
10
|
+
|
11
|
+
# delayed import of lock and flag
|
11
12
|
_uno_imported = False
|
12
13
|
_import_error = None
|
13
14
|
_import_lock = threading.Lock()
|
@@ -16,23 +17,23 @@ _import_lock = threading.Lock()
|
|
16
17
|
def _lazy_import_uno():
|
17
18
|
"""延迟导入 UNO 模块,避免与其他库冲突(线程安全)"""
|
18
19
|
global _uno_imported, _import_error
|
19
|
-
|
20
|
-
#
|
20
|
+
|
21
|
+
# quick check,avoiding unnacessary acquisition of lock
|
21
22
|
if _uno_imported:
|
22
23
|
return True
|
23
|
-
|
24
|
+
|
24
25
|
with _import_lock:
|
25
|
-
#
|
26
|
+
# double check lock mode
|
26
27
|
if _uno_imported:
|
27
28
|
return True
|
28
|
-
|
29
|
+
|
29
30
|
try:
|
30
|
-
#
|
31
|
+
# import module relate to UNO
|
31
32
|
global uno, PropertyValue, NoConnectException
|
32
33
|
import uno
|
33
34
|
from com.sun.star.beans import PropertyValue
|
34
35
|
from com.sun.star.connection import NoConnectException
|
35
|
-
|
36
|
+
|
36
37
|
_uno_imported = True
|
37
38
|
logger.info("✅ UNO模块导入成功")
|
38
39
|
return True
|
@@ -53,11 +54,12 @@ def ensure_uno_imported():
|
|
53
54
|
)
|
54
55
|
|
55
56
|
|
56
|
-
#
|
57
|
+
# check if uno is available(not importing immediately)
|
57
58
|
def check_uno_available():
|
58
59
|
"""检查 UNO 是否可用(不会真正导入)"""
|
59
60
|
try:
|
60
61
|
import importlib.util
|
62
|
+
|
61
63
|
spec = importlib.util.find_spec("uno")
|
62
64
|
return spec is not None
|
63
65
|
except:
|
@@ -72,7 +74,7 @@ class UnoManager:
|
|
72
74
|
UNO管理器,用于管理LibreOffice服务实例和文档转换
|
73
75
|
单线程版本,适合稳定高效的文档处理
|
74
76
|
"""
|
75
|
-
|
77
|
+
|
76
78
|
def __init__(self, host: str = "localhost", port: int = 2002, timeout: int = 30):
|
77
79
|
"""
|
78
80
|
初始化UNO管理器
|
@@ -82,9 +84,9 @@ class UnoManager:
|
|
82
84
|
port: LibreOffice服务端口
|
83
85
|
timeout: 连接超时时间(秒)
|
84
86
|
"""
|
85
|
-
#
|
87
|
+
# Ensure that UNO has been imported (in a thread-safe manner)
|
86
88
|
ensure_uno_imported()
|
87
|
-
|
89
|
+
|
88
90
|
self.host = host
|
89
91
|
self.port = port
|
90
92
|
self.timeout = timeout
|
@@ -102,12 +104,12 @@ class UnoManager:
|
|
102
104
|
"""启动LibreOffice服务"""
|
103
105
|
logger.info(f"🌟 启动LibreOffice服务,监听端口 {self.port}...")
|
104
106
|
|
105
|
-
#
|
107
|
+
# check if soffice running
|
106
108
|
if self._check_soffice_running():
|
107
109
|
logger.info("✅ LibreOffice服务已在运行")
|
108
110
|
return
|
109
111
|
|
110
|
-
#
|
112
|
+
# new a soffice
|
111
113
|
cmd = [
|
112
114
|
"soffice",
|
113
115
|
"--headless",
|
@@ -125,12 +127,25 @@ class UnoManager:
|
|
125
127
|
cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL
|
126
128
|
)
|
127
129
|
logger.info(f"⏳ 等待LibreOffice服务启动...")
|
128
|
-
time.sleep(5) # 给服务一些启动时间
|
129
130
|
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
131
|
+
# Intelligent waiting: Polling to check service status, providing flexible time for machines of different performance levels.
|
132
|
+
start_time = time.time()
|
133
|
+
check_interval = 1 # checking every sec
|
134
|
+
max_wait_time = 30 # wait for max 30 sec
|
135
|
+
|
136
|
+
while time.time() - start_time < max_wait_time:
|
137
|
+
if self._check_soffice_running():
|
138
|
+
elapsed = time.time() - start_time
|
139
|
+
logger.info(f"✅ LibreOffice服务启动成功 (耗时 {elapsed:.1f}秒)")
|
140
|
+
return
|
141
|
+
|
142
|
+
logger.debug(
|
143
|
+
f"🔄 服务未就绪,继续等待... (已等待 {time.time() - start_time:.1f}秒)"
|
144
|
+
)
|
145
|
+
time.sleep(check_interval)
|
146
|
+
|
147
|
+
# overtime
|
148
|
+
raise Exception(f"LibreOffice服务启动超时 (等待了{max_wait_time}秒)")
|
134
149
|
|
135
150
|
except Exception as e:
|
136
151
|
logger.error(f"❌ 启动LibreOffice服务失败: {str(e)}")
|
@@ -157,38 +172,38 @@ class UnoManager:
|
|
157
172
|
"""连接到LibreOffice服务"""
|
158
173
|
with self._lock:
|
159
174
|
if self._connected and self._desktop is not None:
|
160
|
-
return #
|
161
|
-
|
175
|
+
return # connected
|
176
|
+
|
162
177
|
self._start_soffice_service()
|
163
|
-
|
178
|
+
|
164
179
|
logger.info(f"🔌 连接到LibreOffice服务...")
|
165
180
|
start_time = time.time()
|
166
|
-
|
181
|
+
|
167
182
|
while time.time() - start_time < self.timeout:
|
168
183
|
try:
|
169
|
-
#
|
184
|
+
# get context
|
170
185
|
local_ctx = uno.getComponentContext()
|
171
186
|
resolver = local_ctx.ServiceManager.createInstanceWithContext(
|
172
187
|
"com.sun.star.bridge.UnoUrlResolver", local_ctx
|
173
188
|
)
|
174
|
-
|
175
|
-
#
|
189
|
+
|
190
|
+
# connect to LibreOffice
|
176
191
|
self._ctx = resolver.resolve(f"uno:{self.connection_string}")
|
177
192
|
self._desktop = self._ctx.ServiceManager.createInstanceWithContext(
|
178
193
|
"com.sun.star.frame.Desktop", self._ctx
|
179
194
|
)
|
180
|
-
|
195
|
+
|
181
196
|
self._connected = True
|
182
197
|
logger.info("✅ 成功连接到LibreOffice服务")
|
183
198
|
return
|
184
|
-
|
199
|
+
|
185
200
|
except NoConnectException:
|
186
201
|
logger.debug("⏳ 等待LibreOffice服务就绪...")
|
187
202
|
time.sleep(1)
|
188
203
|
except Exception as e:
|
189
204
|
logger.error(f"❌ 连接失败: {str(e)}")
|
190
205
|
time.sleep(1)
|
191
|
-
|
206
|
+
|
192
207
|
raise TimeoutError(f"连接LibreOffice服务超时({self.timeout}秒)")
|
193
208
|
|
194
209
|
def disconnect(self):
|
@@ -229,10 +244,10 @@ class UnoManager:
|
|
229
244
|
"""
|
230
245
|
self.connect()
|
231
246
|
|
232
|
-
#
|
247
|
+
# converse path to URL
|
233
248
|
file_url = uno.systemPathToFileUrl(os.path.abspath(file_path))
|
234
249
|
|
235
|
-
#
|
250
|
+
# open file
|
236
251
|
properties = []
|
237
252
|
properties.append(self._make_property("Hidden", True))
|
238
253
|
properties.append(self._make_property("ReadOnly", True))
|
@@ -274,53 +289,63 @@ class UnoManager:
|
|
274
289
|
if document is None:
|
275
290
|
raise Exception(f"无法打开文档: {input_path}")
|
276
291
|
|
277
|
-
#
|
292
|
+
# prepare to output properties
|
278
293
|
properties = []
|
279
294
|
|
280
|
-
#
|
295
|
+
# set filter
|
281
296
|
if filter_name:
|
282
297
|
properties.append(self._make_property("FilterName", filter_name))
|
283
298
|
else:
|
284
|
-
#
|
299
|
+
# choose filter by format
|
285
300
|
if output_format == "txt":
|
286
|
-
#
|
301
|
+
# multi-filter for multi-files
|
287
302
|
filter_options = [
|
288
303
|
("Text (encoded)", "UTF8"),
|
289
304
|
("Text", None),
|
290
|
-
("HTML (StarWriter)", None)
|
305
|
+
("HTML (StarWriter)", None),
|
291
306
|
]
|
292
|
-
|
307
|
+
|
293
308
|
success = False
|
294
309
|
for filter_name, filter_option in filter_options:
|
295
310
|
try:
|
296
311
|
properties = []
|
297
|
-
properties.append(
|
312
|
+
properties.append(
|
313
|
+
self._make_property("FilterName", filter_name)
|
314
|
+
)
|
298
315
|
if filter_option:
|
299
|
-
properties.append(
|
300
|
-
|
301
|
-
|
316
|
+
properties.append(
|
317
|
+
self._make_property("FilterOptions", filter_option)
|
318
|
+
)
|
319
|
+
|
320
|
+
# ensuring that the output directory exists.
|
302
321
|
output_dir = os.path.dirname(output_path)
|
303
322
|
if output_dir and not os.path.exists(output_dir):
|
304
323
|
os.makedirs(output_dir)
|
305
324
|
|
306
|
-
#
|
307
|
-
output_url = uno.systemPathToFileUrl(
|
325
|
+
# converse to URL
|
326
|
+
output_url = uno.systemPathToFileUrl(
|
327
|
+
os.path.abspath(output_path)
|
328
|
+
)
|
308
329
|
|
309
|
-
#
|
330
|
+
# conversing
|
310
331
|
document.storeToURL(output_url, properties)
|
311
|
-
logger.info(
|
332
|
+
logger.info(
|
333
|
+
f"✅ 文档转换成功 (使用过滤器: {filter_name}): {output_path}"
|
334
|
+
)
|
312
335
|
success = True
|
313
336
|
break
|
314
337
|
except Exception as e:
|
315
338
|
logger.debug(f"🔄 过滤器 {filter_name} 失败: {str(e)}")
|
316
339
|
continue
|
317
|
-
|
340
|
+
|
318
341
|
if not success:
|
319
|
-
raise Exception(
|
320
|
-
|
321
|
-
|
342
|
+
raise Exception(
|
343
|
+
f"所有文本过滤器都失败,无法转换文档: {input_path}"
|
344
|
+
)
|
345
|
+
|
346
|
+
return # converted,return
|
322
347
|
else:
|
323
|
-
#
|
348
|
+
# Other formats use the default filter
|
324
349
|
filter_map = {
|
325
350
|
"pdf": "writer_pdf_Export",
|
326
351
|
"docx": "MS Word 2007 XML",
|
@@ -332,15 +357,15 @@ class UnoManager:
|
|
332
357
|
self._make_property("FilterName", filter_map[output_format])
|
333
358
|
)
|
334
359
|
|
335
|
-
#
|
360
|
+
# ensuring that the output directory exists
|
336
361
|
output_dir = os.path.dirname(output_path)
|
337
362
|
if output_dir and not os.path.exists(output_dir):
|
338
363
|
os.makedirs(output_dir)
|
339
364
|
|
340
|
-
#
|
365
|
+
# converse to URL
|
341
366
|
output_url = uno.systemPathToFileUrl(os.path.abspath(output_path))
|
342
367
|
|
343
|
-
#
|
368
|
+
# conversing
|
344
369
|
document.storeToURL(output_url, properties)
|
345
370
|
logger.info(f"✅ 文档转换成功: {output_path}")
|
346
371
|
|
@@ -352,7 +377,7 @@ class UnoManager:
|
|
352
377
|
return prop
|
353
378
|
|
354
379
|
|
355
|
-
#
|
380
|
+
# global Singleton UnoManager
|
356
381
|
_global_uno_manager: Optional[UnoManager] = None
|
357
382
|
_manager_lock = threading.Lock()
|
358
383
|
|
@@ -360,20 +385,20 @@ _manager_lock = threading.Lock()
|
|
360
385
|
def get_uno_manager() -> UnoManager:
|
361
386
|
"""获取全局单例UNO管理器"""
|
362
387
|
global _global_uno_manager
|
363
|
-
|
388
|
+
|
364
389
|
if _global_uno_manager is None:
|
365
390
|
with _manager_lock:
|
366
391
|
if _global_uno_manager is None:
|
367
392
|
_global_uno_manager = UnoManager()
|
368
393
|
logger.info("🎯 创建全局单例UnoManager (单线程模式)")
|
369
|
-
|
394
|
+
|
370
395
|
return _global_uno_manager
|
371
396
|
|
372
397
|
|
373
398
|
def cleanup_uno_manager():
|
374
399
|
"""清理全局UNO管理器"""
|
375
400
|
global _global_uno_manager
|
376
|
-
|
401
|
+
|
377
402
|
with _manager_lock:
|
378
403
|
if _global_uno_manager is not None:
|
379
404
|
try:
|
@@ -391,36 +416,34 @@ def uno_manager_context():
|
|
391
416
|
try:
|
392
417
|
yield manager
|
393
418
|
finally:
|
394
|
-
#
|
419
|
+
# Maintain connections to improve efficiency in single-threaded mode
|
395
420
|
pass
|
396
421
|
|
397
422
|
|
398
423
|
def convert_with_uno(
|
399
|
-
input_path: str,
|
400
|
-
output_format: str,
|
401
|
-
output_dir: Optional[str] = None
|
424
|
+
input_path: str, output_format: str, output_dir: Optional[str] = None
|
402
425
|
) -> str:
|
403
426
|
"""
|
404
427
|
使用UNO转换文档格式(便捷函数)
|
405
|
-
|
428
|
+
|
406
429
|
Args:
|
407
430
|
input_path: 输入文件路径
|
408
431
|
output_format: 输出格式
|
409
432
|
output_dir: 输出目录(可选,默认为输入文件所在目录)
|
410
|
-
|
433
|
+
|
411
434
|
Returns:
|
412
435
|
输出文件路径
|
413
436
|
"""
|
414
437
|
input_path = Path(input_path)
|
415
|
-
|
438
|
+
|
416
439
|
if output_dir is None:
|
417
440
|
output_dir = input_path.parent
|
418
441
|
else:
|
419
442
|
output_dir = Path(output_dir)
|
420
|
-
|
443
|
+
|
421
444
|
output_path = output_dir / f"{input_path.stem}.{output_format}"
|
422
|
-
|
445
|
+
|
423
446
|
with uno_manager_context() as manager:
|
424
447
|
manager.convert_document(str(input_path), str(output_path), output_format)
|
425
|
-
|
448
|
+
|
426
449
|
return str(output_path)
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: pydatamax
|
3
|
-
Version: 0.1.16
|
3
|
+
Version: 0.1.16.post2
|
4
4
|
Summary: A library for parsing and converting various file formats.
|
5
5
|
Home-page: https://github.com/Hi-Dolphin/datamax
|
6
6
|
Author: ccy
|
@@ -113,7 +113,7 @@ qa_data = dm.get_pre_label(
|
|
113
113
|
question_number=5, # 每块生成问题数
|
114
114
|
max_workers=5 # 并发数
|
115
115
|
)
|
116
|
-
dm.save_label_data(
|
116
|
+
dm.save_label_data(qa_data)
|
117
117
|
```
|
118
118
|
|
119
119
|
## 📖 Detailed Documentation
|
@@ -316,6 +316,58 @@ pip install -r requirements.txt
|
|
316
316
|
python setup.py install
|
317
317
|
```
|
318
318
|
|
319
|
+
### Developer Mode
|
320
|
+
|
321
|
+
For developers who want to contribute to the project or make modifications, we recommend using developer mode for a better development experience.
|
322
|
+
|
323
|
+
#### Setup Developer Mode
|
324
|
+
|
325
|
+
```bash
|
326
|
+
# Clone the repository
|
327
|
+
git clone https://github.com/Hi-Dolphin/datamax.git
|
328
|
+
cd datamax
|
329
|
+
|
330
|
+
# Create virtual environment (recommended)
|
331
|
+
python -m venv venv
|
332
|
+
source venv/bin/activate # On Windows: venv\Scripts\activate
|
333
|
+
|
334
|
+
# Install in developer mode
|
335
|
+
pip install -e .
|
336
|
+
```
|
337
|
+
|
338
|
+
#### Benefits of Developer Mode
|
339
|
+
|
340
|
+
- **Live Updates**: Changes to source code are immediately reflected without reinstallation
|
341
|
+
- **Easy Testing**: Test your modifications instantly
|
342
|
+
- **Debugging**: Better debugging experience with direct access to source code
|
343
|
+
- **Development Workflow**: Seamless integration with your development environment
|
344
|
+
|
345
|
+
#### Development Commands
|
346
|
+
|
347
|
+
```bash
|
348
|
+
# Run tests
|
349
|
+
pytest
|
350
|
+
|
351
|
+
# Install development dependencies
|
352
|
+
pip install -r requirements-dev.txt # if available
|
353
|
+
|
354
|
+
# Check code style
|
355
|
+
flake8 datamax/
|
356
|
+
black datamax/
|
357
|
+
|
358
|
+
# Build package
|
359
|
+
python setup.py sdist bdist_wheel
|
360
|
+
```
|
361
|
+
|
362
|
+
#### Making Changes
|
363
|
+
|
364
|
+
After installing in developer mode, you can:
|
365
|
+
|
366
|
+
1. Edit source code in the `datamax/` directory
|
367
|
+
2. Changes are automatically available when you import the module
|
368
|
+
3. Test your changes immediately without reinstalling
|
369
|
+
4. Submit pull requests with your improvements
|
370
|
+
|
319
371
|
## 📋 System Requirements
|
320
372
|
|
321
373
|
- Python >= 3.10
|
@@ -0,0 +1,39 @@
|
|
1
|
+
datamax/__init__.py,sha256=IGJxWkFpUj1xuHfwtPTrNqsRdLB4jBZIweAVHzDKrvU,29
|
2
|
+
datamax/loader/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
3
|
+
datamax/loader/core.py,sha256=Ld4PmMcbKbsFyU_ynqXxpB9x3IJ34c3hfJBcUiSthrA,5370
|
4
|
+
datamax/loader/minio_handler.py,sha256=VpQ5EHZfLaw0e2JXflAbgPK_plmM_VkPXEiKtIZlQL0,6876
|
5
|
+
datamax/loader/oss_handler.py,sha256=ZO8ZbbA1oyuNN4Z7iVgSbMArYgJ1gvpqADkXDNDw4y0,7648
|
6
|
+
datamax/parser/__init__.py,sha256=3tCt1bmTjJ0sroivt60AoQuZyHH8AtCvn664Qtoh-60,56
|
7
|
+
datamax/parser/base.py,sha256=yEPdk3K-vTf2JnIcTczxEDoMQtVKva9tp2nSACeXOB0,3153
|
8
|
+
datamax/parser/core.py,sha256=eglNe4Vk6U3XEUYT2oLovWfuL1XeFDc0KnRahYN24Mk,19208
|
9
|
+
datamax/parser/csv_parser.py,sha256=PPRqL4MKDymBgFYo0xrgrO8HB3jFcrXTizG27fXVEag,1698
|
10
|
+
datamax/parser/doc_parser.py,sha256=x6aJMQmNCUR2WA0hMvsro1atz6kWK3pRLPzlLfwCUw0,32273
|
11
|
+
datamax/parser/docx_parser.py,sha256=40xq86jLI3nayg0dthBnWSMN2qYQeyUNWh5wWJ8Lar8,37658
|
12
|
+
datamax/parser/epub_parser.py,sha256=zjLp1ha_oQBGIxvgzAyHzO0ZZJQt1JfoAZ9TM8liZ0o,2708
|
13
|
+
datamax/parser/html_parser.py,sha256=b5Rvonj0cScYI2gYfKAfBplp6C8kT2ataf3J0lO40Ok,2017
|
14
|
+
datamax/parser/image_parser.py,sha256=HYNN0oqA1LjI8XBQN09nnWfnVrLcDHcXHOytf8z6NAk,2536
|
15
|
+
datamax/parser/json_parser.py,sha256=wzUKv1lH35PtM8uXunxNZ6ykHEdI_m02SIW1Y7y-wxc,1826
|
16
|
+
datamax/parser/md_parser.py,sha256=62vBHAsHotCC1bNquh7jt8EZuoPyANrbkRTF7YlEOMs,3019
|
17
|
+
datamax/parser/pdf_parser.py,sha256=pm6WVNe2nP5K-XHwNfa3BoS_oNDDdJ5kOo5QERICfkI,5245
|
18
|
+
datamax/parser/ppt_parser.py,sha256=7KmSrxyPRYlYqclMcsxQmLTD3eR1mbED4qgm1mjF4Mg,5808
|
19
|
+
datamax/parser/pptx_parser.py,sha256=huMbv9JMGI7Nvs3v-TGn7MOpicb8-z3L5G24PxBVDzw,2546
|
20
|
+
datamax/parser/txt_parser.py,sha256=GyPFuYQ00mI1shmMFi_gtDJ8B-C2rMI9rI0sjay32Hs,2630
|
21
|
+
datamax/parser/xls_parser.py,sha256=J-Eumrh3oxwr06YHHfUplJlskwBlsEoj3sQ9OVXBFCo,1819
|
22
|
+
datamax/parser/xlsx_parser.py,sha256=Uj1OisEVAzO8mMRcTHpitBstS0M7aSS4UehnE78pvxU,9468
|
23
|
+
datamax/utils/__init__.py,sha256=elPbB7MSk5VfcmKmhaXCTUsVXP9vxd8C-DAMG3JqbDU,1491
|
24
|
+
datamax/utils/constants.py,sha256=1hzHnYsm43Q36Czc7OnC-zJVTunThx82d_ZZAZBErHw,4565
|
25
|
+
datamax/utils/data_cleaner.py,sha256=2sfjXkDaEXavr98Ezj1BWG4uJQPUzeR99172tH43-Yk,10454
|
26
|
+
datamax/utils/env_setup.py,sha256=lXPAL6WGkjOBgqTa2A0li5YS2TQ96cvAR4OhJjQP4pA,3638
|
27
|
+
datamax/utils/gotocr_pdf.py,sha256=LHQ4nIFNC47b7hLWzMGkk_UsLmIxMLnUhRa48iwJo48,8796
|
28
|
+
datamax/utils/lifecycle_types.py,sha256=rvHB4zwzS_nlWKUtWA37L9dJNvx6ol5F2-x2eEf6zJk,625
|
29
|
+
datamax/utils/mineru_operator.py,sha256=mBw9xuCwJZmmOLaUFhw2c3JPDB7KMjWqSlEzbKCRXc8,2276
|
30
|
+
datamax/utils/paddleocr_pdf_operator.py,sha256=5l7P7wCGd4-Qph3NMTDdHR6nStjafDMNpX4sSCFv5qQ,3637
|
31
|
+
datamax/utils/ppt_extract.py,sha256=Sf4H3TKdK6BnKRv0sw5JnfKSQH9l6u5XUwLTd78KB94,6619
|
32
|
+
datamax/utils/qa_generator.py,sha256=q7pzZ3DWItRQLBQH1jab2TBkjJvcKfkBuzlN0wxZ5Rs,24353
|
33
|
+
datamax/utils/tokenizer.py,sha256=j93Uky4bYDKZKT-MOtenZb36MoRPNnYk8sP9t_FSQqk,860
|
34
|
+
datamax/utils/uno_handler.py,sha256=xITU8FGeeBtHRc-Aj4lbKHGvKVslWEwWZOIUZiP_ghY,15447
|
35
|
+
pydatamax-0.1.16.post2.dist-info/licenses/LICENSE,sha256=RltoeMa9c1956S08ThvZE2yJSPbnem68Y8cmiIfOgco,1088
|
36
|
+
pydatamax-0.1.16.post2.dist-info/METADATA,sha256=pTi_avX8RBNYxHcPS6CmZnESFqGlX-TwvqMzF6Ilx0Q,11145
|
37
|
+
pydatamax-0.1.16.post2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
38
|
+
pydatamax-0.1.16.post2.dist-info/top_level.txt,sha256=N9TrwI3GKnWW07RRXHr0xX5Bm8dIM_sahfAnf9j8J9M,8
|
39
|
+
pydatamax-0.1.16.post2.dist-info/RECORD,,
|
@@ -1,38 +0,0 @@
|
|
1
|
-
datamax/__init__.py,sha256=IGJxWkFpUj1xuHfwtPTrNqsRdLB4jBZIweAVHzDKrvU,29
|
2
|
-
datamax/loader/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
3
|
-
datamax/loader/core.py,sha256=NGnK2m59GRBauYxZST0kyX5f4zhvAOk4Z5bVoF0CjGo,5218
|
4
|
-
datamax/loader/minio_handler.py,sha256=e7ZUlwoStVe5iQfAVfNgEwRLxen4NbxwokpJZl6AR0U,6557
|
5
|
-
datamax/loader/oss_handler.py,sha256=ZO8ZbbA1oyuNN4Z7iVgSbMArYgJ1gvpqADkXDNDw4y0,7648
|
6
|
-
datamax/parser/__init__.py,sha256=sIB1N4B_fVguSBN-Uw9tGxAO6s0oi4Tq4kRZ59YlUKo,52
|
7
|
-
datamax/parser/base.py,sha256=FamDV6csc3aXVbobMR1lPNtVpvYMO19koRJW9poj_gE,2590
|
8
|
-
datamax/parser/core.py,sha256=pySissrF6kVVAzT5abIlQ-4cUliFu1HBWjcD6psNkYA,16845
|
9
|
-
datamax/parser/csv_parser.py,sha256=lHQs1MHK9WM4Vl0p9nsE3fFhewF0EoXZUhtk8ixznRw,1028
|
10
|
-
datamax/parser/doc_parser.py,sha256=qPKpZy_p1veV2AodqEQU6LzqmT7y1PANlPtt0CYoHeg,30837
|
11
|
-
datamax/parser/docx_parser.py,sha256=wdDGgeYIDg1Se493XZhlduxKjtYZ58Uqxltm2vt9Dy4,36691
|
12
|
-
datamax/parser/epub_parser.py,sha256=K4eCS4wIXJzDicvtVAfQT8yt1gFHeibZN5-EdQZfJe8,1621
|
13
|
-
datamax/parser/html_parser.py,sha256=5ACrVc03Q9pJqWI_b0EtRgOYy0eMYJq4podgHGD68Z8,1453
|
14
|
-
datamax/parser/image_parser.py,sha256=UH3duPvB7Xu6CFlEeAukX5uJ8VlqnMR89hcLsW2O-aU,1281
|
15
|
-
datamax/parser/json_parser.py,sha256=2Ns2Lm6sei9TnDaFFYvl-xhyhse12sMJBwjKNACw4po,1072
|
16
|
-
datamax/parser/md_parser.py,sha256=rHJqtRV78XgQuKtDdwn1LcgRSUEuhGBqN5uaHG6oPT4,2251
|
17
|
-
datamax/parser/pdf_parser.py,sha256=YOJFOEC8DxWLAE6yNd2x6qMvYJl3sWVcM1eA8R4uA30,4116
|
18
|
-
datamax/parser/ppt_parser.py,sha256=0OlsIrzZZJnYZqLmQkUz4J_Hiv-rQHLHJnHIsw345c8,4631
|
19
|
-
datamax/parser/pptx_parser.py,sha256=yWajVd8kpyTdFavR8XcbwxOj94uNBswDoMHfSOycn0o,1870
|
20
|
-
datamax/parser/txt_parser.py,sha256=NXs7aNpm1PUwiUSlN1RU23ittuuQSBaBNI3KeQjJFXs,1750
|
21
|
-
datamax/parser/xls_parser.py,sha256=iNMx8iPakjE984dkaFL-oUBYWpQwxbWoDnQdwfAeeGM,980
|
22
|
-
datamax/parser/xlsx_parser.py,sha256=hUOFqkqkI0XPcwwrp2cs4PFKbChpZtb8orGsZc9kxJ0,9089
|
23
|
-
datamax/utils/__init__.py,sha256=75D4WFE_FVG9MyT8qWtvtlgzuuRelTC7ObSqqfjDKIY,1476
|
24
|
-
datamax/utils/constants.py,sha256=1hzHnYsm43Q36Czc7OnC-zJVTunThx82d_ZZAZBErHw,4565
|
25
|
-
datamax/utils/data_cleaner.py,sha256=TrrxC1r0__wuOhrQSJZcJKoEIyB4eNKWZkA1IoBYhyQ,9937
|
26
|
-
datamax/utils/env_setup.py,sha256=p_7sqHwyXroeOI_yFZpUOK6wOGmPVmf-gBa6M3351O4,3539
|
27
|
-
datamax/utils/gotocr_pdf.py,sha256=A7sn77EQBDbAe-4edCBUlYXKuE2mY7JcsFGm8U3-xbE,8744
|
28
|
-
datamax/utils/mineru_operator.py,sha256=4i4FtDkDE61FWPyRoDjPujHYJq_kDUAkwlowmFWdEOA,2303
|
29
|
-
datamax/utils/paddleocr_pdf_operator.py,sha256=SW06dts1SxDnUvyf5zWYpAN_6t9PLtJXUSsYhSS317I,3645
|
30
|
-
datamax/utils/ppt_extract.py,sha256=jBVGYEsBGPjHqyq7drHTOM8MnFOwqKyHhbkKmEAryAk,6307
|
31
|
-
datamax/utils/qa_generator.py,sha256=pXxdFm_EnWgMuilfmLKgy2c6NDexQZN8nWxT-bYBt74,12548
|
32
|
-
datamax/utils/tokenizer.py,sha256=o78GPmeJ3vs3-SF0b2nMm35XtbrCKbrhDW0gI9gqGl4,880
|
33
|
-
datamax/utils/uno_handler.py,sha256=gDm42OQQQoCiOP0SB7xZ9TRF6A_XBHNavwG5ycj6kEQ,14807
|
34
|
-
pydatamax-0.1.16.dist-info/licenses/LICENSE,sha256=RltoeMa9c1956S08ThvZE2yJSPbnem68Y8cmiIfOgco,1088
|
35
|
-
pydatamax-0.1.16.dist-info/METADATA,sha256=nwHHLzsxwm_Za3aVbLMFfL-3JIZSZclp9KI0XL6rOHE,9795
|
36
|
-
pydatamax-0.1.16.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
37
|
-
pydatamax-0.1.16.dist-info/top_level.txt,sha256=N9TrwI3GKnWW07RRXHr0xX5Bm8dIM_sahfAnf9j8J9M,8
|
38
|
-
pydatamax-0.1.16.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|