pydatamax 0.1.16.post1__py3-none-any.whl → 0.1.16.post2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datamax/loader/core.py +67 -42
- datamax/loader/minio_handler.py +38 -19
- datamax/parser/__init__.py +2 -1
- datamax/parser/base.py +46 -22
- datamax/parser/core.py +215 -126
- datamax/parser/csv_parser.py +25 -5
- datamax/parser/doc_parser.py +230 -141
- datamax/parser/docx_parser.py +275 -186
- datamax/parser/epub_parser.py +49 -13
- datamax/parser/html_parser.py +36 -16
- datamax/parser/image_parser.py +52 -14
- datamax/parser/json_parser.py +26 -5
- datamax/parser/md_parser.py +40 -21
- datamax/parser/pdf_parser.py +69 -29
- datamax/parser/ppt_parser.py +41 -9
- datamax/parser/pptx_parser.py +49 -21
- datamax/parser/txt_parser.py +45 -14
- datamax/parser/xls_parser.py +34 -6
- datamax/parser/xlsx_parser.py +58 -51
- datamax/utils/__init__.py +2 -1
- datamax/utils/data_cleaner.py +36 -22
- datamax/utils/env_setup.py +25 -18
- datamax/utils/gotocr_pdf.py +13 -13
- datamax/utils/lifecycle_types.py +18 -0
- datamax/utils/mineru_operator.py +17 -15
- datamax/utils/paddleocr_pdf_operator.py +34 -19
- datamax/utils/ppt_extract.py +34 -11
- datamax/utils/qa_generator.py +332 -44
- datamax/utils/tokenizer.py +10 -9
- datamax/utils/uno_handler.py +84 -72
- {pydatamax-0.1.16.post1.dist-info → pydatamax-0.1.16.post2.dist-info}/METADATA +54 -2
- pydatamax-0.1.16.post2.dist-info/RECORD +39 -0
- pydatamax-0.1.16.post1.dist-info/RECORD +0 -38
- {pydatamax-0.1.16.post1.dist-info → pydatamax-0.1.16.post2.dist-info}/WHEEL +0 -0
- {pydatamax-0.1.16.post1.dist-info → pydatamax-0.1.16.post2.dist-info}/licenses/LICENSE +0 -0
- {pydatamax-0.1.16.post1.dist-info → pydatamax-0.1.16.post2.dist-info}/top_level.txt +0 -0
datamax/utils/uno_handler.py
CHANGED
@@ -1,4 +1,3 @@
|
|
1
|
-
from loguru import logger
|
2
1
|
import os
|
3
2
|
import subprocess
|
4
3
|
import threading
|
@@ -7,7 +6,9 @@ from contextlib import contextmanager
|
|
7
6
|
from pathlib import Path
|
8
7
|
from typing import Optional
|
9
8
|
|
10
|
-
|
9
|
+
from loguru import logger
|
10
|
+
|
11
|
+
# delayed import of lock and flag
|
11
12
|
_uno_imported = False
|
12
13
|
_import_error = None
|
13
14
|
_import_lock = threading.Lock()
|
@@ -16,23 +17,23 @@ _import_lock = threading.Lock()
|
|
16
17
|
def _lazy_import_uno():
|
17
18
|
"""延迟导入 UNO 模块,避免与其他库冲突(线程安全)"""
|
18
19
|
global _uno_imported, _import_error
|
19
|
-
|
20
|
-
#
|
20
|
+
|
21
|
+
# quick check,avoiding unnacessary acquisition of lock
|
21
22
|
if _uno_imported:
|
22
23
|
return True
|
23
|
-
|
24
|
+
|
24
25
|
with _import_lock:
|
25
|
-
#
|
26
|
+
# double check lock mode
|
26
27
|
if _uno_imported:
|
27
28
|
return True
|
28
|
-
|
29
|
+
|
29
30
|
try:
|
30
|
-
#
|
31
|
+
# import module relate to UNO
|
31
32
|
global uno, PropertyValue, NoConnectException
|
32
33
|
import uno
|
33
34
|
from com.sun.star.beans import PropertyValue
|
34
35
|
from com.sun.star.connection import NoConnectException
|
35
|
-
|
36
|
+
|
36
37
|
_uno_imported = True
|
37
38
|
logger.info("✅ UNO模块导入成功")
|
38
39
|
return True
|
@@ -53,11 +54,12 @@ def ensure_uno_imported():
|
|
53
54
|
)
|
54
55
|
|
55
56
|
|
56
|
-
#
|
57
|
+
# check if uno is available(not importing immediately)
|
57
58
|
def check_uno_available():
|
58
59
|
"""检查 UNO 是否可用(不会真正导入)"""
|
59
60
|
try:
|
60
61
|
import importlib.util
|
62
|
+
|
61
63
|
spec = importlib.util.find_spec("uno")
|
62
64
|
return spec is not None
|
63
65
|
except:
|
@@ -72,7 +74,7 @@ class UnoManager:
|
|
72
74
|
UNO管理器,用于管理LibreOffice服务实例和文档转换
|
73
75
|
单线程版本,适合稳定高效的文档处理
|
74
76
|
"""
|
75
|
-
|
77
|
+
|
76
78
|
def __init__(self, host: str = "localhost", port: int = 2002, timeout: int = 30):
|
77
79
|
"""
|
78
80
|
初始化UNO管理器
|
@@ -82,9 +84,9 @@ class UnoManager:
|
|
82
84
|
port: LibreOffice服务端口
|
83
85
|
timeout: 连接超时时间(秒)
|
84
86
|
"""
|
85
|
-
#
|
87
|
+
# Ensure that UNO has been imported (in a thread-safe manner)
|
86
88
|
ensure_uno_imported()
|
87
|
-
|
89
|
+
|
88
90
|
self.host = host
|
89
91
|
self.port = port
|
90
92
|
self.timeout = timeout
|
@@ -102,12 +104,12 @@ class UnoManager:
|
|
102
104
|
"""启动LibreOffice服务"""
|
103
105
|
logger.info(f"🌟 启动LibreOffice服务,监听端口 {self.port}...")
|
104
106
|
|
105
|
-
#
|
107
|
+
# check if soffice running
|
106
108
|
if self._check_soffice_running():
|
107
109
|
logger.info("✅ LibreOffice服务已在运行")
|
108
110
|
return
|
109
111
|
|
110
|
-
#
|
112
|
+
# new a soffice
|
111
113
|
cmd = [
|
112
114
|
"soffice",
|
113
115
|
"--headless",
|
@@ -125,22 +127,24 @@ class UnoManager:
|
|
125
127
|
cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL
|
126
128
|
)
|
127
129
|
logger.info(f"⏳ 等待LibreOffice服务启动...")
|
128
|
-
|
129
|
-
#
|
130
|
+
|
131
|
+
# Intelligent waiting: Polling to check service status, providing flexible time for machines of different performance levels.
|
130
132
|
start_time = time.time()
|
131
|
-
check_interval = 1 #
|
132
|
-
max_wait_time = 30
|
133
|
-
|
133
|
+
check_interval = 1 # checking every sec
|
134
|
+
max_wait_time = 30 # wait for max 30 sec
|
135
|
+
|
134
136
|
while time.time() - start_time < max_wait_time:
|
135
137
|
if self._check_soffice_running():
|
136
138
|
elapsed = time.time() - start_time
|
137
139
|
logger.info(f"✅ LibreOffice服务启动成功 (耗时 {elapsed:.1f}秒)")
|
138
140
|
return
|
139
|
-
|
140
|
-
logger.debug(
|
141
|
+
|
142
|
+
logger.debug(
|
143
|
+
f"🔄 服务未就绪,继续等待... (已等待 {time.time() - start_time:.1f}秒)"
|
144
|
+
)
|
141
145
|
time.sleep(check_interval)
|
142
|
-
|
143
|
-
#
|
146
|
+
|
147
|
+
# overtime
|
144
148
|
raise Exception(f"LibreOffice服务启动超时 (等待了{max_wait_time}秒)")
|
145
149
|
|
146
150
|
except Exception as e:
|
@@ -168,38 +172,38 @@ class UnoManager:
|
|
168
172
|
"""连接到LibreOffice服务"""
|
169
173
|
with self._lock:
|
170
174
|
if self._connected and self._desktop is not None:
|
171
|
-
return #
|
172
|
-
|
175
|
+
return # connected
|
176
|
+
|
173
177
|
self._start_soffice_service()
|
174
|
-
|
178
|
+
|
175
179
|
logger.info(f"🔌 连接到LibreOffice服务...")
|
176
180
|
start_time = time.time()
|
177
|
-
|
181
|
+
|
178
182
|
while time.time() - start_time < self.timeout:
|
179
183
|
try:
|
180
|
-
#
|
184
|
+
# get context
|
181
185
|
local_ctx = uno.getComponentContext()
|
182
186
|
resolver = local_ctx.ServiceManager.createInstanceWithContext(
|
183
187
|
"com.sun.star.bridge.UnoUrlResolver", local_ctx
|
184
188
|
)
|
185
|
-
|
186
|
-
#
|
189
|
+
|
190
|
+
# connect to LibreOffice
|
187
191
|
self._ctx = resolver.resolve(f"uno:{self.connection_string}")
|
188
192
|
self._desktop = self._ctx.ServiceManager.createInstanceWithContext(
|
189
193
|
"com.sun.star.frame.Desktop", self._ctx
|
190
194
|
)
|
191
|
-
|
195
|
+
|
192
196
|
self._connected = True
|
193
197
|
logger.info("✅ 成功连接到LibreOffice服务")
|
194
198
|
return
|
195
|
-
|
199
|
+
|
196
200
|
except NoConnectException:
|
197
201
|
logger.debug("⏳ 等待LibreOffice服务就绪...")
|
198
202
|
time.sleep(1)
|
199
203
|
except Exception as e:
|
200
204
|
logger.error(f"❌ 连接失败: {str(e)}")
|
201
205
|
time.sleep(1)
|
202
|
-
|
206
|
+
|
203
207
|
raise TimeoutError(f"连接LibreOffice服务超时({self.timeout}秒)")
|
204
208
|
|
205
209
|
def disconnect(self):
|
@@ -240,10 +244,10 @@ class UnoManager:
|
|
240
244
|
"""
|
241
245
|
self.connect()
|
242
246
|
|
243
|
-
#
|
247
|
+
# converse path to URL
|
244
248
|
file_url = uno.systemPathToFileUrl(os.path.abspath(file_path))
|
245
249
|
|
246
|
-
#
|
250
|
+
# open file
|
247
251
|
properties = []
|
248
252
|
properties.append(self._make_property("Hidden", True))
|
249
253
|
properties.append(self._make_property("ReadOnly", True))
|
@@ -285,53 +289,63 @@ class UnoManager:
|
|
285
289
|
if document is None:
|
286
290
|
raise Exception(f"无法打开文档: {input_path}")
|
287
291
|
|
288
|
-
#
|
292
|
+
# prepare to output properties
|
289
293
|
properties = []
|
290
294
|
|
291
|
-
#
|
295
|
+
# set filter
|
292
296
|
if filter_name:
|
293
297
|
properties.append(self._make_property("FilterName", filter_name))
|
294
298
|
else:
|
295
|
-
#
|
299
|
+
# choose filter by format
|
296
300
|
if output_format == "txt":
|
297
|
-
#
|
301
|
+
# multi-filter for multi-files
|
298
302
|
filter_options = [
|
299
303
|
("Text (encoded)", "UTF8"),
|
300
304
|
("Text", None),
|
301
|
-
("HTML (StarWriter)", None)
|
305
|
+
("HTML (StarWriter)", None),
|
302
306
|
]
|
303
|
-
|
307
|
+
|
304
308
|
success = False
|
305
309
|
for filter_name, filter_option in filter_options:
|
306
310
|
try:
|
307
311
|
properties = []
|
308
|
-
properties.append(
|
312
|
+
properties.append(
|
313
|
+
self._make_property("FilterName", filter_name)
|
314
|
+
)
|
309
315
|
if filter_option:
|
310
|
-
properties.append(
|
311
|
-
|
312
|
-
|
316
|
+
properties.append(
|
317
|
+
self._make_property("FilterOptions", filter_option)
|
318
|
+
)
|
319
|
+
|
320
|
+
# ensuring that the output directory exists.
|
313
321
|
output_dir = os.path.dirname(output_path)
|
314
322
|
if output_dir and not os.path.exists(output_dir):
|
315
323
|
os.makedirs(output_dir)
|
316
324
|
|
317
|
-
#
|
318
|
-
output_url = uno.systemPathToFileUrl(
|
325
|
+
# converse to URL
|
326
|
+
output_url = uno.systemPathToFileUrl(
|
327
|
+
os.path.abspath(output_path)
|
328
|
+
)
|
319
329
|
|
320
|
-
#
|
330
|
+
# conversing
|
321
331
|
document.storeToURL(output_url, properties)
|
322
|
-
logger.info(
|
332
|
+
logger.info(
|
333
|
+
f"✅ 文档转换成功 (使用过滤器: {filter_name}): {output_path}"
|
334
|
+
)
|
323
335
|
success = True
|
324
336
|
break
|
325
337
|
except Exception as e:
|
326
338
|
logger.debug(f"🔄 过滤器 {filter_name} 失败: {str(e)}")
|
327
339
|
continue
|
328
|
-
|
340
|
+
|
329
341
|
if not success:
|
330
|
-
raise Exception(
|
331
|
-
|
332
|
-
|
342
|
+
raise Exception(
|
343
|
+
f"所有文本过滤器都失败,无法转换文档: {input_path}"
|
344
|
+
)
|
345
|
+
|
346
|
+
return # converted,return
|
333
347
|
else:
|
334
|
-
#
|
348
|
+
# Other formats use the default filter
|
335
349
|
filter_map = {
|
336
350
|
"pdf": "writer_pdf_Export",
|
337
351
|
"docx": "MS Word 2007 XML",
|
@@ -343,15 +357,15 @@ class UnoManager:
|
|
343
357
|
self._make_property("FilterName", filter_map[output_format])
|
344
358
|
)
|
345
359
|
|
346
|
-
#
|
360
|
+
# ensuring that the output directory exists
|
347
361
|
output_dir = os.path.dirname(output_path)
|
348
362
|
if output_dir and not os.path.exists(output_dir):
|
349
363
|
os.makedirs(output_dir)
|
350
364
|
|
351
|
-
#
|
365
|
+
# converse to URL
|
352
366
|
output_url = uno.systemPathToFileUrl(os.path.abspath(output_path))
|
353
367
|
|
354
|
-
#
|
368
|
+
# conversing
|
355
369
|
document.storeToURL(output_url, properties)
|
356
370
|
logger.info(f"✅ 文档转换成功: {output_path}")
|
357
371
|
|
@@ -363,7 +377,7 @@ class UnoManager:
|
|
363
377
|
return prop
|
364
378
|
|
365
379
|
|
366
|
-
#
|
380
|
+
# global Singleton UnoManager
|
367
381
|
_global_uno_manager: Optional[UnoManager] = None
|
368
382
|
_manager_lock = threading.Lock()
|
369
383
|
|
@@ -371,20 +385,20 @@ _manager_lock = threading.Lock()
|
|
371
385
|
def get_uno_manager() -> UnoManager:
|
372
386
|
"""获取全局单例UNO管理器"""
|
373
387
|
global _global_uno_manager
|
374
|
-
|
388
|
+
|
375
389
|
if _global_uno_manager is None:
|
376
390
|
with _manager_lock:
|
377
391
|
if _global_uno_manager is None:
|
378
392
|
_global_uno_manager = UnoManager()
|
379
393
|
logger.info("🎯 创建全局单例UnoManager (单线程模式)")
|
380
|
-
|
394
|
+
|
381
395
|
return _global_uno_manager
|
382
396
|
|
383
397
|
|
384
398
|
def cleanup_uno_manager():
|
385
399
|
"""清理全局UNO管理器"""
|
386
400
|
global _global_uno_manager
|
387
|
-
|
401
|
+
|
388
402
|
with _manager_lock:
|
389
403
|
if _global_uno_manager is not None:
|
390
404
|
try:
|
@@ -402,36 +416,34 @@ def uno_manager_context():
|
|
402
416
|
try:
|
403
417
|
yield manager
|
404
418
|
finally:
|
405
|
-
#
|
419
|
+
# Maintain connections to improve efficiency in single-threaded mode
|
406
420
|
pass
|
407
421
|
|
408
422
|
|
409
423
|
def convert_with_uno(
|
410
|
-
input_path: str,
|
411
|
-
output_format: str,
|
412
|
-
output_dir: Optional[str] = None
|
424
|
+
input_path: str, output_format: str, output_dir: Optional[str] = None
|
413
425
|
) -> str:
|
414
426
|
"""
|
415
427
|
使用UNO转换文档格式(便捷函数)
|
416
|
-
|
428
|
+
|
417
429
|
Args:
|
418
430
|
input_path: 输入文件路径
|
419
431
|
output_format: 输出格式
|
420
432
|
output_dir: 输出目录(可选,默认为输入文件所在目录)
|
421
|
-
|
433
|
+
|
422
434
|
Returns:
|
423
435
|
输出文件路径
|
424
436
|
"""
|
425
437
|
input_path = Path(input_path)
|
426
|
-
|
438
|
+
|
427
439
|
if output_dir is None:
|
428
440
|
output_dir = input_path.parent
|
429
441
|
else:
|
430
442
|
output_dir = Path(output_dir)
|
431
|
-
|
443
|
+
|
432
444
|
output_path = output_dir / f"{input_path.stem}.{output_format}"
|
433
|
-
|
445
|
+
|
434
446
|
with uno_manager_context() as manager:
|
435
447
|
manager.convert_document(str(input_path), str(output_path), output_format)
|
436
|
-
|
448
|
+
|
437
449
|
return str(output_path)
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: pydatamax
|
3
|
-
Version: 0.1.16.
|
3
|
+
Version: 0.1.16.post2
|
4
4
|
Summary: A library for parsing and converting various file formats.
|
5
5
|
Home-page: https://github.com/Hi-Dolphin/datamax
|
6
6
|
Author: ccy
|
@@ -113,7 +113,7 @@ qa_data = dm.get_pre_label(
|
|
113
113
|
question_number=5, # 每块生成问题数
|
114
114
|
max_workers=5 # 并发数
|
115
115
|
)
|
116
|
-
dm.save_label_data(
|
116
|
+
dm.save_label_data(qa_data)
|
117
117
|
```
|
118
118
|
|
119
119
|
## 📖 Detailed Documentation
|
@@ -316,6 +316,58 @@ pip install -r requirements.txt
|
|
316
316
|
python setup.py install
|
317
317
|
```
|
318
318
|
|
319
|
+
### Developer Mode
|
320
|
+
|
321
|
+
For developers who want to contribute to the project or make modifications, we recommend using developer mode for a better development experience.
|
322
|
+
|
323
|
+
#### Setup Developer Mode
|
324
|
+
|
325
|
+
```bash
|
326
|
+
# Clone the repository
|
327
|
+
git clone https://github.com/Hi-Dolphin/datamax.git
|
328
|
+
cd datamax
|
329
|
+
|
330
|
+
# Create virtual environment (recommended)
|
331
|
+
python -m venv venv
|
332
|
+
source venv/bin/activate # On Windows: venv\Scripts\activate
|
333
|
+
|
334
|
+
# Install in developer mode
|
335
|
+
pip install -e .
|
336
|
+
```
|
337
|
+
|
338
|
+
#### Benefits of Developer Mode
|
339
|
+
|
340
|
+
- **Live Updates**: Changes to source code are immediately reflected without reinstallation
|
341
|
+
- **Easy Testing**: Test your modifications instantly
|
342
|
+
- **Debugging**: Better debugging experience with direct access to source code
|
343
|
+
- **Development Workflow**: Seamless integration with your development environment
|
344
|
+
|
345
|
+
#### Development Commands
|
346
|
+
|
347
|
+
```bash
|
348
|
+
# Run tests
|
349
|
+
pytest
|
350
|
+
|
351
|
+
# Install development dependencies
|
352
|
+
pip install -r requirements-dev.txt # if available
|
353
|
+
|
354
|
+
# Check code style
|
355
|
+
flake8 datamax/
|
356
|
+
black datamax/
|
357
|
+
|
358
|
+
# Build package
|
359
|
+
python setup.py sdist bdist_wheel
|
360
|
+
```
|
361
|
+
|
362
|
+
#### Making Changes
|
363
|
+
|
364
|
+
After installing in developer mode, you can:
|
365
|
+
|
366
|
+
1. Edit source code in the `datamax/` directory
|
367
|
+
2. Changes are automatically available when you import the module
|
368
|
+
3. Test your changes immediately without reinstalling
|
369
|
+
4. Submit pull requests with your improvements
|
370
|
+
|
319
371
|
## 📋 System Requirements
|
320
372
|
|
321
373
|
- Python >= 3.10
|
@@ -0,0 +1,39 @@
|
|
1
|
+
datamax/__init__.py,sha256=IGJxWkFpUj1xuHfwtPTrNqsRdLB4jBZIweAVHzDKrvU,29
|
2
|
+
datamax/loader/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
3
|
+
datamax/loader/core.py,sha256=Ld4PmMcbKbsFyU_ynqXxpB9x3IJ34c3hfJBcUiSthrA,5370
|
4
|
+
datamax/loader/minio_handler.py,sha256=VpQ5EHZfLaw0e2JXflAbgPK_plmM_VkPXEiKtIZlQL0,6876
|
5
|
+
datamax/loader/oss_handler.py,sha256=ZO8ZbbA1oyuNN4Z7iVgSbMArYgJ1gvpqADkXDNDw4y0,7648
|
6
|
+
datamax/parser/__init__.py,sha256=3tCt1bmTjJ0sroivt60AoQuZyHH8AtCvn664Qtoh-60,56
|
7
|
+
datamax/parser/base.py,sha256=yEPdk3K-vTf2JnIcTczxEDoMQtVKva9tp2nSACeXOB0,3153
|
8
|
+
datamax/parser/core.py,sha256=eglNe4Vk6U3XEUYT2oLovWfuL1XeFDc0KnRahYN24Mk,19208
|
9
|
+
datamax/parser/csv_parser.py,sha256=PPRqL4MKDymBgFYo0xrgrO8HB3jFcrXTizG27fXVEag,1698
|
10
|
+
datamax/parser/doc_parser.py,sha256=x6aJMQmNCUR2WA0hMvsro1atz6kWK3pRLPzlLfwCUw0,32273
|
11
|
+
datamax/parser/docx_parser.py,sha256=40xq86jLI3nayg0dthBnWSMN2qYQeyUNWh5wWJ8Lar8,37658
|
12
|
+
datamax/parser/epub_parser.py,sha256=zjLp1ha_oQBGIxvgzAyHzO0ZZJQt1JfoAZ9TM8liZ0o,2708
|
13
|
+
datamax/parser/html_parser.py,sha256=b5Rvonj0cScYI2gYfKAfBplp6C8kT2ataf3J0lO40Ok,2017
|
14
|
+
datamax/parser/image_parser.py,sha256=HYNN0oqA1LjI8XBQN09nnWfnVrLcDHcXHOytf8z6NAk,2536
|
15
|
+
datamax/parser/json_parser.py,sha256=wzUKv1lH35PtM8uXunxNZ6ykHEdI_m02SIW1Y7y-wxc,1826
|
16
|
+
datamax/parser/md_parser.py,sha256=62vBHAsHotCC1bNquh7jt8EZuoPyANrbkRTF7YlEOMs,3019
|
17
|
+
datamax/parser/pdf_parser.py,sha256=pm6WVNe2nP5K-XHwNfa3BoS_oNDDdJ5kOo5QERICfkI,5245
|
18
|
+
datamax/parser/ppt_parser.py,sha256=7KmSrxyPRYlYqclMcsxQmLTD3eR1mbED4qgm1mjF4Mg,5808
|
19
|
+
datamax/parser/pptx_parser.py,sha256=huMbv9JMGI7Nvs3v-TGn7MOpicb8-z3L5G24PxBVDzw,2546
|
20
|
+
datamax/parser/txt_parser.py,sha256=GyPFuYQ00mI1shmMFi_gtDJ8B-C2rMI9rI0sjay32Hs,2630
|
21
|
+
datamax/parser/xls_parser.py,sha256=J-Eumrh3oxwr06YHHfUplJlskwBlsEoj3sQ9OVXBFCo,1819
|
22
|
+
datamax/parser/xlsx_parser.py,sha256=Uj1OisEVAzO8mMRcTHpitBstS0M7aSS4UehnE78pvxU,9468
|
23
|
+
datamax/utils/__init__.py,sha256=elPbB7MSk5VfcmKmhaXCTUsVXP9vxd8C-DAMG3JqbDU,1491
|
24
|
+
datamax/utils/constants.py,sha256=1hzHnYsm43Q36Czc7OnC-zJVTunThx82d_ZZAZBErHw,4565
|
25
|
+
datamax/utils/data_cleaner.py,sha256=2sfjXkDaEXavr98Ezj1BWG4uJQPUzeR99172tH43-Yk,10454
|
26
|
+
datamax/utils/env_setup.py,sha256=lXPAL6WGkjOBgqTa2A0li5YS2TQ96cvAR4OhJjQP4pA,3638
|
27
|
+
datamax/utils/gotocr_pdf.py,sha256=LHQ4nIFNC47b7hLWzMGkk_UsLmIxMLnUhRa48iwJo48,8796
|
28
|
+
datamax/utils/lifecycle_types.py,sha256=rvHB4zwzS_nlWKUtWA37L9dJNvx6ol5F2-x2eEf6zJk,625
|
29
|
+
datamax/utils/mineru_operator.py,sha256=mBw9xuCwJZmmOLaUFhw2c3JPDB7KMjWqSlEzbKCRXc8,2276
|
30
|
+
datamax/utils/paddleocr_pdf_operator.py,sha256=5l7P7wCGd4-Qph3NMTDdHR6nStjafDMNpX4sSCFv5qQ,3637
|
31
|
+
datamax/utils/ppt_extract.py,sha256=Sf4H3TKdK6BnKRv0sw5JnfKSQH9l6u5XUwLTd78KB94,6619
|
32
|
+
datamax/utils/qa_generator.py,sha256=q7pzZ3DWItRQLBQH1jab2TBkjJvcKfkBuzlN0wxZ5Rs,24353
|
33
|
+
datamax/utils/tokenizer.py,sha256=j93Uky4bYDKZKT-MOtenZb36MoRPNnYk8sP9t_FSQqk,860
|
34
|
+
datamax/utils/uno_handler.py,sha256=xITU8FGeeBtHRc-Aj4lbKHGvKVslWEwWZOIUZiP_ghY,15447
|
35
|
+
pydatamax-0.1.16.post2.dist-info/licenses/LICENSE,sha256=RltoeMa9c1956S08ThvZE2yJSPbnem68Y8cmiIfOgco,1088
|
36
|
+
pydatamax-0.1.16.post2.dist-info/METADATA,sha256=pTi_avX8RBNYxHcPS6CmZnESFqGlX-TwvqMzF6Ilx0Q,11145
|
37
|
+
pydatamax-0.1.16.post2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
38
|
+
pydatamax-0.1.16.post2.dist-info/top_level.txt,sha256=N9TrwI3GKnWW07RRXHr0xX5Bm8dIM_sahfAnf9j8J9M,8
|
39
|
+
pydatamax-0.1.16.post2.dist-info/RECORD,,
|
@@ -1,38 +0,0 @@
|
|
1
|
-
datamax/__init__.py,sha256=IGJxWkFpUj1xuHfwtPTrNqsRdLB4jBZIweAVHzDKrvU,29
|
2
|
-
datamax/loader/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
3
|
-
datamax/loader/core.py,sha256=NGnK2m59GRBauYxZST0kyX5f4zhvAOk4Z5bVoF0CjGo,5218
|
4
|
-
datamax/loader/minio_handler.py,sha256=e7ZUlwoStVe5iQfAVfNgEwRLxen4NbxwokpJZl6AR0U,6557
|
5
|
-
datamax/loader/oss_handler.py,sha256=ZO8ZbbA1oyuNN4Z7iVgSbMArYgJ1gvpqADkXDNDw4y0,7648
|
6
|
-
datamax/parser/__init__.py,sha256=sIB1N4B_fVguSBN-Uw9tGxAO6s0oi4Tq4kRZ59YlUKo,52
|
7
|
-
datamax/parser/base.py,sha256=FamDV6csc3aXVbobMR1lPNtVpvYMO19koRJW9poj_gE,2590
|
8
|
-
datamax/parser/core.py,sha256=pySissrF6kVVAzT5abIlQ-4cUliFu1HBWjcD6psNkYA,16845
|
9
|
-
datamax/parser/csv_parser.py,sha256=lHQs1MHK9WM4Vl0p9nsE3fFhewF0EoXZUhtk8ixznRw,1028
|
10
|
-
datamax/parser/doc_parser.py,sha256=qPKpZy_p1veV2AodqEQU6LzqmT7y1PANlPtt0CYoHeg,30837
|
11
|
-
datamax/parser/docx_parser.py,sha256=wdDGgeYIDg1Se493XZhlduxKjtYZ58Uqxltm2vt9Dy4,36691
|
12
|
-
datamax/parser/epub_parser.py,sha256=K4eCS4wIXJzDicvtVAfQT8yt1gFHeibZN5-EdQZfJe8,1621
|
13
|
-
datamax/parser/html_parser.py,sha256=5ACrVc03Q9pJqWI_b0EtRgOYy0eMYJq4podgHGD68Z8,1453
|
14
|
-
datamax/parser/image_parser.py,sha256=UH3duPvB7Xu6CFlEeAukX5uJ8VlqnMR89hcLsW2O-aU,1281
|
15
|
-
datamax/parser/json_parser.py,sha256=2Ns2Lm6sei9TnDaFFYvl-xhyhse12sMJBwjKNACw4po,1072
|
16
|
-
datamax/parser/md_parser.py,sha256=rHJqtRV78XgQuKtDdwn1LcgRSUEuhGBqN5uaHG6oPT4,2251
|
17
|
-
datamax/parser/pdf_parser.py,sha256=YOJFOEC8DxWLAE6yNd2x6qMvYJl3sWVcM1eA8R4uA30,4116
|
18
|
-
datamax/parser/ppt_parser.py,sha256=0OlsIrzZZJnYZqLmQkUz4J_Hiv-rQHLHJnHIsw345c8,4631
|
19
|
-
datamax/parser/pptx_parser.py,sha256=yWajVd8kpyTdFavR8XcbwxOj94uNBswDoMHfSOycn0o,1870
|
20
|
-
datamax/parser/txt_parser.py,sha256=NXs7aNpm1PUwiUSlN1RU23ittuuQSBaBNI3KeQjJFXs,1750
|
21
|
-
datamax/parser/xls_parser.py,sha256=iNMx8iPakjE984dkaFL-oUBYWpQwxbWoDnQdwfAeeGM,980
|
22
|
-
datamax/parser/xlsx_parser.py,sha256=hUOFqkqkI0XPcwwrp2cs4PFKbChpZtb8orGsZc9kxJ0,9089
|
23
|
-
datamax/utils/__init__.py,sha256=75D4WFE_FVG9MyT8qWtvtlgzuuRelTC7ObSqqfjDKIY,1476
|
24
|
-
datamax/utils/constants.py,sha256=1hzHnYsm43Q36Czc7OnC-zJVTunThx82d_ZZAZBErHw,4565
|
25
|
-
datamax/utils/data_cleaner.py,sha256=TrrxC1r0__wuOhrQSJZcJKoEIyB4eNKWZkA1IoBYhyQ,9937
|
26
|
-
datamax/utils/env_setup.py,sha256=p_7sqHwyXroeOI_yFZpUOK6wOGmPVmf-gBa6M3351O4,3539
|
27
|
-
datamax/utils/gotocr_pdf.py,sha256=A7sn77EQBDbAe-4edCBUlYXKuE2mY7JcsFGm8U3-xbE,8744
|
28
|
-
datamax/utils/mineru_operator.py,sha256=4i4FtDkDE61FWPyRoDjPujHYJq_kDUAkwlowmFWdEOA,2303
|
29
|
-
datamax/utils/paddleocr_pdf_operator.py,sha256=SW06dts1SxDnUvyf5zWYpAN_6t9PLtJXUSsYhSS317I,3645
|
30
|
-
datamax/utils/ppt_extract.py,sha256=jBVGYEsBGPjHqyq7drHTOM8MnFOwqKyHhbkKmEAryAk,6307
|
31
|
-
datamax/utils/qa_generator.py,sha256=pXxdFm_EnWgMuilfmLKgy2c6NDexQZN8nWxT-bYBt74,12548
|
32
|
-
datamax/utils/tokenizer.py,sha256=o78GPmeJ3vs3-SF0b2nMm35XtbrCKbrhDW0gI9gqGl4,880
|
33
|
-
datamax/utils/uno_handler.py,sha256=ehUyk3I8dxMzjK8IzNO5nKcmc-t97ERMUqmSbYPeABc,15435
|
34
|
-
pydatamax-0.1.16.post1.dist-info/licenses/LICENSE,sha256=RltoeMa9c1956S08ThvZE2yJSPbnem68Y8cmiIfOgco,1088
|
35
|
-
pydatamax-0.1.16.post1.dist-info/METADATA,sha256=6I4bYRn8noQbBVURScRDut0fFksMDiU3wAXSNgpavDg,9801
|
36
|
-
pydatamax-0.1.16.post1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
37
|
-
pydatamax-0.1.16.post1.dist-info/top_level.txt,sha256=N9TrwI3GKnWW07RRXHr0xX5Bm8dIM_sahfAnf9j8J9M,8
|
38
|
-
pydatamax-0.1.16.post1.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|