pydatamax 0.1.16.post1__py3-none-any.whl → 0.1.16.post2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. datamax/loader/core.py +67 -42
  2. datamax/loader/minio_handler.py +38 -19
  3. datamax/parser/__init__.py +2 -1
  4. datamax/parser/base.py +46 -22
  5. datamax/parser/core.py +215 -126
  6. datamax/parser/csv_parser.py +25 -5
  7. datamax/parser/doc_parser.py +230 -141
  8. datamax/parser/docx_parser.py +275 -186
  9. datamax/parser/epub_parser.py +49 -13
  10. datamax/parser/html_parser.py +36 -16
  11. datamax/parser/image_parser.py +52 -14
  12. datamax/parser/json_parser.py +26 -5
  13. datamax/parser/md_parser.py +40 -21
  14. datamax/parser/pdf_parser.py +69 -29
  15. datamax/parser/ppt_parser.py +41 -9
  16. datamax/parser/pptx_parser.py +49 -21
  17. datamax/parser/txt_parser.py +45 -14
  18. datamax/parser/xls_parser.py +34 -6
  19. datamax/parser/xlsx_parser.py +58 -51
  20. datamax/utils/__init__.py +2 -1
  21. datamax/utils/data_cleaner.py +36 -22
  22. datamax/utils/env_setup.py +25 -18
  23. datamax/utils/gotocr_pdf.py +13 -13
  24. datamax/utils/lifecycle_types.py +18 -0
  25. datamax/utils/mineru_operator.py +17 -15
  26. datamax/utils/paddleocr_pdf_operator.py +34 -19
  27. datamax/utils/ppt_extract.py +34 -11
  28. datamax/utils/qa_generator.py +332 -44
  29. datamax/utils/tokenizer.py +10 -9
  30. datamax/utils/uno_handler.py +84 -72
  31. {pydatamax-0.1.16.post1.dist-info → pydatamax-0.1.16.post2.dist-info}/METADATA +54 -2
  32. pydatamax-0.1.16.post2.dist-info/RECORD +39 -0
  33. pydatamax-0.1.16.post1.dist-info/RECORD +0 -38
  34. {pydatamax-0.1.16.post1.dist-info → pydatamax-0.1.16.post2.dist-info}/WHEEL +0 -0
  35. {pydatamax-0.1.16.post1.dist-info → pydatamax-0.1.16.post2.dist-info}/licenses/LICENSE +0 -0
  36. {pydatamax-0.1.16.post1.dist-info → pydatamax-0.1.16.post2.dist-info}/top_level.txt +0 -0
@@ -1,4 +1,3 @@
1
- from loguru import logger
2
1
  import os
3
2
  import subprocess
4
3
  import threading
@@ -7,7 +6,9 @@ from contextlib import contextmanager
7
6
  from pathlib import Path
8
7
  from typing import Optional
9
8
 
10
- # 延迟导入标志和锁
9
+ from loguru import logger
10
+
11
+ # delayed import of lock and flag
11
12
  _uno_imported = False
12
13
  _import_error = None
13
14
  _import_lock = threading.Lock()
@@ -16,23 +17,23 @@ _import_lock = threading.Lock()
16
17
  def _lazy_import_uno():
17
18
  """延迟导入 UNO 模块,避免与其他库冲突(线程安全)"""
18
19
  global _uno_imported, _import_error
19
-
20
- # 快速检查,避免不必要的锁获取
20
+
21
+ # quick check,avoiding unnacessary acquisition of lock
21
22
  if _uno_imported:
22
23
  return True
23
-
24
+
24
25
  with _import_lock:
25
- # 双重检查锁定模式
26
+ # double check lock mode
26
27
  if _uno_imported:
27
28
  return True
28
-
29
+
29
30
  try:
30
- # 在这里导入所有 UNO 相关的模块
31
+ # import module relate to UNO
31
32
  global uno, PropertyValue, NoConnectException
32
33
  import uno
33
34
  from com.sun.star.beans import PropertyValue
34
35
  from com.sun.star.connection import NoConnectException
35
-
36
+
36
37
  _uno_imported = True
37
38
  logger.info("✅ UNO模块导入成功")
38
39
  return True
@@ -53,11 +54,12 @@ def ensure_uno_imported():
53
54
  )
54
55
 
55
56
 
56
- # 检查 UNO 是否可用(但不立即导入)
57
+ # check if uno is available(not importing immediately)
57
58
  def check_uno_available():
58
59
  """检查 UNO 是否可用(不会真正导入)"""
59
60
  try:
60
61
  import importlib.util
62
+
61
63
  spec = importlib.util.find_spec("uno")
62
64
  return spec is not None
63
65
  except:
@@ -72,7 +74,7 @@ class UnoManager:
72
74
  UNO管理器,用于管理LibreOffice服务实例和文档转换
73
75
  单线程版本,适合稳定高效的文档处理
74
76
  """
75
-
77
+
76
78
  def __init__(self, host: str = "localhost", port: int = 2002, timeout: int = 30):
77
79
  """
78
80
  初始化UNO管理器
@@ -82,9 +84,9 @@ class UnoManager:
82
84
  port: LibreOffice服务端口
83
85
  timeout: 连接超时时间(秒)
84
86
  """
85
- # 确保UNO已导入(使用线程安全的方式)
87
+ # Ensure that UNO has been imported (in a thread-safe manner)
86
88
  ensure_uno_imported()
87
-
89
+
88
90
  self.host = host
89
91
  self.port = port
90
92
  self.timeout = timeout
@@ -102,12 +104,12 @@ class UnoManager:
102
104
  """启动LibreOffice服务"""
103
105
  logger.info(f"🌟 启动LibreOffice服务,监听端口 {self.port}...")
104
106
 
105
- # 检查是否已有服务在运行
107
+ # check if soffice running
106
108
  if self._check_soffice_running():
107
109
  logger.info("✅ LibreOffice服务已在运行")
108
110
  return
109
111
 
110
- # 启动新的服务实例
112
+ # new a soffice
111
113
  cmd = [
112
114
  "soffice",
113
115
  "--headless",
@@ -125,22 +127,24 @@ class UnoManager:
125
127
  cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL
126
128
  )
127
129
  logger.info(f"⏳ 等待LibreOffice服务启动...")
128
-
129
- # 智能等待:轮询检查服务状态,给不同性能机器弹性时间
130
+
131
+ # Intelligent waiting: Polling to check service status, providing flexible time for machines of different performance levels.
130
132
  start_time = time.time()
131
- check_interval = 1 # 每1秒检查一次
132
- max_wait_time = 30 # 最大等待30
133
-
133
+ check_interval = 1 # checking every sec
134
+ max_wait_time = 30 # wait for max 30 sec
135
+
134
136
  while time.time() - start_time < max_wait_time:
135
137
  if self._check_soffice_running():
136
138
  elapsed = time.time() - start_time
137
139
  logger.info(f"✅ LibreOffice服务启动成功 (耗时 {elapsed:.1f}秒)")
138
140
  return
139
-
140
- logger.debug(f"🔄 服务未就绪,继续等待... (已等待 {time.time() - start_time:.1f}秒)")
141
+
142
+ logger.debug(
143
+ f"🔄 服务未就绪,继续等待... (已等待 {time.time() - start_time:.1f}秒)"
144
+ )
141
145
  time.sleep(check_interval)
142
-
143
- # 超时仍未启动
146
+
147
+ # overtime
144
148
  raise Exception(f"LibreOffice服务启动超时 (等待了{max_wait_time}秒)")
145
149
 
146
150
  except Exception as e:
@@ -168,38 +172,38 @@ class UnoManager:
168
172
  """连接到LibreOffice服务"""
169
173
  with self._lock:
170
174
  if self._connected and self._desktop is not None:
171
- return # 已连接
172
-
175
+ return # connected
176
+
173
177
  self._start_soffice_service()
174
-
178
+
175
179
  logger.info(f"🔌 连接到LibreOffice服务...")
176
180
  start_time = time.time()
177
-
181
+
178
182
  while time.time() - start_time < self.timeout:
179
183
  try:
180
- # 获取组件上下文
184
+ # get context
181
185
  local_ctx = uno.getComponentContext()
182
186
  resolver = local_ctx.ServiceManager.createInstanceWithContext(
183
187
  "com.sun.star.bridge.UnoUrlResolver", local_ctx
184
188
  )
185
-
186
- # 连接到LibreOffice
189
+
190
+ # connect to LibreOffice
187
191
  self._ctx = resolver.resolve(f"uno:{self.connection_string}")
188
192
  self._desktop = self._ctx.ServiceManager.createInstanceWithContext(
189
193
  "com.sun.star.frame.Desktop", self._ctx
190
194
  )
191
-
195
+
192
196
  self._connected = True
193
197
  logger.info("✅ 成功连接到LibreOffice服务")
194
198
  return
195
-
199
+
196
200
  except NoConnectException:
197
201
  logger.debug("⏳ 等待LibreOffice服务就绪...")
198
202
  time.sleep(1)
199
203
  except Exception as e:
200
204
  logger.error(f"❌ 连接失败: {str(e)}")
201
205
  time.sleep(1)
202
-
206
+
203
207
  raise TimeoutError(f"连接LibreOffice服务超时({self.timeout}秒)")
204
208
 
205
209
  def disconnect(self):
@@ -240,10 +244,10 @@ class UnoManager:
240
244
  """
241
245
  self.connect()
242
246
 
243
- # 将路径转换为URL格式
247
+ # converse path to URL
244
248
  file_url = uno.systemPathToFileUrl(os.path.abspath(file_path))
245
249
 
246
- # 打开文档
250
+ # open file
247
251
  properties = []
248
252
  properties.append(self._make_property("Hidden", True))
249
253
  properties.append(self._make_property("ReadOnly", True))
@@ -285,53 +289,63 @@ class UnoManager:
285
289
  if document is None:
286
290
  raise Exception(f"无法打开文档: {input_path}")
287
291
 
288
- # 准备输出属性
292
+ # prepare to output properties
289
293
  properties = []
290
294
 
291
- # 设置过滤器
295
+ # set filter
292
296
  if filter_name:
293
297
  properties.append(self._make_property("FilterName", filter_name))
294
298
  else:
295
- # 根据格式自动选择过滤器
299
+ # choose filter by format
296
300
  if output_format == "txt":
297
- # 对于文本格式,尝试多个过滤器
301
+ # multi-filter for multi-files
298
302
  filter_options = [
299
303
  ("Text (encoded)", "UTF8"),
300
304
  ("Text", None),
301
- ("HTML (StarWriter)", None)
305
+ ("HTML (StarWriter)", None),
302
306
  ]
303
-
307
+
304
308
  success = False
305
309
  for filter_name, filter_option in filter_options:
306
310
  try:
307
311
  properties = []
308
- properties.append(self._make_property("FilterName", filter_name))
312
+ properties.append(
313
+ self._make_property("FilterName", filter_name)
314
+ )
309
315
  if filter_option:
310
- properties.append(self._make_property("FilterOptions", filter_option))
311
-
312
- # 确保输出目录存在
316
+ properties.append(
317
+ self._make_property("FilterOptions", filter_option)
318
+ )
319
+
320
+ # ensuring that the output directory exists.
313
321
  output_dir = os.path.dirname(output_path)
314
322
  if output_dir and not os.path.exists(output_dir):
315
323
  os.makedirs(output_dir)
316
324
 
317
- # 转换为URL格式
318
- output_url = uno.systemPathToFileUrl(os.path.abspath(output_path))
325
+ # converse to URL
326
+ output_url = uno.systemPathToFileUrl(
327
+ os.path.abspath(output_path)
328
+ )
319
329
 
320
- # 执行转换
330
+ # conversing
321
331
  document.storeToURL(output_url, properties)
322
- logger.info(f"✅ 文档转换成功 (使用过滤器: {filter_name}): {output_path}")
332
+ logger.info(
333
+ f"✅ 文档转换成功 (使用过滤器: {filter_name}): {output_path}"
334
+ )
323
335
  success = True
324
336
  break
325
337
  except Exception as e:
326
338
  logger.debug(f"🔄 过滤器 {filter_name} 失败: {str(e)}")
327
339
  continue
328
-
340
+
329
341
  if not success:
330
- raise Exception(f"所有文本过滤器都失败,无法转换文档: {input_path}")
331
-
332
- return # 已经完成转换,直接返回
342
+ raise Exception(
343
+ f"所有文本过滤器都失败,无法转换文档: {input_path}"
344
+ )
345
+
346
+ return # converted,return
333
347
  else:
334
- # 其他格式使用默认过滤器
348
+ # Other formats use the default filter
335
349
  filter_map = {
336
350
  "pdf": "writer_pdf_Export",
337
351
  "docx": "MS Word 2007 XML",
@@ -343,15 +357,15 @@ class UnoManager:
343
357
  self._make_property("FilterName", filter_map[output_format])
344
358
  )
345
359
 
346
- # 确保输出目录存在
360
+ # ensuring that the output directory exists
347
361
  output_dir = os.path.dirname(output_path)
348
362
  if output_dir and not os.path.exists(output_dir):
349
363
  os.makedirs(output_dir)
350
364
 
351
- # 转换为URL格式
365
+ # converse to URL
352
366
  output_url = uno.systemPathToFileUrl(os.path.abspath(output_path))
353
367
 
354
- # 执行转换
368
+ # conversing
355
369
  document.storeToURL(output_url, properties)
356
370
  logger.info(f"✅ 文档转换成功: {output_path}")
357
371
 
@@ -363,7 +377,7 @@ class UnoManager:
363
377
  return prop
364
378
 
365
379
 
366
- # 全局单例UnoManager
380
+ # global Singleton UnoManager
367
381
  _global_uno_manager: Optional[UnoManager] = None
368
382
  _manager_lock = threading.Lock()
369
383
 
@@ -371,20 +385,20 @@ _manager_lock = threading.Lock()
371
385
  def get_uno_manager() -> UnoManager:
372
386
  """获取全局单例UNO管理器"""
373
387
  global _global_uno_manager
374
-
388
+
375
389
  if _global_uno_manager is None:
376
390
  with _manager_lock:
377
391
  if _global_uno_manager is None:
378
392
  _global_uno_manager = UnoManager()
379
393
  logger.info("🎯 创建全局单例UnoManager (单线程模式)")
380
-
394
+
381
395
  return _global_uno_manager
382
396
 
383
397
 
384
398
  def cleanup_uno_manager():
385
399
  """清理全局UNO管理器"""
386
400
  global _global_uno_manager
387
-
401
+
388
402
  with _manager_lock:
389
403
  if _global_uno_manager is not None:
390
404
  try:
@@ -402,36 +416,34 @@ def uno_manager_context():
402
416
  try:
403
417
  yield manager
404
418
  finally:
405
- # 在单线程模式下,保持连接以提高效率
419
+ # Maintain connections to improve efficiency in single-threaded mode
406
420
  pass
407
421
 
408
422
 
409
423
  def convert_with_uno(
410
- input_path: str,
411
- output_format: str,
412
- output_dir: Optional[str] = None
424
+ input_path: str, output_format: str, output_dir: Optional[str] = None
413
425
  ) -> str:
414
426
  """
415
427
  使用UNO转换文档格式(便捷函数)
416
-
428
+
417
429
  Args:
418
430
  input_path: 输入文件路径
419
431
  output_format: 输出格式
420
432
  output_dir: 输出目录(可选,默认为输入文件所在目录)
421
-
433
+
422
434
  Returns:
423
435
  输出文件路径
424
436
  """
425
437
  input_path = Path(input_path)
426
-
438
+
427
439
  if output_dir is None:
428
440
  output_dir = input_path.parent
429
441
  else:
430
442
  output_dir = Path(output_dir)
431
-
443
+
432
444
  output_path = output_dir / f"{input_path.stem}.{output_format}"
433
-
445
+
434
446
  with uno_manager_context() as manager:
435
447
  manager.convert_document(str(input_path), str(output_path), output_format)
436
-
448
+
437
449
  return str(output_path)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: pydatamax
3
- Version: 0.1.16.post1
3
+ Version: 0.1.16.post2
4
4
  Summary: A library for parsing and converting various file formats.
5
5
  Home-page: https://github.com/Hi-Dolphin/datamax
6
6
  Author: ccy
@@ -113,7 +113,7 @@ qa_data = dm.get_pre_label(
113
113
  question_number=5, # 每块生成问题数
114
114
  max_workers=5 # 并发数
115
115
  )
116
- dm.save_label_data(res)
116
+ dm.save_label_data(qa_data)
117
117
  ```
118
118
 
119
119
  ## 📖 Detailed Documentation
@@ -316,6 +316,58 @@ pip install -r requirements.txt
316
316
  python setup.py install
317
317
  ```
318
318
 
319
+ ### Developer Mode
320
+
321
+ For developers who want to contribute to the project or make modifications, we recommend using developer mode for a better development experience.
322
+
323
+ #### Setup Developer Mode
324
+
325
+ ```bash
326
+ # Clone the repository
327
+ git clone https://github.com/Hi-Dolphin/datamax.git
328
+ cd datamax
329
+
330
+ # Create virtual environment (recommended)
331
+ python -m venv venv
332
+ source venv/bin/activate # On Windows: venv\Scripts\activate
333
+
334
+ # Install in developer mode
335
+ pip install -e .
336
+ ```
337
+
338
+ #### Benefits of Developer Mode
339
+
340
+ - **Live Updates**: Changes to source code are immediately reflected without reinstallation
341
+ - **Easy Testing**: Test your modifications instantly
342
+ - **Debugging**: Better debugging experience with direct access to source code
343
+ - **Development Workflow**: Seamless integration with your development environment
344
+
345
+ #### Development Commands
346
+
347
+ ```bash
348
+ # Run tests
349
+ pytest
350
+
351
+ # Install development dependencies
352
+ pip install -r requirements-dev.txt # if available
353
+
354
+ # Check code style
355
+ flake8 datamax/
356
+ black datamax/
357
+
358
+ # Build package
359
+ python setup.py sdist bdist_wheel
360
+ ```
361
+
362
+ #### Making Changes
363
+
364
+ After installing in developer mode, you can:
365
+
366
+ 1. Edit source code in the `datamax/` directory
367
+ 2. Changes are automatically available when you import the module
368
+ 3. Test your changes immediately without reinstalling
369
+ 4. Submit pull requests with your improvements
370
+
319
371
  ## 📋 System Requirements
320
372
 
321
373
  - Python >= 3.10
@@ -0,0 +1,39 @@
1
+ datamax/__init__.py,sha256=IGJxWkFpUj1xuHfwtPTrNqsRdLB4jBZIweAVHzDKrvU,29
2
+ datamax/loader/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
+ datamax/loader/core.py,sha256=Ld4PmMcbKbsFyU_ynqXxpB9x3IJ34c3hfJBcUiSthrA,5370
4
+ datamax/loader/minio_handler.py,sha256=VpQ5EHZfLaw0e2JXflAbgPK_plmM_VkPXEiKtIZlQL0,6876
5
+ datamax/loader/oss_handler.py,sha256=ZO8ZbbA1oyuNN4Z7iVgSbMArYgJ1gvpqADkXDNDw4y0,7648
6
+ datamax/parser/__init__.py,sha256=3tCt1bmTjJ0sroivt60AoQuZyHH8AtCvn664Qtoh-60,56
7
+ datamax/parser/base.py,sha256=yEPdk3K-vTf2JnIcTczxEDoMQtVKva9tp2nSACeXOB0,3153
8
+ datamax/parser/core.py,sha256=eglNe4Vk6U3XEUYT2oLovWfuL1XeFDc0KnRahYN24Mk,19208
9
+ datamax/parser/csv_parser.py,sha256=PPRqL4MKDymBgFYo0xrgrO8HB3jFcrXTizG27fXVEag,1698
10
+ datamax/parser/doc_parser.py,sha256=x6aJMQmNCUR2WA0hMvsro1atz6kWK3pRLPzlLfwCUw0,32273
11
+ datamax/parser/docx_parser.py,sha256=40xq86jLI3nayg0dthBnWSMN2qYQeyUNWh5wWJ8Lar8,37658
12
+ datamax/parser/epub_parser.py,sha256=zjLp1ha_oQBGIxvgzAyHzO0ZZJQt1JfoAZ9TM8liZ0o,2708
13
+ datamax/parser/html_parser.py,sha256=b5Rvonj0cScYI2gYfKAfBplp6C8kT2ataf3J0lO40Ok,2017
14
+ datamax/parser/image_parser.py,sha256=HYNN0oqA1LjI8XBQN09nnWfnVrLcDHcXHOytf8z6NAk,2536
15
+ datamax/parser/json_parser.py,sha256=wzUKv1lH35PtM8uXunxNZ6ykHEdI_m02SIW1Y7y-wxc,1826
16
+ datamax/parser/md_parser.py,sha256=62vBHAsHotCC1bNquh7jt8EZuoPyANrbkRTF7YlEOMs,3019
17
+ datamax/parser/pdf_parser.py,sha256=pm6WVNe2nP5K-XHwNfa3BoS_oNDDdJ5kOo5QERICfkI,5245
18
+ datamax/parser/ppt_parser.py,sha256=7KmSrxyPRYlYqclMcsxQmLTD3eR1mbED4qgm1mjF4Mg,5808
19
+ datamax/parser/pptx_parser.py,sha256=huMbv9JMGI7Nvs3v-TGn7MOpicb8-z3L5G24PxBVDzw,2546
20
+ datamax/parser/txt_parser.py,sha256=GyPFuYQ00mI1shmMFi_gtDJ8B-C2rMI9rI0sjay32Hs,2630
21
+ datamax/parser/xls_parser.py,sha256=J-Eumrh3oxwr06YHHfUplJlskwBlsEoj3sQ9OVXBFCo,1819
22
+ datamax/parser/xlsx_parser.py,sha256=Uj1OisEVAzO8mMRcTHpitBstS0M7aSS4UehnE78pvxU,9468
23
+ datamax/utils/__init__.py,sha256=elPbB7MSk5VfcmKmhaXCTUsVXP9vxd8C-DAMG3JqbDU,1491
24
+ datamax/utils/constants.py,sha256=1hzHnYsm43Q36Czc7OnC-zJVTunThx82d_ZZAZBErHw,4565
25
+ datamax/utils/data_cleaner.py,sha256=2sfjXkDaEXavr98Ezj1BWG4uJQPUzeR99172tH43-Yk,10454
26
+ datamax/utils/env_setup.py,sha256=lXPAL6WGkjOBgqTa2A0li5YS2TQ96cvAR4OhJjQP4pA,3638
27
+ datamax/utils/gotocr_pdf.py,sha256=LHQ4nIFNC47b7hLWzMGkk_UsLmIxMLnUhRa48iwJo48,8796
28
+ datamax/utils/lifecycle_types.py,sha256=rvHB4zwzS_nlWKUtWA37L9dJNvx6ol5F2-x2eEf6zJk,625
29
+ datamax/utils/mineru_operator.py,sha256=mBw9xuCwJZmmOLaUFhw2c3JPDB7KMjWqSlEzbKCRXc8,2276
30
+ datamax/utils/paddleocr_pdf_operator.py,sha256=5l7P7wCGd4-Qph3NMTDdHR6nStjafDMNpX4sSCFv5qQ,3637
31
+ datamax/utils/ppt_extract.py,sha256=Sf4H3TKdK6BnKRv0sw5JnfKSQH9l6u5XUwLTd78KB94,6619
32
+ datamax/utils/qa_generator.py,sha256=q7pzZ3DWItRQLBQH1jab2TBkjJvcKfkBuzlN0wxZ5Rs,24353
33
+ datamax/utils/tokenizer.py,sha256=j93Uky4bYDKZKT-MOtenZb36MoRPNnYk8sP9t_FSQqk,860
34
+ datamax/utils/uno_handler.py,sha256=xITU8FGeeBtHRc-Aj4lbKHGvKVslWEwWZOIUZiP_ghY,15447
35
+ pydatamax-0.1.16.post2.dist-info/licenses/LICENSE,sha256=RltoeMa9c1956S08ThvZE2yJSPbnem68Y8cmiIfOgco,1088
36
+ pydatamax-0.1.16.post2.dist-info/METADATA,sha256=pTi_avX8RBNYxHcPS6CmZnESFqGlX-TwvqMzF6Ilx0Q,11145
37
+ pydatamax-0.1.16.post2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
38
+ pydatamax-0.1.16.post2.dist-info/top_level.txt,sha256=N9TrwI3GKnWW07RRXHr0xX5Bm8dIM_sahfAnf9j8J9M,8
39
+ pydatamax-0.1.16.post2.dist-info/RECORD,,
@@ -1,38 +0,0 @@
1
- datamax/__init__.py,sha256=IGJxWkFpUj1xuHfwtPTrNqsRdLB4jBZIweAVHzDKrvU,29
2
- datamax/loader/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
- datamax/loader/core.py,sha256=NGnK2m59GRBauYxZST0kyX5f4zhvAOk4Z5bVoF0CjGo,5218
4
- datamax/loader/minio_handler.py,sha256=e7ZUlwoStVe5iQfAVfNgEwRLxen4NbxwokpJZl6AR0U,6557
5
- datamax/loader/oss_handler.py,sha256=ZO8ZbbA1oyuNN4Z7iVgSbMArYgJ1gvpqADkXDNDw4y0,7648
6
- datamax/parser/__init__.py,sha256=sIB1N4B_fVguSBN-Uw9tGxAO6s0oi4Tq4kRZ59YlUKo,52
7
- datamax/parser/base.py,sha256=FamDV6csc3aXVbobMR1lPNtVpvYMO19koRJW9poj_gE,2590
8
- datamax/parser/core.py,sha256=pySissrF6kVVAzT5abIlQ-4cUliFu1HBWjcD6psNkYA,16845
9
- datamax/parser/csv_parser.py,sha256=lHQs1MHK9WM4Vl0p9nsE3fFhewF0EoXZUhtk8ixznRw,1028
10
- datamax/parser/doc_parser.py,sha256=qPKpZy_p1veV2AodqEQU6LzqmT7y1PANlPtt0CYoHeg,30837
11
- datamax/parser/docx_parser.py,sha256=wdDGgeYIDg1Se493XZhlduxKjtYZ58Uqxltm2vt9Dy4,36691
12
- datamax/parser/epub_parser.py,sha256=K4eCS4wIXJzDicvtVAfQT8yt1gFHeibZN5-EdQZfJe8,1621
13
- datamax/parser/html_parser.py,sha256=5ACrVc03Q9pJqWI_b0EtRgOYy0eMYJq4podgHGD68Z8,1453
14
- datamax/parser/image_parser.py,sha256=UH3duPvB7Xu6CFlEeAukX5uJ8VlqnMR89hcLsW2O-aU,1281
15
- datamax/parser/json_parser.py,sha256=2Ns2Lm6sei9TnDaFFYvl-xhyhse12sMJBwjKNACw4po,1072
16
- datamax/parser/md_parser.py,sha256=rHJqtRV78XgQuKtDdwn1LcgRSUEuhGBqN5uaHG6oPT4,2251
17
- datamax/parser/pdf_parser.py,sha256=YOJFOEC8DxWLAE6yNd2x6qMvYJl3sWVcM1eA8R4uA30,4116
18
- datamax/parser/ppt_parser.py,sha256=0OlsIrzZZJnYZqLmQkUz4J_Hiv-rQHLHJnHIsw345c8,4631
19
- datamax/parser/pptx_parser.py,sha256=yWajVd8kpyTdFavR8XcbwxOj94uNBswDoMHfSOycn0o,1870
20
- datamax/parser/txt_parser.py,sha256=NXs7aNpm1PUwiUSlN1RU23ittuuQSBaBNI3KeQjJFXs,1750
21
- datamax/parser/xls_parser.py,sha256=iNMx8iPakjE984dkaFL-oUBYWpQwxbWoDnQdwfAeeGM,980
22
- datamax/parser/xlsx_parser.py,sha256=hUOFqkqkI0XPcwwrp2cs4PFKbChpZtb8orGsZc9kxJ0,9089
23
- datamax/utils/__init__.py,sha256=75D4WFE_FVG9MyT8qWtvtlgzuuRelTC7ObSqqfjDKIY,1476
24
- datamax/utils/constants.py,sha256=1hzHnYsm43Q36Czc7OnC-zJVTunThx82d_ZZAZBErHw,4565
25
- datamax/utils/data_cleaner.py,sha256=TrrxC1r0__wuOhrQSJZcJKoEIyB4eNKWZkA1IoBYhyQ,9937
26
- datamax/utils/env_setup.py,sha256=p_7sqHwyXroeOI_yFZpUOK6wOGmPVmf-gBa6M3351O4,3539
27
- datamax/utils/gotocr_pdf.py,sha256=A7sn77EQBDbAe-4edCBUlYXKuE2mY7JcsFGm8U3-xbE,8744
28
- datamax/utils/mineru_operator.py,sha256=4i4FtDkDE61FWPyRoDjPujHYJq_kDUAkwlowmFWdEOA,2303
29
- datamax/utils/paddleocr_pdf_operator.py,sha256=SW06dts1SxDnUvyf5zWYpAN_6t9PLtJXUSsYhSS317I,3645
30
- datamax/utils/ppt_extract.py,sha256=jBVGYEsBGPjHqyq7drHTOM8MnFOwqKyHhbkKmEAryAk,6307
31
- datamax/utils/qa_generator.py,sha256=pXxdFm_EnWgMuilfmLKgy2c6NDexQZN8nWxT-bYBt74,12548
32
- datamax/utils/tokenizer.py,sha256=o78GPmeJ3vs3-SF0b2nMm35XtbrCKbrhDW0gI9gqGl4,880
33
- datamax/utils/uno_handler.py,sha256=ehUyk3I8dxMzjK8IzNO5nKcmc-t97ERMUqmSbYPeABc,15435
34
- pydatamax-0.1.16.post1.dist-info/licenses/LICENSE,sha256=RltoeMa9c1956S08ThvZE2yJSPbnem68Y8cmiIfOgco,1088
35
- pydatamax-0.1.16.post1.dist-info/METADATA,sha256=6I4bYRn8noQbBVURScRDut0fFksMDiU3wAXSNgpavDg,9801
36
- pydatamax-0.1.16.post1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
37
- pydatamax-0.1.16.post1.dist-info/top_level.txt,sha256=N9TrwI3GKnWW07RRXHr0xX5Bm8dIM_sahfAnf9j8J9M,8
38
- pydatamax-0.1.16.post1.dist-info/RECORD,,