pydatamax 0.1.16__py3-none-any.whl → 0.1.16.post2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. datamax/loader/core.py +67 -42
  2. datamax/loader/minio_handler.py +38 -19
  3. datamax/parser/__init__.py +2 -1
  4. datamax/parser/base.py +46 -22
  5. datamax/parser/core.py +215 -126
  6. datamax/parser/csv_parser.py +25 -5
  7. datamax/parser/doc_parser.py +230 -141
  8. datamax/parser/docx_parser.py +275 -186
  9. datamax/parser/epub_parser.py +49 -13
  10. datamax/parser/html_parser.py +36 -16
  11. datamax/parser/image_parser.py +52 -14
  12. datamax/parser/json_parser.py +26 -5
  13. datamax/parser/md_parser.py +40 -21
  14. datamax/parser/pdf_parser.py +69 -29
  15. datamax/parser/ppt_parser.py +41 -9
  16. datamax/parser/pptx_parser.py +49 -21
  17. datamax/parser/txt_parser.py +45 -14
  18. datamax/parser/xls_parser.py +34 -6
  19. datamax/parser/xlsx_parser.py +58 -51
  20. datamax/utils/__init__.py +2 -1
  21. datamax/utils/data_cleaner.py +36 -22
  22. datamax/utils/env_setup.py +25 -18
  23. datamax/utils/gotocr_pdf.py +13 -13
  24. datamax/utils/lifecycle_types.py +18 -0
  25. datamax/utils/mineru_operator.py +17 -15
  26. datamax/utils/paddleocr_pdf_operator.py +34 -19
  27. datamax/utils/ppt_extract.py +34 -11
  28. datamax/utils/qa_generator.py +332 -44
  29. datamax/utils/tokenizer.py +10 -9
  30. datamax/utils/uno_handler.py +91 -68
  31. {pydatamax-0.1.16.dist-info → pydatamax-0.1.16.post2.dist-info}/METADATA +54 -2
  32. pydatamax-0.1.16.post2.dist-info/RECORD +39 -0
  33. pydatamax-0.1.16.dist-info/RECORD +0 -38
  34. {pydatamax-0.1.16.dist-info → pydatamax-0.1.16.post2.dist-info}/WHEEL +0 -0
  35. {pydatamax-0.1.16.dist-info → pydatamax-0.1.16.post2.dist-info}/licenses/LICENSE +0 -0
  36. {pydatamax-0.1.16.dist-info → pydatamax-0.1.16.post2.dist-info}/top_level.txt +0 -0
@@ -1,4 +1,3 @@
1
- from loguru import logger
2
1
  import os
3
2
  import subprocess
4
3
  import threading
@@ -7,7 +6,9 @@ from contextlib import contextmanager
7
6
  from pathlib import Path
8
7
  from typing import Optional
9
8
 
10
- # 延迟导入标志和锁
9
+ from loguru import logger
10
+
11
+ # delayed import of lock and flag
11
12
  _uno_imported = False
12
13
  _import_error = None
13
14
  _import_lock = threading.Lock()
@@ -16,23 +17,23 @@ _import_lock = threading.Lock()
16
17
  def _lazy_import_uno():
17
18
  """延迟导入 UNO 模块,避免与其他库冲突(线程安全)"""
18
19
  global _uno_imported, _import_error
19
-
20
- # 快速检查,避免不必要的锁获取
20
+
21
+ # quick check,avoiding unnacessary acquisition of lock
21
22
  if _uno_imported:
22
23
  return True
23
-
24
+
24
25
  with _import_lock:
25
- # 双重检查锁定模式
26
+ # double check lock mode
26
27
  if _uno_imported:
27
28
  return True
28
-
29
+
29
30
  try:
30
- # 在这里导入所有 UNO 相关的模块
31
+ # import module relate to UNO
31
32
  global uno, PropertyValue, NoConnectException
32
33
  import uno
33
34
  from com.sun.star.beans import PropertyValue
34
35
  from com.sun.star.connection import NoConnectException
35
-
36
+
36
37
  _uno_imported = True
37
38
  logger.info("✅ UNO模块导入成功")
38
39
  return True
@@ -53,11 +54,12 @@ def ensure_uno_imported():
53
54
  )
54
55
 
55
56
 
56
- # 检查 UNO 是否可用(但不立即导入)
57
+ # check if uno is available(not importing immediately)
57
58
  def check_uno_available():
58
59
  """检查 UNO 是否可用(不会真正导入)"""
59
60
  try:
60
61
  import importlib.util
62
+
61
63
  spec = importlib.util.find_spec("uno")
62
64
  return spec is not None
63
65
  except:
@@ -72,7 +74,7 @@ class UnoManager:
72
74
  UNO管理器,用于管理LibreOffice服务实例和文档转换
73
75
  单线程版本,适合稳定高效的文档处理
74
76
  """
75
-
77
+
76
78
  def __init__(self, host: str = "localhost", port: int = 2002, timeout: int = 30):
77
79
  """
78
80
  初始化UNO管理器
@@ -82,9 +84,9 @@ class UnoManager:
82
84
  port: LibreOffice服务端口
83
85
  timeout: 连接超时时间(秒)
84
86
  """
85
- # 确保UNO已导入(使用线程安全的方式)
87
+ # Ensure that UNO has been imported (in a thread-safe manner)
86
88
  ensure_uno_imported()
87
-
89
+
88
90
  self.host = host
89
91
  self.port = port
90
92
  self.timeout = timeout
@@ -102,12 +104,12 @@ class UnoManager:
102
104
  """启动LibreOffice服务"""
103
105
  logger.info(f"🌟 启动LibreOffice服务,监听端口 {self.port}...")
104
106
 
105
- # 检查是否已有服务在运行
107
+ # check if soffice running
106
108
  if self._check_soffice_running():
107
109
  logger.info("✅ LibreOffice服务已在运行")
108
110
  return
109
111
 
110
- # 启动新的服务实例
112
+ # new a soffice
111
113
  cmd = [
112
114
  "soffice",
113
115
  "--headless",
@@ -125,12 +127,25 @@ class UnoManager:
125
127
  cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL
126
128
  )
127
129
  logger.info(f"⏳ 等待LibreOffice服务启动...")
128
- time.sleep(5) # 给服务一些启动时间
129
130
 
130
- if self._check_soffice_running():
131
- logger.info("✅ LibreOffice服务启动成功")
132
- else:
133
- raise Exception("LibreOffice服务启动失败")
131
+ # Intelligent waiting: Polling to check service status, providing flexible time for machines of different performance levels.
132
+ start_time = time.time()
133
+ check_interval = 1 # checking every sec
134
+ max_wait_time = 30 # wait for max 30 sec
135
+
136
+ while time.time() - start_time < max_wait_time:
137
+ if self._check_soffice_running():
138
+ elapsed = time.time() - start_time
139
+ logger.info(f"✅ LibreOffice服务启动成功 (耗时 {elapsed:.1f}秒)")
140
+ return
141
+
142
+ logger.debug(
143
+ f"🔄 服务未就绪,继续等待... (已等待 {time.time() - start_time:.1f}秒)"
144
+ )
145
+ time.sleep(check_interval)
146
+
147
+ # overtime
148
+ raise Exception(f"LibreOffice服务启动超时 (等待了{max_wait_time}秒)")
134
149
 
135
150
  except Exception as e:
136
151
  logger.error(f"❌ 启动LibreOffice服务失败: {str(e)}")
@@ -157,38 +172,38 @@ class UnoManager:
157
172
  """连接到LibreOffice服务"""
158
173
  with self._lock:
159
174
  if self._connected and self._desktop is not None:
160
- return # 已连接
161
-
175
+ return # connected
176
+
162
177
  self._start_soffice_service()
163
-
178
+
164
179
  logger.info(f"🔌 连接到LibreOffice服务...")
165
180
  start_time = time.time()
166
-
181
+
167
182
  while time.time() - start_time < self.timeout:
168
183
  try:
169
- # 获取组件上下文
184
+ # get context
170
185
  local_ctx = uno.getComponentContext()
171
186
  resolver = local_ctx.ServiceManager.createInstanceWithContext(
172
187
  "com.sun.star.bridge.UnoUrlResolver", local_ctx
173
188
  )
174
-
175
- # 连接到LibreOffice
189
+
190
+ # connect to LibreOffice
176
191
  self._ctx = resolver.resolve(f"uno:{self.connection_string}")
177
192
  self._desktop = self._ctx.ServiceManager.createInstanceWithContext(
178
193
  "com.sun.star.frame.Desktop", self._ctx
179
194
  )
180
-
195
+
181
196
  self._connected = True
182
197
  logger.info("✅ 成功连接到LibreOffice服务")
183
198
  return
184
-
199
+
185
200
  except NoConnectException:
186
201
  logger.debug("⏳ 等待LibreOffice服务就绪...")
187
202
  time.sleep(1)
188
203
  except Exception as e:
189
204
  logger.error(f"❌ 连接失败: {str(e)}")
190
205
  time.sleep(1)
191
-
206
+
192
207
  raise TimeoutError(f"连接LibreOffice服务超时({self.timeout}秒)")
193
208
 
194
209
  def disconnect(self):
@@ -229,10 +244,10 @@ class UnoManager:
229
244
  """
230
245
  self.connect()
231
246
 
232
- # 将路径转换为URL格式
247
+ # converse path to URL
233
248
  file_url = uno.systemPathToFileUrl(os.path.abspath(file_path))
234
249
 
235
- # 打开文档
250
+ # open file
236
251
  properties = []
237
252
  properties.append(self._make_property("Hidden", True))
238
253
  properties.append(self._make_property("ReadOnly", True))
@@ -274,53 +289,63 @@ class UnoManager:
274
289
  if document is None:
275
290
  raise Exception(f"无法打开文档: {input_path}")
276
291
 
277
- # 准备输出属性
292
+ # prepare to output properties
278
293
  properties = []
279
294
 
280
- # 设置过滤器
295
+ # set filter
281
296
  if filter_name:
282
297
  properties.append(self._make_property("FilterName", filter_name))
283
298
  else:
284
- # 根据格式自动选择过滤器
299
+ # choose filter by format
285
300
  if output_format == "txt":
286
- # 对于文本格式,尝试多个过滤器
301
+ # multi-filter for multi-files
287
302
  filter_options = [
288
303
  ("Text (encoded)", "UTF8"),
289
304
  ("Text", None),
290
- ("HTML (StarWriter)", None)
305
+ ("HTML (StarWriter)", None),
291
306
  ]
292
-
307
+
293
308
  success = False
294
309
  for filter_name, filter_option in filter_options:
295
310
  try:
296
311
  properties = []
297
- properties.append(self._make_property("FilterName", filter_name))
312
+ properties.append(
313
+ self._make_property("FilterName", filter_name)
314
+ )
298
315
  if filter_option:
299
- properties.append(self._make_property("FilterOptions", filter_option))
300
-
301
- # 确保输出目录存在
316
+ properties.append(
317
+ self._make_property("FilterOptions", filter_option)
318
+ )
319
+
320
+ # ensuring that the output directory exists.
302
321
  output_dir = os.path.dirname(output_path)
303
322
  if output_dir and not os.path.exists(output_dir):
304
323
  os.makedirs(output_dir)
305
324
 
306
- # 转换为URL格式
307
- output_url = uno.systemPathToFileUrl(os.path.abspath(output_path))
325
+ # converse to URL
326
+ output_url = uno.systemPathToFileUrl(
327
+ os.path.abspath(output_path)
328
+ )
308
329
 
309
- # 执行转换
330
+ # conversing
310
331
  document.storeToURL(output_url, properties)
311
- logger.info(f"✅ 文档转换成功 (使用过滤器: {filter_name}): {output_path}")
332
+ logger.info(
333
+ f"✅ 文档转换成功 (使用过滤器: {filter_name}): {output_path}"
334
+ )
312
335
  success = True
313
336
  break
314
337
  except Exception as e:
315
338
  logger.debug(f"🔄 过滤器 {filter_name} 失败: {str(e)}")
316
339
  continue
317
-
340
+
318
341
  if not success:
319
- raise Exception(f"所有文本过滤器都失败,无法转换文档: {input_path}")
320
-
321
- return # 已经完成转换,直接返回
342
+ raise Exception(
343
+ f"所有文本过滤器都失败,无法转换文档: {input_path}"
344
+ )
345
+
346
+ return # converted,return
322
347
  else:
323
- # 其他格式使用默认过滤器
348
+ # Other formats use the default filter
324
349
  filter_map = {
325
350
  "pdf": "writer_pdf_Export",
326
351
  "docx": "MS Word 2007 XML",
@@ -332,15 +357,15 @@ class UnoManager:
332
357
  self._make_property("FilterName", filter_map[output_format])
333
358
  )
334
359
 
335
- # 确保输出目录存在
360
+ # ensuring that the output directory exists
336
361
  output_dir = os.path.dirname(output_path)
337
362
  if output_dir and not os.path.exists(output_dir):
338
363
  os.makedirs(output_dir)
339
364
 
340
- # 转换为URL格式
365
+ # converse to URL
341
366
  output_url = uno.systemPathToFileUrl(os.path.abspath(output_path))
342
367
 
343
- # 执行转换
368
+ # conversing
344
369
  document.storeToURL(output_url, properties)
345
370
  logger.info(f"✅ 文档转换成功: {output_path}")
346
371
 
@@ -352,7 +377,7 @@ class UnoManager:
352
377
  return prop
353
378
 
354
379
 
355
- # 全局单例UnoManager
380
+ # global Singleton UnoManager
356
381
  _global_uno_manager: Optional[UnoManager] = None
357
382
  _manager_lock = threading.Lock()
358
383
 
@@ -360,20 +385,20 @@ _manager_lock = threading.Lock()
360
385
  def get_uno_manager() -> UnoManager:
361
386
  """获取全局单例UNO管理器"""
362
387
  global _global_uno_manager
363
-
388
+
364
389
  if _global_uno_manager is None:
365
390
  with _manager_lock:
366
391
  if _global_uno_manager is None:
367
392
  _global_uno_manager = UnoManager()
368
393
  logger.info("🎯 创建全局单例UnoManager (单线程模式)")
369
-
394
+
370
395
  return _global_uno_manager
371
396
 
372
397
 
373
398
  def cleanup_uno_manager():
374
399
  """清理全局UNO管理器"""
375
400
  global _global_uno_manager
376
-
401
+
377
402
  with _manager_lock:
378
403
  if _global_uno_manager is not None:
379
404
  try:
@@ -391,36 +416,34 @@ def uno_manager_context():
391
416
  try:
392
417
  yield manager
393
418
  finally:
394
- # 在单线程模式下,保持连接以提高效率
419
+ # Maintain connections to improve efficiency in single-threaded mode
395
420
  pass
396
421
 
397
422
 
398
423
  def convert_with_uno(
399
- input_path: str,
400
- output_format: str,
401
- output_dir: Optional[str] = None
424
+ input_path: str, output_format: str, output_dir: Optional[str] = None
402
425
  ) -> str:
403
426
  """
404
427
  使用UNO转换文档格式(便捷函数)
405
-
428
+
406
429
  Args:
407
430
  input_path: 输入文件路径
408
431
  output_format: 输出格式
409
432
  output_dir: 输出目录(可选,默认为输入文件所在目录)
410
-
433
+
411
434
  Returns:
412
435
  输出文件路径
413
436
  """
414
437
  input_path = Path(input_path)
415
-
438
+
416
439
  if output_dir is None:
417
440
  output_dir = input_path.parent
418
441
  else:
419
442
  output_dir = Path(output_dir)
420
-
443
+
421
444
  output_path = output_dir / f"{input_path.stem}.{output_format}"
422
-
445
+
423
446
  with uno_manager_context() as manager:
424
447
  manager.convert_document(str(input_path), str(output_path), output_format)
425
-
448
+
426
449
  return str(output_path)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: pydatamax
3
- Version: 0.1.16
3
+ Version: 0.1.16.post2
4
4
  Summary: A library for parsing and converting various file formats.
5
5
  Home-page: https://github.com/Hi-Dolphin/datamax
6
6
  Author: ccy
@@ -113,7 +113,7 @@ qa_data = dm.get_pre_label(
113
113
  question_number=5, # 每块生成问题数
114
114
  max_workers=5 # 并发数
115
115
  )
116
- dm.save_label_data(res)
116
+ dm.save_label_data(qa_data)
117
117
  ```
118
118
 
119
119
  ## 📖 Detailed Documentation
@@ -316,6 +316,58 @@ pip install -r requirements.txt
316
316
  python setup.py install
317
317
  ```
318
318
 
319
+ ### Developer Mode
320
+
321
+ For developers who want to contribute to the project or make modifications, we recommend using developer mode for a better development experience.
322
+
323
+ #### Setup Developer Mode
324
+
325
+ ```bash
326
+ # Clone the repository
327
+ git clone https://github.com/Hi-Dolphin/datamax.git
328
+ cd datamax
329
+
330
+ # Create virtual environment (recommended)
331
+ python -m venv venv
332
+ source venv/bin/activate # On Windows: venv\Scripts\activate
333
+
334
+ # Install in developer mode
335
+ pip install -e .
336
+ ```
337
+
338
+ #### Benefits of Developer Mode
339
+
340
+ - **Live Updates**: Changes to source code are immediately reflected without reinstallation
341
+ - **Easy Testing**: Test your modifications instantly
342
+ - **Debugging**: Better debugging experience with direct access to source code
343
+ - **Development Workflow**: Seamless integration with your development environment
344
+
345
+ #### Development Commands
346
+
347
+ ```bash
348
+ # Run tests
349
+ pytest
350
+
351
+ # Install development dependencies
352
+ pip install -r requirements-dev.txt # if available
353
+
354
+ # Check code style
355
+ flake8 datamax/
356
+ black datamax/
357
+
358
+ # Build package
359
+ python setup.py sdist bdist_wheel
360
+ ```
361
+
362
+ #### Making Changes
363
+
364
+ After installing in developer mode, you can:
365
+
366
+ 1. Edit source code in the `datamax/` directory
367
+ 2. Changes are automatically available when you import the module
368
+ 3. Test your changes immediately without reinstalling
369
+ 4. Submit pull requests with your improvements
370
+
319
371
  ## 📋 System Requirements
320
372
 
321
373
  - Python >= 3.10
@@ -0,0 +1,39 @@
1
+ datamax/__init__.py,sha256=IGJxWkFpUj1xuHfwtPTrNqsRdLB4jBZIweAVHzDKrvU,29
2
+ datamax/loader/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
+ datamax/loader/core.py,sha256=Ld4PmMcbKbsFyU_ynqXxpB9x3IJ34c3hfJBcUiSthrA,5370
4
+ datamax/loader/minio_handler.py,sha256=VpQ5EHZfLaw0e2JXflAbgPK_plmM_VkPXEiKtIZlQL0,6876
5
+ datamax/loader/oss_handler.py,sha256=ZO8ZbbA1oyuNN4Z7iVgSbMArYgJ1gvpqADkXDNDw4y0,7648
6
+ datamax/parser/__init__.py,sha256=3tCt1bmTjJ0sroivt60AoQuZyHH8AtCvn664Qtoh-60,56
7
+ datamax/parser/base.py,sha256=yEPdk3K-vTf2JnIcTczxEDoMQtVKva9tp2nSACeXOB0,3153
8
+ datamax/parser/core.py,sha256=eglNe4Vk6U3XEUYT2oLovWfuL1XeFDc0KnRahYN24Mk,19208
9
+ datamax/parser/csv_parser.py,sha256=PPRqL4MKDymBgFYo0xrgrO8HB3jFcrXTizG27fXVEag,1698
10
+ datamax/parser/doc_parser.py,sha256=x6aJMQmNCUR2WA0hMvsro1atz6kWK3pRLPzlLfwCUw0,32273
11
+ datamax/parser/docx_parser.py,sha256=40xq86jLI3nayg0dthBnWSMN2qYQeyUNWh5wWJ8Lar8,37658
12
+ datamax/parser/epub_parser.py,sha256=zjLp1ha_oQBGIxvgzAyHzO0ZZJQt1JfoAZ9TM8liZ0o,2708
13
+ datamax/parser/html_parser.py,sha256=b5Rvonj0cScYI2gYfKAfBplp6C8kT2ataf3J0lO40Ok,2017
14
+ datamax/parser/image_parser.py,sha256=HYNN0oqA1LjI8XBQN09nnWfnVrLcDHcXHOytf8z6NAk,2536
15
+ datamax/parser/json_parser.py,sha256=wzUKv1lH35PtM8uXunxNZ6ykHEdI_m02SIW1Y7y-wxc,1826
16
+ datamax/parser/md_parser.py,sha256=62vBHAsHotCC1bNquh7jt8EZuoPyANrbkRTF7YlEOMs,3019
17
+ datamax/parser/pdf_parser.py,sha256=pm6WVNe2nP5K-XHwNfa3BoS_oNDDdJ5kOo5QERICfkI,5245
18
+ datamax/parser/ppt_parser.py,sha256=7KmSrxyPRYlYqclMcsxQmLTD3eR1mbED4qgm1mjF4Mg,5808
19
+ datamax/parser/pptx_parser.py,sha256=huMbv9JMGI7Nvs3v-TGn7MOpicb8-z3L5G24PxBVDzw,2546
20
+ datamax/parser/txt_parser.py,sha256=GyPFuYQ00mI1shmMFi_gtDJ8B-C2rMI9rI0sjay32Hs,2630
21
+ datamax/parser/xls_parser.py,sha256=J-Eumrh3oxwr06YHHfUplJlskwBlsEoj3sQ9OVXBFCo,1819
22
+ datamax/parser/xlsx_parser.py,sha256=Uj1OisEVAzO8mMRcTHpitBstS0M7aSS4UehnE78pvxU,9468
23
+ datamax/utils/__init__.py,sha256=elPbB7MSk5VfcmKmhaXCTUsVXP9vxd8C-DAMG3JqbDU,1491
24
+ datamax/utils/constants.py,sha256=1hzHnYsm43Q36Czc7OnC-zJVTunThx82d_ZZAZBErHw,4565
25
+ datamax/utils/data_cleaner.py,sha256=2sfjXkDaEXavr98Ezj1BWG4uJQPUzeR99172tH43-Yk,10454
26
+ datamax/utils/env_setup.py,sha256=lXPAL6WGkjOBgqTa2A0li5YS2TQ96cvAR4OhJjQP4pA,3638
27
+ datamax/utils/gotocr_pdf.py,sha256=LHQ4nIFNC47b7hLWzMGkk_UsLmIxMLnUhRa48iwJo48,8796
28
+ datamax/utils/lifecycle_types.py,sha256=rvHB4zwzS_nlWKUtWA37L9dJNvx6ol5F2-x2eEf6zJk,625
29
+ datamax/utils/mineru_operator.py,sha256=mBw9xuCwJZmmOLaUFhw2c3JPDB7KMjWqSlEzbKCRXc8,2276
30
+ datamax/utils/paddleocr_pdf_operator.py,sha256=5l7P7wCGd4-Qph3NMTDdHR6nStjafDMNpX4sSCFv5qQ,3637
31
+ datamax/utils/ppt_extract.py,sha256=Sf4H3TKdK6BnKRv0sw5JnfKSQH9l6u5XUwLTd78KB94,6619
32
+ datamax/utils/qa_generator.py,sha256=q7pzZ3DWItRQLBQH1jab2TBkjJvcKfkBuzlN0wxZ5Rs,24353
33
+ datamax/utils/tokenizer.py,sha256=j93Uky4bYDKZKT-MOtenZb36MoRPNnYk8sP9t_FSQqk,860
34
+ datamax/utils/uno_handler.py,sha256=xITU8FGeeBtHRc-Aj4lbKHGvKVslWEwWZOIUZiP_ghY,15447
35
+ pydatamax-0.1.16.post2.dist-info/licenses/LICENSE,sha256=RltoeMa9c1956S08ThvZE2yJSPbnem68Y8cmiIfOgco,1088
36
+ pydatamax-0.1.16.post2.dist-info/METADATA,sha256=pTi_avX8RBNYxHcPS6CmZnESFqGlX-TwvqMzF6Ilx0Q,11145
37
+ pydatamax-0.1.16.post2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
38
+ pydatamax-0.1.16.post2.dist-info/top_level.txt,sha256=N9TrwI3GKnWW07RRXHr0xX5Bm8dIM_sahfAnf9j8J9M,8
39
+ pydatamax-0.1.16.post2.dist-info/RECORD,,
@@ -1,38 +0,0 @@
1
- datamax/__init__.py,sha256=IGJxWkFpUj1xuHfwtPTrNqsRdLB4jBZIweAVHzDKrvU,29
2
- datamax/loader/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
- datamax/loader/core.py,sha256=NGnK2m59GRBauYxZST0kyX5f4zhvAOk4Z5bVoF0CjGo,5218
4
- datamax/loader/minio_handler.py,sha256=e7ZUlwoStVe5iQfAVfNgEwRLxen4NbxwokpJZl6AR0U,6557
5
- datamax/loader/oss_handler.py,sha256=ZO8ZbbA1oyuNN4Z7iVgSbMArYgJ1gvpqADkXDNDw4y0,7648
6
- datamax/parser/__init__.py,sha256=sIB1N4B_fVguSBN-Uw9tGxAO6s0oi4Tq4kRZ59YlUKo,52
7
- datamax/parser/base.py,sha256=FamDV6csc3aXVbobMR1lPNtVpvYMO19koRJW9poj_gE,2590
8
- datamax/parser/core.py,sha256=pySissrF6kVVAzT5abIlQ-4cUliFu1HBWjcD6psNkYA,16845
9
- datamax/parser/csv_parser.py,sha256=lHQs1MHK9WM4Vl0p9nsE3fFhewF0EoXZUhtk8ixznRw,1028
10
- datamax/parser/doc_parser.py,sha256=qPKpZy_p1veV2AodqEQU6LzqmT7y1PANlPtt0CYoHeg,30837
11
- datamax/parser/docx_parser.py,sha256=wdDGgeYIDg1Se493XZhlduxKjtYZ58Uqxltm2vt9Dy4,36691
12
- datamax/parser/epub_parser.py,sha256=K4eCS4wIXJzDicvtVAfQT8yt1gFHeibZN5-EdQZfJe8,1621
13
- datamax/parser/html_parser.py,sha256=5ACrVc03Q9pJqWI_b0EtRgOYy0eMYJq4podgHGD68Z8,1453
14
- datamax/parser/image_parser.py,sha256=UH3duPvB7Xu6CFlEeAukX5uJ8VlqnMR89hcLsW2O-aU,1281
15
- datamax/parser/json_parser.py,sha256=2Ns2Lm6sei9TnDaFFYvl-xhyhse12sMJBwjKNACw4po,1072
16
- datamax/parser/md_parser.py,sha256=rHJqtRV78XgQuKtDdwn1LcgRSUEuhGBqN5uaHG6oPT4,2251
17
- datamax/parser/pdf_parser.py,sha256=YOJFOEC8DxWLAE6yNd2x6qMvYJl3sWVcM1eA8R4uA30,4116
18
- datamax/parser/ppt_parser.py,sha256=0OlsIrzZZJnYZqLmQkUz4J_Hiv-rQHLHJnHIsw345c8,4631
19
- datamax/parser/pptx_parser.py,sha256=yWajVd8kpyTdFavR8XcbwxOj94uNBswDoMHfSOycn0o,1870
20
- datamax/parser/txt_parser.py,sha256=NXs7aNpm1PUwiUSlN1RU23ittuuQSBaBNI3KeQjJFXs,1750
21
- datamax/parser/xls_parser.py,sha256=iNMx8iPakjE984dkaFL-oUBYWpQwxbWoDnQdwfAeeGM,980
22
- datamax/parser/xlsx_parser.py,sha256=hUOFqkqkI0XPcwwrp2cs4PFKbChpZtb8orGsZc9kxJ0,9089
23
- datamax/utils/__init__.py,sha256=75D4WFE_FVG9MyT8qWtvtlgzuuRelTC7ObSqqfjDKIY,1476
24
- datamax/utils/constants.py,sha256=1hzHnYsm43Q36Czc7OnC-zJVTunThx82d_ZZAZBErHw,4565
25
- datamax/utils/data_cleaner.py,sha256=TrrxC1r0__wuOhrQSJZcJKoEIyB4eNKWZkA1IoBYhyQ,9937
26
- datamax/utils/env_setup.py,sha256=p_7sqHwyXroeOI_yFZpUOK6wOGmPVmf-gBa6M3351O4,3539
27
- datamax/utils/gotocr_pdf.py,sha256=A7sn77EQBDbAe-4edCBUlYXKuE2mY7JcsFGm8U3-xbE,8744
28
- datamax/utils/mineru_operator.py,sha256=4i4FtDkDE61FWPyRoDjPujHYJq_kDUAkwlowmFWdEOA,2303
29
- datamax/utils/paddleocr_pdf_operator.py,sha256=SW06dts1SxDnUvyf5zWYpAN_6t9PLtJXUSsYhSS317I,3645
30
- datamax/utils/ppt_extract.py,sha256=jBVGYEsBGPjHqyq7drHTOM8MnFOwqKyHhbkKmEAryAk,6307
31
- datamax/utils/qa_generator.py,sha256=pXxdFm_EnWgMuilfmLKgy2c6NDexQZN8nWxT-bYBt74,12548
32
- datamax/utils/tokenizer.py,sha256=o78GPmeJ3vs3-SF0b2nMm35XtbrCKbrhDW0gI9gqGl4,880
33
- datamax/utils/uno_handler.py,sha256=gDm42OQQQoCiOP0SB7xZ9TRF6A_XBHNavwG5ycj6kEQ,14807
34
- pydatamax-0.1.16.dist-info/licenses/LICENSE,sha256=RltoeMa9c1956S08ThvZE2yJSPbnem68Y8cmiIfOgco,1088
35
- pydatamax-0.1.16.dist-info/METADATA,sha256=nwHHLzsxwm_Za3aVbLMFfL-3JIZSZclp9KI0XL6rOHE,9795
36
- pydatamax-0.1.16.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
37
- pydatamax-0.1.16.dist-info/top_level.txt,sha256=N9TrwI3GKnWW07RRXHr0xX5Bm8dIM_sahfAnf9j8J9M,8
38
- pydatamax-0.1.16.dist-info/RECORD,,