pydatamax 0.1.12__py3-none-any.whl → 0.1.14__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
datamax/loader/core.py CHANGED
@@ -1,7 +1,7 @@
1
1
  import os
2
2
  from typing import List
3
- from datamax.loader.MinioHandler import MinIOClient
4
- from datamax.loader.OssHandler import OssClient
3
+ from datamax.loader.minio_handler import MinIOClient
4
+ from datamax.loader.oss_handler import OssClient
5
5
 
6
6
 
7
7
  class DataLoader:
@@ -1,14 +1,17 @@
1
+ import logging
1
2
  import os
2
3
  import shutil
3
4
  import subprocess
4
5
  import tempfile
5
- import chardet
6
- import docx2markdown
7
6
  from pathlib import Path
8
7
  from typing import Union
9
- from docx import Document
10
- from datamax.parser.base import BaseLife
11
- from datamax.parser.base import MarkdownOutputVo
8
+
9
+ import chardet
10
+
11
+ from datamax.parser.base import BaseLife, MarkdownOutputVo
12
+
13
+ # 配置日志
14
+ logger = logging.getLogger(__name__)
12
15
 
13
16
 
14
17
  class DocParser(BaseLife):
@@ -16,62 +19,185 @@ class DocParser(BaseLife):
16
19
  super().__init__()
17
20
  self.file_path = file_path
18
21
  self.to_markdown = to_markdown
22
+ logger.info(f"🚀 DocParser初始化完成 - 文件路径: {file_path}, 转换为markdown: {to_markdown}")
23
+
24
+ def doc_to_txt(self, doc_path: str, dir_path: str) -> str:
25
+ """将.doc文件转换为.txt文件"""
26
+ logger.info(f"🔄 开始转换DOC文件为TXT - 源文件: {doc_path}, 输出目录: {dir_path}")
27
+
28
+ try:
29
+ cmd = f'soffice --headless --convert-to txt "{doc_path}" --outdir "{dir_path}"'
30
+ logger.debug(f"⚡ 执行转换命令: {cmd}")
31
+
32
+ process = subprocess.Popen(
33
+ cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE
34
+ )
35
+ stdout, stderr = process.communicate()
36
+ exit_code = process.returncode
37
+
38
+ if exit_code == 0:
39
+ logger.info(f"✅ DOC到TXT转换成功 - 退出码: {exit_code}")
40
+ if stdout:
41
+ logger.debug(f"📄 转换输出: {stdout.decode('utf-8', errors='replace')}")
42
+ else:
43
+ encoding = chardet.detect(stderr)["encoding"]
44
+ if encoding is None:
45
+ encoding = "utf-8"
46
+ error_msg = stderr.decode(encoding, errors="replace")
47
+ logger.error(f"❌ DOC到TXT转换失败 - 退出码: {exit_code}, 错误信息: {error_msg}")
48
+ raise Exception(
49
+ f"Error Output (detected encoding: {encoding}): {error_msg}"
50
+ )
51
+
52
+ fname = str(Path(doc_path).stem)
53
+ txt_path = os.path.join(dir_path, f"{fname}.txt")
54
+
55
+ if not os.path.exists(txt_path):
56
+ logger.error(f"❌ 转换后的TXT文件不存在: {txt_path}")
57
+ raise Exception(f"文件转换失败 {doc_path} ==> {txt_path}")
58
+ else:
59
+ logger.info(f"🎉 TXT文件转换成功,文件路径: {txt_path}")
60
+ return txt_path
61
+
62
+ except subprocess.SubprocessError as e:
63
+ logger.error(f"💥 subprocess执行失败: {str(e)}")
64
+ raise Exception(f"执行转换命令时发生错误: {str(e)}")
65
+ except Exception as e:
66
+ logger.error(f"💥 DOC到TXT转换过程中发生未知错误: {str(e)}")
67
+ raise
68
+
69
+ def read_txt_file(self, txt_path: str) -> str:
70
+ """读取txt文件内容"""
71
+ logger.info(f"📖 开始读取TXT文件: {txt_path}")
72
+
73
+ try:
74
+ # 检测文件编码
75
+ with open(txt_path, "rb") as f:
76
+ raw_data = f.read()
77
+ encoding = chardet.detect(raw_data)["encoding"]
78
+ if encoding is None:
79
+ encoding = "utf-8"
80
+ logger.debug(f"🔍 检测到文件编码: {encoding}")
81
+
82
+ # 读取文件内容
83
+ with open(txt_path, "r", encoding=encoding, errors="replace") as f:
84
+ content = f.read()
85
+
86
+ logger.info(f"📄 TXT文件读取完成 - 内容长度: {len(content)} 字符")
87
+ logger.debug(f"👀 前100字符预览: {content[:100]}...")
88
+
89
+ return content
90
+
91
+ except FileNotFoundError as e:
92
+ logger.error(f"🚫 TXT文件未找到: {str(e)}")
93
+ raise Exception(f"文件未找到: {txt_path}")
94
+ except Exception as e:
95
+ logger.error(f"💥 读取TXT文件时发生错误: {str(e)}")
96
+ raise
97
+
98
+ def read_doc_file(self, doc_path: str) -> str:
99
+ """读取doc文件并转换为文本"""
100
+ logger.info(f"📖 开始读取DOC文件 - 文件: {doc_path}")
19
101
 
20
- def doc_to_docx(self, doc_path: str, dir_path: str) -> str:
21
- cmd = f'soffice --headless --convert-to docx "{doc_path}" --outdir "{dir_path}"'
22
- process = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
23
- stdout, stderr = process.communicate()
24
- exit_code = process.returncode
25
- if exit_code == 0:
26
- pass
27
- else:
28
- encoding = chardet.detect(stderr)['encoding']
29
- if encoding is None:
30
- encoding = 'utf-8'
31
- raise Exception(f"Error Output (detected encoding: {encoding}):", stderr.decode(encoding, errors='replace'))
32
- fname = str(Path(doc_path).stem)
33
- docx_path = os.path.join(os.path.dirname(doc_path), f'{fname}.docx')
34
- if not os.path.exists(docx_path):
35
- raise Exception(f"> !!! File conversion failed {doc_path} ==> {docx_path}")
36
- else:
37
- return docx_path
38
-
39
- def read_docx_file(self, doc_path: str, to_mk: bool) -> str:
40
102
  try:
41
103
  with tempfile.TemporaryDirectory() as temp_path:
104
+ logger.debug(f"📁 创建临时目录: {temp_path}")
105
+
42
106
  temp_dir = Path(temp_path)
43
- media_dir = temp_dir / "media"
44
- media_dir.mkdir()
107
+
45
108
  file_path = temp_dir / "tmp.doc"
46
109
  shutil.copy(doc_path, file_path)
47
- docx_file_path = self.doc_to_docx(str(file_path), str(temp_path))
48
- doc = Document(docx_file_path)
49
- full_text = [para.text for para in doc.paragraphs]
50
- if to_mk:
51
- output_md_dir = f'./output/{os.path.basename(docx_file_path).replace(".docx", ".md")}'
52
- output_dir = os.path.dirname(output_md_dir)
53
- if output_dir and not os.path.exists(output_dir):
54
- os.makedirs(output_dir)
55
- docx2markdown.docx_to_markdown(docx_file_path, output_md_dir)
56
- mk_content = open(output_md_dir, 'r', encoding='utf-8').read()
57
- return mk_content
58
- else:
59
- return '\n'.join(full_text)
110
+ logger.debug(f"📋 复制文件到临时目录: {doc_path} -> {file_path}")
111
+
112
+ # 转换DOC为TXT
113
+ txt_file_path = self.doc_to_txt(str(file_path), str(temp_path))
114
+ logger.info(f"🎯 DOC转TXT完成: {txt_file_path}")
115
+
116
+ # 读取TXT文件内容
117
+ content = self.read_txt_file(txt_file_path)
118
+ logger.info(f"✨ TXT文件内容读取完成,内容长度: {len(content)} 字符")
119
+
120
+ return content
121
+
122
+ except FileNotFoundError as e:
123
+ logger.error(f"🚫 文件未找到: {str(e)}")
124
+ raise Exception(f"文件未找到: {doc_path}")
125
+ except PermissionError as e:
126
+ logger.error(f"🔒 文件权限错误: {str(e)}")
127
+ raise Exception(f"无权限访问文件: {doc_path}")
60
128
  except Exception as e:
61
- raise e
129
+ logger.error(f"💥 读取DOC文件时发生错误: {str(e)}")
130
+ raise
62
131
 
63
132
  def parse(self, file_path: str):
133
+ """解析DOC文件"""
134
+ logger.info(f"🎬 开始解析DOC文件: {file_path}")
135
+
64
136
  try:
137
+ # 验证文件存在
138
+ if not os.path.exists(file_path):
139
+ logger.error(f"🚫 文件不存在: {file_path}")
140
+ raise FileNotFoundError(f"文件不存在: {file_path}")
141
+
142
+ # 验证文件大小
143
+ file_size = os.path.getsize(file_path)
144
+ logger.info(f"📏 文件大小: {file_size} 字节")
145
+
65
146
  title = self.get_file_extension(file_path)
147
+ logger.debug(f"🏷️ 提取文件标题: {title}")
148
+
149
+ # 使用soffice转换为txt后读取内容
150
+ logger.info("📝 使用soffice转换DOC为TXT并读取内容")
151
+ content = self.read_doc_file(doc_path=file_path)
152
+
153
+ # 根据to_markdown参数决定是否保持原格式还是处理为markdown格式
66
154
  if self.to_markdown:
67
- mk_content = self.read_docx_file(doc_path=file_path, to_mk=True)
155
+ # 简单的文本到markdown转换(保持段落结构)
156
+ mk_content = self.format_as_markdown(content)
157
+ logger.info("🎨 内容已格式化为markdown格式")
68
158
  else:
69
- content = self.read_docx_file(doc_path=file_path, to_mk=False)
70
159
  mk_content = content
71
- lifecycle = self.generate_lifecycle(source_file=file_path, domain="Technology",
72
- usage_purpose="Documentation", life_type="LLM_ORIGIN")
160
+ logger.info("📝 保持原始文本格式")
161
+
162
+ logger.info(f"🎊 文件内容解析完成,最终内容长度: {len(mk_content)} 字符")
163
+
164
+ lifecycle = self.generate_lifecycle(
165
+ source_file=file_path,
166
+ domain="Technology",
167
+ usage_purpose="Documentation",
168
+ life_type="LLM_ORIGIN",
169
+ )
170
+ logger.debug("⚙️ 生成lifecycle信息完成")
171
+
73
172
  output_vo = MarkdownOutputVo(title, mk_content)
74
173
  output_vo.add_lifecycle(lifecycle)
75
- return output_vo.to_dict()
174
+
175
+ result = output_vo.to_dict()
176
+ logger.info(f"🏆 DOC文件解析完成: {file_path}")
177
+ logger.debug(f"🔑 返回结果键: {list(result.keys())}")
178
+
179
+ return result
180
+
76
181
  except Exception as e:
77
- raise e
182
+ logger.error(f"💀 解析DOC文件失败: {file_path}, 错误: {str(e)}")
183
+ raise
184
+
185
+ def format_as_markdown(self, content: str) -> str:
186
+ """将纯文本格式化为简单的markdown格式"""
187
+ if not content.strip():
188
+ return content
189
+
190
+ lines = content.split("\n")
191
+ formatted_lines = []
192
+
193
+ for line in lines:
194
+ line = line.strip()
195
+ if not line:
196
+ formatted_lines.append("")
197
+ continue
198
+
199
+ # 简单的markdown格式化规则
200
+ # 可以根据需要扩展更多规则
201
+ formatted_lines.append(line)
202
+
203
+ return "\n".join(formatted_lines)
@@ -1,9 +1,17 @@
1
+ import logging
1
2
  import os
2
- import docx2markdown
3
- from docx import Document
3
+ import shutil
4
+ import subprocess
5
+ import tempfile
6
+ from pathlib import Path
4
7
  from typing import Union
5
- from datamax.parser.base import BaseLife
6
- from datamax.parser.base import MarkdownOutputVo
8
+
9
+ import chardet
10
+
11
+ from datamax.parser.base import BaseLife, MarkdownOutputVo
12
+
13
+ # 配置日志
14
+ logger = logging.getLogger(__name__)
7
15
 
8
16
 
9
17
  class DocxParser(BaseLife):
@@ -11,33 +19,206 @@ class DocxParser(BaseLife):
11
19
  super().__init__()
12
20
  self.file_path = file_path
13
21
  self.to_markdown = to_markdown
22
+ logger.info(
23
+ f"🚀 DocxParser初始化完成 - 文件路径: {file_path}, 转换为markdown: {to_markdown}"
24
+ )
25
+
26
+ def docx_to_txt(self, docx_path: str, dir_path: str) -> str:
27
+ """将.docx文件转换为.txt文件"""
28
+ logger.info(f"🔄 开始转换DOCX文件为TXT - 源文件: {docx_path}, 输出目录: {dir_path}")
29
+
30
+ try:
31
+ cmd = f'soffice --headless --convert-to txt "{docx_path}" --outdir "{dir_path}"'
32
+ logger.debug(f"⚡ 执行转换命令: {cmd}")
33
+
34
+ process = subprocess.Popen(
35
+ cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE
36
+ )
37
+ stdout, stderr = process.communicate()
38
+ exit_code = process.returncode
39
+
40
+ if exit_code == 0:
41
+ logger.info(f"✅ DOCX到TXT转换成功 - 退出码: {exit_code}")
42
+ if stdout:
43
+ logger.debug(f"📄 转换输出: {stdout.decode('utf-8', errors='replace')}")
44
+ else:
45
+ encoding = chardet.detect(stderr)["encoding"]
46
+ if encoding is None:
47
+ encoding = "utf-8"
48
+ error_msg = stderr.decode(encoding, errors="replace")
49
+ logger.error(f"❌ DOCX到TXT转换失败 - 退出码: {exit_code}, 错误信息: {error_msg}")
50
+ raise Exception(
51
+ f"Error Output (detected encoding: {encoding}): {error_msg}"
52
+ )
53
+
54
+ fname = str(Path(docx_path).stem)
55
+ txt_path = os.path.join(dir_path, f"{fname}.txt")
56
+
57
+ if not os.path.exists(txt_path):
58
+ logger.error(f"❌ 转换后的TXT文件不存在: {txt_path}")
59
+ raise Exception(f"文件转换失败 {docx_path} ==> {txt_path}")
60
+ else:
61
+ logger.info(f"🎉 TXT文件转换成功,文件路径: {txt_path}")
62
+ return txt_path
63
+
64
+ except subprocess.SubprocessError as e:
65
+ logger.error(f"💥 subprocess执行失败: {str(e)}")
66
+ raise Exception(f"执行转换命令时发生错误: {str(e)}")
67
+ except Exception as e:
68
+ logger.error(f"💥 DOCX到TXT转换过程中发生未知错误: {str(e)}")
69
+ raise
70
+
71
+ def read_txt_file(self, txt_path: str) -> str:
72
+ """读取txt文件内容"""
73
+ logger.info(f"📖 开始读取TXT文件: {txt_path}")
74
+
75
+ try:
76
+ # 检测文件编码
77
+ with open(txt_path, "rb") as f:
78
+ raw_data = f.read()
79
+ encoding = chardet.detect(raw_data)["encoding"]
80
+ if encoding is None:
81
+ encoding = "utf-8"
82
+ logger.debug(f"🔍 检测到文件编码: {encoding}")
83
+
84
+ # 读取文件内容
85
+ with open(txt_path, "r", encoding=encoding, errors="replace") as f:
86
+ content = f.read()
87
+
88
+ logger.info(f"📄 TXT文件读取完成 - 内容长度: {len(content)} 字符")
89
+ logger.debug(f"👀 前100字符预览: {content[:100]}...")
90
+
91
+ return content
92
+
93
+ except FileNotFoundError as e:
94
+ logger.error(f"🚫 TXT文件未找到: {str(e)}")
95
+ raise Exception(f"文件未找到: {txt_path}")
96
+ except Exception as e:
97
+ logger.error(f"💥 读取TXT文件时发生错误: {str(e)}")
98
+ raise
99
+
100
+ def read_docx_file(self, docx_path: str) -> str:
101
+ """读取docx文件并转换为文本"""
102
+ logger.info(f"📖 开始读取DOCX文件 - 文件: {docx_path}")
14
103
 
15
- @staticmethod
16
- def read_docx_file(file_path: str) -> str:
17
104
  try:
18
- doc = Document(file_path)
19
- full_text = [para.text for para in doc.paragraphs]
20
- return '\n'.join(full_text)
105
+ with tempfile.TemporaryDirectory() as temp_path:
106
+ logger.debug(f"📁 创建临时目录: {temp_path}")
107
+
108
+ temp_dir = Path(temp_path)
109
+
110
+ file_path = temp_dir / "tmp.docx"
111
+ shutil.copy(docx_path, file_path)
112
+ logger.debug(f"📋 复制文件到临时目录: {docx_path} -> {file_path}")
113
+
114
+ # 转换DOCX为TXT
115
+ txt_file_path = self.docx_to_txt(str(file_path), str(temp_path))
116
+ logger.info(f"🎯 DOCX转TXT完成: {txt_file_path}")
117
+
118
+ # 读取TXT文件内容
119
+ content = self.read_txt_file(txt_file_path)
120
+ logger.info(f"✨ TXT文件内容读取完成,内容长度: {len(content)} 字符")
121
+
122
+ return content
123
+
124
+ except FileNotFoundError as e:
125
+ logger.error(f"🚫 文件未找到: {str(e)}")
126
+ raise Exception(f"文件未找到: {docx_path}")
127
+ except PermissionError as e:
128
+ logger.error(f"🔒 文件权限错误: {str(e)}")
129
+ raise Exception(f"无权限访问文件: {docx_path}")
21
130
  except Exception as e:
22
- raise e
131
+ logger.error(f"💥 读取DOCX文件时发生错误: {str(e)}")
132
+ raise
23
133
 
24
134
  def parse(self, file_path: str):
135
+ """解析DOCX文件"""
136
+ logger.info(f"🎬 开始解析DOCX文件: {file_path}")
137
+
25
138
  try:
139
+ # 验证文件存在
140
+ if not os.path.exists(file_path):
141
+ logger.error(f"🚫 文件不存在: {file_path}")
142
+ raise FileNotFoundError(f"文件不存在: {file_path}")
143
+
144
+ # 验证文件扩展名
145
+ if not file_path.lower().endswith(".docx"):
146
+ logger.warning(f"⚠️ 文件扩展名不是.docx: {file_path}")
147
+
148
+ # 验证文件大小
149
+ file_size = os.path.getsize(file_path)
150
+ logger.info(f"📏 文件大小: {file_size} 字节")
151
+
152
+ if file_size == 0:
153
+ logger.warning(f"⚠️ 文件大小为0字节: {file_path}")
154
+
26
155
  title = self.get_file_extension(file_path)
156
+ logger.debug(f"🏷️ 提取文件标题: {title}")
157
+
158
+ # 使用soffice转换为txt后读取内容
159
+ logger.info("📝 使用soffice转换DOCX为TXT并读取内容")
160
+ content = self.read_docx_file(docx_path=file_path)
161
+
162
+ # 根据to_markdown参数决定是否保持原格式还是处理为markdown格式
27
163
  if self.to_markdown:
28
- output_md_dir = f'./output/{os.path.basename(file_path).replace(".docx", ".md")}'
29
- output_dir = os.path.dirname(output_md_dir)
30
- if output_dir and not os.path.exists(output_dir):
31
- os.makedirs(output_dir)
32
- docx2markdown.docx_to_markdown(file_path, output_md_dir)
33
- mk_content = open(output_md_dir, 'r', encoding='utf-8').read()
164
+ # 简单的文本到markdown转换(保持段落结构)
165
+ mk_content = self.format_as_markdown(content)
166
+ logger.info("🎨 内容已格式化为markdown格式")
34
167
  else:
35
- content = self.read_docx_file(file_path=file_path)
36
168
  mk_content = content
37
- lifecycle = self.generate_lifecycle(source_file=file_path, domain="Technology",
38
- usage_purpose="Documentation", life_type="LLM_ORIGIN")
169
+ logger.info("📝 保持原始文本格式")
170
+
171
+ logger.info(f"🎊 文件内容解析完成,最终内容长度: {len(mk_content)} 字符")
172
+
173
+ # 检查内容是否为空
174
+ if not mk_content.strip():
175
+ logger.warning(f"⚠️ 解析出的内容为空: {file_path}")
176
+
177
+ lifecycle = self.generate_lifecycle(
178
+ source_file=file_path,
179
+ domain="Technology",
180
+ usage_purpose="Documentation",
181
+ life_type="LLM_ORIGIN",
182
+ )
183
+ logger.debug("⚙️ 生成lifecycle信息完成")
184
+
39
185
  output_vo = MarkdownOutputVo(title, mk_content)
40
186
  output_vo.add_lifecycle(lifecycle)
41
- return output_vo.to_dict()
187
+
188
+ result = output_vo.to_dict()
189
+ logger.info(f"🏆 DOCX文件解析完成: {file_path}")
190
+ logger.debug(f"🔑 返回结果键: {list(result.keys())}")
191
+
192
+ return result
193
+
194
+ except FileNotFoundError as e:
195
+ logger.error(f"🚫 文件不存在错误: {str(e)}")
196
+ raise
197
+ except PermissionError as e:
198
+ logger.error(f"🔒 文件权限错误: {str(e)}")
199
+ raise Exception(f"无权限访问文件: {file_path}")
42
200
  except Exception as e:
43
- raise e
201
+ logger.error(
202
+ f"💀 解析DOCX文件失败: {file_path}, 错误类型: {type(e).__name__}, 错误信息: {str(e)}"
203
+ )
204
+ raise
205
+
206
+ def format_as_markdown(self, content: str) -> str:
207
+ """将纯文本格式化为简单的markdown格式"""
208
+ if not content.strip():
209
+ return content
210
+
211
+ lines = content.split("\n")
212
+ formatted_lines = []
213
+
214
+ for line in lines:
215
+ line = line.strip()
216
+ if not line:
217
+ formatted_lines.append("")
218
+ continue
219
+
220
+ # 简单的markdown格式化规则
221
+ # 可以根据需要扩展更多规则
222
+ formatted_lines.append(line)
223
+
224
+ return "\n".join(formatted_lines)
@@ -1,71 +1,215 @@
1
+ import logging
1
2
  import multiprocessing
3
+ import os
2
4
  import time
3
- from multiprocessing import Queue
4
- from datamax.parser.base import MarkdownOutputVo
5
- from datamax.parser.base import BaseLife
6
- from openpyxl import load_workbook
7
5
  import warnings
8
- from markitdown import MarkItDown
6
+ from multiprocessing import Queue
7
+
8
+ import pandas as pd
9
+
10
+ from datamax.parser.base import BaseLife, MarkdownOutputVo
9
11
 
10
12
  warnings.filterwarnings("ignore")
11
13
 
12
- class XlsxParser(BaseLife):
13
- # single ton
14
+ # 配置日志
15
+ logger = logging.getLogger(__name__)
14
16
 
15
- _markitdown_instance = None
16
17
 
17
- @classmethod
18
- def get_markitdown(cls):
19
- if cls._markitdown_instance is None:
20
- cls._markitdown_instance = MarkItDown()
21
- return cls._markitdown_instance
18
+ class XlsxParser(BaseLife):
19
+ """XLSX解析器 - 使用pandas读取并转换为markdown,支持多进程处理"""
22
20
 
23
21
  def __init__(self, file_path, timeout):
24
22
  super().__init__()
25
23
  self.file_path = file_path
26
24
  self.timeout = timeout
27
- self.markitdown = self.get_markitdown()
25
+ logger.info(f"🚀 XlsxParser初始化完成 - 文件路径: {file_path}, 超时: {timeout}s")
26
+
27
+ def _parse_with_pandas(self, file_path: str) -> str:
28
+ """使用pandas读取Excel并转换为markdown"""
29
+ logger.info(f"🐼 开始使用pandas读取Excel文件: {file_path}")
30
+
31
+ try:
32
+ # 验证文件存在
33
+ if not os.path.exists(file_path):
34
+ logger.error(f"🚫 Excel文件不存在: {file_path}")
35
+ raise FileNotFoundError(f"文件不存在: {file_path}")
36
+
37
+ # 验证文件大小
38
+ file_size = os.path.getsize(file_path)
39
+ logger.info(f"📏 文件大小: {file_size} 字节")
40
+
41
+ if file_size == 0:
42
+ logger.warning(f"⚠️ 文件大小为0字节: {file_path}")
43
+ return "*文件为空*"
44
+
45
+ # 使用pandas读取Excel文件
46
+ logger.debug("📊 正在读取Excel数据...")
47
+ df = pd.read_excel(file_path, sheet_name=None) # 读取所有工作表
48
+
49
+ markdown_content = ""
50
+
51
+ if isinstance(df, dict):
52
+ # 多个工作表
53
+ logger.info(f"📑 检测到多个工作表,共 {len(df)} 个")
54
+ for sheet_name, sheet_df in df.items():
55
+ logger.debug(f"📋 处理工作表: {sheet_name}, 形状: {sheet_df.shape}")
56
+ markdown_content += f"## 工作表: {sheet_name}\n\n"
57
+
58
+ if not sheet_df.empty:
59
+ # 清理数据:移除完全为空的行和列
60
+ sheet_df = sheet_df.dropna(how="all").dropna(axis=1, how="all")
61
+
62
+ if not sheet_df.empty:
63
+ sheet_markdown = sheet_df.to_markdown(index=False)
64
+ markdown_content += sheet_markdown + "\n\n"
65
+ logger.debug(
66
+ f"✅ 工作表 {sheet_name} 转换完成,有效数据形状: {sheet_df.shape}"
67
+ )
68
+ else:
69
+ markdown_content += "*该工作表无有效数据*\n\n"
70
+ logger.warning(f"⚠️ 工作表 {sheet_name} 清理后无有效数据")
71
+ else:
72
+ markdown_content += "*该工作表为空*\n\n"
73
+ logger.warning(f"⚠️ 工作表 {sheet_name} 为空")
74
+ else:
75
+ # 单个工作表
76
+ logger.info(f"📄 单个工作表,形状: {df.shape}")
77
+ if not df.empty:
78
+ # 清理数据:移除完全为空的行和列
79
+ df = df.dropna(how="all").dropna(axis=1, how="all")
80
+
81
+ if not df.empty:
82
+ markdown_content = df.to_markdown(index=False)
83
+ logger.info(f"✅ 工作表转换完成,有效数据形状: {df.shape}")
84
+ else:
85
+ markdown_content = "*工作表无有效数据*"
86
+ logger.warning("⚠️ 工作表清理后无有效数据")
87
+ else:
88
+ markdown_content = "*工作表为空*"
89
+ logger.warning("⚠️ 工作表为空")
90
+
91
+ logger.info(f"🎊 pandas转换完成,markdown内容长度: {len(markdown_content)} 字符")
92
+ logger.debug(f"👀 前200字符预览: {markdown_content[:200]}...")
93
+
94
+ return markdown_content
95
+
96
+ except FileNotFoundError as e:
97
+ logger.error(f"🚫 文件未找到: {str(e)}")
98
+ raise
99
+ except PermissionError as e:
100
+ logger.error(f"🔒 文件权限错误: {str(e)}")
101
+ raise Exception(f"无权限访问文件: {file_path}")
102
+ except pd.errors.EmptyDataError as e:
103
+ logger.error(f"📭 Excel文件为空: {str(e)}")
104
+ raise Exception(f"Excel文件为空或无法读取: {file_path}")
105
+ except Exception as e:
106
+ logger.error(f"💥 pandas读取Excel失败: {str(e)}")
107
+ raise
28
108
 
29
109
  def _parse(self, file_path: str, result_queue: Queue) -> dict:
110
+ """解析Excel文件的核心方法"""
111
+ logger.info(f"🎬 开始解析Excel文件: {file_path}")
112
+
30
113
  try:
31
- wb = load_workbook(
32
- filename=file_path,
33
- data_only=True,
34
- read_only=True
114
+ # 使用pandas解析Excel
115
+ logger.info("🐼 使用pandas模式解析Excel")
116
+ mk_content = self._parse_with_pandas(file_path)
117
+
118
+ # 检查内容是否为空
119
+ if not mk_content.strip():
120
+ logger.warning(f"⚠️ 解析出的内容为空: {file_path}")
121
+ mk_content = "*无法解析文件内容*"
122
+
123
+ logger.info(f"🎊 文件内容解析完成,最终内容长度: {len(mk_content)} 字符")
124
+
125
+ # 生成lifecycle信息
126
+ lifecycle = self.generate_lifecycle(
127
+ source_file=file_path,
128
+ domain="Technology",
129
+ usage_purpose="Documentation",
130
+ life_type="LLM_ORIGIN",
35
131
  )
36
- wb.close()
132
+ logger.debug("⚙️ 生成lifecycle信息完成")
133
+
134
+ # 创建输出对象
135
+ title = self.get_file_extension(file_path)
136
+ output_vo = MarkdownOutputVo(title, mk_content)
137
+ output_vo.add_lifecycle(lifecycle)
138
+
139
+ result = output_vo.to_dict()
140
+ result_queue.put(result)
141
+ logger.info(f"🏆 Excel文件解析完成: {file_path}")
142
+ logger.debug(f"🔑 返回结果键: {list(result.keys())}")
143
+
144
+ time.sleep(0.5) # 给队列一点时间
145
+ return result
146
+
37
147
  except Exception as e:
38
- raise e
39
-
40
- mk_content = self.markitdown.convert(file_path).text_content
41
- lifecycle = self.generate_lifecycle(
42
- source_file=file_path,
43
- domain="Technology",
44
- usage_purpose="Documentation",
45
- life_type="LLM_ORIGIN"
46
- )
47
- output_vo = MarkdownOutputVo(self.get_file_extension(file_path), mk_content)
48
- output_vo.add_lifecycle(lifecycle)
49
- result_queue.put(output_vo.to_dict())
50
- time.sleep(0.5)
51
- return output_vo.to_dict()
148
+ logger.error(f"💀 解析Excel文件失败: {file_path}, 错误: {str(e)}")
149
+ # 将错误也放入队列
150
+ error_result = {"error": str(e), "file_path": file_path}
151
+ result_queue.put(error_result)
152
+ raise
52
153
 
53
154
  def parse(self, file_path: str) -> dict:
54
- import time
55
- result_queue = Queue()
56
- process = multiprocessing.Process(target=self._parse, args=(file_path, result_queue))
57
- process.start()
58
- start_time = time.time()
59
-
60
- # ttl
61
- while time.time() - start_time < self.timeout:
62
- print(f"plz waiting...: {int(time.time() - start_time)}")
63
- if not process.is_alive():
64
- break
65
- if not result_queue.empty():
66
- return result_queue.get()
67
- time.sleep(1)
68
- else:
69
- # killed
70
- process.terminate()
71
- process.join()
155
+ """解析Excel文件 - 支持多进程和超时控制"""
156
+ logger.info(f"🚀 启动Excel解析进程 - 文件: {file_path}, 超时: {self.timeout}s")
157
+
158
+ try:
159
+ # 验证文件存在
160
+ if not os.path.exists(file_path):
161
+ logger.error(f"🚫 文件不存在: {file_path}")
162
+ raise FileNotFoundError(f"文件不存在: {file_path}")
163
+
164
+ # 验证文件扩展名
165
+ if not file_path.lower().endswith((".xlsx", ".xls")):
166
+ logger.warning(f"⚠️ 文件扩展名不是Excel格式: {file_path}")
167
+
168
+ result_queue = Queue()
169
+ process = multiprocessing.Process(
170
+ target=self._parse, args=(file_path, result_queue)
171
+ )
172
+ process.start()
173
+ logger.debug(f"⚡ 启动子进程,PID: {process.pid}")
174
+
175
+ start_time = time.time()
176
+
177
+ # 等待解析完成或超时
178
+ while time.time() - start_time < self.timeout:
179
+ elapsed_time = int(time.time() - start_time)
180
+ logger.debug(f"⏱️ 等待解析完成... {elapsed_time}s")
181
+
182
+ if not process.is_alive():
183
+ logger.debug("✅ 子进程已完成")
184
+ break
185
+
186
+ if not result_queue.empty():
187
+ result = result_queue.get()
188
+ process.join() # 等待进程正常结束
189
+
190
+ # 检查是否是错误结果
191
+ if "error" in result:
192
+ logger.error(f"💥 子进程返回错误: {result['error']}")
193
+ raise Exception(result["error"])
194
+
195
+ logger.info(f"🎉 Excel解析成功完成,耗时: {elapsed_time}s")
196
+ return result
197
+
198
+ time.sleep(1)
199
+ else:
200
+ # 超时处理
201
+ logger.error(f"⏰ 解析超时 ({self.timeout}s),终止进程")
202
+ process.terminate()
203
+ process.join(timeout=5) # 给进程5秒时间优雅退出
204
+
205
+ if process.is_alive():
206
+ logger.error("💀 强制杀死进程")
207
+ process.kill()
208
+
209
+ raise TimeoutError(f"Excel解析超时: {file_path}")
210
+
211
+ except Exception as e:
212
+ logger.error(
213
+ f"💀 Excel解析失败: {file_path}, 错误类型: {type(e).__name__}, 错误信息: {str(e)}"
214
+ )
215
+ raise
@@ -0,0 +1,228 @@
1
+ Metadata-Version: 2.4
2
+ Name: pydatamax
3
+ Version: 0.1.14
4
+ Summary: A library for parsing and converting various file formats.
5
+ Home-page: https://github.com/Hi-Dolphin/datamax
6
+ Author: ccy
7
+ Author-email: cy.kron@foxmail.com
8
+ Classifier: Programming Language :: Python :: 3
9
+ Classifier: License :: OSI Approved :: MIT License
10
+ Classifier: Operating System :: OS Independent
11
+ Requires-Python: >=3.10
12
+ Description-Content-Type: text/markdown
13
+ License-File: LICENSE
14
+ Requires-Dist: oss2<3.0.0,>=2.19.1
15
+ Requires-Dist: aliyun-python-sdk-core<3.0.0,>=2.16.0
16
+ Requires-Dist: aliyun-python-sdk-kms<3.0.0,>=2.16.5
17
+ Requires-Dist: crcmod<2.0.0,>=1.7
18
+ Requires-Dist: langdetect<2.0.0,>=1.0.9
19
+ Requires-Dist: loguru<1.0.0,>=0.7.3
20
+ Requires-Dist: python-docx<2.0.0,>=1.1.2
21
+ Requires-Dist: python-dotenv<2.0.0,>=1.1.0
22
+ Requires-Dist: pymupdf<2.0.0,>=1.26.0
23
+ Requires-Dist: pypdf<6.0.0,>=5.5.0
24
+ Requires-Dist: openpyxl<4.0.0,>=3.1.5
25
+ Requires-Dist: pandas<3.0.0,>=2.2.3
26
+ Requires-Dist: numpy<3.0.0,>=2.2.6
27
+ Requires-Dist: requests<3.0.0,>=2.32.3
28
+ Requires-Dist: tqdm<5.0.0,>=4.67.1
29
+ Requires-Dist: pydantic<3.0.0,>=2.11.5
30
+ Requires-Dist: pydantic-settings<3.0.0,>=2.9.1
31
+ Requires-Dist: python-magic<1.0.0,>=0.4.27
32
+ Requires-Dist: PyYAML<7.0.0,>=6.0.2
33
+ Requires-Dist: Pillow<12.0.0,>=11.2.1
34
+ Requires-Dist: packaging<25.0,>=24.2
35
+ Requires-Dist: beautifulsoup4<5.0.0,>=4.13.4
36
+ Requires-Dist: minio<8.0.0,>=7.2.15
37
+ Requires-Dist: openai<2.0.0,>=1.82.0
38
+ Requires-Dist: jionlp<2.0.0,>=1.5.23
39
+ Requires-Dist: chardet<6.0.0,>=5.2.0
40
+ Requires-Dist: python-pptx<2.0.0,>=1.0.2
41
+ Requires-Dist: tiktoken<1.0.0,>=0.9.0
42
+ Requires-Dist: markitdown<1.0.0,>=0.1.1
43
+ Requires-Dist: xlrd<3.0.0,>=2.0.1
44
+ Requires-Dist: tabulate<1.0.0,>=0.9.0
45
+ Requires-Dist: unstructured<1.0.0,>=0.17.2
46
+ Requires-Dist: markdown<4.0.0,>=3.8
47
+ Requires-Dist: langchain<1.0.0,>=0.3.0
48
+ Requires-Dist: langchain-community<1.0.0,>=0.3.0
49
+ Requires-Dist: ebooklib==0.19
50
+ Requires-Dist: setuptools
51
+ Dynamic: author
52
+ Dynamic: author-email
53
+ Dynamic: classifier
54
+ Dynamic: description
55
+ Dynamic: description-content-type
56
+ Dynamic: home-page
57
+ Dynamic: license-file
58
+ Dynamic: requires-dist
59
+ Dynamic: requires-python
60
+ Dynamic: summary
61
+
62
+ # DataMax
63
+
64
+ <div align="center">
65
+
66
+ [中文](README_zh.md) | **English**
67
+
68
+ [![PyPI version](https://badge.fury.io/py/pydatamax.svg)](https://badge.fury.io/py/pydatamax) [![Python](https://img.shields.io/badge/python-3.10+-blue.svg)](https://www.python.org/downloads/) [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
69
+
70
+ </div>
71
+
72
+ A powerful multi-format file parsing, data cleaning, and AI annotation toolkit.
73
+
74
+ ## ✨ Core Features
75
+
76
+ - 🔄 **Multi-format Support**: PDF, DOCX/DOC, PPT/PPTX, XLS/XLSX, HTML, EPUB, TXT, images, and more
77
+ - 🧹 **Intelligent Cleaning**: Three-layer cleaning process with anomaly detection, privacy protection, and text filtering
78
+ - 🤖 **AI Annotation**: LLM-based automatic data annotation and pre-labeling
79
+ - ⚡ **Batch Processing**: Efficient multi-file parallel processing
80
+ - 🎯 **Easy Integration**: Clean API design, ready to use out of the box
81
+
82
+ ## 🚀 Quick Start
83
+
84
+ ### Installation
85
+
86
+ ```bash
87
+ pip install pydatamax
88
+ ```
89
+
90
+ ### Basic Usage
91
+
92
+ ```python
93
+ from datamax import DataMax
94
+
95
+ # Parse a single file
96
+ dm = DataMax(file_path="document.pdf")
97
+ data = dm.get_data()
98
+
99
+ # Batch processing
100
+ dm = DataMax(file_path=["file1.docx", "file2.pdf"])
101
+ data = dm.get_data()
102
+
103
+ # Data cleaning
104
+ cleaned_data = dm.clean_data(method_list=["abnormal", "private", "filter"])
105
+
106
+ # AI annotation
107
+ qa_data = dm.get_pre_label(
108
+ api_key="your-api-key",
109
+ base_url="https://api.openai.com/v1",
110
+ model_name="gpt-3.5-turbo"
111
+ )
112
+ ```
113
+
114
+ ## 📖 Detailed Documentation
115
+
116
+ ### File Parsing
117
+
118
+ #### Supported Formats
119
+
120
+ | Format | Extensions | Special Features |
121
+ |--------|------------|------------------|
122
+ | Documents | `.pdf`, `.docx`, `.doc` | OCR support, Markdown conversion |
123
+ | Spreadsheets | `.xlsx`, `.xls` | Structured data extraction |
124
+ | Presentations | `.pptx`, `.ppt` | Slide content extraction |
125
+ | Web | `.html`, `.epub` | Tag parsing |
126
+ | Images | `.jpg`, `.png`, `.jpeg` | OCR text recognition |
127
+ | Text | `.txt` | Automatic encoding detection |
128
+
129
+ #### Advanced Features
130
+
131
+ ```python
132
+ # Advanced PDF parsing (requires MinerU)
133
+ dm = DataMax(file_path="complex.pdf", use_mineru=True)
134
+
135
+ # Word to Markdown conversion
136
+ dm = DataMax(file_path="document.docx", to_markdown=True)
137
+
138
+ # Image OCR
139
+ dm = DataMax(file_path="image.jpg", use_ocr=True)
140
+ ```
141
+
142
+ ### Data Cleaning
143
+
144
+ ```python
145
+ # Three cleaning modes
146
+ dm.clean_data(method_list=[
147
+ "abnormal", # Anomaly data processing
148
+ "private", # Privacy information masking
149
+ "filter" # Text filtering and normalization
150
+ ])
151
+ ```
152
+
153
+ ### AI Annotation
154
+
155
+ ```python
156
+ # Custom annotation tasks
157
+ qa_data = dm.get_pre_label(
158
+ api_key="sk-xxx",
159
+ base_url="https://api.provider.com/v1",
160
+ model_name="model-name",
161
+ chunk_size=500, # Text chunk size
162
+ chunk_overlap=100, # Overlap length
163
+ question_number=5, # Questions per chunk
164
+ max_workers=5 # Concurrency
165
+ )
166
+ ```
167
+
168
+ ## ⚙️ Environment Setup
169
+
170
+ ### Optional Dependencies
171
+
172
+ #### LibreOffice (DOC file support)
173
+
174
+ **Ubuntu/Debian:**
175
+ ```bash
176
+ sudo apt-get install libreoffice
177
+ ```
178
+
179
+ **Windows:**
180
+ 1. Download and install [LibreOffice](https://www.libreoffice.org/download/)
181
+ 2. Add to environment variables: `C:\Program Files\LibreOffice\program`
182
+
183
+ #### MinerU (Advanced PDF parsing)
184
+
185
+ ```bash
186
+ # Create virtual environment
187
+ conda create -n mineru python=3.10
188
+ conda activate mineru
189
+
190
+ # Install MinerU
191
+ pip install -U "magic-pdf[full]" --extra-index-url https://wheels.myhloli.com
192
+ ```
193
+
194
+ For detailed configuration, please refer to [MinerU Documentation](https://github.com/opendatalab/MinerU)
195
+
196
+ ## 🛠️ Development
197
+
198
+ ### Local Installation
199
+
200
+ ```bash
201
+ git clone https://github.com/Hi-Dolphin/datamax.git
202
+ cd datamax
203
+ pip install -r requirements.txt
204
+ python setup.py install
205
+ ```
206
+
207
+ ## 📋 System Requirements
208
+
209
+ - Python >= 3.10
210
+ - Supports Windows, macOS, Linux
211
+
212
+ ## 🤝 Contributing
213
+
214
+ Issues and Pull Requests are welcome!
215
+
216
+ ## 📄 License
217
+
218
+ This project is licensed under the [MIT License](LICENSE).
219
+
220
+ ## 📞 Contact Us
221
+
222
+ - 📧 Email: cy.kron@foxmail.com
223
+ - 🐛 Issues: [GitHub Issues](https://github.com/Hi-Dolphin/datamax/issues)
224
+ - 📚 Documentation: [Project Homepage](https://github.com/Hi-Dolphin/datamax)
225
+
226
+ ---
227
+
228
+ ⭐ If this project helps you, please give us a star!
@@ -1,14 +1,14 @@
1
1
  datamax/__init__.py,sha256=Kbs8ITE6suPy0VL8WzKH8A_iAGqukC0jIHcFGLgoBw8,28
2
- datamax/loader/MinioHandler.py,sha256=0XLvtKayhI2cjPU7S7m91tSzPcaOC924ByJNbabY6So,6386
3
- datamax/loader/OssHandler.py,sha256=YFbUfH2vkKRVg8OhkYpOrlYHrIX4zofbHFVe8nPpsQ4,7457
4
2
  datamax/loader/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
5
- datamax/loader/core.py,sha256=tSIkOw5D3EVFYme1b7joFt0e_LxJdf-mdUzxpyVt0VI,5098
3
+ datamax/loader/core.py,sha256=C5VhDXkv0HTRcF6WWtztatUZHURZBD-KoZpKyqXfD7U,5100
4
+ datamax/loader/minio_handler.py,sha256=0XLvtKayhI2cjPU7S7m91tSzPcaOC924ByJNbabY6So,6386
5
+ datamax/loader/oss_handler.py,sha256=YFbUfH2vkKRVg8OhkYpOrlYHrIX4zofbHFVe8nPpsQ4,7457
6
6
  datamax/parser/__init__.py,sha256=Jilq2PLBNonmoXKATzsIHWWvFuBdlcV2dbSP1cOZ6zg,111
7
7
  datamax/parser/base.py,sha256=riGcMn4m295_qf9O0-NbHU2BcHGBXvoF4T3fWj9vgUQ,2514
8
8
  datamax/parser/core.py,sha256=9rzIjsVTRacPTUTAVa5gm5fx0h95LxYnw0lEGqjIIB4,11437
9
9
  datamax/parser/csv_parser.py,sha256=IcyVq8mGE6auIcUInXGYWDnh0H0XJ_3SyQrLVRrS7i0,190
10
- datamax/parser/doc_parser.py,sha256=VwTOdq5pGPbOI-98SoTwwTcXIjD1BrZDfFGEhTi3T44,3348
11
- datamax/parser/docx_parser.py,sha256=OhqcMeZ8JkwDJtvrMirM15j-EDnGNUj6U1-nX3gisKA,1727
10
+ datamax/parser/doc_parser.py,sha256=wVNYn7dkPI12pW_YImDgoceLFWS3RvFpzbFQwVlrnNo,7936
11
+ datamax/parser/docx_parser.py,sha256=o9_1VBDc8nBCmsEMv0sKsVcPMxiAxY5pl0mUvOcoOJc,8796
12
12
  datamax/parser/epub_parser.py,sha256=ljCGxLBPwE5gXVKARJec93VpP4dE9R2GspzuSZBkqPQ,1557
13
13
  datamax/parser/html_parser.py,sha256=xQaaK8674QbQwE-Up9X0DJIH0Gg0mR2KoI7fJ6iw2m0,1393
14
14
  datamax/parser/image_parser.py,sha256=qGCndc_21PwsfuxFG03wHSsV0uc-XMBaW3VDbsJQd90,1233
@@ -19,7 +19,7 @@ datamax/parser/ppt_parser.py,sha256=Niu3Ina6I6m6lAMS1Z-A7rUbR_iFGmNTaASBoNH_vZ0,
19
19
  datamax/parser/pptx_parser.py,sha256=sFWyOa3QNIs4BgtpmSzFQgsgPmunfGqCqi6fulbLFW0,1811
20
20
  datamax/parser/txt_parser.py,sha256=4DIP1LVOw21NDdtqG2RTD_hMcHufkvC8kr048AkuLFs,1682
21
21
  datamax/parser/xls_parser.py,sha256=pRlqgg96f76H8UqXQfheQT9O0ThdP7958hKUCEyQfPM,954
22
- datamax/parser/xlsx_parser.py,sha256=9ZqwCSF01thjEb_RleWGCiNOSuA8KZ3QFqzUKldb3wE,2183
22
+ datamax/parser/xlsx_parser.py,sha256=Vw6XfoQyu6aQUSIueR-krByMA_WOb5fasf4VmKxjVio,8905
23
23
  datamax/utils/__init__.py,sha256=d69SJvqOXzItyg9rEcLc4z67Lw9vACispOe3x7NvZLA,1051
24
24
  datamax/utils/constants.py,sha256=A0S56mkIfeT6oQmOd-VGTChzLOSBUqsG4skMmLt6uNk,4507
25
25
  datamax/utils/data_cleaner.py,sha256=zlk2dXmhU-_9KVfqmqMGr967v-nc7Iv8ZKRdMkIJsGM,7784
@@ -30,10 +30,10 @@ datamax/utils/paddleocr_pdf_operator.py,sha256=Tnb-5SzUd6OXM-XeaL8vdPnsOhgG_GKz-
30
30
  datamax/utils/ppt_extract.py,sha256=nd6KSqEzxANrPhNPUZY4ogAyxHzKCbdsI5ZfDQCz0Cw,6164
31
31
  datamax/utils/qa_generator.py,sha256=d75an9JEyT6sxlSjdmWYveQshfyTb0v4aGSuTpTJa0A,12561
32
32
  datamax/utils/tokenizer.py,sha256=Y8XB06XQVsNuG8IPl_4iBZj2yu1xzXldVbmZtXFMQM4,859
33
- pydatamax-0.1.12.dist-info/licenses/LICENSE,sha256=LvCq2Pc7MejIvfNeRl_kAM5l_KXWlQFiC-Sjp7kqFf8,1067
33
+ pydatamax-0.1.14.dist-info/licenses/LICENSE,sha256=LvCq2Pc7MejIvfNeRl_kAM5l_KXWlQFiC-Sjp7kqFf8,1067
34
34
  tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
35
35
  tests/test_basic.py,sha256=4AByx25-MIt6_zmzxpFRoSCBqLtIjyfTwFLb1UCJz6k,303
36
- pydatamax-0.1.12.dist-info/METADATA,sha256=6D9bCVBg8aNF1kg4gnXrPe7ybvMEWFp7MKxl2_Ivb4s,9774
37
- pydatamax-0.1.12.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
38
- pydatamax-0.1.12.dist-info/top_level.txt,sha256=DvdVIUrxJDgRUYiGA5vznYZIP-K8ZnDkTZfrqYLNZMQ,14
39
- pydatamax-0.1.12.dist-info/RECORD,,
36
+ pydatamax-0.1.14.dist-info/METADATA,sha256=P-wz8Log3gcUMftKTd2qrcmNuzpp-HOn8gVVe8cTceM,6314
37
+ pydatamax-0.1.14.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
38
+ pydatamax-0.1.14.dist-info/top_level.txt,sha256=DvdVIUrxJDgRUYiGA5vznYZIP-K8ZnDkTZfrqYLNZMQ,14
39
+ pydatamax-0.1.14.dist-info/RECORD,,
@@ -1,281 +0,0 @@
1
- Metadata-Version: 2.4
2
- Name: pydatamax
3
- Version: 0.1.12
4
- Summary: A library for parsing and converting various file formats.
5
- Home-page: https://github.com/Hi-Dolphin/datamax
6
- Author: ccy
7
- Author-email: cy.kron@foxmail.com
8
- Classifier: Programming Language :: Python :: 3
9
- Classifier: License :: OSI Approved :: MIT License
10
- Classifier: Operating System :: OS Independent
11
- Requires-Python: >=3.10
12
- Description-Content-Type: text/markdown
13
- License-File: LICENSE
14
- Requires-Dist: oss2<3.0.0,>=2.19.1
15
- Requires-Dist: aliyun-python-sdk-core<3.0.0,>=2.16.0
16
- Requires-Dist: aliyun-python-sdk-kms<3.0.0,>=2.16.5
17
- Requires-Dist: crcmod<2.0.0,>=1.7
18
- Requires-Dist: langdetect<2.0.0,>=1.0.9
19
- Requires-Dist: loguru<1.0.0,>=0.7.3
20
- Requires-Dist: python-docx<2.0.0,>=1.1.2
21
- Requires-Dist: python-dotenv<2.0.0,>=1.1.0
22
- Requires-Dist: pymupdf<2.0.0,>=1.26.0
23
- Requires-Dist: pypdf<6.0.0,>=5.5.0
24
- Requires-Dist: openpyxl<4.0.0,>=3.1.5
25
- Requires-Dist: pandas<3.0.0,>=2.2.3
26
- Requires-Dist: numpy<3.0.0,>=2.2.6
27
- Requires-Dist: requests<3.0.0,>=2.32.3
28
- Requires-Dist: tqdm<5.0.0,>=4.67.1
29
- Requires-Dist: pydantic<3.0.0,>=2.11.5
30
- Requires-Dist: pydantic-settings<3.0.0,>=2.9.1
31
- Requires-Dist: python-magic<1.0.0,>=0.4.27
32
- Requires-Dist: PyYAML<7.0.0,>=6.0.2
33
- Requires-Dist: Pillow<12.0.0,>=11.2.1
34
- Requires-Dist: packaging<25.0,>=24.2
35
- Requires-Dist: beautifulsoup4<5.0.0,>=4.13.4
36
- Requires-Dist: minio<8.0.0,>=7.2.15
37
- Requires-Dist: openai<2.0.0,>=1.82.0
38
- Requires-Dist: jionlp<2.0.0,>=1.5.23
39
- Requires-Dist: chardet<6.0.0,>=5.2.0
40
- Requires-Dist: python-pptx<2.0.0,>=1.0.2
41
- Requires-Dist: docx2markdown<1.0.0,>=0.1.1
42
- Requires-Dist: tiktoken<1.0.0,>=0.9.0
43
- Requires-Dist: markitdown<1.0.0,>=0.1.1
44
- Requires-Dist: xlrd<3.0.0,>=2.0.1
45
- Requires-Dist: tabulate<1.0.0,>=0.9.0
46
- Requires-Dist: unstructured<1.0.0,>=0.17.2
47
- Requires-Dist: markdown<4.0.0,>=3.8
48
- Requires-Dist: langchain<1.0.0,>=0.3.0
49
- Requires-Dist: langchain-community<1.0.0,>=0.3.0
50
- Dynamic: author
51
- Dynamic: author-email
52
- Dynamic: classifier
53
- Dynamic: description
54
- Dynamic: description-content-type
55
- Dynamic: home-page
56
- Dynamic: license-file
57
- Dynamic: requires-dist
58
- Dynamic: requires-python
59
- Dynamic: summary
60
-
61
- # DataMax
62
-
63
- ## Overview
64
- DataMax is designed as a comprehensive solution for processing diverse file formats, performing data cleaning, and facilitating data annotation.
65
-
66
- ## Key Features
67
-
68
- ### File Processing Capabilities
69
- Currently supports reading, conversion, and extraction from:
70
- - PDF, HTML
71
- - DOCX/DOC, PPT/PPTX
72
- - EPUB
73
- - Images
74
- - XLS/XLSX spreadsheets
75
- - Plain text (TXT)
76
-
77
- ### Data Cleaning Pipeline
78
- Three-tiered cleaning process:
79
- 1. Anomaly detection and handling
80
- 2. Privacy protection processing
81
- 3. Text filtering and normalization
82
-
83
- ### AI-Powered Data Annotation
84
- Implements an LLM+Prompt to:
85
- - Continuously generate pre-labeled datasets
86
- - Provide optimized training data for model fine-tuning
87
-
88
-
89
- ## Installation Guide (Key Dependencies)
90
- Dependencies include libreoffice, datamax, and MinerU.
91
-
92
- ### 1. Installing libreoffice Dependency
93
- **Note:** Without datamax, .doc files will not be supported.
94
-
95
- #### Linux (Debian/Ubuntu)
96
- ```bash
97
- sudo apt-get update
98
- sudo apt-get install libreoffice
99
- ```
100
- ### Windows
101
- ```text
102
- Install LibreOffice from: [Download LibreOffice](https://www.libreoffice.org/download/download-libreoffice/?spm=5176.28103460.0.0.5b295d275bpHzh)
103
- Add to environment variable: `$env:PATH += ";C:\Program Files\LibreOffice\program"`
104
- ```
105
- ### Checking LibreOffice Installation
106
- ```bash
107
- soffice --version
108
- ```
109
-
110
- ## 2. Installing MinerU Dependency
111
- Note: Without MinerU, advanced OCR parsing for PDFs will not be supported.
112
- ### Create a Virtual Environment and Install Basic Dependencies
113
- ```bash
114
- conda create -n mineru python=3.10
115
- conda activate mineru
116
- pip install -U "magic-pdf[full]" --extra-index-url https://wheels.myhloli.com -i https://mirrors.aliyun.com/pypi/simple
117
- ```
118
- ### Installing Model Weight Files
119
- https://github.com/opendatalab/MinerU/blob/master/docs/how_to_download_models_zh_cn.md
120
- ```bash
121
- pip install modelscope
122
- wget https://gcore.jsdelivr.net/gh/opendatalab/MinerU@master/scripts/download_models.py -O download_models.py
123
- python download_models.py
124
- ```
125
-
126
- ### Modify the Configuration File magic-pdf.json (Located in the User Directory, Template Preview Below)
127
- ```json
128
- {
129
- "models-dir": "path\\to\\folder\\PDF-Extract-Kit-1___0\\models",
130
- "layoutreader-model-dir": "path\\to\\folder\\layoutreader",
131
- "device-mode": "cpu",
132
- ...
133
- }
134
- ```
135
-
136
- ## 3. Installing Basic Dependencies for datamax
137
- 1. Clone the repository to your local machine:
138
- ```bash
139
- git clone <repository-url>
140
- ```
141
- 2. Install dependencies into conda:
142
- ```bash
143
- cd datamax
144
- pip install -r requirements.txt -i https://pypi.tuna.tsinghua.edu.cn/simple
145
- ```
146
-
147
-
148
- ## Features
149
- - **Multi-format Support**: Capable of handling various text file types such as PDF, HTML, DOCX, and TXT.
150
- - **Content Extraction**: Provides powerful content extraction capabilities to accurately retrieve information from complex document structures.
151
- - **Data Conversion**: Supports converting processed data into markdown format for further analysis.
152
- - **Batch Processing**: Can handle multiple files at once, improving work efficiency.
153
- - **Customizable Configuration**: Users can adjust processing parameters according to their needs to meet different business requirements.
154
- - **Cross-platform Compatibility**: This SDK can run on multiple operating systems, including Windows, MacOS, and Linux.
155
-
156
-
157
- ## Technology Stack
158
-
159
- - **Programming Language**: Python >= 3.10
160
- - **Dependency Libraries**:
161
- - PyMuPDF: For PDF file parsing.
162
- - BeautifulSoup: For HTML file parsing.
163
- - python-docx: For DOCX file parsing.
164
- - pandas: For data processing and conversion.
165
- - paddleocr: For parsing scanned PDFs, tables, and images.
166
- - **Development Environment**: Visual Studio Code or PyCharm
167
- - **Version Control**: Git
168
-
169
- ## Usage Instructions
170
- ### Installing the SDK
171
- - **Installation Commands**:
172
- ```bash
173
- ## Local Installation
174
- python setup.py sdist bdist_wheel
175
- pip install dist/datamax-0.1.3-py3-none-any.whl
176
-
177
- ## Pip Installation
178
- pip install pydatamax
179
- ```
180
-
181
-
182
- - **Importing the Code**:
183
- ```python
184
- # File Parsing
185
- from datamax import DataMax
186
-
187
- ## Handling a Single File in Two Ways
188
- # 1. Using a List of Length 1
189
- data = DataMax(file_path=[r"docx_files_example/船视宝概述.doc"])
190
- data = data.get_data()
191
-
192
- # 2. Using a String
193
- data = DataMax(file_path=r"docx_files_example/船视宝概述.doc")
194
- data = data.get_data()
195
-
196
- ## Handling Multiple Files
197
- # 1. Using a List of Length n
198
- data = DataMax(file_path=[r"docx_files_example/船视宝概述1.doc", r"docx_files_example/船视宝概述2.doc"])
199
- data = data.get_data()
200
-
201
- # 2. Passing a Folder Path as a String
202
- data = DataMax(file_path=r"docx_files_example/")
203
- data = data.get_data()
204
-
205
- # Data Cleaning
206
- """
207
- Cleaning rules can be found in datamax/utils/data_cleaner.py
208
- abnormal: Abnormal cleaning
209
- private: Privacy processing
210
- filter: Text filtering
211
- """
212
- # Direct Use: Clean the text parameter directly and return a string
213
- dm = DataMax()
214
- data = dm.clean_data(method_list=["abnormal", "private"], text="<div></div>你好 18717777777 \n\n\n\n")
215
-
216
- # Process Use: Use after get_data() to return the complete data structure
217
- dm = DataMax(file_path=r"C:\Users\cykro\Desktop\数据库开发手册.pdf", use_ocr=True)
218
- data2 = dm.get_data()
219
- cleaned_data = dm.clean_data(method_list=["abnormal", "filter", "private"])
220
-
221
- # Large Model Pre-annotation Supporting any model that can be called via OpenAI SDK
222
- data = DataMax(file_path=r"path\to\xxx.docx")
223
- parsed_data = data.get_data()
224
- # If no custom messages are passed, the default messages in the SDK will be used
225
- messages = [
226
- {'role': 'system', 'content': 'You are a helpful assistant.'},
227
- {'role': 'user', 'content': 'Who are you?'}
228
- ]
229
- qa_datas = data.get_pre_label(
230
- api_key="sk-xxx",
231
- base_url="https://dashscope.aliyuncs.com/compatible-mode/v1/chat/completions",
232
- model_name="qwen-max",
233
- chunk_size=500,
234
- chunk_overlap=100,
235
- question_number=5,
236
- max_workers=5,
237
- # message=[]
238
- )
239
- print(f'Annotated result:{qa_datas}')
240
- ```
241
-
242
-
243
- ## Examples
244
- ```python
245
- ## docx | doc | epub | html | txt | ppt | pptx | xls | xlsx
246
- from datamax import DataMax
247
- data = DataMax(file_path=r"docx_files_example/船视宝概述.doc", to_markdown=True)
248
- """
249
- Parameters:
250
- file_path: Relative file path / Absolute file path
251
- to_markdown: Whether to convert to markdown (default value False, directly returns text) This parameter only supports word files (doc | docx)
252
- """
253
-
254
- ## jpg | jpeg | png | ...(image types)
255
- data = DataMax(file_path=r"image.jpg", use_mineru=True)
256
- """
257
- Parameters:
258
- file_path: Relative file path / Absolute file path
259
- use_mineru: Whether to use MinerU enhancement
260
- """
261
-
262
- ## pdf
263
- from datamax import DataMax
264
- data = DataMax(file_path=r"docx_files_example/船视宝概述.pdf", use_mineru=True)
265
- """
266
- Parameters:
267
- file_path: Relative file path / Absolute file path
268
- use_mineru: Whether to use MinerU enhancement
269
- """
270
- ```
271
-
272
- ## Contribution Guide
273
- We welcome any form of contribution, whether it is reporting bugs, suggesting new features, or submitting code improvements. Please read our Contributor's Guide to learn how to get started.
274
- ## License
275
- This project is licensed under the MIT License. For more details, see the LICENSE file.
276
-
277
- ## Contact Information
278
- If you encounter any issues during use, or have any suggestions or feedback, please contact us through the following means:
279
- - Email: cy.kron@foxmail.com | zhibaohe@hotmail.com
280
- - Project Homepage: GitHub Project Link
281
-
File without changes
File without changes