pydatamax 0.1.13__py3-none-any.whl → 0.1.14__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
datamax/loader/core.py CHANGED
@@ -1,7 +1,7 @@
1
1
  import os
2
2
  from typing import List
3
- from datamax.loader.MinioHandler import MinIOClient
4
- from datamax.loader.OssHandler import OssClient
3
+ from datamax.loader.minio_handler import MinIOClient
4
+ from datamax.loader.oss_handler import OssClient
5
5
 
6
6
 
7
7
  class DataLoader:
@@ -1,14 +1,14 @@
1
+ import logging
1
2
  import os
2
3
  import shutil
3
4
  import subprocess
4
5
  import tempfile
5
- import chardet
6
- import logging
7
6
  from pathlib import Path
8
7
  from typing import Union
9
- from datamax.parser.base import BaseLife
10
- from datamax.parser.base import MarkdownOutputVo
11
8
 
9
+ import chardet
10
+
11
+ from datamax.parser.base import BaseLife, MarkdownOutputVo
12
12
 
13
13
  # 配置日志
14
14
  logger = logging.getLogger(__name__)
@@ -24,37 +24,41 @@ class DocParser(BaseLife):
24
24
  def doc_to_txt(self, doc_path: str, dir_path: str) -> str:
25
25
  """将.doc文件转换为.txt文件"""
26
26
  logger.info(f"🔄 开始转换DOC文件为TXT - 源文件: {doc_path}, 输出目录: {dir_path}")
27
-
27
+
28
28
  try:
29
29
  cmd = f'soffice --headless --convert-to txt "{doc_path}" --outdir "{dir_path}"'
30
30
  logger.debug(f"⚡ 执行转换命令: {cmd}")
31
-
32
- process = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
31
+
32
+ process = subprocess.Popen(
33
+ cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE
34
+ )
33
35
  stdout, stderr = process.communicate()
34
36
  exit_code = process.returncode
35
-
37
+
36
38
  if exit_code == 0:
37
39
  logger.info(f"✅ DOC到TXT转换成功 - 退出码: {exit_code}")
38
40
  if stdout:
39
41
  logger.debug(f"📄 转换输出: {stdout.decode('utf-8', errors='replace')}")
40
42
  else:
41
- encoding = chardet.detect(stderr)['encoding']
43
+ encoding = chardet.detect(stderr)["encoding"]
42
44
  if encoding is None:
43
- encoding = 'utf-8'
44
- error_msg = stderr.decode(encoding, errors='replace')
45
+ encoding = "utf-8"
46
+ error_msg = stderr.decode(encoding, errors="replace")
45
47
  logger.error(f"❌ DOC到TXT转换失败 - 退出码: {exit_code}, 错误信息: {error_msg}")
46
- raise Exception(f"Error Output (detected encoding: {encoding}): {error_msg}")
47
-
48
+ raise Exception(
49
+ f"Error Output (detected encoding: {encoding}): {error_msg}"
50
+ )
51
+
48
52
  fname = str(Path(doc_path).stem)
49
- txt_path = os.path.join(dir_path, f'{fname}.txt')
50
-
53
+ txt_path = os.path.join(dir_path, f"{fname}.txt")
54
+
51
55
  if not os.path.exists(txt_path):
52
56
  logger.error(f"❌ 转换后的TXT文件不存在: {txt_path}")
53
57
  raise Exception(f"文件转换失败 {doc_path} ==> {txt_path}")
54
58
  else:
55
59
  logger.info(f"🎉 TXT文件转换成功,文件路径: {txt_path}")
56
60
  return txt_path
57
-
61
+
58
62
  except subprocess.SubprocessError as e:
59
63
  logger.error(f"💥 subprocess执行失败: {str(e)}")
60
64
  raise Exception(f"执行转换命令时发生错误: {str(e)}")
@@ -65,25 +69,25 @@ class DocParser(BaseLife):
65
69
  def read_txt_file(self, txt_path: str) -> str:
66
70
  """读取txt文件内容"""
67
71
  logger.info(f"📖 开始读取TXT文件: {txt_path}")
68
-
72
+
69
73
  try:
70
74
  # 检测文件编码
71
- with open(txt_path, 'rb') as f:
75
+ with open(txt_path, "rb") as f:
72
76
  raw_data = f.read()
73
- encoding = chardet.detect(raw_data)['encoding']
77
+ encoding = chardet.detect(raw_data)["encoding"]
74
78
  if encoding is None:
75
- encoding = 'utf-8'
79
+ encoding = "utf-8"
76
80
  logger.debug(f"🔍 检测到文件编码: {encoding}")
77
-
81
+
78
82
  # 读取文件内容
79
- with open(txt_path, 'r', encoding=encoding, errors='replace') as f:
83
+ with open(txt_path, "r", encoding=encoding, errors="replace") as f:
80
84
  content = f.read()
81
-
85
+
82
86
  logger.info(f"📄 TXT文件读取完成 - 内容长度: {len(content)} 字符")
83
87
  logger.debug(f"👀 前100字符预览: {content[:100]}...")
84
-
88
+
85
89
  return content
86
-
90
+
87
91
  except FileNotFoundError as e:
88
92
  logger.error(f"🚫 TXT文件未找到: {str(e)}")
89
93
  raise Exception(f"文件未找到: {txt_path}")
@@ -94,27 +98,27 @@ class DocParser(BaseLife):
94
98
  def read_doc_file(self, doc_path: str) -> str:
95
99
  """读取doc文件并转换为文本"""
96
100
  logger.info(f"📖 开始读取DOC文件 - 文件: {doc_path}")
97
-
101
+
98
102
  try:
99
103
  with tempfile.TemporaryDirectory() as temp_path:
100
104
  logger.debug(f"📁 创建临时目录: {temp_path}")
101
-
105
+
102
106
  temp_dir = Path(temp_path)
103
-
107
+
104
108
  file_path = temp_dir / "tmp.doc"
105
109
  shutil.copy(doc_path, file_path)
106
110
  logger.debug(f"📋 复制文件到临时目录: {doc_path} -> {file_path}")
107
-
111
+
108
112
  # 转换DOC为TXT
109
113
  txt_file_path = self.doc_to_txt(str(file_path), str(temp_path))
110
114
  logger.info(f"🎯 DOC转TXT完成: {txt_file_path}")
111
-
115
+
112
116
  # 读取TXT文件内容
113
117
  content = self.read_txt_file(txt_file_path)
114
118
  logger.info(f"✨ TXT文件内容读取完成,内容长度: {len(content)} 字符")
115
-
119
+
116
120
  return content
117
-
121
+
118
122
  except FileNotFoundError as e:
119
123
  logger.error(f"🚫 文件未找到: {str(e)}")
120
124
  raise Exception(f"文件未找到: {doc_path}")
@@ -128,24 +132,24 @@ class DocParser(BaseLife):
128
132
  def parse(self, file_path: str):
129
133
  """解析DOC文件"""
130
134
  logger.info(f"🎬 开始解析DOC文件: {file_path}")
131
-
135
+
132
136
  try:
133
137
  # 验证文件存在
134
138
  if not os.path.exists(file_path):
135
139
  logger.error(f"🚫 文件不存在: {file_path}")
136
140
  raise FileNotFoundError(f"文件不存在: {file_path}")
137
-
141
+
138
142
  # 验证文件大小
139
143
  file_size = os.path.getsize(file_path)
140
144
  logger.info(f"📏 文件大小: {file_size} 字节")
141
-
145
+
142
146
  title = self.get_file_extension(file_path)
143
147
  logger.debug(f"🏷️ 提取文件标题: {title}")
144
-
148
+
145
149
  # 使用soffice转换为txt后读取内容
146
150
  logger.info("📝 使用soffice转换DOC为TXT并读取内容")
147
151
  content = self.read_doc_file(doc_path=file_path)
148
-
152
+
149
153
  # 根据to_markdown参数决定是否保持原格式还是处理为markdown格式
150
154
  if self.to_markdown:
151
155
  # 简单的文本到markdown转换(保持段落结构)
@@ -154,22 +158,26 @@ class DocParser(BaseLife):
154
158
  else:
155
159
  mk_content = content
156
160
  logger.info("📝 保持原始文本格式")
157
-
161
+
158
162
  logger.info(f"🎊 文件内容解析完成,最终内容长度: {len(mk_content)} 字符")
159
-
160
- lifecycle = self.generate_lifecycle(source_file=file_path, domain="Technology",
161
- usage_purpose="Documentation", life_type="LLM_ORIGIN")
163
+
164
+ lifecycle = self.generate_lifecycle(
165
+ source_file=file_path,
166
+ domain="Technology",
167
+ usage_purpose="Documentation",
168
+ life_type="LLM_ORIGIN",
169
+ )
162
170
  logger.debug("⚙️ 生成lifecycle信息完成")
163
-
171
+
164
172
  output_vo = MarkdownOutputVo(title, mk_content)
165
173
  output_vo.add_lifecycle(lifecycle)
166
-
174
+
167
175
  result = output_vo.to_dict()
168
176
  logger.info(f"🏆 DOC文件解析完成: {file_path}")
169
177
  logger.debug(f"🔑 返回结果键: {list(result.keys())}")
170
-
178
+
171
179
  return result
172
-
180
+
173
181
  except Exception as e:
174
182
  logger.error(f"💀 解析DOC文件失败: {file_path}, 错误: {str(e)}")
175
183
  raise
@@ -178,18 +186,18 @@ class DocParser(BaseLife):
178
186
  """将纯文本格式化为简单的markdown格式"""
179
187
  if not content.strip():
180
188
  return content
181
-
182
- lines = content.split('\n')
189
+
190
+ lines = content.split("\n")
183
191
  formatted_lines = []
184
-
192
+
185
193
  for line in lines:
186
194
  line = line.strip()
187
195
  if not line:
188
- formatted_lines.append('')
196
+ formatted_lines.append("")
189
197
  continue
190
-
198
+
191
199
  # 简单的markdown格式化规则
192
200
  # 可以根据需要扩展更多规则
193
201
  formatted_lines.append(line)
194
-
195
- return '\n'.join(formatted_lines)
202
+
203
+ return "\n".join(formatted_lines)
@@ -1,14 +1,14 @@
1
+ import logging
1
2
  import os
2
3
  import shutil
3
4
  import subprocess
4
5
  import tempfile
5
- import chardet
6
- import logging
7
6
  from pathlib import Path
8
7
  from typing import Union
9
- from datamax.parser.base import BaseLife
10
- from datamax.parser.base import MarkdownOutputVo
11
8
 
9
+ import chardet
10
+
11
+ from datamax.parser.base import BaseLife, MarkdownOutputVo
12
12
 
13
13
  # 配置日志
14
14
  logger = logging.getLogger(__name__)
@@ -19,42 +19,48 @@ class DocxParser(BaseLife):
19
19
  super().__init__()
20
20
  self.file_path = file_path
21
21
  self.to_markdown = to_markdown
22
- logger.info(f"🚀 DocxParser初始化完成 - 文件路径: {file_path}, 转换为markdown: {to_markdown}")
22
+ logger.info(
23
+ f"🚀 DocxParser初始化完成 - 文件路径: {file_path}, 转换为markdown: {to_markdown}"
24
+ )
23
25
 
24
26
  def docx_to_txt(self, docx_path: str, dir_path: str) -> str:
25
27
  """将.docx文件转换为.txt文件"""
26
28
  logger.info(f"🔄 开始转换DOCX文件为TXT - 源文件: {docx_path}, 输出目录: {dir_path}")
27
-
29
+
28
30
  try:
29
31
  cmd = f'soffice --headless --convert-to txt "{docx_path}" --outdir "{dir_path}"'
30
32
  logger.debug(f"⚡ 执行转换命令: {cmd}")
31
-
32
- process = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
33
+
34
+ process = subprocess.Popen(
35
+ cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE
36
+ )
33
37
  stdout, stderr = process.communicate()
34
38
  exit_code = process.returncode
35
-
39
+
36
40
  if exit_code == 0:
37
41
  logger.info(f"✅ DOCX到TXT转换成功 - 退出码: {exit_code}")
38
42
  if stdout:
39
43
  logger.debug(f"📄 转换输出: {stdout.decode('utf-8', errors='replace')}")
40
44
  else:
41
- encoding = chardet.detect(stderr)['encoding']
45
+ encoding = chardet.detect(stderr)["encoding"]
42
46
  if encoding is None:
43
- encoding = 'utf-8'
44
- error_msg = stderr.decode(encoding, errors='replace')
47
+ encoding = "utf-8"
48
+ error_msg = stderr.decode(encoding, errors="replace")
45
49
  logger.error(f"❌ DOCX到TXT转换失败 - 退出码: {exit_code}, 错误信息: {error_msg}")
46
- raise Exception(f"Error Output (detected encoding: {encoding}): {error_msg}")
47
-
50
+ raise Exception(
51
+ f"Error Output (detected encoding: {encoding}): {error_msg}"
52
+ )
53
+
48
54
  fname = str(Path(docx_path).stem)
49
- txt_path = os.path.join(dir_path, f'{fname}.txt')
50
-
55
+ txt_path = os.path.join(dir_path, f"{fname}.txt")
56
+
51
57
  if not os.path.exists(txt_path):
52
58
  logger.error(f"❌ 转换后的TXT文件不存在: {txt_path}")
53
59
  raise Exception(f"文件转换失败 {docx_path} ==> {txt_path}")
54
60
  else:
55
61
  logger.info(f"🎉 TXT文件转换成功,文件路径: {txt_path}")
56
62
  return txt_path
57
-
63
+
58
64
  except subprocess.SubprocessError as e:
59
65
  logger.error(f"💥 subprocess执行失败: {str(e)}")
60
66
  raise Exception(f"执行转换命令时发生错误: {str(e)}")
@@ -65,25 +71,25 @@ class DocxParser(BaseLife):
65
71
  def read_txt_file(self, txt_path: str) -> str:
66
72
  """读取txt文件内容"""
67
73
  logger.info(f"📖 开始读取TXT文件: {txt_path}")
68
-
74
+
69
75
  try:
70
76
  # 检测文件编码
71
- with open(txt_path, 'rb') as f:
77
+ with open(txt_path, "rb") as f:
72
78
  raw_data = f.read()
73
- encoding = chardet.detect(raw_data)['encoding']
79
+ encoding = chardet.detect(raw_data)["encoding"]
74
80
  if encoding is None:
75
- encoding = 'utf-8'
81
+ encoding = "utf-8"
76
82
  logger.debug(f"🔍 检测到文件编码: {encoding}")
77
-
83
+
78
84
  # 读取文件内容
79
- with open(txt_path, 'r', encoding=encoding, errors='replace') as f:
85
+ with open(txt_path, "r", encoding=encoding, errors="replace") as f:
80
86
  content = f.read()
81
-
87
+
82
88
  logger.info(f"📄 TXT文件读取完成 - 内容长度: {len(content)} 字符")
83
89
  logger.debug(f"👀 前100字符预览: {content[:100]}...")
84
-
90
+
85
91
  return content
86
-
92
+
87
93
  except FileNotFoundError as e:
88
94
  logger.error(f"🚫 TXT文件未找到: {str(e)}")
89
95
  raise Exception(f"文件未找到: {txt_path}")
@@ -94,27 +100,27 @@ class DocxParser(BaseLife):
94
100
  def read_docx_file(self, docx_path: str) -> str:
95
101
  """读取docx文件并转换为文本"""
96
102
  logger.info(f"📖 开始读取DOCX文件 - 文件: {docx_path}")
97
-
103
+
98
104
  try:
99
105
  with tempfile.TemporaryDirectory() as temp_path:
100
106
  logger.debug(f"📁 创建临时目录: {temp_path}")
101
-
107
+
102
108
  temp_dir = Path(temp_path)
103
-
109
+
104
110
  file_path = temp_dir / "tmp.docx"
105
111
  shutil.copy(docx_path, file_path)
106
112
  logger.debug(f"📋 复制文件到临时目录: {docx_path} -> {file_path}")
107
-
113
+
108
114
  # 转换DOCX为TXT
109
115
  txt_file_path = self.docx_to_txt(str(file_path), str(temp_path))
110
116
  logger.info(f"🎯 DOCX转TXT完成: {txt_file_path}")
111
-
117
+
112
118
  # 读取TXT文件内容
113
119
  content = self.read_txt_file(txt_file_path)
114
120
  logger.info(f"✨ TXT文件内容读取完成,内容长度: {len(content)} 字符")
115
-
121
+
116
122
  return content
117
-
123
+
118
124
  except FileNotFoundError as e:
119
125
  logger.error(f"🚫 文件未找到: {str(e)}")
120
126
  raise Exception(f"文件未找到: {docx_path}")
@@ -128,31 +134,31 @@ class DocxParser(BaseLife):
128
134
  def parse(self, file_path: str):
129
135
  """解析DOCX文件"""
130
136
  logger.info(f"🎬 开始解析DOCX文件: {file_path}")
131
-
137
+
132
138
  try:
133
139
  # 验证文件存在
134
140
  if not os.path.exists(file_path):
135
141
  logger.error(f"🚫 文件不存在: {file_path}")
136
142
  raise FileNotFoundError(f"文件不存在: {file_path}")
137
-
143
+
138
144
  # 验证文件扩展名
139
- if not file_path.lower().endswith('.docx'):
145
+ if not file_path.lower().endswith(".docx"):
140
146
  logger.warning(f"⚠️ 文件扩展名不是.docx: {file_path}")
141
-
147
+
142
148
  # 验证文件大小
143
149
  file_size = os.path.getsize(file_path)
144
150
  logger.info(f"📏 文件大小: {file_size} 字节")
145
-
151
+
146
152
  if file_size == 0:
147
153
  logger.warning(f"⚠️ 文件大小为0字节: {file_path}")
148
-
154
+
149
155
  title = self.get_file_extension(file_path)
150
156
  logger.debug(f"🏷️ 提取文件标题: {title}")
151
-
157
+
152
158
  # 使用soffice转换为txt后读取内容
153
159
  logger.info("📝 使用soffice转换DOCX为TXT并读取内容")
154
160
  content = self.read_docx_file(docx_path=file_path)
155
-
161
+
156
162
  # 根据to_markdown参数决定是否保持原格式还是处理为markdown格式
157
163
  if self.to_markdown:
158
164
  # 简单的文本到markdown转换(保持段落结构)
@@ -161,26 +167,30 @@ class DocxParser(BaseLife):
161
167
  else:
162
168
  mk_content = content
163
169
  logger.info("📝 保持原始文本格式")
164
-
170
+
165
171
  logger.info(f"🎊 文件内容解析完成,最终内容长度: {len(mk_content)} 字符")
166
-
172
+
167
173
  # 检查内容是否为空
168
174
  if not mk_content.strip():
169
175
  logger.warning(f"⚠️ 解析出的内容为空: {file_path}")
170
-
171
- lifecycle = self.generate_lifecycle(source_file=file_path, domain="Technology",
172
- usage_purpose="Documentation", life_type="LLM_ORIGIN")
176
+
177
+ lifecycle = self.generate_lifecycle(
178
+ source_file=file_path,
179
+ domain="Technology",
180
+ usage_purpose="Documentation",
181
+ life_type="LLM_ORIGIN",
182
+ )
173
183
  logger.debug("⚙️ 生成lifecycle信息完成")
174
-
184
+
175
185
  output_vo = MarkdownOutputVo(title, mk_content)
176
186
  output_vo.add_lifecycle(lifecycle)
177
-
187
+
178
188
  result = output_vo.to_dict()
179
189
  logger.info(f"🏆 DOCX文件解析完成: {file_path}")
180
190
  logger.debug(f"🔑 返回结果键: {list(result.keys())}")
181
-
191
+
182
192
  return result
183
-
193
+
184
194
  except FileNotFoundError as e:
185
195
  logger.error(f"🚫 文件不存在错误: {str(e)}")
186
196
  raise
@@ -188,25 +198,27 @@ class DocxParser(BaseLife):
188
198
  logger.error(f"🔒 文件权限错误: {str(e)}")
189
199
  raise Exception(f"无权限访问文件: {file_path}")
190
200
  except Exception as e:
191
- logger.error(f"💀 解析DOCX文件失败: {file_path}, 错误类型: {type(e).__name__}, 错误信息: {str(e)}")
201
+ logger.error(
202
+ f"💀 解析DOCX文件失败: {file_path}, 错误类型: {type(e).__name__}, 错误信息: {str(e)}"
203
+ )
192
204
  raise
193
205
 
194
206
  def format_as_markdown(self, content: str) -> str:
195
207
  """将纯文本格式化为简单的markdown格式"""
196
208
  if not content.strip():
197
209
  return content
198
-
199
- lines = content.split('\n')
210
+
211
+ lines = content.split("\n")
200
212
  formatted_lines = []
201
-
213
+
202
214
  for line in lines:
203
215
  line = line.strip()
204
216
  if not line:
205
- formatted_lines.append('')
217
+ formatted_lines.append("")
206
218
  continue
207
-
219
+
208
220
  # 简单的markdown格式化规则
209
221
  # 可以根据需要扩展更多规则
210
222
  formatted_lines.append(line)
211
-
212
- return '\n'.join(formatted_lines)
223
+
224
+ return "\n".join(formatted_lines)
@@ -1,12 +1,13 @@
1
- import multiprocessing
2
- import time
3
1
  import logging
2
+ import multiprocessing
4
3
  import os
4
+ import time
5
+ import warnings
5
6
  from multiprocessing import Queue
6
- from datamax.parser.base import MarkdownOutputVo
7
- from datamax.parser.base import BaseLife
7
+
8
8
  import pandas as pd
9
- import warnings
9
+
10
+ from datamax.parser.base import BaseLife, MarkdownOutputVo
10
11
 
11
12
  warnings.filterwarnings("ignore")
12
13
 
@@ -26,42 +27,44 @@ class XlsxParser(BaseLife):
26
27
  def _parse_with_pandas(self, file_path: str) -> str:
27
28
  """使用pandas读取Excel并转换为markdown"""
28
29
  logger.info(f"🐼 开始使用pandas读取Excel文件: {file_path}")
29
-
30
+
30
31
  try:
31
32
  # 验证文件存在
32
33
  if not os.path.exists(file_path):
33
34
  logger.error(f"🚫 Excel文件不存在: {file_path}")
34
35
  raise FileNotFoundError(f"文件不存在: {file_path}")
35
-
36
+
36
37
  # 验证文件大小
37
38
  file_size = os.path.getsize(file_path)
38
39
  logger.info(f"📏 文件大小: {file_size} 字节")
39
-
40
+
40
41
  if file_size == 0:
41
42
  logger.warning(f"⚠️ 文件大小为0字节: {file_path}")
42
43
  return "*文件为空*"
43
-
44
+
44
45
  # 使用pandas读取Excel文件
45
46
  logger.debug("📊 正在读取Excel数据...")
46
47
  df = pd.read_excel(file_path, sheet_name=None) # 读取所有工作表
47
-
48
+
48
49
  markdown_content = ""
49
-
50
+
50
51
  if isinstance(df, dict):
51
52
  # 多个工作表
52
53
  logger.info(f"📑 检测到多个工作表,共 {len(df)} 个")
53
54
  for sheet_name, sheet_df in df.items():
54
55
  logger.debug(f"📋 处理工作表: {sheet_name}, 形状: {sheet_df.shape}")
55
56
  markdown_content += f"## 工作表: {sheet_name}\n\n"
56
-
57
+
57
58
  if not sheet_df.empty:
58
59
  # 清理数据:移除完全为空的行和列
59
- sheet_df = sheet_df.dropna(how='all').dropna(axis=1, how='all')
60
-
60
+ sheet_df = sheet_df.dropna(how="all").dropna(axis=1, how="all")
61
+
61
62
  if not sheet_df.empty:
62
63
  sheet_markdown = sheet_df.to_markdown(index=False)
63
64
  markdown_content += sheet_markdown + "\n\n"
64
- logger.debug(f"✅ 工作表 {sheet_name} 转换完成,有效数据形状: {sheet_df.shape}")
65
+ logger.debug(
66
+ f"✅ 工作表 {sheet_name} 转换完成,有效数据形状: {sheet_df.shape}"
67
+ )
65
68
  else:
66
69
  markdown_content += "*该工作表无有效数据*\n\n"
67
70
  logger.warning(f"⚠️ 工作表 {sheet_name} 清理后无有效数据")
@@ -73,8 +76,8 @@ class XlsxParser(BaseLife):
73
76
  logger.info(f"📄 单个工作表,形状: {df.shape}")
74
77
  if not df.empty:
75
78
  # 清理数据:移除完全为空的行和列
76
- df = df.dropna(how='all').dropna(axis=1, how='all')
77
-
79
+ df = df.dropna(how="all").dropna(axis=1, how="all")
80
+
78
81
  if not df.empty:
79
82
  markdown_content = df.to_markdown(index=False)
80
83
  logger.info(f"✅ 工作表转换完成,有效数据形状: {df.shape}")
@@ -84,12 +87,12 @@ class XlsxParser(BaseLife):
84
87
  else:
85
88
  markdown_content = "*工作表为空*"
86
89
  logger.warning("⚠️ 工作表为空")
87
-
90
+
88
91
  logger.info(f"🎊 pandas转换完成,markdown内容长度: {len(markdown_content)} 字符")
89
92
  logger.debug(f"👀 前200字符预览: {markdown_content[:200]}...")
90
-
93
+
91
94
  return markdown_content
92
-
95
+
93
96
  except FileNotFoundError as e:
94
97
  logger.error(f"🚫 文件未找到: {str(e)}")
95
98
  raise
@@ -106,41 +109,41 @@ class XlsxParser(BaseLife):
106
109
  def _parse(self, file_path: str, result_queue: Queue) -> dict:
107
110
  """解析Excel文件的核心方法"""
108
111
  logger.info(f"🎬 开始解析Excel文件: {file_path}")
109
-
112
+
110
113
  try:
111
114
  # 使用pandas解析Excel
112
115
  logger.info("🐼 使用pandas模式解析Excel")
113
116
  mk_content = self._parse_with_pandas(file_path)
114
-
117
+
115
118
  # 检查内容是否为空
116
119
  if not mk_content.strip():
117
120
  logger.warning(f"⚠️ 解析出的内容为空: {file_path}")
118
121
  mk_content = "*无法解析文件内容*"
119
-
122
+
120
123
  logger.info(f"🎊 文件内容解析完成,最终内容长度: {len(mk_content)} 字符")
121
-
124
+
122
125
  # 生成lifecycle信息
123
126
  lifecycle = self.generate_lifecycle(
124
127
  source_file=file_path,
125
128
  domain="Technology",
126
129
  usage_purpose="Documentation",
127
- life_type="LLM_ORIGIN"
130
+ life_type="LLM_ORIGIN",
128
131
  )
129
132
  logger.debug("⚙️ 生成lifecycle信息完成")
130
-
133
+
131
134
  # 创建输出对象
132
135
  title = self.get_file_extension(file_path)
133
136
  output_vo = MarkdownOutputVo(title, mk_content)
134
137
  output_vo.add_lifecycle(lifecycle)
135
-
138
+
136
139
  result = output_vo.to_dict()
137
140
  result_queue.put(result)
138
141
  logger.info(f"🏆 Excel文件解析完成: {file_path}")
139
142
  logger.debug(f"🔑 返回结果键: {list(result.keys())}")
140
-
143
+
141
144
  time.sleep(0.5) # 给队列一点时间
142
145
  return result
143
-
146
+
144
147
  except Exception as e:
145
148
  logger.error(f"💀 解析Excel文件失败: {file_path}, 错误: {str(e)}")
146
149
  # 将错误也放入队列
@@ -151,58 +154,62 @@ class XlsxParser(BaseLife):
151
154
  def parse(self, file_path: str) -> dict:
152
155
  """解析Excel文件 - 支持多进程和超时控制"""
153
156
  logger.info(f"🚀 启动Excel解析进程 - 文件: {file_path}, 超时: {self.timeout}s")
154
-
157
+
155
158
  try:
156
159
  # 验证文件存在
157
160
  if not os.path.exists(file_path):
158
161
  logger.error(f"🚫 文件不存在: {file_path}")
159
162
  raise FileNotFoundError(f"文件不存在: {file_path}")
160
-
163
+
161
164
  # 验证文件扩展名
162
- if not file_path.lower().endswith(('.xlsx', '.xls')):
165
+ if not file_path.lower().endswith((".xlsx", ".xls")):
163
166
  logger.warning(f"⚠️ 文件扩展名不是Excel格式: {file_path}")
164
-
167
+
165
168
  result_queue = Queue()
166
- process = multiprocessing.Process(target=self._parse, args=(file_path, result_queue))
169
+ process = multiprocessing.Process(
170
+ target=self._parse, args=(file_path, result_queue)
171
+ )
167
172
  process.start()
168
173
  logger.debug(f"⚡ 启动子进程,PID: {process.pid}")
169
-
174
+
170
175
  start_time = time.time()
171
-
176
+
172
177
  # 等待解析完成或超时
173
178
  while time.time() - start_time < self.timeout:
174
179
  elapsed_time = int(time.time() - start_time)
175
180
  logger.debug(f"⏱️ 等待解析完成... {elapsed_time}s")
176
-
181
+
177
182
  if not process.is_alive():
178
183
  logger.debug("✅ 子进程已完成")
179
184
  break
180
-
185
+
181
186
  if not result_queue.empty():
182
187
  result = result_queue.get()
183
188
  process.join() # 等待进程正常结束
184
-
189
+
185
190
  # 检查是否是错误结果
186
191
  if "error" in result:
187
192
  logger.error(f"💥 子进程返回错误: {result['error']}")
188
193
  raise Exception(result["error"])
189
-
194
+
190
195
  logger.info(f"🎉 Excel解析成功完成,耗时: {elapsed_time}s")
191
196
  return result
192
-
197
+
193
198
  time.sleep(1)
194
199
  else:
195
200
  # 超时处理
196
201
  logger.error(f"⏰ 解析超时 ({self.timeout}s),终止进程")
197
202
  process.terminate()
198
203
  process.join(timeout=5) # 给进程5秒时间优雅退出
199
-
204
+
200
205
  if process.is_alive():
201
206
  logger.error("💀 强制杀死进程")
202
207
  process.kill()
203
-
208
+
204
209
  raise TimeoutError(f"Excel解析超时: {file_path}")
205
-
210
+
206
211
  except Exception as e:
207
- logger.error(f"💀 Excel解析失败: {file_path}, 错误类型: {type(e).__name__}, 错误信息: {str(e)}")
208
- raise
212
+ logger.error(
213
+ f"💀 Excel解析失败: {file_path}, 错误类型: {type(e).__name__}, 错误信息: {str(e)}"
214
+ )
215
+ raise
@@ -0,0 +1,228 @@
1
+ Metadata-Version: 2.4
2
+ Name: pydatamax
3
+ Version: 0.1.14
4
+ Summary: A library for parsing and converting various file formats.
5
+ Home-page: https://github.com/Hi-Dolphin/datamax
6
+ Author: ccy
7
+ Author-email: cy.kron@foxmail.com
8
+ Classifier: Programming Language :: Python :: 3
9
+ Classifier: License :: OSI Approved :: MIT License
10
+ Classifier: Operating System :: OS Independent
11
+ Requires-Python: >=3.10
12
+ Description-Content-Type: text/markdown
13
+ License-File: LICENSE
14
+ Requires-Dist: oss2<3.0.0,>=2.19.1
15
+ Requires-Dist: aliyun-python-sdk-core<3.0.0,>=2.16.0
16
+ Requires-Dist: aliyun-python-sdk-kms<3.0.0,>=2.16.5
17
+ Requires-Dist: crcmod<2.0.0,>=1.7
18
+ Requires-Dist: langdetect<2.0.0,>=1.0.9
19
+ Requires-Dist: loguru<1.0.0,>=0.7.3
20
+ Requires-Dist: python-docx<2.0.0,>=1.1.2
21
+ Requires-Dist: python-dotenv<2.0.0,>=1.1.0
22
+ Requires-Dist: pymupdf<2.0.0,>=1.26.0
23
+ Requires-Dist: pypdf<6.0.0,>=5.5.0
24
+ Requires-Dist: openpyxl<4.0.0,>=3.1.5
25
+ Requires-Dist: pandas<3.0.0,>=2.2.3
26
+ Requires-Dist: numpy<3.0.0,>=2.2.6
27
+ Requires-Dist: requests<3.0.0,>=2.32.3
28
+ Requires-Dist: tqdm<5.0.0,>=4.67.1
29
+ Requires-Dist: pydantic<3.0.0,>=2.11.5
30
+ Requires-Dist: pydantic-settings<3.0.0,>=2.9.1
31
+ Requires-Dist: python-magic<1.0.0,>=0.4.27
32
+ Requires-Dist: PyYAML<7.0.0,>=6.0.2
33
+ Requires-Dist: Pillow<12.0.0,>=11.2.1
34
+ Requires-Dist: packaging<25.0,>=24.2
35
+ Requires-Dist: beautifulsoup4<5.0.0,>=4.13.4
36
+ Requires-Dist: minio<8.0.0,>=7.2.15
37
+ Requires-Dist: openai<2.0.0,>=1.82.0
38
+ Requires-Dist: jionlp<2.0.0,>=1.5.23
39
+ Requires-Dist: chardet<6.0.0,>=5.2.0
40
+ Requires-Dist: python-pptx<2.0.0,>=1.0.2
41
+ Requires-Dist: tiktoken<1.0.0,>=0.9.0
42
+ Requires-Dist: markitdown<1.0.0,>=0.1.1
43
+ Requires-Dist: xlrd<3.0.0,>=2.0.1
44
+ Requires-Dist: tabulate<1.0.0,>=0.9.0
45
+ Requires-Dist: unstructured<1.0.0,>=0.17.2
46
+ Requires-Dist: markdown<4.0.0,>=3.8
47
+ Requires-Dist: langchain<1.0.0,>=0.3.0
48
+ Requires-Dist: langchain-community<1.0.0,>=0.3.0
49
+ Requires-Dist: ebooklib==0.19
50
+ Requires-Dist: setuptools
51
+ Dynamic: author
52
+ Dynamic: author-email
53
+ Dynamic: classifier
54
+ Dynamic: description
55
+ Dynamic: description-content-type
56
+ Dynamic: home-page
57
+ Dynamic: license-file
58
+ Dynamic: requires-dist
59
+ Dynamic: requires-python
60
+ Dynamic: summary
61
+
62
+ # DataMax
63
+
64
+ <div align="center">
65
+
66
+ [中文](README_zh.md) | **English**
67
+
68
+ [![PyPI version](https://badge.fury.io/py/pydatamax.svg)](https://badge.fury.io/py/pydatamax) [![Python](https://img.shields.io/badge/python-3.10+-blue.svg)](https://www.python.org/downloads/) [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
69
+
70
+ </div>
71
+
72
+ A powerful multi-format file parsing, data cleaning, and AI annotation toolkit.
73
+
74
+ ## ✨ Core Features
75
+
76
+ - 🔄 **Multi-format Support**: PDF, DOCX/DOC, PPT/PPTX, XLS/XLSX, HTML, EPUB, TXT, images, and more
77
+ - 🧹 **Intelligent Cleaning**: Three-layer cleaning process with anomaly detection, privacy protection, and text filtering
78
+ - 🤖 **AI Annotation**: LLM-based automatic data annotation and pre-labeling
79
+ - ⚡ **Batch Processing**: Efficient multi-file parallel processing
80
+ - 🎯 **Easy Integration**: Clean API design, ready to use out of the box
81
+
82
+ ## 🚀 Quick Start
83
+
84
+ ### Installation
85
+
86
+ ```bash
87
+ pip install pydatamax
88
+ ```
89
+
90
+ ### Basic Usage
91
+
92
+ ```python
93
+ from datamax import DataMax
94
+
95
+ # Parse a single file
96
+ dm = DataMax(file_path="document.pdf")
97
+ data = dm.get_data()
98
+
99
+ # Batch processing
100
+ dm = DataMax(file_path=["file1.docx", "file2.pdf"])
101
+ data = dm.get_data()
102
+
103
+ # Data cleaning
104
+ cleaned_data = dm.clean_data(method_list=["abnormal", "private", "filter"])
105
+
106
+ # AI annotation
107
+ qa_data = dm.get_pre_label(
108
+ api_key="your-api-key",
109
+ base_url="https://api.openai.com/v1",
110
+ model_name="gpt-3.5-turbo"
111
+ )
112
+ ```
113
+
114
+ ## 📖 Detailed Documentation
115
+
116
+ ### File Parsing
117
+
118
+ #### Supported Formats
119
+
120
+ | Format | Extensions | Special Features |
121
+ |--------|------------|------------------|
122
+ | Documents | `.pdf`, `.docx`, `.doc` | OCR support, Markdown conversion |
123
+ | Spreadsheets | `.xlsx`, `.xls` | Structured data extraction |
124
+ | Presentations | `.pptx`, `.ppt` | Slide content extraction |
125
+ | Web | `.html`, `.epub` | Tag parsing |
126
+ | Images | `.jpg`, `.png`, `.jpeg` | OCR text recognition |
127
+ | Text | `.txt` | Automatic encoding detection |
128
+
129
+ #### Advanced Features
130
+
131
+ ```python
132
+ # Advanced PDF parsing (requires MinerU)
133
+ dm = DataMax(file_path="complex.pdf", use_mineru=True)
134
+
135
+ # Word to Markdown conversion
136
+ dm = DataMax(file_path="document.docx", to_markdown=True)
137
+
138
+ # Image OCR
139
+ dm = DataMax(file_path="image.jpg", use_ocr=True)
140
+ ```
141
+
142
+ ### Data Cleaning
143
+
144
+ ```python
145
+ # Three cleaning modes
146
+ dm.clean_data(method_list=[
147
+ "abnormal", # Anomaly data processing
148
+ "private", # Privacy information masking
149
+ "filter" # Text filtering and normalization
150
+ ])
151
+ ```
152
+
153
+ ### AI Annotation
154
+
155
+ ```python
156
+ # Custom annotation tasks
157
+ qa_data = dm.get_pre_label(
158
+ api_key="sk-xxx",
159
+ base_url="https://api.provider.com/v1",
160
+ model_name="model-name",
161
+ chunk_size=500, # Text chunk size
162
+ chunk_overlap=100, # Overlap length
163
+ question_number=5, # Questions per chunk
164
+ max_workers=5 # Concurrency
165
+ )
166
+ ```
167
+
168
+ ## ⚙️ Environment Setup
169
+
170
+ ### Optional Dependencies
171
+
172
+ #### LibreOffice (DOC file support)
173
+
174
+ **Ubuntu/Debian:**
175
+ ```bash
176
+ sudo apt-get install libreoffice
177
+ ```
178
+
179
+ **Windows:**
180
+ 1. Download and install [LibreOffice](https://www.libreoffice.org/download/)
181
+ 2. Add to environment variables: `C:\Program Files\LibreOffice\program`
182
+
183
+ #### MinerU (Advanced PDF parsing)
184
+
185
+ ```bash
186
+ # Create virtual environment
187
+ conda create -n mineru python=3.10
188
+ conda activate mineru
189
+
190
+ # Install MinerU
191
+ pip install -U "magic-pdf[full]" --extra-index-url https://wheels.myhloli.com
192
+ ```
193
+
194
+ For detailed configuration, please refer to [MinerU Documentation](https://github.com/opendatalab/MinerU)
195
+
196
+ ## 🛠️ Development
197
+
198
+ ### Local Installation
199
+
200
+ ```bash
201
+ git clone https://github.com/Hi-Dolphin/datamax.git
202
+ cd datamax
203
+ pip install -r requirements.txt
204
+ python setup.py install
205
+ ```
206
+
207
+ ## 📋 System Requirements
208
+
209
+ - Python >= 3.10
210
+ - Supports Windows, macOS, Linux
211
+
212
+ ## 🤝 Contributing
213
+
214
+ Issues and Pull Requests are welcome!
215
+
216
+ ## 📄 License
217
+
218
+ This project is licensed under the [MIT License](LICENSE).
219
+
220
+ ## 📞 Contact Us
221
+
222
+ - 📧 Email: cy.kron@foxmail.com
223
+ - 🐛 Issues: [GitHub Issues](https://github.com/Hi-Dolphin/datamax/issues)
224
+ - 📚 Documentation: [Project Homepage](https://github.com/Hi-Dolphin/datamax)
225
+
226
+ ---
227
+
228
+ ⭐ If this project helps you, please give us a star!
@@ -1,14 +1,14 @@
1
1
  datamax/__init__.py,sha256=Kbs8ITE6suPy0VL8WzKH8A_iAGqukC0jIHcFGLgoBw8,28
2
- datamax/loader/MinioHandler.py,sha256=0XLvtKayhI2cjPU7S7m91tSzPcaOC924ByJNbabY6So,6386
3
- datamax/loader/OssHandler.py,sha256=YFbUfH2vkKRVg8OhkYpOrlYHrIX4zofbHFVe8nPpsQ4,7457
4
2
  datamax/loader/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
5
- datamax/loader/core.py,sha256=tSIkOw5D3EVFYme1b7joFt0e_LxJdf-mdUzxpyVt0VI,5098
3
+ datamax/loader/core.py,sha256=C5VhDXkv0HTRcF6WWtztatUZHURZBD-KoZpKyqXfD7U,5100
4
+ datamax/loader/minio_handler.py,sha256=0XLvtKayhI2cjPU7S7m91tSzPcaOC924ByJNbabY6So,6386
5
+ datamax/loader/oss_handler.py,sha256=YFbUfH2vkKRVg8OhkYpOrlYHrIX4zofbHFVe8nPpsQ4,7457
6
6
  datamax/parser/__init__.py,sha256=Jilq2PLBNonmoXKATzsIHWWvFuBdlcV2dbSP1cOZ6zg,111
7
7
  datamax/parser/base.py,sha256=riGcMn4m295_qf9O0-NbHU2BcHGBXvoF4T3fWj9vgUQ,2514
8
8
  datamax/parser/core.py,sha256=9rzIjsVTRacPTUTAVa5gm5fx0h95LxYnw0lEGqjIIB4,11437
9
9
  datamax/parser/csv_parser.py,sha256=IcyVq8mGE6auIcUInXGYWDnh0H0XJ_3SyQrLVRrS7i0,190
10
- datamax/parser/doc_parser.py,sha256=WIWZqvWT4bbquMn1t5Y4P3rEFG6YZ6z3b-f-5yCEtwU,8266
11
- datamax/parser/docx_parser.py,sha256=Ipk9ea281N8Edj74tnqUpc_MGZgD4qn780MX_QA9SiU,9111
10
+ datamax/parser/doc_parser.py,sha256=wVNYn7dkPI12pW_YImDgoceLFWS3RvFpzbFQwVlrnNo,7936
11
+ datamax/parser/docx_parser.py,sha256=o9_1VBDc8nBCmsEMv0sKsVcPMxiAxY5pl0mUvOcoOJc,8796
12
12
  datamax/parser/epub_parser.py,sha256=ljCGxLBPwE5gXVKARJec93VpP4dE9R2GspzuSZBkqPQ,1557
13
13
  datamax/parser/html_parser.py,sha256=xQaaK8674QbQwE-Up9X0DJIH0Gg0mR2KoI7fJ6iw2m0,1393
14
14
  datamax/parser/image_parser.py,sha256=qGCndc_21PwsfuxFG03wHSsV0uc-XMBaW3VDbsJQd90,1233
@@ -19,7 +19,7 @@ datamax/parser/ppt_parser.py,sha256=Niu3Ina6I6m6lAMS1Z-A7rUbR_iFGmNTaASBoNH_vZ0,
19
19
  datamax/parser/pptx_parser.py,sha256=sFWyOa3QNIs4BgtpmSzFQgsgPmunfGqCqi6fulbLFW0,1811
20
20
  datamax/parser/txt_parser.py,sha256=4DIP1LVOw21NDdtqG2RTD_hMcHufkvC8kr048AkuLFs,1682
21
21
  datamax/parser/xls_parser.py,sha256=pRlqgg96f76H8UqXQfheQT9O0ThdP7958hKUCEyQfPM,954
22
- datamax/parser/xlsx_parser.py,sha256=tyLU6wa3F31p7JaoCpML6TJyzYd2Lpeuhzs4036en2U,9274
22
+ datamax/parser/xlsx_parser.py,sha256=Vw6XfoQyu6aQUSIueR-krByMA_WOb5fasf4VmKxjVio,8905
23
23
  datamax/utils/__init__.py,sha256=d69SJvqOXzItyg9rEcLc4z67Lw9vACispOe3x7NvZLA,1051
24
24
  datamax/utils/constants.py,sha256=A0S56mkIfeT6oQmOd-VGTChzLOSBUqsG4skMmLt6uNk,4507
25
25
  datamax/utils/data_cleaner.py,sha256=zlk2dXmhU-_9KVfqmqMGr967v-nc7Iv8ZKRdMkIJsGM,7784
@@ -30,10 +30,10 @@ datamax/utils/paddleocr_pdf_operator.py,sha256=Tnb-5SzUd6OXM-XeaL8vdPnsOhgG_GKz-
30
30
  datamax/utils/ppt_extract.py,sha256=nd6KSqEzxANrPhNPUZY4ogAyxHzKCbdsI5ZfDQCz0Cw,6164
31
31
  datamax/utils/qa_generator.py,sha256=d75an9JEyT6sxlSjdmWYveQshfyTb0v4aGSuTpTJa0A,12561
32
32
  datamax/utils/tokenizer.py,sha256=Y8XB06XQVsNuG8IPl_4iBZj2yu1xzXldVbmZtXFMQM4,859
33
- pydatamax-0.1.13.dist-info/licenses/LICENSE,sha256=LvCq2Pc7MejIvfNeRl_kAM5l_KXWlQFiC-Sjp7kqFf8,1067
33
+ pydatamax-0.1.14.dist-info/licenses/LICENSE,sha256=LvCq2Pc7MejIvfNeRl_kAM5l_KXWlQFiC-Sjp7kqFf8,1067
34
34
  tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
35
35
  tests/test_basic.py,sha256=4AByx25-MIt6_zmzxpFRoSCBqLtIjyfTwFLb1UCJz6k,303
36
- pydatamax-0.1.13.dist-info/METADATA,sha256=knte2YZ9jdSGxmO0fzBVtMFAcq1exCKyEdfBde4aCjA,9731
37
- pydatamax-0.1.13.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
38
- pydatamax-0.1.13.dist-info/top_level.txt,sha256=DvdVIUrxJDgRUYiGA5vznYZIP-K8ZnDkTZfrqYLNZMQ,14
39
- pydatamax-0.1.13.dist-info/RECORD,,
36
+ pydatamax-0.1.14.dist-info/METADATA,sha256=P-wz8Log3gcUMftKTd2qrcmNuzpp-HOn8gVVe8cTceM,6314
37
+ pydatamax-0.1.14.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
38
+ pydatamax-0.1.14.dist-info/top_level.txt,sha256=DvdVIUrxJDgRUYiGA5vznYZIP-K8ZnDkTZfrqYLNZMQ,14
39
+ pydatamax-0.1.14.dist-info/RECORD,,
@@ -1,280 +0,0 @@
1
- Metadata-Version: 2.4
2
- Name: pydatamax
3
- Version: 0.1.13
4
- Summary: A library for parsing and converting various file formats.
5
- Home-page: https://github.com/Hi-Dolphin/datamax
6
- Author: ccy
7
- Author-email: cy.kron@foxmail.com
8
- Classifier: Programming Language :: Python :: 3
9
- Classifier: License :: OSI Approved :: MIT License
10
- Classifier: Operating System :: OS Independent
11
- Requires-Python: >=3.10
12
- Description-Content-Type: text/markdown
13
- License-File: LICENSE
14
- Requires-Dist: oss2<3.0.0,>=2.19.1
15
- Requires-Dist: aliyun-python-sdk-core<3.0.0,>=2.16.0
16
- Requires-Dist: aliyun-python-sdk-kms<3.0.0,>=2.16.5
17
- Requires-Dist: crcmod<2.0.0,>=1.7
18
- Requires-Dist: langdetect<2.0.0,>=1.0.9
19
- Requires-Dist: loguru<1.0.0,>=0.7.3
20
- Requires-Dist: python-docx<2.0.0,>=1.1.2
21
- Requires-Dist: python-dotenv<2.0.0,>=1.1.0
22
- Requires-Dist: pymupdf<2.0.0,>=1.26.0
23
- Requires-Dist: pypdf<6.0.0,>=5.5.0
24
- Requires-Dist: openpyxl<4.0.0,>=3.1.5
25
- Requires-Dist: pandas<3.0.0,>=2.2.3
26
- Requires-Dist: numpy<3.0.0,>=2.2.6
27
- Requires-Dist: requests<3.0.0,>=2.32.3
28
- Requires-Dist: tqdm<5.0.0,>=4.67.1
29
- Requires-Dist: pydantic<3.0.0,>=2.11.5
30
- Requires-Dist: pydantic-settings<3.0.0,>=2.9.1
31
- Requires-Dist: python-magic<1.0.0,>=0.4.27
32
- Requires-Dist: PyYAML<7.0.0,>=6.0.2
33
- Requires-Dist: Pillow<12.0.0,>=11.2.1
34
- Requires-Dist: packaging<25.0,>=24.2
35
- Requires-Dist: beautifulsoup4<5.0.0,>=4.13.4
36
- Requires-Dist: minio<8.0.0,>=7.2.15
37
- Requires-Dist: openai<2.0.0,>=1.82.0
38
- Requires-Dist: jionlp<2.0.0,>=1.5.23
39
- Requires-Dist: chardet<6.0.0,>=5.2.0
40
- Requires-Dist: python-pptx<2.0.0,>=1.0.2
41
- Requires-Dist: tiktoken<1.0.0,>=0.9.0
42
- Requires-Dist: markitdown<1.0.0,>=0.1.1
43
- Requires-Dist: xlrd<3.0.0,>=2.0.1
44
- Requires-Dist: tabulate<1.0.0,>=0.9.0
45
- Requires-Dist: unstructured<1.0.0,>=0.17.2
46
- Requires-Dist: markdown<4.0.0,>=3.8
47
- Requires-Dist: langchain<1.0.0,>=0.3.0
48
- Requires-Dist: langchain-community<1.0.0,>=0.3.0
49
- Dynamic: author
50
- Dynamic: author-email
51
- Dynamic: classifier
52
- Dynamic: description
53
- Dynamic: description-content-type
54
- Dynamic: home-page
55
- Dynamic: license-file
56
- Dynamic: requires-dist
57
- Dynamic: requires-python
58
- Dynamic: summary
59
-
60
- # DataMax
61
-
62
- ## Overview
63
- DataMax is designed as a comprehensive solution for processing diverse file formats, performing data cleaning, and facilitating data annotation.
64
-
65
- ## Key Features
66
-
67
- ### File Processing Capabilities
68
- Currently supports reading, conversion, and extraction from:
69
- - PDF, HTML
70
- - DOCX/DOC, PPT/PPTX
71
- - EPUB
72
- - Images
73
- - XLS/XLSX spreadsheets
74
- - Plain text (TXT)
75
-
76
- ### Data Cleaning Pipeline
77
- Three-tiered cleaning process:
78
- 1. Anomaly detection and handling
79
- 2. Privacy protection processing
80
- 3. Text filtering and normalization
81
-
82
- ### AI-Powered Data Annotation
83
- Implements an LLM+Prompt to:
84
- - Continuously generate pre-labeled datasets
85
- - Provide optimized training data for model fine-tuning
86
-
87
-
88
- ## Installation Guide (Key Dependencies)
89
- Dependencies include libreoffice, datamax, and MinerU.
90
-
91
- ### 1. Installing libreoffice Dependency
92
- **Note:** Without datamax, .doc files will not be supported.
93
-
94
- #### Linux (Debian/Ubuntu)
95
- ```bash
96
- sudo apt-get update
97
- sudo apt-get install libreoffice
98
- ```
99
- ### Windows
100
- ```text
101
- Install LibreOffice from: [Download LibreOffice](https://www.libreoffice.org/download/download-libreoffice/?spm=5176.28103460.0.0.5b295d275bpHzh)
102
- Add to environment variable: `$env:PATH += ";C:\Program Files\LibreOffice\program"`
103
- ```
104
- ### Checking LibreOffice Installation
105
- ```bash
106
- soffice --version
107
- ```
108
-
109
- ## 2. Installing MinerU Dependency
110
- Note: Without MinerU, advanced OCR parsing for PDFs will not be supported.
111
- ### Create a Virtual Environment and Install Basic Dependencies
112
- ```bash
113
- conda create -n mineru python=3.10
114
- conda activate mineru
115
- pip install -U "magic-pdf[full]" --extra-index-url https://wheels.myhloli.com -i https://mirrors.aliyun.com/pypi/simple
116
- ```
117
- ### Installing Model Weight Files
118
- https://github.com/opendatalab/MinerU/blob/master/docs/how_to_download_models_zh_cn.md
119
- ```bash
120
- pip install modelscope
121
- wget https://gcore.jsdelivr.net/gh/opendatalab/MinerU@master/scripts/download_models.py -O download_models.py
122
- python download_models.py
123
- ```
124
-
125
- ### Modify the Configuration File magic-pdf.json (Located in the User Directory, Template Preview Below)
126
- ```json
127
- {
128
- "models-dir": "path\\to\\folder\\PDF-Extract-Kit-1___0\\models",
129
- "layoutreader-model-dir": "path\\to\\folder\\layoutreader",
130
- "device-mode": "cpu",
131
- ...
132
- }
133
- ```
134
-
135
- ## 3. Installing Basic Dependencies for datamax
136
- 1. Clone the repository to your local machine:
137
- ```bash
138
- git clone <repository-url>
139
- ```
140
- 2. Install dependencies into conda:
141
- ```bash
142
- cd datamax
143
- pip install -r requirements.txt -i https://pypi.tuna.tsinghua.edu.cn/simple
144
- ```
145
-
146
-
147
- ## Features
148
- - **Multi-format Support**: Capable of handling various text file types such as PDF, HTML, DOCX, and TXT.
149
- - **Content Extraction**: Provides powerful content extraction capabilities to accurately retrieve information from complex document structures.
150
- - **Data Conversion**: Supports converting processed data into markdown format for further analysis.
151
- - **Batch Processing**: Can handle multiple files at once, improving work efficiency.
152
- - **Customizable Configuration**: Users can adjust processing parameters according to their needs to meet different business requirements.
153
- - **Cross-platform Compatibility**: This SDK can run on multiple operating systems, including Windows, MacOS, and Linux.
154
-
155
-
156
- ## Technology Stack
157
-
158
- - **Programming Language**: Python >= 3.10
159
- - **Dependency Libraries**:
160
- - PyMuPDF: For PDF file parsing.
161
- - BeautifulSoup: For HTML file parsing.
162
- - python-docx: For DOCX file parsing.
163
- - pandas: For data processing and conversion.
164
- - paddleocr: For parsing scanned PDFs, tables, and images.
165
- - **Development Environment**: Visual Studio Code or PyCharm
166
- - **Version Control**: Git
167
-
168
- ## Usage Instructions
169
- ### Installing the SDK
170
- - **Installation Commands**:
171
- ```bash
172
- ## Local Installation
173
- python setup.py sdist bdist_wheel
174
- pip install dist/datamax-0.1.3-py3-none-any.whl
175
-
176
- ## Pip Installation
177
- pip install pydatamax
178
- ```
179
-
180
-
181
- - **Importing the Code**:
182
- ```python
183
- # File Parsing
184
- from datamax import DataMax
185
-
186
- ## Handling a Single File in Two Ways
187
- # 1. Using a List of Length 1
188
- data = DataMax(file_path=[r"docx_files_example/船视宝概述.doc"])
189
- data = data.get_data()
190
-
191
- # 2. Using a String
192
- data = DataMax(file_path=r"docx_files_example/船视宝概述.doc")
193
- data = data.get_data()
194
-
195
- ## Handling Multiple Files
196
- # 1. Using a List of Length n
197
- data = DataMax(file_path=[r"docx_files_example/船视宝概述1.doc", r"docx_files_example/船视宝概述2.doc"])
198
- data = data.get_data()
199
-
200
- # 2. Passing a Folder Path as a String
201
- data = DataMax(file_path=r"docx_files_example/")
202
- data = data.get_data()
203
-
204
- # Data Cleaning
205
- """
206
- Cleaning rules can be found in datamax/utils/data_cleaner.py
207
- abnormal: Abnormal cleaning
208
- private: Privacy processing
209
- filter: Text filtering
210
- """
211
- # Direct Use: Clean the text parameter directly and return a string
212
- dm = DataMax()
213
- data = dm.clean_data(method_list=["abnormal", "private"], text="<div></div>你好 18717777777 \n\n\n\n")
214
-
215
- # Process Use: Use after get_data() to return the complete data structure
216
- dm = DataMax(file_path=r"C:\Users\cykro\Desktop\数据库开发手册.pdf", use_ocr=True)
217
- data2 = dm.get_data()
218
- cleaned_data = dm.clean_data(method_list=["abnormal", "filter", "private"])
219
-
220
- # Large Model Pre-annotation Supporting any model that can be called via OpenAI SDK
221
- data = DataMax(file_path=r"path\to\xxx.docx")
222
- parsed_data = data.get_data()
223
- # If no custom messages are passed, the default messages in the SDK will be used
224
- messages = [
225
- {'role': 'system', 'content': 'You are a helpful assistant.'},
226
- {'role': 'user', 'content': 'Who are you?'}
227
- ]
228
- qa_datas = data.get_pre_label(
229
- api_key="sk-xxx",
230
- base_url="https://dashscope.aliyuncs.com/compatible-mode/v1/chat/completions",
231
- model_name="qwen-max",
232
- chunk_size=500,
233
- chunk_overlap=100,
234
- question_number=5,
235
- max_workers=5,
236
- # message=[]
237
- )
238
- print(f'Annotated result:{qa_datas}')
239
- ```
240
-
241
-
242
- ## Examples
243
- ```python
244
- ## docx | doc | epub | html | txt | ppt | pptx | xls | xlsx
245
- from datamax import DataMax
246
- data = DataMax(file_path=r"docx_files_example/船视宝概述.doc", to_markdown=True)
247
- """
248
- Parameters:
249
- file_path: Relative file path / Absolute file path
250
- to_markdown: Whether to convert to markdown (default value False, directly returns text) This parameter only supports word files (doc | docx)
251
- """
252
-
253
- ## jpg | jpeg | png | ...(image types)
254
- data = DataMax(file_path=r"image.jpg", use_mineru=True)
255
- """
256
- Parameters:
257
- file_path: Relative file path / Absolute file path
258
- use_mineru: Whether to use MinerU enhancement
259
- """
260
-
261
- ## pdf
262
- from datamax import DataMax
263
- data = DataMax(file_path=r"docx_files_example/船视宝概述.pdf", use_mineru=True)
264
- """
265
- Parameters:
266
- file_path: Relative file path / Absolute file path
267
- use_mineru: Whether to use MinerU enhancement
268
- """
269
- ```
270
-
271
- ## Contribution Guide
272
- We welcome any form of contribution, whether it is reporting bugs, suggesting new features, or submitting code improvements. Please read our Contributor's Guide to learn how to get started.
273
- ## License
274
- This project is licensed under the MIT License. For more details, see the LICENSE file.
275
-
276
- ## Contact Information
277
- If you encounter any issues during use, or have any suggestions or feedback, please contact us through the following means:
278
- - Email: cy.kron@foxmail.com | zhibaohe@hotmail.com
279
- - Project Homepage: GitHub Project Link
280
-
File without changes
File without changes