jarvis-ai-assistant 0.1.138__py3-none-any.whl → 0.1.141__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of jarvis-ai-assistant might be problematic. Click here for more details.
- jarvis/__init__.py +1 -1
- jarvis/jarvis_agent/__init__.py +62 -14
- jarvis/jarvis_agent/builtin_input_handler.py +4 -14
- jarvis/jarvis_agent/main.py +1 -1
- jarvis/jarvis_agent/patch.py +37 -40
- jarvis/jarvis_agent/shell_input_handler.py +2 -3
- jarvis/jarvis_code_agent/code_agent.py +23 -30
- jarvis/jarvis_code_analysis/checklists/__init__.py +3 -0
- jarvis/jarvis_code_analysis/checklists/c_cpp.py +50 -0
- jarvis/jarvis_code_analysis/checklists/csharp.py +75 -0
- jarvis/jarvis_code_analysis/checklists/data_format.py +82 -0
- jarvis/jarvis_code_analysis/checklists/devops.py +107 -0
- jarvis/jarvis_code_analysis/checklists/docs.py +87 -0
- jarvis/jarvis_code_analysis/checklists/go.py +52 -0
- jarvis/jarvis_code_analysis/checklists/infrastructure.py +98 -0
- jarvis/jarvis_code_analysis/checklists/java.py +66 -0
- jarvis/jarvis_code_analysis/checklists/javascript.py +73 -0
- jarvis/jarvis_code_analysis/checklists/kotlin.py +107 -0
- jarvis/jarvis_code_analysis/checklists/loader.py +76 -0
- jarvis/jarvis_code_analysis/checklists/php.py +77 -0
- jarvis/jarvis_code_analysis/checklists/python.py +56 -0
- jarvis/jarvis_code_analysis/checklists/ruby.py +107 -0
- jarvis/jarvis_code_analysis/checklists/rust.py +58 -0
- jarvis/jarvis_code_analysis/checklists/shell.py +75 -0
- jarvis/jarvis_code_analysis/checklists/sql.py +72 -0
- jarvis/jarvis_code_analysis/checklists/swift.py +77 -0
- jarvis/jarvis_code_analysis/checklists/web.py +97 -0
- jarvis/jarvis_code_analysis/code_review.py +660 -0
- jarvis/jarvis_dev/main.py +61 -88
- jarvis/jarvis_git_squash/main.py +3 -3
- jarvis/jarvis_git_utils/git_commiter.py +242 -0
- jarvis/jarvis_init/main.py +62 -0
- jarvis/jarvis_platform/base.py +4 -0
- jarvis/jarvis_platform/kimi.py +173 -5
- jarvis/jarvis_platform/openai.py +3 -0
- jarvis/jarvis_platform/registry.py +1 -0
- jarvis/jarvis_platform/yuanbao.py +275 -5
- jarvis/jarvis_tools/ask_codebase.py +6 -9
- jarvis/jarvis_tools/ask_user.py +17 -5
- jarvis/jarvis_tools/base.py +3 -1
- jarvis/jarvis_tools/chdir.py +1 -0
- jarvis/jarvis_tools/create_code_agent.py +4 -3
- jarvis/jarvis_tools/create_sub_agent.py +1 -0
- jarvis/jarvis_tools/execute_script.py +170 -0
- jarvis/jarvis_tools/file_analyzer.py +90 -239
- jarvis/jarvis_tools/file_operation.py +99 -31
- jarvis/jarvis_tools/{find_methodolopy.py → find_methodology.py} +2 -1
- jarvis/jarvis_tools/lsp_get_diagnostics.py +2 -0
- jarvis/jarvis_tools/methodology.py +11 -11
- jarvis/jarvis_tools/read_code.py +2 -0
- jarvis/jarvis_tools/read_webpage.py +33 -196
- jarvis/jarvis_tools/registry.py +68 -131
- jarvis/jarvis_tools/search_web.py +14 -6
- jarvis/jarvis_tools/virtual_tty.py +399 -0
- jarvis/jarvis_utils/config.py +29 -3
- jarvis/jarvis_utils/embedding.py +0 -317
- jarvis/jarvis_utils/file_processors.py +343 -0
- jarvis/jarvis_utils/input.py +0 -1
- jarvis/jarvis_utils/methodology.py +94 -435
- jarvis/jarvis_utils/utils.py +207 -9
- {jarvis_ai_assistant-0.1.138.dist-info → jarvis_ai_assistant-0.1.141.dist-info}/METADATA +4 -4
- jarvis_ai_assistant-0.1.141.dist-info/RECORD +94 -0
- {jarvis_ai_assistant-0.1.138.dist-info → jarvis_ai_assistant-0.1.141.dist-info}/entry_points.txt +4 -4
- jarvis/jarvis_code_agent/file_select.py +0 -202
- jarvis/jarvis_platform/ai8.py +0 -268
- jarvis/jarvis_platform/ollama.py +0 -137
- jarvis/jarvis_platform/oyi.py +0 -307
- jarvis/jarvis_rag/file_processors.py +0 -138
- jarvis/jarvis_rag/main.py +0 -1734
- jarvis/jarvis_tools/code_review.py +0 -333
- jarvis/jarvis_tools/execute_python_script.py +0 -58
- jarvis/jarvis_tools/execute_shell.py +0 -97
- jarvis/jarvis_tools/execute_shell_script.py +0 -58
- jarvis/jarvis_tools/find_caller.py +0 -278
- jarvis/jarvis_tools/find_symbol.py +0 -295
- jarvis/jarvis_tools/function_analyzer.py +0 -331
- jarvis/jarvis_tools/git_commiter.py +0 -167
- jarvis/jarvis_tools/project_analyzer.py +0 -304
- jarvis/jarvis_tools/rag.py +0 -143
- jarvis/jarvis_tools/tool_generator.py +0 -221
- jarvis_ai_assistant-0.1.138.dist-info/RECORD +0 -85
- /jarvis/{jarvis_rag → jarvis_init}/__init__.py +0 -0
- {jarvis_ai_assistant-0.1.138.dist-info → jarvis_ai_assistant-0.1.141.dist-info}/LICENSE +0 -0
- {jarvis_ai_assistant-0.1.138.dist-info → jarvis_ai_assistant-0.1.141.dist-info}/WHEEL +0 -0
- {jarvis_ai_assistant-0.1.138.dist-info → jarvis_ai_assistant-0.1.141.dist-info}/top_level.txt +0 -0
jarvis/jarvis_platform/oyi.py
DELETED
|
@@ -1,307 +0,0 @@
|
|
|
1
|
-
import mimetypes
|
|
2
|
-
import os
|
|
3
|
-
from typing import Dict, List, Tuple
|
|
4
|
-
from jarvis.jarvis_platform.base import BasePlatform
|
|
5
|
-
import requests
|
|
6
|
-
import json
|
|
7
|
-
|
|
8
|
-
from jarvis.jarvis_utils.output import OutputType, PrettyOutput
|
|
9
|
-
|
|
10
|
-
class OyiModel(BasePlatform):
|
|
11
|
-
"""Oyi model implementation"""
|
|
12
|
-
|
|
13
|
-
platform_name = "oyi"
|
|
14
|
-
BASE_URL = "https://api-10086.rcouyi.com"
|
|
15
|
-
|
|
16
|
-
def get_model_list(self) -> List[Tuple[str, str]]:
|
|
17
|
-
"""Get model list"""
|
|
18
|
-
self.get_available_models()
|
|
19
|
-
return [(name,info['desc']) for name,info in self.models.items()]
|
|
20
|
-
|
|
21
|
-
def __init__(self):
|
|
22
|
-
"""Initialize model"""
|
|
23
|
-
super().__init__()
|
|
24
|
-
self.models = {}
|
|
25
|
-
self.messages = []
|
|
26
|
-
self.system_message = ""
|
|
27
|
-
self.conversation = None
|
|
28
|
-
self.first_chat = True
|
|
29
|
-
|
|
30
|
-
self.token = os.getenv("OYI_API_KEY")
|
|
31
|
-
if not self.token:
|
|
32
|
-
PrettyOutput.print("OYI_API_KEY 未设置", OutputType.WARNING)
|
|
33
|
-
|
|
34
|
-
self.model_name = os.getenv("JARVIS_MODEL") or "deepseek-chat"
|
|
35
|
-
if self.model_name not in [m.split()[0] for m in self.get_available_models()]:
|
|
36
|
-
PrettyOutput.print(f"警告: 选择的模型 {self.model_name} 不在可用列表中", OutputType.WARNING)
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
def set_model_name(self, model_name: str):
|
|
40
|
-
"""Set model name"""
|
|
41
|
-
|
|
42
|
-
self.model_name = model_name
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
def create_conversation(self) -> bool:
|
|
46
|
-
"""Create a new conversation"""
|
|
47
|
-
try:
|
|
48
|
-
headers = {
|
|
49
|
-
'Authorization': f'Bearer {self.token}',
|
|
50
|
-
'Content-Type': 'application/json',
|
|
51
|
-
'Accept': 'application/json',
|
|
52
|
-
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36'
|
|
53
|
-
}
|
|
54
|
-
|
|
55
|
-
payload = {
|
|
56
|
-
"id": 0,
|
|
57
|
-
"roleId": 0,
|
|
58
|
-
"title": "New conversation",
|
|
59
|
-
"isLock": False,
|
|
60
|
-
"systemMessage": "",
|
|
61
|
-
"params": json.dumps({
|
|
62
|
-
"model": self.model_name,
|
|
63
|
-
"is_webSearch": True,
|
|
64
|
-
"message": [],
|
|
65
|
-
"systemMessage": None,
|
|
66
|
-
"requestMsgCount": 65536,
|
|
67
|
-
"temperature": 0.8,
|
|
68
|
-
"speechVoice": "Alloy",
|
|
69
|
-
"max_tokens": 8192,
|
|
70
|
-
"chatPluginIds": []
|
|
71
|
-
})
|
|
72
|
-
}
|
|
73
|
-
|
|
74
|
-
response = requests.post(
|
|
75
|
-
f"{self.BASE_URL}/chatapi/chat/save",
|
|
76
|
-
headers=headers,
|
|
77
|
-
json=payload
|
|
78
|
-
)
|
|
79
|
-
|
|
80
|
-
if response.status_code == 200:
|
|
81
|
-
data = response.json()
|
|
82
|
-
if data['code'] == 200 and data['type'] == 'success':
|
|
83
|
-
self.conversation = data
|
|
84
|
-
return True
|
|
85
|
-
else:
|
|
86
|
-
PrettyOutput.print(f"创建会话失败: {data['message']}", OutputType.WARNING)
|
|
87
|
-
return False
|
|
88
|
-
else:
|
|
89
|
-
PrettyOutput.print(f"创建会话失败: {response.status_code}", OutputType.WARNING)
|
|
90
|
-
return False
|
|
91
|
-
|
|
92
|
-
except Exception as e:
|
|
93
|
-
PrettyOutput.print(f"创建会话失败: {str(e)}", OutputType.ERROR)
|
|
94
|
-
return False
|
|
95
|
-
|
|
96
|
-
def set_system_message(self, message: str):
|
|
97
|
-
"""Set system message"""
|
|
98
|
-
self.system_message = message
|
|
99
|
-
|
|
100
|
-
def chat(self, message: str) -> str:
|
|
101
|
-
"""Execute chat with the model
|
|
102
|
-
|
|
103
|
-
Args:
|
|
104
|
-
message: User input message
|
|
105
|
-
|
|
106
|
-
Returns:
|
|
107
|
-
str: Model response
|
|
108
|
-
"""
|
|
109
|
-
try:
|
|
110
|
-
# 确保有会话ID
|
|
111
|
-
if not self.conversation:
|
|
112
|
-
if not self.create_conversation():
|
|
113
|
-
raise Exception("Failed to create conversation")
|
|
114
|
-
|
|
115
|
-
# 1. 发送消息
|
|
116
|
-
headers = {
|
|
117
|
-
'Authorization': f'Bearer {self.token}',
|
|
118
|
-
'Content-Type': 'application/json',
|
|
119
|
-
'Accept': 'application/json, text/plain, */*',
|
|
120
|
-
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36',
|
|
121
|
-
'Origin': 'https://ai.rcouyi.com',
|
|
122
|
-
'Referer': 'https://ai.rcouyi.com/'
|
|
123
|
-
}
|
|
124
|
-
|
|
125
|
-
payload = {
|
|
126
|
-
"topicId": self.conversation['result']['id'] if self.conversation else None,
|
|
127
|
-
"messages": self.messages,
|
|
128
|
-
"content": message,
|
|
129
|
-
"contentFiles": []
|
|
130
|
-
}
|
|
131
|
-
|
|
132
|
-
# 如果有上传的文件,添加到请求中
|
|
133
|
-
if self.first_chat:
|
|
134
|
-
message = self.system_message + "\n" + message
|
|
135
|
-
payload["content"] = message
|
|
136
|
-
self.first_chat = False
|
|
137
|
-
|
|
138
|
-
self.messages.append({"role": "user", "content": message})
|
|
139
|
-
|
|
140
|
-
# 发送消息
|
|
141
|
-
response = requests.post(
|
|
142
|
-
f"{self.BASE_URL}/chatapi/chat/message",
|
|
143
|
-
headers=headers,
|
|
144
|
-
json=payload
|
|
145
|
-
)
|
|
146
|
-
|
|
147
|
-
if response.status_code != 200:
|
|
148
|
-
error_msg = f"聊天请求失败: {response.status_code}"
|
|
149
|
-
PrettyOutput.print(error_msg, OutputType.WARNING)
|
|
150
|
-
raise Exception(error_msg)
|
|
151
|
-
|
|
152
|
-
data = response.json()
|
|
153
|
-
if data['code'] != 200 or data['type'] != 'success':
|
|
154
|
-
error_msg = f"聊天失败: {data.get('message', '未知错误')}"
|
|
155
|
-
PrettyOutput.print(error_msg, OutputType.WARNING)
|
|
156
|
-
raise Exception(error_msg)
|
|
157
|
-
|
|
158
|
-
message_id = data['result'][-1]
|
|
159
|
-
|
|
160
|
-
# 获取响应内容
|
|
161
|
-
response = requests.post(
|
|
162
|
-
f"{self.BASE_URL}/chatapi/chat/message/{message_id}",
|
|
163
|
-
headers=headers,
|
|
164
|
-
stream=True
|
|
165
|
-
)
|
|
166
|
-
|
|
167
|
-
if response.status_code == 200:
|
|
168
|
-
full_response = ""
|
|
169
|
-
bin = b""
|
|
170
|
-
for chunk in response.iter_content(decode_unicode=True):
|
|
171
|
-
if chunk:
|
|
172
|
-
bin += chunk
|
|
173
|
-
try:
|
|
174
|
-
text = bin.decode('utf-8')
|
|
175
|
-
except UnicodeDecodeError:
|
|
176
|
-
continue
|
|
177
|
-
if not self.suppress_output:
|
|
178
|
-
PrettyOutput.print_stream(text)
|
|
179
|
-
full_response += text
|
|
180
|
-
bin = b""
|
|
181
|
-
|
|
182
|
-
PrettyOutput.print_stream_end()
|
|
183
|
-
|
|
184
|
-
self.messages.append({"role": "assistant", "content": full_response})
|
|
185
|
-
return full_response
|
|
186
|
-
else:
|
|
187
|
-
error_msg = f"获取响应失败: {response.status_code}"
|
|
188
|
-
PrettyOutput.print(error_msg, OutputType.WARNING)
|
|
189
|
-
raise Exception(error_msg)
|
|
190
|
-
except Exception as e:
|
|
191
|
-
PrettyOutput.print(f"聊天失败: {str(e)}", OutputType.ERROR)
|
|
192
|
-
raise e
|
|
193
|
-
|
|
194
|
-
def name(self) -> str:
|
|
195
|
-
"""Return model name"""
|
|
196
|
-
return self.model_name
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
def delete_chat(self) -> bool:
|
|
200
|
-
"""Delete current chat session"""
|
|
201
|
-
try:
|
|
202
|
-
if not self.conversation:
|
|
203
|
-
return True
|
|
204
|
-
|
|
205
|
-
headers = {
|
|
206
|
-
'Authorization': f'Bearer {self.token}',
|
|
207
|
-
'Content-Type': 'application/json',
|
|
208
|
-
'Accept': 'application/json, text/plain, */*',
|
|
209
|
-
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36',
|
|
210
|
-
'Origin': 'https://ai.rcouyi.com',
|
|
211
|
-
'Referer': 'https://ai.rcouyi.com/'
|
|
212
|
-
}
|
|
213
|
-
|
|
214
|
-
response = requests.post(
|
|
215
|
-
f"{self.BASE_URL}/chatapi/chat/{self.conversation['result']['id']}",
|
|
216
|
-
headers=headers,
|
|
217
|
-
json={}
|
|
218
|
-
)
|
|
219
|
-
|
|
220
|
-
if response.status_code == 200:
|
|
221
|
-
data = response.json()
|
|
222
|
-
if data['code'] == 200 and data['type'] == 'success':
|
|
223
|
-
self.messages = []
|
|
224
|
-
self.conversation = None
|
|
225
|
-
self.first_chat = True
|
|
226
|
-
return True
|
|
227
|
-
else:
|
|
228
|
-
error_msg = f"删除会话失败: {data.get('message', '未知错误')}"
|
|
229
|
-
PrettyOutput.print(error_msg, OutputType.WARNING)
|
|
230
|
-
return False
|
|
231
|
-
else:
|
|
232
|
-
error_msg = f"删除会话请求失败: {response.status_code}"
|
|
233
|
-
PrettyOutput.print(error_msg, OutputType.WARNING)
|
|
234
|
-
return False
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
except Exception as e:
|
|
238
|
-
PrettyOutput.print(f"删除会话失败: {str(e)}", OutputType.ERROR)
|
|
239
|
-
return False
|
|
240
|
-
|
|
241
|
-
def get_available_models(self) -> List[str]:
|
|
242
|
-
"""Get available model list
|
|
243
|
-
|
|
244
|
-
Returns:
|
|
245
|
-
List[str]: Available model name list
|
|
246
|
-
"""
|
|
247
|
-
try:
|
|
248
|
-
if self.models:
|
|
249
|
-
return list(self.models.keys())
|
|
250
|
-
|
|
251
|
-
headers = {
|
|
252
|
-
'Content-Type': 'application/json',
|
|
253
|
-
'Accept': 'application/json, text/plain, */*',
|
|
254
|
-
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36',
|
|
255
|
-
'Origin': 'https://ai.rcouyi.com',
|
|
256
|
-
'Referer': 'https://ai.rcouyi.com/'
|
|
257
|
-
}
|
|
258
|
-
|
|
259
|
-
response = requests.get(
|
|
260
|
-
"https://ai.rcouyi.com/config/system.json",
|
|
261
|
-
headers=headers
|
|
262
|
-
)
|
|
263
|
-
|
|
264
|
-
if response.status_code != 200:
|
|
265
|
-
PrettyOutput.print(f"获取模型列表失败: {response.status_code}", OutputType.WARNING)
|
|
266
|
-
return []
|
|
267
|
-
|
|
268
|
-
data = response.json()
|
|
269
|
-
|
|
270
|
-
# 保存模型信息
|
|
271
|
-
self.models = {
|
|
272
|
-
model['value']: model
|
|
273
|
-
for model in data.get('model', [])
|
|
274
|
-
if model.get('enable', False) # 只保存启用的模型
|
|
275
|
-
}
|
|
276
|
-
|
|
277
|
-
# 格式化显示
|
|
278
|
-
models = []
|
|
279
|
-
for model in self.models.values():
|
|
280
|
-
# 基本信息
|
|
281
|
-
model_name = model['value']
|
|
282
|
-
model_str = model['label']
|
|
283
|
-
|
|
284
|
-
# 添加后缀标签
|
|
285
|
-
suffix = model.get('suffix', [])
|
|
286
|
-
if suffix:
|
|
287
|
-
# 处理新格式的suffix (字典列表)
|
|
288
|
-
if suffix and isinstance(suffix[0], dict):
|
|
289
|
-
suffix_str = ', '.join(s.get('tag', '') for s in suffix)
|
|
290
|
-
# 处理旧格式的suffix (字符串列表)
|
|
291
|
-
else:
|
|
292
|
-
suffix_str = ', '.join(str(s) for s in suffix)
|
|
293
|
-
model_str += f" ({suffix_str})"
|
|
294
|
-
|
|
295
|
-
# 添加描述或提示
|
|
296
|
-
info = model.get('tooltip') or model.get('description', '')
|
|
297
|
-
if info:
|
|
298
|
-
model_str += f" - {info}"
|
|
299
|
-
|
|
300
|
-
model['desc'] = model_str
|
|
301
|
-
models.append(model_name)
|
|
302
|
-
|
|
303
|
-
return sorted(models)
|
|
304
|
-
|
|
305
|
-
except Exception as e:
|
|
306
|
-
PrettyOutput.print(f"获取模型列表失败: {str(e)}", OutputType.WARNING)
|
|
307
|
-
return []
|
|
@@ -1,138 +0,0 @@
|
|
|
1
|
-
from pathlib import Path
|
|
2
|
-
import fitz # PyMuPDF for PDF files
|
|
3
|
-
from docx import Document as DocxDocument # python-docx for DOCX files
|
|
4
|
-
from pptx import Presentation
|
|
5
|
-
import pandas as pd
|
|
6
|
-
import unicodedata
|
|
7
|
-
|
|
8
|
-
class FileProcessor:
|
|
9
|
-
"""Base class for file processor"""
|
|
10
|
-
@staticmethod
|
|
11
|
-
def can_handle(file_path: str) -> bool:
|
|
12
|
-
"""Determine if the file can be processed"""
|
|
13
|
-
raise NotImplementedError
|
|
14
|
-
|
|
15
|
-
@staticmethod
|
|
16
|
-
def extract_text(file_path: str) -> str:
|
|
17
|
-
"""Extract file text content"""
|
|
18
|
-
raise NotImplementedError
|
|
19
|
-
|
|
20
|
-
class TextFileProcessor(FileProcessor):
|
|
21
|
-
"""Text file processor"""
|
|
22
|
-
ENCODINGS = ['utf-8', 'gbk', 'gb2312', 'latin1']
|
|
23
|
-
SAMPLE_SIZE = 8192 # Read the first 8KB to detect encoding
|
|
24
|
-
|
|
25
|
-
@staticmethod
|
|
26
|
-
def can_handle(file_path: str) -> bool:
|
|
27
|
-
"""Determine if the file is a text file by trying to decode it"""
|
|
28
|
-
try:
|
|
29
|
-
# Read the first part of the file to detect encoding
|
|
30
|
-
with open(file_path, 'rb') as f:
|
|
31
|
-
sample = f.read(TextFileProcessor.SAMPLE_SIZE)
|
|
32
|
-
|
|
33
|
-
# Check if it contains null bytes (usually represents a binary file)
|
|
34
|
-
if b'\x00' in sample:
|
|
35
|
-
return False
|
|
36
|
-
|
|
37
|
-
# Check if it contains too many non-printable characters (usually represents a binary file)
|
|
38
|
-
non_printable = sum(1 for byte in sample if byte < 32 and byte not in (9, 10, 13)) # tab, newline, carriage return
|
|
39
|
-
if non_printable / len(sample) > 0.3: # If non-printable characters exceed 30%, it is considered a binary file
|
|
40
|
-
return False
|
|
41
|
-
|
|
42
|
-
# Try to decode with different encodings
|
|
43
|
-
for encoding in TextFileProcessor.ENCODINGS:
|
|
44
|
-
try:
|
|
45
|
-
sample.decode(encoding)
|
|
46
|
-
return True
|
|
47
|
-
except UnicodeDecodeError:
|
|
48
|
-
continue
|
|
49
|
-
|
|
50
|
-
return False
|
|
51
|
-
|
|
52
|
-
except Exception:
|
|
53
|
-
return False
|
|
54
|
-
|
|
55
|
-
@staticmethod
|
|
56
|
-
def extract_text(file_path: str) -> str:
|
|
57
|
-
"""Extract text content, using the detected correct encoding"""
|
|
58
|
-
detected_encoding = None
|
|
59
|
-
try:
|
|
60
|
-
# First try to detect encoding
|
|
61
|
-
with open(file_path, 'rb') as f:
|
|
62
|
-
raw_data = f.read()
|
|
63
|
-
|
|
64
|
-
# Try different encodings
|
|
65
|
-
for encoding in TextFileProcessor.ENCODINGS:
|
|
66
|
-
try:
|
|
67
|
-
raw_data.decode(encoding)
|
|
68
|
-
detected_encoding = encoding
|
|
69
|
-
break
|
|
70
|
-
except UnicodeDecodeError:
|
|
71
|
-
continue
|
|
72
|
-
|
|
73
|
-
if not detected_encoding:
|
|
74
|
-
raise UnicodeDecodeError(f"Failed to decode file with supported encodings: {file_path}") # type: ignore
|
|
75
|
-
|
|
76
|
-
# Use the detected encoding to read the file
|
|
77
|
-
with open(file_path, 'r', encoding=detected_encoding, errors='ignore') as f:
|
|
78
|
-
content = f.read()
|
|
79
|
-
|
|
80
|
-
# Normalize Unicode characters
|
|
81
|
-
content = unicodedata.normalize('NFKC', content)
|
|
82
|
-
|
|
83
|
-
return content
|
|
84
|
-
|
|
85
|
-
except Exception as e:
|
|
86
|
-
raise Exception(f"Failed to read file: {str(e)}")
|
|
87
|
-
|
|
88
|
-
class PDFProcessor(FileProcessor):
|
|
89
|
-
"""PDF file processor"""
|
|
90
|
-
@staticmethod
|
|
91
|
-
def can_handle(file_path: str) -> bool:
|
|
92
|
-
return Path(file_path).suffix.lower() == '.pdf'
|
|
93
|
-
|
|
94
|
-
@staticmethod
|
|
95
|
-
def extract_text(file_path: str) -> str:
|
|
96
|
-
text_parts = []
|
|
97
|
-
with fitz.open(file_path) as doc: # type: ignore
|
|
98
|
-
for page in doc:
|
|
99
|
-
text_parts.append(page.get_text()) # type: ignore
|
|
100
|
-
return "\n".join(text_parts)
|
|
101
|
-
|
|
102
|
-
class DocxProcessor(FileProcessor):
|
|
103
|
-
"""DOCX file processor"""
|
|
104
|
-
@staticmethod
|
|
105
|
-
def can_handle(file_path: str) -> bool:
|
|
106
|
-
return Path(file_path).suffix.lower() == '.docx'
|
|
107
|
-
|
|
108
|
-
@staticmethod
|
|
109
|
-
def extract_text(file_path: str) -> str:
|
|
110
|
-
doc = DocxDocument(file_path)
|
|
111
|
-
return "\n".join([paragraph.text for paragraph in doc.paragraphs])
|
|
112
|
-
|
|
113
|
-
class PPTProcessor(FileProcessor):
|
|
114
|
-
"""PPT file processor"""
|
|
115
|
-
@staticmethod
|
|
116
|
-
def can_handle(file_path: str) -> bool:
|
|
117
|
-
return Path(file_path).suffix.lower() in ['.ppt', '.pptx']
|
|
118
|
-
|
|
119
|
-
@staticmethod
|
|
120
|
-
def extract_text(file_path: str) -> str:
|
|
121
|
-
prs = Presentation(file_path)
|
|
122
|
-
text = []
|
|
123
|
-
for slide in prs.slides:
|
|
124
|
-
for shape in slide.shapes:
|
|
125
|
-
if hasattr(shape, "text"):
|
|
126
|
-
text.append(shape.text) # type: ignore
|
|
127
|
-
return "\n".join(text)
|
|
128
|
-
|
|
129
|
-
class ExcelProcessor(FileProcessor):
|
|
130
|
-
"""Excel file processor"""
|
|
131
|
-
@staticmethod
|
|
132
|
-
def can_handle(file_path: str) -> bool:
|
|
133
|
-
return Path(file_path).suffix.lower() in ['.xls', '.xlsx']
|
|
134
|
-
|
|
135
|
-
@staticmethod
|
|
136
|
-
def extract_text(file_path: str) -> str:
|
|
137
|
-
df = pd.read_excel(file_path)
|
|
138
|
-
return df.to_string()
|