jarvis-ai-assistant 0.1.130__py3-none-any.whl → 0.1.132__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- jarvis/__init__.py +1 -1
- jarvis/jarvis_agent/__init__.py +71 -38
- jarvis/jarvis_agent/builtin_input_handler.py +73 -0
- jarvis/{jarvis_code_agent → jarvis_agent}/file_input_handler.py +1 -1
- jarvis/jarvis_agent/main.py +1 -1
- jarvis/{jarvis_code_agent → jarvis_agent}/patch.py +77 -55
- jarvis/{jarvis_code_agent → jarvis_agent}/shell_input_handler.py +1 -2
- jarvis/jarvis_code_agent/code_agent.py +93 -88
- jarvis/jarvis_dev/main.py +335 -626
- jarvis/jarvis_git_squash/main.py +11 -32
- jarvis/jarvis_lsp/base.py +2 -26
- jarvis/jarvis_lsp/cpp.py +2 -14
- jarvis/jarvis_lsp/go.py +0 -13
- jarvis/jarvis_lsp/python.py +1 -30
- jarvis/jarvis_lsp/registry.py +10 -14
- jarvis/jarvis_lsp/rust.py +0 -12
- jarvis/jarvis_multi_agent/__init__.py +20 -29
- jarvis/jarvis_platform/ai8.py +7 -32
- jarvis/jarvis_platform/base.py +2 -7
- jarvis/jarvis_platform/kimi.py +3 -144
- jarvis/jarvis_platform/ollama.py +54 -68
- jarvis/jarvis_platform/openai.py +0 -4
- jarvis/jarvis_platform/oyi.py +0 -75
- jarvis/jarvis_platform/registry.py +1 -1
- jarvis/jarvis_platform/yuanbao.py +264 -0
- jarvis/jarvis_platform_manager/main.py +3 -3
- jarvis/jarvis_rag/file_processors.py +138 -0
- jarvis/jarvis_rag/main.py +1305 -425
- jarvis/jarvis_tools/ask_codebase.py +227 -41
- jarvis/jarvis_tools/code_review.py +229 -166
- jarvis/jarvis_tools/create_code_agent.py +76 -72
- jarvis/jarvis_tools/create_sub_agent.py +32 -15
- jarvis/jarvis_tools/execute_python_script.py +58 -0
- jarvis/jarvis_tools/execute_shell.py +15 -28
- jarvis/jarvis_tools/execute_shell_script.py +2 -2
- jarvis/jarvis_tools/file_analyzer.py +271 -0
- jarvis/jarvis_tools/file_operation.py +3 -3
- jarvis/jarvis_tools/find_caller.py +213 -0
- jarvis/jarvis_tools/find_symbol.py +211 -0
- jarvis/jarvis_tools/function_analyzer.py +248 -0
- jarvis/jarvis_tools/git_commiter.py +89 -70
- jarvis/jarvis_tools/lsp_find_definition.py +83 -67
- jarvis/jarvis_tools/lsp_find_references.py +62 -46
- jarvis/jarvis_tools/lsp_get_diagnostics.py +90 -74
- jarvis/jarvis_tools/methodology.py +89 -48
- jarvis/jarvis_tools/project_analyzer.py +220 -0
- jarvis/jarvis_tools/read_code.py +24 -3
- jarvis/jarvis_tools/read_webpage.py +195 -81
- jarvis/jarvis_tools/registry.py +132 -11
- jarvis/jarvis_tools/search_web.py +73 -30
- jarvis/jarvis_tools/tool_generator.py +7 -9
- jarvis/jarvis_utils/__init__.py +1 -0
- jarvis/jarvis_utils/config.py +67 -3
- jarvis/jarvis_utils/embedding.py +344 -45
- jarvis/jarvis_utils/git_utils.py +18 -2
- jarvis/jarvis_utils/input.py +7 -4
- jarvis/jarvis_utils/methodology.py +379 -7
- jarvis/jarvis_utils/output.py +5 -3
- jarvis/jarvis_utils/utils.py +62 -10
- {jarvis_ai_assistant-0.1.130.dist-info → jarvis_ai_assistant-0.1.132.dist-info}/METADATA +3 -4
- jarvis_ai_assistant-0.1.132.dist-info/RECORD +82 -0
- {jarvis_ai_assistant-0.1.130.dist-info → jarvis_ai_assistant-0.1.132.dist-info}/entry_points.txt +2 -0
- jarvis/jarvis_c2rust/c2rust.yaml +0 -734
- jarvis/jarvis_code_agent/builtin_input_handler.py +0 -43
- jarvis/jarvis_codebase/__init__.py +0 -0
- jarvis/jarvis_codebase/main.py +0 -1011
- jarvis/jarvis_tools/lsp_get_document_symbols.py +0 -87
- jarvis/jarvis_tools/lsp_prepare_rename.py +0 -130
- jarvis_ai_assistant-0.1.130.dist-info/RECORD +0 -79
- {jarvis_ai_assistant-0.1.130.dist-info → jarvis_ai_assistant-0.1.132.dist-info}/LICENSE +0 -0
- {jarvis_ai_assistant-0.1.130.dist-info → jarvis_ai_assistant-0.1.132.dist-info}/WHEEL +0 -0
- {jarvis_ai_assistant-0.1.130.dist-info → jarvis_ai_assistant-0.1.132.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,264 @@
|
|
|
1
|
+
from typing import List, Tuple
|
|
2
|
+
import requests
|
|
3
|
+
import json
|
|
4
|
+
import os
|
|
5
|
+
from jarvis.jarvis_platform.base import BasePlatform
|
|
6
|
+
from jarvis.jarvis_utils.output import OutputType, PrettyOutput
|
|
7
|
+
from jarvis.jarvis_utils.utils import while_success
|
|
8
|
+
|
|
9
|
+
class HunyuanModel(BasePlatform):
|
|
10
|
+
"""Hunyuan model implementation"""
|
|
11
|
+
|
|
12
|
+
platform_name = "yuanbao"
|
|
13
|
+
|
|
14
|
+
def get_model_list(self) -> List[Tuple[str, str]]:
|
|
15
|
+
"""获取支持的模型列表"""
|
|
16
|
+
return [("deep_seek", "DeepSeek-R1"), ("deep_seek_v3", "DeepSeek-v3"), ("hunyuan_gpt_175B_0404", "Tencent Hunyuan"), ("hunyuan_t1", "Tencent Hunyuan-T1")]
|
|
17
|
+
|
|
18
|
+
def __init__(self):
|
|
19
|
+
"""
|
|
20
|
+
初始化Hunyuan模型
|
|
21
|
+
"""
|
|
22
|
+
super().__init__()
|
|
23
|
+
self.conversation_id = "" # 会话ID,用于标识当前对话
|
|
24
|
+
# 从环境变量中获取必要参数
|
|
25
|
+
self.cookies = os.getenv("YUANBAO_COOKIES") # 认证cookies
|
|
26
|
+
self.agent_id = os.getenv("YUANBAO_AGENT_ID") # 代理ID
|
|
27
|
+
self.web = os.getenv("YUANBAO_WEB", "false") == "true" # 是否启用网页功能
|
|
28
|
+
|
|
29
|
+
if not self.cookies:
|
|
30
|
+
message = (
|
|
31
|
+
"需要设置 YUANBAO_COOKIES 和 YUANBAO_AGENT_ID 才能使用 Jarvis 的元宝功能。请按照以下步骤操作:\n"
|
|
32
|
+
"1. 获取元宝 API 参数:\n"
|
|
33
|
+
" • 访问元宝平台: https://yuanbao.tencent.com\n"
|
|
34
|
+
" • 登录您的账户\n"
|
|
35
|
+
" • 打开浏览器开发者工具 (F12 或右键 -> 检查)\n"
|
|
36
|
+
" • 切换到网络标签\n"
|
|
37
|
+
" • 发送任意消息\n"
|
|
38
|
+
" • 在请求中找到 X-Uskey 和 T-UserID 头部值\n"
|
|
39
|
+
"2. 设置环境变量:\n"
|
|
40
|
+
" • 方法 1: 创建或编辑 ~/.jarvis/env 文件:\n"
|
|
41
|
+
" echo 'YUANBAO_COOKIES=your_cookies_here' >> ~/.jarvis/env\n"
|
|
42
|
+
" echo 'YUANBAO_AGENT_ID=your_agent_id_here' >> ~/.jarvis/env\n"
|
|
43
|
+
" • 方法 2: 直接设置环境变量:\n"
|
|
44
|
+
" export YUANBAO_COOKIES=your_cookies_here\n"
|
|
45
|
+
" export YUANBAO_AGENT_ID=your_agent_id_here\n"
|
|
46
|
+
"设置后,重新运行 Jarvis。"
|
|
47
|
+
)
|
|
48
|
+
PrettyOutput.print(message, OutputType.INFO)
|
|
49
|
+
PrettyOutput.print("YUANBAO_COOKIES 未设置", OutputType.WARNING)
|
|
50
|
+
|
|
51
|
+
self.system_message = "" # 系统消息,用于初始化对话
|
|
52
|
+
self.first_chat = True # 标识是否为第一次对话
|
|
53
|
+
self.model_name = "deep_seek_v3" # 默认模型名称,使用下划线保持一致
|
|
54
|
+
|
|
55
|
+
def set_system_message(self, message: str):
|
|
56
|
+
"""Set system message"""
|
|
57
|
+
self.system_message = message
|
|
58
|
+
|
|
59
|
+
def set_model_name(self, model_name: str):
|
|
60
|
+
# 模型映射表,可以根据需要扩展
|
|
61
|
+
model_mapping = [m[0] for m in self.get_model_list()]
|
|
62
|
+
|
|
63
|
+
if model_name in model_mapping:
|
|
64
|
+
self.model_name = model_name
|
|
65
|
+
else:
|
|
66
|
+
PrettyOutput.print(f"错误:不支持的模型: {model_name}", OutputType.ERROR)
|
|
67
|
+
self.reset()
|
|
68
|
+
|
|
69
|
+
def _get_base_headers(self):
|
|
70
|
+
"""Get base headers for API requests"""
|
|
71
|
+
return {
|
|
72
|
+
'Host': 'yuanbao.tencent.com',
|
|
73
|
+
'X-Language': 'zh-CN',
|
|
74
|
+
'X-Requested-With': 'XMLHttpRequest',
|
|
75
|
+
'chat_version': 'v1',
|
|
76
|
+
'X-Instance-ID': '5',
|
|
77
|
+
'X-Requested-With': 'XMLHttpRequest',
|
|
78
|
+
'Accept': 'application/json, text/plain, */*',
|
|
79
|
+
'Content-Type': 'application/json',
|
|
80
|
+
'sec-ch-ua-mobile': '?0',
|
|
81
|
+
'Origin': 'https://yuanbao.tencent.com',
|
|
82
|
+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36 Edg/133.0.0.0',
|
|
83
|
+
'Referer': f'https://yuanbao.tencent.com/chat/{self.agent_id}',
|
|
84
|
+
'X-Source': 'web',
|
|
85
|
+
'Accept-Encoding': 'gzip, deflate, br, zstd',
|
|
86
|
+
'Accept': '*/*',
|
|
87
|
+
'Sec-Fetch-Site': 'same-origin',
|
|
88
|
+
'Sec-Fetch-Mode': 'cors',
|
|
89
|
+
'Sec-Fetch-Dest': 'empty',
|
|
90
|
+
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
|
|
91
|
+
'Cookie': self.cookies
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
def _create_conversation(self) -> bool:
|
|
95
|
+
"""Create a new conversation session"""
|
|
96
|
+
url = "https://yuanbao.tencent.com/api/user/agent/conversation/create"
|
|
97
|
+
|
|
98
|
+
headers = self._get_base_headers()
|
|
99
|
+
|
|
100
|
+
payload = json.dumps({
|
|
101
|
+
"agentId": self.agent_id
|
|
102
|
+
})
|
|
103
|
+
|
|
104
|
+
try:
|
|
105
|
+
response = while_success(lambda: requests.post(url, headers=headers, data=payload), sleep_time=5)
|
|
106
|
+
response_json = response.json()
|
|
107
|
+
|
|
108
|
+
if "id" in response_json:
|
|
109
|
+
self.conversation_id = response_json["id"]
|
|
110
|
+
return True
|
|
111
|
+
else:
|
|
112
|
+
PrettyOutput.print(f"错误:创建会话失败,响应: {response_json}", OutputType.ERROR)
|
|
113
|
+
return False
|
|
114
|
+
except Exception as e:
|
|
115
|
+
PrettyOutput.print(f"错误:创建会话失败:{e}", OutputType.ERROR)
|
|
116
|
+
return False
|
|
117
|
+
|
|
118
|
+
def chat(self, message: str) -> str:
|
|
119
|
+
"""Send message and get response"""
|
|
120
|
+
if not self.conversation_id:
|
|
121
|
+
if not self._create_conversation():
|
|
122
|
+
raise Exception("Failed to create conversation session")
|
|
123
|
+
|
|
124
|
+
url = f"https://yuanbao.tencent.com/api/chat/{self.conversation_id}"
|
|
125
|
+
|
|
126
|
+
headers = self._get_base_headers()
|
|
127
|
+
|
|
128
|
+
# 准备消息内容
|
|
129
|
+
payload = {
|
|
130
|
+
"model": "gpt_175B_0404",
|
|
131
|
+
"prompt": message,
|
|
132
|
+
"plugin": "Adaptive",
|
|
133
|
+
"displayPrompt": message,
|
|
134
|
+
"displayPromptType": 1,
|
|
135
|
+
"options": {
|
|
136
|
+
"imageIntention": {
|
|
137
|
+
"needIntentionModel": True,
|
|
138
|
+
"backendUpdateFlag": 2,
|
|
139
|
+
"intentionStatus": True
|
|
140
|
+
}
|
|
141
|
+
},
|
|
142
|
+
"multimedia": [],
|
|
143
|
+
"agentId": self.agent_id,
|
|
144
|
+
"supportHint": 1,
|
|
145
|
+
"version": "v2",
|
|
146
|
+
"supportFunctions": [],
|
|
147
|
+
"chatModelId": self.model_name,
|
|
148
|
+
}
|
|
149
|
+
|
|
150
|
+
if self.web:
|
|
151
|
+
payload["supportFunctions"] = ["supportInternetSearch"]
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
# 添加系统消息(如果是第一次对话)
|
|
155
|
+
if self.first_chat and self.system_message:
|
|
156
|
+
payload["prompt"] = f"{self.system_message}\n\n{message}"
|
|
157
|
+
payload["displayPrompt"] = payload["prompt"]
|
|
158
|
+
self.first_chat = False
|
|
159
|
+
|
|
160
|
+
try:
|
|
161
|
+
# 发送消息请求,获取流式响应
|
|
162
|
+
response = while_success(
|
|
163
|
+
lambda: requests.post(url, headers=headers, json=payload, stream=True),
|
|
164
|
+
sleep_time=5
|
|
165
|
+
)
|
|
166
|
+
|
|
167
|
+
# 检查响应状态
|
|
168
|
+
if response.status_code != 200:
|
|
169
|
+
error_msg = f"发送消息失败,状态码: {response.status_code}"
|
|
170
|
+
if hasattr(response, 'text'):
|
|
171
|
+
error_msg += f", 响应: {response.text}"
|
|
172
|
+
raise Exception(error_msg)
|
|
173
|
+
|
|
174
|
+
full_response = ""
|
|
175
|
+
is_text_block = False
|
|
176
|
+
|
|
177
|
+
# 处理SSE流响应
|
|
178
|
+
for line in response.iter_lines():
|
|
179
|
+
if not line:
|
|
180
|
+
continue
|
|
181
|
+
|
|
182
|
+
line_str = line.decode('utf-8')
|
|
183
|
+
|
|
184
|
+
# SSE格式的行通常以"data: "开头
|
|
185
|
+
if line_str.startswith("data: "):
|
|
186
|
+
try:
|
|
187
|
+
data_str = line_str[6:] # 移除"data: "前缀
|
|
188
|
+
data = json.loads(data_str)
|
|
189
|
+
|
|
190
|
+
# 处理文本类型的消息
|
|
191
|
+
if data.get("type") == "text":
|
|
192
|
+
is_text_block = True
|
|
193
|
+
msg = data.get("msg", "")
|
|
194
|
+
if msg:
|
|
195
|
+
if not self.suppress_output:
|
|
196
|
+
PrettyOutput.print_stream(msg)
|
|
197
|
+
full_response += msg
|
|
198
|
+
|
|
199
|
+
# 处理思考中的消息(可选展示)
|
|
200
|
+
elif data.get("type") == "think" and not self.suppress_output:
|
|
201
|
+
think_content = data.get("content", "")
|
|
202
|
+
# 可以选择性地显示思考过程,但不加入最终响应
|
|
203
|
+
PrettyOutput.print_stream(f"{think_content}", is_thinking=True)
|
|
204
|
+
pass
|
|
205
|
+
|
|
206
|
+
except json.JSONDecodeError:
|
|
207
|
+
pass
|
|
208
|
+
|
|
209
|
+
# 检测结束标志
|
|
210
|
+
elif line_str == "data: [DONE]":
|
|
211
|
+
break
|
|
212
|
+
|
|
213
|
+
if not self.suppress_output:
|
|
214
|
+
PrettyOutput.print_stream_end()
|
|
215
|
+
|
|
216
|
+
return full_response
|
|
217
|
+
|
|
218
|
+
except Exception as e:
|
|
219
|
+
raise Exception(f"对话失败: {str(e)}")
|
|
220
|
+
|
|
221
|
+
def reset(self):
|
|
222
|
+
"""Reset chat"""
|
|
223
|
+
self.conversation_id = ""
|
|
224
|
+
self.first_chat = True
|
|
225
|
+
|
|
226
|
+
def delete_chat(self) -> bool:
|
|
227
|
+
"""Delete current session"""
|
|
228
|
+
if not self.conversation_id:
|
|
229
|
+
return True # 如果没有会话ID,视为删除成功
|
|
230
|
+
|
|
231
|
+
# Hunyuan使用专门的clear API来清除会话
|
|
232
|
+
url = "https://yuanbao.tencent.com/api/user/agent/conversation/v1/clear"
|
|
233
|
+
|
|
234
|
+
# 为这个请求获取基础头部
|
|
235
|
+
headers = self._get_base_headers()
|
|
236
|
+
|
|
237
|
+
# 更新X-AgentID头部,需要包含会话ID
|
|
238
|
+
headers.update({
|
|
239
|
+
'X-AgentID': f"{self.agent_id}/{self.conversation_id}"
|
|
240
|
+
})
|
|
241
|
+
|
|
242
|
+
# 创建请求体,包含要删除的会话ID
|
|
243
|
+
payload = {
|
|
244
|
+
"conversationIds": [self.conversation_id]
|
|
245
|
+
}
|
|
246
|
+
|
|
247
|
+
try:
|
|
248
|
+
response = while_success(lambda: requests.post(url, headers=headers, json=payload), sleep_time=5)
|
|
249
|
+
|
|
250
|
+
if response.status_code == 200:
|
|
251
|
+
self.reset()
|
|
252
|
+
return True
|
|
253
|
+
else:
|
|
254
|
+
PrettyOutput.print(f"删除会话失败: HTTP {response.status_code}", OutputType.WARNING)
|
|
255
|
+
if hasattr(response, 'text'):
|
|
256
|
+
PrettyOutput.print(f"响应: {response.text}", OutputType.WARNING)
|
|
257
|
+
return False
|
|
258
|
+
except Exception as e:
|
|
259
|
+
PrettyOutput.print(f"删除会话时发生错误: {str(e)}", OutputType.ERROR)
|
|
260
|
+
return False
|
|
261
|
+
|
|
262
|
+
def name(self) -> str:
|
|
263
|
+
"""Model name"""
|
|
264
|
+
return "yuanbao"
|
|
@@ -219,7 +219,7 @@ def service_command(args):
|
|
|
219
219
|
timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
|
|
220
220
|
log_file = os.path.join(logs_dir, f"conversation_{conversation_id}_{timestamp}.txt")
|
|
221
221
|
|
|
222
|
-
with open(log_file, "w", encoding="utf-8") as f:
|
|
222
|
+
with open(log_file, "w", encoding="utf-8", errors="ignore") as f:
|
|
223
223
|
f.write(f"Conversation ID: {conversation_id}\n")
|
|
224
224
|
f.write(f"Timestamp: {timestamp}\n")
|
|
225
225
|
f.write(f"Model: {model}\n\n")
|
|
@@ -464,7 +464,7 @@ def service_command(args):
|
|
|
464
464
|
"response": full_response
|
|
465
465
|
}
|
|
466
466
|
|
|
467
|
-
with open(log_file, "w", encoding="utf-8") as f:
|
|
467
|
+
with open(log_file, "w", encoding="utf-8", errors="ignore") as f:
|
|
468
468
|
json.dump(log_data, f, ensure_ascii=False, indent=2)
|
|
469
469
|
|
|
470
470
|
PrettyOutput.print(f"Stream conversation logged to {log_file}", OutputType.INFO)
|
|
@@ -501,7 +501,7 @@ def service_command(args):
|
|
|
501
501
|
"error": error_msg
|
|
502
502
|
}
|
|
503
503
|
|
|
504
|
-
with open(log_file, "w", encoding="utf-8") as f:
|
|
504
|
+
with open(log_file, "w", encoding="utf-8", errors="ignore") as f:
|
|
505
505
|
json.dump(log_data, f, ensure_ascii=False, indent=2)
|
|
506
506
|
|
|
507
507
|
PrettyOutput.print(f"Stream error logged to {log_file}", OutputType.ERROR)
|
|
@@ -0,0 +1,138 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
import fitz # PyMuPDF for PDF files
|
|
3
|
+
from docx import Document as DocxDocument # python-docx for DOCX files
|
|
4
|
+
from pptx import Presentation
|
|
5
|
+
import pandas as pd
|
|
6
|
+
import unicodedata
|
|
7
|
+
|
|
8
|
+
class FileProcessor:
|
|
9
|
+
"""Base class for file processor"""
|
|
10
|
+
@staticmethod
|
|
11
|
+
def can_handle(file_path: str) -> bool:
|
|
12
|
+
"""Determine if the file can be processed"""
|
|
13
|
+
raise NotImplementedError
|
|
14
|
+
|
|
15
|
+
@staticmethod
|
|
16
|
+
def extract_text(file_path: str) -> str:
|
|
17
|
+
"""Extract file text content"""
|
|
18
|
+
raise NotImplementedError
|
|
19
|
+
|
|
20
|
+
class TextFileProcessor(FileProcessor):
|
|
21
|
+
"""Text file processor"""
|
|
22
|
+
ENCODINGS = ['utf-8', 'gbk', 'gb2312', 'latin1']
|
|
23
|
+
SAMPLE_SIZE = 8192 # Read the first 8KB to detect encoding
|
|
24
|
+
|
|
25
|
+
@staticmethod
|
|
26
|
+
def can_handle(file_path: str) -> bool:
|
|
27
|
+
"""Determine if the file is a text file by trying to decode it"""
|
|
28
|
+
try:
|
|
29
|
+
# Read the first part of the file to detect encoding
|
|
30
|
+
with open(file_path, 'rb') as f:
|
|
31
|
+
sample = f.read(TextFileProcessor.SAMPLE_SIZE)
|
|
32
|
+
|
|
33
|
+
# Check if it contains null bytes (usually represents a binary file)
|
|
34
|
+
if b'\x00' in sample:
|
|
35
|
+
return False
|
|
36
|
+
|
|
37
|
+
# Check if it contains too many non-printable characters (usually represents a binary file)
|
|
38
|
+
non_printable = sum(1 for byte in sample if byte < 32 and byte not in (9, 10, 13)) # tab, newline, carriage return
|
|
39
|
+
if non_printable / len(sample) > 0.3: # If non-printable characters exceed 30%, it is considered a binary file
|
|
40
|
+
return False
|
|
41
|
+
|
|
42
|
+
# Try to decode with different encodings
|
|
43
|
+
for encoding in TextFileProcessor.ENCODINGS:
|
|
44
|
+
try:
|
|
45
|
+
sample.decode(encoding)
|
|
46
|
+
return True
|
|
47
|
+
except UnicodeDecodeError:
|
|
48
|
+
continue
|
|
49
|
+
|
|
50
|
+
return False
|
|
51
|
+
|
|
52
|
+
except Exception:
|
|
53
|
+
return False
|
|
54
|
+
|
|
55
|
+
@staticmethod
|
|
56
|
+
def extract_text(file_path: str) -> str:
|
|
57
|
+
"""Extract text content, using the detected correct encoding"""
|
|
58
|
+
detected_encoding = None
|
|
59
|
+
try:
|
|
60
|
+
# First try to detect encoding
|
|
61
|
+
with open(file_path, 'rb') as f:
|
|
62
|
+
raw_data = f.read()
|
|
63
|
+
|
|
64
|
+
# Try different encodings
|
|
65
|
+
for encoding in TextFileProcessor.ENCODINGS:
|
|
66
|
+
try:
|
|
67
|
+
raw_data.decode(encoding)
|
|
68
|
+
detected_encoding = encoding
|
|
69
|
+
break
|
|
70
|
+
except UnicodeDecodeError:
|
|
71
|
+
continue
|
|
72
|
+
|
|
73
|
+
if not detected_encoding:
|
|
74
|
+
raise UnicodeDecodeError(f"Failed to decode file with supported encodings: {file_path}") # type: ignore
|
|
75
|
+
|
|
76
|
+
# Use the detected encoding to read the file
|
|
77
|
+
with open(file_path, 'r', encoding=detected_encoding, errors='ignore') as f:
|
|
78
|
+
content = f.read()
|
|
79
|
+
|
|
80
|
+
# Normalize Unicode characters
|
|
81
|
+
content = unicodedata.normalize('NFKC', content)
|
|
82
|
+
|
|
83
|
+
return content
|
|
84
|
+
|
|
85
|
+
except Exception as e:
|
|
86
|
+
raise Exception(f"Failed to read file: {str(e)}")
|
|
87
|
+
|
|
88
|
+
class PDFProcessor(FileProcessor):
|
|
89
|
+
"""PDF file processor"""
|
|
90
|
+
@staticmethod
|
|
91
|
+
def can_handle(file_path: str) -> bool:
|
|
92
|
+
return Path(file_path).suffix.lower() == '.pdf'
|
|
93
|
+
|
|
94
|
+
@staticmethod
|
|
95
|
+
def extract_text(file_path: str) -> str:
|
|
96
|
+
text_parts = []
|
|
97
|
+
with fitz.open(file_path) as doc: # type: ignore
|
|
98
|
+
for page in doc:
|
|
99
|
+
text_parts.append(page.get_text()) # type: ignore
|
|
100
|
+
return "\n".join(text_parts)
|
|
101
|
+
|
|
102
|
+
class DocxProcessor(FileProcessor):
|
|
103
|
+
"""DOCX file processor"""
|
|
104
|
+
@staticmethod
|
|
105
|
+
def can_handle(file_path: str) -> bool:
|
|
106
|
+
return Path(file_path).suffix.lower() == '.docx'
|
|
107
|
+
|
|
108
|
+
@staticmethod
|
|
109
|
+
def extract_text(file_path: str) -> str:
|
|
110
|
+
doc = DocxDocument(file_path)
|
|
111
|
+
return "\n".join([paragraph.text for paragraph in doc.paragraphs])
|
|
112
|
+
|
|
113
|
+
class PPTProcessor(FileProcessor):
|
|
114
|
+
"""PPT file processor"""
|
|
115
|
+
@staticmethod
|
|
116
|
+
def can_handle(file_path: str) -> bool:
|
|
117
|
+
return Path(file_path).suffix.lower() in ['.ppt', '.pptx']
|
|
118
|
+
|
|
119
|
+
@staticmethod
|
|
120
|
+
def extract_text(file_path: str) -> str:
|
|
121
|
+
prs = Presentation(file_path)
|
|
122
|
+
text = []
|
|
123
|
+
for slide in prs.slides:
|
|
124
|
+
for shape in slide.shapes:
|
|
125
|
+
if hasattr(shape, "text"):
|
|
126
|
+
text.append(shape.text) # type: ignore
|
|
127
|
+
return "\n".join(text)
|
|
128
|
+
|
|
129
|
+
class ExcelProcessor(FileProcessor):
|
|
130
|
+
"""Excel file processor"""
|
|
131
|
+
@staticmethod
|
|
132
|
+
def can_handle(file_path: str) -> bool:
|
|
133
|
+
return Path(file_path).suffix.lower() in ['.xls', '.xlsx']
|
|
134
|
+
|
|
135
|
+
@staticmethod
|
|
136
|
+
def extract_text(file_path: str) -> str:
|
|
137
|
+
df = pd.read_excel(file_path)
|
|
138
|
+
return df.to_string()
|