jarvis-ai-assistant 0.1.149__py3-none-any.whl → 0.1.150__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of jarvis-ai-assistant might be problematic. Click here for more details.
- jarvis/__init__.py +1 -1
- jarvis/jarvis_agent/jarvis.py +7 -5
- jarvis/jarvis_lsp/registry.py +2 -1
- jarvis/jarvis_mcp/__init__.py +36 -0
- jarvis/jarvis_mcp/local_mcp_client.py +241 -0
- jarvis/jarvis_mcp/remote_mcp_client.py +230 -0
- jarvis/jarvis_platform/kimi.py +3 -2
- jarvis/jarvis_platform/registry.py +2 -4
- jarvis/jarvis_platform/yuanbao.py +4 -4
- jarvis/jarvis_tools/file_operation.py +7 -26
- jarvis/jarvis_tools/methodology.py +2 -1
- jarvis/jarvis_tools/registry.py +108 -3
- jarvis/jarvis_utils/config.py +14 -8
- jarvis/jarvis_utils/embedding.py +2 -2
- jarvis/jarvis_utils/file_processors.py +0 -262
- jarvis/jarvis_utils/input.py +7 -1
- jarvis/jarvis_utils/methodology.py +2 -2
- jarvis/jarvis_utils/utils.py +5 -5
- {jarvis_ai_assistant-0.1.149.dist-info → jarvis_ai_assistant-0.1.150.dist-info}/METADATA +12 -16
- {jarvis_ai_assistant-0.1.149.dist-info → jarvis_ai_assistant-0.1.150.dist-info}/RECORD +24 -21
- {jarvis_ai_assistant-0.1.149.dist-info → jarvis_ai_assistant-0.1.150.dist-info}/LICENSE +0 -0
- {jarvis_ai_assistant-0.1.149.dist-info → jarvis_ai_assistant-0.1.150.dist-info}/WHEEL +0 -0
- {jarvis_ai_assistant-0.1.149.dist-info → jarvis_ai_assistant-0.1.150.dist-info}/entry_points.txt +0 -0
- {jarvis_ai_assistant-0.1.149.dist-info → jarvis_ai_assistant-0.1.150.dist-info}/top_level.txt +0 -0
|
@@ -8,15 +8,14 @@ from jarvis.jarvis_utils.globals import add_read_file_record
|
|
|
8
8
|
from jarvis.jarvis_utils.output import OutputType, PrettyOutput
|
|
9
9
|
# 导入文件处理器
|
|
10
10
|
from jarvis.jarvis_utils.file_processors import (
|
|
11
|
-
TextFileProcessor
|
|
12
|
-
PPTProcessor, ExcelProcessor
|
|
11
|
+
TextFileProcessor
|
|
13
12
|
)
|
|
14
13
|
|
|
15
14
|
|
|
16
15
|
|
|
17
16
|
class FileOperationTool:
|
|
18
17
|
name = "file_operation"
|
|
19
|
-
description = "
|
|
18
|
+
description = "文件批量操作工具,可批量读写多个文件,仅支持文本文件,适用于需要同时处理多个文件的场景(读取配置文件、保存生成内容等)"
|
|
20
19
|
parameters = {
|
|
21
20
|
"type": "object",
|
|
22
21
|
"properties": {
|
|
@@ -44,10 +43,6 @@ class FileOperationTool:
|
|
|
44
43
|
def _get_file_processor(self, file_path: str):
|
|
45
44
|
"""获取适合处理指定文件的处理器"""
|
|
46
45
|
processors = [
|
|
47
|
-
PDFProcessor, # PDF文件处理器
|
|
48
|
-
DocxProcessor, # Word文档处理器
|
|
49
|
-
PPTProcessor, # PowerPoint演示文稿处理器
|
|
50
|
-
ExcelProcessor, # Excel表格处理器
|
|
51
46
|
TextFileProcessor # 文本文件处理器(放在最后作为兜底)
|
|
52
47
|
]
|
|
53
48
|
|
|
@@ -126,25 +121,11 @@ class FileOperationTool:
|
|
|
126
121
|
"stderr": f"读取文本文件失败: {str(e)}"
|
|
127
122
|
}
|
|
128
123
|
else:
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
file_type_names = {
|
|
135
|
-
PDFProcessor: "PDF文档",
|
|
136
|
-
DocxProcessor: "Word文档",
|
|
137
|
-
PPTProcessor: "PowerPoint演示文稿",
|
|
138
|
-
ExcelProcessor: "Excel表格"
|
|
139
|
-
}
|
|
140
|
-
file_type = file_type_names.get(processor, file_extension)
|
|
141
|
-
file_info = f"\n文件: {abs_path} ({file_type})"
|
|
142
|
-
except Exception as e:
|
|
143
|
-
return {
|
|
144
|
-
"success": False,
|
|
145
|
-
"stdout": "",
|
|
146
|
-
"stderr": f"提取 {file_extension} 文件内容失败: {str(e)}"
|
|
147
|
-
}
|
|
124
|
+
return {
|
|
125
|
+
"success": False,
|
|
126
|
+
"stdout": "",
|
|
127
|
+
"stderr": f"不支持的文件类型: {file_extension}"
|
|
128
|
+
}
|
|
148
129
|
|
|
149
130
|
# 构建输出信息
|
|
150
131
|
output = f"{file_info}\n{content}" + "\n\n"
|
|
@@ -3,6 +3,7 @@ import json
|
|
|
3
3
|
import hashlib
|
|
4
4
|
from typing import Dict, Any
|
|
5
5
|
|
|
6
|
+
from jarvis.jarvis_utils.config import get_data_dir
|
|
6
7
|
from jarvis.jarvis_utils.output import OutputType, PrettyOutput
|
|
7
8
|
|
|
8
9
|
|
|
@@ -35,7 +36,7 @@ class MethodologyTool:
|
|
|
35
36
|
|
|
36
37
|
def __init__(self):
|
|
37
38
|
"""初始化经验管理工具"""
|
|
38
|
-
self.methodology_dir = os.path.
|
|
39
|
+
self.methodology_dir = os.path.join(get_data_dir(), "methodologies")
|
|
39
40
|
self._ensure_dir_exists()
|
|
40
41
|
|
|
41
42
|
def _ensure_dir_exists(self):
|
jarvis/jarvis_tools/registry.py
CHANGED
|
@@ -10,10 +10,12 @@ import yaml
|
|
|
10
10
|
from jarvis.jarvis_agent.output_handler import OutputHandler
|
|
11
11
|
from jarvis.jarvis_platform.registry import PlatformRegistry
|
|
12
12
|
from jarvis.jarvis_tools.base import Tool
|
|
13
|
-
from jarvis.jarvis_utils.config import INPUT_WINDOW_REVERSE_SIZE, get_max_input_token_count
|
|
13
|
+
from jarvis.jarvis_utils.config import INPUT_WINDOW_REVERSE_SIZE, get_max_input_token_count, get_data_dir
|
|
14
14
|
from jarvis.jarvis_utils.embedding import get_context_token_count
|
|
15
15
|
from jarvis.jarvis_utils.output import OutputType, PrettyOutput
|
|
16
16
|
from jarvis.jarvis_utils.utils import ct, ot, init_env
|
|
17
|
+
from jarvis.jarvis_mcp.local_mcp_client import LocalMcpClient
|
|
18
|
+
from jarvis.jarvis_mcp.remote_mcp_client import RemoteMcpClient
|
|
17
19
|
|
|
18
20
|
|
|
19
21
|
|
|
@@ -131,6 +133,7 @@ class ToolRegistry(OutputHandler):
|
|
|
131
133
|
# 加载内置工具和外部工具
|
|
132
134
|
self._load_builtin_tools()
|
|
133
135
|
self._load_external_tools()
|
|
136
|
+
self._load_mcp_tools()
|
|
134
137
|
self.max_input_token_count = get_max_input_token_count() - INPUT_WINDOW_REVERSE_SIZE
|
|
135
138
|
|
|
136
139
|
def use_tools(self, name: List[str]) -> None:
|
|
@@ -152,6 +155,16 @@ class ToolRegistry(OutputHandler):
|
|
|
152
155
|
"""
|
|
153
156
|
self.tools = {name: tool for name, tool in self.tools.items() if name not in names}
|
|
154
157
|
|
|
158
|
+
def _load_mcp_tools(self) -> None:
|
|
159
|
+
"""从jarvis_data/tools/mcp加载工具"""
|
|
160
|
+
mcp_tools_dir = Path(get_data_dir()) / 'mcp'
|
|
161
|
+
if not mcp_tools_dir.exists():
|
|
162
|
+
return
|
|
163
|
+
|
|
164
|
+
# 遍历目录中的所有.yaml文件
|
|
165
|
+
for file_path in mcp_tools_dir.glob("*.yaml"):
|
|
166
|
+
self.register_mcp_tool_by_file(str(file_path))
|
|
167
|
+
|
|
155
168
|
def _load_builtin_tools(self) -> None:
|
|
156
169
|
"""从内置工具目录加载工具"""
|
|
157
170
|
tools_dir = Path(__file__).parent
|
|
@@ -165,8 +178,8 @@ class ToolRegistry(OutputHandler):
|
|
|
165
178
|
self.register_tool_by_file(str(file_path))
|
|
166
179
|
|
|
167
180
|
def _load_external_tools(self) -> None:
|
|
168
|
-
"""
|
|
169
|
-
external_tools_dir = Path
|
|
181
|
+
"""从jarvis_data/tools加载外部工具"""
|
|
182
|
+
external_tools_dir = Path(get_data_dir()) / 'tools'
|
|
170
183
|
if not external_tools_dir.exists():
|
|
171
184
|
return
|
|
172
185
|
|
|
@@ -178,6 +191,98 @@ class ToolRegistry(OutputHandler):
|
|
|
178
191
|
|
|
179
192
|
self.register_tool_by_file(str(file_path))
|
|
180
193
|
|
|
194
|
+
def register_mcp_tool_by_file(self, file_path: str) -> bool:
|
|
195
|
+
"""从指定文件加载并注册工具
|
|
196
|
+
|
|
197
|
+
参数:
|
|
198
|
+
file_path: 工具文件的路径
|
|
199
|
+
|
|
200
|
+
返回:
|
|
201
|
+
bool: 工具是否加载成功
|
|
202
|
+
"""
|
|
203
|
+
try:
|
|
204
|
+
config = yaml.safe_load(open(file_path, 'r', encoding='utf-8'))
|
|
205
|
+
if 'type' not in config:
|
|
206
|
+
PrettyOutput.print(f"文件 {file_path} 缺少type字段", OutputType.WARNING)
|
|
207
|
+
return False
|
|
208
|
+
if config['type'] == 'local':
|
|
209
|
+
if 'command' not in config:
|
|
210
|
+
PrettyOutput.print(f"文件 {file_path} 缺少command字段", OutputType.WARNING)
|
|
211
|
+
return False
|
|
212
|
+
|
|
213
|
+
# 创建本地MCP客户端
|
|
214
|
+
mcp_client = LocalMcpClient(config)
|
|
215
|
+
|
|
216
|
+
# 获取工具信息
|
|
217
|
+
tools = mcp_client.get_tool_list()
|
|
218
|
+
if not tools:
|
|
219
|
+
PrettyOutput.print(f"从 {file_path} 获取工具列表失败", OutputType.WARNING)
|
|
220
|
+
return False
|
|
221
|
+
|
|
222
|
+
# 注册每个工具
|
|
223
|
+
for tool in tools:
|
|
224
|
+
def create_local_execute_func(tool_name: str, client: LocalMcpClient):
|
|
225
|
+
def execute(arguments: Dict[str, Any]) -> Dict[str, Any]:
|
|
226
|
+
args = arguments.copy()
|
|
227
|
+
args.pop('agent', None)
|
|
228
|
+
args.pop('want', None)
|
|
229
|
+
ret = client.execute(tool_name, args)
|
|
230
|
+
PrettyOutput.print(f"MCP {tool_name} 执行结果:\n{yaml.safe_dump(ret)}", OutputType.TOOL)
|
|
231
|
+
return ret
|
|
232
|
+
return execute
|
|
233
|
+
|
|
234
|
+
# 注册工具
|
|
235
|
+
self.register_tool(
|
|
236
|
+
name=tool['name'],
|
|
237
|
+
description=tool['description'],
|
|
238
|
+
parameters=tool['parameters'],
|
|
239
|
+
func=create_local_execute_func(tool['name'], mcp_client)
|
|
240
|
+
)
|
|
241
|
+
|
|
242
|
+
return True
|
|
243
|
+
|
|
244
|
+
elif config['type'] == 'remote':
|
|
245
|
+
if 'base_url' not in config:
|
|
246
|
+
PrettyOutput.print(f"文件 {file_path} 缺少base_url字段", OutputType.WARNING)
|
|
247
|
+
return False
|
|
248
|
+
|
|
249
|
+
# 创建远程MCP客户端
|
|
250
|
+
mcp_client = RemoteMcpClient(config)
|
|
251
|
+
|
|
252
|
+
# 获取工具信息
|
|
253
|
+
tools = mcp_client.get_tool_list()
|
|
254
|
+
if not tools:
|
|
255
|
+
PrettyOutput.print(f"从 {file_path} 获取工具列表失败", OutputType.WARNING)
|
|
256
|
+
return False
|
|
257
|
+
|
|
258
|
+
# 注册每个工具
|
|
259
|
+
for tool in tools:
|
|
260
|
+
def create_remote_execute_func(tool_name: str, client: RemoteMcpClient):
|
|
261
|
+
def execute(arguments: Dict[str, Any]) -> Dict[str, Any]:
|
|
262
|
+
args = arguments.copy()
|
|
263
|
+
args.pop('agent', None)
|
|
264
|
+
args.pop('want', None)
|
|
265
|
+
ret = client.execute(tool_name, args)
|
|
266
|
+
PrettyOutput.print(f"MCP {tool_name} 执行结果:\n{yaml.safe_dump(ret)}", OutputType.TOOL)
|
|
267
|
+
return ret
|
|
268
|
+
return execute
|
|
269
|
+
|
|
270
|
+
# 注册工具
|
|
271
|
+
self.register_tool(
|
|
272
|
+
name=tool['name'],
|
|
273
|
+
description=tool['description'],
|
|
274
|
+
parameters=tool['parameters'],
|
|
275
|
+
func=create_remote_execute_func(tool['name'], mcp_client)
|
|
276
|
+
)
|
|
277
|
+
|
|
278
|
+
return True
|
|
279
|
+
else:
|
|
280
|
+
PrettyOutput.print(f"文件 {file_path} 类型错误: {config['type']}", OutputType.WARNING)
|
|
281
|
+
return False
|
|
282
|
+
except Exception as e:
|
|
283
|
+
PrettyOutput.print(f"文件 {file_path} 加载失败: {str(e)}", OutputType.WARNING)
|
|
284
|
+
return False
|
|
285
|
+
|
|
181
286
|
def register_tool_by_file(self, file_path: str) -> bool:
|
|
182
287
|
"""从指定文件加载并注册工具
|
|
183
288
|
|
jarvis/jarvis_utils/config.py
CHANGED
|
@@ -31,14 +31,6 @@ def get_max_input_token_count() -> int:
|
|
|
31
31
|
"""
|
|
32
32
|
return int(os.getenv('JARVIS_MAX_INPUT_TOKEN_COUNT', '32000'))
|
|
33
33
|
|
|
34
|
-
def get_thread_count() -> int:
|
|
35
|
-
"""
|
|
36
|
-
获取用于并行处理的线程数。
|
|
37
|
-
|
|
38
|
-
返回:
|
|
39
|
-
int: 线程数,默认为1
|
|
40
|
-
"""
|
|
41
|
-
return int(os.getenv('JARVIS_THREAD_COUNT', '1'))
|
|
42
34
|
|
|
43
35
|
def is_auto_complete() -> bool:
|
|
44
36
|
"""
|
|
@@ -118,3 +110,17 @@ def get_max_tool_call_count() -> int:
|
|
|
118
110
|
int: 最大连续工具调用次数,默认为20
|
|
119
111
|
"""
|
|
120
112
|
return int(os.getenv('JARVIS_MAX_TOOL_CALL_COUNT', '20'))
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
def get_data_dir() -> str:
|
|
116
|
+
"""
|
|
117
|
+
获取Jarvis数据存储目录路径。
|
|
118
|
+
|
|
119
|
+
返回:
|
|
120
|
+
str: 数据目录路径,优先从JARVIS_DATA_PATH环境变量获取,
|
|
121
|
+
如果未设置或为空,则使用~/.jarvis作为默认值
|
|
122
|
+
"""
|
|
123
|
+
data_path = os.getenv('JARVIS_DATA_PATH', '').strip()
|
|
124
|
+
if not data_path:
|
|
125
|
+
return os.path.expanduser('~/.jarvis')
|
|
126
|
+
return data_path
|
jarvis/jarvis_utils/embedding.py
CHANGED
|
@@ -4,9 +4,9 @@ from typing import List
|
|
|
4
4
|
import functools
|
|
5
5
|
|
|
6
6
|
from jarvis.jarvis_utils.output import PrettyOutput, OutputType
|
|
7
|
+
from jarvis.jarvis_utils.config import get_data_dir
|
|
7
8
|
|
|
8
9
|
# 全局缓存,避免重复加载模型
|
|
9
|
-
_global_models = {}
|
|
10
10
|
_global_tokenizers = {}
|
|
11
11
|
|
|
12
12
|
def get_context_token_count(text: str) -> int:
|
|
@@ -155,7 +155,7 @@ def load_tokenizer() -> AutoTokenizer:
|
|
|
155
155
|
AutoTokenizer: 加载的分词器
|
|
156
156
|
"""
|
|
157
157
|
model_name = "gpt2"
|
|
158
|
-
cache_dir = os.path.
|
|
158
|
+
cache_dir = os.path.join(get_data_dir(), "huggingface", "hub")
|
|
159
159
|
|
|
160
160
|
# 检查全局缓存
|
|
161
161
|
if model_name in _global_tokenizers:
|
|
@@ -1,8 +1,3 @@
|
|
|
1
|
-
from pathlib import Path
|
|
2
|
-
import fitz # PyMuPDF for PDF files
|
|
3
|
-
from docx import Document as DocxDocument # python-docx for DOCX files
|
|
4
|
-
from pptx import Presentation
|
|
5
|
-
import pandas as pd
|
|
6
1
|
import unicodedata
|
|
7
2
|
|
|
8
3
|
class FileProcessor:
|
|
@@ -84,260 +79,3 @@ class TextFileProcessor(FileProcessor):
|
|
|
84
79
|
|
|
85
80
|
except Exception as e:
|
|
86
81
|
raise Exception(f"Failed to read file: {str(e)}")
|
|
87
|
-
|
|
88
|
-
class PDFProcessor(FileProcessor):
|
|
89
|
-
"""PDF file processor"""
|
|
90
|
-
@staticmethod
|
|
91
|
-
def can_handle(file_path: str) -> bool:
|
|
92
|
-
return Path(file_path).suffix.lower() == '.pdf'
|
|
93
|
-
|
|
94
|
-
@staticmethod
|
|
95
|
-
def extract_text(file_path: str) -> str:
|
|
96
|
-
"""提取PDF文件中的所有文本内容,包括页码、图片描述等"""
|
|
97
|
-
try:
|
|
98
|
-
text_parts = []
|
|
99
|
-
with fitz.open(file_path) as doc: # type: ignore
|
|
100
|
-
# 添加文档信息
|
|
101
|
-
info = doc.metadata
|
|
102
|
-
if info:
|
|
103
|
-
meta_text = []
|
|
104
|
-
if info.get("title"):
|
|
105
|
-
meta_text.append(f"标题: {info['title']}")
|
|
106
|
-
if info.get("author"):
|
|
107
|
-
meta_text.append(f"作者: {info['author']}")
|
|
108
|
-
if info.get("subject"):
|
|
109
|
-
meta_text.append(f"主题: {info['subject']}")
|
|
110
|
-
if info.get("keywords"):
|
|
111
|
-
meta_text.append(f"关键词: {info['keywords']}")
|
|
112
|
-
|
|
113
|
-
if meta_text:
|
|
114
|
-
text_parts.append("=== 文档信息 ===")
|
|
115
|
-
text_parts.append("\n".join(meta_text))
|
|
116
|
-
|
|
117
|
-
# 提取目录结构(如果有)
|
|
118
|
-
toc = doc.get_toc() # type: ignore
|
|
119
|
-
if toc:
|
|
120
|
-
text_parts.append("\n=== 目录结构 ===")
|
|
121
|
-
for level, title, page in toc:
|
|
122
|
-
indent = " " * (level - 1)
|
|
123
|
-
text_parts.append(f"{indent}- {title} (第{page}页)")
|
|
124
|
-
|
|
125
|
-
# 处理各页内容
|
|
126
|
-
text_parts.append("\n=== 页面内容 ===")
|
|
127
|
-
for page_index in range(len(doc)): # 使用范围遍历而不是直接枚举文档对象
|
|
128
|
-
# 添加页码标记
|
|
129
|
-
text_parts.append(f"\n--- 第{page_index+1}页 ---")
|
|
130
|
-
|
|
131
|
-
# 获取页面
|
|
132
|
-
page = doc[page_index]
|
|
133
|
-
|
|
134
|
-
# 提取页面文本(包括结构信息)
|
|
135
|
-
try:
|
|
136
|
-
# 尝试使用结构化提取(保留段落和块结构)
|
|
137
|
-
text = page.get_text("text") # type: ignore
|
|
138
|
-
text = text.strip()
|
|
139
|
-
if text:
|
|
140
|
-
text_parts.append(text)
|
|
141
|
-
except Exception:
|
|
142
|
-
# 如果结构化提取失败,回退到简单文本提取
|
|
143
|
-
text = page.get_text() # type: ignore
|
|
144
|
-
if text.strip():
|
|
145
|
-
text_parts.append(text.strip())
|
|
146
|
-
|
|
147
|
-
# 提取图像信息(如果需要)
|
|
148
|
-
# 注意:这可能会增加处理时间,可根据需要启用
|
|
149
|
-
"""
|
|
150
|
-
image_list = page.get_images()
|
|
151
|
-
if image_list:
|
|
152
|
-
text_parts.append(f"本页包含 {len(image_list)} 个图像")
|
|
153
|
-
"""
|
|
154
|
-
|
|
155
|
-
# 合并所有文本
|
|
156
|
-
return "\n".join(text_parts)
|
|
157
|
-
|
|
158
|
-
except Exception as e:
|
|
159
|
-
# 处理可能的异常
|
|
160
|
-
return f"PDF处理错误: {str(e)}"
|
|
161
|
-
|
|
162
|
-
class DocxProcessor(FileProcessor):
|
|
163
|
-
"""DOCX file processor"""
|
|
164
|
-
@staticmethod
|
|
165
|
-
def can_handle(file_path: str) -> bool:
|
|
166
|
-
return Path(file_path).suffix.lower() == '.docx'
|
|
167
|
-
|
|
168
|
-
@staticmethod
|
|
169
|
-
def extract_text(file_path: str) -> str:
|
|
170
|
-
"""提取 DOCX 文件中的所有文本内容,包括段落、表格、页眉页脚等"""
|
|
171
|
-
doc = DocxDocument(file_path)
|
|
172
|
-
full_text = []
|
|
173
|
-
|
|
174
|
-
# 提取段落文本
|
|
175
|
-
for para in doc.paragraphs:
|
|
176
|
-
if para.text.strip(): # 跳过空段落
|
|
177
|
-
full_text.append(para.text)
|
|
178
|
-
|
|
179
|
-
# 提取表格文本
|
|
180
|
-
for table in doc.tables:
|
|
181
|
-
for row in table.rows:
|
|
182
|
-
row_texts = []
|
|
183
|
-
for cell in row.cells:
|
|
184
|
-
# 每个单元格可能包含多个段落
|
|
185
|
-
cell_text = "\n".join([p.text for p in cell.paragraphs if p.text.strip()])
|
|
186
|
-
if cell_text:
|
|
187
|
-
row_texts.append(cell_text)
|
|
188
|
-
if row_texts:
|
|
189
|
-
full_text.append(" | ".join(row_texts))
|
|
190
|
-
|
|
191
|
-
# 提取页眉页脚(如果有节)
|
|
192
|
-
try:
|
|
193
|
-
for section in doc.sections:
|
|
194
|
-
# 提取页眉
|
|
195
|
-
if section.header:
|
|
196
|
-
header_text = "\n".join([p.text for p in section.header.paragraphs if p.text.strip()])
|
|
197
|
-
if header_text:
|
|
198
|
-
full_text.append(f"页眉: {header_text}")
|
|
199
|
-
|
|
200
|
-
# 提取页脚
|
|
201
|
-
if section.footer:
|
|
202
|
-
footer_text = "\n".join([p.text for p in section.footer.paragraphs if p.text.strip()])
|
|
203
|
-
if footer_text:
|
|
204
|
-
full_text.append(f"页脚: {footer_text}")
|
|
205
|
-
except:
|
|
206
|
-
# 如果提取页眉页脚失败,忽略错误继续
|
|
207
|
-
pass
|
|
208
|
-
|
|
209
|
-
# 合并所有文本
|
|
210
|
-
return "\n\n".join(full_text)
|
|
211
|
-
|
|
212
|
-
class PPTProcessor(FileProcessor):
|
|
213
|
-
"""PPT file processor"""
|
|
214
|
-
@staticmethod
|
|
215
|
-
def can_handle(file_path: str) -> bool:
|
|
216
|
-
return Path(file_path).suffix.lower() in ['.ppt', '.pptx']
|
|
217
|
-
|
|
218
|
-
@staticmethod
|
|
219
|
-
def extract_text(file_path: str) -> str:
|
|
220
|
-
"""提取PPT文件中的所有文本内容,包括标题、文本框、备注等"""
|
|
221
|
-
prs = Presentation(file_path)
|
|
222
|
-
all_text = []
|
|
223
|
-
|
|
224
|
-
# 遍历所有幻灯片
|
|
225
|
-
for slide_index, slide in enumerate(prs.slides, 1):
|
|
226
|
-
slide_text = []
|
|
227
|
-
|
|
228
|
-
# 添加幻灯片编号
|
|
229
|
-
slide_text.append(f"=== 幻灯片 {slide_index} ===")
|
|
230
|
-
|
|
231
|
-
# 提取幻灯片中所有形状的文本
|
|
232
|
-
for shape in slide.shapes:
|
|
233
|
-
# 提取带有文本的形状
|
|
234
|
-
try:
|
|
235
|
-
if hasattr(shape, "text_frame") and shape.text_frame: # type: ignore
|
|
236
|
-
for paragraph in shape.text_frame.paragraphs: # type: ignore
|
|
237
|
-
text = paragraph.text.strip()
|
|
238
|
-
if text:
|
|
239
|
-
slide_text.append(text)
|
|
240
|
-
except AttributeError:
|
|
241
|
-
pass
|
|
242
|
-
|
|
243
|
-
# 提取表格内容
|
|
244
|
-
try:
|
|
245
|
-
if hasattr(shape, "table") and shape.table: # type: ignore
|
|
246
|
-
for row in shape.table.rows: # type: ignore
|
|
247
|
-
row_texts = []
|
|
248
|
-
for cell in row.cells:
|
|
249
|
-
if hasattr(cell, "text_frame") and cell.text_frame:
|
|
250
|
-
cell_paragraphs = cell.text_frame.paragraphs # type: ignore
|
|
251
|
-
cell_text = " ".join([p.text.strip() for p in cell_paragraphs if p.text.strip()])
|
|
252
|
-
if cell_text:
|
|
253
|
-
row_texts.append(cell_text)
|
|
254
|
-
if row_texts:
|
|
255
|
-
slide_text.append(" | ".join(row_texts))
|
|
256
|
-
except AttributeError:
|
|
257
|
-
pass
|
|
258
|
-
|
|
259
|
-
# 提取幻灯片备注
|
|
260
|
-
try:
|
|
261
|
-
if hasattr(slide, "has_notes_slide") and slide.has_notes_slide:
|
|
262
|
-
notes_slide = slide.notes_slide
|
|
263
|
-
if notes_slide and hasattr(notes_slide, "notes_text_frame") and notes_slide.notes_text_frame:
|
|
264
|
-
notes_text = notes_slide.notes_text_frame.text.strip() # type: ignore
|
|
265
|
-
if notes_text:
|
|
266
|
-
slide_text.append(f"备注: {notes_text}")
|
|
267
|
-
except AttributeError:
|
|
268
|
-
pass
|
|
269
|
-
|
|
270
|
-
# 合并当前幻灯片的所有文本
|
|
271
|
-
if len(slide_text) > 1: # 如果除了幻灯片编号外还有其他内容
|
|
272
|
-
all_text.append("\n".join(slide_text))
|
|
273
|
-
|
|
274
|
-
# 返回所有幻灯片的文本内容
|
|
275
|
-
return "\n\n".join(all_text)
|
|
276
|
-
|
|
277
|
-
class ExcelProcessor(FileProcessor):
|
|
278
|
-
"""Excel file processor"""
|
|
279
|
-
@staticmethod
|
|
280
|
-
def can_handle(file_path: str) -> bool:
|
|
281
|
-
return Path(file_path).suffix.lower() in ['.xls', '.xlsx']
|
|
282
|
-
|
|
283
|
-
@staticmethod
|
|
284
|
-
def extract_text(file_path: str) -> str:
|
|
285
|
-
"""提取 Excel 文件中的所有文本内容,包括多个工作表及格式化内容"""
|
|
286
|
-
try:
|
|
287
|
-
# 读取所有工作表
|
|
288
|
-
excel_file = pd.ExcelFile(file_path)
|
|
289
|
-
sheets_text = []
|
|
290
|
-
|
|
291
|
-
# 处理每个工作表
|
|
292
|
-
for sheet_name in excel_file.sheet_names:
|
|
293
|
-
# 读取当前工作表
|
|
294
|
-
df = pd.read_excel(file_path, sheet_name=sheet_name)
|
|
295
|
-
|
|
296
|
-
# 如果是空表格,跳过
|
|
297
|
-
if df.empty:
|
|
298
|
-
continue
|
|
299
|
-
|
|
300
|
-
# 添加工作表标题
|
|
301
|
-
sheet_text = [f"=== 工作表: {sheet_name} ==="]
|
|
302
|
-
|
|
303
|
-
# 填充空单元格,避免NaN显示
|
|
304
|
-
df = df.fillna("")
|
|
305
|
-
|
|
306
|
-
# 提取表格头信息
|
|
307
|
-
if not df.columns.empty:
|
|
308
|
-
headers = [str(col) for col in df.columns]
|
|
309
|
-
sheet_text.append("列标题: " + " | ".join(headers))
|
|
310
|
-
|
|
311
|
-
# 尝试提取表格中可能的关键信息
|
|
312
|
-
# 1. 表格内容概述
|
|
313
|
-
row_count, col_count = df.shape
|
|
314
|
-
sheet_text.append(f"表格大小: {row_count}行 x {col_count}列")
|
|
315
|
-
|
|
316
|
-
# 2. 表格数据,使用更友好的格式
|
|
317
|
-
try:
|
|
318
|
-
# 转换数据框为字符串表示
|
|
319
|
-
# 设置最大行数和列数,避免过大的表格
|
|
320
|
-
max_rows = min(500, row_count) # 最多显示500行
|
|
321
|
-
if row_count > max_rows:
|
|
322
|
-
sheet_text.append(f"注意: 表格太大,仅显示前{max_rows}行")
|
|
323
|
-
|
|
324
|
-
# 将DataFrame转换为字符串表格
|
|
325
|
-
table_str = df.head(max_rows).to_string(index=True, max_rows=max_rows, max_cols=None)
|
|
326
|
-
sheet_text.append(table_str)
|
|
327
|
-
|
|
328
|
-
except Exception as e:
|
|
329
|
-
sheet_text.append(f"表格数据提取错误: {str(e)}")
|
|
330
|
-
|
|
331
|
-
# 合并当前工作表的文本
|
|
332
|
-
sheets_text.append("\n".join(sheet_text))
|
|
333
|
-
|
|
334
|
-
# 如果没有提取到任何内容,返回一个提示信息
|
|
335
|
-
if not sheets_text:
|
|
336
|
-
return "Excel文件为空或无法提取内容"
|
|
337
|
-
|
|
338
|
-
# 合并所有工作表的文本
|
|
339
|
-
return "\n\n".join(sheets_text)
|
|
340
|
-
|
|
341
|
-
except Exception as e:
|
|
342
|
-
# 处理可能的异常,返回错误信息
|
|
343
|
-
return f"Excel文件处理错误: {str(e)}"
|
jarvis/jarvis_utils/input.py
CHANGED
|
@@ -158,8 +158,14 @@ def get_multiline_input(tip: str) -> str:
|
|
|
158
158
|
'prompt': 'ansicyan',
|
|
159
159
|
})
|
|
160
160
|
try:
|
|
161
|
+
from prompt_toolkit.history import FileHistory
|
|
162
|
+
from jarvis.jarvis_utils.config import get_data_dir
|
|
163
|
+
import os
|
|
164
|
+
# 获取数据目录路径
|
|
165
|
+
history_dir = get_data_dir()
|
|
166
|
+
# 初始化带历史记录的会话
|
|
161
167
|
session = PromptSession(
|
|
162
|
-
history=
|
|
168
|
+
history=FileHistory(os.path.join(history_dir, 'multiline_input_history')),
|
|
163
169
|
completer=FileCompleter(),
|
|
164
170
|
key_bindings=bindings,
|
|
165
171
|
complete_while_typing=True,
|
|
@@ -11,7 +11,7 @@ import json
|
|
|
11
11
|
import tempfile
|
|
12
12
|
from typing import Dict, Optional
|
|
13
13
|
|
|
14
|
-
from jarvis.jarvis_utils.config import INPUT_WINDOW_REVERSE_SIZE, get_max_input_token_count
|
|
14
|
+
from jarvis.jarvis_utils.config import INPUT_WINDOW_REVERSE_SIZE, get_max_input_token_count, get_data_dir
|
|
15
15
|
from jarvis.jarvis_utils.embedding import get_context_token_count
|
|
16
16
|
from jarvis.jarvis_utils.output import PrettyOutput, OutputType
|
|
17
17
|
from jarvis.jarvis_platform.registry import PlatformRegistry
|
|
@@ -23,7 +23,7 @@ def _get_methodology_directory() -> str:
|
|
|
23
23
|
返回:
|
|
24
24
|
str: 方法论目录的路径
|
|
25
25
|
"""
|
|
26
|
-
methodology_dir = os.path.
|
|
26
|
+
methodology_dir = os.path.join(get_data_dir(), "methodologies")
|
|
27
27
|
if not os.path.exists(methodology_dir):
|
|
28
28
|
try:
|
|
29
29
|
os.makedirs(methodology_dir, exist_ok=True)
|
jarvis/jarvis_utils/utils.py
CHANGED
|
@@ -3,22 +3,22 @@ import time
|
|
|
3
3
|
import hashlib
|
|
4
4
|
from pathlib import Path
|
|
5
5
|
from typing import List, Any, Callable
|
|
6
|
-
from jarvis.jarvis_utils.config import get_max_input_token_count
|
|
6
|
+
from jarvis.jarvis_utils.config import get_max_input_token_count, get_data_dir
|
|
7
7
|
from jarvis.jarvis_utils.embedding import get_context_token_count
|
|
8
8
|
from jarvis.jarvis_utils.input import get_single_line_input
|
|
9
9
|
from jarvis.jarvis_utils.output import PrettyOutput, OutputType
|
|
10
10
|
def init_env() -> None:
|
|
11
|
-
"""
|
|
11
|
+
"""初始化环境变量从jarvis_data/env文件
|
|
12
12
|
|
|
13
13
|
功能:
|
|
14
|
-
1.
|
|
14
|
+
1. 创建不存在的jarvis_data目录
|
|
15
15
|
2. 加载环境变量到os.environ
|
|
16
16
|
3. 处理文件读取异常
|
|
17
17
|
"""
|
|
18
|
-
jarvis_dir = Path
|
|
18
|
+
jarvis_dir = Path(get_data_dir())
|
|
19
19
|
env_file = jarvis_dir / "env"
|
|
20
20
|
|
|
21
|
-
#
|
|
21
|
+
# 检查jarvis_data目录是否存在
|
|
22
22
|
if not jarvis_dir.exists():
|
|
23
23
|
jarvis_dir.mkdir(parents=True)
|
|
24
24
|
if env_file.exists():
|