jarvis-ai-assistant 0.1.75__tar.gz → 0.1.77__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of jarvis-ai-assistant might be problematic. Click here for more details.
- {jarvis_ai_assistant-0.1.75/src/jarvis_ai_assistant.egg-info → jarvis_ai_assistant-0.1.77}/PKG-INFO +33 -16
- {jarvis_ai_assistant-0.1.75 → jarvis_ai_assistant-0.1.77}/README.md +27 -15
- {jarvis_ai_assistant-0.1.75 → jarvis_ai_assistant-0.1.77}/pyproject.toml +7 -1
- {jarvis_ai_assistant-0.1.75 → jarvis_ai_assistant-0.1.77}/setup.py +7 -2
- {jarvis_ai_assistant-0.1.75 → jarvis_ai_assistant-0.1.77}/src/jarvis/__init__.py +1 -1
- {jarvis_ai_assistant-0.1.75 → jarvis_ai_assistant-0.1.77}/src/jarvis/agent.py +23 -15
- {jarvis_ai_assistant-0.1.75 → jarvis_ai_assistant-0.1.77}/src/jarvis/jarvis_codebase/main.py +2 -4
- jarvis_ai_assistant-0.1.77/src/jarvis/rag/__init__.py +0 -0
- jarvis_ai_assistant-0.1.77/src/jarvis/rag/main.py +483 -0
- {jarvis_ai_assistant-0.1.75 → jarvis_ai_assistant-0.1.77}/src/jarvis/tools/coder.py +4 -4
- {jarvis_ai_assistant-0.1.75 → jarvis_ai_assistant-0.1.77}/src/jarvis/tools/search.py +87 -20
- {jarvis_ai_assistant-0.1.75 → jarvis_ai_assistant-0.1.77}/src/jarvis/utils.py +25 -1
- {jarvis_ai_assistant-0.1.75 → jarvis_ai_assistant-0.1.77/src/jarvis_ai_assistant.egg-info}/PKG-INFO +33 -16
- {jarvis_ai_assistant-0.1.75 → jarvis_ai_assistant-0.1.77}/src/jarvis_ai_assistant.egg-info/SOURCES.txt +2 -0
- {jarvis_ai_assistant-0.1.75 → jarvis_ai_assistant-0.1.77}/src/jarvis_ai_assistant.egg-info/entry_points.txt +1 -0
- {jarvis_ai_assistant-0.1.75 → jarvis_ai_assistant-0.1.77}/src/jarvis_ai_assistant.egg-info/requires.txt +4 -0
- {jarvis_ai_assistant-0.1.75 → jarvis_ai_assistant-0.1.77}/LICENSE +0 -0
- {jarvis_ai_assistant-0.1.75 → jarvis_ai_assistant-0.1.77}/MANIFEST.in +0 -0
- {jarvis_ai_assistant-0.1.75 → jarvis_ai_assistant-0.1.77}/setup.cfg +0 -0
- {jarvis_ai_assistant-0.1.75 → jarvis_ai_assistant-0.1.77}/src/jarvis/jarvis_codebase/__init__.py +0 -0
- {jarvis_ai_assistant-0.1.75 → jarvis_ai_assistant-0.1.77}/src/jarvis/jarvis_coder/__init__.py +0 -0
- {jarvis_ai_assistant-0.1.75 → jarvis_ai_assistant-0.1.77}/src/jarvis/jarvis_coder/main.py +0 -0
- {jarvis_ai_assistant-0.1.75 → jarvis_ai_assistant-0.1.77}/src/jarvis/main.py +0 -0
- {jarvis_ai_assistant-0.1.75 → jarvis_ai_assistant-0.1.77}/src/jarvis/models/__init__.py +0 -0
- {jarvis_ai_assistant-0.1.75 → jarvis_ai_assistant-0.1.77}/src/jarvis/models/ai8.py +0 -0
- {jarvis_ai_assistant-0.1.75 → jarvis_ai_assistant-0.1.77}/src/jarvis/models/base.py +0 -0
- {jarvis_ai_assistant-0.1.75 → jarvis_ai_assistant-0.1.77}/src/jarvis/models/kimi.py +0 -0
- {jarvis_ai_assistant-0.1.75 → jarvis_ai_assistant-0.1.77}/src/jarvis/models/openai.py +0 -0
- {jarvis_ai_assistant-0.1.75 → jarvis_ai_assistant-0.1.77}/src/jarvis/models/oyi.py +0 -0
- {jarvis_ai_assistant-0.1.75 → jarvis_ai_assistant-0.1.77}/src/jarvis/models/registry.py +0 -0
- {jarvis_ai_assistant-0.1.75 → jarvis_ai_assistant-0.1.77}/src/jarvis/tools/__init__.py +0 -0
- {jarvis_ai_assistant-0.1.75 → jarvis_ai_assistant-0.1.77}/src/jarvis/tools/base.py +0 -0
- {jarvis_ai_assistant-0.1.75 → jarvis_ai_assistant-0.1.77}/src/jarvis/tools/codebase_qa.py +0 -0
- {jarvis_ai_assistant-0.1.75 → jarvis_ai_assistant-0.1.77}/src/jarvis/tools/file_ops.py +0 -0
- {jarvis_ai_assistant-0.1.75 → jarvis_ai_assistant-0.1.77}/src/jarvis/tools/generator.py +0 -0
- {jarvis_ai_assistant-0.1.75 → jarvis_ai_assistant-0.1.77}/src/jarvis/tools/methodology.py +0 -0
- {jarvis_ai_assistant-0.1.75 → jarvis_ai_assistant-0.1.77}/src/jarvis/tools/registry.py +0 -0
- {jarvis_ai_assistant-0.1.75 → jarvis_ai_assistant-0.1.77}/src/jarvis/tools/shell.py +0 -0
- {jarvis_ai_assistant-0.1.75 → jarvis_ai_assistant-0.1.77}/src/jarvis/tools/sub_agent.py +0 -0
- {jarvis_ai_assistant-0.1.75 → jarvis_ai_assistant-0.1.77}/src/jarvis/tools/webpage.py +0 -0
- {jarvis_ai_assistant-0.1.75 → jarvis_ai_assistant-0.1.77}/src/jarvis_ai_assistant.egg-info/dependency_links.txt +0 -0
- {jarvis_ai_assistant-0.1.75 → jarvis_ai_assistant-0.1.77}/src/jarvis_ai_assistant.egg-info/top_level.txt +0 -0
{jarvis_ai_assistant-0.1.75/src/jarvis_ai_assistant.egg-info → jarvis_ai_assistant-0.1.77}/PKG-INFO
RENAMED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.2
|
|
2
2
|
Name: jarvis-ai-assistant
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.77
|
|
4
4
|
Summary: Jarvis: An AI assistant that uses tools to interact with the system
|
|
5
5
|
Home-page: https://github.com/skyfireitdiy/Jarvis
|
|
6
6
|
Author: skyfire
|
|
@@ -35,6 +35,7 @@ Classifier: Programming Language :: Python :: 3.8
|
|
|
35
35
|
Classifier: Programming Language :: Python :: 3.9
|
|
36
36
|
Classifier: Programming Language :: Python :: 3.10
|
|
37
37
|
Classifier: Programming Language :: Python :: 3.11
|
|
38
|
+
Classifier: Operating System :: POSIX :: Linux
|
|
38
39
|
Requires-Python: >=3.8
|
|
39
40
|
Description-Content-Type: text/markdown
|
|
40
41
|
License-File: LICENSE
|
|
@@ -48,6 +49,10 @@ Requires-Dist: numpy>=1.24.0
|
|
|
48
49
|
Requires-Dist: faiss-cpu>=1.8.0
|
|
49
50
|
Requires-Dist: sentence-transformers>=2.2.2
|
|
50
51
|
Requires-Dist: bs4>=0.0.1
|
|
52
|
+
Requires-Dist: PyMuPDF>=1.21.0
|
|
53
|
+
Requires-Dist: python-docx>=0.8.11
|
|
54
|
+
Requires-Dist: tiktoken>=0.3.0
|
|
55
|
+
Requires-Dist: tqdm>=4.65.0
|
|
51
56
|
Provides-Extra: dev
|
|
52
57
|
Requires-Dist: pytest; extra == "dev"
|
|
53
58
|
Requires-Dist: black; extra == "dev"
|
|
@@ -124,6 +129,7 @@ Jarvis supports configuration through environment variables that can be set in t
|
|
|
124
129
|
|---------|------|--------|------|
|
|
125
130
|
| JARVIS_PLATFORM | AI platform to use, supports kimi/openai/ai8 etc | kimi | Yes |
|
|
126
131
|
| JARVIS_MODEL | Model name to use | - | No |
|
|
132
|
+
| JARVIS_THREAD_COUNT | Number of threads for parallel processing | 10 | No |
|
|
127
133
|
| JARVIS_CODEGEN_PLATFORM | AI platform for code generation | Same as JARVIS_PLATFORM | No |
|
|
128
134
|
| JARVIS_CODEGEN_MODEL | Model name for code generation | Same as JARVIS_MODEL | No |
|
|
129
135
|
| JARVIS_CHEAP_PLATFORM | AI platform for cheap operations | Same as JARVIS_PLATFORM | No |
|
|
@@ -141,36 +147,47 @@ Jarvis supports configuration through environment variables that can be set in t
|
|
|
141
147
|
|
|
142
148
|
## 🎯 Usage
|
|
143
149
|
|
|
144
|
-
###
|
|
150
|
+
### Main Assistant
|
|
145
151
|
```bash
|
|
146
152
|
jarvis
|
|
147
153
|
```
|
|
148
154
|
|
|
149
|
-
|
|
150
|
-
### With Specific Model
|
|
155
|
+
### Code Generation
|
|
151
156
|
```bash
|
|
152
|
-
jarvis
|
|
153
|
-
jarvis -p openai # Use OpenAI platform
|
|
154
|
-
```
|
|
155
|
-
|
|
156
|
-
### Code Modification
|
|
157
|
-
```bash
|
|
158
|
-
jarvis-coder --feature "Add new feature" # Modify code to add new feature
|
|
157
|
+
jarvis-coder
|
|
159
158
|
```
|
|
160
159
|
|
|
161
160
|
### Codebase Search
|
|
162
161
|
```bash
|
|
163
|
-
|
|
162
|
+
# Generate codebase index
|
|
163
|
+
jarvis-codebase --generate
|
|
164
|
+
|
|
165
|
+
# Search similar code
|
|
166
|
+
jarvis-codebase --search "your search query"
|
|
167
|
+
|
|
168
|
+
# Ask questions about codebase
|
|
169
|
+
jarvis-codebase --ask "your question"
|
|
164
170
|
```
|
|
165
171
|
|
|
166
|
-
###
|
|
172
|
+
### Document Analysis (RAG)
|
|
167
173
|
```bash
|
|
168
|
-
|
|
174
|
+
# Build document index
|
|
175
|
+
jarvis-rag --dir /path/to/documents --build
|
|
176
|
+
|
|
177
|
+
# Search documents
|
|
178
|
+
jarvis-rag --query "your search query"
|
|
169
179
|
```
|
|
170
180
|
|
|
171
|
-
###
|
|
181
|
+
### Search Tool
|
|
172
182
|
```bash
|
|
173
|
-
|
|
183
|
+
# Basic search
|
|
184
|
+
jarvis-search "your query"
|
|
185
|
+
|
|
186
|
+
# Show only URLs
|
|
187
|
+
jarvis-search "your query" --url-only
|
|
188
|
+
|
|
189
|
+
# Limit results
|
|
190
|
+
jarvis-search "your query" --max 3
|
|
174
191
|
```
|
|
175
192
|
|
|
176
193
|
## 🛠️ Tools
|
|
@@ -65,6 +65,7 @@ Jarvis supports configuration through environment variables that can be set in t
|
|
|
65
65
|
|---------|------|--------|------|
|
|
66
66
|
| JARVIS_PLATFORM | AI platform to use, supports kimi/openai/ai8 etc | kimi | Yes |
|
|
67
67
|
| JARVIS_MODEL | Model name to use | - | No |
|
|
68
|
+
| JARVIS_THREAD_COUNT | Number of threads for parallel processing | 10 | No |
|
|
68
69
|
| JARVIS_CODEGEN_PLATFORM | AI platform for code generation | Same as JARVIS_PLATFORM | No |
|
|
69
70
|
| JARVIS_CODEGEN_MODEL | Model name for code generation | Same as JARVIS_MODEL | No |
|
|
70
71
|
| JARVIS_CHEAP_PLATFORM | AI platform for cheap operations | Same as JARVIS_PLATFORM | No |
|
|
@@ -82,36 +83,47 @@ Jarvis supports configuration through environment variables that can be set in t
|
|
|
82
83
|
|
|
83
84
|
## 🎯 Usage
|
|
84
85
|
|
|
85
|
-
###
|
|
86
|
+
### Main Assistant
|
|
86
87
|
```bash
|
|
87
88
|
jarvis
|
|
88
89
|
```
|
|
89
90
|
|
|
90
|
-
|
|
91
|
-
### With Specific Model
|
|
91
|
+
### Code Generation
|
|
92
92
|
```bash
|
|
93
|
-
jarvis
|
|
94
|
-
jarvis -p openai # Use OpenAI platform
|
|
95
|
-
```
|
|
96
|
-
|
|
97
|
-
### Code Modification
|
|
98
|
-
```bash
|
|
99
|
-
jarvis-coder --feature "Add new feature" # Modify code to add new feature
|
|
93
|
+
jarvis-coder
|
|
100
94
|
```
|
|
101
95
|
|
|
102
96
|
### Codebase Search
|
|
103
97
|
```bash
|
|
104
|
-
|
|
98
|
+
# Generate codebase index
|
|
99
|
+
jarvis-codebase --generate
|
|
100
|
+
|
|
101
|
+
# Search similar code
|
|
102
|
+
jarvis-codebase --search "your search query"
|
|
103
|
+
|
|
104
|
+
# Ask questions about codebase
|
|
105
|
+
jarvis-codebase --ask "your question"
|
|
105
106
|
```
|
|
106
107
|
|
|
107
|
-
###
|
|
108
|
+
### Document Analysis (RAG)
|
|
108
109
|
```bash
|
|
109
|
-
|
|
110
|
+
# Build document index
|
|
111
|
+
jarvis-rag --dir /path/to/documents --build
|
|
112
|
+
|
|
113
|
+
# Search documents
|
|
114
|
+
jarvis-rag --query "your search query"
|
|
110
115
|
```
|
|
111
116
|
|
|
112
|
-
###
|
|
117
|
+
### Search Tool
|
|
113
118
|
```bash
|
|
114
|
-
|
|
119
|
+
# Basic search
|
|
120
|
+
jarvis-search "your query"
|
|
121
|
+
|
|
122
|
+
# Show only URLs
|
|
123
|
+
jarvis-search "your query" --url-only
|
|
124
|
+
|
|
125
|
+
# Limit results
|
|
126
|
+
jarvis-search "your query" --max 3
|
|
115
127
|
```
|
|
116
128
|
|
|
117
129
|
## 🛠️ Tools
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "jarvis-ai-assistant"
|
|
7
|
-
version = "0.1.
|
|
7
|
+
version = "0.1.77"
|
|
8
8
|
description = "Jarvis: An AI assistant that uses tools to interact with the system"
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
authors = [{ name = "Your Name", email = "your.email@example.com" }]
|
|
@@ -17,6 +17,7 @@ classifiers = [
|
|
|
17
17
|
"Programming Language :: Python :: 3.9",
|
|
18
18
|
"Programming Language :: Python :: 3.10",
|
|
19
19
|
"Programming Language :: Python :: 3.11",
|
|
20
|
+
"Operating System :: POSIX :: Linux",
|
|
20
21
|
]
|
|
21
22
|
keywords = ["jarvis", "ai", "assistant", "tools", "automation"]
|
|
22
23
|
dependencies = [
|
|
@@ -30,6 +31,10 @@ dependencies = [
|
|
|
30
31
|
"faiss-cpu>=1.8.0",
|
|
31
32
|
"sentence-transformers>=2.2.2",
|
|
32
33
|
"bs4>=0.0.1",
|
|
34
|
+
"PyMuPDF>=1.21.0",
|
|
35
|
+
"python-docx>=0.8.11",
|
|
36
|
+
"tiktoken>=0.3.0",
|
|
37
|
+
"tqdm>=4.65.0",
|
|
33
38
|
]
|
|
34
39
|
requires-python = ">=3.8"
|
|
35
40
|
|
|
@@ -43,3 +48,4 @@ Homepage = "https://github.com/skyfireitdiy/Jarvis"
|
|
|
43
48
|
jarvis = "jarvis.main:main"
|
|
44
49
|
jarvis-coder = "jarvis.jarvis_coder.main:main"
|
|
45
50
|
jarvis-codebase = "jarvis.jarvis_codebase.main:main"
|
|
51
|
+
jarvis-rag = "jarvis.rag.main:main"
|
|
@@ -2,7 +2,7 @@ from setuptools import setup, find_packages
|
|
|
2
2
|
|
|
3
3
|
setup(
|
|
4
4
|
name="jarvis-ai-assistant",
|
|
5
|
-
version="0.1.
|
|
5
|
+
version="0.1.77",
|
|
6
6
|
author="skyfire",
|
|
7
7
|
author_email="skyfireitdiy@hotmail.com",
|
|
8
8
|
description="An AI assistant that uses various tools to interact with the system",
|
|
@@ -23,12 +23,17 @@ setup(
|
|
|
23
23
|
"faiss-cpu>=1.8.0",
|
|
24
24
|
"sentence-transformers>=2.2.2",
|
|
25
25
|
"bs4>=0.0.1",
|
|
26
|
+
"PyMuPDF>=1.21.0",
|
|
27
|
+
"python-docx>=0.8.11",
|
|
28
|
+
"tiktoken>=0.3.0",
|
|
29
|
+
"tqdm>=4.65.0",
|
|
26
30
|
],
|
|
27
31
|
entry_points={
|
|
28
32
|
"console_scripts": [
|
|
29
33
|
"jarvis=jarvis.main:main",
|
|
30
34
|
"jarvis-coder=jarvis.jarvis_coder.main:main",
|
|
31
35
|
"jarvis-codebase=jarvis.jarvis_codebase.main:main",
|
|
36
|
+
"jarvis-rag=jarvis.rag.main:main",
|
|
32
37
|
],
|
|
33
38
|
},
|
|
34
39
|
python_requires=">=3.8",
|
|
@@ -36,7 +41,7 @@ setup(
|
|
|
36
41
|
"Development Status :: 3 - Alpha",
|
|
37
42
|
"Intended Audience :: Developers",
|
|
38
43
|
"License :: OSI Approved :: MIT License",
|
|
39
|
-
"Operating System ::
|
|
44
|
+
"Operating System :: POSIX :: Linux",
|
|
40
45
|
"Programming Language :: Python :: 3",
|
|
41
46
|
"Programming Language :: Python :: 3.8",
|
|
42
47
|
"Programming Language :: Python :: 3.9",
|
|
@@ -8,7 +8,7 @@ import json
|
|
|
8
8
|
|
|
9
9
|
from .models.registry import PlatformRegistry
|
|
10
10
|
from .tools import ToolRegistry
|
|
11
|
-
from .utils import PrettyOutput, OutputType, get_multiline_input, while_success
|
|
11
|
+
from .utils import PrettyOutput, OutputType, get_multiline_input, load_embedding_model, while_success
|
|
12
12
|
import os
|
|
13
13
|
from datetime import datetime
|
|
14
14
|
from prompt_toolkit import prompt
|
|
@@ -37,15 +37,15 @@ class Agent:
|
|
|
37
37
|
|
|
38
38
|
# 初始化嵌入模型
|
|
39
39
|
try:
|
|
40
|
-
|
|
41
|
-
PrettyOutput.print(f"正在加载嵌入模型: {self.embedding_model_name}...", OutputType.INFO)
|
|
42
|
-
self.embedding_model = SentenceTransformer(self.embedding_model_name)
|
|
40
|
+
self.embedding_model = load_embedding_model(self.embedding_model_name)
|
|
43
41
|
|
|
44
42
|
# 预热模型并获取正确的维度
|
|
45
43
|
test_text = "这是一段测试文本,用于确保模型完全加载。"
|
|
46
|
-
test_embedding = self.embedding_model.encode(
|
|
47
|
-
|
|
48
|
-
|
|
44
|
+
test_embedding = self.embedding_model.encode(
|
|
45
|
+
test_text,
|
|
46
|
+
convert_to_tensor=True,
|
|
47
|
+
normalize_embeddings=True
|
|
48
|
+
)
|
|
49
49
|
self.embedding_dimension = len(test_embedding)
|
|
50
50
|
PrettyOutput.print("嵌入模型加载完成", OutputType.SUCCESS)
|
|
51
51
|
|
|
@@ -140,6 +140,7 @@ class Agent:
|
|
|
140
140
|
|
|
141
141
|
def _load_methodology(self, user_input: str) -> Dict[str, str]:
|
|
142
142
|
"""加载方法论并构建向量索引"""
|
|
143
|
+
PrettyOutput.print("加载方法论...", OutputType.PLANNING)
|
|
143
144
|
user_jarvis_methodology = os.path.expanduser("~/.jarvis_methodology")
|
|
144
145
|
if not os.path.exists(user_jarvis_methodology):
|
|
145
146
|
return {}
|
|
@@ -290,6 +291,7 @@ class Agent:
|
|
|
290
291
|
self.prompt = summary_prompt
|
|
291
292
|
return self._call_model(self.prompt)
|
|
292
293
|
|
|
294
|
+
|
|
293
295
|
def run(self, user_input: str, file_list: Optional[List[str]] = None, keep_history: bool = False) -> str:
|
|
294
296
|
"""处理用户输入并返回响应,返回任务总结报告
|
|
295
297
|
|
|
@@ -302,6 +304,7 @@ class Agent:
|
|
|
302
304
|
str: 任务总结报告
|
|
303
305
|
"""
|
|
304
306
|
try:
|
|
307
|
+
PrettyOutput.section("准备环境", OutputType.PLANNING)
|
|
305
308
|
if file_list:
|
|
306
309
|
self.model.upload_files(file_list)
|
|
307
310
|
|
|
@@ -313,18 +316,23 @@ class Agent:
|
|
|
313
316
|
{methodology}
|
|
314
317
|
|
|
315
318
|
"""
|
|
319
|
+
tools_prompt = ""
|
|
320
|
+
|
|
321
|
+
# 选择工具
|
|
322
|
+
tools = self.tool_registry.get_all_tools()
|
|
323
|
+
if tools:
|
|
324
|
+
tools_prompt += "可用工具:\n"
|
|
325
|
+
for tool in tools:
|
|
326
|
+
PrettyOutput.print(f"选择工具: {tool['name']}", OutputType.INFO)
|
|
327
|
+
tools_prompt += f"- 名称: {tool['name']}\n"
|
|
328
|
+
tools_prompt += f" 描述: {tool['description']}\n"
|
|
329
|
+
tools_prompt += f" 参数: {tool['parameters']}\n"
|
|
316
330
|
|
|
317
|
-
self.clear_history()
|
|
318
|
-
self.conversation_turns = 0
|
|
319
|
-
|
|
320
331
|
# 显示任务开始
|
|
321
332
|
PrettyOutput.section(f"开始新任务: {self.name}", OutputType.PLANNING)
|
|
322
333
|
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
tools_prompt += f"- 名称: {tool['name']}\n"
|
|
326
|
-
tools_prompt += f" 描述: {tool['description']}\n"
|
|
327
|
-
tools_prompt += f" 参数: {tool['parameters']}\n"
|
|
334
|
+
self.clear_history()
|
|
335
|
+
self.conversation_turns = 0
|
|
328
336
|
|
|
329
337
|
self.model.set_system_message(f"""你是 {self.name},一个问题处理能力强大的 AI 助手。
|
|
330
338
|
|
{jarvis_ai_assistant-0.1.75 → jarvis_ai_assistant-0.1.77}/src/jarvis/jarvis_codebase/main.py
RENAMED
|
@@ -7,7 +7,7 @@ from jarvis.models.registry import PlatformRegistry
|
|
|
7
7
|
import concurrent.futures
|
|
8
8
|
from threading import Lock
|
|
9
9
|
from concurrent.futures import ThreadPoolExecutor
|
|
10
|
-
from jarvis.utils import OutputType, PrettyOutput, find_git_root
|
|
10
|
+
from jarvis.utils import OutputType, PrettyOutput, find_git_root, load_embedding_model
|
|
11
11
|
from jarvis.utils import load_env_from_file
|
|
12
12
|
import argparse
|
|
13
13
|
from sentence_transformers import SentenceTransformer
|
|
@@ -43,9 +43,7 @@ class CodeBase:
|
|
|
43
43
|
|
|
44
44
|
# 初始化嵌入模型,使用系统默认缓存目录
|
|
45
45
|
try:
|
|
46
|
-
|
|
47
|
-
PrettyOutput.print("正在加载/下载模型,请稍候...", output_type=OutputType.INFO)
|
|
48
|
-
self.embedding_model = SentenceTransformer(self.embedding_model_name)
|
|
46
|
+
self.embedding_model = load_embedding_model(self.embedding_model_name)
|
|
49
47
|
|
|
50
48
|
# 强制完全加载所有模型组件
|
|
51
49
|
test_text = """
|
|
File without changes
|
|
@@ -0,0 +1,483 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import hashlib
|
|
3
|
+
import numpy as np
|
|
4
|
+
import faiss
|
|
5
|
+
from typing import List, Tuple, Optional, Dict
|
|
6
|
+
from sentence_transformers import SentenceTransformer
|
|
7
|
+
import pickle
|
|
8
|
+
from jarvis.utils import OutputType, PrettyOutput, find_git_root, load_embedding_model
|
|
9
|
+
from jarvis.utils import load_env_from_file
|
|
10
|
+
import tiktoken
|
|
11
|
+
from dataclasses import dataclass
|
|
12
|
+
from tqdm import tqdm
|
|
13
|
+
import fitz # PyMuPDF for PDF files
|
|
14
|
+
from docx import Document as DocxDocument # python-docx for DOCX files
|
|
15
|
+
from pathlib import Path
|
|
16
|
+
|
|
17
|
+
@dataclass
|
|
18
|
+
class Document:
|
|
19
|
+
"""文档类,用于存储文档内容和元数据"""
|
|
20
|
+
content: str # 文档内容
|
|
21
|
+
metadata: Dict # 元数据(文件路径、位置等)
|
|
22
|
+
|
|
23
|
+
class FileProcessor:
|
|
24
|
+
"""文件处理器基类"""
|
|
25
|
+
@staticmethod
|
|
26
|
+
def can_handle(file_path: str) -> bool:
|
|
27
|
+
"""判断是否可以处理该文件"""
|
|
28
|
+
raise NotImplementedError
|
|
29
|
+
|
|
30
|
+
@staticmethod
|
|
31
|
+
def extract_text(file_path: str) -> str:
|
|
32
|
+
"""提取文件文本内容"""
|
|
33
|
+
raise NotImplementedError
|
|
34
|
+
|
|
35
|
+
class TextFileProcessor(FileProcessor):
|
|
36
|
+
"""文本文件处理器"""
|
|
37
|
+
ENCODINGS = ['utf-8', 'gbk', 'gb2312', 'latin1']
|
|
38
|
+
SAMPLE_SIZE = 8192 # 读取前8KB来检测编码
|
|
39
|
+
|
|
40
|
+
@staticmethod
|
|
41
|
+
def can_handle(file_path: str) -> bool:
|
|
42
|
+
"""判断文件是否为文本文件,通过尝试解码来判断"""
|
|
43
|
+
try:
|
|
44
|
+
# 读取文件开头的一小部分来检测
|
|
45
|
+
with open(file_path, 'rb') as f:
|
|
46
|
+
sample = f.read(TextFileProcessor.SAMPLE_SIZE)
|
|
47
|
+
|
|
48
|
+
# 检查是否包含空字节(通常表示二进制文件)
|
|
49
|
+
if b'\x00' in sample:
|
|
50
|
+
return False
|
|
51
|
+
|
|
52
|
+
# 检查是否包含过多的非打印字符(通常表示二进制文件)
|
|
53
|
+
non_printable = sum(1 for byte in sample if byte < 32 and byte not in (9, 10, 13)) # tab, newline, carriage return
|
|
54
|
+
if non_printable / len(sample) > 0.3: # 如果非打印字符超过30%,认为是二进制文件
|
|
55
|
+
return False
|
|
56
|
+
|
|
57
|
+
# 尝试用不同编码解码
|
|
58
|
+
for encoding in TextFileProcessor.ENCODINGS:
|
|
59
|
+
try:
|
|
60
|
+
sample.decode(encoding)
|
|
61
|
+
return True
|
|
62
|
+
except UnicodeDecodeError:
|
|
63
|
+
continue
|
|
64
|
+
|
|
65
|
+
return False
|
|
66
|
+
|
|
67
|
+
except Exception:
|
|
68
|
+
return False
|
|
69
|
+
|
|
70
|
+
@staticmethod
|
|
71
|
+
def extract_text(file_path: str) -> str:
|
|
72
|
+
"""提取文本内容,使用检测到的正确编码"""
|
|
73
|
+
detected_encoding = None
|
|
74
|
+
try:
|
|
75
|
+
# 首先尝试检测编码
|
|
76
|
+
with open(file_path, 'rb') as f:
|
|
77
|
+
raw_data = f.read()
|
|
78
|
+
|
|
79
|
+
# 尝试不同的编码
|
|
80
|
+
for encoding in TextFileProcessor.ENCODINGS:
|
|
81
|
+
try:
|
|
82
|
+
raw_data.decode(encoding)
|
|
83
|
+
detected_encoding = encoding
|
|
84
|
+
break
|
|
85
|
+
except UnicodeDecodeError:
|
|
86
|
+
continue
|
|
87
|
+
|
|
88
|
+
if not detected_encoding:
|
|
89
|
+
raise UnicodeDecodeError(f"无法用支持的编码解码文件: {file_path}")
|
|
90
|
+
|
|
91
|
+
# 使用检测到的编码读取文件
|
|
92
|
+
with open(file_path, 'r', encoding=detected_encoding, errors='replace') as f:
|
|
93
|
+
content = f.read()
|
|
94
|
+
|
|
95
|
+
# 规范化Unicode字符
|
|
96
|
+
import unicodedata
|
|
97
|
+
content = unicodedata.normalize('NFKC', content)
|
|
98
|
+
|
|
99
|
+
return content
|
|
100
|
+
|
|
101
|
+
except Exception as e:
|
|
102
|
+
raise Exception(f"读取文件失败: {str(e)}")
|
|
103
|
+
|
|
104
|
+
class PDFProcessor(FileProcessor):
|
|
105
|
+
"""PDF文件处理器"""
|
|
106
|
+
@staticmethod
|
|
107
|
+
def can_handle(file_path: str) -> bool:
|
|
108
|
+
return Path(file_path).suffix.lower() == '.pdf'
|
|
109
|
+
|
|
110
|
+
@staticmethod
|
|
111
|
+
def extract_text(file_path: str) -> str:
|
|
112
|
+
text_parts = []
|
|
113
|
+
with fitz.open(file_path) as doc:
|
|
114
|
+
for page in doc:
|
|
115
|
+
text_parts.append(page.get_text())
|
|
116
|
+
return "\n".join(text_parts)
|
|
117
|
+
|
|
118
|
+
class DocxProcessor(FileProcessor):
|
|
119
|
+
"""DOCX文件处理器"""
|
|
120
|
+
@staticmethod
|
|
121
|
+
def can_handle(file_path: str) -> bool:
|
|
122
|
+
return Path(file_path).suffix.lower() == '.docx'
|
|
123
|
+
|
|
124
|
+
@staticmethod
|
|
125
|
+
def extract_text(file_path: str) -> str:
|
|
126
|
+
doc = DocxDocument(file_path)
|
|
127
|
+
return "\n".join([paragraph.text for paragraph in doc.paragraphs])
|
|
128
|
+
|
|
129
|
+
class RAGTool:
|
|
130
|
+
def __init__(self, root_dir: str):
|
|
131
|
+
"""初始化RAG工具
|
|
132
|
+
|
|
133
|
+
Args:
|
|
134
|
+
root_dir: 项目根目录
|
|
135
|
+
"""
|
|
136
|
+
load_env_from_file()
|
|
137
|
+
self.root_dir = root_dir
|
|
138
|
+
os.chdir(self.root_dir)
|
|
139
|
+
|
|
140
|
+
# 初始化配置
|
|
141
|
+
self.min_paragraph_length = int(os.environ.get("JARVIS_MIN_PARAGRAPH_LENGTH", "50")) # 最小段落长度
|
|
142
|
+
self.max_paragraph_length = int(os.environ.get("JARVIS_MAX_PARAGRAPH_LENGTH", "1000")) # 最大段落长度
|
|
143
|
+
self.embedding_model_name = os.environ.get("JARVIS_EMBEDDING_MODEL", "BAAI/bge-large-zh-v1.5")
|
|
144
|
+
|
|
145
|
+
# 初始化数据目录
|
|
146
|
+
self.data_dir = os.path.join(self.root_dir, ".jarvis-rag")
|
|
147
|
+
if not os.path.exists(self.data_dir):
|
|
148
|
+
os.makedirs(self.data_dir)
|
|
149
|
+
|
|
150
|
+
# 初始化嵌入模型
|
|
151
|
+
try:
|
|
152
|
+
self.embedding_model = load_embedding_model(self.embedding_model_name)
|
|
153
|
+
self.vector_dim = self.embedding_model.get_sentence_embedding_dimension()
|
|
154
|
+
PrettyOutput.print("模型加载完成", output_type=OutputType.SUCCESS)
|
|
155
|
+
except Exception as e:
|
|
156
|
+
PrettyOutput.print(f"加载模型失败: {str(e)}", output_type=OutputType.ERROR)
|
|
157
|
+
raise
|
|
158
|
+
|
|
159
|
+
# 初始化缓存和索引
|
|
160
|
+
self.cache_path = os.path.join(self.data_dir, "cache.pkl")
|
|
161
|
+
self.documents: List[Document] = []
|
|
162
|
+
self.index = None
|
|
163
|
+
|
|
164
|
+
# 加载缓存
|
|
165
|
+
self._load_cache()
|
|
166
|
+
|
|
167
|
+
# 注册文件处理器
|
|
168
|
+
self.file_processors = [
|
|
169
|
+
TextFileProcessor(),
|
|
170
|
+
PDFProcessor(),
|
|
171
|
+
DocxProcessor()
|
|
172
|
+
]
|
|
173
|
+
|
|
174
|
+
def _load_cache(self):
|
|
175
|
+
"""加载缓存数据"""
|
|
176
|
+
if os.path.exists(self.cache_path):
|
|
177
|
+
try:
|
|
178
|
+
with open(self.cache_path, 'rb') as f:
|
|
179
|
+
cache_data = pickle.load(f)
|
|
180
|
+
self.documents = cache_data["documents"]
|
|
181
|
+
vectors = cache_data["vectors"]
|
|
182
|
+
|
|
183
|
+
# 重建索引
|
|
184
|
+
self._build_index(vectors)
|
|
185
|
+
PrettyOutput.print(f"加载了 {len(self.documents)} 个文档片段",
|
|
186
|
+
output_type=OutputType.INFO)
|
|
187
|
+
except Exception as e:
|
|
188
|
+
PrettyOutput.print(f"加载缓存失败: {str(e)}",
|
|
189
|
+
output_type=OutputType.WARNING)
|
|
190
|
+
self.documents = []
|
|
191
|
+
self.index = None
|
|
192
|
+
|
|
193
|
+
def _save_cache(self, vectors: np.ndarray):
|
|
194
|
+
"""保存缓存数据"""
|
|
195
|
+
try:
|
|
196
|
+
cache_data = {
|
|
197
|
+
"documents": self.documents,
|
|
198
|
+
"vectors": vectors
|
|
199
|
+
}
|
|
200
|
+
with open(self.cache_path, 'wb') as f:
|
|
201
|
+
pickle.dump(cache_data, f)
|
|
202
|
+
PrettyOutput.print(f"保存了 {len(self.documents)} 个文档片段",
|
|
203
|
+
output_type=OutputType.INFO)
|
|
204
|
+
except Exception as e:
|
|
205
|
+
PrettyOutput.print(f"保存缓存失败: {str(e)}",
|
|
206
|
+
output_type=OutputType.ERROR)
|
|
207
|
+
|
|
208
|
+
def _build_index(self, vectors: np.ndarray):
|
|
209
|
+
"""构建FAISS索引"""
|
|
210
|
+
# 创建HNSW索引
|
|
211
|
+
hnsw_index = faiss.IndexHNSWFlat(self.vector_dim, 16)
|
|
212
|
+
hnsw_index.hnsw.efConstruction = 40
|
|
213
|
+
hnsw_index.hnsw.efSearch = 16
|
|
214
|
+
|
|
215
|
+
# 用IndexIDMap包装HNSW索引
|
|
216
|
+
self.index = faiss.IndexIDMap(hnsw_index)
|
|
217
|
+
|
|
218
|
+
# 添加向量到索引
|
|
219
|
+
if vectors.shape[0] > 0:
|
|
220
|
+
self.index.add_with_ids(vectors, np.arange(vectors.shape[0]))
|
|
221
|
+
else:
|
|
222
|
+
self.index = None
|
|
223
|
+
|
|
224
|
+
def _split_text(self, text: str) -> List[str]:
|
|
225
|
+
"""将文本分割成段落
|
|
226
|
+
|
|
227
|
+
Args:
|
|
228
|
+
text: 要分割的文本
|
|
229
|
+
|
|
230
|
+
Returns:
|
|
231
|
+
分割后的段落列表
|
|
232
|
+
"""
|
|
233
|
+
# 首先按空行分割
|
|
234
|
+
paragraphs = []
|
|
235
|
+
current_paragraph = []
|
|
236
|
+
|
|
237
|
+
for line in text.split('\n'):
|
|
238
|
+
line = line.strip()
|
|
239
|
+
if not line: # 空行表示段落结束
|
|
240
|
+
if current_paragraph:
|
|
241
|
+
paragraph_text = ' '.join(current_paragraph)
|
|
242
|
+
if len(paragraph_text) >= self.min_paragraph_length:
|
|
243
|
+
paragraphs.append(paragraph_text)
|
|
244
|
+
current_paragraph = []
|
|
245
|
+
else:
|
|
246
|
+
current_paragraph.append(line)
|
|
247
|
+
|
|
248
|
+
# 处理最后一个段落
|
|
249
|
+
if current_paragraph:
|
|
250
|
+
paragraph_text = ' '.join(current_paragraph)
|
|
251
|
+
if len(paragraph_text) >= self.min_paragraph_length:
|
|
252
|
+
paragraphs.append(paragraph_text)
|
|
253
|
+
|
|
254
|
+
# 处理过长的段落
|
|
255
|
+
final_paragraphs = []
|
|
256
|
+
for paragraph in paragraphs:
|
|
257
|
+
if len(paragraph) <= self.max_paragraph_length:
|
|
258
|
+
final_paragraphs.append(paragraph)
|
|
259
|
+
else:
|
|
260
|
+
# 按句子分割过长的段落
|
|
261
|
+
sentences = []
|
|
262
|
+
current_sentence = []
|
|
263
|
+
|
|
264
|
+
# 中文句子结束标记
|
|
265
|
+
sentence_ends = {'。', '!', '?', '…', '.', '!', '?'}
|
|
266
|
+
|
|
267
|
+
for char in paragraph:
|
|
268
|
+
current_sentence.append(char)
|
|
269
|
+
if char in sentence_ends:
|
|
270
|
+
sentence = ''.join(current_sentence)
|
|
271
|
+
if sentence.strip():
|
|
272
|
+
sentences.append(sentence)
|
|
273
|
+
current_sentence = []
|
|
274
|
+
|
|
275
|
+
# 处理最后一个句子
|
|
276
|
+
if current_sentence:
|
|
277
|
+
sentence = ''.join(current_sentence)
|
|
278
|
+
if sentence.strip():
|
|
279
|
+
sentences.append(sentence)
|
|
280
|
+
|
|
281
|
+
# 组合句子成适当长度的段落
|
|
282
|
+
current_chunk = []
|
|
283
|
+
current_length = 0
|
|
284
|
+
|
|
285
|
+
for sentence in sentences:
|
|
286
|
+
sentence_length = len(sentence)
|
|
287
|
+
if current_length + sentence_length > self.max_paragraph_length:
|
|
288
|
+
if current_chunk:
|
|
289
|
+
final_paragraphs.append(''.join(current_chunk))
|
|
290
|
+
current_chunk = [sentence]
|
|
291
|
+
current_length = sentence_length
|
|
292
|
+
else:
|
|
293
|
+
current_chunk.append(sentence)
|
|
294
|
+
current_length += sentence_length
|
|
295
|
+
|
|
296
|
+
# 处理最后一个chunk
|
|
297
|
+
if current_chunk:
|
|
298
|
+
final_paragraphs.append(''.join(current_chunk))
|
|
299
|
+
|
|
300
|
+
# 过滤掉太短的段落
|
|
301
|
+
final_paragraphs = [p for p in final_paragraphs if len(p) >= self.min_paragraph_length]
|
|
302
|
+
|
|
303
|
+
return final_paragraphs
|
|
304
|
+
|
|
305
|
+
def _get_embedding(self, text: str) -> np.ndarray:
|
|
306
|
+
"""获取文本的向量表示"""
|
|
307
|
+
embedding = self.embedding_model.encode(text,
|
|
308
|
+
normalize_embeddings=True,
|
|
309
|
+
show_progress_bar=False)
|
|
310
|
+
return np.array(embedding, dtype=np.float32)
|
|
311
|
+
|
|
312
|
+
def _process_file(self, file_path: str) -> List[Document]:
|
|
313
|
+
"""处理单个文件
|
|
314
|
+
|
|
315
|
+
Args:
|
|
316
|
+
file_path: 文件路径
|
|
317
|
+
|
|
318
|
+
Returns:
|
|
319
|
+
文档对象列表
|
|
320
|
+
"""
|
|
321
|
+
try:
|
|
322
|
+
# 查找合适的处理器
|
|
323
|
+
processor = None
|
|
324
|
+
for p in self.file_processors:
|
|
325
|
+
if p.can_handle(file_path):
|
|
326
|
+
processor = p
|
|
327
|
+
break
|
|
328
|
+
|
|
329
|
+
if not processor:
|
|
330
|
+
PrettyOutput.print(f"跳过不支持的文件: {file_path}",
|
|
331
|
+
output_type=OutputType.WARNING)
|
|
332
|
+
return []
|
|
333
|
+
|
|
334
|
+
# 提取文本内容
|
|
335
|
+
content = processor.extract_text(file_path)
|
|
336
|
+
if not content.strip():
|
|
337
|
+
PrettyOutput.print(f"文件内容为空: {file_path}",
|
|
338
|
+
output_type=OutputType.WARNING)
|
|
339
|
+
return []
|
|
340
|
+
|
|
341
|
+
|
|
342
|
+
# 分割文本
|
|
343
|
+
chunks = self._split_text(content)
|
|
344
|
+
|
|
345
|
+
# 创建文档对象
|
|
346
|
+
documents = []
|
|
347
|
+
for i, chunk in enumerate(chunks):
|
|
348
|
+
doc = Document(
|
|
349
|
+
content=chunk,
|
|
350
|
+
metadata={
|
|
351
|
+
"file_path": file_path,
|
|
352
|
+
"file_type": Path(file_path).suffix.lower(),
|
|
353
|
+
"chunk_index": i,
|
|
354
|
+
"total_chunks": len(chunks)
|
|
355
|
+
}
|
|
356
|
+
)
|
|
357
|
+
documents.append(doc)
|
|
358
|
+
|
|
359
|
+
return documents
|
|
360
|
+
|
|
361
|
+
except Exception as e:
|
|
362
|
+
PrettyOutput.print(f"处理文件失败 {file_path}: {str(e)}",
|
|
363
|
+
output_type=OutputType.ERROR)
|
|
364
|
+
return []
|
|
365
|
+
|
|
366
|
+
def build_index(self):
|
|
367
|
+
"""构建文档索引"""
|
|
368
|
+
# 获取所有文件
|
|
369
|
+
all_files = []
|
|
370
|
+
for root, _, files in os.walk(self.root_dir):
|
|
371
|
+
if any(ignored in root for ignored in ['.jarvis-rag', '.git', '__pycache__', 'node_modules']):
|
|
372
|
+
continue
|
|
373
|
+
for file in files:
|
|
374
|
+
file_path = os.path.join(root, file)
|
|
375
|
+
# 跳过大文件
|
|
376
|
+
if os.path.getsize(file_path) > 10 * 1024 * 1024: # 10MB
|
|
377
|
+
PrettyOutput.print(f"跳过大文件: {file_path}",
|
|
378
|
+
output_type=OutputType.WARNING)
|
|
379
|
+
continue
|
|
380
|
+
all_files.append(file_path)
|
|
381
|
+
|
|
382
|
+
# 处理所有文件
|
|
383
|
+
self.documents = []
|
|
384
|
+
for file_path in tqdm(all_files, desc="处理文件"):
|
|
385
|
+
docs = self._process_file(file_path)
|
|
386
|
+
self.documents.extend(docs)
|
|
387
|
+
|
|
388
|
+
# 获取所有文档的向量表示
|
|
389
|
+
vectors = []
|
|
390
|
+
for doc in tqdm(self.documents, desc="生成向量"):
|
|
391
|
+
vector = self._get_embedding(doc.content)
|
|
392
|
+
vectors.append(vector)
|
|
393
|
+
|
|
394
|
+
if vectors:
|
|
395
|
+
vectors = np.vstack(vectors)
|
|
396
|
+
# 构建索引
|
|
397
|
+
self._build_index(vectors)
|
|
398
|
+
# 保存缓存
|
|
399
|
+
self._save_cache(vectors)
|
|
400
|
+
|
|
401
|
+
PrettyOutput.print(f"成功索引了 {len(self.documents)} 个文档片段",
|
|
402
|
+
output_type=OutputType.SUCCESS)
|
|
403
|
+
|
|
404
|
+
def search(self, query: str, top_k: int = 5) -> List[Tuple[Document, float]]:
|
|
405
|
+
"""搜索相关文档
|
|
406
|
+
|
|
407
|
+
Args:
|
|
408
|
+
query: 查询文本
|
|
409
|
+
top_k: 返回结果数量
|
|
410
|
+
|
|
411
|
+
Returns:
|
|
412
|
+
文档和相似度得分的列表
|
|
413
|
+
"""
|
|
414
|
+
if not self.index:
|
|
415
|
+
raise ValueError("索引未构建,请先调用build_index()")
|
|
416
|
+
|
|
417
|
+
# 获取查询的向量表示
|
|
418
|
+
query_vector = self._get_embedding(query)
|
|
419
|
+
query_vector = query_vector.reshape(1, -1)
|
|
420
|
+
|
|
421
|
+
# 搜索最相似的向量
|
|
422
|
+
distances, indices = self.index.search(query_vector, top_k)
|
|
423
|
+
|
|
424
|
+
# 返回结果
|
|
425
|
+
results = []
|
|
426
|
+
for idx, distance in zip(indices[0], distances[0]):
|
|
427
|
+
if idx == -1: # FAISS返回-1表示无效结果
|
|
428
|
+
continue
|
|
429
|
+
similarity = 1.0 / (1.0 + float(distance))
|
|
430
|
+
results.append((self.documents[idx], similarity))
|
|
431
|
+
|
|
432
|
+
return results
|
|
433
|
+
|
|
434
|
+
def main():
|
|
435
|
+
"""命令行入口"""
|
|
436
|
+
import argparse
|
|
437
|
+
import sys
|
|
438
|
+
|
|
439
|
+
# 设置标准输出编码为UTF-8
|
|
440
|
+
if sys.stdout.encoding != 'utf-8':
|
|
441
|
+
import codecs
|
|
442
|
+
sys.stdout = codecs.getwriter('utf-8')(sys.stdout.buffer, 'strict')
|
|
443
|
+
sys.stderr = codecs.getwriter('utf-8')(sys.stderr.buffer, 'strict')
|
|
444
|
+
|
|
445
|
+
parser = argparse.ArgumentParser(description='RAG工具')
|
|
446
|
+
parser.add_argument('--dir', type=str, default=os.getcwd(), help='项目根目录')
|
|
447
|
+
parser.add_argument('--build', action='store_true', help='构建索引')
|
|
448
|
+
parser.add_argument('--query', type=str, help='搜索查询')
|
|
449
|
+
parser.add_argument('--top-k', type=int, default=5, help='返回结果数量')
|
|
450
|
+
|
|
451
|
+
args = parser.parse_args()
|
|
452
|
+
|
|
453
|
+
try:
|
|
454
|
+
rag = RAGTool(args.dir)
|
|
455
|
+
|
|
456
|
+
if args.build:
|
|
457
|
+
rag.build_index()
|
|
458
|
+
|
|
459
|
+
if args.query:
|
|
460
|
+
results = rag.search(args.query, args.top_k)
|
|
461
|
+
|
|
462
|
+
if not results:
|
|
463
|
+
PrettyOutput.print("未找到相关内容", output_type=OutputType.WARNING)
|
|
464
|
+
return
|
|
465
|
+
|
|
466
|
+
PrettyOutput.print("\n搜索结果:", output_type=OutputType.INFO)
|
|
467
|
+
for doc, score in results:
|
|
468
|
+
PrettyOutput.print("\n" + "="*50, output_type=OutputType.INFO)
|
|
469
|
+
PrettyOutput.print(f"文件: {doc.metadata['file_path']}", output_type=OutputType.INFO)
|
|
470
|
+
PrettyOutput.print(f"相似度: {score:.3f}", output_type=OutputType.INFO)
|
|
471
|
+
PrettyOutput.print(f"片段 {doc.metadata['chunk_index'] + 1}/{doc.metadata['total_chunks']}",
|
|
472
|
+
output_type=OutputType.INFO)
|
|
473
|
+
PrettyOutput.print("\n内容:", output_type=OutputType.INFO)
|
|
474
|
+
# 确保内容是UTF-8编码
|
|
475
|
+
content = doc.content.encode('utf-8', errors='replace').decode('utf-8')
|
|
476
|
+
PrettyOutput.print(content, output_type=OutputType.INFO)
|
|
477
|
+
|
|
478
|
+
except Exception as e:
|
|
479
|
+
PrettyOutput.print(f"执行失败: {str(e)}", output_type=OutputType.ERROR)
|
|
480
|
+
return 1
|
|
481
|
+
|
|
482
|
+
if __name__ == "__main__":
|
|
483
|
+
main()
|
|
@@ -7,21 +7,21 @@ class CoderTool:
|
|
|
7
7
|
"""代码修改工具"""
|
|
8
8
|
|
|
9
9
|
name = "coder"
|
|
10
|
-
description = "
|
|
10
|
+
description = "分析并修改现有代码,用于实现新功能、修复bug、重构代码等。能理解代码上下文并进行精确的代码编辑。"
|
|
11
11
|
parameters = {
|
|
12
12
|
"feature": {
|
|
13
13
|
"type": "string",
|
|
14
|
-
"description": "
|
|
14
|
+
"description": "要实现的功能描述或需要修改的内容,例如:'添加日志功能'、'修复内存泄漏'、'优化性能'等",
|
|
15
15
|
"required": True
|
|
16
16
|
},
|
|
17
17
|
"dir": {
|
|
18
18
|
"type": "string",
|
|
19
|
-
"description": "
|
|
19
|
+
"description": "项目根目录,默认为当前目录",
|
|
20
20
|
"required": False
|
|
21
21
|
},
|
|
22
22
|
"language": {
|
|
23
23
|
"type": "string",
|
|
24
|
-
"description": "
|
|
24
|
+
"description": "项目的主要编程语言,默认为python",
|
|
25
25
|
"required": False
|
|
26
26
|
}
|
|
27
27
|
}
|
|
@@ -8,33 +8,58 @@ from urllib.parse import quote
|
|
|
8
8
|
def bing_search(query):
|
|
9
9
|
try:
|
|
10
10
|
with sync_playwright() as p:
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
11
|
+
# 启动浏览器时设置参数
|
|
12
|
+
browser = p.chromium.launch(
|
|
13
|
+
headless=True, # 无头模式
|
|
14
|
+
args=['--disable-gpu', '--no-sandbox', '--disable-dev-shm-usage']
|
|
15
15
|
)
|
|
16
|
-
|
|
17
|
-
page.wait_for_selector("#b_results", timeout=10000)
|
|
18
16
|
|
|
17
|
+
# 创建新页面并设置超时
|
|
18
|
+
page = browser.new_page(
|
|
19
|
+
user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36',
|
|
20
|
+
viewport={'width': 1920, 'height': 1080}
|
|
21
|
+
)
|
|
22
|
+
|
|
23
|
+
# 设置页面超时
|
|
24
|
+
page.set_default_timeout(60000)
|
|
25
|
+
|
|
26
|
+
# 访问搜索页面
|
|
27
|
+
url = f"https://www.bing.com/search?q={quote(query)}&form=QBLH&sp=-1"
|
|
28
|
+
page.goto(url, wait_until="networkidle")
|
|
29
|
+
|
|
30
|
+
# 等待搜索结果加载
|
|
31
|
+
page.wait_for_selector("#b_results", state="visible", timeout=30000)
|
|
32
|
+
|
|
33
|
+
# 等待一下以确保结果完全加载
|
|
34
|
+
page.wait_for_timeout(1000)
|
|
35
|
+
|
|
36
|
+
# 提取搜索结果
|
|
19
37
|
summaries = page.evaluate("""() => {
|
|
20
|
-
const
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
const
|
|
25
|
-
const
|
|
26
|
-
const
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
38
|
+
const results = [];
|
|
39
|
+
const elements = document.querySelectorAll("#b_results > .b_algo");
|
|
40
|
+
|
|
41
|
+
for (const el of elements) {
|
|
42
|
+
const titleEl = el.querySelector("h2");
|
|
43
|
+
const linkEl = titleEl ? titleEl.querySelector("a") : null;
|
|
44
|
+
const abstractEl = el.querySelector(".b_caption p");
|
|
45
|
+
|
|
46
|
+
if (linkEl) {
|
|
47
|
+
results.push({
|
|
48
|
+
title: titleEl.innerText.trim(),
|
|
49
|
+
href: linkEl.href,
|
|
50
|
+
abstract: abstractEl ? abstractEl.innerText.trim() : ""
|
|
51
|
+
});
|
|
52
|
+
}
|
|
53
|
+
}
|
|
54
|
+
return results;
|
|
31
55
|
}""")
|
|
32
56
|
|
|
33
57
|
browser.close()
|
|
34
|
-
print(summaries)
|
|
35
58
|
return summaries
|
|
59
|
+
|
|
36
60
|
except Exception as error:
|
|
37
|
-
print("
|
|
61
|
+
PrettyOutput.print(f"搜索出错: {str(error)}", OutputType.ERROR)
|
|
62
|
+
return None
|
|
38
63
|
|
|
39
64
|
class SearchTool:
|
|
40
65
|
name = "search"
|
|
@@ -158,4 +183,46 @@ class SearchTool:
|
|
|
158
183
|
return {
|
|
159
184
|
"success": False,
|
|
160
185
|
"error": f"搜索失败: {str(e)}"
|
|
161
|
-
}
|
|
186
|
+
}
|
|
187
|
+
|
|
188
|
+
def main():
|
|
189
|
+
"""命令行直接运行搜索工具"""
|
|
190
|
+
import argparse
|
|
191
|
+
import sys
|
|
192
|
+
|
|
193
|
+
parser = argparse.ArgumentParser(description='Bing搜索工具')
|
|
194
|
+
parser.add_argument('query', help='搜索关键词')
|
|
195
|
+
parser.add_argument('--max', type=int, default=5, help='最大结果数量(默认5)')
|
|
196
|
+
parser.add_argument('--url-only', action='store_true', help='只显示URL')
|
|
197
|
+
args = parser.parse_args()
|
|
198
|
+
|
|
199
|
+
try:
|
|
200
|
+
PrettyOutput.print(f"正在搜索: {args.query}", OutputType.INFO)
|
|
201
|
+
|
|
202
|
+
results = bing_search(args.query)
|
|
203
|
+
|
|
204
|
+
if not results:
|
|
205
|
+
PrettyOutput.print("未找到搜索结果", OutputType.WARNING)
|
|
206
|
+
sys.exit(1)
|
|
207
|
+
|
|
208
|
+
PrettyOutput.print(f"\n找到 {len(results)} 条结果:", OutputType.INFO)
|
|
209
|
+
|
|
210
|
+
for i, result in enumerate(results[:args.max], 1):
|
|
211
|
+
PrettyOutput.print(f"\n{'-'*50}", OutputType.INFO)
|
|
212
|
+
if args.url_only:
|
|
213
|
+
PrettyOutput.print(f"{i}. {result['href']}", OutputType.INFO)
|
|
214
|
+
else:
|
|
215
|
+
PrettyOutput.print(f"{i}. {result['title']}", OutputType.INFO)
|
|
216
|
+
PrettyOutput.print(f"链接: {result['href']}", OutputType.INFO)
|
|
217
|
+
if result['abstract']:
|
|
218
|
+
PrettyOutput.print(f"摘要: {result['abstract']}", OutputType.INFO)
|
|
219
|
+
|
|
220
|
+
except KeyboardInterrupt:
|
|
221
|
+
PrettyOutput.print("\n搜索已取消", OutputType.WARNING)
|
|
222
|
+
sys.exit(1)
|
|
223
|
+
except Exception as e:
|
|
224
|
+
PrettyOutput.print(f"执行出错: {str(e)}", OutputType.ERROR)
|
|
225
|
+
sys.exit(1)
|
|
226
|
+
|
|
227
|
+
if __name__ == "__main__":
|
|
228
|
+
main()
|
|
@@ -9,6 +9,7 @@ from colorama import Fore, Style as ColoramaStyle
|
|
|
9
9
|
from prompt_toolkit import PromptSession
|
|
10
10
|
from prompt_toolkit.styles import Style as PromptStyle
|
|
11
11
|
from prompt_toolkit.formatted_text import FormattedText
|
|
12
|
+
from sentence_transformers import SentenceTransformer
|
|
12
13
|
|
|
13
14
|
# 初始化colorama
|
|
14
15
|
colorama.init()
|
|
@@ -206,4 +207,27 @@ def find_git_root(dir="."):
|
|
|
206
207
|
os.chdir(dir)
|
|
207
208
|
ret = os.popen("git rev-parse --show-toplevel").read().strip()
|
|
208
209
|
os.chdir(curr_dir)
|
|
209
|
-
return ret
|
|
210
|
+
return ret
|
|
211
|
+
|
|
212
|
+
def load_embedding_model(model_name: str):
|
|
213
|
+
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
|
214
|
+
PrettyOutput.print(f"正在加载嵌入模型: {model_name}...", OutputType.INFO)
|
|
215
|
+
try:
|
|
216
|
+
# 首先尝试离线加载
|
|
217
|
+
embedding_model = SentenceTransformer(
|
|
218
|
+
model_name,
|
|
219
|
+
device="cpu",
|
|
220
|
+
cache_folder=os.path.expanduser("~/.cache/huggingface/hub"),
|
|
221
|
+
local_files_only=True
|
|
222
|
+
)
|
|
223
|
+
PrettyOutput.print("使用本地缓存加载模型成功", OutputType.SUCCESS)
|
|
224
|
+
except Exception as local_error:
|
|
225
|
+
PrettyOutput.print(f"本地加载失败,尝试在线下载: {str(local_error)}", OutputType.WARNING)
|
|
226
|
+
# 如果离线加载失败,尝试在线下载
|
|
227
|
+
embedding_model = SentenceTransformer(
|
|
228
|
+
model_name,
|
|
229
|
+
device="cpu",
|
|
230
|
+
cache_folder=os.path.expanduser("~/.cache/huggingface/hub")
|
|
231
|
+
)
|
|
232
|
+
PrettyOutput.print("模型下载并加载成功", OutputType.SUCCESS)
|
|
233
|
+
return embedding_model
|
{jarvis_ai_assistant-0.1.75 → jarvis_ai_assistant-0.1.77/src/jarvis_ai_assistant.egg-info}/PKG-INFO
RENAMED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.2
|
|
2
2
|
Name: jarvis-ai-assistant
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.77
|
|
4
4
|
Summary: Jarvis: An AI assistant that uses tools to interact with the system
|
|
5
5
|
Home-page: https://github.com/skyfireitdiy/Jarvis
|
|
6
6
|
Author: skyfire
|
|
@@ -35,6 +35,7 @@ Classifier: Programming Language :: Python :: 3.8
|
|
|
35
35
|
Classifier: Programming Language :: Python :: 3.9
|
|
36
36
|
Classifier: Programming Language :: Python :: 3.10
|
|
37
37
|
Classifier: Programming Language :: Python :: 3.11
|
|
38
|
+
Classifier: Operating System :: POSIX :: Linux
|
|
38
39
|
Requires-Python: >=3.8
|
|
39
40
|
Description-Content-Type: text/markdown
|
|
40
41
|
License-File: LICENSE
|
|
@@ -48,6 +49,10 @@ Requires-Dist: numpy>=1.24.0
|
|
|
48
49
|
Requires-Dist: faiss-cpu>=1.8.0
|
|
49
50
|
Requires-Dist: sentence-transformers>=2.2.2
|
|
50
51
|
Requires-Dist: bs4>=0.0.1
|
|
52
|
+
Requires-Dist: PyMuPDF>=1.21.0
|
|
53
|
+
Requires-Dist: python-docx>=0.8.11
|
|
54
|
+
Requires-Dist: tiktoken>=0.3.0
|
|
55
|
+
Requires-Dist: tqdm>=4.65.0
|
|
51
56
|
Provides-Extra: dev
|
|
52
57
|
Requires-Dist: pytest; extra == "dev"
|
|
53
58
|
Requires-Dist: black; extra == "dev"
|
|
@@ -124,6 +129,7 @@ Jarvis supports configuration through environment variables that can be set in t
|
|
|
124
129
|
|---------|------|--------|------|
|
|
125
130
|
| JARVIS_PLATFORM | AI platform to use, supports kimi/openai/ai8 etc | kimi | Yes |
|
|
126
131
|
| JARVIS_MODEL | Model name to use | - | No |
|
|
132
|
+
| JARVIS_THREAD_COUNT | Number of threads for parallel processing | 10 | No |
|
|
127
133
|
| JARVIS_CODEGEN_PLATFORM | AI platform for code generation | Same as JARVIS_PLATFORM | No |
|
|
128
134
|
| JARVIS_CODEGEN_MODEL | Model name for code generation | Same as JARVIS_MODEL | No |
|
|
129
135
|
| JARVIS_CHEAP_PLATFORM | AI platform for cheap operations | Same as JARVIS_PLATFORM | No |
|
|
@@ -141,36 +147,47 @@ Jarvis supports configuration through environment variables that can be set in t
|
|
|
141
147
|
|
|
142
148
|
## 🎯 Usage
|
|
143
149
|
|
|
144
|
-
###
|
|
150
|
+
### Main Assistant
|
|
145
151
|
```bash
|
|
146
152
|
jarvis
|
|
147
153
|
```
|
|
148
154
|
|
|
149
|
-
|
|
150
|
-
### With Specific Model
|
|
155
|
+
### Code Generation
|
|
151
156
|
```bash
|
|
152
|
-
jarvis
|
|
153
|
-
jarvis -p openai # Use OpenAI platform
|
|
154
|
-
```
|
|
155
|
-
|
|
156
|
-
### Code Modification
|
|
157
|
-
```bash
|
|
158
|
-
jarvis-coder --feature "Add new feature" # Modify code to add new feature
|
|
157
|
+
jarvis-coder
|
|
159
158
|
```
|
|
160
159
|
|
|
161
160
|
### Codebase Search
|
|
162
161
|
```bash
|
|
163
|
-
|
|
162
|
+
# Generate codebase index
|
|
163
|
+
jarvis-codebase --generate
|
|
164
|
+
|
|
165
|
+
# Search similar code
|
|
166
|
+
jarvis-codebase --search "your search query"
|
|
167
|
+
|
|
168
|
+
# Ask questions about codebase
|
|
169
|
+
jarvis-codebase --ask "your question"
|
|
164
170
|
```
|
|
165
171
|
|
|
166
|
-
###
|
|
172
|
+
### Document Analysis (RAG)
|
|
167
173
|
```bash
|
|
168
|
-
|
|
174
|
+
# Build document index
|
|
175
|
+
jarvis-rag --dir /path/to/documents --build
|
|
176
|
+
|
|
177
|
+
# Search documents
|
|
178
|
+
jarvis-rag --query "your search query"
|
|
169
179
|
```
|
|
170
180
|
|
|
171
|
-
###
|
|
181
|
+
### Search Tool
|
|
172
182
|
```bash
|
|
173
|
-
|
|
183
|
+
# Basic search
|
|
184
|
+
jarvis-search "your query"
|
|
185
|
+
|
|
186
|
+
# Show only URLs
|
|
187
|
+
jarvis-search "your query" --url-only
|
|
188
|
+
|
|
189
|
+
# Limit results
|
|
190
|
+
jarvis-search "your query" --max 3
|
|
174
191
|
```
|
|
175
192
|
|
|
176
193
|
## 🛠️ Tools
|
|
@@ -18,6 +18,8 @@ src/jarvis/models/kimi.py
|
|
|
18
18
|
src/jarvis/models/openai.py
|
|
19
19
|
src/jarvis/models/oyi.py
|
|
20
20
|
src/jarvis/models/registry.py
|
|
21
|
+
src/jarvis/rag/__init__.py
|
|
22
|
+
src/jarvis/rag/main.py
|
|
21
23
|
src/jarvis/tools/__init__.py
|
|
22
24
|
src/jarvis/tools/base.py
|
|
23
25
|
src/jarvis/tools/codebase_qa.py
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{jarvis_ai_assistant-0.1.75 → jarvis_ai_assistant-0.1.77}/src/jarvis/jarvis_codebase/__init__.py
RENAMED
|
File without changes
|
{jarvis_ai_assistant-0.1.75 → jarvis_ai_assistant-0.1.77}/src/jarvis/jarvis_coder/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|