jarvis-ai-assistant 0.1.125__py3-none-any.whl → 0.1.128__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of jarvis-ai-assistant might be problematic. Click here for more details.
- jarvis/__init__.py +1 -1
- jarvis/jarvis_agent/__init__.py +205 -187
- jarvis/jarvis_code_agent/code_agent.py +116 -109
- jarvis/jarvis_code_agent/patch.py +157 -138
- jarvis/jarvis_code_agent/shell_input_handler.py +22 -0
- jarvis/jarvis_codebase/main.py +314 -288
- jarvis/jarvis_dev/main.py +695 -716
- jarvis/jarvis_lsp/base.py +0 -12
- jarvis/jarvis_lsp/cpp.py +0 -9
- jarvis/jarvis_lsp/go.py +0 -9
- jarvis/jarvis_lsp/python.py +0 -28
- jarvis/jarvis_lsp/registry.py +0 -1
- jarvis/jarvis_lsp/rust.py +0 -9
- jarvis/jarvis_multi_agent/__init__.py +52 -52
- jarvis/jarvis_platform/base.py +6 -5
- jarvis/jarvis_platform_manager/main.py +1 -1
- jarvis/jarvis_rag/main.py +250 -186
- jarvis/jarvis_smart_shell/main.py +0 -1
- jarvis/jarvis_tools/ask_codebase.py +10 -9
- jarvis/jarvis_tools/ask_user.py +2 -2
- jarvis/jarvis_tools/base.py +4 -4
- jarvis/jarvis_tools/chdir.py +28 -28
- jarvis/jarvis_tools/code_review.py +44 -39
- jarvis/jarvis_tools/create_code_agent.py +4 -4
- jarvis/jarvis_tools/create_sub_agent.py +7 -7
- jarvis/jarvis_tools/execute_shell.py +53 -23
- jarvis/jarvis_tools/execute_shell_script.py +3 -3
- jarvis/jarvis_tools/file_operation.py +70 -41
- jarvis/jarvis_tools/git_commiter.py +61 -51
- jarvis/jarvis_tools/lsp_find_definition.py +7 -7
- jarvis/jarvis_tools/lsp_prepare_rename.py +7 -7
- jarvis/jarvis_tools/methodology.py +6 -6
- jarvis/jarvis_tools/rag.py +5 -5
- jarvis/jarvis_tools/read_webpage.py +52 -32
- jarvis/jarvis_tools/registry.py +167 -180
- jarvis/jarvis_tools/search_web.py +66 -41
- jarvis/jarvis_tools/select_code_files.py +3 -3
- jarvis/jarvis_tools/tool_generator.py +68 -55
- jarvis/jarvis_utils/methodology.py +77 -59
- jarvis/jarvis_utils/output.py +1 -0
- {jarvis_ai_assistant-0.1.125.dist-info → jarvis_ai_assistant-0.1.128.dist-info}/METADATA +31 -17
- jarvis_ai_assistant-0.1.128.dist-info/RECORD +74 -0
- {jarvis_ai_assistant-0.1.125.dist-info → jarvis_ai_assistant-0.1.128.dist-info}/WHEEL +1 -1
- jarvis/jarvis_tools/lsp_validate_edit.py +0 -141
- jarvis/jarvis_tools/read_code.py +0 -192
- jarvis_ai_assistant-0.1.125.dist-info/RECORD +0 -75
- {jarvis_ai_assistant-0.1.125.dist-info → jarvis_ai_assistant-0.1.128.dist-info}/LICENSE +0 -0
- {jarvis_ai_assistant-0.1.125.dist-info → jarvis_ai_assistant-0.1.128.dist-info}/entry_points.txt +0 -0
- {jarvis_ai_assistant-0.1.125.dist-info → jarvis_ai_assistant-0.1.128.dist-info}/top_level.txt +0 -0
jarvis/jarvis_codebase/main.py
CHANGED
|
@@ -4,6 +4,8 @@ import numpy as np
|
|
|
4
4
|
import faiss
|
|
5
5
|
from typing import List, Tuple, Optional, Dict
|
|
6
6
|
|
|
7
|
+
from yaspin import yaspin
|
|
8
|
+
|
|
7
9
|
from jarvis.jarvis_platform.registry import PlatformRegistry
|
|
8
10
|
import concurrent.futures
|
|
9
11
|
from concurrent.futures import ThreadPoolExecutor
|
|
@@ -21,7 +23,11 @@ from jarvis.jarvis_utils.utils import get_file_md5, init_env, user_confirm
|
|
|
21
23
|
|
|
22
24
|
class CodeBase:
|
|
23
25
|
def __init__(self, root_dir: str):
|
|
24
|
-
|
|
26
|
+
with yaspin(text="正在初始化环境...", color="cyan") as spinner:
|
|
27
|
+
init_env()
|
|
28
|
+
spinner.text = "环境初始化完成"
|
|
29
|
+
spinner.ok("✅")
|
|
30
|
+
|
|
25
31
|
self.root_dir = root_dir
|
|
26
32
|
os.chdir(self.root_dir)
|
|
27
33
|
self.thread_count = get_thread_count()
|
|
@@ -29,22 +35,28 @@ class CodeBase:
|
|
|
29
35
|
self.index = None
|
|
30
36
|
|
|
31
37
|
# 初始化数据目录
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
os.
|
|
36
|
-
|
|
38
|
+
with yaspin(text="正在初始化数据目录...", color="cyan") as spinner:
|
|
39
|
+
self.data_dir = os.path.join(self.root_dir, ".jarvis/codebase")
|
|
40
|
+
self.cache_dir = os.path.join(self.data_dir, "cache")
|
|
41
|
+
if not os.path.exists(self.cache_dir):
|
|
42
|
+
os.makedirs(self.cache_dir)
|
|
43
|
+
spinner.text = "数据目录初始化完成"
|
|
44
|
+
spinner.ok("✅")
|
|
45
|
+
|
|
46
|
+
with yaspin("正在初始化嵌入模型...", color="cyan") as spinner:
|
|
37
47
|
# 初始化嵌入模型
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
+
try:
|
|
49
|
+
self.embedding_model = load_embedding_model()
|
|
50
|
+
test_text = """This is a test text"""
|
|
51
|
+
self.embedding_model.encode([test_text],
|
|
52
|
+
convert_to_tensor=True,
|
|
53
|
+
normalize_embeddings=True)
|
|
54
|
+
spinner.text = "嵌入模型初始化完成"
|
|
55
|
+
spinner.ok("✅")
|
|
56
|
+
except Exception as e:
|
|
57
|
+
spinner.text = "嵌入模型初始化失败"
|
|
58
|
+
spinner.fail("❌")
|
|
59
|
+
raise
|
|
48
60
|
|
|
49
61
|
self.vector_dim = self.embedding_model.get_sentence_embedding_dimension()
|
|
50
62
|
self.git_file_list = self.get_git_file_list()
|
|
@@ -55,7 +67,8 @@ class CodeBase:
|
|
|
55
67
|
self.file_paths = []
|
|
56
68
|
|
|
57
69
|
# 加载所有缓存文件
|
|
58
|
-
|
|
70
|
+
with spinner.hidden():
|
|
71
|
+
self._load_all_cache()
|
|
59
72
|
|
|
60
73
|
def get_git_file_list(self):
|
|
61
74
|
"""Get the list of files in the git repository, excluding the .jarvis-codebase directory"""
|
|
@@ -72,17 +85,13 @@ class CodeBase:
|
|
|
72
85
|
|
|
73
86
|
def make_description(self, file_path: str, content: str) -> str:
|
|
74
87
|
model = PlatformRegistry.get_global_platform_registry().get_cheap_platform()
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
Please use concise and professional language, emphasizing technical functionality to facilitate subsequent code retrieval.
|
|
84
|
-
File path: {file_path}
|
|
85
|
-
Code content:
|
|
88
|
+
prompt = f"""请分析以下代码文件并生成详细描述。描述应包含:
|
|
89
|
+
1. 文件整体功能描述
|
|
90
|
+
2. 对每个全局变量、函数、类型定义、类、方法和其他代码元素的描述
|
|
91
|
+
|
|
92
|
+
请使用简洁专业的语言,强调技术功能,以便于后续代码检索。
|
|
93
|
+
文件路径: {file_path}
|
|
94
|
+
代码内容:
|
|
86
95
|
{content}
|
|
87
96
|
"""
|
|
88
97
|
response = model.chat_until_success(prompt)
|
|
@@ -114,50 +123,52 @@ Code content:
|
|
|
114
123
|
|
|
115
124
|
def _load_all_cache(self):
|
|
116
125
|
"""Load all cache files"""
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
for cache_file in os.listdir(self.cache_dir):
|
|
124
|
-
if not cache_file.endswith('.cache'):
|
|
125
|
-
continue
|
|
126
|
-
|
|
127
|
-
cache_path = os.path.join(self.cache_dir, cache_file)
|
|
128
|
-
try:
|
|
129
|
-
with lzma.open(cache_path, 'rb') as f:
|
|
130
|
-
cache_data = pickle.load(f)
|
|
131
|
-
file_path = cache_data["path"]
|
|
132
|
-
self.vector_cache[file_path] = cache_data
|
|
133
|
-
self.file_paths.append(file_path)
|
|
134
|
-
vectors.append(cache_data["vector"])
|
|
135
|
-
except Exception as e:
|
|
136
|
-
PrettyOutput.print(f"加载缓存文件 {cache_file} 失败: {str(e)}",
|
|
137
|
-
output_type=OutputType.WARNING)
|
|
138
|
-
continue
|
|
139
|
-
|
|
140
|
-
if vectors:
|
|
141
|
-
# 重建索引
|
|
142
|
-
vectors_array = np.vstack(vectors)
|
|
143
|
-
hnsw_index = faiss.IndexHNSWFlat(self.vector_dim, 16)
|
|
144
|
-
hnsw_index.hnsw.efConstruction = 40
|
|
145
|
-
hnsw_index.hnsw.efSearch = 16
|
|
146
|
-
self.index = faiss.IndexIDMap(hnsw_index)
|
|
147
|
-
self.index.add_with_ids(vectors_array, np.array(range(len(vectors)))) # type: ignore
|
|
126
|
+
with yaspin(text="正在加载缓存文件...", color="cyan") as spinner:
|
|
127
|
+
try:
|
|
128
|
+
# 清空现有缓存和文件路径
|
|
129
|
+
self.vector_cache = {}
|
|
130
|
+
self.file_paths = []
|
|
131
|
+
vectors = []
|
|
148
132
|
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
133
|
+
for cache_file in os.listdir(self.cache_dir):
|
|
134
|
+
if not cache_file.endswith('.cache'):
|
|
135
|
+
continue
|
|
136
|
+
|
|
137
|
+
cache_path = os.path.join(self.cache_dir, cache_file)
|
|
138
|
+
try:
|
|
139
|
+
with lzma.open(cache_path, 'rb') as f:
|
|
140
|
+
cache_data = pickle.load(f)
|
|
141
|
+
file_path = cache_data["path"]
|
|
142
|
+
self.vector_cache[file_path] = cache_data
|
|
143
|
+
self.file_paths.append(file_path)
|
|
144
|
+
vectors.append(cache_data["vector"])
|
|
145
|
+
spinner.write(f"✅ 加载缓存文件成功 {file_path}")
|
|
146
|
+
except Exception as e:
|
|
147
|
+
spinner.write(f"❌ 加载缓存文件失败 {cache_file} {str(e)}")
|
|
148
|
+
continue
|
|
154
149
|
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
150
|
+
if vectors:
|
|
151
|
+
# 重建索引
|
|
152
|
+
vectors_array = np.vstack(vectors)
|
|
153
|
+
hnsw_index = faiss.IndexHNSWFlat(self.vector_dim, 16)
|
|
154
|
+
hnsw_index.hnsw.efConstruction = 40
|
|
155
|
+
hnsw_index.hnsw.efSearch = 16
|
|
156
|
+
self.index = faiss.IndexIDMap(hnsw_index)
|
|
157
|
+
self.index.add_with_ids(vectors_array, np.array(range(len(vectors)))) # type: ignore
|
|
158
|
+
|
|
159
|
+
spinner.text = f"加载 {len(self.vector_cache)} 个向量缓存并重建索引"
|
|
160
|
+
spinner.ok("✅")
|
|
161
|
+
else:
|
|
162
|
+
self.index = None
|
|
163
|
+
spinner.text = "没有找到有效的缓存文件"
|
|
164
|
+
spinner.ok("✅")
|
|
165
|
+
|
|
166
|
+
except Exception as e:
|
|
167
|
+
spinner.text = f"加载缓存目录失败: {str(e)}"
|
|
168
|
+
spinner.fail("❌")
|
|
169
|
+
self.vector_cache = {}
|
|
170
|
+
self.file_paths = []
|
|
171
|
+
self.index = None
|
|
161
172
|
|
|
162
173
|
def cache_vector(self, file_path: str, vector: np.ndarray, description: str):
|
|
163
174
|
"""Cache the vector representation of a file"""
|
|
@@ -320,7 +331,7 @@ Content: {content}
|
|
|
320
331
|
ids = []
|
|
321
332
|
self.file_paths = [] # Reset the file path list
|
|
322
333
|
|
|
323
|
-
for i, (file_path, data) in enumerate(self.vector_cache.items()):
|
|
334
|
+
for i, ( file_path, data) in enumerate(self.vector_cache.items()):
|
|
324
335
|
if "vector" not in data:
|
|
325
336
|
PrettyOutput.print(f"无效的缓存数据 {file_path}: 缺少向量",
|
|
326
337
|
output_type=OutputType.WARNING)
|
|
@@ -450,7 +461,6 @@ Content: {content}
|
|
|
450
461
|
# If force is True, continue directly
|
|
451
462
|
if not force:
|
|
452
463
|
if not user_confirm("重建索引?", False):
|
|
453
|
-
PrettyOutput.print("取消重建索引", output_type=OutputType.INFO)
|
|
454
464
|
return
|
|
455
465
|
|
|
456
466
|
# Clean deleted files
|
|
@@ -464,7 +474,7 @@ Content: {content}
|
|
|
464
474
|
files_to_process = new_files + modified_files
|
|
465
475
|
processed_files = []
|
|
466
476
|
|
|
467
|
-
with
|
|
477
|
+
with yaspin(text="正在处理文件...", color="cyan") as spinner:
|
|
468
478
|
# Use a thread pool to process files
|
|
469
479
|
with ThreadPoolExecutor(max_workers=self.thread_count) as executor:
|
|
470
480
|
# Submit all tasks
|
|
@@ -480,16 +490,18 @@ Content: {content}
|
|
|
480
490
|
result = future.result()
|
|
481
491
|
if result:
|
|
482
492
|
processed_files.append(result)
|
|
493
|
+
spinner.write(f"✅ 处理文件成功 {file}")
|
|
483
494
|
except Exception as e:
|
|
484
|
-
|
|
485
|
-
|
|
486
|
-
|
|
495
|
+
spinner.write(f"❌ 处理文件失败 {file}: {str(e)}")
|
|
496
|
+
|
|
497
|
+
spinner.text = f"处理完成"
|
|
498
|
+
spinner.ok("✅")
|
|
487
499
|
|
|
488
500
|
if processed_files:
|
|
489
|
-
|
|
490
|
-
|
|
491
|
-
|
|
492
|
-
|
|
501
|
+
with yaspin(text="重建向量数据库...", color="cyan") as spinner:
|
|
502
|
+
self.gen_vector_db_from_cache()
|
|
503
|
+
spinner.text = f"成功生成了 {len(processed_files)} 个文件的索引"
|
|
504
|
+
spinner.ok("✅")
|
|
493
505
|
else:
|
|
494
506
|
PrettyOutput.print("没有检测到文件变化, 不需要重建索引", output_type=OutputType.INFO)
|
|
495
507
|
|
|
@@ -540,79 +552,79 @@ Content: {content}
|
|
|
540
552
|
"""
|
|
541
553
|
if not initial_results:
|
|
542
554
|
return []
|
|
543
|
-
|
|
544
|
-
|
|
545
|
-
|
|
546
|
-
|
|
547
|
-
|
|
548
|
-
|
|
549
|
-
|
|
550
|
-
|
|
551
|
-
|
|
552
|
-
|
|
553
|
-
|
|
554
|
-
|
|
555
|
-
|
|
556
|
-
|
|
557
|
-
|
|
558
|
-
|
|
559
|
-
|
|
560
|
-
|
|
561
|
-
PrettyOutput.print(f"Truncating large file: {path}", OutputType.WARNING)
|
|
562
|
-
content = content[:max_file_length] + "\n... (content truncated)"
|
|
563
|
-
|
|
564
|
-
file_info = f"File: {path}\nContent: {content}\n\n"
|
|
565
|
-
tokens_count = get_context_token_count(file_info)
|
|
566
|
-
|
|
567
|
-
# If adding this file would exceed batch limit
|
|
568
|
-
if current_token_count + tokens_count > max_batch_length:
|
|
569
|
-
# Process current batch
|
|
570
|
-
if current_batch:
|
|
571
|
-
selected = self._process_batch('\n'.join(query), current_batch)
|
|
572
|
-
all_selected_files.extend(selected)
|
|
573
|
-
# Start new batch
|
|
574
|
-
current_batch = [file_info]
|
|
575
|
-
current_token_count = tokens_count
|
|
576
|
-
else:
|
|
577
|
-
current_batch.append(file_info)
|
|
578
|
-
current_token_count += tokens_count
|
|
555
|
+
with yaspin(text="正在筛选结果...", color="cyan") as spinner:
|
|
556
|
+
try:
|
|
557
|
+
# Maximum content length per batch
|
|
558
|
+
max_batch_length = self.max_token_count - 1000 # Reserve space for prompt
|
|
559
|
+
max_file_length = max_batch_length // 3 # Limit individual file size
|
|
560
|
+
|
|
561
|
+
# Process files in batches
|
|
562
|
+
all_selected_files = []
|
|
563
|
+
current_batch = []
|
|
564
|
+
current_token_count = 0
|
|
565
|
+
|
|
566
|
+
for path in initial_results:
|
|
567
|
+
try:
|
|
568
|
+
content = open(path, "r", encoding="utf-8").read()
|
|
569
|
+
# Truncate large files
|
|
570
|
+
if get_context_token_count(content) > max_file_length:
|
|
571
|
+
spinner.write(f"❌ 截断大文件: {path}")
|
|
572
|
+
content = content[:max_file_length] + "\n... (content truncated)"
|
|
579
573
|
|
|
580
|
-
|
|
581
|
-
|
|
582
|
-
|
|
583
|
-
|
|
584
|
-
|
|
585
|
-
|
|
586
|
-
|
|
587
|
-
|
|
588
|
-
|
|
589
|
-
|
|
590
|
-
|
|
574
|
+
file_info = f"File: {path}\nContent: {content}\n\n"
|
|
575
|
+
tokens_count = get_context_token_count(file_info)
|
|
576
|
+
|
|
577
|
+
# If adding this file would exceed batch limit
|
|
578
|
+
if current_token_count + tokens_count > max_batch_length:
|
|
579
|
+
# Process current batch
|
|
580
|
+
if current_batch:
|
|
581
|
+
selected = self._process_batch('\n'.join(query), current_batch)
|
|
582
|
+
all_selected_files.extend(selected)
|
|
583
|
+
# Start new batch
|
|
584
|
+
current_batch = [file_info]
|
|
585
|
+
current_token_count = tokens_count
|
|
586
|
+
else:
|
|
587
|
+
current_batch.append(file_info)
|
|
588
|
+
current_token_count += tokens_count
|
|
589
|
+
|
|
590
|
+
except Exception as e:
|
|
591
|
+
spinner.write(f"❌ 读取 {path} 失败: {str(e)}")
|
|
592
|
+
continue
|
|
593
|
+
|
|
594
|
+
# Process final batch
|
|
595
|
+
if current_batch:
|
|
596
|
+
selected = self._process_batch('\n'.join(query), current_batch)
|
|
597
|
+
all_selected_files.extend(selected)
|
|
598
|
+
|
|
599
|
+
spinner.write("✅ 结果筛选完成")
|
|
600
|
+
# Convert set to list and maintain original order
|
|
601
|
+
return all_selected_files
|
|
591
602
|
|
|
592
|
-
|
|
593
|
-
|
|
594
|
-
|
|
603
|
+
except Exception as e:
|
|
604
|
+
spinner.text = f"选择失败: {str(e)}"
|
|
605
|
+
spinner.fail("❌")
|
|
606
|
+
return [{"file": f, "reason": "" } for f in initial_results]
|
|
595
607
|
|
|
596
608
|
def _process_batch(self, query: str, files_info: List[str]) -> List[Dict[str, str]]:
|
|
597
609
|
"""Process a batch of files"""
|
|
598
|
-
prompt = f"""
|
|
610
|
+
prompt = f"""作为一名代码分析专家,请使用链式思维推理帮助识别与给定查询最相关的文件。
|
|
599
611
|
|
|
600
|
-
|
|
612
|
+
查询: {query}
|
|
601
613
|
|
|
602
|
-
|
|
614
|
+
可用文件:
|
|
603
615
|
{''.join(files_info)}
|
|
604
616
|
|
|
605
|
-
|
|
606
|
-
1.
|
|
607
|
-
2.
|
|
608
|
-
-
|
|
609
|
-
-
|
|
610
|
-
-
|
|
611
|
-
-
|
|
612
|
-
3.
|
|
613
|
-
4.
|
|
614
|
-
|
|
615
|
-
|
|
617
|
+
请按以下步骤思考:
|
|
618
|
+
1. 首先,分析查询以识别关键需求和技术概念
|
|
619
|
+
2. 对于每个文件:
|
|
620
|
+
- 检查其路径和内容
|
|
621
|
+
- 评估其与查询需求的关系
|
|
622
|
+
- 考虑直接和间接关系
|
|
623
|
+
- 评估其相关性(高/中/低)
|
|
624
|
+
3. 仅选择与查询明确相关的文件
|
|
625
|
+
4. 按相关性排序,最相关的文件在前
|
|
626
|
+
|
|
627
|
+
请以YAML格式输出您的选择:
|
|
616
628
|
<FILES>
|
|
617
629
|
- file: path/to/most/relevant.py
|
|
618
630
|
reason: xxxxxxxxxx
|
|
@@ -620,17 +632,16 @@ Please output your selection in YAML format:
|
|
|
620
632
|
reason: yyyyyyyyyy
|
|
621
633
|
</FILES>
|
|
622
634
|
|
|
623
|
-
|
|
624
|
-
-
|
|
625
|
-
-
|
|
626
|
-
-
|
|
627
|
-
-
|
|
628
|
-
-
|
|
635
|
+
重要提示:
|
|
636
|
+
- 仅包含真正相关的文件
|
|
637
|
+
- 排除连接不明确或较弱的文件
|
|
638
|
+
- 重点关注实现文件而非测试文件
|
|
639
|
+
- 同时考虑文件路径和内容
|
|
640
|
+
- 仅输出文件路径,不要包含其他文本
|
|
629
641
|
"""
|
|
630
642
|
|
|
631
643
|
# Use a large model to evaluate
|
|
632
644
|
model = PlatformRegistry.get_global_platform_registry().get_normal_platform()
|
|
633
|
-
model.set_suppress_output(True)
|
|
634
645
|
response = model.chat_until_success(prompt)
|
|
635
646
|
|
|
636
647
|
# Parse the response
|
|
@@ -656,30 +667,28 @@ Important:
|
|
|
656
667
|
List[str]: The query variants list
|
|
657
668
|
"""
|
|
658
669
|
model = PlatformRegistry.get_global_platform_registry().get_normal_platform()
|
|
659
|
-
|
|
660
|
-
|
|
661
|
-
|
|
662
|
-
|
|
663
|
-
|
|
664
|
-
|
|
665
|
-
|
|
666
|
-
|
|
667
|
-
|
|
668
|
-
|
|
669
|
-
Original query:
|
|
670
|
+
prompt = f"""请基于以下查询生成10个针对向量搜索优化的不同表达。每个表达应满足:
|
|
671
|
+
1. 聚焦关键技术概念和术语
|
|
672
|
+
2. 使用清晰明确的语言
|
|
673
|
+
3. 包含重要的上下文术语
|
|
674
|
+
4. 避免使用通用或模糊的词语
|
|
675
|
+
5. 保持与原始查询的语义相似性
|
|
676
|
+
6. 适合基于嵌入的搜索
|
|
677
|
+
|
|
678
|
+
原始查询:
|
|
670
679
|
{query}
|
|
671
680
|
|
|
672
|
-
|
|
673
|
-
|
|
674
|
-
|
|
681
|
+
示例转换:
|
|
682
|
+
查询: "如何处理用户登录?"
|
|
683
|
+
输出格式:
|
|
675
684
|
<QUESTION>
|
|
676
|
-
-
|
|
677
|
-
-
|
|
678
|
-
-
|
|
685
|
+
- 用户认证的实现与流程
|
|
686
|
+
- 登录系统架构与组件
|
|
687
|
+
- 凭证验证与会话管理
|
|
679
688
|
- ...
|
|
680
689
|
</QUESTION>
|
|
681
690
|
|
|
682
|
-
|
|
691
|
+
请以指定格式提供10个搜索优化的表达。
|
|
683
692
|
"""
|
|
684
693
|
response = model.chat_until_success(prompt)
|
|
685
694
|
|
|
@@ -733,57 +742,70 @@ Please provide 10 search-optimized expressions in the specified format.
|
|
|
733
742
|
|
|
734
743
|
def search_similar(self, query: str, top_k: int = 30) -> List[Dict[str, str]]:
|
|
735
744
|
"""Search related files with optimized retrieval"""
|
|
736
|
-
|
|
737
|
-
|
|
738
|
-
|
|
739
|
-
|
|
740
|
-
|
|
741
|
-
|
|
742
|
-
|
|
743
|
-
|
|
744
|
-
|
|
745
|
-
|
|
746
|
-
|
|
747
|
-
|
|
748
|
-
|
|
749
|
-
# Get vector for each variant
|
|
750
|
-
query_vector = get_embedding(self.embedding_model, variant)
|
|
751
|
-
query_vector = query_vector.reshape(1, -1)
|
|
745
|
+
with yaspin(text="正在搜索相关文件...", color="cyan") as spinner:
|
|
746
|
+
try:
|
|
747
|
+
with spinner.hidden():
|
|
748
|
+
self.generate_codebase()
|
|
749
|
+
if self.index is None:
|
|
750
|
+
spinner.text = "没有找到有效的缓存文件"
|
|
751
|
+
spinner.ok("✅")
|
|
752
|
+
return []
|
|
753
|
+
|
|
754
|
+
# Generate query variants for better coverage
|
|
755
|
+
spinner.text = "生成查询变体..."
|
|
756
|
+
query_variants = self._generate_query_variants(query)
|
|
757
|
+
spinner.write("✅ 查询变体生成完成")
|
|
752
758
|
|
|
753
|
-
#
|
|
754
|
-
|
|
755
|
-
|
|
759
|
+
# Collect results from all variants
|
|
760
|
+
spinner.text = "收集结果..."
|
|
761
|
+
all_results = []
|
|
762
|
+
seen_files = set()
|
|
756
763
|
|
|
757
|
-
|
|
758
|
-
|
|
759
|
-
|
|
760
|
-
|
|
761
|
-
|
|
762
|
-
|
|
763
|
-
|
|
764
|
-
|
|
765
|
-
|
|
766
|
-
|
|
767
|
-
|
|
768
|
-
|
|
764
|
+
for variant in query_variants:
|
|
765
|
+
# Get vector for each variant
|
|
766
|
+
query_vector = get_embedding(self.embedding_model, variant)
|
|
767
|
+
query_vector = query_vector.reshape(1, -1)
|
|
768
|
+
|
|
769
|
+
# Search with current variant
|
|
770
|
+
initial_k = min(top_k * 2, len(self.file_paths))
|
|
771
|
+
distances, indices = self.index.search(query_vector, initial_k) # type: ignore
|
|
772
|
+
|
|
773
|
+
# Process results
|
|
774
|
+
for idx, dist in zip(indices[0], distances[0]):
|
|
775
|
+
if idx != -1:
|
|
776
|
+
file_path = self.file_paths[idx]
|
|
777
|
+
if file_path not in seen_files:
|
|
778
|
+
similarity = 1.0 / (1.0 + float(dist))
|
|
779
|
+
if similarity > 0.3: # Lower threshold for better recall
|
|
780
|
+
seen_files.add(file_path)
|
|
781
|
+
all_results.append((file_path, similarity, self.vector_cache[file_path]["description"]))
|
|
782
|
+
spinner.write("✅ 结果收集完成")
|
|
783
|
+
if not all_results:
|
|
784
|
+
spinner.text = "没有找到相关文件"
|
|
785
|
+
spinner.ok("✅")
|
|
786
|
+
return []
|
|
787
|
+
|
|
788
|
+
spinner.text = "排序..."
|
|
789
|
+
# Sort by similarity and take top_k
|
|
790
|
+
all_results.sort(key=lambda x: x[1], reverse=True)
|
|
791
|
+
results = all_results[:top_k]
|
|
792
|
+
spinner.write("✅ 排序完成")
|
|
769
793
|
|
|
770
|
-
|
|
771
|
-
|
|
772
|
-
results = all_results[:top_k]
|
|
773
|
-
|
|
774
|
-
results = self.pick_results(query_variants, [path for path, _, _ in results])
|
|
794
|
+
with spinner.hidden():
|
|
795
|
+
results = self.pick_results(query_variants, [path for path, _, _ in results])
|
|
775
796
|
|
|
776
|
-
|
|
777
|
-
|
|
778
|
-
|
|
779
|
-
PrettyOutput.print(output, output_type=OutputType.INFO, lang="markdown")
|
|
797
|
+
output = "Found related files:\n"
|
|
798
|
+
for file in results:
|
|
799
|
+
output += f'''- {file['file']} ({file['reason']})\n'''
|
|
780
800
|
|
|
781
|
-
|
|
782
|
-
|
|
783
|
-
|
|
784
|
-
|
|
785
|
-
|
|
786
|
-
|
|
801
|
+
spinner.text="结果输出完成"
|
|
802
|
+
spinner.ok("✅")
|
|
803
|
+
return results
|
|
804
|
+
|
|
805
|
+
except Exception as e:
|
|
806
|
+
spinner.text = f"搜索失败: {str(e)}"
|
|
807
|
+
spinner.fail("❌")
|
|
808
|
+
return []
|
|
787
809
|
|
|
788
810
|
def ask_codebase(self, query: str, top_k: int=20) -> Tuple[List[Dict[str, str]], str]:
|
|
789
811
|
"""Query the codebase with enhanced context building"""
|
|
@@ -794,85 +816,86 @@ Please provide 10 search-optimized expressions in the specified format.
|
|
|
794
816
|
return [], ""
|
|
795
817
|
|
|
796
818
|
prompt = f"""
|
|
797
|
-
# 🤖
|
|
798
|
-
|
|
799
|
-
|
|
800
|
-
# 🎯
|
|
801
|
-
-
|
|
802
|
-
-
|
|
803
|
-
-
|
|
804
|
-
-
|
|
805
|
-
-
|
|
806
|
-
|
|
807
|
-
# 📋
|
|
808
|
-
##
|
|
809
|
-
-
|
|
810
|
-
-
|
|
811
|
-
-
|
|
812
|
-
-
|
|
813
|
-
-
|
|
814
|
-
|
|
815
|
-
##
|
|
816
|
-
- question: [
|
|
819
|
+
# 🤖 角色定义
|
|
820
|
+
您是一位代码分析专家,能够提供关于代码库的全面且准确的回答。
|
|
821
|
+
|
|
822
|
+
# 🎯 核心职责
|
|
823
|
+
- 深入分析代码文件
|
|
824
|
+
- 清晰解释技术概念
|
|
825
|
+
- 提供相关代码示例
|
|
826
|
+
- 识别缺失的信息
|
|
827
|
+
- 使用用户的语言进行回答
|
|
828
|
+
|
|
829
|
+
# 📋 回答要求
|
|
830
|
+
## 内容质量
|
|
831
|
+
- 关注实现细节
|
|
832
|
+
- 保持技术准确性
|
|
833
|
+
- 包含相关代码片段
|
|
834
|
+
- 指出任何缺失的信息
|
|
835
|
+
- 使用专业术语
|
|
836
|
+
|
|
837
|
+
## 回答格式
|
|
838
|
+
- question: [重述问题]
|
|
817
839
|
answer: |
|
|
818
|
-
[
|
|
819
|
-
-
|
|
820
|
-
-
|
|
821
|
-
-
|
|
822
|
-
-
|
|
840
|
+
[详细的技术回答,包含:
|
|
841
|
+
- 实现细节
|
|
842
|
+
- 代码示例(如果相关)
|
|
843
|
+
- 缺失的信息(如果有)
|
|
844
|
+
- 相关技术概念]
|
|
823
845
|
|
|
824
|
-
- question: [
|
|
846
|
+
- question: [如果需要,提出后续问题]
|
|
825
847
|
answer: |
|
|
826
|
-
[
|
|
848
|
+
[额外的技术细节]
|
|
827
849
|
|
|
828
|
-
# 🔍
|
|
829
|
-
|
|
850
|
+
# 🔍 分析上下文
|
|
851
|
+
问题: {query}
|
|
830
852
|
|
|
831
|
-
|
|
853
|
+
相关代码文件(按相关性排序):
|
|
832
854
|
"""
|
|
833
855
|
|
|
834
|
-
|
|
835
|
-
|
|
836
|
-
|
|
837
|
-
|
|
838
|
-
|
|
839
|
-
|
|
840
|
-
|
|
841
|
-
|
|
842
|
-
|
|
843
|
-
|
|
844
|
-
{
|
|
845
|
-
```
|
|
846
|
-
|
|
847
|
-
|
|
848
|
-
|
|
849
|
-
|
|
850
|
-
|
|
851
|
-
|
|
852
|
-
|
|
853
|
-
|
|
856
|
+
with yaspin(text="正在生成回答...", color="cyan") as spinner:
|
|
857
|
+
# 添加上下文,控制长度
|
|
858
|
+
spinner.text = "添加上下文..."
|
|
859
|
+
available_count = self.max_token_count - get_context_token_count(prompt) - 1000 # 为回答预留空间
|
|
860
|
+
current_count = 0
|
|
861
|
+
|
|
862
|
+
for path in files_from_codebase:
|
|
863
|
+
try:
|
|
864
|
+
content = open(path["file"], "r", encoding="utf-8").read()
|
|
865
|
+
file_content = f"""
|
|
866
|
+
## 文件: {path["file"]}
|
|
867
|
+
```
|
|
868
|
+
{content}
|
|
869
|
+
```
|
|
870
|
+
---
|
|
871
|
+
"""
|
|
872
|
+
if current_count + get_context_token_count(file_content) > available_count:
|
|
873
|
+
spinner.write("⚠️ 由于上下文长度限制, 一些文件被省略")
|
|
874
|
+
break
|
|
875
|
+
|
|
876
|
+
prompt += file_content
|
|
877
|
+
current_count += get_context_token_count(file_content)
|
|
854
878
|
|
|
855
|
-
|
|
856
|
-
|
|
857
|
-
|
|
858
|
-
except Exception as e:
|
|
859
|
-
PrettyOutput.print(f"读取 {path} 失败: {str(e)}",
|
|
860
|
-
output_type=OutputType.ERROR)
|
|
861
|
-
continue
|
|
862
|
-
|
|
863
|
-
prompt += """
|
|
864
|
-
# ❗ Important Rules
|
|
865
|
-
1. Always base answers on provided code
|
|
866
|
-
2. Use technical precision
|
|
867
|
-
3. Include code examples when relevant
|
|
868
|
-
4. Indicate any missing information
|
|
869
|
-
5. Maintain professional language
|
|
870
|
-
6. Answer in user's language
|
|
871
|
-
"""
|
|
872
|
-
|
|
873
|
-
model = PlatformRegistry.get_global_platform_registry().get_thinking_platform()
|
|
879
|
+
except Exception as e:
|
|
880
|
+
spinner.write(f"❌ 读取 {path} 失败: {str(e)}")
|
|
881
|
+
continue
|
|
874
882
|
|
|
875
|
-
|
|
883
|
+
prompt += """
|
|
884
|
+
# ❗ 重要规则
|
|
885
|
+
1. 始终基于提供的代码进行回答
|
|
886
|
+
2. 保持技术准确性
|
|
887
|
+
3. 在相关时包含代码示例
|
|
888
|
+
4. 指出任何缺失的信息
|
|
889
|
+
5. 保持专业语言
|
|
890
|
+
6. 使用用户的语言进行回答
|
|
891
|
+
"""
|
|
892
|
+
|
|
893
|
+
model = PlatformRegistry.get_global_platform_registry().get_thinking_platform()
|
|
894
|
+
spinner.text = "生成回答..."
|
|
895
|
+
ret = files_from_codebase, model.chat_until_success(prompt)
|
|
896
|
+
spinner.text = "回答生成完成"
|
|
897
|
+
spinner.ok("✅")
|
|
898
|
+
return ret
|
|
876
899
|
|
|
877
900
|
def is_index_generated(self) -> bool:
|
|
878
901
|
"""Check if the index has been generated"""
|
|
@@ -973,9 +996,12 @@ def main():
|
|
|
973
996
|
PrettyOutput.print(output, output_type=OutputType.INFO, lang="markdown")
|
|
974
997
|
|
|
975
998
|
elif args.command == 'ask':
|
|
976
|
-
|
|
977
|
-
output = f""
|
|
978
|
-
|
|
999
|
+
files, answer = codebase.ask_codebase(args.question, args.top_k)
|
|
1000
|
+
output = f"# 相关文件:\n"
|
|
1001
|
+
for file in files:
|
|
1002
|
+
output += f"""- {file['file']} ({file['reason']})\n"""
|
|
1003
|
+
output += f"# 回答:\n{answer}"
|
|
1004
|
+
PrettyOutput.print(output, output_type=OutputType.SYSTEM, lang="markdown")
|
|
979
1005
|
|
|
980
1006
|
else:
|
|
981
1007
|
parser.print_help()
|