PyPI - jarvis-ai-assistant - Versions diffs - 0.1.97__py3-none-any.whl → 0.1.99__py3-none-any.whl - Mend

jarvis-ai-assistant 0.1.97py3-none-any.whl → 0.1.99py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of jarvis-ai-assistant might be problematic. Click here for more details.

Files changed (41) hide show

jarvis/__init__.py +1 -1
jarvis/agent.py +199 -157
jarvis/jarvis_code_agent/__init__.py +0 -0
jarvis/jarvis_code_agent/main.py +203 -0
jarvis/jarvis_codebase/main.py +412 -284
jarvis/jarvis_coder/file_select.py +209 -0
jarvis/jarvis_coder/git_utils.py +81 -19
jarvis/jarvis_coder/main.py +68 -446
jarvis/jarvis_coder/patch_handler.py +117 -47
jarvis/jarvis_coder/plan_generator.py +69 -27
jarvis/jarvis_platform/main.py +38 -38
jarvis/jarvis_rag/main.py +189 -189
jarvis/jarvis_smart_shell/main.py +22 -24
jarvis/models/base.py +6 -1
jarvis/models/ollama.py +2 -2
jarvis/models/registry.py +3 -6
jarvis/tools/ask_user.py +6 -6
jarvis/tools/codebase_qa.py +5 -7
jarvis/tools/create_code_sub_agent.py +55 -0
jarvis/tools/{sub_agent.py → create_sub_agent.py} +4 -1
jarvis/tools/execute_code_modification.py +72 -0
jarvis/tools/{file_ops.py → file_operation.py} +13 -14
jarvis/tools/find_related_files.py +86 -0
jarvis/tools/methodology.py +25 -25
jarvis/tools/rag.py +32 -32
jarvis/tools/registry.py +72 -36
jarvis/tools/search.py +1 -1
jarvis/tools/select_code_files.py +64 -0
jarvis/utils.py +153 -49
{jarvis_ai_assistant-0.1.97.dist-info → jarvis_ai_assistant-0.1.99.dist-info}/METADATA +1 -1
jarvis_ai_assistant-0.1.99.dist-info/RECORD +52 -0
{jarvis_ai_assistant-0.1.97.dist-info → jarvis_ai_assistant-0.1.99.dist-info}/entry_points.txt +2 -1
jarvis/main.py +0 -155
jarvis/tools/coder.py +0 -69
jarvis_ai_assistant-0.1.97.dist-info/RECORD +0 -47
/jarvis/tools/{shell.py → execute_shell.py} +0 -0
/jarvis/tools/{generator.py → generate_tool.py} +0 -0
/jarvis/tools/{webpage.py → read_webpage.py} +0 -0
{jarvis_ai_assistant-0.1.97.dist-info → jarvis_ai_assistant-0.1.99.dist-info}/LICENSE +0 -0
{jarvis_ai_assistant-0.1.97.dist-info → jarvis_ai_assistant-0.1.99.dist-info}/WHEEL +0 -0
{jarvis_ai_assistant-0.1.97.dist-info → jarvis_ai_assistant-0.1.99.dist-info}/top_level.txt +0 -0

jarvis/jarvis_codebase/main.py CHANGED Viewed

@@ -9,7 +9,7 @@ from jarvis.models.registry import PlatformRegistry
 import concurrent.futures
 from threading import Lock
 from concurrent.futures import ThreadPoolExecutor
-from jarvis.utils import OutputType, PrettyOutput, find_git_root, get_file_md5, get_max_context_length, get_thread_count, load_embedding_model, load_rerank_model
+from jarvis.utils import OutputType, PrettyOutput, find_git_root, get_file_md5, get_max_context_length, get_single_line_input, get_thread_count, load_embedding_model, load_rerank_model
 from jarvis.utils import load_env_from_file
 import argparse
 import pickle
@@ -28,59 +28,37 @@ class CodeBase:
         # 初始化数据目录
         self.data_dir = os.path.join(self.root_dir, ".jarvis-codebase")
-        if not os.path.exists(self.data_dir):
-            os.makedirs(self.data_dir)
+        self.cache_dir = os.path.join(self.data_dir, "cache")
+        if not os.path.exists(self.cache_dir):
+            os.makedirs(self.cache_dir)
-        # 初始化嵌入模型，使用系统默认缓存目录
+        # 初始化嵌入模型
         try:
             self.embedding_model = load_embedding_model()
-            # 强制完全加载所有模型组件
-            test_text = """
-这是一段测试文本，用于确保模型完全加载。
-包含多行内容，以模拟实际使用场景。
-"""
-            # 预热模型，确保所有组件都被加载
+            test_text = """This is a test text"""
             self.embedding_model.encode([test_text],
                                      convert_to_tensor=True,
                                      normalize_embeddings=True)
-            PrettyOutput.print("模型加载完成", output_type=OutputType.SUCCESS)
+            PrettyOutput.print("Model loaded successfully", output_type=OutputType.SUCCESS)
         except Exception as e:
-            PrettyOutput.print(f"加载模型失败: {str(e)}", output_type=OutputType.ERROR)
+            PrettyOutput.print(f"Failed to load model: {str(e)}", output_type=OutputType.ERROR)
             raise
         self.vector_dim = self.embedding_model.get_sentence_embedding_dimension()
         self.git_file_list = self.get_git_file_list()
         self.platform_registry = PlatformRegistry.get_global_platform_registry()
         # 初始化缓存和索引
-        self.cache_path = os.path.join(self.data_dir, "cache.pkl")
         self.vector_cache = {}
         self.file_paths = []
-        # 加载缓存
-        if os.path.exists(self.cache_path):
-            try:
-                with lzma.open(self.cache_path, 'rb') as f:
-                    cache_data = pickle.load(f)
-                    self.vector_cache = cache_data["vectors"]
-                    self.file_paths = cache_data["file_paths"]
-                PrettyOutput.print(f"加载了 {len(self.vector_cache)} 个向量缓存",
-                                 output_type=OutputType.INFO)
-                # 从缓存重建索引
-                self.build_index()
-            except Exception as e:
-                PrettyOutput.print(f"加载缓存失败: {str(e)}",
-                                 output_type=OutputType.WARNING)
-                self.vector_cache = {}
-                self.file_paths = []
-                self.index = None
+        # 加载所有缓存文件
+        self._load_all_cache()
     def get_git_file_list(self):
-        """获取 git 仓库中的文件列表，排除 .jarvis-codebase 目录"""
+        """Get the list of files in the git repository, excluding the .jarvis-codebase directory"""
         files = os.popen("git ls-files").read().splitlines()
-        # 过滤掉 .jarvis-codebase 目录下的文件
+        # Filter out files in the .jarvis-codebase directory
         return [f for f in files if not f.startswith(".jarvis-")]
     def is_text_file(self, file_path: str):
@@ -95,10 +73,11 @@ class CodeBase:
         model = PlatformRegistry.get_global_platform_registry().get_cheap_platform()
         if self.thread_count > 1:
             model.set_suppress_output(True)
+        else:
+            PrettyOutput.print(f"Make description for {file_path} ...", output_type=OutputType.PROGRESS)
         prompt = f"""Please analyze the following code file and generate a detailed description. The description should include:
-1. Overall file functionality description, no more than 100 characters
-2. One-sentence description (max 50 characters) for each global variable, function, type definition, class, method, and other code elements
-3. 5 potential questions users might ask about this file
+1. Overall file functionality description
+2. description for each global variable, function, type definition, class, method, and other code elements
 Please use concise and professional language, emphasizing technical functionality to facilitate subsequent code retrieval.
 File path: {file_path}
@@ -109,42 +88,117 @@ Code content:
         return response
     def export(self):
-        """导出当前索引数据到标准输出"""
+        """Export the current index data to standard output"""
         for file_path, data in self.vector_cache.items():
             print(f"## {file_path}")
             print(f"- path: {file_path}")
             print(f"- description: {data['description']}")
-    def _save_cache(self):
-        """保存缓存数据"""
+    def _get_cache_path(self, file_path: str) -> str:
+        """Get cache file path for a source file
+        Args:
+            file_path: Source file path
+        Returns:
+            str: Cache file path
+        """
+        # 处理文件路径：
+        # 1. 移除开头的 ./ 或 /
+        # 2. 将 / 替换为 --
+        # 3. 添加 .cache 后缀
+        clean_path = file_path.lstrip('./').lstrip('/')
+        cache_name = clean_path.replace('/', '--') + '.cache'
+        return os.path.join(self.cache_dir, cache_name)
+    def _load_all_cache(self):
+        """Load all cache files"""
         try:
-            # 创建缓存数据的副本
-            cache_data = {
-                "vectors": dict(self.vector_cache),  # 创建字典的副本
-                "file_paths": list(self.file_paths)  # 创建列表的副本
-            }
+            # 清空现有缓存和文件路径
+            self.vector_cache = {}
+            self.file_paths = []
+            vectors = []
+            for cache_file in os.listdir(self.cache_dir):
+                if not cache_file.endswith('.cache'):
+                    continue
+                cache_path = os.path.join(self.cache_dir, cache_file)
+                try:
+                    with lzma.open(cache_path, 'rb') as f:
+                        cache_data = pickle.load(f)
+                        file_path = cache_data["path"]
+                        self.vector_cache[file_path] = cache_data
+                        self.file_paths.append(file_path)
+                        vectors.append(cache_data["vector"])
+                except Exception as e:
+                    PrettyOutput.print(f"Failed to load cache file {cache_file}: {str(e)}",
+                                     output_type=OutputType.WARNING)
+                    continue
-            # 使用 lzma 压缩存储
-            with lzma.open(self.cache_path, 'wb') as f:
+            if vectors:
+                # 重建索引
+                vectors_array = np.vstack(vectors)
+                hnsw_index = faiss.IndexHNSWFlat(self.vector_dim, 16)
+                hnsw_index.hnsw.efConstruction = 40
+                hnsw_index.hnsw.efSearch = 16
+                self.index = faiss.IndexIDMap(hnsw_index)
+                self.index.add_with_ids(vectors_array, np.array(range(len(vectors)))) # type: ignore
+                PrettyOutput.print(f"Loaded {len(self.vector_cache)} vector cache and rebuilt index",
+                                 output_type=OutputType.INFO)
+            else:
+                self.index = None
+                PrettyOutput.print("No valid cache files found", output_type=OutputType.WARNING)
+        except Exception as e:
+            PrettyOutput.print(f"Failed to load cache directory: {str(e)}",
+                             output_type=OutputType.WARNING)
+            self.vector_cache = {}
+            self.file_paths = []
+            self.index = None
+    def cache_vector(self, file_path: str, vector: np.ndarray, description: str):
+        """Cache the vector representation of a file"""
+        try:
+            with open(file_path, "rb") as f:
+                file_md5 = hashlib.md5(f.read()).hexdigest()
+        except Exception as e:
+            PrettyOutput.print(f"Failed to calculate MD5 for {file_path}: {str(e)}",
+                              output_type=OutputType.ERROR)
+            file_md5 = ""
+        # 准备缓存数据
+        cache_data = {
+            "path": file_path,  # 保存文件路径
+            "md5": file_md5,    # 保存文件MD5
+            "description": description,  # 保存文件描述
+            "vector": vector    # 保存向量
+        }
+        # 更新内存缓存
+        self.vector_cache[file_path] = cache_data
+        # 保存到单独的缓存文件
+        cache_path = self._get_cache_path(file_path)
+        try:
+            with lzma.open(cache_path, 'wb') as f:
                 pickle.dump(cache_data, f, protocol=pickle.HIGHEST_PROTOCOL)
-            PrettyOutput.print(f"保存了 {len(self.vector_cache)} 个向量缓存",
-                             output_type=OutputType.INFO)
         except Exception as e:
-            PrettyOutput.print(f"保存缓存失败: {str(e)}",
+            PrettyOutput.print(f"Failed to save cache for {file_path}: {str(e)}",
                              output_type=OutputType.ERROR)
-            raise  # 抛出异常以便上层处理
     def get_cached_vector(self, file_path: str, description: str) -> Optional[np.ndarray]:
-        """从缓存获取文件的向量表示"""
+        """Get the vector representation of a file from the cache"""
         if file_path not in self.vector_cache:
             return None
-        # 检查文件是否被修改
+        # Check if the file has been modified
         try:
             with open(file_path, "rb") as f:
                 current_md5 = hashlib.md5(f.read()).hexdigest()
         except Exception as e:
-            PrettyOutput.print(f"计算文件MD5失败 {file_path}: {str(e)}",
+            PrettyOutput.print(f"Failed to calculate MD5 for {file_path}: {str(e)}",
                               output_type=OutputType.ERROR)
             return None
@@ -152,63 +206,45 @@ Code content:
         if cached_data["md5"] != current_md5:
             return None
-        # 检查描述是否变化
+        # Check if the description has changed
         if cached_data["description"] != description:
             return None
         return cached_data["vector"]
-    def cache_vector(self, file_path: str, vector: np.ndarray, description: str):
-        """缓存文件的向量表示"""
-        try:
-            with open(file_path, "rb") as f:
-                file_md5 = hashlib.md5(f.read()).hexdigest()
-        except Exception as e:
-            PrettyOutput.print(f"计算文件MD5失败 {file_path}: {str(e)}",
-                              output_type=OutputType.ERROR)
-            file_md5 = ""
-        # 只更新内存中的缓存
-        self.vector_cache[file_path] = {
-            "path": file_path,  # 保存文件路径
-            "md5": file_md5,    # 保存文件MD5
-            "description": description,  # 保存文件描述
-            "vector": vector    # 保存向量
-        }
     def get_embedding(self, text: str) -> np.ndarray:
-        """使用 transformers 模型获取文本的向量表示"""
-        # 对长文本进行截断
-        max_length = 512  # 或其他合适的长度
+        """Use the transformers model to get the vector representation of text"""
+        # Truncate long text
+        max_length = 512  # Or other suitable length
         text = ' '.join(text.split()[:max_length])
-        # 获取嵌入向量
+        # Get the embedding vector
         embedding = self.embedding_model.encode(text,
-                                                 normalize_embeddings=True,  # L2归一化
+                                                 normalize_embeddings=True,  # L2 normalization
                                                  show_progress_bar=False)
         vector = np.array(embedding, dtype=np.float32)
         return vector
     def vectorize_file(self, file_path: str, description: str) -> np.ndarray:
-        """将文件内容和描述向量化"""
+        """Vectorize the file content and description"""
         try:
-            # 先尝试从缓存获取
+            # Try to get the vector from the cache first
             cached_vector = self.get_cached_vector(file_path, description)
             if cached_vector is not None:
                 return cached_vector
-            # 读取文件内容并组合信息
-            content = open(file_path, "r", encoding="utf-8").read()[:self.max_context_length]  # 限制文件内容长度
+            # Read the file content and combine information
+            content = open(file_path, "r", encoding="utf-8").read()[:self.max_context_length]  # Limit the file content length
-            # 组合文件信息，包含文件内容
+            # Combine file information, including file content
             combined_text = f"""
-{file_path}
-{description}
-{content}
+File path: {file_path}
+Description: {description}
+Content: {content}
 """
             vector = self.get_embedding(combined_text)
-            # 保存到缓存
+            # Save to cache
             self.cache_vector(file_path, vector, description)
             return vector
         except Exception as e:
@@ -217,36 +253,34 @@ Code content:
             return np.zeros(self.vector_dim, dtype=np.float32) # type: ignore
     def clean_cache(self) -> bool:
-        """清理过期的缓存记录，返回是否有文件被删除"""
+        """Clean expired cache records"""
         try:
             files_to_delete = []
             for file_path in list(self.vector_cache.keys()):
-                if file_path not in self.git_file_list:
-                    del self.vector_cache[file_path]
+                if not os.path.exists(file_path):
                     files_to_delete.append(file_path)
-            if files_to_delete:
-                # 只在有文件被删除时保存缓存
-                self._save_cache()
-                PrettyOutput.print(f"清理了 {len(files_to_delete)} 个文件的缓存",
-                                output_type=OutputType.INFO)
-                return True
-            return False
+                    cache_path = self._get_cache_path(file_path)
+                    try:
+                        os.remove(cache_path)
+                    except Exception:
+                        pass
+            for file_path in files_to_delete:
+                del self.vector_cache[file_path]
+                if file_path in self.file_paths:
+                    self.file_paths.remove(file_path)
+            return bool(files_to_delete)
         except Exception as e:
-            PrettyOutput.print(f"清理缓存失败: {str(e)}",
-                            output_type=OutputType.ERROR)
-            # 发生异常时尝试保存当前状态
-            try:
-                self._save_cache()
-            except:
-                pass
+            PrettyOutput.print(f"Failed to clean cache: {str(e)}",
+                             output_type=OutputType.ERROR)
             return False
     def process_file(self, file_path: str):
-        """处理单个文件"""
+        """Process a single file"""
         try:
-            # 跳过不存在的文件
+            # Skip non-existent files
             if not os.path.exists(file_path):
                 return None
@@ -257,15 +291,15 @@ Code content:
             content = open(file_path, "r", encoding="utf-8").read()
-            # 检查文件是否已经处理过且内容未变
+            # Check if the file has already been processed and the content has not changed
             if file_path in self.vector_cache:
                 if self.vector_cache[file_path].get("md5") == md5:
                     return None
-            description = self.make_description(file_path, content)  # 传入截取后的内容
+            description = self.make_description(file_path, content)  # Pass the truncated content
             vector = self.vectorize_file(file_path, description)
-            # 保存到缓存，使用实际文件路径作为键
+            # Save to cache, using the actual file path as the key
             self.vector_cache[file_path] = {
                 "vector": vector,
                 "description": description,
@@ -275,58 +309,94 @@ Code content:
             return file_path
         except Exception as e:
-            PrettyOutput.print(f"处理文件失败 {file_path}: {str(e)}",
+            PrettyOutput.print(f"Failed to process file {file_path}: {str(e)}",
                              output_type=OutputType.ERROR)
             return None
     def build_index(self):
-        """从向量缓存构建 faiss 索引"""
-        # 创建底层 HNSW 索引
-        hnsw_index = faiss.IndexHNSWFlat(self.vector_dim, 16)
-        hnsw_index.hnsw.efConstruction = 40
-        hnsw_index.hnsw.efSearch = 16
-        # 用 IndexIDMap 包装 HNSW 索引
-        self.index = faiss.IndexIDMap(hnsw_index)
-        vectors = []
-        ids = []
-        self.file_paths = []  # 重置文件路径列表
-        for i, (file_path, data) in enumerate(self.vector_cache.items()):
-            vectors.append(data["vector"].reshape(1, -1))
-            ids.append(i)
-            self.file_paths.append(file_path)
-        if vectors:
-            vectors = np.vstack(vectors)
-            self.index.add_with_ids(vectors, np.array(ids)) # type: ignore
-        else:
+        """Build a faiss index from the vector cache"""
+        try:
+            if not self.vector_cache:
+                self.index = None
+                return
+            # Create the underlying HNSW index
+            hnsw_index = faiss.IndexHNSWFlat(self.vector_dim, 16)
+            hnsw_index.hnsw.efConstruction = 40
+            hnsw_index.hnsw.efSearch = 16
+            # Wrap the HNSW index with IndexIDMap
+            self.index = faiss.IndexIDMap(hnsw_index)
+            vectors = []
+            ids = []
+            self.file_paths = []  # Reset the file path list
+            for i, (file_path, data) in enumerate(self.vector_cache.items()):
+                if "vector" not in data:
+                    PrettyOutput.print(f"Invalid cache data for {file_path}: missing vector",
+                                     output_type=OutputType.WARNING)
+                    continue
+                vector = data["vector"]
+                if not isinstance(vector, np.ndarray):
+                    PrettyOutput.print(f"Invalid vector type for {file_path}: {type(vector)}",
+                                     output_type=OutputType.WARNING)
+                    continue
+                vectors.append(vector.reshape(1, -1))
+                ids.append(i)
+                self.file_paths.append(file_path)
+            if vectors:
+                vectors = np.vstack(vectors)
+                if len(vectors) != len(ids):
+                    PrettyOutput.print(f"Vector count mismatch: {len(vectors)} vectors vs {len(ids)} ids",
+                                     output_type=OutputType.ERROR)
+                    self.index = None
+                    return
+                try:
+                    self.index.add_with_ids(vectors, np.array(ids)) # type: ignore
+                    PrettyOutput.print(f"Successfully built index with {len(vectors)} vectors",
+                                     output_type=OutputType.SUCCESS)
+                except Exception as e:
+                    PrettyOutput.print(f"Failed to add vectors to index: {str(e)}",
+                                     output_type=OutputType.ERROR)
+                    self.index = None
+            else:
+                PrettyOutput.print("No valid vectors found, index not built",
+                                 output_type=OutputType.WARNING)
+                self.index = None
+        except Exception as e:
+            PrettyOutput.print(f"Failed to build index: {str(e)}",
+                             output_type=OutputType.ERROR)
             self.index = None
     def gen_vector_db_from_cache(self):
-        """从缓存生成向量数据库"""
+        """Generate a vector database from the cache"""
         self.build_index()
-        self._save_cache()
+        self._load_all_cache()
     def generate_codebase(self, force: bool = False):
-        """生成代码库索引
+        """Generate the codebase index
         Args:
-            force: 是否强制重建索引，不询问用户
+            force: Whether to force rebuild the index, without asking the user
         """
         try:
-            # 更新 git 文件列表
+            # Update the git file list
             self.git_file_list = self.get_git_file_list()
-            # 检查文件变化
-            PrettyOutput.print("\n检查文件变化...", output_type=OutputType.INFO)
+            # Check file changes
+            PrettyOutput.print("\nCheck file changes...", output_type=OutputType.INFO)
             changes_detected = False
             new_files = []
             modified_files = []
             deleted_files = []
-            # 检查删除的文件
+            # Check deleted files
             files_to_delete = []
             for file_path in list(self.vector_cache.keys()):
                 if file_path not in self.git_file_list:
@@ -334,8 +404,8 @@ Code content:
                     files_to_delete.append(file_path)
                     changes_detected = True
-            # 检查新增和修改的文件
-            with tqdm(total=len(self.git_file_list), desc="检查文件状态") as pbar:
+            # Check new and modified files
+            with tqdm(total=len(self.git_file_list), desc="Check file status") as pbar:
                 for file_path in self.git_file_list:
                     if not os.path.exists(file_path) or not self.is_text_file(file_path):
                         pbar.update(1)
@@ -351,60 +421,60 @@ Code content:
                             modified_files.append(file_path)
                             changes_detected = True
                     except Exception as e:
-                        PrettyOutput.print(f"检查文件失败 {file_path}: {str(e)}",
+                        PrettyOutput.print(f"Failed to check file {file_path}: {str(e)}",
                                          output_type=OutputType.ERROR)
                     pbar.update(1)
-            # 如果检测到变化，显示变化并询问用户
+            # If changes are detected, display changes and ask the user
             if changes_detected:
-                PrettyOutput.print("\n检测到以下变化:", output_type=OutputType.WARNING)
+                PrettyOutput.print("\nDetected the following changes:", output_type=OutputType.WARNING)
                 if new_files:
-                    PrettyOutput.print("\n新增文件:", output_type=OutputType.INFO)
+                    PrettyOutput.print("\nNew files:", output_type=OutputType.INFO)
                     for f in new_files:
                         PrettyOutput.print(f"  {f}", output_type=OutputType.INFO)
                 if modified_files:
-                    PrettyOutput.print("\n修改的文件:", output_type=OutputType.INFO)
+                    PrettyOutput.print("\nModified files:", output_type=OutputType.INFO)
                     for f in modified_files:
                         PrettyOutput.print(f"  {f}", output_type=OutputType.INFO)
                 if deleted_files:
-                    PrettyOutput.print("\n删除的文件:", output_type=OutputType.INFO)
+                    PrettyOutput.print("\nDeleted files:", output_type=OutputType.INFO)
                     for f in deleted_files:
                         PrettyOutput.print(f"  {f}", output_type=OutputType.INFO)
-                # 如果force为True，直接继续
+                # If force is True, continue directly
                 if not force:
-                    # 询问用户是否继续
+                    # Ask the user whether to continue
                     while True:
-                        response = input("\n是否重建索引？[y/N] ").lower().strip()
+                        response = get_single_line_input("\nRebuild the index? [y/N]").lower().strip()
                         if response in ['y', 'yes']:
                             break
                         elif response in ['', 'n', 'no']:
-                            PrettyOutput.print("取消重建索引", output_type=OutputType.INFO)
+                            PrettyOutput.print("Cancel rebuilding the index", output_type=OutputType.INFO)
                             return
                         else:
-                            PrettyOutput.print("请输入 y 或 n", output_type=OutputType.WARNING)
+                            PrettyOutput.print("Please input y or n", output_type=OutputType.WARNING)
-                # 清理已删除的文件
+                # Clean deleted files
                 for file_path in files_to_delete:
                     del self.vector_cache[file_path]
                 if files_to_delete:
-                    PrettyOutput.print(f"清理了 {len(files_to_delete)} 个文件的缓存",
+                    PrettyOutput.print(f"Cleaned the cache of {len(files_to_delete)} files",
                                      output_type=OutputType.INFO)
-                # 处理新文件和修改的文件
+                # Process new and modified files
                 files_to_process = new_files + modified_files
                 processed_files = []
-                with tqdm(total=len(files_to_process), desc="处理文件") as pbar:
-                    # 使用线程池处理文件
+                with tqdm(total=len(files_to_process), desc="Processing files") as pbar:
+                    # Use a thread pool to process files
                     with ThreadPoolExecutor(max_workers=self.thread_count) as executor:
-                        # 提交所有任务
+                        # Submit all tasks
                         future_to_file = {
                             executor.submit(self.process_file, file): file
                             for file in files_to_process
                         }
-                        # 处理完成的任务
+                        # Process completed tasks
                         for future in concurrent.futures.as_completed(future_to_file):
                             file = future_to_file[future]
                             try:
@@ -412,37 +482,37 @@ Code content:
                                 if result:
                                     processed_files.append(result)
                             except Exception as e:
-                                PrettyOutput.print(f"处理文件失败 {file}: {str(e)}",
+                                PrettyOutput.print(f"Failed to process file {file}: {str(e)}",
                                                 output_type=OutputType.ERROR)
                             pbar.update(1)
                 if processed_files:
-                    PrettyOutput.print("\n重新生成向量数据库...", output_type=OutputType.INFO)
+                    PrettyOutput.print("\nRebuilding the vector database...", output_type=OutputType.INFO)
                     self.gen_vector_db_from_cache()
-                    PrettyOutput.print(f"成功为 {len(processed_files)} 个文件生成索引",
+                    PrettyOutput.print(f"Successfully generated the index for {len(processed_files)} files",
                                     output_type=OutputType.SUCCESS)
             else:
-                PrettyOutput.print("没有检测到文件变更，无需重建索引", output_type=OutputType.INFO)
+                PrettyOutput.print("No file changes detected, no need to rebuild the index", output_type=OutputType.INFO)
         except Exception as e:
-            # 发生异常时尝试保存缓存
+            # Try to save the cache when an exception occurs
             try:
-                self._save_cache()
+                self._load_all_cache()
             except Exception as save_error:
-                PrettyOutput.print(f"保存缓存失败: {str(save_error)}",
+                PrettyOutput.print(f"Failed to save cache: {str(save_error)}",
                                 output_type=OutputType.ERROR)
-            raise e  # 重新抛出原始异常
+            raise e  # Re-raise the original exception
     def _text_search_score(self, content: str, keywords: List[str]) -> float:
-        """计算文本内容与关键词的匹配分数
+        """Calculate the matching score between the text content and the keywords
         Args:
-            content: 文本内容
-            keywords: 关键词列表
+            content: Text content
+            keywords: List of keywords
         Returns:
-            float: 匹配分数 (0-1)
+            float: Matching score (0-1)
         """
         if not keywords:
             return 0.0
@@ -455,89 +525,128 @@ Code content:
             if keyword in content:
                 matched_keywords.add(keyword)
-        # 计算匹配分数
+        # Calculate the matching score
         score = len(matched_keywords) / len(keywords)
         return score
-    def rerank_results(self, query: str, initial_results: List[Tuple[str, float, str]]) -> List[Tuple[str, float]]:
-        """使用多种策略对搜索结果重新排序"""
+    def pick_results(self, query: str, initial_results: List[str]) -> List[str]:
+        """Use a large model to pick the search results
+        Args:
+            query: Search query
+            initial_results: Initial results list of file paths
+        Returns:
+            List[str]: The picked results list, each item is a file path
+        """
         if not initial_results:
             return []
         try:
-            import torch
+            PrettyOutput.print(f"Picking results for query: {query}", output_type=OutputType.INFO)
-            # 加载模型和分词器
-            model, tokenizer = load_rerank_model()
+            # Maximum content length per batch
+            max_batch_length = self.max_context_length - 1000  # Reserve space for prompt
+            max_file_length = max_batch_length // 3  # Limit individual file size
-            # 准备数据
-            pairs = []
+            # Process files in batches
+            all_selected_files = set()
+            current_batch = []
+            current_length = 0
-            for path, _, desc in initial_results:
+            for path in initial_results:
                 try:
-                    content = open(path, "r", encoding="utf-8").read()[:512]  # 限制内容长度
+                    content = open(path, "r", encoding="utf-8").read()
+                    # Truncate large files
+                    if len(content) > max_file_length:
+                        PrettyOutput.print(f"Truncating large file: {path}", OutputType.WARNING)
+                        content = content[:max_file_length] + "\n... (content truncated)"
-                    # 组合文件信息
-                    doc_content = f"File path: {path}\nDescription: {desc}\nContent: {content}"
-                    pairs.append([query, doc_content])
-                except Exception as e:
-                    PrettyOutput.print(f"读取文件失败 {path}: {str(e)}",
-                                    output_type=OutputType.ERROR)
-                    doc_content = f"File path: {path}\nDescription: {desc}"
-                    pairs.append([query, doc_content])
-            # 使用更大的batch size提高处理速度
-            batch_size = 16  # 根据GPU显存调整
-            batch_scores = []
-            with torch.no_grad():
-                for i in range(0, len(pairs), batch_size):
-                    batch_pairs = pairs[i:i + batch_size]
-                    encoded = tokenizer(
-                        batch_pairs,
-                        padding=True,
-                        truncation=True,
-                        max_length=512,
-                        return_tensors='pt'
-                    )
+                    file_info = f"File: {path}\nContent: {content}\n\n"
+                    file_length = len(file_info)
-                    if torch.cuda.is_available():
-                        encoded = {k: v.cuda() for k, v in encoded.items()}
+                    # If adding this file would exceed batch limit
+                    if current_length + file_length > max_batch_length:
+                        # Process current batch
+                        if current_batch:
+                            selected = self._process_batch(query, current_batch)
+                            all_selected_files.update(selected)
+                        # Start new batch
+                        current_batch = [file_info]
+                        current_length = file_length
+                    else:
+                        current_batch.append(file_info)
+                        current_length += file_length
-                    outputs = model(**encoded)
-                    batch_scores.extend(outputs.logits.squeeze(-1).cpu().numpy())
-            # 归一化分数到 0-1 范围
-            if batch_scores:
-                min_score = min(batch_scores)
-                max_score = max(batch_scores)
-                if max_score > min_score:
-                    batch_scores = [(s - min_score) / (max_score - min_score) for s in batch_scores]
-            # 将重排序分数与原始分数结合
-            scored_results = []
-            for (path,_, desc), rerank_score in zip(initial_results, batch_scores):
-                if rerank_score >= 0.5:  # 只保留相关度较高的结果
-                    scored_results.append((path, rerank_score))
-            # 按综合分数降序排序
-            scored_results.sort(key=lambda x: x[1], reverse=True)
+                except Exception as e:
+                    PrettyOutput.print(f"Failed to read file {path}: {str(e)}", OutputType.ERROR)
+                    continue
-            return scored_results
+            # Process final batch
+            if current_batch:
+                selected = self._process_batch(query, current_batch)
+                all_selected_files.update(selected)
+            # Convert set to list and maintain original order
+            final_results = [path for path in initial_results if path in all_selected_files]
+            return final_results
         except Exception as e:
-            PrettyOutput.print(f"重排序失败: {str(e)}",
-                            output_type=OutputType.ERROR)
-            return [(path, score) for path, score, _ in initial_results]  # 发生错误时返回原始结果
+            PrettyOutput.print(f"Failed to pick: {str(e)}", OutputType.ERROR)
+            return initial_results
+    def _process_batch(self, query: str, files_info: List[str]) -> List[str]:
+        """Process a batch of files
+        Args:
+            query: Search query
+            files_info: List of file information strings
+        Returns:
+            List[str]: Selected file paths from this batch
+        """
+        prompt = f"""Please analyze the following code files and determine which files are most relevant to the given query. Consider file paths and code content to make your judgment.
+Query: {query}
+Available files:
+{''.join(files_info)}
+Please output a YAML list of relevant file paths, ordered by relevance (most relevant first). Only include files that are truly relevant to the query.
+Output format:
+<FILES>
+- path/to/file1.py
+- path/to/file2.py
+</FILES>
+Note: Only include files that have a strong connection to the query."""
+        # Use a large model to evaluate
+        model = PlatformRegistry.get_global_platform_registry().get_normal_platform()
+        response = model.chat_until_success(prompt)
+        # Parse the response
+        import yaml
+        files_match = re.search(r'<FILES>\n(.*?)</FILES>', response, re.DOTALL)
+        if not files_match:
+            return []
+        # Extract the file list
+        try:
+            selected_files = yaml.safe_load(files_match.group(1))
+            return selected_files if selected_files else []
+        except Exception as e:
+            PrettyOutput.print(f"Failed to parse response: {str(e)}", OutputType.ERROR)
+            return []
     def _generate_query_variants(self, query: str) -> List[str]:
-        """生成查询的不同表述变体
+        """Generate different expressions of the query
         Args:
-            query: 原始查询
+            query: Original query
         Returns:
-            List[str]: 查询变体列表
+            List[str]: The query variants list
         """
         model = PlatformRegistry.get_global_platform_registry().get_normal_platform()
         prompt = f"""Please generate 3 different expressions based on the following query, each expression should fully convey the meaning of the original query. These expressions will be used for code search, maintain professionalism and accuracy.
@@ -546,18 +655,18 @@ Original query: {query}
 Please output 3 expressions directly, separated by two line breaks, without numbering or other markers.
 """
         variants = model.chat_until_success(prompt).strip().split('\n\n')
-        variants.append(query)  # 添加原始查询
+        variants.append(query)  # Add the original query
         return variants
     def _vector_search(self, query_variants: List[str], top_k: int) -> Dict[str, Tuple[str, float, str]]:
-        """使用向量搜索查找相关文件
+        """Use vector search to find related files
         Args:
-            query_variants: 查询变体列表
-            top_k: 返回结果数量
+            query_variants: The query variants list
+            top_k: The number of results to return
         Returns:
-            Dict[str, Tuple[str, float, str]]: 文件路径到(路径,分数,描述)的映射
+            Dict[str, Tuple[str, float, str]]: The mapping from file path to (file path, score, description)
         """
         results = {}
         for query in query_variants:
@@ -571,75 +680,78 @@ Please output 3 expressions directly, separated by two line breaks, without numb
                     continue
                 similarity = 1.0 / (1.0 + float(distance))
-                if similarity >= 0.5:
-                    file_path = self.file_paths[i]
-                    # 使用最高的相似度分数
-                    if file_path not in results or similarity > results[file_path][1]:
+                file_path = self.file_paths[i]
+                # Use the highest similarity score
+                if file_path not in results:
+                    if similarity > 0.5:
                         data = self.vector_cache[file_path]
                         results[file_path] = (file_path, similarity, data["description"])
         return results
-    def search_similar(self, query: str, top_k: int = 30) -> List[Tuple[str, float]]:
-        """搜索关联文件"""
+    def search_similar(self, query: str, top_k: int = 30) -> List[str]:
+        """Search related files"""
         try:
             if self.index is None:
                 return []
-            # 生成查询变体
+            # Generate the query variants
             query_variants = self._generate_query_variants(query)
-            # 进行向量搜索
+            # Perform vector search
             vector_results = self._vector_search(query_variants, top_k)
             results = list(vector_results.values())
             results.sort(key=lambda x: x[1], reverse=True)
-            # 取前 top_k 个结果进行重排序
+            # Take the top top_k results for reordering
             initial_results = results[:top_k]
-            # 如果没有找到结果，直接返回
+            # If no results are found, return directly
             if not initial_results:
                 return []
-            # 过滤低分结果
+            # Filter low-scoring results
             initial_results = [(path, score, desc) for path, score, desc in initial_results if score >= 0.5]
+            for path, score, desc in initial_results:
+                PrettyOutput.print(f"File: {path} Similarity: {score:.3f}", output_type=OutputType.INFO)
-            # 对初步结果进行重排序
-            return self.rerank_results(query, initial_results)
+            # Reorder the preliminary results
+            return self.pick_results(query, [path for path, _, _ in initial_results])
         except Exception as e:
-            PrettyOutput.print(f"搜索失败: {str(e)}", output_type=OutputType.ERROR)
+            PrettyOutput.print(f"Failed to search: {str(e)}", output_type=OutputType.ERROR)
             return []
     def ask_codebase(self, query: str, top_k: int=20) -> str:
-        """查询代码库"""
+        """Query the codebase"""
         results = self.search_similar(query, top_k)
         if not results:
-            PrettyOutput.print("没有找到关联的文件", output_type=OutputType.WARNING)
+            PrettyOutput.print("No related files found", output_type=OutputType.WARNING)
             return ""
-        PrettyOutput.print(f"找到的关联文件: ", output_type=OutputType.SUCCESS)
-        for path, score in results:
-            PrettyOutput.print(f"文件: {path} 关联度: {score:.3f}",
+        PrettyOutput.print(f"Found related files: ", output_type=OutputType.SUCCESS)
+        for path in results:
+            PrettyOutput.print(f"File: {path}",
                              output_type=OutputType.INFO)
-        prompt = f"""你是一个代码专家，请根据以下文件信息回答用户的问题：
+        prompt = f"""You are a code expert, please answer the user's question based on the following file information:
 """
-        for path, _ in results:
+        for path in results:
             try:
                 if len(prompt) > self.max_context_length:
-                    PrettyOutput.print(f"避免上下文超限，丢弃低相关度文件：{path}", OutputType.WARNING)
+                    PrettyOutput.print(f"Avoid context overflow, discard low-related file: {path}", OutputType.WARNING)
                     continue
                 content = open(path, "r", encoding="utf-8").read()
                 prompt += f"""
-File path: {path}prompt
+File path: {path}
 File content:
 {content}
 ========================================
 """
             except Exception as e:
-                PrettyOutput.print(f"读取文件失败 {path}: {str(e)}",
+                PrettyOutput.print(f"Failed to read file {path}: {str(e)}",
                                  output_type=OutputType.ERROR)
                 continue
@@ -653,29 +765,46 @@ Please answer the user's question in Chinese using professional language. If the
         return response
     def is_index_generated(self) -> bool:
-        """检查索引是否已经生成"""
-        # 检查缓存文件是否存在
-        if not os.path.exists(self.cache_path):
-            return False
-        # 检查缓存是否有效
+        """Check if the index has been generated"""
         try:
-            with lzma.open(self.cache_path, 'rb') as f:
-                cache_data = pickle.load(f)
-                if not cache_data.get("vectors") or not cache_data.get("file_paths"):
+            # 1. 检查基本条件
+            if not self.vector_cache or not self.file_paths:
+                return False
+            if not hasattr(self, 'index') or self.index is None:
+                return False
+            # 2. 检查索引是否可用
+            # 创建测试向量
+            test_vector = np.zeros((1, self.vector_dim), dtype=np.float32) # type: ignore
+            try:
+                self.index.search(test_vector, 1) # type: ignore
+            except Exception:
+                return False
+            # 3. 验证向量缓存和文件路径的一致性
+            if len(self.vector_cache) != len(self.file_paths):
+                return False
+            # 4. 验证所有缓存文件
+            for file_path in self.file_paths:
+                if file_path not in self.vector_cache:
                     return False
-        except Exception:
-            return False
-        # 检查索引是否已构建
-        if not hasattr(self, 'index') or self.index is None:
-            return False
-        # 检查向量缓存和文件路径列表是否非空
-        if not self.vector_cache or not self.file_paths:
+                cache_path = self._get_cache_path(file_path)
+                if not os.path.exists(cache_path):
+                    return False
+                cache_data = self.vector_cache[file_path]
+                if not isinstance(cache_data.get("vector"), np.ndarray):
+                    return False
+            return True
+        except Exception as e:
+            PrettyOutput.print(f"Error checking index status: {str(e)}",
+                             output_type=OutputType.ERROR)
             return False
-        return True
@@ -729,10 +858,9 @@ def main():
             return
         PrettyOutput.print("\nSearch Results:", output_type=OutputType.INFO)
-        for path, score in results:
+        for path in results:
             PrettyOutput.print("\n" + "="*50, output_type=OutputType.INFO)
             PrettyOutput.print(f"File: {path}", output_type=OutputType.INFO)
-            PrettyOutput.print(f"Similarity: {score:.3f}", output_type=OutputType.INFO)
     elif args.command == 'ask':
         response = codebase.ask_codebase(args.question, args.top_k)

jarvis-ai-assistant 0.1.97__py3-none-any.whl → 0.1.99__py3-none-any.whl

Potentially problematic release.

jarvis-ai-assistant 0.1.97py3-none-any.whl → 0.1.99py3-none-any.whl