PyPI - jarvis-ai-assistant - Versions diffs - 0.1.108__py3-none-any.whl → 0.1.110__py3-none-any.whl - Mend

jarvis-ai-assistant 0.1.108py3-none-any.whl → 0.1.110py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of jarvis-ai-assistant might be problematic. Click here for more details.

Files changed (17) hide show

jarvis/__init__.py +1 -1
jarvis/agent.py +5 -5
jarvis/jarvis_code_agent/code_agent.py +69 -217
jarvis/jarvis_code_agent/file_select.py +11 -10
jarvis/jarvis_code_agent/patch.py +19 -9
jarvis/jarvis_code_agent/relevant_files.py +1 -162
jarvis/jarvis_codebase/main.py +55 -60
jarvis/jarvis_platform/oyi.py +1 -1
jarvis/jarvis_rag/main.py +194 -268
jarvis/jarvis_tools/registry.py +10 -9
jarvis/utils.py +155 -16
{jarvis_ai_assistant-0.1.108.dist-info → jarvis_ai_assistant-0.1.110.dist-info}/METADATA +12 -3
{jarvis_ai_assistant-0.1.108.dist-info → jarvis_ai_assistant-0.1.110.dist-info}/RECORD +17 -17
{jarvis_ai_assistant-0.1.108.dist-info → jarvis_ai_assistant-0.1.110.dist-info}/LICENSE +0 -0
{jarvis_ai_assistant-0.1.108.dist-info → jarvis_ai_assistant-0.1.110.dist-info}/WHEEL +0 -0
{jarvis_ai_assistant-0.1.108.dist-info → jarvis_ai_assistant-0.1.110.dist-info}/entry_points.txt +0 -0
{jarvis_ai_assistant-0.1.108.dist-info → jarvis_ai_assistant-0.1.110.dist-info}/top_level.txt +0 -0

jarvis/jarvis_codebase/main.py CHANGED Viewed

@@ -7,7 +7,7 @@ from typing import List, Tuple, Optional, Dict
 from jarvis.jarvis_platform.registry import PlatformRegistry
 import concurrent.futures
 from concurrent.futures import ThreadPoolExecutor
-from jarvis.utils import OutputType, PrettyOutput, find_git_root, get_file_md5, get_max_context_length, get_thread_count, load_embedding_model, user_confirm
+from jarvis.utils import OutputType, PrettyOutput, find_git_root, get_context_token_count, get_embedding, get_file_md5, get_max_token_count, get_thread_count, load_embedding_model, user_confirm
 from jarvis.utils import init_env
 import argparse
 import pickle
@@ -21,7 +21,7 @@ class CodeBase:
         self.root_dir = root_dir
         os.chdir(self.root_dir)
         self.thread_count = get_thread_count()
-        self.max_context_length = get_max_context_length()
+        self.max_token_count = get_max_token_count()
         self.index = None
         # 初始化数据目录
@@ -209,19 +209,6 @@ Code content:
         return cached_data["vector"]
-    def get_embedding(self, text: str) -> np.ndarray:
-        """Use the transformers model to get the vector representation of text"""
-        # Truncate long text
-        max_length = 512  # Or other suitable length
-        text = ' '.join(text.split()[:max_length])
-        # Get the embedding vector
-        embedding = self.embedding_model.encode(text,
-                                                 normalize_embeddings=True,  # L2 normalization
-                                                 show_progress_bar=False)
-        vector = np.array(embedding, dtype=np.float32)
-        return vector
     def vectorize_file(self, file_path: str, description: str) -> np.ndarray:
         """Vectorize the file content and description"""
         try:
@@ -231,7 +218,7 @@ Code content:
                 return cached_vector
             # Read the file content and combine information
-            content = open(file_path, "r", encoding="utf-8").read()[:self.max_context_length]  # Limit the file content length
+            content = open(file_path, "r", encoding="utf-8").read()[:self.max_token_count]  # Limit the file content length
             # Combine file information, including file content
             combined_text = f"""
@@ -239,7 +226,7 @@ File path: {file_path}
 Description: {description}
 Content: {content}
 """
-            vector = self.get_embedding(combined_text)
+            vector = get_embedding(self.embedding_model, combined_text)
             # Save to cache
             self.cache_vector(file_path, vector, description)
@@ -537,7 +524,7 @@ Content: {content}
         score = len(matched_keywords) / len(keywords)
         return score
-    def pick_results(self, query: str, initial_results: List[str]) -> List[str]:
+    def pick_results(self, query: List[str], initial_results: List[str]) -> List[str]:
         """Use a large model to pick the search results
         Args:
@@ -551,40 +538,40 @@ Content: {content}
             return []
         try:
-            PrettyOutput.print(f"Picking results for query: {query}", output_type=OutputType.INFO)
+            PrettyOutput.print(f"Picking results for query: \n" + "\n".join(query), output_type=OutputType.INFO)
             # Maximum content length per batch
-            max_batch_length = self.max_context_length - 1000  # Reserve space for prompt
+            max_batch_length = self.max_token_count - 1000  # Reserve space for prompt
             max_file_length = max_batch_length // 3  # Limit individual file size
             # Process files in batches
             all_selected_files = set()
             current_batch = []
-            current_length = 0
+            current_token_count = 0
             for path in initial_results:
                 try:
                     content = open(path, "r", encoding="utf-8").read()
                     # Truncate large files
-                    if len(content) > max_file_length:
+                    if get_context_token_count(content) > max_file_length:
                         PrettyOutput.print(f"Truncating large file: {path}", OutputType.WARNING)
                         content = content[:max_file_length] + "\n... (content truncated)"
                     file_info = f"File: {path}\nContent: {content}\n\n"
-                    file_length = len(file_info)
+                    tokens_count = get_context_token_count(file_info)
                     # If adding this file would exceed batch limit
-                    if current_length + file_length > max_batch_length:
+                    if current_token_count + tokens_count > max_batch_length:
                         # Process current batch
                         if current_batch:
-                            selected = self._process_batch(query, current_batch)
+                            selected = self._process_batch('\n'.join(query), current_batch)
                             all_selected_files.update(selected)
                         # Start new batch
                         current_batch = [file_info]
-                        current_length = file_length
+                        current_token_count = tokens_count
                     else:
                         current_batch.append(file_info)
-                        current_length += file_length
+                        current_token_count += tokens_count
                 except Exception as e:
                     PrettyOutput.print(f"Failed to read file {path}: {str(e)}", OutputType.ERROR)
@@ -592,7 +579,7 @@ Content: {content}
             # Process final batch
             if current_batch:
-                selected = self._process_batch(query, current_batch)
+                selected = self._process_batch('\n'.join(query), current_batch)
                 all_selected_files.update(selected)
             # Convert set to list and maintain original order
@@ -604,33 +591,41 @@ Content: {content}
             return initial_results
     def _process_batch(self, query: str, files_info: List[str]) -> List[str]:
-        """Process a batch of files
-        Args:
-            query: Search query
-            files_info: List of file information strings
-        Returns:
-            List[str]: Selected file paths from this batch
-        """
-        prompt = f"""Please analyze the following code files and determine which files are most relevant to the given query. Consider file paths and code content to make your judgment.
+        """Process a batch of files"""
+        prompt = f"""As a code analysis expert, please help identify the most relevant files for the given query using chain-of-thought reasoning.
 Query: {query}
 Available files:
 {''.join(files_info)}
-Please output a YAML list of relevant file paths, ordered by relevance (most relevant first). Only include files that are truly relevant to the query.
-Output format:
+Think through this step by step:
+1. First, analyze the query to identify key requirements and technical concepts
+2. For each file:
+   - Examine its path and content
+   - Assess how it relates to the query's requirements
+   - Consider both direct and indirect relationships
+   - Rate its relevance (high/medium/low)
+3. Select only files with clear relevance to the query
+4. Order files by relevance, with most relevant first
+Please output your selection in YAML format:
 <FILES>
-- path/to/file1.py
-- path/to/file2.py
+- path/to/most/relevant.py
+- path/to/next/relevant.py
 </FILES>
-Note: Only include files that have a strong connection to the query."""
+Important:
+- Only include files that are truly relevant
+- Exclude files with weak or unclear connections
+- Focus on implementation rather than test files
+- Consider both file paths and content
+- Only output the file paths, no other text
+"""
         # Use a large model to evaluate
         model = PlatformRegistry.get_global_platform_registry().get_normal_platform()
+        model.set_suppress_output(True)
         response = model.chat_until_success(prompt)
         # Parse the response
@@ -639,7 +634,6 @@ Note: Only include files that have a strong connection to the query."""
         if not files_match:
             return []
-        # Extract the file list
         try:
             selected_files = yaml.safe_load(files_match.group(1))
             return selected_files if selected_files else []
@@ -657,7 +651,8 @@ Note: Only include files that have a strong connection to the query."""
             List[str]: The query variants list
         """
         model = PlatformRegistry.get_global_platform_registry().get_normal_platform()
-        prompt = f"""Please generate 3 different expressions optimized for vector search based on the following query. Each expression should:
+        model.set_suppress_output(True)
+        prompt = f"""Please generate 10 different expressions optimized for vector search based on the following query. Each expression should:
 1. Focus on key technical concepts and terminology
 2. Use clear and specific language
@@ -666,7 +661,8 @@ Note: Only include files that have a strong connection to the query."""
 5. Maintain semantic similarity with original query
 6. Be suitable for embedding-based search
-Original query: {query}
+Original query:
+{query}
 Example transformations:
 Query: "How to handle user login?"
@@ -708,7 +704,7 @@ Please provide 10 search-optimized expressions in the specified format.
         """
         results = {}
         for query in query_variants:
-            query_vector = self.get_embedding(query)
+            query_vector = get_embedding(self.embedding_model, query)
             query_vector = query_vector.reshape(1, -1)
             distances, indices = self.index.search(query_vector, top_k) # type: ignore
@@ -744,7 +740,7 @@ Please provide 10 search-optimized expressions in the specified format.
             for variant in query_variants:
                 # Get vector for each variant
-                query_vector = self.get_embedding(variant)
+                query_vector = get_embedding(self.embedding_model, variant)
                 query_vector = query_vector.reshape(1, -1)
                 # Search with current variant
@@ -767,14 +763,16 @@ Please provide 10 search-optimized expressions in the specified format.
             # Sort by similarity and take top_k
             all_results.sort(key=lambda x: x[1], reverse=True)
             results = all_results[:top_k]
             # Display results with scores
             message = "Found related files:\n"
             for path, score, _ in results:
                 message += f"File: {path} (Score: {score:.3f})\n"
             PrettyOutput.print(message.rstrip(), output_type=OutputType.INFO, lang="markdown")
+            results = self.pick_results(query_variants, [path for path, _, _ in results])
-            return [path for path, _, _ in results]
+            return results
         except Exception as e:
             PrettyOutput.print(f"Failed to search: {str(e)}", output_type=OutputType.ERROR)
@@ -784,15 +782,12 @@ Please provide 10 search-optimized expressions in the specified format.
         """Query the codebase with enhanced context building"""
         files_from_codebase = self.search_similar(query, top_k)
-        from jarvis.jarvis_code_agent.relevant_files import find_relevant_files_from_agent
-        files_from_agent = find_relevant_files_from_agent(query, files_from_codebase)
-        if not files_from_agent:
+        if not files_from_codebase:
             PrettyOutput.print("No related files found", output_type=OutputType.WARNING)
             return ""
         output = "Found related files:\n"
-        for path in files_from_agent:
+        for path in files_from_codebase:
             output += f"- {path}\n"
         PrettyOutput.print(output, output_type=OutputType.INFO, lang="markdown")
@@ -810,10 +805,10 @@ Question: {query}
 Relevant code files (ordered by relevance):
 """
         # Add context with length control
-        available_length = self.max_context_length - len(prompt) - 1000  # Reserve space for answer
-        current_length = 0
+        available_count = self.max_token_count - get_context_token_count(prompt) - 1000  # Reserve space for answer
+        current_count = 0
-        for path in files_from_agent:
+        for path in files_from_codebase:
             try:
                 content = open(path, "r", encoding="utf-8").read()
                 file_content = f"""
@@ -822,7 +817,7 @@ Content:
 {content}
 ----------------------------------------
 """
-                if current_length + len(file_content) > available_length:
+                if current_count + get_context_token_count(file_content) > available_count:
                     PrettyOutput.print(
                         "Due to context length limit, some files were omitted",
                         output_type=OutputType.WARNING
@@ -830,7 +825,7 @@ Content:
                     break
                 prompt += file_content
-                current_length += len(file_content)
+                current_count += get_context_token_count(file_content)
             except Exception as e:
                 PrettyOutput.print(f"Failed to read file {path}: {str(e)}",

jarvis/jarvis_platform/oyi.py CHANGED Viewed

@@ -2,7 +2,7 @@ import mimetypes
 import os
 from typing import Dict, List, Tuple
 from jarvis.jarvis_platform.base import BasePlatform
-from jarvis.utils import PrettyOutput, OutputType, get_max_context_length
+from jarvis.utils import PrettyOutput, OutputType
 import requests
 import json

jarvis-ai-assistant 0.1.108__py3-none-any.whl → 0.1.110__py3-none-any.whl

Potentially problematic release.

jarvis-ai-assistant 0.1.108py3-none-any.whl → 0.1.110py3-none-any.whl