jarvis-ai-assistant 0.1.108__py3-none-any.whl → 0.1.110__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of jarvis-ai-assistant might be problematic. Click here for more details.
- jarvis/__init__.py +1 -1
- jarvis/agent.py +5 -5
- jarvis/jarvis_code_agent/code_agent.py +69 -217
- jarvis/jarvis_code_agent/file_select.py +11 -10
- jarvis/jarvis_code_agent/patch.py +19 -9
- jarvis/jarvis_code_agent/relevant_files.py +1 -162
- jarvis/jarvis_codebase/main.py +55 -60
- jarvis/jarvis_platform/oyi.py +1 -1
- jarvis/jarvis_rag/main.py +194 -268
- jarvis/jarvis_tools/registry.py +10 -9
- jarvis/utils.py +155 -16
- {jarvis_ai_assistant-0.1.108.dist-info → jarvis_ai_assistant-0.1.110.dist-info}/METADATA +12 -3
- {jarvis_ai_assistant-0.1.108.dist-info → jarvis_ai_assistant-0.1.110.dist-info}/RECORD +17 -17
- {jarvis_ai_assistant-0.1.108.dist-info → jarvis_ai_assistant-0.1.110.dist-info}/LICENSE +0 -0
- {jarvis_ai_assistant-0.1.108.dist-info → jarvis_ai_assistant-0.1.110.dist-info}/WHEEL +0 -0
- {jarvis_ai_assistant-0.1.108.dist-info → jarvis_ai_assistant-0.1.110.dist-info}/entry_points.txt +0 -0
- {jarvis_ai_assistant-0.1.108.dist-info → jarvis_ai_assistant-0.1.110.dist-info}/top_level.txt +0 -0
jarvis/jarvis_codebase/main.py
CHANGED
|
@@ -7,7 +7,7 @@ from typing import List, Tuple, Optional, Dict
|
|
|
7
7
|
from jarvis.jarvis_platform.registry import PlatformRegistry
|
|
8
8
|
import concurrent.futures
|
|
9
9
|
from concurrent.futures import ThreadPoolExecutor
|
|
10
|
-
from jarvis.utils import OutputType, PrettyOutput, find_git_root, get_file_md5,
|
|
10
|
+
from jarvis.utils import OutputType, PrettyOutput, find_git_root, get_context_token_count, get_embedding, get_file_md5, get_max_token_count, get_thread_count, load_embedding_model, user_confirm
|
|
11
11
|
from jarvis.utils import init_env
|
|
12
12
|
import argparse
|
|
13
13
|
import pickle
|
|
@@ -21,7 +21,7 @@ class CodeBase:
|
|
|
21
21
|
self.root_dir = root_dir
|
|
22
22
|
os.chdir(self.root_dir)
|
|
23
23
|
self.thread_count = get_thread_count()
|
|
24
|
-
self.
|
|
24
|
+
self.max_token_count = get_max_token_count()
|
|
25
25
|
self.index = None
|
|
26
26
|
|
|
27
27
|
# 初始化数据目录
|
|
@@ -209,19 +209,6 @@ Code content:
|
|
|
209
209
|
|
|
210
210
|
return cached_data["vector"]
|
|
211
211
|
|
|
212
|
-
def get_embedding(self, text: str) -> np.ndarray:
|
|
213
|
-
"""Use the transformers model to get the vector representation of text"""
|
|
214
|
-
# Truncate long text
|
|
215
|
-
max_length = 512 # Or other suitable length
|
|
216
|
-
text = ' '.join(text.split()[:max_length])
|
|
217
|
-
|
|
218
|
-
# Get the embedding vector
|
|
219
|
-
embedding = self.embedding_model.encode(text,
|
|
220
|
-
normalize_embeddings=True, # L2 normalization
|
|
221
|
-
show_progress_bar=False)
|
|
222
|
-
vector = np.array(embedding, dtype=np.float32)
|
|
223
|
-
return vector
|
|
224
|
-
|
|
225
212
|
def vectorize_file(self, file_path: str, description: str) -> np.ndarray:
|
|
226
213
|
"""Vectorize the file content and description"""
|
|
227
214
|
try:
|
|
@@ -231,7 +218,7 @@ Code content:
|
|
|
231
218
|
return cached_vector
|
|
232
219
|
|
|
233
220
|
# Read the file content and combine information
|
|
234
|
-
content = open(file_path, "r", encoding="utf-8").read()[:self.
|
|
221
|
+
content = open(file_path, "r", encoding="utf-8").read()[:self.max_token_count] # Limit the file content length
|
|
235
222
|
|
|
236
223
|
# Combine file information, including file content
|
|
237
224
|
combined_text = f"""
|
|
@@ -239,7 +226,7 @@ File path: {file_path}
|
|
|
239
226
|
Description: {description}
|
|
240
227
|
Content: {content}
|
|
241
228
|
"""
|
|
242
|
-
vector = self.
|
|
229
|
+
vector = get_embedding(self.embedding_model, combined_text)
|
|
243
230
|
|
|
244
231
|
# Save to cache
|
|
245
232
|
self.cache_vector(file_path, vector, description)
|
|
@@ -537,7 +524,7 @@ Content: {content}
|
|
|
537
524
|
score = len(matched_keywords) / len(keywords)
|
|
538
525
|
return score
|
|
539
526
|
|
|
540
|
-
def pick_results(self, query: str, initial_results: List[str]) -> List[str]:
|
|
527
|
+
def pick_results(self, query: List[str], initial_results: List[str]) -> List[str]:
|
|
541
528
|
"""Use a large model to pick the search results
|
|
542
529
|
|
|
543
530
|
Args:
|
|
@@ -551,40 +538,40 @@ Content: {content}
|
|
|
551
538
|
return []
|
|
552
539
|
|
|
553
540
|
try:
|
|
554
|
-
PrettyOutput.print(f"Picking results for query:
|
|
541
|
+
PrettyOutput.print(f"Picking results for query: \n" + "\n".join(query), output_type=OutputType.INFO)
|
|
555
542
|
|
|
556
543
|
# Maximum content length per batch
|
|
557
|
-
max_batch_length = self.
|
|
544
|
+
max_batch_length = self.max_token_count - 1000 # Reserve space for prompt
|
|
558
545
|
max_file_length = max_batch_length // 3 # Limit individual file size
|
|
559
546
|
|
|
560
547
|
# Process files in batches
|
|
561
548
|
all_selected_files = set()
|
|
562
549
|
current_batch = []
|
|
563
|
-
|
|
550
|
+
current_token_count = 0
|
|
564
551
|
|
|
565
552
|
for path in initial_results:
|
|
566
553
|
try:
|
|
567
554
|
content = open(path, "r", encoding="utf-8").read()
|
|
568
555
|
# Truncate large files
|
|
569
|
-
if
|
|
556
|
+
if get_context_token_count(content) > max_file_length:
|
|
570
557
|
PrettyOutput.print(f"Truncating large file: {path}", OutputType.WARNING)
|
|
571
558
|
content = content[:max_file_length] + "\n... (content truncated)"
|
|
572
559
|
|
|
573
560
|
file_info = f"File: {path}\nContent: {content}\n\n"
|
|
574
|
-
|
|
561
|
+
tokens_count = get_context_token_count(file_info)
|
|
575
562
|
|
|
576
563
|
# If adding this file would exceed batch limit
|
|
577
|
-
if
|
|
564
|
+
if current_token_count + tokens_count > max_batch_length:
|
|
578
565
|
# Process current batch
|
|
579
566
|
if current_batch:
|
|
580
|
-
selected = self._process_batch(query, current_batch)
|
|
567
|
+
selected = self._process_batch('\n'.join(query), current_batch)
|
|
581
568
|
all_selected_files.update(selected)
|
|
582
569
|
# Start new batch
|
|
583
570
|
current_batch = [file_info]
|
|
584
|
-
|
|
571
|
+
current_token_count = tokens_count
|
|
585
572
|
else:
|
|
586
573
|
current_batch.append(file_info)
|
|
587
|
-
|
|
574
|
+
current_token_count += tokens_count
|
|
588
575
|
|
|
589
576
|
except Exception as e:
|
|
590
577
|
PrettyOutput.print(f"Failed to read file {path}: {str(e)}", OutputType.ERROR)
|
|
@@ -592,7 +579,7 @@ Content: {content}
|
|
|
592
579
|
|
|
593
580
|
# Process final batch
|
|
594
581
|
if current_batch:
|
|
595
|
-
selected = self._process_batch(query, current_batch)
|
|
582
|
+
selected = self._process_batch('\n'.join(query), current_batch)
|
|
596
583
|
all_selected_files.update(selected)
|
|
597
584
|
|
|
598
585
|
# Convert set to list and maintain original order
|
|
@@ -604,33 +591,41 @@ Content: {content}
|
|
|
604
591
|
return initial_results
|
|
605
592
|
|
|
606
593
|
def _process_batch(self, query: str, files_info: List[str]) -> List[str]:
|
|
607
|
-
"""Process a batch of files
|
|
608
|
-
|
|
609
|
-
Args:
|
|
610
|
-
query: Search query
|
|
611
|
-
files_info: List of file information strings
|
|
612
|
-
|
|
613
|
-
Returns:
|
|
614
|
-
List[str]: Selected file paths from this batch
|
|
615
|
-
"""
|
|
616
|
-
prompt = f"""Please analyze the following code files and determine which files are most relevant to the given query. Consider file paths and code content to make your judgment.
|
|
594
|
+
"""Process a batch of files"""
|
|
595
|
+
prompt = f"""As a code analysis expert, please help identify the most relevant files for the given query using chain-of-thought reasoning.
|
|
617
596
|
|
|
618
597
|
Query: {query}
|
|
619
598
|
|
|
620
599
|
Available files:
|
|
621
600
|
{''.join(files_info)}
|
|
622
601
|
|
|
623
|
-
|
|
624
|
-
|
|
602
|
+
Think through this step by step:
|
|
603
|
+
1. First, analyze the query to identify key requirements and technical concepts
|
|
604
|
+
2. For each file:
|
|
605
|
+
- Examine its path and content
|
|
606
|
+
- Assess how it relates to the query's requirements
|
|
607
|
+
- Consider both direct and indirect relationships
|
|
608
|
+
- Rate its relevance (high/medium/low)
|
|
609
|
+
3. Select only files with clear relevance to the query
|
|
610
|
+
4. Order files by relevance, with most relevant first
|
|
611
|
+
|
|
612
|
+
Please output your selection in YAML format:
|
|
625
613
|
<FILES>
|
|
626
|
-
- path/to/
|
|
627
|
-
- path/to/
|
|
614
|
+
- path/to/most/relevant.py
|
|
615
|
+
- path/to/next/relevant.py
|
|
628
616
|
</FILES>
|
|
629
617
|
|
|
630
|
-
|
|
618
|
+
Important:
|
|
619
|
+
- Only include files that are truly relevant
|
|
620
|
+
- Exclude files with weak or unclear connections
|
|
621
|
+
- Focus on implementation rather than test files
|
|
622
|
+
- Consider both file paths and content
|
|
623
|
+
- Only output the file paths, no other text
|
|
624
|
+
"""
|
|
631
625
|
|
|
632
626
|
# Use a large model to evaluate
|
|
633
627
|
model = PlatformRegistry.get_global_platform_registry().get_normal_platform()
|
|
628
|
+
model.set_suppress_output(True)
|
|
634
629
|
response = model.chat_until_success(prompt)
|
|
635
630
|
|
|
636
631
|
# Parse the response
|
|
@@ -639,7 +634,6 @@ Note: Only include files that have a strong connection to the query."""
|
|
|
639
634
|
if not files_match:
|
|
640
635
|
return []
|
|
641
636
|
|
|
642
|
-
# Extract the file list
|
|
643
637
|
try:
|
|
644
638
|
selected_files = yaml.safe_load(files_match.group(1))
|
|
645
639
|
return selected_files if selected_files else []
|
|
@@ -657,7 +651,8 @@ Note: Only include files that have a strong connection to the query."""
|
|
|
657
651
|
List[str]: The query variants list
|
|
658
652
|
"""
|
|
659
653
|
model = PlatformRegistry.get_global_platform_registry().get_normal_platform()
|
|
660
|
-
|
|
654
|
+
model.set_suppress_output(True)
|
|
655
|
+
prompt = f"""Please generate 10 different expressions optimized for vector search based on the following query. Each expression should:
|
|
661
656
|
|
|
662
657
|
1. Focus on key technical concepts and terminology
|
|
663
658
|
2. Use clear and specific language
|
|
@@ -666,7 +661,8 @@ Note: Only include files that have a strong connection to the query."""
|
|
|
666
661
|
5. Maintain semantic similarity with original query
|
|
667
662
|
6. Be suitable for embedding-based search
|
|
668
663
|
|
|
669
|
-
Original query:
|
|
664
|
+
Original query:
|
|
665
|
+
{query}
|
|
670
666
|
|
|
671
667
|
Example transformations:
|
|
672
668
|
Query: "How to handle user login?"
|
|
@@ -708,7 +704,7 @@ Please provide 10 search-optimized expressions in the specified format.
|
|
|
708
704
|
"""
|
|
709
705
|
results = {}
|
|
710
706
|
for query in query_variants:
|
|
711
|
-
query_vector = self.
|
|
707
|
+
query_vector = get_embedding(self.embedding_model, query)
|
|
712
708
|
query_vector = query_vector.reshape(1, -1)
|
|
713
709
|
|
|
714
710
|
distances, indices = self.index.search(query_vector, top_k) # type: ignore
|
|
@@ -744,7 +740,7 @@ Please provide 10 search-optimized expressions in the specified format.
|
|
|
744
740
|
|
|
745
741
|
for variant in query_variants:
|
|
746
742
|
# Get vector for each variant
|
|
747
|
-
query_vector = self.
|
|
743
|
+
query_vector = get_embedding(self.embedding_model, variant)
|
|
748
744
|
query_vector = query_vector.reshape(1, -1)
|
|
749
745
|
|
|
750
746
|
# Search with current variant
|
|
@@ -767,14 +763,16 @@ Please provide 10 search-optimized expressions in the specified format.
|
|
|
767
763
|
# Sort by similarity and take top_k
|
|
768
764
|
all_results.sort(key=lambda x: x[1], reverse=True)
|
|
769
765
|
results = all_results[:top_k]
|
|
770
|
-
|
|
766
|
+
|
|
771
767
|
# Display results with scores
|
|
772
768
|
message = "Found related files:\n"
|
|
773
769
|
for path, score, _ in results:
|
|
774
770
|
message += f"File: {path} (Score: {score:.3f})\n"
|
|
775
771
|
PrettyOutput.print(message.rstrip(), output_type=OutputType.INFO, lang="markdown")
|
|
772
|
+
|
|
773
|
+
results = self.pick_results(query_variants, [path for path, _, _ in results])
|
|
776
774
|
|
|
777
|
-
return
|
|
775
|
+
return results
|
|
778
776
|
|
|
779
777
|
except Exception as e:
|
|
780
778
|
PrettyOutput.print(f"Failed to search: {str(e)}", output_type=OutputType.ERROR)
|
|
@@ -784,15 +782,12 @@ Please provide 10 search-optimized expressions in the specified format.
|
|
|
784
782
|
"""Query the codebase with enhanced context building"""
|
|
785
783
|
files_from_codebase = self.search_similar(query, top_k)
|
|
786
784
|
|
|
787
|
-
|
|
788
|
-
files_from_agent = find_relevant_files_from_agent(query, files_from_codebase)
|
|
789
|
-
|
|
790
|
-
if not files_from_agent:
|
|
785
|
+
if not files_from_codebase:
|
|
791
786
|
PrettyOutput.print("No related files found", output_type=OutputType.WARNING)
|
|
792
787
|
return ""
|
|
793
788
|
|
|
794
789
|
output = "Found related files:\n"
|
|
795
|
-
for path in
|
|
790
|
+
for path in files_from_codebase:
|
|
796
791
|
output += f"- {path}\n"
|
|
797
792
|
PrettyOutput.print(output, output_type=OutputType.INFO, lang="markdown")
|
|
798
793
|
|
|
@@ -810,10 +805,10 @@ Question: {query}
|
|
|
810
805
|
Relevant code files (ordered by relevance):
|
|
811
806
|
"""
|
|
812
807
|
# Add context with length control
|
|
813
|
-
|
|
814
|
-
|
|
808
|
+
available_count = self.max_token_count - get_context_token_count(prompt) - 1000 # Reserve space for answer
|
|
809
|
+
current_count = 0
|
|
815
810
|
|
|
816
|
-
for path in
|
|
811
|
+
for path in files_from_codebase:
|
|
817
812
|
try:
|
|
818
813
|
content = open(path, "r", encoding="utf-8").read()
|
|
819
814
|
file_content = f"""
|
|
@@ -822,7 +817,7 @@ Content:
|
|
|
822
817
|
{content}
|
|
823
818
|
----------------------------------------
|
|
824
819
|
"""
|
|
825
|
-
if
|
|
820
|
+
if current_count + get_context_token_count(file_content) > available_count:
|
|
826
821
|
PrettyOutput.print(
|
|
827
822
|
"Due to context length limit, some files were omitted",
|
|
828
823
|
output_type=OutputType.WARNING
|
|
@@ -830,7 +825,7 @@ Content:
|
|
|
830
825
|
break
|
|
831
826
|
|
|
832
827
|
prompt += file_content
|
|
833
|
-
|
|
828
|
+
current_count += get_context_token_count(file_content)
|
|
834
829
|
|
|
835
830
|
except Exception as e:
|
|
836
831
|
PrettyOutput.print(f"Failed to read file {path}: {str(e)}",
|
jarvis/jarvis_platform/oyi.py
CHANGED
|
@@ -2,7 +2,7 @@ import mimetypes
|
|
|
2
2
|
import os
|
|
3
3
|
from typing import Dict, List, Tuple
|
|
4
4
|
from jarvis.jarvis_platform.base import BasePlatform
|
|
5
|
-
from jarvis.utils import PrettyOutput, OutputType
|
|
5
|
+
from jarvis.utils import PrettyOutput, OutputType
|
|
6
6
|
import requests
|
|
7
7
|
import json
|
|
8
8
|
|