jarvis-ai-assistant 0.1.108__py3-none-any.whl → 0.1.110__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of jarvis-ai-assistant might be problematic. Click here for more details.

@@ -7,7 +7,7 @@ from typing import List, Tuple, Optional, Dict
7
7
  from jarvis.jarvis_platform.registry import PlatformRegistry
8
8
  import concurrent.futures
9
9
  from concurrent.futures import ThreadPoolExecutor
10
- from jarvis.utils import OutputType, PrettyOutput, find_git_root, get_file_md5, get_max_context_length, get_thread_count, load_embedding_model, user_confirm
10
+ from jarvis.utils import OutputType, PrettyOutput, find_git_root, get_context_token_count, get_embedding, get_file_md5, get_max_token_count, get_thread_count, load_embedding_model, user_confirm
11
11
  from jarvis.utils import init_env
12
12
  import argparse
13
13
  import pickle
@@ -21,7 +21,7 @@ class CodeBase:
21
21
  self.root_dir = root_dir
22
22
  os.chdir(self.root_dir)
23
23
  self.thread_count = get_thread_count()
24
- self.max_context_length = get_max_context_length()
24
+ self.max_token_count = get_max_token_count()
25
25
  self.index = None
26
26
 
27
27
  # 初始化数据目录
@@ -209,19 +209,6 @@ Code content:
209
209
 
210
210
  return cached_data["vector"]
211
211
 
212
- def get_embedding(self, text: str) -> np.ndarray:
213
- """Use the transformers model to get the vector representation of text"""
214
- # Truncate long text
215
- max_length = 512 # Or other suitable length
216
- text = ' '.join(text.split()[:max_length])
217
-
218
- # Get the embedding vector
219
- embedding = self.embedding_model.encode(text,
220
- normalize_embeddings=True, # L2 normalization
221
- show_progress_bar=False)
222
- vector = np.array(embedding, dtype=np.float32)
223
- return vector
224
-
225
212
  def vectorize_file(self, file_path: str, description: str) -> np.ndarray:
226
213
  """Vectorize the file content and description"""
227
214
  try:
@@ -231,7 +218,7 @@ Code content:
231
218
  return cached_vector
232
219
 
233
220
  # Read the file content and combine information
234
- content = open(file_path, "r", encoding="utf-8").read()[:self.max_context_length] # Limit the file content length
221
+ content = open(file_path, "r", encoding="utf-8").read()[:self.max_token_count] # Limit the file content length
235
222
 
236
223
  # Combine file information, including file content
237
224
  combined_text = f"""
@@ -239,7 +226,7 @@ File path: {file_path}
239
226
  Description: {description}
240
227
  Content: {content}
241
228
  """
242
- vector = self.get_embedding(combined_text)
229
+ vector = get_embedding(self.embedding_model, combined_text)
243
230
 
244
231
  # Save to cache
245
232
  self.cache_vector(file_path, vector, description)
@@ -537,7 +524,7 @@ Content: {content}
537
524
  score = len(matched_keywords) / len(keywords)
538
525
  return score
539
526
 
540
- def pick_results(self, query: str, initial_results: List[str]) -> List[str]:
527
+ def pick_results(self, query: List[str], initial_results: List[str]) -> List[str]:
541
528
  """Use a large model to pick the search results
542
529
 
543
530
  Args:
@@ -551,40 +538,40 @@ Content: {content}
551
538
  return []
552
539
 
553
540
  try:
554
- PrettyOutput.print(f"Picking results for query: {query}", output_type=OutputType.INFO)
541
+ PrettyOutput.print(f"Picking results for query: \n" + "\n".join(query), output_type=OutputType.INFO)
555
542
 
556
543
  # Maximum content length per batch
557
- max_batch_length = self.max_context_length - 1000 # Reserve space for prompt
544
+ max_batch_length = self.max_token_count - 1000 # Reserve space for prompt
558
545
  max_file_length = max_batch_length // 3 # Limit individual file size
559
546
 
560
547
  # Process files in batches
561
548
  all_selected_files = set()
562
549
  current_batch = []
563
- current_length = 0
550
+ current_token_count = 0
564
551
 
565
552
  for path in initial_results:
566
553
  try:
567
554
  content = open(path, "r", encoding="utf-8").read()
568
555
  # Truncate large files
569
- if len(content) > max_file_length:
556
+ if get_context_token_count(content) > max_file_length:
570
557
  PrettyOutput.print(f"Truncating large file: {path}", OutputType.WARNING)
571
558
  content = content[:max_file_length] + "\n... (content truncated)"
572
559
 
573
560
  file_info = f"File: {path}\nContent: {content}\n\n"
574
- file_length = len(file_info)
561
+ tokens_count = get_context_token_count(file_info)
575
562
 
576
563
  # If adding this file would exceed batch limit
577
- if current_length + file_length > max_batch_length:
564
+ if current_token_count + tokens_count > max_batch_length:
578
565
  # Process current batch
579
566
  if current_batch:
580
- selected = self._process_batch(query, current_batch)
567
+ selected = self._process_batch('\n'.join(query), current_batch)
581
568
  all_selected_files.update(selected)
582
569
  # Start new batch
583
570
  current_batch = [file_info]
584
- current_length = file_length
571
+ current_token_count = tokens_count
585
572
  else:
586
573
  current_batch.append(file_info)
587
- current_length += file_length
574
+ current_token_count += tokens_count
588
575
 
589
576
  except Exception as e:
590
577
  PrettyOutput.print(f"Failed to read file {path}: {str(e)}", OutputType.ERROR)
@@ -592,7 +579,7 @@ Content: {content}
592
579
 
593
580
  # Process final batch
594
581
  if current_batch:
595
- selected = self._process_batch(query, current_batch)
582
+ selected = self._process_batch('\n'.join(query), current_batch)
596
583
  all_selected_files.update(selected)
597
584
 
598
585
  # Convert set to list and maintain original order
@@ -604,33 +591,41 @@ Content: {content}
604
591
  return initial_results
605
592
 
606
593
  def _process_batch(self, query: str, files_info: List[str]) -> List[str]:
607
- """Process a batch of files
608
-
609
- Args:
610
- query: Search query
611
- files_info: List of file information strings
612
-
613
- Returns:
614
- List[str]: Selected file paths from this batch
615
- """
616
- prompt = f"""Please analyze the following code files and determine which files are most relevant to the given query. Consider file paths and code content to make your judgment.
594
+ """Process a batch of files"""
595
+ prompt = f"""As a code analysis expert, please help identify the most relevant files for the given query using chain-of-thought reasoning.
617
596
 
618
597
  Query: {query}
619
598
 
620
599
  Available files:
621
600
  {''.join(files_info)}
622
601
 
623
- Please output a YAML list of relevant file paths, ordered by relevance (most relevant first). Only include files that are truly relevant to the query.
624
- Output format:
602
+ Think through this step by step:
603
+ 1. First, analyze the query to identify key requirements and technical concepts
604
+ 2. For each file:
605
+ - Examine its path and content
606
+ - Assess how it relates to the query's requirements
607
+ - Consider both direct and indirect relationships
608
+ - Rate its relevance (high/medium/low)
609
+ 3. Select only files with clear relevance to the query
610
+ 4. Order files by relevance, with most relevant first
611
+
612
+ Please output your selection in YAML format:
625
613
  <FILES>
626
- - path/to/file1.py
627
- - path/to/file2.py
614
+ - path/to/most/relevant.py
615
+ - path/to/next/relevant.py
628
616
  </FILES>
629
617
 
630
- Note: Only include files that have a strong connection to the query."""
618
+ Important:
619
+ - Only include files that are truly relevant
620
+ - Exclude files with weak or unclear connections
621
+ - Focus on implementation rather than test files
622
+ - Consider both file paths and content
623
+ - Only output the file paths, no other text
624
+ """
631
625
 
632
626
  # Use a large model to evaluate
633
627
  model = PlatformRegistry.get_global_platform_registry().get_normal_platform()
628
+ model.set_suppress_output(True)
634
629
  response = model.chat_until_success(prompt)
635
630
 
636
631
  # Parse the response
@@ -639,7 +634,6 @@ Note: Only include files that have a strong connection to the query."""
639
634
  if not files_match:
640
635
  return []
641
636
 
642
- # Extract the file list
643
637
  try:
644
638
  selected_files = yaml.safe_load(files_match.group(1))
645
639
  return selected_files if selected_files else []
@@ -657,7 +651,8 @@ Note: Only include files that have a strong connection to the query."""
657
651
  List[str]: The query variants list
658
652
  """
659
653
  model = PlatformRegistry.get_global_platform_registry().get_normal_platform()
660
- prompt = f"""Please generate 3 different expressions optimized for vector search based on the following query. Each expression should:
654
+ model.set_suppress_output(True)
655
+ prompt = f"""Please generate 10 different expressions optimized for vector search based on the following query. Each expression should:
661
656
 
662
657
  1. Focus on key technical concepts and terminology
663
658
  2. Use clear and specific language
@@ -666,7 +661,8 @@ Note: Only include files that have a strong connection to the query."""
666
661
  5. Maintain semantic similarity with original query
667
662
  6. Be suitable for embedding-based search
668
663
 
669
- Original query: {query}
664
+ Original query:
665
+ {query}
670
666
 
671
667
  Example transformations:
672
668
  Query: "How to handle user login?"
@@ -708,7 +704,7 @@ Please provide 10 search-optimized expressions in the specified format.
708
704
  """
709
705
  results = {}
710
706
  for query in query_variants:
711
- query_vector = self.get_embedding(query)
707
+ query_vector = get_embedding(self.embedding_model, query)
712
708
  query_vector = query_vector.reshape(1, -1)
713
709
 
714
710
  distances, indices = self.index.search(query_vector, top_k) # type: ignore
@@ -744,7 +740,7 @@ Please provide 10 search-optimized expressions in the specified format.
744
740
 
745
741
  for variant in query_variants:
746
742
  # Get vector for each variant
747
- query_vector = self.get_embedding(variant)
743
+ query_vector = get_embedding(self.embedding_model, variant)
748
744
  query_vector = query_vector.reshape(1, -1)
749
745
 
750
746
  # Search with current variant
@@ -767,14 +763,16 @@ Please provide 10 search-optimized expressions in the specified format.
767
763
  # Sort by similarity and take top_k
768
764
  all_results.sort(key=lambda x: x[1], reverse=True)
769
765
  results = all_results[:top_k]
770
-
766
+
771
767
  # Display results with scores
772
768
  message = "Found related files:\n"
773
769
  for path, score, _ in results:
774
770
  message += f"File: {path} (Score: {score:.3f})\n"
775
771
  PrettyOutput.print(message.rstrip(), output_type=OutputType.INFO, lang="markdown")
772
+
773
+ results = self.pick_results(query_variants, [path for path, _, _ in results])
776
774
 
777
- return [path for path, _, _ in results]
775
+ return results
778
776
 
779
777
  except Exception as e:
780
778
  PrettyOutput.print(f"Failed to search: {str(e)}", output_type=OutputType.ERROR)
@@ -784,15 +782,12 @@ Please provide 10 search-optimized expressions in the specified format.
784
782
  """Query the codebase with enhanced context building"""
785
783
  files_from_codebase = self.search_similar(query, top_k)
786
784
 
787
- from jarvis.jarvis_code_agent.relevant_files import find_relevant_files_from_agent
788
- files_from_agent = find_relevant_files_from_agent(query, files_from_codebase)
789
-
790
- if not files_from_agent:
785
+ if not files_from_codebase:
791
786
  PrettyOutput.print("No related files found", output_type=OutputType.WARNING)
792
787
  return ""
793
788
 
794
789
  output = "Found related files:\n"
795
- for path in files_from_agent:
790
+ for path in files_from_codebase:
796
791
  output += f"- {path}\n"
797
792
  PrettyOutput.print(output, output_type=OutputType.INFO, lang="markdown")
798
793
 
@@ -810,10 +805,10 @@ Question: {query}
810
805
  Relevant code files (ordered by relevance):
811
806
  """
812
807
  # Add context with length control
813
- available_length = self.max_context_length - len(prompt) - 1000 # Reserve space for answer
814
- current_length = 0
808
+ available_count = self.max_token_count - get_context_token_count(prompt) - 1000 # Reserve space for answer
809
+ current_count = 0
815
810
 
816
- for path in files_from_agent:
811
+ for path in files_from_codebase:
817
812
  try:
818
813
  content = open(path, "r", encoding="utf-8").read()
819
814
  file_content = f"""
@@ -822,7 +817,7 @@ Content:
822
817
  {content}
823
818
  ----------------------------------------
824
819
  """
825
- if current_length + len(file_content) > available_length:
820
+ if current_count + get_context_token_count(file_content) > available_count:
826
821
  PrettyOutput.print(
827
822
  "Due to context length limit, some files were omitted",
828
823
  output_type=OutputType.WARNING
@@ -830,7 +825,7 @@ Content:
830
825
  break
831
826
 
832
827
  prompt += file_content
833
- current_length += len(file_content)
828
+ current_count += get_context_token_count(file_content)
834
829
 
835
830
  except Exception as e:
836
831
  PrettyOutput.print(f"Failed to read file {path}: {str(e)}",
@@ -2,7 +2,7 @@ import mimetypes
2
2
  import os
3
3
  from typing import Dict, List, Tuple
4
4
  from jarvis.jarvis_platform.base import BasePlatform
5
- from jarvis.utils import PrettyOutput, OutputType, get_max_context_length
5
+ from jarvis.utils import PrettyOutput, OutputType
6
6
  import requests
7
7
  import json
8
8