PyPI - auto-coder - Versions diffs - 0.1.288__py3-none-any.whl → 0.1.289__py3-none-any.whl - Mend

auto-coder 0.1.288py3-none-any.whl → 0.1.289py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of auto-coder might be problematic. Click here for more details.

Files changed (14) hide show

{auto_coder-0.1.288.dist-info → auto_coder-0.1.289.dist-info}/METADATA +1 -1
{auto_coder-0.1.288.dist-info → auto_coder-0.1.289.dist-info}/RECORD +14 -13
autocoder/chat_auto_coder_lang.py +16 -16
autocoder/common/auto_coder_lang.py +16 -4
autocoder/common/mcp_hub.py +99 -77
autocoder/common/mcp_server.py +162 -61
autocoder/index/filter/quick_filter.py +373 -3
autocoder/rag/long_context_rag.py +22 -9
autocoder/rag/searchable.py +58 -0
autocoder/version.py +1 -1
{auto_coder-0.1.288.dist-info → auto_coder-0.1.289.dist-info}/LICENSE +0 -0
{auto_coder-0.1.288.dist-info → auto_coder-0.1.289.dist-info}/WHEEL +0 -0
{auto_coder-0.1.288.dist-info → auto_coder-0.1.289.dist-info}/entry_points.txt +0 -0
{auto_coder-0.1.288.dist-info → auto_coder-0.1.289.dist-info}/top_level.txt +0 -0

autocoder/index/filter/quick_filter.py CHANGED Viewed

@@ -22,6 +22,8 @@ from autocoder.utils.llms import get_llm_names, get_model_info
 from loguru import logger
 from byzerllm.utils.client.code_utils import extract_code
 import json
+from autocoder.index.symbols_utils import extract_symbols
+import os.path
 def get_file_path(file_path):
@@ -389,15 +391,45 @@ class QuickFilter():
         tokens_len = count_tokens(prompt_str)
-        # Print current index size
+        # 打印当前索引大小
         self.printer.print_in_terminal(
             "quick_filter_tokens_len",
             style="blue",
             tokens_len=tokens_len
         )
-        if tokens_len > self.max_tokens:
+        if tokens_len > self.max_tokens and tokens_len < 4*self.max_tokens:
+            # 打印 big_filter 模式的状态
+            self.printer.print_in_terminal(
+                "filter_mode_big",
+                style="yellow",
+                tokens_len=tokens_len
+            )
             return self.big_filter(index_items)
+        elif tokens_len > 4*self.max_tokens:
+            # 打印 super_big_filter 模式的状态
+            self.printer.print_in_terminal(
+                "filter_mode_super_big",
+                style="yellow",
+                tokens_len=tokens_len
+            )
+            round1 = self.super_big_filter(index_items)
+            round1_index_items = []
+            for file_path in round1.files.keys():
+                for index_item in index_items:
+                    if index_item.module_name == file_path:
+                        round1_index_items.append(index_item)
+            if round1_index_items:
+                round2 = self.big_filter(round1_index_items)
+                return round2
+            return round1
+        else:
+            # 打印普通过滤模式的状态
+            self.printer.print_in_terminal(
+                "filter_mode_normal",
+                style="blue"
+            )
         try:
             # 获取模型名称
@@ -520,3 +552,341 @@ class QuickFilter():
             has_error=False,
             file_positions=final_file_positions
         )
+    def super_big_filter(self, index_items: List[IndexItem]) -> QuickFilterResult:
+        """
+        超大索引过滤方法，通过提取文件的核心信息（文件名和用途）来减少token数量
+        可处理超大规模的索引文件，通过切分成多个chunks并行处理
+        """
+        compact_items = []
+        # 将每个索引项转换为更紧凑的格式：只保留文件名和用途
+        for index, item in enumerate(index_items):
+            # 从module_name中提取文件名
+            filename = os.path.basename(item.module_name)
+            # 从symbols中提取用途
+            symbols_info = extract_symbols(item.symbols)
+            usage = symbols_info.usage if symbols_info.usage else "无用途描述"
+            # 创建紧凑的表示
+            compact_item = {
+                "index": index,
+                "filename": filename,
+                "full_path": item.module_name,
+                "usage": usage
+            }
+            compact_items.append(compact_item)
+        # 切分compact_items成多个chunks
+        chunks = []
+        current_chunk = []
+        batch_size = 100  # 每100条记录检查一次token数量
+        # 计算总的tokens长度
+        full_prompt = self.super_big_quick_filter_files.prompt(compact_items, self.args.query)
+        tokens_len = count_tokens(full_prompt)
+        # 如果tokens长度不超过max_tokens，直接处理整个列表
+        if tokens_len <= self.max_tokens:
+            return self._process_compact_items(compact_items, index_items)
+        # 否则，将compact_items切分成多个chunks，每100条检查一次
+        for i, item in enumerate(compact_items):
+            current_chunk.append(item)
+            # 每处理batch_size条记录或者到达末尾时检查一次
+            if (i + 1) % batch_size == 0 or i == len(compact_items) - 1:
+                temp_prompt = self.super_big_quick_filter_files.prompt(current_chunk, self.args.query)
+                temp_size = count_tokens(temp_prompt)
+                # 如果当前chunk的token数超过限制，则从当前位置分割
+                if temp_size > self.max_tokens:
+                    # 如果当前chunk为空，添加至少一项
+                    if len(current_chunk) <= batch_size:
+                        # 当前批次是第一批，但已经超过限制，则至少保留一半的记录
+                        split_index = max(1, len(current_chunk) // 2)
+                        chunks.append(current_chunk[:split_index])
+                        current_chunk = current_chunk[split_index:]
+                    else:
+                        # 将前一批次的items作为一个chunk
+                        prev_batch_end = len(current_chunk) - (i % batch_size + 1)
+                        if prev_batch_end > 0:
+                            chunks.append(current_chunk[:prev_batch_end])
+                            current_chunk = current_chunk[prev_batch_end:]
+                        else:
+                            # 极端情况：即使一条记录也超过了限制，则尝试添加单条记录
+                            chunks.append([current_chunk[0]])
+                            current_chunk = current_chunk[1:]
+        # 确保最后的chunk也被添加
+        if current_chunk:
+            chunks.append(current_chunk)
+        # 打印切分信息
+        self.printer.print_in_terminal(
+            "super_big_filter_splitting",
+            style="yellow",
+            tokens_len=tokens_len,
+            max_tokens=self.max_tokens,
+            split_size=len(chunks)
+        )
+        # 定义处理单个chunk的函数
+        def process_chunk(chunk_index: int, chunk: List[dict]) -> QuickFilterResult:
+            # 为避免在所有chunk上都显示UI，只在第一个chunk上显示
+            if chunk_index == 0:
+                # 显示UI的处理方式
+                return self._process_compact_items(chunk, index_items, show_ui=True, chunk_index=chunk_index)
+            else:
+                # 非UI显示的处理方式
+                return self._process_compact_items(chunk, index_items, show_ui=False, chunk_index=chunk_index)
+        # 使用ThreadPoolExecutor并行处理所有chunks
+        results: List[QuickFilterResult] = []
+        if chunks:
+            with ThreadPoolExecutor() as executor:
+                futures = [executor.submit(process_chunk, i, chunk) for i, chunk in enumerate(chunks)]
+                for future in futures:
+                    results.append(future.result())
+        # 合并所有结果
+        final_files: Dict[str, TargetFile] = {}
+        final_file_positions: Dict[str, int] = {}
+        has_error = False
+        error_messages: List[str] = []
+        # 收集所有文件和错误信息
+        for result in results:
+            if result.has_error:
+                has_error = True
+                if result.error_message:
+                    error_messages.append(result.error_message)
+            final_files.update(result.files)
+        # 处理file_positions的交织排序
+        max_position = max([max(pos.values()) for pos in [result.file_positions for result in results if result.file_positions]] + [0])
+        # 创建position映射表
+        position_map = {}
+        for result in results:
+            if result.file_positions:
+                for file_path, position in result.file_positions.items():
+                    if position not in position_map:
+                        position_map[position] = []
+                    position_map[position].append(file_path)
+        # 重新排序文件路径
+        current_index = 0
+        for position in range(max_position + 1):
+            if position in position_map:
+                for file_path in position_map[position]:
+                    final_file_positions[file_path] = current_index
+                    current_index += 1
+        return QuickFilterResult(
+            files=final_files,
+            has_error=has_error,
+            error_message="\n".join(error_messages) if error_messages else None,
+            file_positions=final_file_positions
+        )
+    def _process_compact_items(self, compact_items: List[dict], index_items: List[IndexItem], show_ui: bool = True, chunk_index: int = 0) -> QuickFilterResult:
+        """
+        处理一组compact_items，返回QuickFilterResult
+        """
+        # 使用流式输出处理
+        model_names = get_llm_names(self.index_manager.index_filter_llm)
+        model_name = ",".join(model_names)
+        # 获取模型价格信息
+        model_info_map = {}
+        for name in model_names:
+            info = get_model_info(name, self.args.product_mode)
+            if info:
+                model_info_map[name] = {
+                    "input_price": info.get("input_price", 0.0),
+                    "output_price": info.get("output_price", 0.0)
+                }
+        try:
+            start_time = time.monotonic()
+            # 渲染 Prompt 模板
+            prompt = self.super_big_quick_filter_files.prompt(compact_items, self.args.query)
+            if show_ui:
+                # 使用流式输出处理
+                stream_generator = stream_chat_with_continue(
+                    self.index_manager.index_filter_llm,
+                    [{"role": "user", "content": prompt}],
+                    {}
+                )
+                def extract_file_number_list(content: str) -> str:
+                    try:
+                        v = to_model(content, FileNumberList)
+                        return "\n".join([index_items[compact_items[file_number]["index"]].module_name for file_number in v.file_list])
+                    except Exception as e:
+                        logger.error(f"Error extracting file number list: {e}")
+                        return content
+                # 获取完整响应
+                full_response, last_meta = stream_out(
+                    stream_generator,
+                    model_name=model_name,
+                    title=self.printer.get_message_from_key_with_format(
+                        "super_big_filter_title", model_name=model_name),
+                    args=self.args,
+                    display_func=extract_file_number_list
+                )
+                # 解析结果
+                file_number_list = to_model(full_response, FileNumberList)
+                end_time = time.monotonic()
+                # 计算总成本
+                total_input_cost = 0.0
+                total_output_cost = 0.0
+                for name in model_names:
+                    info = model_info_map.get(name, {})
+                    total_input_cost += (last_meta.input_tokens_count *
+                                        info.get("input_price", 0.0)) / 1000000
+                    total_output_cost += (last_meta.generated_tokens_count *
+                                         info.get("output_price", 0.0)) / 1000000
+                # 四舍五入到4位小数
+                total_input_cost = round(total_input_cost, 4)
+                total_output_cost = round(total_output_cost, 4)
+                speed = last_meta.generated_tokens_count / (end_time - start_time)
+                # 打印 token 统计信息和成本
+                self.printer.print_in_terminal(
+                    "super_big_filter_stats",
+                    style="blue",
+                    elapsed_time=f"{end_time - start_time:.2f}",
+                    input_tokens=last_meta.input_tokens_count,
+                    output_tokens=last_meta.generated_tokens_count,
+                    input_cost=total_input_cost,
+                    output_cost=total_output_cost,
+                    model_names=model_name,
+                    speed=f"{speed:.2f}",
+                    chunk_index=chunk_index
+                )
+            else:
+                # 非UI模式，直接使用LLM处理
+                meta_holder = MetaHolder()
+                file_number_list = self.super_big_quick_filter_files.with_llm(self.index_manager.index_filter_llm).with_meta(
+                    meta_holder).with_return_type(FileNumberList).run(compact_items, self.args.query)
+                end_time = time.monotonic()
+                # 打印处理信息
+                if meta_holder.get_meta():
+                    meta_dict = meta_holder.get_meta()
+                    total_input_cost = meta_dict.get("input_tokens_count", 0) * model_info_map.get(model_name, {}).get("input_price", 0.0) / 1000000
+                    total_output_cost = meta_dict.get("generated_tokens_count", 0) * model_info_map.get(model_name, {}).get("output_price", 0.0) / 1000000
+                    self.printer.print_in_terminal(
+                        "super_big_filter_stats",
+                        style="blue",
+                        input_tokens=meta_dict.get("input_tokens_count", 0),
+                        output_tokens=meta_dict.get("generated_tokens_count", 0),
+                        input_cost=total_input_cost,
+                        output_cost=total_output_cost,
+                        model_names=model_name,
+                        elapsed_time=f"{end_time - start_time:.2f}",
+                        chunk_index=chunk_index
+                    )
+            # 构建返回结果
+            files = {}
+            file_positions = {}
+            if file_number_list:
+                validated_file_numbers = []
+                for file_number in file_number_list.file_list:
+                    if file_number < 0 or file_number >= len(compact_items):
+                        self.printer.print_in_terminal(
+                            "invalid_file_number",
+                            style="yellow",
+                            file_number=file_number,
+                            total_files=len(compact_items)
+                        )
+                        continue
+                    # 获取实际的index_item索引
+                    original_index = compact_items[file_number]["index"]
+                    validated_file_numbers.append(original_index)
+                # 将最终选中的文件加入files
+                for index, file_number in enumerate(validated_file_numbers):
+                    file_path = get_file_path(index_items[file_number].module_name)
+                    files[file_path] = TargetFile(
+                        file_path=index_items[file_number].module_name,
+                        reason=self.printer.get_message_from_key("quick_filter_reason")
+                    )
+                    file_positions[file_path] = index
+            return QuickFilterResult(
+                files=files,
+                has_error=False,
+                file_positions=file_positions
+            )
+        except Exception as e:
+            self.printer.print_in_terminal(
+                "super_big_filter_failed",
+                style="red",
+                error=str(e)
+            )
+            return QuickFilterResult(
+                files={},
+                has_error=True,
+                error_message=str(e)
+            )
+    @byzerllm.prompt()
+    def super_big_quick_filter_files(self, compact_items: List[dict], query: str) -> str:
+        '''
+        当用户提一个需求的时候，我们要找到相关的源码文件。
+        下面是简化的索引文件列表，每项包含文件序号(index,##[]括起来的部分)、文件名(filename)和用途描述(usage)：
+        <index>
+        {{ file_meta_str }}
+        </index>
+        下面是用户的查询需求：
+        <query>
+        {{ query }}
+        </query>
+        请根据用户的需求，找到相关的文件，并给出文件序号列表。请返回如下json格式：
+        ```json
+        {
+            "file_list": [
+                file_index1,
+                file_index2,
+                ...
+            ]
+        }
+        ```
+        特别注意:
+        1. 如果用户的query里有 @文件 或者 @@符号，请匹配对应的文件名，优先返回这些文件。
+        2. 根据用户需求找出需要被修改的文件(edited_files)，以及可能需要作为参考的文件(reference_files)。
+        3. file_list 里的文件序号，按被 @ 的文件、edited_files文件和reference_files文件的顺序排列。
+        4. 如果 query 里是一段历史对话，那么对话里提及的文件必须要返回。
+        5. 如果用户需求为空，则直接返回空列表即可。
+        6. 返回的 json格式数据不允许有注释
+        '''
+        file_meta_str = "\n".join(
+            [f"##[{index}]{item['filename']}\n{item['usage']}" for index, item in enumerate(compact_items)])
+        context = {
+            "file_meta_str": file_meta_str,
+            "query": query
+        }
+        return context

autocoder/rag/long_context_rag.py CHANGED Viewed

@@ -38,7 +38,7 @@ from pydantic import BaseModel
 from byzerllm.utils.types import SingleOutputMeta
 from autocoder.rag.lang import get_message_with_format_and_newline
 from autocoder.rag.qa_conversation_strategy import get_qa_strategy
+from autocoder.rag.searchable import SearchableResults
 try:
     from autocoder_pro.rag.llm_compute import LLMComputeEngine
     pro_version = version("auto-coder-pro")
@@ -257,7 +257,7 @@ class LongContextRAG:
         请根据提供的文档内容、用户对话历史以及最后一个问题，提取并总结文档中与问题相关的重要信息。
         如果文档中没有相关信息，请回复"该文档中没有与问题相关的信息"。
         提取的信息尽量保持和原文中的一样，并且只输出这些信息。
-        """
+        """
     def _get_document_retriever_class(self):
         """Get the document retriever class based on configuration."""
@@ -500,6 +500,9 @@ class LongContextRAG:
         except json.JSONDecodeError:
             pass
+        if not only_contexts and extra_request_params.get("only_contexts", False):
+            only_contexts = True
         logger.info(f"Query: {query} only_contexts: {only_contexts}")
         start_time = time.time()
@@ -593,10 +596,19 @@ class LongContextRAG:
             )
             if only_contexts:
-                final_docs = []
-                for doc in relevant_docs:
-                    final_docs.append(doc.model_dump())
-                return [json.dumps(final_docs, ensure_ascii=False)], []
+                try:
+                    searcher = SearchableResults()
+                    result = searcher.reorder(docs=relevant_docs)
+                    yield (json.dumps(result.model_dump(), ensure_ascii=False), SingleOutputMeta(input_tokens_count=rag_stat.recall_stat.total_input_tokens + rag_stat.chunk_stat.total_input_tokens,
+                                                                                                 generated_tokens_count=rag_stat.recall_stat.total_generated_tokens +
+                                                                                                 rag_stat.chunk_stat.total_generated_tokens,
+                                                                                                 ))
+                except Exception as e:
+                    yield (str(e), SingleOutputMeta(input_tokens_count=rag_stat.recall_stat.total_input_tokens + rag_stat.chunk_stat.total_input_tokens,
+                                                    generated_tokens_count=rag_stat.recall_stat.total_generated_tokens +
+                                                    rag_stat.chunk_stat.total_generated_tokens,
+                                                    ))
+                return
             if not relevant_docs:
                 yield ("没有找到可以回答你问题的相关文档", SingleOutputMeta(input_tokens_count=rag_stat.recall_stat.total_input_tokens + rag_stat.chunk_stat.total_input_tokens,
@@ -816,12 +828,13 @@ class LongContextRAG:
                 self._print_rag_stats(rag_stat)
             else:
-                qa_strategy = get_qa_strategy(self.args.rag_qa_conversation_strategy)
+                qa_strategy = get_qa_strategy(
+                    self.args.rag_qa_conversation_strategy)
                 new_conversations = qa_strategy.create_conversation(
                     documents=[doc.source_code for doc in relevant_docs],
                     conversations=conversations
-                )
+                )
                 chunks = target_llm.stream_chat_oai(
                     conversations=new_conversations,

autocoder/rag/searchable.py ADDED Viewed

@@ -0,0 +1,58 @@
+import json
+from collections import Counter
+from typing import Dict, List, Any, Optional, Tuple, Set
+from pydantic import BaseModel
+from autocoder.rag.relevant_utils import FilterDoc
+class FileOccurrence(BaseModel):
+    """Represents a file and its occurrence count in search results"""
+    file_path: str
+    count: int
+    score: float = 0.0  # Optional relevance score
+class FileResult(BaseModel):
+    files: List[FileOccurrence]
+class SearchableResults:
+    """Class to process and organize search results by file frequency"""
+    def __init__(self):
+        """Initialize the SearchableResults instance"""
+        pass
+    def extract_original_docs(self, docs: List[FilterDoc]) -> List[str]:
+        """Extract all original_docs from a list of document metadata"""
+        all_files = []
+        for doc in docs:
+            # Extract from metadata if available
+            metadata = doc.source_code.metadata
+            if "original_docs" in metadata:
+                all_files.extend(metadata["original_docs"])
+            # Also include the module_name from source_code as a fallback
+            else:
+                all_files.append(doc.source_code.module_name)
+        return all_files
+    def count_file_occurrences(self, files: List[str]) -> List[FileOccurrence]:
+        """Count occurrences of each file and return sorted list"""
+        # Count occurrences
+        counter = Counter(files)
+        # Convert to FileOccurrence objects
+        occurrences = [
+            FileOccurrence(file_path=file_path, count=count)
+            for file_path, count in counter.items()
+        ]
+        # Sort by count (descending)
+        return sorted(occurrences, key=lambda x: x.count, reverse=True)
+    def reorder(self, docs: List[FilterDoc]) -> List[FileOccurrence]:
+        """Process search results to extract and rank files by occurrence (main entry point)"""
+        all_files = self.extract_original_docs(docs)
+        return FileResult(files=self.count_file_occurrences(all_files))

autocoder/version.py CHANGED Viewed

	@@ -1 +1 @@
1	- __version__ = "0.1.~~288~~"
1	+ __version__ = "0.1.289"

{auto_coder-0.1.288.dist-info → auto_coder-0.1.289.dist-info}/LICENSE RENAMED Viewed

File without changes

{auto_coder-0.1.288.dist-info → auto_coder-0.1.289.dist-info}/WHEEL RENAMED Viewed

File without changes

{auto_coder-0.1.288.dist-info → auto_coder-0.1.289.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{auto_coder-0.1.288.dist-info → auto_coder-0.1.289.dist-info}/top_level.txt RENAMED Viewed

File without changes

auto-coder 0.1.288__py3-none-any.whl → 0.1.289__py3-none-any.whl

Potentially problematic release.

auto-coder 0.1.288py3-none-any.whl → 0.1.289py3-none-any.whl