PyPI - auto-coder - Versions diffs - 0.1.345__py3-none-any.whl → 0.1.347__py3-none-any.whl - Mend

auto-coder 0.1.345py3-none-any.whl → 0.1.347py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of auto-coder might be problematic. Click here for more details.

Files changed (23) hide show

autocoder/rag/cache/file_monitor_cache.py CHANGED Viewed

@@ -48,7 +48,7 @@ class AutoCoderRAGDocListener(BaseCacheManager):
         r"^test.*$",
     ]
-    def __init__(self, path: str, ignore_spec, required_exts: List) -> None:
+    def __init__(self, path: str, ignore_spec, required_exts: List, args=None, llm=None) -> None:
         """
         初始化文件监控缓存管理器。
@@ -89,6 +89,8 @@ class AutoCoderRAGDocListener(BaseCacheManager):
         self.path = path
         self.ignore_spec = ignore_spec
         self.required_exts = required_exts
+        self.args = args
+        self.llm = llm
         self.stop_event = threading.Event()
         # connect list

autocoder/rag/cache/local_byzer_storage_cache.py CHANGED Viewed

@@ -30,6 +30,7 @@ from typing import Union
 from byzerllm import SimpleByzerLLM, ByzerLLM
 from autocoder.rag.cache.cache_result_merge import CacheResultMerger, MergeStrategy
 import time
+from .failed_files_utils import save_failed_files, load_failed_files
 if platform.system() != "Windows":
     import fcntl
@@ -70,71 +71,17 @@ class LocalByzerStorageCache(BaseCacheManager):
         emb_llm: Union[ByzerLLM, SimpleByzerLLM] = None,
         host: str = "127.0.0.1",
         port: int = 33333,
+        args=None,
+        llm=None,
     ):
         """
         初始化基于 Byzer Storage 的 RAG 缓存管理器。
-        参数:
-            path: 需要索引的代码库根目录
-            ignore_spec: 指定哪些文件/目录应被忽略的规则
-            required_exts: 需要处理的文件扩展名列表
-            extra_params: 额外的配置参数，包含向量索引相关设置
-            emb_llm: 用于生成文本向量嵌入的 ByzerLLM 实例
-            host: Byzer Storage 服务的主机地址
-            port: Byzer Storage 服务的端口
-        缓存结构 (self.cache):
-            self.cache 是一个字典，键为文件路径，值为 CacheItem 对象:
-            {
-                "file_path1": CacheItem(
-                    file_path: str,              # 文件的绝对路径
-                    relative_path: str,          # 相对于项目根目录的路径
-                    content: List[Dict],         # 文件内容的结构化表示，每个元素是 SourceCode 对象的序列化
-                    modify_time: float,          # 文件最后修改时间的时间戳
-                    md5: str                     # 文件内容的 MD5 哈希值，用于检测变更
-                ),
-                "file_path2": CacheItem(...),
-                ...
-            }
-            这个缓存有两层存储:
-            1. 本地文件缓存: 保存在项目根目录的 .cache/byzer_storage_speedup.jsonl 文件中
-               - 用于跟踪文件变更和快速加载
-               - 使用 JSONL 格式存储，每行是一个 CacheItem 的 JSON 表示
-            2. Byzer Storage 向量数据库:
-               - 存储文件内容的分块和向量嵌入
-               - 每个文件被分割成大小为 chunk_size 的文本块
-               - 每个块都会生成向量嵌入，用于语义搜索
-               - 存储结构包含: 文件路径、内容块、原始内容、向量嵌入、修改时间
-        源代码处理流程:
-            在缓存更新过程中使用了两个关键函数:
-            1. process_file_in_multi_process: 在多进程环境中处理文件
-               - 参数: file_info (文件信息元组)
-               - 返回值: List[SourceCode] 或 None
-               - 用途: 在初始构建缓存时并行处理多个文件
-            2. process_file_local: 在当前进程中处理单个文件
-               - 参数: file_path (文件路径)
-               - 返回值: List[SourceCode] 或 None
-               - 用途: 在检测到文件更新时处理单个文件
-            文件处理后，会:
-            1. 更新内存中的缓存 (self.cache)
-            2. 将缓存持久化到本地文件
-            3. 将内容分块并更新到 Byzer Storage 向量数据库
-        更新机制:
-            - 通过单独的线程异步处理文件变更
-            - 使用 MD5 哈希值检测文件是否发生变化
-            - 支持文件添加、更新和删除事件
-            - 使用向量数据库进行语义检索，支持相似度搜索
         """
         self.path = path
         self.ignore_spec = ignore_spec
         self.required_exts = required_exts
+        self.args = args
+        self.llm = llm
         self.rag_build_name = extra_params.rag_build_name
         self.storage = LocalByzerStorage("byzerai_store",
             "rag_test", self.rag_build_name, host=host, port=port,emb_llm=emb_llm)
@@ -153,16 +100,20 @@ class LocalByzerStorageCache(BaseCacheManager):
             self.cache_dir, "byzer_storage_speedup.jsonl")
         self.cache: Dict[str, CacheItem] = {}
+        # 创建缓存目录
+        if not os.path.exists(self.cache_dir):
+            os.makedirs(self.cache_dir)
+        # failed files support
+        self.failed_files_path = os.path.join(self.cache_dir, "failed_files.json")
+        self.failed_files = load_failed_files(self.failed_files_path)
         self.lock = threading.Lock()
         self.stop_event = threading.Event()
         self.thread = threading.Thread(target=self.process_queue)
         self.thread.daemon = True
         self.thread.start()
-        # 创建缓存目录
-        if not os.path.exists(self.cache_dir):
-            os.makedirs(self.cache_dir)
         # 加载缓存
         self.cache = self._load_cache()
@@ -485,6 +436,10 @@ class LocalByzerStorageCache(BaseCacheManager):
                 for item in file_list.file_paths:
                     logger.info(f"[QUEUE PROCESSING] Processing file deletion: {item}")
                     del self.cache[item]
+                    # remove from failed files if present
+                    if item in self.failed_files:
+                        self.failed_files.remove(item)
+                        save_failed_files(self.failed_files_path, self.failed_files)
                     # Create a temporary FileInfo object
                     file_info = FileInfo(
                         file_path=item, relative_path="", modify_time=0, file_md5="")
@@ -495,17 +450,30 @@ class LocalByzerStorageCache(BaseCacheManager):
                 for file_info in file_list.file_infos:
                     logger.info(
                         f"[QUEUE PROCESSING] Processing file update: {file_info.file_path}")
-                    # Process file and create CacheItem
-                    content = process_file_local(
-                        self.fileinfo_to_tuple(file_info))
-                    self.cache[file_info.file_path] = CacheItem(
-                        file_path=file_info.file_path,
-                        relative_path=file_info.relative_path,
-                        content=[c.model_dump() for c in content],
-                        modify_time=file_info.modify_time,
-                        md5=file_info.file_md5,
-                    )
-                    self.update_storage(file_info, is_delete=False)
+                    try:
+                        content = process_file_local(
+                            self.fileinfo_to_tuple(file_info))
+                        if content:
+                            self.cache[file_info.file_path] = CacheItem(
+                                file_path=file_info.file_path,
+                                relative_path=file_info.relative_path,
+                                content=[c.model_dump() for c in content],
+                                modify_time=file_info.modify_time,
+                                md5=file_info.file_md5,
+                            )
+                            self.update_storage(file_info, is_delete=False)
+                            # remove from failed files if present
+                            if file_info.file_path in self.failed_files:
+                                self.failed_files.remove(file_info.file_path)
+                                save_failed_files(self.failed_files_path, self.failed_files)
+                        else:
+                            logger.warning(f"Empty result for file: {file_info.file_path}, treat as parse failed, skipping cache update")
+                            self.failed_files.add(file_info.file_path)
+                            save_failed_files(self.failed_files_path, self.failed_files)
+                    except Exception as e:
+                        logger.error(f"Error in process_queue: {e}")
+                        self.failed_files.add(file_info.file_path)
+                        save_failed_files(self.failed_files_path, self.failed_files)
             self.write_cache()
         elapsed = time.time() - start_time
@@ -519,6 +487,10 @@ class LocalByzerStorageCache(BaseCacheManager):
         current_files = set()
         for file_info in self.get_all_files():
             current_files.add(file_info.file_path)
+            # skip failed files
+            if file_info.file_path in self.failed_files:
+                logger.info(f"文件 {file_info.file_path} 之前解析失败，跳过此次更新")
+                continue
             if (
                 file_info.file_path not in self.cache
                 or self.cache[file_info.file_path].md5 != file_info.file_md5

autocoder/rag/cache/local_duckdb_storage_cache.py CHANGED Viewed

@@ -28,6 +28,7 @@ from autocoder.rag.cache.base_cache import (
 from autocoder.rag.utils import process_file_in_multi_process, process_file_local
 from autocoder.rag.variable_holder import VariableHolder
 from byzerllm import SimpleByzerLLM, ByzerLLM
+from .failed_files_utils import save_failed_files, load_failed_files
 if platform.system() != "Windows":
     import fcntl
@@ -300,12 +301,16 @@ class LocalDuckDBStorageCache(BaseCacheManager):
             ignore_spec,
             required_exts,
             extra_params: Optional[AutoCoderArgs] = None,
-            emb_llm: Union[ByzerLLM, SimpleByzerLLM] = None
+            emb_llm: Union[ByzerLLM, SimpleByzerLLM] = None,
+            args=None,
+            llm=None
     ):
         self.path = path
         self.ignore_spec = ignore_spec
         self.required_exts = required_exts
         self.extra_params = extra_params
+        self.args = args
+        self.llm = llm
         self.storage = LocalDuckdbStorage(
             llm=emb_llm,
@@ -325,6 +330,11 @@ class LocalDuckDBStorageCache(BaseCacheManager):
         if not os.path.exists(self.cache_dir):
             os.makedirs(self.cache_dir)
+        # failed files support
+        from .failed_files_utils import load_failed_files
+        self.failed_files_path = os.path.join(self.cache_dir, "failed_files.json")
+        self.failed_files = load_failed_files(self.failed_files_path)
         self.lock = threading.Lock()
         self.stop_event = threading.Event()
         self.thread = threading.Thread(target=self.process_queue)
@@ -569,6 +579,10 @@ class LocalDuckDBStorageCache(BaseCacheManager):
                 for item in file_list.file_paths:
                     logger.info(f"{item} is detected to be removed")
                     del self.cache[item]
+                    # remove from failed files if present
+                    if item in self.failed_files:
+                        self.failed_files.remove(item)
+                        save_failed_files(self.failed_files_path, self.failed_files)
                     # 创建一个临时的 FileInfo 对象
                     file_info = FileInfo(
                         file_path=item, relative_path="", modify_time=0, file_md5="")
@@ -578,18 +592,30 @@ class LocalDuckDBStorageCache(BaseCacheManager):
                 for file_info in file_list.file_infos:
                     logger.info(
                         f"{file_info.file_path} is detected to be updated")
-                    # 处理文件并创建 CacheItem
-                    # content = process_file_local(
-                    #     self.fileinfo_to_tuple(file_info))
-                    content = process_file_local(file_info.file_path)
-                    self.cache[file_info.file_path] = CacheItem(
-                        file_path=file_info.file_path,
-                        relative_path=file_info.relative_path,
-                        content=[c.model_dump() for c in content],
-                        modify_time=file_info.modify_time,
-                        md5=file_info.file_md5,
-                    )
-                    self.update_storage(file_info, is_delete=False)
+                    try:
+                        content = process_file_local(file_info.file_path)
+                        if content:
+                            self.cache[file_info.file_path] = CacheItem(
+                                file_path=file_info.file_path,
+                                relative_path=file_info.relative_path,
+                                content=[c.model_dump() for c in content],
+                                modify_time=file_info.modify_time,
+                                md5=file_info.file_md5,
+                            )
+                            self.update_storage(file_info, is_delete=False)
+                            # remove from failed files if present
+                            if file_info.file_path in self.failed_files:
+                                self.failed_files.remove(file_info.file_path)
+                                save_failed_files(self.failed_files_path, self.failed_files)
+                        else:
+                            logger.warning(f"Empty result for file: {file_info.file_path}, treat as parse failed, skipping cache update")
+                            self.failed_files.add(file_info.file_path)
+                            save_failed_files(self.failed_files_path, self.failed_files)
+                    except Exception as e:
+                        logger.error(f"Error in process_queue: {e}")
+                        self.failed_files.add(file_info.file_path)
+                        save_failed_files(self.failed_files_path, self.failed_files)
             self.write_cache()
     def trigger_update(self):
@@ -598,6 +624,10 @@ class LocalDuckDBStorageCache(BaseCacheManager):
         current_files = set()
         for file_info in self.get_all_files():
             current_files.add(file_info.file_path)
+            # skip failed files
+            if file_info.file_path in self.failed_files:
+                logger.info(f"文件 {file_info.file_path} 之前解析失败，跳过此次更新")
+                continue
             if (
                     file_info.file_path not in self.cache
                     or self.cache[file_info.file_path].md5 != file_info.file_md5

autocoder/rag/cache/simple_cache.py CHANGED Viewed

@@ -19,6 +19,7 @@ from loguru import logger
 from autocoder.rag.utils import process_file_in_multi_process, process_file_local
 from autocoder.rag.variable_holder import VariableHolder
 import hashlib
+from .failed_files_utils import load_failed_files, save_failed_files
 default_ignore_dirs = [
@@ -45,7 +46,7 @@ def generate_content_md5(content: Union[str, bytes]) -> str:
 class AutoCoderRAGAsyncUpdateQueue(BaseCacheManager):
-    def __init__(self, path: str, ignore_spec, required_exts: list, update_interval: int = 5):
+    def __init__(self, path: str, ignore_spec, required_exts: list, update_interval: int = 5, args=None, llm=None):
         """
         初始化异步更新队列，用于管理代码文件的缓存。
@@ -91,24 +92,31 @@ class AutoCoderRAGAsyncUpdateQueue(BaseCacheManager):
         self.path = path
         self.ignore_spec = ignore_spec
         self.required_exts = required_exts
+        self.args = args
+        self.llm = llm
         self.update_interval = update_interval
         self.queue = []
         self.cache = {}  # 初始化为空字典，稍后通过 read_cache() 填充
         self.lock = threading.Lock()
         self.stop_event = threading.Event()
+        # 用于存放解析失败的文件路径集合
+        self.failed_files_path = os.path.join(self.path, ".cache", "failed_files.json")
+        self.failed_files = load_failed_files(self.failed_files_path)
         # 启动处理队列的线程
         self.queue_thread = threading.Thread(target=self._process_queue)
         self.queue_thread.daemon = True
         self.queue_thread.start()
         # 启动定时触发更新的线程
         self.update_thread = threading.Thread(target=self._periodic_update)
         self.update_thread.daemon = True
         self.update_thread.start()
         self.cache = self.read_cache()
     def _process_queue(self):
         while not self.stop_event.is_set():
             try:
@@ -183,13 +191,18 @@ class AutoCoderRAGAsyncUpdateQueue(BaseCacheManager):
         files_to_process = []
         current_files = set()
         for file_info in self.get_all_files():
-            file_path, _, _, file_md5 = file_info
+            file_path, relative_path, modify_time, file_md5 = file_info
             current_files.add(file_path)
+            # 如果文件曾经解析失败，跳过本次增量更新
+            if file_path in self.failed_files:
+                logger.info(f"文件 {file_path} 之前解析失败，跳过此次更新")
+                continue
+            # 变更检测
             if (
                 file_path not in self.cache
-                or self.cache[file_path].get("md5","") != file_md5
+                or self.cache[file_path].get("md5", "") != file_md5
             ):
-                files_to_process.append(file_info)
+                files_to_process.append((file_path, relative_path, modify_time, file_md5))
         deleted_files = set(self.cache.keys()) - current_files
         logger.info(f"files_to_process: {files_to_process}")
@@ -213,19 +226,34 @@ class AutoCoderRAGAsyncUpdateQueue(BaseCacheManager):
             if isinstance(file_list, DeleteEvent):
                 for item in file_list.file_paths:
                     logger.info(f"{item} is detected to be removed")
-                    del self.cache[item]
+                    if item in self.cache:
+                        del self.cache[item]
+                    # 删除时也从失败列表中移除（防止文件已修复）
+                    if item in self.failed_files:
+                        self.failed_files.remove(item)
+                        save_failed_files(self.failed_files_path, self.failed_files)
             elif isinstance(file_list, AddOrUpdateEvent):
                 for file_info in file_list.file_infos:
                     logger.info(f"{file_info.file_path} is detected to be updated")
                     try:
                         result = process_file_local(file_info.file_path)
-                        if result:  # 只有当result不为空时才更新缓存
+                        if result:
+                            # 解析成功且非空
                             self.update_cache(self.fileinfo_to_tuple(file_info), result)
+                            # 如果之前失败过且本次成功，移除失败记录
+                            if file_info.file_path in self.failed_files:
+                                self.failed_files.remove(file_info.file_path)
+                                save_failed_files(self.failed_files_path, self.failed_files)
                         else:
-                            logger.warning(f"Empty result for file: {file_info.file_path}, skipping cache update")
+                            # 只要为空也认为解析失败，加入失败列表
+                            logger.warning(f"Empty result for file: {file_info.file_path}, treat as parse failed, skipping cache update")
+                            self.failed_files.add(file_info.file_path)
+                            save_failed_files(self.failed_files_path, self.failed_files)
                     except Exception as e:
-                        logger.error(
-                            f"SimpleCache Error in process_queue: {e}")
+                        logger.error(f"SimpleCache Error in process_queue: {e}")
+                        # 解析失败则加入失败列表
+                        self.failed_files.add(file_info.file_path)
+                        save_failed_files(self.failed_files_path, self.failed_files)
             self.write_cache()

autocoder/rag/document_retriever.py CHANGED Viewed

@@ -2,7 +2,6 @@ import threading
 from typing import Dict, Generator, List, Tuple, Any, Optional,Union
 from byzerllm import ByzerLLM, SimpleByzerLLM
 from loguru import logger
 from autocoder.common import SourceCode
 from uuid import uuid4
@@ -37,6 +36,8 @@ class LocalDocumentRetriever(BaseDocumentRetriever):
     def __init__(
         self,
+        args: AutoCoderArgs,
+        llm: Union[ByzerLLM,SimpleByzerLLM],
         path: str,
         ignore_spec,
         required_exts: list,
@@ -45,9 +46,12 @@ class LocalDocumentRetriever(BaseDocumentRetriever):
         single_file_token_limit: int = 60000,
         disable_auto_window: bool = False,
         enable_hybrid_index: bool = False,
-        extra_params: Optional[AutoCoderArgs] = None,
-        emb_llm: Union[ByzerLLM, SimpleByzerLLM] = None,
+        extra_params: Optional['AutoCoderArgs'] = None,
+        emb_llm: Union['ByzerLLM', 'SimpleByzerLLM'] = None,
     ) -> None:
+        self.args = args
+        self.llm = llm
         self.path = path
         self.ignore_spec = ignore_spec
         self.required_exts = required_exts
@@ -65,27 +69,32 @@ class LocalDocumentRetriever(BaseDocumentRetriever):
         if self.enable_hybrid_index:
             if self.on_ray:
                 self.cacher = ByzerStorageCache(
-                    path, ignore_spec, required_exts, extra_params
+                    path, ignore_spec, required_exts, extra_params,
+                    args=self.args, llm=self.llm
                 )
             else:
                 if extra_params.rag_storage_type == "duckdb":
                     self.cacher = LocalDuckDBStorageCache(
                         path, ignore_spec, required_exts, extra_params,
-                        emb_llm=emb_llm
+                        emb_llm=emb_llm,
+                        args=self.args, llm=self.llm
                     )
                 elif extra_params.rag_storage_type in ["byzer-storage", "byzer_storage"]:
                     self.cacher = LocalByzerStorageCache(
                         path, ignore_spec, required_exts, extra_params,
-                        emb_llm=emb_llm
+                        emb_llm=emb_llm,
+                        args=self.args, llm=self.llm
                     )
         else:
             if self.monitor_mode:
                 self.cacher = AutoCoderRAGDocListener(
-                    path, ignore_spec, required_exts
+                    path, ignore_spec, required_exts,
+                    args=self.args, llm=self.llm
                 )
             else:
                 self.cacher = AutoCoderRAGAsyncUpdateQueue(
-                    path, ignore_spec, required_exts
+                    path, ignore_spec, required_exts,
+                    args=self.args, llm=self.llm
                 )
         logger.info(f"DocumentRetriever initialized with:")

autocoder/rag/long_context_rag.py CHANGED Viewed

@@ -183,6 +183,8 @@ class LongContextRAG:
                     "emb_llm is required for local byzer storage cache")
         self.document_retriever = retriever_class(
+            self.args,
+            self.llm,
             self.path,
             self.ignore_spec,
             self.required_exts,
@@ -841,8 +843,7 @@ class LongContextRAG:
                 self._print_rag_stats(rag_stat)
             else:
-                qa_strategy = get_qa_strategy(
-                    self.args.rag_qa_conversation_strategy)
+                qa_strategy = get_qa_strategy(self.args)
                 new_conversations = qa_strategy.create_conversation(
                     documents=[doc.source_code for doc in relevant_docs],
                     conversations=conversations, local_image_host=self.args.local_image_host

auto-coder 0.1.345__py3-none-any.whl → 0.1.347__py3-none-any.whl

Potentially problematic release.

auto-coder 0.1.345py3-none-any.whl → 0.1.347py3-none-any.whl