PyPI - auto-coder - Versions diffs - 0.1.348__py3-none-any.whl → 0.1.349__py3-none-any.whl - Mend

auto-coder 0.1.348py3-none-any.whl → 0.1.349py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of auto-coder might be problematic. Click here for more details.

Files changed (35) hide show

{auto_coder-0.1.348.dist-info → auto_coder-0.1.349.dist-info}/METADATA +1 -1
{auto_coder-0.1.348.dist-info → auto_coder-0.1.349.dist-info}/RECORD +35 -26
autocoder/auto_coder_runner.py +14 -10
autocoder/chat_auto_coder_lang.py +5 -3
autocoder/common/model_speed_tester.py +392 -0
autocoder/common/printer.py +7 -8
autocoder/common/run_cmd.py +247 -0
autocoder/common/test_run_cmd.py +110 -0
autocoder/common/v2/agent/agentic_edit.py +61 -11
autocoder/common/v2/agent/agentic_edit_conversation.py +9 -0
autocoder/common/v2/agent/agentic_edit_tools/execute_command_tool_resolver.py +21 -36
autocoder/common/v2/agent/agentic_edit_tools/list_files_tool_resolver.py +4 -7
autocoder/common/v2/agent/agentic_edit_tools/search_files_tool_resolver.py +2 -5
autocoder/helper/rag_doc_creator.py +141 -0
autocoder/ignorefiles/__init__.py +4 -0
autocoder/ignorefiles/ignore_file_utils.py +63 -0
autocoder/ignorefiles/test_ignore_file_utils.py +91 -0
autocoder/models.py +49 -9
autocoder/rag/cache/byzer_storage_cache.py +10 -4
autocoder/rag/cache/file_monitor_cache.py +27 -24
autocoder/rag/cache/local_byzer_storage_cache.py +11 -5
autocoder/rag/cache/local_duckdb_storage_cache.py +203 -128
autocoder/rag/cache/simple_cache.py +56 -37
autocoder/rag/loaders/filter_utils.py +106 -0
autocoder/rag/loaders/image_loader.py +45 -23
autocoder/rag/loaders/pdf_loader.py +3 -3
autocoder/rag/loaders/test_image_loader.py +209 -0
autocoder/rag/qa_conversation_strategy.py +3 -5
autocoder/rag/utils.py +20 -9
autocoder/utils/_markitdown.py +35 -0
autocoder/version.py +1 -1
{auto_coder-0.1.348.dist-info → auto_coder-0.1.349.dist-info}/LICENSE +0 -0
{auto_coder-0.1.348.dist-info → auto_coder-0.1.349.dist-info}/WHEEL +0 -0
{auto_coder-0.1.348.dist-info → auto_coder-0.1.349.dist-info}/entry_points.txt +0 -0
{auto_coder-0.1.348.dist-info → auto_coder-0.1.349.dist-info}/top_level.txt +0 -0

autocoder/rag/cache/simple_cache.py CHANGED Viewed

@@ -1,4 +1,5 @@
 from multiprocessing import Pool
+import functools
 from autocoder.common import SourceCode
 from autocoder.rag.cache.base_cache import (
     BaseCacheManager, DeleteEvent, AddOrUpdateEvent,
@@ -20,6 +21,9 @@ from autocoder.rag.utils import process_file_in_multi_process, process_file_loca
 from autocoder.rag.variable_holder import VariableHolder
 import hashlib
 from .failed_files_utils import load_failed_files, save_failed_files
+from autocoder.common import AutoCoderArgs
+from byzerllm import SimpleByzerLLM, ByzerLLM
+from autocoder.utils.llms import get_llm_names
 default_ignore_dirs = [
@@ -46,16 +50,16 @@ def generate_content_md5(content: Union[str, bytes]) -> str:
 class AutoCoderRAGAsyncUpdateQueue(BaseCacheManager):
-    def __init__(self, path: str, ignore_spec, required_exts: list, update_interval: int = 5, args=None, llm=None):
+    def __init__(self, path: str, ignore_spec, required_exts: list, update_interval: int = 5, args: Optional[AutoCoderArgs] = None, llm: Optional[Union[ByzerLLM, SimpleByzerLLM, str]] = None):
         """
         初始化异步更新队列，用于管理代码文件的缓存。
         参数:
             path: 需要索引的代码库根目录
             ignore_spec: 指定哪些文件/目录应被忽略的规则
             required_exts: 需要处理的文件扩展名列表
             update_interval: 自动触发更新的时间间隔（秒），默认为5秒
         缓存结构 (self.cache):
             self.cache 是一个字典，其结构如下:
             {
@@ -69,23 +73,23 @@ class AutoCoderRAGAsyncUpdateQueue(BaseCacheManager):
                 "file_path2": { ... },
                 ...
             }
             这个缓存保存在项目根目录的 .cache/cache.jsonl 文件中，采用 JSONL 格式存储。
             每次启动时从磁盘加载，并在文件变更时异步更新。
         源代码处理函数:
             在缓存更新过程中使用了两个关键函数:
             1. process_file_in_multi_process: 在多进程环境中处理文件
                - 参数: file_info (文件信息元组)
                - 返回值: List[SourceCode] 或 None
                - 用途: 在初始加载时并行处理多个文件
             2. process_file_local: 在当前进程中处理单个文件
                - 参数: file_path (文件路径)
                - 返回值: List[SourceCode] 或 None
                - 用途: 在检测到文件更新时处理单个文件
             这两个函数返回的 SourceCode 对象列表会通过 model_dump() 方法序列化为字典，
             然后存储在缓存的 "content" 字段中。如果返回为空，则跳过缓存更新。
         """
@@ -94,6 +98,7 @@ class AutoCoderRAGAsyncUpdateQueue(BaseCacheManager):
         self.required_exts = required_exts
         self.args = args
         self.llm = llm
+        self.product_mode = args.product_mode or "lite"
         self.update_interval = update_interval
         self.queue = []
         self.cache = {}  # 初始化为空字典，稍后通过 read_cache() 填充
@@ -101,7 +106,8 @@ class AutoCoderRAGAsyncUpdateQueue(BaseCacheManager):
         self.stop_event = threading.Event()
         # 用于存放解析失败的文件路径集合
-        self.failed_files_path = os.path.join(self.path, ".cache", "failed_files.json")
+        self.failed_files_path = os.path.join(
+            self.path, ".cache", "failed_files.json")
         self.failed_files = load_failed_files(self.failed_files_path)
         # 启动处理队列的线程
@@ -116,7 +122,6 @@ class AutoCoderRAGAsyncUpdateQueue(BaseCacheManager):
         self.cache = self.read_cache()
     def _process_queue(self):
         while not self.stop_event.is_set():
             try:
@@ -124,12 +129,13 @@ class AutoCoderRAGAsyncUpdateQueue(BaseCacheManager):
             except Exception as e:
                 logger.error(f"Error in process_queue: {e}")
             time.sleep(1)  # 避免过于频繁的检查
     def _periodic_update(self):
         """定时触发文件更新检查"""
-        while not self.stop_event.is_set():
+        while not self.stop_event.is_set():
             try:
-                logger.debug(f"Periodic update triggered (every {self.update_interval}s)")
+                logger.debug(
+                    f"Periodic update triggered (every {self.update_interval}s)")
                 # 如果没有被初始化过，不会增量触发
                 if not self.cache:
                     time.sleep(self.update_interval)
@@ -145,7 +151,7 @@ class AutoCoderRAGAsyncUpdateQueue(BaseCacheManager):
         self.update_thread.join()
     def fileinfo_to_tuple(self, file_info: FileInfo) -> Tuple[str, str, float, str]:
-        return (file_info.file_path, file_info.relative_path, file_info.modify_time, file_info.file_md5)
+        return (file_info.file_path, file_info.relative_path, file_info.modify_time, file_info.file_md5)
     def __del__(self):
         self.stop()
@@ -159,7 +165,7 @@ class AutoCoderRAGAsyncUpdateQueue(BaseCacheManager):
                 file_path, _, modify_time, file_md5 = file_info
                 if (
                     file_path not in self.cache
-                    or self.cache[file_path].get("md5","") != file_md5
+                    or self.cache[file_path].get("md5", "") != file_md5
                 ):
                     files_to_process.append(file_info)
             if not files_to_process:
@@ -169,20 +175,23 @@ class AutoCoderRAGAsyncUpdateQueue(BaseCacheManager):
             #     [process_file.remote(file_info) for file_info in files_to_process]
             # )
             from autocoder.rag.token_counter import initialize_tokenizer
+            llm_name = get_llm_names(self.llm)[0] if self.llm else None
             with Pool(
                 processes=os.cpu_count(),
                 initializer=initialize_tokenizer,
                 initargs=(VariableHolder.TOKENIZER_PATH,),
             ) as pool:
-                results = pool.map(
-                    process_file_in_multi_process, files_to_process)
+                worker_func = functools.partial(
+                    process_file_in_multi_process, llm=llm_name, product_mode=self.product_mode)
+                results = pool.map(worker_func, files_to_process)
             for file_info, result in zip(files_to_process, results):
                 if result:  # 只有当result不为空时才更新缓存
                     self.update_cache(file_info, result)
                 else:
-                    logger.warning(f"Empty result for file: {file_info[0]}, skipping cache update")
+                    logger.warning(
+                        f"Empty result for file: {file_info[0]}, skipping cache update")
             self.write_cache()
@@ -195,14 +204,15 @@ class AutoCoderRAGAsyncUpdateQueue(BaseCacheManager):
             current_files.add(file_path)
             # 如果文件曾经解析失败，跳过本次增量更新
             if file_path in self.failed_files:
-                logger.info(f"文件 {file_path} 之前解析失败，跳过此次更新")
+                # logger.info(f"文件 {file_path} 之前解析失败，跳过此次更新")
                 continue
             # 变更检测
             if (
                 file_path not in self.cache
                 or self.cache[file_path].get("md5", "") != file_md5
             ):
-                files_to_process.append((file_path, relative_path, modify_time, file_md5))
+                files_to_process.append(
+                    (file_path, relative_path, modify_time, file_md5))
         deleted_files = set(self.cache.keys()) - current_files
         logger.info(f"files_to_process: {files_to_process}")
@@ -231,29 +241,38 @@ class AutoCoderRAGAsyncUpdateQueue(BaseCacheManager):
                     # 删除时也从失败列表中移除（防止文件已修复）
                     if item in self.failed_files:
                         self.failed_files.remove(item)
-                        save_failed_files(self.failed_files_path, self.failed_files)
+                        save_failed_files(
+                            self.failed_files_path, self.failed_files)
             elif isinstance(file_list, AddOrUpdateEvent):
                 for file_info in file_list.file_infos:
-                    logger.info(f"{file_info.file_path} is detected to be updated")
+                    logger.info(
+                        f"{file_info.file_path} is detected to be updated")
                     try:
-                        result = process_file_local(file_info.file_path)
+                        result = process_file_local(
+                            file_info.file_path, llm=self.llm, product_mode=self.product_mode)
                         if result:
                             # 解析成功且非空
-                            self.update_cache(self.fileinfo_to_tuple(file_info), result)
+                            self.update_cache(
+                                self.fileinfo_to_tuple(file_info), result)
                             # 如果之前失败过且本次成功，移除失败记录
                             if file_info.file_path in self.failed_files:
                                 self.failed_files.remove(file_info.file_path)
-                                save_failed_files(self.failed_files_path, self.failed_files)
+                                save_failed_files(
+                                    self.failed_files_path, self.failed_files)
                         else:
                             # 只要为空也认为解析失败，加入失败列表
-                            logger.warning(f"Empty result for file: {file_info.file_path}, treat as parse failed, skipping cache update")
+                            logger.warning(
+                                f"Empty result for file: {file_info.file_path}, treat as parse failed, skipping cache update")
                             self.failed_files.add(file_info.file_path)
-                            save_failed_files(self.failed_files_path, self.failed_files)
+                            save_failed_files(
+                                self.failed_files_path, self.failed_files)
                     except Exception as e:
-                        logger.error(f"SimpleCache Error in process_queue: {e}")
+                        logger.error(
+                            f"SimpleCache Error in process_queue: {e}")
                         # 解析失败则加入失败列表
                         self.failed_files.add(file_info.file_path)
-                        save_failed_files(self.failed_files_path, self.failed_files)
+                        save_failed_files(
+                            self.failed_files_path, self.failed_files)
             self.write_cache()
@@ -266,7 +285,7 @@ class AutoCoderRAGAsyncUpdateQueue(BaseCacheManager):
         cache = {}
         if os.path.exists(cache_file):
-            with open(cache_file, "r",encoding="utf-8") as f:
+            with open(cache_file, "r", encoding="utf-8") as f:
                 for line in f:
                     data = json.loads(line)
                     cache[data["file_path"]] = data
@@ -277,7 +296,7 @@ class AutoCoderRAGAsyncUpdateQueue(BaseCacheManager):
         cache_file = os.path.join(cache_dir, "cache.jsonl")
         if not fcntl:
-            with open(cache_file, "w",encoding="utf-8") as f:
+            with open(cache_file, "w", encoding="utf-8") as f:
                 for data in self.cache.values():
                     try:
                         json.dump(data, f, ensure_ascii=False)
@@ -287,12 +306,12 @@ class AutoCoderRAGAsyncUpdateQueue(BaseCacheManager):
                             f"Failed to write {data['file_path']} to .cache/cache.jsonl: {e}")
         else:
             lock_file = cache_file + ".lock"
-            with open(lock_file, "w",encoding="utf-8") as lockf:
+            with open(lock_file, "w", encoding="utf-8") as lockf:
                 try:
                     # 获取文件锁
                     fcntl.flock(lockf, fcntl.LOCK_EX | fcntl.LOCK_NB)
                     # 写入缓存文件
-                    with open(cache_file, "w",encoding="utf-8") as f:
+                    with open(cache_file, "w", encoding="utf-8") as f:
                         for data in self.cache.values():
                             try:
                                 json.dump(data, f, ensure_ascii=False)
@@ -310,11 +329,11 @@ class AutoCoderRAGAsyncUpdateQueue(BaseCacheManager):
     ):
         """
         更新缓存中的文件信息。
         参数:
             file_info: 包含文件信息的元组 (file_path, relative_path, modify_time, file_md5)
             content: 解析后的文件内容，SourceCode 对象列表
         说明:
             此方法将文件的最新内容更新到缓存中。缓存项的结构为:
             {
@@ -324,7 +343,7 @@ class AutoCoderRAGAsyncUpdateQueue(BaseCacheManager):
                 "modify_time": float,          # 文件最后修改时间的时间戳
                 "md5": str                     # 文件内容的 MD5 哈希值，用于检测变更
             }
             该方法不会立即写入磁盘，需调用 write_cache() 方法将更新后的缓存持久化。
         """
         file_path, relative_path, modify_time, file_md5 = file_info

autocoder/rag/loaders/filter_utils.py ADDED Viewed

@@ -0,0 +1,106 @@
+import os
+import json
+from typing import Dict, Optional
+from loguru import logger
+class FilterRuleManager:
+    '''
+    {
+        "whitelist": [
+            "glob:*.png",
+            "regex:^/tmp/.*hidden.*"
+        ],
+        "blacklist": [
+            "glob:*/private/*",
+            "regex:.*/secret/.*\\.jpg$"
+        ]
+        }
+    '''
+    _cache_rules: Optional[Dict] = None
+    _cache_mtime: Optional[float] = None
+    def __init__(self, llm, source_dir: str):
+        """
+        初始化过滤规则管理器
+        参数:
+            llm: 大模型对象，当前未使用，预留
+            source_dir: 项目根目录路径
+        """
+        self.llm = llm
+        self.source_dir = source_dir
+        self.filter_rules_path = os.path.join(self.source_dir, ".cache", "filterrules")
+    def load_filter_rules(self) -> Dict:
+        try:
+            current_mtime = os.path.getmtime(self.filter_rules_path) if os.path.exists(self.filter_rules_path) else None
+        except Exception:
+            current_mtime = None
+        need_reload = False
+        # 如果缓存为空，或者文件已更新，触发重新加载
+        if FilterRuleManager._cache_rules is None:
+            need_reload = True
+        elif current_mtime is not None and FilterRuleManager._cache_mtime != current_mtime:
+            need_reload = True
+        if need_reload:
+            FilterRuleManager._cache_rules = {"whitelist": [], "blacklist": []}
+            try:
+                if os.path.exists(self.filter_rules_path):
+                    with open(self.filter_rules_path, "r", encoding="utf-8") as f:
+                        FilterRuleManager._cache_rules = json.load(f)
+                FilterRuleManager._cache_mtime = current_mtime
+            except Exception as e:
+                logger.warning(f"Failed to load filterrules: {e}")
+        return FilterRuleManager._cache_rules or {"whitelist": [], "blacklist": []}
+    def should_parse_image(self, file_path: str) -> bool:
+        """
+        判断某个文件是否需要对图片进行解析。
+        支持规则格式：
+        - glob通配符匹配，示例："glob:*.png" 或 "*.png"
+        - 正则表达式匹配，示例："regex:^/tmp/.*hidden.*"
+        返回:
+            True 表示应该解析
+            False 表示不解析
+        """
+        import fnmatch
+        import re
+        rules = self.load_filter_rules()
+        whitelist = rules.get("whitelist", [])
+        blacklist = rules.get("blacklist", [])
+        def match_pattern(pattern: str, path: str) -> bool:
+            if pattern.startswith("glob:"):
+                pat = pattern[len("glob:"):]
+                return fnmatch.fnmatch(path, pat)
+            elif pattern.startswith("regex:"):
+                pat = pattern[len("regex:"):]
+                try:
+                    return re.search(pat, path) is not None
+                except re.error:
+                    logger.warning(f"Invalid regex pattern: {pat}")
+                    return False
+            else:
+                # 默认按glob处理
+                return fnmatch.fnmatch(path, pattern)
+        # 优先匹配黑名单
+        for pattern in blacklist:
+            if match_pattern(pattern, file_path):
+                return False
+        # 再匹配白名单
+        for pattern in whitelist:
+            if match_pattern(pattern, file_path):
+                return True
+        # 默认不解析
+        return False

autocoder/rag/loaders/image_loader.py CHANGED Viewed

@@ -18,6 +18,7 @@ from byzerllm.utils.client import code_utils
 from autocoder.utils.llms import get_single_llm
 from loguru import logger
 from typing import List, Tuple, Optional
+from autocoder.common.text import TextSimilarity
 from pydantic import BaseModel
@@ -280,6 +281,20 @@ class ImageLoader:
         except Exception:
             traceback.print_exc()
             return ""
+    @staticmethod
+    def extract_replace_in_file_tools(response)->List[ReplaceInFileTool]:
+        tools = []
+        # Pattern to match replace_in_file tool blocks
+        pattern = r'<replace_in_file>\s*<path>(.*?)</path>\s*<diff>(.*?)</diff>\s*</replace_in_file>'
+        matches = re.finditer(pattern, response, re.DOTALL)
+        for match in matches:
+            path = match.group(1).strip()
+            diff = match.group(2).strip()
+            tools.append(ReplaceInFileTool(path=path, diff=diff))
+        return tools
     @staticmethod
     def format_table_in_content(content: str, llm=None) -> str:
@@ -406,35 +421,42 @@ class ImageLoader:
             '''
         # Run the prompt with the provided content
-        tool_response = _format_table.with_llm(llm).run(content)
-        # Parse the tool response to extract replace_in_file tool calls
-        def extract_replace_in_file_tools(response):
-            tools = []
-            # Pattern to match replace_in_file tool blocks
-            pattern = r'<replace_in_file>\s*<path>(.*?)</path>\s*<diff>(.*?)</diff>\s*</replace_in_file>'
-            matches = re.finditer(pattern, response, re.DOTALL)
-            for match in matches:
-                path = match.group(1).strip()
-                diff = match.group(2).strip()
-                tools.append(ReplaceInFileTool(path=path, diff=diff))
-            return tools
+        tool_response = _format_table.with_llm(llm).run(content)
         # Extract tools from the response
-        tools = extract_replace_in_file_tools(tool_response)
+        tools = ImageLoader.extract_replace_in_file_tools(tool_response)
         # Process each tool to apply the replacements
         formatted_content = content
         for tool in tools:
-            # For in-memory content replacement (not actual file modification)
-            if tool.path == "content":
-                # Parse the diff to get search/replace blocks
-                blocks = ImageLoader.parse_diff(tool.diff)
-                # Apply each replacement to the content
-                for search_block, replace_block in blocks:
-                    formatted_content = formatted_content.replace(search_block, replace_block)
+            # For in-memory content replacement (not actual file modification)
+            # Parse the diff to get search/replace blocks
+            blocks = ImageLoader.parse_diff(tool.diff)
+            # Apply each replacement to the content
+            for search_block, replace_block in blocks:
+                # Check if the search_block exists in the content
+                if search_block in formatted_content:
+                    # Replace and verify the replacement occurred
+                    new_content = formatted_content.replace(search_block, replace_block)
+                    if new_content == formatted_content:
+                        logger.warning(f"Replacement failed despite search block found. Search block length: {len(search_block)}")
+                        print(f"\n=== FAILED SEARCH BLOCK ===\n{search_block}\n=== END FAILED SEARCH BLOCK ===\n")
+                    formatted_content = new_content
+                else:
+                    # Fallback to similarity matching when exact match fails
+                    logger.warning(f"Search block not found in content. Trying similarity matching. Search block length: {len(search_block)}")
+                    print(f"\n=== NOT FOUND SEARCH BLOCK (trying similarity) ===\n{search_block}\n=== END NOT FOUND SEARCH BLOCK ===\n")
+                    # Use TextSimilarity to find the best matching window
+                    similarity, best_window = TextSimilarity(search_block, formatted_content).get_best_matching_window()
+                    similarity_threshold = 0.8  # Can be adjusted based on needs
+                    if similarity > similarity_threshold:
+                        logger.info(f"Found similar block with similarity {similarity:.2f}")
+                        print(f"\n=== SIMILAR BLOCK FOUND (similarity: {similarity:.2f}) ===\n{best_window}\n=== END SIMILAR BLOCK ===\n")
+                        formatted_content = formatted_content.replace(best_window, replace_block, 1)
+                    else:
+                        logger.warning(f"No similar block found. Best similarity: {similarity:.2f}")
         return formatted_content

autocoder/rag/loaders/pdf_loader.py CHANGED Viewed

@@ -14,9 +14,9 @@ def extract_text_from_pdf_old(file_path):
         text += page.extract_text()
     return text
-def extract_text_from_pdf(file_path):
-    try:
-        md_converter = MarkItDown()
+def extract_text_from_pdf(file_path, llm=None, product_mode="lite"):
+    try:
+        md_converter = MarkItDown(llm=llm, product_mode=product_mode)
         result = md_converter.convert(file_path)
         return result.text_content
     except (BaseException, Exception) as e:

auto-coder 0.1.348__py3-none-any.whl → 0.1.349__py3-none-any.whl

Potentially problematic release.

auto-coder 0.1.348py3-none-any.whl → 0.1.349py3-none-any.whl