PyPI - auto-coder - Versions diffs - 0.1.348__py3-none-any.whl → 0.1.349__py3-none-any.whl - Mend

auto-coder 0.1.348py3-none-any.whl → 0.1.349py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of auto-coder might be problematic. Click here for more details.

Files changed (35) hide show

{auto_coder-0.1.348.dist-info → auto_coder-0.1.349.dist-info}/METADATA +1 -1
{auto_coder-0.1.348.dist-info → auto_coder-0.1.349.dist-info}/RECORD +35 -26
autocoder/auto_coder_runner.py +14 -10
autocoder/chat_auto_coder_lang.py +5 -3
autocoder/common/model_speed_tester.py +392 -0
autocoder/common/printer.py +7 -8
autocoder/common/run_cmd.py +247 -0
autocoder/common/test_run_cmd.py +110 -0
autocoder/common/v2/agent/agentic_edit.py +61 -11
autocoder/common/v2/agent/agentic_edit_conversation.py +9 -0
autocoder/common/v2/agent/agentic_edit_tools/execute_command_tool_resolver.py +21 -36
autocoder/common/v2/agent/agentic_edit_tools/list_files_tool_resolver.py +4 -7
autocoder/common/v2/agent/agentic_edit_tools/search_files_tool_resolver.py +2 -5
autocoder/helper/rag_doc_creator.py +141 -0
autocoder/ignorefiles/__init__.py +4 -0
autocoder/ignorefiles/ignore_file_utils.py +63 -0
autocoder/ignorefiles/test_ignore_file_utils.py +91 -0
autocoder/models.py +49 -9
autocoder/rag/cache/byzer_storage_cache.py +10 -4
autocoder/rag/cache/file_monitor_cache.py +27 -24
autocoder/rag/cache/local_byzer_storage_cache.py +11 -5
autocoder/rag/cache/local_duckdb_storage_cache.py +203 -128
autocoder/rag/cache/simple_cache.py +56 -37
autocoder/rag/loaders/filter_utils.py +106 -0
autocoder/rag/loaders/image_loader.py +45 -23
autocoder/rag/loaders/pdf_loader.py +3 -3
autocoder/rag/loaders/test_image_loader.py +209 -0
autocoder/rag/qa_conversation_strategy.py +3 -5
autocoder/rag/utils.py +20 -9
autocoder/utils/_markitdown.py +35 -0
autocoder/version.py +1 -1
{auto_coder-0.1.348.dist-info → auto_coder-0.1.349.dist-info}/LICENSE +0 -0
{auto_coder-0.1.348.dist-info → auto_coder-0.1.349.dist-info}/WHEEL +0 -0
{auto_coder-0.1.348.dist-info → auto_coder-0.1.349.dist-info}/entry_points.txt +0 -0
{auto_coder-0.1.348.dist-info → auto_coder-0.1.349.dist-info}/top_level.txt +0 -0

autocoder/rag/loaders/test_image_loader.py ADDED Viewed

@@ -0,0 +1,209 @@
+import os
+import re
+import tempfile
+import pytest
+from autocoder.rag.loaders.image_loader import ImageLoader, ReplaceInFileTool
+from autocoder.utils.llms import get_single_llm
+# 模拟一个简单的llm对象（避免测试中真实调用LLM）
+class DummyLLM:
+    def get_sub_client(self, name):
+        return None
+    def run(self, *args, **kwargs):
+        return "dummy response"
+@pytest.fixture(scope="module")
+def dummy_llm():
+    # 这里可以替换为真实llm，或Mock
+    return DummyLLM()
+def test_parse_diff_basic():
+    diff = """
+<<<<<<< SEARCH
+foo
+bar
+=======
+hello
+world
+>>>>>>> REPLACE
+"""
+    blocks = ImageLoader.parse_diff(diff)
+    assert len(blocks) == 1
+    search, replace = blocks[0]
+    assert "foo" in search
+    assert "hello" in replace
+def test_extract_replace_in_file_tools():
+    text = """
+<replace_in_file>
+<path>file1.py</path>
+<diff>
+<<<<<<< SEARCH
+old content
+=======
+new content
+>>>>>>> REPLACE
+</diff>
+</replace_in_file>
+<replace_in_file>
+<path>file2.py</path>
+<diff>
+<<<<<<< SEARCH
+x=1
+=======
+x=2
+>>>>>>> REPLACE
+</diff>
+</replace_in_file>
+"""
+    tools = ImageLoader.extract_replace_in_file_tools(text)
+    assert len(tools) == 2
+    assert tools[0].path == "file1.py"
+    assert "old content" in tools[0].diff
+    assert tools[1].path == "file2.py"
+    assert "x=1" in tools[1].diff
+def test_format_table_in_content_apply_diff(dummy_llm):
+    # 模拟一个OCR文本和对应diff
+    original = """这里是介绍
+产品 价格 数量
+苹果 5 10
+香蕉 3 20
+结束"""
+    # 构造符合replace_in_file格式的llm返回
+    llm_response = """
+<replace_in_file>
+<path>content</path>
+<diff>
+<<<<<<< SEARCH
+产品 价格 数量
+苹果 5 10
+香蕉 3 20
+=======
+| 产品 | 价格 | 数量 |
+| --- | --- | --- |
+| 苹果 | 5 | 10 |
+| 香蕉 | 3 | 20 |
+>>>>>>> REPLACE
+</diff>
+</replace_in_file>
+"""
+    # 模拟调用llm时返回llm_response
+    class FakeLLM:
+        def get_sub_client(self, name):
+            return None
+        def run(self, *args, **kwargs):
+            return llm_response
+    fake_llm = FakeLLM()
+    # patch _format_table 方法，让它直接返回llm_response
+    import byzerllm
+    class DummyPrompt:
+        def __call__(self, *args, **kwargs):
+            # 使其可装饰函数
+            def decorator(func):
+                class FakePromptWrapper:
+                    def with_llm(self_inner, llm_obj):
+                        class Runner:
+                            def run(self_inner_inner, content):
+                                return llm_response
+                        return Runner()
+                return FakePromptWrapper()
+            return decorator
+    orig_prompt = byzerllm.prompt
+    byzerllm.prompt = DummyPrompt()
+    try:
+        formatted = ImageLoader.format_table_in_content(original, llm=fake_llm)
+        assert "| 产品 | 价格 | 数量 |" in formatted
+        assert "这里是介绍" in formatted
+        assert "结束" in formatted
+    finally:
+        byzerllm.prompt = orig_prompt
+def test_paddleocr_extract_text_type_error_fix(monkeypatch):
+    """
+    测试paddleocr_extract_text对异常结构的兼容性，模拟paddleocr.ocr返回非字符串结构
+    """
+    # 模拟PaddleOCR类
+    class FakeOCR:
+        def __init__(self, **kwargs):
+            pass
+        def ocr(self, file_path, **kwargs):
+            # 模拟返回嵌套list，第二个元素是list而非str，之前会报错
+            return [
+                [
+                    # page 1
+                    [[ [0,0],[1,1] ], (["text_in_list"], 0.9)],
+                    [[ [0,0],[1,1] ], ("normal text", 0.95)],
+                ]
+            ]
+    # patch PaddleOCR
+    import autocoder.rag.loaders.image_loader as ilmod
+    monkeypatch.setattr(ilmod, "PaddleOCR", FakeOCR)
+    # 创建临时文件模拟图片
+    with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmpf:
+        tmp_path = tmpf.name
+    try:
+        text = ImageLoader.paddleocr_extract_text(tmp_path)
+        # 应该不会抛异常，且返回内容包含normal text和text_in_list
+        assert "normal text" in text
+        assert "text_in_list" in text
+    finally:
+        os.remove(tmp_path)
+def test_paddlex_table_extract_markdown_no_paddlex(monkeypatch):
+    # paddlex_module为None时应返回""
+    import autocoder.rag.loaders.image_loader as ilmod
+    monkeypatch.setattr(ilmod, "paddlex_module", None)
+    md = ImageLoader.paddlex_table_extract_markdown("dummy_path.png")
+    assert md == ""
+def test_html_table_to_markdown_simple():
+    html = """
+<table>
+<tr><th>头1</th><th>头2</th></tr>
+<tr><td>数据1</td><td>数据2</td></tr>
+<tr><td>数据3</td><td>数据4</td></tr>
+</table>
+"""
+    md = ImageLoader.html_table_to_markdown(html)
+    assert "| 头1 | 头2 |" in md
+    assert "| 数据1 | 数据2 |" in md
+    assert "| 数据3 | 数据4 |" in md
+def test_extract_text_from_image_unknown_engine(dummy_llm):
+    res = ImageLoader.extract_text_from_image("non_exist.png", dummy_llm, engine="xxx")
+    assert res == ""
+def test_image_to_markdown_creates_file(tmp_path, dummy_llm, monkeypatch):
+    # 准备一个假图片文件
+    imgfile = tmp_path / "testimg.png"
+    imgfile.write_bytes(b"fake image content")
+    # monkeypatch extract_text_from_image返回固定内容
+    monkeypatch.setattr(ImageLoader, "extract_text_from_image", staticmethod(lambda *args, **kwargs: "# hello world"))
+    md_content = ImageLoader.image_to_markdown(str(imgfile), dummy_llm, engine="vl")
+    assert "# hello world" in md_content
+    md_file = imgfile.with_suffix(".md")
+    assert md_file.exists()
+    assert "# hello world" in md_file.read_text()
+if __name__ == "__main__":
+    # 手动运行全部测试
+    pytest.main([__file__])

autocoder/rag/qa_conversation_strategy.py CHANGED Viewed

@@ -92,8 +92,7 @@ class MultiRoundStrategy(QAConversationStrategy):
         {% endfor %}
         </documents>
-        ====
-        {% endif %}
+        ====
         INSTRUCTIONS
@@ -214,8 +213,7 @@ class SingleRoundStrategy(QAConversationStrategy):
         {% endfor %}
         </documents>
-        ====
-        {% endif %}
+        ====
         USER CONVERSATION HISTORY
@@ -295,4 +293,4 @@ def get_qa_strategy(args: AutoCoderArgs) -> QAConversationStrategy:
     if strategy_name not in strategies:
         raise ValueError(f"Unknown strategy: {strategy_name}. Available strategies: {list(strategies.keys())}")
-    return strategies[strategy_name]()
+    return strategies[strategy_name](args)

autocoder/rag/utils.py CHANGED Viewed

@@ -4,20 +4,27 @@ from autocoder.rag.loaders.pdf_loader import extract_text_from_pdf
 from autocoder.rag.loaders.docx_loader import extract_text_from_docx
 from autocoder.rag.loaders.excel_loader import extract_text_from_excel
 from autocoder.rag.loaders.ppt_loader import extract_text_from_ppt
-from typing import List, Tuple
+from typing import List, Tuple, Optional, Union
 import time
 from loguru import logger
 import traceback
+from byzerllm import SimpleByzerLLM, ByzerLLM
+from autocoder.utils.llms import get_single_llm
 def process_file_in_multi_process(
-    file_info: Tuple[str, str, float, str]
+    file_info: Tuple[str, str, float, str],
+    llm: Optional[Union[ByzerLLM, SimpleByzerLLM, str]] = None,
+    product_mode="lite",
 ) -> List[SourceCode]:
+    if llm and isinstance(llm, str):
+        llm = get_single_llm(llm,product_mode)
     start_time = time.time()
     file_path, relative_path, _, _ = file_info
     try:
         if file_path.endswith(".pdf"):
-            content = extract_text_from_pdf(file_path)
+            content = extract_text_from_pdf(file_path, llm, product_mode)
             v = [
                 SourceCode(
                     module_name=file_path,
@@ -46,8 +53,7 @@ def process_file_in_multi_process(
             ]
         elif file_path.endswith(".pptx"):
             slides = extract_text_from_ppt(file_path)
-            content = "".join(
-                f"#{slide[0]}\n{slide[1]}\n\n" for slide in slides)
+            content = "".join(f"#{slide[0]}\n{slide[1]}\n\n" for slide in slides)
             v = [
                 SourceCode(
                     module_name=f"##File: {file_path}",
@@ -73,11 +79,17 @@ def process_file_in_multi_process(
         return []
-def process_file_local(file_path: str) -> List[SourceCode]:
+def process_file_local(
+    file_path: str,
+    llm: Optional[Union[ByzerLLM, SimpleByzerLLM, str]] = None,
+    product_mode="lite",
+) -> List[SourceCode]:
     start_time = time.time()
+    if llm and isinstance(llm, str):
+        llm = get_single_llm(llm,product_mode)
     try:
         if file_path.endswith(".pdf"):
-            content = extract_text_from_pdf(file_path)
+            content = extract_text_from_pdf(file_path, llm, product_mode)
             v = [
                 SourceCode(
                     module_name=file_path,
@@ -106,8 +118,7 @@ def process_file_local(file_path: str) -> List[SourceCode]:
             ]
         elif file_path.endswith(".pptx"):
             slides = extract_text_from_ppt(file_path)
-            content = "".join(
-                f"#{slide[0]}\n{slide[1]}\n\n" for slide in slides)
+            content = "".join(f"#{slide[0]}\n{slide[1]}\n\n" for slide in slides)
             v = [
                 SourceCode(
                     module_name=f"##File: {file_path}",

autocoder/utils/_markitdown.py CHANGED Viewed

@@ -34,6 +34,10 @@ from pdfminer.image import ImageWriter
 import numpy as np
 from PIL import Image
+# 新增导入
+from autocoder.rag.loaders import filter_utils
+from autocoder.rag.loaders.image_loader import ImageLoader
 # File-format detection
 import puremagic
 import requests
@@ -578,6 +582,14 @@ class PdfConverter(DocumentConverter):
                             image_output_dir, f"image_{local_image_count}{suffix}")
                         os.rename(temp_path, image_path)
                         content.append(f"![Image {local_image_count}]({image_path})")
+                        # ===== 新增：根据filter_utils判断是否需要解析图片
+                        if filter_utils.should_parse_image(image_path):
+                            try:
+                                _ = ImageLoader.image_to_markdown(image_path, llm=None, engine="paddle")
+                                # image_to_markdown会自动生成md文件
+                            except Exception:
+                                import traceback; traceback.print_exc()
+                        # =====
                         local_image_count += 1
                         continue
                     try:
@@ -606,6 +618,7 @@ class PdfConverter(DocumentConverter):
                                 content.append(
                                     f"![Image {local_image_count}]({image_path})\n"
                                 )
+                                try_parse_image(image_path)
                                 local_image_count += 1
                                 continue
                             elif colorspace == "DeviceGray":
@@ -616,6 +629,7 @@ class PdfConverter(DocumentConverter):
                                 content.append(
                                     f"![Image {local_image_count}]({image_path})\n"
                                 )
+                                try_parse_image(image_path)
                                 local_image_count += 1
                                 continue
                     except Exception as e:
@@ -627,6 +641,8 @@ class PdfConverter(DocumentConverter):
                         img_file.write(image_data)
                     content.append(f"![Image {local_image_count}]({image_path})\n")
+                    # ===== 新增：根据filter_utils判断是否需要解析图片
+                    try_parse_image(image_path)
                     local_image_count += 1
             # Handle text
@@ -1070,6 +1086,8 @@ class MarkItDown:
         requests_session: Optional[requests.Session] = None,
         mlm_client: Optional[Any] = None,
         mlm_model: Optional[Any] = None,
+        llm: Optional[Any] = None,
+        product_mode: Optional[str] = None,
     ):
         if requests_session is None:
             self._requests_session = requests.Session()
@@ -1079,6 +1097,10 @@ class MarkItDown:
         self._mlm_client = mlm_client
         self._mlm_model = mlm_model
+        # 新增：保存llm和product_mode
+        self._llm = llm
+        self._product_mode = product_mode
         self._page_converters: List[DocumentConverter] = []
         # Register converters for successful browsing operations
@@ -1319,3 +1341,16 @@ class MarkItDown:
     def register_page_converter(self, converter: DocumentConverter) -> None:
         """Register a page text converter."""
         self._page_converters.insert(0, converter)
+def try_parse_image(image_path: str):
+    """
+    根据filter_utils判断是否需要解析图片，如果需要则调用ImageLoader.image_to_markdown。
+    解析失败会自动捕获异常。
+    """
+    if filter_utils.should_parse_image(image_path):
+        try:
+            _ = ImageLoader.image_to_markdown(image_path, llm=None, engine="paddle")
+        except Exception:
+            import traceback; traceback.print_exc()

autocoder/version.py CHANGED Viewed

	@@ -1 +1 @@
1	- __version__ = "0.1.~~348~~"
1	+ __version__ = "0.1.349"

{auto_coder-0.1.348.dist-info → auto_coder-0.1.349.dist-info}/LICENSE RENAMED Viewed

File without changes

{auto_coder-0.1.348.dist-info → auto_coder-0.1.349.dist-info}/WHEEL RENAMED Viewed

File without changes

{auto_coder-0.1.348.dist-info → auto_coder-0.1.349.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{auto_coder-0.1.348.dist-info → auto_coder-0.1.349.dist-info}/top_level.txt RENAMED Viewed

File without changes

auto-coder 0.1.348__py3-none-any.whl → 0.1.349__py3-none-any.whl

Potentially problematic release.

auto-coder 0.1.348py3-none-any.whl → 0.1.349py3-none-any.whl