auto-coder 0.1.348__py3-none-any.whl → 0.1.350__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of auto-coder might be problematic. Click here for more details.

Files changed (35) hide show
  1. {auto_coder-0.1.348.dist-info → auto_coder-0.1.350.dist-info}/METADATA +1 -1
  2. {auto_coder-0.1.348.dist-info → auto_coder-0.1.350.dist-info}/RECORD +35 -26
  3. autocoder/auto_coder_runner.py +14 -10
  4. autocoder/chat_auto_coder_lang.py +5 -3
  5. autocoder/common/model_speed_tester.py +392 -0
  6. autocoder/common/printer.py +7 -8
  7. autocoder/common/run_cmd.py +247 -0
  8. autocoder/common/test_run_cmd.py +110 -0
  9. autocoder/common/v2/agent/agentic_edit.py +61 -11
  10. autocoder/common/v2/agent/agentic_edit_conversation.py +9 -0
  11. autocoder/common/v2/agent/agentic_edit_tools/execute_command_tool_resolver.py +21 -36
  12. autocoder/common/v2/agent/agentic_edit_tools/list_files_tool_resolver.py +4 -7
  13. autocoder/common/v2/agent/agentic_edit_tools/search_files_tool_resolver.py +2 -5
  14. autocoder/helper/rag_doc_creator.py +141 -0
  15. autocoder/ignorefiles/__init__.py +4 -0
  16. autocoder/ignorefiles/ignore_file_utils.py +63 -0
  17. autocoder/ignorefiles/test_ignore_file_utils.py +91 -0
  18. autocoder/models.py +48 -8
  19. autocoder/rag/cache/byzer_storage_cache.py +10 -4
  20. autocoder/rag/cache/file_monitor_cache.py +27 -24
  21. autocoder/rag/cache/local_byzer_storage_cache.py +11 -5
  22. autocoder/rag/cache/local_duckdb_storage_cache.py +203 -128
  23. autocoder/rag/cache/simple_cache.py +56 -37
  24. autocoder/rag/loaders/filter_utils.py +106 -0
  25. autocoder/rag/loaders/image_loader.py +45 -23
  26. autocoder/rag/loaders/pdf_loader.py +3 -3
  27. autocoder/rag/loaders/test_image_loader.py +209 -0
  28. autocoder/rag/qa_conversation_strategy.py +3 -5
  29. autocoder/rag/utils.py +20 -9
  30. autocoder/utils/_markitdown.py +35 -0
  31. autocoder/version.py +1 -1
  32. {auto_coder-0.1.348.dist-info → auto_coder-0.1.350.dist-info}/LICENSE +0 -0
  33. {auto_coder-0.1.348.dist-info → auto_coder-0.1.350.dist-info}/WHEEL +0 -0
  34. {auto_coder-0.1.348.dist-info → auto_coder-0.1.350.dist-info}/entry_points.txt +0 -0
  35. {auto_coder-0.1.348.dist-info → auto_coder-0.1.350.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,209 @@
1
+ import os
2
+ import re
3
+ import tempfile
4
+
5
+ import pytest
6
+
7
+ from autocoder.rag.loaders.image_loader import ImageLoader, ReplaceInFileTool
8
+ from autocoder.utils.llms import get_single_llm
9
+
10
+ # 模拟一个简单的llm对象(避免测试中真实调用LLM)
11
+ class DummyLLM:
12
+ def get_sub_client(self, name):
13
+ return None
14
+
15
+ def run(self, *args, **kwargs):
16
+ return "dummy response"
17
+
18
+ @pytest.fixture(scope="module")
19
+ def dummy_llm():
20
+ # 这里可以替换为真实llm,或Mock
21
+ return DummyLLM()
22
+
23
+ def test_parse_diff_basic():
24
+ diff = """
25
+ <<<<<<< SEARCH
26
+ foo
27
+ bar
28
+ =======
29
+ hello
30
+ world
31
+ >>>>>>> REPLACE
32
+ """
33
+ blocks = ImageLoader.parse_diff(diff)
34
+ assert len(blocks) == 1
35
+ search, replace = blocks[0]
36
+ assert "foo" in search
37
+ assert "hello" in replace
38
+
39
+ def test_extract_replace_in_file_tools():
40
+ text = """
41
+ <replace_in_file>
42
+ <path>file1.py</path>
43
+ <diff>
44
+ <<<<<<< SEARCH
45
+ old content
46
+ =======
47
+ new content
48
+ >>>>>>> REPLACE
49
+ </diff>
50
+ </replace_in_file>
51
+
52
+ <replace_in_file>
53
+ <path>file2.py</path>
54
+ <diff>
55
+ <<<<<<< SEARCH
56
+ x=1
57
+ =======
58
+ x=2
59
+ >>>>>>> REPLACE
60
+ </diff>
61
+ </replace_in_file>
62
+ """
63
+ tools = ImageLoader.extract_replace_in_file_tools(text)
64
+ assert len(tools) == 2
65
+ assert tools[0].path == "file1.py"
66
+ assert "old content" in tools[0].diff
67
+ assert tools[1].path == "file2.py"
68
+ assert "x=1" in tools[1].diff
69
+
70
+ def test_format_table_in_content_apply_diff(dummy_llm):
71
+ # 模拟一个OCR文本和对应diff
72
+ original = """这里是介绍
73
+ 产品 价格 数量
74
+ 苹果 5 10
75
+ 香蕉 3 20
76
+ 结束"""
77
+
78
+ # 构造符合replace_in_file格式的llm返回
79
+ llm_response = """
80
+ <replace_in_file>
81
+ <path>content</path>
82
+ <diff>
83
+ <<<<<<< SEARCH
84
+ 产品 价格 数量
85
+ 苹果 5 10
86
+ 香蕉 3 20
87
+ =======
88
+ | 产品 | 价格 | 数量 |
89
+ | --- | --- | --- |
90
+ | 苹果 | 5 | 10 |
91
+ | 香蕉 | 3 | 20 |
92
+ >>>>>>> REPLACE
93
+ </diff>
94
+ </replace_in_file>
95
+ """
96
+
97
+ # 模拟调用llm时返回llm_response
98
+ class FakeLLM:
99
+ def get_sub_client(self, name):
100
+ return None
101
+
102
+ def run(self, *args, **kwargs):
103
+ return llm_response
104
+
105
+ fake_llm = FakeLLM()
106
+
107
+ # patch _format_table 方法,让它直接返回llm_response
108
+ import byzerllm
109
+
110
+ class DummyPrompt:
111
+ def __call__(self, *args, **kwargs):
112
+ # 使其可装饰函数
113
+ def decorator(func):
114
+ class FakePromptWrapper:
115
+ def with_llm(self_inner, llm_obj):
116
+ class Runner:
117
+ def run(self_inner_inner, content):
118
+ return llm_response
119
+ return Runner()
120
+ return FakePromptWrapper()
121
+ return decorator
122
+
123
+ orig_prompt = byzerllm.prompt
124
+ byzerllm.prompt = DummyPrompt()
125
+
126
+ try:
127
+ formatted = ImageLoader.format_table_in_content(original, llm=fake_llm)
128
+ assert "| 产品 | 价格 | 数量 |" in formatted
129
+ assert "这里是介绍" in formatted
130
+ assert "结束" in formatted
131
+ finally:
132
+ byzerllm.prompt = orig_prompt
133
+
134
+ def test_paddleocr_extract_text_type_error_fix(monkeypatch):
135
+ """
136
+ 测试paddleocr_extract_text对异常结构的兼容性,模拟paddleocr.ocr返回非字符串结构
137
+ """
138
+ # 模拟PaddleOCR类
139
+ class FakeOCR:
140
+ def __init__(self, **kwargs):
141
+ pass
142
+
143
+ def ocr(self, file_path, **kwargs):
144
+ # 模拟返回嵌套list,第二个元素是list而非str,之前会报错
145
+ return [
146
+ [
147
+ # page 1
148
+ [[ [0,0],[1,1] ], (["text_in_list"], 0.9)],
149
+ [[ [0,0],[1,1] ], ("normal text", 0.95)],
150
+ ]
151
+ ]
152
+ # patch PaddleOCR
153
+ import autocoder.rag.loaders.image_loader as ilmod
154
+ monkeypatch.setattr(ilmod, "PaddleOCR", FakeOCR)
155
+
156
+ # 创建临时文件模拟图片
157
+ with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmpf:
158
+ tmp_path = tmpf.name
159
+
160
+ try:
161
+ text = ImageLoader.paddleocr_extract_text(tmp_path)
162
+ # 应该不会抛异常,且返回内容包含normal text和text_in_list
163
+ assert "normal text" in text
164
+ assert "text_in_list" in text
165
+ finally:
166
+ os.remove(tmp_path)
167
+
168
+ def test_paddlex_table_extract_markdown_no_paddlex(monkeypatch):
169
+ # paddlex_module为None时应返回""
170
+ import autocoder.rag.loaders.image_loader as ilmod
171
+ monkeypatch.setattr(ilmod, "paddlex_module", None)
172
+ md = ImageLoader.paddlex_table_extract_markdown("dummy_path.png")
173
+ assert md == ""
174
+
175
+ def test_html_table_to_markdown_simple():
176
+ html = """
177
+ <table>
178
+ <tr><th>头1</th><th>头2</th></tr>
179
+ <tr><td>数据1</td><td>数据2</td></tr>
180
+ <tr><td>数据3</td><td>数据4</td></tr>
181
+ </table>
182
+ """
183
+ md = ImageLoader.html_table_to_markdown(html)
184
+ assert "| 头1 | 头2 |" in md
185
+ assert "| 数据1 | 数据2 |" in md
186
+ assert "| 数据3 | 数据4 |" in md
187
+
188
+ def test_extract_text_from_image_unknown_engine(dummy_llm):
189
+ res = ImageLoader.extract_text_from_image("non_exist.png", dummy_llm, engine="xxx")
190
+ assert res == ""
191
+
192
+ def test_image_to_markdown_creates_file(tmp_path, dummy_llm, monkeypatch):
193
+ # 准备一个假图片文件
194
+ imgfile = tmp_path / "testimg.png"
195
+ imgfile.write_bytes(b"fake image content")
196
+
197
+ # monkeypatch extract_text_from_image返回固定内容
198
+ monkeypatch.setattr(ImageLoader, "extract_text_from_image", staticmethod(lambda *args, **kwargs: "# hello world"))
199
+
200
+ md_content = ImageLoader.image_to_markdown(str(imgfile), dummy_llm, engine="vl")
201
+ assert "# hello world" in md_content
202
+
203
+ md_file = imgfile.with_suffix(".md")
204
+ assert md_file.exists()
205
+ assert "# hello world" in md_file.read_text()
206
+
207
+ if __name__ == "__main__":
208
+ # 手动运行全部测试
209
+ pytest.main([__file__])
@@ -92,8 +92,7 @@ class MultiRoundStrategy(QAConversationStrategy):
92
92
  {% endfor %}
93
93
  </documents>
94
94
 
95
- ====
96
- {% endif %}
95
+ ====
97
96
 
98
97
  INSTRUCTIONS
99
98
 
@@ -214,8 +213,7 @@ class SingleRoundStrategy(QAConversationStrategy):
214
213
  {% endfor %}
215
214
  </documents>
216
215
 
217
- ====
218
- {% endif %}
216
+ ====
219
217
 
220
218
  USER CONVERSATION HISTORY
221
219
 
@@ -295,4 +293,4 @@ def get_qa_strategy(args: AutoCoderArgs) -> QAConversationStrategy:
295
293
  if strategy_name not in strategies:
296
294
  raise ValueError(f"Unknown strategy: {strategy_name}. Available strategies: {list(strategies.keys())}")
297
295
 
298
- return strategies[strategy_name]()
296
+ return strategies[strategy_name](args)
autocoder/rag/utils.py CHANGED
@@ -4,20 +4,27 @@ from autocoder.rag.loaders.pdf_loader import extract_text_from_pdf
4
4
  from autocoder.rag.loaders.docx_loader import extract_text_from_docx
5
5
  from autocoder.rag.loaders.excel_loader import extract_text_from_excel
6
6
  from autocoder.rag.loaders.ppt_loader import extract_text_from_ppt
7
- from typing import List, Tuple
7
+ from typing import List, Tuple, Optional, Union
8
8
  import time
9
9
  from loguru import logger
10
10
  import traceback
11
+ from byzerllm import SimpleByzerLLM, ByzerLLM
12
+ from autocoder.utils.llms import get_single_llm
11
13
 
12
14
 
13
15
  def process_file_in_multi_process(
14
- file_info: Tuple[str, str, float, str]
16
+ file_info: Tuple[str, str, float, str],
17
+ llm: Optional[Union[ByzerLLM, SimpleByzerLLM, str]] = None,
18
+ product_mode="lite",
15
19
  ) -> List[SourceCode]:
20
+ if llm and isinstance(llm, str):
21
+ llm = get_single_llm(llm,product_mode)
22
+
16
23
  start_time = time.time()
17
24
  file_path, relative_path, _, _ = file_info
18
25
  try:
19
26
  if file_path.endswith(".pdf"):
20
- content = extract_text_from_pdf(file_path)
27
+ content = extract_text_from_pdf(file_path, llm, product_mode)
21
28
  v = [
22
29
  SourceCode(
23
30
  module_name=file_path,
@@ -46,8 +53,7 @@ def process_file_in_multi_process(
46
53
  ]
47
54
  elif file_path.endswith(".pptx"):
48
55
  slides = extract_text_from_ppt(file_path)
49
- content = "".join(
50
- f"#{slide[0]}\n{slide[1]}\n\n" for slide in slides)
56
+ content = "".join(f"#{slide[0]}\n{slide[1]}\n\n" for slide in slides)
51
57
  v = [
52
58
  SourceCode(
53
59
  module_name=f"##File: {file_path}",
@@ -73,11 +79,17 @@ def process_file_in_multi_process(
73
79
  return []
74
80
 
75
81
 
76
- def process_file_local(file_path: str) -> List[SourceCode]:
82
+ def process_file_local(
83
+ file_path: str,
84
+ llm: Optional[Union[ByzerLLM, SimpleByzerLLM, str]] = None,
85
+ product_mode="lite",
86
+ ) -> List[SourceCode]:
77
87
  start_time = time.time()
88
+ if llm and isinstance(llm, str):
89
+ llm = get_single_llm(llm,product_mode)
78
90
  try:
79
91
  if file_path.endswith(".pdf"):
80
- content = extract_text_from_pdf(file_path)
92
+ content = extract_text_from_pdf(file_path, llm, product_mode)
81
93
  v = [
82
94
  SourceCode(
83
95
  module_name=file_path,
@@ -106,8 +118,7 @@ def process_file_local(file_path: str) -> List[SourceCode]:
106
118
  ]
107
119
  elif file_path.endswith(".pptx"):
108
120
  slides = extract_text_from_ppt(file_path)
109
- content = "".join(
110
- f"#{slide[0]}\n{slide[1]}\n\n" for slide in slides)
121
+ content = "".join(f"#{slide[0]}\n{slide[1]}\n\n" for slide in slides)
111
122
  v = [
112
123
  SourceCode(
113
124
  module_name=f"##File: {file_path}",
@@ -34,6 +34,10 @@ from pdfminer.image import ImageWriter
34
34
  import numpy as np
35
35
  from PIL import Image
36
36
 
37
+ # 新增导入
38
+ from autocoder.rag.loaders import filter_utils
39
+ from autocoder.rag.loaders.image_loader import ImageLoader
40
+
37
41
  # File-format detection
38
42
  import puremagic
39
43
  import requests
@@ -578,6 +582,14 @@ class PdfConverter(DocumentConverter):
578
582
  image_output_dir, f"image_{local_image_count}{suffix}")
579
583
  os.rename(temp_path, image_path)
580
584
  content.append(f"![Image {local_image_count}]({image_path})")
585
+ # ===== 新增:根据filter_utils判断是否需要解析图片
586
+ if filter_utils.should_parse_image(image_path):
587
+ try:
588
+ _ = ImageLoader.image_to_markdown(image_path, llm=None, engine="paddle")
589
+ # image_to_markdown会自动生成md文件
590
+ except Exception:
591
+ import traceback; traceback.print_exc()
592
+ # =====
581
593
  local_image_count += 1
582
594
  continue
583
595
  try:
@@ -606,6 +618,7 @@ class PdfConverter(DocumentConverter):
606
618
  content.append(
607
619
  f"![Image {local_image_count}]({image_path})\n"
608
620
  )
621
+ try_parse_image(image_path)
609
622
  local_image_count += 1
610
623
  continue
611
624
  elif colorspace == "DeviceGray":
@@ -616,6 +629,7 @@ class PdfConverter(DocumentConverter):
616
629
  content.append(
617
630
  f"![Image {local_image_count}]({image_path})\n"
618
631
  )
632
+ try_parse_image(image_path)
619
633
  local_image_count += 1
620
634
  continue
621
635
  except Exception as e:
@@ -627,6 +641,8 @@ class PdfConverter(DocumentConverter):
627
641
  img_file.write(image_data)
628
642
 
629
643
  content.append(f"![Image {local_image_count}]({image_path})\n")
644
+ # ===== 新增:根据filter_utils判断是否需要解析图片
645
+ try_parse_image(image_path)
630
646
  local_image_count += 1
631
647
 
632
648
  # Handle text
@@ -1070,6 +1086,8 @@ class MarkItDown:
1070
1086
  requests_session: Optional[requests.Session] = None,
1071
1087
  mlm_client: Optional[Any] = None,
1072
1088
  mlm_model: Optional[Any] = None,
1089
+ llm: Optional[Any] = None,
1090
+ product_mode: Optional[str] = None,
1073
1091
  ):
1074
1092
  if requests_session is None:
1075
1093
  self._requests_session = requests.Session()
@@ -1079,6 +1097,10 @@ class MarkItDown:
1079
1097
  self._mlm_client = mlm_client
1080
1098
  self._mlm_model = mlm_model
1081
1099
 
1100
+ # 新增:保存llm和product_mode
1101
+ self._llm = llm
1102
+ self._product_mode = product_mode
1103
+
1082
1104
  self._page_converters: List[DocumentConverter] = []
1083
1105
 
1084
1106
  # Register converters for successful browsing operations
@@ -1319,3 +1341,16 @@ class MarkItDown:
1319
1341
  def register_page_converter(self, converter: DocumentConverter) -> None:
1320
1342
  """Register a page text converter."""
1321
1343
  self._page_converters.insert(0, converter)
1344
+
1345
+
1346
+ def try_parse_image(image_path: str):
1347
+ """
1348
+ 根据filter_utils判断是否需要解析图片,如果需要则调用ImageLoader.image_to_markdown。
1349
+ 解析失败会自动捕获异常。
1350
+ """
1351
+ if filter_utils.should_parse_image(image_path):
1352
+ try:
1353
+ _ = ImageLoader.image_to_markdown(image_path, llm=None, engine="paddle")
1354
+ except Exception:
1355
+ import traceback; traceback.print_exc()
1356
+
autocoder/version.py CHANGED
@@ -1 +1 @@
1
- __version__ = "0.1.348"
1
+ __version__ = "0.1.350"