auto-coder 0.1.348__py3-none-any.whl → 0.1.349__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of auto-coder might be problematic. Click here for more details.
- {auto_coder-0.1.348.dist-info → auto_coder-0.1.349.dist-info}/METADATA +1 -1
- {auto_coder-0.1.348.dist-info → auto_coder-0.1.349.dist-info}/RECORD +35 -26
- autocoder/auto_coder_runner.py +14 -10
- autocoder/chat_auto_coder_lang.py +5 -3
- autocoder/common/model_speed_tester.py +392 -0
- autocoder/common/printer.py +7 -8
- autocoder/common/run_cmd.py +247 -0
- autocoder/common/test_run_cmd.py +110 -0
- autocoder/common/v2/agent/agentic_edit.py +61 -11
- autocoder/common/v2/agent/agentic_edit_conversation.py +9 -0
- autocoder/common/v2/agent/agentic_edit_tools/execute_command_tool_resolver.py +21 -36
- autocoder/common/v2/agent/agentic_edit_tools/list_files_tool_resolver.py +4 -7
- autocoder/common/v2/agent/agentic_edit_tools/search_files_tool_resolver.py +2 -5
- autocoder/helper/rag_doc_creator.py +141 -0
- autocoder/ignorefiles/__init__.py +4 -0
- autocoder/ignorefiles/ignore_file_utils.py +63 -0
- autocoder/ignorefiles/test_ignore_file_utils.py +91 -0
- autocoder/models.py +49 -9
- autocoder/rag/cache/byzer_storage_cache.py +10 -4
- autocoder/rag/cache/file_monitor_cache.py +27 -24
- autocoder/rag/cache/local_byzer_storage_cache.py +11 -5
- autocoder/rag/cache/local_duckdb_storage_cache.py +203 -128
- autocoder/rag/cache/simple_cache.py +56 -37
- autocoder/rag/loaders/filter_utils.py +106 -0
- autocoder/rag/loaders/image_loader.py +45 -23
- autocoder/rag/loaders/pdf_loader.py +3 -3
- autocoder/rag/loaders/test_image_loader.py +209 -0
- autocoder/rag/qa_conversation_strategy.py +3 -5
- autocoder/rag/utils.py +20 -9
- autocoder/utils/_markitdown.py +35 -0
- autocoder/version.py +1 -1
- {auto_coder-0.1.348.dist-info → auto_coder-0.1.349.dist-info}/LICENSE +0 -0
- {auto_coder-0.1.348.dist-info → auto_coder-0.1.349.dist-info}/WHEEL +0 -0
- {auto_coder-0.1.348.dist-info → auto_coder-0.1.349.dist-info}/entry_points.txt +0 -0
- {auto_coder-0.1.348.dist-info → auto_coder-0.1.349.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,209 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import re
|
|
3
|
+
import tempfile
|
|
4
|
+
|
|
5
|
+
import pytest
|
|
6
|
+
|
|
7
|
+
from autocoder.rag.loaders.image_loader import ImageLoader, ReplaceInFileTool
|
|
8
|
+
from autocoder.utils.llms import get_single_llm
|
|
9
|
+
|
|
10
|
+
# 模拟一个简单的llm对象(避免测试中真实调用LLM)
|
|
11
|
+
class DummyLLM:
|
|
12
|
+
def get_sub_client(self, name):
|
|
13
|
+
return None
|
|
14
|
+
|
|
15
|
+
def run(self, *args, **kwargs):
|
|
16
|
+
return "dummy response"
|
|
17
|
+
|
|
18
|
+
@pytest.fixture(scope="module")
|
|
19
|
+
def dummy_llm():
|
|
20
|
+
# 这里可以替换为真实llm,或Mock
|
|
21
|
+
return DummyLLM()
|
|
22
|
+
|
|
23
|
+
def test_parse_diff_basic():
|
|
24
|
+
diff = """
|
|
25
|
+
<<<<<<< SEARCH
|
|
26
|
+
foo
|
|
27
|
+
bar
|
|
28
|
+
=======
|
|
29
|
+
hello
|
|
30
|
+
world
|
|
31
|
+
>>>>>>> REPLACE
|
|
32
|
+
"""
|
|
33
|
+
blocks = ImageLoader.parse_diff(diff)
|
|
34
|
+
assert len(blocks) == 1
|
|
35
|
+
search, replace = blocks[0]
|
|
36
|
+
assert "foo" in search
|
|
37
|
+
assert "hello" in replace
|
|
38
|
+
|
|
39
|
+
def test_extract_replace_in_file_tools():
|
|
40
|
+
text = """
|
|
41
|
+
<replace_in_file>
|
|
42
|
+
<path>file1.py</path>
|
|
43
|
+
<diff>
|
|
44
|
+
<<<<<<< SEARCH
|
|
45
|
+
old content
|
|
46
|
+
=======
|
|
47
|
+
new content
|
|
48
|
+
>>>>>>> REPLACE
|
|
49
|
+
</diff>
|
|
50
|
+
</replace_in_file>
|
|
51
|
+
|
|
52
|
+
<replace_in_file>
|
|
53
|
+
<path>file2.py</path>
|
|
54
|
+
<diff>
|
|
55
|
+
<<<<<<< SEARCH
|
|
56
|
+
x=1
|
|
57
|
+
=======
|
|
58
|
+
x=2
|
|
59
|
+
>>>>>>> REPLACE
|
|
60
|
+
</diff>
|
|
61
|
+
</replace_in_file>
|
|
62
|
+
"""
|
|
63
|
+
tools = ImageLoader.extract_replace_in_file_tools(text)
|
|
64
|
+
assert len(tools) == 2
|
|
65
|
+
assert tools[0].path == "file1.py"
|
|
66
|
+
assert "old content" in tools[0].diff
|
|
67
|
+
assert tools[1].path == "file2.py"
|
|
68
|
+
assert "x=1" in tools[1].diff
|
|
69
|
+
|
|
70
|
+
def test_format_table_in_content_apply_diff(dummy_llm):
|
|
71
|
+
# 模拟一个OCR文本和对应diff
|
|
72
|
+
original = """这里是介绍
|
|
73
|
+
产品 价格 数量
|
|
74
|
+
苹果 5 10
|
|
75
|
+
香蕉 3 20
|
|
76
|
+
结束"""
|
|
77
|
+
|
|
78
|
+
# 构造符合replace_in_file格式的llm返回
|
|
79
|
+
llm_response = """
|
|
80
|
+
<replace_in_file>
|
|
81
|
+
<path>content</path>
|
|
82
|
+
<diff>
|
|
83
|
+
<<<<<<< SEARCH
|
|
84
|
+
产品 价格 数量
|
|
85
|
+
苹果 5 10
|
|
86
|
+
香蕉 3 20
|
|
87
|
+
=======
|
|
88
|
+
| 产品 | 价格 | 数量 |
|
|
89
|
+
| --- | --- | --- |
|
|
90
|
+
| 苹果 | 5 | 10 |
|
|
91
|
+
| 香蕉 | 3 | 20 |
|
|
92
|
+
>>>>>>> REPLACE
|
|
93
|
+
</diff>
|
|
94
|
+
</replace_in_file>
|
|
95
|
+
"""
|
|
96
|
+
|
|
97
|
+
# 模拟调用llm时返回llm_response
|
|
98
|
+
class FakeLLM:
|
|
99
|
+
def get_sub_client(self, name):
|
|
100
|
+
return None
|
|
101
|
+
|
|
102
|
+
def run(self, *args, **kwargs):
|
|
103
|
+
return llm_response
|
|
104
|
+
|
|
105
|
+
fake_llm = FakeLLM()
|
|
106
|
+
|
|
107
|
+
# patch _format_table 方法,让它直接返回llm_response
|
|
108
|
+
import byzerllm
|
|
109
|
+
|
|
110
|
+
class DummyPrompt:
|
|
111
|
+
def __call__(self, *args, **kwargs):
|
|
112
|
+
# 使其可装饰函数
|
|
113
|
+
def decorator(func):
|
|
114
|
+
class FakePromptWrapper:
|
|
115
|
+
def with_llm(self_inner, llm_obj):
|
|
116
|
+
class Runner:
|
|
117
|
+
def run(self_inner_inner, content):
|
|
118
|
+
return llm_response
|
|
119
|
+
return Runner()
|
|
120
|
+
return FakePromptWrapper()
|
|
121
|
+
return decorator
|
|
122
|
+
|
|
123
|
+
orig_prompt = byzerllm.prompt
|
|
124
|
+
byzerllm.prompt = DummyPrompt()
|
|
125
|
+
|
|
126
|
+
try:
|
|
127
|
+
formatted = ImageLoader.format_table_in_content(original, llm=fake_llm)
|
|
128
|
+
assert "| 产品 | 价格 | 数量 |" in formatted
|
|
129
|
+
assert "这里是介绍" in formatted
|
|
130
|
+
assert "结束" in formatted
|
|
131
|
+
finally:
|
|
132
|
+
byzerllm.prompt = orig_prompt
|
|
133
|
+
|
|
134
|
+
def test_paddleocr_extract_text_type_error_fix(monkeypatch):
|
|
135
|
+
"""
|
|
136
|
+
测试paddleocr_extract_text对异常结构的兼容性,模拟paddleocr.ocr返回非字符串结构
|
|
137
|
+
"""
|
|
138
|
+
# 模拟PaddleOCR类
|
|
139
|
+
class FakeOCR:
|
|
140
|
+
def __init__(self, **kwargs):
|
|
141
|
+
pass
|
|
142
|
+
|
|
143
|
+
def ocr(self, file_path, **kwargs):
|
|
144
|
+
# 模拟返回嵌套list,第二个元素是list而非str,之前会报错
|
|
145
|
+
return [
|
|
146
|
+
[
|
|
147
|
+
# page 1
|
|
148
|
+
[[ [0,0],[1,1] ], (["text_in_list"], 0.9)],
|
|
149
|
+
[[ [0,0],[1,1] ], ("normal text", 0.95)],
|
|
150
|
+
]
|
|
151
|
+
]
|
|
152
|
+
# patch PaddleOCR
|
|
153
|
+
import autocoder.rag.loaders.image_loader as ilmod
|
|
154
|
+
monkeypatch.setattr(ilmod, "PaddleOCR", FakeOCR)
|
|
155
|
+
|
|
156
|
+
# 创建临时文件模拟图片
|
|
157
|
+
with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmpf:
|
|
158
|
+
tmp_path = tmpf.name
|
|
159
|
+
|
|
160
|
+
try:
|
|
161
|
+
text = ImageLoader.paddleocr_extract_text(tmp_path)
|
|
162
|
+
# 应该不会抛异常,且返回内容包含normal text和text_in_list
|
|
163
|
+
assert "normal text" in text
|
|
164
|
+
assert "text_in_list" in text
|
|
165
|
+
finally:
|
|
166
|
+
os.remove(tmp_path)
|
|
167
|
+
|
|
168
|
+
def test_paddlex_table_extract_markdown_no_paddlex(monkeypatch):
|
|
169
|
+
# paddlex_module为None时应返回""
|
|
170
|
+
import autocoder.rag.loaders.image_loader as ilmod
|
|
171
|
+
monkeypatch.setattr(ilmod, "paddlex_module", None)
|
|
172
|
+
md = ImageLoader.paddlex_table_extract_markdown("dummy_path.png")
|
|
173
|
+
assert md == ""
|
|
174
|
+
|
|
175
|
+
def test_html_table_to_markdown_simple():
|
|
176
|
+
html = """
|
|
177
|
+
<table>
|
|
178
|
+
<tr><th>头1</th><th>头2</th></tr>
|
|
179
|
+
<tr><td>数据1</td><td>数据2</td></tr>
|
|
180
|
+
<tr><td>数据3</td><td>数据4</td></tr>
|
|
181
|
+
</table>
|
|
182
|
+
"""
|
|
183
|
+
md = ImageLoader.html_table_to_markdown(html)
|
|
184
|
+
assert "| 头1 | 头2 |" in md
|
|
185
|
+
assert "| 数据1 | 数据2 |" in md
|
|
186
|
+
assert "| 数据3 | 数据4 |" in md
|
|
187
|
+
|
|
188
|
+
def test_extract_text_from_image_unknown_engine(dummy_llm):
|
|
189
|
+
res = ImageLoader.extract_text_from_image("non_exist.png", dummy_llm, engine="xxx")
|
|
190
|
+
assert res == ""
|
|
191
|
+
|
|
192
|
+
def test_image_to_markdown_creates_file(tmp_path, dummy_llm, monkeypatch):
|
|
193
|
+
# 准备一个假图片文件
|
|
194
|
+
imgfile = tmp_path / "testimg.png"
|
|
195
|
+
imgfile.write_bytes(b"fake image content")
|
|
196
|
+
|
|
197
|
+
# monkeypatch extract_text_from_image返回固定内容
|
|
198
|
+
monkeypatch.setattr(ImageLoader, "extract_text_from_image", staticmethod(lambda *args, **kwargs: "# hello world"))
|
|
199
|
+
|
|
200
|
+
md_content = ImageLoader.image_to_markdown(str(imgfile), dummy_llm, engine="vl")
|
|
201
|
+
assert "# hello world" in md_content
|
|
202
|
+
|
|
203
|
+
md_file = imgfile.with_suffix(".md")
|
|
204
|
+
assert md_file.exists()
|
|
205
|
+
assert "# hello world" in md_file.read_text()
|
|
206
|
+
|
|
207
|
+
if __name__ == "__main__":
|
|
208
|
+
# 手动运行全部测试
|
|
209
|
+
pytest.main([__file__])
|
|
@@ -92,8 +92,7 @@ class MultiRoundStrategy(QAConversationStrategy):
|
|
|
92
92
|
{% endfor %}
|
|
93
93
|
</documents>
|
|
94
94
|
|
|
95
|
-
====
|
|
96
|
-
{% endif %}
|
|
95
|
+
====
|
|
97
96
|
|
|
98
97
|
INSTRUCTIONS
|
|
99
98
|
|
|
@@ -214,8 +213,7 @@ class SingleRoundStrategy(QAConversationStrategy):
|
|
|
214
213
|
{% endfor %}
|
|
215
214
|
</documents>
|
|
216
215
|
|
|
217
|
-
====
|
|
218
|
-
{% endif %}
|
|
216
|
+
====
|
|
219
217
|
|
|
220
218
|
USER CONVERSATION HISTORY
|
|
221
219
|
|
|
@@ -295,4 +293,4 @@ def get_qa_strategy(args: AutoCoderArgs) -> QAConversationStrategy:
|
|
|
295
293
|
if strategy_name not in strategies:
|
|
296
294
|
raise ValueError(f"Unknown strategy: {strategy_name}. Available strategies: {list(strategies.keys())}")
|
|
297
295
|
|
|
298
|
-
return strategies[strategy_name]()
|
|
296
|
+
return strategies[strategy_name](args)
|
autocoder/rag/utils.py
CHANGED
|
@@ -4,20 +4,27 @@ from autocoder.rag.loaders.pdf_loader import extract_text_from_pdf
|
|
|
4
4
|
from autocoder.rag.loaders.docx_loader import extract_text_from_docx
|
|
5
5
|
from autocoder.rag.loaders.excel_loader import extract_text_from_excel
|
|
6
6
|
from autocoder.rag.loaders.ppt_loader import extract_text_from_ppt
|
|
7
|
-
from typing import List, Tuple
|
|
7
|
+
from typing import List, Tuple, Optional, Union
|
|
8
8
|
import time
|
|
9
9
|
from loguru import logger
|
|
10
10
|
import traceback
|
|
11
|
+
from byzerllm import SimpleByzerLLM, ByzerLLM
|
|
12
|
+
from autocoder.utils.llms import get_single_llm
|
|
11
13
|
|
|
12
14
|
|
|
13
15
|
def process_file_in_multi_process(
|
|
14
|
-
file_info: Tuple[str, str, float, str]
|
|
16
|
+
file_info: Tuple[str, str, float, str],
|
|
17
|
+
llm: Optional[Union[ByzerLLM, SimpleByzerLLM, str]] = None,
|
|
18
|
+
product_mode="lite",
|
|
15
19
|
) -> List[SourceCode]:
|
|
20
|
+
if llm and isinstance(llm, str):
|
|
21
|
+
llm = get_single_llm(llm,product_mode)
|
|
22
|
+
|
|
16
23
|
start_time = time.time()
|
|
17
24
|
file_path, relative_path, _, _ = file_info
|
|
18
25
|
try:
|
|
19
26
|
if file_path.endswith(".pdf"):
|
|
20
|
-
content = extract_text_from_pdf(file_path)
|
|
27
|
+
content = extract_text_from_pdf(file_path, llm, product_mode)
|
|
21
28
|
v = [
|
|
22
29
|
SourceCode(
|
|
23
30
|
module_name=file_path,
|
|
@@ -46,8 +53,7 @@ def process_file_in_multi_process(
|
|
|
46
53
|
]
|
|
47
54
|
elif file_path.endswith(".pptx"):
|
|
48
55
|
slides = extract_text_from_ppt(file_path)
|
|
49
|
-
content = "".join(
|
|
50
|
-
f"#{slide[0]}\n{slide[1]}\n\n" for slide in slides)
|
|
56
|
+
content = "".join(f"#{slide[0]}\n{slide[1]}\n\n" for slide in slides)
|
|
51
57
|
v = [
|
|
52
58
|
SourceCode(
|
|
53
59
|
module_name=f"##File: {file_path}",
|
|
@@ -73,11 +79,17 @@ def process_file_in_multi_process(
|
|
|
73
79
|
return []
|
|
74
80
|
|
|
75
81
|
|
|
76
|
-
def process_file_local(
|
|
82
|
+
def process_file_local(
|
|
83
|
+
file_path: str,
|
|
84
|
+
llm: Optional[Union[ByzerLLM, SimpleByzerLLM, str]] = None,
|
|
85
|
+
product_mode="lite",
|
|
86
|
+
) -> List[SourceCode]:
|
|
77
87
|
start_time = time.time()
|
|
88
|
+
if llm and isinstance(llm, str):
|
|
89
|
+
llm = get_single_llm(llm,product_mode)
|
|
78
90
|
try:
|
|
79
91
|
if file_path.endswith(".pdf"):
|
|
80
|
-
content = extract_text_from_pdf(file_path)
|
|
92
|
+
content = extract_text_from_pdf(file_path, llm, product_mode)
|
|
81
93
|
v = [
|
|
82
94
|
SourceCode(
|
|
83
95
|
module_name=file_path,
|
|
@@ -106,8 +118,7 @@ def process_file_local(file_path: str) -> List[SourceCode]:
|
|
|
106
118
|
]
|
|
107
119
|
elif file_path.endswith(".pptx"):
|
|
108
120
|
slides = extract_text_from_ppt(file_path)
|
|
109
|
-
content = "".join(
|
|
110
|
-
f"#{slide[0]}\n{slide[1]}\n\n" for slide in slides)
|
|
121
|
+
content = "".join(f"#{slide[0]}\n{slide[1]}\n\n" for slide in slides)
|
|
111
122
|
v = [
|
|
112
123
|
SourceCode(
|
|
113
124
|
module_name=f"##File: {file_path}",
|
autocoder/utils/_markitdown.py
CHANGED
|
@@ -34,6 +34,10 @@ from pdfminer.image import ImageWriter
|
|
|
34
34
|
import numpy as np
|
|
35
35
|
from PIL import Image
|
|
36
36
|
|
|
37
|
+
# 新增导入
|
|
38
|
+
from autocoder.rag.loaders import filter_utils
|
|
39
|
+
from autocoder.rag.loaders.image_loader import ImageLoader
|
|
40
|
+
|
|
37
41
|
# File-format detection
|
|
38
42
|
import puremagic
|
|
39
43
|
import requests
|
|
@@ -578,6 +582,14 @@ class PdfConverter(DocumentConverter):
|
|
|
578
582
|
image_output_dir, f"image_{local_image_count}{suffix}")
|
|
579
583
|
os.rename(temp_path, image_path)
|
|
580
584
|
content.append(f"")
|
|
585
|
+
# ===== 新增:根据filter_utils判断是否需要解析图片
|
|
586
|
+
if filter_utils.should_parse_image(image_path):
|
|
587
|
+
try:
|
|
588
|
+
_ = ImageLoader.image_to_markdown(image_path, llm=None, engine="paddle")
|
|
589
|
+
# image_to_markdown会自动生成md文件
|
|
590
|
+
except Exception:
|
|
591
|
+
import traceback; traceback.print_exc()
|
|
592
|
+
# =====
|
|
581
593
|
local_image_count += 1
|
|
582
594
|
continue
|
|
583
595
|
try:
|
|
@@ -606,6 +618,7 @@ class PdfConverter(DocumentConverter):
|
|
|
606
618
|
content.append(
|
|
607
619
|
f"\n"
|
|
608
620
|
)
|
|
621
|
+
try_parse_image(image_path)
|
|
609
622
|
local_image_count += 1
|
|
610
623
|
continue
|
|
611
624
|
elif colorspace == "DeviceGray":
|
|
@@ -616,6 +629,7 @@ class PdfConverter(DocumentConverter):
|
|
|
616
629
|
content.append(
|
|
617
630
|
f"\n"
|
|
618
631
|
)
|
|
632
|
+
try_parse_image(image_path)
|
|
619
633
|
local_image_count += 1
|
|
620
634
|
continue
|
|
621
635
|
except Exception as e:
|
|
@@ -627,6 +641,8 @@ class PdfConverter(DocumentConverter):
|
|
|
627
641
|
img_file.write(image_data)
|
|
628
642
|
|
|
629
643
|
content.append(f"\n")
|
|
644
|
+
# ===== 新增:根据filter_utils判断是否需要解析图片
|
|
645
|
+
try_parse_image(image_path)
|
|
630
646
|
local_image_count += 1
|
|
631
647
|
|
|
632
648
|
# Handle text
|
|
@@ -1070,6 +1086,8 @@ class MarkItDown:
|
|
|
1070
1086
|
requests_session: Optional[requests.Session] = None,
|
|
1071
1087
|
mlm_client: Optional[Any] = None,
|
|
1072
1088
|
mlm_model: Optional[Any] = None,
|
|
1089
|
+
llm: Optional[Any] = None,
|
|
1090
|
+
product_mode: Optional[str] = None,
|
|
1073
1091
|
):
|
|
1074
1092
|
if requests_session is None:
|
|
1075
1093
|
self._requests_session = requests.Session()
|
|
@@ -1079,6 +1097,10 @@ class MarkItDown:
|
|
|
1079
1097
|
self._mlm_client = mlm_client
|
|
1080
1098
|
self._mlm_model = mlm_model
|
|
1081
1099
|
|
|
1100
|
+
# 新增:保存llm和product_mode
|
|
1101
|
+
self._llm = llm
|
|
1102
|
+
self._product_mode = product_mode
|
|
1103
|
+
|
|
1082
1104
|
self._page_converters: List[DocumentConverter] = []
|
|
1083
1105
|
|
|
1084
1106
|
# Register converters for successful browsing operations
|
|
@@ -1319,3 +1341,16 @@ class MarkItDown:
|
|
|
1319
1341
|
def register_page_converter(self, converter: DocumentConverter) -> None:
|
|
1320
1342
|
"""Register a page text converter."""
|
|
1321
1343
|
self._page_converters.insert(0, converter)
|
|
1344
|
+
|
|
1345
|
+
|
|
1346
|
+
def try_parse_image(image_path: str):
|
|
1347
|
+
"""
|
|
1348
|
+
根据filter_utils判断是否需要解析图片,如果需要则调用ImageLoader.image_to_markdown。
|
|
1349
|
+
解析失败会自动捕获异常。
|
|
1350
|
+
"""
|
|
1351
|
+
if filter_utils.should_parse_image(image_path):
|
|
1352
|
+
try:
|
|
1353
|
+
_ = ImageLoader.image_to_markdown(image_path, llm=None, engine="paddle")
|
|
1354
|
+
except Exception:
|
|
1355
|
+
import traceback; traceback.print_exc()
|
|
1356
|
+
|
autocoder/version.py
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "0.1.
|
|
1
|
+
__version__ = "0.1.349"
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|