auto-coder 0.1.205__tar.gz → 0.1.206__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of auto-coder might be problematic. Click here for more details.
- {auto_coder-0.1.205 → auto_coder-0.1.206}/PKG-INFO +1 -1
- {auto_coder-0.1.205 → auto_coder-0.1.206}/setup.py +2 -1
- {auto_coder-0.1.205 → auto_coder-0.1.206}/src/auto_coder.egg-info/PKG-INFO +1 -1
- {auto_coder-0.1.205 → auto_coder-0.1.206}/src/auto_coder.egg-info/SOURCES.txt +6 -0
- auto_coder-0.1.206/src/autocoder/agent/auto_filegroup.py +202 -0
- {auto_coder-0.1.205 → auto_coder-0.1.206}/src/autocoder/auto_coder_rag.py +165 -33
- auto_coder-0.1.206/src/autocoder/benchmark.py +135 -0
- {auto_coder-0.1.205 → auto_coder-0.1.206}/src/autocoder/chat_auto_coder.py +1 -0
- auto_coder-0.1.206/src/autocoder/common/chunk_validation.py +91 -0
- auto_coder-0.1.206/src/autocoder/common/recall_validation.py +58 -0
- auto_coder-0.1.206/src/autocoder/data/tokenizer.json +199865 -0
- {auto_coder-0.1.205 → auto_coder-0.1.206}/src/autocoder/rag/token_counter.py +3 -3
- auto_coder-0.1.206/src/autocoder/utils/operate_config_api.py +148 -0
- auto_coder-0.1.206/src/autocoder/version.py +1 -0
- auto_coder-0.1.205/src/autocoder/version.py +0 -1
- {auto_coder-0.1.205 → auto_coder-0.1.206}/LICENSE +0 -0
- {auto_coder-0.1.205 → auto_coder-0.1.206}/README.md +0 -0
- {auto_coder-0.1.205 → auto_coder-0.1.206}/setup.cfg +0 -0
- {auto_coder-0.1.205 → auto_coder-0.1.206}/src/auto_coder.egg-info/dependency_links.txt +0 -0
- {auto_coder-0.1.205 → auto_coder-0.1.206}/src/auto_coder.egg-info/entry_points.txt +0 -0
- {auto_coder-0.1.205 → auto_coder-0.1.206}/src/auto_coder.egg-info/requires.txt +0 -0
- {auto_coder-0.1.205 → auto_coder-0.1.206}/src/auto_coder.egg-info/top_level.txt +0 -0
- {auto_coder-0.1.205 → auto_coder-0.1.206}/src/autocoder/__init__.py +0 -0
- {auto_coder-0.1.205 → auto_coder-0.1.206}/src/autocoder/agent/__init__.py +0 -0
- {auto_coder-0.1.205 → auto_coder-0.1.206}/src/autocoder/agent/auto_tool.py +0 -0
- {auto_coder-0.1.205 → auto_coder-0.1.206}/src/autocoder/agent/coder.py +0 -0
- {auto_coder-0.1.205 → auto_coder-0.1.206}/src/autocoder/agent/designer.py +0 -0
- {auto_coder-0.1.205 → auto_coder-0.1.206}/src/autocoder/agent/planner.py +0 -0
- {auto_coder-0.1.205 → auto_coder-0.1.206}/src/autocoder/agent/project_reader.py +0 -0
- {auto_coder-0.1.205 → auto_coder-0.1.206}/src/autocoder/auto_coder.py +0 -0
- {auto_coder-0.1.205 → auto_coder-0.1.206}/src/autocoder/auto_coder_lang.py +0 -0
- {auto_coder-0.1.205 → auto_coder-0.1.206}/src/autocoder/auto_coder_server.py +0 -0
- {auto_coder-0.1.205 → auto_coder-0.1.206}/src/autocoder/chat/__init__.py +0 -0
- {auto_coder-0.1.205 → auto_coder-0.1.206}/src/autocoder/chat_auto_coder_lang.py +0 -0
- {auto_coder-0.1.205 → auto_coder-0.1.206}/src/autocoder/command_args.py +0 -0
- {auto_coder-0.1.205 → auto_coder-0.1.206}/src/autocoder/common/JupyterClient.py +0 -0
- {auto_coder-0.1.205 → auto_coder-0.1.206}/src/autocoder/common/ShellClient.py +0 -0
- {auto_coder-0.1.205 → auto_coder-0.1.206}/src/autocoder/common/__init__.py +0 -0
- {auto_coder-0.1.205 → auto_coder-0.1.206}/src/autocoder/common/anything2images.py +0 -0
- {auto_coder-0.1.205 → auto_coder-0.1.206}/src/autocoder/common/audio.py +0 -0
- {auto_coder-0.1.205 → auto_coder-0.1.206}/src/autocoder/common/cleaner.py +0 -0
- {auto_coder-0.1.205 → auto_coder-0.1.206}/src/autocoder/common/code_auto_execute.py +0 -0
- {auto_coder-0.1.205 → auto_coder-0.1.206}/src/autocoder/common/code_auto_generate.py +0 -0
- {auto_coder-0.1.205 → auto_coder-0.1.206}/src/autocoder/common/code_auto_generate_diff.py +0 -0
- {auto_coder-0.1.205 → auto_coder-0.1.206}/src/autocoder/common/code_auto_generate_editblock.py +0 -0
- {auto_coder-0.1.205 → auto_coder-0.1.206}/src/autocoder/common/code_auto_generate_strict_diff.py +0 -0
- {auto_coder-0.1.205 → auto_coder-0.1.206}/src/autocoder/common/code_auto_merge.py +0 -0
- {auto_coder-0.1.205 → auto_coder-0.1.206}/src/autocoder/common/code_auto_merge_diff.py +0 -0
- {auto_coder-0.1.205 → auto_coder-0.1.206}/src/autocoder/common/code_auto_merge_editblock.py +0 -0
- {auto_coder-0.1.205 → auto_coder-0.1.206}/src/autocoder/common/code_auto_merge_strict_diff.py +0 -0
- {auto_coder-0.1.205 → auto_coder-0.1.206}/src/autocoder/common/command_completer.py +0 -0
- {auto_coder-0.1.205 → auto_coder-0.1.206}/src/autocoder/common/command_generator.py +0 -0
- {auto_coder-0.1.205 → auto_coder-0.1.206}/src/autocoder/common/command_templates.py +0 -0
- {auto_coder-0.1.205 → auto_coder-0.1.206}/src/autocoder/common/const.py +0 -0
- {auto_coder-0.1.205 → auto_coder-0.1.206}/src/autocoder/common/git_utils.py +0 -0
- {auto_coder-0.1.205 → auto_coder-0.1.206}/src/autocoder/common/image_to_page.py +0 -0
- {auto_coder-0.1.205 → auto_coder-0.1.206}/src/autocoder/common/interpreter.py +0 -0
- {auto_coder-0.1.205 → auto_coder-0.1.206}/src/autocoder/common/llm_rerank.py +0 -0
- {auto_coder-0.1.205 → auto_coder-0.1.206}/src/autocoder/common/screenshots.py +0 -0
- {auto_coder-0.1.205 → auto_coder-0.1.206}/src/autocoder/common/search.py +0 -0
- {auto_coder-0.1.205 → auto_coder-0.1.206}/src/autocoder/common/search_replace.py +0 -0
- {auto_coder-0.1.205 → auto_coder-0.1.206}/src/autocoder/common/sys_prompt.py +0 -0
- {auto_coder-0.1.205 → auto_coder-0.1.206}/src/autocoder/common/text.py +0 -0
- {auto_coder-0.1.205 → auto_coder-0.1.206}/src/autocoder/common/types.py +0 -0
- {auto_coder-0.1.205 → auto_coder-0.1.206}/src/autocoder/db/__init__.py +0 -0
- {auto_coder-0.1.205 → auto_coder-0.1.206}/src/autocoder/db/store.py +0 -0
- {auto_coder-0.1.205 → auto_coder-0.1.206}/src/autocoder/dispacher/__init__.py +0 -0
- {auto_coder-0.1.205 → auto_coder-0.1.206}/src/autocoder/dispacher/actions/__init__.py +0 -0
- {auto_coder-0.1.205 → auto_coder-0.1.206}/src/autocoder/dispacher/actions/action.py +0 -0
- {auto_coder-0.1.205 → auto_coder-0.1.206}/src/autocoder/dispacher/actions/copilot.py +0 -0
- {auto_coder-0.1.205 → auto_coder-0.1.206}/src/autocoder/dispacher/actions/plugins/__init__.py +0 -0
- {auto_coder-0.1.205 → auto_coder-0.1.206}/src/autocoder/dispacher/actions/plugins/action_regex_project.py +0 -0
- {auto_coder-0.1.205 → auto_coder-0.1.206}/src/autocoder/dispacher/actions/plugins/action_translate.py +0 -0
- {auto_coder-0.1.205 → auto_coder-0.1.206}/src/autocoder/index/__init__.py +0 -0
- {auto_coder-0.1.205 → auto_coder-0.1.206}/src/autocoder/index/for_command.py +0 -0
- {auto_coder-0.1.205 → auto_coder-0.1.206}/src/autocoder/index/index.py +0 -0
- {auto_coder-0.1.205 → auto_coder-0.1.206}/src/autocoder/index/symbols_utils.py +0 -0
- {auto_coder-0.1.205 → auto_coder-0.1.206}/src/autocoder/lang.py +0 -0
- {auto_coder-0.1.205 → auto_coder-0.1.206}/src/autocoder/pyproject/__init__.py +0 -0
- {auto_coder-0.1.205 → auto_coder-0.1.206}/src/autocoder/rag/__init__.py +0 -0
- {auto_coder-0.1.205 → auto_coder-0.1.206}/src/autocoder/rag/api_server.py +0 -0
- {auto_coder-0.1.205 → auto_coder-0.1.206}/src/autocoder/rag/cache/__init__.py +0 -0
- {auto_coder-0.1.205 → auto_coder-0.1.206}/src/autocoder/rag/cache/base_cache.py +0 -0
- {auto_coder-0.1.205 → auto_coder-0.1.206}/src/autocoder/rag/cache/byzer_storage_cache.py +0 -0
- {auto_coder-0.1.205 → auto_coder-0.1.206}/src/autocoder/rag/cache/file_monitor_cache.py +0 -0
- {auto_coder-0.1.205 → auto_coder-0.1.206}/src/autocoder/rag/cache/simple_cache.py +0 -0
- {auto_coder-0.1.205 → auto_coder-0.1.206}/src/autocoder/rag/doc_filter.py +0 -0
- {auto_coder-0.1.205 → auto_coder-0.1.206}/src/autocoder/rag/document_retriever.py +0 -0
- {auto_coder-0.1.205 → auto_coder-0.1.206}/src/autocoder/rag/llm_wrapper.py +0 -0
- {auto_coder-0.1.205 → auto_coder-0.1.206}/src/autocoder/rag/loaders/__init__.py +0 -0
- {auto_coder-0.1.205 → auto_coder-0.1.206}/src/autocoder/rag/loaders/docx_loader.py +0 -0
- {auto_coder-0.1.205 → auto_coder-0.1.206}/src/autocoder/rag/loaders/excel_loader.py +0 -0
- {auto_coder-0.1.205 → auto_coder-0.1.206}/src/autocoder/rag/loaders/pdf_loader.py +0 -0
- {auto_coder-0.1.205 → auto_coder-0.1.206}/src/autocoder/rag/loaders/ppt_loader.py +0 -0
- {auto_coder-0.1.205 → auto_coder-0.1.206}/src/autocoder/rag/long_context_rag.py +0 -0
- {auto_coder-0.1.205 → auto_coder-0.1.206}/src/autocoder/rag/rag_config.py +0 -0
- {auto_coder-0.1.205 → auto_coder-0.1.206}/src/autocoder/rag/rag_entry.py +0 -0
- {auto_coder-0.1.205 → auto_coder-0.1.206}/src/autocoder/rag/raw_rag.py +0 -0
- {auto_coder-0.1.205 → auto_coder-0.1.206}/src/autocoder/rag/relevant_utils.py +0 -0
- {auto_coder-0.1.205 → auto_coder-0.1.206}/src/autocoder/rag/simple_directory_reader.py +0 -0
- {auto_coder-0.1.205 → auto_coder-0.1.206}/src/autocoder/rag/simple_rag.py +0 -0
- {auto_coder-0.1.205 → auto_coder-0.1.206}/src/autocoder/rag/stream_event/__init__.py +0 -0
- {auto_coder-0.1.205 → auto_coder-0.1.206}/src/autocoder/rag/stream_event/event_writer.py +0 -0
- {auto_coder-0.1.205 → auto_coder-0.1.206}/src/autocoder/rag/stream_event/types.py +0 -0
- {auto_coder-0.1.205 → auto_coder-0.1.206}/src/autocoder/rag/token_checker.py +0 -0
- {auto_coder-0.1.205 → auto_coder-0.1.206}/src/autocoder/rag/token_limiter.py +0 -0
- {auto_coder-0.1.205 → auto_coder-0.1.206}/src/autocoder/rag/types.py +0 -0
- {auto_coder-0.1.205 → auto_coder-0.1.206}/src/autocoder/rag/utils.py +0 -0
- {auto_coder-0.1.205 → auto_coder-0.1.206}/src/autocoder/rag/variable_holder.py +0 -0
- {auto_coder-0.1.205 → auto_coder-0.1.206}/src/autocoder/regexproject/__init__.py +0 -0
- {auto_coder-0.1.205 → auto_coder-0.1.206}/src/autocoder/suffixproject/__init__.py +0 -0
- {auto_coder-0.1.205 → auto_coder-0.1.206}/src/autocoder/tsproject/__init__.py +0 -0
- {auto_coder-0.1.205 → auto_coder-0.1.206}/src/autocoder/utils/__init__.py +0 -0
- {auto_coder-0.1.205 → auto_coder-0.1.206}/src/autocoder/utils/_markitdown.py +0 -0
- {auto_coder-0.1.205 → auto_coder-0.1.206}/src/autocoder/utils/conversation_store.py +0 -0
- {auto_coder-0.1.205 → auto_coder-0.1.206}/src/autocoder/utils/llm_client_interceptors.py +0 -0
- {auto_coder-0.1.205 → auto_coder-0.1.206}/src/autocoder/utils/log_capture.py +0 -0
- {auto_coder-0.1.205 → auto_coder-0.1.206}/src/autocoder/utils/multi_turn.py +0 -0
- {auto_coder-0.1.205 → auto_coder-0.1.206}/src/autocoder/utils/print_table.py +0 -0
- {auto_coder-0.1.205 → auto_coder-0.1.206}/src/autocoder/utils/queue_communicate.py +0 -0
- {auto_coder-0.1.205 → auto_coder-0.1.206}/src/autocoder/utils/request_event_queue.py +0 -0
- {auto_coder-0.1.205 → auto_coder-0.1.206}/src/autocoder/utils/request_queue.py +0 -0
- {auto_coder-0.1.205 → auto_coder-0.1.206}/src/autocoder/utils/rest.py +0 -0
- {auto_coder-0.1.205 → auto_coder-0.1.206}/src/autocoder/utils/tests.py +0 -0
- {auto_coder-0.1.205 → auto_coder-0.1.206}/tests/test_action_regex_project.py +0 -0
- {auto_coder-0.1.205 → auto_coder-0.1.206}/tests/test_chat_auto_coder.py +0 -0
- {auto_coder-0.1.205 → auto_coder-0.1.206}/tests/test_code_auto_merge_editblock.py +0 -0
- {auto_coder-0.1.205 → auto_coder-0.1.206}/tests/test_command_completer.py +0 -0
- {auto_coder-0.1.205 → auto_coder-0.1.206}/tests/test_planner.py +0 -0
- {auto_coder-0.1.205 → auto_coder-0.1.206}/tests/test_queue_communicate.py +0 -0
- {auto_coder-0.1.205 → auto_coder-0.1.206}/tests/test_symbols_utils.py +0 -0
|
@@ -12,12 +12,14 @@ src/autocoder/auto_coder.py
|
|
|
12
12
|
src/autocoder/auto_coder_lang.py
|
|
13
13
|
src/autocoder/auto_coder_rag.py
|
|
14
14
|
src/autocoder/auto_coder_server.py
|
|
15
|
+
src/autocoder/benchmark.py
|
|
15
16
|
src/autocoder/chat_auto_coder.py
|
|
16
17
|
src/autocoder/chat_auto_coder_lang.py
|
|
17
18
|
src/autocoder/command_args.py
|
|
18
19
|
src/autocoder/lang.py
|
|
19
20
|
src/autocoder/version.py
|
|
20
21
|
src/autocoder/agent/__init__.py
|
|
22
|
+
src/autocoder/agent/auto_filegroup.py
|
|
21
23
|
src/autocoder/agent/auto_tool.py
|
|
22
24
|
src/autocoder/agent/coder.py
|
|
23
25
|
src/autocoder/agent/designer.py
|
|
@@ -29,6 +31,7 @@ src/autocoder/common/ShellClient.py
|
|
|
29
31
|
src/autocoder/common/__init__.py
|
|
30
32
|
src/autocoder/common/anything2images.py
|
|
31
33
|
src/autocoder/common/audio.py
|
|
34
|
+
src/autocoder/common/chunk_validation.py
|
|
32
35
|
src/autocoder/common/cleaner.py
|
|
33
36
|
src/autocoder/common/code_auto_execute.py
|
|
34
37
|
src/autocoder/common/code_auto_generate.py
|
|
@@ -47,12 +50,14 @@ src/autocoder/common/git_utils.py
|
|
|
47
50
|
src/autocoder/common/image_to_page.py
|
|
48
51
|
src/autocoder/common/interpreter.py
|
|
49
52
|
src/autocoder/common/llm_rerank.py
|
|
53
|
+
src/autocoder/common/recall_validation.py
|
|
50
54
|
src/autocoder/common/screenshots.py
|
|
51
55
|
src/autocoder/common/search.py
|
|
52
56
|
src/autocoder/common/search_replace.py
|
|
53
57
|
src/autocoder/common/sys_prompt.py
|
|
54
58
|
src/autocoder/common/text.py
|
|
55
59
|
src/autocoder/common/types.py
|
|
60
|
+
src/autocoder/data/tokenizer.json
|
|
56
61
|
src/autocoder/db/__init__.py
|
|
57
62
|
src/autocoder/db/store.py
|
|
58
63
|
src/autocoder/dispacher/__init__.py
|
|
@@ -107,6 +112,7 @@ src/autocoder/utils/conversation_store.py
|
|
|
107
112
|
src/autocoder/utils/llm_client_interceptors.py
|
|
108
113
|
src/autocoder/utils/log_capture.py
|
|
109
114
|
src/autocoder/utils/multi_turn.py
|
|
115
|
+
src/autocoder/utils/operate_config_api.py
|
|
110
116
|
src/autocoder/utils/print_table.py
|
|
111
117
|
src/autocoder/utils/queue_communicate.py
|
|
112
118
|
src/autocoder/utils/request_event_queue.py
|
|
@@ -0,0 +1,202 @@
|
|
|
1
|
+
from typing import List, Dict, Optional, Any, Tuple
|
|
2
|
+
import os
|
|
3
|
+
import yaml
|
|
4
|
+
from loguru import logger
|
|
5
|
+
import byzerllm
|
|
6
|
+
import pydantic
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class FileGroup(pydantic.BaseModel):
|
|
10
|
+
name: str
|
|
11
|
+
description: str
|
|
12
|
+
queries: List[str]
|
|
13
|
+
urls: List[str]
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class FileGroups(pydantic.BaseModel):
|
|
17
|
+
groups: List[FileGroup]
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def load_yaml_config(yaml_file: str) -> Dict:
|
|
21
|
+
"""加载YAML配置文件"""
|
|
22
|
+
try:
|
|
23
|
+
with open(yaml_file, 'r', encoding='utf-8') as f:
|
|
24
|
+
return yaml.safe_load(f)
|
|
25
|
+
except Exception as e:
|
|
26
|
+
logger.error(f"Error loading yaml file {yaml_file}: {str(e)}")
|
|
27
|
+
return {}
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class AutoFileGroup:
|
|
31
|
+
def __init__(self, llm: byzerllm.ByzerLLM,
|
|
32
|
+
project_dir: str,
|
|
33
|
+
skip_diff: bool = False,
|
|
34
|
+
group_num_limit: int = 10,
|
|
35
|
+
file_size_limit: int = 100):
|
|
36
|
+
"""
|
|
37
|
+
初始化AutoFileGroup
|
|
38
|
+
|
|
39
|
+
Args:
|
|
40
|
+
actions_dir: 包含YAML文件的目录
|
|
41
|
+
"""
|
|
42
|
+
self.project_dir = project_dir
|
|
43
|
+
self.actions_dir = os.path.join(project_dir, "actions")
|
|
44
|
+
self.llm = llm
|
|
45
|
+
self.file_size_limit = file_size_limit
|
|
46
|
+
self.skip_diff = skip_diff
|
|
47
|
+
self.group_num_limit = group_num_limit
|
|
48
|
+
|
|
49
|
+
@byzerllm.prompt()
|
|
50
|
+
def group_by_similarity(self, querie_with_urls: List[Tuple[str, List[str], str]]) -> str:
|
|
51
|
+
"""
|
|
52
|
+
分析多个开发任务的关联性,将相互关联的任务进行分组。
|
|
53
|
+
|
|
54
|
+
输入说明:
|
|
55
|
+
querie_with_urls 包含多个开发任务信息,每个任务由以下部分组成:
|
|
56
|
+
1. query: 任务需求描述
|
|
57
|
+
2. urls: 需要修改的文件路径列表
|
|
58
|
+
3. diff: Git diff信息,展示具体的代码修改
|
|
59
|
+
|
|
60
|
+
示例数据:
|
|
61
|
+
<queries>
|
|
62
|
+
{% for query,urls,diff in querie_with_urls %}
|
|
63
|
+
## {{ query }}
|
|
64
|
+
|
|
65
|
+
修改的文件:
|
|
66
|
+
{% for url in urls %}
|
|
67
|
+
- {{ url }}
|
|
68
|
+
{% endfor %}
|
|
69
|
+
{% if diff %}
|
|
70
|
+
|
|
71
|
+
代码变更:
|
|
72
|
+
```diff
|
|
73
|
+
{{ diff }}
|
|
74
|
+
```
|
|
75
|
+
{% endif %}
|
|
76
|
+
{% endfor %}
|
|
77
|
+
</queries>
|
|
78
|
+
|
|
79
|
+
分组规则:
|
|
80
|
+
1. 每个分组至少包含2个query
|
|
81
|
+
2. 根据以下维度判断任务的关联性:
|
|
82
|
+
- 功能相似性:任务是否属于同一个功能模块
|
|
83
|
+
- 文件关联:修改的文件是否有重叠或紧密关联
|
|
84
|
+
- 代码依赖:代码修改是否存在依赖关系
|
|
85
|
+
- 业务目的:任务的最终业务目标是否一致
|
|
86
|
+
3. 输出的分组数量最多不超过 {{ group_num_limit }}
|
|
87
|
+
|
|
88
|
+
期望输出:
|
|
89
|
+
返回符合以下格式的JSON:
|
|
90
|
+
{
|
|
91
|
+
"groups": [
|
|
92
|
+
{
|
|
93
|
+
"name": "分组名称",
|
|
94
|
+
"description": "分组的功能概述,描述该组任务的共同目标",
|
|
95
|
+
"queries": ["相关的query1", "相关的query2"],
|
|
96
|
+
"urls": ["相关的文件1", "相关的文件2"]
|
|
97
|
+
}
|
|
98
|
+
]
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
特别说明:
|
|
102
|
+
1. 分组名称应该简洁且具有描述性,能反映该组任务的主要特征
|
|
103
|
+
2. 分组描述应突出任务间的共同点和关联性
|
|
104
|
+
3. 返回的urls应该是该组任务涉及的所有相关文件的并集
|
|
105
|
+
"""
|
|
106
|
+
return {
|
|
107
|
+
"group_num_limit": self.group_num_limit
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
def group_files(self) -> List[Dict]:
|
|
112
|
+
"""
|
|
113
|
+
根据YAML文件中的query和urls进行文件分组,并获取相关的git commit信息
|
|
114
|
+
|
|
115
|
+
Returns:
|
|
116
|
+
List[Dict]: 分组结果列表
|
|
117
|
+
"""
|
|
118
|
+
import git
|
|
119
|
+
import hashlib
|
|
120
|
+
|
|
121
|
+
# 获取所有YAML文件
|
|
122
|
+
action_files = [
|
|
123
|
+
f for f in os.listdir(self.actions_dir)
|
|
124
|
+
if f[:3].isdigit() and "_" in f and f.endswith('.yml')
|
|
125
|
+
]
|
|
126
|
+
|
|
127
|
+
# 按序号排序
|
|
128
|
+
def get_seq(name):
|
|
129
|
+
return int(name.split("_")[0])
|
|
130
|
+
|
|
131
|
+
# 获取最新的action文件列表
|
|
132
|
+
action_files = sorted(action_files, key=get_seq)
|
|
133
|
+
action_files.reverse()
|
|
134
|
+
|
|
135
|
+
action_files = action_files[:self.file_size_limit]
|
|
136
|
+
|
|
137
|
+
querie_with_urls_and_diffs = []
|
|
138
|
+
repo = git.Repo(self.project_dir)
|
|
139
|
+
|
|
140
|
+
# 收集所有query、urls和对应的commit diff
|
|
141
|
+
for yaml_file in action_files:
|
|
142
|
+
yaml_path = os.path.join(self.actions_dir, yaml_file)
|
|
143
|
+
config = load_yaml_config(yaml_path)
|
|
144
|
+
|
|
145
|
+
if not config:
|
|
146
|
+
continue
|
|
147
|
+
|
|
148
|
+
query = config.get('query', '')
|
|
149
|
+
urls = config.get('urls', [])
|
|
150
|
+
|
|
151
|
+
if query and urls:
|
|
152
|
+
commit_diff = ""
|
|
153
|
+
if not self.skip_diff:
|
|
154
|
+
# 计算文件的MD5用于匹配commit
|
|
155
|
+
file_md5 = hashlib.md5(open(yaml_path, 'rb').read()).hexdigest()
|
|
156
|
+
response_id = f"auto_coder_{yaml_file}_{file_md5}"
|
|
157
|
+
# 查找对应的commit
|
|
158
|
+
try:
|
|
159
|
+
for commit in repo.iter_commits():
|
|
160
|
+
if response_id in commit.message:
|
|
161
|
+
if commit.parents:
|
|
162
|
+
parent = commit.parents[0]
|
|
163
|
+
commit_diff = repo.git.diff(
|
|
164
|
+
parent.hexsha, commit.hexsha)
|
|
165
|
+
else:
|
|
166
|
+
commit_diff = repo.git.show(commit.hexsha)
|
|
167
|
+
break
|
|
168
|
+
except git.exc.GitCommandError as e:
|
|
169
|
+
logger.error(f"Git命令执行错误: {str(e)}")
|
|
170
|
+
except Exception as e:
|
|
171
|
+
logger.error(f"获取commit diff时出错: {str(e)}")
|
|
172
|
+
|
|
173
|
+
querie_with_urls_and_diffs.append((query, urls, commit_diff))
|
|
174
|
+
|
|
175
|
+
if not querie_with_urls_and_diffs:
|
|
176
|
+
return []
|
|
177
|
+
|
|
178
|
+
# 使用LLM进行分组
|
|
179
|
+
try:
|
|
180
|
+
result = self.group_by_similarity.with_llm(self.llm).with_return_type(FileGroups).run(
|
|
181
|
+
querie_with_urls=querie_with_urls_and_diffs
|
|
182
|
+
)
|
|
183
|
+
return result.groups
|
|
184
|
+
except Exception as e:
|
|
185
|
+
import traceback
|
|
186
|
+
traceback.print_exc()
|
|
187
|
+
logger.error(f"Error during grouping: {str(e)}")
|
|
188
|
+
return []
|
|
189
|
+
|
|
190
|
+
|
|
191
|
+
def create_file_groups(actions_dir: str) -> List[Dict]:
|
|
192
|
+
"""
|
|
193
|
+
创建文件分组的便捷函数
|
|
194
|
+
|
|
195
|
+
Args:
|
|
196
|
+
actions_dir: YAML文件所在目录
|
|
197
|
+
|
|
198
|
+
Returns:
|
|
199
|
+
List[Dict]: 分组结果,每个字典包含name, queries和urls
|
|
200
|
+
"""
|
|
201
|
+
grouper = AutoFileGroup(actions_dir)
|
|
202
|
+
return grouper.group_files()
|
|
@@ -18,8 +18,10 @@ from rich.console import Console
|
|
|
18
18
|
from rich.table import Table
|
|
19
19
|
import os
|
|
20
20
|
from loguru import logger
|
|
21
|
+
import asyncio
|
|
21
22
|
|
|
22
23
|
from autocoder.rag.document_retriever import process_file_local
|
|
24
|
+
import pkg_resources
|
|
23
25
|
from autocoder.rag.token_counter import TokenCounter
|
|
24
26
|
|
|
25
27
|
if platform.system() == "Windows":
|
|
@@ -139,6 +141,13 @@ def initialize_system():
|
|
|
139
141
|
|
|
140
142
|
def main(input_args: Optional[List[str]] = None):
|
|
141
143
|
|
|
144
|
+
try:
|
|
145
|
+
tokenizer_path = pkg_resources.resource_filename(
|
|
146
|
+
"autocoder", "data/tokenizer.json"
|
|
147
|
+
)
|
|
148
|
+
except FileNotFoundError:
|
|
149
|
+
tokenizer_path = None
|
|
150
|
+
|
|
142
151
|
system_lang, _ = locale.getdefaultlocale()
|
|
143
152
|
lang = "zh" if system_lang and system_lang.startswith("zh") else "en"
|
|
144
153
|
desc = lang_desc[lang]
|
|
@@ -146,18 +155,38 @@ def main(input_args: Optional[List[str]] = None):
|
|
|
146
155
|
subparsers = parser.add_subparsers(dest="command", help="Available commands")
|
|
147
156
|
|
|
148
157
|
# Build hybrid index command
|
|
149
|
-
build_index_parser = subparsers.add_parser(
|
|
150
|
-
|
|
158
|
+
build_index_parser = subparsers.add_parser(
|
|
159
|
+
"build_hybrid_index", help="Build hybrid index for RAG"
|
|
160
|
+
)
|
|
161
|
+
build_index_parser.add_argument(
|
|
162
|
+
"--quick", action="store_true", help="Skip system initialization"
|
|
163
|
+
)
|
|
151
164
|
build_index_parser.add_argument("--file", default="", help=desc["file"])
|
|
152
|
-
build_index_parser.add_argument(
|
|
153
|
-
|
|
165
|
+
build_index_parser.add_argument(
|
|
166
|
+
"--model", default="deepseek_chat", help=desc["model"]
|
|
167
|
+
)
|
|
168
|
+
build_index_parser.add_argument(
|
|
169
|
+
"--index_model", default="", help=desc["index_model"]
|
|
170
|
+
)
|
|
154
171
|
build_index_parser.add_argument("--emb_model", default="", help=desc["emb_model"])
|
|
155
|
-
build_index_parser.add_argument(
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
build_index_parser.add_argument(
|
|
159
|
-
|
|
160
|
-
|
|
172
|
+
build_index_parser.add_argument(
|
|
173
|
+
"--ray_address", default="auto", help=desc["ray_address"]
|
|
174
|
+
)
|
|
175
|
+
build_index_parser.add_argument(
|
|
176
|
+
"--required_exts", default="", help=desc["doc_build_parse_required_exts"]
|
|
177
|
+
)
|
|
178
|
+
build_index_parser.add_argument(
|
|
179
|
+
"--source_dir", default=".", help="Source directory path"
|
|
180
|
+
)
|
|
181
|
+
build_index_parser.add_argument(
|
|
182
|
+
"--tokenizer_path", default=tokenizer_path, help="Path to tokenizer file"
|
|
183
|
+
)
|
|
184
|
+
build_index_parser.add_argument(
|
|
185
|
+
"--doc_dir", default="", help="Document directory path"
|
|
186
|
+
)
|
|
187
|
+
build_index_parser.add_argument(
|
|
188
|
+
"--enable_hybrid_index", action="store_true", help="Enable hybrid index"
|
|
189
|
+
)
|
|
161
190
|
|
|
162
191
|
# Serve command
|
|
163
192
|
serve_parser = subparsers.add_parser("serve", help="Start the RAG server")
|
|
@@ -220,7 +249,7 @@ def main(input_args: Optional[List[str]] = None):
|
|
|
220
249
|
serve_parser.add_argument("--ssl_certfile", default="", help="")
|
|
221
250
|
serve_parser.add_argument("--response_role", default="assistant", help="")
|
|
222
251
|
serve_parser.add_argument("--doc_dir", default="", help="")
|
|
223
|
-
serve_parser.add_argument("--tokenizer_path", default=
|
|
252
|
+
serve_parser.add_argument("--tokenizer_path", default=tokenizer_path, help="")
|
|
224
253
|
serve_parser.add_argument(
|
|
225
254
|
"--collections", default="", help="Collection name for indexing"
|
|
226
255
|
)
|
|
@@ -282,7 +311,7 @@ def main(input_args: Optional[List[str]] = None):
|
|
|
282
311
|
|
|
283
312
|
serve_parser.add_argument(
|
|
284
313
|
"--without_contexts",
|
|
285
|
-
action="store_true",
|
|
314
|
+
action="store_true",
|
|
286
315
|
help="Whether to return responses without contexts. only works when pro plugin is installed",
|
|
287
316
|
)
|
|
288
317
|
|
|
@@ -304,14 +333,70 @@ def main(input_args: Optional[List[str]] = None):
|
|
|
304
333
|
help="The model used for question answering",
|
|
305
334
|
)
|
|
306
335
|
|
|
336
|
+
# Benchmark command
|
|
337
|
+
benchmark_parser = subparsers.add_parser(
|
|
338
|
+
"benchmark", help="Benchmark LLM client performance"
|
|
339
|
+
)
|
|
340
|
+
benchmark_parser.add_argument(
|
|
341
|
+
"--model", default="deepseek_chat", help="Model to benchmark"
|
|
342
|
+
)
|
|
343
|
+
benchmark_parser.add_argument(
|
|
344
|
+
"--parallel", type=int, default=10, help="Number of parallel requests"
|
|
345
|
+
)
|
|
346
|
+
benchmark_parser.add_argument(
|
|
347
|
+
"--rounds", type=int, default=1, help="Number of rounds to run"
|
|
348
|
+
)
|
|
349
|
+
benchmark_parser.add_argument(
|
|
350
|
+
"--type",
|
|
351
|
+
choices=["openai", "byzerllm"],
|
|
352
|
+
default="byzerllm",
|
|
353
|
+
help="Client type to benchmark",
|
|
354
|
+
)
|
|
355
|
+
benchmark_parser.add_argument(
|
|
356
|
+
"--api_key", default="", help="OpenAI API key for OpenAI client"
|
|
357
|
+
)
|
|
358
|
+
benchmark_parser.add_argument(
|
|
359
|
+
"--base_url", default="", help="Base URL for OpenAI client"
|
|
360
|
+
)
|
|
361
|
+
|
|
307
362
|
# Tools command
|
|
308
363
|
tools_parser = subparsers.add_parser("tools", help="Various tools")
|
|
309
364
|
tools_subparsers = tools_parser.add_subparsers(dest="tool", help="Available tools")
|
|
310
365
|
|
|
311
366
|
# Count tool
|
|
312
367
|
count_parser = tools_subparsers.add_parser("count", help="Count tokens in a file")
|
|
368
|
+
|
|
369
|
+
# Recall validation tool
|
|
370
|
+
recall_parser = tools_subparsers.add_parser(
|
|
371
|
+
"recall", help="Validate recall model performance"
|
|
372
|
+
)
|
|
373
|
+
recall_parser.add_argument(
|
|
374
|
+
"--model", required=True, help="Model to use for recall validation"
|
|
375
|
+
)
|
|
376
|
+
recall_parser.add_argument(
|
|
377
|
+
"--content", default=None, help="Content to validate against"
|
|
378
|
+
)
|
|
379
|
+
recall_parser.add_argument(
|
|
380
|
+
"--query", default=None, help="Query to use for validation"
|
|
381
|
+
)
|
|
382
|
+
|
|
383
|
+
# Add chunk model validation tool
|
|
384
|
+
chunk_parser = tools_subparsers.add_parser(
|
|
385
|
+
"chunk", help="Validate chunk model performance"
|
|
386
|
+
)
|
|
387
|
+
chunk_parser.add_argument(
|
|
388
|
+
"--model", required=True, help="Model to use for chunk validation"
|
|
389
|
+
)
|
|
390
|
+
chunk_parser.add_argument(
|
|
391
|
+
"--content", default=None, help="Content to validate against"
|
|
392
|
+
)
|
|
393
|
+
chunk_parser.add_argument(
|
|
394
|
+
"--query", default=None, help="Query to use for validation"
|
|
395
|
+
)
|
|
313
396
|
count_parser.add_argument(
|
|
314
|
-
"--tokenizer_path",
|
|
397
|
+
"--tokenizer_path",
|
|
398
|
+
default=tokenizer_path,
|
|
399
|
+
help="Path to the tokenizer",
|
|
315
400
|
)
|
|
316
401
|
count_parser.add_argument(
|
|
317
402
|
"--file", required=True, help="Path to the file to count tokens"
|
|
@@ -319,7 +404,22 @@ def main(input_args: Optional[List[str]] = None):
|
|
|
319
404
|
|
|
320
405
|
args = parser.parse_args(input_args)
|
|
321
406
|
|
|
322
|
-
if args.command == "
|
|
407
|
+
if args.command == "benchmark":
|
|
408
|
+
from .benchmark import benchmark_openai, benchmark_byzerllm
|
|
409
|
+
|
|
410
|
+
if args.type == "openai":
|
|
411
|
+
if not args.api_key:
|
|
412
|
+
print("OpenAI API key is required for OpenAI client benchmark")
|
|
413
|
+
return
|
|
414
|
+
asyncio.run(
|
|
415
|
+
benchmark_openai(
|
|
416
|
+
args.model, args.parallel, args.api_key, args.base_url, args.rounds
|
|
417
|
+
)
|
|
418
|
+
)
|
|
419
|
+
else: # byzerllm
|
|
420
|
+
benchmark_byzerllm(args.model, args.parallel, args.rounds)
|
|
421
|
+
|
|
422
|
+
elif args.command == "serve":
|
|
323
423
|
if not args.quick:
|
|
324
424
|
initialize_system()
|
|
325
425
|
server_args = ServerArgs(
|
|
@@ -337,14 +437,17 @@ def main(input_args: Optional[List[str]] = None):
|
|
|
337
437
|
}
|
|
338
438
|
)
|
|
339
439
|
|
|
340
|
-
if auto_coder_args.enable_hybrid_index:
|
|
341
|
-
# 尝试连接storage
|
|
440
|
+
if auto_coder_args.enable_hybrid_index:
|
|
441
|
+
# 尝试连接storage
|
|
342
442
|
try:
|
|
343
443
|
from byzerllm.apps.byzer_storage.simple_api import ByzerStorage
|
|
444
|
+
|
|
344
445
|
storage = ByzerStorage("byzerai_store", "rag", "files")
|
|
345
446
|
storage.retrieval.cluster_info("byzerai_store")
|
|
346
447
|
except Exception as e:
|
|
347
|
-
logger.error(
|
|
448
|
+
logger.error(
|
|
449
|
+
"When enable_hybrid_index is true, ByzerStorage must be started"
|
|
450
|
+
)
|
|
348
451
|
logger.error("Please run 'byzerllm storage start' first")
|
|
349
452
|
return
|
|
350
453
|
else:
|
|
@@ -369,12 +472,14 @@ def main(input_args: Optional[List[str]] = None):
|
|
|
369
472
|
llm.setup_sub_client("qa_model", qa_model)
|
|
370
473
|
|
|
371
474
|
# 当启用hybrid_index时,检查必要的组件
|
|
372
|
-
if auto_coder_args.enable_hybrid_index:
|
|
475
|
+
if auto_coder_args.enable_hybrid_index:
|
|
373
476
|
if not llm.is_model_exist("emb"):
|
|
374
|
-
logger.error(
|
|
477
|
+
logger.error(
|
|
478
|
+
"When enable_hybrid_index is true, an 'emb' model must be deployed"
|
|
479
|
+
)
|
|
375
480
|
return
|
|
376
481
|
llm.setup_default_emb_model_name("emb")
|
|
377
|
-
|
|
482
|
+
|
|
378
483
|
if server_args.doc_dir:
|
|
379
484
|
auto_coder_args.rag_type = "simple"
|
|
380
485
|
rag = RAGFactory.get_rag(
|
|
@@ -391,7 +496,7 @@ def main(input_args: Optional[List[str]] = None):
|
|
|
391
496
|
elif args.command == "build_hybrid_index":
|
|
392
497
|
if not args.quick:
|
|
393
498
|
initialize_system()
|
|
394
|
-
|
|
499
|
+
|
|
395
500
|
auto_coder_args = AutoCoderArgs(
|
|
396
501
|
**{
|
|
397
502
|
arg: getattr(args, arg)
|
|
@@ -402,25 +507,30 @@ def main(input_args: Optional[List[str]] = None):
|
|
|
402
507
|
|
|
403
508
|
auto_coder_args.enable_hybrid_index = True
|
|
404
509
|
auto_coder_args.rag_type = "simple"
|
|
405
|
-
|
|
510
|
+
|
|
406
511
|
try:
|
|
407
512
|
from byzerllm.apps.byzer_storage.simple_api import ByzerStorage
|
|
513
|
+
|
|
408
514
|
storage = ByzerStorage("byzerai_store", "rag", "files")
|
|
409
515
|
storage.retrieval.cluster_info("byzerai_store")
|
|
410
516
|
except Exception as e:
|
|
411
|
-
logger.error(
|
|
517
|
+
logger.error(
|
|
518
|
+
"When enable_hybrid_index is true, ByzerStorage must be started"
|
|
519
|
+
)
|
|
412
520
|
logger.error("Please run 'byzerllm storage start' first")
|
|
413
521
|
return
|
|
414
|
-
|
|
522
|
+
|
|
415
523
|
llm = byzerllm.ByzerLLM()
|
|
416
524
|
llm.setup_default_model_name(args.model)
|
|
417
525
|
|
|
418
526
|
# 当启用hybrid_index时,检查必要的组件
|
|
419
|
-
if auto_coder_args.enable_hybrid_index:
|
|
527
|
+
if auto_coder_args.enable_hybrid_index:
|
|
420
528
|
if not llm.is_model_exist("emb"):
|
|
421
|
-
logger.error(
|
|
529
|
+
logger.error(
|
|
530
|
+
"When enable_hybrid_index is true, an 'emb' model must be deployed"
|
|
531
|
+
)
|
|
422
532
|
return
|
|
423
|
-
llm.setup_default_emb_model_name("emb")
|
|
533
|
+
llm.setup_default_emb_model_name("emb")
|
|
424
534
|
|
|
425
535
|
rag = RAGFactory.get_rag(
|
|
426
536
|
llm=llm,
|
|
@@ -428,19 +538,41 @@ def main(input_args: Optional[List[str]] = None):
|
|
|
428
538
|
path=args.doc_dir,
|
|
429
539
|
tokenizer_path=args.tokenizer_path,
|
|
430
540
|
)
|
|
431
|
-
|
|
541
|
+
|
|
432
542
|
if hasattr(rag.document_retriever, "cacher"):
|
|
433
543
|
rag.document_retriever.cacher.build_cache()
|
|
434
544
|
else:
|
|
435
|
-
logger.error(
|
|
545
|
+
logger.error(
|
|
546
|
+
"The document retriever does not support hybrid index building"
|
|
547
|
+
)
|
|
548
|
+
|
|
549
|
+
elif args.command == "tools":
|
|
550
|
+
if args.tool == "count":
|
|
551
|
+
# auto-coder.rag tools count --tokenizer_path /Users/allwefantasy/Downloads/tokenizer.json --file /Users/allwefantasy/data/yum/schema/schema.xlsx
|
|
552
|
+
count_tokens(args.tokenizer_path, args.file)
|
|
553
|
+
elif args.tool == "recall":
|
|
554
|
+
from .common.recall_validation import validate_recall
|
|
555
|
+
|
|
556
|
+
llm = byzerllm.ByzerLLM.from_default_model(args.model)
|
|
557
|
+
|
|
558
|
+
content = None if not args.content else [args.content]
|
|
559
|
+
result = validate_recall(llm, content=content, query=args.query)
|
|
560
|
+
print(f"Recall Validation Result:\n{result}")
|
|
561
|
+
elif args.tool == "chunk":
|
|
562
|
+
from .common.chunk_validation import validate_chunk
|
|
436
563
|
|
|
437
|
-
|
|
438
|
-
|
|
439
|
-
|
|
564
|
+
llm = byzerllm.ByzerLLM.from_default_model(args.model)
|
|
565
|
+
content = None if not args.content else [args.content]
|
|
566
|
+
result = validate_chunk(llm, content=content, query=args.query)
|
|
567
|
+
print(f"Chunk Model Validation Result:\n{result}")
|
|
440
568
|
|
|
441
569
|
|
|
442
570
|
def count_tokens(tokenizer_path: str, file_path: str):
|
|
443
|
-
|
|
571
|
+
from autocoder.rag.variable_holder import VariableHolder
|
|
572
|
+
from tokenizers import Tokenizer
|
|
573
|
+
VariableHolder.TOKENIZER_PATH = tokenizer_path
|
|
574
|
+
VariableHolder.TOKENIZER_MODEL = Tokenizer.from_file(tokenizer_path)
|
|
575
|
+
token_counter = TokenCounter(tokenizer_path)
|
|
444
576
|
source_codes = process_file_local(file_path)
|
|
445
577
|
|
|
446
578
|
console = Console()
|