auto-coder 0.1.374__py3-none-any.whl → 0.1.376__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of auto-coder might be problematic. Click here for more details.
- {auto_coder-0.1.374.dist-info → auto_coder-0.1.376.dist-info}/METADATA +2 -2
- {auto_coder-0.1.374.dist-info → auto_coder-0.1.376.dist-info}/RECORD +27 -57
- autocoder/agent/base_agentic/base_agent.py +202 -52
- autocoder/agent/base_agentic/default_tools.py +38 -6
- autocoder/agent/base_agentic/tools/list_files_tool_resolver.py +83 -43
- autocoder/agent/base_agentic/tools/read_file_tool_resolver.py +88 -25
- autocoder/agent/base_agentic/tools/replace_in_file_tool_resolver.py +171 -62
- autocoder/agent/base_agentic/tools/search_files_tool_resolver.py +101 -56
- autocoder/agent/base_agentic/tools/talk_to_group_tool_resolver.py +5 -0
- autocoder/agent/base_agentic/tools/talk_to_tool_resolver.py +5 -0
- autocoder/agent/base_agentic/tools/write_to_file_tool_resolver.py +145 -32
- autocoder/auto_coder_rag.py +80 -11
- autocoder/models.py +2 -2
- autocoder/rag/agentic_rag.py +217 -0
- autocoder/rag/cache/local_duckdb_storage_cache.py +63 -33
- autocoder/rag/conversation_to_queries.py +37 -5
- autocoder/rag/long_context_rag.py +161 -41
- autocoder/rag/tools/__init__.py +10 -0
- autocoder/rag/tools/recall_tool.py +163 -0
- autocoder/rag/tools/search_tool.py +126 -0
- autocoder/rag/types.py +36 -0
- autocoder/utils/_markitdown.py +59 -13
- autocoder/version.py +1 -1
- autocoder/agent/agentic_edit.py +0 -833
- autocoder/agent/agentic_edit_tools/__init__.py +0 -28
- autocoder/agent/agentic_edit_tools/ask_followup_question_tool_resolver.py +0 -32
- autocoder/agent/agentic_edit_tools/attempt_completion_tool_resolver.py +0 -29
- autocoder/agent/agentic_edit_tools/base_tool_resolver.py +0 -29
- autocoder/agent/agentic_edit_tools/execute_command_tool_resolver.py +0 -84
- autocoder/agent/agentic_edit_tools/list_code_definition_names_tool_resolver.py +0 -75
- autocoder/agent/agentic_edit_tools/list_files_tool_resolver.py +0 -62
- autocoder/agent/agentic_edit_tools/plan_mode_respond_tool_resolver.py +0 -30
- autocoder/agent/agentic_edit_tools/read_file_tool_resolver.py +0 -36
- autocoder/agent/agentic_edit_tools/replace_in_file_tool_resolver.py +0 -95
- autocoder/agent/agentic_edit_tools/search_files_tool_resolver.py +0 -70
- autocoder/agent/agentic_edit_tools/use_mcp_tool_resolver.py +0 -55
- autocoder/agent/agentic_edit_tools/write_to_file_tool_resolver.py +0 -98
- autocoder/agent/agentic_edit_types.py +0 -124
- autocoder/auto_coder_lang.py +0 -60
- autocoder/auto_coder_rag_client_mcp.py +0 -170
- autocoder/auto_coder_rag_mcp.py +0 -193
- autocoder/common/llm_rerank.py +0 -84
- autocoder/common/model_speed_test.py +0 -392
- autocoder/common/v2/agent/agentic_edit_conversation.py +0 -188
- autocoder/common/v2/agent/ignore_utils.py +0 -50
- autocoder/dispacher/actions/plugins/action_translate.py +0 -214
- autocoder/ignorefiles/__init__.py +0 -4
- autocoder/ignorefiles/ignore_file_utils.py +0 -63
- autocoder/ignorefiles/test_ignore_file_utils.py +0 -91
- autocoder/linters/code_linter.py +0 -588
- autocoder/rag/loaders/test_image_loader.py +0 -209
- autocoder/rag/raw_rag.py +0 -96
- autocoder/rag/simple_directory_reader.py +0 -646
- autocoder/rag/simple_rag.py +0 -404
- autocoder/regex_project/__init__.py +0 -162
- autocoder/utils/coder.py +0 -125
- autocoder/utils/tests.py +0 -37
- {auto_coder-0.1.374.dist-info → auto_coder-0.1.376.dist-info}/LICENSE +0 -0
- {auto_coder-0.1.374.dist-info → auto_coder-0.1.376.dist-info}/WHEEL +0 -0
- {auto_coder-0.1.374.dist-info → auto_coder-0.1.376.dist-info}/entry_points.txt +0 -0
- {auto_coder-0.1.374.dist-info → auto_coder-0.1.376.dist-info}/top_level.txt +0 -0
autocoder/rag/simple_rag.py
DELETED
|
@@ -1,404 +0,0 @@
|
|
|
1
|
-
from typing import Any, Dict, List, Optional, Tuple
|
|
2
|
-
from autocoder.common import SourceCode, AutoCoderArgs
|
|
3
|
-
from autocoder.common.llm_rerank import LLMRerank
|
|
4
|
-
from autocoder.rag.simple_directory_reader import AutoCoderSimpleDirectoryReader
|
|
5
|
-
import fsspec
|
|
6
|
-
|
|
7
|
-
from byzerllm.apps.llama_index.simple_retrieval import SimpleRetrieval
|
|
8
|
-
from byzerllm.apps.llama_index.collection_manager import (
|
|
9
|
-
CollectionManager,
|
|
10
|
-
CollectionItem,
|
|
11
|
-
)
|
|
12
|
-
from byzerllm.utils.ray_utils import is_ray_in_client_mode
|
|
13
|
-
|
|
14
|
-
from llama_index.core import QueryBundle, StorageContext
|
|
15
|
-
from llama_index.core.readers.file.base import default_file_metadata_func
|
|
16
|
-
from llama_index.core import VectorStoreIndex
|
|
17
|
-
from llama_index.core.node_parser import HierarchicalNodeParser, get_leaf_nodes
|
|
18
|
-
from llama_index.core.base.llms.types import ChatMessage, MessageRole
|
|
19
|
-
from llama_index.core.retrievers import AutoMergingRetriever
|
|
20
|
-
from llama_index.core.query_engine import RetrieverQueryEngine, RouterQueryEngine
|
|
21
|
-
from llama_index.core.tools import QueryEngineTool
|
|
22
|
-
from llama_index.core.selectors import (
|
|
23
|
-
LLMSingleSelector,
|
|
24
|
-
)
|
|
25
|
-
|
|
26
|
-
import byzerllm
|
|
27
|
-
from loguru import logger
|
|
28
|
-
import hashlib
|
|
29
|
-
import json
|
|
30
|
-
from openai import OpenAI
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
def file_metadata_func(
|
|
34
|
-
file_path: str, fs: Optional[fsspec.AbstractFileSystem] = None
|
|
35
|
-
) -> Dict:
|
|
36
|
-
"""Get some handy metadata from filesystem.
|
|
37
|
-
|
|
38
|
-
Args:
|
|
39
|
-
file_path: str: file path in str
|
|
40
|
-
"""
|
|
41
|
-
|
|
42
|
-
def generate_file_md5(file_path: str) -> str:
|
|
43
|
-
md5_hash = hashlib.md5()
|
|
44
|
-
with open(file_path, "rb") as f:
|
|
45
|
-
for chunk in iter(lambda: f.read(4096), b""):
|
|
46
|
-
md5_hash.update(chunk)
|
|
47
|
-
return md5_hash.hexdigest()
|
|
48
|
-
|
|
49
|
-
return {
|
|
50
|
-
**default_file_metadata_func(file_path=file_path, fs=fs),
|
|
51
|
-
"md5": generate_file_md5(file_path=file_path),
|
|
52
|
-
}
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
class SimpleRAG:
|
|
56
|
-
def __init__(self, llm: byzerllm.ByzerLLM, args: AutoCoderArgs, path: str) -> None:
|
|
57
|
-
from byzerllm.apps.llama_index import get_service_context, get_storage_context
|
|
58
|
-
|
|
59
|
-
self.llm = llm
|
|
60
|
-
self.args = args
|
|
61
|
-
self.path = path
|
|
62
|
-
self.collection_manager = CollectionManager(args.base_dir)
|
|
63
|
-
|
|
64
|
-
self.collections = self.args.collection
|
|
65
|
-
if self.args.collections:
|
|
66
|
-
self.collections = self.args.collections
|
|
67
|
-
|
|
68
|
-
self.collections = self.collections.split(",") if self.collections else []
|
|
69
|
-
|
|
70
|
-
if not self.collections:
|
|
71
|
-
logger.warning(
|
|
72
|
-
"""No RAG collection is set, we will use the `default` collection.
|
|
73
|
-
You can set the collection by passing the `--collections`argument in command line or set the `collections` attribute in the config file."""
|
|
74
|
-
)
|
|
75
|
-
self.collections = ["default"]
|
|
76
|
-
|
|
77
|
-
if not self.llm.default_emb_model_name:
|
|
78
|
-
raise ValueError("emb_model should be set")
|
|
79
|
-
|
|
80
|
-
self.retrieval = byzerllm.ByzerRetrieval()
|
|
81
|
-
|
|
82
|
-
if not is_ray_in_client_mode():
|
|
83
|
-
self.retrieval.launch_gateway()
|
|
84
|
-
self.service_context = get_service_context(self.llm)
|
|
85
|
-
self.storage_context_dict: Dict[str, StorageContext] = {}
|
|
86
|
-
for collection in self.collections:
|
|
87
|
-
self.storage_context_dict[collection] = get_storage_context(
|
|
88
|
-
self.llm,
|
|
89
|
-
self.retrieval,
|
|
90
|
-
chunk_collection=collection,
|
|
91
|
-
namespace=collection,
|
|
92
|
-
)
|
|
93
|
-
|
|
94
|
-
self.simple_retrieval_dict: Dict[str, SimpleRetrieval] = {}
|
|
95
|
-
for collection in self.collections:
|
|
96
|
-
self.simple_retrieval_dict[collection] = SimpleRetrieval(
|
|
97
|
-
llm=llm, retrieval=self.retrieval, chunk_collection=collection
|
|
98
|
-
)
|
|
99
|
-
else:
|
|
100
|
-
if not args.rag_url:
|
|
101
|
-
raise ValueError(
|
|
102
|
-
"You are in client mode, please provide the RAG URL. e.g. rag_url: http://localhost:8000/v1"
|
|
103
|
-
)
|
|
104
|
-
|
|
105
|
-
if not args.rag_url.startswith("http://"):
|
|
106
|
-
raise ValueError("The RAG URL should start with http://")
|
|
107
|
-
|
|
108
|
-
self.client = OpenAI(api_key=args.rag_token, base_url=args.rag_url)
|
|
109
|
-
|
|
110
|
-
def _get_indices(self) -> List[Tuple[CollectionItem, VectorStoreIndex]]:
|
|
111
|
-
indices = []
|
|
112
|
-
for collection in self.collections:
|
|
113
|
-
storage_context = self.storage_context_dict[collection]
|
|
114
|
-
index = VectorStoreIndex.from_vector_store(
|
|
115
|
-
vector_store=storage_context.vector_store,
|
|
116
|
-
service_context=self.service_context,
|
|
117
|
-
)
|
|
118
|
-
collection_item = self.collection_manager.get_collection(collection)
|
|
119
|
-
indices.append((collection_item, index))
|
|
120
|
-
|
|
121
|
-
return indices
|
|
122
|
-
|
|
123
|
-
def _get_query_engine(self, streaming: bool = False):
|
|
124
|
-
indices = self._get_indices()
|
|
125
|
-
retrievers = []
|
|
126
|
-
|
|
127
|
-
for collection_item, index in indices:
|
|
128
|
-
retriever = AutoMergingRetriever(
|
|
129
|
-
index.as_retriever(),
|
|
130
|
-
storage_context=self.storage_context_dict[collection_item.name],
|
|
131
|
-
)
|
|
132
|
-
retrievers.append(retriever)
|
|
133
|
-
|
|
134
|
-
query_engines = [
|
|
135
|
-
RetrieverQueryEngine.from_args(
|
|
136
|
-
retriever, service_context=self.service_context, streaming=streaming
|
|
137
|
-
)
|
|
138
|
-
for retriever in retrievers
|
|
139
|
-
]
|
|
140
|
-
|
|
141
|
-
if len(query_engines) == 1:
|
|
142
|
-
return query_engines[0]
|
|
143
|
-
|
|
144
|
-
tools = []
|
|
145
|
-
for (collection_item, index), query_engine in zip(indices, query_engines):
|
|
146
|
-
tool = QueryEngineTool.from_defaults(
|
|
147
|
-
query_engine=query_engine,
|
|
148
|
-
description=collection_item.description,
|
|
149
|
-
)
|
|
150
|
-
tools.append(tool)
|
|
151
|
-
|
|
152
|
-
query_engine = RouterQueryEngine(
|
|
153
|
-
selector=LLMSingleSelector.from_defaults(
|
|
154
|
-
service_context=self.service_context
|
|
155
|
-
),
|
|
156
|
-
query_engine_tools=tools,
|
|
157
|
-
llm=self.service_context.llm,
|
|
158
|
-
service_context=self.service_context,
|
|
159
|
-
verbose=True,
|
|
160
|
-
)
|
|
161
|
-
return query_engine
|
|
162
|
-
|
|
163
|
-
def _get_retriever(self):
|
|
164
|
-
indices = self._get_indices()
|
|
165
|
-
retrievers = []
|
|
166
|
-
|
|
167
|
-
for collection_item, index in indices:
|
|
168
|
-
retriever = AutoMergingRetriever(
|
|
169
|
-
index.as_retriever(),
|
|
170
|
-
storage_context=self.storage_context_dict[collection_item.name],
|
|
171
|
-
)
|
|
172
|
-
retrievers.append(retriever)
|
|
173
|
-
|
|
174
|
-
return retrievers
|
|
175
|
-
|
|
176
|
-
def stream_search(self, query: str):
|
|
177
|
-
query_bundle = QueryBundle(query_str=query)
|
|
178
|
-
query_engine = self._get_query_engine(streaming=True)
|
|
179
|
-
streaming_response = query_engine.query(query_bundle)
|
|
180
|
-
contexts = []
|
|
181
|
-
for node in streaming_response.source_nodes:
|
|
182
|
-
contexts.append(
|
|
183
|
-
{
|
|
184
|
-
"raw_chunk": node.node.text,
|
|
185
|
-
"doc_url": node.node.metadata["file_path"],
|
|
186
|
-
"_id": node.node.id_,
|
|
187
|
-
}
|
|
188
|
-
)
|
|
189
|
-
return streaming_response.response_gen, contexts
|
|
190
|
-
|
|
191
|
-
def retrieve(self, query: str) -> List[Dict[str, Any]]:
|
|
192
|
-
query_bundle = QueryBundle(query_str=query)
|
|
193
|
-
retrievers = self._get_retriever()
|
|
194
|
-
|
|
195
|
-
result = []
|
|
196
|
-
|
|
197
|
-
for retriever in retrievers:
|
|
198
|
-
nodes = retriever.retrieve(query_bundle)
|
|
199
|
-
|
|
200
|
-
reranker = LLMRerank(llm=self.llm)
|
|
201
|
-
retrieved_nodes = reranker.postprocess_nodes(
|
|
202
|
-
nodes, query_bundle, choice_batch_size=5, top_n=1
|
|
203
|
-
)
|
|
204
|
-
result.extend(
|
|
205
|
-
[
|
|
206
|
-
{
|
|
207
|
-
"raw_chunk": node.node.text,
|
|
208
|
-
"doc_url": node.node.metadata["file_path"],
|
|
209
|
-
"_id": node.node.id_,
|
|
210
|
-
}
|
|
211
|
-
for node in retrieved_nodes
|
|
212
|
-
]
|
|
213
|
-
)
|
|
214
|
-
return result
|
|
215
|
-
|
|
216
|
-
def stream_chat_oai(
|
|
217
|
-
self,
|
|
218
|
-
conversations,
|
|
219
|
-
model: Optional[str] = None,
|
|
220
|
-
role_mapping=None,
|
|
221
|
-
llm_config: Dict[str, Any] = {},
|
|
222
|
-
):
|
|
223
|
-
if len(self.collections) != 1:
|
|
224
|
-
raise ValueError("When chat mode, only one collection can be set")
|
|
225
|
-
|
|
226
|
-
index = VectorStoreIndex.from_vector_store(
|
|
227
|
-
vector_store=self.storage_context_dict[self.collections[0]].vector_store,
|
|
228
|
-
service_context=self.service_context,
|
|
229
|
-
)
|
|
230
|
-
chat_engine = index.as_chat_engine(
|
|
231
|
-
chat_mode="condense_plus_context",
|
|
232
|
-
verbose=False,
|
|
233
|
-
)
|
|
234
|
-
history = []
|
|
235
|
-
for conv in conversations[:-1]:
|
|
236
|
-
if conv["role"] == "user":
|
|
237
|
-
role = MessageRole.USER
|
|
238
|
-
elif conv["role"] == "assistant":
|
|
239
|
-
role = MessageRole.ASSISTANT
|
|
240
|
-
else:
|
|
241
|
-
role = MessageRole.SYSTEM
|
|
242
|
-
history.append(ChatMessage(role=role, content=conv["content"]))
|
|
243
|
-
return (
|
|
244
|
-
chat_engine.stream_chat(
|
|
245
|
-
conversations[-1]["content"], chat_history=history
|
|
246
|
-
).response_gen,
|
|
247
|
-
[],
|
|
248
|
-
)
|
|
249
|
-
|
|
250
|
-
def stream_chat_repl(self, query: str):
|
|
251
|
-
if len(self.collections) != 1:
|
|
252
|
-
raise ValueError("When chat mode, only one collection can be set")
|
|
253
|
-
from llama_index.core.memory import ChatMemoryBuffer
|
|
254
|
-
|
|
255
|
-
memory = ChatMemoryBuffer.from_defaults(token_limit=8092)
|
|
256
|
-
index = VectorStoreIndex.from_vector_store(
|
|
257
|
-
vector_store=self.storage_context_dict[self.collections[0]].vector_store,
|
|
258
|
-
service_context=self.service_context,
|
|
259
|
-
)
|
|
260
|
-
chat_engine = index.as_chat_engine(
|
|
261
|
-
chat_mode="condense_plus_context",
|
|
262
|
-
memory=memory,
|
|
263
|
-
verbose=False,
|
|
264
|
-
)
|
|
265
|
-
chat_engine.streaming_chat_repl()
|
|
266
|
-
|
|
267
|
-
def search(self, query: str) -> List[SourceCode]:
|
|
268
|
-
if not is_ray_in_client_mode():
|
|
269
|
-
return self.inner_search(query)
|
|
270
|
-
|
|
271
|
-
target_query = query
|
|
272
|
-
|
|
273
|
-
if isinstance(self.args.enable_rag_search, str):
|
|
274
|
-
target_query = self.args.enable_rag_search
|
|
275
|
-
|
|
276
|
-
response = self.client.chat.completions.create(
|
|
277
|
-
messages=[{"role": "user", "content": target_query}],
|
|
278
|
-
model="xxxx",
|
|
279
|
-
)
|
|
280
|
-
return [
|
|
281
|
-
SourceCode(
|
|
282
|
-
module_name=f"RAG:{target_query}",
|
|
283
|
-
source_code=response.choices[0].message.content,
|
|
284
|
-
)
|
|
285
|
-
]
|
|
286
|
-
|
|
287
|
-
def inner_search(self, query: str) -> List[SourceCode]:
|
|
288
|
-
if self.args.enable_rag_search:
|
|
289
|
-
target_query = query
|
|
290
|
-
if isinstance(self.args.enable_rag_search, str):
|
|
291
|
-
target_query = self.args.enable_rag_search
|
|
292
|
-
texts, contexts = self.stream_search(target_query)
|
|
293
|
-
s = "".join([text for text in texts])
|
|
294
|
-
urls = ",".join(set([context["doc_url"] for context in contexts]))
|
|
295
|
-
## append RAG: prefix is used to protect avoid the source code is modified by the code auto execute
|
|
296
|
-
return [SourceCode(module_name=f"RAG:{urls}", source_code=s)]
|
|
297
|
-
elif self.args.enable_rag_context:
|
|
298
|
-
target_query = query
|
|
299
|
-
if isinstance(self.args.enable_rag_context, str):
|
|
300
|
-
target_query = self.args.enable_rag_context
|
|
301
|
-
contexts = self.retrieve(target_query)
|
|
302
|
-
for context in contexts:
|
|
303
|
-
context["raw_chunk"]
|
|
304
|
-
try:
|
|
305
|
-
with open(context["doc_url"], "r") as f:
|
|
306
|
-
context["raw_chunk"] = f.read()
|
|
307
|
-
except Exception as e:
|
|
308
|
-
logger.warning(f"Error reading file {context['doc_url']}")
|
|
309
|
-
pass
|
|
310
|
-
|
|
311
|
-
return [
|
|
312
|
-
SourceCode(
|
|
313
|
-
module_name=context["doc_url"], source_code=context["raw_chunk"]
|
|
314
|
-
)
|
|
315
|
-
for context in contexts
|
|
316
|
-
]
|
|
317
|
-
return []
|
|
318
|
-
|
|
319
|
-
def build(self):
|
|
320
|
-
|
|
321
|
-
if len(self.collections) != 1:
|
|
322
|
-
raise ValueError("When build, only one collection should be set")
|
|
323
|
-
|
|
324
|
-
if is_ray_in_client_mode():
|
|
325
|
-
raise ValueError(
|
|
326
|
-
"You are in client mode, please run the build in the server."
|
|
327
|
-
)
|
|
328
|
-
|
|
329
|
-
collection = self.collections[0]
|
|
330
|
-
|
|
331
|
-
collection_exists = self.collection_manager.check_collection_exists(collection)
|
|
332
|
-
if not collection_exists:
|
|
333
|
-
logger.warning(
|
|
334
|
-
f"Collection {collection} not found, creating it automatically"
|
|
335
|
-
)
|
|
336
|
-
if not self.args.description:
|
|
337
|
-
logger.error("Please provide a description for the collection")
|
|
338
|
-
return
|
|
339
|
-
item = CollectionItem(
|
|
340
|
-
name=collection, description=self.args.description or ""
|
|
341
|
-
)
|
|
342
|
-
self.collection_manager.add_collection(item)
|
|
343
|
-
|
|
344
|
-
retrieval_client = self.simple_retrieval_dict[collection]
|
|
345
|
-
# retrieval_client.delete_from_doc_collection(collection)
|
|
346
|
-
# retrieval_client.delete_from_chunk_collection(collection)
|
|
347
|
-
|
|
348
|
-
required_exts = self.args.required_exts or None
|
|
349
|
-
|
|
350
|
-
if required_exts:
|
|
351
|
-
required_exts = required_exts.split(",")
|
|
352
|
-
|
|
353
|
-
documents = AutoCoderSimpleDirectoryReader(
|
|
354
|
-
self.path,
|
|
355
|
-
recursive=True,
|
|
356
|
-
filename_as_id=True,
|
|
357
|
-
required_exts=required_exts,
|
|
358
|
-
file_metadata=file_metadata_func,
|
|
359
|
-
).load_data()
|
|
360
|
-
docs_keep = []
|
|
361
|
-
for document in documents:
|
|
362
|
-
doc = retrieval_client.get_doc(
|
|
363
|
-
f"ref_doc_info/{document.doc_id}", collection
|
|
364
|
-
)
|
|
365
|
-
if doc:
|
|
366
|
-
md5 = json.loads(doc["json_data"])["metadata"].get("md5", "")
|
|
367
|
-
file_path = document.metadata["file_path"]
|
|
368
|
-
new__md5 = document.metadata["md5"]
|
|
369
|
-
if md5 != new__md5:
|
|
370
|
-
retrieval_client.delete_doc_and_chunks_by_filename(
|
|
371
|
-
collection, file_path
|
|
372
|
-
)
|
|
373
|
-
docs_keep.append(document)
|
|
374
|
-
else:
|
|
375
|
-
docs_keep.append(document)
|
|
376
|
-
|
|
377
|
-
retrieval_client.commit_doc()
|
|
378
|
-
retrieval_client.commit_chunk()
|
|
379
|
-
|
|
380
|
-
for document in docs_keep:
|
|
381
|
-
logger.info(f"\nUpsert {document.doc_id}")
|
|
382
|
-
|
|
383
|
-
if docs_keep:
|
|
384
|
-
hirerachical = HierarchicalNodeParser.from_defaults(
|
|
385
|
-
chunk_sizes=[6000, 3000, 1024]
|
|
386
|
-
)
|
|
387
|
-
# sp = SentenceSplitter(chunk_size=1024, chunk_overlap=0)
|
|
388
|
-
|
|
389
|
-
nodes = hirerachical.get_nodes_from_documents(docs_keep, show_progress=True)
|
|
390
|
-
|
|
391
|
-
leaf_nodes = get_leaf_nodes(nodes)
|
|
392
|
-
self.storage_context_dict[collection].docstore.add_documents(nodes)
|
|
393
|
-
|
|
394
|
-
_ = VectorStoreIndex(
|
|
395
|
-
nodes=leaf_nodes,
|
|
396
|
-
store_nodes_override=True,
|
|
397
|
-
storage_context=self.storage_context_dict[collection],
|
|
398
|
-
service_context=self.service_context,
|
|
399
|
-
)
|
|
400
|
-
|
|
401
|
-
retrieval_client.commit_doc()
|
|
402
|
-
retrieval_client.commit_chunk()
|
|
403
|
-
else:
|
|
404
|
-
logger.info("There is no new doc to build")
|
|
@@ -1,162 +0,0 @@
|
|
|
1
|
-
import re
|
|
2
|
-
from autocoder.common import SourceCode,AutoCoderArgs
|
|
3
|
-
from autocoder import common as FileUtils
|
|
4
|
-
from autocoder.utils.rest import HttpDoc
|
|
5
|
-
import os
|
|
6
|
-
from typing import Optional, Generator, List, Dict, Any, Callable
|
|
7
|
-
from git import Repo
|
|
8
|
-
import byzerllm
|
|
9
|
-
from autocoder.common.search import Search,SearchEngine
|
|
10
|
-
from autocoder.rag.simple_rag import SimpleRAG
|
|
11
|
-
from loguru import logger
|
|
12
|
-
from pydantic import BaseModel,Field
|
|
13
|
-
|
|
14
|
-
class RegPattern(BaseModel):
|
|
15
|
-
pattern: str = Field(..., title="Pattern", description="The regex pattern can be used by `re.search` in python.")
|
|
16
|
-
|
|
17
|
-
class RegexProject():
|
|
18
|
-
|
|
19
|
-
def __init__(self, args: AutoCoderArgs, llm: Optional[byzerllm.ByzerLLM] = None,file_filter=None):
|
|
20
|
-
self.args = args
|
|
21
|
-
self.directory = args.source_dir
|
|
22
|
-
self.git_url = args.git_url
|
|
23
|
-
self.target_file = args.target_file
|
|
24
|
-
self.project_type = args.project_type
|
|
25
|
-
self.file_filter = file_filter
|
|
26
|
-
self.sources = []
|
|
27
|
-
self.llm = llm
|
|
28
|
-
self.regex_pattern = self.extract_regex_pattern(self.project_type)
|
|
29
|
-
|
|
30
|
-
@byzerllm.prompt()
|
|
31
|
-
def generate_regex_pattern(self,desc:str)->RegPattern:
|
|
32
|
-
'''
|
|
33
|
-
Generate a regex pattern based on the following description:
|
|
34
|
-
|
|
35
|
-
{{ desc }}
|
|
36
|
-
'''
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
def extract_regex_pattern(self, project_type):
|
|
40
|
-
project_type = project_type.strip()
|
|
41
|
-
if project_type.startswith("regex://"):
|
|
42
|
-
return project_type[8:]
|
|
43
|
-
if project_type.startswith("human://"):
|
|
44
|
-
desc = project_type[8:]
|
|
45
|
-
v = self.generate_regex_pattern.with_llm(self.llm).run(desc=desc)
|
|
46
|
-
if not v:
|
|
47
|
-
raise ValueError("Fail to generate regex pattern, try again.")
|
|
48
|
-
logger.info(f"Generated regex pattern: {v.pattern}")
|
|
49
|
-
return v.pattern
|
|
50
|
-
else:
|
|
51
|
-
raise ValueError("Invalid project_type format. Expected 'regex//<pattern>'")
|
|
52
|
-
|
|
53
|
-
def output(self):
|
|
54
|
-
return open(self.target_file, "r").read()
|
|
55
|
-
|
|
56
|
-
def is_regex_match(self, file_path):
|
|
57
|
-
return re.search(self.regex_pattern, file_path) is not None
|
|
58
|
-
|
|
59
|
-
def read_file_content(self, file_path):
|
|
60
|
-
with open(file_path, "r") as file:
|
|
61
|
-
return file.read()
|
|
62
|
-
|
|
63
|
-
def convert_to_source_code(self, file_path):
|
|
64
|
-
module_name = file_path
|
|
65
|
-
source_code = self.read_file_content(file_path)
|
|
66
|
-
return SourceCode(module_name=module_name, source_code=source_code)
|
|
67
|
-
|
|
68
|
-
def get_source_codes(self) -> Generator[SourceCode, None, None]:
|
|
69
|
-
for root, dirs, files in os.walk(self.directory):
|
|
70
|
-
for file in files:
|
|
71
|
-
file_path = os.path.join(root, file)
|
|
72
|
-
if self.is_regex_match(file_path):
|
|
73
|
-
if self.file_filter is None or self.file_filter(file_path, [self.regex_pattern]):
|
|
74
|
-
logger.info(f"collect file: {file_path}")
|
|
75
|
-
source_code = self.convert_to_source_code(file_path)
|
|
76
|
-
if source_code is not None:
|
|
77
|
-
yield source_code
|
|
78
|
-
|
|
79
|
-
def get_rest_source_codes(self) -> Generator[SourceCode, None, None]:
|
|
80
|
-
if self.args.urls:
|
|
81
|
-
urls = self.args.urls
|
|
82
|
-
if isinstance(self.args.urls, str):
|
|
83
|
-
urls = self.args.urls.split(",")
|
|
84
|
-
http_doc = HttpDoc(args =self.args, llm=self.llm,urls=urls)
|
|
85
|
-
sources = http_doc.crawl_urls()
|
|
86
|
-
for source in sources:
|
|
87
|
-
source.tag = "REST"
|
|
88
|
-
return sources
|
|
89
|
-
return []
|
|
90
|
-
|
|
91
|
-
def get_rag_source_codes(self):
|
|
92
|
-
if not self.args.enable_rag_search and not self.args.enable_rag_context:
|
|
93
|
-
return []
|
|
94
|
-
rag = SimpleRAG(self.llm,self.args,self.args.source_dir)
|
|
95
|
-
docs = rag.search(self.args.query)
|
|
96
|
-
for doc in docs:
|
|
97
|
-
doc.tag = "RAG"
|
|
98
|
-
return docs
|
|
99
|
-
|
|
100
|
-
def get_search_source_codes(self):
|
|
101
|
-
temp = self.get_rag_source_codes()
|
|
102
|
-
if self.args.search_engine and self.args.search_engine_token:
|
|
103
|
-
if self.args.search_engine == "bing":
|
|
104
|
-
search_engine = SearchEngine.BING
|
|
105
|
-
else:
|
|
106
|
-
search_engine = SearchEngine.GOOGLE
|
|
107
|
-
|
|
108
|
-
searcher=Search(args=self.args,llm=self.llm,search_engine=search_engine,subscription_key=self.args.search_engine_token)
|
|
109
|
-
search_query = self.args.search or self.args.query
|
|
110
|
-
search_context = searcher.answer_with_the_most_related_context(search_query)
|
|
111
|
-
return temp + [SourceCode(module_name="SEARCH_ENGINE", source_code=search_context,tag="SEARCH")]
|
|
112
|
-
return temp + []
|
|
113
|
-
|
|
114
|
-
def run(self):
|
|
115
|
-
if self.git_url is not None:
|
|
116
|
-
self.clone_repository()
|
|
117
|
-
|
|
118
|
-
if self.target_file is None:
|
|
119
|
-
for code in self.get_source_codes():
|
|
120
|
-
self.sources.append(code)
|
|
121
|
-
print(f"##File: {code.module_name}")
|
|
122
|
-
print(code.source_code)
|
|
123
|
-
|
|
124
|
-
for code in self.get_rest_source_codes():
|
|
125
|
-
self.sources.append(code)
|
|
126
|
-
print(f"##File: {code.module_name}")
|
|
127
|
-
print(code.source_code)
|
|
128
|
-
|
|
129
|
-
for code in self.get_search_source_codes():
|
|
130
|
-
self.sources.append(code)
|
|
131
|
-
print(f"##File: {code.module_name}")
|
|
132
|
-
print(code.source_code)
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
else:
|
|
136
|
-
with open(self.target_file, "w") as file:
|
|
137
|
-
for code in self.get_source_codes():
|
|
138
|
-
self.sources.append(code)
|
|
139
|
-
file.write(f"##File: {code.module_name}\n")
|
|
140
|
-
file.write(f"{code.source_code}\n\n")
|
|
141
|
-
|
|
142
|
-
for code in self.get_rest_source_codes():
|
|
143
|
-
self.sources.append(code)
|
|
144
|
-
file.write(f"##File: {code.module_name}\n")
|
|
145
|
-
file.write(f"{code.source_code}\n\n")
|
|
146
|
-
|
|
147
|
-
for code in self.get_search_source_codes():
|
|
148
|
-
self.sources.append(code)
|
|
149
|
-
file.write(f"##File: {code.module_name}\n")
|
|
150
|
-
file.write(f"{code.source_code}\n\n")
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
def clone_repository(self):
|
|
155
|
-
if self.git_url is None:
|
|
156
|
-
raise ValueError("git_url is required to clone the repository")
|
|
157
|
-
|
|
158
|
-
if os.path.exists(self.directory):
|
|
159
|
-
print(f"Directory {self.directory} already exists. Skipping cloning.")
|
|
160
|
-
else:
|
|
161
|
-
print(f"Cloning repository {self.git_url} into {self.directory}")
|
|
162
|
-
Repo.clone_from(self.git_url, self.directory)
|
autocoder/utils/coder.py
DELETED
|
@@ -1,125 +0,0 @@
|
|
|
1
|
-
|
|
2
|
-
from autocoder.common import AutoCoderArgs
|
|
3
|
-
import byzerllm
|
|
4
|
-
import pydantic
|
|
5
|
-
from enum import Enum
|
|
6
|
-
|
|
7
|
-
class TaskTypeDef(pydantic.BaseModel):
|
|
8
|
-
name: str
|
|
9
|
-
desc: str = ""
|
|
10
|
-
guidance: str = ""
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
class TaskType(Enum):
|
|
14
|
-
"""By identifying specific types of tasks, we can inject human priors (guidance) to help task solving"""
|
|
15
|
-
|
|
16
|
-
@byzerllm.prompt(render="jinja2")
|
|
17
|
-
def data_preprocess_prompt():
|
|
18
|
-
'''
|
|
19
|
-
The current task is about data preprocessing, please note the following:
|
|
20
|
-
- Monitor data types per column, applying appropriate methods.
|
|
21
|
-
- Ensure operations are on existing dataset columns.
|
|
22
|
-
- Avoid writing processed data to files.
|
|
23
|
-
- Avoid any change to label column, such as standardization, etc.
|
|
24
|
-
- Prefer alternatives to one-hot encoding for categorical data.
|
|
25
|
-
- Only encode or scale necessary columns to allow for potential feature-specific engineering tasks (like time_extract, binning, extraction, etc.) later.
|
|
26
|
-
- Each step do data preprocessing to train, must do same for test separately at the same time.
|
|
27
|
-
- Always copy the DataFrame before processing it and use the copy to process.
|
|
28
|
-
'''
|
|
29
|
-
|
|
30
|
-
@byzerllm.prompt(render="jinja2")
|
|
31
|
-
def image2webpage_prompt():
|
|
32
|
-
'''
|
|
33
|
-
The current task is about converting image into webpage code. please note the following:
|
|
34
|
-
- Single-Step Code Generation: Execute the entire code generation process in a single step, encompassing HTML, CSS, and JavaScript. Avoid fragmenting the code generation into multiple separate steps to maintain consistency and simplify the development workflow.
|
|
35
|
-
- Save webpages: Be sure to use the save method provided.
|
|
36
|
-
'''
|
|
37
|
-
|
|
38
|
-
DATA_PREPROCESS = TaskTypeDef(
|
|
39
|
-
name="data preprocessing",
|
|
40
|
-
desc="For preprocessing dataset in a data analysis or machine learning task ONLY,"
|
|
41
|
-
"general data operation doesn't fall into this type",
|
|
42
|
-
guidance=data_preprocess_prompt(),
|
|
43
|
-
)
|
|
44
|
-
|
|
45
|
-
IMAGE2WEBPAGE = TaskTypeDef(
|
|
46
|
-
name="image2webpage",
|
|
47
|
-
desc="For converting image into webpage code.",
|
|
48
|
-
guidance=image2webpage_prompt(),
|
|
49
|
-
)
|
|
50
|
-
OTHER = TaskTypeDef(name="other", desc="Any tasks not in the defined categories")
|
|
51
|
-
|
|
52
|
-
@property
|
|
53
|
-
def type_name(self):
|
|
54
|
-
return self.value.name
|
|
55
|
-
|
|
56
|
-
@classmethod
|
|
57
|
-
def get_type(cls, type_name):
|
|
58
|
-
for member in cls:
|
|
59
|
-
if member.type_name == type_name:
|
|
60
|
-
return member.value
|
|
61
|
-
return None
|
|
62
|
-
|
|
63
|
-
class Thought(pydantic.BaseModel):
|
|
64
|
-
thoughts:str = pydantic.Field(...,description="Thoughts on current situation, reflect on how you should proceed to fulfill the user requirement")
|
|
65
|
-
state:bool = pydantic.Field(...,description="Decide whether you need to take more actions to complete the user requirement. Return true if you think so. Return false if you think the requirement has been completely fulfilled.")
|
|
66
|
-
|
|
67
|
-
class Plan(pydantic.BaseModel):
|
|
68
|
-
task_id:str = pydantic.Field(...,description="unique identifier for a task in plan, can be an ordinal")
|
|
69
|
-
dependent_task_ids:list[str] = pydantic.Field(...,description="ids of tasks prerequisite to this task")
|
|
70
|
-
instruction:str = pydantic.Field(...,description="what you should do in this task, one short phrase or sentence")
|
|
71
|
-
task_type:str = pydantic.Field(...,description="type of this task, should be one of Available Task Types")
|
|
72
|
-
|
|
73
|
-
class Coder:
|
|
74
|
-
def __init__(self,llm:byzerllm.ByzerLLM,args:AutoCoderArgs) -> None:
|
|
75
|
-
self.llm = llm
|
|
76
|
-
self.args = args
|
|
77
|
-
self.working_memory = []
|
|
78
|
-
|
|
79
|
-
def get_task_type_desc(self):
|
|
80
|
-
task_type_desc = "\n".join([f"- **{tt.type_name}**: {tt.value.desc}" for tt in TaskType])
|
|
81
|
-
return task_type_desc
|
|
82
|
-
|
|
83
|
-
@byzerllm.prompt(llm=lambda self:self.llm,render="jinja2")
|
|
84
|
-
def write_plan(self,context:str,task_type_desc:str,max_tasks:int=3):
|
|
85
|
-
'''
|
|
86
|
-
# Context:
|
|
87
|
-
{{ context }}
|
|
88
|
-
# Available Task Types:
|
|
89
|
-
{{ task_type_desc}}
|
|
90
|
-
# Task:
|
|
91
|
-
Based on the context, write a plan or modify an existing plan of what you should do to achieve the goal. A plan consists of one to {{ max_tasks }} tasks.
|
|
92
|
-
If you are modifying an existing plan, carefully follow the instruction, don't make unnecessary changes. Give the whole plan unless instructed to modify only one task of the plan.
|
|
93
|
-
If you encounter errors on the current task, revise and output the current single task only.
|
|
94
|
-
Output a list of jsons following the format:
|
|
95
|
-
```json
|
|
96
|
-
[
|
|
97
|
-
{
|
|
98
|
-
"task_id": str = "unique identifier for a task in plan, can be an ordinal",
|
|
99
|
-
"dependent_task_ids": list[str] = "ids of tasks prerequisite to this task",
|
|
100
|
-
"instruction": "what you should do in this task, one short phrase or sentence",
|
|
101
|
-
"task_type": "type of this task, should be one of Available Task Types",
|
|
102
|
-
},
|
|
103
|
-
...
|
|
104
|
-
]
|
|
105
|
-
```
|
|
106
|
-
'''
|
|
107
|
-
pass
|
|
108
|
-
|
|
109
|
-
@byzerllm.prompt(llm=lambda self:self.llm,render="jinja2")
|
|
110
|
-
def react_think(self,user_requirement:str,context:str)->str:
|
|
111
|
-
'''
|
|
112
|
-
# User Requirement
|
|
113
|
-
{{ user_requirement }}
|
|
114
|
-
# Context
|
|
115
|
-
{{ context }}
|
|
116
|
-
|
|
117
|
-
Output a json following the format:
|
|
118
|
-
```json
|
|
119
|
-
{
|
|
120
|
-
"thoughts": str = "Thoughts on current situation, reflect on how you should proceed to fulfill the user requirement",
|
|
121
|
-
"state": bool = "Decide whether you need to take more actions to complete the user requirement. Return true if you think so. Return false if you think the requirement has been completely fulfilled."
|
|
122
|
-
}
|
|
123
|
-
```
|
|
124
|
-
'''
|
|
125
|
-
pass
|