auto-coder 0.1.183__py3-none-any.whl → 0.1.185__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of auto-coder might be problematic. Click here for more details.
- {auto_coder-0.1.183.dist-info → auto_coder-0.1.185.dist-info}/METADATA +2 -2
- {auto_coder-0.1.183.dist-info → auto_coder-0.1.185.dist-info}/RECORD +20 -14
- autocoder/auto_coder.py +138 -17
- autocoder/auto_coder_lang.py +14 -2
- autocoder/auto_coder_rag.py +92 -1
- autocoder/chat_auto_coder.py +25 -32
- autocoder/common/__init__.py +2 -0
- autocoder/rag/cache/__init__.py +0 -0
- autocoder/rag/cache/base_cache.py +14 -0
- autocoder/rag/cache/byzer_storage_cache.py +394 -0
- autocoder/rag/cache/file_monitor_cache.py +146 -0
- autocoder/rag/cache/simple_cache.py +204 -0
- autocoder/rag/document_retriever.py +56 -475
- autocoder/rag/long_context_rag.py +16 -6
- autocoder/rag/utils.py +133 -0
- autocoder/version.py +1 -1
- {auto_coder-0.1.183.dist-info → auto_coder-0.1.185.dist-info}/LICENSE +0 -0
- {auto_coder-0.1.183.dist-info → auto_coder-0.1.185.dist-info}/WHEEL +0 -0
- {auto_coder-0.1.183.dist-info → auto_coder-0.1.185.dist-info}/entry_points.txt +0 -0
- {auto_coder-0.1.183.dist-info → auto_coder-0.1.185.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,204 @@
|
|
|
1
|
+
|
|
2
|
+
from multiprocessing import Pool
|
|
3
|
+
from autocoder.common import SourceCode
|
|
4
|
+
from autocoder.rag.cache.base_cache import BaseCacheManager,DeleteEvent,AddOrUpdateEvent
|
|
5
|
+
from typing import Dict, List, Tuple,Any,Optional
|
|
6
|
+
import os
|
|
7
|
+
import threading
|
|
8
|
+
import json
|
|
9
|
+
import platform
|
|
10
|
+
if platform.system() != "Windows":
|
|
11
|
+
import fcntl
|
|
12
|
+
else:
|
|
13
|
+
fcntl = None
|
|
14
|
+
import time
|
|
15
|
+
from loguru import logger
|
|
16
|
+
from autocoder.rag.utils import process_file_in_multi_process,process_file_local
|
|
17
|
+
from autocoder.rag.variable_holder import VariableHolder
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class AutoCoderRAGAsyncUpdateQueue(BaseCacheManager):
|
|
21
|
+
def __init__(self, path: str, ignore_spec, required_exts: list):
|
|
22
|
+
self.path = path
|
|
23
|
+
self.ignore_spec = ignore_spec
|
|
24
|
+
self.required_exts = required_exts
|
|
25
|
+
self.queue = []
|
|
26
|
+
self.cache = {}
|
|
27
|
+
self.lock = threading.Lock()
|
|
28
|
+
self.stop_event = threading.Event()
|
|
29
|
+
self.thread = threading.Thread(target=self._process_queue)
|
|
30
|
+
self.thread.daemon = True
|
|
31
|
+
self.thread.start()
|
|
32
|
+
self.cache = self.read_cache()
|
|
33
|
+
|
|
34
|
+
def _process_queue(self):
|
|
35
|
+
while not self.stop_event.is_set():
|
|
36
|
+
try:
|
|
37
|
+
self.process_queue()
|
|
38
|
+
except Exception as e:
|
|
39
|
+
logger.error(f"Error in process_queue: {e}")
|
|
40
|
+
time.sleep(1) # 避免过于频繁的检查
|
|
41
|
+
|
|
42
|
+
def stop(self):
|
|
43
|
+
self.stop_event.set()
|
|
44
|
+
self.thread.join()
|
|
45
|
+
|
|
46
|
+
def __del__(self):
|
|
47
|
+
self.stop()
|
|
48
|
+
|
|
49
|
+
def load_first(self):
|
|
50
|
+
with self.lock:
|
|
51
|
+
if self.cache:
|
|
52
|
+
return
|
|
53
|
+
files_to_process = []
|
|
54
|
+
for file_info in self.get_all_files():
|
|
55
|
+
file_path, _, modify_time = file_info
|
|
56
|
+
if (
|
|
57
|
+
file_path not in self.cache
|
|
58
|
+
or self.cache[file_path]["modify_time"] < modify_time
|
|
59
|
+
):
|
|
60
|
+
files_to_process.append(file_info)
|
|
61
|
+
if not files_to_process:
|
|
62
|
+
return
|
|
63
|
+
# remote_process_file = ray.remote(process_file)
|
|
64
|
+
# results = ray.get(
|
|
65
|
+
# [process_file.remote(file_info) for file_info in files_to_process]
|
|
66
|
+
# )
|
|
67
|
+
from autocoder.rag.token_counter import initialize_tokenizer
|
|
68
|
+
|
|
69
|
+
with Pool(
|
|
70
|
+
processes=os.cpu_count(),
|
|
71
|
+
initializer=initialize_tokenizer,
|
|
72
|
+
initargs=(VariableHolder.TOKENIZER_PATH,),
|
|
73
|
+
) as pool:
|
|
74
|
+
results = pool.map(process_file_in_multi_process, files_to_process)
|
|
75
|
+
|
|
76
|
+
for file_info, result in zip(files_to_process, results):
|
|
77
|
+
self.update_cache(file_info, result)
|
|
78
|
+
|
|
79
|
+
self.write_cache()
|
|
80
|
+
|
|
81
|
+
def trigger_update(self):
|
|
82
|
+
logger.info("检查文件是否有更新.....")
|
|
83
|
+
files_to_process = []
|
|
84
|
+
current_files = set()
|
|
85
|
+
for file_info in self.get_all_files():
|
|
86
|
+
file_path, _, modify_time = file_info
|
|
87
|
+
current_files.add(file_path)
|
|
88
|
+
if (
|
|
89
|
+
file_path not in self.cache
|
|
90
|
+
or self.cache[file_path]["modify_time"] < modify_time
|
|
91
|
+
):
|
|
92
|
+
files_to_process.append(file_info)
|
|
93
|
+
|
|
94
|
+
deleted_files = set(self.cache.keys()) - current_files
|
|
95
|
+
logger.info(f"files_to_process: {files_to_process}")
|
|
96
|
+
logger.info(f"deleted_files: {deleted_files}")
|
|
97
|
+
if deleted_files:
|
|
98
|
+
with self.lock:
|
|
99
|
+
self.queue.append(DeleteEvent(file_paths=deleted_files))
|
|
100
|
+
if files_to_process:
|
|
101
|
+
with self.lock:
|
|
102
|
+
self.queue.append(AddOrUpdateEvent(file_infos=files_to_process))
|
|
103
|
+
|
|
104
|
+
def process_queue(self):
|
|
105
|
+
while self.queue:
|
|
106
|
+
file_list = self.queue.pop(0)
|
|
107
|
+
if isinstance(file_list, DeleteEvent):
|
|
108
|
+
for item in file_list.file_paths:
|
|
109
|
+
logger.info(f"{item} is detected to be removed")
|
|
110
|
+
del self.cache[item]
|
|
111
|
+
elif isinstance(file_list, AddOrUpdateEvent):
|
|
112
|
+
for file_info in file_list.file_infos:
|
|
113
|
+
logger.info(f"{file_info[0]} is detected to be updated")
|
|
114
|
+
result = process_file_local(file_info[0])
|
|
115
|
+
self.update_cache(file_info, result)
|
|
116
|
+
|
|
117
|
+
self.write_cache()
|
|
118
|
+
|
|
119
|
+
def read_cache(self) -> Dict[str, Dict]:
|
|
120
|
+
cache_dir = os.path.join(self.path, ".cache")
|
|
121
|
+
cache_file = os.path.join(cache_dir, "cache.jsonl")
|
|
122
|
+
|
|
123
|
+
if not os.path.exists(cache_dir):
|
|
124
|
+
os.makedirs(cache_dir)
|
|
125
|
+
|
|
126
|
+
cache = {}
|
|
127
|
+
if os.path.exists(cache_file):
|
|
128
|
+
with open(cache_file, "r") as f:
|
|
129
|
+
for line in f:
|
|
130
|
+
data = json.loads(line)
|
|
131
|
+
cache[data["file_path"]] = data
|
|
132
|
+
return cache
|
|
133
|
+
|
|
134
|
+
def write_cache(self):
|
|
135
|
+
cache_dir = os.path.join(self.path, ".cache")
|
|
136
|
+
cache_file = os.path.join(cache_dir, "cache.jsonl")
|
|
137
|
+
|
|
138
|
+
if not fcntl:
|
|
139
|
+
with open(cache_file, "w") as f:
|
|
140
|
+
for data in self.cache.values():
|
|
141
|
+
json.dump(data, f, ensure_ascii=False)
|
|
142
|
+
f.write("\n")
|
|
143
|
+
else:
|
|
144
|
+
lock_file = cache_file + ".lock"
|
|
145
|
+
with open(lock_file, "w") as lockf:
|
|
146
|
+
try:
|
|
147
|
+
# 获取文件锁
|
|
148
|
+
fcntl.flock(lockf, fcntl.LOCK_EX | fcntl.LOCK_NB)
|
|
149
|
+
# 写入缓存文件
|
|
150
|
+
with open(cache_file, "w") as f:
|
|
151
|
+
for data in self.cache.values():
|
|
152
|
+
json.dump(data, f, ensure_ascii=False)
|
|
153
|
+
f.write("\n")
|
|
154
|
+
|
|
155
|
+
finally:
|
|
156
|
+
# 释放文件锁
|
|
157
|
+
fcntl.flock(lockf, fcntl.LOCK_UN)
|
|
158
|
+
|
|
159
|
+
def update_cache(
|
|
160
|
+
self, file_info: Tuple[str, str, float], content: List[SourceCode]
|
|
161
|
+
):
|
|
162
|
+
file_path, relative_path, modify_time = file_info
|
|
163
|
+
self.cache[file_path] = {
|
|
164
|
+
"file_path": file_path,
|
|
165
|
+
"relative_path": relative_path,
|
|
166
|
+
"content": [c.model_dump() for c in content],
|
|
167
|
+
"modify_time": modify_time,
|
|
168
|
+
}
|
|
169
|
+
|
|
170
|
+
def get_cache(self,options:Optional[Dict[str,Any]]=None):
|
|
171
|
+
self.load_first()
|
|
172
|
+
self.trigger_update()
|
|
173
|
+
return self.cache
|
|
174
|
+
|
|
175
|
+
def get_all_files(self) -> List[Tuple[str, str, float]]:
|
|
176
|
+
all_files = []
|
|
177
|
+
for root, dirs, files in os.walk(self.path):
|
|
178
|
+
dirs[:] = [d for d in dirs if not d.startswith(".")]
|
|
179
|
+
|
|
180
|
+
if self.ignore_spec:
|
|
181
|
+
relative_root = os.path.relpath(root, self.path)
|
|
182
|
+
dirs[:] = [
|
|
183
|
+
d
|
|
184
|
+
for d in dirs
|
|
185
|
+
if not self.ignore_spec.match_file(os.path.join(relative_root, d))
|
|
186
|
+
]
|
|
187
|
+
files = [
|
|
188
|
+
f
|
|
189
|
+
for f in files
|
|
190
|
+
if not self.ignore_spec.match_file(os.path.join(relative_root, f))
|
|
191
|
+
]
|
|
192
|
+
|
|
193
|
+
for file in files:
|
|
194
|
+
if self.required_exts and not any(
|
|
195
|
+
file.endswith(ext) for ext in self.required_exts
|
|
196
|
+
):
|
|
197
|
+
continue
|
|
198
|
+
|
|
199
|
+
file_path = os.path.join(root, file)
|
|
200
|
+
relative_path = os.path.relpath(file_path, self.path)
|
|
201
|
+
modify_time = os.path.getmtime(file_path)
|
|
202
|
+
all_files.append((file_path, relative_path, modify_time))
|
|
203
|
+
|
|
204
|
+
return all_files
|