auto-coder 0.1.183__py3-none-any.whl → 0.1.184__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of auto-coder might be problematic. Click here for more details.

@@ -0,0 +1,394 @@
1
+ from autocoder.rag.cache.base_cache import (
2
+ BaseCacheManager,
3
+ DeleteEvent,
4
+ AddOrUpdateEvent,
5
+ )
6
+ from typing import Generator, List, Dict, Any, Optional, Tuple
7
+ from autocoder.common import SourceCode
8
+ from loguru import logger
9
+ import pathspec
10
+ import os
11
+ import uuid
12
+ import json
13
+ from autocoder.rag.utils import process_file_in_multi_process, process_file_local
14
+ from byzerllm.apps.byzer_storage.simple_api import (
15
+ ByzerStorage,
16
+ DataType,
17
+ FieldOption,
18
+ SortOption,
19
+ )
20
+ from autocoder.common import AutoCoderArgs
21
+ import threading
22
+ from multiprocessing import Pool
23
+ from concurrent.futures import ThreadPoolExecutor, as_completed
24
+ from autocoder.rag.variable_holder import VariableHolder
25
+ import platform
26
+
27
+ if platform.system() != "Windows":
28
+ import fcntl
29
+ else:
30
+ fcntl = None
31
+
32
+
33
+ class ByzerStorageCache(BaseCacheManager):
34
+ def __init__(
35
+ self,
36
+ path,
37
+ ignore_spec,
38
+ required_exts,
39
+ extra_params: Optional[AutoCoderArgs] = None,
40
+ ):
41
+ self.path = path
42
+ self.ignore_spec = ignore_spec
43
+ self.required_exts = required_exts
44
+ self.storage = ByzerStorage("byzerai_store", "rag", "files")
45
+ self.queue = []
46
+ self.chunk_size = 1000
47
+ self._init_schema()
48
+
49
+ if not extra_params:
50
+ raise ValueError("extra_params is required for ByzerStorageCache")
51
+
52
+ self.max_output_tokens = extra_params.hybrid_index_max_output_tokens
53
+
54
+ # 设置缓存文件路径
55
+ self.cache_dir = os.path.join(self.path, ".cache")
56
+ self.cache_file = os.path.join(self.cache_dir, "byzer_storage_speedup.jsonl")
57
+ self.cache = {}
58
+
59
+ self.lock = threading.Lock()
60
+ self.stop_event = threading.Event()
61
+ self.thread = threading.Thread(target=self.process_queue)
62
+ self.thread.daemon = True
63
+ self.thread.start()
64
+
65
+ # 创建缓存目录
66
+ if not os.path.exists(self.cache_dir):
67
+ os.makedirs(self.cache_dir)
68
+
69
+ # 加载缓存
70
+ self.cache = self._load_cache()
71
+
72
+ def _chunk_text(self, text, max_length=1000):
73
+ """Split text into chunks"""
74
+ chunks = []
75
+ current_chunk = []
76
+ current_length = 0
77
+
78
+ for line in text.split("\n"):
79
+ if current_length + len(line) > max_length and current_chunk:
80
+ chunks.append("\n".join(current_chunk))
81
+ current_chunk = []
82
+ current_length = 0
83
+ current_chunk.append(line)
84
+ current_length += len(line)
85
+
86
+ if current_chunk:
87
+ chunks.append("\n".join(current_chunk))
88
+
89
+ return chunks
90
+
91
+ def _init_schema(self):
92
+ """Initialize the Byzer Storage schema"""
93
+ _ = (
94
+ self.storage.schema_builder()
95
+ .add_field("_id", DataType.STRING)
96
+ .add_field("file_path", DataType.STRING)
97
+ .add_field("content", DataType.STRING, [FieldOption.ANALYZE])
98
+ .add_field("raw_content", DataType.STRING, [FieldOption.NO_INDEX])
99
+ .add_array_field("vector", DataType.FLOAT)
100
+ .add_field("mtime", DataType.DOUBLE, [FieldOption.SORT])
101
+ .execute()
102
+ )
103
+
104
+ def _load_cache(self) -> dict:
105
+ """Load cache from file"""
106
+ if os.path.exists(self.cache_file):
107
+ try:
108
+ with open(self.cache_file, "r") as f:
109
+ lines = f.readlines()
110
+ cache = {}
111
+ for line in lines:
112
+ try:
113
+ data = json.loads(line.strip())
114
+ if isinstance(data, dict) and "file_path" in data:
115
+ cache[data["file_path"]] = data
116
+ except json.JSONDecodeError:
117
+ continue
118
+ return cache
119
+ except Exception as e:
120
+ logger.error(f"Error loading cache file: {str(e)}")
121
+ return {}
122
+ return {}
123
+
124
+ def write_cache(self):
125
+ cache_file = self.cache_file
126
+
127
+ if not fcntl:
128
+ try:
129
+ with open(cache_file, "w") as f:
130
+ for data in self.cache.values():
131
+ json.dump(data, f, ensure_ascii=False)
132
+ f.write("\n")
133
+ except IOError as e:
134
+ logger.error(f"Error writing cache file: {str(e)}")
135
+ else:
136
+ lock_file = cache_file + ".lock"
137
+ with open(lock_file, "w") as lockf:
138
+ try:
139
+ # 获取文件锁
140
+ fcntl.flock(lockf, fcntl.LOCK_EX | fcntl.LOCK_NB)
141
+ # 写入缓存文件
142
+ with open(cache_file, "w") as f:
143
+ for data in self.cache.values():
144
+ json.dump(data, f, ensure_ascii=False)
145
+ f.write("\n")
146
+
147
+ finally:
148
+ # 释放文件锁
149
+ fcntl.flock(lockf, fcntl.LOCK_UN)
150
+
151
+ def build_cache(self):
152
+ """Build the cache by reading files and storing in Byzer Storage"""
153
+ logger.info(f"Building cache for path: {self.path}")
154
+
155
+ files_to_process = []
156
+ for file_info in self.get_all_files():
157
+ file_path, _, modify_time = file_info
158
+ if (
159
+ file_path not in self.cache
160
+ or self.cache[file_path]["modify_time"] < modify_time
161
+ ):
162
+ files_to_process.append(file_info)
163
+
164
+ if not files_to_process:
165
+ return
166
+
167
+ from autocoder.rag.token_counter import initialize_tokenizer
168
+
169
+ with Pool(
170
+ processes=os.cpu_count(),
171
+ initializer=initialize_tokenizer,
172
+ initargs=(VariableHolder.TOKENIZER_PATH,),
173
+ ) as pool:
174
+ results = pool.map(process_file_in_multi_process, files_to_process)
175
+
176
+ items = []
177
+ for file_info, result in zip(files_to_process, results):
178
+ file_path, relative_path, modify_time = file_info
179
+ content: List[SourceCode] = result
180
+ self.cache[file_path] = {
181
+ "file_path": file_path,
182
+ "relative_path": relative_path,
183
+ "content": [c.model_dump() for c in content],
184
+ "modify_time": modify_time,
185
+ }
186
+
187
+ for doc in content:
188
+ logger.info(f"Processing file: {doc.module_name}")
189
+ doc.module_name
190
+ chunks = self._chunk_text(doc.source_code, self.chunk_size)
191
+ for chunk_idx, chunk in enumerate(chunks):
192
+ chunk_item = {
193
+ "_id": f"{doc.module_name}_{chunk_idx}",
194
+ "file_path": file_path,
195
+ "content": chunk,
196
+ "raw_content": chunk,
197
+ "vector": chunk,
198
+ "mtime": modify_time,
199
+ }
200
+ items.append(chunk_item)
201
+
202
+ # Save to local cache
203
+ logger.info("Saving cache to local file")
204
+ self.write_cache()
205
+
206
+ if items:
207
+ logger.info("Clear cache from Byzer Storage")
208
+ self.storage.truncate_table()
209
+ logger.info("Save new cache to Byzer Storage")
210
+ max_workers = 5
211
+ chunk_size = max(1, len(items) // max_workers)
212
+ item_chunks = [items[i:i + chunk_size] for i in range(0, len(items), chunk_size)]
213
+
214
+ total_chunks = len(item_chunks)
215
+ completed_chunks = 0
216
+
217
+ logger.info(f"Progress: {0}/{total_chunks} chunks completed")
218
+
219
+ with ThreadPoolExecutor(max_workers=max_workers) as executor:
220
+ futures = []
221
+ for chunk in item_chunks:
222
+ futures.append(
223
+ executor.submit(
224
+ lambda x: self.storage.write_builder().add_items(
225
+ x, vector_fields=["vector"], search_fields=["content"]
226
+ ).execute(),
227
+ chunk
228
+ )
229
+ )
230
+ # Wait for all futures to complete
231
+ for future in as_completed(futures):
232
+ try:
233
+ future.result()
234
+ completed_chunks += 1
235
+ logger.info(f"Progress: {completed_chunks}/{total_chunks} chunks completed")
236
+ except Exception as e:
237
+ logger.error(f"Error in saving chunk: {str(e)}")
238
+
239
+ self.storage.commit()
240
+
241
+ def update_storage(self, file_path, is_delete: bool):
242
+ query = self.storage.query_builder()
243
+ query.and_filter().add_condition("file_path", file_path).build()
244
+ results = query.execute()
245
+ if results:
246
+ for result in results:
247
+ self.storage.delete_by_ids([result["_id"]])
248
+ items = []
249
+
250
+ if not is_delete:
251
+ content = [
252
+ SourceCode.model_validate(doc)
253
+ for doc in self.cache[file_path]["content"]
254
+ ]
255
+ modify_time = self.cache[file_path]["modify_time"]
256
+ for doc in content:
257
+ logger.info(f"Processing file: {doc.module_name}")
258
+ doc.module_name
259
+ chunks = self._chunk_text(doc.source_code, self.chunk_size)
260
+ for chunk_idx, chunk in enumerate(chunks):
261
+ chunk_item = {
262
+ "_id": f"{doc.module_name}_{chunk_idx}",
263
+ "file_path": file_path,
264
+ "content": chunk,
265
+ "raw_content": chunk,
266
+ "vector": chunk,
267
+ "mtime": modify_time,
268
+ }
269
+ items.append(chunk_item)
270
+ if items:
271
+ self.storage.write_builder().add_items(
272
+ items, vector_fields=["vector"], search_fields=["content"]
273
+ ).execute()
274
+ self.storage.commit()
275
+
276
+ def process_queue(self):
277
+ while self.queue:
278
+ file_list = self.queue.pop(0)
279
+ if isinstance(file_list, DeleteEvent):
280
+ for item in file_list.file_paths:
281
+ logger.info(f"{item} is detected to be removed")
282
+ del self.cache[item]
283
+ self.update_storage(item, is_delete=True)
284
+
285
+ elif isinstance(file_list, AddOrUpdateEvent):
286
+ for file_info in file_list.file_infos:
287
+ logger.info(f"{file_info[0]} is detected to be updated")
288
+ result = process_file_local(file_info[0])
289
+ self.cache[file_info[0]] = result
290
+ self.update_storage(file_info[0], is_delete=False)
291
+ self.write_cache()
292
+
293
+ def trigger_update(self):
294
+ logger.info("检查文件是否有更新.....")
295
+ files_to_process = []
296
+ current_files = set()
297
+ for file_info in self.get_all_files():
298
+ file_path, _, modify_time = file_info
299
+ current_files.add(file_path)
300
+ if (
301
+ file_path not in self.cache
302
+ or self.cache[file_path]["modify_time"] < modify_time
303
+ ):
304
+ files_to_process.append(file_info)
305
+
306
+ deleted_files = set(self.cache.keys()) - current_files
307
+ logger.info(f"files_to_process: {files_to_process}")
308
+ logger.info(f"deleted_files: {deleted_files}")
309
+ if deleted_files:
310
+ with self.lock:
311
+ self.queue.append(DeleteEvent(file_paths=deleted_files))
312
+ if files_to_process:
313
+ with self.lock:
314
+ self.queue.append(AddOrUpdateEvent(file_infos=files_to_process))
315
+
316
+ def get_cache(self, options: Dict[str, Any]) -> Dict[str, Dict]:
317
+ """Search cached documents using query"""
318
+
319
+ self.trigger_update()
320
+
321
+ if options is None or "query" not in options:
322
+ return self.cache
323
+
324
+ query = options.get("query", "")
325
+ total_tokens = 0
326
+
327
+ # Build query with both vector search and text search
328
+ query_builder = self.storage.query_builder()
329
+ query_builder.set_limit(100000)
330
+
331
+ # Add vector search if enabled
332
+ if options.get("enable_vector_search", True):
333
+ query_builder.set_vector_query(query, fields=["vector"])
334
+
335
+ # Add text search
336
+ if options.get("enable_text_search", True):
337
+ query_builder.set_search_query(query, fields=["content"])
338
+
339
+ results = query_builder.execute()
340
+
341
+ # Group results by file_path and reconstruct documents while preserving order
342
+ file_paths = []
343
+ seen = set()
344
+ for result in results:
345
+ file_path = result["file_path"]
346
+ if file_path not in seen:
347
+ seen.add(file_path)
348
+ file_paths.append(file_path)
349
+
350
+ # 从缓存中获取文件内容
351
+ result = {}
352
+ for file_path in file_paths:
353
+ if file_path in self.cache:
354
+ cached_data = self.cache[file_path]
355
+ for doc in cached_data["content"]:
356
+ if total_tokens + doc["tokens"] > self.max_output_tokens:
357
+ return result
358
+ total_tokens += doc["tokens"]
359
+ result[file_path] = cached_data
360
+
361
+ return result
362
+
363
+
364
+
365
+ def get_all_files(self) -> List[Tuple[str, str, float]]:
366
+ all_files = []
367
+ for root, dirs, files in os.walk(self.path):
368
+ dirs[:] = [d for d in dirs if not d.startswith(".")]
369
+
370
+ if self.ignore_spec:
371
+ relative_root = os.path.relpath(root, self.path)
372
+ dirs[:] = [
373
+ d
374
+ for d in dirs
375
+ if not self.ignore_spec.match_file(os.path.join(relative_root, d))
376
+ ]
377
+ files = [
378
+ f
379
+ for f in files
380
+ if not self.ignore_spec.match_file(os.path.join(relative_root, f))
381
+ ]
382
+
383
+ for file in files:
384
+ if self.required_exts and not any(
385
+ file.endswith(ext) for ext in self.required_exts
386
+ ):
387
+ continue
388
+
389
+ file_path = os.path.join(root, file)
390
+ relative_path = os.path.relpath(file_path, self.path)
391
+ modify_time = os.path.getmtime(file_path)
392
+ all_files.append((file_path, relative_path, modify_time))
393
+
394
+ return all_files
@@ -0,0 +1,146 @@
1
+ from autocoder.rag.cache.base_cache import BaseCacheManager
2
+ from typing import Dict, List,Any,Optional
3
+ import os
4
+ import threading
5
+ from loguru import logger
6
+ from watchfiles import watch, Change
7
+ from autocoder.rag.variable_holder import VariableHolder
8
+ from autocoder.common import SourceCode
9
+ from autocoder.rag.utils import process_file_in_multi_process,process_file_local
10
+ from watchfiles import Change, DefaultFilter, awatch, watch
11
+
12
+
13
+ class AutoCoderRAGDocListener(BaseCacheManager):
14
+ cache: Dict[str, Dict] = {}
15
+ ignore_dirs = [
16
+ "__pycache__",
17
+ ".git",
18
+ ".hg",
19
+ ".svn",
20
+ ".tox",
21
+ ".venv",
22
+ ".cache",
23
+ ".idea",
24
+ "node_modules",
25
+ ".mypy_cache",
26
+ ".pytest_cache",
27
+ ".hypothesis",
28
+ ]
29
+ ignore_entity_patterns = [
30
+ r"\.py[cod]$",
31
+ r"\.___jb_...___$",
32
+ r"\.sw.$",
33
+ "~$",
34
+ r"^\.\#",
35
+ r"^\.DS_Store$",
36
+ r"^flycheck_",
37
+ r"^test.*$",
38
+ ]
39
+
40
+ def __init__(self, path: str, ignore_spec, required_exts: List) -> None:
41
+ self.path = path
42
+ self.ignore_spec = ignore_spec
43
+ self.required_exts = required_exts
44
+ self.stop_event = threading.Event()
45
+
46
+ # connect list
47
+ self.ignore_entity_patterns.extend(self._load_ignore_file())
48
+ self.file_filter = DefaultFilter(
49
+ ignore_dirs=self.ignore_dirs,
50
+ ignore_paths=[],
51
+ ignore_entity_patterns=self.ignore_entity_patterns,
52
+ )
53
+ self.load_first()
54
+ # 创建一个新线程来执行open_watch
55
+ self.watch_thread = threading.Thread(target=self.open_watch)
56
+ # 将线程设置为守护线程,这样主程序退出时,这个线程也会自动退出
57
+ self.watch_thread.daemon = True
58
+ # 启动线程
59
+ self.watch_thread.start()
60
+
61
+ def stop(self):
62
+ self.stop_event.set()
63
+ self.watch_thread.join()
64
+
65
+ def __del__(self):
66
+ self.stop()
67
+
68
+ def load_first(self):
69
+ files_to_process = self.get_all_files()
70
+ if not files_to_process:
71
+ return
72
+ for item in files_to_process:
73
+ self.update_cache(item)
74
+
75
+ def update_cache(self, file_path):
76
+ source_code = process_file_local(file_path)
77
+ self.cache[file_path] = {
78
+ "file_path": file_path,
79
+ "content": [c.model_dump() for c in source_code],
80
+ }
81
+ logger.info(f"update cache: {file_path}")
82
+ logger.info(f"current cache: {self.cache.keys()}")
83
+
84
+ def remove_cache(self, file_path):
85
+ del self.cache[file_path]
86
+ logger.info(f"remove cache: {file_path}")
87
+ logger.info(f"current cache: {self.cache.keys()}")
88
+
89
+ def open_watch(self):
90
+ logger.info(f"start monitor: {self.path}...")
91
+ for changes in watch(
92
+ self.path, watch_filter=self.file_filter, stop_event=self.stop_event
93
+ ):
94
+ for change in changes:
95
+ (action, path) = change
96
+ if action == Change.added or action == Change.modified:
97
+ self.update_cache(path)
98
+ elif action == Change.deleted:
99
+ self.remove_cache(path)
100
+
101
+ def get_cache(self,options:Optional[Dict[str,Any]]=None):
102
+ return self.cache
103
+
104
+ def _load_ignore_file(self):
105
+ serveignore_path = os.path.join(self.path, ".serveignore")
106
+ gitignore_path = os.path.join(self.path, ".gitignore")
107
+
108
+ if os.path.exists(serveignore_path):
109
+ with open(serveignore_path, "r") as ignore_file:
110
+ patterns = ignore_file.readlines()
111
+ return [pattern.strip() for pattern in patterns]
112
+ elif os.path.exists(gitignore_path):
113
+ with open(gitignore_path, "r") as ignore_file:
114
+ patterns = ignore_file.readlines()
115
+ return [pattern.strip() for pattern in patterns]
116
+ return []
117
+
118
+ def get_all_files(self) -> List[str]:
119
+ all_files = []
120
+ for root, dirs, files in os.walk(self.path):
121
+ dirs[:] = [d for d in dirs if not d.startswith(".")]
122
+
123
+ if self.ignore_spec:
124
+ relative_root = os.path.relpath(root, self.path)
125
+ dirs[:] = [
126
+ d
127
+ for d in dirs
128
+ if not self.ignore_spec.match_file(os.path.join(relative_root, d))
129
+ ]
130
+ files = [
131
+ f
132
+ for f in files
133
+ if not self.ignore_spec.match_file(os.path.join(relative_root, f))
134
+ ]
135
+
136
+ for file in files:
137
+ if self.required_exts and not any(
138
+ file.endswith(ext) for ext in self.required_exts
139
+ ):
140
+ continue
141
+
142
+ file_path = os.path.join(root, file)
143
+ absolute_path = os.path.abspath(file_path)
144
+ all_files.append(absolute_path)
145
+
146
+ return all_files