auto-coder 0.1.183__py3-none-any.whl → 0.1.185__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of auto-coder might be problematic. Click here for more details.

@@ -0,0 +1,204 @@
1
+
2
+ from multiprocessing import Pool
3
+ from autocoder.common import SourceCode
4
+ from autocoder.rag.cache.base_cache import BaseCacheManager,DeleteEvent,AddOrUpdateEvent
5
+ from typing import Dict, List, Tuple,Any,Optional
6
+ import os
7
+ import threading
8
+ import json
9
+ import platform
10
+ if platform.system() != "Windows":
11
+ import fcntl
12
+ else:
13
+ fcntl = None
14
+ import time
15
+ from loguru import logger
16
+ from autocoder.rag.utils import process_file_in_multi_process,process_file_local
17
+ from autocoder.rag.variable_holder import VariableHolder
18
+
19
+
20
+ class AutoCoderRAGAsyncUpdateQueue(BaseCacheManager):
21
+ def __init__(self, path: str, ignore_spec, required_exts: list):
22
+ self.path = path
23
+ self.ignore_spec = ignore_spec
24
+ self.required_exts = required_exts
25
+ self.queue = []
26
+ self.cache = {}
27
+ self.lock = threading.Lock()
28
+ self.stop_event = threading.Event()
29
+ self.thread = threading.Thread(target=self._process_queue)
30
+ self.thread.daemon = True
31
+ self.thread.start()
32
+ self.cache = self.read_cache()
33
+
34
+ def _process_queue(self):
35
+ while not self.stop_event.is_set():
36
+ try:
37
+ self.process_queue()
38
+ except Exception as e:
39
+ logger.error(f"Error in process_queue: {e}")
40
+ time.sleep(1) # 避免过于频繁的检查
41
+
42
+ def stop(self):
43
+ self.stop_event.set()
44
+ self.thread.join()
45
+
46
+ def __del__(self):
47
+ self.stop()
48
+
49
+ def load_first(self):
50
+ with self.lock:
51
+ if self.cache:
52
+ return
53
+ files_to_process = []
54
+ for file_info in self.get_all_files():
55
+ file_path, _, modify_time = file_info
56
+ if (
57
+ file_path not in self.cache
58
+ or self.cache[file_path]["modify_time"] < modify_time
59
+ ):
60
+ files_to_process.append(file_info)
61
+ if not files_to_process:
62
+ return
63
+ # remote_process_file = ray.remote(process_file)
64
+ # results = ray.get(
65
+ # [process_file.remote(file_info) for file_info in files_to_process]
66
+ # )
67
+ from autocoder.rag.token_counter import initialize_tokenizer
68
+
69
+ with Pool(
70
+ processes=os.cpu_count(),
71
+ initializer=initialize_tokenizer,
72
+ initargs=(VariableHolder.TOKENIZER_PATH,),
73
+ ) as pool:
74
+ results = pool.map(process_file_in_multi_process, files_to_process)
75
+
76
+ for file_info, result in zip(files_to_process, results):
77
+ self.update_cache(file_info, result)
78
+
79
+ self.write_cache()
80
+
81
+ def trigger_update(self):
82
+ logger.info("检查文件是否有更新.....")
83
+ files_to_process = []
84
+ current_files = set()
85
+ for file_info in self.get_all_files():
86
+ file_path, _, modify_time = file_info
87
+ current_files.add(file_path)
88
+ if (
89
+ file_path not in self.cache
90
+ or self.cache[file_path]["modify_time"] < modify_time
91
+ ):
92
+ files_to_process.append(file_info)
93
+
94
+ deleted_files = set(self.cache.keys()) - current_files
95
+ logger.info(f"files_to_process: {files_to_process}")
96
+ logger.info(f"deleted_files: {deleted_files}")
97
+ if deleted_files:
98
+ with self.lock:
99
+ self.queue.append(DeleteEvent(file_paths=deleted_files))
100
+ if files_to_process:
101
+ with self.lock:
102
+ self.queue.append(AddOrUpdateEvent(file_infos=files_to_process))
103
+
104
+ def process_queue(self):
105
+ while self.queue:
106
+ file_list = self.queue.pop(0)
107
+ if isinstance(file_list, DeleteEvent):
108
+ for item in file_list.file_paths:
109
+ logger.info(f"{item} is detected to be removed")
110
+ del self.cache[item]
111
+ elif isinstance(file_list, AddOrUpdateEvent):
112
+ for file_info in file_list.file_infos:
113
+ logger.info(f"{file_info[0]} is detected to be updated")
114
+ result = process_file_local(file_info[0])
115
+ self.update_cache(file_info, result)
116
+
117
+ self.write_cache()
118
+
119
+ def read_cache(self) -> Dict[str, Dict]:
120
+ cache_dir = os.path.join(self.path, ".cache")
121
+ cache_file = os.path.join(cache_dir, "cache.jsonl")
122
+
123
+ if not os.path.exists(cache_dir):
124
+ os.makedirs(cache_dir)
125
+
126
+ cache = {}
127
+ if os.path.exists(cache_file):
128
+ with open(cache_file, "r") as f:
129
+ for line in f:
130
+ data = json.loads(line)
131
+ cache[data["file_path"]] = data
132
+ return cache
133
+
134
+ def write_cache(self):
135
+ cache_dir = os.path.join(self.path, ".cache")
136
+ cache_file = os.path.join(cache_dir, "cache.jsonl")
137
+
138
+ if not fcntl:
139
+ with open(cache_file, "w") as f:
140
+ for data in self.cache.values():
141
+ json.dump(data, f, ensure_ascii=False)
142
+ f.write("\n")
143
+ else:
144
+ lock_file = cache_file + ".lock"
145
+ with open(lock_file, "w") as lockf:
146
+ try:
147
+ # 获取文件锁
148
+ fcntl.flock(lockf, fcntl.LOCK_EX | fcntl.LOCK_NB)
149
+ # 写入缓存文件
150
+ with open(cache_file, "w") as f:
151
+ for data in self.cache.values():
152
+ json.dump(data, f, ensure_ascii=False)
153
+ f.write("\n")
154
+
155
+ finally:
156
+ # 释放文件锁
157
+ fcntl.flock(lockf, fcntl.LOCK_UN)
158
+
159
+ def update_cache(
160
+ self, file_info: Tuple[str, str, float], content: List[SourceCode]
161
+ ):
162
+ file_path, relative_path, modify_time = file_info
163
+ self.cache[file_path] = {
164
+ "file_path": file_path,
165
+ "relative_path": relative_path,
166
+ "content": [c.model_dump() for c in content],
167
+ "modify_time": modify_time,
168
+ }
169
+
170
+ def get_cache(self,options:Optional[Dict[str,Any]]=None):
171
+ self.load_first()
172
+ self.trigger_update()
173
+ return self.cache
174
+
175
+ def get_all_files(self) -> List[Tuple[str, str, float]]:
176
+ all_files = []
177
+ for root, dirs, files in os.walk(self.path):
178
+ dirs[:] = [d for d in dirs if not d.startswith(".")]
179
+
180
+ if self.ignore_spec:
181
+ relative_root = os.path.relpath(root, self.path)
182
+ dirs[:] = [
183
+ d
184
+ for d in dirs
185
+ if not self.ignore_spec.match_file(os.path.join(relative_root, d))
186
+ ]
187
+ files = [
188
+ f
189
+ for f in files
190
+ if not self.ignore_spec.match_file(os.path.join(relative_root, f))
191
+ ]
192
+
193
+ for file in files:
194
+ if self.required_exts and not any(
195
+ file.endswith(ext) for ext in self.required_exts
196
+ ):
197
+ continue
198
+
199
+ file_path = os.path.join(root, file)
200
+ relative_path = os.path.relpath(file_path, self.path)
201
+ modify_time = os.path.getmtime(file_path)
202
+ all_files.append((file_path, relative_path, modify_time))
203
+
204
+ return all_files