auto-coder 0.1.345__py3-none-any.whl → 0.1.347__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of auto-coder might be problematic. Click here for more details.

@@ -48,7 +48,7 @@ class AutoCoderRAGDocListener(BaseCacheManager):
48
48
  r"^test.*$",
49
49
  ]
50
50
 
51
- def __init__(self, path: str, ignore_spec, required_exts: List) -> None:
51
+ def __init__(self, path: str, ignore_spec, required_exts: List, args=None, llm=None) -> None:
52
52
  """
53
53
  初始化文件监控缓存管理器。
54
54
 
@@ -89,6 +89,8 @@ class AutoCoderRAGDocListener(BaseCacheManager):
89
89
  self.path = path
90
90
  self.ignore_spec = ignore_spec
91
91
  self.required_exts = required_exts
92
+ self.args = args
93
+ self.llm = llm
92
94
  self.stop_event = threading.Event()
93
95
 
94
96
  # connect list
@@ -30,6 +30,7 @@ from typing import Union
30
30
  from byzerllm import SimpleByzerLLM, ByzerLLM
31
31
  from autocoder.rag.cache.cache_result_merge import CacheResultMerger, MergeStrategy
32
32
  import time
33
+ from .failed_files_utils import save_failed_files, load_failed_files
33
34
 
34
35
  if platform.system() != "Windows":
35
36
  import fcntl
@@ -70,71 +71,17 @@ class LocalByzerStorageCache(BaseCacheManager):
70
71
  emb_llm: Union[ByzerLLM, SimpleByzerLLM] = None,
71
72
  host: str = "127.0.0.1",
72
73
  port: int = 33333,
74
+ args=None,
75
+ llm=None,
73
76
  ):
74
77
  """
75
78
  初始化基于 Byzer Storage 的 RAG 缓存管理器。
76
-
77
- 参数:
78
- path: 需要索引的代码库根目录
79
- ignore_spec: 指定哪些文件/目录应被忽略的规则
80
- required_exts: 需要处理的文件扩展名列表
81
- extra_params: 额外的配置参数,包含向量索引相关设置
82
- emb_llm: 用于生成文本向量嵌入的 ByzerLLM 实例
83
- host: Byzer Storage 服务的主机地址
84
- port: Byzer Storage 服务的端口
85
-
86
- 缓存结构 (self.cache):
87
- self.cache 是一个字典,键为文件路径,值为 CacheItem 对象:
88
- {
89
- "file_path1": CacheItem(
90
- file_path: str, # 文件的绝对路径
91
- relative_path: str, # 相对于项目根目录的路径
92
- content: List[Dict], # 文件内容的结构化表示,每个元素是 SourceCode 对象的序列化
93
- modify_time: float, # 文件最后修改时间的时间戳
94
- md5: str # 文件内容的 MD5 哈希值,用于检测变更
95
- ),
96
- "file_path2": CacheItem(...),
97
- ...
98
- }
99
-
100
- 这个缓存有两层存储:
101
- 1. 本地文件缓存: 保存在项目根目录的 .cache/byzer_storage_speedup.jsonl 文件中
102
- - 用于跟踪文件变更和快速加载
103
- - 使用 JSONL 格式存储,每行是一个 CacheItem 的 JSON 表示
104
-
105
- 2. Byzer Storage 向量数据库:
106
- - 存储文件内容的分块和向量嵌入
107
- - 每个文件被分割成大小为 chunk_size 的文本块
108
- - 每个块都会生成向量嵌入,用于语义搜索
109
- - 存储结构包含: 文件路径、内容块、原始内容、向量嵌入、修改时间
110
-
111
- 源代码处理流程:
112
- 在缓存更新过程中使用了两个关键函数:
113
-
114
- 1. process_file_in_multi_process: 在多进程环境中处理文件
115
- - 参数: file_info (文件信息元组)
116
- - 返回值: List[SourceCode] 或 None
117
- - 用途: 在初始构建缓存时并行处理多个文件
118
-
119
- 2. process_file_local: 在当前进程中处理单个文件
120
- - 参数: file_path (文件路径)
121
- - 返回值: List[SourceCode] 或 None
122
- - 用途: 在检测到文件更新时处理单个文件
123
-
124
- 文件处理后,会:
125
- 1. 更新内存中的缓存 (self.cache)
126
- 2. 将缓存持久化到本地文件
127
- 3. 将内容分块并更新到 Byzer Storage 向量数据库
128
-
129
- 更新机制:
130
- - 通过单独的线程异步处理文件变更
131
- - 使用 MD5 哈希值检测文件是否发生变化
132
- - 支持文件添加、更新和删除事件
133
- - 使用向量数据库进行语义检索,支持相似度搜索
134
79
  """
135
80
  self.path = path
136
81
  self.ignore_spec = ignore_spec
137
82
  self.required_exts = required_exts
83
+ self.args = args
84
+ self.llm = llm
138
85
  self.rag_build_name = extra_params.rag_build_name
139
86
  self.storage = LocalByzerStorage("byzerai_store",
140
87
  "rag_test", self.rag_build_name, host=host, port=port,emb_llm=emb_llm)
@@ -153,16 +100,20 @@ class LocalByzerStorageCache(BaseCacheManager):
153
100
  self.cache_dir, "byzer_storage_speedup.jsonl")
154
101
  self.cache: Dict[str, CacheItem] = {}
155
102
 
103
+ # 创建缓存目录
104
+ if not os.path.exists(self.cache_dir):
105
+ os.makedirs(self.cache_dir)
106
+
107
+ # failed files support
108
+ self.failed_files_path = os.path.join(self.cache_dir, "failed_files.json")
109
+ self.failed_files = load_failed_files(self.failed_files_path)
110
+
156
111
  self.lock = threading.Lock()
157
112
  self.stop_event = threading.Event()
158
113
  self.thread = threading.Thread(target=self.process_queue)
159
114
  self.thread.daemon = True
160
115
  self.thread.start()
161
116
 
162
- # 创建缓存目录
163
- if not os.path.exists(self.cache_dir):
164
- os.makedirs(self.cache_dir)
165
-
166
117
  # 加载缓存
167
118
  self.cache = self._load_cache()
168
119
 
@@ -485,6 +436,10 @@ class LocalByzerStorageCache(BaseCacheManager):
485
436
  for item in file_list.file_paths:
486
437
  logger.info(f"[QUEUE PROCESSING] Processing file deletion: {item}")
487
438
  del self.cache[item]
439
+ # remove from failed files if present
440
+ if item in self.failed_files:
441
+ self.failed_files.remove(item)
442
+ save_failed_files(self.failed_files_path, self.failed_files)
488
443
  # Create a temporary FileInfo object
489
444
  file_info = FileInfo(
490
445
  file_path=item, relative_path="", modify_time=0, file_md5="")
@@ -495,17 +450,30 @@ class LocalByzerStorageCache(BaseCacheManager):
495
450
  for file_info in file_list.file_infos:
496
451
  logger.info(
497
452
  f"[QUEUE PROCESSING] Processing file update: {file_info.file_path}")
498
- # Process file and create CacheItem
499
- content = process_file_local(
500
- self.fileinfo_to_tuple(file_info))
501
- self.cache[file_info.file_path] = CacheItem(
502
- file_path=file_info.file_path,
503
- relative_path=file_info.relative_path,
504
- content=[c.model_dump() for c in content],
505
- modify_time=file_info.modify_time,
506
- md5=file_info.file_md5,
507
- )
508
- self.update_storage(file_info, is_delete=False)
453
+ try:
454
+ content = process_file_local(
455
+ self.fileinfo_to_tuple(file_info))
456
+ if content:
457
+ self.cache[file_info.file_path] = CacheItem(
458
+ file_path=file_info.file_path,
459
+ relative_path=file_info.relative_path,
460
+ content=[c.model_dump() for c in content],
461
+ modify_time=file_info.modify_time,
462
+ md5=file_info.file_md5,
463
+ )
464
+ self.update_storage(file_info, is_delete=False)
465
+ # remove from failed files if present
466
+ if file_info.file_path in self.failed_files:
467
+ self.failed_files.remove(file_info.file_path)
468
+ save_failed_files(self.failed_files_path, self.failed_files)
469
+ else:
470
+ logger.warning(f"Empty result for file: {file_info.file_path}, treat as parse failed, skipping cache update")
471
+ self.failed_files.add(file_info.file_path)
472
+ save_failed_files(self.failed_files_path, self.failed_files)
473
+ except Exception as e:
474
+ logger.error(f"Error in process_queue: {e}")
475
+ self.failed_files.add(file_info.file_path)
476
+ save_failed_files(self.failed_files_path, self.failed_files)
509
477
  self.write_cache()
510
478
 
511
479
  elapsed = time.time() - start_time
@@ -519,6 +487,10 @@ class LocalByzerStorageCache(BaseCacheManager):
519
487
  current_files = set()
520
488
  for file_info in self.get_all_files():
521
489
  current_files.add(file_info.file_path)
490
+ # skip failed files
491
+ if file_info.file_path in self.failed_files:
492
+ logger.info(f"文件 {file_info.file_path} 之前解析失败,跳过此次更新")
493
+ continue
522
494
  if (
523
495
  file_info.file_path not in self.cache
524
496
  or self.cache[file_info.file_path].md5 != file_info.file_md5
@@ -28,6 +28,7 @@ from autocoder.rag.cache.base_cache import (
28
28
  from autocoder.rag.utils import process_file_in_multi_process, process_file_local
29
29
  from autocoder.rag.variable_holder import VariableHolder
30
30
  from byzerllm import SimpleByzerLLM, ByzerLLM
31
+ from .failed_files_utils import save_failed_files, load_failed_files
31
32
 
32
33
  if platform.system() != "Windows":
33
34
  import fcntl
@@ -300,12 +301,16 @@ class LocalDuckDBStorageCache(BaseCacheManager):
300
301
  ignore_spec,
301
302
  required_exts,
302
303
  extra_params: Optional[AutoCoderArgs] = None,
303
- emb_llm: Union[ByzerLLM, SimpleByzerLLM] = None
304
+ emb_llm: Union[ByzerLLM, SimpleByzerLLM] = None,
305
+ args=None,
306
+ llm=None
304
307
  ):
305
308
  self.path = path
306
309
  self.ignore_spec = ignore_spec
307
310
  self.required_exts = required_exts
308
311
  self.extra_params = extra_params
312
+ self.args = args
313
+ self.llm = llm
309
314
 
310
315
  self.storage = LocalDuckdbStorage(
311
316
  llm=emb_llm,
@@ -325,6 +330,11 @@ class LocalDuckDBStorageCache(BaseCacheManager):
325
330
  if not os.path.exists(self.cache_dir):
326
331
  os.makedirs(self.cache_dir)
327
332
 
333
+ # failed files support
334
+ from .failed_files_utils import load_failed_files
335
+ self.failed_files_path = os.path.join(self.cache_dir, "failed_files.json")
336
+ self.failed_files = load_failed_files(self.failed_files_path)
337
+
328
338
  self.lock = threading.Lock()
329
339
  self.stop_event = threading.Event()
330
340
  self.thread = threading.Thread(target=self.process_queue)
@@ -569,6 +579,10 @@ class LocalDuckDBStorageCache(BaseCacheManager):
569
579
  for item in file_list.file_paths:
570
580
  logger.info(f"{item} is detected to be removed")
571
581
  del self.cache[item]
582
+ # remove from failed files if present
583
+ if item in self.failed_files:
584
+ self.failed_files.remove(item)
585
+ save_failed_files(self.failed_files_path, self.failed_files)
572
586
  # 创建一个临时的 FileInfo 对象
573
587
  file_info = FileInfo(
574
588
  file_path=item, relative_path="", modify_time=0, file_md5="")
@@ -578,18 +592,30 @@ class LocalDuckDBStorageCache(BaseCacheManager):
578
592
  for file_info in file_list.file_infos:
579
593
  logger.info(
580
594
  f"{file_info.file_path} is detected to be updated")
581
- # 处理文件并创建 CacheItem
582
- # content = process_file_local(
583
- # self.fileinfo_to_tuple(file_info))
584
- content = process_file_local(file_info.file_path)
585
- self.cache[file_info.file_path] = CacheItem(
586
- file_path=file_info.file_path,
587
- relative_path=file_info.relative_path,
588
- content=[c.model_dump() for c in content],
589
- modify_time=file_info.modify_time,
590
- md5=file_info.file_md5,
591
- )
592
- self.update_storage(file_info, is_delete=False)
595
+ try:
596
+ content = process_file_local(file_info.file_path)
597
+ if content:
598
+ self.cache[file_info.file_path] = CacheItem(
599
+ file_path=file_info.file_path,
600
+ relative_path=file_info.relative_path,
601
+ content=[c.model_dump() for c in content],
602
+ modify_time=file_info.modify_time,
603
+ md5=file_info.file_md5,
604
+ )
605
+ self.update_storage(file_info, is_delete=False)
606
+ # remove from failed files if present
607
+ if file_info.file_path in self.failed_files:
608
+ self.failed_files.remove(file_info.file_path)
609
+ save_failed_files(self.failed_files_path, self.failed_files)
610
+ else:
611
+ logger.warning(f"Empty result for file: {file_info.file_path}, treat as parse failed, skipping cache update")
612
+ self.failed_files.add(file_info.file_path)
613
+ save_failed_files(self.failed_files_path, self.failed_files)
614
+ except Exception as e:
615
+ logger.error(f"Error in process_queue: {e}")
616
+ self.failed_files.add(file_info.file_path)
617
+ save_failed_files(self.failed_files_path, self.failed_files)
618
+
593
619
  self.write_cache()
594
620
 
595
621
  def trigger_update(self):
@@ -598,6 +624,10 @@ class LocalDuckDBStorageCache(BaseCacheManager):
598
624
  current_files = set()
599
625
  for file_info in self.get_all_files():
600
626
  current_files.add(file_info.file_path)
627
+ # skip failed files
628
+ if file_info.file_path in self.failed_files:
629
+ logger.info(f"文件 {file_info.file_path} 之前解析失败,跳过此次更新")
630
+ continue
601
631
  if (
602
632
  file_info.file_path not in self.cache
603
633
  or self.cache[file_info.file_path].md5 != file_info.file_md5
@@ -19,6 +19,7 @@ from loguru import logger
19
19
  from autocoder.rag.utils import process_file_in_multi_process, process_file_local
20
20
  from autocoder.rag.variable_holder import VariableHolder
21
21
  import hashlib
22
+ from .failed_files_utils import load_failed_files, save_failed_files
22
23
 
23
24
 
24
25
  default_ignore_dirs = [
@@ -45,7 +46,7 @@ def generate_content_md5(content: Union[str, bytes]) -> str:
45
46
 
46
47
 
47
48
  class AutoCoderRAGAsyncUpdateQueue(BaseCacheManager):
48
- def __init__(self, path: str, ignore_spec, required_exts: list, update_interval: int = 5):
49
+ def __init__(self, path: str, ignore_spec, required_exts: list, update_interval: int = 5, args=None, llm=None):
49
50
  """
50
51
  初始化异步更新队列,用于管理代码文件的缓存。
51
52
 
@@ -91,24 +92,31 @@ class AutoCoderRAGAsyncUpdateQueue(BaseCacheManager):
91
92
  self.path = path
92
93
  self.ignore_spec = ignore_spec
93
94
  self.required_exts = required_exts
95
+ self.args = args
96
+ self.llm = llm
94
97
  self.update_interval = update_interval
95
98
  self.queue = []
96
99
  self.cache = {} # 初始化为空字典,稍后通过 read_cache() 填充
97
100
  self.lock = threading.Lock()
98
101
  self.stop_event = threading.Event()
99
-
102
+
103
+ # 用于存放解析失败的文件路径集合
104
+ self.failed_files_path = os.path.join(self.path, ".cache", "failed_files.json")
105
+ self.failed_files = load_failed_files(self.failed_files_path)
106
+
100
107
  # 启动处理队列的线程
101
108
  self.queue_thread = threading.Thread(target=self._process_queue)
102
109
  self.queue_thread.daemon = True
103
110
  self.queue_thread.start()
104
-
111
+
105
112
  # 启动定时触发更新的线程
106
113
  self.update_thread = threading.Thread(target=self._periodic_update)
107
114
  self.update_thread.daemon = True
108
115
  self.update_thread.start()
109
-
116
+
110
117
  self.cache = self.read_cache()
111
118
 
119
+
112
120
  def _process_queue(self):
113
121
  while not self.stop_event.is_set():
114
122
  try:
@@ -183,13 +191,18 @@ class AutoCoderRAGAsyncUpdateQueue(BaseCacheManager):
183
191
  files_to_process = []
184
192
  current_files = set()
185
193
  for file_info in self.get_all_files():
186
- file_path, _, _, file_md5 = file_info
194
+ file_path, relative_path, modify_time, file_md5 = file_info
187
195
  current_files.add(file_path)
196
+ # 如果文件曾经解析失败,跳过本次增量更新
197
+ if file_path in self.failed_files:
198
+ logger.info(f"文件 {file_path} 之前解析失败,跳过此次更新")
199
+ continue
200
+ # 变更检测
188
201
  if (
189
202
  file_path not in self.cache
190
- or self.cache[file_path].get("md5","") != file_md5
203
+ or self.cache[file_path].get("md5", "") != file_md5
191
204
  ):
192
- files_to_process.append(file_info)
205
+ files_to_process.append((file_path, relative_path, modify_time, file_md5))
193
206
 
194
207
  deleted_files = set(self.cache.keys()) - current_files
195
208
  logger.info(f"files_to_process: {files_to_process}")
@@ -213,19 +226,34 @@ class AutoCoderRAGAsyncUpdateQueue(BaseCacheManager):
213
226
  if isinstance(file_list, DeleteEvent):
214
227
  for item in file_list.file_paths:
215
228
  logger.info(f"{item} is detected to be removed")
216
- del self.cache[item]
229
+ if item in self.cache:
230
+ del self.cache[item]
231
+ # 删除时也从失败列表中移除(防止文件已修复)
232
+ if item in self.failed_files:
233
+ self.failed_files.remove(item)
234
+ save_failed_files(self.failed_files_path, self.failed_files)
217
235
  elif isinstance(file_list, AddOrUpdateEvent):
218
236
  for file_info in file_list.file_infos:
219
237
  logger.info(f"{file_info.file_path} is detected to be updated")
220
238
  try:
221
239
  result = process_file_local(file_info.file_path)
222
- if result: # 只有当result不为空时才更新缓存
240
+ if result:
241
+ # 解析成功且非空
223
242
  self.update_cache(self.fileinfo_to_tuple(file_info), result)
243
+ # 如果之前失败过且本次成功,移除失败记录
244
+ if file_info.file_path in self.failed_files:
245
+ self.failed_files.remove(file_info.file_path)
246
+ save_failed_files(self.failed_files_path, self.failed_files)
224
247
  else:
225
- logger.warning(f"Empty result for file: {file_info.file_path}, skipping cache update")
248
+ # 只要为空也认为解析失败,加入失败列表
249
+ logger.warning(f"Empty result for file: {file_info.file_path}, treat as parse failed, skipping cache update")
250
+ self.failed_files.add(file_info.file_path)
251
+ save_failed_files(self.failed_files_path, self.failed_files)
226
252
  except Exception as e:
227
- logger.error(
228
- f"SimpleCache Error in process_queue: {e}")
253
+ logger.error(f"SimpleCache Error in process_queue: {e}")
254
+ # 解析失败则加入失败列表
255
+ self.failed_files.add(file_info.file_path)
256
+ save_failed_files(self.failed_files_path, self.failed_files)
229
257
 
230
258
  self.write_cache()
231
259
 
@@ -2,7 +2,6 @@ import threading
2
2
  from typing import Dict, Generator, List, Tuple, Any, Optional,Union
3
3
 
4
4
  from byzerllm import ByzerLLM, SimpleByzerLLM
5
-
6
5
  from loguru import logger
7
6
  from autocoder.common import SourceCode
8
7
  from uuid import uuid4
@@ -37,6 +36,8 @@ class LocalDocumentRetriever(BaseDocumentRetriever):
37
36
 
38
37
  def __init__(
39
38
  self,
39
+ args: AutoCoderArgs,
40
+ llm: Union[ByzerLLM,SimpleByzerLLM],
40
41
  path: str,
41
42
  ignore_spec,
42
43
  required_exts: list,
@@ -45,9 +46,12 @@ class LocalDocumentRetriever(BaseDocumentRetriever):
45
46
  single_file_token_limit: int = 60000,
46
47
  disable_auto_window: bool = False,
47
48
  enable_hybrid_index: bool = False,
48
- extra_params: Optional[AutoCoderArgs] = None,
49
- emb_llm: Union[ByzerLLM, SimpleByzerLLM] = None,
49
+ extra_params: Optional['AutoCoderArgs'] = None,
50
+ emb_llm: Union['ByzerLLM', 'SimpleByzerLLM'] = None,
50
51
  ) -> None:
52
+ self.args = args
53
+ self.llm = llm
54
+
51
55
  self.path = path
52
56
  self.ignore_spec = ignore_spec
53
57
  self.required_exts = required_exts
@@ -65,27 +69,32 @@ class LocalDocumentRetriever(BaseDocumentRetriever):
65
69
  if self.enable_hybrid_index:
66
70
  if self.on_ray:
67
71
  self.cacher = ByzerStorageCache(
68
- path, ignore_spec, required_exts, extra_params
72
+ path, ignore_spec, required_exts, extra_params,
73
+ args=self.args, llm=self.llm
69
74
  )
70
75
  else:
71
76
  if extra_params.rag_storage_type == "duckdb":
72
77
  self.cacher = LocalDuckDBStorageCache(
73
78
  path, ignore_spec, required_exts, extra_params,
74
- emb_llm=emb_llm
79
+ emb_llm=emb_llm,
80
+ args=self.args, llm=self.llm
75
81
  )
76
82
  elif extra_params.rag_storage_type in ["byzer-storage", "byzer_storage"]:
77
83
  self.cacher = LocalByzerStorageCache(
78
84
  path, ignore_spec, required_exts, extra_params,
79
- emb_llm=emb_llm
85
+ emb_llm=emb_llm,
86
+ args=self.args, llm=self.llm
80
87
  )
81
88
  else:
82
89
  if self.monitor_mode:
83
90
  self.cacher = AutoCoderRAGDocListener(
84
- path, ignore_spec, required_exts
91
+ path, ignore_spec, required_exts,
92
+ args=self.args, llm=self.llm
85
93
  )
86
94
  else:
87
95
  self.cacher = AutoCoderRAGAsyncUpdateQueue(
88
- path, ignore_spec, required_exts
96
+ path, ignore_spec, required_exts,
97
+ args=self.args, llm=self.llm
89
98
  )
90
99
 
91
100
  logger.info(f"DocumentRetriever initialized with:")
@@ -183,6 +183,8 @@ class LongContextRAG:
183
183
  "emb_llm is required for local byzer storage cache")
184
184
 
185
185
  self.document_retriever = retriever_class(
186
+ self.args,
187
+ self.llm,
186
188
  self.path,
187
189
  self.ignore_spec,
188
190
  self.required_exts,
@@ -841,8 +843,7 @@ class LongContextRAG:
841
843
  self._print_rag_stats(rag_stat)
842
844
  else:
843
845
 
844
- qa_strategy = get_qa_strategy(
845
- self.args.rag_qa_conversation_strategy)
846
+ qa_strategy = get_qa_strategy(self.args)
846
847
  new_conversations = qa_strategy.create_conversation(
847
848
  documents=[doc.source_code for doc in relevant_docs],
848
849
  conversations=conversations, local_image_host=self.args.local_image_host