auto-coder 0.1.346__py3-none-any.whl → 0.1.348__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of auto-coder might be problematic. Click here for more details.
- {auto_coder-0.1.346.dist-info → auto_coder-0.1.348.dist-info}/METADATA +1 -1
- {auto_coder-0.1.346.dist-info → auto_coder-0.1.348.dist-info}/RECORD +24 -21
- autocoder/auto_coder_runner.py +5 -4
- autocoder/common/auto_coder_lang.py +8 -0
- autocoder/common/v2/agent/agentic_edit.py +68 -22
- autocoder/common/v2/agent/agentic_edit_tools/__init__.py +2 -0
- autocoder/common/v2/agent/agentic_edit_tools/list_package_info_tool_resolver.py +42 -0
- autocoder/common/v2/agent/agentic_edit_types.py +4 -0
- autocoder/plugins/__init__.py +20 -0
- autocoder/rag/cache/byzer_storage_cache.py +44 -74
- autocoder/rag/cache/failed_files_utils.py +39 -0
- autocoder/rag/cache/file_monitor_cache.py +3 -1
- autocoder/rag/cache/local_byzer_storage_cache.py +45 -73
- autocoder/rag/cache/local_duckdb_storage_cache.py +43 -13
- autocoder/rag/cache/simple_cache.py +40 -12
- autocoder/rag/document_retriever.py +17 -8
- autocoder/rag/loaders/image_loader.py +551 -0
- autocoder/rag/long_context_rag.py +2 -0
- autocoder/rag/qa_conversation_strategy.py +26 -23
- autocoder/version.py +1 -1
- {auto_coder-0.1.346.dist-info → auto_coder-0.1.348.dist-info}/LICENSE +0 -0
- {auto_coder-0.1.346.dist-info → auto_coder-0.1.348.dist-info}/WHEEL +0 -0
- {auto_coder-0.1.346.dist-info → auto_coder-0.1.348.dist-info}/entry_points.txt +0 -0
- {auto_coder-0.1.346.dist-info → auto_coder-0.1.348.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
|
|
2
|
+
import os
|
|
3
|
+
import json
|
|
4
|
+
from loguru import logger
|
|
5
|
+
|
|
6
|
+
def load_failed_files(failed_files_path: str) -> set:
|
|
7
|
+
"""
|
|
8
|
+
Load the set of failed file paths from a JSON file.
|
|
9
|
+
|
|
10
|
+
Args:
|
|
11
|
+
failed_files_path: Path to the JSON file storing failed files.
|
|
12
|
+
|
|
13
|
+
Returns:
|
|
14
|
+
A set of failed file paths.
|
|
15
|
+
"""
|
|
16
|
+
directory = os.path.dirname(failed_files_path)
|
|
17
|
+
if not os.path.exists(directory):
|
|
18
|
+
os.makedirs(directory, exist_ok=True)
|
|
19
|
+
if os.path.exists(failed_files_path):
|
|
20
|
+
try:
|
|
21
|
+
with open(failed_files_path, "r", encoding="utf-8") as f:
|
|
22
|
+
return set(json.load(f))
|
|
23
|
+
except Exception:
|
|
24
|
+
return set()
|
|
25
|
+
return set()
|
|
26
|
+
|
|
27
|
+
def save_failed_files(failed_files_path: str, failed_files: set) -> None:
|
|
28
|
+
"""
|
|
29
|
+
Save the set of failed file paths to a JSON file.
|
|
30
|
+
|
|
31
|
+
Args:
|
|
32
|
+
failed_files_path: Path to the JSON file.
|
|
33
|
+
failed_files: A set of failed file paths.
|
|
34
|
+
"""
|
|
35
|
+
try:
|
|
36
|
+
with open(failed_files_path, "w", encoding="utf-8") as f:
|
|
37
|
+
json.dump(list(failed_files), f, ensure_ascii=False, indent=2)
|
|
38
|
+
except Exception as e:
|
|
39
|
+
logger.error(f"Error saving failed files list: {e}")
|
|
@@ -48,7 +48,7 @@ class AutoCoderRAGDocListener(BaseCacheManager):
|
|
|
48
48
|
r"^test.*$",
|
|
49
49
|
]
|
|
50
50
|
|
|
51
|
-
def __init__(self, path: str, ignore_spec, required_exts: List) -> None:
|
|
51
|
+
def __init__(self, path: str, ignore_spec, required_exts: List, args=None, llm=None) -> None:
|
|
52
52
|
"""
|
|
53
53
|
初始化文件监控缓存管理器。
|
|
54
54
|
|
|
@@ -89,6 +89,8 @@ class AutoCoderRAGDocListener(BaseCacheManager):
|
|
|
89
89
|
self.path = path
|
|
90
90
|
self.ignore_spec = ignore_spec
|
|
91
91
|
self.required_exts = required_exts
|
|
92
|
+
self.args = args
|
|
93
|
+
self.llm = llm
|
|
92
94
|
self.stop_event = threading.Event()
|
|
93
95
|
|
|
94
96
|
# connect list
|
|
@@ -30,6 +30,7 @@ from typing import Union
|
|
|
30
30
|
from byzerllm import SimpleByzerLLM, ByzerLLM
|
|
31
31
|
from autocoder.rag.cache.cache_result_merge import CacheResultMerger, MergeStrategy
|
|
32
32
|
import time
|
|
33
|
+
from .failed_files_utils import save_failed_files, load_failed_files
|
|
33
34
|
|
|
34
35
|
if platform.system() != "Windows":
|
|
35
36
|
import fcntl
|
|
@@ -70,71 +71,17 @@ class LocalByzerStorageCache(BaseCacheManager):
|
|
|
70
71
|
emb_llm: Union[ByzerLLM, SimpleByzerLLM] = None,
|
|
71
72
|
host: str = "127.0.0.1",
|
|
72
73
|
port: int = 33333,
|
|
74
|
+
args=None,
|
|
75
|
+
llm=None,
|
|
73
76
|
):
|
|
74
77
|
"""
|
|
75
78
|
初始化基于 Byzer Storage 的 RAG 缓存管理器。
|
|
76
|
-
|
|
77
|
-
参数:
|
|
78
|
-
path: 需要索引的代码库根目录
|
|
79
|
-
ignore_spec: 指定哪些文件/目录应被忽略的规则
|
|
80
|
-
required_exts: 需要处理的文件扩展名列表
|
|
81
|
-
extra_params: 额外的配置参数,包含向量索引相关设置
|
|
82
|
-
emb_llm: 用于生成文本向量嵌入的 ByzerLLM 实例
|
|
83
|
-
host: Byzer Storage 服务的主机地址
|
|
84
|
-
port: Byzer Storage 服务的端口
|
|
85
|
-
|
|
86
|
-
缓存结构 (self.cache):
|
|
87
|
-
self.cache 是一个字典,键为文件路径,值为 CacheItem 对象:
|
|
88
|
-
{
|
|
89
|
-
"file_path1": CacheItem(
|
|
90
|
-
file_path: str, # 文件的绝对路径
|
|
91
|
-
relative_path: str, # 相对于项目根目录的路径
|
|
92
|
-
content: List[Dict], # 文件内容的结构化表示,每个元素是 SourceCode 对象的序列化
|
|
93
|
-
modify_time: float, # 文件最后修改时间的时间戳
|
|
94
|
-
md5: str # 文件内容的 MD5 哈希值,用于检测变更
|
|
95
|
-
),
|
|
96
|
-
"file_path2": CacheItem(...),
|
|
97
|
-
...
|
|
98
|
-
}
|
|
99
|
-
|
|
100
|
-
这个缓存有两层存储:
|
|
101
|
-
1. 本地文件缓存: 保存在项目根目录的 .cache/byzer_storage_speedup.jsonl 文件中
|
|
102
|
-
- 用于跟踪文件变更和快速加载
|
|
103
|
-
- 使用 JSONL 格式存储,每行是一个 CacheItem 的 JSON 表示
|
|
104
|
-
|
|
105
|
-
2. Byzer Storage 向量数据库:
|
|
106
|
-
- 存储文件内容的分块和向量嵌入
|
|
107
|
-
- 每个文件被分割成大小为 chunk_size 的文本块
|
|
108
|
-
- 每个块都会生成向量嵌入,用于语义搜索
|
|
109
|
-
- 存储结构包含: 文件路径、内容块、原始内容、向量嵌入、修改时间
|
|
110
|
-
|
|
111
|
-
源代码处理流程:
|
|
112
|
-
在缓存更新过程中使用了两个关键函数:
|
|
113
|
-
|
|
114
|
-
1. process_file_in_multi_process: 在多进程环境中处理文件
|
|
115
|
-
- 参数: file_info (文件信息元组)
|
|
116
|
-
- 返回值: List[SourceCode] 或 None
|
|
117
|
-
- 用途: 在初始构建缓存时并行处理多个文件
|
|
118
|
-
|
|
119
|
-
2. process_file_local: 在当前进程中处理单个文件
|
|
120
|
-
- 参数: file_path (文件路径)
|
|
121
|
-
- 返回值: List[SourceCode] 或 None
|
|
122
|
-
- 用途: 在检测到文件更新时处理单个文件
|
|
123
|
-
|
|
124
|
-
文件处理后,会:
|
|
125
|
-
1. 更新内存中的缓存 (self.cache)
|
|
126
|
-
2. 将缓存持久化到本地文件
|
|
127
|
-
3. 将内容分块并更新到 Byzer Storage 向量数据库
|
|
128
|
-
|
|
129
|
-
更新机制:
|
|
130
|
-
- 通过单独的线程异步处理文件变更
|
|
131
|
-
- 使用 MD5 哈希值检测文件是否发生变化
|
|
132
|
-
- 支持文件添加、更新和删除事件
|
|
133
|
-
- 使用向量数据库进行语义检索,支持相似度搜索
|
|
134
79
|
"""
|
|
135
80
|
self.path = path
|
|
136
81
|
self.ignore_spec = ignore_spec
|
|
137
82
|
self.required_exts = required_exts
|
|
83
|
+
self.args = args
|
|
84
|
+
self.llm = llm
|
|
138
85
|
self.rag_build_name = extra_params.rag_build_name
|
|
139
86
|
self.storage = LocalByzerStorage("byzerai_store",
|
|
140
87
|
"rag_test", self.rag_build_name, host=host, port=port,emb_llm=emb_llm)
|
|
@@ -153,16 +100,20 @@ class LocalByzerStorageCache(BaseCacheManager):
|
|
|
153
100
|
self.cache_dir, "byzer_storage_speedup.jsonl")
|
|
154
101
|
self.cache: Dict[str, CacheItem] = {}
|
|
155
102
|
|
|
103
|
+
# 创建缓存目录
|
|
104
|
+
if not os.path.exists(self.cache_dir):
|
|
105
|
+
os.makedirs(self.cache_dir)
|
|
106
|
+
|
|
107
|
+
# failed files support
|
|
108
|
+
self.failed_files_path = os.path.join(self.cache_dir, "failed_files.json")
|
|
109
|
+
self.failed_files = load_failed_files(self.failed_files_path)
|
|
110
|
+
|
|
156
111
|
self.lock = threading.Lock()
|
|
157
112
|
self.stop_event = threading.Event()
|
|
158
113
|
self.thread = threading.Thread(target=self.process_queue)
|
|
159
114
|
self.thread.daemon = True
|
|
160
115
|
self.thread.start()
|
|
161
116
|
|
|
162
|
-
# 创建缓存目录
|
|
163
|
-
if not os.path.exists(self.cache_dir):
|
|
164
|
-
os.makedirs(self.cache_dir)
|
|
165
|
-
|
|
166
117
|
# 加载缓存
|
|
167
118
|
self.cache = self._load_cache()
|
|
168
119
|
|
|
@@ -485,6 +436,10 @@ class LocalByzerStorageCache(BaseCacheManager):
|
|
|
485
436
|
for item in file_list.file_paths:
|
|
486
437
|
logger.info(f"[QUEUE PROCESSING] Processing file deletion: {item}")
|
|
487
438
|
del self.cache[item]
|
|
439
|
+
# remove from failed files if present
|
|
440
|
+
if item in self.failed_files:
|
|
441
|
+
self.failed_files.remove(item)
|
|
442
|
+
save_failed_files(self.failed_files_path, self.failed_files)
|
|
488
443
|
# Create a temporary FileInfo object
|
|
489
444
|
file_info = FileInfo(
|
|
490
445
|
file_path=item, relative_path="", modify_time=0, file_md5="")
|
|
@@ -495,17 +450,30 @@ class LocalByzerStorageCache(BaseCacheManager):
|
|
|
495
450
|
for file_info in file_list.file_infos:
|
|
496
451
|
logger.info(
|
|
497
452
|
f"[QUEUE PROCESSING] Processing file update: {file_info.file_path}")
|
|
498
|
-
|
|
499
|
-
|
|
500
|
-
|
|
501
|
-
|
|
502
|
-
|
|
503
|
-
|
|
504
|
-
|
|
505
|
-
|
|
506
|
-
|
|
507
|
-
|
|
508
|
-
|
|
453
|
+
try:
|
|
454
|
+
content = process_file_local(
|
|
455
|
+
self.fileinfo_to_tuple(file_info))
|
|
456
|
+
if content:
|
|
457
|
+
self.cache[file_info.file_path] = CacheItem(
|
|
458
|
+
file_path=file_info.file_path,
|
|
459
|
+
relative_path=file_info.relative_path,
|
|
460
|
+
content=[c.model_dump() for c in content],
|
|
461
|
+
modify_time=file_info.modify_time,
|
|
462
|
+
md5=file_info.file_md5,
|
|
463
|
+
)
|
|
464
|
+
self.update_storage(file_info, is_delete=False)
|
|
465
|
+
# remove from failed files if present
|
|
466
|
+
if file_info.file_path in self.failed_files:
|
|
467
|
+
self.failed_files.remove(file_info.file_path)
|
|
468
|
+
save_failed_files(self.failed_files_path, self.failed_files)
|
|
469
|
+
else:
|
|
470
|
+
logger.warning(f"Empty result for file: {file_info.file_path}, treat as parse failed, skipping cache update")
|
|
471
|
+
self.failed_files.add(file_info.file_path)
|
|
472
|
+
save_failed_files(self.failed_files_path, self.failed_files)
|
|
473
|
+
except Exception as e:
|
|
474
|
+
logger.error(f"Error in process_queue: {e}")
|
|
475
|
+
self.failed_files.add(file_info.file_path)
|
|
476
|
+
save_failed_files(self.failed_files_path, self.failed_files)
|
|
509
477
|
self.write_cache()
|
|
510
478
|
|
|
511
479
|
elapsed = time.time() - start_time
|
|
@@ -519,6 +487,10 @@ class LocalByzerStorageCache(BaseCacheManager):
|
|
|
519
487
|
current_files = set()
|
|
520
488
|
for file_info in self.get_all_files():
|
|
521
489
|
current_files.add(file_info.file_path)
|
|
490
|
+
# skip failed files
|
|
491
|
+
if file_info.file_path in self.failed_files:
|
|
492
|
+
logger.info(f"文件 {file_info.file_path} 之前解析失败,跳过此次更新")
|
|
493
|
+
continue
|
|
522
494
|
if (
|
|
523
495
|
file_info.file_path not in self.cache
|
|
524
496
|
or self.cache[file_info.file_path].md5 != file_info.file_md5
|
|
@@ -28,6 +28,7 @@ from autocoder.rag.cache.base_cache import (
|
|
|
28
28
|
from autocoder.rag.utils import process_file_in_multi_process, process_file_local
|
|
29
29
|
from autocoder.rag.variable_holder import VariableHolder
|
|
30
30
|
from byzerllm import SimpleByzerLLM, ByzerLLM
|
|
31
|
+
from .failed_files_utils import save_failed_files, load_failed_files
|
|
31
32
|
|
|
32
33
|
if platform.system() != "Windows":
|
|
33
34
|
import fcntl
|
|
@@ -300,12 +301,16 @@ class LocalDuckDBStorageCache(BaseCacheManager):
|
|
|
300
301
|
ignore_spec,
|
|
301
302
|
required_exts,
|
|
302
303
|
extra_params: Optional[AutoCoderArgs] = None,
|
|
303
|
-
emb_llm: Union[ByzerLLM, SimpleByzerLLM] = None
|
|
304
|
+
emb_llm: Union[ByzerLLM, SimpleByzerLLM] = None,
|
|
305
|
+
args=None,
|
|
306
|
+
llm=None
|
|
304
307
|
):
|
|
305
308
|
self.path = path
|
|
306
309
|
self.ignore_spec = ignore_spec
|
|
307
310
|
self.required_exts = required_exts
|
|
308
311
|
self.extra_params = extra_params
|
|
312
|
+
self.args = args
|
|
313
|
+
self.llm = llm
|
|
309
314
|
|
|
310
315
|
self.storage = LocalDuckdbStorage(
|
|
311
316
|
llm=emb_llm,
|
|
@@ -325,6 +330,11 @@ class LocalDuckDBStorageCache(BaseCacheManager):
|
|
|
325
330
|
if not os.path.exists(self.cache_dir):
|
|
326
331
|
os.makedirs(self.cache_dir)
|
|
327
332
|
|
|
333
|
+
# failed files support
|
|
334
|
+
from .failed_files_utils import load_failed_files
|
|
335
|
+
self.failed_files_path = os.path.join(self.cache_dir, "failed_files.json")
|
|
336
|
+
self.failed_files = load_failed_files(self.failed_files_path)
|
|
337
|
+
|
|
328
338
|
self.lock = threading.Lock()
|
|
329
339
|
self.stop_event = threading.Event()
|
|
330
340
|
self.thread = threading.Thread(target=self.process_queue)
|
|
@@ -569,6 +579,10 @@ class LocalDuckDBStorageCache(BaseCacheManager):
|
|
|
569
579
|
for item in file_list.file_paths:
|
|
570
580
|
logger.info(f"{item} is detected to be removed")
|
|
571
581
|
del self.cache[item]
|
|
582
|
+
# remove from failed files if present
|
|
583
|
+
if item in self.failed_files:
|
|
584
|
+
self.failed_files.remove(item)
|
|
585
|
+
save_failed_files(self.failed_files_path, self.failed_files)
|
|
572
586
|
# 创建一个临时的 FileInfo 对象
|
|
573
587
|
file_info = FileInfo(
|
|
574
588
|
file_path=item, relative_path="", modify_time=0, file_md5="")
|
|
@@ -578,18 +592,30 @@ class LocalDuckDBStorageCache(BaseCacheManager):
|
|
|
578
592
|
for file_info in file_list.file_infos:
|
|
579
593
|
logger.info(
|
|
580
594
|
f"{file_info.file_path} is detected to be updated")
|
|
581
|
-
|
|
582
|
-
|
|
583
|
-
|
|
584
|
-
|
|
585
|
-
|
|
586
|
-
|
|
587
|
-
|
|
588
|
-
|
|
589
|
-
|
|
590
|
-
|
|
591
|
-
|
|
592
|
-
|
|
595
|
+
try:
|
|
596
|
+
content = process_file_local(file_info.file_path)
|
|
597
|
+
if content:
|
|
598
|
+
self.cache[file_info.file_path] = CacheItem(
|
|
599
|
+
file_path=file_info.file_path,
|
|
600
|
+
relative_path=file_info.relative_path,
|
|
601
|
+
content=[c.model_dump() for c in content],
|
|
602
|
+
modify_time=file_info.modify_time,
|
|
603
|
+
md5=file_info.file_md5,
|
|
604
|
+
)
|
|
605
|
+
self.update_storage(file_info, is_delete=False)
|
|
606
|
+
# remove from failed files if present
|
|
607
|
+
if file_info.file_path in self.failed_files:
|
|
608
|
+
self.failed_files.remove(file_info.file_path)
|
|
609
|
+
save_failed_files(self.failed_files_path, self.failed_files)
|
|
610
|
+
else:
|
|
611
|
+
logger.warning(f"Empty result for file: {file_info.file_path}, treat as parse failed, skipping cache update")
|
|
612
|
+
self.failed_files.add(file_info.file_path)
|
|
613
|
+
save_failed_files(self.failed_files_path, self.failed_files)
|
|
614
|
+
except Exception as e:
|
|
615
|
+
logger.error(f"Error in process_queue: {e}")
|
|
616
|
+
self.failed_files.add(file_info.file_path)
|
|
617
|
+
save_failed_files(self.failed_files_path, self.failed_files)
|
|
618
|
+
|
|
593
619
|
self.write_cache()
|
|
594
620
|
|
|
595
621
|
def trigger_update(self):
|
|
@@ -598,6 +624,10 @@ class LocalDuckDBStorageCache(BaseCacheManager):
|
|
|
598
624
|
current_files = set()
|
|
599
625
|
for file_info in self.get_all_files():
|
|
600
626
|
current_files.add(file_info.file_path)
|
|
627
|
+
# skip failed files
|
|
628
|
+
if file_info.file_path in self.failed_files:
|
|
629
|
+
logger.info(f"文件 {file_info.file_path} 之前解析失败,跳过此次更新")
|
|
630
|
+
continue
|
|
601
631
|
if (
|
|
602
632
|
file_info.file_path not in self.cache
|
|
603
633
|
or self.cache[file_info.file_path].md5 != file_info.file_md5
|
|
@@ -19,6 +19,7 @@ from loguru import logger
|
|
|
19
19
|
from autocoder.rag.utils import process_file_in_multi_process, process_file_local
|
|
20
20
|
from autocoder.rag.variable_holder import VariableHolder
|
|
21
21
|
import hashlib
|
|
22
|
+
from .failed_files_utils import load_failed_files, save_failed_files
|
|
22
23
|
|
|
23
24
|
|
|
24
25
|
default_ignore_dirs = [
|
|
@@ -45,7 +46,7 @@ def generate_content_md5(content: Union[str, bytes]) -> str:
|
|
|
45
46
|
|
|
46
47
|
|
|
47
48
|
class AutoCoderRAGAsyncUpdateQueue(BaseCacheManager):
|
|
48
|
-
def __init__(self, path: str, ignore_spec, required_exts: list, update_interval: int = 5):
|
|
49
|
+
def __init__(self, path: str, ignore_spec, required_exts: list, update_interval: int = 5, args=None, llm=None):
|
|
49
50
|
"""
|
|
50
51
|
初始化异步更新队列,用于管理代码文件的缓存。
|
|
51
52
|
|
|
@@ -91,24 +92,31 @@ class AutoCoderRAGAsyncUpdateQueue(BaseCacheManager):
|
|
|
91
92
|
self.path = path
|
|
92
93
|
self.ignore_spec = ignore_spec
|
|
93
94
|
self.required_exts = required_exts
|
|
95
|
+
self.args = args
|
|
96
|
+
self.llm = llm
|
|
94
97
|
self.update_interval = update_interval
|
|
95
98
|
self.queue = []
|
|
96
99
|
self.cache = {} # 初始化为空字典,稍后通过 read_cache() 填充
|
|
97
100
|
self.lock = threading.Lock()
|
|
98
101
|
self.stop_event = threading.Event()
|
|
99
|
-
|
|
102
|
+
|
|
103
|
+
# 用于存放解析失败的文件路径集合
|
|
104
|
+
self.failed_files_path = os.path.join(self.path, ".cache", "failed_files.json")
|
|
105
|
+
self.failed_files = load_failed_files(self.failed_files_path)
|
|
106
|
+
|
|
100
107
|
# 启动处理队列的线程
|
|
101
108
|
self.queue_thread = threading.Thread(target=self._process_queue)
|
|
102
109
|
self.queue_thread.daemon = True
|
|
103
110
|
self.queue_thread.start()
|
|
104
|
-
|
|
111
|
+
|
|
105
112
|
# 启动定时触发更新的线程
|
|
106
113
|
self.update_thread = threading.Thread(target=self._periodic_update)
|
|
107
114
|
self.update_thread.daemon = True
|
|
108
115
|
self.update_thread.start()
|
|
109
|
-
|
|
116
|
+
|
|
110
117
|
self.cache = self.read_cache()
|
|
111
118
|
|
|
119
|
+
|
|
112
120
|
def _process_queue(self):
|
|
113
121
|
while not self.stop_event.is_set():
|
|
114
122
|
try:
|
|
@@ -183,13 +191,18 @@ class AutoCoderRAGAsyncUpdateQueue(BaseCacheManager):
|
|
|
183
191
|
files_to_process = []
|
|
184
192
|
current_files = set()
|
|
185
193
|
for file_info in self.get_all_files():
|
|
186
|
-
file_path,
|
|
194
|
+
file_path, relative_path, modify_time, file_md5 = file_info
|
|
187
195
|
current_files.add(file_path)
|
|
196
|
+
# 如果文件曾经解析失败,跳过本次增量更新
|
|
197
|
+
if file_path in self.failed_files:
|
|
198
|
+
logger.info(f"文件 {file_path} 之前解析失败,跳过此次更新")
|
|
199
|
+
continue
|
|
200
|
+
# 变更检测
|
|
188
201
|
if (
|
|
189
202
|
file_path not in self.cache
|
|
190
|
-
or self.cache[file_path].get("md5","") != file_md5
|
|
203
|
+
or self.cache[file_path].get("md5", "") != file_md5
|
|
191
204
|
):
|
|
192
|
-
files_to_process.append(
|
|
205
|
+
files_to_process.append((file_path, relative_path, modify_time, file_md5))
|
|
193
206
|
|
|
194
207
|
deleted_files = set(self.cache.keys()) - current_files
|
|
195
208
|
logger.info(f"files_to_process: {files_to_process}")
|
|
@@ -213,19 +226,34 @@ class AutoCoderRAGAsyncUpdateQueue(BaseCacheManager):
|
|
|
213
226
|
if isinstance(file_list, DeleteEvent):
|
|
214
227
|
for item in file_list.file_paths:
|
|
215
228
|
logger.info(f"{item} is detected to be removed")
|
|
216
|
-
|
|
229
|
+
if item in self.cache:
|
|
230
|
+
del self.cache[item]
|
|
231
|
+
# 删除时也从失败列表中移除(防止文件已修复)
|
|
232
|
+
if item in self.failed_files:
|
|
233
|
+
self.failed_files.remove(item)
|
|
234
|
+
save_failed_files(self.failed_files_path, self.failed_files)
|
|
217
235
|
elif isinstance(file_list, AddOrUpdateEvent):
|
|
218
236
|
for file_info in file_list.file_infos:
|
|
219
237
|
logger.info(f"{file_info.file_path} is detected to be updated")
|
|
220
238
|
try:
|
|
221
239
|
result = process_file_local(file_info.file_path)
|
|
222
|
-
if result:
|
|
240
|
+
if result:
|
|
241
|
+
# 解析成功且非空
|
|
223
242
|
self.update_cache(self.fileinfo_to_tuple(file_info), result)
|
|
243
|
+
# 如果之前失败过且本次成功,移除失败记录
|
|
244
|
+
if file_info.file_path in self.failed_files:
|
|
245
|
+
self.failed_files.remove(file_info.file_path)
|
|
246
|
+
save_failed_files(self.failed_files_path, self.failed_files)
|
|
224
247
|
else:
|
|
225
|
-
|
|
248
|
+
# 只要为空也认为解析失败,加入失败列表
|
|
249
|
+
logger.warning(f"Empty result for file: {file_info.file_path}, treat as parse failed, skipping cache update")
|
|
250
|
+
self.failed_files.add(file_info.file_path)
|
|
251
|
+
save_failed_files(self.failed_files_path, self.failed_files)
|
|
226
252
|
except Exception as e:
|
|
227
|
-
logger.error(
|
|
228
|
-
|
|
253
|
+
logger.error(f"SimpleCache Error in process_queue: {e}")
|
|
254
|
+
# 解析失败则加入失败列表
|
|
255
|
+
self.failed_files.add(file_info.file_path)
|
|
256
|
+
save_failed_files(self.failed_files_path, self.failed_files)
|
|
229
257
|
|
|
230
258
|
self.write_cache()
|
|
231
259
|
|
|
@@ -2,7 +2,6 @@ import threading
|
|
|
2
2
|
from typing import Dict, Generator, List, Tuple, Any, Optional,Union
|
|
3
3
|
|
|
4
4
|
from byzerllm import ByzerLLM, SimpleByzerLLM
|
|
5
|
-
|
|
6
5
|
from loguru import logger
|
|
7
6
|
from autocoder.common import SourceCode
|
|
8
7
|
from uuid import uuid4
|
|
@@ -37,6 +36,8 @@ class LocalDocumentRetriever(BaseDocumentRetriever):
|
|
|
37
36
|
|
|
38
37
|
def __init__(
|
|
39
38
|
self,
|
|
39
|
+
args: AutoCoderArgs,
|
|
40
|
+
llm: Union[ByzerLLM,SimpleByzerLLM],
|
|
40
41
|
path: str,
|
|
41
42
|
ignore_spec,
|
|
42
43
|
required_exts: list,
|
|
@@ -45,9 +46,12 @@ class LocalDocumentRetriever(BaseDocumentRetriever):
|
|
|
45
46
|
single_file_token_limit: int = 60000,
|
|
46
47
|
disable_auto_window: bool = False,
|
|
47
48
|
enable_hybrid_index: bool = False,
|
|
48
|
-
extra_params: Optional[AutoCoderArgs] = None,
|
|
49
|
-
emb_llm: Union[ByzerLLM, SimpleByzerLLM] = None,
|
|
49
|
+
extra_params: Optional['AutoCoderArgs'] = None,
|
|
50
|
+
emb_llm: Union['ByzerLLM', 'SimpleByzerLLM'] = None,
|
|
50
51
|
) -> None:
|
|
52
|
+
self.args = args
|
|
53
|
+
self.llm = llm
|
|
54
|
+
|
|
51
55
|
self.path = path
|
|
52
56
|
self.ignore_spec = ignore_spec
|
|
53
57
|
self.required_exts = required_exts
|
|
@@ -65,27 +69,32 @@ class LocalDocumentRetriever(BaseDocumentRetriever):
|
|
|
65
69
|
if self.enable_hybrid_index:
|
|
66
70
|
if self.on_ray:
|
|
67
71
|
self.cacher = ByzerStorageCache(
|
|
68
|
-
path, ignore_spec, required_exts, extra_params
|
|
72
|
+
path, ignore_spec, required_exts, extra_params,
|
|
73
|
+
args=self.args, llm=self.llm
|
|
69
74
|
)
|
|
70
75
|
else:
|
|
71
76
|
if extra_params.rag_storage_type == "duckdb":
|
|
72
77
|
self.cacher = LocalDuckDBStorageCache(
|
|
73
78
|
path, ignore_spec, required_exts, extra_params,
|
|
74
|
-
emb_llm=emb_llm
|
|
79
|
+
emb_llm=emb_llm,
|
|
80
|
+
args=self.args, llm=self.llm
|
|
75
81
|
)
|
|
76
82
|
elif extra_params.rag_storage_type in ["byzer-storage", "byzer_storage"]:
|
|
77
83
|
self.cacher = LocalByzerStorageCache(
|
|
78
84
|
path, ignore_spec, required_exts, extra_params,
|
|
79
|
-
emb_llm=emb_llm
|
|
85
|
+
emb_llm=emb_llm,
|
|
86
|
+
args=self.args, llm=self.llm
|
|
80
87
|
)
|
|
81
88
|
else:
|
|
82
89
|
if self.monitor_mode:
|
|
83
90
|
self.cacher = AutoCoderRAGDocListener(
|
|
84
|
-
path, ignore_spec, required_exts
|
|
91
|
+
path, ignore_spec, required_exts,
|
|
92
|
+
args=self.args, llm=self.llm
|
|
85
93
|
)
|
|
86
94
|
else:
|
|
87
95
|
self.cacher = AutoCoderRAGAsyncUpdateQueue(
|
|
88
|
-
path, ignore_spec, required_exts
|
|
96
|
+
path, ignore_spec, required_exts,
|
|
97
|
+
args=self.args, llm=self.llm
|
|
89
98
|
)
|
|
90
99
|
|
|
91
100
|
logger.info(f"DocumentRetriever initialized with:")
|