auto-coder 0.1.348__py3-none-any.whl → 0.1.350__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of auto-coder might be problematic. Click here for more details.
- {auto_coder-0.1.348.dist-info → auto_coder-0.1.350.dist-info}/METADATA +1 -1
- {auto_coder-0.1.348.dist-info → auto_coder-0.1.350.dist-info}/RECORD +35 -26
- autocoder/auto_coder_runner.py +14 -10
- autocoder/chat_auto_coder_lang.py +5 -3
- autocoder/common/model_speed_tester.py +392 -0
- autocoder/common/printer.py +7 -8
- autocoder/common/run_cmd.py +247 -0
- autocoder/common/test_run_cmd.py +110 -0
- autocoder/common/v2/agent/agentic_edit.py +61 -11
- autocoder/common/v2/agent/agentic_edit_conversation.py +9 -0
- autocoder/common/v2/agent/agentic_edit_tools/execute_command_tool_resolver.py +21 -36
- autocoder/common/v2/agent/agentic_edit_tools/list_files_tool_resolver.py +4 -7
- autocoder/common/v2/agent/agentic_edit_tools/search_files_tool_resolver.py +2 -5
- autocoder/helper/rag_doc_creator.py +141 -0
- autocoder/ignorefiles/__init__.py +4 -0
- autocoder/ignorefiles/ignore_file_utils.py +63 -0
- autocoder/ignorefiles/test_ignore_file_utils.py +91 -0
- autocoder/models.py +48 -8
- autocoder/rag/cache/byzer_storage_cache.py +10 -4
- autocoder/rag/cache/file_monitor_cache.py +27 -24
- autocoder/rag/cache/local_byzer_storage_cache.py +11 -5
- autocoder/rag/cache/local_duckdb_storage_cache.py +203 -128
- autocoder/rag/cache/simple_cache.py +56 -37
- autocoder/rag/loaders/filter_utils.py +106 -0
- autocoder/rag/loaders/image_loader.py +45 -23
- autocoder/rag/loaders/pdf_loader.py +3 -3
- autocoder/rag/loaders/test_image_loader.py +209 -0
- autocoder/rag/qa_conversation_strategy.py +3 -5
- autocoder/rag/utils.py +20 -9
- autocoder/utils/_markitdown.py +35 -0
- autocoder/version.py +1 -1
- {auto_coder-0.1.348.dist-info → auto_coder-0.1.350.dist-info}/LICENSE +0 -0
- {auto_coder-0.1.348.dist-info → auto_coder-0.1.350.dist-info}/WHEEL +0 -0
- {auto_coder-0.1.348.dist-info → auto_coder-0.1.350.dist-info}/entry_points.txt +0 -0
- {auto_coder-0.1.348.dist-info → auto_coder-0.1.350.dist-info}/top_level.txt +0 -0
|
@@ -5,15 +5,21 @@ import time
|
|
|
5
5
|
import platform
|
|
6
6
|
import threading
|
|
7
7
|
from multiprocessing import Pool
|
|
8
|
+
import functools
|
|
8
9
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
9
10
|
from typing import List, Dict, Any, Optional, Tuple, Union
|
|
10
11
|
import numpy as np
|
|
11
12
|
from loguru import logger
|
|
13
|
+
from typing import Union
|
|
14
|
+
from byzerllm import SimpleByzerLLM, ByzerLLM
|
|
15
|
+
from autocoder.utils.llms import get_llm_names
|
|
12
16
|
|
|
13
17
|
try:
|
|
14
18
|
import duckdb
|
|
15
19
|
except ImportError:
|
|
16
|
-
logger.error(
|
|
20
|
+
logger.error(
|
|
21
|
+
"DuckDB is not installed, please install it using 'pip install duckdb'"
|
|
22
|
+
)
|
|
17
23
|
raise
|
|
18
24
|
|
|
19
25
|
from autocoder.common import AutoCoderArgs
|
|
@@ -23,7 +29,7 @@ from autocoder.rag.cache.base_cache import (
|
|
|
23
29
|
DeleteEvent,
|
|
24
30
|
AddOrUpdateEvent,
|
|
25
31
|
FileInfo,
|
|
26
|
-
CacheItem
|
|
32
|
+
CacheItem,
|
|
27
33
|
)
|
|
28
34
|
from autocoder.rag.utils import process_file_in_multi_process, process_file_local
|
|
29
35
|
from autocoder.rag.variable_holder import VariableHolder
|
|
@@ -36,11 +42,7 @@ else:
|
|
|
36
42
|
fcntl = None
|
|
37
43
|
|
|
38
44
|
|
|
39
|
-
default_ignore_dirs = [
|
|
40
|
-
"__pycache__",
|
|
41
|
-
"node_modules",
|
|
42
|
-
"_images"
|
|
43
|
-
]
|
|
45
|
+
default_ignore_dirs = ["__pycache__", "node_modules", "_images"]
|
|
44
46
|
|
|
45
47
|
|
|
46
48
|
def generate_file_md5(file_path: str) -> str:
|
|
@@ -80,16 +82,19 @@ class DuckDBLocalContext:
|
|
|
80
82
|
class LocalDuckdbStorage:
|
|
81
83
|
|
|
82
84
|
def __init__(
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
85
|
+
self,
|
|
86
|
+
llm: Union[ByzerLLM, SimpleByzerLLM] = None,
|
|
87
|
+
database_name: str = ":memory:",
|
|
88
|
+
table_name: str = "documents",
|
|
89
|
+
embed_dim: Optional[int] = None,
|
|
90
|
+
persist_dir: str = "./storage",
|
|
86
91
|
) -> None:
|
|
87
92
|
self.llm = llm
|
|
88
93
|
self.database_name = database_name
|
|
89
94
|
self.table_name = table_name
|
|
90
95
|
self.embed_dim = embed_dim
|
|
91
96
|
self.persist_dir = persist_dir
|
|
92
|
-
self.cache_dir = os.path.join(self.persist_dir,
|
|
97
|
+
self.cache_dir = os.path.join(self.persist_dir, ".cache")
|
|
93
98
|
logger.info(f"正在启动 DuckDBVectorStore.")
|
|
94
99
|
|
|
95
100
|
if self.database_name != ":memory:":
|
|
@@ -105,8 +110,10 @@ class LocalDuckdbStorage:
|
|
|
105
110
|
os.makedirs(self.cache_dir)
|
|
106
111
|
self._initialize()
|
|
107
112
|
self._conn = None
|
|
108
|
-
logger.info(
|
|
109
|
-
|
|
113
|
+
logger.info(
|
|
114
|
+
f"DuckDBVectorStore 初始化完成, 存储目录: {self.cache_dir}, "
|
|
115
|
+
f"数据库名称: {self.database_name}, 数据表名称: {self.table_name}"
|
|
116
|
+
)
|
|
110
117
|
|
|
111
118
|
@classmethod
|
|
112
119
|
def class_name(cls) -> str:
|
|
@@ -127,37 +134,47 @@ class LocalDuckdbStorage:
|
|
|
127
134
|
# 生成固定随机投影矩阵(避免每次调用重新生成)
|
|
128
135
|
np.random.seed(42) # 固定随机种子保证一致性
|
|
129
136
|
source_dim = len(embedding)
|
|
130
|
-
projection_matrix = np.random.randn(source_dim, target_dim) / np.sqrt(
|
|
137
|
+
projection_matrix = np.random.randn(source_dim, target_dim) / np.sqrt(
|
|
138
|
+
source_dim
|
|
139
|
+
)
|
|
131
140
|
|
|
132
141
|
# 执行投影
|
|
133
142
|
reduced = np.dot(embedding, projection_matrix)
|
|
134
143
|
return reduced
|
|
135
144
|
|
|
136
|
-
def _embedding(
|
|
145
|
+
def _embedding(
|
|
146
|
+
self, context: str, norm: bool = True, dim: int | None = None
|
|
147
|
+
) -> List[float]:
|
|
137
148
|
max_retries = 3
|
|
138
149
|
retry_count = 0
|
|
139
|
-
|
|
150
|
+
|
|
140
151
|
while retry_count < max_retries:
|
|
141
152
|
try:
|
|
142
153
|
embedding = self.llm.emb_query(context)[0].output
|
|
143
|
-
|
|
154
|
+
|
|
144
155
|
if dim:
|
|
145
|
-
embedding = self._apply_pca(
|
|
146
|
-
|
|
156
|
+
embedding = self._apply_pca(
|
|
157
|
+
embedding, target_dim=dim
|
|
158
|
+
) # 降维后形状 (1024,)
|
|
159
|
+
|
|
147
160
|
if norm:
|
|
148
161
|
embedding = embedding / np.linalg.norm(embedding)
|
|
149
|
-
|
|
162
|
+
|
|
150
163
|
return embedding.tolist()
|
|
151
164
|
except Exception as e:
|
|
152
165
|
retry_count += 1
|
|
153
166
|
if retry_count >= max_retries:
|
|
154
|
-
logger.error(
|
|
167
|
+
logger.error(
|
|
168
|
+
f"Failed to get embedding after {max_retries} attempts: {str(e)}"
|
|
169
|
+
)
|
|
155
170
|
raise
|
|
156
|
-
|
|
171
|
+
|
|
157
172
|
# Sleep between 1-5 seconds before retrying
|
|
158
173
|
sleep_time = 1 + (retry_count * 1.5)
|
|
159
|
-
logger.warning(
|
|
160
|
-
|
|
174
|
+
logger.warning(
|
|
175
|
+
f"Embedding API call failed (attempt {retry_count}/{max_retries}). "
|
|
176
|
+
f"Error: {str(e)}. Retrying in {sleep_time:.1f} seconds..."
|
|
177
|
+
)
|
|
161
178
|
time.sleep(sleep_time)
|
|
162
179
|
|
|
163
180
|
def _initialize(self) -> None:
|
|
@@ -200,9 +217,7 @@ class LocalDuckdbStorage:
|
|
|
200
217
|
|
|
201
218
|
def query_by_path(self, file_path: str):
|
|
202
219
|
_exists_query = f"""SELECT _id FROM {self.table_name} WHERE file_path = ?"""
|
|
203
|
-
query_params = [
|
|
204
|
-
file_path
|
|
205
|
-
]
|
|
220
|
+
query_params = [file_path]
|
|
206
221
|
_final_results = []
|
|
207
222
|
if self.database_name == ":memory:":
|
|
208
223
|
_final_results = self._conn.execute(_exists_query, query_params).fetchall()
|
|
@@ -213,9 +228,7 @@ class LocalDuckdbStorage:
|
|
|
213
228
|
|
|
214
229
|
def delete_by_ids(self, _ids: List[str]):
|
|
215
230
|
_delete_query = f"""DELETE FROM {self.table_name} WHERE _id IN (?);"""
|
|
216
|
-
query_params = [
|
|
217
|
-
','.join(_ids)
|
|
218
|
-
]
|
|
231
|
+
query_params = [",".join(_ids)]
|
|
219
232
|
if self.database_name == ":memory:":
|
|
220
233
|
_final_results = self._conn.execute(_delete_query, query_params).fetchall()
|
|
221
234
|
elif self.database_path is not None:
|
|
@@ -223,14 +236,16 @@ class LocalDuckdbStorage:
|
|
|
223
236
|
_final_results = _conn.execute(_delete_query, query_params).fetchall()
|
|
224
237
|
return _final_results
|
|
225
238
|
|
|
226
|
-
def _node_to_table_row(
|
|
239
|
+
def _node_to_table_row(
|
|
240
|
+
self, context_chunk: Dict[str, str | float], dim: int | None = None
|
|
241
|
+
) -> Any:
|
|
227
242
|
return (
|
|
228
243
|
context_chunk["_id"],
|
|
229
244
|
context_chunk["file_path"],
|
|
230
245
|
context_chunk["content"],
|
|
231
246
|
context_chunk["raw_content"],
|
|
232
247
|
self._embedding(context_chunk["raw_content"], norm=True, dim=dim),
|
|
233
|
-
context_chunk["mtime"]
|
|
248
|
+
context_chunk["mtime"],
|
|
234
249
|
)
|
|
235
250
|
|
|
236
251
|
def add_doc(self, context_chunk: Dict[str, str | float], dim: int | None = None):
|
|
@@ -255,7 +270,11 @@ class LocalDuckdbStorage:
|
|
|
255
270
|
_table.insert(_row)
|
|
256
271
|
|
|
257
272
|
def vector_search(
|
|
258
|
-
|
|
273
|
+
self,
|
|
274
|
+
query: str,
|
|
275
|
+
similarity_value: float = 0.7,
|
|
276
|
+
similarity_top_k: int = 10,
|
|
277
|
+
query_dim: int | None = None,
|
|
259
278
|
):
|
|
260
279
|
"""
|
|
261
280
|
list_cosine_similarity: 计算两个列表之间的余弦相似度
|
|
@@ -287,23 +306,19 @@ class LocalDuckdbStorage:
|
|
|
287
306
|
return _final_results
|
|
288
307
|
|
|
289
308
|
|
|
290
|
-
efault_ignore_dirs = [
|
|
291
|
-
"__pycache__",
|
|
292
|
-
"node_modules",
|
|
293
|
-
"_images"
|
|
294
|
-
]
|
|
309
|
+
efault_ignore_dirs = ["__pycache__", "node_modules", "_images"]
|
|
295
310
|
|
|
296
311
|
|
|
297
312
|
class LocalDuckDBStorageCache(BaseCacheManager):
|
|
298
313
|
def __init__(
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
314
|
+
self,
|
|
315
|
+
path,
|
|
316
|
+
ignore_spec,
|
|
317
|
+
required_exts,
|
|
318
|
+
extra_params: Optional[AutoCoderArgs] = None,
|
|
319
|
+
emb_llm: Union[ByzerLLM, SimpleByzerLLM] = None,
|
|
320
|
+
args: Optional[AutoCoderArgs] = None,
|
|
321
|
+
llm: Optional[Union[ByzerLLM, SimpleByzerLLM, str]] = None,
|
|
307
322
|
):
|
|
308
323
|
self.path = path
|
|
309
324
|
self.ignore_spec = ignore_spec
|
|
@@ -316,7 +331,7 @@ class LocalDuckDBStorageCache(BaseCacheManager):
|
|
|
316
331
|
llm=emb_llm,
|
|
317
332
|
database_name="byzerai_store_duckdb.db",
|
|
318
333
|
table_name="rag_duckdb",
|
|
319
|
-
persist_dir=self.path
|
|
334
|
+
persist_dir=self.path,
|
|
320
335
|
)
|
|
321
336
|
self.queue = []
|
|
322
337
|
self.chunk_size = 1000
|
|
@@ -332,6 +347,7 @@ class LocalDuckDBStorageCache(BaseCacheManager):
|
|
|
332
347
|
|
|
333
348
|
# failed files support
|
|
334
349
|
from .failed_files_utils import load_failed_files
|
|
350
|
+
|
|
335
351
|
self.failed_files_path = os.path.join(self.cache_dir, "failed_files.json")
|
|
336
352
|
self.failed_files = load_failed_files(self.failed_files_path)
|
|
337
353
|
|
|
@@ -417,7 +433,12 @@ class LocalDuckDBStorageCache(BaseCacheManager):
|
|
|
417
433
|
|
|
418
434
|
@staticmethod
|
|
419
435
|
def fileinfo_to_tuple(file_info: FileInfo) -> Tuple[str, str, float, str]:
|
|
420
|
-
return
|
|
436
|
+
return (
|
|
437
|
+
file_info.file_path,
|
|
438
|
+
file_info.relative_path,
|
|
439
|
+
file_info.modify_time,
|
|
440
|
+
file_info.file_md5,
|
|
441
|
+
)
|
|
421
442
|
|
|
422
443
|
def build_cache(self):
|
|
423
444
|
"""Build the cache by reading files and storing in DuckDBVectorStore"""
|
|
@@ -426,8 +447,8 @@ class LocalDuckDBStorageCache(BaseCacheManager):
|
|
|
426
447
|
files_to_process = []
|
|
427
448
|
for file_info in self.get_all_files():
|
|
428
449
|
if (
|
|
429
|
-
|
|
430
|
-
|
|
450
|
+
file_info.file_path not in self.cache
|
|
451
|
+
or self.cache[file_info.file_path].md5 != file_info.file_md5
|
|
431
452
|
):
|
|
432
453
|
files_to_process.append(file_info)
|
|
433
454
|
|
|
@@ -436,17 +457,20 @@ class LocalDuckDBStorageCache(BaseCacheManager):
|
|
|
436
457
|
|
|
437
458
|
from autocoder.rag.token_counter import initialize_tokenizer
|
|
438
459
|
|
|
460
|
+
llm_name = get_llm_names(self.llm)[0] if self.llm else None
|
|
461
|
+
product_mode = self.args.product_mode
|
|
439
462
|
with Pool(
|
|
440
|
-
|
|
441
|
-
|
|
442
|
-
|
|
463
|
+
processes=os.cpu_count(),
|
|
464
|
+
initializer=initialize_tokenizer,
|
|
465
|
+
initargs=(VariableHolder.TOKENIZER_PATH,),
|
|
443
466
|
) as pool:
|
|
444
467
|
target_files_to_process = []
|
|
445
468
|
for file_info in files_to_process:
|
|
446
|
-
target_files_to_process.append(
|
|
447
|
-
|
|
448
|
-
|
|
449
|
-
|
|
469
|
+
target_files_to_process.append(self.fileinfo_to_tuple(file_info))
|
|
470
|
+
worker_func = functools.partial(
|
|
471
|
+
process_file_in_multi_process, llm=llm_name, product_mode=product_mode
|
|
472
|
+
)
|
|
473
|
+
results = pool.map(worker_func, target_files_to_process)
|
|
450
474
|
|
|
451
475
|
items = []
|
|
452
476
|
for file_info, result in zip(files_to_process, results):
|
|
@@ -480,37 +504,43 @@ class LocalDuckDBStorageCache(BaseCacheManager):
|
|
|
480
504
|
if items:
|
|
481
505
|
logger.info("[BUILD CACHE] Clearing existing cache from DuckDB Storage")
|
|
482
506
|
self.storage.truncate_table()
|
|
483
|
-
logger.info(
|
|
484
|
-
|
|
507
|
+
logger.info(
|
|
508
|
+
f"[BUILD CACHE] Preparing to write to DuckDB Storage, "
|
|
509
|
+
f"total chunks: {len(items)}, total files: {len(files_to_process)}"
|
|
510
|
+
)
|
|
485
511
|
|
|
486
512
|
# Use a fixed optimal batch size instead of dividing by worker count
|
|
487
513
|
batch_size = 100 # Optimal batch size for Byzer Storage
|
|
488
|
-
item_batches = [
|
|
514
|
+
item_batches = [
|
|
515
|
+
items[i : i + batch_size] for i in range(0, len(items), batch_size)
|
|
516
|
+
]
|
|
489
517
|
|
|
490
518
|
total_batches = len(item_batches)
|
|
491
519
|
completed_batches = 0
|
|
492
520
|
|
|
493
|
-
logger.info(
|
|
494
|
-
|
|
521
|
+
logger.info(
|
|
522
|
+
f"[BUILD CACHE] Starting to write to DuckDB Storage using {batch_size} items per batch, "
|
|
523
|
+
f"total batches: {total_batches}"
|
|
524
|
+
)
|
|
495
525
|
start_time = time.time()
|
|
496
526
|
|
|
497
|
-
# Use more workers to process the smaller batches efficiently
|
|
498
|
-
max_workers = min(
|
|
499
|
-
|
|
527
|
+
# Use more workers to process the smaller batches efficiently
|
|
528
|
+
max_workers = min(
|
|
529
|
+
self.extra_params.rag_index_build_workers, total_batches
|
|
530
|
+
) # Cap at 10 workers or total batch count
|
|
531
|
+
logger.info(
|
|
532
|
+
f"[BUILD CACHE] Using {max_workers} parallel workers for processing"
|
|
533
|
+
)
|
|
500
534
|
|
|
501
535
|
def batch_add_doc(_batch):
|
|
502
536
|
for b in _batch:
|
|
503
537
|
self.storage.add_doc(b, dim=self.extra_params.rag_duckdb_vector_dim)
|
|
504
538
|
|
|
505
|
-
with
|
|
539
|
+
with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
|
506
540
|
futures = []
|
|
507
541
|
# Submit all batches to the executor upfront (non-blocking)
|
|
508
542
|
for batch in item_batches:
|
|
509
|
-
futures.append(
|
|
510
|
-
executor.submit(
|
|
511
|
-
batch_add_doc, batch
|
|
512
|
-
)
|
|
513
|
-
)
|
|
543
|
+
futures.append(executor.submit(batch_add_doc, batch))
|
|
514
544
|
|
|
515
545
|
# Wait for futures to complete
|
|
516
546
|
for future in as_completed(futures):
|
|
@@ -518,13 +548,19 @@ class LocalDuckDBStorageCache(BaseCacheManager):
|
|
|
518
548
|
future.result()
|
|
519
549
|
completed_batches += 1
|
|
520
550
|
elapsed = time.time() - start_time
|
|
521
|
-
estimated_total =
|
|
551
|
+
estimated_total = (
|
|
552
|
+
elapsed / completed_batches * total_batches
|
|
553
|
+
if completed_batches > 0
|
|
554
|
+
else 0
|
|
555
|
+
)
|
|
522
556
|
remaining = estimated_total - elapsed
|
|
523
557
|
|
|
524
558
|
# Only log progress at reasonable intervals to reduce log spam
|
|
525
|
-
if (
|
|
526
|
-
|
|
527
|
-
|
|
559
|
+
if (
|
|
560
|
+
(completed_batches == 1)
|
|
561
|
+
or (completed_batches == total_batches)
|
|
562
|
+
or (completed_batches % max(1, total_batches // 10) == 0)
|
|
563
|
+
):
|
|
528
564
|
logger.info(
|
|
529
565
|
f"[BUILD CACHE] Progress: {completed_batches}/{total_batches} batches completed "
|
|
530
566
|
f"({(completed_batches / total_batches * 100):.1f}%) "
|
|
@@ -533,11 +569,15 @@ class LocalDuckDBStorageCache(BaseCacheManager):
|
|
|
533
569
|
except Exception as e:
|
|
534
570
|
logger.error(f"[BUILD CACHE] Error saving batch: {str(e)}")
|
|
535
571
|
# Add more detailed error information
|
|
536
|
-
logger.error(
|
|
537
|
-
|
|
572
|
+
logger.error(
|
|
573
|
+
f"[BUILD CACHE] Error details: batch size: "
|
|
574
|
+
f"{len(batch) if 'batch' in locals() else 'unknown'}"
|
|
575
|
+
)
|
|
538
576
|
|
|
539
577
|
total_time = time.time() - start_time
|
|
540
|
-
logger.info(
|
|
578
|
+
logger.info(
|
|
579
|
+
f"[BUILD CACHE] All chunks written, total time: {total_time:.2f}s"
|
|
580
|
+
)
|
|
541
581
|
|
|
542
582
|
def update_storage(self, file_info: FileInfo, is_delete: bool):
|
|
543
583
|
results = self.storage.query_by_path(file_info.file_path)
|
|
@@ -548,7 +588,8 @@ class LocalDuckDBStorageCache(BaseCacheManager):
|
|
|
548
588
|
items = []
|
|
549
589
|
if not is_delete:
|
|
550
590
|
content = [
|
|
551
|
-
SourceCode.model_validate(doc)
|
|
591
|
+
SourceCode.model_validate(doc)
|
|
592
|
+
for doc in self.cache[file_info.file_path].content
|
|
552
593
|
]
|
|
553
594
|
modify_time = self.cache[file_info.file_path].modify_time
|
|
554
595
|
for doc in content:
|
|
@@ -567,7 +608,9 @@ class LocalDuckDBStorageCache(BaseCacheManager):
|
|
|
567
608
|
if items:
|
|
568
609
|
for _chunk in items:
|
|
569
610
|
try:
|
|
570
|
-
self.storage.add_doc(
|
|
611
|
+
self.storage.add_doc(
|
|
612
|
+
_chunk, dim=self.extra_params.rag_duckdb_vector_dim
|
|
613
|
+
)
|
|
571
614
|
time.sleep(self.extra_params.anti_quota_limit)
|
|
572
615
|
except Exception as err:
|
|
573
616
|
logger.error(f"Error in saving chunk: {str(err)}")
|
|
@@ -585,15 +628,19 @@ class LocalDuckDBStorageCache(BaseCacheManager):
|
|
|
585
628
|
save_failed_files(self.failed_files_path, self.failed_files)
|
|
586
629
|
# 创建一个临时的 FileInfo 对象
|
|
587
630
|
file_info = FileInfo(
|
|
588
|
-
file_path=item, relative_path="", modify_time=0, file_md5=""
|
|
631
|
+
file_path=item, relative_path="", modify_time=0, file_md5=""
|
|
632
|
+
)
|
|
589
633
|
self.update_storage(file_info, is_delete=True)
|
|
590
634
|
|
|
591
635
|
elif isinstance(file_list, AddOrUpdateEvent):
|
|
592
636
|
for file_info in file_list.file_infos:
|
|
593
|
-
logger.info(
|
|
594
|
-
f"{file_info.file_path} is detected to be updated")
|
|
637
|
+
logger.info(f"{file_info.file_path} is detected to be updated")
|
|
595
638
|
try:
|
|
596
|
-
content = process_file_local(
|
|
639
|
+
content = process_file_local(
|
|
640
|
+
file_info.file_path,
|
|
641
|
+
llm=self.llm,
|
|
642
|
+
product_mode=self.product_mode,
|
|
643
|
+
)
|
|
597
644
|
if content:
|
|
598
645
|
self.cache[file_info.file_path] = CacheItem(
|
|
599
646
|
file_path=file_info.file_path,
|
|
@@ -606,9 +653,13 @@ class LocalDuckDBStorageCache(BaseCacheManager):
|
|
|
606
653
|
# remove from failed files if present
|
|
607
654
|
if file_info.file_path in self.failed_files:
|
|
608
655
|
self.failed_files.remove(file_info.file_path)
|
|
609
|
-
save_failed_files(
|
|
656
|
+
save_failed_files(
|
|
657
|
+
self.failed_files_path, self.failed_files
|
|
658
|
+
)
|
|
610
659
|
else:
|
|
611
|
-
logger.warning(
|
|
660
|
+
logger.warning(
|
|
661
|
+
f"Empty result for file: {file_info.file_path}, treat as parse failed, skipping cache update"
|
|
662
|
+
)
|
|
612
663
|
self.failed_files.add(file_info.file_path)
|
|
613
664
|
save_failed_files(self.failed_files_path, self.failed_files)
|
|
614
665
|
except Exception as e:
|
|
@@ -629,8 +680,8 @@ class LocalDuckDBStorageCache(BaseCacheManager):
|
|
|
629
680
|
logger.info(f"文件 {file_info.file_path} 之前解析失败,跳过此次更新")
|
|
630
681
|
continue
|
|
631
682
|
if (
|
|
632
|
-
|
|
633
|
-
|
|
683
|
+
file_info.file_path not in self.cache
|
|
684
|
+
or self.cache[file_info.file_path].md5 != file_info.file_md5
|
|
634
685
|
):
|
|
635
686
|
files_to_process.append(file_info)
|
|
636
687
|
|
|
@@ -647,8 +698,11 @@ class LocalDuckDBStorageCache(BaseCacheManager):
|
|
|
647
698
|
def get_all_files(self) -> List[FileInfo]:
|
|
648
699
|
all_files = []
|
|
649
700
|
for root, dirs, files in os.walk(self.path, followlinks=True):
|
|
650
|
-
dirs[:] = [
|
|
651
|
-
|
|
701
|
+
dirs[:] = [
|
|
702
|
+
d
|
|
703
|
+
for d in dirs
|
|
704
|
+
if not d.startswith(".") and d not in default_ignore_dirs
|
|
705
|
+
]
|
|
652
706
|
|
|
653
707
|
if self.ignore_spec:
|
|
654
708
|
relative_root = os.path.relpath(root, self.path)
|
|
@@ -665,7 +719,7 @@ class LocalDuckDBStorageCache(BaseCacheManager):
|
|
|
665
719
|
|
|
666
720
|
for file in files:
|
|
667
721
|
if self.required_exts and not any(
|
|
668
|
-
|
|
722
|
+
file.endswith(ext) for ext in self.required_exts
|
|
669
723
|
):
|
|
670
724
|
continue
|
|
671
725
|
|
|
@@ -678,18 +732,22 @@ class LocalDuckDBStorageCache(BaseCacheManager):
|
|
|
678
732
|
file_path=file_path,
|
|
679
733
|
relative_path=relative_path,
|
|
680
734
|
modify_time=modify_time,
|
|
681
|
-
file_md5=file_md5
|
|
735
|
+
file_md5=file_md5,
|
|
736
|
+
)
|
|
737
|
+
)
|
|
682
738
|
|
|
683
739
|
return all_files
|
|
684
740
|
|
|
685
|
-
def _get_single_cache(
|
|
741
|
+
def _get_single_cache(
|
|
742
|
+
self, query: str, options: Dict[str, Any]
|
|
743
|
+
) -> List[Dict[str, Any]]:
|
|
686
744
|
"""
|
|
687
745
|
使用单个查询检索缓存文档
|
|
688
|
-
|
|
746
|
+
|
|
689
747
|
参数:
|
|
690
748
|
query: 查询字符串
|
|
691
749
|
options: 包含查询选项的字典
|
|
692
|
-
|
|
750
|
+
|
|
693
751
|
返回:
|
|
694
752
|
包含文档信息的字典列表,每个字典包含_id、file_path、mtime和score字段
|
|
695
753
|
"""
|
|
@@ -703,38 +761,35 @@ class LocalDuckDBStorageCache(BaseCacheManager):
|
|
|
703
761
|
query,
|
|
704
762
|
similarity_value=self.extra_params.rag_duckdb_query_similarity,
|
|
705
763
|
similarity_top_k=self.extra_params.rag_duckdb_query_top_k,
|
|
706
|
-
query_dim=self.extra_params.rag_duckdb_vector_dim
|
|
764
|
+
query_dim=self.extra_params.rag_duckdb_vector_dim,
|
|
707
765
|
)
|
|
708
|
-
|
|
766
|
+
|
|
709
767
|
# Convert tuples to dictionaries for the merger
|
|
710
768
|
for _id, file_path, mtime, score in search_results:
|
|
711
|
-
results.append(
|
|
712
|
-
"_id": _id,
|
|
713
|
-
|
|
714
|
-
|
|
715
|
-
"score": score
|
|
716
|
-
})
|
|
717
|
-
|
|
769
|
+
results.append(
|
|
770
|
+
{"_id": _id, "file_path": file_path, "mtime": mtime, "score": score}
|
|
771
|
+
)
|
|
772
|
+
|
|
718
773
|
logger.info(f"查询 '{query}' 返回 {len(results)} 条记录")
|
|
719
774
|
return results
|
|
720
|
-
|
|
775
|
+
|
|
721
776
|
def _process_search_results(self, results: List[Dict[str, Any]]) -> Dict[str, Dict]:
|
|
722
777
|
"""
|
|
723
778
|
处理搜索结果,提取文件路径并构建结果字典
|
|
724
|
-
|
|
779
|
+
|
|
725
780
|
参数:
|
|
726
781
|
results: 搜索结果列表,每项包含文档信息的字典
|
|
727
|
-
|
|
782
|
+
|
|
728
783
|
返回:
|
|
729
784
|
匹配文档的字典,键为文件路径,值为文件内容
|
|
730
|
-
|
|
785
|
+
|
|
731
786
|
说明:
|
|
732
787
|
该方法会根据查询结果从缓存中提取文件内容,并记录累计token数,
|
|
733
788
|
当累计token数超过max_output_tokens时,将停止处理并返回已处理的结果。
|
|
734
789
|
"""
|
|
735
790
|
# 记录被处理的总tokens数
|
|
736
791
|
total_tokens = 0
|
|
737
|
-
|
|
792
|
+
|
|
738
793
|
# Group results by file_path and reconstruct documents while preserving order
|
|
739
794
|
# 这里还可以有排序优化,综合考虑一篇内容出现的次数以及排序位置
|
|
740
795
|
file_paths = []
|
|
@@ -755,64 +810,84 @@ class LocalDuckDBStorageCache(BaseCacheManager):
|
|
|
755
810
|
logger.info(
|
|
756
811
|
f"当前检索已超出用户设置 Hybrid Index Max Tokens:{self.max_output_tokens},"
|
|
757
812
|
f"累计tokens: {total_tokens}, "
|
|
758
|
-
f"经过向量搜索共检索出 {len(result.keys())} 个文档, 共 {len(self.cache.keys())} 个文档"
|
|
813
|
+
f"经过向量搜索共检索出 {len(result.keys())} 个文档, 共 {len(self.cache.keys())} 个文档"
|
|
814
|
+
)
|
|
759
815
|
return result
|
|
760
816
|
total_tokens += doc["tokens"]
|
|
761
817
|
result[file_path] = cached_data.model_dump()
|
|
762
818
|
logger.info(
|
|
763
819
|
f"用户Hybrid Index Max Tokens设置为:{self.max_output_tokens},"
|
|
764
820
|
f"累计tokens: {total_tokens}, "
|
|
765
|
-
f"经过向量搜索共检索出 {len(result.keys())} 个文档, 共 {len(self.cache.keys())} 个文档"
|
|
821
|
+
f"经过向量搜索共检索出 {len(result.keys())} 个文档, 共 {len(self.cache.keys())} 个文档"
|
|
822
|
+
)
|
|
766
823
|
return result
|
|
767
824
|
|
|
768
825
|
def get_cache(self, options: Optional[Dict[str, Any]] = None) -> Dict[str, Dict]:
|
|
769
826
|
"""
|
|
770
827
|
获取缓存中的文档信息
|
|
771
|
-
|
|
828
|
+
|
|
772
829
|
参数:
|
|
773
830
|
options: 包含查询参数的字典,可以包含以下键:
|
|
774
831
|
- queries: 查询列表,可以是单个查询或多个查询
|
|
775
832
|
- enable_vector_search: 是否启用向量搜索,默认为True
|
|
776
833
|
- merge_strategy: 多查询时的合并策略,默认为WEIGHTED_RANK
|
|
777
834
|
- max_results: 最大结果数,默认为None表示不限制
|
|
778
|
-
|
|
835
|
+
|
|
779
836
|
返回:
|
|
780
837
|
匹配文档的字典,键为文件路径,值为文件内容
|
|
781
838
|
"""
|
|
782
839
|
self.trigger_update() # 检查更新
|
|
783
840
|
|
|
784
841
|
if options is None or "queries" not in options:
|
|
785
|
-
return {
|
|
842
|
+
return {
|
|
843
|
+
file_path: self.cache[file_path].model_dump()
|
|
844
|
+
for file_path in self.cache
|
|
845
|
+
}
|
|
786
846
|
|
|
787
847
|
queries = options.get("queries", [])
|
|
788
|
-
|
|
848
|
+
|
|
789
849
|
# 如果没有查询或只有一个查询,使用原来的方法
|
|
790
850
|
if not queries:
|
|
791
|
-
return {
|
|
851
|
+
return {
|
|
852
|
+
file_path: self.cache[file_path].model_dump()
|
|
853
|
+
for file_path in self.cache
|
|
854
|
+
}
|
|
792
855
|
elif len(queries) == 1:
|
|
793
856
|
results = self._get_single_cache(queries[0], options)
|
|
794
857
|
return self._process_search_results(results)
|
|
795
|
-
|
|
858
|
+
|
|
796
859
|
# 导入合并策略
|
|
797
|
-
from autocoder.rag.cache.cache_result_merge import
|
|
798
|
-
|
|
860
|
+
from autocoder.rag.cache.cache_result_merge import (
|
|
861
|
+
CacheResultMerger,
|
|
862
|
+
MergeStrategy,
|
|
863
|
+
)
|
|
864
|
+
|
|
799
865
|
# 获取合并策略
|
|
800
|
-
merge_strategy_name = options.get(
|
|
866
|
+
merge_strategy_name = options.get(
|
|
867
|
+
"merge_strategy", MergeStrategy.WEIGHTED_RANK.value
|
|
868
|
+
)
|
|
801
869
|
try:
|
|
802
870
|
merge_strategy = MergeStrategy(merge_strategy_name)
|
|
803
871
|
except ValueError:
|
|
804
|
-
logger.warning(
|
|
872
|
+
logger.warning(
|
|
873
|
+
f"未知的合并策略: {merge_strategy_name}, 使用默认策略 WEIGHTED_RANK"
|
|
874
|
+
)
|
|
805
875
|
merge_strategy = MergeStrategy.WEIGHTED_RANK
|
|
806
|
-
|
|
876
|
+
|
|
807
877
|
# 限制最大结果数
|
|
808
878
|
max_results = options.get("max_results", None)
|
|
809
879
|
merger = CacheResultMerger(max_results=max_results)
|
|
810
|
-
|
|
880
|
+
|
|
811
881
|
# 并发处理多个查询
|
|
812
|
-
logger.info(
|
|
882
|
+
logger.info(
|
|
883
|
+
f"处理多查询请求,查询数量: {len(queries)}, 合并策略: {merge_strategy}"
|
|
884
|
+
)
|
|
813
885
|
query_results = []
|
|
814
886
|
with ThreadPoolExecutor(max_workers=min(len(queries), 10)) as executor:
|
|
815
|
-
future_to_query = {
|
|
887
|
+
future_to_query = {
|
|
888
|
+
executor.submit(self._get_single_cache, query, options): query
|
|
889
|
+
for query in queries
|
|
890
|
+
}
|
|
816
891
|
for future in as_completed(future_to_query):
|
|
817
892
|
query = future_to_query[future]
|
|
818
893
|
try:
|
|
@@ -821,12 +896,12 @@ class LocalDuckDBStorageCache(BaseCacheManager):
|
|
|
821
896
|
query_results.append((query, query_result))
|
|
822
897
|
except Exception as e:
|
|
823
898
|
logger.error(f"处理查询 '{query}' 时出错: {str(e)}")
|
|
824
|
-
|
|
899
|
+
|
|
825
900
|
logger.info(f"所有查询共返回 {sum(len(r) for _, r in query_results)} 条记录")
|
|
826
|
-
|
|
901
|
+
|
|
827
902
|
# 使用策略合并结果
|
|
828
903
|
merged_results = merger.merge(query_results, strategy=merge_strategy)
|
|
829
904
|
logger.info(f"合并后的结果共 {len(merged_results)} 条记录")
|
|
830
|
-
|
|
905
|
+
|
|
831
906
|
# 处理合并后的结果
|
|
832
907
|
return self._process_search_results(merged_results)
|