auto-coder 0.1.289__py3-none-any.whl → 0.1.290__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of auto-coder might be problematic. Click here for more details.
- {auto_coder-0.1.289.dist-info → auto_coder-0.1.290.dist-info}/METADATA +2 -2
- {auto_coder-0.1.289.dist-info → auto_coder-0.1.290.dist-info}/RECORD +19 -16
- autocoder/auto_coder_rag.py +10 -0
- autocoder/common/__init__.py +4 -0
- autocoder/rag/api_server.py +48 -0
- autocoder/rag/cache/byzer_storage_cache.py +254 -44
- autocoder/rag/cache/cache_result_merge.py +265 -0
- autocoder/rag/cache/file_monitor_cache.py +117 -4
- autocoder/rag/cache/local_byzer_storage_cache.py +286 -58
- autocoder/rag/cache/rag_file_meta.py +494 -0
- autocoder/rag/cache/simple_cache.py +67 -3
- autocoder/rag/conversation_to_queries.py +139 -0
- autocoder/rag/long_context_rag.py +9 -3
- autocoder/rag/qa_conversation_strategy.py +21 -10
- autocoder/version.py +1 -1
- {auto_coder-0.1.289.dist-info → auto_coder-0.1.290.dist-info}/LICENSE +0 -0
- {auto_coder-0.1.289.dist-info → auto_coder-0.1.290.dist-info}/WHEEL +0 -0
- {auto_coder-0.1.289.dist-info → auto_coder-0.1.290.dist-info}/entry_points.txt +0 -0
- {auto_coder-0.1.289.dist-info → auto_coder-0.1.290.dist-info}/top_level.txt +0 -0
|
@@ -28,6 +28,8 @@ import platform
|
|
|
28
28
|
import hashlib
|
|
29
29
|
from typing import Union
|
|
30
30
|
from pydantic import BaseModel
|
|
31
|
+
from autocoder.rag.cache.cache_result_merge import CacheResultMerger, MergeStrategy
|
|
32
|
+
import time
|
|
31
33
|
|
|
32
34
|
if platform.system() != "Windows":
|
|
33
35
|
import fcntl
|
|
@@ -64,6 +66,69 @@ class ByzerStorageCache(BaseCacheManager):
|
|
|
64
66
|
required_exts,
|
|
65
67
|
extra_params: Optional[AutoCoderArgs] = None,
|
|
66
68
|
):
|
|
69
|
+
"""
|
|
70
|
+
初始化基于云端 Byzer Storage 的 RAG 缓存管理器。
|
|
71
|
+
|
|
72
|
+
参数:
|
|
73
|
+
path: 需要索引的代码库根目录
|
|
74
|
+
ignore_spec: 指定哪些文件/目录应被忽略的规则
|
|
75
|
+
required_exts: 需要处理的文件扩展名列表
|
|
76
|
+
extra_params: 额外的配置参数,包含向量索引相关设置
|
|
77
|
+
|
|
78
|
+
缓存结构 (self.cache):
|
|
79
|
+
self.cache 是一个字典,键为文件路径,值为 CacheItem 对象:
|
|
80
|
+
{
|
|
81
|
+
"file_path1": CacheItem(
|
|
82
|
+
file_path: str, # 文件的绝对路径
|
|
83
|
+
relative_path: str, # 相对于项目根目录的路径
|
|
84
|
+
content: List[Dict], # 文件内容的结构化表示,每个元素是 SourceCode 对象的序列化
|
|
85
|
+
modify_time: float, # 文件最后修改时间的时间戳
|
|
86
|
+
md5: str # 文件内容的 MD5 哈希值,用于检测变更
|
|
87
|
+
),
|
|
88
|
+
"file_path2": CacheItem(...),
|
|
89
|
+
...
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
这个缓存有两层存储:
|
|
93
|
+
1. 本地文件缓存: 保存在项目根目录的 .cache/byzer_storage_speedup.jsonl 文件中
|
|
94
|
+
- 用于跟踪文件变更和快速加载
|
|
95
|
+
- 使用 JSONL 格式存储,每行是一个 CacheItem 的 JSON 表示
|
|
96
|
+
|
|
97
|
+
2. 云端 Byzer Storage 向量数据库:
|
|
98
|
+
- 存储文件内容的分块和向量嵌入
|
|
99
|
+
- 每个文件被分割成大小为 chunk_size 的文本块
|
|
100
|
+
- 每个块都会生成向量嵌入,用于语义搜索
|
|
101
|
+
- 存储结构包含: 文件路径、内容块、原始内容、向量嵌入、修改时间
|
|
102
|
+
|
|
103
|
+
源代码处理流程:
|
|
104
|
+
在缓存更新过程中使用了两个关键函数:
|
|
105
|
+
|
|
106
|
+
1. process_file_in_multi_process: 在多进程环境中处理文件
|
|
107
|
+
- 参数: file_info (文件信息元组)
|
|
108
|
+
- 返回值: List[SourceCode] 或 None
|
|
109
|
+
- 用途: 在初始构建缓存时并行处理多个文件
|
|
110
|
+
|
|
111
|
+
2. process_file_local: 在当前进程中处理单个文件
|
|
112
|
+
- 参数: file_path (文件路径)
|
|
113
|
+
- 返回值: List[SourceCode] 或 None
|
|
114
|
+
- 用途: 在检测到文件更新时处理单个文件
|
|
115
|
+
|
|
116
|
+
文件处理后,会:
|
|
117
|
+
1. 更新内存中的缓存 (self.cache)
|
|
118
|
+
2. 将缓存持久化到本地文件
|
|
119
|
+
3. 将内容分块并更新到 Byzer Storage 向量数据库
|
|
120
|
+
|
|
121
|
+
更新机制:
|
|
122
|
+
- 通过单独的线程异步处理文件变更
|
|
123
|
+
- 使用 MD5 哈希值检测文件是否发生变化
|
|
124
|
+
- 支持文件添加、更新和删除事件
|
|
125
|
+
- 使用向量数据库进行语义检索,支持相似度搜索
|
|
126
|
+
|
|
127
|
+
与 LocalByzerStorageCache 的区别:
|
|
128
|
+
- 使用云端 ByzerStorage 而非本地存储
|
|
129
|
+
- 适用于需要远程访问和共享索引的场景
|
|
130
|
+
- 支持大规模分布式检索和更高级的查询功能
|
|
131
|
+
"""
|
|
67
132
|
self.path = path
|
|
68
133
|
self.ignore_spec = ignore_spec
|
|
69
134
|
self.required_exts = required_exts
|
|
@@ -185,7 +250,7 @@ class ByzerStorageCache(BaseCacheManager):
|
|
|
185
250
|
|
|
186
251
|
def build_cache(self):
|
|
187
252
|
"""Build the cache by reading files and storing in Byzer Storage"""
|
|
188
|
-
logger.info(f"
|
|
253
|
+
logger.info(f"[BUILD CACHE] Starting cache build for path: {self.path}")
|
|
189
254
|
|
|
190
255
|
files_to_process = []
|
|
191
256
|
for file_info in self.get_all_files():
|
|
@@ -195,11 +260,15 @@ class ByzerStorageCache(BaseCacheManager):
|
|
|
195
260
|
):
|
|
196
261
|
files_to_process.append(file_info)
|
|
197
262
|
|
|
263
|
+
logger.info(f"[BUILD CACHE] Found {len(files_to_process)} files to process")
|
|
198
264
|
if not files_to_process:
|
|
265
|
+
logger.info("[BUILD CACHE] No files to process, cache build completed")
|
|
199
266
|
return
|
|
200
267
|
|
|
201
268
|
from autocoder.rag.token_counter import initialize_tokenizer
|
|
202
269
|
|
|
270
|
+
logger.info("[BUILD CACHE] Starting parallel file processing...")
|
|
271
|
+
start_time = time.time()
|
|
203
272
|
with Pool(
|
|
204
273
|
processes=os.cpu_count(),
|
|
205
274
|
initializer=initialize_tokenizer,
|
|
@@ -209,6 +278,8 @@ class ByzerStorageCache(BaseCacheManager):
|
|
|
209
278
|
for file_info in files_to_process:
|
|
210
279
|
target_files_to_process.append(self.fileinfo_to_tuple(file_info))
|
|
211
280
|
results = pool.map(process_file_in_multi_process, target_files_to_process)
|
|
281
|
+
processing_time = time.time() - start_time
|
|
282
|
+
logger.info(f"[BUILD CACHE] File processing completed, time elapsed: {processing_time:.2f}s")
|
|
212
283
|
|
|
213
284
|
items = []
|
|
214
285
|
for file_info, result in zip(files_to_process, results):
|
|
@@ -222,109 +293,176 @@ class ByzerStorageCache(BaseCacheManager):
|
|
|
222
293
|
)
|
|
223
294
|
|
|
224
295
|
for doc in content:
|
|
225
|
-
logger.info(f"Processing file: {doc.module_name}")
|
|
296
|
+
logger.info(f"[BUILD CACHE] Processing file: {doc.module_name}")
|
|
226
297
|
doc.module_name
|
|
227
298
|
chunks = self._chunk_text(doc.source_code, self.chunk_size)
|
|
299
|
+
logger.info(f"[BUILD CACHE] File {doc.module_name} chunking completed, total chunks: {len(chunks)}")
|
|
228
300
|
for chunk_idx, chunk in enumerate(chunks):
|
|
229
301
|
chunk_item = {
|
|
230
302
|
"_id": f"{doc.module_name}_{chunk_idx}",
|
|
231
303
|
"file_path": file_info.file_path,
|
|
232
|
-
"content": chunk,
|
|
233
|
-
"raw_content": chunk,
|
|
234
|
-
"vector": chunk,
|
|
304
|
+
"content": chunk[0:self.chunk_size*2],
|
|
305
|
+
"raw_content": chunk[0:self.chunk_size*2],
|
|
306
|
+
"vector": chunk[0:self.chunk_size*2],
|
|
235
307
|
"mtime": file_info.modify_time,
|
|
236
308
|
}
|
|
237
309
|
items.append(chunk_item)
|
|
238
310
|
|
|
239
311
|
# Save to local cache
|
|
240
|
-
logger.info("Saving cache to local file")
|
|
312
|
+
logger.info("[BUILD CACHE] Saving cache to local file")
|
|
241
313
|
self.write_cache()
|
|
242
314
|
|
|
243
315
|
if items:
|
|
244
|
-
logger.info("
|
|
316
|
+
logger.info("[BUILD CACHE] Clearing existing cache from Byzer Storage")
|
|
245
317
|
self.storage.truncate_table()
|
|
246
|
-
logger.info("
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
318
|
+
logger.info(f"[BUILD CACHE] Preparing to write to Byzer Storage, total chunks: {len(items)}, total files: {len(files_to_process)}")
|
|
319
|
+
|
|
320
|
+
# Use a fixed optimal batch size instead of dividing by worker count
|
|
321
|
+
batch_size = 100 # Optimal batch size for Byzer Storage
|
|
322
|
+
item_batches = [items[i:i + batch_size] for i in range(0, len(items), batch_size)]
|
|
250
323
|
|
|
251
|
-
|
|
252
|
-
|
|
324
|
+
total_batches = len(item_batches)
|
|
325
|
+
completed_batches = 0
|
|
253
326
|
|
|
254
|
-
logger.info(f"
|
|
327
|
+
logger.info(f"[BUILD CACHE] Starting to write to Byzer Storage using {batch_size} items per batch, "
|
|
328
|
+
f"total batches: {total_batches}")
|
|
329
|
+
start_time = time.time()
|
|
330
|
+
|
|
331
|
+
# Use more workers to process the smaller batches efficiently
|
|
332
|
+
max_workers = min(10, total_batches) # Cap at 10 workers or total batch count
|
|
333
|
+
logger.info(f"[BUILD CACHE] Using {max_workers} parallel workers for processing")
|
|
255
334
|
|
|
256
335
|
with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
|
257
336
|
futures = []
|
|
258
|
-
|
|
337
|
+
# Submit all batches to the executor upfront (non-blocking)
|
|
338
|
+
for batch in item_batches:
|
|
259
339
|
futures.append(
|
|
260
340
|
executor.submit(
|
|
261
341
|
lambda x: self.storage.write_builder().add_items(
|
|
262
342
|
x, vector_fields=["vector"], search_fields=["content"]
|
|
263
343
|
).execute(),
|
|
264
|
-
|
|
344
|
+
batch
|
|
265
345
|
)
|
|
266
346
|
)
|
|
267
|
-
# Wait for
|
|
347
|
+
# Wait for futures to complete
|
|
268
348
|
for future in as_completed(futures):
|
|
269
349
|
try:
|
|
270
350
|
future.result()
|
|
271
|
-
|
|
272
|
-
|
|
351
|
+
completed_batches += 1
|
|
352
|
+
elapsed = time.time() - start_time
|
|
353
|
+
estimated_total = elapsed / completed_batches * total_batches if completed_batches > 0 else 0
|
|
354
|
+
remaining = estimated_total - elapsed
|
|
355
|
+
|
|
356
|
+
# Only log progress at reasonable intervals to reduce log spam
|
|
357
|
+
if completed_batches == 1 or completed_batches == total_batches or completed_batches % max(1, total_batches // 10) == 0:
|
|
358
|
+
logger.info(
|
|
359
|
+
f"[BUILD CACHE] Progress: {completed_batches}/{total_batches} batches completed "
|
|
360
|
+
f"({(completed_batches/total_batches*100):.1f}%) "
|
|
361
|
+
f"Estimated time remaining: {remaining:.1f}s"
|
|
362
|
+
)
|
|
273
363
|
except Exception as e:
|
|
274
|
-
logger.error(f"Error
|
|
364
|
+
logger.error(f"[BUILD CACHE] Error saving batch: {str(e)}")
|
|
365
|
+
# Add more detailed error information
|
|
366
|
+
logger.error(f"[BUILD CACHE] Error details: batch size: {len(batch) if 'batch' in locals() else 'unknown'}")
|
|
275
367
|
|
|
368
|
+
total_time = time.time() - start_time
|
|
369
|
+
logger.info(f"[BUILD CACHE] All chunks written, total time: {total_time:.2f}s")
|
|
276
370
|
self.storage.commit()
|
|
371
|
+
logger.info("[BUILD CACHE] Changes committed to Byzer Storage")
|
|
277
372
|
|
|
278
373
|
def update_storage(self, file_info: FileInfo, is_delete: bool):
|
|
374
|
+
"""
|
|
375
|
+
Updates file content in the Byzer Storage vector database.
|
|
376
|
+
|
|
377
|
+
Parameters:
|
|
378
|
+
file_info: FileInfo object containing file path, relative path, modify time, and MD5 hash
|
|
379
|
+
is_delete: Whether this is a delete operation, True means all records for this file will be removed
|
|
380
|
+
"""
|
|
381
|
+
logger.info(f"[UPDATE STORAGE] Starting update for file: {file_info.file_path}, is delete: {is_delete}")
|
|
382
|
+
|
|
279
383
|
query = self.storage.query_builder()
|
|
280
384
|
query.and_filter().add_condition("file_path", file_info.file_path).build()
|
|
281
385
|
results = query.execute()
|
|
282
386
|
if results:
|
|
387
|
+
logger.info(f"[UPDATE STORAGE] Deleting existing records from Byzer Storage: {len(results)} records")
|
|
283
388
|
for result in results:
|
|
284
389
|
self.storage.delete_by_ids([result["_id"]])
|
|
285
390
|
items = []
|
|
286
391
|
|
|
287
392
|
if not is_delete:
|
|
393
|
+
logger.info(f"[UPDATE STORAGE] Getting file content from cache and preparing update")
|
|
288
394
|
content = [
|
|
289
395
|
SourceCode.model_validate(doc)
|
|
290
396
|
for doc in self.cache[file_info.file_path].content
|
|
291
397
|
]
|
|
292
398
|
modify_time = self.cache[file_info.file_path].modify_time
|
|
293
399
|
for doc in content:
|
|
294
|
-
logger.info(f"Processing file: {doc.module_name}")
|
|
400
|
+
logger.info(f"[UPDATE STORAGE] Processing file: {doc.module_name}")
|
|
295
401
|
doc.module_name
|
|
296
402
|
chunks = self._chunk_text(doc.source_code, self.chunk_size)
|
|
403
|
+
logger.info(f"[UPDATE STORAGE] File {doc.module_name} chunking completed, total chunks: {len(chunks)}")
|
|
297
404
|
for chunk_idx, chunk in enumerate(chunks):
|
|
298
405
|
chunk_item = {
|
|
299
406
|
"_id": f"{doc.module_name}_{chunk_idx}",
|
|
300
407
|
"file_path": file_info.file_path,
|
|
301
|
-
"content": chunk,
|
|
302
|
-
"raw_content": chunk,
|
|
303
|
-
"vector": chunk,
|
|
408
|
+
"content": chunk[0:self.chunk_size*2],
|
|
409
|
+
"raw_content": chunk[0:self.chunk_size*2],
|
|
410
|
+
"vector": chunk[0:self.chunk_size*2],
|
|
304
411
|
"mtime": modify_time,
|
|
305
412
|
}
|
|
306
413
|
items.append(chunk_item)
|
|
307
414
|
if items:
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
415
|
+
logger.info(f"[UPDATE STORAGE] Starting to write {len(items)} chunks to Byzer Storage")
|
|
416
|
+
start_time = time.time()
|
|
417
|
+
|
|
418
|
+
# Use optimal batch size for larger updates
|
|
419
|
+
batch_size = 100
|
|
420
|
+
if len(items) > batch_size:
|
|
421
|
+
logger.info(f"[UPDATE STORAGE] Using batched writes with {batch_size} items per batch")
|
|
422
|
+
batches = [items[i:i + batch_size] for i in range(0, len(items), batch_size)]
|
|
423
|
+
total_batches = len(batches)
|
|
424
|
+
|
|
425
|
+
for i, batch in enumerate(batches):
|
|
426
|
+
self.storage.write_builder().add_items(
|
|
427
|
+
batch, vector_fields=["vector"], search_fields=["content"]
|
|
428
|
+
).execute()
|
|
429
|
+
logger.info(f"[UPDATE STORAGE] Progress: {i+1}/{total_batches} batches written")
|
|
430
|
+
else:
|
|
431
|
+
# For small item counts, just use a single write operation
|
|
432
|
+
self.storage.write_builder().add_items(
|
|
433
|
+
items, vector_fields=["vector"], search_fields=["content"]
|
|
434
|
+
).execute()
|
|
435
|
+
|
|
311
436
|
self.storage.commit()
|
|
437
|
+
elapsed = time.time() - start_time
|
|
438
|
+
logger.info(f"[UPDATE STORAGE] Write completed, time elapsed: {elapsed:.2f}s")
|
|
439
|
+
else:
|
|
440
|
+
logger.info(f"[UPDATE STORAGE] No content to write")
|
|
312
441
|
|
|
313
442
|
def process_queue(self):
|
|
443
|
+
if not self.queue:
|
|
444
|
+
logger.info("[QUEUE PROCESSING] Queue is empty, nothing to process")
|
|
445
|
+
return
|
|
446
|
+
|
|
447
|
+
logger.info(f"[QUEUE PROCESSING] Starting queue processing, queue length: {len(self.queue)}")
|
|
448
|
+
start_time = time.time()
|
|
449
|
+
|
|
314
450
|
while self.queue:
|
|
315
451
|
file_list = self.queue.pop(0)
|
|
316
452
|
if isinstance(file_list, DeleteEvent):
|
|
453
|
+
logger.info(f"[QUEUE PROCESSING] Processing delete event, total files: {len(file_list.file_paths)}")
|
|
317
454
|
for item in file_list.file_paths:
|
|
318
|
-
logger.info(f"
|
|
455
|
+
logger.info(f"[QUEUE PROCESSING] Processing file deletion: {item}")
|
|
319
456
|
del self.cache[item]
|
|
320
|
-
#
|
|
457
|
+
# Create a temporary FileInfo object
|
|
321
458
|
file_info = FileInfo(file_path=item, relative_path="", modify_time=0, file_md5="")
|
|
322
459
|
self.update_storage(file_info, is_delete=True)
|
|
323
460
|
|
|
324
461
|
elif isinstance(file_list, AddOrUpdateEvent):
|
|
462
|
+
logger.info(f"[QUEUE PROCESSING] Processing add/update event, total files: {len(file_list.file_infos)}")
|
|
325
463
|
for file_info in file_list.file_infos:
|
|
326
|
-
logger.info(f"{file_info.file_path}
|
|
327
|
-
#
|
|
464
|
+
logger.info(f"[QUEUE PROCESSING] Processing file update: {file_info.file_path}")
|
|
465
|
+
# Process file and create CacheItem
|
|
328
466
|
content = process_file_local(self.fileinfo_to_tuple(file_info))
|
|
329
467
|
self.cache[file_info.file_path] = CacheItem(
|
|
330
468
|
file_path=file_info.file_path,
|
|
@@ -335,9 +473,14 @@ class ByzerStorageCache(BaseCacheManager):
|
|
|
335
473
|
)
|
|
336
474
|
self.update_storage(file_info, is_delete=False)
|
|
337
475
|
self.write_cache()
|
|
476
|
+
|
|
477
|
+
elapsed = time.time() - start_time
|
|
478
|
+
logger.info(f"[QUEUE PROCESSING] Queue processing completed, time elapsed: {elapsed:.2f}s")
|
|
338
479
|
|
|
339
480
|
def trigger_update(self):
|
|
340
|
-
logger.info("
|
|
481
|
+
logger.info("[TRIGGER UPDATE] Starting file update check...")
|
|
482
|
+
start_time = time.time()
|
|
483
|
+
|
|
341
484
|
files_to_process = []
|
|
342
485
|
current_files = set()
|
|
343
486
|
for file_info in self.get_all_files():
|
|
@@ -349,26 +492,26 @@ class ByzerStorageCache(BaseCacheManager):
|
|
|
349
492
|
files_to_process.append(file_info)
|
|
350
493
|
|
|
351
494
|
deleted_files = set(self.cache.keys()) - current_files
|
|
352
|
-
|
|
353
|
-
logger.info(f"
|
|
495
|
+
|
|
496
|
+
logger.info(f"[TRIGGER UPDATE] Files to process: {len(files_to_process)}")
|
|
497
|
+
logger.info(f"[TRIGGER UPDATE] Files deleted: {len(deleted_files)}")
|
|
498
|
+
|
|
354
499
|
if deleted_files:
|
|
500
|
+
logger.info(f"[TRIGGER UPDATE] Adding delete event to queue")
|
|
355
501
|
with self.lock:
|
|
356
502
|
self.queue.append(DeleteEvent(file_paths=deleted_files))
|
|
357
503
|
if files_to_process:
|
|
504
|
+
logger.info(f"[TRIGGER UPDATE] Adding update event to queue")
|
|
358
505
|
with self.lock:
|
|
359
506
|
self.queue.append(AddOrUpdateEvent(file_infos=files_to_process))
|
|
507
|
+
|
|
508
|
+
elapsed = time.time() - start_time
|
|
509
|
+
logger.info(f"[TRIGGER UPDATE] Check completed, time elapsed: {elapsed:.2f}s")
|
|
360
510
|
|
|
361
|
-
def
|
|
511
|
+
def get_single_cache(self, query: str,options: Dict[str, Any]) -> Dict[str, Dict]:
|
|
362
512
|
"""Search cached documents using query"""
|
|
363
|
-
|
|
364
|
-
self.trigger_update()
|
|
365
|
-
|
|
366
|
-
if options is None or "query" not in options:
|
|
367
|
-
return {file_path: self.cache[file_path].model_dump() for file_path in self.cache}
|
|
368
|
-
|
|
369
|
-
query = options.get("query", "")
|
|
370
513
|
total_tokens = 0
|
|
371
|
-
|
|
514
|
+
logger.info(f"查询缓存 query: {query}")
|
|
372
515
|
# Build query with both vector search and text search
|
|
373
516
|
query_builder = self.storage.query_builder()
|
|
374
517
|
query_builder.set_limit(100000)
|
|
@@ -383,6 +526,14 @@ class ByzerStorageCache(BaseCacheManager):
|
|
|
383
526
|
|
|
384
527
|
results = query_builder.execute()
|
|
385
528
|
|
|
529
|
+
return results
|
|
530
|
+
|
|
531
|
+
|
|
532
|
+
def _process_search_results(self, results):
|
|
533
|
+
"""处理搜索结果,提取文件路径并构建结果字典"""
|
|
534
|
+
# 记录被处理的总tokens数
|
|
535
|
+
total_tokens = 0
|
|
536
|
+
|
|
386
537
|
# Group results by file_path and reconstruct documents while preserving order
|
|
387
538
|
# 这里还可以有排序优化,综合考虑一篇内容出现的次数以及排序位置
|
|
388
539
|
file_paths = []
|
|
@@ -400,11 +551,70 @@ class ByzerStorageCache(BaseCacheManager):
|
|
|
400
551
|
cached_data = self.cache[file_path]
|
|
401
552
|
for doc in cached_data.content:
|
|
402
553
|
if total_tokens + doc["tokens"] > self.max_output_tokens:
|
|
554
|
+
logger.info(f"用户tokens设置为:{self.max_output_tokens},累计tokens: {total_tokens} 当前文件: {file_path} tokens: {doc['tokens']},数据条数变化: {len(results)} -> {len(result)}")
|
|
403
555
|
return result
|
|
404
556
|
total_tokens += doc["tokens"]
|
|
405
557
|
result[file_path] = cached_data.model_dump()
|
|
406
|
-
|
|
558
|
+
|
|
559
|
+
logger.info(f"用户tokens设置为:{self.max_output_tokens},累计tokens: {total_tokens},数据条数变化: {len(results)} -> {len(result)}")
|
|
407
560
|
return result
|
|
561
|
+
|
|
562
|
+
|
|
563
|
+
def get_cache(self, options: Dict[str, Any]) -> Dict[str, Dict]:
|
|
564
|
+
"""
|
|
565
|
+
获取缓存中的文档信息
|
|
566
|
+
|
|
567
|
+
如果options中包含query,则根据query搜索;否则返回所有缓存
|
|
568
|
+
"""
|
|
569
|
+
# options是一个词典,词典的key是搜索参数,value是具体值
|
|
570
|
+
|
|
571
|
+
# 触发更新
|
|
572
|
+
self.trigger_update()
|
|
573
|
+
|
|
574
|
+
if options is None or "queries" not in options:
|
|
575
|
+
return {file_path: self.cache[file_path].model_dump() for file_path in self.cache}
|
|
576
|
+
|
|
577
|
+
queries = options.get("queries", [])
|
|
578
|
+
|
|
579
|
+
# 如果没有查询或只有一个查询,使用原来的方法
|
|
580
|
+
if not queries:
|
|
581
|
+
return {file_path: self.cache[file_path].model_dump() for file_path in self.cache}
|
|
582
|
+
elif len(queries) == 1:
|
|
583
|
+
results = self.get_single_cache(queries[0], options)
|
|
584
|
+
return self._process_search_results(results)
|
|
585
|
+
|
|
586
|
+
# 获取合并策略
|
|
587
|
+
merge_strategy_name = options.get("merge_strategy", MergeStrategy.WEIGHTED_RANK.value)
|
|
588
|
+
try:
|
|
589
|
+
merge_strategy = MergeStrategy(merge_strategy_name)
|
|
590
|
+
except ValueError:
|
|
591
|
+
logger.warning(f"未知的合并策略: {merge_strategy_name},使用默认策略")
|
|
592
|
+
merge_strategy = MergeStrategy.WEIGHTED_RANK
|
|
593
|
+
|
|
594
|
+
# 限制最大结果数
|
|
595
|
+
max_results = options.get("max_results", None)
|
|
596
|
+
merger = CacheResultMerger(max_results=max_results)
|
|
597
|
+
|
|
598
|
+
# 并发处理多个查询
|
|
599
|
+
query_results = []
|
|
600
|
+
with ThreadPoolExecutor(max_workers=min(len(queries), 10)) as executor:
|
|
601
|
+
future_to_query = {executor.submit(self.get_single_cache, query, options): query for query in queries}
|
|
602
|
+
for future in as_completed(future_to_query):
|
|
603
|
+
query = future_to_query[future]
|
|
604
|
+
try:
|
|
605
|
+
query_result = future.result()
|
|
606
|
+
logger.info(f"查询 '{query}' 返回 {len(query_result)} 条结果")
|
|
607
|
+
query_results.append((query, query_result))
|
|
608
|
+
except Exception as e:
|
|
609
|
+
logger.error(f"处理查询 '{query}' 时出错: {str(e)}")
|
|
610
|
+
|
|
611
|
+
logger.info(f"所有查询共返回 {sum(len(r) for _, r in query_results)} 条结果")
|
|
612
|
+
logger.info(f"使用合并策略: {merge_strategy}")
|
|
613
|
+
|
|
614
|
+
# 使用策略合并结果
|
|
615
|
+
merged_results = merger.merge(query_results, strategy=merge_strategy)
|
|
616
|
+
|
|
617
|
+
return self._process_search_results(merged_results)
|
|
408
618
|
|
|
409
619
|
|
|
410
620
|
|