auto-coder 0.1.289__py3-none-any.whl → 0.1.290__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of auto-coder might be problematic. Click here for more details.
- {auto_coder-0.1.289.dist-info → auto_coder-0.1.290.dist-info}/METADATA +2 -2
- {auto_coder-0.1.289.dist-info → auto_coder-0.1.290.dist-info}/RECORD +19 -16
- autocoder/auto_coder_rag.py +10 -0
- autocoder/common/__init__.py +4 -0
- autocoder/rag/api_server.py +48 -0
- autocoder/rag/cache/byzer_storage_cache.py +254 -44
- autocoder/rag/cache/cache_result_merge.py +265 -0
- autocoder/rag/cache/file_monitor_cache.py +117 -4
- autocoder/rag/cache/local_byzer_storage_cache.py +286 -58
- autocoder/rag/cache/rag_file_meta.py +494 -0
- autocoder/rag/cache/simple_cache.py +67 -3
- autocoder/rag/conversation_to_queries.py +139 -0
- autocoder/rag/long_context_rag.py +9 -3
- autocoder/rag/qa_conversation_strategy.py +21 -10
- autocoder/version.py +1 -1
- {auto_coder-0.1.289.dist-info → auto_coder-0.1.290.dist-info}/LICENSE +0 -0
- {auto_coder-0.1.289.dist-info → auto_coder-0.1.290.dist-info}/WHEEL +0 -0
- {auto_coder-0.1.289.dist-info → auto_coder-0.1.290.dist-info}/entry_points.txt +0 -0
- {auto_coder-0.1.289.dist-info → auto_coder-0.1.290.dist-info}/top_level.txt +0 -0
|
@@ -28,6 +28,8 @@ import platform
|
|
|
28
28
|
import hashlib
|
|
29
29
|
from typing import Union
|
|
30
30
|
from byzerllm import SimpleByzerLLM, ByzerLLM
|
|
31
|
+
from autocoder.rag.cache.cache_result_merge import CacheResultMerger, MergeStrategy
|
|
32
|
+
import time
|
|
31
33
|
|
|
32
34
|
if platform.system() != "Windows":
|
|
33
35
|
import fcntl
|
|
@@ -69,6 +71,67 @@ class LocalByzerStorageCache(BaseCacheManager):
|
|
|
69
71
|
host: str = "127.0.0.1",
|
|
70
72
|
port: int = 33333,
|
|
71
73
|
):
|
|
74
|
+
"""
|
|
75
|
+
初始化基于 Byzer Storage 的 RAG 缓存管理器。
|
|
76
|
+
|
|
77
|
+
参数:
|
|
78
|
+
path: 需要索引的代码库根目录
|
|
79
|
+
ignore_spec: 指定哪些文件/目录应被忽略的规则
|
|
80
|
+
required_exts: 需要处理的文件扩展名列表
|
|
81
|
+
extra_params: 额外的配置参数,包含向量索引相关设置
|
|
82
|
+
emb_llm: 用于生成文本向量嵌入的 ByzerLLM 实例
|
|
83
|
+
host: Byzer Storage 服务的主机地址
|
|
84
|
+
port: Byzer Storage 服务的端口
|
|
85
|
+
|
|
86
|
+
缓存结构 (self.cache):
|
|
87
|
+
self.cache 是一个字典,键为文件路径,值为 CacheItem 对象:
|
|
88
|
+
{
|
|
89
|
+
"file_path1": CacheItem(
|
|
90
|
+
file_path: str, # 文件的绝对路径
|
|
91
|
+
relative_path: str, # 相对于项目根目录的路径
|
|
92
|
+
content: List[Dict], # 文件内容的结构化表示,每个元素是 SourceCode 对象的序列化
|
|
93
|
+
modify_time: float, # 文件最后修改时间的时间戳
|
|
94
|
+
md5: str # 文件内容的 MD5 哈希值,用于检测变更
|
|
95
|
+
),
|
|
96
|
+
"file_path2": CacheItem(...),
|
|
97
|
+
...
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
这个缓存有两层存储:
|
|
101
|
+
1. 本地文件缓存: 保存在项目根目录的 .cache/byzer_storage_speedup.jsonl 文件中
|
|
102
|
+
- 用于跟踪文件变更和快速加载
|
|
103
|
+
- 使用 JSONL 格式存储,每行是一个 CacheItem 的 JSON 表示
|
|
104
|
+
|
|
105
|
+
2. Byzer Storage 向量数据库:
|
|
106
|
+
- 存储文件内容的分块和向量嵌入
|
|
107
|
+
- 每个文件被分割成大小为 chunk_size 的文本块
|
|
108
|
+
- 每个块都会生成向量嵌入,用于语义搜索
|
|
109
|
+
- 存储结构包含: 文件路径、内容块、原始内容、向量嵌入、修改时间
|
|
110
|
+
|
|
111
|
+
源代码处理流程:
|
|
112
|
+
在缓存更新过程中使用了两个关键函数:
|
|
113
|
+
|
|
114
|
+
1. process_file_in_multi_process: 在多进程环境中处理文件
|
|
115
|
+
- 参数: file_info (文件信息元组)
|
|
116
|
+
- 返回值: List[SourceCode] 或 None
|
|
117
|
+
- 用途: 在初始构建缓存时并行处理多个文件
|
|
118
|
+
|
|
119
|
+
2. process_file_local: 在当前进程中处理单个文件
|
|
120
|
+
- 参数: file_path (文件路径)
|
|
121
|
+
- 返回值: List[SourceCode] 或 None
|
|
122
|
+
- 用途: 在检测到文件更新时处理单个文件
|
|
123
|
+
|
|
124
|
+
文件处理后,会:
|
|
125
|
+
1. 更新内存中的缓存 (self.cache)
|
|
126
|
+
2. 将缓存持久化到本地文件
|
|
127
|
+
3. 将内容分块并更新到 Byzer Storage 向量数据库
|
|
128
|
+
|
|
129
|
+
更新机制:
|
|
130
|
+
- 通过单独的线程异步处理文件变更
|
|
131
|
+
- 使用 MD5 哈希值检测文件是否发生变化
|
|
132
|
+
- 支持文件添加、更新和删除事件
|
|
133
|
+
- 使用向量数据库进行语义检索,支持相似度搜索
|
|
134
|
+
"""
|
|
72
135
|
self.path = path
|
|
73
136
|
self.ignore_spec = ignore_spec
|
|
74
137
|
self.required_exts = required_exts
|
|
@@ -193,7 +256,7 @@ class LocalByzerStorageCache(BaseCacheManager):
|
|
|
193
256
|
|
|
194
257
|
def build_cache(self):
|
|
195
258
|
"""Build the cache by reading files and storing in Byzer Storage"""
|
|
196
|
-
logger.info(f"
|
|
259
|
+
logger.info(f"[BUILD CACHE] Starting cache build for path: {self.path}")
|
|
197
260
|
|
|
198
261
|
files_to_process = []
|
|
199
262
|
for file_info in self.get_all_files():
|
|
@@ -203,11 +266,15 @@ class LocalByzerStorageCache(BaseCacheManager):
|
|
|
203
266
|
):
|
|
204
267
|
files_to_process.append(file_info)
|
|
205
268
|
|
|
269
|
+
logger.info(f"[BUILD CACHE] Found {len(files_to_process)} files to process")
|
|
206
270
|
if not files_to_process:
|
|
271
|
+
logger.info("[BUILD CACHE] No files to process, cache build completed")
|
|
207
272
|
return
|
|
208
273
|
|
|
209
274
|
from autocoder.rag.token_counter import initialize_tokenizer
|
|
210
275
|
|
|
276
|
+
logger.info("[BUILD CACHE] Starting parallel file processing...")
|
|
277
|
+
start_time = time.time()
|
|
211
278
|
with Pool(
|
|
212
279
|
processes=os.cpu_count(),
|
|
213
280
|
initializer=initialize_tokenizer,
|
|
@@ -219,6 +286,8 @@ class LocalByzerStorageCache(BaseCacheManager):
|
|
|
219
286
|
self.fileinfo_to_tuple(file_info))
|
|
220
287
|
results = pool.map(process_file_in_multi_process,
|
|
221
288
|
target_files_to_process)
|
|
289
|
+
processing_time = time.time() - start_time
|
|
290
|
+
logger.info(f"[BUILD CACHE] File processing completed, time elapsed: {processing_time:.2f}s")
|
|
222
291
|
|
|
223
292
|
items = []
|
|
224
293
|
for file_info, result in zip(files_to_process, results):
|
|
@@ -232,113 +301,201 @@ class LocalByzerStorageCache(BaseCacheManager):
|
|
|
232
301
|
)
|
|
233
302
|
|
|
234
303
|
for doc in content:
|
|
235
|
-
logger.info(f"Processing file: {doc.module_name}")
|
|
304
|
+
logger.info(f"[BUILD CACHE] Processing file: {doc.module_name}")
|
|
236
305
|
doc.module_name
|
|
237
306
|
chunks = self._chunk_text(doc.source_code, self.chunk_size)
|
|
307
|
+
logger.info(f"[BUILD CACHE] File {doc.module_name} chunking completed, total chunks: {len(chunks)}")
|
|
308
|
+
# 可能chunk 会超出 chunk size, 为了防止出现问题,我们会做截断
|
|
238
309
|
for chunk_idx, chunk in enumerate(chunks):
|
|
239
310
|
chunk_item = {
|
|
240
311
|
"_id": f"{doc.module_name}_{chunk_idx}",
|
|
241
312
|
"file_path": file_info.file_path,
|
|
242
|
-
"content": chunk,
|
|
243
|
-
"raw_content": chunk,
|
|
244
|
-
"vector": chunk,
|
|
313
|
+
"content": chunk[0:self.chunk_size*2],
|
|
314
|
+
"raw_content": chunk[0:self.chunk_size*2],
|
|
315
|
+
"vector": chunk[0:self.chunk_size*2],
|
|
245
316
|
"mtime": file_info.modify_time,
|
|
246
317
|
}
|
|
247
318
|
items.append(chunk_item)
|
|
248
319
|
|
|
249
320
|
# Save to local cache
|
|
250
|
-
logger.info("Saving cache to local file")
|
|
251
|
-
self.write_cache()
|
|
252
|
-
|
|
253
|
-
if items:
|
|
254
|
-
logger.info("
|
|
255
|
-
self.storage.truncate_table()
|
|
256
|
-
logger.info("
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
logger.info(f"
|
|
321
|
+
logger.info("[BUILD CACHE] Saving cache to local file")
|
|
322
|
+
self.write_cache()
|
|
323
|
+
|
|
324
|
+
if items:
|
|
325
|
+
logger.info("[BUILD CACHE] Clearing existing cache from Byzer Storage")
|
|
326
|
+
self.storage.truncate_table()
|
|
327
|
+
logger.info(f"[BUILD CACHE] Preparing to write to Byzer Storage, total chunks: {len(items)}, total files: {len(files_to_process)}")
|
|
328
|
+
|
|
329
|
+
# Use a fixed optimal batch size instead of dividing by worker count
|
|
330
|
+
batch_size = 100 # Optimal batch size for Byzer Storage
|
|
331
|
+
item_batches = [items[i:i + batch_size] for i in range(0, len(items), batch_size)]
|
|
332
|
+
|
|
333
|
+
total_batches = len(item_batches)
|
|
334
|
+
completed_batches = 0
|
|
335
|
+
|
|
336
|
+
logger.info(f"[BUILD CACHE] Starting to write to Byzer Storage using {batch_size} items per batch, "
|
|
337
|
+
f"total batches: {total_batches}")
|
|
338
|
+
start_time = time.time()
|
|
339
|
+
|
|
340
|
+
# Use more workers to process the smaller batches efficiently
|
|
341
|
+
max_workers = min(10, total_batches) # Cap at 10 workers or total batch count
|
|
342
|
+
logger.info(f"[BUILD CACHE] Using {max_workers} parallel workers for processing")
|
|
266
343
|
|
|
267
344
|
with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
|
268
345
|
futures = []
|
|
269
|
-
|
|
346
|
+
# Submit all batches to the executor upfront (non-blocking)
|
|
347
|
+
for batch in item_batches:
|
|
270
348
|
futures.append(
|
|
271
349
|
executor.submit(
|
|
272
350
|
lambda x: self.storage.write_builder().add_items(
|
|
273
351
|
x, vector_fields=["vector"], search_fields=["content"]
|
|
274
352
|
).execute(),
|
|
275
|
-
|
|
353
|
+
batch
|
|
276
354
|
)
|
|
277
355
|
)
|
|
278
|
-
|
|
356
|
+
|
|
357
|
+
# Wait for futures to complete
|
|
279
358
|
for future in as_completed(futures):
|
|
280
359
|
try:
|
|
281
360
|
future.result()
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
361
|
+
completed_batches += 1
|
|
362
|
+
elapsed = time.time() - start_time
|
|
363
|
+
estimated_total = elapsed / completed_batches * total_batches if completed_batches > 0 else 0
|
|
364
|
+
remaining = estimated_total - elapsed
|
|
365
|
+
|
|
366
|
+
# Only log progress at reasonable intervals to reduce log spam
|
|
367
|
+
if completed_batches == 1 or completed_batches == total_batches or completed_batches % max(1, total_batches // 10) == 0:
|
|
368
|
+
logger.info(
|
|
369
|
+
f"[BUILD CACHE] Progress: {completed_batches}/{total_batches} batches completed "
|
|
370
|
+
f"({(completed_batches/total_batches*100):.1f}%) "
|
|
371
|
+
f"Estimated time remaining: {remaining:.1f}s"
|
|
372
|
+
)
|
|
285
373
|
except Exception as e:
|
|
286
|
-
logger.error(f"Error
|
|
374
|
+
logger.error(f"[BUILD CACHE] Error saving batch: {str(e)}")
|
|
375
|
+
# Add more detailed error information
|
|
376
|
+
logger.error(f"[BUILD CACHE] Error details: batch size: {len(batch) if 'batch' in locals() else 'unknown'}")
|
|
287
377
|
|
|
378
|
+
total_time = time.time() - start_time
|
|
379
|
+
logger.info(f"[BUILD CACHE] All chunks written, total time: {total_time:.2f}s")
|
|
288
380
|
self.storage.commit()
|
|
381
|
+
logger.info("[BUILD CACHE] Changes committed to Byzer Storage")
|
|
289
382
|
|
|
290
383
|
def update_storage(self, file_info: FileInfo, is_delete: bool):
|
|
384
|
+
"""
|
|
385
|
+
Updates file content in the Byzer Storage vector database.
|
|
386
|
+
|
|
387
|
+
Parameters:
|
|
388
|
+
file_info: FileInfo object containing file path, relative path, modify time, and MD5 hash
|
|
389
|
+
is_delete: Whether this is a delete operation, True means all records for this file will be removed
|
|
390
|
+
|
|
391
|
+
Process:
|
|
392
|
+
1. First query and delete all existing records for this file path from the vector database
|
|
393
|
+
2. If not a delete operation:
|
|
394
|
+
a. Get parsed content (SourceCode objects) for the file from local cache
|
|
395
|
+
b. Iterate through each SourceCode object
|
|
396
|
+
c. Split its source code into fixed-size (chunk_size) text chunks
|
|
397
|
+
d. Create items for each chunk containing:
|
|
398
|
+
- ID: combination of module name and chunk index
|
|
399
|
+
- File path
|
|
400
|
+
- Content text
|
|
401
|
+
- Raw content (for searching)
|
|
402
|
+
- Vector representation (embedding generated by ByzerLLM)
|
|
403
|
+
- Modify time
|
|
404
|
+
3. Write all items to Byzer Storage with vector and search fields specified
|
|
405
|
+
4. Commit changes to ensure data persistence
|
|
406
|
+
|
|
407
|
+
Notes:
|
|
408
|
+
- This method removes all records for a file before updating to avoid leftovers
|
|
409
|
+
- File content is processed in chunks, each stored and indexed separately
|
|
410
|
+
- Vector fields are used for similarity search, content field for full-text search
|
|
411
|
+
"""
|
|
412
|
+
logger.info(f"[UPDATE STORAGE] Starting update for file: {file_info.file_path}, is delete: {is_delete}")
|
|
413
|
+
|
|
291
414
|
query = self.storage.query_builder()
|
|
292
415
|
query.and_filter().add_condition("file_path", file_info.file_path).build()
|
|
293
416
|
results = query.execute()
|
|
294
417
|
if results:
|
|
418
|
+
logger.info(f"[UPDATE STORAGE] Deleting existing records from Byzer Storage: {len(results)} records")
|
|
295
419
|
for result in results:
|
|
296
420
|
self.storage.delete_by_ids([result["_id"]])
|
|
297
421
|
items = []
|
|
298
422
|
|
|
299
423
|
if not is_delete:
|
|
424
|
+
logger.info(f"[UPDATE STORAGE] Getting file content from cache and preparing update")
|
|
300
425
|
content = [
|
|
301
426
|
SourceCode.model_validate(doc)
|
|
302
427
|
for doc in self.cache[file_info.file_path].content
|
|
303
428
|
]
|
|
304
429
|
modify_time = self.cache[file_info.file_path].modify_time
|
|
305
430
|
for doc in content:
|
|
306
|
-
logger.info(f"Processing file: {doc.module_name}")
|
|
431
|
+
logger.info(f"[UPDATE STORAGE] Processing file: {doc.module_name}")
|
|
307
432
|
doc.module_name
|
|
308
433
|
chunks = self._chunk_text(doc.source_code, self.chunk_size)
|
|
434
|
+
logger.info(f"[UPDATE STORAGE] File {doc.module_name} chunking completed, total chunks: {len(chunks)}")
|
|
309
435
|
for chunk_idx, chunk in enumerate(chunks):
|
|
310
436
|
chunk_item = {
|
|
311
437
|
"_id": f"{doc.module_name}_{chunk_idx}",
|
|
312
438
|
"file_path": file_info.file_path,
|
|
313
|
-
"content": chunk,
|
|
314
|
-
"raw_content": chunk,
|
|
315
|
-
"vector": chunk,
|
|
439
|
+
"content": chunk[0:self.chunk_size*2],
|
|
440
|
+
"raw_content": chunk[0:self.chunk_size*2],
|
|
441
|
+
"vector": chunk[0:self.chunk_size*2],
|
|
316
442
|
"mtime": modify_time,
|
|
317
443
|
}
|
|
318
444
|
items.append(chunk_item)
|
|
319
445
|
if items:
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
446
|
+
logger.info(f"[UPDATE STORAGE] Starting to write {len(items)} chunks to Byzer Storage")
|
|
447
|
+
start_time = time.time()
|
|
448
|
+
|
|
449
|
+
# Use optimal batch size here too
|
|
450
|
+
batch_size = 100
|
|
451
|
+
if len(items) > batch_size:
|
|
452
|
+
logger.info(f"[UPDATE STORAGE] Using batched writes with {batch_size} items per batch")
|
|
453
|
+
batches = [items[i:i + batch_size] for i in range(0, len(items), batch_size)]
|
|
454
|
+
total_batches = len(batches)
|
|
455
|
+
|
|
456
|
+
for i, batch in enumerate(batches):
|
|
457
|
+
self.storage.write_builder().add_items(
|
|
458
|
+
batch, vector_fields=["vector"], search_fields=["content"]
|
|
459
|
+
).execute()
|
|
460
|
+
logger.info(f"[UPDATE STORAGE] Progress: {i+1}/{total_batches} batches written")
|
|
461
|
+
else:
|
|
462
|
+
# For small item counts, just use a single write operation
|
|
463
|
+
self.storage.write_builder().add_items(
|
|
464
|
+
items, vector_fields=["vector"], search_fields=["content"]
|
|
465
|
+
).execute()
|
|
466
|
+
|
|
323
467
|
self.storage.commit()
|
|
468
|
+
elapsed = time.time() - start_time
|
|
469
|
+
logger.info(f"[UPDATE STORAGE] Write completed, time elapsed: {elapsed:.2f}s")
|
|
470
|
+
else:
|
|
471
|
+
logger.info(f"[UPDATE STORAGE] No content to write")
|
|
324
472
|
|
|
325
473
|
def process_queue(self):
|
|
474
|
+
if not self.queue:
|
|
475
|
+
logger.info("[QUEUE PROCESSING] Queue is empty, nothing to process")
|
|
476
|
+
return
|
|
477
|
+
|
|
478
|
+
logger.info(f"[QUEUE PROCESSING] Starting queue processing, queue length: {len(self.queue)}")
|
|
479
|
+
start_time = time.time()
|
|
480
|
+
|
|
326
481
|
while self.queue:
|
|
327
482
|
file_list = self.queue.pop(0)
|
|
328
483
|
if isinstance(file_list, DeleteEvent):
|
|
484
|
+
logger.info(f"[QUEUE PROCESSING] Processing delete event, total files: {len(file_list.file_paths)}")
|
|
329
485
|
for item in file_list.file_paths:
|
|
330
|
-
logger.info(f"
|
|
486
|
+
logger.info(f"[QUEUE PROCESSING] Processing file deletion: {item}")
|
|
331
487
|
del self.cache[item]
|
|
332
|
-
#
|
|
488
|
+
# Create a temporary FileInfo object
|
|
333
489
|
file_info = FileInfo(
|
|
334
490
|
file_path=item, relative_path="", modify_time=0, file_md5="")
|
|
335
491
|
self.update_storage(file_info, is_delete=True)
|
|
336
492
|
|
|
337
493
|
elif isinstance(file_list, AddOrUpdateEvent):
|
|
494
|
+
logger.info(f"[QUEUE PROCESSING] Processing add/update event, total files: {len(file_list.file_infos)}")
|
|
338
495
|
for file_info in file_list.file_infos:
|
|
339
496
|
logger.info(
|
|
340
|
-
f"{file_info.file_path}
|
|
341
|
-
#
|
|
497
|
+
f"[QUEUE PROCESSING] Processing file update: {file_info.file_path}")
|
|
498
|
+
# Process file and create CacheItem
|
|
342
499
|
content = process_file_local(
|
|
343
500
|
self.fileinfo_to_tuple(file_info))
|
|
344
501
|
self.cache[file_info.file_path] = CacheItem(
|
|
@@ -350,9 +507,14 @@ class LocalByzerStorageCache(BaseCacheManager):
|
|
|
350
507
|
)
|
|
351
508
|
self.update_storage(file_info, is_delete=False)
|
|
352
509
|
self.write_cache()
|
|
510
|
+
|
|
511
|
+
elapsed = time.time() - start_time
|
|
512
|
+
logger.info(f"[QUEUE PROCESSING] Queue processing completed, time elapsed: {elapsed:.2f}s")
|
|
353
513
|
|
|
354
514
|
def trigger_update(self):
|
|
355
|
-
logger.info("
|
|
515
|
+
logger.info("[TRIGGER UPDATE] Starting file update check...")
|
|
516
|
+
start_time = time.time()
|
|
517
|
+
|
|
356
518
|
files_to_process = []
|
|
357
519
|
current_files = set()
|
|
358
520
|
for file_info in self.get_all_files():
|
|
@@ -364,27 +526,28 @@ class LocalByzerStorageCache(BaseCacheManager):
|
|
|
364
526
|
files_to_process.append(file_info)
|
|
365
527
|
|
|
366
528
|
deleted_files = set(self.cache.keys()) - current_files
|
|
367
|
-
|
|
368
|
-
logger.info(f"
|
|
529
|
+
|
|
530
|
+
logger.info(f"[TRIGGER UPDATE] Files to process: {len(files_to_process)}")
|
|
531
|
+
logger.info(f"[TRIGGER UPDATE] Files deleted: {len(deleted_files)}")
|
|
532
|
+
|
|
369
533
|
if deleted_files:
|
|
534
|
+
logger.info(f"[TRIGGER UPDATE] Adding delete event to queue")
|
|
370
535
|
with self.lock:
|
|
371
536
|
self.queue.append(DeleteEvent(file_paths=deleted_files))
|
|
372
537
|
if files_to_process:
|
|
538
|
+
logger.info(f"[TRIGGER UPDATE] Adding update event to queue")
|
|
373
539
|
with self.lock:
|
|
374
540
|
self.queue.append(AddOrUpdateEvent(
|
|
375
541
|
file_infos=files_to_process))
|
|
542
|
+
|
|
543
|
+
elapsed = time.time() - start_time
|
|
544
|
+
logger.info(f"[TRIGGER UPDATE] Check completed, time elapsed: {elapsed:.2f}s")
|
|
376
545
|
|
|
377
|
-
def
|
|
546
|
+
def get_single_cache(self, query: str, options: Dict[str, Any]) -> Dict[str, Dict]:
|
|
378
547
|
"""Search cached documents using query"""
|
|
379
|
-
|
|
380
|
-
self.trigger_update()
|
|
381
|
-
|
|
382
|
-
if options is None or "query" not in options:
|
|
383
|
-
return {file_path: self.cache[file_path].model_dump() for file_path in self.cache}
|
|
384
|
-
|
|
385
|
-
query = options.get("query", "")
|
|
548
|
+
|
|
386
549
|
total_tokens = 0
|
|
387
|
-
|
|
550
|
+
logger.info(f"Querying cache, query: {query}")
|
|
388
551
|
# Build query with both vector search and text search
|
|
389
552
|
query_builder = self.storage.query_builder()
|
|
390
553
|
query_builder.set_limit(100000)
|
|
@@ -399,7 +562,7 @@ class LocalByzerStorageCache(BaseCacheManager):
|
|
|
399
562
|
|
|
400
563
|
results = query_builder.execute()
|
|
401
564
|
|
|
402
|
-
logger.info(f"
|
|
565
|
+
logger.info(f"From cache retrieved: {len(results)} records")
|
|
403
566
|
# Preview first 5 results with all fields but limited content size
|
|
404
567
|
preview_results = []
|
|
405
568
|
for r in results[:5]:
|
|
@@ -409,17 +572,82 @@ class LocalByzerStorageCache(BaseCacheManager):
|
|
|
409
572
|
if "raw_content" in preview and isinstance(preview["raw_content"], str):
|
|
410
573
|
preview["raw_content"] = preview["raw_content"][:100] + "..." if len(preview["raw_content"]) > 100 else preview["raw_content"]
|
|
411
574
|
preview_results.append(preview)
|
|
412
|
-
logger.info(f"
|
|
575
|
+
logger.info(f"Previewing first 5 records:")
|
|
413
576
|
|
|
414
577
|
for r in preview_results:
|
|
415
|
-
logger.info(f"
|
|
416
|
-
logger.info(f"
|
|
417
|
-
#
|
|
578
|
+
logger.info(f"File path: {r['file_path']}")
|
|
579
|
+
logger.info(f"Raw content: {r['raw_content']}")
|
|
580
|
+
# Print other fields
|
|
418
581
|
for k, v in r.items():
|
|
419
582
|
if k not in ["file_path", "raw_content"]:
|
|
420
583
|
logger.info(f"{k}: {v}")
|
|
421
584
|
logger.info("-"*100)
|
|
422
585
|
|
|
586
|
+
return results
|
|
587
|
+
|
|
588
|
+
|
|
589
|
+
def get_cache(self, options: Dict[str, Any]) -> Dict[str, Dict]:
|
|
590
|
+
"""
|
|
591
|
+
获取缓存中的文档信息
|
|
592
|
+
|
|
593
|
+
如果options中包含query,则根据query搜索;否则返回所有缓存
|
|
594
|
+
"""
|
|
595
|
+
# options是一个词典,词典的key是搜索参数,value是具体值
|
|
596
|
+
|
|
597
|
+
# 触发更新
|
|
598
|
+
self.trigger_update()
|
|
599
|
+
|
|
600
|
+
# 如果没有查询参数,则返回所有缓存
|
|
601
|
+
if options is None or "queries" not in options:
|
|
602
|
+
return {file_path: self.cache[file_path].model_dump() for file_path in self.cache}
|
|
603
|
+
|
|
604
|
+
queries = options.get("queries", [])
|
|
605
|
+
|
|
606
|
+
# 如果没有查询或只有一个查询,使用原来的方法
|
|
607
|
+
if not queries:
|
|
608
|
+
return {file_path: self.cache[file_path].model_dump() for file_path in self.cache}
|
|
609
|
+
elif len(queries) == 1:
|
|
610
|
+
results = self.get_single_cache(queries[0], options)
|
|
611
|
+
return self._process_search_results(results)
|
|
612
|
+
|
|
613
|
+
# 获取合并策略
|
|
614
|
+
merge_strategy_name = options.get("merge_strategy", MergeStrategy.WEIGHTED_RANK.value)
|
|
615
|
+
try:
|
|
616
|
+
merge_strategy = MergeStrategy(merge_strategy_name)
|
|
617
|
+
except ValueError:
|
|
618
|
+
logger.warning(f"Unknown merge strategy: {merge_strategy_name}, using default strategy")
|
|
619
|
+
merge_strategy = MergeStrategy.WEIGHTED_RANK
|
|
620
|
+
|
|
621
|
+
# 限制最大结果数
|
|
622
|
+
max_results = options.get("max_results", None)
|
|
623
|
+
merger = CacheResultMerger(max_results=max_results)
|
|
624
|
+
|
|
625
|
+
# 并发处理多个查询
|
|
626
|
+
query_results = []
|
|
627
|
+
with ThreadPoolExecutor(max_workers=min(len(queries), 10)) as executor:
|
|
628
|
+
future_to_query = {executor.submit(self.get_single_cache, query, options): query for query in queries}
|
|
629
|
+
for future in as_completed(future_to_query):
|
|
630
|
+
query = future_to_query[future]
|
|
631
|
+
try:
|
|
632
|
+
query_result = future.result()
|
|
633
|
+
logger.info(f"Query '{query}' returned {len(query_result)} records")
|
|
634
|
+
query_results.append((query, query_result))
|
|
635
|
+
except Exception as e:
|
|
636
|
+
logger.error(f"Error processing query '{query}': {str(e)}")
|
|
637
|
+
|
|
638
|
+
logger.info(f"All queries returned {sum(len(r) for _, r in query_results)} records")
|
|
639
|
+
logger.info(f"Using merge strategy: {merge_strategy}")
|
|
640
|
+
|
|
641
|
+
# 使用策略合并结果
|
|
642
|
+
merged_results = merger.merge(query_results, strategy=merge_strategy)
|
|
643
|
+
|
|
644
|
+
return self._process_search_results(merged_results)
|
|
645
|
+
|
|
646
|
+
def _process_search_results(self, results):
|
|
647
|
+
"""处理搜索结果,提取文件路径并构建结果字典"""
|
|
648
|
+
# 记录被处理的总tokens数
|
|
649
|
+
total_tokens = 0
|
|
650
|
+
|
|
423
651
|
# Group results by file_path and reconstruct documents while preserving order
|
|
424
652
|
# 这里还可以有排序优化,综合考虑一篇内容出现的次数以及排序位置
|
|
425
653
|
file_paths = []
|
|
@@ -437,12 +665,12 @@ class LocalByzerStorageCache(BaseCacheManager):
|
|
|
437
665
|
cached_data = self.cache[file_path]
|
|
438
666
|
for doc in cached_data.content:
|
|
439
667
|
if total_tokens + doc["tokens"] > self.max_output_tokens:
|
|
440
|
-
logger.info(f"
|
|
668
|
+
logger.info(f"User tokens set to: {self.max_output_tokens}, cumulative tokens: {total_tokens} current file: {file_path} tokens: {doc['tokens']}, data record count change: {len(results)} -> {len(result)}")
|
|
441
669
|
return result
|
|
442
670
|
total_tokens += doc["tokens"]
|
|
443
671
|
result[file_path] = cached_data.model_dump()
|
|
444
672
|
|
|
445
|
-
logger.info(f"
|
|
673
|
+
logger.info(f"User tokens set to: {self.max_output_tokens}, cumulative tokens: {total_tokens}, data record count change: {len(results)} -> {len(result)}")
|
|
446
674
|
return result
|
|
447
675
|
|
|
448
676
|
def get_all_files(self) -> List[FileInfo]:
|