auto-coder 0.1.347__py3-none-any.whl → 0.1.349__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of auto-coder might be problematic. Click here for more details.

Files changed (37) hide show
  1. {auto_coder-0.1.347.dist-info → auto_coder-0.1.349.dist-info}/METADATA +1 -1
  2. {auto_coder-0.1.347.dist-info → auto_coder-0.1.349.dist-info}/RECORD +37 -27
  3. autocoder/auto_coder_runner.py +19 -14
  4. autocoder/chat_auto_coder_lang.py +5 -3
  5. autocoder/common/auto_coder_lang.py +3 -3
  6. autocoder/common/model_speed_tester.py +392 -0
  7. autocoder/common/printer.py +7 -8
  8. autocoder/common/run_cmd.py +247 -0
  9. autocoder/common/test_run_cmd.py +110 -0
  10. autocoder/common/v2/agent/agentic_edit.py +82 -29
  11. autocoder/common/v2/agent/agentic_edit_conversation.py +9 -0
  12. autocoder/common/v2/agent/agentic_edit_tools/execute_command_tool_resolver.py +21 -36
  13. autocoder/common/v2/agent/agentic_edit_tools/list_files_tool_resolver.py +4 -7
  14. autocoder/common/v2/agent/agentic_edit_tools/search_files_tool_resolver.py +2 -5
  15. autocoder/helper/rag_doc_creator.py +141 -0
  16. autocoder/ignorefiles/__init__.py +4 -0
  17. autocoder/ignorefiles/ignore_file_utils.py +63 -0
  18. autocoder/ignorefiles/test_ignore_file_utils.py +91 -0
  19. autocoder/models.py +49 -9
  20. autocoder/plugins/__init__.py +20 -0
  21. autocoder/rag/cache/byzer_storage_cache.py +10 -4
  22. autocoder/rag/cache/file_monitor_cache.py +27 -24
  23. autocoder/rag/cache/local_byzer_storage_cache.py +11 -5
  24. autocoder/rag/cache/local_duckdb_storage_cache.py +203 -128
  25. autocoder/rag/cache/simple_cache.py +56 -37
  26. autocoder/rag/loaders/filter_utils.py +106 -0
  27. autocoder/rag/loaders/image_loader.py +573 -0
  28. autocoder/rag/loaders/pdf_loader.py +3 -3
  29. autocoder/rag/loaders/test_image_loader.py +209 -0
  30. autocoder/rag/qa_conversation_strategy.py +3 -5
  31. autocoder/rag/utils.py +20 -9
  32. autocoder/utils/_markitdown.py +35 -0
  33. autocoder/version.py +1 -1
  34. {auto_coder-0.1.347.dist-info → auto_coder-0.1.349.dist-info}/LICENSE +0 -0
  35. {auto_coder-0.1.347.dist-info → auto_coder-0.1.349.dist-info}/WHEEL +0 -0
  36. {auto_coder-0.1.347.dist-info → auto_coder-0.1.349.dist-info}/entry_points.txt +0 -0
  37. {auto_coder-0.1.347.dist-info → auto_coder-0.1.349.dist-info}/top_level.txt +0 -0
@@ -5,15 +5,21 @@ import time
5
5
  import platform
6
6
  import threading
7
7
  from multiprocessing import Pool
8
+ import functools
8
9
  from concurrent.futures import ThreadPoolExecutor, as_completed
9
10
  from typing import List, Dict, Any, Optional, Tuple, Union
10
11
  import numpy as np
11
12
  from loguru import logger
13
+ from typing import Union
14
+ from byzerllm import SimpleByzerLLM, ByzerLLM
15
+ from autocoder.utils.llms import get_llm_names
12
16
 
13
17
  try:
14
18
  import duckdb
15
19
  except ImportError:
16
- logger.error("DuckDB is not installed, please install it using 'pip install duckdb'")
20
+ logger.error(
21
+ "DuckDB is not installed, please install it using 'pip install duckdb'"
22
+ )
17
23
  raise
18
24
 
19
25
  from autocoder.common import AutoCoderArgs
@@ -23,7 +29,7 @@ from autocoder.rag.cache.base_cache import (
23
29
  DeleteEvent,
24
30
  AddOrUpdateEvent,
25
31
  FileInfo,
26
- CacheItem
32
+ CacheItem,
27
33
  )
28
34
  from autocoder.rag.utils import process_file_in_multi_process, process_file_local
29
35
  from autocoder.rag.variable_holder import VariableHolder
@@ -36,11 +42,7 @@ else:
36
42
  fcntl = None
37
43
 
38
44
 
39
- default_ignore_dirs = [
40
- "__pycache__",
41
- "node_modules",
42
- "_images"
43
- ]
45
+ default_ignore_dirs = ["__pycache__", "node_modules", "_images"]
44
46
 
45
47
 
46
48
  def generate_file_md5(file_path: str) -> str:
@@ -80,16 +82,19 @@ class DuckDBLocalContext:
80
82
  class LocalDuckdbStorage:
81
83
 
82
84
  def __init__(
83
- self, llm: Union[ByzerLLM, SimpleByzerLLM] = None, database_name: str = ":memory:",
84
- table_name: str = "documents",
85
- embed_dim: Optional[int] = None, persist_dir: str = "./storage"
85
+ self,
86
+ llm: Union[ByzerLLM, SimpleByzerLLM] = None,
87
+ database_name: str = ":memory:",
88
+ table_name: str = "documents",
89
+ embed_dim: Optional[int] = None,
90
+ persist_dir: str = "./storage",
86
91
  ) -> None:
87
92
  self.llm = llm
88
93
  self.database_name = database_name
89
94
  self.table_name = table_name
90
95
  self.embed_dim = embed_dim
91
96
  self.persist_dir = persist_dir
92
- self.cache_dir = os.path.join(self.persist_dir, '.cache')
97
+ self.cache_dir = os.path.join(self.persist_dir, ".cache")
93
98
  logger.info(f"正在启动 DuckDBVectorStore.")
94
99
 
95
100
  if self.database_name != ":memory:":
@@ -105,8 +110,10 @@ class LocalDuckdbStorage:
105
110
  os.makedirs(self.cache_dir)
106
111
  self._initialize()
107
112
  self._conn = None
108
- logger.info(f"DuckDBVectorStore 初始化完成, 存储目录: {self.cache_dir}, "
109
- f"数据库名称: {self.database_name}, 数据表名称: {self.table_name}")
113
+ logger.info(
114
+ f"DuckDBVectorStore 初始化完成, 存储目录: {self.cache_dir}, "
115
+ f"数据库名称: {self.database_name}, 数据表名称: {self.table_name}"
116
+ )
110
117
 
111
118
  @classmethod
112
119
  def class_name(cls) -> str:
@@ -127,37 +134,47 @@ class LocalDuckdbStorage:
127
134
  # 生成固定随机投影矩阵(避免每次调用重新生成)
128
135
  np.random.seed(42) # 固定随机种子保证一致性
129
136
  source_dim = len(embedding)
130
- projection_matrix = np.random.randn(source_dim, target_dim) / np.sqrt(source_dim)
137
+ projection_matrix = np.random.randn(source_dim, target_dim) / np.sqrt(
138
+ source_dim
139
+ )
131
140
 
132
141
  # 执行投影
133
142
  reduced = np.dot(embedding, projection_matrix)
134
143
  return reduced
135
144
 
136
- def _embedding(self, context: str, norm: bool = True, dim: int | None = None) -> List[float]:
145
+ def _embedding(
146
+ self, context: str, norm: bool = True, dim: int | None = None
147
+ ) -> List[float]:
137
148
  max_retries = 3
138
149
  retry_count = 0
139
-
150
+
140
151
  while retry_count < max_retries:
141
152
  try:
142
153
  embedding = self.llm.emb_query(context)[0].output
143
-
154
+
144
155
  if dim:
145
- embedding = self._apply_pca(embedding, target_dim=dim) # 降维后形状 (1024,)
146
-
156
+ embedding = self._apply_pca(
157
+ embedding, target_dim=dim
158
+ ) # 降维后形状 (1024,)
159
+
147
160
  if norm:
148
161
  embedding = embedding / np.linalg.norm(embedding)
149
-
162
+
150
163
  return embedding.tolist()
151
164
  except Exception as e:
152
165
  retry_count += 1
153
166
  if retry_count >= max_retries:
154
- logger.error(f"Failed to get embedding after {max_retries} attempts: {str(e)}")
167
+ logger.error(
168
+ f"Failed to get embedding after {max_retries} attempts: {str(e)}"
169
+ )
155
170
  raise
156
-
171
+
157
172
  # Sleep between 1-5 seconds before retrying
158
173
  sleep_time = 1 + (retry_count * 1.5)
159
- logger.warning(f"Embedding API call failed (attempt {retry_count}/{max_retries}). "
160
- f"Error: {str(e)}. Retrying in {sleep_time:.1f} seconds...")
174
+ logger.warning(
175
+ f"Embedding API call failed (attempt {retry_count}/{max_retries}). "
176
+ f"Error: {str(e)}. Retrying in {sleep_time:.1f} seconds..."
177
+ )
161
178
  time.sleep(sleep_time)
162
179
 
163
180
  def _initialize(self) -> None:
@@ -200,9 +217,7 @@ class LocalDuckdbStorage:
200
217
 
201
218
  def query_by_path(self, file_path: str):
202
219
  _exists_query = f"""SELECT _id FROM {self.table_name} WHERE file_path = ?"""
203
- query_params = [
204
- file_path
205
- ]
220
+ query_params = [file_path]
206
221
  _final_results = []
207
222
  if self.database_name == ":memory:":
208
223
  _final_results = self._conn.execute(_exists_query, query_params).fetchall()
@@ -213,9 +228,7 @@ class LocalDuckdbStorage:
213
228
 
214
229
  def delete_by_ids(self, _ids: List[str]):
215
230
  _delete_query = f"""DELETE FROM {self.table_name} WHERE _id IN (?);"""
216
- query_params = [
217
- ','.join(_ids)
218
- ]
231
+ query_params = [",".join(_ids)]
219
232
  if self.database_name == ":memory:":
220
233
  _final_results = self._conn.execute(_delete_query, query_params).fetchall()
221
234
  elif self.database_path is not None:
@@ -223,14 +236,16 @@ class LocalDuckdbStorage:
223
236
  _final_results = _conn.execute(_delete_query, query_params).fetchall()
224
237
  return _final_results
225
238
 
226
- def _node_to_table_row(self, context_chunk: Dict[str, str | float], dim: int | None = None) -> Any:
239
+ def _node_to_table_row(
240
+ self, context_chunk: Dict[str, str | float], dim: int | None = None
241
+ ) -> Any:
227
242
  return (
228
243
  context_chunk["_id"],
229
244
  context_chunk["file_path"],
230
245
  context_chunk["content"],
231
246
  context_chunk["raw_content"],
232
247
  self._embedding(context_chunk["raw_content"], norm=True, dim=dim),
233
- context_chunk["mtime"]
248
+ context_chunk["mtime"],
234
249
  )
235
250
 
236
251
  def add_doc(self, context_chunk: Dict[str, str | float], dim: int | None = None):
@@ -255,7 +270,11 @@ class LocalDuckdbStorage:
255
270
  _table.insert(_row)
256
271
 
257
272
  def vector_search(
258
- self, query: str, similarity_value: float = 0.7, similarity_top_k: int = 10, query_dim: int | None = None
273
+ self,
274
+ query: str,
275
+ similarity_value: float = 0.7,
276
+ similarity_top_k: int = 10,
277
+ query_dim: int | None = None,
259
278
  ):
260
279
  """
261
280
  list_cosine_similarity: 计算两个列表之间的余弦相似度
@@ -287,23 +306,19 @@ class LocalDuckdbStorage:
287
306
  return _final_results
288
307
 
289
308
 
290
- efault_ignore_dirs = [
291
- "__pycache__",
292
- "node_modules",
293
- "_images"
294
- ]
309
+ efault_ignore_dirs = ["__pycache__", "node_modules", "_images"]
295
310
 
296
311
 
297
312
  class LocalDuckDBStorageCache(BaseCacheManager):
298
313
  def __init__(
299
- self,
300
- path,
301
- ignore_spec,
302
- required_exts,
303
- extra_params: Optional[AutoCoderArgs] = None,
304
- emb_llm: Union[ByzerLLM, SimpleByzerLLM] = None,
305
- args=None,
306
- llm=None
314
+ self,
315
+ path,
316
+ ignore_spec,
317
+ required_exts,
318
+ extra_params: Optional[AutoCoderArgs] = None,
319
+ emb_llm: Union[ByzerLLM, SimpleByzerLLM] = None,
320
+ args: Optional[AutoCoderArgs] = None,
321
+ llm: Optional[Union[ByzerLLM, SimpleByzerLLM, str]] = None,
307
322
  ):
308
323
  self.path = path
309
324
  self.ignore_spec = ignore_spec
@@ -316,7 +331,7 @@ class LocalDuckDBStorageCache(BaseCacheManager):
316
331
  llm=emb_llm,
317
332
  database_name="byzerai_store_duckdb.db",
318
333
  table_name="rag_duckdb",
319
- persist_dir=self.path
334
+ persist_dir=self.path,
320
335
  )
321
336
  self.queue = []
322
337
  self.chunk_size = 1000
@@ -332,6 +347,7 @@ class LocalDuckDBStorageCache(BaseCacheManager):
332
347
 
333
348
  # failed files support
334
349
  from .failed_files_utils import load_failed_files
350
+
335
351
  self.failed_files_path = os.path.join(self.cache_dir, "failed_files.json")
336
352
  self.failed_files = load_failed_files(self.failed_files_path)
337
353
 
@@ -417,7 +433,12 @@ class LocalDuckDBStorageCache(BaseCacheManager):
417
433
 
418
434
  @staticmethod
419
435
  def fileinfo_to_tuple(file_info: FileInfo) -> Tuple[str, str, float, str]:
420
- return file_info.file_path, file_info.relative_path, file_info.modify_time, file_info.file_md5
436
+ return (
437
+ file_info.file_path,
438
+ file_info.relative_path,
439
+ file_info.modify_time,
440
+ file_info.file_md5,
441
+ )
421
442
 
422
443
  def build_cache(self):
423
444
  """Build the cache by reading files and storing in DuckDBVectorStore"""
@@ -426,8 +447,8 @@ class LocalDuckDBStorageCache(BaseCacheManager):
426
447
  files_to_process = []
427
448
  for file_info in self.get_all_files():
428
449
  if (
429
- file_info.file_path not in self.cache
430
- or self.cache[file_info.file_path].md5 != file_info.file_md5
450
+ file_info.file_path not in self.cache
451
+ or self.cache[file_info.file_path].md5 != file_info.file_md5
431
452
  ):
432
453
  files_to_process.append(file_info)
433
454
 
@@ -436,17 +457,20 @@ class LocalDuckDBStorageCache(BaseCacheManager):
436
457
 
437
458
  from autocoder.rag.token_counter import initialize_tokenizer
438
459
 
460
+ llm_name = get_llm_names(self.llm)[0] if self.llm else None
461
+ product_mode = self.args.product_mode
439
462
  with Pool(
440
- processes=os.cpu_count(),
441
- initializer=initialize_tokenizer,
442
- initargs=(VariableHolder.TOKENIZER_PATH,),
463
+ processes=os.cpu_count(),
464
+ initializer=initialize_tokenizer,
465
+ initargs=(VariableHolder.TOKENIZER_PATH,),
443
466
  ) as pool:
444
467
  target_files_to_process = []
445
468
  for file_info in files_to_process:
446
- target_files_to_process.append(
447
- self.fileinfo_to_tuple(file_info))
448
- results = pool.map(process_file_in_multi_process,
449
- target_files_to_process)
469
+ target_files_to_process.append(self.fileinfo_to_tuple(file_info))
470
+ worker_func = functools.partial(
471
+ process_file_in_multi_process, llm=llm_name, product_mode=product_mode
472
+ )
473
+ results = pool.map(worker_func, target_files_to_process)
450
474
 
451
475
  items = []
452
476
  for file_info, result in zip(files_to_process, results):
@@ -480,37 +504,43 @@ class LocalDuckDBStorageCache(BaseCacheManager):
480
504
  if items:
481
505
  logger.info("[BUILD CACHE] Clearing existing cache from DuckDB Storage")
482
506
  self.storage.truncate_table()
483
- logger.info(f"[BUILD CACHE] Preparing to write to DuckDB Storage, "
484
- f"total chunks: {len(items)}, total files: {len(files_to_process)}")
507
+ logger.info(
508
+ f"[BUILD CACHE] Preparing to write to DuckDB Storage, "
509
+ f"total chunks: {len(items)}, total files: {len(files_to_process)}"
510
+ )
485
511
 
486
512
  # Use a fixed optimal batch size instead of dividing by worker count
487
513
  batch_size = 100 # Optimal batch size for Byzer Storage
488
- item_batches = [items[i:i + batch_size] for i in range(0, len(items), batch_size)]
514
+ item_batches = [
515
+ items[i : i + batch_size] for i in range(0, len(items), batch_size)
516
+ ]
489
517
 
490
518
  total_batches = len(item_batches)
491
519
  completed_batches = 0
492
520
 
493
- logger.info(f"[BUILD CACHE] Starting to write to DuckDB Storage using {batch_size} items per batch, "
494
- f"total batches: {total_batches}")
521
+ logger.info(
522
+ f"[BUILD CACHE] Starting to write to DuckDB Storage using {batch_size} items per batch, "
523
+ f"total batches: {total_batches}"
524
+ )
495
525
  start_time = time.time()
496
526
 
497
- # Use more workers to process the smaller batches efficiently
498
- max_workers = min(self.extra_params.rag_index_build_workers, total_batches) # Cap at 10 workers or total batch count
499
- logger.info(f"[BUILD CACHE] Using {max_workers} parallel workers for processing")
527
+ # Use more workers to process the smaller batches efficiently
528
+ max_workers = min(
529
+ self.extra_params.rag_index_build_workers, total_batches
530
+ ) # Cap at 10 workers or total batch count
531
+ logger.info(
532
+ f"[BUILD CACHE] Using {max_workers} parallel workers for processing"
533
+ )
500
534
 
501
535
  def batch_add_doc(_batch):
502
536
  for b in _batch:
503
537
  self.storage.add_doc(b, dim=self.extra_params.rag_duckdb_vector_dim)
504
538
 
505
- with (ThreadPoolExecutor(max_workers=max_workers) as executor):
539
+ with ThreadPoolExecutor(max_workers=max_workers) as executor:
506
540
  futures = []
507
541
  # Submit all batches to the executor upfront (non-blocking)
508
542
  for batch in item_batches:
509
- futures.append(
510
- executor.submit(
511
- batch_add_doc, batch
512
- )
513
- )
543
+ futures.append(executor.submit(batch_add_doc, batch))
514
544
 
515
545
  # Wait for futures to complete
516
546
  for future in as_completed(futures):
@@ -518,13 +548,19 @@ class LocalDuckDBStorageCache(BaseCacheManager):
518
548
  future.result()
519
549
  completed_batches += 1
520
550
  elapsed = time.time() - start_time
521
- estimated_total = elapsed / completed_batches * total_batches if completed_batches > 0 else 0
551
+ estimated_total = (
552
+ elapsed / completed_batches * total_batches
553
+ if completed_batches > 0
554
+ else 0
555
+ )
522
556
  remaining = estimated_total - elapsed
523
557
 
524
558
  # Only log progress at reasonable intervals to reduce log spam
525
- if ((completed_batches == 1) or
526
- (completed_batches == total_batches) or
527
- (completed_batches % max(1, total_batches // 10) == 0)):
559
+ if (
560
+ (completed_batches == 1)
561
+ or (completed_batches == total_batches)
562
+ or (completed_batches % max(1, total_batches // 10) == 0)
563
+ ):
528
564
  logger.info(
529
565
  f"[BUILD CACHE] Progress: {completed_batches}/{total_batches} batches completed "
530
566
  f"({(completed_batches / total_batches * 100):.1f}%) "
@@ -533,11 +569,15 @@ class LocalDuckDBStorageCache(BaseCacheManager):
533
569
  except Exception as e:
534
570
  logger.error(f"[BUILD CACHE] Error saving batch: {str(e)}")
535
571
  # Add more detailed error information
536
- logger.error(f"[BUILD CACHE] Error details: batch size: "
537
- f"{len(batch) if 'batch' in locals() else 'unknown'}")
572
+ logger.error(
573
+ f"[BUILD CACHE] Error details: batch size: "
574
+ f"{len(batch) if 'batch' in locals() else 'unknown'}"
575
+ )
538
576
 
539
577
  total_time = time.time() - start_time
540
- logger.info(f"[BUILD CACHE] All chunks written, total time: {total_time:.2f}s")
578
+ logger.info(
579
+ f"[BUILD CACHE] All chunks written, total time: {total_time:.2f}s"
580
+ )
541
581
 
542
582
  def update_storage(self, file_info: FileInfo, is_delete: bool):
543
583
  results = self.storage.query_by_path(file_info.file_path)
@@ -548,7 +588,8 @@ class LocalDuckDBStorageCache(BaseCacheManager):
548
588
  items = []
549
589
  if not is_delete:
550
590
  content = [
551
- SourceCode.model_validate(doc) for doc in self.cache[file_info.file_path].content
591
+ SourceCode.model_validate(doc)
592
+ for doc in self.cache[file_info.file_path].content
552
593
  ]
553
594
  modify_time = self.cache[file_info.file_path].modify_time
554
595
  for doc in content:
@@ -567,7 +608,9 @@ class LocalDuckDBStorageCache(BaseCacheManager):
567
608
  if items:
568
609
  for _chunk in items:
569
610
  try:
570
- self.storage.add_doc(_chunk, dim=self.extra_params.rag_duckdb_vector_dim)
611
+ self.storage.add_doc(
612
+ _chunk, dim=self.extra_params.rag_duckdb_vector_dim
613
+ )
571
614
  time.sleep(self.extra_params.anti_quota_limit)
572
615
  except Exception as err:
573
616
  logger.error(f"Error in saving chunk: {str(err)}")
@@ -585,15 +628,19 @@ class LocalDuckDBStorageCache(BaseCacheManager):
585
628
  save_failed_files(self.failed_files_path, self.failed_files)
586
629
  # 创建一个临时的 FileInfo 对象
587
630
  file_info = FileInfo(
588
- file_path=item, relative_path="", modify_time=0, file_md5="")
631
+ file_path=item, relative_path="", modify_time=0, file_md5=""
632
+ )
589
633
  self.update_storage(file_info, is_delete=True)
590
634
 
591
635
  elif isinstance(file_list, AddOrUpdateEvent):
592
636
  for file_info in file_list.file_infos:
593
- logger.info(
594
- f"{file_info.file_path} is detected to be updated")
637
+ logger.info(f"{file_info.file_path} is detected to be updated")
595
638
  try:
596
- content = process_file_local(file_info.file_path)
639
+ content = process_file_local(
640
+ file_info.file_path,
641
+ llm=self.llm,
642
+ product_mode=self.product_mode,
643
+ )
597
644
  if content:
598
645
  self.cache[file_info.file_path] = CacheItem(
599
646
  file_path=file_info.file_path,
@@ -606,9 +653,13 @@ class LocalDuckDBStorageCache(BaseCacheManager):
606
653
  # remove from failed files if present
607
654
  if file_info.file_path in self.failed_files:
608
655
  self.failed_files.remove(file_info.file_path)
609
- save_failed_files(self.failed_files_path, self.failed_files)
656
+ save_failed_files(
657
+ self.failed_files_path, self.failed_files
658
+ )
610
659
  else:
611
- logger.warning(f"Empty result for file: {file_info.file_path}, treat as parse failed, skipping cache update")
660
+ logger.warning(
661
+ f"Empty result for file: {file_info.file_path}, treat as parse failed, skipping cache update"
662
+ )
612
663
  self.failed_files.add(file_info.file_path)
613
664
  save_failed_files(self.failed_files_path, self.failed_files)
614
665
  except Exception as e:
@@ -629,8 +680,8 @@ class LocalDuckDBStorageCache(BaseCacheManager):
629
680
  logger.info(f"文件 {file_info.file_path} 之前解析失败,跳过此次更新")
630
681
  continue
631
682
  if (
632
- file_info.file_path not in self.cache
633
- or self.cache[file_info.file_path].md5 != file_info.file_md5
683
+ file_info.file_path not in self.cache
684
+ or self.cache[file_info.file_path].md5 != file_info.file_md5
634
685
  ):
635
686
  files_to_process.append(file_info)
636
687
 
@@ -647,8 +698,11 @@ class LocalDuckDBStorageCache(BaseCacheManager):
647
698
  def get_all_files(self) -> List[FileInfo]:
648
699
  all_files = []
649
700
  for root, dirs, files in os.walk(self.path, followlinks=True):
650
- dirs[:] = [d for d in dirs if not d.startswith(
651
- ".") and d not in default_ignore_dirs]
701
+ dirs[:] = [
702
+ d
703
+ for d in dirs
704
+ if not d.startswith(".") and d not in default_ignore_dirs
705
+ ]
652
706
 
653
707
  if self.ignore_spec:
654
708
  relative_root = os.path.relpath(root, self.path)
@@ -665,7 +719,7 @@ class LocalDuckDBStorageCache(BaseCacheManager):
665
719
 
666
720
  for file in files:
667
721
  if self.required_exts and not any(
668
- file.endswith(ext) for ext in self.required_exts
722
+ file.endswith(ext) for ext in self.required_exts
669
723
  ):
670
724
  continue
671
725
 
@@ -678,18 +732,22 @@ class LocalDuckDBStorageCache(BaseCacheManager):
678
732
  file_path=file_path,
679
733
  relative_path=relative_path,
680
734
  modify_time=modify_time,
681
- file_md5=file_md5))
735
+ file_md5=file_md5,
736
+ )
737
+ )
682
738
 
683
739
  return all_files
684
740
 
685
- def _get_single_cache(self, query: str, options: Dict[str, Any]) -> List[Dict[str, Any]]:
741
+ def _get_single_cache(
742
+ self, query: str, options: Dict[str, Any]
743
+ ) -> List[Dict[str, Any]]:
686
744
  """
687
745
  使用单个查询检索缓存文档
688
-
746
+
689
747
  参数:
690
748
  query: 查询字符串
691
749
  options: 包含查询选项的字典
692
-
750
+
693
751
  返回:
694
752
  包含文档信息的字典列表,每个字典包含_id、file_path、mtime和score字段
695
753
  """
@@ -703,38 +761,35 @@ class LocalDuckDBStorageCache(BaseCacheManager):
703
761
  query,
704
762
  similarity_value=self.extra_params.rag_duckdb_query_similarity,
705
763
  similarity_top_k=self.extra_params.rag_duckdb_query_top_k,
706
- query_dim=self.extra_params.rag_duckdb_vector_dim
764
+ query_dim=self.extra_params.rag_duckdb_vector_dim,
707
765
  )
708
-
766
+
709
767
  # Convert tuples to dictionaries for the merger
710
768
  for _id, file_path, mtime, score in search_results:
711
- results.append({
712
- "_id": _id,
713
- "file_path": file_path,
714
- "mtime": mtime,
715
- "score": score
716
- })
717
-
769
+ results.append(
770
+ {"_id": _id, "file_path": file_path, "mtime": mtime, "score": score}
771
+ )
772
+
718
773
  logger.info(f"查询 '{query}' 返回 {len(results)} 条记录")
719
774
  return results
720
-
775
+
721
776
  def _process_search_results(self, results: List[Dict[str, Any]]) -> Dict[str, Dict]:
722
777
  """
723
778
  处理搜索结果,提取文件路径并构建结果字典
724
-
779
+
725
780
  参数:
726
781
  results: 搜索结果列表,每项包含文档信息的字典
727
-
782
+
728
783
  返回:
729
784
  匹配文档的字典,键为文件路径,值为文件内容
730
-
785
+
731
786
  说明:
732
787
  该方法会根据查询结果从缓存中提取文件内容,并记录累计token数,
733
788
  当累计token数超过max_output_tokens时,将停止处理并返回已处理的结果。
734
789
  """
735
790
  # 记录被处理的总tokens数
736
791
  total_tokens = 0
737
-
792
+
738
793
  # Group results by file_path and reconstruct documents while preserving order
739
794
  # 这里还可以有排序优化,综合考虑一篇内容出现的次数以及排序位置
740
795
  file_paths = []
@@ -755,64 +810,84 @@ class LocalDuckDBStorageCache(BaseCacheManager):
755
810
  logger.info(
756
811
  f"当前检索已超出用户设置 Hybrid Index Max Tokens:{self.max_output_tokens},"
757
812
  f"累计tokens: {total_tokens}, "
758
- f"经过向量搜索共检索出 {len(result.keys())} 个文档, 共 {len(self.cache.keys())} 个文档")
813
+ f"经过向量搜索共检索出 {len(result.keys())} 个文档, 共 {len(self.cache.keys())} 个文档"
814
+ )
759
815
  return result
760
816
  total_tokens += doc["tokens"]
761
817
  result[file_path] = cached_data.model_dump()
762
818
  logger.info(
763
819
  f"用户Hybrid Index Max Tokens设置为:{self.max_output_tokens},"
764
820
  f"累计tokens: {total_tokens}, "
765
- f"经过向量搜索共检索出 {len(result.keys())} 个文档, 共 {len(self.cache.keys())} 个文档")
821
+ f"经过向量搜索共检索出 {len(result.keys())} 个文档, 共 {len(self.cache.keys())} 个文档"
822
+ )
766
823
  return result
767
824
 
768
825
  def get_cache(self, options: Optional[Dict[str, Any]] = None) -> Dict[str, Dict]:
769
826
  """
770
827
  获取缓存中的文档信息
771
-
828
+
772
829
  参数:
773
830
  options: 包含查询参数的字典,可以包含以下键:
774
831
  - queries: 查询列表,可以是单个查询或多个查询
775
832
  - enable_vector_search: 是否启用向量搜索,默认为True
776
833
  - merge_strategy: 多查询时的合并策略,默认为WEIGHTED_RANK
777
834
  - max_results: 最大结果数,默认为None表示不限制
778
-
835
+
779
836
  返回:
780
837
  匹配文档的字典,键为文件路径,值为文件内容
781
838
  """
782
839
  self.trigger_update() # 检查更新
783
840
 
784
841
  if options is None or "queries" not in options:
785
- return {file_path: self.cache[file_path].model_dump() for file_path in self.cache}
842
+ return {
843
+ file_path: self.cache[file_path].model_dump()
844
+ for file_path in self.cache
845
+ }
786
846
 
787
847
  queries = options.get("queries", [])
788
-
848
+
789
849
  # 如果没有查询或只有一个查询,使用原来的方法
790
850
  if not queries:
791
- return {file_path: self.cache[file_path].model_dump() for file_path in self.cache}
851
+ return {
852
+ file_path: self.cache[file_path].model_dump()
853
+ for file_path in self.cache
854
+ }
792
855
  elif len(queries) == 1:
793
856
  results = self._get_single_cache(queries[0], options)
794
857
  return self._process_search_results(results)
795
-
858
+
796
859
  # 导入合并策略
797
- from autocoder.rag.cache.cache_result_merge import CacheResultMerger, MergeStrategy
798
-
860
+ from autocoder.rag.cache.cache_result_merge import (
861
+ CacheResultMerger,
862
+ MergeStrategy,
863
+ )
864
+
799
865
  # 获取合并策略
800
- merge_strategy_name = options.get("merge_strategy", MergeStrategy.WEIGHTED_RANK.value)
866
+ merge_strategy_name = options.get(
867
+ "merge_strategy", MergeStrategy.WEIGHTED_RANK.value
868
+ )
801
869
  try:
802
870
  merge_strategy = MergeStrategy(merge_strategy_name)
803
871
  except ValueError:
804
- logger.warning(f"未知的合并策略: {merge_strategy_name}, 使用默认策略 WEIGHTED_RANK")
872
+ logger.warning(
873
+ f"未知的合并策略: {merge_strategy_name}, 使用默认策略 WEIGHTED_RANK"
874
+ )
805
875
  merge_strategy = MergeStrategy.WEIGHTED_RANK
806
-
876
+
807
877
  # 限制最大结果数
808
878
  max_results = options.get("max_results", None)
809
879
  merger = CacheResultMerger(max_results=max_results)
810
-
880
+
811
881
  # 并发处理多个查询
812
- logger.info(f"处理多查询请求,查询数量: {len(queries)}, 合并策略: {merge_strategy}")
882
+ logger.info(
883
+ f"处理多查询请求,查询数量: {len(queries)}, 合并策略: {merge_strategy}"
884
+ )
813
885
  query_results = []
814
886
  with ThreadPoolExecutor(max_workers=min(len(queries), 10)) as executor:
815
- future_to_query = {executor.submit(self._get_single_cache, query, options): query for query in queries}
887
+ future_to_query = {
888
+ executor.submit(self._get_single_cache, query, options): query
889
+ for query in queries
890
+ }
816
891
  for future in as_completed(future_to_query):
817
892
  query = future_to_query[future]
818
893
  try:
@@ -821,12 +896,12 @@ class LocalDuckDBStorageCache(BaseCacheManager):
821
896
  query_results.append((query, query_result))
822
897
  except Exception as e:
823
898
  logger.error(f"处理查询 '{query}' 时出错: {str(e)}")
824
-
899
+
825
900
  logger.info(f"所有查询共返回 {sum(len(r) for _, r in query_results)} 条记录")
826
-
901
+
827
902
  # 使用策略合并结果
828
903
  merged_results = merger.merge(query_results, strategy=merge_strategy)
829
904
  logger.info(f"合并后的结果共 {len(merged_results)} 条记录")
830
-
905
+
831
906
  # 处理合并后的结果
832
907
  return self._process_search_results(merged_results)