auto-coder 0.1.281__py3-none-any.whl → 0.1.283__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of auto-coder might be problematic. Click here for more details.

@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: auto-coder
3
- Version: 0.1.281
3
+ Version: 0.1.283
4
4
  Summary: AutoCoder: AutoCoder
5
5
  Author: allwefantasy
6
6
  Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
@@ -26,7 +26,7 @@ Requires-Dist: tabulate
26
26
  Requires-Dist: jupyter-client
27
27
  Requires-Dist: prompt-toolkit
28
28
  Requires-Dist: tokenizers
29
- Requires-Dist: byzerllm[saas] >=0.1.170
29
+ Requires-Dist: byzerllm[saas] >=0.1.171
30
30
  Requires-Dist: patch
31
31
  Requires-Dist: diff-match-patch
32
32
  Requires-Dist: GitPython
@@ -1,7 +1,7 @@
1
1
  autocoder/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
2
  autocoder/auto_coder.py,sha256=cxH77xjLPwCmE6R-o1KpONOZlVNOvbTYV4ilAjL8w8A,65592
3
3
  autocoder/auto_coder_lang.py,sha256=Rtupq6N3_HT7JRhDKdgCBcwRaiAnyCOR_Gsp4jUomrI,3229
4
- autocoder/auto_coder_rag.py,sha256=mX-szIG9T7Mzwoc4QwKp_GyYBcVf6dfsNJnKzYHHl6U,30329
4
+ autocoder/auto_coder_rag.py,sha256=cwsCpudP6d2CHOUTD-RXw7CCgx-c7sG7_cKqnjzphlk,32973
5
5
  autocoder/auto_coder_rag_client_mcp.py,sha256=QRxUbjc6A8UmDMQ8lXgZkjgqtq3lgKYeatJbDY6rSo0,6270
6
6
  autocoder/auto_coder_rag_mcp.py,sha256=-RrjNwFaS2e5v8XDIrKR-zlUNUE8UBaeOtojffBrvJo,8521
7
7
  autocoder/auto_coder_runner.py,sha256=w-4MCKhOFaoABcDfVoZoonF59UyRso3kghimQYLz3NA,100851
@@ -12,7 +12,7 @@ autocoder/chat_auto_coder_lang.py,sha256=ShOQVOnMA-WlT-fB9OrOer-xQkbcWxJGl-WMPuZ
12
12
  autocoder/command_args.py,sha256=9aYJ-AmPxP1sQh6ciw04FWHjSn31f2W9afXFwo8wgx4,30441
13
13
  autocoder/lang.py,sha256=U6AjVV8Rs1uLyjFCZ8sT6WWuNUxMBqkXXIOs4S120uk,14511
14
14
  autocoder/models.py,sha256=PlG1tKHSHwB57cKLOl5gTl5yTzFUDzCgeHPJU3N9F6Q,9106
15
- autocoder/version.py,sha256=Fso1WyP_6RMO2-L9K3ndYaM4yp608_gE7DE5dYI85fU,23
15
+ autocoder/version.py,sha256=gD3sSROI4mWkMlhRoIZLn--lc2LLLHyqeGIDWZ8UCTM,23
16
16
  autocoder/agent/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
17
17
  autocoder/agent/auto_demand_organizer.py,sha256=NWSAEsEk94vT3lGjfo25kKLMwYdPcpy9e-i21txPasQ,6942
18
18
  autocoder/agent/auto_filegroup.py,sha256=CW7bqp0FW1GIEMnl-blyAc2UGT7O9Mom0q66ITz1ckM,6635
@@ -109,10 +109,10 @@ autocoder/pyproject/__init__.py,sha256=ms-A_pocgGv0oZPEW8JAdXi7G-VSVhkQ6CnWFe535
109
109
  autocoder/rag/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
110
110
  autocoder/rag/api_server.py,sha256=xiypCkdbclY0Z3Cmq5FTvtKrfQUV7yKcDaFFUttA2n0,7242
111
111
  autocoder/rag/doc_filter.py,sha256=UduVO2mlrngwJICrefjDJTYfdmQ4GcRXrfWDQ7xXksk,14206
112
- autocoder/rag/document_retriever.py,sha256=5oThtxukGuRFF96o3pHKsk306a8diXbhgSrbqyU2BvM,8894
112
+ autocoder/rag/document_retriever.py,sha256=MGn6oIPo49BbRC99xmLMFkZrpHfcDfKoGYqWxXF554U,8051
113
113
  autocoder/rag/lang.py,sha256=TVNx5m7OtBcdfahzI29tMj9m1yrEm32G1c1zc4ZNIPs,3130
114
114
  autocoder/rag/llm_wrapper.py,sha256=Ht5GF5yJtrztoliujsZzx_ooWZmHkd5xLZKcGEiicZw,4303
115
- autocoder/rag/long_context_rag.py,sha256=X9oNsdgfeP6YuVqGlIWaibXoFgGAexVMKbi1_sTtb8c,39333
115
+ autocoder/rag/long_context_rag.py,sha256=mI7X_UT_QgL9uGmX1K5jSiRGC0K5o6m3CgtQESaG6Vk,40581
116
116
  autocoder/rag/rag_config.py,sha256=8LwFcTd8OJWWwi1_WY4IzjqgtT6RyE2j4PjxS5cCTDE,802
117
117
  autocoder/rag/rag_entry.py,sha256=6TKtErZ0Us9XSV6HgRKXA6yR3SiZGPHpynOKSaR1wgE,2463
118
118
  autocoder/rag/raw_rag.py,sha256=BOr0YGf3umjqXOIDVO1LXQ0bIHx8hzBdiubND2ezyxc,2946
@@ -130,6 +130,7 @@ autocoder/rag/cache/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSu
130
130
  autocoder/rag/cache/base_cache.py,sha256=EaYYYbclMBvnlOUoM7qonnluwZX5oSvUjdvGvFun8_8,742
131
131
  autocoder/rag/cache/byzer_storage_cache.py,sha256=gK90pf741CgccCzQ73urBorCqVyAfwU1FAqMtSorWVk,17232
132
132
  autocoder/rag/cache/file_monitor_cache.py,sha256=2TnOW8Y81Zc0WA1upRrkmQH18IMdv40CeNccmnTvd3c,4981
133
+ autocoder/rag/cache/local_byzer_storage_cache.py,sha256=Uhmu5JK0tfZ8NvlcjJzcwtQRhZDpbGp_U6qLXZxVwss,17495
133
134
  autocoder/rag/cache/simple_cache.py,sha256=8FMmBAfhAPcdSNUWC6Ga43LBFGXD-klwabVbzm_bciI,9347
134
135
  autocoder/rag/loaders/__init__.py,sha256=EQHEZ5Cmz-mGP2SllUTvcIbYCnF7W149dNpNItfs0yE,304
135
136
  autocoder/rag/loaders/docx_loader.py,sha256=ZswPqiiLngUEpzLhNNm1nmwEYV7ZHFEfIoXoG7c5GDU,614
@@ -167,9 +168,9 @@ autocoder/utils/types.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
167
168
  autocoder/utils/auto_coder_utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
168
169
  autocoder/utils/auto_coder_utils/chat_stream_out.py,sha256=lkJ_A-sYU36JMzjFWkk3pR6uos8oZHYt9GPsPe_CPAo,11766
169
170
  autocoder/utils/chat_auto_coder_utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
170
- auto_coder-0.1.281.dist-info/LICENSE,sha256=HrhfyXIkWY2tGFK11kg7vPCqhgh5DcxleloqdhrpyMY,11558
171
- auto_coder-0.1.281.dist-info/METADATA,sha256=nEKe-EhczLI_Fo1z2gKr948i-fe07ivbw68WSxyYPlc,2643
172
- auto_coder-0.1.281.dist-info/WHEEL,sha256=GV9aMThwP_4oNCtvEC2ec3qUYutgWeAzklro_0m4WJQ,91
173
- auto_coder-0.1.281.dist-info/entry_points.txt,sha256=0nzHtHH4pNcM7xq4EBA2toS28Qelrvcbrr59GqD_0Ak,350
174
- auto_coder-0.1.281.dist-info/top_level.txt,sha256=Jqc0_uJSw2GwoFQAa9iJxYns-2mWla-9ok_Y3Gcznjk,10
175
- auto_coder-0.1.281.dist-info/RECORD,,
171
+ auto_coder-0.1.283.dist-info/LICENSE,sha256=HrhfyXIkWY2tGFK11kg7vPCqhgh5DcxleloqdhrpyMY,11558
172
+ auto_coder-0.1.283.dist-info/METADATA,sha256=pLzj-iE-hpBIpDnMabXu-4cpgkQmR3qSrOMruAEY098,2643
173
+ auto_coder-0.1.283.dist-info/WHEEL,sha256=GV9aMThwP_4oNCtvEC2ec3qUYutgWeAzklro_0m4WJQ,91
174
+ auto_coder-0.1.283.dist-info/entry_points.txt,sha256=0nzHtHH4pNcM7xq4EBA2toS28Qelrvcbrr59GqD_0Ak,350
175
+ auto_coder-0.1.283.dist-info/top_level.txt,sha256=Jqc0_uJSw2GwoFQAa9iJxYns-2mWla-9ok_Y3Gcznjk,10
176
+ auto_coder-0.1.283.dist-info/RECORD,,
@@ -22,7 +22,7 @@ from loguru import logger
22
22
  import asyncio
23
23
  from datetime import datetime
24
24
 
25
- from autocoder.rag.document_retriever import process_file_local
25
+ from autocoder.rag.utils import process_file_local
26
26
  import pkg_resources
27
27
  from autocoder.rag.token_counter import TokenCounter
28
28
  from autocoder.rag.types import RAGServiceInfo
@@ -186,6 +186,11 @@ def main(input_args: Optional[List[str]] = None):
186
186
  build_index_parser.add_argument(
187
187
  "--model", default="v3_chat", help=desc["model"]
188
188
  )
189
+
190
+ build_index_parser.add_argument(
191
+ "--on_ray", action="store_true", help="Run on Ray"
192
+ )
193
+
189
194
  build_index_parser.add_argument(
190
195
  "--index_model", default="", help=desc["index_model"]
191
196
  )
@@ -216,8 +221,7 @@ def main(input_args: Optional[List[str]] = None):
216
221
  )
217
222
  serve_parser.add_argument("--file", default="", help=desc["file"])
218
223
  serve_parser.add_argument("--model", default="v3_chat", help=desc["model"])
219
- serve_parser.add_argument("--index_model", default="", help=desc["index_model"])
220
- serve_parser.add_argument("--emb_model", default="", help=desc["emb_model"])
224
+ serve_parser.add_argument("--index_model", default="", help=desc["index_model"])
221
225
  serve_parser.add_argument("--ray_address", default="auto", help=desc["ray_address"])
222
226
  serve_parser.add_argument(
223
227
  "--index_filter_workers",
@@ -377,6 +381,12 @@ def main(input_args: Optional[List[str]] = None):
377
381
  help="The model used for question answering",
378
382
  )
379
383
 
384
+ serve_parser.add_argument(
385
+ "--emb_model",
386
+ default="",
387
+ help="The model used for embedding documents",
388
+ )
389
+
380
390
  # Benchmark command
381
391
  benchmark_parser = subparsers.add_parser(
382
392
  "benchmark", help="Benchmark LLM client performance"
@@ -510,10 +520,7 @@ def main(input_args: Optional[List[str]] = None):
510
520
  # Generate unique name for RAG build if doc_dir exists
511
521
  if server_args.doc_dir:
512
522
  auto_coder_args.rag_build_name = generate_unique_name_from_path(server_args.doc_dir)
513
- logger.info(f"Generated RAG build name: {auto_coder_args.rag_build_name}")
514
-
515
- if auto_coder_args.enable_hybrid_index and args.product_mode == "lite":
516
- raise Exception("Hybrid index is not supported in lite mode")
523
+ logger.info(f"Generated RAG build name: {auto_coder_args.rag_build_name}")
517
524
 
518
525
  if auto_coder_args.enable_hybrid_index and args.product_mode == "pro":
519
526
  # 尝试连接storage
@@ -555,16 +562,22 @@ def main(input_args: Optional[List[str]] = None):
555
562
  qa_model.skip_nontext_check = True
556
563
  llm.setup_sub_client("qa_model", qa_model)
557
564
 
565
+ if args.emb_model:
566
+ emb_model = byzerllm.ByzerLLM()
567
+ emb_model.setup_default_model_name(args.emb_model)
568
+ emb_model.skip_nontext_check = True
569
+ llm.setup_sub_client("emb_model", emb_model)
570
+
558
571
  # 当启用hybrid_index时,检查必要的组件
559
572
  if auto_coder_args.enable_hybrid_index:
560
- if not llm.is_model_exist("emb"):
573
+ if not args.emb_model and not llm.is_model_exist("emb"):
561
574
  logger.error(
562
575
  "When enable_hybrid_index is true, an 'emb' model must be deployed"
563
576
  )
564
577
  return
565
- llm.setup_default_emb_model_name("emb")
578
+ llm.setup_default_emb_model_name(args.emb_model or "emb")
566
579
 
567
- elif args.product_mode == "lite":
580
+ if args.product_mode == "lite":
568
581
  from autocoder import models as models_module
569
582
  model_info = models_module.get_model_by_name(args.model)
570
583
  llm = byzerllm.SimpleByzerLLM(default_model_name=args.model)
@@ -629,6 +642,26 @@ def main(input_args: Optional[List[str]] = None):
629
642
  )
630
643
  llm.setup_sub_client("qa_model", qa_model)
631
644
 
645
+ if args.emb_model:
646
+ model_info = models_module.get_model_by_name(args.emb_model)
647
+ emb_model = byzerllm.SimpleByzerLLM(default_model_name=args.emb_model)
648
+ emb_model.deploy(
649
+ model_path="",
650
+ pretrained_model_type=model_info["model_type"],
651
+ udf_name=args.emb_model,
652
+ infer_params={
653
+ "saas.base_url": model_info["base_url"],
654
+ "saas.api_key": model_info["api_key"],
655
+ "saas.model": model_info["model_name"],
656
+ "saas.is_reasoning": False
657
+ }
658
+ )
659
+ llm.setup_sub_client("emb_model", emb_model)
660
+
661
+ if args.enable_hybrid_index:
662
+ if not args.emb_model:
663
+ raise Exception("When enable_hybrid_index is true, an 'emb' model must be specified")
664
+
632
665
  if server_args.doc_dir:
633
666
  auto_coder_args.rag_type = "simple"
634
667
  auto_coder_args.rag_build_name = generate_unique_name_from_path(server_args.doc_dir)
@@ -675,31 +708,62 @@ def main(input_args: Optional[List[str]] = None):
675
708
  auto_coder_args.enable_hybrid_index = True
676
709
  auto_coder_args.rag_type = "simple"
677
710
 
678
- try:
679
- from byzerllm.apps.byzer_storage.simple_api import ByzerStorage
680
-
681
- storage = ByzerStorage("byzerai_store", "rag", "files")
682
- storage.retrieval.cluster_info("byzerai_store")
683
- except Exception as e:
684
- logger.error(
685
- "When enable_hybrid_index is true, ByzerStorage must be started"
686
- )
687
- logger.error("Please run 'byzerllm storage start' first")
688
- return
711
+ if args.on_ray:
689
712
 
690
- llm = byzerllm.ByzerLLM()
691
- llm.setup_default_model_name(args.model)
713
+ try:
714
+ from byzerllm.apps.byzer_storage.simple_api import ByzerStorage
692
715
 
693
- # 当启用hybrid_index时,检查必要的组件
694
- if auto_coder_args.enable_hybrid_index:
695
- if not llm.is_model_exist("emb"):
716
+ storage = ByzerStorage("byzerai_store", "rag", "files")
717
+ storage.retrieval.cluster_info("byzerai_store")
718
+ except Exception as e:
696
719
  logger.error(
697
- "When enable_hybrid_index is true, an 'emb' model must be deployed"
720
+ "When enable_hybrid_index is true, ByzerStorage must be started"
698
721
  )
722
+ logger.error("Please run 'byzerllm storage start' first")
699
723
  return
700
- llm.setup_default_emb_model_name("emb")
701
724
 
702
- auto_coder_args.rag_build_name = generate_unique_name_from_path(args.doc_dir)
725
+ llm = byzerllm.ByzerLLM()
726
+ llm.setup_default_model_name(args.model)
727
+
728
+ # 当启用hybrid_index时,检查必要的组件
729
+ if auto_coder_args.enable_hybrid_index:
730
+ if not llm.is_model_exist("emb"):
731
+ logger.error(
732
+ "When enable_hybrid_index is true, an 'emb' model must be deployed"
733
+ )
734
+ return
735
+ llm.setup_default_emb_model_name("emb")
736
+ else:
737
+ from autocoder import models as models_module
738
+ model_info = models_module.get_model_by_name(args.model)
739
+ llm = byzerllm.SimpleByzerLLM(default_model_name=args.model)
740
+ llm.deploy(
741
+ model_path="",
742
+ pretrained_model_type=model_info["model_type"],
743
+ udf_name=args.model,
744
+ infer_params={
745
+ "saas.base_url": model_info["base_url"],
746
+ "saas.api_key": model_info["api_key"],
747
+ "saas.model": model_info["model_name"],
748
+ "saas.is_reasoning": model_info["is_reasoning"]
749
+ }
750
+ )
751
+
752
+ model_info = models_module.get_model_by_name(args.emb_model)
753
+ emb_model = byzerllm.SimpleByzerLLM(default_model_name=args.emb_model)
754
+ emb_model.deploy(
755
+ model_path="",
756
+ pretrained_model_type=model_info["model_type"],
757
+ udf_name=args.emb_model,
758
+ infer_params={
759
+ "saas.base_url": model_info["base_url"],
760
+ "saas.api_key": model_info["api_key"],
761
+ "saas.model": model_info["model_name"],
762
+ "saas.is_reasoning": False
763
+ }
764
+ )
765
+ llm.setup_sub_client("emb_model", emb_model)
766
+
703
767
  rag = RAGFactory.get_rag(
704
768
  llm=llm,
705
769
  args=auto_coder_args,
@@ -0,0 +1,457 @@
1
+ from autocoder.rag.cache.base_cache import (
2
+ BaseCacheManager,
3
+ DeleteEvent,
4
+ AddOrUpdateEvent,
5
+ FileInfo,
6
+ CacheItem
7
+ )
8
+ from typing import Generator, List, Dict, Any, Optional, Tuple
9
+ from autocoder.common import SourceCode
10
+ from loguru import logger
11
+ import pathspec
12
+ import os
13
+ import uuid
14
+ import json
15
+ from autocoder.rag.utils import process_file_in_multi_process, process_file_local
16
+ from byzerllm.apps.byzer_storage.local_simple_api import (
17
+ LocalByzerStorage,
18
+ DataType,
19
+ FieldOption,
20
+ SortOption,
21
+ )
22
+ from autocoder.common import AutoCoderArgs
23
+ import threading
24
+ from multiprocessing import Pool
25
+ from concurrent.futures import ThreadPoolExecutor, as_completed
26
+ from autocoder.rag.variable_holder import VariableHolder
27
+ import platform
28
+ import hashlib
29
+ from typing import Union
30
+ from byzerllm import SimpleByzerLLM, ByzerLLM
31
+
32
+ if platform.system() != "Windows":
33
+ import fcntl
34
+ else:
35
+ fcntl = None
36
+
37
+
38
+ def generate_file_md5(file_path: str) -> str:
39
+ md5_hash = hashlib.md5()
40
+ with open(file_path, "rb") as f:
41
+ for chunk in iter(lambda: f.read(4096), b""):
42
+ md5_hash.update(chunk)
43
+ return md5_hash.hexdigest()
44
+
45
+
46
+ def generate_content_md5(content: Union[str, bytes]) -> str:
47
+ if isinstance(content, str):
48
+ content = content.encode("utf-8")
49
+ md5_hash = hashlib.md5()
50
+ md5_hash.update(content)
51
+ return md5_hash.hexdigest()
52
+
53
+
54
+ default_ignore_dirs = [
55
+ "__pycache__",
56
+ "node_modules",
57
+ "_images"
58
+ ]
59
+
60
+
61
+ class LocalByzerStorageCache(BaseCacheManager):
62
+ def __init__(
63
+ self,
64
+ path,
65
+ ignore_spec,
66
+ required_exts,
67
+ extra_params: Optional[AutoCoderArgs] = None,
68
+ emb_llm: Union[ByzerLLM, SimpleByzerLLM] = None,
69
+ ):
70
+ self.path = path
71
+ self.ignore_spec = ignore_spec
72
+ self.required_exts = required_exts
73
+ self.rag_build_name = extra_params.rag_build_name
74
+ self.storage = LocalByzerStorage("byzerai_store",
75
+ "rag_test", self.rag_build_name, host="127.0.0.1", port=33333,emb_llm=emb_llm)
76
+ self.queue = []
77
+ self.chunk_size = 1000
78
+ self._init_schema()
79
+
80
+ if not extra_params:
81
+ raise ValueError("extra_params is required for ByzerStorageCache")
82
+
83
+ self.max_output_tokens = extra_params.hybrid_index_max_output_tokens
84
+
85
+ # 设置缓存文件路径
86
+ self.cache_dir = os.path.join(self.path, ".cache")
87
+ self.cache_file = os.path.join(
88
+ self.cache_dir, "byzer_storage_speedup.jsonl")
89
+ self.cache: Dict[str, CacheItem] = {}
90
+
91
+ self.lock = threading.Lock()
92
+ self.stop_event = threading.Event()
93
+ self.thread = threading.Thread(target=self.process_queue)
94
+ self.thread.daemon = True
95
+ self.thread.start()
96
+
97
+ # 创建缓存目录
98
+ if not os.path.exists(self.cache_dir):
99
+ os.makedirs(self.cache_dir)
100
+
101
+ # 加载缓存
102
+ self.cache = self._load_cache()
103
+
104
+ def _chunk_text(self, text, max_length=1000):
105
+ """Split text into chunks"""
106
+ chunks = []
107
+ current_chunk = []
108
+ current_length = 0
109
+
110
+ for line in text.split("\n"):
111
+ if current_length + len(line) > max_length and current_chunk:
112
+ chunks.append("\n".join(current_chunk))
113
+ current_chunk = []
114
+ current_length = 0
115
+ current_chunk.append(line)
116
+ current_length += len(line)
117
+
118
+ if current_chunk:
119
+ chunks.append("\n".join(current_chunk))
120
+
121
+ return chunks
122
+
123
+ def _init_schema(self):
124
+ """Initialize the Byzer Storage schema"""
125
+ _ = (
126
+ self.storage.schema_builder()
127
+ .add_field("_id", DataType.STRING)
128
+ .add_field("file_path", DataType.STRING)
129
+ .add_field("content", DataType.STRING, [FieldOption.ANALYZE])
130
+ .add_field("raw_content", DataType.STRING, [FieldOption.NO_INDEX])
131
+ .add_array_field("vector", DataType.FLOAT)
132
+ .add_field("mtime", DataType.DOUBLE, [FieldOption.SORT])
133
+ .execute()
134
+ )
135
+
136
+ def _load_cache(self) -> Dict[str, CacheItem]:
137
+ """Load cache from file"""
138
+ if os.path.exists(self.cache_file):
139
+ try:
140
+ with open(self.cache_file, "r", encoding="utf-8") as f:
141
+ lines = f.readlines()
142
+ cache = {}
143
+ for line in lines:
144
+ try:
145
+ data = json.loads(line.strip())
146
+ if isinstance(data, dict) and "file_path" in data:
147
+ # 转换为 CacheItem 对象
148
+ cache_item = CacheItem.model_validate(data)
149
+ cache[data["file_path"]] = cache_item
150
+ except json.JSONDecodeError:
151
+ continue
152
+ return cache
153
+ except Exception as e:
154
+ logger.error(f"Error loading cache file: {str(e)}")
155
+ return {}
156
+ return {}
157
+
158
+ def write_cache(self):
159
+ cache_file = self.cache_file
160
+
161
+ if not fcntl:
162
+ try:
163
+ with open(cache_file, "w", encoding="utf-8") as f:
164
+ for cache_item in self.cache.values():
165
+ # 确保序列化 Pydantic 模型
166
+ json.dump(cache_item.model_dump(),
167
+ f, ensure_ascii=False)
168
+ f.write("\n")
169
+ except IOError as e:
170
+ logger.error(f"Error writing cache file: {str(e)}")
171
+ else:
172
+ lock_file = cache_file + ".lock"
173
+ with open(lock_file, "w", encoding="utf-8") as lockf:
174
+ try:
175
+ # 获取文件锁
176
+ fcntl.flock(lockf, fcntl.LOCK_EX | fcntl.LOCK_NB)
177
+ # 写入缓存文件
178
+ with open(cache_file, "w", encoding="utf-8") as f:
179
+ for cache_item in self.cache.values():
180
+ # 确保序列化 Pydantic 模型
181
+ json.dump(cache_item.model_dump(),
182
+ f, ensure_ascii=False)
183
+ f.write("\n")
184
+
185
+ finally:
186
+ # 释放文件锁
187
+ fcntl.flock(lockf, fcntl.LOCK_UN)
188
+
189
+ def fileinfo_to_tuple(self, file_info: FileInfo) -> Tuple[str, str, float, str]:
190
+ return (file_info.file_path, file_info.relative_path, file_info.modify_time, file_info.file_md5)
191
+
192
+ def build_cache(self):
193
+ """Build the cache by reading files and storing in Byzer Storage"""
194
+ logger.info(f"Building cache for path: {self.path}")
195
+
196
+ files_to_process = []
197
+ for file_info in self.get_all_files():
198
+ if (
199
+ file_info.file_path not in self.cache
200
+ or self.cache[file_info.file_path].md5 != file_info.file_md5
201
+ ):
202
+ files_to_process.append(file_info)
203
+
204
+ if not files_to_process:
205
+ return
206
+
207
+ from autocoder.rag.token_counter import initialize_tokenizer
208
+
209
+ with Pool(
210
+ processes=os.cpu_count(),
211
+ initializer=initialize_tokenizer,
212
+ initargs=(VariableHolder.TOKENIZER_PATH,),
213
+ ) as pool:
214
+ target_files_to_process = []
215
+ for file_info in files_to_process:
216
+ target_files_to_process.append(
217
+ self.fileinfo_to_tuple(file_info))
218
+ results = pool.map(process_file_in_multi_process,
219
+ target_files_to_process)
220
+
221
+ items = []
222
+ for file_info, result in zip(files_to_process, results):
223
+ content: List[SourceCode] = result
224
+ self.cache[file_info.file_path] = CacheItem(
225
+ file_path=file_info.file_path,
226
+ relative_path=file_info.relative_path,
227
+ content=[c.model_dump() for c in content],
228
+ modify_time=file_info.modify_time,
229
+ md5=file_info.file_md5,
230
+ )
231
+
232
+ for doc in content:
233
+ logger.info(f"Processing file: {doc.module_name}")
234
+ doc.module_name
235
+ chunks = self._chunk_text(doc.source_code, self.chunk_size)
236
+ for chunk_idx, chunk in enumerate(chunks):
237
+ chunk_item = {
238
+ "_id": f"{doc.module_name}_{chunk_idx}",
239
+ "file_path": file_info.file_path,
240
+ "content": chunk,
241
+ "raw_content": chunk,
242
+ "vector": chunk,
243
+ "mtime": file_info.modify_time,
244
+ }
245
+ items.append(chunk_item)
246
+
247
+ # Save to local cache
248
+ logger.info("Saving cache to local file")
249
+ self.write_cache()
250
+
251
+ if items:
252
+ logger.info("Clear cache from Byzer Storage")
253
+ self.storage.truncate_table()
254
+ logger.info("Save new cache to Byzer Storage")
255
+ max_workers = 5
256
+ chunk_size = max(1, len(items) // max_workers)
257
+ item_chunks = [items[i:i + chunk_size]
258
+ for i in range(0, len(items), chunk_size)]
259
+
260
+ total_chunks = len(item_chunks)
261
+ completed_chunks = 0
262
+
263
+ logger.info(f"Progress: {0}/{total_chunks} chunks completed")
264
+
265
+ with ThreadPoolExecutor(max_workers=max_workers) as executor:
266
+ futures = []
267
+ for chunk in item_chunks:
268
+ futures.append(
269
+ executor.submit(
270
+ lambda x: self.storage.write_builder().add_items(
271
+ x, vector_fields=["vector"], search_fields=["content"]
272
+ ).execute(),
273
+ chunk
274
+ )
275
+ )
276
+ # Wait for all futures to complete
277
+ for future in as_completed(futures):
278
+ try:
279
+ future.result()
280
+ completed_chunks += 1
281
+ logger.info(
282
+ f"Progress: {completed_chunks}/{total_chunks} chunks completed")
283
+ except Exception as e:
284
+ logger.error(f"Error in saving chunk: {str(e)}")
285
+
286
+ self.storage.commit()
287
+
288
+ def update_storage(self, file_info: FileInfo, is_delete: bool):
289
+ query = self.storage.query_builder()
290
+ query.and_filter().add_condition("file_path", file_info.file_path).build()
291
+ results = query.execute()
292
+ if results:
293
+ for result in results:
294
+ self.storage.delete_by_ids([result["_id"]])
295
+ items = []
296
+
297
+ if not is_delete:
298
+ content = [
299
+ SourceCode.model_validate(doc)
300
+ for doc in self.cache[file_info.file_path].content
301
+ ]
302
+ modify_time = self.cache[file_info.file_path].modify_time
303
+ for doc in content:
304
+ logger.info(f"Processing file: {doc.module_name}")
305
+ doc.module_name
306
+ chunks = self._chunk_text(doc.source_code, self.chunk_size)
307
+ for chunk_idx, chunk in enumerate(chunks):
308
+ chunk_item = {
309
+ "_id": f"{doc.module_name}_{chunk_idx}",
310
+ "file_path": file_info.file_path,
311
+ "content": chunk,
312
+ "raw_content": chunk,
313
+ "vector": chunk,
314
+ "mtime": modify_time,
315
+ }
316
+ items.append(chunk_item)
317
+ if items:
318
+ self.storage.write_builder().add_items(
319
+ items, vector_fields=["vector"], search_fields=["content"]
320
+ ).execute()
321
+ self.storage.commit()
322
+
323
+ def process_queue(self):
324
+ while self.queue:
325
+ file_list = self.queue.pop(0)
326
+ if isinstance(file_list, DeleteEvent):
327
+ for item in file_list.file_paths:
328
+ logger.info(f"{item} is detected to be removed")
329
+ del self.cache[item]
330
+ # 创建一个临时的 FileInfo 对象
331
+ file_info = FileInfo(
332
+ file_path=item, relative_path="", modify_time=0, file_md5="")
333
+ self.update_storage(file_info, is_delete=True)
334
+
335
+ elif isinstance(file_list, AddOrUpdateEvent):
336
+ for file_info in file_list.file_infos:
337
+ logger.info(
338
+ f"{file_info.file_path} is detected to be updated")
339
+ # 处理文件并创建 CacheItem
340
+ content = process_file_local(
341
+ self.fileinfo_to_tuple(file_info))
342
+ self.cache[file_info.file_path] = CacheItem(
343
+ file_path=file_info.file_path,
344
+ relative_path=file_info.relative_path,
345
+ content=[c.model_dump() for c in content],
346
+ modify_time=file_info.modify_time,
347
+ md5=file_info.file_md5,
348
+ )
349
+ self.update_storage(file_info, is_delete=False)
350
+ self.write_cache()
351
+
352
+ def trigger_update(self):
353
+ logger.info("检查文件是否有更新.....")
354
+ files_to_process = []
355
+ current_files = set()
356
+ for file_info in self.get_all_files():
357
+ current_files.add(file_info.file_path)
358
+ if (
359
+ file_info.file_path not in self.cache
360
+ or self.cache[file_info.file_path].md5 != file_info.file_md5
361
+ ):
362
+ files_to_process.append(file_info)
363
+
364
+ deleted_files = set(self.cache.keys()) - current_files
365
+ logger.info(f"files_to_process: {files_to_process}")
366
+ logger.info(f"deleted_files: {deleted_files}")
367
+ if deleted_files:
368
+ with self.lock:
369
+ self.queue.append(DeleteEvent(file_paths=deleted_files))
370
+ if files_to_process:
371
+ with self.lock:
372
+ self.queue.append(AddOrUpdateEvent(
373
+ file_infos=files_to_process))
374
+
375
+ def get_cache(self, options: Dict[str, Any]) -> Dict[str, Dict]:
376
+ """Search cached documents using query"""
377
+
378
+ self.trigger_update()
379
+
380
+ if options is None or "query" not in options:
381
+ return {file_path: self.cache[file_path].model_dump() for file_path in self.cache}
382
+
383
+ query = options.get("query", "")
384
+ total_tokens = 0
385
+
386
+ # Build query with both vector search and text search
387
+ query_builder = self.storage.query_builder()
388
+ query_builder.set_limit(100000)
389
+
390
+ # Add vector search if enabled
391
+ if options.get("enable_vector_search", True):
392
+ query_builder.set_vector_query(query, fields=["vector"])
393
+
394
+ # Add text search
395
+ if options.get("enable_text_search", True):
396
+ query_builder.set_search_query(query, fields=["content"])
397
+
398
+ results = query_builder.execute()
399
+
400
+ # Group results by file_path and reconstruct documents while preserving order
401
+ # 这里还可以有排序优化,综合考虑一篇内容出现的次数以及排序位置
402
+ file_paths = []
403
+ seen = set()
404
+ for result in results:
405
+ file_path = result["file_path"]
406
+ if file_path not in seen:
407
+ seen.add(file_path)
408
+ file_paths.append(file_path)
409
+
410
+ # 从缓存中获取文件内容
411
+ result = {}
412
+ for file_path in file_paths:
413
+ if file_path in self.cache:
414
+ cached_data = self.cache[file_path]
415
+ for doc in cached_data.content:
416
+ if total_tokens + doc["tokens"] > self.max_output_tokens:
417
+ return result
418
+ total_tokens += doc["tokens"]
419
+ result[file_path] = cached_data.model_dump()
420
+
421
+ return result
422
+
423
+ def get_all_files(self) -> List[FileInfo]:
424
+ all_files = []
425
+ for root, dirs, files in os.walk(self.path, followlinks=True):
426
+ dirs[:] = [d for d in dirs if not d.startswith(
427
+ ".") and d not in default_ignore_dirs]
428
+
429
+ if self.ignore_spec:
430
+ relative_root = os.path.relpath(root, self.path)
431
+ dirs[:] = [
432
+ d
433
+ for d in dirs
434
+ if not self.ignore_spec.match_file(os.path.join(relative_root, d))
435
+ ]
436
+ files = [
437
+ f
438
+ for f in files
439
+ if not self.ignore_spec.match_file(os.path.join(relative_root, f))
440
+ ]
441
+
442
+ for file in files:
443
+ if self.required_exts and not any(
444
+ file.endswith(ext) for ext in self.required_exts
445
+ ):
446
+ continue
447
+
448
+ file_path = os.path.join(root, file)
449
+ relative_path = os.path.relpath(file_path, self.path)
450
+ modify_time = os.path.getmtime(file_path)
451
+ file_md5 = generate_file_md5(file_path)
452
+ all_files.append(FileInfo(file_path=file_path,
453
+ relative_path=relative_path,
454
+ modify_time=modify_time,
455
+ file_md5=file_md5))
456
+
457
+ return all_files
@@ -1,52 +1,20 @@
1
- import json
2
- import os
3
- import platform
4
- import time
5
- import traceback
6
-
7
1
  import threading
8
- from multiprocessing import Pool
9
- from typing import Dict, Generator, List, Tuple, Any, Optional
2
+ from typing import Dict, Generator, List, Tuple, Any, Optional,Union
10
3
 
11
- import ray
12
- from loguru import logger
13
- from pydantic import BaseModel
4
+ from byzerllm import ByzerLLM, SimpleByzerLLM
14
5
 
6
+ from loguru import logger
15
7
  from autocoder.common import SourceCode
16
8
  from uuid import uuid4
17
- from autocoder.rag.variable_holder import VariableHolder
18
9
  from abc import ABC, abstractmethod
19
- from autocoder.rag.cache.base_cache import BaseCacheManager
20
10
  from autocoder.rag.cache.simple_cache import AutoCoderRAGAsyncUpdateQueue
21
11
  from autocoder.rag.cache.file_monitor_cache import AutoCoderRAGDocListener
22
12
  from autocoder.rag.cache.byzer_storage_cache import ByzerStorageCache
23
- from autocoder.rag.utils import process_file_in_multi_process, process_file_local
13
+ from autocoder.rag.cache.local_byzer_storage_cache import LocalByzerStorageCache
24
14
  from autocoder.common import AutoCoderArgs
25
15
 
26
16
  cache_lock = threading.Lock()
27
17
 
28
-
29
- def get_or_create_actor(path: str, ignore_spec, required_exts: list, cacher={}):
30
- with cache_lock:
31
- # 处理路径名
32
- actor_name = "AutoCoderRAGAsyncUpdateQueue_" + path.replace(
33
- os.sep, "_"
34
- ).replace(" ", "")
35
- try:
36
- actor = ray.get_actor(actor_name)
37
- except ValueError:
38
- actor = None
39
- if actor is None:
40
- actor = (
41
- ray.remote(AutoCoderRAGAsyncUpdateQueue)
42
- .options(name=actor_name, num_cpus=0)
43
- .remote(path, ignore_spec, required_exts)
44
- )
45
- ray.get(actor.load_first.remote())
46
- cacher[actor_name] = actor
47
- return actor
48
-
49
-
50
18
  class BaseDocumentRetriever(ABC):
51
19
  """Abstract base class for document retrieval."""
52
20
 
@@ -77,6 +45,7 @@ class LocalDocumentRetriever(BaseDocumentRetriever):
77
45
  disable_auto_window: bool = False,
78
46
  enable_hybrid_index: bool = False,
79
47
  extra_params: Optional[AutoCoderArgs] = None,
48
+ emb_llm: Union[ByzerLLM, SimpleByzerLLM] = None,
80
49
  ) -> None:
81
50
  self.path = path
82
51
  self.ignore_spec = ignore_spec
@@ -91,23 +60,26 @@ class LocalDocumentRetriever(BaseDocumentRetriever):
91
60
  # 合并后的最大文件大小
92
61
  self.small_file_merge_limit = self.single_file_token_limit / 2
93
62
 
94
- self.on_ray = on_ray
95
- if self.on_ray:
96
- self.cacher = get_or_create_actor(path, ignore_spec, required_exts)
97
- else:
98
- if self.enable_hybrid_index:
63
+ self.on_ray = on_ray
64
+ if self.enable_hybrid_index:
65
+ if self.on_ray:
99
66
  self.cacher = ByzerStorageCache(
100
67
  path, ignore_spec, required_exts, extra_params
101
68
  )
69
+ else:
70
+ self.cacher = LocalByzerStorageCache(
71
+ path, ignore_spec, required_exts, extra_params,
72
+ emb_llm = emb_llm
73
+ )
74
+ else:
75
+ if self.monitor_mode:
76
+ self.cacher = AutoCoderRAGDocListener(
77
+ path, ignore_spec, required_exts
78
+ )
102
79
  else:
103
- if self.monitor_mode:
104
- self.cacher = AutoCoderRAGDocListener(
105
- path, ignore_spec, required_exts
106
- )
107
- else:
108
- self.cacher = AutoCoderRAGAsyncUpdateQueue(
109
- path, ignore_spec, required_exts
110
- )
80
+ self.cacher = AutoCoderRAGAsyncUpdateQueue(
81
+ path, ignore_spec, required_exts
82
+ )
111
83
 
112
84
  logger.info(f"DocumentRetriever initialized with:")
113
85
  logger.info(f" Path: {self.path}")
@@ -123,10 +95,7 @@ class LocalDocumentRetriever(BaseDocumentRetriever):
123
95
  )
124
96
 
125
97
  def get_cache(self, options: Optional[Dict[str, Any]] = None):
126
- if self.on_ray:
127
- return ray.get(self.cacher.get_cache.remote())
128
- else:
129
- return self.cacher.get_cache(options=options)
98
+ return self.cacher.get_cache(options=options)
130
99
 
131
100
  def retrieve_documents(
132
101
  self, options: Optional[Dict[str, Any]] = None
@@ -86,6 +86,7 @@ class LongContextRAG:
86
86
  self.recall_llm = self.llm
87
87
  self.chunk_llm = self.llm
88
88
  self.qa_llm = self.llm
89
+ self.emb_llm = None
89
90
 
90
91
  if self.llm.get_sub_client("qa_model"):
91
92
  self.qa_llm = self.llm.get_sub_client("qa_model")
@@ -96,6 +97,9 @@ class LongContextRAG:
96
97
  if self.llm.get_sub_client("chunk_model"):
97
98
  self.chunk_llm = self.llm.get_sub_client("chunk_model")
98
99
 
100
+ if self.llm.get_sub_client("emb_model"):
101
+ self.emb_llm = self.llm.get_sub_client("emb_model")
102
+
99
103
  self.args = args
100
104
 
101
105
  self.path = path
@@ -169,6 +173,11 @@ class LongContextRAG:
169
173
 
170
174
  self.token_limit = self.args.rag_context_window_limit or 120000
171
175
  retriever_class = self._get_document_retriever_class()
176
+
177
+ if self.args.enable_hybrid_index and not self.on_ray:
178
+ if self.emb_llm is None:
179
+ raise ValueError("emb_llm is required for local byzer storage cache")
180
+
172
181
  self.document_retriever = retriever_class(
173
182
  self.path,
174
183
  self.ignore_spec,
@@ -179,7 +188,8 @@ class LongContextRAG:
179
188
  single_file_token_limit=self.full_text_limit - 100,
180
189
  disable_auto_window=self.args.disable_auto_window,
181
190
  enable_hybrid_index=self.args.enable_hybrid_index,
182
- extra_params=self.args
191
+ extra_params=self.args,
192
+ emb_llm=self.emb_llm
183
193
  )
184
194
 
185
195
  self.doc_filter = DocFilter(
@@ -616,7 +626,11 @@ class LongContextRAG:
616
626
  return [json.dumps(final_docs, ensure_ascii=False)], []
617
627
 
618
628
  if not relevant_docs:
619
- return ["没有找到相关的文档来回答这个问题。"], []
629
+ yield ("没有找到可以回答你问题的相关文档", SingleOutputMeta(input_tokens_count=rag_stat.recall_stat.total_input_tokens + rag_stat.chunk_stat.total_input_tokens,
630
+ generated_tokens_count=rag_stat.recall_stat.total_generated_tokens +
631
+ rag_stat.chunk_stat.total_generated_tokens,
632
+ ))
633
+ return
620
634
 
621
635
  context = [doc.source_code.module_name for doc in relevant_docs]
622
636
 
@@ -778,6 +792,12 @@ class LongContextRAG:
778
792
  tokens=request_tokens
779
793
  )
780
794
  ))
795
+
796
+ yield ("", SingleOutputMeta(input_tokens_count=rag_stat.recall_stat.total_input_tokens + rag_stat.chunk_stat.total_input_tokens,
797
+ generated_tokens_count=rag_stat.recall_stat.total_generated_tokens +
798
+ rag_stat.chunk_stat.total_generated_tokens,
799
+ reasoning_content="qa_model_thinking"
800
+ ))
781
801
 
782
802
  if LLMComputeEngine is not None and not self.args.disable_inference_enhance:
783
803
  llm_compute_engine = LLMComputeEngine(
autocoder/version.py CHANGED
@@ -1 +1 @@
1
- __version__ = "0.1.281"
1
+ __version__ = "0.1.283"