auto-coder 0.1.281__py3-none-any.whl → 0.1.282__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of auto-coder might be problematic. Click here for more details.
- {auto_coder-0.1.281.dist-info → auto_coder-0.1.282.dist-info}/METADATA +2 -2
- {auto_coder-0.1.281.dist-info → auto_coder-0.1.282.dist-info}/RECORD +11 -10
- autocoder/auto_coder_rag.py +93 -29
- autocoder/rag/cache/local_byzer_storage_cache.py +457 -0
- autocoder/rag/document_retriever.py +22 -53
- autocoder/rag/long_context_rag.py +17 -1
- autocoder/version.py +1 -1
- {auto_coder-0.1.281.dist-info → auto_coder-0.1.282.dist-info}/LICENSE +0 -0
- {auto_coder-0.1.281.dist-info → auto_coder-0.1.282.dist-info}/WHEEL +0 -0
- {auto_coder-0.1.281.dist-info → auto_coder-0.1.282.dist-info}/entry_points.txt +0 -0
- {auto_coder-0.1.281.dist-info → auto_coder-0.1.282.dist-info}/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: auto-coder
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.282
|
|
4
4
|
Summary: AutoCoder: AutoCoder
|
|
5
5
|
Author: allwefantasy
|
|
6
6
|
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
@@ -26,7 +26,7 @@ Requires-Dist: tabulate
|
|
|
26
26
|
Requires-Dist: jupyter-client
|
|
27
27
|
Requires-Dist: prompt-toolkit
|
|
28
28
|
Requires-Dist: tokenizers
|
|
29
|
-
Requires-Dist: byzerllm[saas] >=0.1.
|
|
29
|
+
Requires-Dist: byzerllm[saas] >=0.1.171
|
|
30
30
|
Requires-Dist: patch
|
|
31
31
|
Requires-Dist: diff-match-patch
|
|
32
32
|
Requires-Dist: GitPython
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
autocoder/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
2
|
autocoder/auto_coder.py,sha256=cxH77xjLPwCmE6R-o1KpONOZlVNOvbTYV4ilAjL8w8A,65592
|
|
3
3
|
autocoder/auto_coder_lang.py,sha256=Rtupq6N3_HT7JRhDKdgCBcwRaiAnyCOR_Gsp4jUomrI,3229
|
|
4
|
-
autocoder/auto_coder_rag.py,sha256=
|
|
4
|
+
autocoder/auto_coder_rag.py,sha256=cwsCpudP6d2CHOUTD-RXw7CCgx-c7sG7_cKqnjzphlk,32973
|
|
5
5
|
autocoder/auto_coder_rag_client_mcp.py,sha256=QRxUbjc6A8UmDMQ8lXgZkjgqtq3lgKYeatJbDY6rSo0,6270
|
|
6
6
|
autocoder/auto_coder_rag_mcp.py,sha256=-RrjNwFaS2e5v8XDIrKR-zlUNUE8UBaeOtojffBrvJo,8521
|
|
7
7
|
autocoder/auto_coder_runner.py,sha256=w-4MCKhOFaoABcDfVoZoonF59UyRso3kghimQYLz3NA,100851
|
|
@@ -12,7 +12,7 @@ autocoder/chat_auto_coder_lang.py,sha256=ShOQVOnMA-WlT-fB9OrOer-xQkbcWxJGl-WMPuZ
|
|
|
12
12
|
autocoder/command_args.py,sha256=9aYJ-AmPxP1sQh6ciw04FWHjSn31f2W9afXFwo8wgx4,30441
|
|
13
13
|
autocoder/lang.py,sha256=U6AjVV8Rs1uLyjFCZ8sT6WWuNUxMBqkXXIOs4S120uk,14511
|
|
14
14
|
autocoder/models.py,sha256=PlG1tKHSHwB57cKLOl5gTl5yTzFUDzCgeHPJU3N9F6Q,9106
|
|
15
|
-
autocoder/version.py,sha256=
|
|
15
|
+
autocoder/version.py,sha256=MuZdShxBktD9b-QDa1Sdv3QS_FjwPsAMhpVaUVk1n9A,23
|
|
16
16
|
autocoder/agent/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
17
17
|
autocoder/agent/auto_demand_organizer.py,sha256=NWSAEsEk94vT3lGjfo25kKLMwYdPcpy9e-i21txPasQ,6942
|
|
18
18
|
autocoder/agent/auto_filegroup.py,sha256=CW7bqp0FW1GIEMnl-blyAc2UGT7O9Mom0q66ITz1ckM,6635
|
|
@@ -109,10 +109,10 @@ autocoder/pyproject/__init__.py,sha256=ms-A_pocgGv0oZPEW8JAdXi7G-VSVhkQ6CnWFe535
|
|
|
109
109
|
autocoder/rag/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
110
110
|
autocoder/rag/api_server.py,sha256=xiypCkdbclY0Z3Cmq5FTvtKrfQUV7yKcDaFFUttA2n0,7242
|
|
111
111
|
autocoder/rag/doc_filter.py,sha256=UduVO2mlrngwJICrefjDJTYfdmQ4GcRXrfWDQ7xXksk,14206
|
|
112
|
-
autocoder/rag/document_retriever.py,sha256=
|
|
112
|
+
autocoder/rag/document_retriever.py,sha256=MGn6oIPo49BbRC99xmLMFkZrpHfcDfKoGYqWxXF554U,8051
|
|
113
113
|
autocoder/rag/lang.py,sha256=TVNx5m7OtBcdfahzI29tMj9m1yrEm32G1c1zc4ZNIPs,3130
|
|
114
114
|
autocoder/rag/llm_wrapper.py,sha256=Ht5GF5yJtrztoliujsZzx_ooWZmHkd5xLZKcGEiicZw,4303
|
|
115
|
-
autocoder/rag/long_context_rag.py,sha256=
|
|
115
|
+
autocoder/rag/long_context_rag.py,sha256=ZvTT3yO5FmJwWXHqqFrQgkb9YrVajrXrtB_tcFdDwAs,40172
|
|
116
116
|
autocoder/rag/rag_config.py,sha256=8LwFcTd8OJWWwi1_WY4IzjqgtT6RyE2j4PjxS5cCTDE,802
|
|
117
117
|
autocoder/rag/rag_entry.py,sha256=6TKtErZ0Us9XSV6HgRKXA6yR3SiZGPHpynOKSaR1wgE,2463
|
|
118
118
|
autocoder/rag/raw_rag.py,sha256=BOr0YGf3umjqXOIDVO1LXQ0bIHx8hzBdiubND2ezyxc,2946
|
|
@@ -130,6 +130,7 @@ autocoder/rag/cache/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSu
|
|
|
130
130
|
autocoder/rag/cache/base_cache.py,sha256=EaYYYbclMBvnlOUoM7qonnluwZX5oSvUjdvGvFun8_8,742
|
|
131
131
|
autocoder/rag/cache/byzer_storage_cache.py,sha256=gK90pf741CgccCzQ73urBorCqVyAfwU1FAqMtSorWVk,17232
|
|
132
132
|
autocoder/rag/cache/file_monitor_cache.py,sha256=2TnOW8Y81Zc0WA1upRrkmQH18IMdv40CeNccmnTvd3c,4981
|
|
133
|
+
autocoder/rag/cache/local_byzer_storage_cache.py,sha256=Uhmu5JK0tfZ8NvlcjJzcwtQRhZDpbGp_U6qLXZxVwss,17495
|
|
133
134
|
autocoder/rag/cache/simple_cache.py,sha256=8FMmBAfhAPcdSNUWC6Ga43LBFGXD-klwabVbzm_bciI,9347
|
|
134
135
|
autocoder/rag/loaders/__init__.py,sha256=EQHEZ5Cmz-mGP2SllUTvcIbYCnF7W149dNpNItfs0yE,304
|
|
135
136
|
autocoder/rag/loaders/docx_loader.py,sha256=ZswPqiiLngUEpzLhNNm1nmwEYV7ZHFEfIoXoG7c5GDU,614
|
|
@@ -167,9 +168,9 @@ autocoder/utils/types.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
|
167
168
|
autocoder/utils/auto_coder_utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
168
169
|
autocoder/utils/auto_coder_utils/chat_stream_out.py,sha256=lkJ_A-sYU36JMzjFWkk3pR6uos8oZHYt9GPsPe_CPAo,11766
|
|
169
170
|
autocoder/utils/chat_auto_coder_utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
170
|
-
auto_coder-0.1.
|
|
171
|
-
auto_coder-0.1.
|
|
172
|
-
auto_coder-0.1.
|
|
173
|
-
auto_coder-0.1.
|
|
174
|
-
auto_coder-0.1.
|
|
175
|
-
auto_coder-0.1.
|
|
171
|
+
auto_coder-0.1.282.dist-info/LICENSE,sha256=HrhfyXIkWY2tGFK11kg7vPCqhgh5DcxleloqdhrpyMY,11558
|
|
172
|
+
auto_coder-0.1.282.dist-info/METADATA,sha256=L2wd7XGt-KURDWFp-mn4HAo7K87iqwuPXKfepSBC3JA,2643
|
|
173
|
+
auto_coder-0.1.282.dist-info/WHEEL,sha256=GV9aMThwP_4oNCtvEC2ec3qUYutgWeAzklro_0m4WJQ,91
|
|
174
|
+
auto_coder-0.1.282.dist-info/entry_points.txt,sha256=0nzHtHH4pNcM7xq4EBA2toS28Qelrvcbrr59GqD_0Ak,350
|
|
175
|
+
auto_coder-0.1.282.dist-info/top_level.txt,sha256=Jqc0_uJSw2GwoFQAa9iJxYns-2mWla-9ok_Y3Gcznjk,10
|
|
176
|
+
auto_coder-0.1.282.dist-info/RECORD,,
|
autocoder/auto_coder_rag.py
CHANGED
|
@@ -22,7 +22,7 @@ from loguru import logger
|
|
|
22
22
|
import asyncio
|
|
23
23
|
from datetime import datetime
|
|
24
24
|
|
|
25
|
-
from autocoder.rag.
|
|
25
|
+
from autocoder.rag.utils import process_file_local
|
|
26
26
|
import pkg_resources
|
|
27
27
|
from autocoder.rag.token_counter import TokenCounter
|
|
28
28
|
from autocoder.rag.types import RAGServiceInfo
|
|
@@ -186,6 +186,11 @@ def main(input_args: Optional[List[str]] = None):
|
|
|
186
186
|
build_index_parser.add_argument(
|
|
187
187
|
"--model", default="v3_chat", help=desc["model"]
|
|
188
188
|
)
|
|
189
|
+
|
|
190
|
+
build_index_parser.add_argument(
|
|
191
|
+
"--on_ray", action="store_true", help="Run on Ray"
|
|
192
|
+
)
|
|
193
|
+
|
|
189
194
|
build_index_parser.add_argument(
|
|
190
195
|
"--index_model", default="", help=desc["index_model"]
|
|
191
196
|
)
|
|
@@ -216,8 +221,7 @@ def main(input_args: Optional[List[str]] = None):
|
|
|
216
221
|
)
|
|
217
222
|
serve_parser.add_argument("--file", default="", help=desc["file"])
|
|
218
223
|
serve_parser.add_argument("--model", default="v3_chat", help=desc["model"])
|
|
219
|
-
serve_parser.add_argument("--index_model", default="", help=desc["index_model"])
|
|
220
|
-
serve_parser.add_argument("--emb_model", default="", help=desc["emb_model"])
|
|
224
|
+
serve_parser.add_argument("--index_model", default="", help=desc["index_model"])
|
|
221
225
|
serve_parser.add_argument("--ray_address", default="auto", help=desc["ray_address"])
|
|
222
226
|
serve_parser.add_argument(
|
|
223
227
|
"--index_filter_workers",
|
|
@@ -377,6 +381,12 @@ def main(input_args: Optional[List[str]] = None):
|
|
|
377
381
|
help="The model used for question answering",
|
|
378
382
|
)
|
|
379
383
|
|
|
384
|
+
serve_parser.add_argument(
|
|
385
|
+
"--emb_model",
|
|
386
|
+
default="",
|
|
387
|
+
help="The model used for embedding documents",
|
|
388
|
+
)
|
|
389
|
+
|
|
380
390
|
# Benchmark command
|
|
381
391
|
benchmark_parser = subparsers.add_parser(
|
|
382
392
|
"benchmark", help="Benchmark LLM client performance"
|
|
@@ -510,10 +520,7 @@ def main(input_args: Optional[List[str]] = None):
|
|
|
510
520
|
# Generate unique name for RAG build if doc_dir exists
|
|
511
521
|
if server_args.doc_dir:
|
|
512
522
|
auto_coder_args.rag_build_name = generate_unique_name_from_path(server_args.doc_dir)
|
|
513
|
-
logger.info(f"Generated RAG build name: {auto_coder_args.rag_build_name}")
|
|
514
|
-
|
|
515
|
-
if auto_coder_args.enable_hybrid_index and args.product_mode == "lite":
|
|
516
|
-
raise Exception("Hybrid index is not supported in lite mode")
|
|
523
|
+
logger.info(f"Generated RAG build name: {auto_coder_args.rag_build_name}")
|
|
517
524
|
|
|
518
525
|
if auto_coder_args.enable_hybrid_index and args.product_mode == "pro":
|
|
519
526
|
# 尝试连接storage
|
|
@@ -555,16 +562,22 @@ def main(input_args: Optional[List[str]] = None):
|
|
|
555
562
|
qa_model.skip_nontext_check = True
|
|
556
563
|
llm.setup_sub_client("qa_model", qa_model)
|
|
557
564
|
|
|
565
|
+
if args.emb_model:
|
|
566
|
+
emb_model = byzerllm.ByzerLLM()
|
|
567
|
+
emb_model.setup_default_model_name(args.emb_model)
|
|
568
|
+
emb_model.skip_nontext_check = True
|
|
569
|
+
llm.setup_sub_client("emb_model", emb_model)
|
|
570
|
+
|
|
558
571
|
# 当启用hybrid_index时,检查必要的组件
|
|
559
572
|
if auto_coder_args.enable_hybrid_index:
|
|
560
|
-
if not llm.is_model_exist("emb"):
|
|
573
|
+
if not args.emb_model and not llm.is_model_exist("emb"):
|
|
561
574
|
logger.error(
|
|
562
575
|
"When enable_hybrid_index is true, an 'emb' model must be deployed"
|
|
563
576
|
)
|
|
564
577
|
return
|
|
565
|
-
llm.setup_default_emb_model_name("emb")
|
|
578
|
+
llm.setup_default_emb_model_name(args.emb_model or "emb")
|
|
566
579
|
|
|
567
|
-
|
|
580
|
+
if args.product_mode == "lite":
|
|
568
581
|
from autocoder import models as models_module
|
|
569
582
|
model_info = models_module.get_model_by_name(args.model)
|
|
570
583
|
llm = byzerllm.SimpleByzerLLM(default_model_name=args.model)
|
|
@@ -629,6 +642,26 @@ def main(input_args: Optional[List[str]] = None):
|
|
|
629
642
|
)
|
|
630
643
|
llm.setup_sub_client("qa_model", qa_model)
|
|
631
644
|
|
|
645
|
+
if args.emb_model:
|
|
646
|
+
model_info = models_module.get_model_by_name(args.emb_model)
|
|
647
|
+
emb_model = byzerllm.SimpleByzerLLM(default_model_name=args.emb_model)
|
|
648
|
+
emb_model.deploy(
|
|
649
|
+
model_path="",
|
|
650
|
+
pretrained_model_type=model_info["model_type"],
|
|
651
|
+
udf_name=args.emb_model,
|
|
652
|
+
infer_params={
|
|
653
|
+
"saas.base_url": model_info["base_url"],
|
|
654
|
+
"saas.api_key": model_info["api_key"],
|
|
655
|
+
"saas.model": model_info["model_name"],
|
|
656
|
+
"saas.is_reasoning": False
|
|
657
|
+
}
|
|
658
|
+
)
|
|
659
|
+
llm.setup_sub_client("emb_model", emb_model)
|
|
660
|
+
|
|
661
|
+
if args.enable_hybrid_index:
|
|
662
|
+
if not args.emb_model:
|
|
663
|
+
raise Exception("When enable_hybrid_index is true, an 'emb' model must be specified")
|
|
664
|
+
|
|
632
665
|
if server_args.doc_dir:
|
|
633
666
|
auto_coder_args.rag_type = "simple"
|
|
634
667
|
auto_coder_args.rag_build_name = generate_unique_name_from_path(server_args.doc_dir)
|
|
@@ -675,31 +708,62 @@ def main(input_args: Optional[List[str]] = None):
|
|
|
675
708
|
auto_coder_args.enable_hybrid_index = True
|
|
676
709
|
auto_coder_args.rag_type = "simple"
|
|
677
710
|
|
|
678
|
-
|
|
679
|
-
from byzerllm.apps.byzer_storage.simple_api import ByzerStorage
|
|
680
|
-
|
|
681
|
-
storage = ByzerStorage("byzerai_store", "rag", "files")
|
|
682
|
-
storage.retrieval.cluster_info("byzerai_store")
|
|
683
|
-
except Exception as e:
|
|
684
|
-
logger.error(
|
|
685
|
-
"When enable_hybrid_index is true, ByzerStorage must be started"
|
|
686
|
-
)
|
|
687
|
-
logger.error("Please run 'byzerllm storage start' first")
|
|
688
|
-
return
|
|
711
|
+
if args.on_ray:
|
|
689
712
|
|
|
690
|
-
|
|
691
|
-
|
|
713
|
+
try:
|
|
714
|
+
from byzerllm.apps.byzer_storage.simple_api import ByzerStorage
|
|
692
715
|
|
|
693
|
-
|
|
694
|
-
|
|
695
|
-
|
|
716
|
+
storage = ByzerStorage("byzerai_store", "rag", "files")
|
|
717
|
+
storage.retrieval.cluster_info("byzerai_store")
|
|
718
|
+
except Exception as e:
|
|
696
719
|
logger.error(
|
|
697
|
-
"When enable_hybrid_index is true,
|
|
720
|
+
"When enable_hybrid_index is true, ByzerStorage must be started"
|
|
698
721
|
)
|
|
722
|
+
logger.error("Please run 'byzerllm storage start' first")
|
|
699
723
|
return
|
|
700
|
-
llm.setup_default_emb_model_name("emb")
|
|
701
724
|
|
|
702
|
-
|
|
725
|
+
llm = byzerllm.ByzerLLM()
|
|
726
|
+
llm.setup_default_model_name(args.model)
|
|
727
|
+
|
|
728
|
+
# 当启用hybrid_index时,检查必要的组件
|
|
729
|
+
if auto_coder_args.enable_hybrid_index:
|
|
730
|
+
if not llm.is_model_exist("emb"):
|
|
731
|
+
logger.error(
|
|
732
|
+
"When enable_hybrid_index is true, an 'emb' model must be deployed"
|
|
733
|
+
)
|
|
734
|
+
return
|
|
735
|
+
llm.setup_default_emb_model_name("emb")
|
|
736
|
+
else:
|
|
737
|
+
from autocoder import models as models_module
|
|
738
|
+
model_info = models_module.get_model_by_name(args.model)
|
|
739
|
+
llm = byzerllm.SimpleByzerLLM(default_model_name=args.model)
|
|
740
|
+
llm.deploy(
|
|
741
|
+
model_path="",
|
|
742
|
+
pretrained_model_type=model_info["model_type"],
|
|
743
|
+
udf_name=args.model,
|
|
744
|
+
infer_params={
|
|
745
|
+
"saas.base_url": model_info["base_url"],
|
|
746
|
+
"saas.api_key": model_info["api_key"],
|
|
747
|
+
"saas.model": model_info["model_name"],
|
|
748
|
+
"saas.is_reasoning": model_info["is_reasoning"]
|
|
749
|
+
}
|
|
750
|
+
)
|
|
751
|
+
|
|
752
|
+
model_info = models_module.get_model_by_name(args.emb_model)
|
|
753
|
+
emb_model = byzerllm.SimpleByzerLLM(default_model_name=args.emb_model)
|
|
754
|
+
emb_model.deploy(
|
|
755
|
+
model_path="",
|
|
756
|
+
pretrained_model_type=model_info["model_type"],
|
|
757
|
+
udf_name=args.emb_model,
|
|
758
|
+
infer_params={
|
|
759
|
+
"saas.base_url": model_info["base_url"],
|
|
760
|
+
"saas.api_key": model_info["api_key"],
|
|
761
|
+
"saas.model": model_info["model_name"],
|
|
762
|
+
"saas.is_reasoning": False
|
|
763
|
+
}
|
|
764
|
+
)
|
|
765
|
+
llm.setup_sub_client("emb_model", emb_model)
|
|
766
|
+
|
|
703
767
|
rag = RAGFactory.get_rag(
|
|
704
768
|
llm=llm,
|
|
705
769
|
args=auto_coder_args,
|
|
@@ -0,0 +1,457 @@
|
|
|
1
|
+
from autocoder.rag.cache.base_cache import (
|
|
2
|
+
BaseCacheManager,
|
|
3
|
+
DeleteEvent,
|
|
4
|
+
AddOrUpdateEvent,
|
|
5
|
+
FileInfo,
|
|
6
|
+
CacheItem
|
|
7
|
+
)
|
|
8
|
+
from typing import Generator, List, Dict, Any, Optional, Tuple
|
|
9
|
+
from autocoder.common import SourceCode
|
|
10
|
+
from loguru import logger
|
|
11
|
+
import pathspec
|
|
12
|
+
import os
|
|
13
|
+
import uuid
|
|
14
|
+
import json
|
|
15
|
+
from autocoder.rag.utils import process_file_in_multi_process, process_file_local
|
|
16
|
+
from byzerllm.apps.byzer_storage.local_simple_api import (
|
|
17
|
+
LocalByzerStorage,
|
|
18
|
+
DataType,
|
|
19
|
+
FieldOption,
|
|
20
|
+
SortOption,
|
|
21
|
+
)
|
|
22
|
+
from autocoder.common import AutoCoderArgs
|
|
23
|
+
import threading
|
|
24
|
+
from multiprocessing import Pool
|
|
25
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
26
|
+
from autocoder.rag.variable_holder import VariableHolder
|
|
27
|
+
import platform
|
|
28
|
+
import hashlib
|
|
29
|
+
from typing import Union
|
|
30
|
+
from byzerllm import SimpleByzerLLM, ByzerLLM
|
|
31
|
+
|
|
32
|
+
if platform.system() != "Windows":
|
|
33
|
+
import fcntl
|
|
34
|
+
else:
|
|
35
|
+
fcntl = None
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def generate_file_md5(file_path: str) -> str:
|
|
39
|
+
md5_hash = hashlib.md5()
|
|
40
|
+
with open(file_path, "rb") as f:
|
|
41
|
+
for chunk in iter(lambda: f.read(4096), b""):
|
|
42
|
+
md5_hash.update(chunk)
|
|
43
|
+
return md5_hash.hexdigest()
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def generate_content_md5(content: Union[str, bytes]) -> str:
|
|
47
|
+
if isinstance(content, str):
|
|
48
|
+
content = content.encode("utf-8")
|
|
49
|
+
md5_hash = hashlib.md5()
|
|
50
|
+
md5_hash.update(content)
|
|
51
|
+
return md5_hash.hexdigest()
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
default_ignore_dirs = [
|
|
55
|
+
"__pycache__",
|
|
56
|
+
"node_modules",
|
|
57
|
+
"_images"
|
|
58
|
+
]
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
class LocalByzerStorageCache(BaseCacheManager):
|
|
62
|
+
def __init__(
|
|
63
|
+
self,
|
|
64
|
+
path,
|
|
65
|
+
ignore_spec,
|
|
66
|
+
required_exts,
|
|
67
|
+
extra_params: Optional[AutoCoderArgs] = None,
|
|
68
|
+
emb_llm: Union[ByzerLLM, SimpleByzerLLM] = None,
|
|
69
|
+
):
|
|
70
|
+
self.path = path
|
|
71
|
+
self.ignore_spec = ignore_spec
|
|
72
|
+
self.required_exts = required_exts
|
|
73
|
+
self.rag_build_name = extra_params.rag_build_name
|
|
74
|
+
self.storage = LocalByzerStorage("byzerai_store",
|
|
75
|
+
"rag_test", self.rag_build_name, host="127.0.0.1", port=33333,emb_llm=emb_llm)
|
|
76
|
+
self.queue = []
|
|
77
|
+
self.chunk_size = 1000
|
|
78
|
+
self._init_schema()
|
|
79
|
+
|
|
80
|
+
if not extra_params:
|
|
81
|
+
raise ValueError("extra_params is required for ByzerStorageCache")
|
|
82
|
+
|
|
83
|
+
self.max_output_tokens = extra_params.hybrid_index_max_output_tokens
|
|
84
|
+
|
|
85
|
+
# 设置缓存文件路径
|
|
86
|
+
self.cache_dir = os.path.join(self.path, ".cache")
|
|
87
|
+
self.cache_file = os.path.join(
|
|
88
|
+
self.cache_dir, "byzer_storage_speedup.jsonl")
|
|
89
|
+
self.cache: Dict[str, CacheItem] = {}
|
|
90
|
+
|
|
91
|
+
self.lock = threading.Lock()
|
|
92
|
+
self.stop_event = threading.Event()
|
|
93
|
+
self.thread = threading.Thread(target=self.process_queue)
|
|
94
|
+
self.thread.daemon = True
|
|
95
|
+
self.thread.start()
|
|
96
|
+
|
|
97
|
+
# 创建缓存目录
|
|
98
|
+
if not os.path.exists(self.cache_dir):
|
|
99
|
+
os.makedirs(self.cache_dir)
|
|
100
|
+
|
|
101
|
+
# 加载缓存
|
|
102
|
+
self.cache = self._load_cache()
|
|
103
|
+
|
|
104
|
+
def _chunk_text(self, text, max_length=1000):
|
|
105
|
+
"""Split text into chunks"""
|
|
106
|
+
chunks = []
|
|
107
|
+
current_chunk = []
|
|
108
|
+
current_length = 0
|
|
109
|
+
|
|
110
|
+
for line in text.split("\n"):
|
|
111
|
+
if current_length + len(line) > max_length and current_chunk:
|
|
112
|
+
chunks.append("\n".join(current_chunk))
|
|
113
|
+
current_chunk = []
|
|
114
|
+
current_length = 0
|
|
115
|
+
current_chunk.append(line)
|
|
116
|
+
current_length += len(line)
|
|
117
|
+
|
|
118
|
+
if current_chunk:
|
|
119
|
+
chunks.append("\n".join(current_chunk))
|
|
120
|
+
|
|
121
|
+
return chunks
|
|
122
|
+
|
|
123
|
+
def _init_schema(self):
|
|
124
|
+
"""Initialize the Byzer Storage schema"""
|
|
125
|
+
_ = (
|
|
126
|
+
self.storage.schema_builder()
|
|
127
|
+
.add_field("_id", DataType.STRING)
|
|
128
|
+
.add_field("file_path", DataType.STRING)
|
|
129
|
+
.add_field("content", DataType.STRING, [FieldOption.ANALYZE])
|
|
130
|
+
.add_field("raw_content", DataType.STRING, [FieldOption.NO_INDEX])
|
|
131
|
+
.add_array_field("vector", DataType.FLOAT)
|
|
132
|
+
.add_field("mtime", DataType.DOUBLE, [FieldOption.SORT])
|
|
133
|
+
.execute()
|
|
134
|
+
)
|
|
135
|
+
|
|
136
|
+
def _load_cache(self) -> Dict[str, CacheItem]:
|
|
137
|
+
"""Load cache from file"""
|
|
138
|
+
if os.path.exists(self.cache_file):
|
|
139
|
+
try:
|
|
140
|
+
with open(self.cache_file, "r", encoding="utf-8") as f:
|
|
141
|
+
lines = f.readlines()
|
|
142
|
+
cache = {}
|
|
143
|
+
for line in lines:
|
|
144
|
+
try:
|
|
145
|
+
data = json.loads(line.strip())
|
|
146
|
+
if isinstance(data, dict) and "file_path" in data:
|
|
147
|
+
# 转换为 CacheItem 对象
|
|
148
|
+
cache_item = CacheItem.model_validate(data)
|
|
149
|
+
cache[data["file_path"]] = cache_item
|
|
150
|
+
except json.JSONDecodeError:
|
|
151
|
+
continue
|
|
152
|
+
return cache
|
|
153
|
+
except Exception as e:
|
|
154
|
+
logger.error(f"Error loading cache file: {str(e)}")
|
|
155
|
+
return {}
|
|
156
|
+
return {}
|
|
157
|
+
|
|
158
|
+
def write_cache(self):
|
|
159
|
+
cache_file = self.cache_file
|
|
160
|
+
|
|
161
|
+
if not fcntl:
|
|
162
|
+
try:
|
|
163
|
+
with open(cache_file, "w", encoding="utf-8") as f:
|
|
164
|
+
for cache_item in self.cache.values():
|
|
165
|
+
# 确保序列化 Pydantic 模型
|
|
166
|
+
json.dump(cache_item.model_dump(),
|
|
167
|
+
f, ensure_ascii=False)
|
|
168
|
+
f.write("\n")
|
|
169
|
+
except IOError as e:
|
|
170
|
+
logger.error(f"Error writing cache file: {str(e)}")
|
|
171
|
+
else:
|
|
172
|
+
lock_file = cache_file + ".lock"
|
|
173
|
+
with open(lock_file, "w", encoding="utf-8") as lockf:
|
|
174
|
+
try:
|
|
175
|
+
# 获取文件锁
|
|
176
|
+
fcntl.flock(lockf, fcntl.LOCK_EX | fcntl.LOCK_NB)
|
|
177
|
+
# 写入缓存文件
|
|
178
|
+
with open(cache_file, "w", encoding="utf-8") as f:
|
|
179
|
+
for cache_item in self.cache.values():
|
|
180
|
+
# 确保序列化 Pydantic 模型
|
|
181
|
+
json.dump(cache_item.model_dump(),
|
|
182
|
+
f, ensure_ascii=False)
|
|
183
|
+
f.write("\n")
|
|
184
|
+
|
|
185
|
+
finally:
|
|
186
|
+
# 释放文件锁
|
|
187
|
+
fcntl.flock(lockf, fcntl.LOCK_UN)
|
|
188
|
+
|
|
189
|
+
def fileinfo_to_tuple(self, file_info: FileInfo) -> Tuple[str, str, float, str]:
|
|
190
|
+
return (file_info.file_path, file_info.relative_path, file_info.modify_time, file_info.file_md5)
|
|
191
|
+
|
|
192
|
+
def build_cache(self):
|
|
193
|
+
"""Build the cache by reading files and storing in Byzer Storage"""
|
|
194
|
+
logger.info(f"Building cache for path: {self.path}")
|
|
195
|
+
|
|
196
|
+
files_to_process = []
|
|
197
|
+
for file_info in self.get_all_files():
|
|
198
|
+
if (
|
|
199
|
+
file_info.file_path not in self.cache
|
|
200
|
+
or self.cache[file_info.file_path].md5 != file_info.file_md5
|
|
201
|
+
):
|
|
202
|
+
files_to_process.append(file_info)
|
|
203
|
+
|
|
204
|
+
if not files_to_process:
|
|
205
|
+
return
|
|
206
|
+
|
|
207
|
+
from autocoder.rag.token_counter import initialize_tokenizer
|
|
208
|
+
|
|
209
|
+
with Pool(
|
|
210
|
+
processes=os.cpu_count(),
|
|
211
|
+
initializer=initialize_tokenizer,
|
|
212
|
+
initargs=(VariableHolder.TOKENIZER_PATH,),
|
|
213
|
+
) as pool:
|
|
214
|
+
target_files_to_process = []
|
|
215
|
+
for file_info in files_to_process:
|
|
216
|
+
target_files_to_process.append(
|
|
217
|
+
self.fileinfo_to_tuple(file_info))
|
|
218
|
+
results = pool.map(process_file_in_multi_process,
|
|
219
|
+
target_files_to_process)
|
|
220
|
+
|
|
221
|
+
items = []
|
|
222
|
+
for file_info, result in zip(files_to_process, results):
|
|
223
|
+
content: List[SourceCode] = result
|
|
224
|
+
self.cache[file_info.file_path] = CacheItem(
|
|
225
|
+
file_path=file_info.file_path,
|
|
226
|
+
relative_path=file_info.relative_path,
|
|
227
|
+
content=[c.model_dump() for c in content],
|
|
228
|
+
modify_time=file_info.modify_time,
|
|
229
|
+
md5=file_info.file_md5,
|
|
230
|
+
)
|
|
231
|
+
|
|
232
|
+
for doc in content:
|
|
233
|
+
logger.info(f"Processing file: {doc.module_name}")
|
|
234
|
+
doc.module_name
|
|
235
|
+
chunks = self._chunk_text(doc.source_code, self.chunk_size)
|
|
236
|
+
for chunk_idx, chunk in enumerate(chunks):
|
|
237
|
+
chunk_item = {
|
|
238
|
+
"_id": f"{doc.module_name}_{chunk_idx}",
|
|
239
|
+
"file_path": file_info.file_path,
|
|
240
|
+
"content": chunk,
|
|
241
|
+
"raw_content": chunk,
|
|
242
|
+
"vector": chunk,
|
|
243
|
+
"mtime": file_info.modify_time,
|
|
244
|
+
}
|
|
245
|
+
items.append(chunk_item)
|
|
246
|
+
|
|
247
|
+
# Save to local cache
|
|
248
|
+
logger.info("Saving cache to local file")
|
|
249
|
+
self.write_cache()
|
|
250
|
+
|
|
251
|
+
if items:
|
|
252
|
+
logger.info("Clear cache from Byzer Storage")
|
|
253
|
+
self.storage.truncate_table()
|
|
254
|
+
logger.info("Save new cache to Byzer Storage")
|
|
255
|
+
max_workers = 5
|
|
256
|
+
chunk_size = max(1, len(items) // max_workers)
|
|
257
|
+
item_chunks = [items[i:i + chunk_size]
|
|
258
|
+
for i in range(0, len(items), chunk_size)]
|
|
259
|
+
|
|
260
|
+
total_chunks = len(item_chunks)
|
|
261
|
+
completed_chunks = 0
|
|
262
|
+
|
|
263
|
+
logger.info(f"Progress: {0}/{total_chunks} chunks completed")
|
|
264
|
+
|
|
265
|
+
with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
|
266
|
+
futures = []
|
|
267
|
+
for chunk in item_chunks:
|
|
268
|
+
futures.append(
|
|
269
|
+
executor.submit(
|
|
270
|
+
lambda x: self.storage.write_builder().add_items(
|
|
271
|
+
x, vector_fields=["vector"], search_fields=["content"]
|
|
272
|
+
).execute(),
|
|
273
|
+
chunk
|
|
274
|
+
)
|
|
275
|
+
)
|
|
276
|
+
# Wait for all futures to complete
|
|
277
|
+
for future in as_completed(futures):
|
|
278
|
+
try:
|
|
279
|
+
future.result()
|
|
280
|
+
completed_chunks += 1
|
|
281
|
+
logger.info(
|
|
282
|
+
f"Progress: {completed_chunks}/{total_chunks} chunks completed")
|
|
283
|
+
except Exception as e:
|
|
284
|
+
logger.error(f"Error in saving chunk: {str(e)}")
|
|
285
|
+
|
|
286
|
+
self.storage.commit()
|
|
287
|
+
|
|
288
|
+
def update_storage(self, file_info: FileInfo, is_delete: bool):
|
|
289
|
+
query = self.storage.query_builder()
|
|
290
|
+
query.and_filter().add_condition("file_path", file_info.file_path).build()
|
|
291
|
+
results = query.execute()
|
|
292
|
+
if results:
|
|
293
|
+
for result in results:
|
|
294
|
+
self.storage.delete_by_ids([result["_id"]])
|
|
295
|
+
items = []
|
|
296
|
+
|
|
297
|
+
if not is_delete:
|
|
298
|
+
content = [
|
|
299
|
+
SourceCode.model_validate(doc)
|
|
300
|
+
for doc in self.cache[file_info.file_path].content
|
|
301
|
+
]
|
|
302
|
+
modify_time = self.cache[file_info.file_path].modify_time
|
|
303
|
+
for doc in content:
|
|
304
|
+
logger.info(f"Processing file: {doc.module_name}")
|
|
305
|
+
doc.module_name
|
|
306
|
+
chunks = self._chunk_text(doc.source_code, self.chunk_size)
|
|
307
|
+
for chunk_idx, chunk in enumerate(chunks):
|
|
308
|
+
chunk_item = {
|
|
309
|
+
"_id": f"{doc.module_name}_{chunk_idx}",
|
|
310
|
+
"file_path": file_info.file_path,
|
|
311
|
+
"content": chunk,
|
|
312
|
+
"raw_content": chunk,
|
|
313
|
+
"vector": chunk,
|
|
314
|
+
"mtime": modify_time,
|
|
315
|
+
}
|
|
316
|
+
items.append(chunk_item)
|
|
317
|
+
if items:
|
|
318
|
+
self.storage.write_builder().add_items(
|
|
319
|
+
items, vector_fields=["vector"], search_fields=["content"]
|
|
320
|
+
).execute()
|
|
321
|
+
self.storage.commit()
|
|
322
|
+
|
|
323
|
+
def process_queue(self):
|
|
324
|
+
while self.queue:
|
|
325
|
+
file_list = self.queue.pop(0)
|
|
326
|
+
if isinstance(file_list, DeleteEvent):
|
|
327
|
+
for item in file_list.file_paths:
|
|
328
|
+
logger.info(f"{item} is detected to be removed")
|
|
329
|
+
del self.cache[item]
|
|
330
|
+
# 创建一个临时的 FileInfo 对象
|
|
331
|
+
file_info = FileInfo(
|
|
332
|
+
file_path=item, relative_path="", modify_time=0, file_md5="")
|
|
333
|
+
self.update_storage(file_info, is_delete=True)
|
|
334
|
+
|
|
335
|
+
elif isinstance(file_list, AddOrUpdateEvent):
|
|
336
|
+
for file_info in file_list.file_infos:
|
|
337
|
+
logger.info(
|
|
338
|
+
f"{file_info.file_path} is detected to be updated")
|
|
339
|
+
# 处理文件并创建 CacheItem
|
|
340
|
+
content = process_file_local(
|
|
341
|
+
self.fileinfo_to_tuple(file_info))
|
|
342
|
+
self.cache[file_info.file_path] = CacheItem(
|
|
343
|
+
file_path=file_info.file_path,
|
|
344
|
+
relative_path=file_info.relative_path,
|
|
345
|
+
content=[c.model_dump() for c in content],
|
|
346
|
+
modify_time=file_info.modify_time,
|
|
347
|
+
md5=file_info.file_md5,
|
|
348
|
+
)
|
|
349
|
+
self.update_storage(file_info, is_delete=False)
|
|
350
|
+
self.write_cache()
|
|
351
|
+
|
|
352
|
+
def trigger_update(self):
|
|
353
|
+
logger.info("检查文件是否有更新.....")
|
|
354
|
+
files_to_process = []
|
|
355
|
+
current_files = set()
|
|
356
|
+
for file_info in self.get_all_files():
|
|
357
|
+
current_files.add(file_info.file_path)
|
|
358
|
+
if (
|
|
359
|
+
file_info.file_path not in self.cache
|
|
360
|
+
or self.cache[file_info.file_path].md5 != file_info.file_md5
|
|
361
|
+
):
|
|
362
|
+
files_to_process.append(file_info)
|
|
363
|
+
|
|
364
|
+
deleted_files = set(self.cache.keys()) - current_files
|
|
365
|
+
logger.info(f"files_to_process: {files_to_process}")
|
|
366
|
+
logger.info(f"deleted_files: {deleted_files}")
|
|
367
|
+
if deleted_files:
|
|
368
|
+
with self.lock:
|
|
369
|
+
self.queue.append(DeleteEvent(file_paths=deleted_files))
|
|
370
|
+
if files_to_process:
|
|
371
|
+
with self.lock:
|
|
372
|
+
self.queue.append(AddOrUpdateEvent(
|
|
373
|
+
file_infos=files_to_process))
|
|
374
|
+
|
|
375
|
+
def get_cache(self, options: Dict[str, Any]) -> Dict[str, Dict]:
|
|
376
|
+
"""Search cached documents using query"""
|
|
377
|
+
|
|
378
|
+
self.trigger_update()
|
|
379
|
+
|
|
380
|
+
if options is None or "query" not in options:
|
|
381
|
+
return {file_path: self.cache[file_path].model_dump() for file_path in self.cache}
|
|
382
|
+
|
|
383
|
+
query = options.get("query", "")
|
|
384
|
+
total_tokens = 0
|
|
385
|
+
|
|
386
|
+
# Build query with both vector search and text search
|
|
387
|
+
query_builder = self.storage.query_builder()
|
|
388
|
+
query_builder.set_limit(100000)
|
|
389
|
+
|
|
390
|
+
# Add vector search if enabled
|
|
391
|
+
if options.get("enable_vector_search", True):
|
|
392
|
+
query_builder.set_vector_query(query, fields=["vector"])
|
|
393
|
+
|
|
394
|
+
# Add text search
|
|
395
|
+
if options.get("enable_text_search", True):
|
|
396
|
+
query_builder.set_search_query(query, fields=["content"])
|
|
397
|
+
|
|
398
|
+
results = query_builder.execute()
|
|
399
|
+
|
|
400
|
+
# Group results by file_path and reconstruct documents while preserving order
|
|
401
|
+
# 这里还可以有排序优化,综合考虑一篇内容出现的次数以及排序位置
|
|
402
|
+
file_paths = []
|
|
403
|
+
seen = set()
|
|
404
|
+
for result in results:
|
|
405
|
+
file_path = result["file_path"]
|
|
406
|
+
if file_path not in seen:
|
|
407
|
+
seen.add(file_path)
|
|
408
|
+
file_paths.append(file_path)
|
|
409
|
+
|
|
410
|
+
# 从缓存中获取文件内容
|
|
411
|
+
result = {}
|
|
412
|
+
for file_path in file_paths:
|
|
413
|
+
if file_path in self.cache:
|
|
414
|
+
cached_data = self.cache[file_path]
|
|
415
|
+
for doc in cached_data.content:
|
|
416
|
+
if total_tokens + doc["tokens"] > self.max_output_tokens:
|
|
417
|
+
return result
|
|
418
|
+
total_tokens += doc["tokens"]
|
|
419
|
+
result[file_path] = cached_data.model_dump()
|
|
420
|
+
|
|
421
|
+
return result
|
|
422
|
+
|
|
423
|
+
def get_all_files(self) -> List[FileInfo]:
|
|
424
|
+
all_files = []
|
|
425
|
+
for root, dirs, files in os.walk(self.path, followlinks=True):
|
|
426
|
+
dirs[:] = [d for d in dirs if not d.startswith(
|
|
427
|
+
".") and d not in default_ignore_dirs]
|
|
428
|
+
|
|
429
|
+
if self.ignore_spec:
|
|
430
|
+
relative_root = os.path.relpath(root, self.path)
|
|
431
|
+
dirs[:] = [
|
|
432
|
+
d
|
|
433
|
+
for d in dirs
|
|
434
|
+
if not self.ignore_spec.match_file(os.path.join(relative_root, d))
|
|
435
|
+
]
|
|
436
|
+
files = [
|
|
437
|
+
f
|
|
438
|
+
for f in files
|
|
439
|
+
if not self.ignore_spec.match_file(os.path.join(relative_root, f))
|
|
440
|
+
]
|
|
441
|
+
|
|
442
|
+
for file in files:
|
|
443
|
+
if self.required_exts and not any(
|
|
444
|
+
file.endswith(ext) for ext in self.required_exts
|
|
445
|
+
):
|
|
446
|
+
continue
|
|
447
|
+
|
|
448
|
+
file_path = os.path.join(root, file)
|
|
449
|
+
relative_path = os.path.relpath(file_path, self.path)
|
|
450
|
+
modify_time = os.path.getmtime(file_path)
|
|
451
|
+
file_md5 = generate_file_md5(file_path)
|
|
452
|
+
all_files.append(FileInfo(file_path=file_path,
|
|
453
|
+
relative_path=relative_path,
|
|
454
|
+
modify_time=modify_time,
|
|
455
|
+
file_md5=file_md5))
|
|
456
|
+
|
|
457
|
+
return all_files
|
|
@@ -1,52 +1,20 @@
|
|
|
1
|
-
import json
|
|
2
|
-
import os
|
|
3
|
-
import platform
|
|
4
|
-
import time
|
|
5
|
-
import traceback
|
|
6
|
-
|
|
7
1
|
import threading
|
|
8
|
-
from
|
|
9
|
-
from typing import Dict, Generator, List, Tuple, Any, Optional
|
|
2
|
+
from typing import Dict, Generator, List, Tuple, Any, Optional,Union
|
|
10
3
|
|
|
11
|
-
import
|
|
12
|
-
from loguru import logger
|
|
13
|
-
from pydantic import BaseModel
|
|
4
|
+
from byzerllm import ByzerLLM, SimpleByzerLLM
|
|
14
5
|
|
|
6
|
+
from loguru import logger
|
|
15
7
|
from autocoder.common import SourceCode
|
|
16
8
|
from uuid import uuid4
|
|
17
|
-
from autocoder.rag.variable_holder import VariableHolder
|
|
18
9
|
from abc import ABC, abstractmethod
|
|
19
|
-
from autocoder.rag.cache.base_cache import BaseCacheManager
|
|
20
10
|
from autocoder.rag.cache.simple_cache import AutoCoderRAGAsyncUpdateQueue
|
|
21
11
|
from autocoder.rag.cache.file_monitor_cache import AutoCoderRAGDocListener
|
|
22
12
|
from autocoder.rag.cache.byzer_storage_cache import ByzerStorageCache
|
|
23
|
-
from autocoder.rag.
|
|
13
|
+
from autocoder.rag.cache.local_byzer_storage_cache import LocalByzerStorageCache
|
|
24
14
|
from autocoder.common import AutoCoderArgs
|
|
25
15
|
|
|
26
16
|
cache_lock = threading.Lock()
|
|
27
17
|
|
|
28
|
-
|
|
29
|
-
def get_or_create_actor(path: str, ignore_spec, required_exts: list, cacher={}):
|
|
30
|
-
with cache_lock:
|
|
31
|
-
# 处理路径名
|
|
32
|
-
actor_name = "AutoCoderRAGAsyncUpdateQueue_" + path.replace(
|
|
33
|
-
os.sep, "_"
|
|
34
|
-
).replace(" ", "")
|
|
35
|
-
try:
|
|
36
|
-
actor = ray.get_actor(actor_name)
|
|
37
|
-
except ValueError:
|
|
38
|
-
actor = None
|
|
39
|
-
if actor is None:
|
|
40
|
-
actor = (
|
|
41
|
-
ray.remote(AutoCoderRAGAsyncUpdateQueue)
|
|
42
|
-
.options(name=actor_name, num_cpus=0)
|
|
43
|
-
.remote(path, ignore_spec, required_exts)
|
|
44
|
-
)
|
|
45
|
-
ray.get(actor.load_first.remote())
|
|
46
|
-
cacher[actor_name] = actor
|
|
47
|
-
return actor
|
|
48
|
-
|
|
49
|
-
|
|
50
18
|
class BaseDocumentRetriever(ABC):
|
|
51
19
|
"""Abstract base class for document retrieval."""
|
|
52
20
|
|
|
@@ -77,6 +45,7 @@ class LocalDocumentRetriever(BaseDocumentRetriever):
|
|
|
77
45
|
disable_auto_window: bool = False,
|
|
78
46
|
enable_hybrid_index: bool = False,
|
|
79
47
|
extra_params: Optional[AutoCoderArgs] = None,
|
|
48
|
+
emb_llm: Union[ByzerLLM, SimpleByzerLLM] = None,
|
|
80
49
|
) -> None:
|
|
81
50
|
self.path = path
|
|
82
51
|
self.ignore_spec = ignore_spec
|
|
@@ -91,23 +60,26 @@ class LocalDocumentRetriever(BaseDocumentRetriever):
|
|
|
91
60
|
# 合并后的最大文件大小
|
|
92
61
|
self.small_file_merge_limit = self.single_file_token_limit / 2
|
|
93
62
|
|
|
94
|
-
self.on_ray = on_ray
|
|
95
|
-
if self.
|
|
96
|
-
self.
|
|
97
|
-
else:
|
|
98
|
-
if self.enable_hybrid_index:
|
|
63
|
+
self.on_ray = on_ray
|
|
64
|
+
if self.enable_hybrid_index:
|
|
65
|
+
if self.on_ray:
|
|
99
66
|
self.cacher = ByzerStorageCache(
|
|
100
67
|
path, ignore_spec, required_exts, extra_params
|
|
101
68
|
)
|
|
69
|
+
else:
|
|
70
|
+
self.cacher = LocalByzerStorageCache(
|
|
71
|
+
path, ignore_spec, required_exts, extra_params,
|
|
72
|
+
emb_llm = emb_llm
|
|
73
|
+
)
|
|
74
|
+
else:
|
|
75
|
+
if self.monitor_mode:
|
|
76
|
+
self.cacher = AutoCoderRAGDocListener(
|
|
77
|
+
path, ignore_spec, required_exts
|
|
78
|
+
)
|
|
102
79
|
else:
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
)
|
|
107
|
-
else:
|
|
108
|
-
self.cacher = AutoCoderRAGAsyncUpdateQueue(
|
|
109
|
-
path, ignore_spec, required_exts
|
|
110
|
-
)
|
|
80
|
+
self.cacher = AutoCoderRAGAsyncUpdateQueue(
|
|
81
|
+
path, ignore_spec, required_exts
|
|
82
|
+
)
|
|
111
83
|
|
|
112
84
|
logger.info(f"DocumentRetriever initialized with:")
|
|
113
85
|
logger.info(f" Path: {self.path}")
|
|
@@ -123,10 +95,7 @@ class LocalDocumentRetriever(BaseDocumentRetriever):
|
|
|
123
95
|
)
|
|
124
96
|
|
|
125
97
|
def get_cache(self, options: Optional[Dict[str, Any]] = None):
|
|
126
|
-
|
|
127
|
-
return ray.get(self.cacher.get_cache.remote())
|
|
128
|
-
else:
|
|
129
|
-
return self.cacher.get_cache(options=options)
|
|
98
|
+
return self.cacher.get_cache(options=options)
|
|
130
99
|
|
|
131
100
|
def retrieve_documents(
|
|
132
101
|
self, options: Optional[Dict[str, Any]] = None
|
|
@@ -86,6 +86,7 @@ class LongContextRAG:
|
|
|
86
86
|
self.recall_llm = self.llm
|
|
87
87
|
self.chunk_llm = self.llm
|
|
88
88
|
self.qa_llm = self.llm
|
|
89
|
+
self.emb_llm = None
|
|
89
90
|
|
|
90
91
|
if self.llm.get_sub_client("qa_model"):
|
|
91
92
|
self.qa_llm = self.llm.get_sub_client("qa_model")
|
|
@@ -96,6 +97,9 @@ class LongContextRAG:
|
|
|
96
97
|
if self.llm.get_sub_client("chunk_model"):
|
|
97
98
|
self.chunk_llm = self.llm.get_sub_client("chunk_model")
|
|
98
99
|
|
|
100
|
+
if self.llm.get_sub_client("emb_model"):
|
|
101
|
+
self.emb_llm = self.llm.get_sub_client("emb_model")
|
|
102
|
+
|
|
99
103
|
self.args = args
|
|
100
104
|
|
|
101
105
|
self.path = path
|
|
@@ -169,6 +173,11 @@ class LongContextRAG:
|
|
|
169
173
|
|
|
170
174
|
self.token_limit = self.args.rag_context_window_limit or 120000
|
|
171
175
|
retriever_class = self._get_document_retriever_class()
|
|
176
|
+
|
|
177
|
+
if self.args.enable_hybrid_index and not self.on_ray:
|
|
178
|
+
if self.emb_llm is None:
|
|
179
|
+
raise ValueError("emb_llm is required for local byzer storage cache")
|
|
180
|
+
|
|
172
181
|
self.document_retriever = retriever_class(
|
|
173
182
|
self.path,
|
|
174
183
|
self.ignore_spec,
|
|
@@ -179,7 +188,8 @@ class LongContextRAG:
|
|
|
179
188
|
single_file_token_limit=self.full_text_limit - 100,
|
|
180
189
|
disable_auto_window=self.args.disable_auto_window,
|
|
181
190
|
enable_hybrid_index=self.args.enable_hybrid_index,
|
|
182
|
-
extra_params=self.args
|
|
191
|
+
extra_params=self.args,
|
|
192
|
+
emb_llm=self.emb_llm
|
|
183
193
|
)
|
|
184
194
|
|
|
185
195
|
self.doc_filter = DocFilter(
|
|
@@ -778,6 +788,12 @@ class LongContextRAG:
|
|
|
778
788
|
tokens=request_tokens
|
|
779
789
|
)
|
|
780
790
|
))
|
|
791
|
+
|
|
792
|
+
yield ("", SingleOutputMeta(input_tokens_count=rag_stat.recall_stat.total_input_tokens + rag_stat.chunk_stat.total_input_tokens,
|
|
793
|
+
generated_tokens_count=rag_stat.recall_stat.total_generated_tokens +
|
|
794
|
+
rag_stat.chunk_stat.total_generated_tokens,
|
|
795
|
+
reasoning_content="qa_model_thinking"
|
|
796
|
+
))
|
|
781
797
|
|
|
782
798
|
if LLMComputeEngine is not None and not self.args.disable_inference_enhance:
|
|
783
799
|
llm_compute_engine = LLMComputeEngine(
|
autocoder/version.py
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "0.1.
|
|
1
|
+
__version__ = "0.1.282"
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|