auto-coder 0.1.291__py3-none-any.whl → 0.1.293__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of auto-coder might be problematic. Click here for more details.
- {auto_coder-0.1.291.dist-info → auto_coder-0.1.293.dist-info}/METADATA +2 -1
- {auto_coder-0.1.291.dist-info → auto_coder-0.1.293.dist-info}/RECORD +16 -14
- autocoder/auto_coder_rag.py +16 -0
- autocoder/auto_coder_server.py +8 -0
- autocoder/common/__init__.py +8 -4
- autocoder/common/mcp_hub.py +59 -18
- autocoder/common/mcp_servers/mcp_server_gpt4o_mini_search.py +153 -0
- autocoder/rag/api_server.py +8 -0
- autocoder/rag/cache/local_duckdb_storage_cache.py +647 -0
- autocoder/rag/document_retriever.py +12 -5
- autocoder/rag/long_context_rag.py +16 -16
- autocoder/version.py +1 -1
- {auto_coder-0.1.291.dist-info → auto_coder-0.1.293.dist-info}/LICENSE +0 -0
- {auto_coder-0.1.291.dist-info → auto_coder-0.1.293.dist-info}/WHEEL +0 -0
- {auto_coder-0.1.291.dist-info → auto_coder-0.1.293.dist-info}/entry_points.txt +0 -0
- {auto_coder-0.1.291.dist-info → auto_coder-0.1.293.dist-info}/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: auto-coder
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.293
|
|
4
4
|
Summary: AutoCoder: AutoCoder
|
|
5
5
|
Author: allwefantasy
|
|
6
6
|
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
@@ -36,6 +36,7 @@ Requires-Dist: google-generativeai
|
|
|
36
36
|
Requires-Dist: protobuf
|
|
37
37
|
Requires-Dist: azure-cognitiveservices-speech
|
|
38
38
|
Requires-Dist: real-agent
|
|
39
|
+
Requires-Dist: duckdb
|
|
39
40
|
Requires-Dist: python-docx
|
|
40
41
|
Requires-Dist: docx2txt
|
|
41
42
|
Requires-Dist: pdf2image
|
|
@@ -1,18 +1,18 @@
|
|
|
1
1
|
autocoder/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
2
|
autocoder/auto_coder.py,sha256=zKqfMdm2F419hrNGaosW4kYJ3IbaxBKlpTlVl6JFWmE,65563
|
|
3
3
|
autocoder/auto_coder_lang.py,sha256=Rtupq6N3_HT7JRhDKdgCBcwRaiAnyCOR_Gsp4jUomrI,3229
|
|
4
|
-
autocoder/auto_coder_rag.py,sha256=
|
|
4
|
+
autocoder/auto_coder_rag.py,sha256=yhwRh_TJZyBxBCmUusZ8h5guU42i0Z6UJ10mT0FH3Rc,33857
|
|
5
5
|
autocoder/auto_coder_rag_client_mcp.py,sha256=QRxUbjc6A8UmDMQ8lXgZkjgqtq3lgKYeatJbDY6rSo0,6270
|
|
6
6
|
autocoder/auto_coder_rag_mcp.py,sha256=-RrjNwFaS2e5v8XDIrKR-zlUNUE8UBaeOtojffBrvJo,8521
|
|
7
7
|
autocoder/auto_coder_runner.py,sha256=w-4MCKhOFaoABcDfVoZoonF59UyRso3kghimQYLz3NA,100851
|
|
8
|
-
autocoder/auto_coder_server.py,sha256=
|
|
8
|
+
autocoder/auto_coder_server.py,sha256=E3Z829TPSooRSNhuh3_x9yaZi0f5G0Lm0ntoZhjGaoQ,20576
|
|
9
9
|
autocoder/benchmark.py,sha256=Ypomkdzd1T3GE6dRICY3Hj547dZ6_inqJbBJIp5QMco,4423
|
|
10
10
|
autocoder/chat_auto_coder.py,sha256=z_Kqd7CAecuNMa77kJn7iko2zTdko-4-o72a58H-_s8,24655
|
|
11
11
|
autocoder/chat_auto_coder_lang.py,sha256=CjsiJsUaWr-TJBCDDlDNnFpCDTd-itJhd9aid9DKlp8,20542
|
|
12
12
|
autocoder/command_args.py,sha256=9aYJ-AmPxP1sQh6ciw04FWHjSn31f2W9afXFwo8wgx4,30441
|
|
13
13
|
autocoder/lang.py,sha256=U6AjVV8Rs1uLyjFCZ8sT6WWuNUxMBqkXXIOs4S120uk,14511
|
|
14
14
|
autocoder/models.py,sha256=AyoZ-Pzy0oyYUmWCxOIRiOImsqboSfRET7LO9-UOuxI,11172
|
|
15
|
-
autocoder/version.py,sha256=
|
|
15
|
+
autocoder/version.py,sha256=uJLvEc9fkxd409iL_wj7Xexi0uD8yIeEHv4m5yx5T6E,23
|
|
16
16
|
autocoder/agent/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
17
17
|
autocoder/agent/auto_demand_organizer.py,sha256=NWSAEsEk94vT3lGjfo25kKLMwYdPcpy9e-i21txPasQ,6942
|
|
18
18
|
autocoder/agent/auto_filegroup.py,sha256=CW7bqp0FW1GIEMnl-blyAc2UGT7O9Mom0q66ITz1ckM,6635
|
|
@@ -31,7 +31,7 @@ autocoder/commands/auto_web.py,sha256=_449f4rCoRG7Sv0SB0hIBRFLPLPJ5DgWW4DlI22a3X
|
|
|
31
31
|
autocoder/commands/tools.py,sha256=lanjoBGR6H8HDJSY3KrM6ibrtHZbgKX6mKJHSSE66dg,20493
|
|
32
32
|
autocoder/common/JupyterClient.py,sha256=O-wi6pXeAEYhAY24kDa0BINrLYvKS6rKyWe98pDClS0,2816
|
|
33
33
|
autocoder/common/ShellClient.py,sha256=fM1q8t_XMSbLBl2zkCNC2J9xuyKN3eXzGm6hHhqL2WY,2286
|
|
34
|
-
autocoder/common/__init__.py,sha256=
|
|
34
|
+
autocoder/common/__init__.py,sha256=LE-HHb_HwpWwRKwbwDlTqRzMV6nTzqF3RB55zVMDS3c,13656
|
|
35
35
|
autocoder/common/anything2images.py,sha256=0ILBbWzY02M-CiWB-vzuomb_J1hVdxRcenAfIrAXq9M,25283
|
|
36
36
|
autocoder/common/anything2img.py,sha256=iZQmg8srXlD7N5uGl5b_ONKJMBjYoW8kPmokkG6ISF0,10118
|
|
37
37
|
autocoder/common/audio.py,sha256=Kn9nWKQddWnUrAz0a_ZUgjcu4VUU_IcZBigT7n3N3qc,7439
|
|
@@ -66,7 +66,7 @@ autocoder/common/image_to_page.py,sha256=yWiTJQ49Lm3j0FngiJhQ9u7qayqE_bOGb8Rk0Tm
|
|
|
66
66
|
autocoder/common/index_import_export.py,sha256=h758AYY1df6JMTKUXYmMkSgxItfymDt82XT7O-ygEuw,4565
|
|
67
67
|
autocoder/common/interpreter.py,sha256=62-dIakOunYB4yjmX8SHC0Gdy2h8NtxdgbpdqRZJ5vk,2833
|
|
68
68
|
autocoder/common/llm_rerank.py,sha256=FbvtCzaR661Mt2wn0qsuiEL1Y3puD6jeIJS4zg_e7Bs,3260
|
|
69
|
-
autocoder/common/mcp_hub.py,sha256=
|
|
69
|
+
autocoder/common/mcp_hub.py,sha256=ymy580rkv8kFx2zwQFpMg03s9K8KWsJP3dkfjoYbWSU,16573
|
|
70
70
|
autocoder/common/mcp_server.py,sha256=gKaQDQWeRZgHtR9UnuxHVgVbo0acrT9qA1kwtgDpHZU,16551
|
|
71
71
|
autocoder/common/mcp_tools.py,sha256=KsLvRrB6pvmebqd-lDaSH6IBJR0AIxWRE-dtCEG_w9k,12485
|
|
72
72
|
autocoder/common/memory_manager.py,sha256=2ZjYG7BPyvbYalZBF6AM_G5e10Qkw_zrqtD4Zd7GSsQ,3663
|
|
@@ -84,6 +84,7 @@ autocoder/common/text.py,sha256=KGRQq314GHBmY4MWG8ossRoQi1_DTotvhxchpn78c-k,1003
|
|
|
84
84
|
autocoder/common/types.py,sha256=PXTETrsTvhLE49jqAeUKGySvxBN9pjeyCgRHLDYdd9U,664
|
|
85
85
|
autocoder/common/utils_code_auto_generate.py,sha256=oiBjdCgdcQErfhMozFdHxkU84WmDo2euBA86yezha-g,3597
|
|
86
86
|
autocoder/common/mcp_servers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
87
|
+
autocoder/common/mcp_servers/mcp_server_gpt4o_mini_search.py,sha256=z-c3zq0YT7wK2XK2t-tDxdXFTUtCFDfvyGTYaYwRgtM,5661
|
|
87
88
|
autocoder/common/mcp_servers/mcp_server_perplexity.py,sha256=IXTyMpd1CQcBLzVinA-_OIOHoNmbzvuW6pXIadaKHJE,5533
|
|
88
89
|
autocoder/data/byzerllm.md,sha256=SGCMpEaUQ0ysPxQsgzyyp5sgvEr8dZsxEGAfVcPBIq0,47741
|
|
89
90
|
autocoder/data/tokenizer.json,sha256=7Lb5_DaYlDRvBRH0B0ynXO5c1fOwbQLxujX805-OEh0,7847602
|
|
@@ -115,13 +116,13 @@ autocoder/privacy/__init__.py,sha256=LnIVvGu_K66zCE-yhN_-dPO8R80pQyedCsXJ7wRqQaI
|
|
|
115
116
|
autocoder/privacy/model_filter.py,sha256=-N9ZvxxDKpxU7hkn-tKv-QHyXjvkCopUaKgvJwTOGQs,3369
|
|
116
117
|
autocoder/pyproject/__init__.py,sha256=ms-A_pocgGv0oZPEW8JAdXi7G-VSVhkQ6CnWFe535Ec,14477
|
|
117
118
|
autocoder/rag/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
118
|
-
autocoder/rag/api_server.py,sha256=
|
|
119
|
+
autocoder/rag/api_server.py,sha256=gsk450_B-qGtBwJ1niG9-QFJAG0RGr2s2KdiMrzzbyQ,9582
|
|
119
120
|
autocoder/rag/conversation_to_queries.py,sha256=xwmErn4WbdADnhK1me-h_6fV3KYrl_y1qPNQl1aoI6o,4810
|
|
120
121
|
autocoder/rag/doc_filter.py,sha256=UduVO2mlrngwJICrefjDJTYfdmQ4GcRXrfWDQ7xXksk,14206
|
|
121
|
-
autocoder/rag/document_retriever.py,sha256=
|
|
122
|
+
autocoder/rag/document_retriever.py,sha256=5BDqKVJqLPScEnua5S5suXhWuCaALIfPf5obXeJoWfs,8461
|
|
122
123
|
autocoder/rag/lang.py,sha256=_jmUtxZDG1fmF4b2mhMJbYS1YQDb2ZE8nyAn5_vrvjA,3350
|
|
123
124
|
autocoder/rag/llm_wrapper.py,sha256=Ht5GF5yJtrztoliujsZzx_ooWZmHkd5xLZKcGEiicZw,4303
|
|
124
|
-
autocoder/rag/long_context_rag.py,sha256=
|
|
125
|
+
autocoder/rag/long_context_rag.py,sha256=6rqq0pvYe9N4TvyLwd2OB21ZUrPC4FfxZuks0weAz4A,41935
|
|
125
126
|
autocoder/rag/qa_conversation_strategy.py,sha256=_BFdgit2KkUkW_82jE67QLYS_d8BsGhU1pG73YhHJgE,5744
|
|
126
127
|
autocoder/rag/rag_config.py,sha256=8LwFcTd8OJWWwi1_WY4IzjqgtT6RyE2j4PjxS5cCTDE,802
|
|
127
128
|
autocoder/rag/rag_entry.py,sha256=6TKtErZ0Us9XSV6HgRKXA6yR3SiZGPHpynOKSaR1wgE,2463
|
|
@@ -143,6 +144,7 @@ autocoder/rag/cache/byzer_storage_cache.py,sha256=okmNUDRCDv81JOfBuTspmTxf8ltYmY
|
|
|
143
144
|
autocoder/rag/cache/cache_result_merge.py,sha256=VnTdbT2OMBmWl_83bqds97d9_M33IhPNX8tF7KH2GMM,10556
|
|
144
145
|
autocoder/rag/cache/file_monitor_cache.py,sha256=OdSXTH3vo6inAzkN5d55I0RN03GUlSlnUEKmXpjFl78,9443
|
|
145
146
|
autocoder/rag/cache/local_byzer_storage_cache.py,sha256=7_6zCRY3BiCM0ec3U96i1G4l2SzmAedaTfkArNkMfQU,31925
|
|
147
|
+
autocoder/rag/cache/local_duckdb_storage_cache.py,sha256=4lnxjwluKqVWWfmd4giJ81O6bZlqEhesHfHGgEFx55I,25128
|
|
146
148
|
autocoder/rag/cache/rag_file_meta.py,sha256=RQ3n4wfkHlB-1ljS3sFSi8ijbsUPeIqBSgjmmbRuwRI,20521
|
|
147
149
|
autocoder/rag/cache/simple_cache.py,sha256=j9dxhei-Nwq9FJrrGOWhaDIDSb_Iz6JSojT1pelS9k4,13084
|
|
148
150
|
autocoder/rag/loaders/__init__.py,sha256=EQHEZ5Cmz-mGP2SllUTvcIbYCnF7W149dNpNItfs0yE,304
|
|
@@ -181,9 +183,9 @@ autocoder/utils/types.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
|
181
183
|
autocoder/utils/auto_coder_utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
182
184
|
autocoder/utils/auto_coder_utils/chat_stream_out.py,sha256=lkJ_A-sYU36JMzjFWkk3pR6uos8oZHYt9GPsPe_CPAo,11766
|
|
183
185
|
autocoder/utils/chat_auto_coder_utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
184
|
-
auto_coder-0.1.
|
|
185
|
-
auto_coder-0.1.
|
|
186
|
-
auto_coder-0.1.
|
|
187
|
-
auto_coder-0.1.
|
|
188
|
-
auto_coder-0.1.
|
|
189
|
-
auto_coder-0.1.
|
|
186
|
+
auto_coder-0.1.293.dist-info/LICENSE,sha256=HrhfyXIkWY2tGFK11kg7vPCqhgh5DcxleloqdhrpyMY,11558
|
|
187
|
+
auto_coder-0.1.293.dist-info/METADATA,sha256=wM3jXJXkDW9JVdM0Oy1EVukWKunNk2NKrPjRn658wK4,2665
|
|
188
|
+
auto_coder-0.1.293.dist-info/WHEEL,sha256=GV9aMThwP_4oNCtvEC2ec3qUYutgWeAzklro_0m4WJQ,91
|
|
189
|
+
auto_coder-0.1.293.dist-info/entry_points.txt,sha256=0nzHtHH4pNcM7xq4EBA2toS28Qelrvcbrr59GqD_0Ak,350
|
|
190
|
+
auto_coder-0.1.293.dist-info/top_level.txt,sha256=Jqc0_uJSw2GwoFQAa9iJxYns-2mWla-9ok_Y3Gcznjk,10
|
|
191
|
+
auto_coder-0.1.293.dist-info/RECORD,,
|
autocoder/auto_coder_rag.py
CHANGED
|
@@ -179,6 +179,14 @@ def main(input_args: Optional[List[str]] = None):
|
|
|
179
179
|
build_index_parser = subparsers.add_parser(
|
|
180
180
|
"build_hybrid_index", help="Build hybrid index for RAG"
|
|
181
181
|
)
|
|
182
|
+
|
|
183
|
+
build_index_parser.add_argument(
|
|
184
|
+
"--rag_storage_type",
|
|
185
|
+
type=str,
|
|
186
|
+
default="duckdb",
|
|
187
|
+
help="The storage type of the RAG, duckdb or byzer-storage",
|
|
188
|
+
)
|
|
189
|
+
|
|
182
190
|
build_index_parser.add_argument(
|
|
183
191
|
"--quick", action="store_true", help="Skip system initialization"
|
|
184
192
|
)
|
|
@@ -329,6 +337,14 @@ def main(input_args: Optional[List[str]] = None):
|
|
|
329
337
|
action="store_true",
|
|
330
338
|
help="Enable hybrid index",
|
|
331
339
|
)
|
|
340
|
+
|
|
341
|
+
serve_parser.add_argument(
|
|
342
|
+
"--rag_storage_type",
|
|
343
|
+
type=str,
|
|
344
|
+
default="duckdb",
|
|
345
|
+
help="The storage type of the RAG, duckdb or byzer-storage",
|
|
346
|
+
)
|
|
347
|
+
|
|
332
348
|
serve_parser.add_argument(
|
|
333
349
|
"--hybrid_index_max_output_tokens",
|
|
334
350
|
type=int,
|
autocoder/auto_coder_server.py
CHANGED
|
@@ -34,6 +34,14 @@ import sys
|
|
|
34
34
|
import io
|
|
35
35
|
from autocoder.utils.log_capture import LogCapture
|
|
36
36
|
|
|
37
|
+
# If support dotenv, use it
|
|
38
|
+
if os.path.exists(".env"):
|
|
39
|
+
try:
|
|
40
|
+
from dotenv import load_dotenv
|
|
41
|
+
load_dotenv()
|
|
42
|
+
except ImportError:
|
|
43
|
+
pass
|
|
44
|
+
|
|
37
45
|
def convert_yaml_config_to_str(yaml_config):
|
|
38
46
|
yaml_content = yaml.safe_dump(
|
|
39
47
|
yaml_config,
|
autocoder/common/__init__.py
CHANGED
|
@@ -292,10 +292,14 @@ class AutoCoderArgs(pydantic.BaseModel):
|
|
|
292
292
|
|
|
293
293
|
rag_url: Optional[str] = ""
|
|
294
294
|
rag_token: Optional[str] = ""
|
|
295
|
-
rag_type: Optional[str] = "storage"
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
295
|
+
rag_type: Optional[str] = "storage"
|
|
296
|
+
rag_storage_type: Optional[str] = "duckdb" # 向量化存储类型 byzer-storage | duckdb
|
|
297
|
+
rag_params_max_tokens: Optional[int] = 500000
|
|
298
|
+
rag_doc_filter_relevance: Optional[int] = 2
|
|
299
|
+
rag_context_window_limit: Optional[int] = 120000
|
|
300
|
+
rag_duckdb_vector_dim: Optional[int] = 1024 # DuckDB 向量化存储的维度
|
|
301
|
+
rag_duckdb_query_similarity: Optional[float] = 0.1 # DuckDB 向量化检索 相似度 阈值
|
|
302
|
+
rag_duckdb_query_top_k: Optional[int] = 10000 # DuckDB 向量化检索 返回 TopK个结果(且大于相似度)
|
|
299
303
|
# rag 本地图床地址
|
|
300
304
|
local_image_host: Optional[str] = ""
|
|
301
305
|
rag_recall_max_queries: Optional[int] = 5
|
autocoder/common/mcp_hub.py
CHANGED
|
@@ -2,8 +2,12 @@ import os
|
|
|
2
2
|
import json
|
|
3
3
|
import asyncio
|
|
4
4
|
import aiohttp
|
|
5
|
+
import importlib
|
|
6
|
+
import pkgutil
|
|
7
|
+
import re
|
|
8
|
+
import inspect
|
|
5
9
|
from datetime import datetime, timedelta
|
|
6
|
-
from typing import Dict, List, Optional, Any, Set, Optional
|
|
10
|
+
from typing import Dict, List, Optional, Any, Set, Optional, Tuple
|
|
7
11
|
from pathlib import Path
|
|
8
12
|
from pydantic import BaseModel, Field
|
|
9
13
|
|
|
@@ -62,23 +66,53 @@ class McpConnection:
|
|
|
62
66
|
self.session = session
|
|
63
67
|
|
|
64
68
|
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
69
|
+
def _generate_server_configs() -> Tuple[Dict[str, Any], Dict[str, str]]:
|
|
70
|
+
"""
|
|
71
|
+
Scan the autocoder.common.mcp_servers directory for mcp_server_*.py files
|
|
72
|
+
and generate server configurations.
|
|
73
|
+
|
|
74
|
+
Returns:
|
|
75
|
+
Tuple of (built-in servers dict, JSON templates dict)
|
|
76
|
+
"""
|
|
77
|
+
servers = {}
|
|
78
|
+
templates = {}
|
|
79
|
+
|
|
80
|
+
try:
|
|
81
|
+
package_name = "autocoder.common.mcp_servers"
|
|
82
|
+
package = importlib.import_module(package_name)
|
|
83
|
+
|
|
84
|
+
# Find all modules in the package
|
|
85
|
+
for _, name, _ in pkgutil.iter_modules(package.__path__, package.__name__ + "."):
|
|
86
|
+
# Only process modules that start with "mcp_server_"
|
|
87
|
+
base_name = name.split(".")[-1]
|
|
88
|
+
if base_name.startswith("mcp_server_"):
|
|
89
|
+
# Generate a friendly server name
|
|
90
|
+
friendly_name = base_name[11:]
|
|
91
|
+
|
|
92
|
+
# Create env dictionary with placeholders
|
|
93
|
+
env_dict = {}
|
|
94
|
+
|
|
95
|
+
# Create server configuration
|
|
96
|
+
config = {
|
|
97
|
+
"command": "python",
|
|
98
|
+
"args": ["-m", name],
|
|
99
|
+
"env": env_dict
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
# Store in dictionaries
|
|
103
|
+
servers[friendly_name] = config
|
|
104
|
+
templates[friendly_name] = json.dumps({friendly_name: config}, indent=4)
|
|
105
|
+
|
|
106
|
+
logger.info(f"Detected MCP server: {friendly_name}")
|
|
107
|
+
|
|
108
|
+
except Exception as e:
|
|
109
|
+
logger.error(f"Error generating server configs: {e}")
|
|
110
|
+
|
|
111
|
+
return servers, templates
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
# Automatically generate server configurations
|
|
115
|
+
MCP_BUILD_IN_SERVERS, MCP_SERVER_TEMPLATES = _generate_server_configs()
|
|
82
116
|
|
|
83
117
|
|
|
84
118
|
class McpHub:
|
|
@@ -422,3 +456,10 @@ class McpHub:
|
|
|
422
456
|
"""
|
|
423
457
|
for name in list(self.connections.keys()):
|
|
424
458
|
await self.delete_connection(name)
|
|
459
|
+
|
|
460
|
+
@classmethod
|
|
461
|
+
def get_server_templates(cls) -> Dict[str, str]:
|
|
462
|
+
"""
|
|
463
|
+
Get all available server templates as JSON strings
|
|
464
|
+
"""
|
|
465
|
+
return MCP_SERVER_TEMPLATES
|
|
@@ -0,0 +1,153 @@
|
|
|
1
|
+
from os import getenv
|
|
2
|
+
from textwrap import dedent
|
|
3
|
+
import sys
|
|
4
|
+
|
|
5
|
+
import mcp.server.stdio
|
|
6
|
+
import mcp.types as types
|
|
7
|
+
from mcp.server import NotificationOptions, Server
|
|
8
|
+
from mcp.server.models import InitializationOptions
|
|
9
|
+
import json
|
|
10
|
+
from openai import OpenAI
|
|
11
|
+
|
|
12
|
+
OPENAI_API_KEY = getenv("OPENAI_API_KEY")
|
|
13
|
+
# Check if API key is empty or None
|
|
14
|
+
if not OPENAI_API_KEY:
|
|
15
|
+
print("Error: OPENAI_API_KEY environment variable is not set. Please set it before running this server.", file=sys.stderr)
|
|
16
|
+
sys.exit(1)
|
|
17
|
+
|
|
18
|
+
OPENAI_API_BASE_URL = getenv(
|
|
19
|
+
"OPENAI_API_BASE_URL", "https://api.openai.com/v1")
|
|
20
|
+
|
|
21
|
+
server = Server("mcp-server-gpt4o-mini-search")
|
|
22
|
+
|
|
23
|
+
client = OpenAI(
|
|
24
|
+
api_key=OPENAI_API_KEY,
|
|
25
|
+
base_url=OPENAI_API_BASE_URL
|
|
26
|
+
)
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
@server.list_tools()
|
|
30
|
+
async def handle_list_tools() -> list[types.Tool]:
|
|
31
|
+
return [
|
|
32
|
+
types.Tool(
|
|
33
|
+
name="gpt4o_mini_search",
|
|
34
|
+
description=dedent(
|
|
35
|
+
"""
|
|
36
|
+
GPT-4o mini with search enables agents to gather information from the internet
|
|
37
|
+
in real-time, providing up-to-date answers with source citations.
|
|
38
|
+
This tool is ideal for fact-checking, research, and accessing current information
|
|
39
|
+
that might not be in the model's training data.
|
|
40
|
+
|
|
41
|
+
The search-enhanced responses include relevant web sources to support the information
|
|
42
|
+
provided, making it useful for obtaining verified and recent information.
|
|
43
|
+
|
|
44
|
+
[Response structure]
|
|
45
|
+
- id: A unique identifier for the response
|
|
46
|
+
- model: The model used (gpt-4o-mini-search-preview)
|
|
47
|
+
- object: The object type ("chat.completion")
|
|
48
|
+
- created: The Unix timestamp when the completion was created
|
|
49
|
+
- choices[]: The list of completion choices generated
|
|
50
|
+
- usage: Usage statistics for the completion request
|
|
51
|
+
"""
|
|
52
|
+
),
|
|
53
|
+
inputSchema={
|
|
54
|
+
"type": "object",
|
|
55
|
+
"properties": {
|
|
56
|
+
"system_message": {
|
|
57
|
+
"type": "string",
|
|
58
|
+
"description": "Optional custom system message. If not provided, a default search-optimized system message will be used.",
|
|
59
|
+
},
|
|
60
|
+
"messages": {
|
|
61
|
+
"type": "array",
|
|
62
|
+
"description": "A list of messages comprising the conversation so far (excluding system message which is handled separately).",
|
|
63
|
+
"items": {
|
|
64
|
+
"type": "object",
|
|
65
|
+
"properties": {
|
|
66
|
+
"content": {
|
|
67
|
+
"type": "string",
|
|
68
|
+
"description": "The contents of the message in this turn of conversation.",
|
|
69
|
+
},
|
|
70
|
+
"role": {
|
|
71
|
+
"type": "string",
|
|
72
|
+
"description": "The role of the speaker in this turn of conversation.",
|
|
73
|
+
"enum": ["user", "assistant"],
|
|
74
|
+
},
|
|
75
|
+
},
|
|
76
|
+
"required": ["content", "role"],
|
|
77
|
+
},
|
|
78
|
+
},
|
|
79
|
+
},
|
|
80
|
+
"required": ["messages"],
|
|
81
|
+
},
|
|
82
|
+
)
|
|
83
|
+
]
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
@server.call_tool()
|
|
87
|
+
async def handle_call_tool(
|
|
88
|
+
name: str, arguments: dict
|
|
89
|
+
) -> list[types.TextContent | types.ImageContent | types.EmbeddedResource]:
|
|
90
|
+
if name != "gpt4o_mini_search":
|
|
91
|
+
raise ValueError(f"Unknown tool: {name}")
|
|
92
|
+
|
|
93
|
+
# Extract user messages
|
|
94
|
+
user_messages = arguments.get("messages", [])
|
|
95
|
+
|
|
96
|
+
# Define default system message if not provided
|
|
97
|
+
default_system_message = (
|
|
98
|
+
"你是专业搜索助手,需要:\n"
|
|
99
|
+
"1. 提供基于用户查询的清晰格式化信息\n"
|
|
100
|
+
"2. 使用[标题](URL)格式嵌入链接\n"
|
|
101
|
+
"3. 每条信息后附上来源\n"
|
|
102
|
+
"4. 用'---'分隔不同结果\n"
|
|
103
|
+
"5. 直接在文本中引用,不使用编号引用\n"
|
|
104
|
+
"6. 确保提供完整URL"
|
|
105
|
+
)
|
|
106
|
+
|
|
107
|
+
# Use custom system message if provided, otherwise use default
|
|
108
|
+
system_message = arguments.get("system_message", default_system_message)
|
|
109
|
+
|
|
110
|
+
# Prepare full message list with system message first
|
|
111
|
+
full_messages = [{"role": "system", "content": system_message}]
|
|
112
|
+
full_messages.extend(user_messages)
|
|
113
|
+
|
|
114
|
+
try:
|
|
115
|
+
# Initialize OpenAI client
|
|
116
|
+
|
|
117
|
+
# Make the API call using OpenAI SDK
|
|
118
|
+
completion = client.chat.completions.create(
|
|
119
|
+
model="gpt-4o-mini-search-preview",
|
|
120
|
+
messages=full_messages
|
|
121
|
+
)
|
|
122
|
+
|
|
123
|
+
# Extract content from response
|
|
124
|
+
content = completion.choices[0].message.content
|
|
125
|
+
|
|
126
|
+
except Exception as e:
|
|
127
|
+
raise RuntimeError(f"API error: {str(e)}")
|
|
128
|
+
|
|
129
|
+
return [types.TextContent(
|
|
130
|
+
type="text",
|
|
131
|
+
text=content,
|
|
132
|
+
)]
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
async def main():
|
|
136
|
+
async with mcp.server.stdio.stdio_server() as (read_stream, write_stream):
|
|
137
|
+
await server.run(
|
|
138
|
+
read_stream,
|
|
139
|
+
write_stream,
|
|
140
|
+
InitializationOptions(
|
|
141
|
+
server_name="mcp-server-gpt4o-mini-search",
|
|
142
|
+
server_version="0.1.0",
|
|
143
|
+
capabilities=server.get_capabilities(
|
|
144
|
+
notification_options=NotificationOptions(
|
|
145
|
+
tools_changed=True),
|
|
146
|
+
experimental_capabilities={},
|
|
147
|
+
),
|
|
148
|
+
),
|
|
149
|
+
)
|
|
150
|
+
|
|
151
|
+
if __name__ == "__main__":
|
|
152
|
+
import asyncio
|
|
153
|
+
asyncio.run(main())
|
autocoder/rag/api_server.py
CHANGED
|
@@ -31,6 +31,14 @@ from byzerllm.utils.client.entrypoints.openai.protocol import (
|
|
|
31
31
|
from pydantic import BaseModel
|
|
32
32
|
from typing import List,Optional
|
|
33
33
|
|
|
34
|
+
# If support dotenv, use it
|
|
35
|
+
if os.path.exists(".env"):
|
|
36
|
+
try:
|
|
37
|
+
from dotenv import load_dotenv
|
|
38
|
+
load_dotenv()
|
|
39
|
+
except ImportError:
|
|
40
|
+
pass
|
|
41
|
+
|
|
34
42
|
logger = init_logger(__name__)
|
|
35
43
|
|
|
36
44
|
llm_client: ByzerLLM = None
|
|
@@ -0,0 +1,647 @@
|
|
|
1
|
+
import hashlib
|
|
2
|
+
import json
|
|
3
|
+
import os
|
|
4
|
+
import time
|
|
5
|
+
import platform
|
|
6
|
+
import threading
|
|
7
|
+
from multiprocessing import Pool
|
|
8
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
9
|
+
from typing import List, Dict, Any, Optional, Tuple, Union
|
|
10
|
+
import numpy as np
|
|
11
|
+
from loguru import logger
|
|
12
|
+
|
|
13
|
+
try:
|
|
14
|
+
import duckdb
|
|
15
|
+
except ImportError:
|
|
16
|
+
logger.error("DuckDB is not installed, please install it using 'pip install duckdb'")
|
|
17
|
+
raise
|
|
18
|
+
|
|
19
|
+
from autocoder.common import AutoCoderArgs
|
|
20
|
+
from autocoder.common import SourceCode
|
|
21
|
+
from autocoder.rag.cache.base_cache import (
|
|
22
|
+
BaseCacheManager,
|
|
23
|
+
DeleteEvent,
|
|
24
|
+
AddOrUpdateEvent,
|
|
25
|
+
FileInfo,
|
|
26
|
+
CacheItem
|
|
27
|
+
)
|
|
28
|
+
from autocoder.rag.utils import process_file_in_multi_process, process_file_local
|
|
29
|
+
from autocoder.rag.variable_holder import VariableHolder
|
|
30
|
+
from byzerllm import SimpleByzerLLM, ByzerLLM
|
|
31
|
+
|
|
32
|
+
if platform.system() != "Windows":
|
|
33
|
+
import fcntl
|
|
34
|
+
else:
|
|
35
|
+
fcntl = None
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
default_ignore_dirs = [
|
|
39
|
+
"__pycache__",
|
|
40
|
+
"node_modules",
|
|
41
|
+
"_images"
|
|
42
|
+
]
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def generate_file_md5(file_path: str) -> str:
|
|
46
|
+
md5_hash = hashlib.md5()
|
|
47
|
+
with open(file_path, "rb") as f:
|
|
48
|
+
for chunk in iter(lambda: f.read(4096), b""):
|
|
49
|
+
md5_hash.update(chunk)
|
|
50
|
+
return md5_hash.hexdigest()
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
class DuckDBLocalContext:
|
|
54
|
+
def __init__(self, database_path: str):
|
|
55
|
+
self.database_path = database_path
|
|
56
|
+
self._conn = None
|
|
57
|
+
|
|
58
|
+
def _install_load_extension(self, ext_list):
|
|
59
|
+
for ext in ext_list:
|
|
60
|
+
self._conn.install_extension(ext)
|
|
61
|
+
self._conn.load_extension(ext)
|
|
62
|
+
|
|
63
|
+
def __enter__(self) -> "duckdb.DuckDBPyConnection":
|
|
64
|
+
if not os.path.exists(os.path.dirname(self.database_path)):
|
|
65
|
+
raise ValueError(
|
|
66
|
+
f"Directory {os.path.dirname(self.database_path)} does not exist."
|
|
67
|
+
)
|
|
68
|
+
|
|
69
|
+
self._conn = duckdb.connect(self.database_path)
|
|
70
|
+
self._install_load_extension(["json", "fts", "vss"])
|
|
71
|
+
|
|
72
|
+
return self._conn
|
|
73
|
+
|
|
74
|
+
def __exit__(self, exc_type, exc_val, exc_tb) -> None:
|
|
75
|
+
if self._conn:
|
|
76
|
+
self._conn.close()
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
class LocalDuckdbStorage:
|
|
80
|
+
|
|
81
|
+
def __init__(
|
|
82
|
+
self, llm: Union[ByzerLLM, SimpleByzerLLM] = None, database_name: str = ":memory:", table_name: str = "documents",
|
|
83
|
+
embed_dim: Optional[int] = None, persist_dir: str = "./storage"
|
|
84
|
+
) -> None:
|
|
85
|
+
self.llm = llm
|
|
86
|
+
self.database_name = database_name
|
|
87
|
+
self.table_name = table_name
|
|
88
|
+
self.embed_dim = embed_dim
|
|
89
|
+
self.persist_dir = persist_dir
|
|
90
|
+
self.cache_dir = os.path.join(self.persist_dir, '.cache')
|
|
91
|
+
logger.info(f"正在启动 DuckDBVectorStore.")
|
|
92
|
+
|
|
93
|
+
if self.database_name != ":memory:":
|
|
94
|
+
self.database_path = os.path.join(self.cache_dir, self.database_name)
|
|
95
|
+
|
|
96
|
+
if self.database_name == ":memory:":
|
|
97
|
+
self._conn = duckdb.connect(self.database_name)
|
|
98
|
+
self._install_load_extension(["json", "fts", "vss"])
|
|
99
|
+
self._initialize()
|
|
100
|
+
else:
|
|
101
|
+
if not os.path.exists(self.database_path):
|
|
102
|
+
if not os.path.exists(self.cache_dir):
|
|
103
|
+
os.makedirs(self.cache_dir)
|
|
104
|
+
self._initialize()
|
|
105
|
+
self._conn = None
|
|
106
|
+
logger.info(f"DuckDBVectorStore 初始化完成, 存储目录: {self.cache_dir}, "
|
|
107
|
+
f"数据库名称: {self.database_name}, 数据表名称: {self.table_name}")
|
|
108
|
+
|
|
109
|
+
@classmethod
|
|
110
|
+
def class_name(cls) -> str:
|
|
111
|
+
return "DuckDBVectorStore"
|
|
112
|
+
|
|
113
|
+
@property
|
|
114
|
+
def client(self) -> Any:
|
|
115
|
+
"""Return client."""
|
|
116
|
+
return self._conn
|
|
117
|
+
|
|
118
|
+
def _install_load_extension(self, ext_list):
|
|
119
|
+
for ext in ext_list:
|
|
120
|
+
self._conn.install_extension(ext)
|
|
121
|
+
self._conn.load_extension(ext)
|
|
122
|
+
|
|
123
|
+
@staticmethod
|
|
124
|
+
def _apply_pca(embedding, target_dim):
|
|
125
|
+
# 生成固定随机投影矩阵(避免每次调用重新生成)
|
|
126
|
+
np.random.seed(42) # 固定随机种子保证一致性
|
|
127
|
+
source_dim = len(embedding)
|
|
128
|
+
projection_matrix = np.random.randn(source_dim, target_dim) / np.sqrt(source_dim)
|
|
129
|
+
|
|
130
|
+
# 执行投影
|
|
131
|
+
reduced = np.dot(embedding, projection_matrix)
|
|
132
|
+
return reduced
|
|
133
|
+
|
|
134
|
+
def _embedding(self, context: str, norm: bool = True, dim: int | None = None) -> List[float]:
|
|
135
|
+
embedding = self.llm.emb_query(context)[0].output
|
|
136
|
+
|
|
137
|
+
if dim:
|
|
138
|
+
embedding = self._apply_pca(embedding, target_dim=dim) # 降维后形状 (1024,)
|
|
139
|
+
|
|
140
|
+
if norm:
|
|
141
|
+
embedding = embedding / np.linalg.norm(embedding)
|
|
142
|
+
|
|
143
|
+
return embedding.tolist()
|
|
144
|
+
|
|
145
|
+
def _initialize(self) -> None:
|
|
146
|
+
if self.embed_dim is None:
|
|
147
|
+
_query = f"""
|
|
148
|
+
CREATE TABLE IF NOT EXISTS {self.table_name} (
|
|
149
|
+
_id VARCHAR,
|
|
150
|
+
file_path VARCHAR,
|
|
151
|
+
content TEXT,
|
|
152
|
+
raw_content TEXT,
|
|
153
|
+
vector FLOAT[],
|
|
154
|
+
mtime FLOAT
|
|
155
|
+
);
|
|
156
|
+
"""
|
|
157
|
+
else:
|
|
158
|
+
_query = f"""
|
|
159
|
+
CREATE TABLE IF NOT EXISTS {self.table_name} (
|
|
160
|
+
_id VARCHAR,
|
|
161
|
+
file_path VARCHAR,
|
|
162
|
+
content TEXT,
|
|
163
|
+
raw_content TEXT,
|
|
164
|
+
vector FLOAT[],
|
|
165
|
+
mtime FLOAT
|
|
166
|
+
);
|
|
167
|
+
"""
|
|
168
|
+
|
|
169
|
+
if self.database_name == ":memory:":
|
|
170
|
+
self._conn.execute(_query)
|
|
171
|
+
elif self.database_path is not None:
|
|
172
|
+
with DuckDBLocalContext(self.database_path) as _conn:
|
|
173
|
+
_conn.execute(_query)
|
|
174
|
+
|
|
175
|
+
def truncate_table(self):
|
|
176
|
+
_truncate_query = f"""TRUNCATE TABLE {self.table_name};"""
|
|
177
|
+
if self.database_name == ":memory:":
|
|
178
|
+
self._conn.execute(_truncate_query)
|
|
179
|
+
elif self.database_path is not None:
|
|
180
|
+
with DuckDBLocalContext(self.database_path) as _conn:
|
|
181
|
+
_conn.execute(_truncate_query)
|
|
182
|
+
|
|
183
|
+
def query_by_path(self, file_path: str):
|
|
184
|
+
_exists_query = f"""SELECT _id FROM {self.table_name} WHERE file_path = ?"""
|
|
185
|
+
query_params = [
|
|
186
|
+
file_path
|
|
187
|
+
]
|
|
188
|
+
_final_results = []
|
|
189
|
+
if self.database_name == ":memory:":
|
|
190
|
+
_final_results = self._conn.execute(_exists_query, query_params).fetchall()
|
|
191
|
+
elif self.database_path is not None:
|
|
192
|
+
with DuckDBLocalContext(self.database_path) as _conn:
|
|
193
|
+
_final_results = _conn.execute(_exists_query, query_params).fetchall()
|
|
194
|
+
return _final_results
|
|
195
|
+
|
|
196
|
+
def delete_by_ids(self, _ids: List[str]):
|
|
197
|
+
_delete_query = f"""DELETE FROM {self.table_name} WHERE _id IN (?);"""
|
|
198
|
+
query_params = [
|
|
199
|
+
','.join(_ids)
|
|
200
|
+
]
|
|
201
|
+
if self.database_name == ":memory:":
|
|
202
|
+
_final_results = self._conn.execute(_delete_query, query_params).fetchall()
|
|
203
|
+
elif self.database_path is not None:
|
|
204
|
+
with DuckDBLocalContext(self.database_path) as _conn:
|
|
205
|
+
_final_results = _conn.execute(_delete_query, query_params).fetchall()
|
|
206
|
+
return _final_results
|
|
207
|
+
|
|
208
|
+
def _node_to_table_row(self, context_chunk: Dict[str, str | float], dim: int | None = None) -> Any:
|
|
209
|
+
return (
|
|
210
|
+
context_chunk["_id"],
|
|
211
|
+
context_chunk["file_path"],
|
|
212
|
+
context_chunk["content"],
|
|
213
|
+
context_chunk["raw_content"],
|
|
214
|
+
self._embedding(context_chunk["raw_content"], norm=True, dim=dim),
|
|
215
|
+
context_chunk["mtime"]
|
|
216
|
+
)
|
|
217
|
+
|
|
218
|
+
def add_doc(self, context_chunk: Dict[str, str | float], dim: int | None = None):
|
|
219
|
+
"""
|
|
220
|
+
{
|
|
221
|
+
"_id": f"{doc.module_name}_{chunk_idx}",
|
|
222
|
+
"file_path": file_info.file_path,
|
|
223
|
+
"content": chunk,
|
|
224
|
+
"raw_content": chunk,
|
|
225
|
+
"vector": chunk,
|
|
226
|
+
"mtime": file_info.modify_time,
|
|
227
|
+
}
|
|
228
|
+
"""
|
|
229
|
+
if self.database_name == ":memory:":
|
|
230
|
+
_table = self._conn.table(self.table_name)
|
|
231
|
+
_row = self._node_to_table_row(context_chunk, dim=dim)
|
|
232
|
+
_table.insert(_row)
|
|
233
|
+
elif self.database_path is not None:
|
|
234
|
+
with DuckDBLocalContext(self.database_path) as _conn:
|
|
235
|
+
_table = _conn.table(self.table_name)
|
|
236
|
+
_row = self._node_to_table_row(context_chunk, dim=dim)
|
|
237
|
+
_table.insert(_row)
|
|
238
|
+
|
|
239
|
+
def vector_search(
|
|
240
|
+
self, query: str, similarity_value: float = 0.7, similarity_top_k: int = 10, query_dim: int | None = None
|
|
241
|
+
):
|
|
242
|
+
"""
|
|
243
|
+
list_cosine_similarity: 计算两个列表之间的余弦相似度
|
|
244
|
+
list_cosine_distance: 计算两个列表之间的余弦距离
|
|
245
|
+
list_dot_product: 计算两个大小相同的数字列表的点积
|
|
246
|
+
"""
|
|
247
|
+
_db_query = f"""
|
|
248
|
+
SELECT _id, file_path, mtime, score
|
|
249
|
+
FROM (
|
|
250
|
+
SELECT *, list_cosine_similarity(vector, ?) AS score
|
|
251
|
+
FROM {self.table_name}
|
|
252
|
+
) sq
|
|
253
|
+
WHERE score IS NOT NULL
|
|
254
|
+
AND score >= ?
|
|
255
|
+
ORDER BY score DESC LIMIT ?;
|
|
256
|
+
"""
|
|
257
|
+
query_params = [
|
|
258
|
+
self._embedding(query, norm=True, dim=query_dim),
|
|
259
|
+
similarity_value,
|
|
260
|
+
similarity_top_k,
|
|
261
|
+
]
|
|
262
|
+
|
|
263
|
+
_final_results = []
|
|
264
|
+
if self.database_name == ":memory:":
|
|
265
|
+
_final_results = self._conn.execute(_db_query, query_params).fetchall()
|
|
266
|
+
elif self.database_path is not None:
|
|
267
|
+
with DuckDBLocalContext(self.database_path) as _conn:
|
|
268
|
+
_final_results = _conn.execute(_db_query, query_params).fetchall()
|
|
269
|
+
return _final_results
|
|
270
|
+
|
|
271
|
+
|
|
272
|
+
efault_ignore_dirs = [
|
|
273
|
+
"__pycache__",
|
|
274
|
+
"node_modules",
|
|
275
|
+
"_images"
|
|
276
|
+
]
|
|
277
|
+
|
|
278
|
+
|
|
279
|
+
class LocalDuckDBStorageCache(BaseCacheManager):
|
|
280
|
+
def __init__(
|
|
281
|
+
self,
|
|
282
|
+
path,
|
|
283
|
+
ignore_spec,
|
|
284
|
+
required_exts,
|
|
285
|
+
extra_params: Optional[AutoCoderArgs] = None,
|
|
286
|
+
emb_llm: Union[ByzerLLM, SimpleByzerLLM] = None
|
|
287
|
+
):
|
|
288
|
+
self.path = path
|
|
289
|
+
self.ignore_spec = ignore_spec
|
|
290
|
+
self.required_exts = required_exts
|
|
291
|
+
self.extra_params = extra_params
|
|
292
|
+
|
|
293
|
+
self.storage = LocalDuckdbStorage(
|
|
294
|
+
llm=emb_llm,
|
|
295
|
+
database_name="byzerai_store_duckdb.db",
|
|
296
|
+
table_name="rag_duckdb",
|
|
297
|
+
persist_dir=self.path
|
|
298
|
+
)
|
|
299
|
+
self.queue = []
|
|
300
|
+
self.chunk_size = 1000
|
|
301
|
+
self.max_output_tokens = extra_params.hybrid_index_max_output_tokens
|
|
302
|
+
|
|
303
|
+
# 设置缓存文件路径
|
|
304
|
+
self.cache_dir = os.path.join(self.path, ".cache")
|
|
305
|
+
self.cache_file = os.path.join(self.cache_dir, "duckdb_storage_speedup.jsonl")
|
|
306
|
+
self.cache: Dict[str, CacheItem] = {}
|
|
307
|
+
# 创建缓存目录
|
|
308
|
+
if not os.path.exists(self.cache_dir):
|
|
309
|
+
os.makedirs(self.cache_dir)
|
|
310
|
+
|
|
311
|
+
self.lock = threading.Lock()
|
|
312
|
+
self.stop_event = threading.Event()
|
|
313
|
+
self.thread = threading.Thread(target=self.process_queue)
|
|
314
|
+
self.thread.daemon = True
|
|
315
|
+
self.thread.start()
|
|
316
|
+
|
|
317
|
+
# 加载缓存
|
|
318
|
+
self.cache = self._load_cache()
|
|
319
|
+
|
|
320
|
+
@staticmethod
|
|
321
|
+
def _chunk_text(text, max_length=1000):
|
|
322
|
+
"""Split text into chunks"""
|
|
323
|
+
chunks = []
|
|
324
|
+
current_chunk = []
|
|
325
|
+
current_length = 0
|
|
326
|
+
|
|
327
|
+
for line in text.split("\n"):
|
|
328
|
+
if current_length + len(line) > max_length and current_chunk:
|
|
329
|
+
chunks.append("\n".join(current_chunk))
|
|
330
|
+
current_chunk = []
|
|
331
|
+
current_length = 0
|
|
332
|
+
current_chunk.append(line)
|
|
333
|
+
current_length += len(line)
|
|
334
|
+
|
|
335
|
+
if current_chunk:
|
|
336
|
+
chunks.append("\n".join(current_chunk))
|
|
337
|
+
|
|
338
|
+
return chunks
|
|
339
|
+
|
|
340
|
+
def _load_cache(self) -> Dict[str, CacheItem]:
|
|
341
|
+
"""Load cache from file"""
|
|
342
|
+
if os.path.exists(self.cache_file):
|
|
343
|
+
try:
|
|
344
|
+
with open(self.cache_file, "r", encoding="utf-8") as f:
|
|
345
|
+
lines = f.readlines()
|
|
346
|
+
cache = {}
|
|
347
|
+
for line in lines:
|
|
348
|
+
try:
|
|
349
|
+
data = json.loads(line.strip())
|
|
350
|
+
if isinstance(data, dict) and "file_path" in data:
|
|
351
|
+
# 转换为 CacheItem 对象
|
|
352
|
+
cache_item = CacheItem.model_validate(data)
|
|
353
|
+
cache[data["file_path"]] = cache_item
|
|
354
|
+
except json.JSONDecodeError:
|
|
355
|
+
continue
|
|
356
|
+
return cache
|
|
357
|
+
except Exception as e:
|
|
358
|
+
logger.error(f"Error loading cache file: {str(e)}")
|
|
359
|
+
return {}
|
|
360
|
+
return {}
|
|
361
|
+
|
|
362
|
+
def write_cache(self):
|
|
363
|
+
cache_file = self.cache_file
|
|
364
|
+
|
|
365
|
+
if not fcntl:
|
|
366
|
+
try:
|
|
367
|
+
with open(cache_file, "w", encoding="utf-8") as f:
|
|
368
|
+
for cache_item in self.cache.values():
|
|
369
|
+
# 确保序列化 Pydantic 模型
|
|
370
|
+
json.dump(cache_item.model_dump(), f, ensure_ascii=False)
|
|
371
|
+
f.write("\n")
|
|
372
|
+
except IOError as e:
|
|
373
|
+
logger.error(f"Error writing cache file: {str(e)}")
|
|
374
|
+
else:
|
|
375
|
+
lock_file = cache_file + ".lock"
|
|
376
|
+
with open(lock_file, "w", encoding="utf-8") as lockf:
|
|
377
|
+
try:
|
|
378
|
+
# 获取文件锁
|
|
379
|
+
fcntl.flock(lockf, fcntl.LOCK_EX | fcntl.LOCK_NB)
|
|
380
|
+
# 写入缓存文件
|
|
381
|
+
with open(cache_file, "w", encoding="utf-8") as f:
|
|
382
|
+
for cache_item in self.cache.values():
|
|
383
|
+
# 确保序列化 Pydantic 模型
|
|
384
|
+
json.dump(cache_item.model_dump(), f, ensure_ascii=False)
|
|
385
|
+
f.write("\n")
|
|
386
|
+
|
|
387
|
+
finally:
|
|
388
|
+
# 释放文件锁
|
|
389
|
+
fcntl.flock(lockf, fcntl.LOCK_UN)
|
|
390
|
+
|
|
391
|
+
@staticmethod
|
|
392
|
+
def fileinfo_to_tuple(file_info: FileInfo) -> Tuple[str, str, float, str]:
|
|
393
|
+
return file_info.file_path, file_info.relative_path, file_info.modify_time, file_info.file_md5
|
|
394
|
+
|
|
395
|
+
def build_cache(self):
|
|
396
|
+
"""Build the cache by reading files and storing in DuckDBVectorStore"""
|
|
397
|
+
logger.info(f"Building cache for path: {self.path}")
|
|
398
|
+
|
|
399
|
+
files_to_process = []
|
|
400
|
+
for file_info in self.get_all_files():
|
|
401
|
+
if (
|
|
402
|
+
file_info.file_path not in self.cache
|
|
403
|
+
or self.cache[file_info.file_path].md5 != file_info.file_md5
|
|
404
|
+
):
|
|
405
|
+
files_to_process.append(file_info)
|
|
406
|
+
|
|
407
|
+
if not files_to_process:
|
|
408
|
+
return
|
|
409
|
+
|
|
410
|
+
from autocoder.rag.token_counter import initialize_tokenizer
|
|
411
|
+
|
|
412
|
+
with Pool(
|
|
413
|
+
processes=os.cpu_count(),
|
|
414
|
+
initializer=initialize_tokenizer,
|
|
415
|
+
initargs=(VariableHolder.TOKENIZER_PATH,),
|
|
416
|
+
) as pool:
|
|
417
|
+
target_files_to_process = []
|
|
418
|
+
for file_info in files_to_process:
|
|
419
|
+
target_files_to_process.append(
|
|
420
|
+
self.fileinfo_to_tuple(file_info))
|
|
421
|
+
results = pool.map(process_file_in_multi_process,
|
|
422
|
+
target_files_to_process)
|
|
423
|
+
|
|
424
|
+
items = []
|
|
425
|
+
for file_info, result in zip(files_to_process, results):
|
|
426
|
+
content: List[SourceCode] = result
|
|
427
|
+
self.cache[file_info.file_path] = CacheItem(
|
|
428
|
+
file_path=file_info.file_path,
|
|
429
|
+
relative_path=file_info.relative_path,
|
|
430
|
+
content=[c.model_dump() for c in content],
|
|
431
|
+
modify_time=file_info.modify_time,
|
|
432
|
+
md5=file_info.file_md5,
|
|
433
|
+
)
|
|
434
|
+
|
|
435
|
+
for doc in content:
|
|
436
|
+
logger.info(f"Processing file: {doc.module_name}")
|
|
437
|
+
chunks = self._chunk_text(doc.source_code, self.chunk_size)
|
|
438
|
+
for chunk_idx, chunk in enumerate(chunks):
|
|
439
|
+
chunk_item = {
|
|
440
|
+
"_id": f"{doc.module_name}_{chunk_idx}",
|
|
441
|
+
"file_path": file_info.file_path,
|
|
442
|
+
"content": chunk,
|
|
443
|
+
"raw_content": chunk,
|
|
444
|
+
"vector": "",
|
|
445
|
+
"mtime": file_info.modify_time,
|
|
446
|
+
}
|
|
447
|
+
items.append(chunk_item)
|
|
448
|
+
|
|
449
|
+
# Save to local cache
|
|
450
|
+
logger.info("Saving cache to local file")
|
|
451
|
+
self.write_cache()
|
|
452
|
+
|
|
453
|
+
if items:
|
|
454
|
+
logger.info("Clear cache from Byzer DuckDB Storage")
|
|
455
|
+
self.storage.truncate_table()
|
|
456
|
+
logger.info("Save new cache to Byzer DuckDB Storage")
|
|
457
|
+
|
|
458
|
+
total_chunks = len(items)
|
|
459
|
+
completed_chunks = 0
|
|
460
|
+
|
|
461
|
+
logger.info(f"进度: 已完成 {0}/{total_chunks} 个文本块")
|
|
462
|
+
|
|
463
|
+
for _chunk in items:
|
|
464
|
+
try:
|
|
465
|
+
self.storage.add_doc(_chunk, dim=self.extra_params.rag_duckdb_vector_dim)
|
|
466
|
+
completed_chunks += 1
|
|
467
|
+
logger.info(f"进度: 已完成 {completed_chunks}/{total_chunks} 个文本块")
|
|
468
|
+
time.sleep(self.extra_params.anti_quota_limit)
|
|
469
|
+
except Exception as err:
|
|
470
|
+
logger.error(f"Error in saving chunk: {str(err)}")
|
|
471
|
+
|
|
472
|
+
def update_storage(self, file_info: FileInfo, is_delete: bool):
|
|
473
|
+
results = self.storage.query_by_path(file_info.file_path)
|
|
474
|
+
if results: # [('_id',)]
|
|
475
|
+
for result in results:
|
|
476
|
+
self.storage.delete_by_ids([result[0]])
|
|
477
|
+
|
|
478
|
+
items = []
|
|
479
|
+
if not is_delete:
|
|
480
|
+
content = [
|
|
481
|
+
SourceCode.model_validate(doc) for doc in self.cache[file_info.file_path].content
|
|
482
|
+
]
|
|
483
|
+
modify_time = self.cache[file_info.file_path].modify_time
|
|
484
|
+
for doc in content:
|
|
485
|
+
logger.info(f"正在处理更新文件: {doc.module_name}")
|
|
486
|
+
chunks = self._chunk_text(doc.source_code, self.chunk_size)
|
|
487
|
+
for chunk_idx, chunk in enumerate(chunks):
|
|
488
|
+
chunk_item = {
|
|
489
|
+
"_id": f"{doc.module_name}_{chunk_idx}",
|
|
490
|
+
"file_path": file_info.file_path,
|
|
491
|
+
"content": chunk,
|
|
492
|
+
"raw_content": chunk,
|
|
493
|
+
"vector": chunk,
|
|
494
|
+
"mtime": modify_time,
|
|
495
|
+
}
|
|
496
|
+
items.append(chunk_item)
|
|
497
|
+
if items:
|
|
498
|
+
for _chunk in items:
|
|
499
|
+
try:
|
|
500
|
+
self.storage.add_doc(_chunk, dim=self.extra_params.rag_duckdb_vector_dim)
|
|
501
|
+
time.sleep(self.extra_params.anti_quota_limit)
|
|
502
|
+
except Exception as err:
|
|
503
|
+
logger.error(f"Error in saving chunk: {str(err)}")
|
|
504
|
+
|
|
505
|
+
def process_queue(self):
|
|
506
|
+
while self.queue:
|
|
507
|
+
file_list = self.queue.pop(0)
|
|
508
|
+
if isinstance(file_list, DeleteEvent):
|
|
509
|
+
for item in file_list.file_paths:
|
|
510
|
+
logger.info(f"{item} is detected to be removed")
|
|
511
|
+
del self.cache[item]
|
|
512
|
+
# 创建一个临时的 FileInfo 对象
|
|
513
|
+
file_info = FileInfo(
|
|
514
|
+
file_path=item, relative_path="", modify_time=0, file_md5="")
|
|
515
|
+
self.update_storage(file_info, is_delete=True)
|
|
516
|
+
|
|
517
|
+
elif isinstance(file_list, AddOrUpdateEvent):
|
|
518
|
+
for file_info in file_list.file_infos:
|
|
519
|
+
logger.info(
|
|
520
|
+
f"{file_info.file_path} is detected to be updated")
|
|
521
|
+
# 处理文件并创建 CacheItem
|
|
522
|
+
# content = process_file_local(
|
|
523
|
+
# self.fileinfo_to_tuple(file_info))
|
|
524
|
+
content = process_file_local(file_info.file_path)
|
|
525
|
+
self.cache[file_info.file_path] = CacheItem(
|
|
526
|
+
file_path=file_info.file_path,
|
|
527
|
+
relative_path=file_info.relative_path,
|
|
528
|
+
content=[c.model_dump() for c in content],
|
|
529
|
+
modify_time=file_info.modify_time,
|
|
530
|
+
md5=file_info.file_md5,
|
|
531
|
+
)
|
|
532
|
+
self.update_storage(file_info, is_delete=False)
|
|
533
|
+
self.write_cache()
|
|
534
|
+
|
|
535
|
+
def trigger_update(self):
|
|
536
|
+
logger.info("检查文件是否有更新.....")
|
|
537
|
+
files_to_process = []
|
|
538
|
+
current_files = set()
|
|
539
|
+
for file_info in self.get_all_files():
|
|
540
|
+
current_files.add(file_info.file_path)
|
|
541
|
+
if (
|
|
542
|
+
file_info.file_path not in self.cache
|
|
543
|
+
or self.cache[file_info.file_path].md5 != file_info.file_md5
|
|
544
|
+
):
|
|
545
|
+
files_to_process.append(file_info)
|
|
546
|
+
|
|
547
|
+
deleted_files = set(self.cache.keys()) - current_files
|
|
548
|
+
logger.info(f"待处理的文件: {len(files_to_process)}个")
|
|
549
|
+
logger.info(f"已删除的文件: {len(deleted_files)}个")
|
|
550
|
+
if deleted_files:
|
|
551
|
+
with self.lock:
|
|
552
|
+
self.queue.append(DeleteEvent(file_paths=deleted_files))
|
|
553
|
+
if files_to_process:
|
|
554
|
+
with self.lock:
|
|
555
|
+
self.queue.append(AddOrUpdateEvent(file_infos=files_to_process))
|
|
556
|
+
|
|
557
|
+
def get_all_files(self) -> List[FileInfo]:
|
|
558
|
+
all_files = []
|
|
559
|
+
for root, dirs, files in os.walk(self.path, followlinks=True):
|
|
560
|
+
dirs[:] = [d for d in dirs if not d.startswith(
|
|
561
|
+
".") and d not in default_ignore_dirs]
|
|
562
|
+
|
|
563
|
+
if self.ignore_spec:
|
|
564
|
+
relative_root = os.path.relpath(root, self.path)
|
|
565
|
+
dirs[:] = [
|
|
566
|
+
d
|
|
567
|
+
for d in dirs
|
|
568
|
+
if not self.ignore_spec.match_file(os.path.join(relative_root, d))
|
|
569
|
+
]
|
|
570
|
+
files = [
|
|
571
|
+
f
|
|
572
|
+
for f in files
|
|
573
|
+
if not self.ignore_spec.match_file(os.path.join(relative_root, f))
|
|
574
|
+
]
|
|
575
|
+
|
|
576
|
+
for file in files:
|
|
577
|
+
if self.required_exts and not any(
|
|
578
|
+
file.endswith(ext) for ext in self.required_exts
|
|
579
|
+
):
|
|
580
|
+
continue
|
|
581
|
+
|
|
582
|
+
file_path = os.path.join(root, file)
|
|
583
|
+
relative_path = os.path.relpath(file_path, self.path)
|
|
584
|
+
modify_time = os.path.getmtime(file_path)
|
|
585
|
+
file_md5 = generate_file_md5(file_path)
|
|
586
|
+
all_files.append(
|
|
587
|
+
FileInfo(
|
|
588
|
+
file_path=file_path,
|
|
589
|
+
relative_path=relative_path,
|
|
590
|
+
modify_time=modify_time,
|
|
591
|
+
file_md5=file_md5))
|
|
592
|
+
|
|
593
|
+
return all_files
|
|
594
|
+
|
|
595
|
+
def get_cache(self, options: Optional[Dict[str, Any]] = None) -> Dict[str, Dict]:
|
|
596
|
+
"""Search cached documents using query"""
|
|
597
|
+
self.trigger_update() # 检查更新
|
|
598
|
+
|
|
599
|
+
if options is None or "query" not in options:
|
|
600
|
+
return {file_path: self.cache[file_path].model_dump() for file_path in self.cache}
|
|
601
|
+
|
|
602
|
+
query = options.get("query", "")
|
|
603
|
+
logger.info(f"正在使用向量搜索检索数据, 你的问题: {query}")
|
|
604
|
+
total_tokens = 0
|
|
605
|
+
results = []
|
|
606
|
+
|
|
607
|
+
# Add vector search if enabled
|
|
608
|
+
if options.get("enable_vector_search", True):
|
|
609
|
+
# 返回值包含 [(_id, file_path, mtime, score,),]
|
|
610
|
+
# results = self.storage.vector_search(query, similarity_value=0.7, similarity_top_k=200)
|
|
611
|
+
search_results = self.storage.vector_search(
|
|
612
|
+
query,
|
|
613
|
+
similarity_value=self.extra_params.duckdb_query_similarity,
|
|
614
|
+
similarity_top_k=self.extra_params.duckdb_query_top_k,
|
|
615
|
+
query_dim=self.extra_params.duckdb_vector_dim
|
|
616
|
+
)
|
|
617
|
+
results.extend(search_results)
|
|
618
|
+
|
|
619
|
+
# Group results by file_path and reconstruct documents while preserving order
|
|
620
|
+
# 这里还可以有排序优化,综合考虑一篇内容出现的次数以及排序位置
|
|
621
|
+
file_paths = []
|
|
622
|
+
seen = set()
|
|
623
|
+
for result in results:
|
|
624
|
+
_id, _file_path, _mtime, _score = result
|
|
625
|
+
if _file_path not in seen:
|
|
626
|
+
seen.add(_file_path)
|
|
627
|
+
file_paths.append(_file_path)
|
|
628
|
+
|
|
629
|
+
# 从缓存中获取文件内容
|
|
630
|
+
result = {}
|
|
631
|
+
for file_path in file_paths:
|
|
632
|
+
if file_path in self.cache:
|
|
633
|
+
cached_data = self.cache[file_path]
|
|
634
|
+
for doc in cached_data.content:
|
|
635
|
+
if total_tokens + doc["tokens"] > self.max_output_tokens:
|
|
636
|
+
logger.info(
|
|
637
|
+
f"当前检索已超出用户设置 Hybrid Index Max Tokens:{self.max_output_tokens},"
|
|
638
|
+
f"累计tokens: {total_tokens}, "
|
|
639
|
+
f"经过向量搜索共检索出 {len(result.keys())} 个文档, 共 {len(self.cache.keys())} 个文档")
|
|
640
|
+
return result
|
|
641
|
+
total_tokens += doc["tokens"]
|
|
642
|
+
result[file_path] = cached_data.model_dump()
|
|
643
|
+
logger.info(
|
|
644
|
+
f"用户Hybrid Index Max Tokens设置为:{self.max_output_tokens},"
|
|
645
|
+
f"累计tokens: {total_tokens}, "
|
|
646
|
+
f"经过向量搜索共检索出 {len(result.keys())} 个文档, 共 {len(self.cache.keys())} 个文档")
|
|
647
|
+
return result
|
|
@@ -11,6 +11,7 @@ from autocoder.rag.cache.simple_cache import AutoCoderRAGAsyncUpdateQueue
|
|
|
11
11
|
from autocoder.rag.cache.file_monitor_cache import AutoCoderRAGDocListener
|
|
12
12
|
from autocoder.rag.cache.byzer_storage_cache import ByzerStorageCache
|
|
13
13
|
from autocoder.rag.cache.local_byzer_storage_cache import LocalByzerStorageCache
|
|
14
|
+
from autocoder.rag.cache.local_duckdb_storage_cache import LocalDuckDBStorageCache
|
|
14
15
|
from autocoder.common import AutoCoderArgs
|
|
15
16
|
|
|
16
17
|
cache_lock = threading.Lock()
|
|
@@ -66,11 +67,17 @@ class LocalDocumentRetriever(BaseDocumentRetriever):
|
|
|
66
67
|
self.cacher = ByzerStorageCache(
|
|
67
68
|
path, ignore_spec, required_exts, extra_params
|
|
68
69
|
)
|
|
69
|
-
else:
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
70
|
+
else:
|
|
71
|
+
if extra_params.rag_storage_type == "duckdb":
|
|
72
|
+
self.cacher = LocalDuckDBStorageCache(
|
|
73
|
+
path, ignore_spec, required_exts, extra_params,
|
|
74
|
+
emb_llm=emb_llm
|
|
75
|
+
)
|
|
76
|
+
elif extra_params.rag_storage_type in ["byzer-storage", "byzer_storage"]:
|
|
77
|
+
self.cacher = LocalByzerStorageCache(
|
|
78
|
+
path, ignore_spec, required_exts, extra_params,
|
|
79
|
+
emb_llm=emb_llm
|
|
80
|
+
)
|
|
74
81
|
else:
|
|
75
82
|
if self.monitor_mode:
|
|
76
83
|
self.cacher = AutoCoderRAGDocListener(
|
|
@@ -210,22 +210,22 @@ class LongContextRAG:
|
|
|
210
210
|
|
|
211
211
|
avg_tokens = statistics.mean(token_counts) if token_counts else 0
|
|
212
212
|
median_tokens = statistics.median(token_counts) if token_counts else 0
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
213
|
+
if not self.client:
|
|
214
|
+
logger.info(
|
|
215
|
+
"RAG Configuration:\n"
|
|
216
|
+
f" Total docs: {doc_num}\n"
|
|
217
|
+
f" Total tokens: {token_num}\n"
|
|
218
|
+
f" Tokenizer path: {self.tokenizer_path}\n"
|
|
219
|
+
f" Relevant score: {self.relevant_score}\n"
|
|
220
|
+
f" Token limit: {self.token_limit}\n"
|
|
221
|
+
f" Full text limit: {self.full_text_limit}\n"
|
|
222
|
+
f" Segment limit: {self.segment_limit}\n"
|
|
223
|
+
f" Buff limit: {self.buff_limit}\n"
|
|
224
|
+
f" Max doc tokens: {max(token_counts) if token_counts else 0}\n"
|
|
225
|
+
f" Min doc tokens: {min(token_counts) if token_counts else 0}\n"
|
|
226
|
+
f" Avg doc tokens: {avg_tokens:.2f}\n"
|
|
227
|
+
f" Median doc tokens: {median_tokens:.2f}\n"
|
|
228
|
+
)
|
|
229
229
|
|
|
230
230
|
def count_tokens(self, text: str) -> int:
|
|
231
231
|
if self.tokenizer is None:
|
autocoder/version.py
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "0.1.
|
|
1
|
+
__version__ = "0.1.293"
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|