auto-coder 0.1.291__py3-none-any.whl → 0.1.293__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of auto-coder might be problematic. Click here for more details.

@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: auto-coder
3
- Version: 0.1.291
3
+ Version: 0.1.293
4
4
  Summary: AutoCoder: AutoCoder
5
5
  Author: allwefantasy
6
6
  Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
@@ -36,6 +36,7 @@ Requires-Dist: google-generativeai
36
36
  Requires-Dist: protobuf
37
37
  Requires-Dist: azure-cognitiveservices-speech
38
38
  Requires-Dist: real-agent
39
+ Requires-Dist: duckdb
39
40
  Requires-Dist: python-docx
40
41
  Requires-Dist: docx2txt
41
42
  Requires-Dist: pdf2image
@@ -1,18 +1,18 @@
1
1
  autocoder/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
2
  autocoder/auto_coder.py,sha256=zKqfMdm2F419hrNGaosW4kYJ3IbaxBKlpTlVl6JFWmE,65563
3
3
  autocoder/auto_coder_lang.py,sha256=Rtupq6N3_HT7JRhDKdgCBcwRaiAnyCOR_Gsp4jUomrI,3229
4
- autocoder/auto_coder_rag.py,sha256=NGg0X1E4RltWHDDvyfwPPv2DiaBP5cRr9FGBpzL85oo,33483
4
+ autocoder/auto_coder_rag.py,sha256=yhwRh_TJZyBxBCmUusZ8h5guU42i0Z6UJ10mT0FH3Rc,33857
5
5
  autocoder/auto_coder_rag_client_mcp.py,sha256=QRxUbjc6A8UmDMQ8lXgZkjgqtq3lgKYeatJbDY6rSo0,6270
6
6
  autocoder/auto_coder_rag_mcp.py,sha256=-RrjNwFaS2e5v8XDIrKR-zlUNUE8UBaeOtojffBrvJo,8521
7
7
  autocoder/auto_coder_runner.py,sha256=w-4MCKhOFaoABcDfVoZoonF59UyRso3kghimQYLz3NA,100851
8
- autocoder/auto_coder_server.py,sha256=6YQweNEKUrGAZ3yPvw8_qlNZJYLVSVUXGrn1K6udLts,20413
8
+ autocoder/auto_coder_server.py,sha256=E3Z829TPSooRSNhuh3_x9yaZi0f5G0Lm0ntoZhjGaoQ,20576
9
9
  autocoder/benchmark.py,sha256=Ypomkdzd1T3GE6dRICY3Hj547dZ6_inqJbBJIp5QMco,4423
10
10
  autocoder/chat_auto_coder.py,sha256=z_Kqd7CAecuNMa77kJn7iko2zTdko-4-o72a58H-_s8,24655
11
11
  autocoder/chat_auto_coder_lang.py,sha256=CjsiJsUaWr-TJBCDDlDNnFpCDTd-itJhd9aid9DKlp8,20542
12
12
  autocoder/command_args.py,sha256=9aYJ-AmPxP1sQh6ciw04FWHjSn31f2W9afXFwo8wgx4,30441
13
13
  autocoder/lang.py,sha256=U6AjVV8Rs1uLyjFCZ8sT6WWuNUxMBqkXXIOs4S120uk,14511
14
14
  autocoder/models.py,sha256=AyoZ-Pzy0oyYUmWCxOIRiOImsqboSfRET7LO9-UOuxI,11172
15
- autocoder/version.py,sha256=xRk7_FB9HaCsWF0se7FARGQ80UT42_XGKM6sHbcSNCc,23
15
+ autocoder/version.py,sha256=uJLvEc9fkxd409iL_wj7Xexi0uD8yIeEHv4m5yx5T6E,23
16
16
  autocoder/agent/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
17
17
  autocoder/agent/auto_demand_organizer.py,sha256=NWSAEsEk94vT3lGjfo25kKLMwYdPcpy9e-i21txPasQ,6942
18
18
  autocoder/agent/auto_filegroup.py,sha256=CW7bqp0FW1GIEMnl-blyAc2UGT7O9Mom0q66ITz1ckM,6635
@@ -31,7 +31,7 @@ autocoder/commands/auto_web.py,sha256=_449f4rCoRG7Sv0SB0hIBRFLPLPJ5DgWW4DlI22a3X
31
31
  autocoder/commands/tools.py,sha256=lanjoBGR6H8HDJSY3KrM6ibrtHZbgKX6mKJHSSE66dg,20493
32
32
  autocoder/common/JupyterClient.py,sha256=O-wi6pXeAEYhAY24kDa0BINrLYvKS6rKyWe98pDClS0,2816
33
33
  autocoder/common/ShellClient.py,sha256=fM1q8t_XMSbLBl2zkCNC2J9xuyKN3eXzGm6hHhqL2WY,2286
34
- autocoder/common/__init__.py,sha256=zvNkIt388i0afI-Mt6ekwl406mllvxPgBEyzLYWh0vk,13265
34
+ autocoder/common/__init__.py,sha256=LE-HHb_HwpWwRKwbwDlTqRzMV6nTzqF3RB55zVMDS3c,13656
35
35
  autocoder/common/anything2images.py,sha256=0ILBbWzY02M-CiWB-vzuomb_J1hVdxRcenAfIrAXq9M,25283
36
36
  autocoder/common/anything2img.py,sha256=iZQmg8srXlD7N5uGl5b_ONKJMBjYoW8kPmokkG6ISF0,10118
37
37
  autocoder/common/audio.py,sha256=Kn9nWKQddWnUrAz0a_ZUgjcu4VUU_IcZBigT7n3N3qc,7439
@@ -66,7 +66,7 @@ autocoder/common/image_to_page.py,sha256=yWiTJQ49Lm3j0FngiJhQ9u7qayqE_bOGb8Rk0Tm
66
66
  autocoder/common/index_import_export.py,sha256=h758AYY1df6JMTKUXYmMkSgxItfymDt82XT7O-ygEuw,4565
67
67
  autocoder/common/interpreter.py,sha256=62-dIakOunYB4yjmX8SHC0Gdy2h8NtxdgbpdqRZJ5vk,2833
68
68
  autocoder/common/llm_rerank.py,sha256=FbvtCzaR661Mt2wn0qsuiEL1Y3puD6jeIJS4zg_e7Bs,3260
69
- autocoder/common/mcp_hub.py,sha256=RPp7bnW6ij2EmBJMg2a5TN3U9G4oX_gH_vQKsIg7t40,14934
69
+ autocoder/common/mcp_hub.py,sha256=ymy580rkv8kFx2zwQFpMg03s9K8KWsJP3dkfjoYbWSU,16573
70
70
  autocoder/common/mcp_server.py,sha256=gKaQDQWeRZgHtR9UnuxHVgVbo0acrT9qA1kwtgDpHZU,16551
71
71
  autocoder/common/mcp_tools.py,sha256=KsLvRrB6pvmebqd-lDaSH6IBJR0AIxWRE-dtCEG_w9k,12485
72
72
  autocoder/common/memory_manager.py,sha256=2ZjYG7BPyvbYalZBF6AM_G5e10Qkw_zrqtD4Zd7GSsQ,3663
@@ -84,6 +84,7 @@ autocoder/common/text.py,sha256=KGRQq314GHBmY4MWG8ossRoQi1_DTotvhxchpn78c-k,1003
84
84
  autocoder/common/types.py,sha256=PXTETrsTvhLE49jqAeUKGySvxBN9pjeyCgRHLDYdd9U,664
85
85
  autocoder/common/utils_code_auto_generate.py,sha256=oiBjdCgdcQErfhMozFdHxkU84WmDo2euBA86yezha-g,3597
86
86
  autocoder/common/mcp_servers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
87
+ autocoder/common/mcp_servers/mcp_server_gpt4o_mini_search.py,sha256=z-c3zq0YT7wK2XK2t-tDxdXFTUtCFDfvyGTYaYwRgtM,5661
87
88
  autocoder/common/mcp_servers/mcp_server_perplexity.py,sha256=IXTyMpd1CQcBLzVinA-_OIOHoNmbzvuW6pXIadaKHJE,5533
88
89
  autocoder/data/byzerllm.md,sha256=SGCMpEaUQ0ysPxQsgzyyp5sgvEr8dZsxEGAfVcPBIq0,47741
89
90
  autocoder/data/tokenizer.json,sha256=7Lb5_DaYlDRvBRH0B0ynXO5c1fOwbQLxujX805-OEh0,7847602
@@ -115,13 +116,13 @@ autocoder/privacy/__init__.py,sha256=LnIVvGu_K66zCE-yhN_-dPO8R80pQyedCsXJ7wRqQaI
115
116
  autocoder/privacy/model_filter.py,sha256=-N9ZvxxDKpxU7hkn-tKv-QHyXjvkCopUaKgvJwTOGQs,3369
116
117
  autocoder/pyproject/__init__.py,sha256=ms-A_pocgGv0oZPEW8JAdXi7G-VSVhkQ6CnWFe535Ec,14477
117
118
  autocoder/rag/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
118
- autocoder/rag/api_server.py,sha256=6daDd5tF_Z69cogl-nz-8ogvtKn-BNUdnnpXXxFK0uo,9419
119
+ autocoder/rag/api_server.py,sha256=gsk450_B-qGtBwJ1niG9-QFJAG0RGr2s2KdiMrzzbyQ,9582
119
120
  autocoder/rag/conversation_to_queries.py,sha256=xwmErn4WbdADnhK1me-h_6fV3KYrl_y1qPNQl1aoI6o,4810
120
121
  autocoder/rag/doc_filter.py,sha256=UduVO2mlrngwJICrefjDJTYfdmQ4GcRXrfWDQ7xXksk,14206
121
- autocoder/rag/document_retriever.py,sha256=MGn6oIPo49BbRC99xmLMFkZrpHfcDfKoGYqWxXF554U,8051
122
+ autocoder/rag/document_retriever.py,sha256=5BDqKVJqLPScEnua5S5suXhWuCaALIfPf5obXeJoWfs,8461
122
123
  autocoder/rag/lang.py,sha256=_jmUtxZDG1fmF4b2mhMJbYS1YQDb2ZE8nyAn5_vrvjA,3350
123
124
  autocoder/rag/llm_wrapper.py,sha256=Ht5GF5yJtrztoliujsZzx_ooWZmHkd5xLZKcGEiicZw,4303
124
- autocoder/rag/long_context_rag.py,sha256=THQakGbrr-kOn8Mu4PdJDMiiPq02FNZxZZUM8Du2YCw,41848
125
+ autocoder/rag/long_context_rag.py,sha256=6rqq0pvYe9N4TvyLwd2OB21ZUrPC4FfxZuks0weAz4A,41935
125
126
  autocoder/rag/qa_conversation_strategy.py,sha256=_BFdgit2KkUkW_82jE67QLYS_d8BsGhU1pG73YhHJgE,5744
126
127
  autocoder/rag/rag_config.py,sha256=8LwFcTd8OJWWwi1_WY4IzjqgtT6RyE2j4PjxS5cCTDE,802
127
128
  autocoder/rag/rag_entry.py,sha256=6TKtErZ0Us9XSV6HgRKXA6yR3SiZGPHpynOKSaR1wgE,2463
@@ -143,6 +144,7 @@ autocoder/rag/cache/byzer_storage_cache.py,sha256=okmNUDRCDv81JOfBuTspmTxf8ltYmY
143
144
  autocoder/rag/cache/cache_result_merge.py,sha256=VnTdbT2OMBmWl_83bqds97d9_M33IhPNX8tF7KH2GMM,10556
144
145
  autocoder/rag/cache/file_monitor_cache.py,sha256=OdSXTH3vo6inAzkN5d55I0RN03GUlSlnUEKmXpjFl78,9443
145
146
  autocoder/rag/cache/local_byzer_storage_cache.py,sha256=7_6zCRY3BiCM0ec3U96i1G4l2SzmAedaTfkArNkMfQU,31925
147
+ autocoder/rag/cache/local_duckdb_storage_cache.py,sha256=4lnxjwluKqVWWfmd4giJ81O6bZlqEhesHfHGgEFx55I,25128
146
148
  autocoder/rag/cache/rag_file_meta.py,sha256=RQ3n4wfkHlB-1ljS3sFSi8ijbsUPeIqBSgjmmbRuwRI,20521
147
149
  autocoder/rag/cache/simple_cache.py,sha256=j9dxhei-Nwq9FJrrGOWhaDIDSb_Iz6JSojT1pelS9k4,13084
148
150
  autocoder/rag/loaders/__init__.py,sha256=EQHEZ5Cmz-mGP2SllUTvcIbYCnF7W149dNpNItfs0yE,304
@@ -181,9 +183,9 @@ autocoder/utils/types.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
181
183
  autocoder/utils/auto_coder_utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
182
184
  autocoder/utils/auto_coder_utils/chat_stream_out.py,sha256=lkJ_A-sYU36JMzjFWkk3pR6uos8oZHYt9GPsPe_CPAo,11766
183
185
  autocoder/utils/chat_auto_coder_utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
184
- auto_coder-0.1.291.dist-info/LICENSE,sha256=HrhfyXIkWY2tGFK11kg7vPCqhgh5DcxleloqdhrpyMY,11558
185
- auto_coder-0.1.291.dist-info/METADATA,sha256=P_4fuBpcjGDCIfmcSYtOHKqqvP2hUmphsz0BddB0r0w,2643
186
- auto_coder-0.1.291.dist-info/WHEEL,sha256=GV9aMThwP_4oNCtvEC2ec3qUYutgWeAzklro_0m4WJQ,91
187
- auto_coder-0.1.291.dist-info/entry_points.txt,sha256=0nzHtHH4pNcM7xq4EBA2toS28Qelrvcbrr59GqD_0Ak,350
188
- auto_coder-0.1.291.dist-info/top_level.txt,sha256=Jqc0_uJSw2GwoFQAa9iJxYns-2mWla-9ok_Y3Gcznjk,10
189
- auto_coder-0.1.291.dist-info/RECORD,,
186
+ auto_coder-0.1.293.dist-info/LICENSE,sha256=HrhfyXIkWY2tGFK11kg7vPCqhgh5DcxleloqdhrpyMY,11558
187
+ auto_coder-0.1.293.dist-info/METADATA,sha256=wM3jXJXkDW9JVdM0Oy1EVukWKunNk2NKrPjRn658wK4,2665
188
+ auto_coder-0.1.293.dist-info/WHEEL,sha256=GV9aMThwP_4oNCtvEC2ec3qUYutgWeAzklro_0m4WJQ,91
189
+ auto_coder-0.1.293.dist-info/entry_points.txt,sha256=0nzHtHH4pNcM7xq4EBA2toS28Qelrvcbrr59GqD_0Ak,350
190
+ auto_coder-0.1.293.dist-info/top_level.txt,sha256=Jqc0_uJSw2GwoFQAa9iJxYns-2mWla-9ok_Y3Gcznjk,10
191
+ auto_coder-0.1.293.dist-info/RECORD,,
@@ -179,6 +179,14 @@ def main(input_args: Optional[List[str]] = None):
179
179
  build_index_parser = subparsers.add_parser(
180
180
  "build_hybrid_index", help="Build hybrid index for RAG"
181
181
  )
182
+
183
+ build_index_parser.add_argument(
184
+ "--rag_storage_type",
185
+ type=str,
186
+ default="duckdb",
187
+ help="The storage type of the RAG, duckdb or byzer-storage",
188
+ )
189
+
182
190
  build_index_parser.add_argument(
183
191
  "--quick", action="store_true", help="Skip system initialization"
184
192
  )
@@ -329,6 +337,14 @@ def main(input_args: Optional[List[str]] = None):
329
337
  action="store_true",
330
338
  help="Enable hybrid index",
331
339
  )
340
+
341
+ serve_parser.add_argument(
342
+ "--rag_storage_type",
343
+ type=str,
344
+ default="duckdb",
345
+ help="The storage type of the RAG, duckdb or byzer-storage",
346
+ )
347
+
332
348
  serve_parser.add_argument(
333
349
  "--hybrid_index_max_output_tokens",
334
350
  type=int,
@@ -34,6 +34,14 @@ import sys
34
34
  import io
35
35
  from autocoder.utils.log_capture import LogCapture
36
36
 
37
+ # If support dotenv, use it
38
+ if os.path.exists(".env"):
39
+ try:
40
+ from dotenv import load_dotenv
41
+ load_dotenv()
42
+ except ImportError:
43
+ pass
44
+
37
45
  def convert_yaml_config_to_str(yaml_config):
38
46
  yaml_content = yaml.safe_dump(
39
47
  yaml_config,
@@ -292,10 +292,14 @@ class AutoCoderArgs(pydantic.BaseModel):
292
292
 
293
293
  rag_url: Optional[str] = ""
294
294
  rag_token: Optional[str] = ""
295
- rag_type: Optional[str] = "storage"
296
- rag_params_max_tokens: Optional[int] = 4096
297
- rag_doc_filter_relevance: Optional[int] = 5
298
- rag_context_window_limit: Optional[int] = 120000
295
+ rag_type: Optional[str] = "storage"
296
+ rag_storage_type: Optional[str] = "duckdb" # 向量化存储类型 byzer-storage | duckdb
297
+ rag_params_max_tokens: Optional[int] = 500000
298
+ rag_doc_filter_relevance: Optional[int] = 2
299
+ rag_context_window_limit: Optional[int] = 120000
300
+ rag_duckdb_vector_dim: Optional[int] = 1024 # DuckDB 向量化存储的维度
301
+ rag_duckdb_query_similarity: Optional[float] = 0.1 # DuckDB 向量化检索 相似度 阈值
302
+ rag_duckdb_query_top_k: Optional[int] = 10000 # DuckDB 向量化检索 返回 TopK个结果(且大于相似度)
299
303
  # rag 本地图床地址
300
304
  local_image_host: Optional[str] = ""
301
305
  rag_recall_max_queries: Optional[int] = 5
@@ -2,8 +2,12 @@ import os
2
2
  import json
3
3
  import asyncio
4
4
  import aiohttp
5
+ import importlib
6
+ import pkgutil
7
+ import re
8
+ import inspect
5
9
  from datetime import datetime, timedelta
6
- from typing import Dict, List, Optional, Any, Set, Optional
10
+ from typing import Dict, List, Optional, Any, Set, Optional, Tuple
7
11
  from pathlib import Path
8
12
  from pydantic import BaseModel, Field
9
13
 
@@ -62,23 +66,53 @@ class McpConnection:
62
66
  self.session = session
63
67
 
64
68
 
65
- MCP_PERPLEXITY_SERVER = '''
66
- {
67
- "perplexity": {
68
- "command": "python",
69
- "args": [
70
- "-m", "autocoder.common.mcp_servers.mcp_server_perplexity"
71
- ],
72
- "env": {
73
- "PERPLEXITY_API_KEY": "{{PERPLEXITY_API_KEY}}"
74
- }
75
- }
76
- }
77
- '''
78
-
79
- MCP_BUILD_IN_SERVERS = {
80
- "perplexity": json.loads(MCP_PERPLEXITY_SERVER)["perplexity"]
81
- }
69
+ def _generate_server_configs() -> Tuple[Dict[str, Any], Dict[str, str]]:
70
+ """
71
+ Scan the autocoder.common.mcp_servers directory for mcp_server_*.py files
72
+ and generate server configurations.
73
+
74
+ Returns:
75
+ Tuple of (built-in servers dict, JSON templates dict)
76
+ """
77
+ servers = {}
78
+ templates = {}
79
+
80
+ try:
81
+ package_name = "autocoder.common.mcp_servers"
82
+ package = importlib.import_module(package_name)
83
+
84
+ # Find all modules in the package
85
+ for _, name, _ in pkgutil.iter_modules(package.__path__, package.__name__ + "."):
86
+ # Only process modules that start with "mcp_server_"
87
+ base_name = name.split(".")[-1]
88
+ if base_name.startswith("mcp_server_"):
89
+ # Generate a friendly server name
90
+ friendly_name = base_name[11:]
91
+
92
+ # Create env dictionary with placeholders
93
+ env_dict = {}
94
+
95
+ # Create server configuration
96
+ config = {
97
+ "command": "python",
98
+ "args": ["-m", name],
99
+ "env": env_dict
100
+ }
101
+
102
+ # Store in dictionaries
103
+ servers[friendly_name] = config
104
+ templates[friendly_name] = json.dumps({friendly_name: config}, indent=4)
105
+
106
+ logger.info(f"Detected MCP server: {friendly_name}")
107
+
108
+ except Exception as e:
109
+ logger.error(f"Error generating server configs: {e}")
110
+
111
+ return servers, templates
112
+
113
+
114
+ # Automatically generate server configurations
115
+ MCP_BUILD_IN_SERVERS, MCP_SERVER_TEMPLATES = _generate_server_configs()
82
116
 
83
117
 
84
118
  class McpHub:
@@ -422,3 +456,10 @@ class McpHub:
422
456
  """
423
457
  for name in list(self.connections.keys()):
424
458
  await self.delete_connection(name)
459
+
460
+ @classmethod
461
+ def get_server_templates(cls) -> Dict[str, str]:
462
+ """
463
+ Get all available server templates as JSON strings
464
+ """
465
+ return MCP_SERVER_TEMPLATES
@@ -0,0 +1,153 @@
1
+ from os import getenv
2
+ from textwrap import dedent
3
+ import sys
4
+
5
+ import mcp.server.stdio
6
+ import mcp.types as types
7
+ from mcp.server import NotificationOptions, Server
8
+ from mcp.server.models import InitializationOptions
9
+ import json
10
+ from openai import OpenAI
11
+
12
+ OPENAI_API_KEY = getenv("OPENAI_API_KEY")
13
+ # Check if API key is empty or None
14
+ if not OPENAI_API_KEY:
15
+ print("Error: OPENAI_API_KEY environment variable is not set. Please set it before running this server.", file=sys.stderr)
16
+ sys.exit(1)
17
+
18
+ OPENAI_API_BASE_URL = getenv(
19
+ "OPENAI_API_BASE_URL", "https://api.openai.com/v1")
20
+
21
+ server = Server("mcp-server-gpt4o-mini-search")
22
+
23
+ client = OpenAI(
24
+ api_key=OPENAI_API_KEY,
25
+ base_url=OPENAI_API_BASE_URL
26
+ )
27
+
28
+
29
+ @server.list_tools()
30
+ async def handle_list_tools() -> list[types.Tool]:
31
+ return [
32
+ types.Tool(
33
+ name="gpt4o_mini_search",
34
+ description=dedent(
35
+ """
36
+ GPT-4o mini with search enables agents to gather information from the internet
37
+ in real-time, providing up-to-date answers with source citations.
38
+ This tool is ideal for fact-checking, research, and accessing current information
39
+ that might not be in the model's training data.
40
+
41
+ The search-enhanced responses include relevant web sources to support the information
42
+ provided, making it useful for obtaining verified and recent information.
43
+
44
+ [Response structure]
45
+ - id: A unique identifier for the response
46
+ - model: The model used (gpt-4o-mini-search-preview)
47
+ - object: The object type ("chat.completion")
48
+ - created: The Unix timestamp when the completion was created
49
+ - choices[]: The list of completion choices generated
50
+ - usage: Usage statistics for the completion request
51
+ """
52
+ ),
53
+ inputSchema={
54
+ "type": "object",
55
+ "properties": {
56
+ "system_message": {
57
+ "type": "string",
58
+ "description": "Optional custom system message. If not provided, a default search-optimized system message will be used.",
59
+ },
60
+ "messages": {
61
+ "type": "array",
62
+ "description": "A list of messages comprising the conversation so far (excluding system message which is handled separately).",
63
+ "items": {
64
+ "type": "object",
65
+ "properties": {
66
+ "content": {
67
+ "type": "string",
68
+ "description": "The contents of the message in this turn of conversation.",
69
+ },
70
+ "role": {
71
+ "type": "string",
72
+ "description": "The role of the speaker in this turn of conversation.",
73
+ "enum": ["user", "assistant"],
74
+ },
75
+ },
76
+ "required": ["content", "role"],
77
+ },
78
+ },
79
+ },
80
+ "required": ["messages"],
81
+ },
82
+ )
83
+ ]
84
+
85
+
86
+ @server.call_tool()
87
+ async def handle_call_tool(
88
+ name: str, arguments: dict
89
+ ) -> list[types.TextContent | types.ImageContent | types.EmbeddedResource]:
90
+ if name != "gpt4o_mini_search":
91
+ raise ValueError(f"Unknown tool: {name}")
92
+
93
+ # Extract user messages
94
+ user_messages = arguments.get("messages", [])
95
+
96
+ # Define default system message if not provided
97
+ default_system_message = (
98
+ "你是专业搜索助手,需要:\n"
99
+ "1. 提供基于用户查询的清晰格式化信息\n"
100
+ "2. 使用[标题](URL)格式嵌入链接\n"
101
+ "3. 每条信息后附上来源\n"
102
+ "4. 用'---'分隔不同结果\n"
103
+ "5. 直接在文本中引用,不使用编号引用\n"
104
+ "6. 确保提供完整URL"
105
+ )
106
+
107
+ # Use custom system message if provided, otherwise use default
108
+ system_message = arguments.get("system_message", default_system_message)
109
+
110
+ # Prepare full message list with system message first
111
+ full_messages = [{"role": "system", "content": system_message}]
112
+ full_messages.extend(user_messages)
113
+
114
+ try:
115
+ # Initialize OpenAI client
116
+
117
+ # Make the API call using OpenAI SDK
118
+ completion = client.chat.completions.create(
119
+ model="gpt-4o-mini-search-preview",
120
+ messages=full_messages
121
+ )
122
+
123
+ # Extract content from response
124
+ content = completion.choices[0].message.content
125
+
126
+ except Exception as e:
127
+ raise RuntimeError(f"API error: {str(e)}")
128
+
129
+ return [types.TextContent(
130
+ type="text",
131
+ text=content,
132
+ )]
133
+
134
+
135
+ async def main():
136
+ async with mcp.server.stdio.stdio_server() as (read_stream, write_stream):
137
+ await server.run(
138
+ read_stream,
139
+ write_stream,
140
+ InitializationOptions(
141
+ server_name="mcp-server-gpt4o-mini-search",
142
+ server_version="0.1.0",
143
+ capabilities=server.get_capabilities(
144
+ notification_options=NotificationOptions(
145
+ tools_changed=True),
146
+ experimental_capabilities={},
147
+ ),
148
+ ),
149
+ )
150
+
151
+ if __name__ == "__main__":
152
+ import asyncio
153
+ asyncio.run(main())
@@ -31,6 +31,14 @@ from byzerllm.utils.client.entrypoints.openai.protocol import (
31
31
  from pydantic import BaseModel
32
32
  from typing import List,Optional
33
33
 
34
+ # If support dotenv, use it
35
+ if os.path.exists(".env"):
36
+ try:
37
+ from dotenv import load_dotenv
38
+ load_dotenv()
39
+ except ImportError:
40
+ pass
41
+
34
42
  logger = init_logger(__name__)
35
43
 
36
44
  llm_client: ByzerLLM = None
@@ -0,0 +1,647 @@
1
+ import hashlib
2
+ import json
3
+ import os
4
+ import time
5
+ import platform
6
+ import threading
7
+ from multiprocessing import Pool
8
+ from concurrent.futures import ThreadPoolExecutor, as_completed
9
+ from typing import List, Dict, Any, Optional, Tuple, Union
10
+ import numpy as np
11
+ from loguru import logger
12
+
13
+ try:
14
+ import duckdb
15
+ except ImportError:
16
+ logger.error("DuckDB is not installed, please install it using 'pip install duckdb'")
17
+ raise
18
+
19
+ from autocoder.common import AutoCoderArgs
20
+ from autocoder.common import SourceCode
21
+ from autocoder.rag.cache.base_cache import (
22
+ BaseCacheManager,
23
+ DeleteEvent,
24
+ AddOrUpdateEvent,
25
+ FileInfo,
26
+ CacheItem
27
+ )
28
+ from autocoder.rag.utils import process_file_in_multi_process, process_file_local
29
+ from autocoder.rag.variable_holder import VariableHolder
30
+ from byzerllm import SimpleByzerLLM, ByzerLLM
31
+
32
+ if platform.system() != "Windows":
33
+ import fcntl
34
+ else:
35
+ fcntl = None
36
+
37
+
38
+ default_ignore_dirs = [
39
+ "__pycache__",
40
+ "node_modules",
41
+ "_images"
42
+ ]
43
+
44
+
45
+ def generate_file_md5(file_path: str) -> str:
46
+ md5_hash = hashlib.md5()
47
+ with open(file_path, "rb") as f:
48
+ for chunk in iter(lambda: f.read(4096), b""):
49
+ md5_hash.update(chunk)
50
+ return md5_hash.hexdigest()
51
+
52
+
53
+ class DuckDBLocalContext:
54
+ def __init__(self, database_path: str):
55
+ self.database_path = database_path
56
+ self._conn = None
57
+
58
+ def _install_load_extension(self, ext_list):
59
+ for ext in ext_list:
60
+ self._conn.install_extension(ext)
61
+ self._conn.load_extension(ext)
62
+
63
+ def __enter__(self) -> "duckdb.DuckDBPyConnection":
64
+ if not os.path.exists(os.path.dirname(self.database_path)):
65
+ raise ValueError(
66
+ f"Directory {os.path.dirname(self.database_path)} does not exist."
67
+ )
68
+
69
+ self._conn = duckdb.connect(self.database_path)
70
+ self._install_load_extension(["json", "fts", "vss"])
71
+
72
+ return self._conn
73
+
74
+ def __exit__(self, exc_type, exc_val, exc_tb) -> None:
75
+ if self._conn:
76
+ self._conn.close()
77
+
78
+
79
+ class LocalDuckdbStorage:
80
+
81
+ def __init__(
82
+ self, llm: Union[ByzerLLM, SimpleByzerLLM] = None, database_name: str = ":memory:", table_name: str = "documents",
83
+ embed_dim: Optional[int] = None, persist_dir: str = "./storage"
84
+ ) -> None:
85
+ self.llm = llm
86
+ self.database_name = database_name
87
+ self.table_name = table_name
88
+ self.embed_dim = embed_dim
89
+ self.persist_dir = persist_dir
90
+ self.cache_dir = os.path.join(self.persist_dir, '.cache')
91
+ logger.info(f"正在启动 DuckDBVectorStore.")
92
+
93
+ if self.database_name != ":memory:":
94
+ self.database_path = os.path.join(self.cache_dir, self.database_name)
95
+
96
+ if self.database_name == ":memory:":
97
+ self._conn = duckdb.connect(self.database_name)
98
+ self._install_load_extension(["json", "fts", "vss"])
99
+ self._initialize()
100
+ else:
101
+ if not os.path.exists(self.database_path):
102
+ if not os.path.exists(self.cache_dir):
103
+ os.makedirs(self.cache_dir)
104
+ self._initialize()
105
+ self._conn = None
106
+ logger.info(f"DuckDBVectorStore 初始化完成, 存储目录: {self.cache_dir}, "
107
+ f"数据库名称: {self.database_name}, 数据表名称: {self.table_name}")
108
+
109
+ @classmethod
110
+ def class_name(cls) -> str:
111
+ return "DuckDBVectorStore"
112
+
113
+ @property
114
+ def client(self) -> Any:
115
+ """Return client."""
116
+ return self._conn
117
+
118
+ def _install_load_extension(self, ext_list):
119
+ for ext in ext_list:
120
+ self._conn.install_extension(ext)
121
+ self._conn.load_extension(ext)
122
+
123
+ @staticmethod
124
+ def _apply_pca(embedding, target_dim):
125
+ # 生成固定随机投影矩阵(避免每次调用重新生成)
126
+ np.random.seed(42) # 固定随机种子保证一致性
127
+ source_dim = len(embedding)
128
+ projection_matrix = np.random.randn(source_dim, target_dim) / np.sqrt(source_dim)
129
+
130
+ # 执行投影
131
+ reduced = np.dot(embedding, projection_matrix)
132
+ return reduced
133
+
134
+ def _embedding(self, context: str, norm: bool = True, dim: int | None = None) -> List[float]:
135
+ embedding = self.llm.emb_query(context)[0].output
136
+
137
+ if dim:
138
+ embedding = self._apply_pca(embedding, target_dim=dim) # 降维后形状 (1024,)
139
+
140
+ if norm:
141
+ embedding = embedding / np.linalg.norm(embedding)
142
+
143
+ return embedding.tolist()
144
+
145
+ def _initialize(self) -> None:
146
+ if self.embed_dim is None:
147
+ _query = f"""
148
+ CREATE TABLE IF NOT EXISTS {self.table_name} (
149
+ _id VARCHAR,
150
+ file_path VARCHAR,
151
+ content TEXT,
152
+ raw_content TEXT,
153
+ vector FLOAT[],
154
+ mtime FLOAT
155
+ );
156
+ """
157
+ else:
158
+ _query = f"""
159
+ CREATE TABLE IF NOT EXISTS {self.table_name} (
160
+ _id VARCHAR,
161
+ file_path VARCHAR,
162
+ content TEXT,
163
+ raw_content TEXT,
164
+ vector FLOAT[],
165
+ mtime FLOAT
166
+ );
167
+ """
168
+
169
+ if self.database_name == ":memory:":
170
+ self._conn.execute(_query)
171
+ elif self.database_path is not None:
172
+ with DuckDBLocalContext(self.database_path) as _conn:
173
+ _conn.execute(_query)
174
+
175
+ def truncate_table(self):
176
+ _truncate_query = f"""TRUNCATE TABLE {self.table_name};"""
177
+ if self.database_name == ":memory:":
178
+ self._conn.execute(_truncate_query)
179
+ elif self.database_path is not None:
180
+ with DuckDBLocalContext(self.database_path) as _conn:
181
+ _conn.execute(_truncate_query)
182
+
183
+ def query_by_path(self, file_path: str):
184
+ _exists_query = f"""SELECT _id FROM {self.table_name} WHERE file_path = ?"""
185
+ query_params = [
186
+ file_path
187
+ ]
188
+ _final_results = []
189
+ if self.database_name == ":memory:":
190
+ _final_results = self._conn.execute(_exists_query, query_params).fetchall()
191
+ elif self.database_path is not None:
192
+ with DuckDBLocalContext(self.database_path) as _conn:
193
+ _final_results = _conn.execute(_exists_query, query_params).fetchall()
194
+ return _final_results
195
+
196
+ def delete_by_ids(self, _ids: List[str]):
197
+ _delete_query = f"""DELETE FROM {self.table_name} WHERE _id IN (?);"""
198
+ query_params = [
199
+ ','.join(_ids)
200
+ ]
201
+ if self.database_name == ":memory:":
202
+ _final_results = self._conn.execute(_delete_query, query_params).fetchall()
203
+ elif self.database_path is not None:
204
+ with DuckDBLocalContext(self.database_path) as _conn:
205
+ _final_results = _conn.execute(_delete_query, query_params).fetchall()
206
+ return _final_results
207
+
208
+ def _node_to_table_row(self, context_chunk: Dict[str, str | float], dim: int | None = None) -> Any:
209
+ return (
210
+ context_chunk["_id"],
211
+ context_chunk["file_path"],
212
+ context_chunk["content"],
213
+ context_chunk["raw_content"],
214
+ self._embedding(context_chunk["raw_content"], norm=True, dim=dim),
215
+ context_chunk["mtime"]
216
+ )
217
+
218
+ def add_doc(self, context_chunk: Dict[str, str | float], dim: int | None = None):
219
+ """
220
+ {
221
+ "_id": f"{doc.module_name}_{chunk_idx}",
222
+ "file_path": file_info.file_path,
223
+ "content": chunk,
224
+ "raw_content": chunk,
225
+ "vector": chunk,
226
+ "mtime": file_info.modify_time,
227
+ }
228
+ """
229
+ if self.database_name == ":memory:":
230
+ _table = self._conn.table(self.table_name)
231
+ _row = self._node_to_table_row(context_chunk, dim=dim)
232
+ _table.insert(_row)
233
+ elif self.database_path is not None:
234
+ with DuckDBLocalContext(self.database_path) as _conn:
235
+ _table = _conn.table(self.table_name)
236
+ _row = self._node_to_table_row(context_chunk, dim=dim)
237
+ _table.insert(_row)
238
+
239
+ def vector_search(
240
+ self, query: str, similarity_value: float = 0.7, similarity_top_k: int = 10, query_dim: int | None = None
241
+ ):
242
+ """
243
+ list_cosine_similarity: 计算两个列表之间的余弦相似度
244
+ list_cosine_distance: 计算两个列表之间的余弦距离
245
+ list_dot_product: 计算两个大小相同的数字列表的点积
246
+ """
247
+ _db_query = f"""
248
+ SELECT _id, file_path, mtime, score
249
+ FROM (
250
+ SELECT *, list_cosine_similarity(vector, ?) AS score
251
+ FROM {self.table_name}
252
+ ) sq
253
+ WHERE score IS NOT NULL
254
+ AND score >= ?
255
+ ORDER BY score DESC LIMIT ?;
256
+ """
257
+ query_params = [
258
+ self._embedding(query, norm=True, dim=query_dim),
259
+ similarity_value,
260
+ similarity_top_k,
261
+ ]
262
+
263
+ _final_results = []
264
+ if self.database_name == ":memory:":
265
+ _final_results = self._conn.execute(_db_query, query_params).fetchall()
266
+ elif self.database_path is not None:
267
+ with DuckDBLocalContext(self.database_path) as _conn:
268
+ _final_results = _conn.execute(_db_query, query_params).fetchall()
269
+ return _final_results
270
+
271
+
272
+ efault_ignore_dirs = [
273
+ "__pycache__",
274
+ "node_modules",
275
+ "_images"
276
+ ]
277
+
278
+
279
+ class LocalDuckDBStorageCache(BaseCacheManager):
280
+ def __init__(
281
+ self,
282
+ path,
283
+ ignore_spec,
284
+ required_exts,
285
+ extra_params: Optional[AutoCoderArgs] = None,
286
+ emb_llm: Union[ByzerLLM, SimpleByzerLLM] = None
287
+ ):
288
+ self.path = path
289
+ self.ignore_spec = ignore_spec
290
+ self.required_exts = required_exts
291
+ self.extra_params = extra_params
292
+
293
+ self.storage = LocalDuckdbStorage(
294
+ llm=emb_llm,
295
+ database_name="byzerai_store_duckdb.db",
296
+ table_name="rag_duckdb",
297
+ persist_dir=self.path
298
+ )
299
+ self.queue = []
300
+ self.chunk_size = 1000
301
+ self.max_output_tokens = extra_params.hybrid_index_max_output_tokens
302
+
303
+ # 设置缓存文件路径
304
+ self.cache_dir = os.path.join(self.path, ".cache")
305
+ self.cache_file = os.path.join(self.cache_dir, "duckdb_storage_speedup.jsonl")
306
+ self.cache: Dict[str, CacheItem] = {}
307
+ # 创建缓存目录
308
+ if not os.path.exists(self.cache_dir):
309
+ os.makedirs(self.cache_dir)
310
+
311
+ self.lock = threading.Lock()
312
+ self.stop_event = threading.Event()
313
+ self.thread = threading.Thread(target=self.process_queue)
314
+ self.thread.daemon = True
315
+ self.thread.start()
316
+
317
+ # 加载缓存
318
+ self.cache = self._load_cache()
319
+
320
+ @staticmethod
321
+ def _chunk_text(text, max_length=1000):
322
+ """Split text into chunks"""
323
+ chunks = []
324
+ current_chunk = []
325
+ current_length = 0
326
+
327
+ for line in text.split("\n"):
328
+ if current_length + len(line) > max_length and current_chunk:
329
+ chunks.append("\n".join(current_chunk))
330
+ current_chunk = []
331
+ current_length = 0
332
+ current_chunk.append(line)
333
+ current_length += len(line)
334
+
335
+ if current_chunk:
336
+ chunks.append("\n".join(current_chunk))
337
+
338
+ return chunks
339
+
340
+ def _load_cache(self) -> Dict[str, CacheItem]:
341
+ """Load cache from file"""
342
+ if os.path.exists(self.cache_file):
343
+ try:
344
+ with open(self.cache_file, "r", encoding="utf-8") as f:
345
+ lines = f.readlines()
346
+ cache = {}
347
+ for line in lines:
348
+ try:
349
+ data = json.loads(line.strip())
350
+ if isinstance(data, dict) and "file_path" in data:
351
+ # 转换为 CacheItem 对象
352
+ cache_item = CacheItem.model_validate(data)
353
+ cache[data["file_path"]] = cache_item
354
+ except json.JSONDecodeError:
355
+ continue
356
+ return cache
357
+ except Exception as e:
358
+ logger.error(f"Error loading cache file: {str(e)}")
359
+ return {}
360
+ return {}
361
+
362
+ def write_cache(self):
363
+ cache_file = self.cache_file
364
+
365
+ if not fcntl:
366
+ try:
367
+ with open(cache_file, "w", encoding="utf-8") as f:
368
+ for cache_item in self.cache.values():
369
+ # 确保序列化 Pydantic 模型
370
+ json.dump(cache_item.model_dump(), f, ensure_ascii=False)
371
+ f.write("\n")
372
+ except IOError as e:
373
+ logger.error(f"Error writing cache file: {str(e)}")
374
+ else:
375
+ lock_file = cache_file + ".lock"
376
+ with open(lock_file, "w", encoding="utf-8") as lockf:
377
+ try:
378
+ # 获取文件锁
379
+ fcntl.flock(lockf, fcntl.LOCK_EX | fcntl.LOCK_NB)
380
+ # 写入缓存文件
381
+ with open(cache_file, "w", encoding="utf-8") as f:
382
+ for cache_item in self.cache.values():
383
+ # 确保序列化 Pydantic 模型
384
+ json.dump(cache_item.model_dump(), f, ensure_ascii=False)
385
+ f.write("\n")
386
+
387
+ finally:
388
+ # 释放文件锁
389
+ fcntl.flock(lockf, fcntl.LOCK_UN)
390
+
391
+ @staticmethod
392
+ def fileinfo_to_tuple(file_info: FileInfo) -> Tuple[str, str, float, str]:
393
+ return file_info.file_path, file_info.relative_path, file_info.modify_time, file_info.file_md5
394
+
395
+ def build_cache(self):
396
+ """Build the cache by reading files and storing in DuckDBVectorStore"""
397
+ logger.info(f"Building cache for path: {self.path}")
398
+
399
+ files_to_process = []
400
+ for file_info in self.get_all_files():
401
+ if (
402
+ file_info.file_path not in self.cache
403
+ or self.cache[file_info.file_path].md5 != file_info.file_md5
404
+ ):
405
+ files_to_process.append(file_info)
406
+
407
+ if not files_to_process:
408
+ return
409
+
410
+ from autocoder.rag.token_counter import initialize_tokenizer
411
+
412
+ with Pool(
413
+ processes=os.cpu_count(),
414
+ initializer=initialize_tokenizer,
415
+ initargs=(VariableHolder.TOKENIZER_PATH,),
416
+ ) as pool:
417
+ target_files_to_process = []
418
+ for file_info in files_to_process:
419
+ target_files_to_process.append(
420
+ self.fileinfo_to_tuple(file_info))
421
+ results = pool.map(process_file_in_multi_process,
422
+ target_files_to_process)
423
+
424
+ items = []
425
+ for file_info, result in zip(files_to_process, results):
426
+ content: List[SourceCode] = result
427
+ self.cache[file_info.file_path] = CacheItem(
428
+ file_path=file_info.file_path,
429
+ relative_path=file_info.relative_path,
430
+ content=[c.model_dump() for c in content],
431
+ modify_time=file_info.modify_time,
432
+ md5=file_info.file_md5,
433
+ )
434
+
435
+ for doc in content:
436
+ logger.info(f"Processing file: {doc.module_name}")
437
+ chunks = self._chunk_text(doc.source_code, self.chunk_size)
438
+ for chunk_idx, chunk in enumerate(chunks):
439
+ chunk_item = {
440
+ "_id": f"{doc.module_name}_{chunk_idx}",
441
+ "file_path": file_info.file_path,
442
+ "content": chunk,
443
+ "raw_content": chunk,
444
+ "vector": "",
445
+ "mtime": file_info.modify_time,
446
+ }
447
+ items.append(chunk_item)
448
+
449
+ # Save to local cache
450
+ logger.info("Saving cache to local file")
451
+ self.write_cache()
452
+
453
+ if items:
454
+ logger.info("Clear cache from Byzer DuckDB Storage")
455
+ self.storage.truncate_table()
456
+ logger.info("Save new cache to Byzer DuckDB Storage")
457
+
458
+ total_chunks = len(items)
459
+ completed_chunks = 0
460
+
461
+ logger.info(f"进度: 已完成 {0}/{total_chunks} 个文本块")
462
+
463
+ for _chunk in items:
464
+ try:
465
+ self.storage.add_doc(_chunk, dim=self.extra_params.rag_duckdb_vector_dim)
466
+ completed_chunks += 1
467
+ logger.info(f"进度: 已完成 {completed_chunks}/{total_chunks} 个文本块")
468
+ time.sleep(self.extra_params.anti_quota_limit)
469
+ except Exception as err:
470
+ logger.error(f"Error in saving chunk: {str(err)}")
471
+
472
+ def update_storage(self, file_info: FileInfo, is_delete: bool):
473
+ results = self.storage.query_by_path(file_info.file_path)
474
+ if results: # [('_id',)]
475
+ for result in results:
476
+ self.storage.delete_by_ids([result[0]])
477
+
478
+ items = []
479
+ if not is_delete:
480
+ content = [
481
+ SourceCode.model_validate(doc) for doc in self.cache[file_info.file_path].content
482
+ ]
483
+ modify_time = self.cache[file_info.file_path].modify_time
484
+ for doc in content:
485
+ logger.info(f"正在处理更新文件: {doc.module_name}")
486
+ chunks = self._chunk_text(doc.source_code, self.chunk_size)
487
+ for chunk_idx, chunk in enumerate(chunks):
488
+ chunk_item = {
489
+ "_id": f"{doc.module_name}_{chunk_idx}",
490
+ "file_path": file_info.file_path,
491
+ "content": chunk,
492
+ "raw_content": chunk,
493
+ "vector": chunk,
494
+ "mtime": modify_time,
495
+ }
496
+ items.append(chunk_item)
497
+ if items:
498
+ for _chunk in items:
499
+ try:
500
+ self.storage.add_doc(_chunk, dim=self.extra_params.rag_duckdb_vector_dim)
501
+ time.sleep(self.extra_params.anti_quota_limit)
502
+ except Exception as err:
503
+ logger.error(f"Error in saving chunk: {str(err)}")
504
+
505
+ def process_queue(self):
506
+ while self.queue:
507
+ file_list = self.queue.pop(0)
508
+ if isinstance(file_list, DeleteEvent):
509
+ for item in file_list.file_paths:
510
+ logger.info(f"{item} is detected to be removed")
511
+ del self.cache[item]
512
+ # 创建一个临时的 FileInfo 对象
513
+ file_info = FileInfo(
514
+ file_path=item, relative_path="", modify_time=0, file_md5="")
515
+ self.update_storage(file_info, is_delete=True)
516
+
517
+ elif isinstance(file_list, AddOrUpdateEvent):
518
+ for file_info in file_list.file_infos:
519
+ logger.info(
520
+ f"{file_info.file_path} is detected to be updated")
521
+ # 处理文件并创建 CacheItem
522
+ # content = process_file_local(
523
+ # self.fileinfo_to_tuple(file_info))
524
+ content = process_file_local(file_info.file_path)
525
+ self.cache[file_info.file_path] = CacheItem(
526
+ file_path=file_info.file_path,
527
+ relative_path=file_info.relative_path,
528
+ content=[c.model_dump() for c in content],
529
+ modify_time=file_info.modify_time,
530
+ md5=file_info.file_md5,
531
+ )
532
+ self.update_storage(file_info, is_delete=False)
533
+ self.write_cache()
534
+
535
+ def trigger_update(self):
536
+ logger.info("检查文件是否有更新.....")
537
+ files_to_process = []
538
+ current_files = set()
539
+ for file_info in self.get_all_files():
540
+ current_files.add(file_info.file_path)
541
+ if (
542
+ file_info.file_path not in self.cache
543
+ or self.cache[file_info.file_path].md5 != file_info.file_md5
544
+ ):
545
+ files_to_process.append(file_info)
546
+
547
+ deleted_files = set(self.cache.keys()) - current_files
548
+ logger.info(f"待处理的文件: {len(files_to_process)}个")
549
+ logger.info(f"已删除的文件: {len(deleted_files)}个")
550
+ if deleted_files:
551
+ with self.lock:
552
+ self.queue.append(DeleteEvent(file_paths=deleted_files))
553
+ if files_to_process:
554
+ with self.lock:
555
+ self.queue.append(AddOrUpdateEvent(file_infos=files_to_process))
556
+
557
+ def get_all_files(self) -> List[FileInfo]:
558
+ all_files = []
559
+ for root, dirs, files in os.walk(self.path, followlinks=True):
560
+ dirs[:] = [d for d in dirs if not d.startswith(
561
+ ".") and d not in default_ignore_dirs]
562
+
563
+ if self.ignore_spec:
564
+ relative_root = os.path.relpath(root, self.path)
565
+ dirs[:] = [
566
+ d
567
+ for d in dirs
568
+ if not self.ignore_spec.match_file(os.path.join(relative_root, d))
569
+ ]
570
+ files = [
571
+ f
572
+ for f in files
573
+ if not self.ignore_spec.match_file(os.path.join(relative_root, f))
574
+ ]
575
+
576
+ for file in files:
577
+ if self.required_exts and not any(
578
+ file.endswith(ext) for ext in self.required_exts
579
+ ):
580
+ continue
581
+
582
+ file_path = os.path.join(root, file)
583
+ relative_path = os.path.relpath(file_path, self.path)
584
+ modify_time = os.path.getmtime(file_path)
585
+ file_md5 = generate_file_md5(file_path)
586
+ all_files.append(
587
+ FileInfo(
588
+ file_path=file_path,
589
+ relative_path=relative_path,
590
+ modify_time=modify_time,
591
+ file_md5=file_md5))
592
+
593
+ return all_files
594
+
595
+ def get_cache(self, options: Optional[Dict[str, Any]] = None) -> Dict[str, Dict]:
596
+ """Search cached documents using query"""
597
+ self.trigger_update() # 检查更新
598
+
599
+ if options is None or "query" not in options:
600
+ return {file_path: self.cache[file_path].model_dump() for file_path in self.cache}
601
+
602
+ query = options.get("query", "")
603
+ logger.info(f"正在使用向量搜索检索数据, 你的问题: {query}")
604
+ total_tokens = 0
605
+ results = []
606
+
607
+ # Add vector search if enabled
608
+ if options.get("enable_vector_search", True):
609
+ # 返回值包含 [(_id, file_path, mtime, score,),]
610
+ # results = self.storage.vector_search(query, similarity_value=0.7, similarity_top_k=200)
611
+ search_results = self.storage.vector_search(
612
+ query,
613
+ similarity_value=self.extra_params.duckdb_query_similarity,
614
+ similarity_top_k=self.extra_params.duckdb_query_top_k,
615
+ query_dim=self.extra_params.duckdb_vector_dim
616
+ )
617
+ results.extend(search_results)
618
+
619
+ # Group results by file_path and reconstruct documents while preserving order
620
+ # 这里还可以有排序优化,综合考虑一篇内容出现的次数以及排序位置
621
+ file_paths = []
622
+ seen = set()
623
+ for result in results:
624
+ _id, _file_path, _mtime, _score = result
625
+ if _file_path not in seen:
626
+ seen.add(_file_path)
627
+ file_paths.append(_file_path)
628
+
629
+ # 从缓存中获取文件内容
630
+ result = {}
631
+ for file_path in file_paths:
632
+ if file_path in self.cache:
633
+ cached_data = self.cache[file_path]
634
+ for doc in cached_data.content:
635
+ if total_tokens + doc["tokens"] > self.max_output_tokens:
636
+ logger.info(
637
+ f"当前检索已超出用户设置 Hybrid Index Max Tokens:{self.max_output_tokens},"
638
+ f"累计tokens: {total_tokens}, "
639
+ f"经过向量搜索共检索出 {len(result.keys())} 个文档, 共 {len(self.cache.keys())} 个文档")
640
+ return result
641
+ total_tokens += doc["tokens"]
642
+ result[file_path] = cached_data.model_dump()
643
+ logger.info(
644
+ f"用户Hybrid Index Max Tokens设置为:{self.max_output_tokens},"
645
+ f"累计tokens: {total_tokens}, "
646
+ f"经过向量搜索共检索出 {len(result.keys())} 个文档, 共 {len(self.cache.keys())} 个文档")
647
+ return result
@@ -11,6 +11,7 @@ from autocoder.rag.cache.simple_cache import AutoCoderRAGAsyncUpdateQueue
11
11
  from autocoder.rag.cache.file_monitor_cache import AutoCoderRAGDocListener
12
12
  from autocoder.rag.cache.byzer_storage_cache import ByzerStorageCache
13
13
  from autocoder.rag.cache.local_byzer_storage_cache import LocalByzerStorageCache
14
+ from autocoder.rag.cache.local_duckdb_storage_cache import LocalDuckDBStorageCache
14
15
  from autocoder.common import AutoCoderArgs
15
16
 
16
17
  cache_lock = threading.Lock()
@@ -66,11 +67,17 @@ class LocalDocumentRetriever(BaseDocumentRetriever):
66
67
  self.cacher = ByzerStorageCache(
67
68
  path, ignore_spec, required_exts, extra_params
68
69
  )
69
- else:
70
- self.cacher = LocalByzerStorageCache(
71
- path, ignore_spec, required_exts, extra_params,
72
- emb_llm = emb_llm
73
- )
70
+ else:
71
+ if extra_params.rag_storage_type == "duckdb":
72
+ self.cacher = LocalDuckDBStorageCache(
73
+ path, ignore_spec, required_exts, extra_params,
74
+ emb_llm=emb_llm
75
+ )
76
+ elif extra_params.rag_storage_type in ["byzer-storage", "byzer_storage"]:
77
+ self.cacher = LocalByzerStorageCache(
78
+ path, ignore_spec, required_exts, extra_params,
79
+ emb_llm=emb_llm
80
+ )
74
81
  else:
75
82
  if self.monitor_mode:
76
83
  self.cacher = AutoCoderRAGDocListener(
@@ -210,22 +210,22 @@ class LongContextRAG:
210
210
 
211
211
  avg_tokens = statistics.mean(token_counts) if token_counts else 0
212
212
  median_tokens = statistics.median(token_counts) if token_counts else 0
213
-
214
- logger.info(
215
- "RAG Configuration:\n"
216
- f" Total docs: {doc_num}\n"
217
- f" Total tokens: {token_num}\n"
218
- f" Tokenizer path: {self.tokenizer_path}\n"
219
- f" Relevant score: {self.relevant_score}\n"
220
- f" Token limit: {self.token_limit}\n"
221
- f" Full text limit: {self.full_text_limit}\n"
222
- f" Segment limit: {self.segment_limit}\n"
223
- f" Buff limit: {self.buff_limit}\n"
224
- f" Max doc tokens: {max(token_counts) if token_counts else 0}\n"
225
- f" Min doc tokens: {min(token_counts) if token_counts else 0}\n"
226
- f" Avg doc tokens: {avg_tokens:.2f}\n"
227
- f" Median doc tokens: {median_tokens:.2f}\n"
228
- )
213
+ if not self.client:
214
+ logger.info(
215
+ "RAG Configuration:\n"
216
+ f" Total docs: {doc_num}\n"
217
+ f" Total tokens: {token_num}\n"
218
+ f" Tokenizer path: {self.tokenizer_path}\n"
219
+ f" Relevant score: {self.relevant_score}\n"
220
+ f" Token limit: {self.token_limit}\n"
221
+ f" Full text limit: {self.full_text_limit}\n"
222
+ f" Segment limit: {self.segment_limit}\n"
223
+ f" Buff limit: {self.buff_limit}\n"
224
+ f" Max doc tokens: {max(token_counts) if token_counts else 0}\n"
225
+ f" Min doc tokens: {min(token_counts) if token_counts else 0}\n"
226
+ f" Avg doc tokens: {avg_tokens:.2f}\n"
227
+ f" Median doc tokens: {median_tokens:.2f}\n"
228
+ )
229
229
 
230
230
  def count_tokens(self, text: str) -> int:
231
231
  if self.tokenizer is None:
autocoder/version.py CHANGED
@@ -1 +1 @@
1
- __version__ = "0.1.291"
1
+ __version__ = "0.1.293"