eth-mcp 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- eth_mcp-0.2.0.dist-info/METADATA +332 -0
- eth_mcp-0.2.0.dist-info/RECORD +21 -0
- eth_mcp-0.2.0.dist-info/WHEEL +4 -0
- eth_mcp-0.2.0.dist-info/entry_points.txt +3 -0
- ethereum_mcp/__init__.py +3 -0
- ethereum_mcp/cli.py +589 -0
- ethereum_mcp/clients.py +363 -0
- ethereum_mcp/config.py +324 -0
- ethereum_mcp/expert/__init__.py +1 -0
- ethereum_mcp/expert/guidance.py +300 -0
- ethereum_mcp/indexer/__init__.py +8 -0
- ethereum_mcp/indexer/chunker.py +563 -0
- ethereum_mcp/indexer/client_compiler.py +725 -0
- ethereum_mcp/indexer/compiler.py +245 -0
- ethereum_mcp/indexer/downloader.py +521 -0
- ethereum_mcp/indexer/embedder.py +627 -0
- ethereum_mcp/indexer/manifest.py +411 -0
- ethereum_mcp/logging.py +85 -0
- ethereum_mcp/models.py +126 -0
- ethereum_mcp/server.py +555 -0
- ethereum_mcp/tools/__init__.py +1 -0
ethereum_mcp/clients.py
ADDED
|
@@ -0,0 +1,363 @@
|
|
|
1
|
+
"""Ethereum client tracking.
|
|
2
|
+
|
|
3
|
+
Tracks execution layer (EL) and consensus layer (CL) client implementations.
|
|
4
|
+
Ethereum has better client diversity than most chains - a key security feature.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from dataclasses import dataclass
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
@dataclass(frozen=True)
|
|
11
|
+
class EthereumClient:
|
|
12
|
+
"""An Ethereum client implementation."""
|
|
13
|
+
|
|
14
|
+
name: str
|
|
15
|
+
layer: str # "execution", "consensus", "both"
|
|
16
|
+
organization: str
|
|
17
|
+
language: str
|
|
18
|
+
repo: str
|
|
19
|
+
description: str
|
|
20
|
+
mainnet_status: str # "production", "beta", "development"
|
|
21
|
+
stake_percentage: float | None # For CL clients
|
|
22
|
+
node_percentage: float | None # For EL clients
|
|
23
|
+
key_features: list[str]
|
|
24
|
+
notes: list[str]
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
# Execution Layer Clients
|
|
28
|
+
EXECUTION_CLIENTS: list[EthereumClient] = [
|
|
29
|
+
EthereumClient(
|
|
30
|
+
name="Geth",
|
|
31
|
+
layer="execution",
|
|
32
|
+
organization="Ethereum Foundation",
|
|
33
|
+
language="Go",
|
|
34
|
+
repo="https://github.com/ethereum/go-ethereum",
|
|
35
|
+
description="Original and most widely used Ethereum execution client.",
|
|
36
|
+
mainnet_status="production",
|
|
37
|
+
stake_percentage=None,
|
|
38
|
+
node_percentage=55.0, # Approximate as of late 2025
|
|
39
|
+
key_features=[
|
|
40
|
+
"Reference implementation",
|
|
41
|
+
"Most battle-tested",
|
|
42
|
+
"Snap sync for fast initial sync",
|
|
43
|
+
"Full and light node modes",
|
|
44
|
+
],
|
|
45
|
+
notes=[
|
|
46
|
+
"Historically >80% dominance, now improving",
|
|
47
|
+
"Single Geth bug could halt network if >66%",
|
|
48
|
+
"EF actively encouraging diversity",
|
|
49
|
+
],
|
|
50
|
+
),
|
|
51
|
+
EthereumClient(
|
|
52
|
+
name="Reth",
|
|
53
|
+
layer="execution",
|
|
54
|
+
organization="Paradigm",
|
|
55
|
+
language="Rust",
|
|
56
|
+
repo="https://github.com/paradigmxyz/reth",
|
|
57
|
+
description="High-performance Rust implementation. Fast sync, modular architecture.",
|
|
58
|
+
mainnet_status="production",
|
|
59
|
+
stake_percentage=None,
|
|
60
|
+
node_percentage=15.0, # Growing rapidly
|
|
61
|
+
key_features=[
|
|
62
|
+
"Written in Rust for safety and performance",
|
|
63
|
+
"Modular architecture (reth-* crates)",
|
|
64
|
+
"Fast sync times",
|
|
65
|
+
"Lower memory usage than Geth",
|
|
66
|
+
"Active development by Paradigm",
|
|
67
|
+
],
|
|
68
|
+
notes=[
|
|
69
|
+
"Fastest growing EL client",
|
|
70
|
+
"Production-ready since 2024",
|
|
71
|
+
"Popular with infrastructure providers",
|
|
72
|
+
"Excellent documentation",
|
|
73
|
+
],
|
|
74
|
+
),
|
|
75
|
+
EthereumClient(
|
|
76
|
+
name="Nethermind",
|
|
77
|
+
layer="execution",
|
|
78
|
+
organization="Nethermind",
|
|
79
|
+
language="C#/.NET",
|
|
80
|
+
repo="https://github.com/NethermindEth/nethermind",
|
|
81
|
+
description="Enterprise-grade .NET implementation with extensive plugin system.",
|
|
82
|
+
mainnet_status="production",
|
|
83
|
+
stake_percentage=None,
|
|
84
|
+
node_percentage=18.0,
|
|
85
|
+
key_features=[
|
|
86
|
+
".NET ecosystem integration",
|
|
87
|
+
"Extensive plugin system",
|
|
88
|
+
"Good for enterprise deployments",
|
|
89
|
+
"MEV-boost compatible",
|
|
90
|
+
],
|
|
91
|
+
notes=[
|
|
92
|
+
"Second most used EL client",
|
|
93
|
+
"Popular with institutional stakers",
|
|
94
|
+
"Good Windows support",
|
|
95
|
+
],
|
|
96
|
+
),
|
|
97
|
+
EthereumClient(
|
|
98
|
+
name="Besu",
|
|
99
|
+
layer="execution",
|
|
100
|
+
organization="Hyperledger / ConsenSys",
|
|
101
|
+
language="Java",
|
|
102
|
+
repo="https://github.com/hyperledger/besu",
|
|
103
|
+
description="Enterprise Ethereum client, Apache 2.0 licensed.",
|
|
104
|
+
mainnet_status="production",
|
|
105
|
+
stake_percentage=None,
|
|
106
|
+
node_percentage=8.0,
|
|
107
|
+
key_features=[
|
|
108
|
+
"Apache 2.0 license (enterprise-friendly)",
|
|
109
|
+
"Privacy features (Tessera integration)",
|
|
110
|
+
"Permissioning for private networks",
|
|
111
|
+
"GraphQL API",
|
|
112
|
+
],
|
|
113
|
+
notes=[
|
|
114
|
+
"Popular for enterprise and consortium chains",
|
|
115
|
+
"Hyperledger project governance",
|
|
116
|
+
"Good for private Ethereum networks",
|
|
117
|
+
],
|
|
118
|
+
),
|
|
119
|
+
EthereumClient(
|
|
120
|
+
name="Erigon",
|
|
121
|
+
layer="execution",
|
|
122
|
+
organization="Erigon (formerly Turbo-Geth)",
|
|
123
|
+
language="Go",
|
|
124
|
+
repo="https://github.com/ledgerwatch/erigon",
|
|
125
|
+
description="Efficiency-focused client optimized for archive nodes.",
|
|
126
|
+
mainnet_status="production",
|
|
127
|
+
stake_percentage=None,
|
|
128
|
+
node_percentage=4.0,
|
|
129
|
+
key_features=[
|
|
130
|
+
"Extremely efficient disk usage",
|
|
131
|
+
"Best for archive nodes",
|
|
132
|
+
"Staged sync architecture",
|
|
133
|
+
"Lower hardware requirements for full history",
|
|
134
|
+
],
|
|
135
|
+
notes=[
|
|
136
|
+
"Forked from Geth, heavily modified",
|
|
137
|
+
"Preferred for archive node operators",
|
|
138
|
+
"Different database structure (MDBX)",
|
|
139
|
+
],
|
|
140
|
+
),
|
|
141
|
+
]
|
|
142
|
+
|
|
143
|
+
# Consensus Layer Clients
|
|
144
|
+
CONSENSUS_CLIENTS: list[EthereumClient] = [
|
|
145
|
+
EthereumClient(
|
|
146
|
+
name="Prysm",
|
|
147
|
+
layer="consensus",
|
|
148
|
+
organization="Prysmatic Labs (Offchain Labs)",
|
|
149
|
+
language="Go",
|
|
150
|
+
repo="https://github.com/prysmaticlabs/prysm",
|
|
151
|
+
description="Most popular consensus client, originally the first to mainnet.",
|
|
152
|
+
mainnet_status="production",
|
|
153
|
+
stake_percentage=35.0,
|
|
154
|
+
node_percentage=None,
|
|
155
|
+
key_features=[
|
|
156
|
+
"First production CL client",
|
|
157
|
+
"Slasher included",
|
|
158
|
+
"Web UI for monitoring",
|
|
159
|
+
"gRPC and REST APIs",
|
|
160
|
+
],
|
|
161
|
+
notes=[
|
|
162
|
+
"~35% of stake (down from >60% historically)",
|
|
163
|
+
"Now part of Offchain Labs (Arbitrum)",
|
|
164
|
+
"Diversity improving",
|
|
165
|
+
],
|
|
166
|
+
),
|
|
167
|
+
EthereumClient(
|
|
168
|
+
name="Lighthouse",
|
|
169
|
+
layer="consensus",
|
|
170
|
+
organization="Sigma Prime",
|
|
171
|
+
language="Rust",
|
|
172
|
+
repo="https://github.com/sigp/lighthouse",
|
|
173
|
+
description="Rust implementation focused on security and performance.",
|
|
174
|
+
mainnet_status="production",
|
|
175
|
+
stake_percentage=33.0,
|
|
176
|
+
node_percentage=None,
|
|
177
|
+
key_features=[
|
|
178
|
+
"Rust safety guarantees",
|
|
179
|
+
"Security audit focused",
|
|
180
|
+
"Efficient memory usage",
|
|
181
|
+
"Good documentation",
|
|
182
|
+
],
|
|
183
|
+
notes=[
|
|
184
|
+
"~33% of stake - excellent diversity",
|
|
185
|
+
"Security-focused development",
|
|
186
|
+
"Popular with solo stakers",
|
|
187
|
+
],
|
|
188
|
+
),
|
|
189
|
+
EthereumClient(
|
|
190
|
+
name="Teku",
|
|
191
|
+
layer="consensus",
|
|
192
|
+
organization="ConsenSys",
|
|
193
|
+
language="Java",
|
|
194
|
+
repo="https://github.com/ConsenSys/teku",
|
|
195
|
+
description="Enterprise-grade Java implementation, pairs well with Besu.",
|
|
196
|
+
mainnet_status="production",
|
|
197
|
+
stake_percentage=18.0,
|
|
198
|
+
node_percentage=None,
|
|
199
|
+
key_features=[
|
|
200
|
+
"Java enterprise ecosystem",
|
|
201
|
+
"Pairs naturally with Besu",
|
|
202
|
+
"REST API focused",
|
|
203
|
+
"Good institutional support",
|
|
204
|
+
],
|
|
205
|
+
notes=[
|
|
206
|
+
"~18% of stake",
|
|
207
|
+
"ConsenSys enterprise offering",
|
|
208
|
+
"Popular with institutions",
|
|
209
|
+
],
|
|
210
|
+
),
|
|
211
|
+
EthereumClient(
|
|
212
|
+
name="Nimbus",
|
|
213
|
+
layer="consensus",
|
|
214
|
+
organization="Status",
|
|
215
|
+
language="Nim",
|
|
216
|
+
repo="https://github.com/status-im/nimbus-eth2",
|
|
217
|
+
description="Lightweight client designed for resource-constrained devices.",
|
|
218
|
+
mainnet_status="production",
|
|
219
|
+
stake_percentage=10.0,
|
|
220
|
+
node_percentage=None,
|
|
221
|
+
key_features=[
|
|
222
|
+
"Extremely lightweight",
|
|
223
|
+
"Can run on Raspberry Pi",
|
|
224
|
+
"Low memory footprint",
|
|
225
|
+
"Nim language (compiles to C)",
|
|
226
|
+
],
|
|
227
|
+
notes=[
|
|
228
|
+
"~10% of stake",
|
|
229
|
+
"Best for home stakers with limited hardware",
|
|
230
|
+
"Also developing EL client (nimbus-eth1)",
|
|
231
|
+
],
|
|
232
|
+
),
|
|
233
|
+
EthereumClient(
|
|
234
|
+
name="Lodestar",
|
|
235
|
+
layer="consensus",
|
|
236
|
+
organization="ChainSafe",
|
|
237
|
+
language="TypeScript",
|
|
238
|
+
repo="https://github.com/ChainSafe/lodestar",
|
|
239
|
+
description="TypeScript implementation, great for JS/TS developers.",
|
|
240
|
+
mainnet_status="production",
|
|
241
|
+
stake_percentage=4.0,
|
|
242
|
+
node_percentage=None,
|
|
243
|
+
key_features=[
|
|
244
|
+
"TypeScript/JavaScript ecosystem",
|
|
245
|
+
"Good for web3 developers",
|
|
246
|
+
"Light client focus",
|
|
247
|
+
"Browser-compatible components",
|
|
248
|
+
],
|
|
249
|
+
notes=[
|
|
250
|
+
"~4% of stake",
|
|
251
|
+
"Youngest production CL client",
|
|
252
|
+
"Growing adoption",
|
|
253
|
+
],
|
|
254
|
+
),
|
|
255
|
+
]
|
|
256
|
+
|
|
257
|
+
|
|
258
|
+
def list_execution_clients() -> list[EthereumClient]:
|
|
259
|
+
"""List all execution layer clients."""
|
|
260
|
+
return EXECUTION_CLIENTS
|
|
261
|
+
|
|
262
|
+
|
|
263
|
+
def list_consensus_clients() -> list[EthereumClient]:
|
|
264
|
+
"""List all consensus layer clients."""
|
|
265
|
+
return CONSENSUS_CLIENTS
|
|
266
|
+
|
|
267
|
+
|
|
268
|
+
def list_all_clients() -> list[EthereumClient]:
|
|
269
|
+
"""List all Ethereum clients."""
|
|
270
|
+
return EXECUTION_CLIENTS + CONSENSUS_CLIENTS
|
|
271
|
+
|
|
272
|
+
|
|
273
|
+
def get_client(name: str) -> EthereumClient | None:
|
|
274
|
+
"""Get a specific client by name."""
|
|
275
|
+
name_lower = name.lower()
|
|
276
|
+
for client in list_all_clients():
|
|
277
|
+
if name_lower in client.name.lower():
|
|
278
|
+
return client
|
|
279
|
+
return None
|
|
280
|
+
|
|
281
|
+
|
|
282
|
+
def get_client_diversity() -> dict:
|
|
283
|
+
"""Get Ethereum client diversity statistics."""
|
|
284
|
+
return {
|
|
285
|
+
"execution_layer": {
|
|
286
|
+
"Geth (Go)": "~55% - Reference implementation, still dominant",
|
|
287
|
+
"Nethermind (C#)": "~18% - Enterprise-focused",
|
|
288
|
+
"Reth (Rust)": "~15% - Fastest growing, Paradigm",
|
|
289
|
+
"Besu (Java)": "~8% - Enterprise/Hyperledger",
|
|
290
|
+
"Erigon (Go)": "~4% - Archive node specialist",
|
|
291
|
+
},
|
|
292
|
+
"consensus_layer": {
|
|
293
|
+
"Prysm (Go)": "~35% - Prysmatic Labs/Offchain Labs",
|
|
294
|
+
"Lighthouse (Rust)": "~33% - Sigma Prime",
|
|
295
|
+
"Teku (Java)": "~18% - ConsenSys",
|
|
296
|
+
"Nimbus (Nim)": "~10% - Status, lightweight",
|
|
297
|
+
"Lodestar (TypeScript)": "~4% - ChainSafe",
|
|
298
|
+
},
|
|
299
|
+
"diversity_health": {
|
|
300
|
+
"consensus_layer": "GOOD - No client >34% (supermajority threshold)",
|
|
301
|
+
"execution_layer": "MODERATE - Geth still >50%, improving",
|
|
302
|
+
},
|
|
303
|
+
"recommendations": [
|
|
304
|
+
"CL diversity is healthy - no client can cause finality failure alone",
|
|
305
|
+
"EL still needs improvement - Geth bug could affect majority",
|
|
306
|
+
"Reth growth is positive for EL diversity",
|
|
307
|
+
"Run minority clients if possible to help the network",
|
|
308
|
+
],
|
|
309
|
+
"supermajority_risk": (
|
|
310
|
+
"If any client >66%, a bug in that client could cause "
|
|
311
|
+
"incorrect finalization, requiring manual intervention to fix"
|
|
312
|
+
),
|
|
313
|
+
}
|
|
314
|
+
|
|
315
|
+
|
|
316
|
+
def get_recommended_pairs() -> list[dict]:
|
|
317
|
+
"""Get recommended EL+CL client pairs."""
|
|
318
|
+
return [
|
|
319
|
+
{
|
|
320
|
+
"pair": "Geth + Lighthouse",
|
|
321
|
+
"notes": "Most common, very stable",
|
|
322
|
+
},
|
|
323
|
+
{
|
|
324
|
+
"pair": "Reth + Lighthouse",
|
|
325
|
+
"notes": "Modern stack, both Rust, good performance",
|
|
326
|
+
},
|
|
327
|
+
{
|
|
328
|
+
"pair": "Nethermind + Prysm",
|
|
329
|
+
"notes": "Enterprise-friendly combination",
|
|
330
|
+
},
|
|
331
|
+
{
|
|
332
|
+
"pair": "Besu + Teku",
|
|
333
|
+
"notes": "Both ConsenSys/Java, enterprise pairing",
|
|
334
|
+
},
|
|
335
|
+
{
|
|
336
|
+
"pair": "Geth + Nimbus",
|
|
337
|
+
"notes": "Good for resource-constrained setups",
|
|
338
|
+
},
|
|
339
|
+
{
|
|
340
|
+
"pair": "Reth + Nimbus",
|
|
341
|
+
"notes": "Minimal resource usage, modern stack",
|
|
342
|
+
},
|
|
343
|
+
]
|
|
344
|
+
|
|
345
|
+
|
|
346
|
+
if __name__ == "__main__":
|
|
347
|
+
print("Ethereum Execution Layer Clients:")
|
|
348
|
+
print("=" * 60)
|
|
349
|
+
for c in EXECUTION_CLIENTS:
|
|
350
|
+
pct = f" ({c.node_percentage}%)" if c.node_percentage else ""
|
|
351
|
+
print(f" {c.name}{pct} - {c.language} - {c.organization}")
|
|
352
|
+
|
|
353
|
+
print("\nEthereum Consensus Layer Clients:")
|
|
354
|
+
print("=" * 60)
|
|
355
|
+
for c in CONSENSUS_CLIENTS:
|
|
356
|
+
pct = f" ({c.stake_percentage}%)" if c.stake_percentage else ""
|
|
357
|
+
print(f" {c.name}{pct} - {c.language} - {c.organization}")
|
|
358
|
+
|
|
359
|
+
print("\nClient Diversity:")
|
|
360
|
+
print("=" * 60)
|
|
361
|
+
diversity = get_client_diversity()
|
|
362
|
+
print(f" EL: {diversity['diversity_health']['execution_layer']}")
|
|
363
|
+
print(f" CL: {diversity['diversity_health']['consensus_layer']}")
|
ethereum_mcp/config.py
ADDED
|
@@ -0,0 +1,324 @@
|
|
|
1
|
+
"""Configuration management for ethereum-mcp.
|
|
2
|
+
|
|
3
|
+
Supports loading configuration from:
|
|
4
|
+
1. Default values
|
|
5
|
+
2. Config file (~/.ethereum-mcp/config.yaml)
|
|
6
|
+
3. Environment variables (for secrets like API keys)
|
|
7
|
+
|
|
8
|
+
Configuration precedence: env vars > config file > defaults
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
import os
|
|
12
|
+
from dataclasses import dataclass, field
|
|
13
|
+
from pathlib import Path
|
|
14
|
+
from typing import Any
|
|
15
|
+
|
|
16
|
+
import yaml
|
|
17
|
+
|
|
18
|
+
from .logging import get_logger
|
|
19
|
+
|
|
20
|
+
logger = get_logger("config")
|
|
21
|
+
|
|
22
|
+
# Default values
|
|
23
|
+
DEFAULT_EMBEDDING_MODEL = "all-MiniLM-L6-v2"
|
|
24
|
+
DEFAULT_BATCH_SIZE = 32
|
|
25
|
+
DEFAULT_CHUNK_SIZE = 1000
|
|
26
|
+
DEFAULT_CHUNK_OVERLAP = 200
|
|
27
|
+
|
|
28
|
+
# Supported embedding models with their properties
|
|
29
|
+
EMBEDDING_MODELS = {
|
|
30
|
+
# Local models (sentence-transformers)
|
|
31
|
+
"all-MiniLM-L6-v2": {
|
|
32
|
+
"dimensions": 384,
|
|
33
|
+
"max_tokens": 256,
|
|
34
|
+
"type": "local",
|
|
35
|
+
"description": "Fast, lightweight fallback model",
|
|
36
|
+
},
|
|
37
|
+
"all-mpnet-base-v2": {
|
|
38
|
+
"dimensions": 768,
|
|
39
|
+
"max_tokens": 384,
|
|
40
|
+
"type": "local",
|
|
41
|
+
"description": "Better quality, moderate speed",
|
|
42
|
+
},
|
|
43
|
+
"codesage/codesage-large": {
|
|
44
|
+
"dimensions": 1024,
|
|
45
|
+
"max_tokens": 1024,
|
|
46
|
+
"type": "local",
|
|
47
|
+
"description": "Code-specialized, recommended for specs",
|
|
48
|
+
},
|
|
49
|
+
# API models (require API keys)
|
|
50
|
+
"voyage:voyage-code-3": {
|
|
51
|
+
"dimensions": 1024,
|
|
52
|
+
"max_tokens": 16000,
|
|
53
|
+
"type": "api",
|
|
54
|
+
"env_var": "VOYAGE_API_KEY",
|
|
55
|
+
"description": "Best quality, requires API key ($0.06/1M tokens)",
|
|
56
|
+
},
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
class ConfigError(Exception):
|
|
61
|
+
"""Configuration error."""
|
|
62
|
+
|
|
63
|
+
pass
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
@dataclass
|
|
67
|
+
class EmbeddingConfig:
|
|
68
|
+
"""Embedding model configuration."""
|
|
69
|
+
|
|
70
|
+
model: str = DEFAULT_EMBEDDING_MODEL
|
|
71
|
+
batch_size: int = DEFAULT_BATCH_SIZE
|
|
72
|
+
|
|
73
|
+
@property
|
|
74
|
+
def model_info(self) -> dict[str, Any]:
|
|
75
|
+
"""Get model info from EMBEDDING_MODELS."""
|
|
76
|
+
return EMBEDDING_MODELS.get(self.model, {})
|
|
77
|
+
|
|
78
|
+
@property
|
|
79
|
+
def dimensions(self) -> int:
|
|
80
|
+
"""Get embedding dimensions for the model."""
|
|
81
|
+
return self.model_info.get("dimensions", 384)
|
|
82
|
+
|
|
83
|
+
@property
|
|
84
|
+
def requires_api_key(self) -> bool:
|
|
85
|
+
"""Check if model requires an API key."""
|
|
86
|
+
return self.model_info.get("type") == "api"
|
|
87
|
+
|
|
88
|
+
@property
|
|
89
|
+
def api_key_env_var(self) -> str | None:
|
|
90
|
+
"""Get the environment variable name for the API key."""
|
|
91
|
+
return self.model_info.get("env_var")
|
|
92
|
+
|
|
93
|
+
def validate(self) -> None:
|
|
94
|
+
"""Validate configuration.
|
|
95
|
+
|
|
96
|
+
Raises ConfigError if validation fails.
|
|
97
|
+
"""
|
|
98
|
+
if self.model not in EMBEDDING_MODELS:
|
|
99
|
+
logger.warning(
|
|
100
|
+
"Unknown embedding model: %s. Using default settings.", self.model
|
|
101
|
+
)
|
|
102
|
+
|
|
103
|
+
if self.requires_api_key:
|
|
104
|
+
env_var = self.api_key_env_var
|
|
105
|
+
if env_var and not os.environ.get(env_var):
|
|
106
|
+
raise ConfigError(
|
|
107
|
+
f"Model {self.model} requires {env_var} environment variable"
|
|
108
|
+
)
|
|
109
|
+
|
|
110
|
+
if self.batch_size < 1:
|
|
111
|
+
raise ConfigError(f"batch_size must be >= 1, got {self.batch_size}")
|
|
112
|
+
|
|
113
|
+
if self.batch_size > 256:
|
|
114
|
+
logger.warning(
|
|
115
|
+
"Large batch_size (%d) may cause memory issues", self.batch_size
|
|
116
|
+
)
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
@dataclass
|
|
120
|
+
class ChunkingConfig:
|
|
121
|
+
"""Document chunking configuration."""
|
|
122
|
+
|
|
123
|
+
chunk_size: int = DEFAULT_CHUNK_SIZE
|
|
124
|
+
chunk_overlap: int = DEFAULT_CHUNK_OVERLAP
|
|
125
|
+
|
|
126
|
+
def validate(self) -> None:
|
|
127
|
+
"""Validate configuration.
|
|
128
|
+
|
|
129
|
+
Raises ConfigError if validation fails.
|
|
130
|
+
"""
|
|
131
|
+
if self.chunk_size < 100:
|
|
132
|
+
raise ConfigError(f"chunk_size must be >= 100, got {self.chunk_size}")
|
|
133
|
+
|
|
134
|
+
if self.chunk_size > 10000:
|
|
135
|
+
raise ConfigError(f"chunk_size must be <= 10000, got {self.chunk_size}")
|
|
136
|
+
|
|
137
|
+
if self.chunk_overlap < 0:
|
|
138
|
+
raise ConfigError(f"chunk_overlap must be >= 0, got {self.chunk_overlap}")
|
|
139
|
+
|
|
140
|
+
if self.chunk_overlap >= self.chunk_size:
|
|
141
|
+
raise ConfigError(
|
|
142
|
+
f"chunk_overlap ({self.chunk_overlap}) must be < chunk_size ({self.chunk_size})"
|
|
143
|
+
)
|
|
144
|
+
|
|
145
|
+
def to_dict(self) -> dict[str, int]:
|
|
146
|
+
"""Convert to dict for manifest storage."""
|
|
147
|
+
return {
|
|
148
|
+
"chunk_size": self.chunk_size,
|
|
149
|
+
"chunk_overlap": self.chunk_overlap,
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
@dataclass
|
|
154
|
+
class Config:
|
|
155
|
+
"""Main configuration container."""
|
|
156
|
+
|
|
157
|
+
embedding: EmbeddingConfig = field(default_factory=EmbeddingConfig)
|
|
158
|
+
chunking: ChunkingConfig = field(default_factory=ChunkingConfig)
|
|
159
|
+
|
|
160
|
+
def validate(self) -> None:
|
|
161
|
+
"""Validate all configuration."""
|
|
162
|
+
self.embedding.validate()
|
|
163
|
+
self.chunking.validate()
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
def load_config(config_path: Path | None = None, data_dir: Path | None = None) -> Config:
|
|
167
|
+
"""
|
|
168
|
+
Load configuration from file and environment.
|
|
169
|
+
|
|
170
|
+
Args:
|
|
171
|
+
config_path: Explicit path to config file (optional)
|
|
172
|
+
data_dir: Data directory to look for config.yaml (optional)
|
|
173
|
+
|
|
174
|
+
Returns:
|
|
175
|
+
Validated Config object
|
|
176
|
+
"""
|
|
177
|
+
config = Config()
|
|
178
|
+
|
|
179
|
+
# Determine config file path
|
|
180
|
+
if config_path is None and data_dir is not None:
|
|
181
|
+
config_path = data_dir / "config.yaml"
|
|
182
|
+
|
|
183
|
+
# Load from file if exists
|
|
184
|
+
if config_path and config_path.exists():
|
|
185
|
+
try:
|
|
186
|
+
config = _load_config_file(config_path)
|
|
187
|
+
logger.debug("Loaded config from %s", config_path)
|
|
188
|
+
except Exception as e:
|
|
189
|
+
logger.warning("Failed to load config from %s: %s", config_path, e)
|
|
190
|
+
config = Config()
|
|
191
|
+
|
|
192
|
+
# Override with environment variables
|
|
193
|
+
config = _apply_env_overrides(config)
|
|
194
|
+
|
|
195
|
+
# Validate
|
|
196
|
+
config.validate()
|
|
197
|
+
|
|
198
|
+
return config
|
|
199
|
+
|
|
200
|
+
|
|
201
|
+
def _load_config_file(config_path: Path) -> Config:
|
|
202
|
+
"""Load configuration from YAML file."""
|
|
203
|
+
# Security: limit file size
|
|
204
|
+
max_size = 1024 * 1024 # 1MB
|
|
205
|
+
if config_path.stat().st_size > max_size:
|
|
206
|
+
raise ConfigError(f"Config file too large: {config_path.stat().st_size} > {max_size}")
|
|
207
|
+
|
|
208
|
+
with open(config_path, encoding="utf-8") as f:
|
|
209
|
+
# Use safe_load to prevent code execution
|
|
210
|
+
data = yaml.safe_load(f)
|
|
211
|
+
|
|
212
|
+
if data is None:
|
|
213
|
+
return Config()
|
|
214
|
+
|
|
215
|
+
if not isinstance(data, dict):
|
|
216
|
+
raise ConfigError("Config file must be a YAML mapping")
|
|
217
|
+
|
|
218
|
+
# Validate keys
|
|
219
|
+
allowed_keys = {"embedding", "chunking"}
|
|
220
|
+
unknown_keys = set(data.keys()) - allowed_keys
|
|
221
|
+
if unknown_keys:
|
|
222
|
+
logger.warning("Unknown config keys ignored: %s", unknown_keys)
|
|
223
|
+
|
|
224
|
+
# Parse embedding config
|
|
225
|
+
embedding_data = data.get("embedding", {})
|
|
226
|
+
if not isinstance(embedding_data, dict):
|
|
227
|
+
raise ConfigError("'embedding' must be a mapping")
|
|
228
|
+
|
|
229
|
+
embedding = EmbeddingConfig(
|
|
230
|
+
model=str(embedding_data.get("model", DEFAULT_EMBEDDING_MODEL)),
|
|
231
|
+
batch_size=int(embedding_data.get("batch_size", DEFAULT_BATCH_SIZE)),
|
|
232
|
+
)
|
|
233
|
+
|
|
234
|
+
# Parse chunking config
|
|
235
|
+
chunking_data = data.get("chunking", {})
|
|
236
|
+
if not isinstance(chunking_data, dict):
|
|
237
|
+
raise ConfigError("'chunking' must be a mapping")
|
|
238
|
+
|
|
239
|
+
chunking = ChunkingConfig(
|
|
240
|
+
chunk_size=int(chunking_data.get("chunk_size", DEFAULT_CHUNK_SIZE)),
|
|
241
|
+
chunk_overlap=int(chunking_data.get("chunk_overlap", DEFAULT_CHUNK_OVERLAP)),
|
|
242
|
+
)
|
|
243
|
+
|
|
244
|
+
return Config(embedding=embedding, chunking=chunking)
|
|
245
|
+
|
|
246
|
+
|
|
247
|
+
def _apply_env_overrides(config: Config) -> Config:
|
|
248
|
+
"""Apply environment variable overrides to config."""
|
|
249
|
+
# ETHEREUM_MCP_EMBEDDING_MODEL overrides config file
|
|
250
|
+
env_model = os.environ.get("ETHEREUM_MCP_EMBEDDING_MODEL")
|
|
251
|
+
if env_model:
|
|
252
|
+
config.embedding.model = env_model
|
|
253
|
+
logger.debug("Using embedding model from env: %s", env_model)
|
|
254
|
+
|
|
255
|
+
# ETHEREUM_MCP_BATCH_SIZE
|
|
256
|
+
env_batch = os.environ.get("ETHEREUM_MCP_BATCH_SIZE")
|
|
257
|
+
if env_batch:
|
|
258
|
+
try:
|
|
259
|
+
config.embedding.batch_size = int(env_batch)
|
|
260
|
+
except ValueError:
|
|
261
|
+
logger.warning("Invalid ETHEREUM_MCP_BATCH_SIZE: %s", env_batch)
|
|
262
|
+
|
|
263
|
+
return config
|
|
264
|
+
|
|
265
|
+
|
|
266
|
+
def save_config(config: Config, config_path: Path) -> None:
|
|
267
|
+
"""
|
|
268
|
+
Save configuration to YAML file.
|
|
269
|
+
|
|
270
|
+
Args:
|
|
271
|
+
config: Configuration to save
|
|
272
|
+
config_path: Path to write config file
|
|
273
|
+
"""
|
|
274
|
+
data = {
|
|
275
|
+
"embedding": {
|
|
276
|
+
"model": config.embedding.model,
|
|
277
|
+
"batch_size": config.embedding.batch_size,
|
|
278
|
+
},
|
|
279
|
+
"chunking": {
|
|
280
|
+
"chunk_size": config.chunking.chunk_size,
|
|
281
|
+
"chunk_overlap": config.chunking.chunk_overlap,
|
|
282
|
+
},
|
|
283
|
+
}
|
|
284
|
+
|
|
285
|
+
config_path.parent.mkdir(parents=True, exist_ok=True)
|
|
286
|
+
|
|
287
|
+
with open(config_path, "w", encoding="utf-8") as f:
|
|
288
|
+
yaml.dump(data, f, default_flow_style=False, sort_keys=False)
|
|
289
|
+
|
|
290
|
+
logger.info("Saved config to %s", config_path)
|
|
291
|
+
|
|
292
|
+
|
|
293
|
+
def get_model_info(model_name: str | None = None) -> str:
|
|
294
|
+
"""Get human-readable info about embedding models.
|
|
295
|
+
|
|
296
|
+
Args:
|
|
297
|
+
model_name: Specific model to get info for (None = all models)
|
|
298
|
+
|
|
299
|
+
Returns:
|
|
300
|
+
Formatted string with model information
|
|
301
|
+
"""
|
|
302
|
+
if model_name:
|
|
303
|
+
if model_name not in EMBEDDING_MODELS:
|
|
304
|
+
return f"Unknown model: {model_name}"
|
|
305
|
+
info = EMBEDDING_MODELS[model_name]
|
|
306
|
+
return (
|
|
307
|
+
f"{model_name}:\n"
|
|
308
|
+
f" Dimensions: {info['dimensions']}\n"
|
|
309
|
+
f" Max tokens: {info['max_tokens']}\n"
|
|
310
|
+
f" Type: {info['type']}\n"
|
|
311
|
+
f" Description: {info['description']}"
|
|
312
|
+
)
|
|
313
|
+
|
|
314
|
+
lines = ["Available embedding models:\n"]
|
|
315
|
+
for name, info in EMBEDDING_MODELS.items():
|
|
316
|
+
marker = " (default)" if name == DEFAULT_EMBEDDING_MODEL else ""
|
|
317
|
+
api_note = " [requires API key]" if info["type"] == "api" else ""
|
|
318
|
+
lines.append(
|
|
319
|
+
f" {name}{marker}{api_note}\n"
|
|
320
|
+
f" {info['dimensions']} dims, {info['max_tokens']} tokens\n"
|
|
321
|
+
f" {info['description']}\n"
|
|
322
|
+
)
|
|
323
|
+
|
|
324
|
+
return "\n".join(lines)
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Expert guidance system for curated interpretations."""
|