eth-mcp 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- eth_mcp-0.2.0.dist-info/METADATA +332 -0
- eth_mcp-0.2.0.dist-info/RECORD +21 -0
- eth_mcp-0.2.0.dist-info/WHEEL +4 -0
- eth_mcp-0.2.0.dist-info/entry_points.txt +3 -0
- ethereum_mcp/__init__.py +3 -0
- ethereum_mcp/cli.py +589 -0
- ethereum_mcp/clients.py +363 -0
- ethereum_mcp/config.py +324 -0
- ethereum_mcp/expert/__init__.py +1 -0
- ethereum_mcp/expert/guidance.py +300 -0
- ethereum_mcp/indexer/__init__.py +8 -0
- ethereum_mcp/indexer/chunker.py +563 -0
- ethereum_mcp/indexer/client_compiler.py +725 -0
- ethereum_mcp/indexer/compiler.py +245 -0
- ethereum_mcp/indexer/downloader.py +521 -0
- ethereum_mcp/indexer/embedder.py +627 -0
- ethereum_mcp/indexer/manifest.py +411 -0
- ethereum_mcp/logging.py +85 -0
- ethereum_mcp/models.py +126 -0
- ethereum_mcp/server.py +555 -0
- ethereum_mcp/tools/__init__.py +1 -0
|
@@ -0,0 +1,563 @@
|
|
|
1
|
+
"""Chunk markdown documents and client code for embedding."""
|
|
2
|
+
|
|
3
|
+
import hashlib
|
|
4
|
+
import re
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import TYPE_CHECKING
|
|
8
|
+
|
|
9
|
+
from langchain_text_splitters import MarkdownHeaderTextSplitter, RecursiveCharacterTextSplitter
|
|
10
|
+
|
|
11
|
+
if TYPE_CHECKING:
|
|
12
|
+
from .client_compiler import ExtractedConstant, ExtractedItem
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def generate_chunk_id(
|
|
16
|
+
project: str,
|
|
17
|
+
source_type: str,
|
|
18
|
+
source_file: str,
|
|
19
|
+
chunk_index: int,
|
|
20
|
+
content: str,
|
|
21
|
+
) -> str:
|
|
22
|
+
"""
|
|
23
|
+
Generate a unique, deterministic chunk ID.
|
|
24
|
+
|
|
25
|
+
Format: {project}_{source_type}_{path_hash}_{index:04d}_{content_hash}
|
|
26
|
+
|
|
27
|
+
The content hash ensures that if content changes, the ID changes,
|
|
28
|
+
enabling proper delta updates.
|
|
29
|
+
|
|
30
|
+
Args:
|
|
31
|
+
project: Project identifier (e.g., "eth")
|
|
32
|
+
source_type: Type of source (e.g., "spec", "eip", "function")
|
|
33
|
+
source_file: Relative path to source file
|
|
34
|
+
chunk_index: Index of this chunk within the file
|
|
35
|
+
content: Chunk content for hashing
|
|
36
|
+
|
|
37
|
+
Returns:
|
|
38
|
+
Unique chunk ID string
|
|
39
|
+
"""
|
|
40
|
+
path_hash = hashlib.sha256(source_file.encode()).hexdigest()[:8]
|
|
41
|
+
content_hash = hashlib.sha256(content.encode()).hexdigest()[:8]
|
|
42
|
+
return f"{project}_{source_type}_{path_hash}_{chunk_index:04d}_{content_hash}"
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
@dataclass(frozen=True)
|
|
46
|
+
class Chunk:
|
|
47
|
+
"""A document chunk with metadata."""
|
|
48
|
+
|
|
49
|
+
content: str
|
|
50
|
+
source: str # File path
|
|
51
|
+
fork: str | None # Fork name if from specs
|
|
52
|
+
section: str | None # Section header
|
|
53
|
+
chunk_type: str # 'spec', 'eip', 'function', 'constant'
|
|
54
|
+
metadata: dict
|
|
55
|
+
chunk_id: str = "" # Unique ID for incremental indexing
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def chunk_documents(
|
|
59
|
+
spec_files: list[Path],
|
|
60
|
+
eip_files: list[Path],
|
|
61
|
+
builder_spec_files: list[Path] | None = None,
|
|
62
|
+
chunk_size: int = 1000,
|
|
63
|
+
chunk_overlap: int = 200,
|
|
64
|
+
generate_ids: bool = False,
|
|
65
|
+
base_path: Path | None = None,
|
|
66
|
+
) -> list[Chunk]:
|
|
67
|
+
"""
|
|
68
|
+
Chunk spec and EIP documents for embedding.
|
|
69
|
+
|
|
70
|
+
Args:
|
|
71
|
+
spec_files: List of spec markdown files
|
|
72
|
+
eip_files: List of EIP markdown files
|
|
73
|
+
builder_spec_files: List of builder-specs markdown files
|
|
74
|
+
chunk_size: Target chunk size in characters
|
|
75
|
+
chunk_overlap: Overlap between chunks
|
|
76
|
+
generate_ids: If True, generate unique chunk IDs for incremental indexing
|
|
77
|
+
base_path: Base path for relative paths in chunk IDs (required if generate_ids=True)
|
|
78
|
+
|
|
79
|
+
Returns:
|
|
80
|
+
List of chunks with metadata (and chunk_id if generate_ids=True)
|
|
81
|
+
"""
|
|
82
|
+
chunks = []
|
|
83
|
+
builder_spec_files = builder_spec_files or []
|
|
84
|
+
|
|
85
|
+
# Headers to split on for markdown
|
|
86
|
+
headers_to_split_on = [
|
|
87
|
+
("#", "h1"),
|
|
88
|
+
("##", "h2"),
|
|
89
|
+
("###", "h3"),
|
|
90
|
+
]
|
|
91
|
+
|
|
92
|
+
md_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)
|
|
93
|
+
text_splitter = RecursiveCharacterTextSplitter(
|
|
94
|
+
chunk_size=chunk_size,
|
|
95
|
+
chunk_overlap=chunk_overlap,
|
|
96
|
+
)
|
|
97
|
+
|
|
98
|
+
# Process spec files
|
|
99
|
+
for spec_file in spec_files:
|
|
100
|
+
fork = _extract_fork_from_path(spec_file)
|
|
101
|
+
chunks.extend(_chunk_spec_file(spec_file, fork, md_splitter, text_splitter))
|
|
102
|
+
|
|
103
|
+
# Process EIP files
|
|
104
|
+
for eip_file in eip_files:
|
|
105
|
+
chunks.extend(_chunk_eip_file(eip_file, md_splitter, text_splitter))
|
|
106
|
+
|
|
107
|
+
# Process builder-specs files
|
|
108
|
+
for builder_file in builder_spec_files:
|
|
109
|
+
chunks.extend(_chunk_builder_spec_file(builder_file, md_splitter, text_splitter))
|
|
110
|
+
|
|
111
|
+
# Generate chunk IDs if requested
|
|
112
|
+
if generate_ids:
|
|
113
|
+
chunks = _assign_chunk_ids(chunks, base_path)
|
|
114
|
+
|
|
115
|
+
return chunks
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
def _assign_chunk_ids(chunks: list[Chunk], base_path: Path | None = None) -> list[Chunk]:
|
|
119
|
+
"""
|
|
120
|
+
Assign unique chunk IDs to chunks, grouped by source file.
|
|
121
|
+
|
|
122
|
+
Args:
|
|
123
|
+
chunks: List of chunks without IDs
|
|
124
|
+
base_path: Base path for making source paths relative
|
|
125
|
+
|
|
126
|
+
Returns:
|
|
127
|
+
New list of chunks with chunk_id populated
|
|
128
|
+
"""
|
|
129
|
+
# Group chunks by source file to assign sequential indices
|
|
130
|
+
from collections import defaultdict
|
|
131
|
+
|
|
132
|
+
file_chunks: dict[str, list[tuple[int, Chunk]]] = defaultdict(list)
|
|
133
|
+
for idx, chunk in enumerate(chunks):
|
|
134
|
+
file_chunks[chunk.source].append((idx, chunk))
|
|
135
|
+
|
|
136
|
+
import contextlib
|
|
137
|
+
|
|
138
|
+
# Assign IDs
|
|
139
|
+
new_chunks = [None] * len(chunks)
|
|
140
|
+
for source, indexed_chunks in file_chunks.items():
|
|
141
|
+
# Make path relative if base_path provided
|
|
142
|
+
rel_source = source
|
|
143
|
+
if base_path:
|
|
144
|
+
with contextlib.suppress(ValueError):
|
|
145
|
+
rel_source = str(Path(source).relative_to(base_path))
|
|
146
|
+
|
|
147
|
+
for chunk_idx, (original_idx, chunk) in enumerate(indexed_chunks):
|
|
148
|
+
chunk_id = generate_chunk_id(
|
|
149
|
+
project="eth",
|
|
150
|
+
source_type=chunk.chunk_type,
|
|
151
|
+
source_file=rel_source,
|
|
152
|
+
chunk_index=chunk_idx,
|
|
153
|
+
content=chunk.content,
|
|
154
|
+
)
|
|
155
|
+
# Create new chunk with ID (Chunk is frozen)
|
|
156
|
+
new_chunk = Chunk(
|
|
157
|
+
content=chunk.content,
|
|
158
|
+
source=chunk.source,
|
|
159
|
+
fork=chunk.fork,
|
|
160
|
+
section=chunk.section,
|
|
161
|
+
chunk_type=chunk.chunk_type,
|
|
162
|
+
metadata=chunk.metadata,
|
|
163
|
+
chunk_id=chunk_id,
|
|
164
|
+
)
|
|
165
|
+
new_chunks[original_idx] = new_chunk
|
|
166
|
+
|
|
167
|
+
return new_chunks
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
def chunk_single_file(
|
|
171
|
+
file_path: Path,
|
|
172
|
+
file_type: str,
|
|
173
|
+
chunk_size: int = 1000,
|
|
174
|
+
chunk_overlap: int = 200,
|
|
175
|
+
base_path: Path | None = None,
|
|
176
|
+
) -> list[Chunk]:
|
|
177
|
+
"""
|
|
178
|
+
Chunk a single file for incremental indexing.
|
|
179
|
+
|
|
180
|
+
Args:
|
|
181
|
+
file_path: Path to the file to chunk
|
|
182
|
+
file_type: Type of file ('spec', 'eip', 'builder')
|
|
183
|
+
chunk_size: Target chunk size in characters
|
|
184
|
+
chunk_overlap: Overlap between chunks
|
|
185
|
+
base_path: Base path for relative paths in chunk IDs
|
|
186
|
+
|
|
187
|
+
Returns:
|
|
188
|
+
List of chunks with chunk_id populated
|
|
189
|
+
"""
|
|
190
|
+
headers_to_split_on = [
|
|
191
|
+
("#", "h1"),
|
|
192
|
+
("##", "h2"),
|
|
193
|
+
("###", "h3"),
|
|
194
|
+
]
|
|
195
|
+
|
|
196
|
+
md_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)
|
|
197
|
+
text_splitter = RecursiveCharacterTextSplitter(
|
|
198
|
+
chunk_size=chunk_size,
|
|
199
|
+
chunk_overlap=chunk_overlap,
|
|
200
|
+
)
|
|
201
|
+
|
|
202
|
+
chunks = []
|
|
203
|
+
if file_type == "spec":
|
|
204
|
+
fork = _extract_fork_from_path(file_path)
|
|
205
|
+
chunks = _chunk_spec_file(file_path, fork, md_splitter, text_splitter)
|
|
206
|
+
elif file_type == "eip":
|
|
207
|
+
chunks = _chunk_eip_file(file_path, md_splitter, text_splitter)
|
|
208
|
+
elif file_type == "builder":
|
|
209
|
+
chunks = _chunk_builder_spec_file(file_path, md_splitter, text_splitter)
|
|
210
|
+
|
|
211
|
+
# Assign chunk IDs
|
|
212
|
+
return _assign_chunk_ids(chunks, base_path)
|
|
213
|
+
|
|
214
|
+
|
|
215
|
+
def _extract_fork_from_path(path: Path) -> str | None:
|
|
216
|
+
"""Extract fork name from file path."""
|
|
217
|
+
parts = path.parts
|
|
218
|
+
known_forks = {"phase0", "altair", "bellatrix", "capella", "deneb", "electra", "fulu", "gloas"}
|
|
219
|
+
|
|
220
|
+
for part in parts:
|
|
221
|
+
if part.lower() in known_forks:
|
|
222
|
+
return part.lower()
|
|
223
|
+
|
|
224
|
+
return None
|
|
225
|
+
|
|
226
|
+
|
|
227
|
+
def _chunk_spec_file(
|
|
228
|
+
file_path: Path,
|
|
229
|
+
fork: str | None,
|
|
230
|
+
md_splitter: MarkdownHeaderTextSplitter,
|
|
231
|
+
text_splitter: RecursiveCharacterTextSplitter,
|
|
232
|
+
) -> list[Chunk]:
|
|
233
|
+
"""Chunk a spec markdown file."""
|
|
234
|
+
chunks = []
|
|
235
|
+
content = file_path.read_text()
|
|
236
|
+
|
|
237
|
+
# First split by markdown headers
|
|
238
|
+
md_docs = md_splitter.split_text(content)
|
|
239
|
+
|
|
240
|
+
for doc in md_docs:
|
|
241
|
+
section = doc.metadata.get("h2") or doc.metadata.get("h1") or "unknown"
|
|
242
|
+
|
|
243
|
+
# Detect chunk type based on content
|
|
244
|
+
chunk_type = _detect_chunk_type(doc.page_content)
|
|
245
|
+
|
|
246
|
+
# Further split if too large
|
|
247
|
+
if len(doc.page_content) > 1500:
|
|
248
|
+
sub_chunks = text_splitter.split_text(doc.page_content)
|
|
249
|
+
for i, sub_content in enumerate(sub_chunks):
|
|
250
|
+
chunks.append(
|
|
251
|
+
Chunk(
|
|
252
|
+
content=sub_content,
|
|
253
|
+
source=str(file_path),
|
|
254
|
+
fork=fork,
|
|
255
|
+
section=section,
|
|
256
|
+
chunk_type=chunk_type,
|
|
257
|
+
metadata={
|
|
258
|
+
"h1": doc.metadata.get("h1"),
|
|
259
|
+
"h2": doc.metadata.get("h2"),
|
|
260
|
+
"h3": doc.metadata.get("h3"),
|
|
261
|
+
"sub_chunk": i,
|
|
262
|
+
},
|
|
263
|
+
)
|
|
264
|
+
)
|
|
265
|
+
else:
|
|
266
|
+
chunks.append(
|
|
267
|
+
Chunk(
|
|
268
|
+
content=doc.page_content,
|
|
269
|
+
source=str(file_path),
|
|
270
|
+
fork=fork,
|
|
271
|
+
section=section,
|
|
272
|
+
chunk_type=chunk_type,
|
|
273
|
+
metadata={
|
|
274
|
+
"h1": doc.metadata.get("h1"),
|
|
275
|
+
"h2": doc.metadata.get("h2"),
|
|
276
|
+
"h3": doc.metadata.get("h3"),
|
|
277
|
+
},
|
|
278
|
+
)
|
|
279
|
+
)
|
|
280
|
+
|
|
281
|
+
# Also extract functions as separate chunks
|
|
282
|
+
function_chunks = _extract_function_chunks(content, file_path, fork)
|
|
283
|
+
chunks.extend(function_chunks)
|
|
284
|
+
|
|
285
|
+
return chunks
|
|
286
|
+
|
|
287
|
+
|
|
288
|
+
def _chunk_eip_file(
|
|
289
|
+
file_path: Path,
|
|
290
|
+
md_splitter: MarkdownHeaderTextSplitter,
|
|
291
|
+
text_splitter: RecursiveCharacterTextSplitter,
|
|
292
|
+
) -> list[Chunk]:
|
|
293
|
+
"""Chunk an EIP markdown file."""
|
|
294
|
+
chunks = []
|
|
295
|
+
content = file_path.read_text()
|
|
296
|
+
|
|
297
|
+
# Extract EIP number from filename
|
|
298
|
+
eip_match = re.search(r"eip-(\d+)", file_path.name)
|
|
299
|
+
eip_number = eip_match.group(1) if eip_match else "unknown"
|
|
300
|
+
|
|
301
|
+
# Extract frontmatter metadata
|
|
302
|
+
frontmatter = _extract_eip_frontmatter(content)
|
|
303
|
+
|
|
304
|
+
# Split by headers
|
|
305
|
+
md_docs = md_splitter.split_text(content)
|
|
306
|
+
|
|
307
|
+
for doc in md_docs:
|
|
308
|
+
section = doc.metadata.get("h2") or doc.metadata.get("h1") or "unknown"
|
|
309
|
+
|
|
310
|
+
if len(doc.page_content) > 1500:
|
|
311
|
+
sub_chunks = text_splitter.split_text(doc.page_content)
|
|
312
|
+
for i, sub_content in enumerate(sub_chunks):
|
|
313
|
+
chunks.append(
|
|
314
|
+
Chunk(
|
|
315
|
+
content=sub_content,
|
|
316
|
+
source=str(file_path),
|
|
317
|
+
fork=None,
|
|
318
|
+
section=section,
|
|
319
|
+
chunk_type="eip",
|
|
320
|
+
metadata={
|
|
321
|
+
"eip": eip_number,
|
|
322
|
+
"title": frontmatter.get("title"),
|
|
323
|
+
"status": frontmatter.get("status"),
|
|
324
|
+
"category": frontmatter.get("category"),
|
|
325
|
+
"sub_chunk": i,
|
|
326
|
+
**doc.metadata,
|
|
327
|
+
},
|
|
328
|
+
)
|
|
329
|
+
)
|
|
330
|
+
else:
|
|
331
|
+
chunks.append(
|
|
332
|
+
Chunk(
|
|
333
|
+
content=doc.page_content,
|
|
334
|
+
source=str(file_path),
|
|
335
|
+
fork=None,
|
|
336
|
+
section=section,
|
|
337
|
+
chunk_type="eip",
|
|
338
|
+
metadata={
|
|
339
|
+
"eip": eip_number,
|
|
340
|
+
"title": frontmatter.get("title"),
|
|
341
|
+
"status": frontmatter.get("status"),
|
|
342
|
+
"category": frontmatter.get("category"),
|
|
343
|
+
**doc.metadata,
|
|
344
|
+
},
|
|
345
|
+
)
|
|
346
|
+
)
|
|
347
|
+
|
|
348
|
+
return chunks
|
|
349
|
+
|
|
350
|
+
|
|
351
|
+
def _chunk_builder_spec_file(
|
|
352
|
+
file_path: Path,
|
|
353
|
+
md_splitter: MarkdownHeaderTextSplitter,
|
|
354
|
+
text_splitter: RecursiveCharacterTextSplitter,
|
|
355
|
+
) -> list[Chunk]:
|
|
356
|
+
"""Chunk a builder-specs markdown file."""
|
|
357
|
+
chunks = []
|
|
358
|
+
content = file_path.read_text()
|
|
359
|
+
|
|
360
|
+
# Extract fork from path (e.g., specs/bellatrix/builder.md)
|
|
361
|
+
fork = _extract_fork_from_path(file_path)
|
|
362
|
+
|
|
363
|
+
# Split by headers
|
|
364
|
+
md_docs = md_splitter.split_text(content)
|
|
365
|
+
|
|
366
|
+
for doc in md_docs:
|
|
367
|
+
section = doc.metadata.get("h2") or doc.metadata.get("h1") or "unknown"
|
|
368
|
+
|
|
369
|
+
# Detect chunk type
|
|
370
|
+
chunk_type = _detect_builder_spec_chunk_type(doc.page_content)
|
|
371
|
+
|
|
372
|
+
if len(doc.page_content) > 1500:
|
|
373
|
+
sub_chunks = text_splitter.split_text(doc.page_content)
|
|
374
|
+
for i, sub_content in enumerate(sub_chunks):
|
|
375
|
+
chunks.append(
|
|
376
|
+
Chunk(
|
|
377
|
+
content=sub_content,
|
|
378
|
+
source=str(file_path),
|
|
379
|
+
fork=fork,
|
|
380
|
+
section=section,
|
|
381
|
+
chunk_type=chunk_type,
|
|
382
|
+
metadata={
|
|
383
|
+
"spec_type": "builder",
|
|
384
|
+
"h1": doc.metadata.get("h1"),
|
|
385
|
+
"h2": doc.metadata.get("h2"),
|
|
386
|
+
"h3": doc.metadata.get("h3"),
|
|
387
|
+
"sub_chunk": i,
|
|
388
|
+
},
|
|
389
|
+
)
|
|
390
|
+
)
|
|
391
|
+
else:
|
|
392
|
+
chunks.append(
|
|
393
|
+
Chunk(
|
|
394
|
+
content=doc.page_content,
|
|
395
|
+
source=str(file_path),
|
|
396
|
+
fork=fork,
|
|
397
|
+
section=section,
|
|
398
|
+
chunk_type=chunk_type,
|
|
399
|
+
metadata={
|
|
400
|
+
"spec_type": "builder",
|
|
401
|
+
"h1": doc.metadata.get("h1"),
|
|
402
|
+
"h2": doc.metadata.get("h2"),
|
|
403
|
+
"h3": doc.metadata.get("h3"),
|
|
404
|
+
},
|
|
405
|
+
)
|
|
406
|
+
)
|
|
407
|
+
|
|
408
|
+
return chunks
|
|
409
|
+
|
|
410
|
+
|
|
411
|
+
def _detect_builder_spec_chunk_type(content: str) -> str:
|
|
412
|
+
"""Detect the type of content in a builder-spec chunk."""
|
|
413
|
+
# Check for API endpoint definitions
|
|
414
|
+
if re.search(r"(POST|GET|PUT|DELETE)\s+`?/", content):
|
|
415
|
+
return "builder_api"
|
|
416
|
+
# Check for SSZ container definitions
|
|
417
|
+
if re.search(r"class\s+\w+\s*\(Container\)", content):
|
|
418
|
+
return "builder_type"
|
|
419
|
+
# Check for data structure definitions
|
|
420
|
+
if re.search(r"```python\s*class", content):
|
|
421
|
+
return "builder_type"
|
|
422
|
+
return "builder_spec"
|
|
423
|
+
|
|
424
|
+
|
|
425
|
+
def _detect_chunk_type(content: str) -> str:
|
|
426
|
+
"""Detect the type of content in a chunk."""
|
|
427
|
+
if re.search(r"^def\s+\w+\s*\(", content, re.MULTILINE):
|
|
428
|
+
return "function"
|
|
429
|
+
if re.search(r"\|\s*`?[A-Z][A-Z0-9_]+`?\s*\|", content):
|
|
430
|
+
return "constant"
|
|
431
|
+
if re.search(r"^class\s+\w+", content, re.MULTILINE):
|
|
432
|
+
return "type"
|
|
433
|
+
return "spec"
|
|
434
|
+
|
|
435
|
+
|
|
436
|
+
def _extract_function_chunks(content: str, file_path: Path, fork: str | None) -> list[Chunk]:
|
|
437
|
+
"""Extract complete function definitions as separate chunks."""
|
|
438
|
+
chunks = []
|
|
439
|
+
|
|
440
|
+
# Find all python code blocks with function definitions
|
|
441
|
+
pattern = r"```python\n(def\s+(\w+)\s*\([^`]+?)```"
|
|
442
|
+
|
|
443
|
+
for match in re.finditer(pattern, content, re.DOTALL):
|
|
444
|
+
func_source = match.group(1).strip()
|
|
445
|
+
func_name = match.group(2)
|
|
446
|
+
|
|
447
|
+
chunks.append(
|
|
448
|
+
Chunk(
|
|
449
|
+
content=func_source,
|
|
450
|
+
source=str(file_path),
|
|
451
|
+
fork=fork,
|
|
452
|
+
section=func_name,
|
|
453
|
+
chunk_type="function",
|
|
454
|
+
metadata={
|
|
455
|
+
"function_name": func_name,
|
|
456
|
+
},
|
|
457
|
+
)
|
|
458
|
+
)
|
|
459
|
+
|
|
460
|
+
return chunks
|
|
461
|
+
|
|
462
|
+
|
|
463
|
+
def _extract_eip_frontmatter(content: str) -> dict:
|
|
464
|
+
"""Extract YAML frontmatter from EIP."""
|
|
465
|
+
frontmatter = {}
|
|
466
|
+
|
|
467
|
+
match = re.match(r"^---\n(.*?)\n---", content, re.DOTALL)
|
|
468
|
+
if match:
|
|
469
|
+
for line in match.group(1).split("\n"):
|
|
470
|
+
if ":" in line:
|
|
471
|
+
key, value = line.split(":", 1)
|
|
472
|
+
frontmatter[key.strip()] = value.strip()
|
|
473
|
+
|
|
474
|
+
return frontmatter
|
|
475
|
+
|
|
476
|
+
|
|
477
|
+
def chunk_client_code(
|
|
478
|
+
items: list["ExtractedItem"],
|
|
479
|
+
constants: list["ExtractedConstant"],
|
|
480
|
+
max_body_length: int = 2000,
|
|
481
|
+
) -> list[Chunk]:
|
|
482
|
+
"""
|
|
483
|
+
Convert extracted client code items into chunks for embedding.
|
|
484
|
+
|
|
485
|
+
Args:
|
|
486
|
+
items: List of ExtractedItem (functions, structs, interfaces)
|
|
487
|
+
constants: List of ExtractedConstant
|
|
488
|
+
max_body_length: Truncate bodies longer than this
|
|
489
|
+
|
|
490
|
+
Returns:
|
|
491
|
+
List of chunks ready for embedding
|
|
492
|
+
"""
|
|
493
|
+
chunks = []
|
|
494
|
+
|
|
495
|
+
for item in items:
|
|
496
|
+
# Build content with doc comment + signature + body
|
|
497
|
+
content_parts = []
|
|
498
|
+
if item.doc_comment:
|
|
499
|
+
content_parts.append(f"// {item.doc_comment}")
|
|
500
|
+
content_parts.append(item.signature)
|
|
501
|
+
|
|
502
|
+
# Include body but truncate if too long
|
|
503
|
+
body = item.body
|
|
504
|
+
if len(body) > max_body_length:
|
|
505
|
+
body = body[:max_body_length] + "\n// ... truncated ..."
|
|
506
|
+
|
|
507
|
+
content_parts.append(body)
|
|
508
|
+
content = "\n".join(content_parts)
|
|
509
|
+
|
|
510
|
+
# Map item kind to chunk_type
|
|
511
|
+
chunk_type_map = {
|
|
512
|
+
"function": "client_function",
|
|
513
|
+
"struct": "client_struct",
|
|
514
|
+
"interface": "client_interface",
|
|
515
|
+
"enum": "client_enum",
|
|
516
|
+
"type": "client_type",
|
|
517
|
+
}
|
|
518
|
+
chunk_type = chunk_type_map.get(item.kind, f"client_{item.kind}")
|
|
519
|
+
|
|
520
|
+
chunks.append(
|
|
521
|
+
Chunk(
|
|
522
|
+
content=content,
|
|
523
|
+
source=item.file_path,
|
|
524
|
+
fork=None,
|
|
525
|
+
section=item.name,
|
|
526
|
+
chunk_type=chunk_type,
|
|
527
|
+
metadata={
|
|
528
|
+
"function_name": item.name if item.kind == "function" else "",
|
|
529
|
+
"client": item.client,
|
|
530
|
+
"language": item.language,
|
|
531
|
+
"line_number": item.line_number,
|
|
532
|
+
"visibility": item.visibility,
|
|
533
|
+
},
|
|
534
|
+
)
|
|
535
|
+
)
|
|
536
|
+
|
|
537
|
+
for const in constants:
|
|
538
|
+
content_parts = []
|
|
539
|
+
if const.doc_comment:
|
|
540
|
+
content_parts.append(f"// {const.doc_comment}")
|
|
541
|
+
if const.type_annotation:
|
|
542
|
+
content_parts.append(f"const {const.name}: {const.type_annotation} = {const.value}")
|
|
543
|
+
else:
|
|
544
|
+
content_parts.append(f"const {const.name} = {const.value}")
|
|
545
|
+
|
|
546
|
+
content = "\n".join(content_parts)
|
|
547
|
+
|
|
548
|
+
chunks.append(
|
|
549
|
+
Chunk(
|
|
550
|
+
content=content,
|
|
551
|
+
source=const.file_path,
|
|
552
|
+
fork=None,
|
|
553
|
+
section=const.name,
|
|
554
|
+
chunk_type="client_constant",
|
|
555
|
+
metadata={
|
|
556
|
+
"client": const.client,
|
|
557
|
+
"language": const.language,
|
|
558
|
+
"line_number": const.line_number,
|
|
559
|
+
},
|
|
560
|
+
)
|
|
561
|
+
)
|
|
562
|
+
|
|
563
|
+
return chunks
|