eth-mcp 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,563 @@
1
+ """Chunk markdown documents and client code for embedding."""
2
+
3
+ import hashlib
4
+ import re
5
+ from dataclasses import dataclass
6
+ from pathlib import Path
7
+ from typing import TYPE_CHECKING
8
+
9
+ from langchain_text_splitters import MarkdownHeaderTextSplitter, RecursiveCharacterTextSplitter
10
+
11
+ if TYPE_CHECKING:
12
+ from .client_compiler import ExtractedConstant, ExtractedItem
13
+
14
+
15
+ def generate_chunk_id(
16
+ project: str,
17
+ source_type: str,
18
+ source_file: str,
19
+ chunk_index: int,
20
+ content: str,
21
+ ) -> str:
22
+ """
23
+ Generate a unique, deterministic chunk ID.
24
+
25
+ Format: {project}_{source_type}_{path_hash}_{index:04d}_{content_hash}
26
+
27
+ The content hash ensures that if content changes, the ID changes,
28
+ enabling proper delta updates.
29
+
30
+ Args:
31
+ project: Project identifier (e.g., "eth")
32
+ source_type: Type of source (e.g., "spec", "eip", "function")
33
+ source_file: Relative path to source file
34
+ chunk_index: Index of this chunk within the file
35
+ content: Chunk content for hashing
36
+
37
+ Returns:
38
+ Unique chunk ID string
39
+ """
40
+ path_hash = hashlib.sha256(source_file.encode()).hexdigest()[:8]
41
+ content_hash = hashlib.sha256(content.encode()).hexdigest()[:8]
42
+ return f"{project}_{source_type}_{path_hash}_{chunk_index:04d}_{content_hash}"
43
+
44
+
45
+ @dataclass(frozen=True)
46
+ class Chunk:
47
+ """A document chunk with metadata."""
48
+
49
+ content: str
50
+ source: str # File path
51
+ fork: str | None # Fork name if from specs
52
+ section: str | None # Section header
53
+ chunk_type: str # 'spec', 'eip', 'function', 'constant'
54
+ metadata: dict
55
+ chunk_id: str = "" # Unique ID for incremental indexing
56
+
57
+
58
+ def chunk_documents(
59
+ spec_files: list[Path],
60
+ eip_files: list[Path],
61
+ builder_spec_files: list[Path] | None = None,
62
+ chunk_size: int = 1000,
63
+ chunk_overlap: int = 200,
64
+ generate_ids: bool = False,
65
+ base_path: Path | None = None,
66
+ ) -> list[Chunk]:
67
+ """
68
+ Chunk spec and EIP documents for embedding.
69
+
70
+ Args:
71
+ spec_files: List of spec markdown files
72
+ eip_files: List of EIP markdown files
73
+ builder_spec_files: List of builder-specs markdown files
74
+ chunk_size: Target chunk size in characters
75
+ chunk_overlap: Overlap between chunks
76
+ generate_ids: If True, generate unique chunk IDs for incremental indexing
77
+ base_path: Base path for relative paths in chunk IDs (required if generate_ids=True)
78
+
79
+ Returns:
80
+ List of chunks with metadata (and chunk_id if generate_ids=True)
81
+ """
82
+ chunks = []
83
+ builder_spec_files = builder_spec_files or []
84
+
85
+ # Headers to split on for markdown
86
+ headers_to_split_on = [
87
+ ("#", "h1"),
88
+ ("##", "h2"),
89
+ ("###", "h3"),
90
+ ]
91
+
92
+ md_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)
93
+ text_splitter = RecursiveCharacterTextSplitter(
94
+ chunk_size=chunk_size,
95
+ chunk_overlap=chunk_overlap,
96
+ )
97
+
98
+ # Process spec files
99
+ for spec_file in spec_files:
100
+ fork = _extract_fork_from_path(spec_file)
101
+ chunks.extend(_chunk_spec_file(spec_file, fork, md_splitter, text_splitter))
102
+
103
+ # Process EIP files
104
+ for eip_file in eip_files:
105
+ chunks.extend(_chunk_eip_file(eip_file, md_splitter, text_splitter))
106
+
107
+ # Process builder-specs files
108
+ for builder_file in builder_spec_files:
109
+ chunks.extend(_chunk_builder_spec_file(builder_file, md_splitter, text_splitter))
110
+
111
+ # Generate chunk IDs if requested
112
+ if generate_ids:
113
+ chunks = _assign_chunk_ids(chunks, base_path)
114
+
115
+ return chunks
116
+
117
+
118
+ def _assign_chunk_ids(chunks: list[Chunk], base_path: Path | None = None) -> list[Chunk]:
119
+ """
120
+ Assign unique chunk IDs to chunks, grouped by source file.
121
+
122
+ Args:
123
+ chunks: List of chunks without IDs
124
+ base_path: Base path for making source paths relative
125
+
126
+ Returns:
127
+ New list of chunks with chunk_id populated
128
+ """
129
+ # Group chunks by source file to assign sequential indices
130
+ from collections import defaultdict
131
+
132
+ file_chunks: dict[str, list[tuple[int, Chunk]]] = defaultdict(list)
133
+ for idx, chunk in enumerate(chunks):
134
+ file_chunks[chunk.source].append((idx, chunk))
135
+
136
+ import contextlib
137
+
138
+ # Assign IDs
139
+ new_chunks = [None] * len(chunks)
140
+ for source, indexed_chunks in file_chunks.items():
141
+ # Make path relative if base_path provided
142
+ rel_source = source
143
+ if base_path:
144
+ with contextlib.suppress(ValueError):
145
+ rel_source = str(Path(source).relative_to(base_path))
146
+
147
+ for chunk_idx, (original_idx, chunk) in enumerate(indexed_chunks):
148
+ chunk_id = generate_chunk_id(
149
+ project="eth",
150
+ source_type=chunk.chunk_type,
151
+ source_file=rel_source,
152
+ chunk_index=chunk_idx,
153
+ content=chunk.content,
154
+ )
155
+ # Create new chunk with ID (Chunk is frozen)
156
+ new_chunk = Chunk(
157
+ content=chunk.content,
158
+ source=chunk.source,
159
+ fork=chunk.fork,
160
+ section=chunk.section,
161
+ chunk_type=chunk.chunk_type,
162
+ metadata=chunk.metadata,
163
+ chunk_id=chunk_id,
164
+ )
165
+ new_chunks[original_idx] = new_chunk
166
+
167
+ return new_chunks
168
+
169
+
170
+ def chunk_single_file(
171
+ file_path: Path,
172
+ file_type: str,
173
+ chunk_size: int = 1000,
174
+ chunk_overlap: int = 200,
175
+ base_path: Path | None = None,
176
+ ) -> list[Chunk]:
177
+ """
178
+ Chunk a single file for incremental indexing.
179
+
180
+ Args:
181
+ file_path: Path to the file to chunk
182
+ file_type: Type of file ('spec', 'eip', 'builder')
183
+ chunk_size: Target chunk size in characters
184
+ chunk_overlap: Overlap between chunks
185
+ base_path: Base path for relative paths in chunk IDs
186
+
187
+ Returns:
188
+ List of chunks with chunk_id populated
189
+ """
190
+ headers_to_split_on = [
191
+ ("#", "h1"),
192
+ ("##", "h2"),
193
+ ("###", "h3"),
194
+ ]
195
+
196
+ md_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)
197
+ text_splitter = RecursiveCharacterTextSplitter(
198
+ chunk_size=chunk_size,
199
+ chunk_overlap=chunk_overlap,
200
+ )
201
+
202
+ chunks = []
203
+ if file_type == "spec":
204
+ fork = _extract_fork_from_path(file_path)
205
+ chunks = _chunk_spec_file(file_path, fork, md_splitter, text_splitter)
206
+ elif file_type == "eip":
207
+ chunks = _chunk_eip_file(file_path, md_splitter, text_splitter)
208
+ elif file_type == "builder":
209
+ chunks = _chunk_builder_spec_file(file_path, md_splitter, text_splitter)
210
+
211
+ # Assign chunk IDs
212
+ return _assign_chunk_ids(chunks, base_path)
213
+
214
+
215
+ def _extract_fork_from_path(path: Path) -> str | None:
216
+ """Extract fork name from file path."""
217
+ parts = path.parts
218
+ known_forks = {"phase0", "altair", "bellatrix", "capella", "deneb", "electra", "fulu", "gloas"}
219
+
220
+ for part in parts:
221
+ if part.lower() in known_forks:
222
+ return part.lower()
223
+
224
+ return None
225
+
226
+
227
+ def _chunk_spec_file(
228
+ file_path: Path,
229
+ fork: str | None,
230
+ md_splitter: MarkdownHeaderTextSplitter,
231
+ text_splitter: RecursiveCharacterTextSplitter,
232
+ ) -> list[Chunk]:
233
+ """Chunk a spec markdown file."""
234
+ chunks = []
235
+ content = file_path.read_text()
236
+
237
+ # First split by markdown headers
238
+ md_docs = md_splitter.split_text(content)
239
+
240
+ for doc in md_docs:
241
+ section = doc.metadata.get("h2") or doc.metadata.get("h1") or "unknown"
242
+
243
+ # Detect chunk type based on content
244
+ chunk_type = _detect_chunk_type(doc.page_content)
245
+
246
+ # Further split if too large
247
+ if len(doc.page_content) > 1500:
248
+ sub_chunks = text_splitter.split_text(doc.page_content)
249
+ for i, sub_content in enumerate(sub_chunks):
250
+ chunks.append(
251
+ Chunk(
252
+ content=sub_content,
253
+ source=str(file_path),
254
+ fork=fork,
255
+ section=section,
256
+ chunk_type=chunk_type,
257
+ metadata={
258
+ "h1": doc.metadata.get("h1"),
259
+ "h2": doc.metadata.get("h2"),
260
+ "h3": doc.metadata.get("h3"),
261
+ "sub_chunk": i,
262
+ },
263
+ )
264
+ )
265
+ else:
266
+ chunks.append(
267
+ Chunk(
268
+ content=doc.page_content,
269
+ source=str(file_path),
270
+ fork=fork,
271
+ section=section,
272
+ chunk_type=chunk_type,
273
+ metadata={
274
+ "h1": doc.metadata.get("h1"),
275
+ "h2": doc.metadata.get("h2"),
276
+ "h3": doc.metadata.get("h3"),
277
+ },
278
+ )
279
+ )
280
+
281
+ # Also extract functions as separate chunks
282
+ function_chunks = _extract_function_chunks(content, file_path, fork)
283
+ chunks.extend(function_chunks)
284
+
285
+ return chunks
286
+
287
+
288
+ def _chunk_eip_file(
289
+ file_path: Path,
290
+ md_splitter: MarkdownHeaderTextSplitter,
291
+ text_splitter: RecursiveCharacterTextSplitter,
292
+ ) -> list[Chunk]:
293
+ """Chunk an EIP markdown file."""
294
+ chunks = []
295
+ content = file_path.read_text()
296
+
297
+ # Extract EIP number from filename
298
+ eip_match = re.search(r"eip-(\d+)", file_path.name)
299
+ eip_number = eip_match.group(1) if eip_match else "unknown"
300
+
301
+ # Extract frontmatter metadata
302
+ frontmatter = _extract_eip_frontmatter(content)
303
+
304
+ # Split by headers
305
+ md_docs = md_splitter.split_text(content)
306
+
307
+ for doc in md_docs:
308
+ section = doc.metadata.get("h2") or doc.metadata.get("h1") or "unknown"
309
+
310
+ if len(doc.page_content) > 1500:
311
+ sub_chunks = text_splitter.split_text(doc.page_content)
312
+ for i, sub_content in enumerate(sub_chunks):
313
+ chunks.append(
314
+ Chunk(
315
+ content=sub_content,
316
+ source=str(file_path),
317
+ fork=None,
318
+ section=section,
319
+ chunk_type="eip",
320
+ metadata={
321
+ "eip": eip_number,
322
+ "title": frontmatter.get("title"),
323
+ "status": frontmatter.get("status"),
324
+ "category": frontmatter.get("category"),
325
+ "sub_chunk": i,
326
+ **doc.metadata,
327
+ },
328
+ )
329
+ )
330
+ else:
331
+ chunks.append(
332
+ Chunk(
333
+ content=doc.page_content,
334
+ source=str(file_path),
335
+ fork=None,
336
+ section=section,
337
+ chunk_type="eip",
338
+ metadata={
339
+ "eip": eip_number,
340
+ "title": frontmatter.get("title"),
341
+ "status": frontmatter.get("status"),
342
+ "category": frontmatter.get("category"),
343
+ **doc.metadata,
344
+ },
345
+ )
346
+ )
347
+
348
+ return chunks
349
+
350
+
351
+ def _chunk_builder_spec_file(
352
+ file_path: Path,
353
+ md_splitter: MarkdownHeaderTextSplitter,
354
+ text_splitter: RecursiveCharacterTextSplitter,
355
+ ) -> list[Chunk]:
356
+ """Chunk a builder-specs markdown file."""
357
+ chunks = []
358
+ content = file_path.read_text()
359
+
360
+ # Extract fork from path (e.g., specs/bellatrix/builder.md)
361
+ fork = _extract_fork_from_path(file_path)
362
+
363
+ # Split by headers
364
+ md_docs = md_splitter.split_text(content)
365
+
366
+ for doc in md_docs:
367
+ section = doc.metadata.get("h2") or doc.metadata.get("h1") or "unknown"
368
+
369
+ # Detect chunk type
370
+ chunk_type = _detect_builder_spec_chunk_type(doc.page_content)
371
+
372
+ if len(doc.page_content) > 1500:
373
+ sub_chunks = text_splitter.split_text(doc.page_content)
374
+ for i, sub_content in enumerate(sub_chunks):
375
+ chunks.append(
376
+ Chunk(
377
+ content=sub_content,
378
+ source=str(file_path),
379
+ fork=fork,
380
+ section=section,
381
+ chunk_type=chunk_type,
382
+ metadata={
383
+ "spec_type": "builder",
384
+ "h1": doc.metadata.get("h1"),
385
+ "h2": doc.metadata.get("h2"),
386
+ "h3": doc.metadata.get("h3"),
387
+ "sub_chunk": i,
388
+ },
389
+ )
390
+ )
391
+ else:
392
+ chunks.append(
393
+ Chunk(
394
+ content=doc.page_content,
395
+ source=str(file_path),
396
+ fork=fork,
397
+ section=section,
398
+ chunk_type=chunk_type,
399
+ metadata={
400
+ "spec_type": "builder",
401
+ "h1": doc.metadata.get("h1"),
402
+ "h2": doc.metadata.get("h2"),
403
+ "h3": doc.metadata.get("h3"),
404
+ },
405
+ )
406
+ )
407
+
408
+ return chunks
409
+
410
+
411
+ def _detect_builder_spec_chunk_type(content: str) -> str:
412
+ """Detect the type of content in a builder-spec chunk."""
413
+ # Check for API endpoint definitions
414
+ if re.search(r"(POST|GET|PUT|DELETE)\s+`?/", content):
415
+ return "builder_api"
416
+ # Check for SSZ container definitions
417
+ if re.search(r"class\s+\w+\s*\(Container\)", content):
418
+ return "builder_type"
419
+ # Check for data structure definitions
420
+ if re.search(r"```python\s*class", content):
421
+ return "builder_type"
422
+ return "builder_spec"
423
+
424
+
425
+ def _detect_chunk_type(content: str) -> str:
426
+ """Detect the type of content in a chunk."""
427
+ if re.search(r"^def\s+\w+\s*\(", content, re.MULTILINE):
428
+ return "function"
429
+ if re.search(r"\|\s*`?[A-Z][A-Z0-9_]+`?\s*\|", content):
430
+ return "constant"
431
+ if re.search(r"^class\s+\w+", content, re.MULTILINE):
432
+ return "type"
433
+ return "spec"
434
+
435
+
436
+ def _extract_function_chunks(content: str, file_path: Path, fork: str | None) -> list[Chunk]:
437
+ """Extract complete function definitions as separate chunks."""
438
+ chunks = []
439
+
440
+ # Find all python code blocks with function definitions
441
+ pattern = r"```python\n(def\s+(\w+)\s*\([^`]+?)```"
442
+
443
+ for match in re.finditer(pattern, content, re.DOTALL):
444
+ func_source = match.group(1).strip()
445
+ func_name = match.group(2)
446
+
447
+ chunks.append(
448
+ Chunk(
449
+ content=func_source,
450
+ source=str(file_path),
451
+ fork=fork,
452
+ section=func_name,
453
+ chunk_type="function",
454
+ metadata={
455
+ "function_name": func_name,
456
+ },
457
+ )
458
+ )
459
+
460
+ return chunks
461
+
462
+
463
+ def _extract_eip_frontmatter(content: str) -> dict:
464
+ """Extract YAML frontmatter from EIP."""
465
+ frontmatter = {}
466
+
467
+ match = re.match(r"^---\n(.*?)\n---", content, re.DOTALL)
468
+ if match:
469
+ for line in match.group(1).split("\n"):
470
+ if ":" in line:
471
+ key, value = line.split(":", 1)
472
+ frontmatter[key.strip()] = value.strip()
473
+
474
+ return frontmatter
475
+
476
+
477
+ def chunk_client_code(
478
+ items: list["ExtractedItem"],
479
+ constants: list["ExtractedConstant"],
480
+ max_body_length: int = 2000,
481
+ ) -> list[Chunk]:
482
+ """
483
+ Convert extracted client code items into chunks for embedding.
484
+
485
+ Args:
486
+ items: List of ExtractedItem (functions, structs, interfaces)
487
+ constants: List of ExtractedConstant
488
+ max_body_length: Truncate bodies longer than this
489
+
490
+ Returns:
491
+ List of chunks ready for embedding
492
+ """
493
+ chunks = []
494
+
495
+ for item in items:
496
+ # Build content with doc comment + signature + body
497
+ content_parts = []
498
+ if item.doc_comment:
499
+ content_parts.append(f"// {item.doc_comment}")
500
+ content_parts.append(item.signature)
501
+
502
+ # Include body but truncate if too long
503
+ body = item.body
504
+ if len(body) > max_body_length:
505
+ body = body[:max_body_length] + "\n// ... truncated ..."
506
+
507
+ content_parts.append(body)
508
+ content = "\n".join(content_parts)
509
+
510
+ # Map item kind to chunk_type
511
+ chunk_type_map = {
512
+ "function": "client_function",
513
+ "struct": "client_struct",
514
+ "interface": "client_interface",
515
+ "enum": "client_enum",
516
+ "type": "client_type",
517
+ }
518
+ chunk_type = chunk_type_map.get(item.kind, f"client_{item.kind}")
519
+
520
+ chunks.append(
521
+ Chunk(
522
+ content=content,
523
+ source=item.file_path,
524
+ fork=None,
525
+ section=item.name,
526
+ chunk_type=chunk_type,
527
+ metadata={
528
+ "function_name": item.name if item.kind == "function" else "",
529
+ "client": item.client,
530
+ "language": item.language,
531
+ "line_number": item.line_number,
532
+ "visibility": item.visibility,
533
+ },
534
+ )
535
+ )
536
+
537
+ for const in constants:
538
+ content_parts = []
539
+ if const.doc_comment:
540
+ content_parts.append(f"// {const.doc_comment}")
541
+ if const.type_annotation:
542
+ content_parts.append(f"const {const.name}: {const.type_annotation} = {const.value}")
543
+ else:
544
+ content_parts.append(f"const {const.name} = {const.value}")
545
+
546
+ content = "\n".join(content_parts)
547
+
548
+ chunks.append(
549
+ Chunk(
550
+ content=content,
551
+ source=const.file_path,
552
+ fork=None,
553
+ section=const.name,
554
+ chunk_type="client_constant",
555
+ metadata={
556
+ "client": const.client,
557
+ "language": const.language,
558
+ "line_number": const.line_number,
559
+ },
560
+ )
561
+ )
562
+
563
+ return chunks