code-finder 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. claude_context/__init__.py +33 -0
  2. claude_context/agentic_integration.py +309 -0
  3. claude_context/ast_chunker.py +646 -0
  4. claude_context/config.py +239 -0
  5. claude_context/context_manager.py +627 -0
  6. claude_context/embeddings.py +307 -0
  7. claude_context/embeddings_interface.py +226 -0
  8. claude_context/enhanced_ast_chunker.py +1129 -0
  9. claude_context/explorer.py +951 -0
  10. claude_context/explorer_with_context.py +1008 -0
  11. claude_context/indexer.py +893 -0
  12. claude_context/markdown_chunker.py +421 -0
  13. claude_context/mode_handler.py +1774 -0
  14. claude_context/query_metrics.py +164 -0
  15. claude_context/question_generator.py +800 -0
  16. claude_context/readme_extractor.py +485 -0
  17. claude_context/repository_adapter.py +399 -0
  18. claude_context/search.py +493 -0
  19. claude_context/skills/__init__.py +11 -0
  20. claude_context/skills/_cli_common.py +74 -0
  21. claude_context/skills/_index_manager.py +98 -0
  22. claude_context/skills/api_surface.py +219 -0
  23. claude_context/skills/evidence_retrieval.py +151 -0
  24. claude_context/skills/grounded_review.py +212 -0
  25. claude_context/synthesis/__init__.py +8 -0
  26. claude_context/synthesis/editor_agent.py +391 -0
  27. claude_context/synthesis/llm_synthesizer.py +153 -0
  28. claude_context/synthesis/logic_explainer.py +235 -0
  29. claude_context/synthesis/multi_review_pipeline.py +717 -0
  30. claude_context/synthesis/prompt_builder.py +439 -0
  31. claude_context/synthesis/providers.py +115 -0
  32. claude_context/synthesis/validators.py +458 -0
  33. code_finder-0.1.0.dist-info/METADATA +823 -0
  34. code_finder-0.1.0.dist-info/RECORD +37 -0
  35. code_finder-0.1.0.dist-info/WHEEL +5 -0
  36. code_finder-0.1.0.dist-info/entry_points.txt +4 -0
  37. code_finder-0.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,1129 @@
1
+ """
2
+ Enhanced AST-based Code Chunker with Rich Metadata Extraction
3
+
4
+ This module ports key features from supermemoryai/code-chunk to provide
5
+ richer semantic metadata for code chunks, improving retrieval and
6
+ documentation generation quality.
7
+
8
+ Key enhancements over basic ast_chunker:
9
+ - Full scope chain tracking (class > method > nested function)
10
+ - Complete signature extraction with parameters and types
11
+ - Import dependency tracking per chunk
12
+ - Contextualized text generation for LLM consumption
13
+
14
+ Supports: Python, JavaScript, TypeScript, Go
15
+ """
16
+
17
+ import re
18
+ import logging
19
+ from typing import List, Dict, Any, Optional, Tuple, Set
20
+ from dataclasses import dataclass, field
21
+ from pathlib import Path
22
+
23
+ logger = logging.getLogger(__name__)
24
+
25
+ # Try to import tree-sitter dependencies
26
+ try:
27
+ import tree_sitter_python as tspython
28
+ import tree_sitter_javascript as tsjavascript
29
+ import tree_sitter_typescript as tstypescript
30
+ import tree_sitter_go as tsgo
31
+ from tree_sitter import Language, Parser, Node
32
+ HAS_TREE_SITTER = True
33
+ logger.info("Tree-sitter available for enhanced AST parsing")
34
+ except ImportError as e:
35
+ HAS_TREE_SITTER = False
36
+ Node = Any # Type hint fallback
37
+ logger.warning(f"Tree-sitter not available: {e}")
38
+
39
+
40
+ @dataclass
41
+ class ImportInfo:
42
+ """Information about an import statement"""
43
+ module: str # The module being imported from
44
+ name: str # The specific name imported (or module name for simple imports)
45
+ alias: Optional[str] = None # Alias if 'as X' is used
46
+ is_from_import: bool = False # True for 'from X import Y'
47
+ line: int = 0
48
+
49
+
50
+ @dataclass
51
+ class ParameterInfo:
52
+ """Information about a function parameter"""
53
+ name: str
54
+ type_annotation: Optional[str] = None
55
+ default_value: Optional[str] = None
56
+ is_variadic: bool = False # *args
57
+ is_keyword: bool = False # **kwargs
58
+
59
+
60
+ @dataclass
61
+ class EnhancedCodeChunk:
62
+ """
63
+ Rich code chunk with full metadata for improved retrieval and understanding.
64
+
65
+ This extends the basic CodeChunk with:
66
+ - Full scope chain (hierarchical context)
67
+ - Complete function signatures
68
+ - Import dependency tracking
69
+ - Contextualized text for LLM consumption
70
+ """
71
+ # Core content
72
+ text: str # Raw code text
73
+ contextualized_text: str # Code with context prepended
74
+
75
+ # Location
76
+ line_range: Tuple[int, int] # (start, end) 0-indexed
77
+ byte_range: Tuple[int, int] # (start, end)
78
+
79
+ # Entity identification
80
+ chunk_type: str # function, class, method, import_block, module, type
81
+ name: Optional[str] = None
82
+ signature: Optional[str] = None
83
+ docstring: Optional[str] = None
84
+
85
+ # Parameters (for functions/methods)
86
+ parameters: List[ParameterInfo] = field(default_factory=list)
87
+ return_type: Optional[str] = None
88
+
89
+ # Hierarchical context
90
+ scope: List[str] = field(default_factory=list) # Full scope chain
91
+
92
+ # Dependencies
93
+ imports: List[ImportInfo] = field(default_factory=list) # Used imports
94
+
95
+ # Metadata
96
+ language: str = ""
97
+ filepath: str = ""
98
+ is_partial: bool = False # True if chunk was split due to size
99
+
100
+ # Size tracking
101
+ size_chars: int = field(default=0, init=False)
102
+
103
+ def __post_init__(self):
104
+ if self.size_chars == 0:
105
+ self.size_chars = len(re.sub(r'\s', '', self.text))
106
+
107
+ def to_dict(self) -> Dict[str, Any]:
108
+ """Convert to dictionary for storage/serialization"""
109
+ return {
110
+ "text": self.text,
111
+ "contextualized_text": self.contextualized_text,
112
+ "line_range": self.line_range,
113
+ "byte_range": self.byte_range,
114
+ "chunk_type": self.chunk_type,
115
+ "name": self.name,
116
+ "signature": self.signature,
117
+ "docstring": self.docstring,
118
+ "parameters": [
119
+ {
120
+ "name": p.name,
121
+ "type": p.type_annotation,
122
+ "default": p.default_value,
123
+ "is_variadic": p.is_variadic,
124
+ "is_keyword": p.is_keyword
125
+ }
126
+ for p in self.parameters
127
+ ],
128
+ "return_type": self.return_type,
129
+ "scope": self.scope,
130
+ "imports": [
131
+ {
132
+ "module": i.module,
133
+ "name": i.name,
134
+ "alias": i.alias
135
+ }
136
+ for i in self.imports
137
+ ],
138
+ "language": self.language,
139
+ "filepath": self.filepath,
140
+ "is_partial": self.is_partial,
141
+ "size_chars": self.size_chars
142
+ }
143
+
144
+
145
+ class EnhancedASTChunker:
146
+ """
147
+ Enhanced AST chunker with rich metadata extraction.
148
+
149
+ Ports key features from supermemoryai/code-chunk:
150
+ - Scope chain tracking
151
+ - Signature extraction
152
+ - Import dependency linking
153
+ - Contextualized text generation
154
+ """
155
+
156
+ # Node types that represent scope boundaries
157
+ SCOPE_NODES = {
158
+ 'python': ['class_definition', 'function_definition', 'async_function_definition'],
159
+ 'javascript': ['class_declaration', 'function_declaration', 'arrow_function', 'method_definition'],
160
+ 'typescript': ['class_declaration', 'function_declaration', 'arrow_function', 'method_definition',
161
+ 'interface_declaration'],
162
+ 'go': ['function_declaration', 'method_declaration', 'type_declaration']
163
+ }
164
+
165
+ # Node types that should become chunks
166
+ CHUNK_NODES = {
167
+ 'python': [
168
+ 'function_definition', 'async_function_definition', 'class_definition',
169
+ 'decorated_definition'
170
+ ],
171
+ 'javascript': [
172
+ 'function_declaration', 'class_declaration', 'method_definition',
173
+ 'arrow_function', 'export_statement'
174
+ ],
175
+ 'typescript': [
176
+ 'function_declaration', 'class_declaration', 'method_definition',
177
+ 'arrow_function', 'interface_declaration', 'type_alias_declaration',
178
+ 'export_statement'
179
+ ],
180
+ 'go': [
181
+ 'function_declaration', 'method_declaration', 'type_declaration'
182
+ ]
183
+ }
184
+
185
+ def __init__(
186
+ self,
187
+ max_chunk_size: int = 1500,
188
+ context_mode: str = "full",
189
+ merge_small_chunks: bool = False,
190
+ min_chunk_size: int = 100
191
+ ):
192
+ """
193
+ Initialize the enhanced AST chunker.
194
+
195
+ Args:
196
+ max_chunk_size: Maximum chunk size in characters
197
+ context_mode: Context detail level ('none', 'minimal', 'full')
198
+ merge_small_chunks: Whether to merge adjacent small chunks
199
+ min_chunk_size: Minimum chunk size for merging
200
+ """
201
+ self.max_chunk_size = max_chunk_size
202
+ self.context_mode = context_mode
203
+ self.merge_small_chunks = merge_small_chunks
204
+ self.min_chunk_size = min_chunk_size
205
+
206
+ self.parsers: Dict[str, Parser] = {}
207
+ self.supported_languages = ['python', 'javascript', 'typescript', 'go']
208
+
209
+ if HAS_TREE_SITTER:
210
+ self._initialize_parsers()
211
+
212
+ def _initialize_parsers(self) -> None:
213
+ """Initialize tree-sitter parsers for supported languages"""
214
+ parser_configs = [
215
+ ('python', tspython.language),
216
+ ('javascript', tsjavascript.language),
217
+ ('typescript', lambda: tstypescript.language_tsx()),
218
+ ('go', tsgo.language)
219
+ ]
220
+
221
+ for lang, lang_fn in parser_configs:
222
+ try:
223
+ language = Language(lang_fn())
224
+ self.parsers[lang] = Parser(language)
225
+ logger.debug(f"{lang} parser initialized")
226
+ except Exception as e:
227
+ logger.warning(f"Failed to initialize {lang} parser: {e}")
228
+
229
+ def chunk_file(
230
+ self,
231
+ file_path: Path,
232
+ content: Optional[str] = None
233
+ ) -> List[EnhancedCodeChunk]:
234
+ """
235
+ Chunk a file into semantically complete units with rich metadata.
236
+
237
+ Args:
238
+ file_path: Path to the source file
239
+ content: Optional pre-loaded content
240
+
241
+ Returns:
242
+ List of EnhancedCodeChunk objects
243
+ """
244
+ file_path = Path(file_path)
245
+
246
+ if content is None:
247
+ try:
248
+ content = file_path.read_text(encoding='utf-8', errors='ignore')
249
+ except Exception as e:
250
+ logger.error(f"Failed to read {file_path}: {e}")
251
+ return []
252
+
253
+ language = self._detect_language(file_path)
254
+
255
+ if not HAS_TREE_SITTER or language not in self.parsers:
256
+ logger.debug(f"Using fallback chunking for {file_path}")
257
+ return self._chunk_fallback(content, language, str(file_path))
258
+
259
+ try:
260
+ return self._chunk_with_ast(content, language, str(file_path))
261
+ except Exception as e:
262
+ logger.warning(f"AST parsing failed for {file_path}, using fallback: {e}")
263
+ return self._chunk_fallback(content, language, str(file_path))
264
+
265
+ def _detect_language(self, file_path: Path) -> str:
266
+ """Detect programming language from file extension"""
267
+ ext_map = {
268
+ '.py': 'python',
269
+ '.js': 'javascript',
270
+ '.jsx': 'javascript',
271
+ '.ts': 'typescript',
272
+ '.tsx': 'typescript',
273
+ '.go': 'go'
274
+ }
275
+ return ext_map.get(file_path.suffix.lower(), 'unknown')
276
+
277
+ def _bs(self, node) -> str:
278
+ """Slice source bytes by node byte offsets and decode to str.
279
+
280
+ Tree-sitter nodes report byte offsets, not character offsets.
281
+ Using ``source_str[start_byte:end_byte]`` gives wrong results
282
+ when the file contains multi-byte UTF-8 characters (emojis, etc.).
283
+ """
284
+ return self._source_bytes[node.start_byte:node.end_byte].decode('utf-8', errors='replace')
285
+
286
+ def _chunk_with_ast(
287
+ self,
288
+ content: str,
289
+ language: str,
290
+ filepath: str
291
+ ) -> List[EnhancedCodeChunk]:
292
+ """
293
+ Chunk code using tree-sitter AST parsing with rich metadata extraction.
294
+ """
295
+ parser = self.parsers[language]
296
+ self._source_bytes = bytes(content, 'utf-8')
297
+ tree = parser.parse(self._source_bytes)
298
+
299
+ if not tree.root_node:
300
+ return self._chunk_fallback(content, language, filepath)
301
+
302
+ # First pass: parse all imports
303
+ all_imports = self._parse_all_imports(tree.root_node, content, language)
304
+
305
+ # Second pass: extract chunks with scope tracking
306
+ chunks = self._extract_chunks(
307
+ tree.root_node,
308
+ content,
309
+ language,
310
+ filepath,
311
+ scope=[],
312
+ all_imports=all_imports
313
+ )
314
+
315
+ # Generate contextualized text for each chunk
316
+ for chunk in chunks:
317
+ chunk.contextualized_text = self._build_contextualized_text(chunk)
318
+
319
+ return chunks
320
+
321
+ def _parse_all_imports(
322
+ self,
323
+ root: Node,
324
+ source: str,
325
+ language: str
326
+ ) -> Dict[str, ImportInfo]:
327
+ """
328
+ Parse all imports in the file and return a mapping of names to import info.
329
+ """
330
+ imports: Dict[str, ImportInfo] = {}
331
+
332
+ if language == 'python':
333
+ self._parse_python_imports(root, source, imports)
334
+ elif language in ['javascript', 'typescript']:
335
+ self._parse_js_imports(root, source, imports)
336
+ elif language == 'go':
337
+ self._parse_go_imports(root, source, imports)
338
+
339
+ return imports
340
+
341
+ def _parse_python_imports(
342
+ self,
343
+ node: Node,
344
+ source: str,
345
+ imports: Dict[str, ImportInfo]
346
+ ) -> None:
347
+ """Parse Python import statements"""
348
+ if node.type == 'import_statement':
349
+ # import X, Y, Z
350
+ for child in node.children:
351
+ if child.type == 'dotted_name':
352
+ name = self._bs(child)
353
+ imports[name] = ImportInfo(
354
+ module=name,
355
+ name=name,
356
+ line=child.start_point[0]
357
+ )
358
+ elif child.type == 'aliased_import':
359
+ # import X as Y
360
+ dotted = None
361
+ alias = None
362
+ for c in child.children:
363
+ if c.type == 'dotted_name':
364
+ dotted = self._bs(c)
365
+ elif c.type == 'identifier':
366
+ alias = self._bs(c)
367
+ if dotted:
368
+ imports[alias or dotted] = ImportInfo(
369
+ module=dotted,
370
+ name=dotted,
371
+ alias=alias,
372
+ line=child.start_point[0]
373
+ )
374
+
375
+ elif node.type == 'import_from_statement':
376
+ # from X import Y, Z
377
+ module = None
378
+ for child in node.children:
379
+ if child.type == 'dotted_name':
380
+ module = self._bs(child)
381
+ elif child.type == 'identifier' and module:
382
+ name = self._bs(child)
383
+ imports[name] = ImportInfo(
384
+ module=module,
385
+ name=name,
386
+ is_from_import=True,
387
+ line=child.start_point[0]
388
+ )
389
+ elif child.type == 'aliased_import':
390
+ original = None
391
+ alias = None
392
+ for c in child.children:
393
+ if c.type == 'identifier':
394
+ if original is None:
395
+ original = self._bs(c)
396
+ else:
397
+ alias = self._bs(c)
398
+ if original and module:
399
+ imports[alias or original] = ImportInfo(
400
+ module=module,
401
+ name=original,
402
+ alias=alias,
403
+ is_from_import=True,
404
+ line=child.start_point[0]
405
+ )
406
+
407
+ # Recurse
408
+ for child in node.children:
409
+ self._parse_python_imports(child, source, imports)
410
+
411
+ def _parse_js_imports(
412
+ self,
413
+ node: Node,
414
+ source: str,
415
+ imports: Dict[str, ImportInfo]
416
+ ) -> None:
417
+ """Parse JavaScript/TypeScript import statements"""
418
+ if node.type == 'import_statement':
419
+ module = None
420
+ # Find the module string
421
+ for child in node.children:
422
+ if child.type == 'string':
423
+ module = self._bs(child).strip("'\"")
424
+
425
+ if module:
426
+ # Find imported names
427
+ for child in node.children:
428
+ if child.type == 'import_clause':
429
+ self._extract_js_import_names(child, source, module, imports, node.start_point[0])
430
+
431
+ # Recurse
432
+ for child in node.children:
433
+ self._parse_js_imports(child, source, imports)
434
+
435
+ def _extract_js_import_names(
436
+ self,
437
+ node: Node,
438
+ source: str,
439
+ module: str,
440
+ imports: Dict[str, ImportInfo],
441
+ line: int
442
+ ) -> None:
443
+ """Extract names from JS import clause"""
444
+ for child in node.children:
445
+ if child.type == 'identifier':
446
+ # Default import
447
+ name = self._bs(child)
448
+ imports[name] = ImportInfo(module=module, name=name, line=line)
449
+ elif child.type == 'named_imports':
450
+ # { X, Y as Z }
451
+ for spec in child.children:
452
+ if spec.type == 'import_specifier':
453
+ original = None
454
+ alias = None
455
+ for c in spec.children:
456
+ if c.type == 'identifier':
457
+ if original is None:
458
+ original = self._bs(c)
459
+ else:
460
+ alias = self._bs(c)
461
+ if original:
462
+ imports[alias or original] = ImportInfo(
463
+ module=module,
464
+ name=original,
465
+ alias=alias,
466
+ is_from_import=True,
467
+ line=line
468
+ )
469
+ elif child.type == 'namespace_import':
470
+ # import * as X
471
+ for c in child.children:
472
+ if c.type == 'identifier':
473
+ name = self._bs(c)
474
+ imports[name] = ImportInfo(module=module, name='*', alias=name, line=line)
475
+
476
+ def _parse_go_imports(
477
+ self,
478
+ node: Node,
479
+ source: str,
480
+ imports: Dict[str, ImportInfo]
481
+ ) -> None:
482
+ """Parse Go import statements"""
483
+ if node.type == 'import_declaration':
484
+ for child in node.children:
485
+ if child.type == 'import_spec':
486
+ pkg = None
487
+ alias = None
488
+ for c in child.children:
489
+ if c.type == 'interpreted_string_literal':
490
+ pkg = self._bs(c).strip('"')
491
+ elif c.type == 'package_identifier' or c.type == 'identifier':
492
+ alias = self._bs(c)
493
+ if pkg:
494
+ # Use last part of package path as name
495
+ name = pkg.split('/')[-1]
496
+ imports[alias or name] = ImportInfo(
497
+ module=pkg,
498
+ name=name,
499
+ alias=alias,
500
+ line=child.start_point[0]
501
+ )
502
+ elif child.type == 'import_spec_list':
503
+ self._parse_go_imports(child, source, imports)
504
+
505
+ for child in node.children:
506
+ self._parse_go_imports(child, source, imports)
507
+
508
+ def _extract_chunks(
509
+ self,
510
+ node: Node,
511
+ source: str,
512
+ language: str,
513
+ filepath: str,
514
+ scope: List[str],
515
+ all_imports: Dict[str, ImportInfo]
516
+ ) -> List[EnhancedCodeChunk]:
517
+ """
518
+ Recursively extract chunks from AST with scope tracking.
519
+ """
520
+ chunks = []
521
+ chunk_node_types = self.CHUNK_NODES.get(language, [])
522
+ scope_node_types = self.SCOPE_NODES.get(language, [])
523
+
524
+ # Check if this node should be a chunk
525
+ if node.type in chunk_node_types:
526
+ chunk = self._create_chunk(node, source, language, filepath, scope, all_imports)
527
+ if chunk:
528
+ chunks.append(chunk)
529
+
530
+ # Update scope for children if this is a scope boundary
531
+ if node.type in scope_node_types and chunk.name:
532
+ scope = scope + [chunk.name]
533
+
534
+ # Process children
535
+ for child in node.children:
536
+ child_chunks = self._extract_chunks(
537
+ child, source, language, filepath, scope, all_imports
538
+ )
539
+ chunks.extend(child_chunks)
540
+
541
+ return chunks
542
+
543
+ def _create_chunk(
544
+ self,
545
+ node: Node,
546
+ source: str,
547
+ language: str,
548
+ filepath: str,
549
+ scope: List[str],
550
+ all_imports: Dict[str, ImportInfo]
551
+ ) -> Optional[EnhancedCodeChunk]:
552
+ """Create an EnhancedCodeChunk from an AST node"""
553
+ text = self._bs(node)
554
+ if not text.strip():
555
+ return None
556
+
557
+ # Determine chunk type and extract name
558
+ chunk_type, name = self._analyze_node(node, source, language)
559
+
560
+ # Extract signature for functions/methods
561
+ signature = None
562
+ parameters = []
563
+ return_type = None
564
+ if chunk_type in ['function', 'method']:
565
+ sig_info = self._extract_signature(node, source, language)
566
+ signature = sig_info.get('signature')
567
+ parameters = sig_info.get('parameters', [])
568
+ return_type = sig_info.get('return_type')
569
+
570
+ # Extract docstring
571
+ docstring = self._extract_docstring(node, source, language)
572
+
573
+ # Find which imports are used in this chunk
574
+ used_imports = self._find_used_imports(text, all_imports)
575
+
576
+ return EnhancedCodeChunk(
577
+ text=text,
578
+ contextualized_text="", # Will be filled later
579
+ line_range=(node.start_point[0], node.end_point[0]),
580
+ byte_range=(node.start_byte, node.end_byte),
581
+ chunk_type=chunk_type,
582
+ name=name,
583
+ signature=signature,
584
+ docstring=docstring,
585
+ parameters=parameters,
586
+ return_type=return_type,
587
+ scope=scope.copy(),
588
+ imports=used_imports,
589
+ language=language,
590
+ filepath=filepath
591
+ )
592
+
593
+ def _analyze_node(
594
+ self,
595
+ node: Node,
596
+ source: str,
597
+ language: str
598
+ ) -> Tuple[str, Optional[str]]:
599
+ """Determine chunk type and extract name from node"""
600
+ node_type = node.type
601
+ name = None
602
+
603
+ # Python
604
+ if language == 'python':
605
+ if node_type in ['function_definition', 'async_function_definition']:
606
+ name = self._get_identifier(node, source)
607
+ return ('function', name)
608
+ elif node_type == 'class_definition':
609
+ name = self._get_identifier(node, source)
610
+ return ('class', name)
611
+ elif node_type == 'decorated_definition':
612
+ # Get the actual definition inside
613
+ for child in node.children:
614
+ if child.type in ['function_definition', 'async_function_definition']:
615
+ return self._analyze_node(child, source, language)
616
+ elif child.type == 'class_definition':
617
+ return self._analyze_node(child, source, language)
618
+
619
+ # JavaScript/TypeScript
620
+ elif language in ['javascript', 'typescript']:
621
+ if node_type == 'function_declaration':
622
+ name = self._get_identifier(node, source)
623
+ return ('function', name)
624
+ elif node_type == 'class_declaration':
625
+ name = self._get_identifier(node, source)
626
+ return ('class', name)
627
+ elif node_type == 'method_definition':
628
+ name = self._get_property_identifier(node, source)
629
+ return ('method', name)
630
+ elif node_type == 'arrow_function':
631
+ return ('function', 'arrow_function')
632
+ elif node_type == 'interface_declaration':
633
+ name = self._get_identifier(node, source)
634
+ return ('interface', name)
635
+ elif node_type == 'type_alias_declaration':
636
+ name = self._get_identifier(node, source)
637
+ return ('type', name)
638
+
639
+ # Go
640
+ elif language == 'go':
641
+ if node_type == 'function_declaration':
642
+ name = self._get_identifier(node, source)
643
+ return ('function', name)
644
+ elif node_type == 'method_declaration':
645
+ name = self._get_field_identifier(node, source)
646
+ return ('method', name)
647
+ elif node_type == 'type_declaration':
648
+ name = self._get_type_identifier(node, source)
649
+ return ('type', name)
650
+
651
+ return ('block', None)
652
+
653
+ def _get_identifier(self, node: Node, source: str) -> Optional[str]:
654
+ """Get identifier name from node's children"""
655
+ for child in node.children:
656
+ if child.type == 'identifier':
657
+ return self._bs(child)
658
+ return None
659
+
660
+ def _get_property_identifier(self, node: Node, source: str) -> Optional[str]:
661
+ """Get property identifier for JS methods"""
662
+ for child in node.children:
663
+ if child.type == 'property_identifier':
664
+ return self._bs(child)
665
+ return None
666
+
667
+ def _get_field_identifier(self, node: Node, source: str) -> Optional[str]:
668
+ """Get field identifier for Go methods"""
669
+ for child in node.children:
670
+ if child.type == 'field_identifier':
671
+ return self._bs(child)
672
+ return None
673
+
674
+ def _get_type_identifier(self, node: Node, source: str) -> Optional[str]:
675
+ """Get type identifier for Go type declarations"""
676
+ for child in node.children:
677
+ if child.type == 'type_spec':
678
+ for c in child.children:
679
+ if c.type == 'type_identifier':
680
+ return self._bs(c)
681
+ return None
682
+
683
+ def _extract_signature(
684
+ self,
685
+ node: Node,
686
+ source: str,
687
+ language: str
688
+ ) -> Dict[str, Any]:
689
+ """Extract complete function signature"""
690
+ result = {'signature': None, 'parameters': [], 'return_type': None}
691
+
692
+ if language == 'python':
693
+ result = self._extract_python_signature(node, source)
694
+ elif language in ['javascript', 'typescript']:
695
+ result = self._extract_js_signature(node, source)
696
+ elif language == 'go':
697
+ result = self._extract_go_signature(node, source)
698
+
699
+ return result
700
+
701
+ def _extract_python_signature(self, node: Node, source: str) -> Dict[str, Any]:
702
+ """Extract Python function signature"""
703
+ name = self._get_identifier(node, source)
704
+ parameters = []
705
+ return_type = None
706
+
707
+ for child in node.children:
708
+ if child.type == 'parameters':
709
+ parameters = self._parse_python_parameters(child, source)
710
+ elif child.type == 'type':
711
+ return_type = self._bs(child)
712
+
713
+ # Build signature string
714
+ param_str = ', '.join(self._format_parameter(p) for p in parameters)
715
+ sig = f"def {name}({param_str})"
716
+ if return_type:
717
+ sig += f" -> {return_type}"
718
+
719
+ return {
720
+ 'signature': sig,
721
+ 'parameters': parameters,
722
+ 'return_type': return_type
723
+ }
724
+
725
+ def _parse_python_parameters(self, params_node: Node, source: str) -> List[ParameterInfo]:
726
+ """Parse Python function parameters"""
727
+ parameters = []
728
+
729
+ for child in params_node.children:
730
+ if child.type == 'identifier':
731
+ parameters.append(ParameterInfo(
732
+ name=self._bs(child)
733
+ ))
734
+ elif child.type == 'typed_parameter':
735
+ name = None
736
+ type_ann = None
737
+ for c in child.children:
738
+ if c.type == 'identifier':
739
+ name = self._bs(c)
740
+ elif c.type == 'type':
741
+ type_ann = self._bs(c)
742
+ if name:
743
+ parameters.append(ParameterInfo(name=name, type_annotation=type_ann))
744
+ elif child.type == 'default_parameter':
745
+ name = None
746
+ type_ann = None
747
+ default = None
748
+ for c in child.children:
749
+ if c.type == 'identifier':
750
+ name = self._bs(c)
751
+ elif c.type == 'type':
752
+ type_ann = self._bs(c)
753
+ elif c.type not in ['identifier', 'type', '=', ':']:
754
+ default = self._bs(c)
755
+ if name:
756
+ parameters.append(ParameterInfo(
757
+ name=name,
758
+ type_annotation=type_ann,
759
+ default_value=default
760
+ ))
761
+ elif child.type == 'typed_default_parameter':
762
+ name = None
763
+ type_ann = None
764
+ default = None
765
+ for c in child.children:
766
+ if c.type == 'identifier':
767
+ name = self._bs(c)
768
+ elif c.type == 'type':
769
+ type_ann = self._bs(c)
770
+ elif c.type not in ['identifier', 'type', '=', ':']:
771
+ default = self._bs(c)
772
+ if name:
773
+ parameters.append(ParameterInfo(
774
+ name=name,
775
+ type_annotation=type_ann,
776
+ default_value=default
777
+ ))
778
+ elif child.type == 'list_splat_pattern':
779
+ name = self._get_identifier(child, source)
780
+ if name:
781
+ parameters.append(ParameterInfo(name=name, is_variadic=True))
782
+ elif child.type == 'dictionary_splat_pattern':
783
+ name = self._get_identifier(child, source)
784
+ if name:
785
+ parameters.append(ParameterInfo(name=name, is_keyword=True))
786
+
787
+ return parameters
788
+
789
+ def _extract_js_signature(self, node: Node, source: str) -> Dict[str, Any]:
790
+ """Extract JavaScript/TypeScript function signature"""
791
+ name = self._get_identifier(node, source) or self._get_property_identifier(node, source)
792
+ parameters = []
793
+ return_type = None
794
+
795
+ for child in node.children:
796
+ if child.type == 'formal_parameters':
797
+ parameters = self._parse_js_parameters(child, source)
798
+ elif child.type == 'type_annotation':
799
+ return_type = self._bs(child).lstrip(': ')
800
+
801
+ # Build signature
802
+ param_str = ', '.join(self._format_parameter(p) for p in parameters)
803
+ sig = f"function {name or 'anonymous'}({param_str})"
804
+ if return_type:
805
+ sig += f": {return_type}"
806
+
807
+ return {
808
+ 'signature': sig,
809
+ 'parameters': parameters,
810
+ 'return_type': return_type
811
+ }
812
+
813
+ def _parse_js_parameters(self, params_node: Node, source: str) -> List[ParameterInfo]:
814
+ """Parse JavaScript/TypeScript function parameters"""
815
+ parameters = []
816
+
817
+ for child in params_node.children:
818
+ if child.type == 'identifier':
819
+ parameters.append(ParameterInfo(
820
+ name=self._bs(child)
821
+ ))
822
+ elif child.type == 'required_parameter' or child.type == 'optional_parameter':
823
+ name = None
824
+ type_ann = None
825
+ for c in child.children:
826
+ if c.type == 'identifier':
827
+ name = self._bs(c)
828
+ elif c.type == 'type_annotation':
829
+ type_ann = self._bs(c).lstrip(': ')
830
+ if name:
831
+ parameters.append(ParameterInfo(name=name, type_annotation=type_ann))
832
+ elif child.type == 'rest_pattern':
833
+ name = self._get_identifier(child, source)
834
+ if name:
835
+ parameters.append(ParameterInfo(name=name, is_variadic=True))
836
+
837
+ return parameters
838
+
839
+ def _extract_go_signature(self, node: Node, source: str) -> Dict[str, Any]:
840
+ """Extract Go function signature"""
841
+ name = self._get_identifier(node, source) or self._get_field_identifier(node, source)
842
+ parameters = []
843
+ return_type = None
844
+
845
+ for child in node.children:
846
+ if child.type == 'parameter_list':
847
+ parameters = self._parse_go_parameters(child, source)
848
+ elif child.type == 'result':
849
+ return_type = self._bs(child)
850
+
851
+ # Build signature
852
+ param_str = ', '.join(self._format_parameter(p) for p in parameters)
853
+ sig = f"func {name or 'anonymous'}({param_str})"
854
+ if return_type:
855
+ sig += f" {return_type}"
856
+
857
+ return {
858
+ 'signature': sig,
859
+ 'parameters': parameters,
860
+ 'return_type': return_type
861
+ }
862
+
863
+ def _parse_go_parameters(self, params_node: Node, source: str) -> List[ParameterInfo]:
864
+ """Parse Go function parameters"""
865
+ parameters = []
866
+
867
+ for child in params_node.children:
868
+ if child.type == 'parameter_declaration':
869
+ names = []
870
+ type_ann = None
871
+ for c in child.children:
872
+ if c.type == 'identifier':
873
+ names.append(self._bs(c))
874
+ elif c.type in ['type_identifier', 'pointer_type', 'slice_type',
875
+ 'array_type', 'map_type', 'interface_type']:
876
+ type_ann = self._bs(c)
877
+ for name in names:
878
+ parameters.append(ParameterInfo(name=name, type_annotation=type_ann))
879
+
880
+ return parameters
881
+
882
+ def _format_parameter(self, param: ParameterInfo) -> str:
883
+ """Format a parameter for signature display"""
884
+ prefix = ''
885
+ if param.is_variadic:
886
+ prefix = '*'
887
+ elif param.is_keyword:
888
+ prefix = '**'
889
+
890
+ result = f"{prefix}{param.name}"
891
+ if param.type_annotation:
892
+ result += f": {param.type_annotation}"
893
+ if param.default_value:
894
+ result += f" = {param.default_value}"
895
+ return result
896
+
897
+ def _extract_docstring(
898
+ self,
899
+ node: Node,
900
+ source: str,
901
+ language: str
902
+ ) -> Optional[str]:
903
+ """Extract docstring from a function/class node"""
904
+ if language == 'python':
905
+ # Look for string as first statement in body
906
+ for child in node.children:
907
+ if child.type == 'block':
908
+ for stmt in child.children:
909
+ if stmt.type == 'expression_statement':
910
+ for expr in stmt.children:
911
+ if expr.type == 'string':
912
+ docstring = self._bs(expr)
913
+ # Clean up the docstring
914
+ return docstring.strip('"""\'\'\'').strip()
915
+ elif stmt.type not in ['comment', 'pass_statement']:
916
+ break # Docstring must be first
917
+ break
918
+ elif language in ['javascript', 'typescript']:
919
+ # Look for JSDoc comment before the node
920
+ # This would require looking at preceding siblings or comments
921
+ pass
922
+
923
+ return None
924
+
925
+ def _find_used_imports(
926
+ self,
927
+ chunk_text: str,
928
+ all_imports: Dict[str, ImportInfo]
929
+ ) -> List[ImportInfo]:
930
+ """Find which imports are used in a chunk"""
931
+ used = []
932
+ # Simple word boundary check for each import name
933
+ for name, import_info in all_imports.items():
934
+ # Use word boundary to avoid partial matches
935
+ if re.search(rf'\b{re.escape(name)}\b', chunk_text):
936
+ used.append(import_info)
937
+ return used
938
+
939
+ def _build_contextualized_text(self, chunk: EnhancedCodeChunk) -> str:
940
+ """
941
+ Build contextualized text with metadata prepended.
942
+
943
+ This format is optimized for LLM consumption and embedding.
944
+ """
945
+ if self.context_mode == 'none':
946
+ return chunk.text
947
+
948
+ lines = []
949
+
950
+ # File path
951
+ lines.append(f"# {chunk.filepath}")
952
+
953
+ if self.context_mode in ['minimal', 'full']:
954
+ # Scope chain
955
+ if chunk.scope:
956
+ scope_str = ' > '.join(chunk.scope)
957
+ lines.append(f"# Scope: {scope_str}")
958
+
959
+ if self.context_mode == 'full':
960
+ # Signature
961
+ if chunk.signature:
962
+ lines.append(f"# Signature: {chunk.signature}")
963
+
964
+ # Used imports
965
+ if chunk.imports:
966
+ import_names = [f"{i.name}" for i in chunk.imports[:5]] # Limit to 5
967
+ if len(chunk.imports) > 5:
968
+ import_names.append(f"... +{len(chunk.imports) - 5} more")
969
+ lines.append(f"# Uses: {', '.join(import_names)}")
970
+
971
+ # Docstring summary (first line)
972
+ if chunk.docstring:
973
+ first_line = chunk.docstring.split('\n')[0][:100]
974
+ lines.append(f"# Doc: {first_line}")
975
+
976
+ lines.append("") # Blank line before code
977
+ lines.append(chunk.text)
978
+
979
+ return '\n'.join(lines)
980
+
981
+ def _chunk_fallback(
982
+ self,
983
+ content: str,
984
+ language: str,
985
+ filepath: str
986
+ ) -> List[EnhancedCodeChunk]:
987
+ """
988
+ Fallback line-based chunking when tree-sitter is unavailable.
989
+ """
990
+ lines = content.split('\n')
991
+ chunks = []
992
+ current_chunk_lines = []
993
+ current_start = 0
994
+
995
+ for i, line in enumerate(lines):
996
+ current_chunk_lines.append(line)
997
+
998
+ # Check if we should start a new chunk
999
+ should_split = (
1000
+ len('\n'.join(current_chunk_lines)) > self.max_chunk_size or
1001
+ self._is_chunk_boundary(line, language)
1002
+ )
1003
+
1004
+ if should_split and current_chunk_lines:
1005
+ text = '\n'.join(current_chunk_lines)
1006
+ if text.strip():
1007
+ chunk = EnhancedCodeChunk(
1008
+ text=text,
1009
+ contextualized_text=f"# {filepath}\n\n{text}",
1010
+ line_range=(current_start, i),
1011
+ byte_range=(0, 0), # Not calculated for fallback
1012
+ chunk_type='block',
1013
+ language=language,
1014
+ filepath=filepath
1015
+ )
1016
+ chunks.append(chunk)
1017
+
1018
+ current_chunk_lines = []
1019
+ current_start = i + 1
1020
+
1021
+ # Handle remaining lines
1022
+ if current_chunk_lines:
1023
+ text = '\n'.join(current_chunk_lines)
1024
+ if text.strip():
1025
+ chunks.append(EnhancedCodeChunk(
1026
+ text=text,
1027
+ contextualized_text=f"# {filepath}\n\n{text}",
1028
+ line_range=(current_start, len(lines) - 1),
1029
+ byte_range=(0, 0),
1030
+ chunk_type='block',
1031
+ language=language,
1032
+ filepath=filepath
1033
+ ))
1034
+
1035
+ return chunks
1036
+
1037
+ def _is_chunk_boundary(self, line: str, language: str) -> bool:
1038
+ """Detect potential chunk boundaries for fallback mode"""
1039
+ line = line.strip()
1040
+
1041
+ if language == 'python':
1042
+ return line.startswith(('def ', 'class ', 'async def ', '@'))
1043
+ elif language in ['javascript', 'typescript']:
1044
+ return (line.startswith(('function ', 'class ', 'export ')) or
1045
+ 'function(' in line or '=>' in line)
1046
+ elif language == 'go':
1047
+ return line.startswith(('func ', 'type '))
1048
+
1049
+ return False
1050
+
1051
+
1052
+ # Convenience function
1053
+ def enhanced_chunk_code(
1054
+ file_path: Path,
1055
+ content: Optional[str] = None,
1056
+ max_chunk_size: int = 1500,
1057
+ context_mode: str = "full"
1058
+ ) -> List[EnhancedCodeChunk]:
1059
+ """
1060
+ Convenience function to chunk code with enhanced metadata extraction.
1061
+
1062
+ Args:
1063
+ file_path: Path to the source file
1064
+ content: Optional pre-loaded content
1065
+ max_chunk_size: Maximum chunk size
1066
+ context_mode: Context detail level ('none', 'minimal', 'full')
1067
+
1068
+ Returns:
1069
+ List of EnhancedCodeChunk objects
1070
+ """
1071
+ chunker = EnhancedASTChunker(
1072
+ max_chunk_size=max_chunk_size,
1073
+ context_mode=context_mode
1074
+ )
1075
+ return chunker.chunk_file(file_path, content)
1076
+
1077
+
1078
+ if __name__ == "__main__":
1079
+ # Test the enhanced chunker
1080
+ import sys
1081
+
1082
+ logging.basicConfig(level=logging.DEBUG)
1083
+
1084
+ if len(sys.argv) < 2:
1085
+ print("Usage: python enhanced_ast_chunker.py <file_path>")
1086
+ sys.exit(1)
1087
+
1088
+ test_file = Path(sys.argv[1])
1089
+
1090
+ print(f"\nTesting EnhancedASTChunker on: {test_file}")
1091
+ print("=" * 60)
1092
+
1093
+ chunker = EnhancedASTChunker(context_mode="full")
1094
+
1095
+ try:
1096
+ chunks = chunker.chunk_file(test_file)
1097
+
1098
+ print(f"\nFound {len(chunks)} chunks:\n")
1099
+
1100
+ for i, chunk in enumerate(chunks):
1101
+ print(f"{'='*60}")
1102
+ print(f"Chunk {i + 1}: {chunk.chunk_type} - {chunk.name or '(unnamed)'}")
1103
+ print(f"{'='*60}")
1104
+ print(f"Lines: {chunk.line_range[0]+1}-{chunk.line_range[1]+1}")
1105
+ print(f"Scope: {' > '.join(chunk.scope) or '(module level)'}")
1106
+ if chunk.signature:
1107
+ print(f"Signature: {chunk.signature}")
1108
+ if chunk.parameters:
1109
+ print(f"Parameters: {len(chunk.parameters)}")
1110
+ for p in chunk.parameters[:3]:
1111
+ print(f" - {p.name}: {p.type_annotation or 'untyped'}")
1112
+ if chunk.return_type:
1113
+ print(f"Return type: {chunk.return_type}")
1114
+ if chunk.imports:
1115
+ print(f"Uses imports: {[i.name for i in chunk.imports[:5]]}")
1116
+ if chunk.docstring:
1117
+ print(f"Docstring: {chunk.docstring[:80]}...")
1118
+ print(f"\nContextualized text preview:")
1119
+ print("-" * 40)
1120
+ print(chunk.contextualized_text[:500])
1121
+ if len(chunk.contextualized_text) > 500:
1122
+ print("...")
1123
+ print()
1124
+
1125
+ except Exception as e:
1126
+ print(f"Error: {e}")
1127
+ import traceback
1128
+ traceback.print_exc()
1129
+ sys.exit(1)