code-finder 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- claude_context/__init__.py +33 -0
- claude_context/agentic_integration.py +309 -0
- claude_context/ast_chunker.py +646 -0
- claude_context/config.py +239 -0
- claude_context/context_manager.py +627 -0
- claude_context/embeddings.py +307 -0
- claude_context/embeddings_interface.py +226 -0
- claude_context/enhanced_ast_chunker.py +1129 -0
- claude_context/explorer.py +951 -0
- claude_context/explorer_with_context.py +1008 -0
- claude_context/indexer.py +893 -0
- claude_context/markdown_chunker.py +421 -0
- claude_context/mode_handler.py +1774 -0
- claude_context/query_metrics.py +164 -0
- claude_context/question_generator.py +800 -0
- claude_context/readme_extractor.py +485 -0
- claude_context/repository_adapter.py +399 -0
- claude_context/search.py +493 -0
- claude_context/skills/__init__.py +11 -0
- claude_context/skills/_cli_common.py +74 -0
- claude_context/skills/_index_manager.py +98 -0
- claude_context/skills/api_surface.py +219 -0
- claude_context/skills/evidence_retrieval.py +151 -0
- claude_context/skills/grounded_review.py +212 -0
- claude_context/synthesis/__init__.py +8 -0
- claude_context/synthesis/editor_agent.py +391 -0
- claude_context/synthesis/llm_synthesizer.py +153 -0
- claude_context/synthesis/logic_explainer.py +235 -0
- claude_context/synthesis/multi_review_pipeline.py +717 -0
- claude_context/synthesis/prompt_builder.py +439 -0
- claude_context/synthesis/providers.py +115 -0
- claude_context/synthesis/validators.py +458 -0
- code_finder-0.1.0.dist-info/METADATA +823 -0
- code_finder-0.1.0.dist-info/RECORD +37 -0
- code_finder-0.1.0.dist-info/WHEEL +5 -0
- code_finder-0.1.0.dist-info/entry_points.txt +4 -0
- code_finder-0.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,1129 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Enhanced AST-based Code Chunker with Rich Metadata Extraction
|
|
3
|
+
|
|
4
|
+
This module ports key features from supermemoryai/code-chunk to provide
|
|
5
|
+
richer semantic metadata for code chunks, improving retrieval and
|
|
6
|
+
documentation generation quality.
|
|
7
|
+
|
|
8
|
+
Key enhancements over basic ast_chunker:
|
|
9
|
+
- Full scope chain tracking (class > method > nested function)
|
|
10
|
+
- Complete signature extraction with parameters and types
|
|
11
|
+
- Import dependency tracking per chunk
|
|
12
|
+
- Contextualized text generation for LLM consumption
|
|
13
|
+
|
|
14
|
+
Supports: Python, JavaScript, TypeScript, Go
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
import re
|
|
18
|
+
import logging
|
|
19
|
+
from typing import List, Dict, Any, Optional, Tuple, Set
|
|
20
|
+
from dataclasses import dataclass, field
|
|
21
|
+
from pathlib import Path
|
|
22
|
+
|
|
23
|
+
logger = logging.getLogger(__name__)
|
|
24
|
+
|
|
25
|
+
# Try to import tree-sitter dependencies
|
|
26
|
+
try:
|
|
27
|
+
import tree_sitter_python as tspython
|
|
28
|
+
import tree_sitter_javascript as tsjavascript
|
|
29
|
+
import tree_sitter_typescript as tstypescript
|
|
30
|
+
import tree_sitter_go as tsgo
|
|
31
|
+
from tree_sitter import Language, Parser, Node
|
|
32
|
+
HAS_TREE_SITTER = True
|
|
33
|
+
logger.info("Tree-sitter available for enhanced AST parsing")
|
|
34
|
+
except ImportError as e:
|
|
35
|
+
HAS_TREE_SITTER = False
|
|
36
|
+
Node = Any # Type hint fallback
|
|
37
|
+
logger.warning(f"Tree-sitter not available: {e}")
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
@dataclass
|
|
41
|
+
class ImportInfo:
|
|
42
|
+
"""Information about an import statement"""
|
|
43
|
+
module: str # The module being imported from
|
|
44
|
+
name: str # The specific name imported (or module name for simple imports)
|
|
45
|
+
alias: Optional[str] = None # Alias if 'as X' is used
|
|
46
|
+
is_from_import: bool = False # True for 'from X import Y'
|
|
47
|
+
line: int = 0
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
@dataclass
|
|
51
|
+
class ParameterInfo:
|
|
52
|
+
"""Information about a function parameter"""
|
|
53
|
+
name: str
|
|
54
|
+
type_annotation: Optional[str] = None
|
|
55
|
+
default_value: Optional[str] = None
|
|
56
|
+
is_variadic: bool = False # *args
|
|
57
|
+
is_keyword: bool = False # **kwargs
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
@dataclass
|
|
61
|
+
class EnhancedCodeChunk:
|
|
62
|
+
"""
|
|
63
|
+
Rich code chunk with full metadata for improved retrieval and understanding.
|
|
64
|
+
|
|
65
|
+
This extends the basic CodeChunk with:
|
|
66
|
+
- Full scope chain (hierarchical context)
|
|
67
|
+
- Complete function signatures
|
|
68
|
+
- Import dependency tracking
|
|
69
|
+
- Contextualized text for LLM consumption
|
|
70
|
+
"""
|
|
71
|
+
# Core content
|
|
72
|
+
text: str # Raw code text
|
|
73
|
+
contextualized_text: str # Code with context prepended
|
|
74
|
+
|
|
75
|
+
# Location
|
|
76
|
+
line_range: Tuple[int, int] # (start, end) 0-indexed
|
|
77
|
+
byte_range: Tuple[int, int] # (start, end)
|
|
78
|
+
|
|
79
|
+
# Entity identification
|
|
80
|
+
chunk_type: str # function, class, method, import_block, module, type
|
|
81
|
+
name: Optional[str] = None
|
|
82
|
+
signature: Optional[str] = None
|
|
83
|
+
docstring: Optional[str] = None
|
|
84
|
+
|
|
85
|
+
# Parameters (for functions/methods)
|
|
86
|
+
parameters: List[ParameterInfo] = field(default_factory=list)
|
|
87
|
+
return_type: Optional[str] = None
|
|
88
|
+
|
|
89
|
+
# Hierarchical context
|
|
90
|
+
scope: List[str] = field(default_factory=list) # Full scope chain
|
|
91
|
+
|
|
92
|
+
# Dependencies
|
|
93
|
+
imports: List[ImportInfo] = field(default_factory=list) # Used imports
|
|
94
|
+
|
|
95
|
+
# Metadata
|
|
96
|
+
language: str = ""
|
|
97
|
+
filepath: str = ""
|
|
98
|
+
is_partial: bool = False # True if chunk was split due to size
|
|
99
|
+
|
|
100
|
+
# Size tracking
|
|
101
|
+
size_chars: int = field(default=0, init=False)
|
|
102
|
+
|
|
103
|
+
def __post_init__(self):
|
|
104
|
+
if self.size_chars == 0:
|
|
105
|
+
self.size_chars = len(re.sub(r'\s', '', self.text))
|
|
106
|
+
|
|
107
|
+
def to_dict(self) -> Dict[str, Any]:
|
|
108
|
+
"""Convert to dictionary for storage/serialization"""
|
|
109
|
+
return {
|
|
110
|
+
"text": self.text,
|
|
111
|
+
"contextualized_text": self.contextualized_text,
|
|
112
|
+
"line_range": self.line_range,
|
|
113
|
+
"byte_range": self.byte_range,
|
|
114
|
+
"chunk_type": self.chunk_type,
|
|
115
|
+
"name": self.name,
|
|
116
|
+
"signature": self.signature,
|
|
117
|
+
"docstring": self.docstring,
|
|
118
|
+
"parameters": [
|
|
119
|
+
{
|
|
120
|
+
"name": p.name,
|
|
121
|
+
"type": p.type_annotation,
|
|
122
|
+
"default": p.default_value,
|
|
123
|
+
"is_variadic": p.is_variadic,
|
|
124
|
+
"is_keyword": p.is_keyword
|
|
125
|
+
}
|
|
126
|
+
for p in self.parameters
|
|
127
|
+
],
|
|
128
|
+
"return_type": self.return_type,
|
|
129
|
+
"scope": self.scope,
|
|
130
|
+
"imports": [
|
|
131
|
+
{
|
|
132
|
+
"module": i.module,
|
|
133
|
+
"name": i.name,
|
|
134
|
+
"alias": i.alias
|
|
135
|
+
}
|
|
136
|
+
for i in self.imports
|
|
137
|
+
],
|
|
138
|
+
"language": self.language,
|
|
139
|
+
"filepath": self.filepath,
|
|
140
|
+
"is_partial": self.is_partial,
|
|
141
|
+
"size_chars": self.size_chars
|
|
142
|
+
}
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
class EnhancedASTChunker:
|
|
146
|
+
"""
|
|
147
|
+
Enhanced AST chunker with rich metadata extraction.
|
|
148
|
+
|
|
149
|
+
Ports key features from supermemoryai/code-chunk:
|
|
150
|
+
- Scope chain tracking
|
|
151
|
+
- Signature extraction
|
|
152
|
+
- Import dependency linking
|
|
153
|
+
- Contextualized text generation
|
|
154
|
+
"""
|
|
155
|
+
|
|
156
|
+
# Node types that represent scope boundaries
|
|
157
|
+
SCOPE_NODES = {
|
|
158
|
+
'python': ['class_definition', 'function_definition', 'async_function_definition'],
|
|
159
|
+
'javascript': ['class_declaration', 'function_declaration', 'arrow_function', 'method_definition'],
|
|
160
|
+
'typescript': ['class_declaration', 'function_declaration', 'arrow_function', 'method_definition',
|
|
161
|
+
'interface_declaration'],
|
|
162
|
+
'go': ['function_declaration', 'method_declaration', 'type_declaration']
|
|
163
|
+
}
|
|
164
|
+
|
|
165
|
+
# Node types that should become chunks
|
|
166
|
+
CHUNK_NODES = {
|
|
167
|
+
'python': [
|
|
168
|
+
'function_definition', 'async_function_definition', 'class_definition',
|
|
169
|
+
'decorated_definition'
|
|
170
|
+
],
|
|
171
|
+
'javascript': [
|
|
172
|
+
'function_declaration', 'class_declaration', 'method_definition',
|
|
173
|
+
'arrow_function', 'export_statement'
|
|
174
|
+
],
|
|
175
|
+
'typescript': [
|
|
176
|
+
'function_declaration', 'class_declaration', 'method_definition',
|
|
177
|
+
'arrow_function', 'interface_declaration', 'type_alias_declaration',
|
|
178
|
+
'export_statement'
|
|
179
|
+
],
|
|
180
|
+
'go': [
|
|
181
|
+
'function_declaration', 'method_declaration', 'type_declaration'
|
|
182
|
+
]
|
|
183
|
+
}
|
|
184
|
+
|
|
185
|
+
def __init__(
|
|
186
|
+
self,
|
|
187
|
+
max_chunk_size: int = 1500,
|
|
188
|
+
context_mode: str = "full",
|
|
189
|
+
merge_small_chunks: bool = False,
|
|
190
|
+
min_chunk_size: int = 100
|
|
191
|
+
):
|
|
192
|
+
"""
|
|
193
|
+
Initialize the enhanced AST chunker.
|
|
194
|
+
|
|
195
|
+
Args:
|
|
196
|
+
max_chunk_size: Maximum chunk size in characters
|
|
197
|
+
context_mode: Context detail level ('none', 'minimal', 'full')
|
|
198
|
+
merge_small_chunks: Whether to merge adjacent small chunks
|
|
199
|
+
min_chunk_size: Minimum chunk size for merging
|
|
200
|
+
"""
|
|
201
|
+
self.max_chunk_size = max_chunk_size
|
|
202
|
+
self.context_mode = context_mode
|
|
203
|
+
self.merge_small_chunks = merge_small_chunks
|
|
204
|
+
self.min_chunk_size = min_chunk_size
|
|
205
|
+
|
|
206
|
+
self.parsers: Dict[str, Parser] = {}
|
|
207
|
+
self.supported_languages = ['python', 'javascript', 'typescript', 'go']
|
|
208
|
+
|
|
209
|
+
if HAS_TREE_SITTER:
|
|
210
|
+
self._initialize_parsers()
|
|
211
|
+
|
|
212
|
+
def _initialize_parsers(self) -> None:
|
|
213
|
+
"""Initialize tree-sitter parsers for supported languages"""
|
|
214
|
+
parser_configs = [
|
|
215
|
+
('python', tspython.language),
|
|
216
|
+
('javascript', tsjavascript.language),
|
|
217
|
+
('typescript', lambda: tstypescript.language_tsx()),
|
|
218
|
+
('go', tsgo.language)
|
|
219
|
+
]
|
|
220
|
+
|
|
221
|
+
for lang, lang_fn in parser_configs:
|
|
222
|
+
try:
|
|
223
|
+
language = Language(lang_fn())
|
|
224
|
+
self.parsers[lang] = Parser(language)
|
|
225
|
+
logger.debug(f"{lang} parser initialized")
|
|
226
|
+
except Exception as e:
|
|
227
|
+
logger.warning(f"Failed to initialize {lang} parser: {e}")
|
|
228
|
+
|
|
229
|
+
def chunk_file(
|
|
230
|
+
self,
|
|
231
|
+
file_path: Path,
|
|
232
|
+
content: Optional[str] = None
|
|
233
|
+
) -> List[EnhancedCodeChunk]:
|
|
234
|
+
"""
|
|
235
|
+
Chunk a file into semantically complete units with rich metadata.
|
|
236
|
+
|
|
237
|
+
Args:
|
|
238
|
+
file_path: Path to the source file
|
|
239
|
+
content: Optional pre-loaded content
|
|
240
|
+
|
|
241
|
+
Returns:
|
|
242
|
+
List of EnhancedCodeChunk objects
|
|
243
|
+
"""
|
|
244
|
+
file_path = Path(file_path)
|
|
245
|
+
|
|
246
|
+
if content is None:
|
|
247
|
+
try:
|
|
248
|
+
content = file_path.read_text(encoding='utf-8', errors='ignore')
|
|
249
|
+
except Exception as e:
|
|
250
|
+
logger.error(f"Failed to read {file_path}: {e}")
|
|
251
|
+
return []
|
|
252
|
+
|
|
253
|
+
language = self._detect_language(file_path)
|
|
254
|
+
|
|
255
|
+
if not HAS_TREE_SITTER or language not in self.parsers:
|
|
256
|
+
logger.debug(f"Using fallback chunking for {file_path}")
|
|
257
|
+
return self._chunk_fallback(content, language, str(file_path))
|
|
258
|
+
|
|
259
|
+
try:
|
|
260
|
+
return self._chunk_with_ast(content, language, str(file_path))
|
|
261
|
+
except Exception as e:
|
|
262
|
+
logger.warning(f"AST parsing failed for {file_path}, using fallback: {e}")
|
|
263
|
+
return self._chunk_fallback(content, language, str(file_path))
|
|
264
|
+
|
|
265
|
+
def _detect_language(self, file_path: Path) -> str:
|
|
266
|
+
"""Detect programming language from file extension"""
|
|
267
|
+
ext_map = {
|
|
268
|
+
'.py': 'python',
|
|
269
|
+
'.js': 'javascript',
|
|
270
|
+
'.jsx': 'javascript',
|
|
271
|
+
'.ts': 'typescript',
|
|
272
|
+
'.tsx': 'typescript',
|
|
273
|
+
'.go': 'go'
|
|
274
|
+
}
|
|
275
|
+
return ext_map.get(file_path.suffix.lower(), 'unknown')
|
|
276
|
+
|
|
277
|
+
def _bs(self, node) -> str:
|
|
278
|
+
"""Slice source bytes by node byte offsets and decode to str.
|
|
279
|
+
|
|
280
|
+
Tree-sitter nodes report byte offsets, not character offsets.
|
|
281
|
+
Using ``source_str[start_byte:end_byte]`` gives wrong results
|
|
282
|
+
when the file contains multi-byte UTF-8 characters (emojis, etc.).
|
|
283
|
+
"""
|
|
284
|
+
return self._source_bytes[node.start_byte:node.end_byte].decode('utf-8', errors='replace')
|
|
285
|
+
|
|
286
|
+
def _chunk_with_ast(
|
|
287
|
+
self,
|
|
288
|
+
content: str,
|
|
289
|
+
language: str,
|
|
290
|
+
filepath: str
|
|
291
|
+
) -> List[EnhancedCodeChunk]:
|
|
292
|
+
"""
|
|
293
|
+
Chunk code using tree-sitter AST parsing with rich metadata extraction.
|
|
294
|
+
"""
|
|
295
|
+
parser = self.parsers[language]
|
|
296
|
+
self._source_bytes = bytes(content, 'utf-8')
|
|
297
|
+
tree = parser.parse(self._source_bytes)
|
|
298
|
+
|
|
299
|
+
if not tree.root_node:
|
|
300
|
+
return self._chunk_fallback(content, language, filepath)
|
|
301
|
+
|
|
302
|
+
# First pass: parse all imports
|
|
303
|
+
all_imports = self._parse_all_imports(tree.root_node, content, language)
|
|
304
|
+
|
|
305
|
+
# Second pass: extract chunks with scope tracking
|
|
306
|
+
chunks = self._extract_chunks(
|
|
307
|
+
tree.root_node,
|
|
308
|
+
content,
|
|
309
|
+
language,
|
|
310
|
+
filepath,
|
|
311
|
+
scope=[],
|
|
312
|
+
all_imports=all_imports
|
|
313
|
+
)
|
|
314
|
+
|
|
315
|
+
# Generate contextualized text for each chunk
|
|
316
|
+
for chunk in chunks:
|
|
317
|
+
chunk.contextualized_text = self._build_contextualized_text(chunk)
|
|
318
|
+
|
|
319
|
+
return chunks
|
|
320
|
+
|
|
321
|
+
def _parse_all_imports(
|
|
322
|
+
self,
|
|
323
|
+
root: Node,
|
|
324
|
+
source: str,
|
|
325
|
+
language: str
|
|
326
|
+
) -> Dict[str, ImportInfo]:
|
|
327
|
+
"""
|
|
328
|
+
Parse all imports in the file and return a mapping of names to import info.
|
|
329
|
+
"""
|
|
330
|
+
imports: Dict[str, ImportInfo] = {}
|
|
331
|
+
|
|
332
|
+
if language == 'python':
|
|
333
|
+
self._parse_python_imports(root, source, imports)
|
|
334
|
+
elif language in ['javascript', 'typescript']:
|
|
335
|
+
self._parse_js_imports(root, source, imports)
|
|
336
|
+
elif language == 'go':
|
|
337
|
+
self._parse_go_imports(root, source, imports)
|
|
338
|
+
|
|
339
|
+
return imports
|
|
340
|
+
|
|
341
|
+
def _parse_python_imports(
|
|
342
|
+
self,
|
|
343
|
+
node: Node,
|
|
344
|
+
source: str,
|
|
345
|
+
imports: Dict[str, ImportInfo]
|
|
346
|
+
) -> None:
|
|
347
|
+
"""Parse Python import statements"""
|
|
348
|
+
if node.type == 'import_statement':
|
|
349
|
+
# import X, Y, Z
|
|
350
|
+
for child in node.children:
|
|
351
|
+
if child.type == 'dotted_name':
|
|
352
|
+
name = self._bs(child)
|
|
353
|
+
imports[name] = ImportInfo(
|
|
354
|
+
module=name,
|
|
355
|
+
name=name,
|
|
356
|
+
line=child.start_point[0]
|
|
357
|
+
)
|
|
358
|
+
elif child.type == 'aliased_import':
|
|
359
|
+
# import X as Y
|
|
360
|
+
dotted = None
|
|
361
|
+
alias = None
|
|
362
|
+
for c in child.children:
|
|
363
|
+
if c.type == 'dotted_name':
|
|
364
|
+
dotted = self._bs(c)
|
|
365
|
+
elif c.type == 'identifier':
|
|
366
|
+
alias = self._bs(c)
|
|
367
|
+
if dotted:
|
|
368
|
+
imports[alias or dotted] = ImportInfo(
|
|
369
|
+
module=dotted,
|
|
370
|
+
name=dotted,
|
|
371
|
+
alias=alias,
|
|
372
|
+
line=child.start_point[0]
|
|
373
|
+
)
|
|
374
|
+
|
|
375
|
+
elif node.type == 'import_from_statement':
|
|
376
|
+
# from X import Y, Z
|
|
377
|
+
module = None
|
|
378
|
+
for child in node.children:
|
|
379
|
+
if child.type == 'dotted_name':
|
|
380
|
+
module = self._bs(child)
|
|
381
|
+
elif child.type == 'identifier' and module:
|
|
382
|
+
name = self._bs(child)
|
|
383
|
+
imports[name] = ImportInfo(
|
|
384
|
+
module=module,
|
|
385
|
+
name=name,
|
|
386
|
+
is_from_import=True,
|
|
387
|
+
line=child.start_point[0]
|
|
388
|
+
)
|
|
389
|
+
elif child.type == 'aliased_import':
|
|
390
|
+
original = None
|
|
391
|
+
alias = None
|
|
392
|
+
for c in child.children:
|
|
393
|
+
if c.type == 'identifier':
|
|
394
|
+
if original is None:
|
|
395
|
+
original = self._bs(c)
|
|
396
|
+
else:
|
|
397
|
+
alias = self._bs(c)
|
|
398
|
+
if original and module:
|
|
399
|
+
imports[alias or original] = ImportInfo(
|
|
400
|
+
module=module,
|
|
401
|
+
name=original,
|
|
402
|
+
alias=alias,
|
|
403
|
+
is_from_import=True,
|
|
404
|
+
line=child.start_point[0]
|
|
405
|
+
)
|
|
406
|
+
|
|
407
|
+
# Recurse
|
|
408
|
+
for child in node.children:
|
|
409
|
+
self._parse_python_imports(child, source, imports)
|
|
410
|
+
|
|
411
|
+
def _parse_js_imports(
|
|
412
|
+
self,
|
|
413
|
+
node: Node,
|
|
414
|
+
source: str,
|
|
415
|
+
imports: Dict[str, ImportInfo]
|
|
416
|
+
) -> None:
|
|
417
|
+
"""Parse JavaScript/TypeScript import statements"""
|
|
418
|
+
if node.type == 'import_statement':
|
|
419
|
+
module = None
|
|
420
|
+
# Find the module string
|
|
421
|
+
for child in node.children:
|
|
422
|
+
if child.type == 'string':
|
|
423
|
+
module = self._bs(child).strip("'\"")
|
|
424
|
+
|
|
425
|
+
if module:
|
|
426
|
+
# Find imported names
|
|
427
|
+
for child in node.children:
|
|
428
|
+
if child.type == 'import_clause':
|
|
429
|
+
self._extract_js_import_names(child, source, module, imports, node.start_point[0])
|
|
430
|
+
|
|
431
|
+
# Recurse
|
|
432
|
+
for child in node.children:
|
|
433
|
+
self._parse_js_imports(child, source, imports)
|
|
434
|
+
|
|
435
|
+
def _extract_js_import_names(
|
|
436
|
+
self,
|
|
437
|
+
node: Node,
|
|
438
|
+
source: str,
|
|
439
|
+
module: str,
|
|
440
|
+
imports: Dict[str, ImportInfo],
|
|
441
|
+
line: int
|
|
442
|
+
) -> None:
|
|
443
|
+
"""Extract names from JS import clause"""
|
|
444
|
+
for child in node.children:
|
|
445
|
+
if child.type == 'identifier':
|
|
446
|
+
# Default import
|
|
447
|
+
name = self._bs(child)
|
|
448
|
+
imports[name] = ImportInfo(module=module, name=name, line=line)
|
|
449
|
+
elif child.type == 'named_imports':
|
|
450
|
+
# { X, Y as Z }
|
|
451
|
+
for spec in child.children:
|
|
452
|
+
if spec.type == 'import_specifier':
|
|
453
|
+
original = None
|
|
454
|
+
alias = None
|
|
455
|
+
for c in spec.children:
|
|
456
|
+
if c.type == 'identifier':
|
|
457
|
+
if original is None:
|
|
458
|
+
original = self._bs(c)
|
|
459
|
+
else:
|
|
460
|
+
alias = self._bs(c)
|
|
461
|
+
if original:
|
|
462
|
+
imports[alias or original] = ImportInfo(
|
|
463
|
+
module=module,
|
|
464
|
+
name=original,
|
|
465
|
+
alias=alias,
|
|
466
|
+
is_from_import=True,
|
|
467
|
+
line=line
|
|
468
|
+
)
|
|
469
|
+
elif child.type == 'namespace_import':
|
|
470
|
+
# import * as X
|
|
471
|
+
for c in child.children:
|
|
472
|
+
if c.type == 'identifier':
|
|
473
|
+
name = self._bs(c)
|
|
474
|
+
imports[name] = ImportInfo(module=module, name='*', alias=name, line=line)
|
|
475
|
+
|
|
476
|
+
def _parse_go_imports(
|
|
477
|
+
self,
|
|
478
|
+
node: Node,
|
|
479
|
+
source: str,
|
|
480
|
+
imports: Dict[str, ImportInfo]
|
|
481
|
+
) -> None:
|
|
482
|
+
"""Parse Go import statements"""
|
|
483
|
+
if node.type == 'import_declaration':
|
|
484
|
+
for child in node.children:
|
|
485
|
+
if child.type == 'import_spec':
|
|
486
|
+
pkg = None
|
|
487
|
+
alias = None
|
|
488
|
+
for c in child.children:
|
|
489
|
+
if c.type == 'interpreted_string_literal':
|
|
490
|
+
pkg = self._bs(c).strip('"')
|
|
491
|
+
elif c.type == 'package_identifier' or c.type == 'identifier':
|
|
492
|
+
alias = self._bs(c)
|
|
493
|
+
if pkg:
|
|
494
|
+
# Use last part of package path as name
|
|
495
|
+
name = pkg.split('/')[-1]
|
|
496
|
+
imports[alias or name] = ImportInfo(
|
|
497
|
+
module=pkg,
|
|
498
|
+
name=name,
|
|
499
|
+
alias=alias,
|
|
500
|
+
line=child.start_point[0]
|
|
501
|
+
)
|
|
502
|
+
elif child.type == 'import_spec_list':
|
|
503
|
+
self._parse_go_imports(child, source, imports)
|
|
504
|
+
|
|
505
|
+
for child in node.children:
|
|
506
|
+
self._parse_go_imports(child, source, imports)
|
|
507
|
+
|
|
508
|
+
def _extract_chunks(
|
|
509
|
+
self,
|
|
510
|
+
node: Node,
|
|
511
|
+
source: str,
|
|
512
|
+
language: str,
|
|
513
|
+
filepath: str,
|
|
514
|
+
scope: List[str],
|
|
515
|
+
all_imports: Dict[str, ImportInfo]
|
|
516
|
+
) -> List[EnhancedCodeChunk]:
|
|
517
|
+
"""
|
|
518
|
+
Recursively extract chunks from AST with scope tracking.
|
|
519
|
+
"""
|
|
520
|
+
chunks = []
|
|
521
|
+
chunk_node_types = self.CHUNK_NODES.get(language, [])
|
|
522
|
+
scope_node_types = self.SCOPE_NODES.get(language, [])
|
|
523
|
+
|
|
524
|
+
# Check if this node should be a chunk
|
|
525
|
+
if node.type in chunk_node_types:
|
|
526
|
+
chunk = self._create_chunk(node, source, language, filepath, scope, all_imports)
|
|
527
|
+
if chunk:
|
|
528
|
+
chunks.append(chunk)
|
|
529
|
+
|
|
530
|
+
# Update scope for children if this is a scope boundary
|
|
531
|
+
if node.type in scope_node_types and chunk.name:
|
|
532
|
+
scope = scope + [chunk.name]
|
|
533
|
+
|
|
534
|
+
# Process children
|
|
535
|
+
for child in node.children:
|
|
536
|
+
child_chunks = self._extract_chunks(
|
|
537
|
+
child, source, language, filepath, scope, all_imports
|
|
538
|
+
)
|
|
539
|
+
chunks.extend(child_chunks)
|
|
540
|
+
|
|
541
|
+
return chunks
|
|
542
|
+
|
|
543
|
+
def _create_chunk(
|
|
544
|
+
self,
|
|
545
|
+
node: Node,
|
|
546
|
+
source: str,
|
|
547
|
+
language: str,
|
|
548
|
+
filepath: str,
|
|
549
|
+
scope: List[str],
|
|
550
|
+
all_imports: Dict[str, ImportInfo]
|
|
551
|
+
) -> Optional[EnhancedCodeChunk]:
|
|
552
|
+
"""Create an EnhancedCodeChunk from an AST node"""
|
|
553
|
+
text = self._bs(node)
|
|
554
|
+
if not text.strip():
|
|
555
|
+
return None
|
|
556
|
+
|
|
557
|
+
# Determine chunk type and extract name
|
|
558
|
+
chunk_type, name = self._analyze_node(node, source, language)
|
|
559
|
+
|
|
560
|
+
# Extract signature for functions/methods
|
|
561
|
+
signature = None
|
|
562
|
+
parameters = []
|
|
563
|
+
return_type = None
|
|
564
|
+
if chunk_type in ['function', 'method']:
|
|
565
|
+
sig_info = self._extract_signature(node, source, language)
|
|
566
|
+
signature = sig_info.get('signature')
|
|
567
|
+
parameters = sig_info.get('parameters', [])
|
|
568
|
+
return_type = sig_info.get('return_type')
|
|
569
|
+
|
|
570
|
+
# Extract docstring
|
|
571
|
+
docstring = self._extract_docstring(node, source, language)
|
|
572
|
+
|
|
573
|
+
# Find which imports are used in this chunk
|
|
574
|
+
used_imports = self._find_used_imports(text, all_imports)
|
|
575
|
+
|
|
576
|
+
return EnhancedCodeChunk(
|
|
577
|
+
text=text,
|
|
578
|
+
contextualized_text="", # Will be filled later
|
|
579
|
+
line_range=(node.start_point[0], node.end_point[0]),
|
|
580
|
+
byte_range=(node.start_byte, node.end_byte),
|
|
581
|
+
chunk_type=chunk_type,
|
|
582
|
+
name=name,
|
|
583
|
+
signature=signature,
|
|
584
|
+
docstring=docstring,
|
|
585
|
+
parameters=parameters,
|
|
586
|
+
return_type=return_type,
|
|
587
|
+
scope=scope.copy(),
|
|
588
|
+
imports=used_imports,
|
|
589
|
+
language=language,
|
|
590
|
+
filepath=filepath
|
|
591
|
+
)
|
|
592
|
+
|
|
593
|
+
def _analyze_node(
|
|
594
|
+
self,
|
|
595
|
+
node: Node,
|
|
596
|
+
source: str,
|
|
597
|
+
language: str
|
|
598
|
+
) -> Tuple[str, Optional[str]]:
|
|
599
|
+
"""Determine chunk type and extract name from node"""
|
|
600
|
+
node_type = node.type
|
|
601
|
+
name = None
|
|
602
|
+
|
|
603
|
+
# Python
|
|
604
|
+
if language == 'python':
|
|
605
|
+
if node_type in ['function_definition', 'async_function_definition']:
|
|
606
|
+
name = self._get_identifier(node, source)
|
|
607
|
+
return ('function', name)
|
|
608
|
+
elif node_type == 'class_definition':
|
|
609
|
+
name = self._get_identifier(node, source)
|
|
610
|
+
return ('class', name)
|
|
611
|
+
elif node_type == 'decorated_definition':
|
|
612
|
+
# Get the actual definition inside
|
|
613
|
+
for child in node.children:
|
|
614
|
+
if child.type in ['function_definition', 'async_function_definition']:
|
|
615
|
+
return self._analyze_node(child, source, language)
|
|
616
|
+
elif child.type == 'class_definition':
|
|
617
|
+
return self._analyze_node(child, source, language)
|
|
618
|
+
|
|
619
|
+
# JavaScript/TypeScript
|
|
620
|
+
elif language in ['javascript', 'typescript']:
|
|
621
|
+
if node_type == 'function_declaration':
|
|
622
|
+
name = self._get_identifier(node, source)
|
|
623
|
+
return ('function', name)
|
|
624
|
+
elif node_type == 'class_declaration':
|
|
625
|
+
name = self._get_identifier(node, source)
|
|
626
|
+
return ('class', name)
|
|
627
|
+
elif node_type == 'method_definition':
|
|
628
|
+
name = self._get_property_identifier(node, source)
|
|
629
|
+
return ('method', name)
|
|
630
|
+
elif node_type == 'arrow_function':
|
|
631
|
+
return ('function', 'arrow_function')
|
|
632
|
+
elif node_type == 'interface_declaration':
|
|
633
|
+
name = self._get_identifier(node, source)
|
|
634
|
+
return ('interface', name)
|
|
635
|
+
elif node_type == 'type_alias_declaration':
|
|
636
|
+
name = self._get_identifier(node, source)
|
|
637
|
+
return ('type', name)
|
|
638
|
+
|
|
639
|
+
# Go
|
|
640
|
+
elif language == 'go':
|
|
641
|
+
if node_type == 'function_declaration':
|
|
642
|
+
name = self._get_identifier(node, source)
|
|
643
|
+
return ('function', name)
|
|
644
|
+
elif node_type == 'method_declaration':
|
|
645
|
+
name = self._get_field_identifier(node, source)
|
|
646
|
+
return ('method', name)
|
|
647
|
+
elif node_type == 'type_declaration':
|
|
648
|
+
name = self._get_type_identifier(node, source)
|
|
649
|
+
return ('type', name)
|
|
650
|
+
|
|
651
|
+
return ('block', None)
|
|
652
|
+
|
|
653
|
+
def _get_identifier(self, node: Node, source: str) -> Optional[str]:
|
|
654
|
+
"""Get identifier name from node's children"""
|
|
655
|
+
for child in node.children:
|
|
656
|
+
if child.type == 'identifier':
|
|
657
|
+
return self._bs(child)
|
|
658
|
+
return None
|
|
659
|
+
|
|
660
|
+
def _get_property_identifier(self, node: Node, source: str) -> Optional[str]:
|
|
661
|
+
"""Get property identifier for JS methods"""
|
|
662
|
+
for child in node.children:
|
|
663
|
+
if child.type == 'property_identifier':
|
|
664
|
+
return self._bs(child)
|
|
665
|
+
return None
|
|
666
|
+
|
|
667
|
+
def _get_field_identifier(self, node: Node, source: str) -> Optional[str]:
|
|
668
|
+
"""Get field identifier for Go methods"""
|
|
669
|
+
for child in node.children:
|
|
670
|
+
if child.type == 'field_identifier':
|
|
671
|
+
return self._bs(child)
|
|
672
|
+
return None
|
|
673
|
+
|
|
674
|
+
def _get_type_identifier(self, node: Node, source: str) -> Optional[str]:
|
|
675
|
+
"""Get type identifier for Go type declarations"""
|
|
676
|
+
for child in node.children:
|
|
677
|
+
if child.type == 'type_spec':
|
|
678
|
+
for c in child.children:
|
|
679
|
+
if c.type == 'type_identifier':
|
|
680
|
+
return self._bs(c)
|
|
681
|
+
return None
|
|
682
|
+
|
|
683
|
+
def _extract_signature(
|
|
684
|
+
self,
|
|
685
|
+
node: Node,
|
|
686
|
+
source: str,
|
|
687
|
+
language: str
|
|
688
|
+
) -> Dict[str, Any]:
|
|
689
|
+
"""Extract complete function signature"""
|
|
690
|
+
result = {'signature': None, 'parameters': [], 'return_type': None}
|
|
691
|
+
|
|
692
|
+
if language == 'python':
|
|
693
|
+
result = self._extract_python_signature(node, source)
|
|
694
|
+
elif language in ['javascript', 'typescript']:
|
|
695
|
+
result = self._extract_js_signature(node, source)
|
|
696
|
+
elif language == 'go':
|
|
697
|
+
result = self._extract_go_signature(node, source)
|
|
698
|
+
|
|
699
|
+
return result
|
|
700
|
+
|
|
701
|
+
def _extract_python_signature(self, node: Node, source: str) -> Dict[str, Any]:
|
|
702
|
+
"""Extract Python function signature"""
|
|
703
|
+
name = self._get_identifier(node, source)
|
|
704
|
+
parameters = []
|
|
705
|
+
return_type = None
|
|
706
|
+
|
|
707
|
+
for child in node.children:
|
|
708
|
+
if child.type == 'parameters':
|
|
709
|
+
parameters = self._parse_python_parameters(child, source)
|
|
710
|
+
elif child.type == 'type':
|
|
711
|
+
return_type = self._bs(child)
|
|
712
|
+
|
|
713
|
+
# Build signature string
|
|
714
|
+
param_str = ', '.join(self._format_parameter(p) for p in parameters)
|
|
715
|
+
sig = f"def {name}({param_str})"
|
|
716
|
+
if return_type:
|
|
717
|
+
sig += f" -> {return_type}"
|
|
718
|
+
|
|
719
|
+
return {
|
|
720
|
+
'signature': sig,
|
|
721
|
+
'parameters': parameters,
|
|
722
|
+
'return_type': return_type
|
|
723
|
+
}
|
|
724
|
+
|
|
725
|
+
def _parse_python_parameters(self, params_node: Node, source: str) -> List[ParameterInfo]:
|
|
726
|
+
"""Parse Python function parameters"""
|
|
727
|
+
parameters = []
|
|
728
|
+
|
|
729
|
+
for child in params_node.children:
|
|
730
|
+
if child.type == 'identifier':
|
|
731
|
+
parameters.append(ParameterInfo(
|
|
732
|
+
name=self._bs(child)
|
|
733
|
+
))
|
|
734
|
+
elif child.type == 'typed_parameter':
|
|
735
|
+
name = None
|
|
736
|
+
type_ann = None
|
|
737
|
+
for c in child.children:
|
|
738
|
+
if c.type == 'identifier':
|
|
739
|
+
name = self._bs(c)
|
|
740
|
+
elif c.type == 'type':
|
|
741
|
+
type_ann = self._bs(c)
|
|
742
|
+
if name:
|
|
743
|
+
parameters.append(ParameterInfo(name=name, type_annotation=type_ann))
|
|
744
|
+
elif child.type == 'default_parameter':
|
|
745
|
+
name = None
|
|
746
|
+
type_ann = None
|
|
747
|
+
default = None
|
|
748
|
+
for c in child.children:
|
|
749
|
+
if c.type == 'identifier':
|
|
750
|
+
name = self._bs(c)
|
|
751
|
+
elif c.type == 'type':
|
|
752
|
+
type_ann = self._bs(c)
|
|
753
|
+
elif c.type not in ['identifier', 'type', '=', ':']:
|
|
754
|
+
default = self._bs(c)
|
|
755
|
+
if name:
|
|
756
|
+
parameters.append(ParameterInfo(
|
|
757
|
+
name=name,
|
|
758
|
+
type_annotation=type_ann,
|
|
759
|
+
default_value=default
|
|
760
|
+
))
|
|
761
|
+
elif child.type == 'typed_default_parameter':
|
|
762
|
+
name = None
|
|
763
|
+
type_ann = None
|
|
764
|
+
default = None
|
|
765
|
+
for c in child.children:
|
|
766
|
+
if c.type == 'identifier':
|
|
767
|
+
name = self._bs(c)
|
|
768
|
+
elif c.type == 'type':
|
|
769
|
+
type_ann = self._bs(c)
|
|
770
|
+
elif c.type not in ['identifier', 'type', '=', ':']:
|
|
771
|
+
default = self._bs(c)
|
|
772
|
+
if name:
|
|
773
|
+
parameters.append(ParameterInfo(
|
|
774
|
+
name=name,
|
|
775
|
+
type_annotation=type_ann,
|
|
776
|
+
default_value=default
|
|
777
|
+
))
|
|
778
|
+
elif child.type == 'list_splat_pattern':
|
|
779
|
+
name = self._get_identifier(child, source)
|
|
780
|
+
if name:
|
|
781
|
+
parameters.append(ParameterInfo(name=name, is_variadic=True))
|
|
782
|
+
elif child.type == 'dictionary_splat_pattern':
|
|
783
|
+
name = self._get_identifier(child, source)
|
|
784
|
+
if name:
|
|
785
|
+
parameters.append(ParameterInfo(name=name, is_keyword=True))
|
|
786
|
+
|
|
787
|
+
return parameters
|
|
788
|
+
|
|
789
|
+
def _extract_js_signature(self, node: Node, source: str) -> Dict[str, Any]:
|
|
790
|
+
"""Extract JavaScript/TypeScript function signature"""
|
|
791
|
+
name = self._get_identifier(node, source) or self._get_property_identifier(node, source)
|
|
792
|
+
parameters = []
|
|
793
|
+
return_type = None
|
|
794
|
+
|
|
795
|
+
for child in node.children:
|
|
796
|
+
if child.type == 'formal_parameters':
|
|
797
|
+
parameters = self._parse_js_parameters(child, source)
|
|
798
|
+
elif child.type == 'type_annotation':
|
|
799
|
+
return_type = self._bs(child).lstrip(': ')
|
|
800
|
+
|
|
801
|
+
# Build signature
|
|
802
|
+
param_str = ', '.join(self._format_parameter(p) for p in parameters)
|
|
803
|
+
sig = f"function {name or 'anonymous'}({param_str})"
|
|
804
|
+
if return_type:
|
|
805
|
+
sig += f": {return_type}"
|
|
806
|
+
|
|
807
|
+
return {
|
|
808
|
+
'signature': sig,
|
|
809
|
+
'parameters': parameters,
|
|
810
|
+
'return_type': return_type
|
|
811
|
+
}
|
|
812
|
+
|
|
813
|
+
def _parse_js_parameters(self, params_node: Node, source: str) -> List[ParameterInfo]:
|
|
814
|
+
"""Parse JavaScript/TypeScript function parameters"""
|
|
815
|
+
parameters = []
|
|
816
|
+
|
|
817
|
+
for child in params_node.children:
|
|
818
|
+
if child.type == 'identifier':
|
|
819
|
+
parameters.append(ParameterInfo(
|
|
820
|
+
name=self._bs(child)
|
|
821
|
+
))
|
|
822
|
+
elif child.type == 'required_parameter' or child.type == 'optional_parameter':
|
|
823
|
+
name = None
|
|
824
|
+
type_ann = None
|
|
825
|
+
for c in child.children:
|
|
826
|
+
if c.type == 'identifier':
|
|
827
|
+
name = self._bs(c)
|
|
828
|
+
elif c.type == 'type_annotation':
|
|
829
|
+
type_ann = self._bs(c).lstrip(': ')
|
|
830
|
+
if name:
|
|
831
|
+
parameters.append(ParameterInfo(name=name, type_annotation=type_ann))
|
|
832
|
+
elif child.type == 'rest_pattern':
|
|
833
|
+
name = self._get_identifier(child, source)
|
|
834
|
+
if name:
|
|
835
|
+
parameters.append(ParameterInfo(name=name, is_variadic=True))
|
|
836
|
+
|
|
837
|
+
return parameters
|
|
838
|
+
|
|
839
|
+
def _extract_go_signature(self, node: Node, source: str) -> Dict[str, Any]:
|
|
840
|
+
"""Extract Go function signature"""
|
|
841
|
+
name = self._get_identifier(node, source) or self._get_field_identifier(node, source)
|
|
842
|
+
parameters = []
|
|
843
|
+
return_type = None
|
|
844
|
+
|
|
845
|
+
for child in node.children:
|
|
846
|
+
if child.type == 'parameter_list':
|
|
847
|
+
parameters = self._parse_go_parameters(child, source)
|
|
848
|
+
elif child.type == 'result':
|
|
849
|
+
return_type = self._bs(child)
|
|
850
|
+
|
|
851
|
+
# Build signature
|
|
852
|
+
param_str = ', '.join(self._format_parameter(p) for p in parameters)
|
|
853
|
+
sig = f"func {name or 'anonymous'}({param_str})"
|
|
854
|
+
if return_type:
|
|
855
|
+
sig += f" {return_type}"
|
|
856
|
+
|
|
857
|
+
return {
|
|
858
|
+
'signature': sig,
|
|
859
|
+
'parameters': parameters,
|
|
860
|
+
'return_type': return_type
|
|
861
|
+
}
|
|
862
|
+
|
|
863
|
+
def _parse_go_parameters(self, params_node: Node, source: str) -> List[ParameterInfo]:
|
|
864
|
+
"""Parse Go function parameters"""
|
|
865
|
+
parameters = []
|
|
866
|
+
|
|
867
|
+
for child in params_node.children:
|
|
868
|
+
if child.type == 'parameter_declaration':
|
|
869
|
+
names = []
|
|
870
|
+
type_ann = None
|
|
871
|
+
for c in child.children:
|
|
872
|
+
if c.type == 'identifier':
|
|
873
|
+
names.append(self._bs(c))
|
|
874
|
+
elif c.type in ['type_identifier', 'pointer_type', 'slice_type',
|
|
875
|
+
'array_type', 'map_type', 'interface_type']:
|
|
876
|
+
type_ann = self._bs(c)
|
|
877
|
+
for name in names:
|
|
878
|
+
parameters.append(ParameterInfo(name=name, type_annotation=type_ann))
|
|
879
|
+
|
|
880
|
+
return parameters
|
|
881
|
+
|
|
882
|
+
def _format_parameter(self, param: ParameterInfo) -> str:
|
|
883
|
+
"""Format a parameter for signature display"""
|
|
884
|
+
prefix = ''
|
|
885
|
+
if param.is_variadic:
|
|
886
|
+
prefix = '*'
|
|
887
|
+
elif param.is_keyword:
|
|
888
|
+
prefix = '**'
|
|
889
|
+
|
|
890
|
+
result = f"{prefix}{param.name}"
|
|
891
|
+
if param.type_annotation:
|
|
892
|
+
result += f": {param.type_annotation}"
|
|
893
|
+
if param.default_value:
|
|
894
|
+
result += f" = {param.default_value}"
|
|
895
|
+
return result
|
|
896
|
+
|
|
897
|
+
def _extract_docstring(
|
|
898
|
+
self,
|
|
899
|
+
node: Node,
|
|
900
|
+
source: str,
|
|
901
|
+
language: str
|
|
902
|
+
) -> Optional[str]:
|
|
903
|
+
"""Extract docstring from a function/class node"""
|
|
904
|
+
if language == 'python':
|
|
905
|
+
# Look for string as first statement in body
|
|
906
|
+
for child in node.children:
|
|
907
|
+
if child.type == 'block':
|
|
908
|
+
for stmt in child.children:
|
|
909
|
+
if stmt.type == 'expression_statement':
|
|
910
|
+
for expr in stmt.children:
|
|
911
|
+
if expr.type == 'string':
|
|
912
|
+
docstring = self._bs(expr)
|
|
913
|
+
# Clean up the docstring
|
|
914
|
+
return docstring.strip('"""\'\'\'').strip()
|
|
915
|
+
elif stmt.type not in ['comment', 'pass_statement']:
|
|
916
|
+
break # Docstring must be first
|
|
917
|
+
break
|
|
918
|
+
elif language in ['javascript', 'typescript']:
|
|
919
|
+
# Look for JSDoc comment before the node
|
|
920
|
+
# This would require looking at preceding siblings or comments
|
|
921
|
+
pass
|
|
922
|
+
|
|
923
|
+
return None
|
|
924
|
+
|
|
925
|
+
def _find_used_imports(
|
|
926
|
+
self,
|
|
927
|
+
chunk_text: str,
|
|
928
|
+
all_imports: Dict[str, ImportInfo]
|
|
929
|
+
) -> List[ImportInfo]:
|
|
930
|
+
"""Find which imports are used in a chunk"""
|
|
931
|
+
used = []
|
|
932
|
+
# Simple word boundary check for each import name
|
|
933
|
+
for name, import_info in all_imports.items():
|
|
934
|
+
# Use word boundary to avoid partial matches
|
|
935
|
+
if re.search(rf'\b{re.escape(name)}\b', chunk_text):
|
|
936
|
+
used.append(import_info)
|
|
937
|
+
return used
|
|
938
|
+
|
|
939
|
+
def _build_contextualized_text(self, chunk: EnhancedCodeChunk) -> str:
|
|
940
|
+
"""
|
|
941
|
+
Build contextualized text with metadata prepended.
|
|
942
|
+
|
|
943
|
+
This format is optimized for LLM consumption and embedding.
|
|
944
|
+
"""
|
|
945
|
+
if self.context_mode == 'none':
|
|
946
|
+
return chunk.text
|
|
947
|
+
|
|
948
|
+
lines = []
|
|
949
|
+
|
|
950
|
+
# File path
|
|
951
|
+
lines.append(f"# {chunk.filepath}")
|
|
952
|
+
|
|
953
|
+
if self.context_mode in ['minimal', 'full']:
|
|
954
|
+
# Scope chain
|
|
955
|
+
if chunk.scope:
|
|
956
|
+
scope_str = ' > '.join(chunk.scope)
|
|
957
|
+
lines.append(f"# Scope: {scope_str}")
|
|
958
|
+
|
|
959
|
+
if self.context_mode == 'full':
|
|
960
|
+
# Signature
|
|
961
|
+
if chunk.signature:
|
|
962
|
+
lines.append(f"# Signature: {chunk.signature}")
|
|
963
|
+
|
|
964
|
+
# Used imports
|
|
965
|
+
if chunk.imports:
|
|
966
|
+
import_names = [f"{i.name}" for i in chunk.imports[:5]] # Limit to 5
|
|
967
|
+
if len(chunk.imports) > 5:
|
|
968
|
+
import_names.append(f"... +{len(chunk.imports) - 5} more")
|
|
969
|
+
lines.append(f"# Uses: {', '.join(import_names)}")
|
|
970
|
+
|
|
971
|
+
# Docstring summary (first line)
|
|
972
|
+
if chunk.docstring:
|
|
973
|
+
first_line = chunk.docstring.split('\n')[0][:100]
|
|
974
|
+
lines.append(f"# Doc: {first_line}")
|
|
975
|
+
|
|
976
|
+
lines.append("") # Blank line before code
|
|
977
|
+
lines.append(chunk.text)
|
|
978
|
+
|
|
979
|
+
return '\n'.join(lines)
|
|
980
|
+
|
|
981
|
+
def _chunk_fallback(
|
|
982
|
+
self,
|
|
983
|
+
content: str,
|
|
984
|
+
language: str,
|
|
985
|
+
filepath: str
|
|
986
|
+
) -> List[EnhancedCodeChunk]:
|
|
987
|
+
"""
|
|
988
|
+
Fallback line-based chunking when tree-sitter is unavailable.
|
|
989
|
+
"""
|
|
990
|
+
lines = content.split('\n')
|
|
991
|
+
chunks = []
|
|
992
|
+
current_chunk_lines = []
|
|
993
|
+
current_start = 0
|
|
994
|
+
|
|
995
|
+
for i, line in enumerate(lines):
|
|
996
|
+
current_chunk_lines.append(line)
|
|
997
|
+
|
|
998
|
+
# Check if we should start a new chunk
|
|
999
|
+
should_split = (
|
|
1000
|
+
len('\n'.join(current_chunk_lines)) > self.max_chunk_size or
|
|
1001
|
+
self._is_chunk_boundary(line, language)
|
|
1002
|
+
)
|
|
1003
|
+
|
|
1004
|
+
if should_split and current_chunk_lines:
|
|
1005
|
+
text = '\n'.join(current_chunk_lines)
|
|
1006
|
+
if text.strip():
|
|
1007
|
+
chunk = EnhancedCodeChunk(
|
|
1008
|
+
text=text,
|
|
1009
|
+
contextualized_text=f"# {filepath}\n\n{text}",
|
|
1010
|
+
line_range=(current_start, i),
|
|
1011
|
+
byte_range=(0, 0), # Not calculated for fallback
|
|
1012
|
+
chunk_type='block',
|
|
1013
|
+
language=language,
|
|
1014
|
+
filepath=filepath
|
|
1015
|
+
)
|
|
1016
|
+
chunks.append(chunk)
|
|
1017
|
+
|
|
1018
|
+
current_chunk_lines = []
|
|
1019
|
+
current_start = i + 1
|
|
1020
|
+
|
|
1021
|
+
# Handle remaining lines
|
|
1022
|
+
if current_chunk_lines:
|
|
1023
|
+
text = '\n'.join(current_chunk_lines)
|
|
1024
|
+
if text.strip():
|
|
1025
|
+
chunks.append(EnhancedCodeChunk(
|
|
1026
|
+
text=text,
|
|
1027
|
+
contextualized_text=f"# {filepath}\n\n{text}",
|
|
1028
|
+
line_range=(current_start, len(lines) - 1),
|
|
1029
|
+
byte_range=(0, 0),
|
|
1030
|
+
chunk_type='block',
|
|
1031
|
+
language=language,
|
|
1032
|
+
filepath=filepath
|
|
1033
|
+
))
|
|
1034
|
+
|
|
1035
|
+
return chunks
|
|
1036
|
+
|
|
1037
|
+
def _is_chunk_boundary(self, line: str, language: str) -> bool:
|
|
1038
|
+
"""Detect potential chunk boundaries for fallback mode"""
|
|
1039
|
+
line = line.strip()
|
|
1040
|
+
|
|
1041
|
+
if language == 'python':
|
|
1042
|
+
return line.startswith(('def ', 'class ', 'async def ', '@'))
|
|
1043
|
+
elif language in ['javascript', 'typescript']:
|
|
1044
|
+
return (line.startswith(('function ', 'class ', 'export ')) or
|
|
1045
|
+
'function(' in line or '=>' in line)
|
|
1046
|
+
elif language == 'go':
|
|
1047
|
+
return line.startswith(('func ', 'type '))
|
|
1048
|
+
|
|
1049
|
+
return False
|
|
1050
|
+
|
|
1051
|
+
|
|
1052
|
+
# Convenience function
|
|
1053
|
+
def enhanced_chunk_code(
|
|
1054
|
+
file_path: Path,
|
|
1055
|
+
content: Optional[str] = None,
|
|
1056
|
+
max_chunk_size: int = 1500,
|
|
1057
|
+
context_mode: str = "full"
|
|
1058
|
+
) -> List[EnhancedCodeChunk]:
|
|
1059
|
+
"""
|
|
1060
|
+
Convenience function to chunk code with enhanced metadata extraction.
|
|
1061
|
+
|
|
1062
|
+
Args:
|
|
1063
|
+
file_path: Path to the source file
|
|
1064
|
+
content: Optional pre-loaded content
|
|
1065
|
+
max_chunk_size: Maximum chunk size
|
|
1066
|
+
context_mode: Context detail level ('none', 'minimal', 'full')
|
|
1067
|
+
|
|
1068
|
+
Returns:
|
|
1069
|
+
List of EnhancedCodeChunk objects
|
|
1070
|
+
"""
|
|
1071
|
+
chunker = EnhancedASTChunker(
|
|
1072
|
+
max_chunk_size=max_chunk_size,
|
|
1073
|
+
context_mode=context_mode
|
|
1074
|
+
)
|
|
1075
|
+
return chunker.chunk_file(file_path, content)
|
|
1076
|
+
|
|
1077
|
+
|
|
1078
|
+
if __name__ == "__main__":
|
|
1079
|
+
# Test the enhanced chunker
|
|
1080
|
+
import sys
|
|
1081
|
+
|
|
1082
|
+
logging.basicConfig(level=logging.DEBUG)
|
|
1083
|
+
|
|
1084
|
+
if len(sys.argv) < 2:
|
|
1085
|
+
print("Usage: python enhanced_ast_chunker.py <file_path>")
|
|
1086
|
+
sys.exit(1)
|
|
1087
|
+
|
|
1088
|
+
test_file = Path(sys.argv[1])
|
|
1089
|
+
|
|
1090
|
+
print(f"\nTesting EnhancedASTChunker on: {test_file}")
|
|
1091
|
+
print("=" * 60)
|
|
1092
|
+
|
|
1093
|
+
chunker = EnhancedASTChunker(context_mode="full")
|
|
1094
|
+
|
|
1095
|
+
try:
|
|
1096
|
+
chunks = chunker.chunk_file(test_file)
|
|
1097
|
+
|
|
1098
|
+
print(f"\nFound {len(chunks)} chunks:\n")
|
|
1099
|
+
|
|
1100
|
+
for i, chunk in enumerate(chunks):
|
|
1101
|
+
print(f"{'='*60}")
|
|
1102
|
+
print(f"Chunk {i + 1}: {chunk.chunk_type} - {chunk.name or '(unnamed)'}")
|
|
1103
|
+
print(f"{'='*60}")
|
|
1104
|
+
print(f"Lines: {chunk.line_range[0]+1}-{chunk.line_range[1]+1}")
|
|
1105
|
+
print(f"Scope: {' > '.join(chunk.scope) or '(module level)'}")
|
|
1106
|
+
if chunk.signature:
|
|
1107
|
+
print(f"Signature: {chunk.signature}")
|
|
1108
|
+
if chunk.parameters:
|
|
1109
|
+
print(f"Parameters: {len(chunk.parameters)}")
|
|
1110
|
+
for p in chunk.parameters[:3]:
|
|
1111
|
+
print(f" - {p.name}: {p.type_annotation or 'untyped'}")
|
|
1112
|
+
if chunk.return_type:
|
|
1113
|
+
print(f"Return type: {chunk.return_type}")
|
|
1114
|
+
if chunk.imports:
|
|
1115
|
+
print(f"Uses imports: {[i.name for i in chunk.imports[:5]]}")
|
|
1116
|
+
if chunk.docstring:
|
|
1117
|
+
print(f"Docstring: {chunk.docstring[:80]}...")
|
|
1118
|
+
print(f"\nContextualized text preview:")
|
|
1119
|
+
print("-" * 40)
|
|
1120
|
+
print(chunk.contextualized_text[:500])
|
|
1121
|
+
if len(chunk.contextualized_text) > 500:
|
|
1122
|
+
print("...")
|
|
1123
|
+
print()
|
|
1124
|
+
|
|
1125
|
+
except Exception as e:
|
|
1126
|
+
print(f"Error: {e}")
|
|
1127
|
+
import traceback
|
|
1128
|
+
traceback.print_exc()
|
|
1129
|
+
sys.exit(1)
|