code-finder 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- claude_context/__init__.py +33 -0
- claude_context/agentic_integration.py +309 -0
- claude_context/ast_chunker.py +646 -0
- claude_context/config.py +239 -0
- claude_context/context_manager.py +627 -0
- claude_context/embeddings.py +307 -0
- claude_context/embeddings_interface.py +226 -0
- claude_context/enhanced_ast_chunker.py +1129 -0
- claude_context/explorer.py +951 -0
- claude_context/explorer_with_context.py +1008 -0
- claude_context/indexer.py +893 -0
- claude_context/markdown_chunker.py +421 -0
- claude_context/mode_handler.py +1774 -0
- claude_context/query_metrics.py +164 -0
- claude_context/question_generator.py +800 -0
- claude_context/readme_extractor.py +485 -0
- claude_context/repository_adapter.py +399 -0
- claude_context/search.py +493 -0
- claude_context/skills/__init__.py +11 -0
- claude_context/skills/_cli_common.py +74 -0
- claude_context/skills/_index_manager.py +98 -0
- claude_context/skills/api_surface.py +219 -0
- claude_context/skills/evidence_retrieval.py +151 -0
- claude_context/skills/grounded_review.py +212 -0
- claude_context/synthesis/__init__.py +8 -0
- claude_context/synthesis/editor_agent.py +391 -0
- claude_context/synthesis/llm_synthesizer.py +153 -0
- claude_context/synthesis/logic_explainer.py +235 -0
- claude_context/synthesis/multi_review_pipeline.py +717 -0
- claude_context/synthesis/prompt_builder.py +439 -0
- claude_context/synthesis/providers.py +115 -0
- claude_context/synthesis/validators.py +458 -0
- code_finder-0.1.0.dist-info/METADATA +823 -0
- code_finder-0.1.0.dist-info/RECORD +37 -0
- code_finder-0.1.0.dist-info/WHEEL +5 -0
- code_finder-0.1.0.dist-info/entry_points.txt +4 -0
- code_finder-0.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,646 @@
|
|
|
1
|
+
"""
|
|
2
|
+
AST-based Code Chunker for Claude Context
|
|
3
|
+
|
|
4
|
+
Adapted from the original vibe2doc AST chunker to provide semantic code chunking
|
|
5
|
+
for the Claude Context MCP integration. Uses tree-sitter for accurate parsing
|
|
6
|
+
with a simple line-based fallback for edge cases.
|
|
7
|
+
|
|
8
|
+
Supports: Python, JavaScript, TypeScript, Go
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
import re
|
|
12
|
+
import logging
|
|
13
|
+
from typing import List, Dict, Any, Optional, Tuple
|
|
14
|
+
from dataclasses import dataclass, field
|
|
15
|
+
from pathlib import Path
|
|
16
|
+
|
|
17
|
+
logger = logging.getLogger(__name__)
|
|
18
|
+
|
|
19
|
+
# Try to import tree-sitter dependencies
|
|
20
|
+
try:
|
|
21
|
+
import tree_sitter_python as tspython
|
|
22
|
+
import tree_sitter_javascript as tsjavascript
|
|
23
|
+
import tree_sitter_typescript as tstypescript
|
|
24
|
+
import tree_sitter_go as tsgo
|
|
25
|
+
from tree_sitter import Language, Parser, Node
|
|
26
|
+
HAS_TREE_SITTER = True
|
|
27
|
+
logger.info("Tree-sitter available for AST parsing")
|
|
28
|
+
except ImportError as e:
|
|
29
|
+
HAS_TREE_SITTER = False
|
|
30
|
+
Node = Any # Type hint fallback
|
|
31
|
+
logger.warning(f"Tree-sitter not available, using line-based fallback: {e}")
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
@dataclass
|
|
35
|
+
class CodeChunk:
|
|
36
|
+
"""Represents a semantically complete code chunk"""
|
|
37
|
+
content: str
|
|
38
|
+
chunk_type: str # 'function', 'class', 'method', 'import_block', 'module', 'block'
|
|
39
|
+
name: Optional[str] = None
|
|
40
|
+
start_line: int = 0
|
|
41
|
+
end_line: int = 0
|
|
42
|
+
language: str = ""
|
|
43
|
+
parent_context: Optional[str] = None
|
|
44
|
+
docstring: Optional[str] = None # Extracted docstring content (for functions/classes)
|
|
45
|
+
size_chars: int = field(default=0, init=False)
|
|
46
|
+
metadata: Dict[str, Any] = field(default_factory=dict)
|
|
47
|
+
|
|
48
|
+
def __post_init__(self):
|
|
49
|
+
# Count non-whitespace characters for size
|
|
50
|
+
if self.size_chars == 0:
|
|
51
|
+
self.size_chars = len(re.sub(r'\s', '', self.content))
|
|
52
|
+
|
|
53
|
+
def to_dict(self) -> Dict[str, Any]:
|
|
54
|
+
"""Convert to dictionary for storage"""
|
|
55
|
+
return {
|
|
56
|
+
"content": self.content,
|
|
57
|
+
"chunk_type": self.chunk_type,
|
|
58
|
+
"name": self.name,
|
|
59
|
+
"start_line": self.start_line,
|
|
60
|
+
"end_line": self.end_line,
|
|
61
|
+
"language": self.language,
|
|
62
|
+
"parent_context": self.parent_context,
|
|
63
|
+
"docstring": self.docstring,
|
|
64
|
+
"size_chars": self.size_chars,
|
|
65
|
+
"metadata": self.metadata
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
class ASTChunker:
|
|
70
|
+
"""
|
|
71
|
+
AST-based code chunker for semantic code understanding.
|
|
72
|
+
|
|
73
|
+
Primary: Uses tree-sitter for accurate AST parsing
|
|
74
|
+
Fallback: Simple line-based chunking for edge cases
|
|
75
|
+
"""
|
|
76
|
+
|
|
77
|
+
def __init__(self, max_chunk_size: int = 1500, chunk_overlap: int = 100):
|
|
78
|
+
"""
|
|
79
|
+
Initialize the AST chunker.
|
|
80
|
+
|
|
81
|
+
Args:
|
|
82
|
+
max_chunk_size: Maximum chunk size in lines (for fallback)
|
|
83
|
+
chunk_overlap: Number of lines to overlap (for fallback)
|
|
84
|
+
"""
|
|
85
|
+
self.max_chunk_size = max_chunk_size
|
|
86
|
+
self.chunk_overlap = chunk_overlap
|
|
87
|
+
self.supported_languages = ['python', 'javascript', 'typescript', 'go']
|
|
88
|
+
self.parsers = {}
|
|
89
|
+
|
|
90
|
+
if HAS_TREE_SITTER:
|
|
91
|
+
self._initialize_parsers()
|
|
92
|
+
else:
|
|
93
|
+
logger.warning("Using line-based chunking fallback")
|
|
94
|
+
|
|
95
|
+
def _initialize_parsers(self):
|
|
96
|
+
"""Initialize tree-sitter parsers for supported languages"""
|
|
97
|
+
try:
|
|
98
|
+
# Python parser
|
|
99
|
+
PY_LANGUAGE = Language(tspython.language())
|
|
100
|
+
self.parsers['python'] = Parser(PY_LANGUAGE)
|
|
101
|
+
logger.debug("Python parser initialized")
|
|
102
|
+
except Exception as e:
|
|
103
|
+
logger.warning(f"Failed to initialize Python parser: {e}")
|
|
104
|
+
|
|
105
|
+
try:
|
|
106
|
+
# JavaScript parser
|
|
107
|
+
JS_LANGUAGE = Language(tsjavascript.language())
|
|
108
|
+
self.parsers['javascript'] = Parser(JS_LANGUAGE)
|
|
109
|
+
logger.debug("JavaScript parser initialized")
|
|
110
|
+
except Exception as e:
|
|
111
|
+
logger.warning(f"Failed to initialize JavaScript parser: {e}")
|
|
112
|
+
|
|
113
|
+
try:
|
|
114
|
+
# TypeScript parser (uses tsx variant)
|
|
115
|
+
TS_LANGUAGE = Language(tstypescript.language_tsx())
|
|
116
|
+
self.parsers['typescript'] = Parser(TS_LANGUAGE)
|
|
117
|
+
logger.debug("TypeScript parser initialized")
|
|
118
|
+
except Exception as e:
|
|
119
|
+
logger.warning(f"Failed to initialize TypeScript parser: {e}")
|
|
120
|
+
|
|
121
|
+
try:
|
|
122
|
+
# Go parser
|
|
123
|
+
GO_LANGUAGE = Language(tsgo.language())
|
|
124
|
+
self.parsers['go'] = Parser(GO_LANGUAGE)
|
|
125
|
+
logger.debug("Go parser initialized")
|
|
126
|
+
except Exception as e:
|
|
127
|
+
logger.warning(f"Failed to initialize Go parser: {e}")
|
|
128
|
+
|
|
129
|
+
def chunk_file(self, file_path: Path, content: Optional[str] = None) -> List[CodeChunk]:
|
|
130
|
+
"""
|
|
131
|
+
Chunk a file into semantic units.
|
|
132
|
+
|
|
133
|
+
Args:
|
|
134
|
+
file_path: Path to the file
|
|
135
|
+
content: Optional content (if already loaded)
|
|
136
|
+
|
|
137
|
+
Returns:
|
|
138
|
+
List of CodeChunk objects
|
|
139
|
+
"""
|
|
140
|
+
if content is None:
|
|
141
|
+
try:
|
|
142
|
+
content = file_path.read_text(encoding='utf-8', errors='ignore')
|
|
143
|
+
except Exception as e:
|
|
144
|
+
logger.error(f"Failed to read {file_path}: {e}")
|
|
145
|
+
return []
|
|
146
|
+
|
|
147
|
+
# Detect language from extension
|
|
148
|
+
language = self._detect_language(file_path)
|
|
149
|
+
|
|
150
|
+
# Try AST parsing first
|
|
151
|
+
if HAS_TREE_SITTER and language in self.parsers:
|
|
152
|
+
try:
|
|
153
|
+
return self._chunk_with_ast(content, language, str(file_path))
|
|
154
|
+
except Exception as e:
|
|
155
|
+
logger.warning(f"AST parsing failed for {file_path}, using fallback: {e}")
|
|
156
|
+
|
|
157
|
+
# Fallback to line-based chunking
|
|
158
|
+
return self._chunk_with_lines(content, language, str(file_path))
|
|
159
|
+
|
|
160
|
+
def _detect_language(self, file_path: Path) -> str:
|
|
161
|
+
"""Detect programming language from file extension"""
|
|
162
|
+
ext_to_lang = {
|
|
163
|
+
'.py': 'python',
|
|
164
|
+
'.js': 'javascript',
|
|
165
|
+
'.jsx': 'javascript',
|
|
166
|
+
'.ts': 'typescript',
|
|
167
|
+
'.tsx': 'typescript',
|
|
168
|
+
'.go': 'go',
|
|
169
|
+
}
|
|
170
|
+
return ext_to_lang.get(file_path.suffix.lower(), 'unknown')
|
|
171
|
+
|
|
172
|
+
def _chunk_with_ast(self, content: str, language: str, file_path: str) -> List[CodeChunk]:
|
|
173
|
+
"""
|
|
174
|
+
Chunk code using tree-sitter AST parsing.
|
|
175
|
+
|
|
176
|
+
This provides semantic chunking that respects:
|
|
177
|
+
- Module-level docstrings (extracted first for Python)
|
|
178
|
+
- Function boundaries
|
|
179
|
+
- Class definitions
|
|
180
|
+
- Method boundaries
|
|
181
|
+
- Import blocks
|
|
182
|
+
"""
|
|
183
|
+
if language not in self.parsers:
|
|
184
|
+
raise ValueError(f"No parser available for {language}")
|
|
185
|
+
|
|
186
|
+
parser = self.parsers[language]
|
|
187
|
+
tree = parser.parse(bytes(content, "utf8"))
|
|
188
|
+
|
|
189
|
+
chunks = []
|
|
190
|
+
|
|
191
|
+
# Extract module-level docstring for Python files
|
|
192
|
+
if language == 'python':
|
|
193
|
+
module_docstring = self._extract_module_docstring(tree.root_node, content)
|
|
194
|
+
if module_docstring and len(module_docstring.strip()) > 20:
|
|
195
|
+
# Create a module chunk with the docstring
|
|
196
|
+
# Extract module name from file path
|
|
197
|
+
module_name = Path(file_path).stem
|
|
198
|
+
chunks.append(CodeChunk(
|
|
199
|
+
content=module_docstring,
|
|
200
|
+
chunk_type='module_docstring',
|
|
201
|
+
name=module_name,
|
|
202
|
+
start_line=1,
|
|
203
|
+
end_line=module_docstring.count('\n') + 1,
|
|
204
|
+
language=language,
|
|
205
|
+
parent_context=None,
|
|
206
|
+
docstring=module_docstring,
|
|
207
|
+
metadata={
|
|
208
|
+
"node_type": "module",
|
|
209
|
+
"has_error": False,
|
|
210
|
+
"has_docstring": True,
|
|
211
|
+
"is_module_docstring": True
|
|
212
|
+
}
|
|
213
|
+
))
|
|
214
|
+
|
|
215
|
+
# Extract semantic units from the AST
|
|
216
|
+
chunks.extend(self._extract_chunks_from_node(
|
|
217
|
+
tree.root_node, content, language, file_path
|
|
218
|
+
))
|
|
219
|
+
|
|
220
|
+
return chunks
|
|
221
|
+
|
|
222
|
+
def _extract_chunks_from_node(
|
|
223
|
+
self,
|
|
224
|
+
node: Node,
|
|
225
|
+
source: str,
|
|
226
|
+
language: str,
|
|
227
|
+
file_path: str,
|
|
228
|
+
parent_context: Optional[str] = None
|
|
229
|
+
) -> List[CodeChunk]:
|
|
230
|
+
"""Recursively extract chunks from AST nodes"""
|
|
231
|
+
chunks = []
|
|
232
|
+
|
|
233
|
+
# Determine if this node should be a chunk
|
|
234
|
+
if self._should_chunk_node(node, language):
|
|
235
|
+
chunk = self._create_chunk_from_node(node, source, language, parent_context)
|
|
236
|
+
if chunk:
|
|
237
|
+
chunks.append(chunk)
|
|
238
|
+
# Update parent context for children
|
|
239
|
+
if chunk.name:
|
|
240
|
+
parent_context = chunk.name
|
|
241
|
+
|
|
242
|
+
# Process children
|
|
243
|
+
for child in node.children:
|
|
244
|
+
chunks.extend(self._extract_chunks_from_node(
|
|
245
|
+
child, source, language, file_path, parent_context
|
|
246
|
+
))
|
|
247
|
+
|
|
248
|
+
return chunks
|
|
249
|
+
|
|
250
|
+
def _should_chunk_node(self, node: Node, language: str) -> bool:
|
|
251
|
+
"""Determine if a node should become its own chunk"""
|
|
252
|
+
chunk_node_types = {
|
|
253
|
+
'python': [
|
|
254
|
+
'function_definition', 'class_definition',
|
|
255
|
+
'decorated_definition', 'import_from_statement',
|
|
256
|
+
'import_statement'
|
|
257
|
+
],
|
|
258
|
+
'javascript': [
|
|
259
|
+
'function_declaration', 'class_declaration',
|
|
260
|
+
'method_definition', 'arrow_function',
|
|
261
|
+
'function_expression', 'import_statement',
|
|
262
|
+
'export_statement'
|
|
263
|
+
],
|
|
264
|
+
'typescript': [
|
|
265
|
+
'function_declaration', 'class_declaration',
|
|
266
|
+
'method_definition', 'arrow_function',
|
|
267
|
+
'interface_declaration', 'type_alias_declaration',
|
|
268
|
+
'import_statement', 'export_statement'
|
|
269
|
+
],
|
|
270
|
+
'go': [
|
|
271
|
+
'function_declaration', 'method_declaration',
|
|
272
|
+
'type_declaration', 'interface_declaration',
|
|
273
|
+
'import_declaration'
|
|
274
|
+
]
|
|
275
|
+
}
|
|
276
|
+
|
|
277
|
+
node_types = chunk_node_types.get(language, [])
|
|
278
|
+
return node.type in node_types
|
|
279
|
+
|
|
280
|
+
def _create_chunk_from_node(
|
|
281
|
+
self,
|
|
282
|
+
node: Node,
|
|
283
|
+
source: str,
|
|
284
|
+
language: str,
|
|
285
|
+
parent_context: Optional[str] = None
|
|
286
|
+
) -> Optional[CodeChunk]:
|
|
287
|
+
"""Create a CodeChunk from an AST node"""
|
|
288
|
+
# Extract content
|
|
289
|
+
start_byte = node.start_byte
|
|
290
|
+
end_byte = node.end_byte
|
|
291
|
+
content = source[start_byte:end_byte]
|
|
292
|
+
|
|
293
|
+
if not content.strip():
|
|
294
|
+
return None
|
|
295
|
+
|
|
296
|
+
# Determine chunk type and name
|
|
297
|
+
chunk_type, name = self._analyze_node(node, language)
|
|
298
|
+
|
|
299
|
+
# Extract docstring for Python functions and classes
|
|
300
|
+
docstring = None
|
|
301
|
+
if language == 'python' and node.type in ['function_definition', 'class_definition']:
|
|
302
|
+
docstring = self._extract_python_docstring(node, source)
|
|
303
|
+
|
|
304
|
+
return CodeChunk(
|
|
305
|
+
content=content,
|
|
306
|
+
chunk_type=chunk_type,
|
|
307
|
+
name=name,
|
|
308
|
+
start_line=node.start_point[0] + 1,
|
|
309
|
+
end_line=node.end_point[0] + 1,
|
|
310
|
+
language=language,
|
|
311
|
+
parent_context=parent_context,
|
|
312
|
+
docstring=docstring,
|
|
313
|
+
metadata={
|
|
314
|
+
"node_type": node.type,
|
|
315
|
+
"has_error": node.has_error,
|
|
316
|
+
"has_docstring": docstring is not None
|
|
317
|
+
}
|
|
318
|
+
)
|
|
319
|
+
|
|
320
|
+
def _analyze_node(self, node: Node, language: str) -> Tuple[str, Optional[str]]:
|
|
321
|
+
"""Analyze node to determine chunk type and extract name"""
|
|
322
|
+
node_type = node.type
|
|
323
|
+
name = None
|
|
324
|
+
|
|
325
|
+
# Language-specific analysis
|
|
326
|
+
if language == 'python':
|
|
327
|
+
if node_type == 'function_definition':
|
|
328
|
+
chunk_type = 'function'
|
|
329
|
+
name = self._extract_python_function_name(node)
|
|
330
|
+
elif node_type == 'class_definition':
|
|
331
|
+
chunk_type = 'class'
|
|
332
|
+
name = self._extract_python_class_name(node)
|
|
333
|
+
elif node_type in ['import_statement', 'import_from_statement']:
|
|
334
|
+
chunk_type = 'import_block'
|
|
335
|
+
else:
|
|
336
|
+
chunk_type = 'module'
|
|
337
|
+
|
|
338
|
+
elif language in ['javascript', 'typescript']:
|
|
339
|
+
if node_type in ['function_declaration', 'function_expression']:
|
|
340
|
+
chunk_type = 'function'
|
|
341
|
+
name = self._extract_js_function_name(node)
|
|
342
|
+
elif node_type == 'class_declaration':
|
|
343
|
+
chunk_type = 'class'
|
|
344
|
+
name = self._extract_js_class_name(node)
|
|
345
|
+
elif node_type == 'arrow_function':
|
|
346
|
+
chunk_type = 'function'
|
|
347
|
+
name = 'arrow_function'
|
|
348
|
+
elif node_type == 'method_definition':
|
|
349
|
+
chunk_type = 'method'
|
|
350
|
+
name = self._extract_js_method_name(node)
|
|
351
|
+
elif node_type in ['import_statement', 'export_statement']:
|
|
352
|
+
chunk_type = 'import_block'
|
|
353
|
+
else:
|
|
354
|
+
chunk_type = 'module'
|
|
355
|
+
|
|
356
|
+
elif language == 'go':
|
|
357
|
+
if node_type == 'function_declaration':
|
|
358
|
+
chunk_type = 'function'
|
|
359
|
+
name = self._extract_go_function_name(node)
|
|
360
|
+
elif node_type == 'method_declaration':
|
|
361
|
+
chunk_type = 'method'
|
|
362
|
+
name = self._extract_go_method_name(node)
|
|
363
|
+
elif node_type in ['type_declaration', 'interface_declaration']:
|
|
364
|
+
chunk_type = 'type'
|
|
365
|
+
name = self._extract_go_type_name(node)
|
|
366
|
+
elif node_type == 'import_declaration':
|
|
367
|
+
chunk_type = 'import_block'
|
|
368
|
+
else:
|
|
369
|
+
chunk_type = 'module'
|
|
370
|
+
else:
|
|
371
|
+
chunk_type = 'unknown'
|
|
372
|
+
|
|
373
|
+
return chunk_type, name
|
|
374
|
+
|
|
375
|
+
def _extract_python_function_name(self, node: Node) -> Optional[str]:
|
|
376
|
+
"""Extract function name from Python AST node"""
|
|
377
|
+
for child in node.children:
|
|
378
|
+
if child.type == 'identifier':
|
|
379
|
+
return child.text.decode('utf-8')
|
|
380
|
+
return None
|
|
381
|
+
|
|
382
|
+
def _extract_python_class_name(self, node: Node) -> Optional[str]:
|
|
383
|
+
"""Extract class name from Python AST node"""
|
|
384
|
+
for child in node.children:
|
|
385
|
+
if child.type == 'identifier':
|
|
386
|
+
return child.text.decode('utf-8')
|
|
387
|
+
return None
|
|
388
|
+
|
|
389
|
+
def _extract_js_function_name(self, node: Node) -> Optional[str]:
|
|
390
|
+
"""Extract function name from JavaScript/TypeScript AST node"""
|
|
391
|
+
for child in node.children:
|
|
392
|
+
if child.type == 'identifier':
|
|
393
|
+
return child.text.decode('utf-8')
|
|
394
|
+
return None
|
|
395
|
+
|
|
396
|
+
def _extract_js_class_name(self, node: Node) -> Optional[str]:
|
|
397
|
+
"""Extract class name from JavaScript/TypeScript AST node"""
|
|
398
|
+
for child in node.children:
|
|
399
|
+
if child.type == 'identifier':
|
|
400
|
+
return child.text.decode('utf-8')
|
|
401
|
+
return None
|
|
402
|
+
|
|
403
|
+
def _extract_js_method_name(self, node: Node) -> Optional[str]:
|
|
404
|
+
"""Extract method name from JavaScript/TypeScript AST node"""
|
|
405
|
+
for child in node.children:
|
|
406
|
+
if child.type == 'property_identifier':
|
|
407
|
+
return child.text.decode('utf-8')
|
|
408
|
+
return None
|
|
409
|
+
|
|
410
|
+
def _extract_go_function_name(self, node: Node) -> Optional[str]:
|
|
411
|
+
"""Extract function name from Go AST node"""
|
|
412
|
+
for child in node.children:
|
|
413
|
+
if child.type == 'identifier':
|
|
414
|
+
return child.text.decode('utf-8')
|
|
415
|
+
return None
|
|
416
|
+
|
|
417
|
+
def _extract_go_method_name(self, node: Node) -> Optional[str]:
|
|
418
|
+
"""Extract method name from Go AST node"""
|
|
419
|
+
for child in node.children:
|
|
420
|
+
if child.type == 'field_identifier':
|
|
421
|
+
return child.text.decode('utf-8')
|
|
422
|
+
return None
|
|
423
|
+
|
|
424
|
+
def _extract_go_type_name(self, node: Node) -> Optional[str]:
|
|
425
|
+
"""Extract type name from Go AST node"""
|
|
426
|
+
spec_list = None
|
|
427
|
+
for child in node.children:
|
|
428
|
+
if child.type == 'type_spec':
|
|
429
|
+
spec_list = child
|
|
430
|
+
break
|
|
431
|
+
|
|
432
|
+
if spec_list:
|
|
433
|
+
for child in spec_list.children:
|
|
434
|
+
if child.type == 'type_identifier':
|
|
435
|
+
return child.text.decode('utf-8')
|
|
436
|
+
return None
|
|
437
|
+
|
|
438
|
+
def _extract_python_docstring(self, node: Node, source: str) -> Optional[str]:
|
|
439
|
+
"""
|
|
440
|
+
Extract docstring from a Python function/class node.
|
|
441
|
+
|
|
442
|
+
Python docstrings are the first expression_statement containing a string
|
|
443
|
+
inside the block of a function_definition or class_definition.
|
|
444
|
+
"""
|
|
445
|
+
for child in node.children:
|
|
446
|
+
if child.type == 'block':
|
|
447
|
+
for block_child in child.children:
|
|
448
|
+
if block_child.type == 'expression_statement':
|
|
449
|
+
for expr_child in block_child.children:
|
|
450
|
+
if expr_child.type == 'string':
|
|
451
|
+
raw = source[expr_child.start_byte:expr_child.end_byte]
|
|
452
|
+
return self._clean_docstring(raw)
|
|
453
|
+
# First expression_statement wasn't a string, no docstring
|
|
454
|
+
return None
|
|
455
|
+
return None
|
|
456
|
+
|
|
457
|
+
def _extract_module_docstring(self, root_node: Node, source: str) -> Optional[str]:
|
|
458
|
+
"""
|
|
459
|
+
Extract module-level docstring from the root node of a Python file.
|
|
460
|
+
|
|
461
|
+
Module docstrings are the first expression_statement containing a string
|
|
462
|
+
at the top level of the module (direct child of the module/root node).
|
|
463
|
+
These often contain:
|
|
464
|
+
- Overall module purpose and description
|
|
465
|
+
- Usage examples
|
|
466
|
+
- Algorithm explanations
|
|
467
|
+
- Paper references (arXiv, DOI, etc.)
|
|
468
|
+
"""
|
|
469
|
+
for child in root_node.children:
|
|
470
|
+
if child.type == 'expression_statement':
|
|
471
|
+
for expr_child in child.children:
|
|
472
|
+
if expr_child.type == 'string':
|
|
473
|
+
raw = source[expr_child.start_byte:expr_child.end_byte]
|
|
474
|
+
return self._clean_docstring(raw)
|
|
475
|
+
# First expression_statement wasn't a string, no module docstring
|
|
476
|
+
return None
|
|
477
|
+
# Skip comments but stop at any other statement type
|
|
478
|
+
elif child.type not in ['comment']:
|
|
479
|
+
return None
|
|
480
|
+
return None
|
|
481
|
+
|
|
482
|
+
def _clean_docstring(self, raw: str) -> str:
|
|
483
|
+
"""Clean docstring by removing quotes and normalizing whitespace."""
|
|
484
|
+
# Remove triple quotes (both styles)
|
|
485
|
+
if raw.startswith('"""') and raw.endswith('"""'):
|
|
486
|
+
raw = raw[3:-3]
|
|
487
|
+
elif raw.startswith("'''") and raw.endswith("'''"):
|
|
488
|
+
raw = raw[3:-3]
|
|
489
|
+
# Remove single quotes (less common but valid)
|
|
490
|
+
elif raw.startswith('"') and raw.endswith('"'):
|
|
491
|
+
raw = raw[1:-1]
|
|
492
|
+
elif raw.startswith("'") and raw.endswith("'"):
|
|
493
|
+
raw = raw[1:-1]
|
|
494
|
+
return raw.strip()
|
|
495
|
+
|
|
496
|
+
def _chunk_with_lines(self, content: str, language: str, file_path: str) -> List[CodeChunk]:
|
|
497
|
+
"""
|
|
498
|
+
Simple line-based chunking fallback.
|
|
499
|
+
|
|
500
|
+
This is used when tree-sitter is unavailable or fails.
|
|
501
|
+
Creates overlapping chunks based on line count.
|
|
502
|
+
"""
|
|
503
|
+
lines = content.split('\n')
|
|
504
|
+
chunks = []
|
|
505
|
+
|
|
506
|
+
# Simple heuristic: try to detect function/class boundaries
|
|
507
|
+
i = 0
|
|
508
|
+
while i < len(lines):
|
|
509
|
+
# Look for potential start markers
|
|
510
|
+
chunk_lines = []
|
|
511
|
+
chunk_start = i
|
|
512
|
+
|
|
513
|
+
# Determine chunk type based on content
|
|
514
|
+
first_line = lines[i].strip() if i < len(lines) else ""
|
|
515
|
+
chunk_type = self._detect_chunk_type_from_line(first_line, language)
|
|
516
|
+
name = self._extract_name_from_line(first_line, language)
|
|
517
|
+
|
|
518
|
+
# Collect lines for this chunk
|
|
519
|
+
indent_level = len(lines[i]) - len(lines[i].lstrip()) if i < len(lines) else 0
|
|
520
|
+
|
|
521
|
+
# Add lines until we hit size limit or dedent
|
|
522
|
+
while i < len(lines) and len(chunk_lines) < self.max_chunk_size:
|
|
523
|
+
line = lines[i]
|
|
524
|
+
current_indent = len(line) - len(line.lstrip())
|
|
525
|
+
|
|
526
|
+
# Check for end of logical block (dedent)
|
|
527
|
+
if chunk_lines and line.strip() and current_indent < indent_level:
|
|
528
|
+
# Skip if it's just a blank line
|
|
529
|
+
if i + 1 < len(lines):
|
|
530
|
+
next_indent = len(lines[i + 1]) - len(lines[i + 1].lstrip())
|
|
531
|
+
if next_indent >= indent_level:
|
|
532
|
+
chunk_lines.append(line)
|
|
533
|
+
i += 1
|
|
534
|
+
continue
|
|
535
|
+
break
|
|
536
|
+
|
|
537
|
+
chunk_lines.append(line)
|
|
538
|
+
i += 1
|
|
539
|
+
|
|
540
|
+
# Create chunk if we have content
|
|
541
|
+
if chunk_lines and any(line.strip() for line in chunk_lines):
|
|
542
|
+
chunks.append(CodeChunk(
|
|
543
|
+
content='\n'.join(chunk_lines),
|
|
544
|
+
chunk_type=chunk_type if chunk_type else 'block',
|
|
545
|
+
name=name,
|
|
546
|
+
start_line=chunk_start + 1,
|
|
547
|
+
end_line=chunk_start + len(chunk_lines),
|
|
548
|
+
language=language,
|
|
549
|
+
metadata={"chunking_method": "line-based"}
|
|
550
|
+
))
|
|
551
|
+
|
|
552
|
+
# Move back for overlap if chunk is large
|
|
553
|
+
if len(chunk_lines) > self.chunk_overlap:
|
|
554
|
+
i -= self.chunk_overlap
|
|
555
|
+
|
|
556
|
+
# Ensure we make progress
|
|
557
|
+
if i <= chunk_start:
|
|
558
|
+
i = chunk_start + 1
|
|
559
|
+
|
|
560
|
+
return chunks
|
|
561
|
+
|
|
562
|
+
def _detect_chunk_type_from_line(self, line: str, language: str) -> Optional[str]:
|
|
563
|
+
"""Simple heuristic to detect chunk type from a line"""
|
|
564
|
+
line = line.strip()
|
|
565
|
+
|
|
566
|
+
if language == 'python':
|
|
567
|
+
if line.startswith('def '):
|
|
568
|
+
return 'function'
|
|
569
|
+
elif line.startswith('class '):
|
|
570
|
+
return 'class'
|
|
571
|
+
elif line.startswith(('import ', 'from ')):
|
|
572
|
+
return 'import_block'
|
|
573
|
+
|
|
574
|
+
elif language in ['javascript', 'typescript']:
|
|
575
|
+
if 'function ' in line or 'const ' in line and '=>' in line:
|
|
576
|
+
return 'function'
|
|
577
|
+
elif line.startswith('class '):
|
|
578
|
+
return 'class'
|
|
579
|
+
elif line.startswith(('import ', 'export ')):
|
|
580
|
+
return 'import_block'
|
|
581
|
+
|
|
582
|
+
elif language == 'go':
|
|
583
|
+
if line.startswith('func '):
|
|
584
|
+
return 'function'
|
|
585
|
+
elif line.startswith('type '):
|
|
586
|
+
return 'type'
|
|
587
|
+
elif line.startswith('import'):
|
|
588
|
+
return 'import_block'
|
|
589
|
+
|
|
590
|
+
return None
|
|
591
|
+
|
|
592
|
+
def _extract_name_from_line(self, line: str, language: str) -> Optional[str]:
|
|
593
|
+
"""Simple heuristic to extract name from a line"""
|
|
594
|
+
line = line.strip()
|
|
595
|
+
|
|
596
|
+
if language == 'python':
|
|
597
|
+
if line.startswith('def '):
|
|
598
|
+
match = re.match(r'def\s+(\w+)', line)
|
|
599
|
+
return match.group(1) if match else None
|
|
600
|
+
elif line.startswith('class '):
|
|
601
|
+
match = re.match(r'class\s+(\w+)', line)
|
|
602
|
+
return match.group(1) if match else None
|
|
603
|
+
|
|
604
|
+
elif language in ['javascript', 'typescript']:
|
|
605
|
+
if 'function ' in line:
|
|
606
|
+
match = re.search(r'function\s+(\w+)', line)
|
|
607
|
+
return match.group(1) if match else None
|
|
608
|
+
elif line.startswith('class '):
|
|
609
|
+
match = re.match(r'class\s+(\w+)', line)
|
|
610
|
+
return match.group(1) if match else None
|
|
611
|
+
elif 'const ' in line and '=>' in line:
|
|
612
|
+
match = re.match(r'const\s+(\w+)', line)
|
|
613
|
+
return match.group(1) if match else None
|
|
614
|
+
|
|
615
|
+
elif language == 'go':
|
|
616
|
+
if line.startswith('func '):
|
|
617
|
+
match = re.match(r'func\s+(?:\([^)]+\)\s+)?(\w+)', line)
|
|
618
|
+
return match.group(1) if match else None
|
|
619
|
+
elif line.startswith('type '):
|
|
620
|
+
match = re.match(r'type\s+(\w+)', line)
|
|
621
|
+
return match.group(1) if match else None
|
|
622
|
+
|
|
623
|
+
return None
|
|
624
|
+
|
|
625
|
+
|
|
626
|
+
# Convenience function for easy import
|
|
627
|
+
def chunk_code(
|
|
628
|
+
file_path: Path,
|
|
629
|
+
content: Optional[str] = None,
|
|
630
|
+
max_chunk_size: int = 1500,
|
|
631
|
+
chunk_overlap: int = 100
|
|
632
|
+
) -> List[CodeChunk]:
|
|
633
|
+
"""
|
|
634
|
+
Convenience function to chunk code from a file.
|
|
635
|
+
|
|
636
|
+
Args:
|
|
637
|
+
file_path: Path to the file
|
|
638
|
+
content: Optional pre-loaded content
|
|
639
|
+
max_chunk_size: Maximum lines per chunk (for fallback)
|
|
640
|
+
chunk_overlap: Line overlap (for fallback)
|
|
641
|
+
|
|
642
|
+
Returns:
|
|
643
|
+
List of CodeChunk objects
|
|
644
|
+
"""
|
|
645
|
+
chunker = ASTChunker(max_chunk_size, chunk_overlap)
|
|
646
|
+
return chunker.chunk_file(file_path, content)
|