tarang 4.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tarang/__init__.py +23 -0
- tarang/cli.py +1168 -0
- tarang/client/__init__.py +19 -0
- tarang/client/api_client.py +701 -0
- tarang/client/auth.py +178 -0
- tarang/context/__init__.py +41 -0
- tarang/context/bm25.py +218 -0
- tarang/context/chunker.py +984 -0
- tarang/context/graph.py +464 -0
- tarang/context/indexer.py +514 -0
- tarang/context/retriever.py +270 -0
- tarang/context/skeleton.py +282 -0
- tarang/context_collector.py +449 -0
- tarang/executor/__init__.py +6 -0
- tarang/executor/diff_apply.py +246 -0
- tarang/executor/linter.py +184 -0
- tarang/stream.py +1346 -0
- tarang/ui/__init__.py +7 -0
- tarang/ui/console.py +407 -0
- tarang/ui/diff_viewer.py +146 -0
- tarang/ui/formatter.py +1151 -0
- tarang/ui/keyboard.py +197 -0
- tarang/ws/__init__.py +14 -0
- tarang/ws/client.py +464 -0
- tarang/ws/executor.py +638 -0
- tarang/ws/handlers.py +590 -0
- tarang-4.4.0.dist-info/METADATA +102 -0
- tarang-4.4.0.dist-info/RECORD +31 -0
- tarang-4.4.0.dist-info/WHEEL +5 -0
- tarang-4.4.0.dist-info/entry_points.txt +2 -0
- tarang-4.4.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,984 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Code Chunker - AST-based code parsing using tree-sitter.
|
|
3
|
+
|
|
4
|
+
Extracts semantic chunks (functions, classes, methods) from source files
|
|
5
|
+
for efficient indexing and retrieval.
|
|
6
|
+
"""
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import hashlib
|
|
10
|
+
import re
|
|
11
|
+
from dataclasses import dataclass, field
|
|
12
|
+
from pathlib import Path
|
|
13
|
+
from typing import Dict, List, Optional, Tuple
|
|
14
|
+
|
|
15
|
+
# Tree-sitter imports (lazy loaded)
|
|
16
|
+
_ts_python = None
|
|
17
|
+
_ts_javascript = None
|
|
18
|
+
_ts_sql = None
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def _wrap_language(lang_ptr, name: str):
|
|
22
|
+
"""Wrap language pointer for tree-sitter 0.21+ compatibility."""
|
|
23
|
+
try:
|
|
24
|
+
from tree_sitter import Language
|
|
25
|
+
# New API: wrap PyCapsule with Language
|
|
26
|
+
return Language(lang_ptr)
|
|
27
|
+
except TypeError:
|
|
28
|
+
# Older API: Language expects (library_path, name) or already wrapped
|
|
29
|
+
return lang_ptr
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def _get_python_language():
|
|
33
|
+
"""Lazy load Python language."""
|
|
34
|
+
global _ts_python
|
|
35
|
+
if _ts_python is None:
|
|
36
|
+
try:
|
|
37
|
+
import tree_sitter_python as tspython
|
|
38
|
+
_ts_python = _wrap_language(tspython.language(), "python")
|
|
39
|
+
except ImportError:
|
|
40
|
+
return None
|
|
41
|
+
return _ts_python
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def _get_javascript_language():
|
|
45
|
+
"""Lazy load JavaScript/TypeScript language."""
|
|
46
|
+
global _ts_javascript
|
|
47
|
+
if _ts_javascript is None:
|
|
48
|
+
try:
|
|
49
|
+
# Try typescript first (handles .ts, .tsx, .js, .jsx)
|
|
50
|
+
import tree_sitter_typescript as tsts
|
|
51
|
+
_ts_javascript = _wrap_language(tsts.language_tsx(), "tsx")
|
|
52
|
+
except ImportError:
|
|
53
|
+
try:
|
|
54
|
+
import tree_sitter_javascript as tsjs
|
|
55
|
+
_ts_javascript = _wrap_language(tsjs.language(), "javascript")
|
|
56
|
+
except ImportError:
|
|
57
|
+
return None
|
|
58
|
+
return _ts_javascript
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def _get_sql_language():
|
|
62
|
+
"""Lazy load SQL language."""
|
|
63
|
+
global _ts_sql
|
|
64
|
+
if _ts_sql is None:
|
|
65
|
+
try:
|
|
66
|
+
import tree_sitter_sql as tssql
|
|
67
|
+
_ts_sql = _wrap_language(tssql.language(), "sql")
|
|
68
|
+
except ImportError:
|
|
69
|
+
return None
|
|
70
|
+
return _ts_sql
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
@dataclass
|
|
74
|
+
class Chunk:
|
|
75
|
+
"""A semantic code chunk extracted from source."""
|
|
76
|
+
id: str # Unique ID: "file.py:function_name"
|
|
77
|
+
file: str # Relative file path
|
|
78
|
+
type: str # "function" | "method" | "class" | "module"
|
|
79
|
+
name: str # Symbol name
|
|
80
|
+
signature: str # Function/class signature line
|
|
81
|
+
content: str # Full code content
|
|
82
|
+
line_start: int # Starting line (1-indexed)
|
|
83
|
+
line_end: int # Ending line (1-indexed)
|
|
84
|
+
tokens: List[str] = field(default_factory=list) # Tokenized for BM25
|
|
85
|
+
parent: Optional[str] = None # Parent class for methods
|
|
86
|
+
|
|
87
|
+
@property
|
|
88
|
+
def hash(self) -> str:
|
|
89
|
+
"""Content hash for change detection."""
|
|
90
|
+
return hashlib.sha256(self.content.encode()).hexdigest()[:16]
|
|
91
|
+
|
|
92
|
+
def to_dict(self) -> Dict:
|
|
93
|
+
"""Convert to dictionary for serialization."""
|
|
94
|
+
return {
|
|
95
|
+
"id": self.id,
|
|
96
|
+
"file": self.file,
|
|
97
|
+
"type": self.type,
|
|
98
|
+
"name": self.name,
|
|
99
|
+
"signature": self.signature,
|
|
100
|
+
"content": self.content,
|
|
101
|
+
"line_start": self.line_start,
|
|
102
|
+
"line_end": self.line_end,
|
|
103
|
+
"tokens": self.tokens,
|
|
104
|
+
"parent": self.parent,
|
|
105
|
+
"hash": self.hash,
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
@classmethod
|
|
109
|
+
def from_dict(cls, data: Dict) -> "Chunk":
|
|
110
|
+
"""Create from dictionary."""
|
|
111
|
+
return cls(
|
|
112
|
+
id=data["id"],
|
|
113
|
+
file=data["file"],
|
|
114
|
+
type=data["type"],
|
|
115
|
+
name=data["name"],
|
|
116
|
+
signature=data["signature"],
|
|
117
|
+
content=data["content"],
|
|
118
|
+
line_start=data["line_start"],
|
|
119
|
+
line_end=data["line_end"],
|
|
120
|
+
tokens=data.get("tokens", []),
|
|
121
|
+
parent=data.get("parent"),
|
|
122
|
+
)
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
@dataclass
|
|
126
|
+
class SymbolInfo:
|
|
127
|
+
"""Information about a symbol for graph building."""
|
|
128
|
+
id: str # "file.py:function_name"
|
|
129
|
+
name: str # Symbol name
|
|
130
|
+
type: str # "function" | "method" | "class"
|
|
131
|
+
file: str # File path
|
|
132
|
+
line: int # Definition line
|
|
133
|
+
signature: str # Signature
|
|
134
|
+
calls: List[str] = field(default_factory=list) # Functions called
|
|
135
|
+
imports: List[str] = field(default_factory=list) # Modules imported
|
|
136
|
+
parent_class: Optional[str] = None # For methods
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
class Chunker:
|
|
140
|
+
"""
|
|
141
|
+
AST-based code chunker using tree-sitter.
|
|
142
|
+
|
|
143
|
+
Extracts functions, classes, and methods as semantic chunks.
|
|
144
|
+
"""
|
|
145
|
+
|
|
146
|
+
# Supported file extensions
|
|
147
|
+
LANGUAGE_MAP = {
|
|
148
|
+
# Python
|
|
149
|
+
".py": "python",
|
|
150
|
+
".pyw": "python",
|
|
151
|
+
# JavaScript/TypeScript
|
|
152
|
+
".js": "javascript",
|
|
153
|
+
".jsx": "javascript",
|
|
154
|
+
".ts": "javascript", # tree-sitter-javascript handles TS basics
|
|
155
|
+
".tsx": "javascript",
|
|
156
|
+
".mjs": "javascript",
|
|
157
|
+
".cjs": "javascript",
|
|
158
|
+
# SQL
|
|
159
|
+
".sql": "sql",
|
|
160
|
+
}
|
|
161
|
+
|
|
162
|
+
# Max lines per chunk (split if larger)
|
|
163
|
+
MAX_CHUNK_LINES = 200
|
|
164
|
+
|
|
165
|
+
# Max file size to process (100KB)
|
|
166
|
+
MAX_FILE_SIZE = 100 * 1024
|
|
167
|
+
|
|
168
|
+
def __init__(self):
|
|
169
|
+
self._parsers: Dict[str, any] = {}
|
|
170
|
+
|
|
171
|
+
def _get_parser(self, language: str):
|
|
172
|
+
"""Get or create parser for language."""
|
|
173
|
+
if language in self._parsers:
|
|
174
|
+
return self._parsers[language]
|
|
175
|
+
|
|
176
|
+
try:
|
|
177
|
+
from tree_sitter import Parser
|
|
178
|
+
except ImportError:
|
|
179
|
+
return None
|
|
180
|
+
|
|
181
|
+
lang = None
|
|
182
|
+
if language == "python":
|
|
183
|
+
lang = _get_python_language()
|
|
184
|
+
elif language in ("javascript", "typescript", "tsx", "jsx"):
|
|
185
|
+
lang = _get_javascript_language()
|
|
186
|
+
elif language == "sql":
|
|
187
|
+
lang = _get_sql_language()
|
|
188
|
+
|
|
189
|
+
if lang is None:
|
|
190
|
+
return None
|
|
191
|
+
|
|
192
|
+
# Create parser - handle both old and new tree-sitter API
|
|
193
|
+
parser = Parser()
|
|
194
|
+
try:
|
|
195
|
+
# New API (0.21+): set language via property
|
|
196
|
+
parser.language = lang
|
|
197
|
+
except AttributeError:
|
|
198
|
+
# Old API: pass language to constructor (already created above, need to recreate)
|
|
199
|
+
try:
|
|
200
|
+
parser = Parser(lang)
|
|
201
|
+
except TypeError:
|
|
202
|
+
return None
|
|
203
|
+
|
|
204
|
+
self._parsers[language] = parser
|
|
205
|
+
return parser
|
|
206
|
+
|
|
207
|
+
def chunk_file(self, file_path: Path, project_root: Path) -> Tuple[List[Chunk], List[SymbolInfo]]:
|
|
208
|
+
"""
|
|
209
|
+
Parse a file and extract chunks and symbol info.
|
|
210
|
+
|
|
211
|
+
Args:
|
|
212
|
+
file_path: Absolute path to file
|
|
213
|
+
project_root: Project root for relative paths
|
|
214
|
+
|
|
215
|
+
Returns:
|
|
216
|
+
Tuple of (chunks, symbols)
|
|
217
|
+
"""
|
|
218
|
+
# Get relative path
|
|
219
|
+
try:
|
|
220
|
+
rel_path = str(file_path.relative_to(project_root))
|
|
221
|
+
except ValueError:
|
|
222
|
+
rel_path = str(file_path)
|
|
223
|
+
|
|
224
|
+
# Check file size
|
|
225
|
+
try:
|
|
226
|
+
if file_path.stat().st_size > self.MAX_FILE_SIZE:
|
|
227
|
+
return [], []
|
|
228
|
+
except OSError:
|
|
229
|
+
return [], []
|
|
230
|
+
|
|
231
|
+
# Determine language
|
|
232
|
+
ext = file_path.suffix.lower()
|
|
233
|
+
language = self.LANGUAGE_MAP.get(ext)
|
|
234
|
+
|
|
235
|
+
if language is None:
|
|
236
|
+
# Return file as single module chunk for unsupported languages
|
|
237
|
+
return self._chunk_as_module(file_path, rel_path)
|
|
238
|
+
|
|
239
|
+
# Get parser
|
|
240
|
+
parser = self._get_parser(language)
|
|
241
|
+
if parser is None:
|
|
242
|
+
return self._chunk_as_module(file_path, rel_path)
|
|
243
|
+
|
|
244
|
+
# Read and parse
|
|
245
|
+
try:
|
|
246
|
+
content = file_path.read_text(encoding="utf-8", errors="replace")
|
|
247
|
+
except Exception:
|
|
248
|
+
return [], []
|
|
249
|
+
|
|
250
|
+
tree = parser.parse(content.encode("utf-8"))
|
|
251
|
+
|
|
252
|
+
# Extract based on language
|
|
253
|
+
if language == "python":
|
|
254
|
+
return self._extract_python(tree, content, rel_path)
|
|
255
|
+
elif language == "javascript":
|
|
256
|
+
return self._extract_javascript(tree, content, rel_path)
|
|
257
|
+
elif language == "sql":
|
|
258
|
+
return self._extract_sql(tree, content, rel_path)
|
|
259
|
+
|
|
260
|
+
return [], []
|
|
261
|
+
|
|
262
|
+
def _chunk_as_module(self, file_path: Path, rel_path: str) -> Tuple[List[Chunk], List[SymbolInfo]]:
|
|
263
|
+
"""Treat entire file as a single module chunk."""
|
|
264
|
+
try:
|
|
265
|
+
content = file_path.read_text(encoding="utf-8", errors="replace")
|
|
266
|
+
except Exception:
|
|
267
|
+
return [], []
|
|
268
|
+
|
|
269
|
+
lines = content.splitlines()
|
|
270
|
+
if len(lines) > self.MAX_CHUNK_LINES:
|
|
271
|
+
content = "\n".join(lines[:self.MAX_CHUNK_LINES]) + "\n... (truncated)"
|
|
272
|
+
|
|
273
|
+
chunk = Chunk(
|
|
274
|
+
id=f"{rel_path}:module",
|
|
275
|
+
file=rel_path,
|
|
276
|
+
type="module",
|
|
277
|
+
name=Path(rel_path).stem,
|
|
278
|
+
signature=f"# {rel_path}",
|
|
279
|
+
content=content,
|
|
280
|
+
line_start=1,
|
|
281
|
+
line_end=len(lines),
|
|
282
|
+
tokens=self._tokenize(content),
|
|
283
|
+
)
|
|
284
|
+
|
|
285
|
+
symbol = SymbolInfo(
|
|
286
|
+
id=chunk.id,
|
|
287
|
+
name=chunk.name,
|
|
288
|
+
type="module",
|
|
289
|
+
file=rel_path,
|
|
290
|
+
line=1,
|
|
291
|
+
signature=chunk.signature,
|
|
292
|
+
)
|
|
293
|
+
|
|
294
|
+
return [chunk], [symbol]
|
|
295
|
+
|
|
296
|
+
def _extract_python(self, tree, content: str, rel_path: str) -> Tuple[List[Chunk], List[SymbolInfo]]:
|
|
297
|
+
"""Extract chunks from Python AST."""
|
|
298
|
+
chunks = []
|
|
299
|
+
symbols = []
|
|
300
|
+
lines = content.splitlines()
|
|
301
|
+
|
|
302
|
+
def get_node_text(node) -> str:
|
|
303
|
+
return content[node.start_byte:node.end_byte]
|
|
304
|
+
|
|
305
|
+
def get_signature(node) -> str:
|
|
306
|
+
"""Extract just the signature line."""
|
|
307
|
+
text = get_node_text(node)
|
|
308
|
+
first_line = text.split("\n")[0]
|
|
309
|
+
# For functions, include up to the colon
|
|
310
|
+
if ":" in first_line:
|
|
311
|
+
return first_line.rstrip()
|
|
312
|
+
return first_line
|
|
313
|
+
|
|
314
|
+
def extract_calls(node) -> List[str]:
|
|
315
|
+
"""Extract function calls from a node."""
|
|
316
|
+
calls = []
|
|
317
|
+
|
|
318
|
+
def walk(n):
|
|
319
|
+
if n.type == "call":
|
|
320
|
+
func = n.child_by_field_name("function")
|
|
321
|
+
if func:
|
|
322
|
+
call_name = get_node_text(func)
|
|
323
|
+
# Handle method calls: obj.method -> method
|
|
324
|
+
if "." in call_name:
|
|
325
|
+
call_name = call_name.split(".")[-1]
|
|
326
|
+
calls.append(call_name)
|
|
327
|
+
for child in n.children:
|
|
328
|
+
walk(child)
|
|
329
|
+
|
|
330
|
+
walk(node)
|
|
331
|
+
return calls
|
|
332
|
+
|
|
333
|
+
def extract_imports(node) -> List[str]:
|
|
334
|
+
"""Extract imports from module level."""
|
|
335
|
+
imports = []
|
|
336
|
+
|
|
337
|
+
def walk(n):
|
|
338
|
+
if n.type == "import_statement":
|
|
339
|
+
# import foo, bar
|
|
340
|
+
for child in n.children:
|
|
341
|
+
if child.type == "dotted_name":
|
|
342
|
+
imports.append(get_node_text(child))
|
|
343
|
+
elif n.type == "import_from_statement":
|
|
344
|
+
# from foo import bar
|
|
345
|
+
module = n.child_by_field_name("module_name")
|
|
346
|
+
if module:
|
|
347
|
+
imports.append(get_node_text(module))
|
|
348
|
+
for child in n.children:
|
|
349
|
+
if child.type not in ("function_definition", "class_definition"):
|
|
350
|
+
walk(child)
|
|
351
|
+
|
|
352
|
+
walk(node)
|
|
353
|
+
return imports
|
|
354
|
+
|
|
355
|
+
# First pass: extract module-level imports
|
|
356
|
+
module_imports = extract_imports(tree.root_node)
|
|
357
|
+
|
|
358
|
+
# Process top-level nodes
|
|
359
|
+
current_class = None
|
|
360
|
+
|
|
361
|
+
def process_node(node, parent_class=None):
|
|
362
|
+
nonlocal chunks, symbols
|
|
363
|
+
|
|
364
|
+
if node.type == "function_definition":
|
|
365
|
+
name_node = node.child_by_field_name("name")
|
|
366
|
+
if name_node:
|
|
367
|
+
name = get_node_text(name_node)
|
|
368
|
+
node_content = get_node_text(node)
|
|
369
|
+
|
|
370
|
+
chunk_type = "method" if parent_class else "function"
|
|
371
|
+
chunk_id = f"{rel_path}:{parent_class}.{name}" if parent_class else f"{rel_path}:{name}"
|
|
372
|
+
|
|
373
|
+
chunk = Chunk(
|
|
374
|
+
id=chunk_id,
|
|
375
|
+
file=rel_path,
|
|
376
|
+
type=chunk_type,
|
|
377
|
+
name=name,
|
|
378
|
+
signature=get_signature(node),
|
|
379
|
+
content=node_content,
|
|
380
|
+
line_start=node.start_point[0] + 1,
|
|
381
|
+
line_end=node.end_point[0] + 1,
|
|
382
|
+
tokens=self._tokenize(node_content),
|
|
383
|
+
parent=parent_class,
|
|
384
|
+
)
|
|
385
|
+
chunks.append(chunk)
|
|
386
|
+
|
|
387
|
+
symbol = SymbolInfo(
|
|
388
|
+
id=chunk_id,
|
|
389
|
+
name=name,
|
|
390
|
+
type=chunk_type,
|
|
391
|
+
file=rel_path,
|
|
392
|
+
line=node.start_point[0] + 1,
|
|
393
|
+
signature=chunk.signature,
|
|
394
|
+
calls=extract_calls(node),
|
|
395
|
+
parent_class=parent_class,
|
|
396
|
+
)
|
|
397
|
+
symbols.append(symbol)
|
|
398
|
+
|
|
399
|
+
elif node.type == "class_definition":
|
|
400
|
+
name_node = node.child_by_field_name("name")
|
|
401
|
+
if name_node:
|
|
402
|
+
class_name = get_node_text(name_node)
|
|
403
|
+
node_content = get_node_text(node)
|
|
404
|
+
|
|
405
|
+
# Extract class signature (just the class line)
|
|
406
|
+
class_sig = get_signature(node)
|
|
407
|
+
|
|
408
|
+
# Create class chunk (without method bodies for summary)
|
|
409
|
+
chunk_id = f"{rel_path}:{class_name}"
|
|
410
|
+
|
|
411
|
+
# Get just class definition without full method bodies
|
|
412
|
+
class_summary = self._get_class_summary(node, content)
|
|
413
|
+
|
|
414
|
+
chunk = Chunk(
|
|
415
|
+
id=chunk_id,
|
|
416
|
+
file=rel_path,
|
|
417
|
+
type="class",
|
|
418
|
+
name=class_name,
|
|
419
|
+
signature=class_sig,
|
|
420
|
+
content=class_summary,
|
|
421
|
+
line_start=node.start_point[0] + 1,
|
|
422
|
+
line_end=node.end_point[0] + 1,
|
|
423
|
+
tokens=self._tokenize(class_summary),
|
|
424
|
+
)
|
|
425
|
+
chunks.append(chunk)
|
|
426
|
+
|
|
427
|
+
# Extract parent classes
|
|
428
|
+
superclasses = []
|
|
429
|
+
args = node.child_by_field_name("superclasses")
|
|
430
|
+
if args:
|
|
431
|
+
for arg in args.children:
|
|
432
|
+
if arg.type == "identifier":
|
|
433
|
+
superclasses.append(get_node_text(arg))
|
|
434
|
+
|
|
435
|
+
symbol = SymbolInfo(
|
|
436
|
+
id=chunk_id,
|
|
437
|
+
name=class_name,
|
|
438
|
+
type="class",
|
|
439
|
+
file=rel_path,
|
|
440
|
+
line=node.start_point[0] + 1,
|
|
441
|
+
signature=class_sig,
|
|
442
|
+
imports=superclasses, # Reuse imports for inheritance
|
|
443
|
+
)
|
|
444
|
+
symbols.append(symbol)
|
|
445
|
+
|
|
446
|
+
# Process methods inside class
|
|
447
|
+
body = node.child_by_field_name("body")
|
|
448
|
+
if body:
|
|
449
|
+
for child in body.children:
|
|
450
|
+
process_node(child, parent_class=class_name)
|
|
451
|
+
|
|
452
|
+
# Process all top-level nodes
|
|
453
|
+
for child in tree.root_node.children:
|
|
454
|
+
process_node(child)
|
|
455
|
+
|
|
456
|
+
# Add module-level symbol with imports
|
|
457
|
+
if module_imports:
|
|
458
|
+
module_symbol = SymbolInfo(
|
|
459
|
+
id=f"{rel_path}:module",
|
|
460
|
+
name=Path(rel_path).stem,
|
|
461
|
+
type="module",
|
|
462
|
+
file=rel_path,
|
|
463
|
+
line=1,
|
|
464
|
+
signature=f"# {rel_path}",
|
|
465
|
+
imports=module_imports,
|
|
466
|
+
)
|
|
467
|
+
symbols.append(module_symbol)
|
|
468
|
+
|
|
469
|
+
return chunks, symbols
|
|
470
|
+
|
|
471
|
+
def _get_class_summary(self, class_node, content: str) -> str:
|
|
472
|
+
"""Get class with method signatures only (not full bodies)."""
|
|
473
|
+
lines = []
|
|
474
|
+
|
|
475
|
+
def get_node_text(node) -> str:
|
|
476
|
+
return content[node.start_byte:node.end_byte]
|
|
477
|
+
|
|
478
|
+
# Get class definition line
|
|
479
|
+
first_line = get_node_text(class_node).split("\n")[0]
|
|
480
|
+
lines.append(first_line)
|
|
481
|
+
|
|
482
|
+
# Get docstring if present
|
|
483
|
+
body = class_node.child_by_field_name("body")
|
|
484
|
+
if body and body.children:
|
|
485
|
+
first_child = body.children[0]
|
|
486
|
+
if first_child.type == "expression_statement":
|
|
487
|
+
expr = first_child.children[0] if first_child.children else None
|
|
488
|
+
if expr and expr.type == "string":
|
|
489
|
+
docstring = get_node_text(expr)
|
|
490
|
+
# Indent docstring
|
|
491
|
+
for doc_line in docstring.split("\n"):
|
|
492
|
+
lines.append(" " + doc_line)
|
|
493
|
+
|
|
494
|
+
# Get method signatures
|
|
495
|
+
if body:
|
|
496
|
+
for child in body.children:
|
|
497
|
+
if child.type == "function_definition":
|
|
498
|
+
sig = get_node_text(child).split("\n")[0]
|
|
499
|
+
lines.append(" " + sig)
|
|
500
|
+
lines.append(" ...")
|
|
501
|
+
|
|
502
|
+
return "\n".join(lines)
|
|
503
|
+
|
|
504
|
+
def _extract_javascript(self, tree, content: str, rel_path: str) -> Tuple[List[Chunk], List[SymbolInfo]]:
|
|
505
|
+
"""Extract chunks from JavaScript/TypeScript AST."""
|
|
506
|
+
chunks = []
|
|
507
|
+
symbols = []
|
|
508
|
+
|
|
509
|
+
def get_node_text(node) -> str:
|
|
510
|
+
return content[node.start_byte:node.end_byte]
|
|
511
|
+
|
|
512
|
+
def get_signature(node) -> str:
|
|
513
|
+
"""Extract just the signature line."""
|
|
514
|
+
text = get_node_text(node)
|
|
515
|
+
first_line = text.split("\n")[0]
|
|
516
|
+
# Truncate at opening brace
|
|
517
|
+
if "{" in first_line:
|
|
518
|
+
return first_line[:first_line.index("{")].strip() + " {"
|
|
519
|
+
return first_line
|
|
520
|
+
|
|
521
|
+
def extract_calls(node) -> List[str]:
|
|
522
|
+
"""Extract function calls."""
|
|
523
|
+
calls = []
|
|
524
|
+
|
|
525
|
+
def walk(n):
|
|
526
|
+
if n.type == "call_expression":
|
|
527
|
+
func = n.child_by_field_name("function")
|
|
528
|
+
if func:
|
|
529
|
+
call_name = get_node_text(func)
|
|
530
|
+
if "." in call_name:
|
|
531
|
+
call_name = call_name.split(".")[-1]
|
|
532
|
+
calls.append(call_name)
|
|
533
|
+
for child in n.children:
|
|
534
|
+
walk(child)
|
|
535
|
+
|
|
536
|
+
walk(node)
|
|
537
|
+
return calls
|
|
538
|
+
|
|
539
|
+
def process_node(node, parent_class=None):
|
|
540
|
+
nonlocal chunks, symbols
|
|
541
|
+
|
|
542
|
+
# Function declarations
|
|
543
|
+
if node.type in ("function_declaration", "function"):
|
|
544
|
+
name_node = node.child_by_field_name("name")
|
|
545
|
+
if name_node:
|
|
546
|
+
name = get_node_text(name_node)
|
|
547
|
+
node_content = get_node_text(node)
|
|
548
|
+
|
|
549
|
+
chunk_id = f"{rel_path}:{name}"
|
|
550
|
+
|
|
551
|
+
chunk = Chunk(
|
|
552
|
+
id=chunk_id,
|
|
553
|
+
file=rel_path,
|
|
554
|
+
type="function",
|
|
555
|
+
name=name,
|
|
556
|
+
signature=get_signature(node),
|
|
557
|
+
content=node_content,
|
|
558
|
+
line_start=node.start_point[0] + 1,
|
|
559
|
+
line_end=node.end_point[0] + 1,
|
|
560
|
+
tokens=self._tokenize(node_content),
|
|
561
|
+
)
|
|
562
|
+
chunks.append(chunk)
|
|
563
|
+
|
|
564
|
+
symbol = SymbolInfo(
|
|
565
|
+
id=chunk_id,
|
|
566
|
+
name=name,
|
|
567
|
+
type="function",
|
|
568
|
+
file=rel_path,
|
|
569
|
+
line=node.start_point[0] + 1,
|
|
570
|
+
signature=chunk.signature,
|
|
571
|
+
calls=extract_calls(node),
|
|
572
|
+
)
|
|
573
|
+
symbols.append(symbol)
|
|
574
|
+
|
|
575
|
+
# Arrow functions assigned to variables
|
|
576
|
+
elif node.type == "lexical_declaration":
|
|
577
|
+
for decl in node.children:
|
|
578
|
+
if decl.type == "variable_declarator":
|
|
579
|
+
name_node = decl.child_by_field_name("name")
|
|
580
|
+
value_node = decl.child_by_field_name("value")
|
|
581
|
+
if name_node and value_node and value_node.type == "arrow_function":
|
|
582
|
+
name = get_node_text(name_node)
|
|
583
|
+
node_content = get_node_text(node)
|
|
584
|
+
|
|
585
|
+
chunk_id = f"{rel_path}:{name}"
|
|
586
|
+
|
|
587
|
+
chunk = Chunk(
|
|
588
|
+
id=chunk_id,
|
|
589
|
+
file=rel_path,
|
|
590
|
+
type="function",
|
|
591
|
+
name=name,
|
|
592
|
+
signature=get_signature(node),
|
|
593
|
+
content=node_content,
|
|
594
|
+
line_start=node.start_point[0] + 1,
|
|
595
|
+
line_end=node.end_point[0] + 1,
|
|
596
|
+
tokens=self._tokenize(node_content),
|
|
597
|
+
)
|
|
598
|
+
chunks.append(chunk)
|
|
599
|
+
|
|
600
|
+
symbol = SymbolInfo(
|
|
601
|
+
id=chunk_id,
|
|
602
|
+
name=name,
|
|
603
|
+
type="function",
|
|
604
|
+
file=rel_path,
|
|
605
|
+
line=node.start_point[0] + 1,
|
|
606
|
+
signature=chunk.signature,
|
|
607
|
+
calls=extract_calls(value_node),
|
|
608
|
+
)
|
|
609
|
+
symbols.append(symbol)
|
|
610
|
+
|
|
611
|
+
# Class declarations
|
|
612
|
+
elif node.type == "class_declaration":
|
|
613
|
+
name_node = node.child_by_field_name("name")
|
|
614
|
+
if name_node:
|
|
615
|
+
class_name = get_node_text(name_node)
|
|
616
|
+
node_content = get_node_text(node)
|
|
617
|
+
|
|
618
|
+
chunk_id = f"{rel_path}:{class_name}"
|
|
619
|
+
|
|
620
|
+
chunk = Chunk(
|
|
621
|
+
id=chunk_id,
|
|
622
|
+
file=rel_path,
|
|
623
|
+
type="class",
|
|
624
|
+
name=class_name,
|
|
625
|
+
signature=get_signature(node),
|
|
626
|
+
content=node_content,
|
|
627
|
+
line_start=node.start_point[0] + 1,
|
|
628
|
+
line_end=node.end_point[0] + 1,
|
|
629
|
+
tokens=self._tokenize(node_content),
|
|
630
|
+
)
|
|
631
|
+
chunks.append(chunk)
|
|
632
|
+
|
|
633
|
+
symbol = SymbolInfo(
|
|
634
|
+
id=chunk_id,
|
|
635
|
+
name=class_name,
|
|
636
|
+
type="class",
|
|
637
|
+
file=rel_path,
|
|
638
|
+
line=node.start_point[0] + 1,
|
|
639
|
+
signature=chunk.signature,
|
|
640
|
+
)
|
|
641
|
+
symbols.append(symbol)
|
|
642
|
+
|
|
643
|
+
# Recurse into children
|
|
644
|
+
for child in node.children:
|
|
645
|
+
process_node(child, parent_class)
|
|
646
|
+
|
|
647
|
+
# Process all nodes
|
|
648
|
+
for child in tree.root_node.children:
|
|
649
|
+
process_node(child)
|
|
650
|
+
|
|
651
|
+
return chunks, symbols
|
|
652
|
+
|
|
653
|
+
def _tokenize(self, content: str) -> List[str]:
|
|
654
|
+
"""
|
|
655
|
+
Tokenize content for BM25 indexing.
|
|
656
|
+
|
|
657
|
+
Handles:
|
|
658
|
+
- snake_case splitting
|
|
659
|
+
- camelCase splitting
|
|
660
|
+
- Code-specific tokens
|
|
661
|
+
"""
|
|
662
|
+
# Split on whitespace and punctuation
|
|
663
|
+
words = re.findall(r'\b\w+\b', content.lower())
|
|
664
|
+
|
|
665
|
+
tokens = []
|
|
666
|
+
for word in words:
|
|
667
|
+
# Split snake_case
|
|
668
|
+
if "_" in word:
|
|
669
|
+
tokens.extend(word.split("_"))
|
|
670
|
+
# Split camelCase
|
|
671
|
+
elif any(c.isupper() for c in word[1:]):
|
|
672
|
+
parts = re.findall(r'[A-Z]?[a-z]+|[A-Z]+(?=[A-Z]|$)', word)
|
|
673
|
+
tokens.extend(p.lower() for p in parts)
|
|
674
|
+
else:
|
|
675
|
+
tokens.append(word)
|
|
676
|
+
|
|
677
|
+
# Filter very short tokens and common keywords
|
|
678
|
+
stop_words = {
|
|
679
|
+
"def", "class", "self", "return", "if", "else", "elif", "for",
|
|
680
|
+
"while", "try", "except", "finally", "with", "as", "import",
|
|
681
|
+
"from", "in", "is", "not", "and", "or", "true", "false", "none",
|
|
682
|
+
"function", "const", "let", "var", "this", "new", "async", "await",
|
|
683
|
+
}
|
|
684
|
+
|
|
685
|
+
return [t for t in tokens if len(t) > 2 and t not in stop_words]
|
|
686
|
+
|
|
687
|
+
def _extract_sql(self, tree, content: str, rel_path: str) -> Tuple[List[Chunk], List[SymbolInfo]]:
|
|
688
|
+
"""
|
|
689
|
+
Extract chunks from SQL AST.
|
|
690
|
+
|
|
691
|
+
Handles:
|
|
692
|
+
- CREATE TABLE statements
|
|
693
|
+
- CREATE VIEW statements
|
|
694
|
+
- CREATE FUNCTION/PROCEDURE statements
|
|
695
|
+
- CREATE INDEX statements
|
|
696
|
+
- CREATE TRIGGER statements
|
|
697
|
+
"""
|
|
698
|
+
chunks = []
|
|
699
|
+
symbols = []
|
|
700
|
+
|
|
701
|
+
def get_node_text(node) -> str:
|
|
702
|
+
return content[node.start_byte:node.end_byte]
|
|
703
|
+
|
|
704
|
+
def extract_identifier(node):
|
|
705
|
+
"""Extract identifier name from various node structures."""
|
|
706
|
+
if node is None:
|
|
707
|
+
return None
|
|
708
|
+
|
|
709
|
+
# Direct identifier
|
|
710
|
+
if node.type == "identifier":
|
|
711
|
+
return get_node_text(node)
|
|
712
|
+
|
|
713
|
+
# Object reference (schema.table)
|
|
714
|
+
if node.type == "object_reference":
|
|
715
|
+
parts = []
|
|
716
|
+
for child in node.children:
|
|
717
|
+
if child.type == "identifier":
|
|
718
|
+
parts.append(get_node_text(child))
|
|
719
|
+
return ".".join(parts) if parts else None
|
|
720
|
+
|
|
721
|
+
# Search children for identifier
|
|
722
|
+
for child in node.children:
|
|
723
|
+
if child.type == "identifier":
|
|
724
|
+
return get_node_text(child)
|
|
725
|
+
if child.type == "object_reference":
|
|
726
|
+
return extract_identifier(child)
|
|
727
|
+
|
|
728
|
+
return None
|
|
729
|
+
|
|
730
|
+
def extract_table_refs(node) -> List[str]:
|
|
731
|
+
"""Extract table references from a statement (for views, functions)."""
|
|
732
|
+
refs = []
|
|
733
|
+
|
|
734
|
+
def walk(n):
|
|
735
|
+
if n.type in ("object_reference", "table_reference"):
|
|
736
|
+
name = extract_identifier(n)
|
|
737
|
+
if name:
|
|
738
|
+
refs.append(name)
|
|
739
|
+
elif n.type == "identifier" and n.parent and n.parent.type in (
|
|
740
|
+
"from_clause", "join_clause", "table_expression"
|
|
741
|
+
):
|
|
742
|
+
refs.append(get_node_text(n))
|
|
743
|
+
for child in n.children:
|
|
744
|
+
walk(child)
|
|
745
|
+
|
|
746
|
+
walk(node)
|
|
747
|
+
return list(set(refs))
|
|
748
|
+
|
|
749
|
+
def process_statement(node):
|
|
750
|
+
"""Process a SQL statement node."""
|
|
751
|
+
node_type = node.type.lower()
|
|
752
|
+
node_content = get_node_text(node)
|
|
753
|
+
|
|
754
|
+
# CREATE TABLE
|
|
755
|
+
if "create" in node_type and "table" in node_type:
|
|
756
|
+
name = None
|
|
757
|
+
# Find table name
|
|
758
|
+
for child in node.children:
|
|
759
|
+
if child.type in ("object_reference", "identifier"):
|
|
760
|
+
name = extract_identifier(child)
|
|
761
|
+
if name:
|
|
762
|
+
break
|
|
763
|
+
|
|
764
|
+
if name:
|
|
765
|
+
# Extract column names for signature
|
|
766
|
+
columns = []
|
|
767
|
+
for child in node.children:
|
|
768
|
+
if child.type == "column_definitions":
|
|
769
|
+
for col_def in child.children:
|
|
770
|
+
if col_def.type == "column_definition":
|
|
771
|
+
col_name = extract_identifier(col_def)
|
|
772
|
+
if col_name:
|
|
773
|
+
columns.append(col_name)
|
|
774
|
+
|
|
775
|
+
signature = f"CREATE TABLE {name}"
|
|
776
|
+
if columns:
|
|
777
|
+
signature += f" ({', '.join(columns[:5])}{'...' if len(columns) > 5 else ''})"
|
|
778
|
+
|
|
779
|
+
chunk_id = f"{rel_path}:table:{name}"
|
|
780
|
+
chunk = Chunk(
|
|
781
|
+
id=chunk_id,
|
|
782
|
+
file=rel_path,
|
|
783
|
+
type="table",
|
|
784
|
+
name=name,
|
|
785
|
+
signature=signature,
|
|
786
|
+
content=node_content,
|
|
787
|
+
line_start=node.start_point[0] + 1,
|
|
788
|
+
line_end=node.end_point[0] + 1,
|
|
789
|
+
tokens=self._tokenize(node_content),
|
|
790
|
+
)
|
|
791
|
+
chunks.append(chunk)
|
|
792
|
+
|
|
793
|
+
symbol = SymbolInfo(
|
|
794
|
+
id=chunk_id,
|
|
795
|
+
name=name,
|
|
796
|
+
type="table",
|
|
797
|
+
file=rel_path,
|
|
798
|
+
line=node.start_point[0] + 1,
|
|
799
|
+
signature=signature,
|
|
800
|
+
)
|
|
801
|
+
symbols.append(symbol)
|
|
802
|
+
|
|
803
|
+
# CREATE VIEW
|
|
804
|
+
elif "create" in node_type and "view" in node_type:
|
|
805
|
+
name = None
|
|
806
|
+
for child in node.children:
|
|
807
|
+
if child.type in ("object_reference", "identifier"):
|
|
808
|
+
name = extract_identifier(child)
|
|
809
|
+
if name:
|
|
810
|
+
break
|
|
811
|
+
|
|
812
|
+
if name:
|
|
813
|
+
table_refs = extract_table_refs(node)
|
|
814
|
+
signature = f"CREATE VIEW {name}"
|
|
815
|
+
|
|
816
|
+
chunk_id = f"{rel_path}:view:{name}"
|
|
817
|
+
chunk = Chunk(
|
|
818
|
+
id=chunk_id,
|
|
819
|
+
file=rel_path,
|
|
820
|
+
type="view",
|
|
821
|
+
name=name,
|
|
822
|
+
signature=signature,
|
|
823
|
+
content=node_content,
|
|
824
|
+
line_start=node.start_point[0] + 1,
|
|
825
|
+
line_end=node.end_point[0] + 1,
|
|
826
|
+
tokens=self._tokenize(node_content),
|
|
827
|
+
)
|
|
828
|
+
chunks.append(chunk)
|
|
829
|
+
|
|
830
|
+
symbol = SymbolInfo(
|
|
831
|
+
id=chunk_id,
|
|
832
|
+
name=name,
|
|
833
|
+
type="view",
|
|
834
|
+
file=rel_path,
|
|
835
|
+
line=node.start_point[0] + 1,
|
|
836
|
+
signature=signature,
|
|
837
|
+
imports=table_refs, # Views depend on tables
|
|
838
|
+
)
|
|
839
|
+
symbols.append(symbol)
|
|
840
|
+
|
|
841
|
+
# CREATE FUNCTION / CREATE PROCEDURE
|
|
842
|
+
elif "create" in node_type and ("function" in node_type or "procedure" in node_type):
|
|
843
|
+
name = None
|
|
844
|
+
obj_type = "procedure" if "procedure" in node_type else "function"
|
|
845
|
+
|
|
846
|
+
for child in node.children:
|
|
847
|
+
if child.type in ("object_reference", "identifier", "function_name"):
|
|
848
|
+
name = extract_identifier(child)
|
|
849
|
+
if name:
|
|
850
|
+
break
|
|
851
|
+
|
|
852
|
+
if name:
|
|
853
|
+
table_refs = extract_table_refs(node)
|
|
854
|
+
signature = f"CREATE {obj_type.upper()} {name}()"
|
|
855
|
+
|
|
856
|
+
chunk_id = f"{rel_path}:{obj_type}:{name}"
|
|
857
|
+
chunk = Chunk(
|
|
858
|
+
id=chunk_id,
|
|
859
|
+
file=rel_path,
|
|
860
|
+
type=obj_type,
|
|
861
|
+
name=name,
|
|
862
|
+
signature=signature,
|
|
863
|
+
content=node_content,
|
|
864
|
+
line_start=node.start_point[0] + 1,
|
|
865
|
+
line_end=node.end_point[0] + 1,
|
|
866
|
+
tokens=self._tokenize(node_content),
|
|
867
|
+
)
|
|
868
|
+
chunks.append(chunk)
|
|
869
|
+
|
|
870
|
+
symbol = SymbolInfo(
|
|
871
|
+
id=chunk_id,
|
|
872
|
+
name=name,
|
|
873
|
+
type=obj_type,
|
|
874
|
+
file=rel_path,
|
|
875
|
+
line=node.start_point[0] + 1,
|
|
876
|
+
signature=signature,
|
|
877
|
+
imports=table_refs, # Functions/procedures reference tables
|
|
878
|
+
)
|
|
879
|
+
symbols.append(symbol)
|
|
880
|
+
|
|
881
|
+
# CREATE INDEX
|
|
882
|
+
elif "create" in node_type and "index" in node_type:
|
|
883
|
+
index_name = None
|
|
884
|
+
table_name = None
|
|
885
|
+
|
|
886
|
+
for child in node.children:
|
|
887
|
+
if child.type in ("object_reference", "identifier"):
|
|
888
|
+
if index_name is None:
|
|
889
|
+
index_name = extract_identifier(child)
|
|
890
|
+
else:
|
|
891
|
+
table_name = extract_identifier(child)
|
|
892
|
+
break
|
|
893
|
+
|
|
894
|
+
if index_name:
|
|
895
|
+
signature = f"CREATE INDEX {index_name}"
|
|
896
|
+
if table_name:
|
|
897
|
+
signature += f" ON {table_name}"
|
|
898
|
+
|
|
899
|
+
chunk_id = f"{rel_path}:index:{index_name}"
|
|
900
|
+
chunk = Chunk(
|
|
901
|
+
id=chunk_id,
|
|
902
|
+
file=rel_path,
|
|
903
|
+
type="index",
|
|
904
|
+
name=index_name,
|
|
905
|
+
signature=signature,
|
|
906
|
+
content=node_content,
|
|
907
|
+
line_start=node.start_point[0] + 1,
|
|
908
|
+
line_end=node.end_point[0] + 1,
|
|
909
|
+
tokens=self._tokenize(node_content),
|
|
910
|
+
)
|
|
911
|
+
chunks.append(chunk)
|
|
912
|
+
|
|
913
|
+
symbol = SymbolInfo(
|
|
914
|
+
id=chunk_id,
|
|
915
|
+
name=index_name,
|
|
916
|
+
type="index",
|
|
917
|
+
file=rel_path,
|
|
918
|
+
line=node.start_point[0] + 1,
|
|
919
|
+
signature=signature,
|
|
920
|
+
imports=[table_name] if table_name else [],
|
|
921
|
+
)
|
|
922
|
+
symbols.append(symbol)
|
|
923
|
+
|
|
924
|
+
# CREATE TRIGGER
|
|
925
|
+
elif "create" in node_type and "trigger" in node_type:
|
|
926
|
+
trigger_name = None
|
|
927
|
+
table_name = None
|
|
928
|
+
|
|
929
|
+
for child in node.children:
|
|
930
|
+
if child.type in ("object_reference", "identifier"):
|
|
931
|
+
if trigger_name is None:
|
|
932
|
+
trigger_name = extract_identifier(child)
|
|
933
|
+
else:
|
|
934
|
+
table_name = extract_identifier(child)
|
|
935
|
+
break
|
|
936
|
+
|
|
937
|
+
if trigger_name:
|
|
938
|
+
signature = f"CREATE TRIGGER {trigger_name}"
|
|
939
|
+
if table_name:
|
|
940
|
+
signature += f" ON {table_name}"
|
|
941
|
+
|
|
942
|
+
chunk_id = f"{rel_path}:trigger:{trigger_name}"
|
|
943
|
+
chunk = Chunk(
|
|
944
|
+
id=chunk_id,
|
|
945
|
+
file=rel_path,
|
|
946
|
+
type="trigger",
|
|
947
|
+
name=trigger_name,
|
|
948
|
+
signature=signature,
|
|
949
|
+
content=node_content,
|
|
950
|
+
line_start=node.start_point[0] + 1,
|
|
951
|
+
line_end=node.end_point[0] + 1,
|
|
952
|
+
tokens=self._tokenize(node_content),
|
|
953
|
+
)
|
|
954
|
+
chunks.append(chunk)
|
|
955
|
+
|
|
956
|
+
symbol = SymbolInfo(
|
|
957
|
+
id=chunk_id,
|
|
958
|
+
name=trigger_name,
|
|
959
|
+
type="trigger",
|
|
960
|
+
file=rel_path,
|
|
961
|
+
line=node.start_point[0] + 1,
|
|
962
|
+
signature=signature,
|
|
963
|
+
imports=[table_name] if table_name else [],
|
|
964
|
+
)
|
|
965
|
+
symbols.append(symbol)
|
|
966
|
+
|
|
967
|
+
# Walk the AST and process statements
|
|
968
|
+
def walk(node):
|
|
969
|
+
node_type = node.type.lower()
|
|
970
|
+
|
|
971
|
+
# Check if this is a CREATE statement
|
|
972
|
+
if "create" in node_type or node.type == "statement":
|
|
973
|
+
process_statement(node)
|
|
974
|
+
else:
|
|
975
|
+
for child in node.children:
|
|
976
|
+
walk(child)
|
|
977
|
+
|
|
978
|
+
walk(tree.root_node)
|
|
979
|
+
|
|
980
|
+
# If no chunks extracted, fall back to module chunk
|
|
981
|
+
if not chunks:
|
|
982
|
+
return self._chunk_as_module(Path(rel_path), rel_path)
|
|
983
|
+
|
|
984
|
+
return chunks, symbols
|