tarang 4.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,984 @@
1
+ """
2
+ Code Chunker - AST-based code parsing using tree-sitter.
3
+
4
+ Extracts semantic chunks (functions, classes, methods) from source files
5
+ for efficient indexing and retrieval.
6
+ """
7
+ from __future__ import annotations
8
+
9
+ import hashlib
10
+ import re
11
+ from dataclasses import dataclass, field
12
+ from pathlib import Path
13
+ from typing import Dict, List, Optional, Tuple
14
+
15
+ # Tree-sitter imports (lazy loaded)
16
+ _ts_python = None
17
+ _ts_javascript = None
18
+ _ts_sql = None
19
+
20
+
21
+ def _wrap_language(lang_ptr, name: str):
22
+ """Wrap language pointer for tree-sitter 0.21+ compatibility."""
23
+ try:
24
+ from tree_sitter import Language
25
+ # New API: wrap PyCapsule with Language
26
+ return Language(lang_ptr)
27
+ except TypeError:
28
+ # Older API: Language expects (library_path, name) or already wrapped
29
+ return lang_ptr
30
+
31
+
32
+ def _get_python_language():
33
+ """Lazy load Python language."""
34
+ global _ts_python
35
+ if _ts_python is None:
36
+ try:
37
+ import tree_sitter_python as tspython
38
+ _ts_python = _wrap_language(tspython.language(), "python")
39
+ except ImportError:
40
+ return None
41
+ return _ts_python
42
+
43
+
44
+ def _get_javascript_language():
45
+ """Lazy load JavaScript/TypeScript language."""
46
+ global _ts_javascript
47
+ if _ts_javascript is None:
48
+ try:
49
+ # Try typescript first (handles .ts, .tsx, .js, .jsx)
50
+ import tree_sitter_typescript as tsts
51
+ _ts_javascript = _wrap_language(tsts.language_tsx(), "tsx")
52
+ except ImportError:
53
+ try:
54
+ import tree_sitter_javascript as tsjs
55
+ _ts_javascript = _wrap_language(tsjs.language(), "javascript")
56
+ except ImportError:
57
+ return None
58
+ return _ts_javascript
59
+
60
+
61
+ def _get_sql_language():
62
+ """Lazy load SQL language."""
63
+ global _ts_sql
64
+ if _ts_sql is None:
65
+ try:
66
+ import tree_sitter_sql as tssql
67
+ _ts_sql = _wrap_language(tssql.language(), "sql")
68
+ except ImportError:
69
+ return None
70
+ return _ts_sql
71
+
72
+
73
+ @dataclass
74
+ class Chunk:
75
+ """A semantic code chunk extracted from source."""
76
+ id: str # Unique ID: "file.py:function_name"
77
+ file: str # Relative file path
78
+ type: str # "function" | "method" | "class" | "module"
79
+ name: str # Symbol name
80
+ signature: str # Function/class signature line
81
+ content: str # Full code content
82
+ line_start: int # Starting line (1-indexed)
83
+ line_end: int # Ending line (1-indexed)
84
+ tokens: List[str] = field(default_factory=list) # Tokenized for BM25
85
+ parent: Optional[str] = None # Parent class for methods
86
+
87
+ @property
88
+ def hash(self) -> str:
89
+ """Content hash for change detection."""
90
+ return hashlib.sha256(self.content.encode()).hexdigest()[:16]
91
+
92
+ def to_dict(self) -> Dict:
93
+ """Convert to dictionary for serialization."""
94
+ return {
95
+ "id": self.id,
96
+ "file": self.file,
97
+ "type": self.type,
98
+ "name": self.name,
99
+ "signature": self.signature,
100
+ "content": self.content,
101
+ "line_start": self.line_start,
102
+ "line_end": self.line_end,
103
+ "tokens": self.tokens,
104
+ "parent": self.parent,
105
+ "hash": self.hash,
106
+ }
107
+
108
+ @classmethod
109
+ def from_dict(cls, data: Dict) -> "Chunk":
110
+ """Create from dictionary."""
111
+ return cls(
112
+ id=data["id"],
113
+ file=data["file"],
114
+ type=data["type"],
115
+ name=data["name"],
116
+ signature=data["signature"],
117
+ content=data["content"],
118
+ line_start=data["line_start"],
119
+ line_end=data["line_end"],
120
+ tokens=data.get("tokens", []),
121
+ parent=data.get("parent"),
122
+ )
123
+
124
+
125
+ @dataclass
126
+ class SymbolInfo:
127
+ """Information about a symbol for graph building."""
128
+ id: str # "file.py:function_name"
129
+ name: str # Symbol name
130
+ type: str # "function" | "method" | "class"
131
+ file: str # File path
132
+ line: int # Definition line
133
+ signature: str # Signature
134
+ calls: List[str] = field(default_factory=list) # Functions called
135
+ imports: List[str] = field(default_factory=list) # Modules imported
136
+ parent_class: Optional[str] = None # For methods
137
+
138
+
139
+ class Chunker:
140
+ """
141
+ AST-based code chunker using tree-sitter.
142
+
143
+ Extracts functions, classes, and methods as semantic chunks.
144
+ """
145
+
146
+ # Supported file extensions
147
+ LANGUAGE_MAP = {
148
+ # Python
149
+ ".py": "python",
150
+ ".pyw": "python",
151
+ # JavaScript/TypeScript
152
+ ".js": "javascript",
153
+ ".jsx": "javascript",
154
+ ".ts": "javascript", # tree-sitter-javascript handles TS basics
155
+ ".tsx": "javascript",
156
+ ".mjs": "javascript",
157
+ ".cjs": "javascript",
158
+ # SQL
159
+ ".sql": "sql",
160
+ }
161
+
162
+ # Max lines per chunk (split if larger)
163
+ MAX_CHUNK_LINES = 200
164
+
165
+ # Max file size to process (100KB)
166
+ MAX_FILE_SIZE = 100 * 1024
167
+
168
+ def __init__(self):
169
+ self._parsers: Dict[str, any] = {}
170
+
171
+ def _get_parser(self, language: str):
172
+ """Get or create parser for language."""
173
+ if language in self._parsers:
174
+ return self._parsers[language]
175
+
176
+ try:
177
+ from tree_sitter import Parser
178
+ except ImportError:
179
+ return None
180
+
181
+ lang = None
182
+ if language == "python":
183
+ lang = _get_python_language()
184
+ elif language in ("javascript", "typescript", "tsx", "jsx"):
185
+ lang = _get_javascript_language()
186
+ elif language == "sql":
187
+ lang = _get_sql_language()
188
+
189
+ if lang is None:
190
+ return None
191
+
192
+ # Create parser - handle both old and new tree-sitter API
193
+ parser = Parser()
194
+ try:
195
+ # New API (0.21+): set language via property
196
+ parser.language = lang
197
+ except AttributeError:
198
+ # Old API: pass language to constructor (already created above, need to recreate)
199
+ try:
200
+ parser = Parser(lang)
201
+ except TypeError:
202
+ return None
203
+
204
+ self._parsers[language] = parser
205
+ return parser
206
+
207
+ def chunk_file(self, file_path: Path, project_root: Path) -> Tuple[List[Chunk], List[SymbolInfo]]:
208
+ """
209
+ Parse a file and extract chunks and symbol info.
210
+
211
+ Args:
212
+ file_path: Absolute path to file
213
+ project_root: Project root for relative paths
214
+
215
+ Returns:
216
+ Tuple of (chunks, symbols)
217
+ """
218
+ # Get relative path
219
+ try:
220
+ rel_path = str(file_path.relative_to(project_root))
221
+ except ValueError:
222
+ rel_path = str(file_path)
223
+
224
+ # Check file size
225
+ try:
226
+ if file_path.stat().st_size > self.MAX_FILE_SIZE:
227
+ return [], []
228
+ except OSError:
229
+ return [], []
230
+
231
+ # Determine language
232
+ ext = file_path.suffix.lower()
233
+ language = self.LANGUAGE_MAP.get(ext)
234
+
235
+ if language is None:
236
+ # Return file as single module chunk for unsupported languages
237
+ return self._chunk_as_module(file_path, rel_path)
238
+
239
+ # Get parser
240
+ parser = self._get_parser(language)
241
+ if parser is None:
242
+ return self._chunk_as_module(file_path, rel_path)
243
+
244
+ # Read and parse
245
+ try:
246
+ content = file_path.read_text(encoding="utf-8", errors="replace")
247
+ except Exception:
248
+ return [], []
249
+
250
+ tree = parser.parse(content.encode("utf-8"))
251
+
252
+ # Extract based on language
253
+ if language == "python":
254
+ return self._extract_python(tree, content, rel_path)
255
+ elif language == "javascript":
256
+ return self._extract_javascript(tree, content, rel_path)
257
+ elif language == "sql":
258
+ return self._extract_sql(tree, content, rel_path)
259
+
260
+ return [], []
261
+
262
+ def _chunk_as_module(self, file_path: Path, rel_path: str) -> Tuple[List[Chunk], List[SymbolInfo]]:
263
+ """Treat entire file as a single module chunk."""
264
+ try:
265
+ content = file_path.read_text(encoding="utf-8", errors="replace")
266
+ except Exception:
267
+ return [], []
268
+
269
+ lines = content.splitlines()
270
+ if len(lines) > self.MAX_CHUNK_LINES:
271
+ content = "\n".join(lines[:self.MAX_CHUNK_LINES]) + "\n... (truncated)"
272
+
273
+ chunk = Chunk(
274
+ id=f"{rel_path}:module",
275
+ file=rel_path,
276
+ type="module",
277
+ name=Path(rel_path).stem,
278
+ signature=f"# {rel_path}",
279
+ content=content,
280
+ line_start=1,
281
+ line_end=len(lines),
282
+ tokens=self._tokenize(content),
283
+ )
284
+
285
+ symbol = SymbolInfo(
286
+ id=chunk.id,
287
+ name=chunk.name,
288
+ type="module",
289
+ file=rel_path,
290
+ line=1,
291
+ signature=chunk.signature,
292
+ )
293
+
294
+ return [chunk], [symbol]
295
+
296
+ def _extract_python(self, tree, content: str, rel_path: str) -> Tuple[List[Chunk], List[SymbolInfo]]:
297
+ """Extract chunks from Python AST."""
298
+ chunks = []
299
+ symbols = []
300
+ lines = content.splitlines()
301
+
302
+ def get_node_text(node) -> str:
303
+ return content[node.start_byte:node.end_byte]
304
+
305
+ def get_signature(node) -> str:
306
+ """Extract just the signature line."""
307
+ text = get_node_text(node)
308
+ first_line = text.split("\n")[0]
309
+ # For functions, include up to the colon
310
+ if ":" in first_line:
311
+ return first_line.rstrip()
312
+ return first_line
313
+
314
+ def extract_calls(node) -> List[str]:
315
+ """Extract function calls from a node."""
316
+ calls = []
317
+
318
+ def walk(n):
319
+ if n.type == "call":
320
+ func = n.child_by_field_name("function")
321
+ if func:
322
+ call_name = get_node_text(func)
323
+ # Handle method calls: obj.method -> method
324
+ if "." in call_name:
325
+ call_name = call_name.split(".")[-1]
326
+ calls.append(call_name)
327
+ for child in n.children:
328
+ walk(child)
329
+
330
+ walk(node)
331
+ return calls
332
+
333
+ def extract_imports(node) -> List[str]:
334
+ """Extract imports from module level."""
335
+ imports = []
336
+
337
+ def walk(n):
338
+ if n.type == "import_statement":
339
+ # import foo, bar
340
+ for child in n.children:
341
+ if child.type == "dotted_name":
342
+ imports.append(get_node_text(child))
343
+ elif n.type == "import_from_statement":
344
+ # from foo import bar
345
+ module = n.child_by_field_name("module_name")
346
+ if module:
347
+ imports.append(get_node_text(module))
348
+ for child in n.children:
349
+ if child.type not in ("function_definition", "class_definition"):
350
+ walk(child)
351
+
352
+ walk(node)
353
+ return imports
354
+
355
+ # First pass: extract module-level imports
356
+ module_imports = extract_imports(tree.root_node)
357
+
358
+ # Process top-level nodes
359
+ current_class = None
360
+
361
+ def process_node(node, parent_class=None):
362
+ nonlocal chunks, symbols
363
+
364
+ if node.type == "function_definition":
365
+ name_node = node.child_by_field_name("name")
366
+ if name_node:
367
+ name = get_node_text(name_node)
368
+ node_content = get_node_text(node)
369
+
370
+ chunk_type = "method" if parent_class else "function"
371
+ chunk_id = f"{rel_path}:{parent_class}.{name}" if parent_class else f"{rel_path}:{name}"
372
+
373
+ chunk = Chunk(
374
+ id=chunk_id,
375
+ file=rel_path,
376
+ type=chunk_type,
377
+ name=name,
378
+ signature=get_signature(node),
379
+ content=node_content,
380
+ line_start=node.start_point[0] + 1,
381
+ line_end=node.end_point[0] + 1,
382
+ tokens=self._tokenize(node_content),
383
+ parent=parent_class,
384
+ )
385
+ chunks.append(chunk)
386
+
387
+ symbol = SymbolInfo(
388
+ id=chunk_id,
389
+ name=name,
390
+ type=chunk_type,
391
+ file=rel_path,
392
+ line=node.start_point[0] + 1,
393
+ signature=chunk.signature,
394
+ calls=extract_calls(node),
395
+ parent_class=parent_class,
396
+ )
397
+ symbols.append(symbol)
398
+
399
+ elif node.type == "class_definition":
400
+ name_node = node.child_by_field_name("name")
401
+ if name_node:
402
+ class_name = get_node_text(name_node)
403
+ node_content = get_node_text(node)
404
+
405
+ # Extract class signature (just the class line)
406
+ class_sig = get_signature(node)
407
+
408
+ # Create class chunk (without method bodies for summary)
409
+ chunk_id = f"{rel_path}:{class_name}"
410
+
411
+ # Get just class definition without full method bodies
412
+ class_summary = self._get_class_summary(node, content)
413
+
414
+ chunk = Chunk(
415
+ id=chunk_id,
416
+ file=rel_path,
417
+ type="class",
418
+ name=class_name,
419
+ signature=class_sig,
420
+ content=class_summary,
421
+ line_start=node.start_point[0] + 1,
422
+ line_end=node.end_point[0] + 1,
423
+ tokens=self._tokenize(class_summary),
424
+ )
425
+ chunks.append(chunk)
426
+
427
+ # Extract parent classes
428
+ superclasses = []
429
+ args = node.child_by_field_name("superclasses")
430
+ if args:
431
+ for arg in args.children:
432
+ if arg.type == "identifier":
433
+ superclasses.append(get_node_text(arg))
434
+
435
+ symbol = SymbolInfo(
436
+ id=chunk_id,
437
+ name=class_name,
438
+ type="class",
439
+ file=rel_path,
440
+ line=node.start_point[0] + 1,
441
+ signature=class_sig,
442
+ imports=superclasses, # Reuse imports for inheritance
443
+ )
444
+ symbols.append(symbol)
445
+
446
+ # Process methods inside class
447
+ body = node.child_by_field_name("body")
448
+ if body:
449
+ for child in body.children:
450
+ process_node(child, parent_class=class_name)
451
+
452
+ # Process all top-level nodes
453
+ for child in tree.root_node.children:
454
+ process_node(child)
455
+
456
+ # Add module-level symbol with imports
457
+ if module_imports:
458
+ module_symbol = SymbolInfo(
459
+ id=f"{rel_path}:module",
460
+ name=Path(rel_path).stem,
461
+ type="module",
462
+ file=rel_path,
463
+ line=1,
464
+ signature=f"# {rel_path}",
465
+ imports=module_imports,
466
+ )
467
+ symbols.append(module_symbol)
468
+
469
+ return chunks, symbols
470
+
471
+ def _get_class_summary(self, class_node, content: str) -> str:
472
+ """Get class with method signatures only (not full bodies)."""
473
+ lines = []
474
+
475
+ def get_node_text(node) -> str:
476
+ return content[node.start_byte:node.end_byte]
477
+
478
+ # Get class definition line
479
+ first_line = get_node_text(class_node).split("\n")[0]
480
+ lines.append(first_line)
481
+
482
+ # Get docstring if present
483
+ body = class_node.child_by_field_name("body")
484
+ if body and body.children:
485
+ first_child = body.children[0]
486
+ if first_child.type == "expression_statement":
487
+ expr = first_child.children[0] if first_child.children else None
488
+ if expr and expr.type == "string":
489
+ docstring = get_node_text(expr)
490
+ # Indent docstring
491
+ for doc_line in docstring.split("\n"):
492
+ lines.append(" " + doc_line)
493
+
494
+ # Get method signatures
495
+ if body:
496
+ for child in body.children:
497
+ if child.type == "function_definition":
498
+ sig = get_node_text(child).split("\n")[0]
499
+ lines.append(" " + sig)
500
+ lines.append(" ...")
501
+
502
+ return "\n".join(lines)
503
+
504
+ def _extract_javascript(self, tree, content: str, rel_path: str) -> Tuple[List[Chunk], List[SymbolInfo]]:
505
+ """Extract chunks from JavaScript/TypeScript AST."""
506
+ chunks = []
507
+ symbols = []
508
+
509
+ def get_node_text(node) -> str:
510
+ return content[node.start_byte:node.end_byte]
511
+
512
+ def get_signature(node) -> str:
513
+ """Extract just the signature line."""
514
+ text = get_node_text(node)
515
+ first_line = text.split("\n")[0]
516
+ # Truncate at opening brace
517
+ if "{" in first_line:
518
+ return first_line[:first_line.index("{")].strip() + " {"
519
+ return first_line
520
+
521
+ def extract_calls(node) -> List[str]:
522
+ """Extract function calls."""
523
+ calls = []
524
+
525
+ def walk(n):
526
+ if n.type == "call_expression":
527
+ func = n.child_by_field_name("function")
528
+ if func:
529
+ call_name = get_node_text(func)
530
+ if "." in call_name:
531
+ call_name = call_name.split(".")[-1]
532
+ calls.append(call_name)
533
+ for child in n.children:
534
+ walk(child)
535
+
536
+ walk(node)
537
+ return calls
538
+
539
+ def process_node(node, parent_class=None):
540
+ nonlocal chunks, symbols
541
+
542
+ # Function declarations
543
+ if node.type in ("function_declaration", "function"):
544
+ name_node = node.child_by_field_name("name")
545
+ if name_node:
546
+ name = get_node_text(name_node)
547
+ node_content = get_node_text(node)
548
+
549
+ chunk_id = f"{rel_path}:{name}"
550
+
551
+ chunk = Chunk(
552
+ id=chunk_id,
553
+ file=rel_path,
554
+ type="function",
555
+ name=name,
556
+ signature=get_signature(node),
557
+ content=node_content,
558
+ line_start=node.start_point[0] + 1,
559
+ line_end=node.end_point[0] + 1,
560
+ tokens=self._tokenize(node_content),
561
+ )
562
+ chunks.append(chunk)
563
+
564
+ symbol = SymbolInfo(
565
+ id=chunk_id,
566
+ name=name,
567
+ type="function",
568
+ file=rel_path,
569
+ line=node.start_point[0] + 1,
570
+ signature=chunk.signature,
571
+ calls=extract_calls(node),
572
+ )
573
+ symbols.append(symbol)
574
+
575
+ # Arrow functions assigned to variables
576
+ elif node.type == "lexical_declaration":
577
+ for decl in node.children:
578
+ if decl.type == "variable_declarator":
579
+ name_node = decl.child_by_field_name("name")
580
+ value_node = decl.child_by_field_name("value")
581
+ if name_node and value_node and value_node.type == "arrow_function":
582
+ name = get_node_text(name_node)
583
+ node_content = get_node_text(node)
584
+
585
+ chunk_id = f"{rel_path}:{name}"
586
+
587
+ chunk = Chunk(
588
+ id=chunk_id,
589
+ file=rel_path,
590
+ type="function",
591
+ name=name,
592
+ signature=get_signature(node),
593
+ content=node_content,
594
+ line_start=node.start_point[0] + 1,
595
+ line_end=node.end_point[0] + 1,
596
+ tokens=self._tokenize(node_content),
597
+ )
598
+ chunks.append(chunk)
599
+
600
+ symbol = SymbolInfo(
601
+ id=chunk_id,
602
+ name=name,
603
+ type="function",
604
+ file=rel_path,
605
+ line=node.start_point[0] + 1,
606
+ signature=chunk.signature,
607
+ calls=extract_calls(value_node),
608
+ )
609
+ symbols.append(symbol)
610
+
611
+ # Class declarations
612
+ elif node.type == "class_declaration":
613
+ name_node = node.child_by_field_name("name")
614
+ if name_node:
615
+ class_name = get_node_text(name_node)
616
+ node_content = get_node_text(node)
617
+
618
+ chunk_id = f"{rel_path}:{class_name}"
619
+
620
+ chunk = Chunk(
621
+ id=chunk_id,
622
+ file=rel_path,
623
+ type="class",
624
+ name=class_name,
625
+ signature=get_signature(node),
626
+ content=node_content,
627
+ line_start=node.start_point[0] + 1,
628
+ line_end=node.end_point[0] + 1,
629
+ tokens=self._tokenize(node_content),
630
+ )
631
+ chunks.append(chunk)
632
+
633
+ symbol = SymbolInfo(
634
+ id=chunk_id,
635
+ name=class_name,
636
+ type="class",
637
+ file=rel_path,
638
+ line=node.start_point[0] + 1,
639
+ signature=chunk.signature,
640
+ )
641
+ symbols.append(symbol)
642
+
643
+ # Recurse into children
644
+ for child in node.children:
645
+ process_node(child, parent_class)
646
+
647
+ # Process all nodes
648
+ for child in tree.root_node.children:
649
+ process_node(child)
650
+
651
+ return chunks, symbols
652
+
653
+ def _tokenize(self, content: str) -> List[str]:
654
+ """
655
+ Tokenize content for BM25 indexing.
656
+
657
+ Handles:
658
+ - snake_case splitting
659
+ - camelCase splitting
660
+ - Code-specific tokens
661
+ """
662
+ # Split on whitespace and punctuation
663
+ words = re.findall(r'\b\w+\b', content.lower())
664
+
665
+ tokens = []
666
+ for word in words:
667
+ # Split snake_case
668
+ if "_" in word:
669
+ tokens.extend(word.split("_"))
670
+ # Split camelCase
671
+ elif any(c.isupper() for c in word[1:]):
672
+ parts = re.findall(r'[A-Z]?[a-z]+|[A-Z]+(?=[A-Z]|$)', word)
673
+ tokens.extend(p.lower() for p in parts)
674
+ else:
675
+ tokens.append(word)
676
+
677
+ # Filter very short tokens and common keywords
678
+ stop_words = {
679
+ "def", "class", "self", "return", "if", "else", "elif", "for",
680
+ "while", "try", "except", "finally", "with", "as", "import",
681
+ "from", "in", "is", "not", "and", "or", "true", "false", "none",
682
+ "function", "const", "let", "var", "this", "new", "async", "await",
683
+ }
684
+
685
+ return [t for t in tokens if len(t) > 2 and t not in stop_words]
686
+
687
+ def _extract_sql(self, tree, content: str, rel_path: str) -> Tuple[List[Chunk], List[SymbolInfo]]:
688
+ """
689
+ Extract chunks from SQL AST.
690
+
691
+ Handles:
692
+ - CREATE TABLE statements
693
+ - CREATE VIEW statements
694
+ - CREATE FUNCTION/PROCEDURE statements
695
+ - CREATE INDEX statements
696
+ - CREATE TRIGGER statements
697
+ """
698
+ chunks = []
699
+ symbols = []
700
+
701
+ def get_node_text(node) -> str:
702
+ return content[node.start_byte:node.end_byte]
703
+
704
+ def extract_identifier(node):
705
+ """Extract identifier name from various node structures."""
706
+ if node is None:
707
+ return None
708
+
709
+ # Direct identifier
710
+ if node.type == "identifier":
711
+ return get_node_text(node)
712
+
713
+ # Object reference (schema.table)
714
+ if node.type == "object_reference":
715
+ parts = []
716
+ for child in node.children:
717
+ if child.type == "identifier":
718
+ parts.append(get_node_text(child))
719
+ return ".".join(parts) if parts else None
720
+
721
+ # Search children for identifier
722
+ for child in node.children:
723
+ if child.type == "identifier":
724
+ return get_node_text(child)
725
+ if child.type == "object_reference":
726
+ return extract_identifier(child)
727
+
728
+ return None
729
+
730
+ def extract_table_refs(node) -> List[str]:
731
+ """Extract table references from a statement (for views, functions)."""
732
+ refs = []
733
+
734
+ def walk(n):
735
+ if n.type in ("object_reference", "table_reference"):
736
+ name = extract_identifier(n)
737
+ if name:
738
+ refs.append(name)
739
+ elif n.type == "identifier" and n.parent and n.parent.type in (
740
+ "from_clause", "join_clause", "table_expression"
741
+ ):
742
+ refs.append(get_node_text(n))
743
+ for child in n.children:
744
+ walk(child)
745
+
746
+ walk(node)
747
+ return list(set(refs))
748
+
749
+ def process_statement(node):
750
+ """Process a SQL statement node."""
751
+ node_type = node.type.lower()
752
+ node_content = get_node_text(node)
753
+
754
+ # CREATE TABLE
755
+ if "create" in node_type and "table" in node_type:
756
+ name = None
757
+ # Find table name
758
+ for child in node.children:
759
+ if child.type in ("object_reference", "identifier"):
760
+ name = extract_identifier(child)
761
+ if name:
762
+ break
763
+
764
+ if name:
765
+ # Extract column names for signature
766
+ columns = []
767
+ for child in node.children:
768
+ if child.type == "column_definitions":
769
+ for col_def in child.children:
770
+ if col_def.type == "column_definition":
771
+ col_name = extract_identifier(col_def)
772
+ if col_name:
773
+ columns.append(col_name)
774
+
775
+ signature = f"CREATE TABLE {name}"
776
+ if columns:
777
+ signature += f" ({', '.join(columns[:5])}{'...' if len(columns) > 5 else ''})"
778
+
779
+ chunk_id = f"{rel_path}:table:{name}"
780
+ chunk = Chunk(
781
+ id=chunk_id,
782
+ file=rel_path,
783
+ type="table",
784
+ name=name,
785
+ signature=signature,
786
+ content=node_content,
787
+ line_start=node.start_point[0] + 1,
788
+ line_end=node.end_point[0] + 1,
789
+ tokens=self._tokenize(node_content),
790
+ )
791
+ chunks.append(chunk)
792
+
793
+ symbol = SymbolInfo(
794
+ id=chunk_id,
795
+ name=name,
796
+ type="table",
797
+ file=rel_path,
798
+ line=node.start_point[0] + 1,
799
+ signature=signature,
800
+ )
801
+ symbols.append(symbol)
802
+
803
+ # CREATE VIEW
804
+ elif "create" in node_type and "view" in node_type:
805
+ name = None
806
+ for child in node.children:
807
+ if child.type in ("object_reference", "identifier"):
808
+ name = extract_identifier(child)
809
+ if name:
810
+ break
811
+
812
+ if name:
813
+ table_refs = extract_table_refs(node)
814
+ signature = f"CREATE VIEW {name}"
815
+
816
+ chunk_id = f"{rel_path}:view:{name}"
817
+ chunk = Chunk(
818
+ id=chunk_id,
819
+ file=rel_path,
820
+ type="view",
821
+ name=name,
822
+ signature=signature,
823
+ content=node_content,
824
+ line_start=node.start_point[0] + 1,
825
+ line_end=node.end_point[0] + 1,
826
+ tokens=self._tokenize(node_content),
827
+ )
828
+ chunks.append(chunk)
829
+
830
+ symbol = SymbolInfo(
831
+ id=chunk_id,
832
+ name=name,
833
+ type="view",
834
+ file=rel_path,
835
+ line=node.start_point[0] + 1,
836
+ signature=signature,
837
+ imports=table_refs, # Views depend on tables
838
+ )
839
+ symbols.append(symbol)
840
+
841
+ # CREATE FUNCTION / CREATE PROCEDURE
842
+ elif "create" in node_type and ("function" in node_type or "procedure" in node_type):
843
+ name = None
844
+ obj_type = "procedure" if "procedure" in node_type else "function"
845
+
846
+ for child in node.children:
847
+ if child.type in ("object_reference", "identifier", "function_name"):
848
+ name = extract_identifier(child)
849
+ if name:
850
+ break
851
+
852
+ if name:
853
+ table_refs = extract_table_refs(node)
854
+ signature = f"CREATE {obj_type.upper()} {name}()"
855
+
856
+ chunk_id = f"{rel_path}:{obj_type}:{name}"
857
+ chunk = Chunk(
858
+ id=chunk_id,
859
+ file=rel_path,
860
+ type=obj_type,
861
+ name=name,
862
+ signature=signature,
863
+ content=node_content,
864
+ line_start=node.start_point[0] + 1,
865
+ line_end=node.end_point[0] + 1,
866
+ tokens=self._tokenize(node_content),
867
+ )
868
+ chunks.append(chunk)
869
+
870
+ symbol = SymbolInfo(
871
+ id=chunk_id,
872
+ name=name,
873
+ type=obj_type,
874
+ file=rel_path,
875
+ line=node.start_point[0] + 1,
876
+ signature=signature,
877
+ imports=table_refs, # Functions/procedures reference tables
878
+ )
879
+ symbols.append(symbol)
880
+
881
+ # CREATE INDEX
882
+ elif "create" in node_type and "index" in node_type:
883
+ index_name = None
884
+ table_name = None
885
+
886
+ for child in node.children:
887
+ if child.type in ("object_reference", "identifier"):
888
+ if index_name is None:
889
+ index_name = extract_identifier(child)
890
+ else:
891
+ table_name = extract_identifier(child)
892
+ break
893
+
894
+ if index_name:
895
+ signature = f"CREATE INDEX {index_name}"
896
+ if table_name:
897
+ signature += f" ON {table_name}"
898
+
899
+ chunk_id = f"{rel_path}:index:{index_name}"
900
+ chunk = Chunk(
901
+ id=chunk_id,
902
+ file=rel_path,
903
+ type="index",
904
+ name=index_name,
905
+ signature=signature,
906
+ content=node_content,
907
+ line_start=node.start_point[0] + 1,
908
+ line_end=node.end_point[0] + 1,
909
+ tokens=self._tokenize(node_content),
910
+ )
911
+ chunks.append(chunk)
912
+
913
+ symbol = SymbolInfo(
914
+ id=chunk_id,
915
+ name=index_name,
916
+ type="index",
917
+ file=rel_path,
918
+ line=node.start_point[0] + 1,
919
+ signature=signature,
920
+ imports=[table_name] if table_name else [],
921
+ )
922
+ symbols.append(symbol)
923
+
924
+ # CREATE TRIGGER
925
+ elif "create" in node_type and "trigger" in node_type:
926
+ trigger_name = None
927
+ table_name = None
928
+
929
+ for child in node.children:
930
+ if child.type in ("object_reference", "identifier"):
931
+ if trigger_name is None:
932
+ trigger_name = extract_identifier(child)
933
+ else:
934
+ table_name = extract_identifier(child)
935
+ break
936
+
937
+ if trigger_name:
938
+ signature = f"CREATE TRIGGER {trigger_name}"
939
+ if table_name:
940
+ signature += f" ON {table_name}"
941
+
942
+ chunk_id = f"{rel_path}:trigger:{trigger_name}"
943
+ chunk = Chunk(
944
+ id=chunk_id,
945
+ file=rel_path,
946
+ type="trigger",
947
+ name=trigger_name,
948
+ signature=signature,
949
+ content=node_content,
950
+ line_start=node.start_point[0] + 1,
951
+ line_end=node.end_point[0] + 1,
952
+ tokens=self._tokenize(node_content),
953
+ )
954
+ chunks.append(chunk)
955
+
956
+ symbol = SymbolInfo(
957
+ id=chunk_id,
958
+ name=trigger_name,
959
+ type="trigger",
960
+ file=rel_path,
961
+ line=node.start_point[0] + 1,
962
+ signature=signature,
963
+ imports=[table_name] if table_name else [],
964
+ )
965
+ symbols.append(symbol)
966
+
967
+ # Walk the AST and process statements
968
+ def walk(node):
969
+ node_type = node.type.lower()
970
+
971
+ # Check if this is a CREATE statement
972
+ if "create" in node_type or node.type == "statement":
973
+ process_statement(node)
974
+ else:
975
+ for child in node.children:
976
+ walk(child)
977
+
978
+ walk(tree.root_node)
979
+
980
+ # If no chunks extracted, fall back to module chunk
981
+ if not chunks:
982
+ return self._chunk_as_module(Path(rel_path), rel_path)
983
+
984
+ return chunks, symbols