codebase-intel 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. codebase_intel/__init__.py +3 -0
  2. codebase_intel/analytics/__init__.py +1 -0
  3. codebase_intel/analytics/benchmark.py +406 -0
  4. codebase_intel/analytics/feedback.py +496 -0
  5. codebase_intel/analytics/tracker.py +439 -0
  6. codebase_intel/cli/__init__.py +1 -0
  7. codebase_intel/cli/main.py +740 -0
  8. codebase_intel/contracts/__init__.py +1 -0
  9. codebase_intel/contracts/auto_generator.py +438 -0
  10. codebase_intel/contracts/evaluator.py +531 -0
  11. codebase_intel/contracts/models.py +433 -0
  12. codebase_intel/contracts/registry.py +225 -0
  13. codebase_intel/core/__init__.py +1 -0
  14. codebase_intel/core/config.py +248 -0
  15. codebase_intel/core/exceptions.py +454 -0
  16. codebase_intel/core/types.py +375 -0
  17. codebase_intel/decisions/__init__.py +1 -0
  18. codebase_intel/decisions/miner.py +297 -0
  19. codebase_intel/decisions/models.py +302 -0
  20. codebase_intel/decisions/store.py +411 -0
  21. codebase_intel/drift/__init__.py +1 -0
  22. codebase_intel/drift/detector.py +443 -0
  23. codebase_intel/graph/__init__.py +1 -0
  24. codebase_intel/graph/builder.py +391 -0
  25. codebase_intel/graph/parser.py +1232 -0
  26. codebase_intel/graph/query.py +377 -0
  27. codebase_intel/graph/storage.py +736 -0
  28. codebase_intel/mcp/__init__.py +1 -0
  29. codebase_intel/mcp/server.py +710 -0
  30. codebase_intel/orchestrator/__init__.py +1 -0
  31. codebase_intel/orchestrator/assembler.py +649 -0
  32. codebase_intel-0.1.0.dist-info/METADATA +361 -0
  33. codebase_intel-0.1.0.dist-info/RECORD +36 -0
  34. codebase_intel-0.1.0.dist-info/WHEEL +4 -0
  35. codebase_intel-0.1.0.dist-info/entry_points.txt +2 -0
  36. codebase_intel-0.1.0.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,1232 @@
1
+ """Tree-sitter based source code parser — extracts nodes and edges from files.
2
+
3
+ This is the most edge-case-heavy module in the system. Source code is wild:
4
+ people do bizarre things with imports, naming, and structure.
5
+
6
+ Edge cases handled:
7
+ - Binary files disguised as source (UTF-8 decode fails): skip gracefully
8
+ - Mixed encodings in a file: try UTF-8, fall back to latin-1, then skip
9
+ - Syntax errors in user code: tree-sitter still produces a partial AST,
10
+ we extract what we can and flag errors
11
+ - Generated code: detected via header markers, tagged is_generated=True
12
+ - Huge files (>1MB): skip entirely, log warning
13
+ - Empty files: valid — produce MODULE node with no children
14
+ - Circular imports: not our problem at parse time (graph handles it)
15
+ - Dynamic imports: `importlib.import_module("x")`, `__import__("x")`,
16
+ `require(variable)` — extract target string if it's a literal, flag
17
+ as dynamic_import with lower confidence if it's a variable
18
+ - Conditional imports: `if TYPE_CHECKING:`, `try/except ImportError:` —
19
+ tagged as type_only or optional
20
+ - Re-exports: barrel files that `from x import *` or explicit re-export
21
+ - Star imports: `from x import *` — edge exists but target is the module,
22
+ not specific symbols (can't resolve without runtime)
23
+ - Relative imports: `from . import x`, `from ..utils import y` — resolved
24
+ relative to file position in the project
25
+ - Decorator detection: @app.route, @router.get — used to identify endpoints
26
+ - Async vs sync: tracked in metadata for quality contracts
27
+ """
28
+
29
+ from __future__ import annotations
30
+
31
+ import logging
32
+ from pathlib import Path
33
+ from typing import TYPE_CHECKING
34
+
35
+ import xxhash
36
+
37
+ from codebase_intel.core.exceptions import ErrorContext, ParseError, UnsupportedLanguageError
38
+ from codebase_intel.core.types import (
39
+ EdgeKind,
40
+ GraphEdge,
41
+ GraphNode,
42
+ Language,
43
+ LineRange,
44
+ NodeKind,
45
+ )
46
+
47
+ if TYPE_CHECKING:
48
+ from codebase_intel.core.config import ParserConfig
49
+
50
+ logger = logging.getLogger(__name__)
51
+
52
+ # File extension → Language mapping (19 languages)
53
+ EXTENSION_MAP: dict[str, Language] = {
54
+ ".py": Language.PYTHON,
55
+ ".pyi": Language.PYTHON,
56
+ ".js": Language.JAVASCRIPT,
57
+ ".mjs": Language.JAVASCRIPT,
58
+ ".cjs": Language.JAVASCRIPT,
59
+ ".jsx": Language.JAVASCRIPT,
60
+ ".ts": Language.TYPESCRIPT,
61
+ ".tsx": Language.TSX,
62
+ ".go": Language.GO,
63
+ ".rs": Language.RUST,
64
+ ".java": Language.JAVA,
65
+ ".rb": Language.RUBY,
66
+ ".c": Language.C,
67
+ ".h": Language.C,
68
+ ".cpp": Language.CPP,
69
+ ".cc": Language.CPP,
70
+ ".cxx": Language.CPP,
71
+ ".hpp": Language.CPP,
72
+ ".cs": Language.CSHARP,
73
+ ".php": Language.PHP,
74
+ ".swift": Language.SWIFT,
75
+ ".kt": Language.KOTLIN,
76
+ ".kts": Language.KOTLIN,
77
+ ".scala": Language.SCALA,
78
+ ".lua": Language.LUA,
79
+ ".dart": Language.DART,
80
+ ".ex": Language.ELIXIR,
81
+ ".exs": Language.ELIXIR,
82
+ ".hs": Language.HASKELL,
83
+ }
84
+
85
+ # tree-sitter-language-pack grammar names
86
+ LANGUAGE_GRAMMAR_MAP: dict[Language, str] = {
87
+ Language.PYTHON: "python",
88
+ Language.JAVASCRIPT: "javascript",
89
+ Language.TYPESCRIPT: "typescript",
90
+ Language.TSX: "tsx",
91
+ Language.GO: "go",
92
+ Language.RUST: "rust",
93
+ Language.JAVA: "java",
94
+ Language.RUBY: "ruby",
95
+ Language.C: "c",
96
+ Language.CPP: "cpp",
97
+ Language.CSHARP: "c_sharp",
98
+ Language.PHP: "php",
99
+ Language.SWIFT: "swift",
100
+ Language.KOTLIN: "kotlin",
101
+ Language.SCALA: "scala",
102
+ Language.LUA: "lua",
103
+ Language.DART: "dart",
104
+ Language.ELIXIR: "elixir",
105
+ Language.HASKELL: "haskell",
106
+ }
107
+
108
+
109
+ def detect_language(file_path: Path) -> Language:
110
+ """Detect language from file extension.
111
+
112
+ Edge case: .tsx is TSX not TypeScript (different grammar).
113
+ Edge case: .mjs/.cjs are JavaScript (ES modules / CommonJS).
114
+ Edge case: .pyi is Python (type stubs — treated same as .py).
115
+ """
116
+ return EXTENSION_MAP.get(file_path.suffix.lower(), Language.UNKNOWN)
117
+
118
+
119
+ def compute_file_hash(content: bytes) -> str:
120
+ """Content-addressable hash using xxhash (much faster than SHA for our use case)."""
121
+ return xxhash.xxh64(content).hexdigest()
122
+
123
+
124
+ class ParseResult:
125
+ """Result of parsing a single file — nodes, edges, and any warnings."""
126
+
127
+ def __init__(self, file_path: Path, language: Language) -> None:
128
+ self.file_path = file_path
129
+ self.language = language
130
+ self.nodes: list[GraphNode] = []
131
+ self.edges: list[GraphEdge] = []
132
+ self.warnings: list[str] = []
133
+ self.content_hash: str = ""
134
+ self.size_bytes: int = 0
135
+ self.is_generated: bool = False
136
+ self.had_syntax_errors: bool = False
137
+
138
+ @property
139
+ def module_node_id(self) -> str:
140
+ """The node ID of the file-level MODULE node."""
141
+ return GraphNode.make_id(self.file_path, NodeKind.MODULE, self.file_path.stem)
142
+
143
+
144
+ class FileParser:
145
+ """Parses source files into graph nodes and edges.
146
+
147
+ Uses tree-sitter for language-aware AST parsing. Falls back to
148
+ regex-based extraction for unsupported languages (limited but
149
+ better than nothing).
150
+ """
151
+
152
+ def __init__(self, config: ParserConfig, project_root: Path) -> None:
153
+ self._config = config
154
+ self._project_root = project_root
155
+ self._grammars: dict[Language, object] = {}
156
+
157
+ def _is_ignored(self, file_path: Path) -> bool:
158
+ """Check if file matches any ignore pattern.
159
+
160
+ Edge case: patterns are relative to project root.
161
+ Edge case: symlinks — we resolve before checking to avoid
162
+ processing the same file twice via different paths.
163
+ """
164
+ import fnmatch
165
+
166
+ try:
167
+ rel = file_path.resolve().relative_to(self._project_root)
168
+ except ValueError:
169
+ return True # Outside project root — skip
170
+
171
+ rel_str = str(rel)
172
+ return any(
173
+ fnmatch.fnmatch(rel_str, pattern)
174
+ for pattern in self._config.ignored_patterns
175
+ )
176
+
177
+ def _is_generated(self, content: str) -> bool:
178
+ """Check if file is generated code by examining its header.
179
+
180
+ Edge case: some generated files put the marker on line 3 (after
181
+ a shebang and encoding declaration). We check first 5 lines.
182
+ """
183
+ header = "\n".join(content.split("\n")[:5]).lower()
184
+ return any(marker.lower() in header for marker in self._config.generated_markers)
185
+
186
+ def _read_file_safe(self, file_path: Path) -> tuple[bytes, str] | None:
187
+ """Read file with encoding fallback.
188
+
189
+ Edge cases:
190
+ - Binary file: UTF-8 decode fails, latin-1 produces garbage but doesn't crash
191
+ - Mixed encoding: some lines UTF-8, some not — we get partial content
192
+ - Null bytes in file: strong indicator of binary, skip
193
+ - Symlink to file outside project: resolve and check
194
+
195
+ Returns (raw_bytes, decoded_text) or None if unreadable.
196
+ """
197
+ try:
198
+ raw = file_path.read_bytes()
199
+ except (OSError, PermissionError) as exc:
200
+ logger.warning("Cannot read %s: %s", file_path, exc)
201
+ return None
202
+
203
+ # Size check
204
+ if len(raw) > self._config.max_file_size_bytes:
205
+ logger.info(
206
+ "Skipping %s: %d bytes exceeds limit %d",
207
+ file_path,
208
+ len(raw),
209
+ self._config.max_file_size_bytes,
210
+ )
211
+ return None
212
+
213
+ # Binary detection: null bytes in first 8KB
214
+ if b"\x00" in raw[:8192]:
215
+ logger.debug("Skipping binary file: %s", file_path)
216
+ return None
217
+
218
+ # Decode with fallback
219
+ try:
220
+ text = raw.decode("utf-8")
221
+ except UnicodeDecodeError:
222
+ try:
223
+ text = raw.decode("latin-1")
224
+ logger.debug("Fell back to latin-1 for %s", file_path)
225
+ except UnicodeDecodeError:
226
+ logger.warning("Cannot decode %s with any encoding", file_path)
227
+ return None
228
+
229
+ return raw, text
230
+
231
+ async def parse_file(self, file_path: Path) -> ParseResult | None:
232
+ """Parse a single file and extract graph nodes and edges.
233
+
234
+ Returns None if the file should be skipped entirely.
235
+
236
+ This is the main entry point. Language-specific extraction
237
+ is delegated to _extract_python, _extract_javascript, etc.
238
+ """
239
+ if self._is_ignored(file_path):
240
+ return None
241
+
242
+ language = detect_language(file_path)
243
+
244
+ result = ParseResult(file_path, language)
245
+
246
+ # Read file
247
+ read_result = self._read_file_safe(file_path)
248
+ if read_result is None:
249
+ return None
250
+
251
+ raw_bytes, text = read_result
252
+ result.content_hash = compute_file_hash(raw_bytes)
253
+ result.size_bytes = len(raw_bytes)
254
+ result.is_generated = self._is_generated(text)
255
+
256
+ # Create the MODULE-level node (every file gets one)
257
+ is_test = self._is_test_file(file_path)
258
+ module_node = GraphNode(
259
+ node_id=result.module_node_id,
260
+ kind=NodeKind.MODULE,
261
+ name=file_path.stem,
262
+ qualified_name=self._qualified_module_name(file_path),
263
+ file_path=file_path,
264
+ line_range=LineRange(start=1, end=max(1, text.count("\n") + 1)),
265
+ language=language,
266
+ content_hash=result.content_hash,
267
+ is_generated=result.is_generated,
268
+ is_test=is_test,
269
+ is_entry_point=self._is_entry_point(file_path, text),
270
+ )
271
+ result.nodes.append(module_node)
272
+
273
+ # Language-specific extraction
274
+ if language == Language.PYTHON:
275
+ await self._extract_python(text, file_path, result)
276
+ elif language in (Language.JAVASCRIPT, Language.TYPESCRIPT, Language.TSX):
277
+ await self._extract_javascript_family(text, file_path, result)
278
+ elif language == Language.UNKNOWN:
279
+ result.warnings.append(f"No parser for {file_path.suffix}")
280
+ elif language in LANGUAGE_GRAMMAR_MAP:
281
+ # All other supported languages — extract basic structure via tree-sitter
282
+ await self._extract_generic(text, file_path, language, result)
283
+ else:
284
+ result.warnings.append(f"Language {language.value} not in enabled_languages")
285
+
286
+ return result
287
+
288
+ def _is_test_file(self, file_path: Path) -> bool:
289
+ """Detect if a file is a test file.
290
+
291
+ Edge cases:
292
+ - test_*.py, *_test.py, *_spec.ts, *.test.js — all common patterns
293
+ - Files inside tests/, __tests__/, spec/ directories
294
+ - conftest.py is test infrastructure, not a test itself (still flagged)
295
+ """
296
+ name = file_path.stem.lower()
297
+ parts = [p.lower() for p in file_path.parts]
298
+
299
+ is_test_name = (
300
+ name.startswith("test_")
301
+ or name.endswith("_test")
302
+ or name.endswith("_spec")
303
+ or name.endswith(".test")
304
+ or name.endswith(".spec")
305
+ or name == "conftest"
306
+ )
307
+ is_test_dir = any(
308
+ p in ("tests", "test", "__tests__", "spec", "specs") for p in parts
309
+ )
310
+ return is_test_name or is_test_dir
311
+
312
+ def _is_entry_point(self, file_path: Path, content: str) -> bool:
313
+ """Detect if a file is an application entry point.
314
+
315
+ Edge cases:
316
+ - Python: `if __name__ == "__main__"`, main.py, app.py, manage.py
317
+ - JS/TS: package.json "main" field (not detectable here — handled in builder)
318
+ - Multiple entry points: CLI, web server, worker — all valid
319
+ """
320
+ name = file_path.stem.lower()
321
+ if name in ("main", "app", "manage", "server", "worker", "cli"):
322
+ return True
323
+ if '__name__' in content and '__main__' in content:
324
+ return True
325
+ return False
326
+
327
+ def _qualified_module_name(self, file_path: Path) -> str:
328
+ """Build a Python-style qualified module name from file path.
329
+
330
+ Edge case: __init__.py represents the package, not a module named "init".
331
+ Edge case: files outside src/ — use path from project root.
332
+ """
333
+ try:
334
+ rel = file_path.resolve().relative_to(self._project_root)
335
+ except ValueError:
336
+ return file_path.stem
337
+
338
+ parts = list(rel.with_suffix("").parts)
339
+
340
+ # Remove common source directories from the qualified name
341
+ for prefix in ("src", "lib", "app"):
342
+ if parts and parts[0] == prefix:
343
+ parts = parts[1:]
344
+ break
345
+
346
+ # __init__ represents the package
347
+ if parts and parts[-1] == "__init__":
348
+ parts = parts[:-1]
349
+
350
+ # index.ts/js represents the directory
351
+ if parts and parts[-1] in ("index", "mod"):
352
+ parts = parts[:-1]
353
+
354
+ return ".".join(parts) if parts else file_path.stem
355
+
356
+ # -------------------------------------------------------------------
357
+ # Python extraction
358
+ # -------------------------------------------------------------------
359
+
360
+ async def _extract_python(
361
+ self, source: str, file_path: Path, result: ParseResult
362
+ ) -> None:
363
+ """Extract Python nodes and edges using tree-sitter.
364
+
365
+ Edge cases specific to Python:
366
+ - `from __future__ import annotations`: changes how type hints are evaluated
367
+ - `if TYPE_CHECKING:` blocks: imports are type-only, not runtime
368
+ - `try: import x / except: import y`: optional dependency pattern
369
+ - `importlib.import_module(f"plugins.{name}")`: dynamic import with template
370
+ - `__all__ = [...]`: explicit re-exports
371
+ - Decorators: @app.route("/path") marks an endpoint
372
+ - Dataclasses, NamedTuples: class-like but different structure
373
+ - Nested functions/classes: qualified name must include parent
374
+ """
375
+ try:
376
+ from tree_sitter_language_pack import get_language, get_parser
377
+ except ImportError:
378
+ result.warnings.append("tree-sitter-language-pack not installed")
379
+ await self._extract_python_regex(source, file_path, result)
380
+ return
381
+
382
+ parser = get_parser("python")
383
+ tree = parser.parse(source.encode("utf-8"))
384
+
385
+ if tree.root_node.has_error:
386
+ result.had_syntax_errors = True
387
+ result.warnings.append(f"Syntax errors in {file_path} — partial extraction")
388
+
389
+ lines = source.split("\n")
390
+ module_id = result.module_node_id
391
+ in_type_checking = False
392
+
393
+ for child in tree.root_node.children:
394
+ node_type = child.type
395
+
396
+ # --- Imports ---
397
+ if node_type in ("import_statement", "import_from_statement"):
398
+ edges = self._parse_python_import(
399
+ child, source, file_path, module_id, in_type_checking
400
+ )
401
+ result.edges.extend(edges)
402
+
403
+ # --- If TYPE_CHECKING block ---
404
+ elif node_type == "if_statement":
405
+ condition_text = self._node_text(child.child_by_field_name("condition"), source)
406
+ if condition_text and "TYPE_CHECKING" in condition_text:
407
+ # Process imports inside this block as type-only
408
+ body = child.child_by_field_name("consequence")
409
+ if body:
410
+ for stmt in body.children:
411
+ if stmt.type in ("import_statement", "import_from_statement"):
412
+ edges = self._parse_python_import(
413
+ stmt, source, file_path, module_id,
414
+ is_type_only=True,
415
+ )
416
+ result.edges.extend(edges)
417
+
418
+ # --- Classes ---
419
+ elif node_type == "class_definition":
420
+ class_node = self._parse_python_class(
421
+ child, source, file_path, result
422
+ )
423
+ if class_node:
424
+ result.nodes.append(class_node)
425
+ result.edges.append(GraphEdge(
426
+ source_id=module_id,
427
+ target_id=class_node.node_id,
428
+ kind=EdgeKind.DEPENDS_ON,
429
+ ))
430
+
431
+ # --- Functions ---
432
+ elif node_type == "function_definition":
433
+ func_node = self._parse_python_function(
434
+ child, source, file_path, parent_qualified=""
435
+ )
436
+ if func_node:
437
+ result.nodes.append(func_node)
438
+
439
+ # --- Assignments (module-level constants, __all__) ---
440
+ elif node_type in ("expression_statement", "assignment"):
441
+ self._parse_python_assignment(child, source, file_path, result)
442
+
443
+ def _node_text(self, node: object | None, source: str) -> str | None:
444
+ """Extract text from a tree-sitter node safely."""
445
+ if node is None:
446
+ return None
447
+ start = getattr(node, "start_byte", 0)
448
+ end = getattr(node, "end_byte", 0)
449
+ return source[start:end]
450
+
451
+ def _parse_python_import(
452
+ self,
453
+ node: object,
454
+ source: str,
455
+ file_path: Path,
456
+ module_id: str,
457
+ is_type_only: bool = False,
458
+ ) -> list[GraphEdge]:
459
+ """Parse a Python import statement into graph edges.
460
+
461
+ Edge cases:
462
+ - `from . import x`: relative import — resolve against package
463
+ - `from ...utils import y`: multi-level relative import
464
+ - `import x.y.z`: creates edge to module x.y.z
465
+ - `from x import *`: star import — edge to module, not symbols
466
+ - `import x as y`: alias doesn't affect the edge
467
+ - `from __future__ import annotations`: skip (not a real dependency)
468
+ """
469
+ text = self._node_text(node, source)
470
+ if not text:
471
+ return []
472
+
473
+ # Skip __future__ imports
474
+ if "__future__" in text:
475
+ return []
476
+
477
+ edges: list[GraphEdge] = []
478
+ # Extract the module being imported
479
+ module_name = self._extract_import_module(text, file_path)
480
+
481
+ if module_name:
482
+ target_id = GraphNode.make_id(
483
+ self._resolve_module_path(module_name),
484
+ NodeKind.MODULE,
485
+ module_name.split(".")[-1],
486
+ )
487
+ is_dynamic = False
488
+ confidence = 1.0 if not is_dynamic else 0.6
489
+
490
+ edges.append(GraphEdge(
491
+ source_id=module_id,
492
+ target_id=target_id,
493
+ kind=EdgeKind.IMPORTS,
494
+ confidence=confidence,
495
+ is_type_only=is_type_only,
496
+ metadata={"raw_import": text.strip()},
497
+ ))
498
+
499
+ return edges
500
+
501
+ def _extract_import_module(self, import_text: str, file_path: Path) -> str | None:
502
+ """Extract module name from import statement text.
503
+
504
+ Edge cases:
505
+ - `from .sibling import func` → resolve to package.sibling
506
+ - `from .. import parent_func` → resolve to parent package
507
+ - `import os.path` → "os.path"
508
+ - `from typing import List` → "typing"
509
+ """
510
+ import re
511
+
512
+ # from X import Y
513
+ match = re.match(r"from\s+([\w.]+)\s+import", import_text)
514
+ if match:
515
+ module = match.group(1)
516
+ # Handle relative imports
517
+ if import_text.strip().startswith("from ."):
518
+ dots = re.match(r"from\s+(\.+)", import_text)
519
+ if dots:
520
+ level = len(dots.group(1))
521
+ package_parts = self._get_package_parts(file_path)
522
+ if level <= len(package_parts):
523
+ base = ".".join(package_parts[: -level] if level > 0 else package_parts)
524
+ rest = re.match(r"from\s+\.+\s*([\w.]*)\s+import", import_text)
525
+ if rest and rest.group(1):
526
+ return f"{base}.{rest.group(1)}" if base else rest.group(1)
527
+ return base
528
+ return module
529
+
530
+ # import X
531
+ match = re.match(r"import\s+([\w.]+)", import_text)
532
+ if match:
533
+ return match.group(1)
534
+
535
+ return None
536
+
537
+ def _get_package_parts(self, file_path: Path) -> list[str]:
538
+ """Get the package path components for resolving relative imports."""
539
+ try:
540
+ rel = file_path.resolve().relative_to(self._project_root)
541
+ except ValueError:
542
+ return []
543
+
544
+ parts = list(rel.parent.parts)
545
+ # Remove common source directories
546
+ for prefix in ("src", "lib"):
547
+ if parts and parts[0] == prefix:
548
+ parts = parts[1:]
549
+ break
550
+ return parts
551
+
552
+ def _resolve_module_path(self, module_name: str) -> Path:
553
+ """Best-effort resolution of a module name to a file path.
554
+
555
+ Edge case: module might be a package (directory with __init__.py)
556
+ or a file. We try both and return whichever exists, defaulting
557
+ to file path if neither exists (the node will be created as a
558
+ placeholder).
559
+ """
560
+ parts = module_name.split(".")
561
+ # Try as file
562
+ file_path = self._project_root / Path(*parts).with_suffix(".py")
563
+ if file_path.exists():
564
+ return file_path
565
+
566
+ # Try common source directories
567
+ for src_dir in ("src", "lib"):
568
+ file_path = self._project_root / src_dir / Path(*parts).with_suffix(".py")
569
+ if file_path.exists():
570
+ return file_path
571
+
572
+ # Try as package
573
+ pkg_path = self._project_root / Path(*parts) / "__init__.py"
574
+ if pkg_path.exists():
575
+ return pkg_path
576
+
577
+ # Return a synthetic path — the node will exist but may not resolve to a real file
578
+ return self._project_root / Path(*parts).with_suffix(".py")
579
+
580
+ def _parse_python_class(
581
+ self,
582
+ node: object,
583
+ source: str,
584
+ file_path: Path,
585
+ result: ParseResult,
586
+ ) -> GraphNode | None:
587
+ """Parse a Python class definition.
588
+
589
+ Edge cases:
590
+ - Dataclass: @dataclass decorator → metadata tag
591
+ - NamedTuple: class Foo(NamedTuple) → metadata tag
592
+ - Protocol: class Foo(Protocol) → kind=INTERFACE
593
+ - Multiple inheritance: class Foo(Base, Mixin) → multiple INHERITS edges
594
+ - Nested class: class Outer: class Inner → qualified_name includes parent
595
+ - Abstract class: has @abstractmethod → metadata tag
596
+ """
597
+ name_node = getattr(node, "child_by_field_name", lambda _: None)("name")
598
+ name = self._node_text(name_node, source) if name_node else None
599
+ if not name:
600
+ return None
601
+
602
+ start_line = getattr(node, "start_point", (0, 0))[0] + 1
603
+ end_line = getattr(node, "end_point", (0, 0))[0] + 1
604
+
605
+ # Detect kind: Protocol/ABC → INTERFACE, else CLASS
606
+ kind = NodeKind.CLASS
607
+ superclasses = self._extract_superclasses(node, source)
608
+ if any(s in ("Protocol", "ABC", "ABCMeta") for s in superclasses):
609
+ kind = NodeKind.INTERFACE
610
+
611
+ # Build docstring
612
+ docstring = self._extract_docstring(node, source)
613
+
614
+ qualified = f"{self._qualified_module_name(file_path)}.{name}"
615
+
616
+ class_node = GraphNode(
617
+ node_id=GraphNode.make_id(file_path, kind, name),
618
+ kind=kind,
619
+ name=name,
620
+ qualified_name=qualified,
621
+ file_path=file_path,
622
+ line_range=LineRange(start=start_line, end=end_line),
623
+ language=Language.PYTHON,
624
+ docstring=docstring,
625
+ is_generated=result.is_generated,
626
+ is_test=result.nodes[0].is_test if result.nodes else False,
627
+ metadata={
628
+ "superclasses": superclasses,
629
+ "is_dataclass": self._has_decorator(node, source, "dataclass"),
630
+ },
631
+ )
632
+
633
+ # Create INHERITS edges for each superclass
634
+ for superclass in superclasses:
635
+ target_id = GraphNode.make_id(
636
+ self._resolve_module_path(superclass),
637
+ NodeKind.CLASS,
638
+ superclass.split(".")[-1],
639
+ )
640
+ result.edges.append(GraphEdge(
641
+ source_id=class_node.node_id,
642
+ target_id=target_id,
643
+ kind=EdgeKind.INHERITS,
644
+ ))
645
+
646
+ return class_node
647
+
648
+ def _parse_python_function(
649
+ self,
650
+ node: object,
651
+ source: str,
652
+ file_path: Path,
653
+ parent_qualified: str = "",
654
+ ) -> GraphNode | None:
655
+ """Parse a Python function definition.
656
+
657
+ Edge cases:
658
+ - Async functions: `async def` → metadata["is_async"] = True
659
+ - Decorated with @app.route → kind=ENDPOINT
660
+ - @staticmethod/@classmethod → metadata tag
661
+ - Inner functions (closures): tracked with qualified name
662
+ - Property decorators: @property → metadata tag
663
+ - Overloaded functions: @overload → skip (only the implementation matters)
664
+ """
665
+ name_node = getattr(node, "child_by_field_name", lambda _: None)("name")
666
+ name = self._node_text(name_node, source) if name_node else None
667
+ if not name:
668
+ return None
669
+
670
+ start_line = getattr(node, "start_point", (0, 0))[0] + 1
671
+ end_line = getattr(node, "end_point", (0, 0))[0] + 1
672
+
673
+ # Skip overload stubs — they provide no implementation
674
+ if self._has_decorator(node, source, "overload"):
675
+ return None
676
+
677
+ # Detect endpoint decorators
678
+ kind = NodeKind.FUNCTION
679
+ is_endpoint = self._has_decorator(node, source, "route") or self._has_decorator(
680
+ node, source, "get"
681
+ ) or self._has_decorator(node, source, "post")
682
+ if is_endpoint:
683
+ kind = NodeKind.ENDPOINT
684
+
685
+ # Detect method vs function
686
+ if parent_qualified:
687
+ kind = NodeKind.METHOD
688
+
689
+ qualified = f"{self._qualified_module_name(file_path)}.{name}"
690
+ if parent_qualified:
691
+ qualified = f"{parent_qualified}.{name}"
692
+
693
+ is_async = self._node_text(node, source).strip().startswith("async ") if self._node_text(node, source) else False
694
+
695
+ return GraphNode(
696
+ node_id=GraphNode.make_id(file_path, kind, qualified),
697
+ kind=kind,
698
+ name=name,
699
+ qualified_name=qualified,
700
+ file_path=file_path,
701
+ line_range=LineRange(start=start_line, end=end_line),
702
+ language=Language.PYTHON,
703
+ docstring=self._extract_docstring(node, source),
704
+ metadata={
705
+ "is_async": is_async,
706
+ "is_staticmethod": self._has_decorator(node, source, "staticmethod"),
707
+ "is_classmethod": self._has_decorator(node, source, "classmethod"),
708
+ "is_property": self._has_decorator(node, source, "property"),
709
+ },
710
+ )
711
+
712
+ def _parse_python_assignment(
713
+ self,
714
+ node: object,
715
+ source: str,
716
+ file_path: Path,
717
+ result: ParseResult,
718
+ ) -> None:
719
+ """Parse module-level assignments for __all__, constants, etc.
720
+
721
+ Edge case: `__all__ = ["Foo", "Bar"]` defines explicit re-exports.
722
+ We create RE_EXPORTS edges for each listed symbol.
723
+ """
724
+ text = self._node_text(node, source)
725
+ if not text:
726
+ return
727
+
728
+ if "__all__" in text:
729
+ import re
730
+ symbols = re.findall(r'["\'](\w+)["\']', text)
731
+ module_id = result.module_node_id
732
+ for symbol in symbols:
733
+ target_id = GraphNode.make_id(file_path, NodeKind.FUNCTION, symbol)
734
+ result.edges.append(GraphEdge(
735
+ source_id=module_id,
736
+ target_id=target_id,
737
+ kind=EdgeKind.RE_EXPORTS,
738
+ ))
739
+
740
+ # -------------------------------------------------------------------
741
+ # JavaScript/TypeScript extraction
742
+ # -------------------------------------------------------------------
743
+
744
+ async def _extract_javascript_family(
745
+ self, source: str, file_path: Path, result: ParseResult
746
+ ) -> None:
747
+ """Extract nodes and edges from JS/TS/TSX files.
748
+
749
+ Edge cases specific to JavaScript ecosystem:
750
+ - require() vs import: both CommonJS and ESM in same codebase
751
+ - Dynamic import(): `import("./module")` — async, different from static
752
+ - Path aliases: tsconfig paths, webpack aliases (can't resolve fully,
753
+ but can extract the alias and flag it)
754
+ - Barrel files: index.ts that re-exports from 10+ files
755
+ - Default export vs named exports: affects how dependents reference it
756
+ - JSX: <Component /> creates an implicit dependency on the component
757
+ - Type-only imports: `import type { Foo }` in TypeScript
758
+ - Namespace imports: `import * as utils from "./utils"`
759
+ - Side-effect imports: `import "./polyfill"` — no named bindings
760
+ """
761
+ try:
762
+ from tree_sitter_language_pack import get_parser
763
+
764
+ grammar_name = LANGUAGE_GRAMMAR_MAP.get(detect_language(file_path), "javascript")
765
+ parser = get_parser(grammar_name)
766
+
767
+ tree = parser.parse(source.encode("utf-8"))
768
+
769
+ if tree.root_node.has_error:
770
+ result.had_syntax_errors = True
771
+ result.warnings.append(f"Syntax errors in {file_path}")
772
+
773
+ await self._walk_js_tree(tree.root_node, source, file_path, result)
774
+
775
+ except (ImportError, Exception) as exc:
776
+ result.warnings.append(f"tree-sitter grammar not available for {file_path.suffix}: {exc}")
777
+ await self._extract_js_regex(source, file_path, result)
778
+
779
+ async def _walk_js_tree(
780
+ self, root: object, source: str, file_path: Path, result: ParseResult
781
+ ) -> None:
782
+ """Walk a JS/TS AST and extract nodes and edges."""
783
+ module_id = result.module_node_id
784
+
785
+ for child in getattr(root, "children", []):
786
+ child_type = getattr(child, "type", "")
787
+
788
+ # --- ESM imports ---
789
+ if child_type == "import_statement":
790
+ edge = self._parse_js_import(child, source, file_path, module_id)
791
+ if edge:
792
+ result.edges.append(edge)
793
+
794
+ # --- Exports (function, class, variable) ---
795
+ elif child_type == "export_statement":
796
+ # Contains the actual declaration
797
+ declaration = getattr(child, "child_by_field_name", lambda _: None)("declaration")
798
+ if declaration:
799
+ decl_type = getattr(declaration, "type", "")
800
+ if decl_type in ("function_declaration", "generator_function_declaration"):
801
+ node = self._parse_js_function(declaration, source, file_path)
802
+ if node:
803
+ result.nodes.append(node)
804
+ elif decl_type == "class_declaration":
805
+ node = self._parse_js_class(declaration, source, file_path)
806
+ if node:
807
+ result.nodes.append(node)
808
+
809
+ # --- Top-level functions ---
810
+ elif child_type in ("function_declaration", "generator_function_declaration"):
811
+ node = self._parse_js_function(child, source, file_path)
812
+ if node:
813
+ result.nodes.append(node)
814
+
815
+ # --- Top-level classes ---
816
+ elif child_type == "class_declaration":
817
+ node = self._parse_js_class(child, source, file_path)
818
+ if node:
819
+ result.nodes.append(node)
820
+
821
+ # --- CommonJS require() ---
822
+ elif child_type == "lexical_declaration" or child_type == "variable_declaration":
823
+ text = self._node_text(child, source) or ""
824
+ if "require(" in text:
825
+ edge = self._parse_require(text, file_path, module_id)
826
+ if edge:
827
+ result.edges.append(edge)
828
+
829
+ def _parse_js_import(
830
+ self, node: object, source: str, file_path: Path, module_id: str
831
+ ) -> GraphEdge | None:
832
+ """Parse an ESM import statement.
833
+
834
+ Edge cases:
835
+ - `import type { Foo }` → type-only edge
836
+ - `import "./side-effect"` → side-effect import, edge with no target symbol
837
+ - `import * as ns from "x"` → namespace import
838
+ - `import("./lazy")` → dynamic import (different AST node type)
839
+ """
840
+ import re
841
+
842
+ text = self._node_text(node, source) or ""
843
+ is_type_only = "import type" in text
844
+
845
+ # Extract module path
846
+ match = re.search(r'''from\s+['"]([^'"]+)['"]''', text)
847
+ if not match:
848
+ match = re.search(r'''import\s+['"]([^'"]+)['"]''', text)
849
+ if not match:
850
+ return None
851
+
852
+ module_path = match.group(1)
853
+ target_path = self._resolve_js_module(module_path, file_path)
854
+ target_id = GraphNode.make_id(target_path, NodeKind.MODULE, target_path.stem)
855
+
856
+ return GraphEdge(
857
+ source_id=module_id,
858
+ target_id=target_id,
859
+ kind=EdgeKind.IMPORTS,
860
+ is_type_only=is_type_only,
861
+ metadata={"raw_import": text.strip(), "module_specifier": module_path},
862
+ )
863
+
864
+ def _parse_require(
865
+ self, text: str, file_path: Path, module_id: str
866
+ ) -> GraphEdge | None:
867
+ """Parse a CommonJS require() call.
868
+
869
+ Edge case: `require(variable)` — can't resolve, create edge
870
+ with low confidence and the variable name in metadata.
871
+ """
872
+ import re
873
+
874
+ match = re.search(r'''require\(['"]([^'"]+)['"]\)''', text)
875
+ if not match:
876
+ # Dynamic require — check if it's require(variable)
877
+ match_dynamic = re.search(r"require\((\w+)\)", text)
878
+ if match_dynamic:
879
+ return GraphEdge(
880
+ source_id=module_id,
881
+ target_id="unresolved:" + match_dynamic.group(1),
882
+ kind=EdgeKind.DYNAMIC_IMPORT,
883
+ confidence=0.3,
884
+ metadata={"dynamic_variable": match_dynamic.group(1)},
885
+ )
886
+ return None
887
+
888
+ module_path = match.group(1)
889
+ target_path = self._resolve_js_module(module_path, file_path)
890
+ target_id = GraphNode.make_id(target_path, NodeKind.MODULE, target_path.stem)
891
+
892
+ return GraphEdge(
893
+ source_id=module_id,
894
+ target_id=target_id,
895
+ kind=EdgeKind.IMPORTS,
896
+ metadata={"raw_import": text.strip(), "style": "commonjs"},
897
+ )
898
+
899
+ def _resolve_js_module(self, specifier: str, from_file: Path) -> Path:
900
+ """Resolve a JS module specifier to a file path.
901
+
902
+ Edge cases:
903
+ - Relative: "./foo" → look for foo.ts, foo.tsx, foo.js, foo/index.ts, etc.
904
+ - Bare: "react" → node_modules/react (external, tracked as is_external)
905
+ - Alias: "@/utils" → depends on tsconfig/webpack (flag, can't fully resolve)
906
+ - Missing extension: JS allows importing without extension
907
+ """
908
+ if specifier.startswith("."):
909
+ # Relative import
910
+ base = from_file.parent
911
+ candidate = base / specifier
912
+ # Try common extensions
913
+ for ext in (".ts", ".tsx", ".js", ".jsx", "/index.ts", "/index.tsx", "/index.js"):
914
+ full = candidate.parent / (candidate.name + ext)
915
+ if full.exists():
916
+ return full
917
+ return candidate.with_suffix(".ts") # Default assumption
918
+ else:
919
+ # Bare specifier (package import) or alias
920
+ return Path(f"node_modules/{specifier}/index.js")
921
+
922
+ def _parse_js_function(
923
+ self, node: object, source: str, file_path: Path
924
+ ) -> GraphNode | None:
925
+ """Parse a JS/TS function declaration."""
926
+ name_node = getattr(node, "child_by_field_name", lambda _: None)("name")
927
+ name = self._node_text(name_node, source) if name_node else None
928
+ if not name:
929
+ return None
930
+
931
+ start_line = getattr(node, "start_point", (0, 0))[0] + 1
932
+ end_line = getattr(node, "end_point", (0, 0))[0] + 1
933
+ text = self._node_text(node, source) or ""
934
+
935
+ return GraphNode(
936
+ node_id=GraphNode.make_id(file_path, NodeKind.FUNCTION, name),
937
+ kind=NodeKind.FUNCTION,
938
+ name=name,
939
+ qualified_name=f"{self._qualified_module_name(file_path)}.{name}",
940
+ file_path=file_path,
941
+ line_range=LineRange(start=start_line, end=end_line),
942
+ language=detect_language(file_path),
943
+ metadata={
944
+ "is_async": "async " in text[:50],
945
+ "is_generator": "function*" in text[:50],
946
+ "is_exported": True,
947
+ },
948
+ )
949
+
950
+ def _parse_js_class(
951
+ self, node: object, source: str, file_path: Path
952
+ ) -> GraphNode | None:
953
+ """Parse a JS/TS class declaration."""
954
+ name_node = getattr(node, "child_by_field_name", lambda _: None)("name")
955
+ name = self._node_text(name_node, source) if name_node else None
956
+ if not name:
957
+ return None
958
+
959
+ start_line = getattr(node, "start_point", (0, 0))[0] + 1
960
+ end_line = getattr(node, "end_point", (0, 0))[0] + 1
961
+
962
+ return GraphNode(
963
+ node_id=GraphNode.make_id(file_path, NodeKind.CLASS, name),
964
+ kind=NodeKind.CLASS,
965
+ name=name,
966
+ qualified_name=f"{self._qualified_module_name(file_path)}.{name}",
967
+ file_path=file_path,
968
+ line_range=LineRange(start=start_line, end=end_line),
969
+ language=detect_language(file_path),
970
+ )
971
+
972
+ # -------------------------------------------------------------------
973
+ # Regex fallbacks (when tree-sitter grammar not available)
974
+ # -------------------------------------------------------------------
975
+
976
+ async def _extract_python_regex(
977
+ self, source: str, file_path: Path, result: ParseResult
978
+ ) -> None:
979
+ """Regex-based Python extraction — fallback when tree-sitter unavailable.
980
+
981
+ This is intentionally limited. It catches the most common patterns
982
+ (imports, class/function definitions) but misses nesting, decorators,
983
+ and complex constructs.
984
+ """
985
+ import re
986
+
987
+ module_id = result.module_node_id
988
+
989
+ for line_no, line in enumerate(source.split("\n"), 1):
990
+ stripped = line.strip()
991
+
992
+ # Imports
993
+ if stripped.startswith(("import ", "from ")):
994
+ module_name = self._extract_import_module(stripped, file_path)
995
+ if module_name:
996
+ target_path = self._resolve_module_path(module_name)
997
+ target_id = GraphNode.make_id(target_path, NodeKind.MODULE, module_name.split(".")[-1])
998
+ result.edges.append(GraphEdge(
999
+ source_id=module_id,
1000
+ target_id=target_id,
1001
+ kind=EdgeKind.IMPORTS,
1002
+ confidence=0.9, # Lower confidence for regex extraction
1003
+ metadata={"extraction": "regex"},
1004
+ ))
1005
+
1006
+ # Class definitions
1007
+ match = re.match(r"class\s+(\w+)", stripped)
1008
+ if match:
1009
+ name = match.group(1)
1010
+ result.nodes.append(GraphNode(
1011
+ node_id=GraphNode.make_id(file_path, NodeKind.CLASS, name),
1012
+ kind=NodeKind.CLASS,
1013
+ name=name,
1014
+ qualified_name=f"{self._qualified_module_name(file_path)}.{name}",
1015
+ file_path=file_path,
1016
+ line_range=LineRange(start=line_no, end=line_no),
1017
+ language=Language.PYTHON,
1018
+ metadata={"extraction": "regex"},
1019
+ ))
1020
+
1021
+ # Function definitions
1022
+ match = re.match(r"(?:async\s+)?def\s+(\w+)", stripped)
1023
+ if match:
1024
+ name = match.group(1)
1025
+ result.nodes.append(GraphNode(
1026
+ node_id=GraphNode.make_id(file_path, NodeKind.FUNCTION, name),
1027
+ kind=NodeKind.FUNCTION,
1028
+ name=name,
1029
+ qualified_name=f"{self._qualified_module_name(file_path)}.{name}",
1030
+ file_path=file_path,
1031
+ line_range=LineRange(start=line_no, end=line_no),
1032
+ language=Language.PYTHON,
1033
+ metadata={"is_async": "async" in stripped, "extraction": "regex"},
1034
+ ))
1035
+
1036
+ async def _extract_js_regex(
1037
+ self, source: str, file_path: Path, result: ParseResult
1038
+ ) -> None:
1039
+ """Regex-based JS/TS extraction — fallback."""
1040
+ import re
1041
+
1042
+ module_id = result.module_node_id
1043
+
1044
+ for line_no, line in enumerate(source.split("\n"), 1):
1045
+ stripped = line.strip()
1046
+
1047
+ # ESM imports
1048
+ match = re.search(r'''from\s+['"]([^'"]+)['"]''', stripped)
1049
+ if match:
1050
+ target_path = self._resolve_js_module(match.group(1), file_path)
1051
+ target_id = GraphNode.make_id(target_path, NodeKind.MODULE, target_path.stem)
1052
+ result.edges.append(GraphEdge(
1053
+ source_id=module_id,
1054
+ target_id=target_id,
1055
+ kind=EdgeKind.IMPORTS,
1056
+ confidence=0.9,
1057
+ metadata={"extraction": "regex"},
1058
+ ))
1059
+
1060
+ # Function declarations
1061
+ match = re.match(r"(?:export\s+)?(?:async\s+)?function\s+(\w+)", stripped)
1062
+ if match:
1063
+ name = match.group(1)
1064
+ result.nodes.append(GraphNode(
1065
+ node_id=GraphNode.make_id(file_path, NodeKind.FUNCTION, name),
1066
+ kind=NodeKind.FUNCTION,
1067
+ name=name,
1068
+ qualified_name=f"{self._qualified_module_name(file_path)}.{name}",
1069
+ file_path=file_path,
1070
+ line_range=LineRange(start=line_no, end=line_no),
1071
+ language=detect_language(file_path),
1072
+ metadata={"extraction": "regex"},
1073
+ ))
1074
+
1075
+ # -------------------------------------------------------------------
1076
+ # Generic language extraction (all 19 languages via tree-sitter-language-pack)
1077
+ # -------------------------------------------------------------------
1078
+
1079
+ async def _extract_generic(
1080
+ self, source: str, file_path: Path, language: Language, result: ParseResult
1081
+ ) -> None:
1082
+ """Extract basic structure from any tree-sitter supported language.
1083
+
1084
+ Works for Go, Rust, Java, C, C++, C#, PHP, Swift, Kotlin, Scala,
1085
+ Lua, Dart, Elixir, Haskell, Ruby.
1086
+
1087
+ Extracts:
1088
+ - Top-level function/method definitions
1089
+ - Class/struct/interface/trait definitions
1090
+ - Import/include/use statements
1091
+
1092
+ This is intentionally broad — we use tree-sitter node type names
1093
+ that are common across grammars (function_definition, class_definition,
1094
+ import_declaration, etc.) and skip what we don't recognize.
1095
+ """
1096
+ grammar_name = LANGUAGE_GRAMMAR_MAP.get(language)
1097
+ if not grammar_name:
1098
+ return
1099
+
1100
+ try:
1101
+ from tree_sitter_language_pack import get_parser
1102
+ parser = get_parser(grammar_name)
1103
+ except Exception as exc:
1104
+ result.warnings.append(f"No grammar for {language.value}: {exc}")
1105
+ return
1106
+
1107
+ tree = parser.parse(source.encode("utf-8"))
1108
+ if tree.root_node.has_error:
1109
+ result.had_syntax_errors = True
1110
+
1111
+ module_id = result.module_node_id
1112
+
1113
+ # Node types that represent function/method definitions across languages
1114
+ function_types = {
1115
+ "function_definition", "function_declaration", "method_definition",
1116
+ "method_declaration", "function_item", "fun_spec",
1117
+ }
1118
+ # Node types that represent class/struct/interface definitions
1119
+ class_types = {
1120
+ "class_definition", "class_declaration", "struct_item",
1121
+ "interface_declaration", "trait_item", "type_declaration",
1122
+ "struct_declaration", "enum_declaration", "enum_item",
1123
+ "object_declaration",
1124
+ }
1125
+ # Node types that represent imports
1126
+ import_types = {
1127
+ "import_declaration", "import_statement", "use_declaration",
1128
+ "include_statement", "require_expression", "using_directive",
1129
+ "import_from_statement", "preproc_include",
1130
+ }
1131
+
1132
+ for child in tree.root_node.children:
1133
+ child_type = getattr(child, "type", "")
1134
+
1135
+ if child_type in function_types:
1136
+ name_node = getattr(child, "child_by_field_name", lambda _: None)("name")
1137
+ name = self._node_text(name_node, source) if name_node else None
1138
+ if name:
1139
+ start_line = getattr(child, "start_point", (0, 0))[0] + 1
1140
+ end_line = getattr(child, "end_point", (0, 0))[0] + 1
1141
+ result.nodes.append(GraphNode(
1142
+ node_id=GraphNode.make_id(file_path, NodeKind.FUNCTION, name),
1143
+ kind=NodeKind.FUNCTION,
1144
+ name=name,
1145
+ qualified_name=f"{self._qualified_module_name(file_path)}.{name}",
1146
+ file_path=file_path,
1147
+ line_range=LineRange(start=start_line, end=end_line),
1148
+ language=language,
1149
+ ))
1150
+
1151
+ elif child_type in class_types:
1152
+ name_node = getattr(child, "child_by_field_name", lambda _: None)("name")
1153
+ name = self._node_text(name_node, source) if name_node else None
1154
+ if name:
1155
+ start_line = getattr(child, "start_point", (0, 0))[0] + 1
1156
+ end_line = getattr(child, "end_point", (0, 0))[0] + 1
1157
+ kind = NodeKind.INTERFACE if "interface" in child_type or "trait" in child_type else NodeKind.CLASS
1158
+ result.nodes.append(GraphNode(
1159
+ node_id=GraphNode.make_id(file_path, kind, name),
1160
+ kind=kind,
1161
+ name=name,
1162
+ qualified_name=f"{self._qualified_module_name(file_path)}.{name}",
1163
+ file_path=file_path,
1164
+ line_range=LineRange(start=start_line, end=end_line),
1165
+ language=language,
1166
+ ))
1167
+
1168
+ elif child_type in import_types:
1169
+ # Record a basic import edge
1170
+ text = self._node_text(child, source)
1171
+ if text:
1172
+ result.edges.append(GraphEdge(
1173
+ source_id=module_id,
1174
+ target_id=GraphNode.make_id(file_path, NodeKind.MODULE, f"_import_{hash(text) % 10000}"),
1175
+ kind=EdgeKind.IMPORTS,
1176
+ confidence=0.7,
1177
+ metadata={"raw_import": text.strip()[:200], "language": language.value},
1178
+ ))
1179
+
1180
+ # -------------------------------------------------------------------
1181
+ # Utility methods
1182
+ # -------------------------------------------------------------------
1183
+
1184
+ def _extract_superclasses(self, class_node: object, source: str) -> list[str]:
1185
+ """Extract superclass names from a class definition node."""
1186
+ superclasses_node = getattr(class_node, "child_by_field_name", lambda _: None)("superclasses")
1187
+ if not superclasses_node:
1188
+ return []
1189
+
1190
+ text = self._node_text(superclasses_node, source)
1191
+ if not text:
1192
+ return []
1193
+
1194
+ # Remove parens and split
1195
+ import re
1196
+ text = text.strip("()")
1197
+ return [s.strip() for s in re.split(r",\s*", text) if s.strip()]
1198
+
1199
+ def _extract_docstring(self, node: object, source: str) -> str | None:
1200
+ """Extract docstring from a class or function body."""
1201
+ body = getattr(node, "child_by_field_name", lambda _: None)("body")
1202
+ if not body:
1203
+ return None
1204
+
1205
+ children = getattr(body, "children", [])
1206
+ if not children:
1207
+ return None
1208
+
1209
+ first = children[0]
1210
+ if getattr(first, "type", "") == "expression_statement":
1211
+ inner_children = getattr(first, "children", [])
1212
+ if inner_children and getattr(inner_children[0], "type", "") == "string":
1213
+ text = self._node_text(inner_children[0], source)
1214
+ if text:
1215
+ return text.strip("'\"").strip()
1216
+ return None
1217
+
1218
+ def _has_decorator(self, node: object, source: str, decorator_name: str) -> bool:
1219
+ """Check if a node has a specific decorator."""
1220
+ # Look for decorator nodes before this node
1221
+ parent = getattr(node, "parent", None)
1222
+ if not parent:
1223
+ return False
1224
+
1225
+ for child in getattr(parent, "children", []):
1226
+ if getattr(child, "type", "") == "decorator":
1227
+ text = self._node_text(child, source) or ""
1228
+ if decorator_name in text:
1229
+ return True
1230
+ if child is node:
1231
+ break
1232
+ return False