interlinked-mapper 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1141 @@
1
+ """AST-based parser that extracts symbols and relationships from Python source.
2
+
3
+ Architecture:
4
+ Pass 1 (_SymbolVisitor) -- Extracts all nodes and **raw** edges. Edge targets
5
+ are the literal names from the AST (e.g. "self.state.zoom_level", "n.id",
6
+ "graph"). No resolution happens here.
7
+ Pass 2 (_TypeInferencer) -- Collects type annotations from the AST, then
8
+ resolves every raw edge target using the full type map. Handles self/cls,
9
+ dotted attribute chains, typed loop variables, assignment propagation.
10
+ Pass 3 (Structural inference) -- For any unresolved dotted name like "n.id",
11
+ builds a reverse index (field_name -> classes), intersects all field
12
+ accesses for a variable, and infers its type from the unique match.
13
+ Pass 4 (Progressive truncation + drop) -- For any remaining unresolved edge,
14
+ progressively strips attrs from the right until a known node is hit.
15
+ If nothing resolves, the edge is external and is dropped.
16
+ Pass 5 (CodeGraph.build_from) -- Final short-name to qualified-name resolution
17
+ for bare names (cross-module calls).
18
+
19
+ No hardcoded external module or builtin method lists. Resolution is entirely
20
+ dynamic: if a target resolves to a project node, it's kept; otherwise it's
21
+ progressively truncated or dropped.
22
+
23
+ Extracts:
24
+ Nodes -- modules, classes, functions/methods, variables (module/class/instance
25
+ scope), parameters, local variables (function scope).
26
+ Edges -- contains, calls, imports, inherits, reads, writes, returns.
27
+ """
28
+
29
+ from __future__ import annotations
30
+
31
+ import ast
32
+ import builtins
33
+ from pathlib import Path
34
+ from typing import Any
35
+
36
+ from interlinked.models import NodeData, EdgeData, SymbolType, EdgeType
37
+
38
+ # Python builtins we should never create nodes/edges for
39
+ _BUILTINS: frozenset[str] = frozenset(dir(builtins)) | frozenset({
40
+ "None", "True", "False", "__name__", "__file__", "__doc__",
41
+ "__all__", "__spec__", "__loader__", "__package__", "__builtins__",
42
+ })
43
+
44
+
45
+
46
+ def parse_project(root: str | Path) -> tuple[list[NodeData], list[EdgeData]]:
47
+ """Walk a Python project directory and extract all symbols and edges."""
48
+ root = Path(root).resolve()
49
+ nodes: list[NodeData] = []
50
+ edges: list[EdgeData] = []
51
+
52
+ py_files = sorted(root.rglob("*.py"))
53
+
54
+ # Pass 1: extract all symbols and raw (unresolved) edges
55
+ trees: list[tuple[ast.Module, str, str]] = []
56
+ for py_file in py_files:
57
+ try:
58
+ source = py_file.read_text(encoding="utf-8", errors="replace")
59
+ tree = ast.parse(source, filename=str(py_file))
60
+ except SyntaxError:
61
+ continue
62
+
63
+ rel_path = py_file.relative_to(root)
64
+ module_qname = _path_to_module(rel_path)
65
+ trees.append((tree, module_qname, str(py_file)))
66
+
67
+ file_nodes, file_edges = _extract_from_module(
68
+ tree, source, module_qname, str(py_file)
69
+ )
70
+ nodes.extend(file_nodes)
71
+ edges.extend(file_edges)
72
+
73
+ # Pass 2: type inference from annotations
74
+ node_ids = {n.id for n in nodes}
75
+
76
+ # Build class/type short name -> qualified name index
77
+ type_index: dict[str, str] = {}
78
+ for n in nodes:
79
+ if n.symbol_type in (SymbolType.CLASS, SymbolType.MODULE):
80
+ type_index[n.name] = n.id
81
+ parts = n.qualified_name.split(".")
82
+ for i in range(len(parts)):
83
+ suffix = ".".join(parts[i:])
84
+ type_index.setdefault(suffix, n.id)
85
+
86
+ inferencer = _TypeInferencer(type_index, node_ids)
87
+ for tree, module_qname, _fp in trees:
88
+ inferencer.collect_types(tree, module_qname)
89
+
90
+ # Pass 3: structural type inference — infer types from field access patterns
91
+ inferencer.infer_structural_types(edges)
92
+
93
+ # Build name_index for bare-name filtering (same index build_from uses)
94
+ name_index: dict[str, list[str]] = {}
95
+ for n in nodes:
96
+ name_index.setdefault(n.name, []).append(n.id)
97
+ parts = n.qualified_name.split(".")
98
+ for i in range(1, len(parts)):
99
+ suffix = ".".join(parts[i:])
100
+ name_index.setdefault(suffix, []).append(n.id)
101
+
102
+ # Pass 4: resolve all data-flow edges, progressive truncation, drop external
103
+ #
104
+ # Edge type handling:
105
+ # CALLS / IMPORTS — ALWAYS keep. Even unresolved external calls like
106
+ # nx.all_simple_paths() are critical for auditing what the code does.
107
+ # READS / WRITES — resolve or drop. External attribute accesses like
108
+ # node.lineno (ast module) are not useful; project data flow must resolve.
109
+ # RETURNS — resolve or drop (same as reads/writes).
110
+ # CONTAINS / INHERITS — pass through unchanged.
111
+ resolved_edges: list[EdgeData] = []
112
+ for e in edges:
113
+ # Structural edges — always keep
114
+ if e.edge_type not in (EdgeType.READS, EdgeType.WRITES, EdgeType.CALLS, EdgeType.RETURNS):
115
+ resolved_edges.append(e)
116
+ continue
117
+
118
+ # CALLS — always keep (external calls are audit-critical), but filter builtins
119
+ if e.edge_type == EdgeType.CALLS:
120
+ raw_target = e.target
121
+ # Filter builtin function calls (len, str, isinstance, etc.)
122
+ callee_root = raw_target.split(".")[0]
123
+ if callee_root in _BUILTINS:
124
+ continue
125
+ resolved = inferencer.resolve(raw_target, e.source)
126
+ if resolved and resolved in node_ids:
127
+ resolved_edges.append(EdgeData(
128
+ source=e.source, target=resolved,
129
+ edge_type=e.edge_type, is_dead=e.is_dead,
130
+ is_proposed=e.is_proposed, line=e.line,
131
+ metadata=e.metadata,
132
+ ))
133
+ else:
134
+ # Keep the raw call target — external library calls visible to auditors
135
+ resolved_edges.append(e)
136
+ continue
137
+
138
+ # READS / WRITES / RETURNS — resolve, truncate, or drop
139
+ raw_target = e.target
140
+ resolved = inferencer.resolve(raw_target, e.source)
141
+ if resolved is None:
142
+ continue # filtered out (builtin like len, str, True)
143
+
144
+ # If resolved is a known node, keep
145
+ if resolved in node_ids:
146
+ if resolved != raw_target:
147
+ e = EdgeData(
148
+ source=e.source, target=resolved,
149
+ edge_type=e.edge_type, is_dead=e.is_dead,
150
+ is_proposed=e.is_proposed, line=e.line,
151
+ metadata=e.metadata,
152
+ )
153
+ resolved_edges.append(e)
154
+ continue
155
+
156
+ # Progressive truncation: strip from right until we hit a known node
157
+ if "." in resolved:
158
+ parts = resolved.split(".")
159
+ found = False
160
+ for i in range(len(parts), 0, -1):
161
+ candidate = ".".join(parts[:i])
162
+ if candidate in node_ids:
163
+ resolved_edges.append(EdgeData(
164
+ source=e.source, target=candidate,
165
+ edge_type=e.edge_type, is_dead=e.is_dead,
166
+ is_proposed=e.is_proposed, line=e.line,
167
+ metadata=e.metadata,
168
+ ))
169
+ found = True
170
+ break
171
+ if found:
172
+ continue
173
+
174
+ # Bare names — keep only if they match a project symbol short name
175
+ if "." not in resolved:
176
+ if resolved in name_index:
177
+ resolved_edges.append(EdgeData(
178
+ source=e.source, target=resolved,
179
+ edge_type=e.edge_type, is_dead=e.is_dead,
180
+ is_proposed=e.is_proposed, line=e.line,
181
+ metadata=e.metadata,
182
+ ))
183
+ continue
184
+
185
+ # Dotted read/write that didn't resolve — external, drop it
186
+
187
+ return nodes, resolved_edges
188
+
189
+
190
+ def _path_to_module(rel_path: Path) -> str:
191
+ """Convert a relative file path to a dotted module name."""
192
+ parts = list(rel_path.parts)
193
+ if parts[-1] == "__init__.py":
194
+ parts = parts[:-1]
195
+ else:
196
+ parts[-1] = parts[-1].removesuffix(".py")
197
+ return ".".join(parts) if parts else "__root__"
198
+
199
+
200
+ def _extract_from_module(
201
+ tree: ast.Module,
202
+ source: str,
203
+ module_qname: str,
204
+ file_path: str,
205
+ ) -> tuple[list[NodeData], list[EdgeData]]:
206
+ """Extract nodes and edges from a single parsed module."""
207
+ nodes: list[NodeData] = []
208
+ edges: list[EdgeData] = []
209
+
210
+ mod_docstring = ast.get_docstring(tree)
211
+ nodes.append(NodeData(
212
+ id=module_qname,
213
+ name=module_qname.split(".")[-1],
214
+ qualified_name=module_qname,
215
+ symbol_type=SymbolType.MODULE,
216
+ file_path=file_path,
217
+ line_start=1,
218
+ line_end=len(source.splitlines()),
219
+ docstring=mod_docstring,
220
+ ))
221
+
222
+ visitor = _SymbolVisitor(module_qname, file_path, nodes, edges)
223
+ visitor.visit(tree)
224
+
225
+ return nodes, edges
226
+
227
+
228
+ # ---------------------------------------------------------------------------
229
+ # Pass 1: AST visitor -- node creation + raw edge emission
230
+ # ---------------------------------------------------------------------------
231
+
232
+ class _SymbolVisitor(ast.NodeVisitor):
233
+ """Walks the AST, creates graph nodes, and emits raw (unresolved) edges.
234
+
235
+ Edge targets are the literal AST names: "self.state", "n.id", "graph".
236
+ Resolution is deferred entirely to pass 2 (_TypeInferencer).
237
+ """
238
+
239
+ def __init__(
240
+ self,
241
+ module_qname: str,
242
+ file_path: str,
243
+ nodes: list[NodeData],
244
+ edges: list[EdgeData],
245
+ ):
246
+ self._module = module_qname
247
+ self._file = file_path
248
+ self._nodes = nodes
249
+ self._edges = edges
250
+ self._scope_stack: list[str] = [module_qname]
251
+ self._node_ids: set[str] = set()
252
+
253
+ @property
254
+ def _current_scope(self) -> str:
255
+ return self._scope_stack[-1]
256
+
257
+ def _add_node(self, node: NodeData) -> None:
258
+ if node.id not in self._node_ids:
259
+ self._nodes.append(node)
260
+ self._node_ids.add(node.id)
261
+
262
+ def _is_inside_class(self) -> bool:
263
+ return any(
264
+ n.symbol_type == SymbolType.CLASS
265
+ for n in self._nodes
266
+ if n.id == self._current_scope
267
+ )
268
+
269
+ def _class_scope(self) -> str | None:
270
+ for scope in reversed(self._scope_stack):
271
+ if any(n.id == scope and n.symbol_type == SymbolType.CLASS for n in self._nodes):
272
+ return scope
273
+ return None
274
+
275
+ # -- Classes -----------------------------------------------------------
276
+
277
+ def visit_ClassDef(self, node: ast.ClassDef) -> None:
278
+ qname = f"{self._current_scope}.{node.name}"
279
+ self._add_node(NodeData(
280
+ id=qname, name=node.name, qualified_name=qname,
281
+ symbol_type=SymbolType.CLASS, file_path=self._file,
282
+ line_start=node.lineno, line_end=node.end_lineno,
283
+ docstring=ast.get_docstring(node),
284
+ ))
285
+ self._edges.append(EdgeData(
286
+ source=self._current_scope, target=qname,
287
+ edge_type=EdgeType.CONTAINS, line=node.lineno,
288
+ ))
289
+ for base in node.bases:
290
+ base_name = _name_from_node(base)
291
+ if base_name:
292
+ self._edges.append(EdgeData(
293
+ source=qname, target=base_name,
294
+ edge_type=EdgeType.INHERITS, line=node.lineno,
295
+ ))
296
+ self._scope_stack.append(qname)
297
+ self.generic_visit(node)
298
+ self._scope_stack.pop()
299
+
300
+ # -- Functions / Methods -----------------------------------------------
301
+
302
+ def visit_FunctionDef(self, node: ast.FunctionDef) -> None:
303
+ self._handle_funcdef(node)
304
+
305
+ def visit_AsyncFunctionDef(self, node: ast.AsyncFunctionDef) -> None:
306
+ self._handle_funcdef(node)
307
+
308
+ def _handle_funcdef(self, node: ast.FunctionDef | ast.AsyncFunctionDef) -> None:
309
+ qname = f"{self._current_scope}.{node.name}"
310
+ is_method = self._is_inside_class()
311
+ sym_type = SymbolType.METHOD if is_method else SymbolType.FUNCTION
312
+ sig = _signature_from_funcdef(node)
313
+
314
+ self._add_node(NodeData(
315
+ id=qname, name=node.name, qualified_name=qname,
316
+ symbol_type=sym_type, file_path=self._file,
317
+ line_start=node.lineno, line_end=node.end_lineno,
318
+ docstring=ast.get_docstring(node), signature=sig,
319
+ ))
320
+ self._edges.append(EdgeData(
321
+ source=self._current_scope, target=qname,
322
+ edge_type=EdgeType.CONTAINS, line=node.lineno,
323
+ ))
324
+
325
+ self._extract_parameters(node, qname)
326
+
327
+ if node.name == "__init__" and is_method:
328
+ self._extract_instance_attrs(node)
329
+
330
+ self._scope_stack.append(qname)
331
+ self._extract_calls(node, qname)
332
+ self._extract_variable_access(node, qname)
333
+ self._extract_returns(node, qname)
334
+ self.generic_visit(node)
335
+ self._scope_stack.pop()
336
+
337
+ # -- Imports -----------------------------------------------------------
338
+
339
+ def visit_Import(self, node: ast.Import) -> None:
340
+ for alias in node.names:
341
+ self._edges.append(EdgeData(
342
+ source=self._module, target=alias.name,
343
+ edge_type=EdgeType.IMPORTS, line=node.lineno,
344
+ ))
345
+
346
+ def visit_ImportFrom(self, node: ast.ImportFrom) -> None:
347
+ base = node.module or ""
348
+ for alias in node.names:
349
+ target = f"{base}.{alias.name}" if base else alias.name
350
+ self._edges.append(EdgeData(
351
+ source=self._module, target=target,
352
+ edge_type=EdgeType.IMPORTS, line=node.lineno,
353
+ ))
354
+
355
+ # -- Assignments at module / class scope --------------------------------
356
+
357
+ def visit_Assign(self, node: ast.Assign) -> None:
358
+ if self._current_scope == self._module or self._is_inside_class():
359
+ for target in node.targets:
360
+ for name in _assigned_names(target):
361
+ qname = f"{self._current_scope}.{name}"
362
+ self._add_node(NodeData(
363
+ id=qname, name=name, qualified_name=qname,
364
+ symbol_type=SymbolType.VARIABLE, file_path=self._file,
365
+ line_start=node.lineno, line_end=node.end_lineno,
366
+ ))
367
+ self._edges.append(EdgeData(
368
+ source=self._current_scope, target=qname,
369
+ edge_type=EdgeType.CONTAINS, line=node.lineno,
370
+ ))
371
+ self.generic_visit(node)
372
+
373
+ def visit_AnnAssign(self, node: ast.AnnAssign) -> None:
374
+ if self._current_scope == self._module or self._is_inside_class():
375
+ if node.target:
376
+ name = _name_from_node(node.target)
377
+ if name and "." not in name:
378
+ qname = f"{self._current_scope}.{name}"
379
+ self._add_node(NodeData(
380
+ id=qname, name=name, qualified_name=qname,
381
+ symbol_type=SymbolType.VARIABLE, file_path=self._file,
382
+ line_start=node.lineno, line_end=node.end_lineno,
383
+ ))
384
+ self._edges.append(EdgeData(
385
+ source=self._current_scope, target=qname,
386
+ edge_type=EdgeType.CONTAINS, line=node.lineno,
387
+ ))
388
+ self.generic_visit(node)
389
+
390
+ # -- Internal helpers --------------------------------------------------
391
+
392
+ def _extract_instance_attrs(self, init_node: ast.FunctionDef | ast.AsyncFunctionDef) -> None:
393
+ """Create VARIABLE nodes for self.X = ... and self.X: T = ... in __init__."""
394
+ class_scope = self._class_scope()
395
+ if not class_scope:
396
+ return
397
+ for node in ast.walk(init_node):
398
+ targets: list[ast.AST] = []
399
+ if isinstance(node, ast.Assign):
400
+ targets = node.targets
401
+ elif isinstance(node, ast.AnnAssign) and node.target:
402
+ targets = [node.target]
403
+ for target in targets:
404
+ if isinstance(target, ast.Attribute) and isinstance(target.value, ast.Name):
405
+ if target.value.id == "self":
406
+ attr_qname = f"{class_scope}.{target.attr}"
407
+ self._add_node(NodeData(
408
+ id=attr_qname, name=target.attr,
409
+ qualified_name=attr_qname,
410
+ symbol_type=SymbolType.VARIABLE,
411
+ file_path=self._file,
412
+ line_start=node.lineno,
413
+ line_end=getattr(node, "end_lineno", None),
414
+ ))
415
+ self._edges.append(EdgeData(
416
+ source=class_scope, target=attr_qname,
417
+ edge_type=EdgeType.CONTAINS, line=node.lineno,
418
+ ))
419
+
420
+ def _extract_parameters(self, func_node: ast.FunctionDef | ast.AsyncFunctionDef, func_qname: str) -> None:
421
+ args = func_node.args
422
+ all_args: list[ast.arg] = (
423
+ args.posonlyargs + args.args + args.kwonlyargs
424
+ )
425
+ if args.vararg:
426
+ all_args.append(args.vararg)
427
+ if args.kwarg:
428
+ all_args.append(args.kwarg)
429
+ for arg in all_args:
430
+ if arg.arg in ("self", "cls"):
431
+ continue
432
+ param_qname = f"{func_qname}.{arg.arg}"
433
+ self._add_node(NodeData(
434
+ id=param_qname, name=arg.arg, qualified_name=param_qname,
435
+ symbol_type=SymbolType.PARAMETER, file_path=self._file,
436
+ line_start=func_node.lineno,
437
+ ))
438
+ self._edges.append(EdgeData(
439
+ source=func_qname, target=param_qname,
440
+ edge_type=EdgeType.CONTAINS, line=func_node.lineno,
441
+ ))
442
+
443
+ def _extract_calls(self, func_node: ast.AST, caller_qname: str) -> None:
444
+ """Emit raw CALLS edges. Targets are unresolved (e.g. 'self.add_node')."""
445
+ for node in ast.walk(func_node):
446
+ if not isinstance(node, ast.Call):
447
+ continue
448
+ callee = _name_from_node(node.func)
449
+ if not callee:
450
+ continue
451
+
452
+ arg_names: list[str] = []
453
+ for arg in node.args:
454
+ aname = _name_from_node(arg)
455
+ if aname:
456
+ arg_names.append(aname)
457
+ kwarg_names: dict[str, str] = {}
458
+ for kw in node.keywords:
459
+ if kw.arg:
460
+ vname = _name_from_node(kw.value)
461
+ if vname:
462
+ kwarg_names[kw.arg] = vname
463
+
464
+ metadata: dict[str, Any] = {}
465
+ if arg_names:
466
+ metadata["args"] = arg_names
467
+ if kwarg_names:
468
+ metadata["kwargs"] = kwarg_names
469
+
470
+ self._edges.append(EdgeData(
471
+ source=caller_qname, target=callee,
472
+ edge_type=EdgeType.CALLS,
473
+ line=getattr(node, "lineno", None),
474
+ metadata=metadata,
475
+ ))
476
+
477
+ def _extract_variable_access(self, func_node: ast.AST, scope_qname: str) -> None:
478
+ """Emit raw READS/WRITES edges. Targets are unresolved."""
479
+ param_names: set[str] = set()
480
+ local_names: set[str] = set()
481
+
482
+ if isinstance(func_node, (ast.FunctionDef, ast.AsyncFunctionDef)):
483
+ for arg in (func_node.args.posonlyargs + func_node.args.args +
484
+ func_node.args.kwonlyargs):
485
+ if arg.arg not in ("self", "cls"):
486
+ param_names.add(arg.arg)
487
+ if func_node.args.vararg:
488
+ param_names.add(func_node.args.vararg.arg)
489
+ if func_node.args.kwarg:
490
+ param_names.add(func_node.args.kwarg.arg)
491
+
492
+ # Collect local assignment targets
493
+ for node in ast.walk(func_node):
494
+ if isinstance(node, ast.Assign):
495
+ for t in node.targets:
496
+ for name in _assigned_names(t):
497
+ if name not in ("self", "cls") and name not in _BUILTINS:
498
+ local_names.add(name)
499
+ elif isinstance(node, ast.AugAssign):
500
+ name = _name_from_node(node.target)
501
+ if name and name not in ("self", "cls") and name not in _BUILTINS and "." not in name:
502
+ local_names.add(name)
503
+ elif isinstance(node, ast.AnnAssign) and node.value and node.target:
504
+ name = _name_from_node(node.target)
505
+ if name and name not in ("self", "cls") and name not in _BUILTINS and "." not in name:
506
+ local_names.add(name)
507
+ # For-loop targets
508
+ elif isinstance(node, ast.For):
509
+ if isinstance(node.target, ast.Name):
510
+ local_names.add(node.target.id)
511
+ elif isinstance(node.target, (ast.Tuple, ast.List)):
512
+ for elt in node.target.elts:
513
+ if isinstance(elt, ast.Name):
514
+ local_names.add(elt.id)
515
+
516
+ # Comprehension loop variables (listcomp, setcomp, genexpr, dictcomp)
517
+ elif isinstance(node, (ast.ListComp, ast.SetComp, ast.GeneratorExp, ast.DictComp)):
518
+ for gen in node.generators:
519
+ if isinstance(gen.target, ast.Name):
520
+ local_names.add(gen.target.id)
521
+ elif isinstance(gen.target, (ast.Tuple, ast.List)):
522
+ for elt in gen.target.elts:
523
+ if isinstance(elt, ast.Name):
524
+ local_names.add(elt.id)
525
+
526
+ # Create local variable nodes (not params -- those already exist)
527
+ for lname in local_names:
528
+ if lname not in param_names:
529
+ lvar_qname = f"{scope_qname}.{lname}"
530
+ self._add_node(NodeData(
531
+ id=lvar_qname, name=lname, qualified_name=lvar_qname,
532
+ symbol_type=SymbolType.VARIABLE, file_path=self._file,
533
+ line_start=getattr(func_node, "lineno", None),
534
+ ))
535
+ self._edges.append(EdgeData(
536
+ source=scope_qname, target=lvar_qname,
537
+ edge_type=EdgeType.CONTAINS,
538
+ line=getattr(func_node, "lineno", None),
539
+ ))
540
+
541
+ known_locals = param_names | local_names
542
+
543
+ # Emit raw reads/writes
544
+ for node in ast.walk(func_node):
545
+ if isinstance(node, ast.Assign):
546
+ for target in node.targets:
547
+ self._emit_raw_write(target, scope_qname, known_locals, node.lineno)
548
+ elif isinstance(node, ast.AugAssign):
549
+ self._emit_raw_write(node.target, scope_qname, known_locals, node.lineno)
550
+ # AugAssign also reads
551
+ name = _name_from_node(node.target)
552
+ if name:
553
+ raw = self._raw_target(name, scope_qname, known_locals)
554
+ if raw:
555
+ self._edges.append(EdgeData(
556
+ source=scope_qname, target=raw,
557
+ edge_type=EdgeType.READS, line=node.lineno,
558
+ ))
559
+ elif isinstance(node, ast.AnnAssign) and node.value and node.target:
560
+ self._emit_raw_write(node.target, scope_qname, known_locals, node.lineno)
561
+
562
+ elif isinstance(node, ast.Name) and isinstance(node.ctx, ast.Load):
563
+ if node.id in _BUILTINS or node.id in ("self", "cls"):
564
+ continue
565
+ raw = self._raw_target(node.id, scope_qname, known_locals)
566
+ if raw:
567
+ self._edges.append(EdgeData(
568
+ source=scope_qname, target=raw,
569
+ edge_type=EdgeType.READS, line=node.lineno,
570
+ ))
571
+
572
+ elif isinstance(node, ast.Attribute) and isinstance(node.ctx, ast.Load):
573
+ dotted = _name_from_node(node)
574
+ if dotted and dotted.split(".")[0] not in _BUILTINS:
575
+ self._edges.append(EdgeData(
576
+ source=scope_qname, target=dotted,
577
+ edge_type=EdgeType.READS, line=node.lineno,
578
+ ))
579
+
580
+ def _emit_raw_write(self, target: ast.AST, scope_qname: str, known_locals: set[str], lineno: int) -> None:
581
+ if isinstance(target, ast.Name):
582
+ if target.id in _BUILTINS or target.id in ("self", "cls"):
583
+ return
584
+ raw = self._raw_target(target.id, scope_qname, known_locals)
585
+ if raw:
586
+ self._edges.append(EdgeData(
587
+ source=scope_qname, target=raw,
588
+ edge_type=EdgeType.WRITES, line=lineno,
589
+ ))
590
+ elif isinstance(target, ast.Attribute):
591
+ dotted = _name_from_node(target)
592
+ if dotted and dotted.split(".")[0] not in _BUILTINS:
593
+ self._edges.append(EdgeData(
594
+ source=scope_qname, target=dotted,
595
+ edge_type=EdgeType.WRITES, line=lineno,
596
+ ))
597
+ elif isinstance(target, (ast.Tuple, ast.List)):
598
+ for elt in target.elts:
599
+ self._emit_raw_write(elt, scope_qname, known_locals, lineno)
600
+
601
+ @staticmethod
602
+ def _raw_target(name: str, scope_qname: str, known_locals: set[str]) -> str | None:
603
+ """Return the raw edge target for a bare name.
604
+
605
+ Known locals/params get scope-qualified so they match their node IDs.
606
+ Everything else is left bare for the resolver.
607
+ """
608
+ if name in _BUILTINS:
609
+ return None
610
+ if name in known_locals:
611
+ return f"{scope_qname}.{name}"
612
+ return name
613
+
614
+ def _extract_returns(self, func_node: ast.AST, func_qname: str) -> None:
615
+ for node in ast.walk(func_node):
616
+ if isinstance(node, ast.Return) and node.value:
617
+ ret_name = _name_from_node(node.value)
618
+ if ret_name:
619
+ self._edges.append(EdgeData(
620
+ source=func_qname, target=ret_name,
621
+ edge_type=EdgeType.RETURNS, line=node.lineno,
622
+ ))
623
+
624
+
625
+
626
+ # ---------------------------------------------------------------------------
627
+ # Pass 2: Type inference and unified resolution
628
+ # ---------------------------------------------------------------------------
629
+
630
+ class _TypeInferencer:
631
+ """Collects type annotations and resolves ALL edge targets.
632
+
633
+ Single point of resolution for every name pattern:
634
+ - "self.X" -> Class.X (via scope)
635
+ - "self.X.Y" -> resolve X's type, then Type.Y
636
+ - "n.id" -> look up n's type, then NodeData.id
637
+ - "graph.all_nodes" -> look up graph's type, then CodeGraph.all_nodes
638
+ - bare "result" -> already scope-qualified by visitor
639
+ - "ast.Name" -> filtered (external)
640
+ """
641
+
642
+ def __init__(self, type_index: dict[str, str], node_ids: set[str]) -> None:
643
+ self._type_index = type_index
644
+ self._node_ids = node_ids
645
+ # (func_qname, var_name) -> class_qname
646
+ self._var_types: dict[tuple[str, str], str] = {}
647
+ # func_qname -> return type annotation AST
648
+ self._return_types: dict[str, ast.AST] = {}
649
+
650
+ # -- Annotation helpers ------------------------------------------------
651
+
652
+ def _resolve_annotation(self, ann: ast.AST) -> str | None:
653
+ """Extract the resolved class qname from a type annotation."""
654
+ if isinstance(ann, ast.Name):
655
+ return self._type_index.get(ann.id)
656
+ if isinstance(ann, ast.Attribute):
657
+ dotted = _name_from_node(ann)
658
+ return self._type_index.get(dotted) if dotted else None
659
+ if isinstance(ann, ast.Subscript):
660
+ return self._resolve_annotation(ann.value)
661
+ if isinstance(ann, ast.BinOp) and isinstance(ann.op, ast.BitOr):
662
+ return self._resolve_annotation(ann.left) or self._resolve_annotation(ann.right)
663
+ if isinstance(ann, ast.Constant) and isinstance(ann.value, str):
664
+ return self._type_index.get(ann.value)
665
+ return None
666
+
667
+ def _resolve_subscript_inner(self, ann: ast.AST) -> str | None:
668
+ """For list[NodeData] or set[X], resolve the element type."""
669
+ if isinstance(ann, ast.Subscript):
670
+ sl = ann.slice
671
+ if isinstance(sl, ast.Name):
672
+ return self._type_index.get(sl.id)
673
+ if isinstance(sl, ast.Attribute):
674
+ dotted = _name_from_node(sl)
675
+ return self._type_index.get(dotted) if dotted else None
676
+ # dict[K, V] -- return V for .values() iteration
677
+ if isinstance(sl, ast.Tuple) and len(sl.elts) >= 2:
678
+ return self._resolve_annotation(sl.elts[-1])
679
+ # Handle X | None wrapping
680
+ if isinstance(ann, ast.BinOp) and isinstance(ann.op, ast.BitOr):
681
+ return self._resolve_subscript_inner(ann.left) or self._resolve_subscript_inner(ann.right)
682
+ return None
683
+
684
+ # -- Type collection ---------------------------------------------------
685
+
686
+ def collect_types(self, tree: ast.Module, module_qname: str) -> None:
687
+ """Walk an AST and collect all type information.
688
+
689
+ Two sub-passes per module:
690
+ 1. Collect ALL return type annotations (so method call assignments
691
+ can look up return types regardless of definition order).
692
+ 2. Collect param types, local types, assignments, for-loop types.
693
+ """
694
+ # Precompute func node id -> qualified name in one walk (O(n) not O(n²))
695
+ func_qnames = self._build_func_qname_map(tree, module_qname)
696
+
697
+ # Sub-pass 1: return types for every function in this module
698
+ for node in ast.walk(tree):
699
+ if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)):
700
+ fq = func_qnames.get(id(node))
701
+ if fq and node.returns:
702
+ self._return_types[fq] = node.returns
703
+
704
+ # Sub-pass 2: everything else
705
+ for node in ast.walk(tree):
706
+ if not isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)):
707
+ continue
708
+
709
+ func_qname = func_qnames.get(id(node))
710
+ if not func_qname:
711
+ continue
712
+
713
+ # Parameter annotations
714
+ param_annotations: dict[str, ast.AST] = {}
715
+ for arg in (node.args.posonlyargs + node.args.args + node.args.kwonlyargs):
716
+ if arg.arg in ("self", "cls") or not arg.annotation:
717
+ continue
718
+ param_annotations[arg.arg] = arg.annotation
719
+ resolved = self._resolve_annotation(arg.annotation)
720
+ if resolved:
721
+ self._var_types[(func_qname, arg.arg)] = resolved
722
+
723
+ # Local annotations and constructor assignments
724
+ local_annotations: dict[str, ast.AST] = {}
725
+ for child in ast.walk(node):
726
+ if isinstance(child, ast.AnnAssign) and child.target:
727
+ name = _name_from_node(child.target)
728
+ if not name or name in ("self", "cls"):
729
+ continue
730
+ if name.startswith("self."):
731
+ attr = name.split(".", 1)[1]
732
+ resolved = self._resolve_annotation(child.annotation)
733
+ if resolved:
734
+ self._var_types[(func_qname, attr)] = resolved
735
+ elif "." not in name:
736
+ local_annotations[name] = child.annotation
737
+ resolved = self._resolve_annotation(child.annotation)
738
+ if resolved:
739
+ self._var_types[(func_qname, name)] = resolved
740
+
741
+ if isinstance(child, ast.Assign) and len(child.targets) == 1:
742
+ target = child.targets[0]
743
+ if isinstance(child.value, ast.Call):
744
+ callee = _name_from_node(child.value.func)
745
+ if callee:
746
+ # Case A: constructor call — x = CodeGraph()
747
+ resolved = self._type_index.get(callee)
748
+ if resolved:
749
+ if isinstance(target, ast.Name):
750
+ self._var_types[(func_qname, target.id)] = resolved
751
+ elif isinstance(target, ast.Attribute) and isinstance(target.value, ast.Name):
752
+ if target.value.id in ("self", "cls"):
753
+ self._var_types[(func_qname, target.attr)] = resolved
754
+
755
+ # Case B: method call — x = obj.method()
756
+ elif "." in callee and isinstance(target, ast.Name):
757
+ obj_name, method = callee.rsplit(".", 1)
758
+ cls = self._resolve_var_type(obj_name, func_qname)
759
+ if cls:
760
+ method_qname = f"{cls}.{method}"
761
+ ret_ann = self._return_types.get(method_qname)
762
+ if ret_ann:
763
+ ret_type = self._resolve_annotation(ret_ann)
764
+ if ret_type:
765
+ self._var_types[(func_qname, target.id)] = ret_type
766
+ local_annotations[target.id] = ret_ann
767
+
768
+ # Case C: assignment type propagation
769
+ # self.x = param or x = other_typed_var
770
+ elif isinstance(child.value, ast.Name):
771
+ rhs_name = child.value.id
772
+ rhs_type = self._resolve_var_type(rhs_name, func_qname)
773
+ if not rhs_type:
774
+ # Check param annotations directly
775
+ rhs_ann = param_annotations.get(rhs_name)
776
+ if rhs_ann:
777
+ rhs_type = self._resolve_annotation(rhs_ann)
778
+ # Also store raw annotation for subscript extraction
779
+ if isinstance(target, ast.Name):
780
+ local_annotations[target.id] = rhs_ann
781
+ if rhs_type:
782
+ if isinstance(target, ast.Name):
783
+ self._var_types[(func_qname, target.id)] = rhs_type
784
+ elif isinstance(target, ast.Attribute) and isinstance(target.value, ast.Name):
785
+ if target.value.id in ("self", "cls"):
786
+ self._var_types[(func_qname, target.attr)] = rhs_type
787
+
788
+ # For-loop and comprehension element type inference
789
+ for child in ast.walk(node):
790
+ if isinstance(child, ast.For) and isinstance(child.target, ast.Name):
791
+ elem_type = self._infer_iter_element_type(
792
+ child.iter, func_qname, local_annotations, param_annotations,
793
+ )
794
+ if elem_type:
795
+ self._var_types[(func_qname, child.target.id)] = elem_type
796
+
797
+ elif isinstance(child, (ast.ListComp, ast.SetComp, ast.GeneratorExp, ast.DictComp)):
798
+ for gen in child.generators:
799
+ if isinstance(gen.target, ast.Name):
800
+ elem_type = self._infer_iter_element_type(
801
+ gen.iter, func_qname, local_annotations, param_annotations,
802
+ )
803
+ if elem_type:
804
+ self._var_types[(func_qname, gen.target.id)] = elem_type
805
+
806
+ def _infer_iter_element_type(
807
+ self,
808
+ it: ast.AST,
809
+ func_qname: str,
810
+ local_annotations: dict[str, ast.AST],
811
+ param_annotations: dict[str, ast.AST],
812
+ ) -> str | None:
813
+ """Infer element type from an iterator expression."""
814
+
815
+ # Case 1: for x in local_var -- check annotations for list[X]
816
+ if isinstance(it, ast.Name):
817
+ for ann_map in (local_annotations, param_annotations):
818
+ ann = ann_map.get(it.id)
819
+ if ann:
820
+ # Strip X | None
821
+ actual = ann
822
+ if isinstance(ann, ast.BinOp) and isinstance(ann.op, ast.BitOr):
823
+ actual = ann.left
824
+ inner = self._resolve_subscript_inner(actual)
825
+ if inner:
826
+ return inner
827
+
828
+ # Case 2: for x in obj.method() -- resolve obj type, look up method return
829
+ if isinstance(it, ast.Call):
830
+ callee = _name_from_node(it.func)
831
+ if callee and "." in callee:
832
+ obj_name, method = callee.rsplit(".", 1)
833
+ cls = self._resolve_var_type(obj_name, func_qname)
834
+ if cls:
835
+ method_qname = f"{cls}.{method}"
836
+ ret_ann = self._return_types.get(method_qname)
837
+ if ret_ann:
838
+ inner = self._resolve_subscript_inner(ret_ann)
839
+ if inner:
840
+ return inner
841
+
842
+ # Case 3: for x in obj.values() on dict[K, V]
843
+ if isinstance(it, ast.Call) and isinstance(it.func, ast.Attribute):
844
+ if it.func.attr == "values":
845
+ obj = _name_from_node(it.func.value)
846
+ if obj:
847
+ for ann_map in (local_annotations, param_annotations):
848
+ ann = ann_map.get(obj)
849
+ if ann and isinstance(ann, ast.Subscript):
850
+ sl = ann.slice
851
+ if isinstance(sl, ast.Tuple) and len(sl.elts) >= 2:
852
+ return self._resolve_annotation(sl.elts[-1])
853
+
854
+ return None
855
+
856
+ def _resolve_var_type(self, name: str, func_qname: str) -> str | None:
857
+ """Look up a variable's type, handling 'self'/'cls', dotted chains, and scope walking.
858
+
859
+ Handles:
860
+ - 'self' / 'cls' -> enclosing class
861
+ - 'graph' -> param/local type lookup
862
+ - 'self.graph' -> resolve self to class, then look up 'graph' attr type
863
+ """
864
+ if name in ("self", "cls"):
865
+ parts = func_qname.split(".")
866
+ for i in range(len(parts) - 1, 0, -1):
867
+ candidate = ".".join(parts[:i])
868
+ if candidate in self._type_index.values():
869
+ return candidate
870
+ return None
871
+
872
+ # Handle dotted names: self.graph, self._node_data, etc.
873
+ if "." in name:
874
+ parts = name.split(".")
875
+ root_type = self._resolve_var_type(parts[0], func_qname)
876
+ if root_type:
877
+ # Walk the chain resolving each attribute's type
878
+ current_type = root_type
879
+ for attr in parts[1:]:
880
+ attr_type = self._find_attr_type(current_type, attr)
881
+ if attr_type:
882
+ current_type = attr_type
883
+ else:
884
+ return None
885
+ return current_type
886
+ return None
887
+
888
+ scope = func_qname
889
+ while scope:
890
+ key = (scope, name)
891
+ if key in self._var_types:
892
+ return self._var_types[key]
893
+ if "." in scope:
894
+ scope = scope.rsplit(".", 1)[0]
895
+ else:
896
+ break
897
+ return None
898
+
899
+ # -- Structural type inference -----------------------------------------
900
+
901
+ def infer_structural_types(self, edges: list[EdgeData]) -> None:
902
+ """Infer variable types from field access patterns (reverse index).
903
+
904
+ For untyped variable 'n' where we see n.id, n.symbol_type, n.qualified_name,
905
+ build a reverse index field_name -> {classes that have that field as a child
906
+ node}, intersect all accessed fields, and if there's a unique class match,
907
+ type 'n' as that class.
908
+ """
909
+ # Build reverse index: field_name -> set of parent class qnames
910
+ field_to_classes: dict[str, set[str]] = {}
911
+ for nid in self._node_ids:
912
+ parts = nid.rsplit(".", 1)
913
+ if len(parts) == 2:
914
+ parent, field = parts
915
+ # Only consider class children (not module-level or function locals)
916
+ if parent in self._node_ids:
917
+ field_to_classes.setdefault(field, set()).add(parent)
918
+
919
+ # Collect field accesses per (scope, variable) for untyped variables
920
+ # An edge target like "n.id" from scope S means variable 'n' in S accesses field 'id'
921
+ var_fields: dict[tuple[str, str], set[str]] = {} # (scope, var) -> {field1, field2, ...}
922
+ for e in edges:
923
+ if e.edge_type not in (EdgeType.READS, EdgeType.WRITES):
924
+ continue
925
+ t = e.target
926
+ if "." not in t:
927
+ continue
928
+ parts = t.split(".")
929
+ if parts[0] in ("self", "cls"):
930
+ continue
931
+ var_name = parts[0]
932
+ field = parts[1] # first attribute access
933
+ key = (e.source, var_name)
934
+ # Only care about variables we haven't already typed
935
+ if key not in self._var_types:
936
+ var_fields.setdefault(key, set()).add(field)
937
+
938
+ # Intersect: for each untyped var, find classes that have ALL its accessed fields
939
+ for (scope, var_name), fields in var_fields.items():
940
+ if not fields:
941
+ continue
942
+ # Find classes that have ALL these fields as children
943
+ candidate_classes: set[str] | None = None
944
+ for field in fields:
945
+ classes_with_field = field_to_classes.get(field)
946
+ if classes_with_field is None:
947
+ candidate_classes = set()
948
+ break
949
+ if candidate_classes is None:
950
+ candidate_classes = set(classes_with_field)
951
+ else:
952
+ candidate_classes &= classes_with_field
953
+ if not candidate_classes:
954
+ break
955
+
956
+ if candidate_classes and len(candidate_classes) == 1:
957
+ cls = next(iter(candidate_classes))
958
+ self._var_types[(scope, var_name)] = cls
959
+
960
+ # -- Unified resolution ------------------------------------------------
961
+
962
+ def resolve(self, target: str, source: str) -> str | None:
963
+ """Resolve a raw edge target to a qualified node ID.
964
+
965
+ Returns None if the target should be filtered out (builtin).
966
+ Returns the resolved target (may be progressively truncated).
967
+ """
968
+ # Already a known node
969
+ if target in self._node_ids:
970
+ return target
971
+
972
+ # Filter builtins
973
+ root = target.split(".")[0]
974
+ if root in _BUILTINS:
975
+ return None
976
+
977
+ # Bare name (no dots) -- leave for build_from
978
+ if "." not in target:
979
+ return target
980
+
981
+ # Dotted name -- resolve through type system
982
+ parts = target.split(".")
983
+ first = parts[0]
984
+
985
+ # Handle self.X / cls.X chains
986
+ if first in ("self", "cls"):
987
+ cls_qname = self._resolve_var_type(first, source)
988
+ if cls_qname:
989
+ resolved = self._resolve_attr_chain(cls_qname, parts[1:])
990
+ if resolved:
991
+ return resolved
992
+ return target
993
+
994
+ # Handle typed variable chains: graph.all_nodes, n.id, etc.
995
+ var_type = self._resolve_var_type(first, source)
996
+ if var_type:
997
+ resolved = self._resolve_attr_chain(var_type, parts[1:])
998
+ if resolved:
999
+ return resolved
1000
+
1001
+ # Untyped root -- return raw for progressive truncation later
1002
+ return target
1003
+
1004
+ def _resolve_attr_chain(self, class_qname: str, attrs: list[str]) -> str | None:
1005
+ """Resolve an attribute chain against a known class.
1006
+
1007
+ Returns the deepest resolvable node ID, or None if nothing resolves.
1008
+ Uses progressive truncation: tries full chain first, then strips from
1009
+ the right until a known node is found.
1010
+
1011
+ Example: class_qname='CodeGraph', attrs=['_edges', 'append']
1012
+ 1. Try 'CodeGraph._edges.append' -- not a node
1013
+ 2. Try 'CodeGraph._edges' -- IS a node -> return it
1014
+ """
1015
+ if not attrs:
1016
+ return class_qname if class_qname in self._node_ids else None
1017
+
1018
+ # Try full chain first
1019
+ full = class_qname + "." + ".".join(attrs)
1020
+ if full in self._node_ids:
1021
+ return full
1022
+
1023
+ # Try type-based resolution for deeper chains
1024
+ first = attrs[0]
1025
+ first_resolved = f"{class_qname}.{first}"
1026
+
1027
+ if len(attrs) > 1:
1028
+ attr_type = self._find_attr_type(class_qname, first)
1029
+ if attr_type:
1030
+ deeper = self._resolve_attr_chain(attr_type, attrs[1:])
1031
+ if deeper and deeper in self._node_ids:
1032
+ return deeper
1033
+
1034
+ # Progressive truncation: strip from right until we hit a node
1035
+ for i in range(len(attrs), 0, -1):
1036
+ candidate = class_qname + "." + ".".join(attrs[:i])
1037
+ if candidate in self._node_ids:
1038
+ return candidate
1039
+
1040
+ # The class itself is a node
1041
+ if class_qname in self._node_ids:
1042
+ return class_qname
1043
+
1044
+ return first_resolved # best-effort
1045
+
1046
+ def _find_attr_type(self, class_qname: str, attr_name: str) -> str | None:
1047
+ """Find the type of a class attribute via __init__ or class-level scope."""
1048
+ for scope in (f"{class_qname}.__init__", class_qname):
1049
+ key = (scope, attr_name)
1050
+ if key in self._var_types:
1051
+ return self._var_types[key]
1052
+ return None
1053
+
1054
+ @staticmethod
1055
+ def _build_func_qname_map(tree: ast.Module, module_qname: str) -> dict[int, str]:
1056
+ """Single-pass precomputation: map func node id() -> qualified name.
1057
+
1058
+ Replaces the O(n²) _find_func_qname which did a full AST walk per function.
1059
+ """
1060
+ result: dict[int, str] = {}
1061
+ class _Mapper(ast.NodeVisitor):
1062
+ def __init__(self):
1063
+ self.stack = [module_qname]
1064
+ def visit_ClassDef(self, node):
1065
+ self.stack.append(f"{self.stack[-1]}.{node.name}")
1066
+ self.generic_visit(node)
1067
+ self.stack.pop()
1068
+ def visit_FunctionDef(self, node):
1069
+ qname = f"{self.stack[-1]}.{node.name}"
1070
+ result[id(node)] = qname
1071
+ self.stack.append(qname)
1072
+ self.generic_visit(node)
1073
+ self.stack.pop()
1074
+ def visit_AsyncFunctionDef(self, node):
1075
+ self.visit_FunctionDef(node)
1076
+ _Mapper().visit(tree)
1077
+ return result
1078
+
1079
+
1080
+ # ---------------------------------------------------------------------------
1081
+ # Utility functions
1082
+ # ---------------------------------------------------------------------------
1083
+
1084
+ def _name_from_node(node: ast.AST) -> str | None:
1085
+ """Extract a dotted name from an AST node."""
1086
+ if isinstance(node, ast.Name):
1087
+ return node.id
1088
+ if isinstance(node, ast.Attribute):
1089
+ parent = _name_from_node(node.value)
1090
+ if parent:
1091
+ return f"{parent}.{node.attr}"
1092
+ return None
1093
+
1094
+
1095
+ def _assigned_names(target: ast.AST) -> list[str]:
1096
+ """Get flat list of names from an assignment target."""
1097
+ if isinstance(target, ast.Name):
1098
+ return [target.id]
1099
+ if isinstance(target, (ast.Tuple, ast.List)):
1100
+ names = []
1101
+ for elt in target.elts:
1102
+ names.extend(_assigned_names(elt))
1103
+ return names
1104
+ return []
1105
+
1106
+
1107
+ def _signature_from_funcdef(node: ast.FunctionDef | ast.AsyncFunctionDef) -> str:
1108
+ """Reconstruct a human-readable signature string."""
1109
+ args = node.args
1110
+ parts: list[str] = []
1111
+
1112
+ for a in args.posonlyargs:
1113
+ parts.append(a.arg)
1114
+ if args.posonlyargs:
1115
+ parts.append("/")
1116
+
1117
+ n_defaults = len(args.defaults)
1118
+ n_regular = len(args.args)
1119
+ for i, a in enumerate(args.args):
1120
+ default_idx = i - (n_regular - n_defaults)
1121
+ if default_idx >= 0:
1122
+ parts.append(f"{a.arg}=...")
1123
+ else:
1124
+ parts.append(a.arg)
1125
+
1126
+ if args.vararg:
1127
+ parts.append(f"*{args.vararg.arg}")
1128
+ elif args.kwonlyargs:
1129
+ parts.append("*")
1130
+
1131
+ for i, a in enumerate(args.kwonlyargs):
1132
+ if i < len(args.kw_defaults) and args.kw_defaults[i] is not None:
1133
+ parts.append(f"{a.arg}=...")
1134
+ else:
1135
+ parts.append(a.arg)
1136
+
1137
+ if args.kwarg:
1138
+ parts.append(f"**{args.kwarg.arg}")
1139
+
1140
+ prefix = "async def" if isinstance(node, ast.AsyncFunctionDef) else "def"
1141
+ return f"{prefix} {node.name}({', '.join(parts)})"