polycodegraph 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. codegraph/__init__.py +10 -0
  2. codegraph/analysis/__init__.py +30 -0
  3. codegraph/analysis/_common.py +125 -0
  4. codegraph/analysis/blast_radius.py +63 -0
  5. codegraph/analysis/cycles.py +79 -0
  6. codegraph/analysis/dataflow.py +861 -0
  7. codegraph/analysis/dead_code.py +165 -0
  8. codegraph/analysis/hotspots.py +68 -0
  9. codegraph/analysis/infrastructure.py +439 -0
  10. codegraph/analysis/metrics.py +52 -0
  11. codegraph/analysis/report.py +222 -0
  12. codegraph/analysis/roles.py +323 -0
  13. codegraph/analysis/untested.py +79 -0
  14. codegraph/cli.py +1506 -0
  15. codegraph/config.py +64 -0
  16. codegraph/embed/__init__.py +35 -0
  17. codegraph/embed/chunker.py +120 -0
  18. codegraph/embed/embedder.py +113 -0
  19. codegraph/embed/query.py +181 -0
  20. codegraph/embed/store.py +360 -0
  21. codegraph/graph/__init__.py +0 -0
  22. codegraph/graph/builder.py +212 -0
  23. codegraph/graph/schema.py +69 -0
  24. codegraph/graph/store_networkx.py +55 -0
  25. codegraph/graph/store_sqlite.py +249 -0
  26. codegraph/mcp_server/__init__.py +6 -0
  27. codegraph/mcp_server/server.py +933 -0
  28. codegraph/parsers/__init__.py +0 -0
  29. codegraph/parsers/base.py +70 -0
  30. codegraph/parsers/go.py +570 -0
  31. codegraph/parsers/python.py +1707 -0
  32. codegraph/parsers/typescript.py +1397 -0
  33. codegraph/py.typed +0 -0
  34. codegraph/resolve/__init__.py +4 -0
  35. codegraph/resolve/calls.py +480 -0
  36. codegraph/review/__init__.py +31 -0
  37. codegraph/review/baseline.py +32 -0
  38. codegraph/review/differ.py +211 -0
  39. codegraph/review/hook.py +70 -0
  40. codegraph/review/risk.py +219 -0
  41. codegraph/review/rules.py +342 -0
  42. codegraph/viz/__init__.py +17 -0
  43. codegraph/viz/_style.py +45 -0
  44. codegraph/viz/dashboard.py +740 -0
  45. codegraph/viz/diagrams.py +370 -0
  46. codegraph/viz/explore.py +453 -0
  47. codegraph/viz/hld.py +683 -0
  48. codegraph/viz/html.py +115 -0
  49. codegraph/viz/mermaid.py +111 -0
  50. codegraph/viz/svg.py +77 -0
  51. codegraph/web/__init__.py +4 -0
  52. codegraph/web/server.py +165 -0
  53. codegraph/web/static/app.css +664 -0
  54. codegraph/web/static/app.js +919 -0
  55. codegraph/web/static/index.html +112 -0
  56. codegraph/web/static/views/architecture.js +1671 -0
  57. codegraph/web/static/views/graph3d.css +564 -0
  58. codegraph/web/static/views/graph3d.js +999 -0
  59. codegraph/web/static/views/graph3d_transform.js +984 -0
  60. codegraph/workspace/__init__.py +34 -0
  61. codegraph/workspace/config.py +110 -0
  62. codegraph/workspace/operations.py +294 -0
  63. polycodegraph-0.1.0.dist-info/METADATA +687 -0
  64. polycodegraph-0.1.0.dist-info/RECORD +67 -0
  65. polycodegraph-0.1.0.dist-info/WHEEL +4 -0
  66. polycodegraph-0.1.0.dist-info/entry_points.txt +2 -0
  67. polycodegraph-0.1.0.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,1707 @@
1
+ """Python source extractor using tree-sitter."""
2
+ from __future__ import annotations
3
+
4
+ import re
5
+ from pathlib import Path, PurePosixPath
6
+
7
+ import tree_sitter
8
+
9
+ from codegraph.graph.schema import Edge, EdgeKind, Node, NodeKind, make_node_id
10
+ from codegraph.parsers.base import (
11
+ ExtractorBase,
12
+ load_parser,
13
+ node_text,
14
+ register_extractor,
15
+ )
16
+
17
+
18
+ def _is_test_file(rel_path: str) -> bool:
19
+ return bool(
20
+ re.search(r"(^|[/\\])(tests?[/\\]|test_)", rel_path)
21
+ or rel_path.endswith("_test.py")
22
+ )
23
+
24
+
25
+ def _file_to_qualname(rel_path: str) -> str:
26
+ """Convert repo-relative path like 'src/foo/bar.py' to 'src.foo.bar'."""
27
+ p = PurePosixPath(rel_path)
28
+ parts = list(p.with_suffix("").parts)
29
+ if parts and parts[-1] == "__init__":
30
+ parts.pop()
31
+ return ".".join(parts)
32
+
33
+
34
+ def _get_docstring(block_node: tree_sitter.Node, src: bytes) -> str | None:
35
+ for child in block_node.children:
36
+ if child.type == "expression_statement":
37
+ for sub in child.children:
38
+ if sub.type == "string":
39
+ raw = node_text(sub, src).strip()
40
+ # Strip triple/single quotes
41
+ for q in ('"""', "'''", '"', "'"):
42
+ if raw.startswith(q) and raw.endswith(q):
43
+ raw = raw[len(q):-len(q)]
44
+ break
45
+ return raw.strip()
46
+ return None
47
+
48
+
49
+ def _extract_types_from_type_node(
50
+ type_node: tree_sitter.Node, src: bytes
51
+ ) -> list[str]:
52
+ """Return the list of simple type names from a ``type`` AST node.
53
+
54
+ Handles three shapes:
55
+ * single identifier / attribute -> one-element list
56
+ * binary union ``A | B | ...`` -> flattened list of operand names
57
+ * subscript ``Union[A, B]`` / ``Optional[A]`` -> list of inner names
58
+
59
+ Anything else (string forward refs, generics like ``list[Foo]``)
60
+ returns an empty list — the resolver will simply not bind that
61
+ attribute, which is safe.
62
+ """
63
+ # ``type`` typically has a single inner expression child; descend.
64
+ inner: tree_sitter.Node | None = None
65
+ for c in type_node.children:
66
+ if c.is_named:
67
+ inner = c
68
+ break
69
+ if inner is None:
70
+ return []
71
+ return _flatten_type_expr(inner, src)
72
+
73
+
74
+ def _flatten_type_expr(node: tree_sitter.Node, src: bytes) -> list[str]:
75
+ """Recursively flatten a type expression into bare type names."""
76
+ if node.type in ("identifier", "attribute"):
77
+ return [node_text(node, src)]
78
+ if node.type == "binary_operator":
79
+ # ``A | B`` — only honor union when the operator is ``|``.
80
+ op_is_pipe = any(
81
+ c.type == "|" for c in node.children if not c.is_named
82
+ )
83
+ if not op_is_pipe:
84
+ return []
85
+ out: list[str] = []
86
+ for c in node.children:
87
+ if c.is_named:
88
+ out.extend(_flatten_type_expr(c, src))
89
+ return out
90
+ if node.type in ("subscript", "generic_type"):
91
+ # ``Union[A, B]`` / ``Optional[A]`` — both flatten to operand list.
92
+ # Tree-sitter parses ``Union[A, B]`` as ``generic_type`` with a
93
+ # leading identifier and a ``type_parameter`` child; ``Optional[A]``
94
+ # may be a ``subscript`` depending on grammar version.
95
+ head_node: tree_sitter.Node | None = None
96
+ if node.type == "subscript":
97
+ head_node = node.child_by_field_name("value")
98
+ else:
99
+ for c in node.children:
100
+ if c.type in ("identifier", "attribute"):
101
+ head_node = c
102
+ break
103
+ head = node_text(head_node, src) if head_node is not None else ""
104
+ head_leaf = head.rsplit(".", 1)[-1]
105
+ if head_leaf not in ("Union", "Optional"):
106
+ return []
107
+ out2: list[str] = []
108
+ for c in node.children:
109
+ if not c.is_named or c is head_node:
110
+ continue
111
+ if c.type == "type_parameter":
112
+ for inner_c in c.children:
113
+ if inner_c.is_named:
114
+ out2.extend(_flatten_type_expr(inner_c, src))
115
+ else:
116
+ out2.extend(_flatten_type_expr(c, src))
117
+ return out2
118
+ if node.type == "type":
119
+ # Wrapping ``type`` node — descend into its named child.
120
+ for c in node.children:
121
+ if c.is_named:
122
+ return _flatten_type_expr(c, src)
123
+ return []
124
+ return []
125
+
126
+
127
+ def _collect_class_attr_types(
128
+ body: tree_sitter.Node, src: bytes
129
+ ) -> dict[str, list[str]]:
130
+ """Return ``{attr_name: [type_qualname, ...]}`` for class annotations.
131
+
132
+ Captures both class-level direct annotations (``svc: Service``,
133
+ ``svc: Foo | Bar``, ``svc: Union[Foo, Bar]``) AND attribute
134
+ assignments inside ``__init__`` (including ``if/else`` branches), so
135
+ a backend-facade pattern like::
136
+
137
+ def __init__(self, x):
138
+ if x:
139
+ self._b: Foo = Foo()
140
+ else:
141
+ self._b = Bar()
142
+
143
+ yields ``{"_b": ["Foo", "Bar"]}``.
144
+ """
145
+ out: dict[str, list[str]] = {}
146
+ for stmt in body.children:
147
+ if stmt.type != "expression_statement":
148
+ continue
149
+ for assignment in stmt.children:
150
+ if assignment.type != "assignment":
151
+ continue
152
+ name_node: tree_sitter.Node | None = None
153
+ type_node: tree_sitter.Node | None = None
154
+ for c in assignment.children:
155
+ if c.type == "identifier" and name_node is None:
156
+ name_node = c
157
+ elif c.type == "type":
158
+ type_node = c
159
+ if name_node is None or type_node is None:
160
+ continue
161
+ attr_name = node_text(name_node, src)
162
+ type_names = _extract_types_from_type_node(type_node, src)
163
+ if not attr_name or not type_names:
164
+ continue
165
+ existing = out.setdefault(attr_name, [])
166
+ for t in type_names:
167
+ if t not in existing:
168
+ existing.append(t)
169
+
170
+ # Walk __init__ for ``self.X = ...`` and ``self.X: T = ...`` bindings.
171
+ for stmt in body.children:
172
+ func: tree_sitter.Node | None = None
173
+ if stmt.type == "function_definition":
174
+ func = stmt
175
+ elif stmt.type == "decorated_definition":
176
+ for c in stmt.children:
177
+ if c.type == "function_definition":
178
+ func = c
179
+ break
180
+ if func is None:
181
+ continue
182
+ name_n = func.child_by_field_name("name")
183
+ if name_n is None or node_text(name_n, src) != "__init__":
184
+ continue
185
+ init_body = func.child_by_field_name("body")
186
+ if init_body is None:
187
+ continue
188
+ _collect_self_attr_types_in_block(init_body, src, out)
189
+ return out
190
+
191
+
192
+ def _collect_self_attr_types_in_block(
193
+ block: tree_sitter.Node,
194
+ src: bytes,
195
+ out: dict[str, list[str]],
196
+ ) -> None:
197
+ """Walk a function body collecting ``self.X[: T] = Y(...)`` bindings.
198
+
199
+ Recurses into ``if/else`` (and ``try/with/for/while``) branches so
200
+ both arms of a conditional contribute to the attribute's type list.
201
+ Walrus (``:=``) and dynamic ``setattr`` are deliberately ignored —
202
+ those are R4+ territory.
203
+ """
204
+ for child in block.children:
205
+ if child.type == "expression_statement":
206
+ for assignment in child.children:
207
+ if assignment.type != "assignment":
208
+ continue
209
+ _maybe_record_self_assign(assignment, src, out)
210
+ elif child.type == "block":
211
+ # Tree-sitter wraps clause bodies in a ``block`` whose entries
212
+ # are the actual statements; recurse straight into it.
213
+ _collect_self_attr_types_in_block(child, src, out)
214
+ elif child.type in (
215
+ "if_statement", "with_statement", "try_statement",
216
+ "for_statement", "while_statement", "elif_clause", "else_clause",
217
+ "except_clause", "finally_clause",
218
+ ):
219
+ # Recurse into all named children — this picks up the clause's
220
+ # inner ``block`` plus any sibling ``elif_clause`` / ``else_clause``
221
+ # / ``except_clause`` chains.
222
+ for sub in child.children:
223
+ if sub.is_named:
224
+ _collect_self_attr_types_in_block(sub, src, out)
225
+
226
+
227
+ def _maybe_record_self_assign(
228
+ assignment: tree_sitter.Node,
229
+ src: bytes,
230
+ out: dict[str, list[str]],
231
+ ) -> None:
232
+ """If ``assignment`` is ``self.X[: T] = expr``, record the type(s)."""
233
+ # Find the LHS (attribute), the optional type annotation, and RHS.
234
+ lhs: tree_sitter.Node | None = None
235
+ type_node: tree_sitter.Node | None = None
236
+ rhs: tree_sitter.Node | None = None
237
+ seen_eq = False
238
+ for c in assignment.children:
239
+ if c.type == "=":
240
+ seen_eq = True
241
+ continue
242
+ if c.type == "type":
243
+ type_node = c
244
+ continue
245
+ if not seen_eq:
246
+ if lhs is None:
247
+ lhs = c
248
+ else:
249
+ if rhs is None:
250
+ rhs = c
251
+ if lhs is None or lhs.type != "attribute":
252
+ return
253
+ obj = lhs.child_by_field_name("object")
254
+ attr = lhs.child_by_field_name("attribute")
255
+ if obj is None or attr is None:
256
+ return
257
+ if node_text(obj, src) != "self":
258
+ return
259
+ attr_name = node_text(attr, src)
260
+ if not attr_name:
261
+ return
262
+
263
+ type_names: list[str] = []
264
+ if type_node is not None:
265
+ type_names.extend(_extract_types_from_type_node(type_node, src))
266
+
267
+ # If no annotation (or annotation gave nothing useful), fall back
268
+ # to the constructor name on the RHS.
269
+ if not type_names and rhs is not None:
270
+ ctor = _ctor_name_from_expr(rhs, src)
271
+ if ctor:
272
+ type_names.append(ctor)
273
+
274
+ if not type_names:
275
+ return
276
+ existing = out.setdefault(attr_name, [])
277
+ for t in type_names:
278
+ if t not in existing:
279
+ existing.append(t)
280
+
281
+
282
+ def _ctor_name_from_expr(
283
+ node: tree_sitter.Node, src: bytes
284
+ ) -> str | None:
285
+ """Return the constructor name from an RHS expression like ``Foo(...)``.
286
+
287
+ Handles ``Foo(...)``, ``mod.Foo(...)`` (returns ``Foo``), and simple
288
+ identifier references ``Foo`` (when a name is being aliased without
289
+ instantiation, we still record the type so ``self._b = some_factory``
290
+ style does NOT match — only ``identifier`` / ``attribute`` whose leaf
291
+ looks PascalCase counts as a "type-ish" reference).
292
+
293
+ Walrus (``named_expression``) is intentionally skipped.
294
+ """
295
+ if node.type == "call":
296
+ func = node.child_by_field_name("function")
297
+ if func is None:
298
+ return None
299
+ text = node_text(func, src).rsplit(".", 1)[-1]
300
+ if text and text[0].isupper():
301
+ return text
302
+ return None
303
+ return None
304
+
305
+
306
+ # --- Argument expression simplification ---------------------------------
307
+ #
308
+ # Per DF0 spec: "simple" arg expressions (literals, identifiers, attributes,
309
+ # subscripts) are captured verbatim; anything else collapses to "<expr>".
310
+ _SIMPLE_ARG_TYPES: frozenset[str] = frozenset({
311
+ "identifier", "string", "integer", "float",
312
+ "true", "false", "none",
313
+ "attribute", "subscript",
314
+ })
315
+
316
+
317
+ def _simplify_arg(node: tree_sitter.Node, src: bytes) -> str:
318
+ """Return arg text if the AST node is a simple form, else ``"<expr>"``."""
319
+ if node.type in _SIMPLE_ARG_TYPES:
320
+ return node_text(node, src)
321
+ return "<expr>"
322
+
323
+
324
+ def _extract_params(
325
+ params_node: tree_sitter.Node,
326
+ src: bytes,
327
+ *,
328
+ skip_self_or_cls: bool,
329
+ ) -> list[dict[str, str | None]]:
330
+ """Walk a ``parameters`` AST block and return DF0 param descriptors.
331
+
332
+ Skip the first parameter when ``skip_self_or_cls`` is True and that
333
+ first parameter is named ``self`` or ``cls``. Variadic forms are
334
+ captured with ``*`` / ``**`` prefixes on the name.
335
+ """
336
+ out: list[dict[str, str | None]] = []
337
+ first_seen = False
338
+ for child in params_node.children:
339
+ if not child.is_named:
340
+ continue
341
+ descriptor: dict[str, str | None] | None = None
342
+ if child.type == "identifier":
343
+ descriptor = {
344
+ "name": node_text(child, src),
345
+ "type": None,
346
+ "default": None,
347
+ }
348
+ elif child.type == "typed_parameter":
349
+ name_n = next(
350
+ (c for c in child.children if c.type == "identifier"), None
351
+ )
352
+ type_n = next(
353
+ (c for c in child.children if c.type == "type"), None
354
+ )
355
+ if name_n is not None:
356
+ descriptor = {
357
+ "name": node_text(name_n, src),
358
+ "type": node_text(type_n, src) if type_n else None,
359
+ "default": None,
360
+ }
361
+ elif child.type == "default_parameter":
362
+ name_n = child.child_by_field_name("name")
363
+ value_n = child.child_by_field_name("value")
364
+ if name_n is not None:
365
+ descriptor = {
366
+ "name": node_text(name_n, src),
367
+ "type": None,
368
+ "default": node_text(value_n, src) if value_n else None,
369
+ }
370
+ elif child.type == "typed_default_parameter":
371
+ name_n = child.child_by_field_name("name")
372
+ type_n = child.child_by_field_name("type")
373
+ value_n = child.child_by_field_name("value")
374
+ if name_n is not None:
375
+ descriptor = {
376
+ "name": node_text(name_n, src),
377
+ "type": node_text(type_n, src) if type_n else None,
378
+ "default": node_text(value_n, src) if value_n else None,
379
+ }
380
+ elif child.type == "list_splat_pattern":
381
+ inner = next(
382
+ (c for c in child.children if c.type == "identifier"), None
383
+ )
384
+ if inner is not None:
385
+ descriptor = {
386
+ "name": f"*{node_text(inner, src)}",
387
+ "type": None,
388
+ "default": None,
389
+ }
390
+ elif child.type == "dictionary_splat_pattern":
391
+ inner = next(
392
+ (c for c in child.children if c.type == "identifier"), None
393
+ )
394
+ if inner is not None:
395
+ descriptor = {
396
+ "name": f"**{node_text(inner, src)}",
397
+ "type": None,
398
+ "default": None,
399
+ }
400
+ if descriptor is None:
401
+ continue
402
+ if (
403
+ skip_self_or_cls
404
+ and not first_seen
405
+ and descriptor["name"] in ("self", "cls")
406
+ ):
407
+ first_seen = True
408
+ continue
409
+ first_seen = True
410
+ out.append(descriptor)
411
+ return out
412
+
413
+
414
+ def _extract_call_args(
415
+ arg_list: tree_sitter.Node, src: bytes
416
+ ) -> tuple[list[str], dict[str, str]]:
417
+ """Return ``(args, kwargs)`` for a ``call.argument_list`` AST node.
418
+
419
+ Follows the DF0 capture rules: positional args are simplified via
420
+ ``_simplify_arg``; keyword args become ``kwargs[name] = simplified``;
421
+ ``*spread`` becomes ``"*name"`` in args; ``**spread`` becomes
422
+ ``kwargs["**"] = name``.
423
+ """
424
+ args: list[str] = []
425
+ kwargs: dict[str, str] = {}
426
+ for child in arg_list.children:
427
+ if not child.is_named:
428
+ continue
429
+ if child.type == "keyword_argument":
430
+ name_n = child.child_by_field_name("name")
431
+ value_n = child.child_by_field_name("value")
432
+ if name_n is not None and value_n is not None:
433
+ kwargs[node_text(name_n, src)] = _simplify_arg(value_n, src)
434
+ elif child.type == "list_splat":
435
+ inner = next(
436
+ (c for c in child.children if c.is_named), None
437
+ )
438
+ if inner is not None:
439
+ args.append(f"*{node_text(inner, src)}")
440
+ else:
441
+ args.append("<expr>")
442
+ elif child.type == "dictionary_splat":
443
+ inner = next(
444
+ (c for c in child.children if c.is_named), None
445
+ )
446
+ if inner is not None:
447
+ kwargs["**"] = node_text(inner, src)
448
+ else:
449
+ args.append(_simplify_arg(child, src))
450
+ return args, kwargs
451
+
452
+
453
+ # --- DF1: HTTP route + SQLAlchemy detection ---------------------------
454
+ #
455
+ # Patterns are regex-based on the raw decorator / call text. Tree-sitter
456
+ # gives us reliable syntactic boundaries; we lean on regex for the inner
457
+ # semantic shape (HTTP method names, model arguments) since the surface
458
+ # vocabulary is small and well-known.
459
+
460
+ # Recognised HTTP verbs / route helpers across FastAPI, Flask, aiohttp.
461
+ _HTTP_VERBS: tuple[str, ...] = (
462
+ "get", "post", "put", "delete", "patch",
463
+ "head", "options", "trace", "websocket",
464
+ )
465
+
466
+ # Decorator forms:
467
+ # @<router>.<verb>("/path", ...)
468
+ # @<router>.<verb>('/path', ...)
469
+ # router is any identifier (app, router, blueprint, bp, ...).
470
+ _ROUTE_VERB_RE = re.compile(
471
+ r"@\s*(?P<router>[\w.]+)\.(?P<verb>"
472
+ + "|".join(_HTTP_VERBS)
473
+ + r")\s*\(\s*['\"](?P<path>[^'\"]+)['\"]"
474
+ )
475
+ # @<router>.route("/path", methods=[...]) — Flask shape.
476
+ _ROUTE_GENERIC_RE = re.compile(
477
+ r"@\s*(?P<router>[\w.]+)\.route\s*\(\s*['\"](?P<path>[^'\"]+)['\"]"
478
+ )
479
+ _METHODS_KW_RE = re.compile(
480
+ r"methods\s*=\s*\[(?P<methods>[^\]]*)\]"
481
+ )
482
+ _METHOD_TOKEN_RE = re.compile(r"['\"]([A-Za-z]+)['\"]")
483
+
484
+ # FastAPI-style routers vs Flask app/blueprint heuristic for `framework`.
485
+ _FASTAPI_ROUTER_TOKENS: frozenset[str] = frozenset({
486
+ "router", "api_router", "apirouter",
487
+ })
488
+ _FLASK_ROUTER_TOKENS: frozenset[str] = frozenset({
489
+ "blueprint", "bp", "blueprints",
490
+ })
491
+
492
+
493
+ def _classify_framework(router_name: str, has_methods_kw: bool) -> str:
494
+ """Best-effort framework guess for ROUTE edge metadata.
495
+
496
+ Heuristics:
497
+ * ``methods=[...]`` keyword is Flask-shaped; FastAPI's per-verb
498
+ decorators don't accept it.
499
+ * Names containing ``router`` lean FastAPI; ``blueprint``/``bp``
500
+ lean Flask. Fallback is ``fastapi`` since it is by far the most
501
+ common modern Python web framework.
502
+ """
503
+ head = router_name.rsplit(".", 1)[-1].lower()
504
+ if has_methods_kw:
505
+ return "flask"
506
+ if head in _FLASK_ROUTER_TOKENS:
507
+ return "flask"
508
+ if head in _FASTAPI_ROUTER_TOKENS:
509
+ return "fastapi"
510
+ return "fastapi"
511
+
512
+
513
+ def _extract_route_specs(
514
+ decorators: list[str],
515
+ ) -> list[dict[str, str]]:
516
+ """Return one dict per HTTP route described by the decorators.
517
+
518
+ Flask's ``@app.route("/x", methods=["GET", "POST"])`` produces ONE
519
+ dict per method (so caller emits one ROUTE edge per method).
520
+ FastAPI's ``@app.get("/x")`` produces a single dict.
521
+
522
+ Each dict has keys: ``method`` (uppercase), ``path``, ``framework``,
523
+ ``router`` (raw router-variable text).
524
+ """
525
+ out: list[dict[str, str]] = []
526
+ for raw in decorators:
527
+ text = raw.strip()
528
+ # @<router>.route(...) — handle FIRST so methods kw is honored,
529
+ # otherwise the verb regex would never match (no verb in decl).
530
+ m = _ROUTE_GENERIC_RE.search(text)
531
+ if m:
532
+ router = m.group("router")
533
+ path = m.group("path")
534
+ framework = _classify_framework(router, has_methods_kw=True)
535
+ mm = _METHODS_KW_RE.search(text)
536
+ if mm:
537
+ methods = [
538
+ tok.upper()
539
+ for tok in _METHOD_TOKEN_RE.findall(mm.group("methods"))
540
+ ]
541
+ else:
542
+ # Default Flask method when methods= is absent.
543
+ methods = ["GET"]
544
+ for method in methods:
545
+ out.append({
546
+ "method": method,
547
+ "path": path,
548
+ "framework": framework,
549
+ "router": router,
550
+ })
551
+ continue
552
+ # @<router>.<verb>(path, ...)
553
+ m2 = _ROUTE_VERB_RE.search(text)
554
+ if m2:
555
+ router = m2.group("router")
556
+ verb = m2.group("verb")
557
+ path = m2.group("path")
558
+ framework = _classify_framework(router, has_methods_kw=False)
559
+ out.append({
560
+ "method": verb.upper(),
561
+ "path": path,
562
+ "framework": framework,
563
+ "router": router,
564
+ })
565
+ return out
566
+
567
+
568
+ # --- SQLAlchemy detection ----------------------------------------------
569
+ #
570
+ # We detect data-access patterns at parse time and emit READS_FROM /
571
+ # WRITES_TO edges with ``dst="unresolved::<ModelName>"``. The post-build
572
+ # resolver rewrites these to real CLASS node ids when the model is in
573
+ # repo; any that remain unresolved are dropped (per DF1 spec).
574
+
575
+ # Outer verbs we recognise on session/db/conn.
576
+ _SQL_READ_OUTER: frozenset[str] = frozenset({"query", "get", "scalar", "scalars"})
577
+ _SQL_WRITE_OUTER: frozenset[str] = frozenset({"add", "add_all", "delete", "merge"})
578
+ # Inner verbs in session.execute(<inner>(Model)).
579
+ _SQL_READ_INNER: frozenset[str] = frozenset({"select"})
580
+ _SQL_WRITE_INNER: frozenset[str] = frozenset({"insert", "update", "delete"})
581
+
582
+ # `session`, `db.session`, `db`, `conn`, `cursor`, ... — left-most token
583
+ # of a chain that suggests an ORM/connection root. We accept any
584
+ # identifier and rely on ``execute``/``query``/``add``/etc. as the verb
585
+ # trigger, but record the chain's last identifier in metadata.
586
+ _SESSION_HEAD_TOKENS: frozenset[str] = frozenset({
587
+ "session", "db", "conn", "connection", "cursor",
588
+ })
589
+
590
+
591
+ def _strip_call_suffix(name: str) -> str:
592
+ """Drop `()` and trailing chained calls — `Foo().bar` -> `Foo.bar`."""
593
+ out: list[str] = []
594
+ depth = 0
595
+ for ch in name:
596
+ if ch == "(":
597
+ depth += 1
598
+ continue
599
+ if ch == ")":
600
+ if depth > 0:
601
+ depth -= 1
602
+ continue
603
+ if depth == 0:
604
+ out.append(ch)
605
+ return "".join(out).strip().rstrip(".")
606
+
607
+
608
+ def _is_session_chain(target: str) -> bool:
609
+ """Return True if the dotted chain's left-most segment looks like a
610
+ session/db handle (``session.query``, ``db.session.query``, ...).
611
+
612
+ Also matches ``self.session.X`` / ``self.db.X`` patterns common in
613
+ repository-style code where the session is held as an instance
614
+ attribute.
615
+ """
616
+ if not target:
617
+ return False
618
+ parts = target.split(".")
619
+ head = parts[0].lower()
620
+ if head in _SESSION_HEAD_TOKENS:
621
+ return True
622
+ # self.<session-token>.X — repository-pattern method bodies.
623
+ if head == "self" and len(parts) >= 2:
624
+ second = parts[1].lower()
625
+ if second in _SESSION_HEAD_TOKENS:
626
+ return True
627
+ return False
628
+
629
+
630
+ def _unwrap_to_root_call(node: tree_sitter.Node) -> tree_sitter.Node | None:
631
+ """Follow ``call.function -> attribute.object`` chains down to the
632
+ leftmost ``call`` node.
633
+
634
+ Used for ``select(Model).where(...).order_by(...)`` style chains so we
635
+ extract ``select(Model)``'s argument, not the outer chained call's.
636
+ """
637
+ cur: tree_sitter.Node | None = node
638
+ while cur is not None and cur.type == "call":
639
+ func_child = cur.child_by_field_name("function")
640
+ # If function is itself an attribute whose object is a call, the
641
+ # inner call is the "root"; descend.
642
+ if (
643
+ func_child is not None
644
+ and func_child.type == "attribute"
645
+ ):
646
+ obj = func_child.child_by_field_name("object")
647
+ if obj is not None and obj.type == "call":
648
+ cur = obj
649
+ continue
650
+ break
651
+ return cur
652
+
653
+
654
+ def _model_name_from_call_arg(arg_text: str) -> str | None:
655
+ """Extract a Model name from a call-argument expression.
656
+
657
+ Handles:
658
+ * ``User`` — bare identifier
659
+ * ``User(...)`` — constructor call (returns ``User``)
660
+ * ``[User(...), Other()]`` — list with a Model constructor (returns
661
+ ``User``, the first model)
662
+ * ``some_chain.User`` — last segment
663
+ """
664
+ if not arg_text:
665
+ return None
666
+ text = arg_text.strip()
667
+ if text.startswith("[") and text.endswith("]"):
668
+ # ``add_all([User(...), ...])`` — pick the first PascalCase token.
669
+ inner = text[1:-1]
670
+ tokens: list[str] = re.findall(r"[A-Za-z_][A-Za-z0-9_]*", inner)
671
+ for tok in tokens:
672
+ if tok and tok[0].isupper():
673
+ return tok
674
+ return None
675
+ # Drop call args / parens.
676
+ no_parens = _strip_call_suffix(text)
677
+ # Last identifier segment after dotting.
678
+ leaf = no_parens.rsplit(".", 1)[-1]
679
+ if not leaf or not re.fullmatch(r"[A-Za-z_][A-Za-z0-9_]*", leaf):
680
+ return None
681
+ if not leaf[0].isupper():
682
+ return None
683
+ return leaf
684
+
685
+
686
+ # --- Public-API pragma detection ----------------------------------------
687
+ #
688
+ # A function or class can be exempted from dead-code analysis by prefixing
689
+ # its definition with one of these pragma comments on the line immediately
690
+ # before the def/class (or before the topmost decorator). A trailing
691
+ # same-line pragma (``def foo(): ... # pragma: codegraph-public-api``) is
692
+ # also accepted.
693
+ _PUBLIC_API_PRAGMAS: tuple[str, ...] = (
694
+ "# pragma: codegraph-public-api",
695
+ "# codegraph: public-api",
696
+ )
697
+
698
+
699
+ def _line_has_public_api_pragma(line: str) -> bool:
700
+ stripped = line.strip()
701
+ return any(pragma in stripped for pragma in _PUBLIC_API_PRAGMAS)
702
+
703
+
704
+ def _has_public_api_pragma(def_node: tree_sitter.Node, src: bytes) -> bool:
705
+ """Return True if the def/class node is preceded by a public-API pragma.
706
+
707
+ The pragma must sit on the line immediately above the definition (or
708
+ above the topmost decorator, when decorators are present) or as a
709
+ trailing comment on the def/class signature line itself.
710
+ """
711
+ container: tree_sitter.Node = def_node
712
+ if (
713
+ def_node.parent is not None
714
+ and def_node.parent.type == "decorated_definition"
715
+ ):
716
+ container = def_node.parent
717
+
718
+ start_byte = container.start_byte
719
+ end_byte = container.end_byte
720
+
721
+ # Same-line trailing pragma: scan from the def signature start to the
722
+ # first newline of the def body.
723
+ sig_end = src.find(b"\n", start_byte)
724
+ if sig_end == -1:
725
+ sig_end = end_byte
726
+ sig_line = src[start_byte:sig_end].decode("utf-8", errors="replace")
727
+ if _line_has_public_api_pragma(sig_line):
728
+ return True
729
+
730
+ # Walk backward through whitespace-only lines until we find a non-blank
731
+ # line; if that line is a pragma comment, we're matched.
732
+ cursor = start_byte
733
+ # Step back past the leading newline of the def's line.
734
+ if cursor > 0 and src[cursor - 1:cursor] == b"\n":
735
+ cursor -= 1
736
+ while cursor > 0:
737
+ # Find the start of the previous line.
738
+ prev_nl = src.rfind(b"\n", 0, cursor)
739
+ line_start = prev_nl + 1 if prev_nl != -1 else 0
740
+ line = src[line_start:cursor].decode("utf-8", errors="replace")
741
+ if not line.strip():
742
+ # Blank line — keep walking.
743
+ cursor = prev_nl
744
+ if cursor <= 0:
745
+ return False
746
+ continue
747
+ return _line_has_public_api_pragma(line)
748
+ return False
749
+
750
+
751
+ def _get_function_decorators(func_node: tree_sitter.Node, src: bytes) -> list[str]:
752
+ """Collect decorator strings for a function/class definition.
753
+
754
+ Tree-sitter wraps decorated definitions in a ``decorated_definition``
755
+ parent whose siblings are the ``decorator`` nodes; the actual
756
+ ``function_definition``/``class_definition`` itself has no decorator
757
+ children. We therefore look at the parent when needed.
758
+ """
759
+ decs: list[str] = []
760
+ container: tree_sitter.Node | None = func_node
761
+ if (
762
+ func_node.parent is not None
763
+ and func_node.parent.type == "decorated_definition"
764
+ ):
765
+ container = func_node.parent
766
+ if container is None:
767
+ return decs
768
+ for child in container.children:
769
+ if child.type == "decorator":
770
+ decs.append(node_text(child, src))
771
+ return decs
772
+
773
+
774
+ # --- Entry-point decorator catalog ---------------------------------------
775
+ #
776
+ # Decorator-prefix patterns (matched as substring of the raw "@..." text).
777
+ # Order is irrelevant; first match wins. Patterns starting with ``@`` match
778
+ # only at the start of the decorator string, while patterns without a
779
+ # leading ``@`` are matched as a contained substring (so ``@<name>.command``
780
+ # style patterns require explicit suffixes).
781
+ _ENTRYPOINT_DECORATOR_SUFFIXES: tuple[str, ...] = (
782
+ # Typer / Click — bound to any local Typer/Click instance.
783
+ ".command", ".callback", ".group",
784
+ # FastAPI / Flask / aiohttp — HTTP and websocket route decorators.
785
+ ".get", ".post", ".put", ".delete", ".patch", ".head", ".options",
786
+ ".trace", ".websocket", ".route", ".on_event", ".middleware",
787
+ ".before_request", ".after_request", ".teardown_request",
788
+ ".errorhandler",
789
+ # Celery.
790
+ ".task",
791
+ # SQLAlchemy.
792
+ ".listens_for",
793
+ # MCP protocol server (anthropic mcp-python-sdk and similar).
794
+ ".list_tools", ".call_tool", ".list_resources", ".read_resource",
795
+ ".list_prompts", ".get_prompt",
796
+ )
797
+
798
+ # Decorator names matched anywhere in the raw decorator text (covers bare
799
+ # ``@shared_task`` as well as ``@app.shared_task`` and ``@pytest.fixture``).
800
+ _ENTRYPOINT_DECORATOR_CONTAINS: tuple[str, ...] = (
801
+ "shared_task",
802
+ "pytest.fixture",
803
+ "pytest.mark",
804
+ "abstractmethod",
805
+ "abc.abstractmethod",
806
+ "admin.register",
807
+ "receiver",
808
+ "login_required",
809
+ "permission_required",
810
+ "event.listens_for",
811
+ # Local registry decorators commonly used in this codebase / MCP servers.
812
+ "_register",
813
+ )
814
+
815
+
816
+ def _is_entry_point(
817
+ decorators: list[str],
818
+ name: str,
819
+ *,
820
+ extra_decorator_patterns: tuple[str, ...] = (),
821
+ ) -> bool:
822
+ """Return True if any decorator matches a known entry-point pattern.
823
+
824
+ ``name`` is currently unused but kept for forward compatibility with
825
+ name-glob configuration in DeadCodeConfig.
826
+ """
827
+ if not decorators:
828
+ return False
829
+ for raw in decorators:
830
+ text = raw.strip()
831
+ # Drop the leading '@' for substring matching, but keep the raw
832
+ # form for prefix matching.
833
+ body = text[1:] if text.startswith("@") else text
834
+ for suffix in _ENTRYPOINT_DECORATOR_SUFFIXES:
835
+ if suffix in body:
836
+ return True
837
+ for needle in _ENTRYPOINT_DECORATOR_CONTAINS:
838
+ if needle in body:
839
+ return True
840
+ for pattern in extra_decorator_patterns:
841
+ stripped = pattern.lstrip("@").strip()
842
+ if stripped and stripped in body:
843
+ return True
844
+ return False
845
+
846
+
847
+ @register_extractor
848
+ class PythonExtractor(ExtractorBase):
849
+ language = "python"
850
+ extensions = (".py",)
851
+
852
+ # Optional user-supplied decorator patterns (set by GraphBuilder before
853
+ # parsing). Matched as substring of the raw decorator text via
854
+ # ``_is_entry_point``.
855
+ extra_entry_point_decorators: tuple[str, ...] = ()
856
+
857
+ def parse_file(
858
+ self, path: Path, repo_root: Path
859
+ ) -> tuple[list[Node], list[Edge]]:
860
+ src = path.read_bytes()
861
+ rel = path.relative_to(repo_root).as_posix()
862
+ parser = load_parser("python")
863
+ tree = parser.parse(src)
864
+ root = tree.root_node
865
+
866
+ nodes: list[Node] = []
867
+ edges: list[Edge] = []
868
+
869
+ is_test = _is_test_file(rel)
870
+ qualname = _file_to_qualname(rel)
871
+ module_id = make_node_id(NodeKind.MODULE, qualname, rel)
872
+ module_node = Node(
873
+ id=module_id,
874
+ kind=NodeKind.MODULE,
875
+ name=qualname.split(".")[-1] if qualname else rel,
876
+ qualname=qualname,
877
+ file=rel,
878
+ line_start=1,
879
+ line_end=root.end_point[0] + 1,
880
+ language="python",
881
+ metadata={"is_test": is_test},
882
+ )
883
+ nodes.append(module_node)
884
+
885
+ if is_test:
886
+ test_id = make_node_id(NodeKind.TEST, qualname, rel)
887
+ test_node = Node(
888
+ id=test_id,
889
+ kind=NodeKind.TEST,
890
+ name=qualname.split(".")[-1] if qualname else rel,
891
+ qualname=qualname,
892
+ file=rel,
893
+ line_start=1,
894
+ line_end=root.end_point[0] + 1,
895
+ language="python",
896
+ metadata={"is_test": True},
897
+ )
898
+ nodes.append(test_node)
899
+
900
+ self._visit_block(
901
+ root, rel, qualname, module_id, None, src, nodes, edges
902
+ )
903
+ # Module-level call expressions (e.g. `Widget("a")` at top level)
904
+ # also produce CALLS edges attributed to the module so the resolver
905
+ # can link them to in-repo classes/functions defined in the same
906
+ # file. We deliberately stop traversal at any function/class def so
907
+ # we don't double-count their inner calls.
908
+ self._collect_calls(root, rel, module_id, src, edges)
909
+ return nodes, edges
910
+
911
+ def _visit_block(
912
+ self,
913
+ block: tree_sitter.Node,
914
+ rel: str,
915
+ parent_qualname: str,
916
+ parent_id: str,
917
+ enclosing_class_id: str | None,
918
+ src: bytes,
919
+ nodes: list[Node],
920
+ edges: list[Edge],
921
+ ) -> None:
922
+ for child in block.children:
923
+ if child.type == "class_definition":
924
+ self._handle_class(
925
+ child, rel, parent_qualname, parent_id, src, nodes, edges
926
+ )
927
+ elif child.type == "function_definition":
928
+ kind = (
929
+ NodeKind.METHOD if enclosing_class_id else NodeKind.FUNCTION
930
+ )
931
+ self._handle_function(
932
+ child, rel, parent_qualname, parent_id, kind,
933
+ src, nodes, edges,
934
+ )
935
+ elif child.type == "decorated_definition":
936
+ inner = None
937
+ for c in child.children:
938
+ if c.type in ("function_definition", "class_definition"):
939
+ inner = c
940
+ break
941
+ if inner is not None and inner.type == "class_definition":
942
+ self._handle_class(
943
+ inner, rel, parent_qualname, parent_id,
944
+ src, nodes, edges,
945
+ )
946
+ elif inner is not None:
947
+ kind = (
948
+ NodeKind.METHOD if enclosing_class_id else NodeKind.FUNCTION
949
+ )
950
+ self._handle_function(
951
+ inner, rel, parent_qualname, parent_id, kind,
952
+ src, nodes, edges,
953
+ )
954
+ elif child.type == "import_statement":
955
+ self._handle_import(child, rel, parent_id, src, edges)
956
+ elif child.type == "import_from_statement":
957
+ self._handle_import_from(child, rel, parent_id, src, edges)
958
+ elif child.type in (
959
+ "if_statement", "with_statement", "try_statement",
960
+ "for_statement", "while_statement",
961
+ ):
962
+ for sub in child.children:
963
+ if sub.type == "block":
964
+ self._visit_block(
965
+ sub, rel, parent_qualname, parent_id,
966
+ enclosing_class_id, src, nodes, edges,
967
+ )
968
+
969
+ def _handle_class(
970
+ self,
971
+ node: tree_sitter.Node,
972
+ rel: str,
973
+ parent_qualname: str,
974
+ parent_id: str,
975
+ src: bytes,
976
+ nodes: list[Node],
977
+ edges: list[Edge],
978
+ ) -> None:
979
+ name_node = node.child_by_field_name("name")
980
+ if name_node is None:
981
+ return
982
+ name = node_text(name_node, src)
983
+ qualname = f"{parent_qualname}.{name}" if parent_qualname else name
984
+ class_id = make_node_id(NodeKind.CLASS, qualname, rel)
985
+
986
+ sig = node_text(node, src).split("\n")[0].rstrip(":")
987
+
988
+ body = node.child_by_field_name("body")
989
+ docstring = _get_docstring(body, src) if body else None
990
+
991
+ decorators = _get_function_decorators(node, src)
992
+ cls_metadata: dict[str, object] = {}
993
+ if decorators:
994
+ cls_metadata["decorators"] = decorators
995
+ if _is_entry_point(
996
+ decorators,
997
+ name,
998
+ extra_decorator_patterns=self.extra_entry_point_decorators,
999
+ ):
1000
+ cls_metadata["entry_point"] = True
1001
+ if _has_public_api_pragma(node, src):
1002
+ cls_metadata["public_api"] = True
1003
+
1004
+ body_for_attrs = node.child_by_field_name("body")
1005
+ attr_types = (
1006
+ _collect_class_attr_types(body_for_attrs, src)
1007
+ if body_for_attrs is not None else {}
1008
+ )
1009
+ if attr_types:
1010
+ cls_metadata["attr_types"] = attr_types
1011
+
1012
+ class_node = Node(
1013
+ id=class_id,
1014
+ kind=NodeKind.CLASS,
1015
+ name=name,
1016
+ qualname=qualname,
1017
+ file=rel,
1018
+ line_start=node.start_point[0] + 1,
1019
+ line_end=node.end_point[0] + 1,
1020
+ signature=sig,
1021
+ docstring=docstring,
1022
+ language="python",
1023
+ metadata=cls_metadata,
1024
+ )
1025
+ nodes.append(class_node)
1026
+
1027
+ edges.append(Edge(
1028
+ src=class_id, dst=parent_id, kind=EdgeKind.DEFINED_IN,
1029
+ file=rel, line=node.start_point[0] + 1,
1030
+ ))
1031
+
1032
+ self._emit_decorator_calls(node, rel, class_id, src, edges)
1033
+
1034
+ arg_list = node.child_by_field_name("superclasses")
1035
+ if arg_list is None:
1036
+ for c in node.children:
1037
+ if c.type == "argument_list":
1038
+ arg_list = c
1039
+ break
1040
+ if arg_list is not None:
1041
+ for base in arg_list.children:
1042
+ if base.is_named and base.type in ("identifier", "attribute"):
1043
+ base_name = node_text(base, src)
1044
+ edges.append(Edge(
1045
+ src=class_id,
1046
+ dst=f"unresolved::{base_name}",
1047
+ kind=EdgeKind.INHERITS,
1048
+ file=rel,
1049
+ line=node.start_point[0] + 1,
1050
+ metadata={"target_name": base_name},
1051
+ ))
1052
+
1053
+ if body is not None:
1054
+ for child in body.children:
1055
+ if child.type == "function_definition":
1056
+ self._handle_function(
1057
+ child, rel, qualname, class_id,
1058
+ NodeKind.METHOD, src, nodes, edges,
1059
+ )
1060
+ elif child.type == "decorated_definition":
1061
+ inner = None
1062
+ for c in child.children:
1063
+ if c.type in ("function_definition", "class_definition"):
1064
+ inner = c
1065
+ break
1066
+ if inner is not None and inner.type == "function_definition":
1067
+ self._handle_function(
1068
+ inner, rel, qualname, class_id,
1069
+ NodeKind.METHOD, src, nodes, edges,
1070
+ )
1071
+ elif inner is not None:
1072
+ self._handle_class(
1073
+ inner, rel, qualname, class_id, src, nodes, edges
1074
+ )
1075
+ elif child.type == "class_definition":
1076
+ self._handle_class(
1077
+ child, rel, qualname, class_id, src, nodes, edges
1078
+ )
1079
+ elif child.type == "import_statement":
1080
+ self._handle_import(child, rel, class_id, src, edges)
1081
+ elif child.type == "import_from_statement":
1082
+ self._handle_import_from(child, rel, class_id, src, edges)
1083
+
1084
+ def _handle_function(
1085
+ self,
1086
+ node: tree_sitter.Node,
1087
+ rel: str,
1088
+ parent_qualname: str,
1089
+ parent_id: str,
1090
+ kind: NodeKind,
1091
+ src: bytes,
1092
+ nodes: list[Node],
1093
+ edges: list[Edge],
1094
+ ) -> None:
1095
+ name_node = node.child_by_field_name("name")
1096
+ if name_node is None:
1097
+ return
1098
+ name = node_text(name_node, src)
1099
+ qualname = f"{parent_qualname}.{name}" if parent_qualname else name
1100
+ func_id = make_node_id(kind, qualname, rel)
1101
+
1102
+ params = node.child_by_field_name("parameters")
1103
+ sig = f"{name}{node_text(params, src)}" if params is not None else name
1104
+
1105
+ body = node.child_by_field_name("body")
1106
+ docstring = _get_docstring(body, src) if body else None
1107
+
1108
+ decorators = _get_function_decorators(node, src)
1109
+ metadata: dict[str, object] = {"decorators": decorators}
1110
+ if _is_entry_point(
1111
+ decorators,
1112
+ name,
1113
+ extra_decorator_patterns=self.extra_entry_point_decorators,
1114
+ ) or name == "__main__":
1115
+ metadata["entry_point"] = True
1116
+ if _has_public_api_pragma(node, src):
1117
+ metadata["public_api"] = True
1118
+
1119
+ # DF0: capture parameter descriptors and return-type annotation.
1120
+ # Methods skip the leading ``self`` / ``cls`` parameter; classmethods
1121
+ # and staticmethods follow the same rule (``cls`` is dropped, the
1122
+ # static-method case has no implicit first arg so nothing to skip).
1123
+ if params is not None:
1124
+ metadata["params"] = _extract_params(
1125
+ params, src, skip_self_or_cls=(kind == NodeKind.METHOD),
1126
+ )
1127
+ else:
1128
+ metadata["params"] = []
1129
+ return_type_node = node.child_by_field_name("return_type")
1130
+ metadata["returns"] = (
1131
+ node_text(return_type_node, src) if return_type_node else None
1132
+ )
1133
+
1134
+ func_node = Node(
1135
+ id=func_id,
1136
+ kind=kind,
1137
+ name=name,
1138
+ qualname=qualname,
1139
+ file=rel,
1140
+ line_start=node.start_point[0] + 1,
1141
+ line_end=node.end_point[0] + 1,
1142
+ signature=sig,
1143
+ docstring=docstring,
1144
+ language="python",
1145
+ metadata=metadata,
1146
+ )
1147
+ nodes.append(func_node)
1148
+
1149
+ edges.append(Edge(
1150
+ src=func_id, dst=parent_id, kind=EdgeKind.DEFINED_IN,
1151
+ file=rel, line=node.start_point[0] + 1,
1152
+ ))
1153
+
1154
+ self._emit_decorator_calls(node, rel, func_id, src, edges)
1155
+
1156
+ # DF1 — HTTP route extraction. One ROUTE edge per (method, path);
1157
+ # Flask's ``methods=[...]`` expands to multiple edges.
1158
+ for spec in _extract_route_specs(decorators):
1159
+ self._emit_route_edge(
1160
+ spec, func_id, rel, node.start_point[0] + 1,
1161
+ nodes, edges,
1162
+ )
1163
+
1164
+ if body is not None:
1165
+ self._collect_calls(body, rel, func_id, src, edges)
1166
+ # DF1 — SQLAlchemy READS_FROM / WRITES_TO. Walk the body for
1167
+ # ORM session calls; emits ``unresolved::Model`` edges that
1168
+ # the post-build resolver rewrites to real CLASS ids.
1169
+ self._collect_sql_io(body, rel, func_id, src, edges)
1170
+ # Visit nested defs so their bodies and calls are not lost.
1171
+ # The innermost named function owns its calls — that mirrors
1172
+ # the runtime attribution and matches what users expect when
1173
+ # they ask "who calls X?".
1174
+ self._visit_nested_defs(
1175
+ body, rel, qualname, func_id, kind == NodeKind.METHOD,
1176
+ src, nodes, edges,
1177
+ )
1178
+
1179
+ def _visit_nested_defs(
1180
+ self,
1181
+ block: tree_sitter.Node,
1182
+ rel: str,
1183
+ parent_qualname: str,
1184
+ parent_id: str,
1185
+ in_method: bool,
1186
+ src: bytes,
1187
+ nodes: list[Node],
1188
+ edges: list[Edge],
1189
+ ) -> None:
1190
+ """Recursively register nested function/class definitions.
1191
+
1192
+ Walks the subtree but stops descending into a function or class
1193
+ once we have handed it to ``_handle_function`` / ``_handle_class``
1194
+ (those handlers will recurse on their own bodies). This mirrors
1195
+ ``_visit_block`` but skips top-level statement noise.
1196
+ """
1197
+ stack: list[tree_sitter.Node] = list(block.children)
1198
+ while stack:
1199
+ node = stack.pop()
1200
+ if node.type == "function_definition":
1201
+ # Nested functions are FUNCTION nodes (not METHOD); a method's
1202
+ # nested helpers are still locally-scoped functions.
1203
+ self._handle_function(
1204
+ node, rel, parent_qualname, parent_id,
1205
+ NodeKind.FUNCTION, src, nodes, edges,
1206
+ )
1207
+ continue
1208
+ if node.type == "class_definition":
1209
+ self._handle_class(
1210
+ node, rel, parent_qualname, parent_id,
1211
+ src, nodes, edges,
1212
+ )
1213
+ continue
1214
+ if node.type == "decorated_definition":
1215
+ inner = next(
1216
+ (
1217
+ c for c in node.children
1218
+ if c.type in ("function_definition", "class_definition")
1219
+ ),
1220
+ None,
1221
+ )
1222
+ if inner is not None and inner.type == "function_definition":
1223
+ self._handle_function(
1224
+ inner, rel, parent_qualname, parent_id,
1225
+ NodeKind.FUNCTION, src, nodes, edges,
1226
+ )
1227
+ continue
1228
+ if inner is not None:
1229
+ self._handle_class(
1230
+ inner, rel, parent_qualname, parent_id,
1231
+ src, nodes, edges,
1232
+ )
1233
+ continue
1234
+ stack.extend(node.children)
1235
+
1236
+ def _collect_calls(
1237
+ self,
1238
+ node: tree_sitter.Node,
1239
+ rel: str,
1240
+ scope_id: str,
1241
+ src: bytes,
1242
+ edges: list[Edge],
1243
+ ) -> None:
1244
+ """Walk subtree collecting call expressions, stopping at nested defs."""
1245
+ stack: list[tree_sitter.Node] = list(node.children)
1246
+ while stack:
1247
+ child = stack.pop()
1248
+ if child.type == "call":
1249
+ func_child = child.child_by_field_name("function")
1250
+ if func_child is None and child.children:
1251
+ func_child = child.children[0]
1252
+ if func_child is not None:
1253
+ name = node_text(func_child, src)
1254
+ arg_list = child.child_by_field_name("arguments")
1255
+ args: list[str] = []
1256
+ kwargs: dict[str, str] = {}
1257
+ if arg_list is not None:
1258
+ args, kwargs = _extract_call_args(arg_list, src)
1259
+ edges.append(Edge(
1260
+ src=scope_id,
1261
+ dst=f"unresolved::{name}",
1262
+ kind=EdgeKind.CALLS,
1263
+ file=rel,
1264
+ line=child.start_point[0] + 1,
1265
+ metadata={
1266
+ "target_name": name,
1267
+ "args": args,
1268
+ "kwargs": kwargs,
1269
+ },
1270
+ ))
1271
+ # ``decorator`` subtrees are handled by ``_emit_decorator_calls``
1272
+ # so we attribute decorator factories to the decorated symbol
1273
+ # rather than the surrounding scope. Skipping them here avoids
1274
+ # double-counting at module level.
1275
+ if child.type not in (
1276
+ "class_definition", "function_definition", "decorator",
1277
+ ):
1278
+ stack.extend(child.children)
1279
+
1280
+ def _emit_decorator_calls(
1281
+ self,
1282
+ def_node: tree_sitter.Node,
1283
+ rel: str,
1284
+ scope_id: str,
1285
+ src: bytes,
1286
+ edges: list[Edge],
1287
+ ) -> None:
1288
+ """Emit a CALLS edge for each decorator on a function or class.
1289
+
1290
+ ``@_register("name")`` and ``@my_decorator(arg)`` are calls — they
1291
+ invoke the decorator factory at definition time. Without these edges
1292
+ decorator-only functions look unreferenced.
1293
+ """
1294
+ container = def_node
1295
+ if (
1296
+ def_node.parent is not None
1297
+ and def_node.parent.type == "decorated_definition"
1298
+ ):
1299
+ container = def_node.parent
1300
+ for child in container.children:
1301
+ if child.type != "decorator":
1302
+ continue
1303
+ for sub in child.children:
1304
+ # The decorator body is either a bare reference (\`@foo\`)
1305
+ # which is not a call we should emit, or a \`call\`
1306
+ # (\`@foo("x")\`) — only the latter is a real invocation.
1307
+ if sub.type == "call":
1308
+ func_child = sub.child_by_field_name("function")
1309
+ if func_child is None and sub.children:
1310
+ func_child = sub.children[0]
1311
+ if func_child is not None:
1312
+ name = node_text(func_child, src)
1313
+ arg_list = sub.child_by_field_name("arguments")
1314
+ args: list[str] = []
1315
+ kwargs: dict[str, str] = {}
1316
+ if arg_list is not None:
1317
+ args, kwargs = _extract_call_args(arg_list, src)
1318
+ edges.append(Edge(
1319
+ src=scope_id,
1320
+ dst=f"unresolved::{name}",
1321
+ kind=EdgeKind.CALLS,
1322
+ file=rel,
1323
+ line=sub.start_point[0] + 1,
1324
+ metadata={
1325
+ "target_name": name,
1326
+ "args": args,
1327
+ "kwargs": kwargs,
1328
+ },
1329
+ ))
1330
+
1331
+ # --- DF1: route + SQL emission helpers ----------------------------
1332
+
1333
+ def _emit_route_edge(
1334
+ self,
1335
+ spec: dict[str, str],
1336
+ func_id: str,
1337
+ rel: str,
1338
+ line: int,
1339
+ nodes: list[Node],
1340
+ edges: list[Edge],
1341
+ ) -> None:
1342
+ """Create a synthetic route node + ROUTE edge from handler.
1343
+
1344
+ The synthetic node uses ``NodeKind.VARIABLE`` (sentinel — see
1345
+ ``metadata.synthetic_kind``). Its id encodes ``METHOD::PATH`` so
1346
+ multiple handlers binding the same route share the destination.
1347
+ """
1348
+ method = spec["method"]
1349
+ path = spec["path"]
1350
+ synthetic_qualname = f"route::{method}::{path}"
1351
+ synthetic_id = f"route::{method}::{path}"
1352
+ # Avoid duplicate node emission when multiple handlers in the
1353
+ # same file declare the same route — caller reuses the same id.
1354
+ if not any(n.id == synthetic_id for n in nodes):
1355
+ nodes.append(Node(
1356
+ id=synthetic_id,
1357
+ kind=NodeKind.VARIABLE,
1358
+ name=f"{method} {path}",
1359
+ qualname=synthetic_qualname,
1360
+ file=rel,
1361
+ line_start=line,
1362
+ line_end=line,
1363
+ language="python",
1364
+ metadata={
1365
+ "synthetic_kind": "ROUTE",
1366
+ "method": method,
1367
+ "path": path,
1368
+ "framework": spec["framework"],
1369
+ },
1370
+ ))
1371
+ edges.append(Edge(
1372
+ src=func_id,
1373
+ dst=synthetic_id,
1374
+ kind=EdgeKind.ROUTE,
1375
+ file=rel,
1376
+ line=line,
1377
+ metadata={
1378
+ "method": method,
1379
+ "path": path,
1380
+ "framework": spec["framework"],
1381
+ },
1382
+ ))
1383
+
1384
+ def _collect_sql_io(
1385
+ self,
1386
+ body: tree_sitter.Node,
1387
+ rel: str,
1388
+ scope_id: str,
1389
+ src: bytes,
1390
+ edges: list[Edge],
1391
+ ) -> None:
1392
+ """Walk a function body for SQLAlchemy data-access patterns.
1393
+
1394
+ Emits ``READS_FROM`` / ``WRITES_TO`` edges with
1395
+ ``dst="unresolved::<ModelName>"`` so the post-build resolver can
1396
+ rewrite them to real CLASS node ids by qualname/tail match.
1397
+ """
1398
+ stack: list[tree_sitter.Node] = list(body.children)
1399
+ while stack:
1400
+ child = stack.pop()
1401
+ if child.type == "call":
1402
+ self._maybe_emit_sql_edge(child, rel, scope_id, src, edges)
1403
+ # Stop at nested defs — their bodies own their own edges.
1404
+ if child.type not in (
1405
+ "class_definition", "function_definition", "decorator",
1406
+ ):
1407
+ stack.extend(child.children)
1408
+
1409
+ def _maybe_emit_sql_edge(
1410
+ self,
1411
+ call_node: tree_sitter.Node,
1412
+ rel: str,
1413
+ scope_id: str,
1414
+ src: bytes,
1415
+ edges: list[Edge],
1416
+ ) -> None:
1417
+ """Inspect one ``call`` AST node for an SQLAlchemy data-op."""
1418
+ func_child = call_node.child_by_field_name("function")
1419
+ if func_child is None:
1420
+ return
1421
+ target = node_text(func_child, src)
1422
+ # `Model.query.filter(...)` or `Model.query` — Flask-SQLAlchemy.
1423
+ m_query = re.match(
1424
+ r"^([A-Z][\w]*)\.query(?:\.|$)", target,
1425
+ )
1426
+ if m_query:
1427
+ model = m_query.group(1)
1428
+ edges.append(Edge(
1429
+ src=scope_id,
1430
+ dst=f"unresolved::{model}",
1431
+ kind=EdgeKind.READS_FROM,
1432
+ file=rel,
1433
+ line=call_node.start_point[0] + 1,
1434
+ metadata={
1435
+ "operation": "select",
1436
+ "via": "Model.query",
1437
+ "model_name": model,
1438
+ "target_name": model,
1439
+ },
1440
+ ))
1441
+ return
1442
+ # session-style chain — `session.query(Model)`, `db.session.add(...)`.
1443
+ if not _is_session_chain(target):
1444
+ return
1445
+ verb = target.rsplit(".", 1)[-1]
1446
+ # session.query(Model) / session.get(Model, id) / .scalars(...)
1447
+ if verb in _SQL_READ_OUTER:
1448
+ self._emit_sql_from_first_arg(
1449
+ call_node, rel, scope_id, src, edges,
1450
+ kind=EdgeKind.READS_FROM, operation="select",
1451
+ via=f"session.{verb}",
1452
+ )
1453
+ return
1454
+ if verb in _SQL_WRITE_OUTER:
1455
+ op = "delete" if verb == "delete" else "insert"
1456
+ self._emit_sql_from_first_arg(
1457
+ call_node, rel, scope_id, src, edges,
1458
+ kind=EdgeKind.WRITES_TO, operation=op,
1459
+ via=f"session.{verb}",
1460
+ )
1461
+ return
1462
+ if verb == "execute":
1463
+ # session.execute(select(Model)) / insert(Model) / etc.
1464
+ self._emit_sql_from_execute(
1465
+ call_node, rel, scope_id, src, edges,
1466
+ )
1467
+
1468
+ def _emit_sql_from_first_arg(
1469
+ self,
1470
+ call_node: tree_sitter.Node,
1471
+ rel: str,
1472
+ scope_id: str,
1473
+ src: bytes,
1474
+ edges: list[Edge],
1475
+ *,
1476
+ kind: EdgeKind,
1477
+ operation: str,
1478
+ via: str,
1479
+ ) -> None:
1480
+ arg_list = call_node.child_by_field_name("arguments")
1481
+ if arg_list is None:
1482
+ return
1483
+ first_named = next(
1484
+ (c for c in arg_list.children if c.is_named), None,
1485
+ )
1486
+ if first_named is None:
1487
+ return
1488
+ model = _model_name_from_call_arg(node_text(first_named, src))
1489
+ if not model:
1490
+ return
1491
+ edges.append(Edge(
1492
+ src=scope_id,
1493
+ dst=f"unresolved::{model}",
1494
+ kind=kind,
1495
+ file=rel,
1496
+ line=call_node.start_point[0] + 1,
1497
+ metadata={
1498
+ "operation": operation,
1499
+ "via": via,
1500
+ "model_name": model,
1501
+ "target_name": model,
1502
+ },
1503
+ ))
1504
+
1505
+ def _emit_sql_from_execute(
1506
+ self,
1507
+ call_node: tree_sitter.Node,
1508
+ rel: str,
1509
+ scope_id: str,
1510
+ src: bytes,
1511
+ edges: list[Edge],
1512
+ ) -> None:
1513
+ """Handle ``session.execute(select|insert|update|delete(Model))``."""
1514
+ arg_list = call_node.child_by_field_name("arguments")
1515
+ if arg_list is None:
1516
+ return
1517
+ first_named = next(
1518
+ (c for c in arg_list.children if c.is_named), None,
1519
+ )
1520
+ if first_named is None:
1521
+ return
1522
+ # Drill through ``.values(...)`` / ``.where(...)`` chains —
1523
+ # ``select(Model).where(...)`` keeps wrapping the original
1524
+ # constructor call inside ``function -> attribute -> object``.
1525
+ first_named = _unwrap_to_root_call(first_named)
1526
+ if first_named is None or first_named.type != "call":
1527
+ return
1528
+ inner_func = first_named.child_by_field_name("function")
1529
+ if inner_func is None:
1530
+ return
1531
+ inner_name = node_text(inner_func, src).rsplit(".", 1)[-1]
1532
+ if inner_name in _SQL_READ_INNER:
1533
+ kind = EdgeKind.READS_FROM
1534
+ operation = "select"
1535
+ elif inner_name in _SQL_WRITE_INNER:
1536
+ kind = EdgeKind.WRITES_TO
1537
+ operation = inner_name
1538
+ else:
1539
+ return
1540
+ inner_args = first_named.child_by_field_name("arguments")
1541
+ if inner_args is None:
1542
+ return
1543
+ first_inner = next(
1544
+ (c for c in inner_args.children if c.is_named), None,
1545
+ )
1546
+ if first_inner is None:
1547
+ return
1548
+ model = _model_name_from_call_arg(node_text(first_inner, src))
1549
+ if not model:
1550
+ return
1551
+ edges.append(Edge(
1552
+ src=scope_id,
1553
+ dst=f"unresolved::{model}",
1554
+ kind=kind,
1555
+ file=rel,
1556
+ line=call_node.start_point[0] + 1,
1557
+ metadata={
1558
+ "operation": operation,
1559
+ "via": f"session.execute({inner_name})",
1560
+ "model_name": model,
1561
+ "target_name": model,
1562
+ },
1563
+ ))
1564
+
1565
+ def _handle_import(
1566
+ self,
1567
+ node: tree_sitter.Node,
1568
+ rel: str,
1569
+ parent_id: str,
1570
+ src: bytes,
1571
+ edges: list[Edge],
1572
+ ) -> None:
1573
+ for child in node.children:
1574
+ if child.type in ("dotted_name", "aliased_import"):
1575
+ if child.type == "aliased_import":
1576
+ name_node = child.children[0] if child.children else child
1577
+ else:
1578
+ name_node = child
1579
+ name = node_text(name_node, src)
1580
+ edges.append(Edge(
1581
+ src=parent_id,
1582
+ dst=f"unresolved::{name}",
1583
+ kind=EdgeKind.IMPORTS,
1584
+ file=rel,
1585
+ line=node.start_point[0] + 1,
1586
+ metadata={"target_name": name},
1587
+ ))
1588
+
1589
+ def _handle_import_from(
1590
+ self,
1591
+ node: tree_sitter.Node,
1592
+ rel: str,
1593
+ parent_id: str,
1594
+ src: bytes,
1595
+ edges: list[Edge],
1596
+ ) -> None:
1597
+ # Locate the module portion (relative_import or dotted_name) and the
1598
+ # imported names that follow the `import` keyword.
1599
+ module_node: tree_sitter.Node | None = None
1600
+ seen_import_kw = False
1601
+ name_nodes: list[tree_sitter.Node] = []
1602
+ for child in node.children:
1603
+ if not seen_import_kw:
1604
+ if (
1605
+ child.type in ("relative_import", "dotted_name")
1606
+ and module_node is None
1607
+ ):
1608
+ module_node = child
1609
+ elif child.type == "import":
1610
+ seen_import_kw = True
1611
+ else:
1612
+ if child.type in ("dotted_name", "identifier"):
1613
+ name_nodes.append(child)
1614
+ elif child.type == "aliased_import":
1615
+ # `from m import X as Y` — bind original name X.
1616
+ inner = next(
1617
+ (
1618
+ c for c in child.children
1619
+ if c.type in ("dotted_name", "identifier")
1620
+ ),
1621
+ None,
1622
+ )
1623
+ if inner is not None:
1624
+ name_nodes.append(inner)
1625
+ elif child.type == "wildcard_import":
1626
+ # `from m import *` — no per-name edges to emit.
1627
+ pass
1628
+
1629
+ # Resolve module name. Handle relative imports by computing the
1630
+ # absolute package qualname from the importing file's location.
1631
+ module_name = self._resolve_from_module(module_node, rel, src)
1632
+
1633
+ # If there are no imported names (e.g. parser fallback), keep the
1634
+ # module-level edge so we don't lose the import entirely. When we
1635
+ # do have per-name edges, the per-name edges carry the binding info
1636
+ # the resolver needs and the module-level edge would be redundant
1637
+ # noise.
1638
+ if module_name and not name_nodes:
1639
+ edges.append(Edge(
1640
+ src=parent_id,
1641
+ dst=f"unresolved::{module_name}",
1642
+ kind=EdgeKind.IMPORTS,
1643
+ file=rel,
1644
+ line=node.start_point[0] + 1,
1645
+ metadata={"target_name": module_name},
1646
+ ))
1647
+
1648
+ # Emit one IMPORTS edge per imported name, with imported_name in the
1649
+ # metadata so the resolver can bind alias -> full qualname.
1650
+ for nn in name_nodes:
1651
+ imported = node_text(nn, src)
1652
+ if not imported:
1653
+ continue
1654
+ full = (
1655
+ f"{module_name}.{imported}" if module_name else imported
1656
+ )
1657
+ edges.append(Edge(
1658
+ src=parent_id,
1659
+ dst=f"unresolved::{full}",
1660
+ kind=EdgeKind.IMPORTS,
1661
+ file=rel,
1662
+ line=node.start_point[0] + 1,
1663
+ metadata={
1664
+ "target_name": full,
1665
+ "imported_name": imported,
1666
+ },
1667
+ ))
1668
+
1669
+ def _resolve_from_module(
1670
+ self,
1671
+ module_node: tree_sitter.Node | None,
1672
+ rel: str,
1673
+ src: bytes,
1674
+ ) -> str:
1675
+ """Return the absolute module qualname for a `from X import ...`.
1676
+
1677
+ For relative imports (`from . import x`, `from ..pkg import x`),
1678
+ count the leading dots and walk up the importing file's package
1679
+ path that many levels, then append the relative module name.
1680
+ """
1681
+ if module_node is None:
1682
+ return ""
1683
+ if module_node.type != "relative_import":
1684
+ return node_text(module_node, src)
1685
+
1686
+ # Count leading dots and find the trailing dotted_name (if any).
1687
+ dots = 0
1688
+ rel_module = ""
1689
+ for child in module_node.children:
1690
+ if child.type == "import_prefix":
1691
+ dots = sum(1 for c in child.children if c.type == ".")
1692
+ elif child.type == "dotted_name":
1693
+ rel_module = node_text(child, src)
1694
+
1695
+ # Importing-file qualname (without the file's own basename).
1696
+ file_qual = _file_to_qualname(rel)
1697
+ pkg_parts = file_qual.split(".") if file_qual else []
1698
+ # Drop the file's own module name to get the containing package.
1699
+ if pkg_parts:
1700
+ pkg_parts = pkg_parts[:-1]
1701
+ # Walk up `dots - 1` further levels (one dot = current package).
1702
+ if dots > 1:
1703
+ cut = dots - 1
1704
+ pkg_parts = pkg_parts[:-cut] if cut <= len(pkg_parts) else []
1705
+
1706
+ parts = pkg_parts + ([rel_module] if rel_module else [])
1707
+ return ".".join(p for p in parts if p)