polycodegraph 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- codegraph/__init__.py +10 -0
- codegraph/analysis/__init__.py +30 -0
- codegraph/analysis/_common.py +125 -0
- codegraph/analysis/blast_radius.py +63 -0
- codegraph/analysis/cycles.py +79 -0
- codegraph/analysis/dataflow.py +861 -0
- codegraph/analysis/dead_code.py +165 -0
- codegraph/analysis/hotspots.py +68 -0
- codegraph/analysis/infrastructure.py +439 -0
- codegraph/analysis/metrics.py +52 -0
- codegraph/analysis/report.py +222 -0
- codegraph/analysis/roles.py +323 -0
- codegraph/analysis/untested.py +79 -0
- codegraph/cli.py +1506 -0
- codegraph/config.py +64 -0
- codegraph/embed/__init__.py +35 -0
- codegraph/embed/chunker.py +120 -0
- codegraph/embed/embedder.py +113 -0
- codegraph/embed/query.py +181 -0
- codegraph/embed/store.py +360 -0
- codegraph/graph/__init__.py +0 -0
- codegraph/graph/builder.py +212 -0
- codegraph/graph/schema.py +69 -0
- codegraph/graph/store_networkx.py +55 -0
- codegraph/graph/store_sqlite.py +249 -0
- codegraph/mcp_server/__init__.py +6 -0
- codegraph/mcp_server/server.py +933 -0
- codegraph/parsers/__init__.py +0 -0
- codegraph/parsers/base.py +70 -0
- codegraph/parsers/go.py +570 -0
- codegraph/parsers/python.py +1707 -0
- codegraph/parsers/typescript.py +1397 -0
- codegraph/py.typed +0 -0
- codegraph/resolve/__init__.py +4 -0
- codegraph/resolve/calls.py +480 -0
- codegraph/review/__init__.py +31 -0
- codegraph/review/baseline.py +32 -0
- codegraph/review/differ.py +211 -0
- codegraph/review/hook.py +70 -0
- codegraph/review/risk.py +219 -0
- codegraph/review/rules.py +342 -0
- codegraph/viz/__init__.py +17 -0
- codegraph/viz/_style.py +45 -0
- codegraph/viz/dashboard.py +740 -0
- codegraph/viz/diagrams.py +370 -0
- codegraph/viz/explore.py +453 -0
- codegraph/viz/hld.py +683 -0
- codegraph/viz/html.py +115 -0
- codegraph/viz/mermaid.py +111 -0
- codegraph/viz/svg.py +77 -0
- codegraph/web/__init__.py +4 -0
- codegraph/web/server.py +165 -0
- codegraph/web/static/app.css +664 -0
- codegraph/web/static/app.js +919 -0
- codegraph/web/static/index.html +112 -0
- codegraph/web/static/views/architecture.js +1671 -0
- codegraph/web/static/views/graph3d.css +564 -0
- codegraph/web/static/views/graph3d.js +999 -0
- codegraph/web/static/views/graph3d_transform.js +984 -0
- codegraph/workspace/__init__.py +34 -0
- codegraph/workspace/config.py +110 -0
- codegraph/workspace/operations.py +294 -0
- polycodegraph-0.1.0.dist-info/METADATA +687 -0
- polycodegraph-0.1.0.dist-info/RECORD +67 -0
- polycodegraph-0.1.0.dist-info/WHEEL +4 -0
- polycodegraph-0.1.0.dist-info/entry_points.txt +2 -0
- polycodegraph-0.1.0.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,1707 @@
|
|
|
1
|
+
"""Python source extractor using tree-sitter."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
import re
|
|
5
|
+
from pathlib import Path, PurePosixPath
|
|
6
|
+
|
|
7
|
+
import tree_sitter
|
|
8
|
+
|
|
9
|
+
from codegraph.graph.schema import Edge, EdgeKind, Node, NodeKind, make_node_id
|
|
10
|
+
from codegraph.parsers.base import (
|
|
11
|
+
ExtractorBase,
|
|
12
|
+
load_parser,
|
|
13
|
+
node_text,
|
|
14
|
+
register_extractor,
|
|
15
|
+
)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def _is_test_file(rel_path: str) -> bool:
|
|
19
|
+
return bool(
|
|
20
|
+
re.search(r"(^|[/\\])(tests?[/\\]|test_)", rel_path)
|
|
21
|
+
or rel_path.endswith("_test.py")
|
|
22
|
+
)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def _file_to_qualname(rel_path: str) -> str:
|
|
26
|
+
"""Convert repo-relative path like 'src/foo/bar.py' to 'src.foo.bar'."""
|
|
27
|
+
p = PurePosixPath(rel_path)
|
|
28
|
+
parts = list(p.with_suffix("").parts)
|
|
29
|
+
if parts and parts[-1] == "__init__":
|
|
30
|
+
parts.pop()
|
|
31
|
+
return ".".join(parts)
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def _get_docstring(block_node: tree_sitter.Node, src: bytes) -> str | None:
|
|
35
|
+
for child in block_node.children:
|
|
36
|
+
if child.type == "expression_statement":
|
|
37
|
+
for sub in child.children:
|
|
38
|
+
if sub.type == "string":
|
|
39
|
+
raw = node_text(sub, src).strip()
|
|
40
|
+
# Strip triple/single quotes
|
|
41
|
+
for q in ('"""', "'''", '"', "'"):
|
|
42
|
+
if raw.startswith(q) and raw.endswith(q):
|
|
43
|
+
raw = raw[len(q):-len(q)]
|
|
44
|
+
break
|
|
45
|
+
return raw.strip()
|
|
46
|
+
return None
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def _extract_types_from_type_node(
|
|
50
|
+
type_node: tree_sitter.Node, src: bytes
|
|
51
|
+
) -> list[str]:
|
|
52
|
+
"""Return the list of simple type names from a ``type`` AST node.
|
|
53
|
+
|
|
54
|
+
Handles three shapes:
|
|
55
|
+
* single identifier / attribute -> one-element list
|
|
56
|
+
* binary union ``A | B | ...`` -> flattened list of operand names
|
|
57
|
+
* subscript ``Union[A, B]`` / ``Optional[A]`` -> list of inner names
|
|
58
|
+
|
|
59
|
+
Anything else (string forward refs, generics like ``list[Foo]``)
|
|
60
|
+
returns an empty list — the resolver will simply not bind that
|
|
61
|
+
attribute, which is safe.
|
|
62
|
+
"""
|
|
63
|
+
# ``type`` typically has a single inner expression child; descend.
|
|
64
|
+
inner: tree_sitter.Node | None = None
|
|
65
|
+
for c in type_node.children:
|
|
66
|
+
if c.is_named:
|
|
67
|
+
inner = c
|
|
68
|
+
break
|
|
69
|
+
if inner is None:
|
|
70
|
+
return []
|
|
71
|
+
return _flatten_type_expr(inner, src)
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def _flatten_type_expr(node: tree_sitter.Node, src: bytes) -> list[str]:
|
|
75
|
+
"""Recursively flatten a type expression into bare type names."""
|
|
76
|
+
if node.type in ("identifier", "attribute"):
|
|
77
|
+
return [node_text(node, src)]
|
|
78
|
+
if node.type == "binary_operator":
|
|
79
|
+
# ``A | B`` — only honor union when the operator is ``|``.
|
|
80
|
+
op_is_pipe = any(
|
|
81
|
+
c.type == "|" for c in node.children if not c.is_named
|
|
82
|
+
)
|
|
83
|
+
if not op_is_pipe:
|
|
84
|
+
return []
|
|
85
|
+
out: list[str] = []
|
|
86
|
+
for c in node.children:
|
|
87
|
+
if c.is_named:
|
|
88
|
+
out.extend(_flatten_type_expr(c, src))
|
|
89
|
+
return out
|
|
90
|
+
if node.type in ("subscript", "generic_type"):
|
|
91
|
+
# ``Union[A, B]`` / ``Optional[A]`` — both flatten to operand list.
|
|
92
|
+
# Tree-sitter parses ``Union[A, B]`` as ``generic_type`` with a
|
|
93
|
+
# leading identifier and a ``type_parameter`` child; ``Optional[A]``
|
|
94
|
+
# may be a ``subscript`` depending on grammar version.
|
|
95
|
+
head_node: tree_sitter.Node | None = None
|
|
96
|
+
if node.type == "subscript":
|
|
97
|
+
head_node = node.child_by_field_name("value")
|
|
98
|
+
else:
|
|
99
|
+
for c in node.children:
|
|
100
|
+
if c.type in ("identifier", "attribute"):
|
|
101
|
+
head_node = c
|
|
102
|
+
break
|
|
103
|
+
head = node_text(head_node, src) if head_node is not None else ""
|
|
104
|
+
head_leaf = head.rsplit(".", 1)[-1]
|
|
105
|
+
if head_leaf not in ("Union", "Optional"):
|
|
106
|
+
return []
|
|
107
|
+
out2: list[str] = []
|
|
108
|
+
for c in node.children:
|
|
109
|
+
if not c.is_named or c is head_node:
|
|
110
|
+
continue
|
|
111
|
+
if c.type == "type_parameter":
|
|
112
|
+
for inner_c in c.children:
|
|
113
|
+
if inner_c.is_named:
|
|
114
|
+
out2.extend(_flatten_type_expr(inner_c, src))
|
|
115
|
+
else:
|
|
116
|
+
out2.extend(_flatten_type_expr(c, src))
|
|
117
|
+
return out2
|
|
118
|
+
if node.type == "type":
|
|
119
|
+
# Wrapping ``type`` node — descend into its named child.
|
|
120
|
+
for c in node.children:
|
|
121
|
+
if c.is_named:
|
|
122
|
+
return _flatten_type_expr(c, src)
|
|
123
|
+
return []
|
|
124
|
+
return []
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
def _collect_class_attr_types(
|
|
128
|
+
body: tree_sitter.Node, src: bytes
|
|
129
|
+
) -> dict[str, list[str]]:
|
|
130
|
+
"""Return ``{attr_name: [type_qualname, ...]}`` for class annotations.
|
|
131
|
+
|
|
132
|
+
Captures both class-level direct annotations (``svc: Service``,
|
|
133
|
+
``svc: Foo | Bar``, ``svc: Union[Foo, Bar]``) AND attribute
|
|
134
|
+
assignments inside ``__init__`` (including ``if/else`` branches), so
|
|
135
|
+
a backend-facade pattern like::
|
|
136
|
+
|
|
137
|
+
def __init__(self, x):
|
|
138
|
+
if x:
|
|
139
|
+
self._b: Foo = Foo()
|
|
140
|
+
else:
|
|
141
|
+
self._b = Bar()
|
|
142
|
+
|
|
143
|
+
yields ``{"_b": ["Foo", "Bar"]}``.
|
|
144
|
+
"""
|
|
145
|
+
out: dict[str, list[str]] = {}
|
|
146
|
+
for stmt in body.children:
|
|
147
|
+
if stmt.type != "expression_statement":
|
|
148
|
+
continue
|
|
149
|
+
for assignment in stmt.children:
|
|
150
|
+
if assignment.type != "assignment":
|
|
151
|
+
continue
|
|
152
|
+
name_node: tree_sitter.Node | None = None
|
|
153
|
+
type_node: tree_sitter.Node | None = None
|
|
154
|
+
for c in assignment.children:
|
|
155
|
+
if c.type == "identifier" and name_node is None:
|
|
156
|
+
name_node = c
|
|
157
|
+
elif c.type == "type":
|
|
158
|
+
type_node = c
|
|
159
|
+
if name_node is None or type_node is None:
|
|
160
|
+
continue
|
|
161
|
+
attr_name = node_text(name_node, src)
|
|
162
|
+
type_names = _extract_types_from_type_node(type_node, src)
|
|
163
|
+
if not attr_name or not type_names:
|
|
164
|
+
continue
|
|
165
|
+
existing = out.setdefault(attr_name, [])
|
|
166
|
+
for t in type_names:
|
|
167
|
+
if t not in existing:
|
|
168
|
+
existing.append(t)
|
|
169
|
+
|
|
170
|
+
# Walk __init__ for ``self.X = ...`` and ``self.X: T = ...`` bindings.
|
|
171
|
+
for stmt in body.children:
|
|
172
|
+
func: tree_sitter.Node | None = None
|
|
173
|
+
if stmt.type == "function_definition":
|
|
174
|
+
func = stmt
|
|
175
|
+
elif stmt.type == "decorated_definition":
|
|
176
|
+
for c in stmt.children:
|
|
177
|
+
if c.type == "function_definition":
|
|
178
|
+
func = c
|
|
179
|
+
break
|
|
180
|
+
if func is None:
|
|
181
|
+
continue
|
|
182
|
+
name_n = func.child_by_field_name("name")
|
|
183
|
+
if name_n is None or node_text(name_n, src) != "__init__":
|
|
184
|
+
continue
|
|
185
|
+
init_body = func.child_by_field_name("body")
|
|
186
|
+
if init_body is None:
|
|
187
|
+
continue
|
|
188
|
+
_collect_self_attr_types_in_block(init_body, src, out)
|
|
189
|
+
return out
|
|
190
|
+
|
|
191
|
+
|
|
192
|
+
def _collect_self_attr_types_in_block(
|
|
193
|
+
block: tree_sitter.Node,
|
|
194
|
+
src: bytes,
|
|
195
|
+
out: dict[str, list[str]],
|
|
196
|
+
) -> None:
|
|
197
|
+
"""Walk a function body collecting ``self.X[: T] = Y(...)`` bindings.
|
|
198
|
+
|
|
199
|
+
Recurses into ``if/else`` (and ``try/with/for/while``) branches so
|
|
200
|
+
both arms of a conditional contribute to the attribute's type list.
|
|
201
|
+
Walrus (``:=``) and dynamic ``setattr`` are deliberately ignored —
|
|
202
|
+
those are R4+ territory.
|
|
203
|
+
"""
|
|
204
|
+
for child in block.children:
|
|
205
|
+
if child.type == "expression_statement":
|
|
206
|
+
for assignment in child.children:
|
|
207
|
+
if assignment.type != "assignment":
|
|
208
|
+
continue
|
|
209
|
+
_maybe_record_self_assign(assignment, src, out)
|
|
210
|
+
elif child.type == "block":
|
|
211
|
+
# Tree-sitter wraps clause bodies in a ``block`` whose entries
|
|
212
|
+
# are the actual statements; recurse straight into it.
|
|
213
|
+
_collect_self_attr_types_in_block(child, src, out)
|
|
214
|
+
elif child.type in (
|
|
215
|
+
"if_statement", "with_statement", "try_statement",
|
|
216
|
+
"for_statement", "while_statement", "elif_clause", "else_clause",
|
|
217
|
+
"except_clause", "finally_clause",
|
|
218
|
+
):
|
|
219
|
+
# Recurse into all named children — this picks up the clause's
|
|
220
|
+
# inner ``block`` plus any sibling ``elif_clause`` / ``else_clause``
|
|
221
|
+
# / ``except_clause`` chains.
|
|
222
|
+
for sub in child.children:
|
|
223
|
+
if sub.is_named:
|
|
224
|
+
_collect_self_attr_types_in_block(sub, src, out)
|
|
225
|
+
|
|
226
|
+
|
|
227
|
+
def _maybe_record_self_assign(
|
|
228
|
+
assignment: tree_sitter.Node,
|
|
229
|
+
src: bytes,
|
|
230
|
+
out: dict[str, list[str]],
|
|
231
|
+
) -> None:
|
|
232
|
+
"""If ``assignment`` is ``self.X[: T] = expr``, record the type(s)."""
|
|
233
|
+
# Find the LHS (attribute), the optional type annotation, and RHS.
|
|
234
|
+
lhs: tree_sitter.Node | None = None
|
|
235
|
+
type_node: tree_sitter.Node | None = None
|
|
236
|
+
rhs: tree_sitter.Node | None = None
|
|
237
|
+
seen_eq = False
|
|
238
|
+
for c in assignment.children:
|
|
239
|
+
if c.type == "=":
|
|
240
|
+
seen_eq = True
|
|
241
|
+
continue
|
|
242
|
+
if c.type == "type":
|
|
243
|
+
type_node = c
|
|
244
|
+
continue
|
|
245
|
+
if not seen_eq:
|
|
246
|
+
if lhs is None:
|
|
247
|
+
lhs = c
|
|
248
|
+
else:
|
|
249
|
+
if rhs is None:
|
|
250
|
+
rhs = c
|
|
251
|
+
if lhs is None or lhs.type != "attribute":
|
|
252
|
+
return
|
|
253
|
+
obj = lhs.child_by_field_name("object")
|
|
254
|
+
attr = lhs.child_by_field_name("attribute")
|
|
255
|
+
if obj is None or attr is None:
|
|
256
|
+
return
|
|
257
|
+
if node_text(obj, src) != "self":
|
|
258
|
+
return
|
|
259
|
+
attr_name = node_text(attr, src)
|
|
260
|
+
if not attr_name:
|
|
261
|
+
return
|
|
262
|
+
|
|
263
|
+
type_names: list[str] = []
|
|
264
|
+
if type_node is not None:
|
|
265
|
+
type_names.extend(_extract_types_from_type_node(type_node, src))
|
|
266
|
+
|
|
267
|
+
# If no annotation (or annotation gave nothing useful), fall back
|
|
268
|
+
# to the constructor name on the RHS.
|
|
269
|
+
if not type_names and rhs is not None:
|
|
270
|
+
ctor = _ctor_name_from_expr(rhs, src)
|
|
271
|
+
if ctor:
|
|
272
|
+
type_names.append(ctor)
|
|
273
|
+
|
|
274
|
+
if not type_names:
|
|
275
|
+
return
|
|
276
|
+
existing = out.setdefault(attr_name, [])
|
|
277
|
+
for t in type_names:
|
|
278
|
+
if t not in existing:
|
|
279
|
+
existing.append(t)
|
|
280
|
+
|
|
281
|
+
|
|
282
|
+
def _ctor_name_from_expr(
|
|
283
|
+
node: tree_sitter.Node, src: bytes
|
|
284
|
+
) -> str | None:
|
|
285
|
+
"""Return the constructor name from an RHS expression like ``Foo(...)``.
|
|
286
|
+
|
|
287
|
+
Handles ``Foo(...)``, ``mod.Foo(...)`` (returns ``Foo``), and simple
|
|
288
|
+
identifier references ``Foo`` (when a name is being aliased without
|
|
289
|
+
instantiation, we still record the type so ``self._b = some_factory``
|
|
290
|
+
style does NOT match — only ``identifier`` / ``attribute`` whose leaf
|
|
291
|
+
looks PascalCase counts as a "type-ish" reference).
|
|
292
|
+
|
|
293
|
+
Walrus (``named_expression``) is intentionally skipped.
|
|
294
|
+
"""
|
|
295
|
+
if node.type == "call":
|
|
296
|
+
func = node.child_by_field_name("function")
|
|
297
|
+
if func is None:
|
|
298
|
+
return None
|
|
299
|
+
text = node_text(func, src).rsplit(".", 1)[-1]
|
|
300
|
+
if text and text[0].isupper():
|
|
301
|
+
return text
|
|
302
|
+
return None
|
|
303
|
+
return None
|
|
304
|
+
|
|
305
|
+
|
|
306
|
+
# --- Argument expression simplification ---------------------------------
|
|
307
|
+
#
|
|
308
|
+
# Per DF0 spec: "simple" arg expressions (literals, identifiers, attributes,
|
|
309
|
+
# subscripts) are captured verbatim; anything else collapses to "<expr>".
|
|
310
|
+
_SIMPLE_ARG_TYPES: frozenset[str] = frozenset({
|
|
311
|
+
"identifier", "string", "integer", "float",
|
|
312
|
+
"true", "false", "none",
|
|
313
|
+
"attribute", "subscript",
|
|
314
|
+
})
|
|
315
|
+
|
|
316
|
+
|
|
317
|
+
def _simplify_arg(node: tree_sitter.Node, src: bytes) -> str:
|
|
318
|
+
"""Return arg text if the AST node is a simple form, else ``"<expr>"``."""
|
|
319
|
+
if node.type in _SIMPLE_ARG_TYPES:
|
|
320
|
+
return node_text(node, src)
|
|
321
|
+
return "<expr>"
|
|
322
|
+
|
|
323
|
+
|
|
324
|
+
def _extract_params(
|
|
325
|
+
params_node: tree_sitter.Node,
|
|
326
|
+
src: bytes,
|
|
327
|
+
*,
|
|
328
|
+
skip_self_or_cls: bool,
|
|
329
|
+
) -> list[dict[str, str | None]]:
|
|
330
|
+
"""Walk a ``parameters`` AST block and return DF0 param descriptors.
|
|
331
|
+
|
|
332
|
+
Skip the first parameter when ``skip_self_or_cls`` is True and that
|
|
333
|
+
first parameter is named ``self`` or ``cls``. Variadic forms are
|
|
334
|
+
captured with ``*`` / ``**`` prefixes on the name.
|
|
335
|
+
"""
|
|
336
|
+
out: list[dict[str, str | None]] = []
|
|
337
|
+
first_seen = False
|
|
338
|
+
for child in params_node.children:
|
|
339
|
+
if not child.is_named:
|
|
340
|
+
continue
|
|
341
|
+
descriptor: dict[str, str | None] | None = None
|
|
342
|
+
if child.type == "identifier":
|
|
343
|
+
descriptor = {
|
|
344
|
+
"name": node_text(child, src),
|
|
345
|
+
"type": None,
|
|
346
|
+
"default": None,
|
|
347
|
+
}
|
|
348
|
+
elif child.type == "typed_parameter":
|
|
349
|
+
name_n = next(
|
|
350
|
+
(c for c in child.children if c.type == "identifier"), None
|
|
351
|
+
)
|
|
352
|
+
type_n = next(
|
|
353
|
+
(c for c in child.children if c.type == "type"), None
|
|
354
|
+
)
|
|
355
|
+
if name_n is not None:
|
|
356
|
+
descriptor = {
|
|
357
|
+
"name": node_text(name_n, src),
|
|
358
|
+
"type": node_text(type_n, src) if type_n else None,
|
|
359
|
+
"default": None,
|
|
360
|
+
}
|
|
361
|
+
elif child.type == "default_parameter":
|
|
362
|
+
name_n = child.child_by_field_name("name")
|
|
363
|
+
value_n = child.child_by_field_name("value")
|
|
364
|
+
if name_n is not None:
|
|
365
|
+
descriptor = {
|
|
366
|
+
"name": node_text(name_n, src),
|
|
367
|
+
"type": None,
|
|
368
|
+
"default": node_text(value_n, src) if value_n else None,
|
|
369
|
+
}
|
|
370
|
+
elif child.type == "typed_default_parameter":
|
|
371
|
+
name_n = child.child_by_field_name("name")
|
|
372
|
+
type_n = child.child_by_field_name("type")
|
|
373
|
+
value_n = child.child_by_field_name("value")
|
|
374
|
+
if name_n is not None:
|
|
375
|
+
descriptor = {
|
|
376
|
+
"name": node_text(name_n, src),
|
|
377
|
+
"type": node_text(type_n, src) if type_n else None,
|
|
378
|
+
"default": node_text(value_n, src) if value_n else None,
|
|
379
|
+
}
|
|
380
|
+
elif child.type == "list_splat_pattern":
|
|
381
|
+
inner = next(
|
|
382
|
+
(c for c in child.children if c.type == "identifier"), None
|
|
383
|
+
)
|
|
384
|
+
if inner is not None:
|
|
385
|
+
descriptor = {
|
|
386
|
+
"name": f"*{node_text(inner, src)}",
|
|
387
|
+
"type": None,
|
|
388
|
+
"default": None,
|
|
389
|
+
}
|
|
390
|
+
elif child.type == "dictionary_splat_pattern":
|
|
391
|
+
inner = next(
|
|
392
|
+
(c for c in child.children if c.type == "identifier"), None
|
|
393
|
+
)
|
|
394
|
+
if inner is not None:
|
|
395
|
+
descriptor = {
|
|
396
|
+
"name": f"**{node_text(inner, src)}",
|
|
397
|
+
"type": None,
|
|
398
|
+
"default": None,
|
|
399
|
+
}
|
|
400
|
+
if descriptor is None:
|
|
401
|
+
continue
|
|
402
|
+
if (
|
|
403
|
+
skip_self_or_cls
|
|
404
|
+
and not first_seen
|
|
405
|
+
and descriptor["name"] in ("self", "cls")
|
|
406
|
+
):
|
|
407
|
+
first_seen = True
|
|
408
|
+
continue
|
|
409
|
+
first_seen = True
|
|
410
|
+
out.append(descriptor)
|
|
411
|
+
return out
|
|
412
|
+
|
|
413
|
+
|
|
414
|
+
def _extract_call_args(
|
|
415
|
+
arg_list: tree_sitter.Node, src: bytes
|
|
416
|
+
) -> tuple[list[str], dict[str, str]]:
|
|
417
|
+
"""Return ``(args, kwargs)`` for a ``call.argument_list`` AST node.
|
|
418
|
+
|
|
419
|
+
Follows the DF0 capture rules: positional args are simplified via
|
|
420
|
+
``_simplify_arg``; keyword args become ``kwargs[name] = simplified``;
|
|
421
|
+
``*spread`` becomes ``"*name"`` in args; ``**spread`` becomes
|
|
422
|
+
``kwargs["**"] = name``.
|
|
423
|
+
"""
|
|
424
|
+
args: list[str] = []
|
|
425
|
+
kwargs: dict[str, str] = {}
|
|
426
|
+
for child in arg_list.children:
|
|
427
|
+
if not child.is_named:
|
|
428
|
+
continue
|
|
429
|
+
if child.type == "keyword_argument":
|
|
430
|
+
name_n = child.child_by_field_name("name")
|
|
431
|
+
value_n = child.child_by_field_name("value")
|
|
432
|
+
if name_n is not None and value_n is not None:
|
|
433
|
+
kwargs[node_text(name_n, src)] = _simplify_arg(value_n, src)
|
|
434
|
+
elif child.type == "list_splat":
|
|
435
|
+
inner = next(
|
|
436
|
+
(c for c in child.children if c.is_named), None
|
|
437
|
+
)
|
|
438
|
+
if inner is not None:
|
|
439
|
+
args.append(f"*{node_text(inner, src)}")
|
|
440
|
+
else:
|
|
441
|
+
args.append("<expr>")
|
|
442
|
+
elif child.type == "dictionary_splat":
|
|
443
|
+
inner = next(
|
|
444
|
+
(c for c in child.children if c.is_named), None
|
|
445
|
+
)
|
|
446
|
+
if inner is not None:
|
|
447
|
+
kwargs["**"] = node_text(inner, src)
|
|
448
|
+
else:
|
|
449
|
+
args.append(_simplify_arg(child, src))
|
|
450
|
+
return args, kwargs
|
|
451
|
+
|
|
452
|
+
|
|
453
|
+
# --- DF1: HTTP route + SQLAlchemy detection ---------------------------
|
|
454
|
+
#
|
|
455
|
+
# Patterns are regex-based on the raw decorator / call text. Tree-sitter
|
|
456
|
+
# gives us reliable syntactic boundaries; we lean on regex for the inner
|
|
457
|
+
# semantic shape (HTTP method names, model arguments) since the surface
|
|
458
|
+
# vocabulary is small and well-known.
|
|
459
|
+
|
|
460
|
+
# Recognised HTTP verbs / route helpers across FastAPI, Flask, aiohttp.
|
|
461
|
+
_HTTP_VERBS: tuple[str, ...] = (
|
|
462
|
+
"get", "post", "put", "delete", "patch",
|
|
463
|
+
"head", "options", "trace", "websocket",
|
|
464
|
+
)
|
|
465
|
+
|
|
466
|
+
# Decorator forms:
|
|
467
|
+
# @<router>.<verb>("/path", ...)
|
|
468
|
+
# @<router>.<verb>('/path', ...)
|
|
469
|
+
# router is any identifier (app, router, blueprint, bp, ...).
|
|
470
|
+
_ROUTE_VERB_RE = re.compile(
|
|
471
|
+
r"@\s*(?P<router>[\w.]+)\.(?P<verb>"
|
|
472
|
+
+ "|".join(_HTTP_VERBS)
|
|
473
|
+
+ r")\s*\(\s*['\"](?P<path>[^'\"]+)['\"]"
|
|
474
|
+
)
|
|
475
|
+
# @<router>.route("/path", methods=[...]) — Flask shape.
|
|
476
|
+
_ROUTE_GENERIC_RE = re.compile(
|
|
477
|
+
r"@\s*(?P<router>[\w.]+)\.route\s*\(\s*['\"](?P<path>[^'\"]+)['\"]"
|
|
478
|
+
)
|
|
479
|
+
_METHODS_KW_RE = re.compile(
|
|
480
|
+
r"methods\s*=\s*\[(?P<methods>[^\]]*)\]"
|
|
481
|
+
)
|
|
482
|
+
_METHOD_TOKEN_RE = re.compile(r"['\"]([A-Za-z]+)['\"]")
|
|
483
|
+
|
|
484
|
+
# FastAPI-style routers vs Flask app/blueprint heuristic for `framework`.
|
|
485
|
+
_FASTAPI_ROUTER_TOKENS: frozenset[str] = frozenset({
|
|
486
|
+
"router", "api_router", "apirouter",
|
|
487
|
+
})
|
|
488
|
+
_FLASK_ROUTER_TOKENS: frozenset[str] = frozenset({
|
|
489
|
+
"blueprint", "bp", "blueprints",
|
|
490
|
+
})
|
|
491
|
+
|
|
492
|
+
|
|
493
|
+
def _classify_framework(router_name: str, has_methods_kw: bool) -> str:
|
|
494
|
+
"""Best-effort framework guess for ROUTE edge metadata.
|
|
495
|
+
|
|
496
|
+
Heuristics:
|
|
497
|
+
* ``methods=[...]`` keyword is Flask-shaped; FastAPI's per-verb
|
|
498
|
+
decorators don't accept it.
|
|
499
|
+
* Names containing ``router`` lean FastAPI; ``blueprint``/``bp``
|
|
500
|
+
lean Flask. Fallback is ``fastapi`` since it is by far the most
|
|
501
|
+
common modern Python web framework.
|
|
502
|
+
"""
|
|
503
|
+
head = router_name.rsplit(".", 1)[-1].lower()
|
|
504
|
+
if has_methods_kw:
|
|
505
|
+
return "flask"
|
|
506
|
+
if head in _FLASK_ROUTER_TOKENS:
|
|
507
|
+
return "flask"
|
|
508
|
+
if head in _FASTAPI_ROUTER_TOKENS:
|
|
509
|
+
return "fastapi"
|
|
510
|
+
return "fastapi"
|
|
511
|
+
|
|
512
|
+
|
|
513
|
+
def _extract_route_specs(
|
|
514
|
+
decorators: list[str],
|
|
515
|
+
) -> list[dict[str, str]]:
|
|
516
|
+
"""Return one dict per HTTP route described by the decorators.
|
|
517
|
+
|
|
518
|
+
Flask's ``@app.route("/x", methods=["GET", "POST"])`` produces ONE
|
|
519
|
+
dict per method (so caller emits one ROUTE edge per method).
|
|
520
|
+
FastAPI's ``@app.get("/x")`` produces a single dict.
|
|
521
|
+
|
|
522
|
+
Each dict has keys: ``method`` (uppercase), ``path``, ``framework``,
|
|
523
|
+
``router`` (raw router-variable text).
|
|
524
|
+
"""
|
|
525
|
+
out: list[dict[str, str]] = []
|
|
526
|
+
for raw in decorators:
|
|
527
|
+
text = raw.strip()
|
|
528
|
+
# @<router>.route(...) — handle FIRST so methods kw is honored,
|
|
529
|
+
# otherwise the verb regex would never match (no verb in decl).
|
|
530
|
+
m = _ROUTE_GENERIC_RE.search(text)
|
|
531
|
+
if m:
|
|
532
|
+
router = m.group("router")
|
|
533
|
+
path = m.group("path")
|
|
534
|
+
framework = _classify_framework(router, has_methods_kw=True)
|
|
535
|
+
mm = _METHODS_KW_RE.search(text)
|
|
536
|
+
if mm:
|
|
537
|
+
methods = [
|
|
538
|
+
tok.upper()
|
|
539
|
+
for tok in _METHOD_TOKEN_RE.findall(mm.group("methods"))
|
|
540
|
+
]
|
|
541
|
+
else:
|
|
542
|
+
# Default Flask method when methods= is absent.
|
|
543
|
+
methods = ["GET"]
|
|
544
|
+
for method in methods:
|
|
545
|
+
out.append({
|
|
546
|
+
"method": method,
|
|
547
|
+
"path": path,
|
|
548
|
+
"framework": framework,
|
|
549
|
+
"router": router,
|
|
550
|
+
})
|
|
551
|
+
continue
|
|
552
|
+
# @<router>.<verb>(path, ...)
|
|
553
|
+
m2 = _ROUTE_VERB_RE.search(text)
|
|
554
|
+
if m2:
|
|
555
|
+
router = m2.group("router")
|
|
556
|
+
verb = m2.group("verb")
|
|
557
|
+
path = m2.group("path")
|
|
558
|
+
framework = _classify_framework(router, has_methods_kw=False)
|
|
559
|
+
out.append({
|
|
560
|
+
"method": verb.upper(),
|
|
561
|
+
"path": path,
|
|
562
|
+
"framework": framework,
|
|
563
|
+
"router": router,
|
|
564
|
+
})
|
|
565
|
+
return out
|
|
566
|
+
|
|
567
|
+
|
|
568
|
+
# --- SQLAlchemy detection ----------------------------------------------
|
|
569
|
+
#
|
|
570
|
+
# We detect data-access patterns at parse time and emit READS_FROM /
|
|
571
|
+
# WRITES_TO edges with ``dst="unresolved::<ModelName>"``. The post-build
|
|
572
|
+
# resolver rewrites these to real CLASS node ids when the model is in
|
|
573
|
+
# repo; any that remain unresolved are dropped (per DF1 spec).
|
|
574
|
+
|
|
575
|
+
# Outer verbs we recognise on session/db/conn.
|
|
576
|
+
_SQL_READ_OUTER: frozenset[str] = frozenset({"query", "get", "scalar", "scalars"})
|
|
577
|
+
_SQL_WRITE_OUTER: frozenset[str] = frozenset({"add", "add_all", "delete", "merge"})
|
|
578
|
+
# Inner verbs in session.execute(<inner>(Model)).
|
|
579
|
+
_SQL_READ_INNER: frozenset[str] = frozenset({"select"})
|
|
580
|
+
_SQL_WRITE_INNER: frozenset[str] = frozenset({"insert", "update", "delete"})
|
|
581
|
+
|
|
582
|
+
# `session`, `db.session`, `db`, `conn`, `cursor`, ... — left-most token
|
|
583
|
+
# of a chain that suggests an ORM/connection root. We accept any
|
|
584
|
+
# identifier and rely on ``execute``/``query``/``add``/etc. as the verb
|
|
585
|
+
# trigger, but record the chain's last identifier in metadata.
|
|
586
|
+
_SESSION_HEAD_TOKENS: frozenset[str] = frozenset({
|
|
587
|
+
"session", "db", "conn", "connection", "cursor",
|
|
588
|
+
})
|
|
589
|
+
|
|
590
|
+
|
|
591
|
+
def _strip_call_suffix(name: str) -> str:
|
|
592
|
+
"""Drop `()` and trailing chained calls — `Foo().bar` -> `Foo.bar`."""
|
|
593
|
+
out: list[str] = []
|
|
594
|
+
depth = 0
|
|
595
|
+
for ch in name:
|
|
596
|
+
if ch == "(":
|
|
597
|
+
depth += 1
|
|
598
|
+
continue
|
|
599
|
+
if ch == ")":
|
|
600
|
+
if depth > 0:
|
|
601
|
+
depth -= 1
|
|
602
|
+
continue
|
|
603
|
+
if depth == 0:
|
|
604
|
+
out.append(ch)
|
|
605
|
+
return "".join(out).strip().rstrip(".")
|
|
606
|
+
|
|
607
|
+
|
|
608
|
+
def _is_session_chain(target: str) -> bool:
|
|
609
|
+
"""Return True if the dotted chain's left-most segment looks like a
|
|
610
|
+
session/db handle (``session.query``, ``db.session.query``, ...).
|
|
611
|
+
|
|
612
|
+
Also matches ``self.session.X`` / ``self.db.X`` patterns common in
|
|
613
|
+
repository-style code where the session is held as an instance
|
|
614
|
+
attribute.
|
|
615
|
+
"""
|
|
616
|
+
if not target:
|
|
617
|
+
return False
|
|
618
|
+
parts = target.split(".")
|
|
619
|
+
head = parts[0].lower()
|
|
620
|
+
if head in _SESSION_HEAD_TOKENS:
|
|
621
|
+
return True
|
|
622
|
+
# self.<session-token>.X — repository-pattern method bodies.
|
|
623
|
+
if head == "self" and len(parts) >= 2:
|
|
624
|
+
second = parts[1].lower()
|
|
625
|
+
if second in _SESSION_HEAD_TOKENS:
|
|
626
|
+
return True
|
|
627
|
+
return False
|
|
628
|
+
|
|
629
|
+
|
|
630
|
+
def _unwrap_to_root_call(node: tree_sitter.Node) -> tree_sitter.Node | None:
|
|
631
|
+
"""Follow ``call.function -> attribute.object`` chains down to the
|
|
632
|
+
leftmost ``call`` node.
|
|
633
|
+
|
|
634
|
+
Used for ``select(Model).where(...).order_by(...)`` style chains so we
|
|
635
|
+
extract ``select(Model)``'s argument, not the outer chained call's.
|
|
636
|
+
"""
|
|
637
|
+
cur: tree_sitter.Node | None = node
|
|
638
|
+
while cur is not None and cur.type == "call":
|
|
639
|
+
func_child = cur.child_by_field_name("function")
|
|
640
|
+
# If function is itself an attribute whose object is a call, the
|
|
641
|
+
# inner call is the "root"; descend.
|
|
642
|
+
if (
|
|
643
|
+
func_child is not None
|
|
644
|
+
and func_child.type == "attribute"
|
|
645
|
+
):
|
|
646
|
+
obj = func_child.child_by_field_name("object")
|
|
647
|
+
if obj is not None and obj.type == "call":
|
|
648
|
+
cur = obj
|
|
649
|
+
continue
|
|
650
|
+
break
|
|
651
|
+
return cur
|
|
652
|
+
|
|
653
|
+
|
|
654
|
+
def _model_name_from_call_arg(arg_text: str) -> str | None:
|
|
655
|
+
"""Extract a Model name from a call-argument expression.
|
|
656
|
+
|
|
657
|
+
Handles:
|
|
658
|
+
* ``User`` — bare identifier
|
|
659
|
+
* ``User(...)`` — constructor call (returns ``User``)
|
|
660
|
+
* ``[User(...), Other()]`` — list with a Model constructor (returns
|
|
661
|
+
``User``, the first model)
|
|
662
|
+
* ``some_chain.User`` — last segment
|
|
663
|
+
"""
|
|
664
|
+
if not arg_text:
|
|
665
|
+
return None
|
|
666
|
+
text = arg_text.strip()
|
|
667
|
+
if text.startswith("[") and text.endswith("]"):
|
|
668
|
+
# ``add_all([User(...), ...])`` — pick the first PascalCase token.
|
|
669
|
+
inner = text[1:-1]
|
|
670
|
+
tokens: list[str] = re.findall(r"[A-Za-z_][A-Za-z0-9_]*", inner)
|
|
671
|
+
for tok in tokens:
|
|
672
|
+
if tok and tok[0].isupper():
|
|
673
|
+
return tok
|
|
674
|
+
return None
|
|
675
|
+
# Drop call args / parens.
|
|
676
|
+
no_parens = _strip_call_suffix(text)
|
|
677
|
+
# Last identifier segment after dotting.
|
|
678
|
+
leaf = no_parens.rsplit(".", 1)[-1]
|
|
679
|
+
if not leaf or not re.fullmatch(r"[A-Za-z_][A-Za-z0-9_]*", leaf):
|
|
680
|
+
return None
|
|
681
|
+
if not leaf[0].isupper():
|
|
682
|
+
return None
|
|
683
|
+
return leaf
|
|
684
|
+
|
|
685
|
+
|
|
686
|
+
# --- Public-API pragma detection ----------------------------------------
|
|
687
|
+
#
|
|
688
|
+
# A function or class can be exempted from dead-code analysis by prefixing
|
|
689
|
+
# its definition with one of these pragma comments on the line immediately
|
|
690
|
+
# before the def/class (or before the topmost decorator). A trailing
|
|
691
|
+
# same-line pragma (``def foo(): ... # pragma: codegraph-public-api``) is
|
|
692
|
+
# also accepted.
|
|
693
|
+
_PUBLIC_API_PRAGMAS: tuple[str, ...] = (
|
|
694
|
+
"# pragma: codegraph-public-api",
|
|
695
|
+
"# codegraph: public-api",
|
|
696
|
+
)
|
|
697
|
+
|
|
698
|
+
|
|
699
|
+
def _line_has_public_api_pragma(line: str) -> bool:
|
|
700
|
+
stripped = line.strip()
|
|
701
|
+
return any(pragma in stripped for pragma in _PUBLIC_API_PRAGMAS)
|
|
702
|
+
|
|
703
|
+
|
|
704
|
+
def _has_public_api_pragma(def_node: tree_sitter.Node, src: bytes) -> bool:
|
|
705
|
+
"""Return True if the def/class node is preceded by a public-API pragma.
|
|
706
|
+
|
|
707
|
+
The pragma must sit on the line immediately above the definition (or
|
|
708
|
+
above the topmost decorator, when decorators are present) or as a
|
|
709
|
+
trailing comment on the def/class signature line itself.
|
|
710
|
+
"""
|
|
711
|
+
container: tree_sitter.Node = def_node
|
|
712
|
+
if (
|
|
713
|
+
def_node.parent is not None
|
|
714
|
+
and def_node.parent.type == "decorated_definition"
|
|
715
|
+
):
|
|
716
|
+
container = def_node.parent
|
|
717
|
+
|
|
718
|
+
start_byte = container.start_byte
|
|
719
|
+
end_byte = container.end_byte
|
|
720
|
+
|
|
721
|
+
# Same-line trailing pragma: scan from the def signature start to the
|
|
722
|
+
# first newline of the def body.
|
|
723
|
+
sig_end = src.find(b"\n", start_byte)
|
|
724
|
+
if sig_end == -1:
|
|
725
|
+
sig_end = end_byte
|
|
726
|
+
sig_line = src[start_byte:sig_end].decode("utf-8", errors="replace")
|
|
727
|
+
if _line_has_public_api_pragma(sig_line):
|
|
728
|
+
return True
|
|
729
|
+
|
|
730
|
+
# Walk backward through whitespace-only lines until we find a non-blank
|
|
731
|
+
# line; if that line is a pragma comment, we're matched.
|
|
732
|
+
cursor = start_byte
|
|
733
|
+
# Step back past the leading newline of the def's line.
|
|
734
|
+
if cursor > 0 and src[cursor - 1:cursor] == b"\n":
|
|
735
|
+
cursor -= 1
|
|
736
|
+
while cursor > 0:
|
|
737
|
+
# Find the start of the previous line.
|
|
738
|
+
prev_nl = src.rfind(b"\n", 0, cursor)
|
|
739
|
+
line_start = prev_nl + 1 if prev_nl != -1 else 0
|
|
740
|
+
line = src[line_start:cursor].decode("utf-8", errors="replace")
|
|
741
|
+
if not line.strip():
|
|
742
|
+
# Blank line — keep walking.
|
|
743
|
+
cursor = prev_nl
|
|
744
|
+
if cursor <= 0:
|
|
745
|
+
return False
|
|
746
|
+
continue
|
|
747
|
+
return _line_has_public_api_pragma(line)
|
|
748
|
+
return False
|
|
749
|
+
|
|
750
|
+
|
|
751
|
+
def _get_function_decorators(func_node: tree_sitter.Node, src: bytes) -> list[str]:
|
|
752
|
+
"""Collect decorator strings for a function/class definition.
|
|
753
|
+
|
|
754
|
+
Tree-sitter wraps decorated definitions in a ``decorated_definition``
|
|
755
|
+
parent whose siblings are the ``decorator`` nodes; the actual
|
|
756
|
+
``function_definition``/``class_definition`` itself has no decorator
|
|
757
|
+
children. We therefore look at the parent when needed.
|
|
758
|
+
"""
|
|
759
|
+
decs: list[str] = []
|
|
760
|
+
container: tree_sitter.Node | None = func_node
|
|
761
|
+
if (
|
|
762
|
+
func_node.parent is not None
|
|
763
|
+
and func_node.parent.type == "decorated_definition"
|
|
764
|
+
):
|
|
765
|
+
container = func_node.parent
|
|
766
|
+
if container is None:
|
|
767
|
+
return decs
|
|
768
|
+
for child in container.children:
|
|
769
|
+
if child.type == "decorator":
|
|
770
|
+
decs.append(node_text(child, src))
|
|
771
|
+
return decs
|
|
772
|
+
|
|
773
|
+
|
|
774
|
+
# --- Entry-point decorator catalog ---------------------------------------
|
|
775
|
+
#
|
|
776
|
+
# Decorator-prefix patterns (matched as substring of the raw "@..." text).
|
|
777
|
+
# Order is irrelevant; first match wins. Patterns starting with ``@`` match
|
|
778
|
+
# only at the start of the decorator string, while patterns without a
|
|
779
|
+
# leading ``@`` are matched as a contained substring (so ``@<name>.command``
|
|
780
|
+
# style patterns require explicit suffixes).
|
|
781
|
+
_ENTRYPOINT_DECORATOR_SUFFIXES: tuple[str, ...] = (
|
|
782
|
+
# Typer / Click — bound to any local Typer/Click instance.
|
|
783
|
+
".command", ".callback", ".group",
|
|
784
|
+
# FastAPI / Flask / aiohttp — HTTP and websocket route decorators.
|
|
785
|
+
".get", ".post", ".put", ".delete", ".patch", ".head", ".options",
|
|
786
|
+
".trace", ".websocket", ".route", ".on_event", ".middleware",
|
|
787
|
+
".before_request", ".after_request", ".teardown_request",
|
|
788
|
+
".errorhandler",
|
|
789
|
+
# Celery.
|
|
790
|
+
".task",
|
|
791
|
+
# SQLAlchemy.
|
|
792
|
+
".listens_for",
|
|
793
|
+
# MCP protocol server (anthropic mcp-python-sdk and similar).
|
|
794
|
+
".list_tools", ".call_tool", ".list_resources", ".read_resource",
|
|
795
|
+
".list_prompts", ".get_prompt",
|
|
796
|
+
)
|
|
797
|
+
|
|
798
|
+
# Decorator names matched anywhere in the raw decorator text (covers bare
|
|
799
|
+
# ``@shared_task`` as well as ``@app.shared_task`` and ``@pytest.fixture``).
|
|
800
|
+
_ENTRYPOINT_DECORATOR_CONTAINS: tuple[str, ...] = (
|
|
801
|
+
"shared_task",
|
|
802
|
+
"pytest.fixture",
|
|
803
|
+
"pytest.mark",
|
|
804
|
+
"abstractmethod",
|
|
805
|
+
"abc.abstractmethod",
|
|
806
|
+
"admin.register",
|
|
807
|
+
"receiver",
|
|
808
|
+
"login_required",
|
|
809
|
+
"permission_required",
|
|
810
|
+
"event.listens_for",
|
|
811
|
+
# Local registry decorators commonly used in this codebase / MCP servers.
|
|
812
|
+
"_register",
|
|
813
|
+
)
|
|
814
|
+
|
|
815
|
+
|
|
816
|
+
def _is_entry_point(
|
|
817
|
+
decorators: list[str],
|
|
818
|
+
name: str,
|
|
819
|
+
*,
|
|
820
|
+
extra_decorator_patterns: tuple[str, ...] = (),
|
|
821
|
+
) -> bool:
|
|
822
|
+
"""Return True if any decorator matches a known entry-point pattern.
|
|
823
|
+
|
|
824
|
+
``name`` is currently unused but kept for forward compatibility with
|
|
825
|
+
name-glob configuration in DeadCodeConfig.
|
|
826
|
+
"""
|
|
827
|
+
if not decorators:
|
|
828
|
+
return False
|
|
829
|
+
for raw in decorators:
|
|
830
|
+
text = raw.strip()
|
|
831
|
+
# Drop the leading '@' for substring matching, but keep the raw
|
|
832
|
+
# form for prefix matching.
|
|
833
|
+
body = text[1:] if text.startswith("@") else text
|
|
834
|
+
for suffix in _ENTRYPOINT_DECORATOR_SUFFIXES:
|
|
835
|
+
if suffix in body:
|
|
836
|
+
return True
|
|
837
|
+
for needle in _ENTRYPOINT_DECORATOR_CONTAINS:
|
|
838
|
+
if needle in body:
|
|
839
|
+
return True
|
|
840
|
+
for pattern in extra_decorator_patterns:
|
|
841
|
+
stripped = pattern.lstrip("@").strip()
|
|
842
|
+
if stripped and stripped in body:
|
|
843
|
+
return True
|
|
844
|
+
return False
|
|
845
|
+
|
|
846
|
+
|
|
847
|
+
@register_extractor
|
|
848
|
+
class PythonExtractor(ExtractorBase):
|
|
849
|
+
language = "python"
|
|
850
|
+
extensions = (".py",)
|
|
851
|
+
|
|
852
|
+
# Optional user-supplied decorator patterns (set by GraphBuilder before
|
|
853
|
+
# parsing). Matched as substring of the raw decorator text via
|
|
854
|
+
# ``_is_entry_point``.
|
|
855
|
+
extra_entry_point_decorators: tuple[str, ...] = ()
|
|
856
|
+
|
|
857
|
+
def parse_file(
|
|
858
|
+
self, path: Path, repo_root: Path
|
|
859
|
+
) -> tuple[list[Node], list[Edge]]:
|
|
860
|
+
src = path.read_bytes()
|
|
861
|
+
rel = path.relative_to(repo_root).as_posix()
|
|
862
|
+
parser = load_parser("python")
|
|
863
|
+
tree = parser.parse(src)
|
|
864
|
+
root = tree.root_node
|
|
865
|
+
|
|
866
|
+
nodes: list[Node] = []
|
|
867
|
+
edges: list[Edge] = []
|
|
868
|
+
|
|
869
|
+
is_test = _is_test_file(rel)
|
|
870
|
+
qualname = _file_to_qualname(rel)
|
|
871
|
+
module_id = make_node_id(NodeKind.MODULE, qualname, rel)
|
|
872
|
+
module_node = Node(
|
|
873
|
+
id=module_id,
|
|
874
|
+
kind=NodeKind.MODULE,
|
|
875
|
+
name=qualname.split(".")[-1] if qualname else rel,
|
|
876
|
+
qualname=qualname,
|
|
877
|
+
file=rel,
|
|
878
|
+
line_start=1,
|
|
879
|
+
line_end=root.end_point[0] + 1,
|
|
880
|
+
language="python",
|
|
881
|
+
metadata={"is_test": is_test},
|
|
882
|
+
)
|
|
883
|
+
nodes.append(module_node)
|
|
884
|
+
|
|
885
|
+
if is_test:
|
|
886
|
+
test_id = make_node_id(NodeKind.TEST, qualname, rel)
|
|
887
|
+
test_node = Node(
|
|
888
|
+
id=test_id,
|
|
889
|
+
kind=NodeKind.TEST,
|
|
890
|
+
name=qualname.split(".")[-1] if qualname else rel,
|
|
891
|
+
qualname=qualname,
|
|
892
|
+
file=rel,
|
|
893
|
+
line_start=1,
|
|
894
|
+
line_end=root.end_point[0] + 1,
|
|
895
|
+
language="python",
|
|
896
|
+
metadata={"is_test": True},
|
|
897
|
+
)
|
|
898
|
+
nodes.append(test_node)
|
|
899
|
+
|
|
900
|
+
self._visit_block(
|
|
901
|
+
root, rel, qualname, module_id, None, src, nodes, edges
|
|
902
|
+
)
|
|
903
|
+
# Module-level call expressions (e.g. `Widget("a")` at top level)
|
|
904
|
+
# also produce CALLS edges attributed to the module so the resolver
|
|
905
|
+
# can link them to in-repo classes/functions defined in the same
|
|
906
|
+
# file. We deliberately stop traversal at any function/class def so
|
|
907
|
+
# we don't double-count their inner calls.
|
|
908
|
+
self._collect_calls(root, rel, module_id, src, edges)
|
|
909
|
+
return nodes, edges
|
|
910
|
+
|
|
911
|
+
def _visit_block(
|
|
912
|
+
self,
|
|
913
|
+
block: tree_sitter.Node,
|
|
914
|
+
rel: str,
|
|
915
|
+
parent_qualname: str,
|
|
916
|
+
parent_id: str,
|
|
917
|
+
enclosing_class_id: str | None,
|
|
918
|
+
src: bytes,
|
|
919
|
+
nodes: list[Node],
|
|
920
|
+
edges: list[Edge],
|
|
921
|
+
) -> None:
|
|
922
|
+
for child in block.children:
|
|
923
|
+
if child.type == "class_definition":
|
|
924
|
+
self._handle_class(
|
|
925
|
+
child, rel, parent_qualname, parent_id, src, nodes, edges
|
|
926
|
+
)
|
|
927
|
+
elif child.type == "function_definition":
|
|
928
|
+
kind = (
|
|
929
|
+
NodeKind.METHOD if enclosing_class_id else NodeKind.FUNCTION
|
|
930
|
+
)
|
|
931
|
+
self._handle_function(
|
|
932
|
+
child, rel, parent_qualname, parent_id, kind,
|
|
933
|
+
src, nodes, edges,
|
|
934
|
+
)
|
|
935
|
+
elif child.type == "decorated_definition":
|
|
936
|
+
inner = None
|
|
937
|
+
for c in child.children:
|
|
938
|
+
if c.type in ("function_definition", "class_definition"):
|
|
939
|
+
inner = c
|
|
940
|
+
break
|
|
941
|
+
if inner is not None and inner.type == "class_definition":
|
|
942
|
+
self._handle_class(
|
|
943
|
+
inner, rel, parent_qualname, parent_id,
|
|
944
|
+
src, nodes, edges,
|
|
945
|
+
)
|
|
946
|
+
elif inner is not None:
|
|
947
|
+
kind = (
|
|
948
|
+
NodeKind.METHOD if enclosing_class_id else NodeKind.FUNCTION
|
|
949
|
+
)
|
|
950
|
+
self._handle_function(
|
|
951
|
+
inner, rel, parent_qualname, parent_id, kind,
|
|
952
|
+
src, nodes, edges,
|
|
953
|
+
)
|
|
954
|
+
elif child.type == "import_statement":
|
|
955
|
+
self._handle_import(child, rel, parent_id, src, edges)
|
|
956
|
+
elif child.type == "import_from_statement":
|
|
957
|
+
self._handle_import_from(child, rel, parent_id, src, edges)
|
|
958
|
+
elif child.type in (
|
|
959
|
+
"if_statement", "with_statement", "try_statement",
|
|
960
|
+
"for_statement", "while_statement",
|
|
961
|
+
):
|
|
962
|
+
for sub in child.children:
|
|
963
|
+
if sub.type == "block":
|
|
964
|
+
self._visit_block(
|
|
965
|
+
sub, rel, parent_qualname, parent_id,
|
|
966
|
+
enclosing_class_id, src, nodes, edges,
|
|
967
|
+
)
|
|
968
|
+
|
|
969
|
+
def _handle_class(
|
|
970
|
+
self,
|
|
971
|
+
node: tree_sitter.Node,
|
|
972
|
+
rel: str,
|
|
973
|
+
parent_qualname: str,
|
|
974
|
+
parent_id: str,
|
|
975
|
+
src: bytes,
|
|
976
|
+
nodes: list[Node],
|
|
977
|
+
edges: list[Edge],
|
|
978
|
+
) -> None:
|
|
979
|
+
name_node = node.child_by_field_name("name")
|
|
980
|
+
if name_node is None:
|
|
981
|
+
return
|
|
982
|
+
name = node_text(name_node, src)
|
|
983
|
+
qualname = f"{parent_qualname}.{name}" if parent_qualname else name
|
|
984
|
+
class_id = make_node_id(NodeKind.CLASS, qualname, rel)
|
|
985
|
+
|
|
986
|
+
sig = node_text(node, src).split("\n")[0].rstrip(":")
|
|
987
|
+
|
|
988
|
+
body = node.child_by_field_name("body")
|
|
989
|
+
docstring = _get_docstring(body, src) if body else None
|
|
990
|
+
|
|
991
|
+
decorators = _get_function_decorators(node, src)
|
|
992
|
+
cls_metadata: dict[str, object] = {}
|
|
993
|
+
if decorators:
|
|
994
|
+
cls_metadata["decorators"] = decorators
|
|
995
|
+
if _is_entry_point(
|
|
996
|
+
decorators,
|
|
997
|
+
name,
|
|
998
|
+
extra_decorator_patterns=self.extra_entry_point_decorators,
|
|
999
|
+
):
|
|
1000
|
+
cls_metadata["entry_point"] = True
|
|
1001
|
+
if _has_public_api_pragma(node, src):
|
|
1002
|
+
cls_metadata["public_api"] = True
|
|
1003
|
+
|
|
1004
|
+
body_for_attrs = node.child_by_field_name("body")
|
|
1005
|
+
attr_types = (
|
|
1006
|
+
_collect_class_attr_types(body_for_attrs, src)
|
|
1007
|
+
if body_for_attrs is not None else {}
|
|
1008
|
+
)
|
|
1009
|
+
if attr_types:
|
|
1010
|
+
cls_metadata["attr_types"] = attr_types
|
|
1011
|
+
|
|
1012
|
+
class_node = Node(
|
|
1013
|
+
id=class_id,
|
|
1014
|
+
kind=NodeKind.CLASS,
|
|
1015
|
+
name=name,
|
|
1016
|
+
qualname=qualname,
|
|
1017
|
+
file=rel,
|
|
1018
|
+
line_start=node.start_point[0] + 1,
|
|
1019
|
+
line_end=node.end_point[0] + 1,
|
|
1020
|
+
signature=sig,
|
|
1021
|
+
docstring=docstring,
|
|
1022
|
+
language="python",
|
|
1023
|
+
metadata=cls_metadata,
|
|
1024
|
+
)
|
|
1025
|
+
nodes.append(class_node)
|
|
1026
|
+
|
|
1027
|
+
edges.append(Edge(
|
|
1028
|
+
src=class_id, dst=parent_id, kind=EdgeKind.DEFINED_IN,
|
|
1029
|
+
file=rel, line=node.start_point[0] + 1,
|
|
1030
|
+
))
|
|
1031
|
+
|
|
1032
|
+
self._emit_decorator_calls(node, rel, class_id, src, edges)
|
|
1033
|
+
|
|
1034
|
+
arg_list = node.child_by_field_name("superclasses")
|
|
1035
|
+
if arg_list is None:
|
|
1036
|
+
for c in node.children:
|
|
1037
|
+
if c.type == "argument_list":
|
|
1038
|
+
arg_list = c
|
|
1039
|
+
break
|
|
1040
|
+
if arg_list is not None:
|
|
1041
|
+
for base in arg_list.children:
|
|
1042
|
+
if base.is_named and base.type in ("identifier", "attribute"):
|
|
1043
|
+
base_name = node_text(base, src)
|
|
1044
|
+
edges.append(Edge(
|
|
1045
|
+
src=class_id,
|
|
1046
|
+
dst=f"unresolved::{base_name}",
|
|
1047
|
+
kind=EdgeKind.INHERITS,
|
|
1048
|
+
file=rel,
|
|
1049
|
+
line=node.start_point[0] + 1,
|
|
1050
|
+
metadata={"target_name": base_name},
|
|
1051
|
+
))
|
|
1052
|
+
|
|
1053
|
+
if body is not None:
|
|
1054
|
+
for child in body.children:
|
|
1055
|
+
if child.type == "function_definition":
|
|
1056
|
+
self._handle_function(
|
|
1057
|
+
child, rel, qualname, class_id,
|
|
1058
|
+
NodeKind.METHOD, src, nodes, edges,
|
|
1059
|
+
)
|
|
1060
|
+
elif child.type == "decorated_definition":
|
|
1061
|
+
inner = None
|
|
1062
|
+
for c in child.children:
|
|
1063
|
+
if c.type in ("function_definition", "class_definition"):
|
|
1064
|
+
inner = c
|
|
1065
|
+
break
|
|
1066
|
+
if inner is not None and inner.type == "function_definition":
|
|
1067
|
+
self._handle_function(
|
|
1068
|
+
inner, rel, qualname, class_id,
|
|
1069
|
+
NodeKind.METHOD, src, nodes, edges,
|
|
1070
|
+
)
|
|
1071
|
+
elif inner is not None:
|
|
1072
|
+
self._handle_class(
|
|
1073
|
+
inner, rel, qualname, class_id, src, nodes, edges
|
|
1074
|
+
)
|
|
1075
|
+
elif child.type == "class_definition":
|
|
1076
|
+
self._handle_class(
|
|
1077
|
+
child, rel, qualname, class_id, src, nodes, edges
|
|
1078
|
+
)
|
|
1079
|
+
elif child.type == "import_statement":
|
|
1080
|
+
self._handle_import(child, rel, class_id, src, edges)
|
|
1081
|
+
elif child.type == "import_from_statement":
|
|
1082
|
+
self._handle_import_from(child, rel, class_id, src, edges)
|
|
1083
|
+
|
|
1084
|
+
def _handle_function(
|
|
1085
|
+
self,
|
|
1086
|
+
node: tree_sitter.Node,
|
|
1087
|
+
rel: str,
|
|
1088
|
+
parent_qualname: str,
|
|
1089
|
+
parent_id: str,
|
|
1090
|
+
kind: NodeKind,
|
|
1091
|
+
src: bytes,
|
|
1092
|
+
nodes: list[Node],
|
|
1093
|
+
edges: list[Edge],
|
|
1094
|
+
) -> None:
|
|
1095
|
+
name_node = node.child_by_field_name("name")
|
|
1096
|
+
if name_node is None:
|
|
1097
|
+
return
|
|
1098
|
+
name = node_text(name_node, src)
|
|
1099
|
+
qualname = f"{parent_qualname}.{name}" if parent_qualname else name
|
|
1100
|
+
func_id = make_node_id(kind, qualname, rel)
|
|
1101
|
+
|
|
1102
|
+
params = node.child_by_field_name("parameters")
|
|
1103
|
+
sig = f"{name}{node_text(params, src)}" if params is not None else name
|
|
1104
|
+
|
|
1105
|
+
body = node.child_by_field_name("body")
|
|
1106
|
+
docstring = _get_docstring(body, src) if body else None
|
|
1107
|
+
|
|
1108
|
+
decorators = _get_function_decorators(node, src)
|
|
1109
|
+
metadata: dict[str, object] = {"decorators": decorators}
|
|
1110
|
+
if _is_entry_point(
|
|
1111
|
+
decorators,
|
|
1112
|
+
name,
|
|
1113
|
+
extra_decorator_patterns=self.extra_entry_point_decorators,
|
|
1114
|
+
) or name == "__main__":
|
|
1115
|
+
metadata["entry_point"] = True
|
|
1116
|
+
if _has_public_api_pragma(node, src):
|
|
1117
|
+
metadata["public_api"] = True
|
|
1118
|
+
|
|
1119
|
+
# DF0: capture parameter descriptors and return-type annotation.
|
|
1120
|
+
# Methods skip the leading ``self`` / ``cls`` parameter; classmethods
|
|
1121
|
+
# and staticmethods follow the same rule (``cls`` is dropped, the
|
|
1122
|
+
# static-method case has no implicit first arg so nothing to skip).
|
|
1123
|
+
if params is not None:
|
|
1124
|
+
metadata["params"] = _extract_params(
|
|
1125
|
+
params, src, skip_self_or_cls=(kind == NodeKind.METHOD),
|
|
1126
|
+
)
|
|
1127
|
+
else:
|
|
1128
|
+
metadata["params"] = []
|
|
1129
|
+
return_type_node = node.child_by_field_name("return_type")
|
|
1130
|
+
metadata["returns"] = (
|
|
1131
|
+
node_text(return_type_node, src) if return_type_node else None
|
|
1132
|
+
)
|
|
1133
|
+
|
|
1134
|
+
func_node = Node(
|
|
1135
|
+
id=func_id,
|
|
1136
|
+
kind=kind,
|
|
1137
|
+
name=name,
|
|
1138
|
+
qualname=qualname,
|
|
1139
|
+
file=rel,
|
|
1140
|
+
line_start=node.start_point[0] + 1,
|
|
1141
|
+
line_end=node.end_point[0] + 1,
|
|
1142
|
+
signature=sig,
|
|
1143
|
+
docstring=docstring,
|
|
1144
|
+
language="python",
|
|
1145
|
+
metadata=metadata,
|
|
1146
|
+
)
|
|
1147
|
+
nodes.append(func_node)
|
|
1148
|
+
|
|
1149
|
+
edges.append(Edge(
|
|
1150
|
+
src=func_id, dst=parent_id, kind=EdgeKind.DEFINED_IN,
|
|
1151
|
+
file=rel, line=node.start_point[0] + 1,
|
|
1152
|
+
))
|
|
1153
|
+
|
|
1154
|
+
self._emit_decorator_calls(node, rel, func_id, src, edges)
|
|
1155
|
+
|
|
1156
|
+
# DF1 — HTTP route extraction. One ROUTE edge per (method, path);
|
|
1157
|
+
# Flask's ``methods=[...]`` expands to multiple edges.
|
|
1158
|
+
for spec in _extract_route_specs(decorators):
|
|
1159
|
+
self._emit_route_edge(
|
|
1160
|
+
spec, func_id, rel, node.start_point[0] + 1,
|
|
1161
|
+
nodes, edges,
|
|
1162
|
+
)
|
|
1163
|
+
|
|
1164
|
+
if body is not None:
|
|
1165
|
+
self._collect_calls(body, rel, func_id, src, edges)
|
|
1166
|
+
# DF1 — SQLAlchemy READS_FROM / WRITES_TO. Walk the body for
|
|
1167
|
+
# ORM session calls; emits ``unresolved::Model`` edges that
|
|
1168
|
+
# the post-build resolver rewrites to real CLASS ids.
|
|
1169
|
+
self._collect_sql_io(body, rel, func_id, src, edges)
|
|
1170
|
+
# Visit nested defs so their bodies and calls are not lost.
|
|
1171
|
+
# The innermost named function owns its calls — that mirrors
|
|
1172
|
+
# the runtime attribution and matches what users expect when
|
|
1173
|
+
# they ask "who calls X?".
|
|
1174
|
+
self._visit_nested_defs(
|
|
1175
|
+
body, rel, qualname, func_id, kind == NodeKind.METHOD,
|
|
1176
|
+
src, nodes, edges,
|
|
1177
|
+
)
|
|
1178
|
+
|
|
1179
|
+
def _visit_nested_defs(
|
|
1180
|
+
self,
|
|
1181
|
+
block: tree_sitter.Node,
|
|
1182
|
+
rel: str,
|
|
1183
|
+
parent_qualname: str,
|
|
1184
|
+
parent_id: str,
|
|
1185
|
+
in_method: bool,
|
|
1186
|
+
src: bytes,
|
|
1187
|
+
nodes: list[Node],
|
|
1188
|
+
edges: list[Edge],
|
|
1189
|
+
) -> None:
|
|
1190
|
+
"""Recursively register nested function/class definitions.
|
|
1191
|
+
|
|
1192
|
+
Walks the subtree but stops descending into a function or class
|
|
1193
|
+
once we have handed it to ``_handle_function`` / ``_handle_class``
|
|
1194
|
+
(those handlers will recurse on their own bodies). This mirrors
|
|
1195
|
+
``_visit_block`` but skips top-level statement noise.
|
|
1196
|
+
"""
|
|
1197
|
+
stack: list[tree_sitter.Node] = list(block.children)
|
|
1198
|
+
while stack:
|
|
1199
|
+
node = stack.pop()
|
|
1200
|
+
if node.type == "function_definition":
|
|
1201
|
+
# Nested functions are FUNCTION nodes (not METHOD); a method's
|
|
1202
|
+
# nested helpers are still locally-scoped functions.
|
|
1203
|
+
self._handle_function(
|
|
1204
|
+
node, rel, parent_qualname, parent_id,
|
|
1205
|
+
NodeKind.FUNCTION, src, nodes, edges,
|
|
1206
|
+
)
|
|
1207
|
+
continue
|
|
1208
|
+
if node.type == "class_definition":
|
|
1209
|
+
self._handle_class(
|
|
1210
|
+
node, rel, parent_qualname, parent_id,
|
|
1211
|
+
src, nodes, edges,
|
|
1212
|
+
)
|
|
1213
|
+
continue
|
|
1214
|
+
if node.type == "decorated_definition":
|
|
1215
|
+
inner = next(
|
|
1216
|
+
(
|
|
1217
|
+
c for c in node.children
|
|
1218
|
+
if c.type in ("function_definition", "class_definition")
|
|
1219
|
+
),
|
|
1220
|
+
None,
|
|
1221
|
+
)
|
|
1222
|
+
if inner is not None and inner.type == "function_definition":
|
|
1223
|
+
self._handle_function(
|
|
1224
|
+
inner, rel, parent_qualname, parent_id,
|
|
1225
|
+
NodeKind.FUNCTION, src, nodes, edges,
|
|
1226
|
+
)
|
|
1227
|
+
continue
|
|
1228
|
+
if inner is not None:
|
|
1229
|
+
self._handle_class(
|
|
1230
|
+
inner, rel, parent_qualname, parent_id,
|
|
1231
|
+
src, nodes, edges,
|
|
1232
|
+
)
|
|
1233
|
+
continue
|
|
1234
|
+
stack.extend(node.children)
|
|
1235
|
+
|
|
1236
|
+
def _collect_calls(
|
|
1237
|
+
self,
|
|
1238
|
+
node: tree_sitter.Node,
|
|
1239
|
+
rel: str,
|
|
1240
|
+
scope_id: str,
|
|
1241
|
+
src: bytes,
|
|
1242
|
+
edges: list[Edge],
|
|
1243
|
+
) -> None:
|
|
1244
|
+
"""Walk subtree collecting call expressions, stopping at nested defs."""
|
|
1245
|
+
stack: list[tree_sitter.Node] = list(node.children)
|
|
1246
|
+
while stack:
|
|
1247
|
+
child = stack.pop()
|
|
1248
|
+
if child.type == "call":
|
|
1249
|
+
func_child = child.child_by_field_name("function")
|
|
1250
|
+
if func_child is None and child.children:
|
|
1251
|
+
func_child = child.children[0]
|
|
1252
|
+
if func_child is not None:
|
|
1253
|
+
name = node_text(func_child, src)
|
|
1254
|
+
arg_list = child.child_by_field_name("arguments")
|
|
1255
|
+
args: list[str] = []
|
|
1256
|
+
kwargs: dict[str, str] = {}
|
|
1257
|
+
if arg_list is not None:
|
|
1258
|
+
args, kwargs = _extract_call_args(arg_list, src)
|
|
1259
|
+
edges.append(Edge(
|
|
1260
|
+
src=scope_id,
|
|
1261
|
+
dst=f"unresolved::{name}",
|
|
1262
|
+
kind=EdgeKind.CALLS,
|
|
1263
|
+
file=rel,
|
|
1264
|
+
line=child.start_point[0] + 1,
|
|
1265
|
+
metadata={
|
|
1266
|
+
"target_name": name,
|
|
1267
|
+
"args": args,
|
|
1268
|
+
"kwargs": kwargs,
|
|
1269
|
+
},
|
|
1270
|
+
))
|
|
1271
|
+
# ``decorator`` subtrees are handled by ``_emit_decorator_calls``
|
|
1272
|
+
# so we attribute decorator factories to the decorated symbol
|
|
1273
|
+
# rather than the surrounding scope. Skipping them here avoids
|
|
1274
|
+
# double-counting at module level.
|
|
1275
|
+
if child.type not in (
|
|
1276
|
+
"class_definition", "function_definition", "decorator",
|
|
1277
|
+
):
|
|
1278
|
+
stack.extend(child.children)
|
|
1279
|
+
|
|
1280
|
+
def _emit_decorator_calls(
|
|
1281
|
+
self,
|
|
1282
|
+
def_node: tree_sitter.Node,
|
|
1283
|
+
rel: str,
|
|
1284
|
+
scope_id: str,
|
|
1285
|
+
src: bytes,
|
|
1286
|
+
edges: list[Edge],
|
|
1287
|
+
) -> None:
|
|
1288
|
+
"""Emit a CALLS edge for each decorator on a function or class.
|
|
1289
|
+
|
|
1290
|
+
``@_register("name")`` and ``@my_decorator(arg)`` are calls — they
|
|
1291
|
+
invoke the decorator factory at definition time. Without these edges
|
|
1292
|
+
decorator-only functions look unreferenced.
|
|
1293
|
+
"""
|
|
1294
|
+
container = def_node
|
|
1295
|
+
if (
|
|
1296
|
+
def_node.parent is not None
|
|
1297
|
+
and def_node.parent.type == "decorated_definition"
|
|
1298
|
+
):
|
|
1299
|
+
container = def_node.parent
|
|
1300
|
+
for child in container.children:
|
|
1301
|
+
if child.type != "decorator":
|
|
1302
|
+
continue
|
|
1303
|
+
for sub in child.children:
|
|
1304
|
+
# The decorator body is either a bare reference (\`@foo\`)
|
|
1305
|
+
# which is not a call we should emit, or a \`call\`
|
|
1306
|
+
# (\`@foo("x")\`) — only the latter is a real invocation.
|
|
1307
|
+
if sub.type == "call":
|
|
1308
|
+
func_child = sub.child_by_field_name("function")
|
|
1309
|
+
if func_child is None and sub.children:
|
|
1310
|
+
func_child = sub.children[0]
|
|
1311
|
+
if func_child is not None:
|
|
1312
|
+
name = node_text(func_child, src)
|
|
1313
|
+
arg_list = sub.child_by_field_name("arguments")
|
|
1314
|
+
args: list[str] = []
|
|
1315
|
+
kwargs: dict[str, str] = {}
|
|
1316
|
+
if arg_list is not None:
|
|
1317
|
+
args, kwargs = _extract_call_args(arg_list, src)
|
|
1318
|
+
edges.append(Edge(
|
|
1319
|
+
src=scope_id,
|
|
1320
|
+
dst=f"unresolved::{name}",
|
|
1321
|
+
kind=EdgeKind.CALLS,
|
|
1322
|
+
file=rel,
|
|
1323
|
+
line=sub.start_point[0] + 1,
|
|
1324
|
+
metadata={
|
|
1325
|
+
"target_name": name,
|
|
1326
|
+
"args": args,
|
|
1327
|
+
"kwargs": kwargs,
|
|
1328
|
+
},
|
|
1329
|
+
))
|
|
1330
|
+
|
|
1331
|
+
# --- DF1: route + SQL emission helpers ----------------------------
|
|
1332
|
+
|
|
1333
|
+
def _emit_route_edge(
|
|
1334
|
+
self,
|
|
1335
|
+
spec: dict[str, str],
|
|
1336
|
+
func_id: str,
|
|
1337
|
+
rel: str,
|
|
1338
|
+
line: int,
|
|
1339
|
+
nodes: list[Node],
|
|
1340
|
+
edges: list[Edge],
|
|
1341
|
+
) -> None:
|
|
1342
|
+
"""Create a synthetic route node + ROUTE edge from handler.
|
|
1343
|
+
|
|
1344
|
+
The synthetic node uses ``NodeKind.VARIABLE`` (sentinel — see
|
|
1345
|
+
``metadata.synthetic_kind``). Its id encodes ``METHOD::PATH`` so
|
|
1346
|
+
multiple handlers binding the same route share the destination.
|
|
1347
|
+
"""
|
|
1348
|
+
method = spec["method"]
|
|
1349
|
+
path = spec["path"]
|
|
1350
|
+
synthetic_qualname = f"route::{method}::{path}"
|
|
1351
|
+
synthetic_id = f"route::{method}::{path}"
|
|
1352
|
+
# Avoid duplicate node emission when multiple handlers in the
|
|
1353
|
+
# same file declare the same route — caller reuses the same id.
|
|
1354
|
+
if not any(n.id == synthetic_id for n in nodes):
|
|
1355
|
+
nodes.append(Node(
|
|
1356
|
+
id=synthetic_id,
|
|
1357
|
+
kind=NodeKind.VARIABLE,
|
|
1358
|
+
name=f"{method} {path}",
|
|
1359
|
+
qualname=synthetic_qualname,
|
|
1360
|
+
file=rel,
|
|
1361
|
+
line_start=line,
|
|
1362
|
+
line_end=line,
|
|
1363
|
+
language="python",
|
|
1364
|
+
metadata={
|
|
1365
|
+
"synthetic_kind": "ROUTE",
|
|
1366
|
+
"method": method,
|
|
1367
|
+
"path": path,
|
|
1368
|
+
"framework": spec["framework"],
|
|
1369
|
+
},
|
|
1370
|
+
))
|
|
1371
|
+
edges.append(Edge(
|
|
1372
|
+
src=func_id,
|
|
1373
|
+
dst=synthetic_id,
|
|
1374
|
+
kind=EdgeKind.ROUTE,
|
|
1375
|
+
file=rel,
|
|
1376
|
+
line=line,
|
|
1377
|
+
metadata={
|
|
1378
|
+
"method": method,
|
|
1379
|
+
"path": path,
|
|
1380
|
+
"framework": spec["framework"],
|
|
1381
|
+
},
|
|
1382
|
+
))
|
|
1383
|
+
|
|
1384
|
+
def _collect_sql_io(
|
|
1385
|
+
self,
|
|
1386
|
+
body: tree_sitter.Node,
|
|
1387
|
+
rel: str,
|
|
1388
|
+
scope_id: str,
|
|
1389
|
+
src: bytes,
|
|
1390
|
+
edges: list[Edge],
|
|
1391
|
+
) -> None:
|
|
1392
|
+
"""Walk a function body for SQLAlchemy data-access patterns.
|
|
1393
|
+
|
|
1394
|
+
Emits ``READS_FROM`` / ``WRITES_TO`` edges with
|
|
1395
|
+
``dst="unresolved::<ModelName>"`` so the post-build resolver can
|
|
1396
|
+
rewrite them to real CLASS node ids by qualname/tail match.
|
|
1397
|
+
"""
|
|
1398
|
+
stack: list[tree_sitter.Node] = list(body.children)
|
|
1399
|
+
while stack:
|
|
1400
|
+
child = stack.pop()
|
|
1401
|
+
if child.type == "call":
|
|
1402
|
+
self._maybe_emit_sql_edge(child, rel, scope_id, src, edges)
|
|
1403
|
+
# Stop at nested defs — their bodies own their own edges.
|
|
1404
|
+
if child.type not in (
|
|
1405
|
+
"class_definition", "function_definition", "decorator",
|
|
1406
|
+
):
|
|
1407
|
+
stack.extend(child.children)
|
|
1408
|
+
|
|
1409
|
+
def _maybe_emit_sql_edge(
|
|
1410
|
+
self,
|
|
1411
|
+
call_node: tree_sitter.Node,
|
|
1412
|
+
rel: str,
|
|
1413
|
+
scope_id: str,
|
|
1414
|
+
src: bytes,
|
|
1415
|
+
edges: list[Edge],
|
|
1416
|
+
) -> None:
|
|
1417
|
+
"""Inspect one ``call`` AST node for an SQLAlchemy data-op."""
|
|
1418
|
+
func_child = call_node.child_by_field_name("function")
|
|
1419
|
+
if func_child is None:
|
|
1420
|
+
return
|
|
1421
|
+
target = node_text(func_child, src)
|
|
1422
|
+
# `Model.query.filter(...)` or `Model.query` — Flask-SQLAlchemy.
|
|
1423
|
+
m_query = re.match(
|
|
1424
|
+
r"^([A-Z][\w]*)\.query(?:\.|$)", target,
|
|
1425
|
+
)
|
|
1426
|
+
if m_query:
|
|
1427
|
+
model = m_query.group(1)
|
|
1428
|
+
edges.append(Edge(
|
|
1429
|
+
src=scope_id,
|
|
1430
|
+
dst=f"unresolved::{model}",
|
|
1431
|
+
kind=EdgeKind.READS_FROM,
|
|
1432
|
+
file=rel,
|
|
1433
|
+
line=call_node.start_point[0] + 1,
|
|
1434
|
+
metadata={
|
|
1435
|
+
"operation": "select",
|
|
1436
|
+
"via": "Model.query",
|
|
1437
|
+
"model_name": model,
|
|
1438
|
+
"target_name": model,
|
|
1439
|
+
},
|
|
1440
|
+
))
|
|
1441
|
+
return
|
|
1442
|
+
# session-style chain — `session.query(Model)`, `db.session.add(...)`.
|
|
1443
|
+
if not _is_session_chain(target):
|
|
1444
|
+
return
|
|
1445
|
+
verb = target.rsplit(".", 1)[-1]
|
|
1446
|
+
# session.query(Model) / session.get(Model, id) / .scalars(...)
|
|
1447
|
+
if verb in _SQL_READ_OUTER:
|
|
1448
|
+
self._emit_sql_from_first_arg(
|
|
1449
|
+
call_node, rel, scope_id, src, edges,
|
|
1450
|
+
kind=EdgeKind.READS_FROM, operation="select",
|
|
1451
|
+
via=f"session.{verb}",
|
|
1452
|
+
)
|
|
1453
|
+
return
|
|
1454
|
+
if verb in _SQL_WRITE_OUTER:
|
|
1455
|
+
op = "delete" if verb == "delete" else "insert"
|
|
1456
|
+
self._emit_sql_from_first_arg(
|
|
1457
|
+
call_node, rel, scope_id, src, edges,
|
|
1458
|
+
kind=EdgeKind.WRITES_TO, operation=op,
|
|
1459
|
+
via=f"session.{verb}",
|
|
1460
|
+
)
|
|
1461
|
+
return
|
|
1462
|
+
if verb == "execute":
|
|
1463
|
+
# session.execute(select(Model)) / insert(Model) / etc.
|
|
1464
|
+
self._emit_sql_from_execute(
|
|
1465
|
+
call_node, rel, scope_id, src, edges,
|
|
1466
|
+
)
|
|
1467
|
+
|
|
1468
|
+
def _emit_sql_from_first_arg(
|
|
1469
|
+
self,
|
|
1470
|
+
call_node: tree_sitter.Node,
|
|
1471
|
+
rel: str,
|
|
1472
|
+
scope_id: str,
|
|
1473
|
+
src: bytes,
|
|
1474
|
+
edges: list[Edge],
|
|
1475
|
+
*,
|
|
1476
|
+
kind: EdgeKind,
|
|
1477
|
+
operation: str,
|
|
1478
|
+
via: str,
|
|
1479
|
+
) -> None:
|
|
1480
|
+
arg_list = call_node.child_by_field_name("arguments")
|
|
1481
|
+
if arg_list is None:
|
|
1482
|
+
return
|
|
1483
|
+
first_named = next(
|
|
1484
|
+
(c for c in arg_list.children if c.is_named), None,
|
|
1485
|
+
)
|
|
1486
|
+
if first_named is None:
|
|
1487
|
+
return
|
|
1488
|
+
model = _model_name_from_call_arg(node_text(first_named, src))
|
|
1489
|
+
if not model:
|
|
1490
|
+
return
|
|
1491
|
+
edges.append(Edge(
|
|
1492
|
+
src=scope_id,
|
|
1493
|
+
dst=f"unresolved::{model}",
|
|
1494
|
+
kind=kind,
|
|
1495
|
+
file=rel,
|
|
1496
|
+
line=call_node.start_point[0] + 1,
|
|
1497
|
+
metadata={
|
|
1498
|
+
"operation": operation,
|
|
1499
|
+
"via": via,
|
|
1500
|
+
"model_name": model,
|
|
1501
|
+
"target_name": model,
|
|
1502
|
+
},
|
|
1503
|
+
))
|
|
1504
|
+
|
|
1505
|
+
def _emit_sql_from_execute(
|
|
1506
|
+
self,
|
|
1507
|
+
call_node: tree_sitter.Node,
|
|
1508
|
+
rel: str,
|
|
1509
|
+
scope_id: str,
|
|
1510
|
+
src: bytes,
|
|
1511
|
+
edges: list[Edge],
|
|
1512
|
+
) -> None:
|
|
1513
|
+
"""Handle ``session.execute(select|insert|update|delete(Model))``."""
|
|
1514
|
+
arg_list = call_node.child_by_field_name("arguments")
|
|
1515
|
+
if arg_list is None:
|
|
1516
|
+
return
|
|
1517
|
+
first_named = next(
|
|
1518
|
+
(c for c in arg_list.children if c.is_named), None,
|
|
1519
|
+
)
|
|
1520
|
+
if first_named is None:
|
|
1521
|
+
return
|
|
1522
|
+
# Drill through ``.values(...)`` / ``.where(...)`` chains —
|
|
1523
|
+
# ``select(Model).where(...)`` keeps wrapping the original
|
|
1524
|
+
# constructor call inside ``function -> attribute -> object``.
|
|
1525
|
+
first_named = _unwrap_to_root_call(first_named)
|
|
1526
|
+
if first_named is None or first_named.type != "call":
|
|
1527
|
+
return
|
|
1528
|
+
inner_func = first_named.child_by_field_name("function")
|
|
1529
|
+
if inner_func is None:
|
|
1530
|
+
return
|
|
1531
|
+
inner_name = node_text(inner_func, src).rsplit(".", 1)[-1]
|
|
1532
|
+
if inner_name in _SQL_READ_INNER:
|
|
1533
|
+
kind = EdgeKind.READS_FROM
|
|
1534
|
+
operation = "select"
|
|
1535
|
+
elif inner_name in _SQL_WRITE_INNER:
|
|
1536
|
+
kind = EdgeKind.WRITES_TO
|
|
1537
|
+
operation = inner_name
|
|
1538
|
+
else:
|
|
1539
|
+
return
|
|
1540
|
+
inner_args = first_named.child_by_field_name("arguments")
|
|
1541
|
+
if inner_args is None:
|
|
1542
|
+
return
|
|
1543
|
+
first_inner = next(
|
|
1544
|
+
(c for c in inner_args.children if c.is_named), None,
|
|
1545
|
+
)
|
|
1546
|
+
if first_inner is None:
|
|
1547
|
+
return
|
|
1548
|
+
model = _model_name_from_call_arg(node_text(first_inner, src))
|
|
1549
|
+
if not model:
|
|
1550
|
+
return
|
|
1551
|
+
edges.append(Edge(
|
|
1552
|
+
src=scope_id,
|
|
1553
|
+
dst=f"unresolved::{model}",
|
|
1554
|
+
kind=kind,
|
|
1555
|
+
file=rel,
|
|
1556
|
+
line=call_node.start_point[0] + 1,
|
|
1557
|
+
metadata={
|
|
1558
|
+
"operation": operation,
|
|
1559
|
+
"via": f"session.execute({inner_name})",
|
|
1560
|
+
"model_name": model,
|
|
1561
|
+
"target_name": model,
|
|
1562
|
+
},
|
|
1563
|
+
))
|
|
1564
|
+
|
|
1565
|
+
def _handle_import(
|
|
1566
|
+
self,
|
|
1567
|
+
node: tree_sitter.Node,
|
|
1568
|
+
rel: str,
|
|
1569
|
+
parent_id: str,
|
|
1570
|
+
src: bytes,
|
|
1571
|
+
edges: list[Edge],
|
|
1572
|
+
) -> None:
|
|
1573
|
+
for child in node.children:
|
|
1574
|
+
if child.type in ("dotted_name", "aliased_import"):
|
|
1575
|
+
if child.type == "aliased_import":
|
|
1576
|
+
name_node = child.children[0] if child.children else child
|
|
1577
|
+
else:
|
|
1578
|
+
name_node = child
|
|
1579
|
+
name = node_text(name_node, src)
|
|
1580
|
+
edges.append(Edge(
|
|
1581
|
+
src=parent_id,
|
|
1582
|
+
dst=f"unresolved::{name}",
|
|
1583
|
+
kind=EdgeKind.IMPORTS,
|
|
1584
|
+
file=rel,
|
|
1585
|
+
line=node.start_point[0] + 1,
|
|
1586
|
+
metadata={"target_name": name},
|
|
1587
|
+
))
|
|
1588
|
+
|
|
1589
|
+
def _handle_import_from(
|
|
1590
|
+
self,
|
|
1591
|
+
node: tree_sitter.Node,
|
|
1592
|
+
rel: str,
|
|
1593
|
+
parent_id: str,
|
|
1594
|
+
src: bytes,
|
|
1595
|
+
edges: list[Edge],
|
|
1596
|
+
) -> None:
|
|
1597
|
+
# Locate the module portion (relative_import or dotted_name) and the
|
|
1598
|
+
# imported names that follow the `import` keyword.
|
|
1599
|
+
module_node: tree_sitter.Node | None = None
|
|
1600
|
+
seen_import_kw = False
|
|
1601
|
+
name_nodes: list[tree_sitter.Node] = []
|
|
1602
|
+
for child in node.children:
|
|
1603
|
+
if not seen_import_kw:
|
|
1604
|
+
if (
|
|
1605
|
+
child.type in ("relative_import", "dotted_name")
|
|
1606
|
+
and module_node is None
|
|
1607
|
+
):
|
|
1608
|
+
module_node = child
|
|
1609
|
+
elif child.type == "import":
|
|
1610
|
+
seen_import_kw = True
|
|
1611
|
+
else:
|
|
1612
|
+
if child.type in ("dotted_name", "identifier"):
|
|
1613
|
+
name_nodes.append(child)
|
|
1614
|
+
elif child.type == "aliased_import":
|
|
1615
|
+
# `from m import X as Y` — bind original name X.
|
|
1616
|
+
inner = next(
|
|
1617
|
+
(
|
|
1618
|
+
c for c in child.children
|
|
1619
|
+
if c.type in ("dotted_name", "identifier")
|
|
1620
|
+
),
|
|
1621
|
+
None,
|
|
1622
|
+
)
|
|
1623
|
+
if inner is not None:
|
|
1624
|
+
name_nodes.append(inner)
|
|
1625
|
+
elif child.type == "wildcard_import":
|
|
1626
|
+
# `from m import *` — no per-name edges to emit.
|
|
1627
|
+
pass
|
|
1628
|
+
|
|
1629
|
+
# Resolve module name. Handle relative imports by computing the
|
|
1630
|
+
# absolute package qualname from the importing file's location.
|
|
1631
|
+
module_name = self._resolve_from_module(module_node, rel, src)
|
|
1632
|
+
|
|
1633
|
+
# If there are no imported names (e.g. parser fallback), keep the
|
|
1634
|
+
# module-level edge so we don't lose the import entirely. When we
|
|
1635
|
+
# do have per-name edges, the per-name edges carry the binding info
|
|
1636
|
+
# the resolver needs and the module-level edge would be redundant
|
|
1637
|
+
# noise.
|
|
1638
|
+
if module_name and not name_nodes:
|
|
1639
|
+
edges.append(Edge(
|
|
1640
|
+
src=parent_id,
|
|
1641
|
+
dst=f"unresolved::{module_name}",
|
|
1642
|
+
kind=EdgeKind.IMPORTS,
|
|
1643
|
+
file=rel,
|
|
1644
|
+
line=node.start_point[0] + 1,
|
|
1645
|
+
metadata={"target_name": module_name},
|
|
1646
|
+
))
|
|
1647
|
+
|
|
1648
|
+
# Emit one IMPORTS edge per imported name, with imported_name in the
|
|
1649
|
+
# metadata so the resolver can bind alias -> full qualname.
|
|
1650
|
+
for nn in name_nodes:
|
|
1651
|
+
imported = node_text(nn, src)
|
|
1652
|
+
if not imported:
|
|
1653
|
+
continue
|
|
1654
|
+
full = (
|
|
1655
|
+
f"{module_name}.{imported}" if module_name else imported
|
|
1656
|
+
)
|
|
1657
|
+
edges.append(Edge(
|
|
1658
|
+
src=parent_id,
|
|
1659
|
+
dst=f"unresolved::{full}",
|
|
1660
|
+
kind=EdgeKind.IMPORTS,
|
|
1661
|
+
file=rel,
|
|
1662
|
+
line=node.start_point[0] + 1,
|
|
1663
|
+
metadata={
|
|
1664
|
+
"target_name": full,
|
|
1665
|
+
"imported_name": imported,
|
|
1666
|
+
},
|
|
1667
|
+
))
|
|
1668
|
+
|
|
1669
|
+
def _resolve_from_module(
|
|
1670
|
+
self,
|
|
1671
|
+
module_node: tree_sitter.Node | None,
|
|
1672
|
+
rel: str,
|
|
1673
|
+
src: bytes,
|
|
1674
|
+
) -> str:
|
|
1675
|
+
"""Return the absolute module qualname for a `from X import ...`.
|
|
1676
|
+
|
|
1677
|
+
For relative imports (`from . import x`, `from ..pkg import x`),
|
|
1678
|
+
count the leading dots and walk up the importing file's package
|
|
1679
|
+
path that many levels, then append the relative module name.
|
|
1680
|
+
"""
|
|
1681
|
+
if module_node is None:
|
|
1682
|
+
return ""
|
|
1683
|
+
if module_node.type != "relative_import":
|
|
1684
|
+
return node_text(module_node, src)
|
|
1685
|
+
|
|
1686
|
+
# Count leading dots and find the trailing dotted_name (if any).
|
|
1687
|
+
dots = 0
|
|
1688
|
+
rel_module = ""
|
|
1689
|
+
for child in module_node.children:
|
|
1690
|
+
if child.type == "import_prefix":
|
|
1691
|
+
dots = sum(1 for c in child.children if c.type == ".")
|
|
1692
|
+
elif child.type == "dotted_name":
|
|
1693
|
+
rel_module = node_text(child, src)
|
|
1694
|
+
|
|
1695
|
+
# Importing-file qualname (without the file's own basename).
|
|
1696
|
+
file_qual = _file_to_qualname(rel)
|
|
1697
|
+
pkg_parts = file_qual.split(".") if file_qual else []
|
|
1698
|
+
# Drop the file's own module name to get the containing package.
|
|
1699
|
+
if pkg_parts:
|
|
1700
|
+
pkg_parts = pkg_parts[:-1]
|
|
1701
|
+
# Walk up `dots - 1` further levels (one dot = current package).
|
|
1702
|
+
if dots > 1:
|
|
1703
|
+
cut = dots - 1
|
|
1704
|
+
pkg_parts = pkg_parts[:-cut] if cut <= len(pkg_parts) else []
|
|
1705
|
+
|
|
1706
|
+
parts = pkg_parts + ([rel_module] if rel_module else [])
|
|
1707
|
+
return ".".join(p for p in parts if p)
|