codeanalyzer-python 0.1.13__py3-none-any.whl → 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- codeanalyzer/__main__.py +99 -11
- codeanalyzer/core.py +154 -19
- codeanalyzer/neo4j/__init__.py +46 -0
- codeanalyzer/neo4j/bolt.py +223 -0
- codeanalyzer/neo4j/catalog.py +245 -0
- codeanalyzer/neo4j/cypher.py +138 -0
- codeanalyzer/neo4j/emit.py +74 -0
- codeanalyzer/neo4j/project.py +322 -0
- codeanalyzer/neo4j/rows.py +176 -0
- codeanalyzer/neo4j/schema.py +39 -0
- codeanalyzer/options/__init__.py +2 -2
- codeanalyzer/options/options.py +20 -1
- codeanalyzer/schema/py_schema.py +20 -0
- codeanalyzer/semantic_analysis/call_graph.py +266 -0
- codeanalyzer/semantic_analysis/codeql/codeql_analysis.py +318 -69
- codeanalyzer/semantic_analysis/codeql/codeql_loader.py +32 -4
- codeanalyzer/semantic_analysis/codeql/codeql_query_runner.py +51 -31
- codeanalyzer/syntactic_analysis/symbol_table_builder.py +87 -4
- codeanalyzer_python-0.2.0.dist-info/METADATA +393 -0
- codeanalyzer_python-0.2.0.dist-info/RECORD +39 -0
- {codeanalyzer_python-0.1.13.dist-info → codeanalyzer_python-0.2.0.dist-info}/WHEEL +1 -1
- codeanalyzer_python-0.2.0.dist-info/entry_points.txt +3 -0
- codeanalyzer/semantic_analysis/wala/__init__.py +0 -15
- codeanalyzer_python-0.1.13.dist-info/METADATA +0 -414
- codeanalyzer_python-0.1.13.dist-info/RECORD +0 -31
- codeanalyzer_python-0.1.13.dist-info/entry_points.txt +0 -2
- {codeanalyzer_python-0.1.13.dist-info → codeanalyzer_python-0.2.0.dist-info}/licenses/LICENSE +0 -0
- {codeanalyzer_python-0.1.13.dist-info → codeanalyzer_python-0.2.0.dist-info}/licenses/NOTICE +0 -0
|
@@ -0,0 +1,266 @@
|
|
|
1
|
+
################################################################################
|
|
2
|
+
# Copyright IBM Corporation 2025
|
|
3
|
+
#
|
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
# you may not use this file except in compliance with the License.
|
|
6
|
+
# You may obtain a copy of the License at
|
|
7
|
+
#
|
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
#
|
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
# See the License for the specific language governing permissions and
|
|
14
|
+
# limitations under the License.
|
|
15
|
+
################################################################################
|
|
16
|
+
|
|
17
|
+
"""Adapters between the persisted call-graph schema and ``networkx``.
|
|
18
|
+
|
|
19
|
+
The schema persists the call graph as ``List[PyCallEdge]`` with signatures
|
|
20
|
+
referencing ``PyCallable`` entries already in the symbol table. These
|
|
21
|
+
helpers rehydrate it into a ``networkx.DiGraph`` for in-process queries
|
|
22
|
+
(paths, callers, callees) and reduce a built ``DiGraph`` back to the
|
|
23
|
+
serializable edge list.
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
from collections import Counter
|
|
27
|
+
from typing import Dict, Iterator, List, Tuple
|
|
28
|
+
|
|
29
|
+
import networkx as nx
|
|
30
|
+
|
|
31
|
+
from codeanalyzer.schema.py_schema import (
|
|
32
|
+
PyApplication,
|
|
33
|
+
PyCallable,
|
|
34
|
+
PyCallEdge,
|
|
35
|
+
PyClass,
|
|
36
|
+
PyModule,
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def _walk_class_callables(cls: PyClass) -> Iterator[PyCallable]:
|
|
41
|
+
for method in cls.methods.values():
|
|
42
|
+
yield from _walk_callable(method)
|
|
43
|
+
for inner in cls.inner_classes.values():
|
|
44
|
+
yield from _walk_class_callables(inner)
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def _walk_callable(c: PyCallable) -> Iterator[PyCallable]:
|
|
48
|
+
yield c
|
|
49
|
+
for inner in c.inner_callables.values():
|
|
50
|
+
yield from _walk_callable(inner)
|
|
51
|
+
for inner_cls in c.inner_classes.values():
|
|
52
|
+
yield from _walk_class_callables(inner_cls)
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def _walk_module_callables(module: PyModule) -> Iterator[PyCallable]:
|
|
56
|
+
for fn in module.functions.values():
|
|
57
|
+
yield from _walk_callable(fn)
|
|
58
|
+
for cls in module.classes.values():
|
|
59
|
+
yield from _walk_class_callables(cls)
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def iter_callables_in_symbol_table(
|
|
63
|
+
symbol_table: Dict[str, PyModule],
|
|
64
|
+
) -> Iterator[PyCallable]:
|
|
65
|
+
"""Yield every ``PyCallable`` in a symbol table, recursively."""
|
|
66
|
+
for module in symbol_table.values():
|
|
67
|
+
yield from _walk_module_callables(module)
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def _walk_classes_in_class(cls: PyClass) -> Iterator[PyClass]:
|
|
71
|
+
yield cls
|
|
72
|
+
for inner in cls.inner_classes.values():
|
|
73
|
+
yield from _walk_classes_in_class(inner)
|
|
74
|
+
# Classes can live inside methods (e.g. a factory method that defines
|
|
75
|
+
# a helper class). Recurse through every method's callable subtree.
|
|
76
|
+
for method in cls.methods.values():
|
|
77
|
+
yield from _walk_classes_in_callable(method)
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def _walk_classes_in_callable(c: PyCallable) -> Iterator[PyClass]:
|
|
81
|
+
for inner_cls in c.inner_classes.values():
|
|
82
|
+
yield from _walk_classes_in_class(inner_cls)
|
|
83
|
+
for inner in c.inner_callables.values():
|
|
84
|
+
yield from _walk_classes_in_callable(inner)
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def iter_classes_in_symbol_table(
|
|
88
|
+
symbol_table: Dict[str, PyModule],
|
|
89
|
+
) -> Iterator[PyClass]:
|
|
90
|
+
"""Yield every ``PyClass`` in a symbol table, recursively — including
|
|
91
|
+
inner classes, classes nested in functions, and classes nested in
|
|
92
|
+
class methods."""
|
|
93
|
+
for module in symbol_table.values():
|
|
94
|
+
for cls in module.classes.values():
|
|
95
|
+
yield from _walk_classes_in_class(cls)
|
|
96
|
+
for fn in module.functions.values():
|
|
97
|
+
yield from _walk_classes_in_callable(fn)
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
def iter_callables(app: PyApplication) -> Iterator[PyCallable]:
|
|
101
|
+
"""Yield every ``PyCallable`` in the application, recursively."""
|
|
102
|
+
yield from iter_callables_in_symbol_table(app.symbol_table)
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
def callables_by_signature(app: PyApplication) -> Dict[str, PyCallable]:
|
|
106
|
+
"""Flat ``signature -> PyCallable`` index for O(1) node lookup."""
|
|
107
|
+
return {c.signature: c for c in iter_callables(app)}
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
def to_digraph(app: PyApplication) -> nx.DiGraph:
|
|
111
|
+
"""Build a ``networkx.DiGraph`` from a ``PyApplication``.
|
|
112
|
+
|
|
113
|
+
Nodes are keyed by ``PyCallable.signature``. Nodes for in-source
|
|
114
|
+
callables carry a ``callable`` attribute holding the full
|
|
115
|
+
``PyCallable`` and ``ghost=False``. Endpoints referenced by edges
|
|
116
|
+
but absent from the symbol table — RPC targets, third-party
|
|
117
|
+
libraries, framework callbacks, dynamically resolved callees — are
|
|
118
|
+
added as **ghost** nodes (``callable=None``, ``ghost=True``) so the
|
|
119
|
+
edges are preserved.
|
|
120
|
+
|
|
121
|
+
Edges carry ``type``, ``weight``, and ``provenance`` attributes.
|
|
122
|
+
"""
|
|
123
|
+
g = nx.DiGraph()
|
|
124
|
+
by_sig = callables_by_signature(app)
|
|
125
|
+
for sig, c in by_sig.items():
|
|
126
|
+
g.add_node(sig, callable=c, ghost=False)
|
|
127
|
+
for e in app.call_graph:
|
|
128
|
+
for sig in (e.source, e.target):
|
|
129
|
+
if sig not in g.nodes:
|
|
130
|
+
g.add_node(sig, callable=None, ghost=True)
|
|
131
|
+
g.add_edge(
|
|
132
|
+
e.source,
|
|
133
|
+
e.target,
|
|
134
|
+
type=e.type,
|
|
135
|
+
weight=e.weight,
|
|
136
|
+
provenance=list(e.provenance),
|
|
137
|
+
)
|
|
138
|
+
return g
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
def from_digraph(g: nx.DiGraph) -> list:
|
|
142
|
+
"""Reduce a ``DiGraph`` to the persisted ``List[PyCallEdge]`` form.
|
|
143
|
+
|
|
144
|
+
Only edges are extracted; nodes are not serialized here — they are
|
|
145
|
+
expected to already exist as ``PyCallable`` entries in the symbol
|
|
146
|
+
table. Edge attributes default to ``CALL_DEP`` / weight 1 / empty
|
|
147
|
+
provenance when missing.
|
|
148
|
+
"""
|
|
149
|
+
edges = []
|
|
150
|
+
for src, dst, data in g.edges(data=True):
|
|
151
|
+
edges.append(
|
|
152
|
+
PyCallEdge(
|
|
153
|
+
source=src,
|
|
154
|
+
target=dst,
|
|
155
|
+
type=data.get("type", "CALL_DEP"),
|
|
156
|
+
weight=int(data.get("weight", 1)),
|
|
157
|
+
provenance=list(data.get("provenance", [])),
|
|
158
|
+
)
|
|
159
|
+
)
|
|
160
|
+
return edges
|
|
161
|
+
|
|
162
|
+
|
|
163
|
+
def jedi_call_graph_edges(
|
|
164
|
+
symbol_table: Dict[str, PyModule],
|
|
165
|
+
) -> List[PyCallEdge]:
|
|
166
|
+
"""Derive ``PyCallEdge`` entries from Jedi's per-callable ``call_sites``.
|
|
167
|
+
|
|
168
|
+
For every ``PyCallable`` in the symbol table, each ``PyCallsite`` whose
|
|
169
|
+
``callee_signature`` is resolved (non-empty) contributes an edge
|
|
170
|
+
``caller.signature -> site.callee_signature``. Sites where Jedi failed
|
|
171
|
+
to resolve the callee (``callee_signature`` is ``None`` or empty) are
|
|
172
|
+
skipped — they have no anchor to put on the graph.
|
|
173
|
+
|
|
174
|
+
Edges are coalesced on ``(source, target)``: ``weight`` is the count of
|
|
175
|
+
matching sites. Provenance is always ``["jedi"]``; combine with
|
|
176
|
+
CodeQL-derived edges via ``merge_edges``.
|
|
177
|
+
"""
|
|
178
|
+
counts: Counter = Counter()
|
|
179
|
+
for caller in iter_callables_in_symbol_table(symbol_table):
|
|
180
|
+
for site in caller.call_sites:
|
|
181
|
+
if not site.callee_signature:
|
|
182
|
+
continue
|
|
183
|
+
counts[(caller.signature, site.callee_signature)] += 1
|
|
184
|
+
|
|
185
|
+
return [
|
|
186
|
+
PyCallEdge(source=src, target=dst, weight=n, provenance=["jedi"])
|
|
187
|
+
for (src, dst), n in counts.items()
|
|
188
|
+
]
|
|
189
|
+
|
|
190
|
+
|
|
191
|
+
def resolve_unresolved_constructors(symbol_table: Dict[str, PyModule]) -> int:
|
|
192
|
+
"""Fill in ``PyCallsite.callee_signature`` for unresolved constructor sites.
|
|
193
|
+
|
|
194
|
+
When both Jedi and CodeQL fail to resolve a constructor call (commonly
|
|
195
|
+
for classes nested inside functions or methods, where static-analysis
|
|
196
|
+
points-to is weakest), Jedi still flags the site as
|
|
197
|
+
``is_constructor_call=True`` with ``method_name`` set to the class's
|
|
198
|
+
short name. This pass does the resolution heuristically:
|
|
199
|
+
|
|
200
|
+
1. Build a ``short_name -> [PyClass]`` index from all classes in the
|
|
201
|
+
symbol table.
|
|
202
|
+
2. For each unresolved constructor site under a caller ``C``, look up
|
|
203
|
+
candidates by ``site.method_name`` and prefer the class whose
|
|
204
|
+
``signature`` is the longest prefix-ancestor of ``C.signature`` —
|
|
205
|
+
this approximates Python's LEGB scoping for nested classes.
|
|
206
|
+
3. Set ``callee_signature = f"{class.signature}.__init__"``.
|
|
207
|
+
|
|
208
|
+
Returns the number of sites resolved. Best-effort; sites with no
|
|
209
|
+
matching class or ambiguous candidates with no scope tiebreaker are
|
|
210
|
+
left as-is.
|
|
211
|
+
"""
|
|
212
|
+
by_name: Dict[str, List[PyClass]] = {}
|
|
213
|
+
for cls in iter_classes_in_symbol_table(symbol_table):
|
|
214
|
+
by_name.setdefault(cls.name, []).append(cls)
|
|
215
|
+
|
|
216
|
+
resolved = 0
|
|
217
|
+
for caller in iter_callables_in_symbol_table(symbol_table):
|
|
218
|
+
for site in caller.call_sites:
|
|
219
|
+
if not site.is_constructor_call or site.callee_signature:
|
|
220
|
+
continue
|
|
221
|
+
candidates = by_name.get(site.method_name)
|
|
222
|
+
if not candidates:
|
|
223
|
+
continue
|
|
224
|
+
|
|
225
|
+
# Prefer the class whose signature is the longest prefix of
|
|
226
|
+
# the caller's signature (closest enclosing scope).
|
|
227
|
+
def scope_score(c: PyClass, _caller_sig: str = caller.signature) -> int:
|
|
228
|
+
cls_sig = c.signature
|
|
229
|
+
parent_sig = cls_sig.rsplit(".", 1)[0] if "." in cls_sig else ""
|
|
230
|
+
# Score = length of parent_sig if it's a prefix of caller's
|
|
231
|
+
# signature, else -1 (not in scope, lowest priority).
|
|
232
|
+
if parent_sig and _caller_sig.startswith(parent_sig):
|
|
233
|
+
return len(parent_sig)
|
|
234
|
+
# Module-level class (parent_sig is the module path) — give
|
|
235
|
+
# it a base score so it still wins over no match.
|
|
236
|
+
return 0 if not parent_sig else -1
|
|
237
|
+
|
|
238
|
+
best = max(candidates, key=scope_score)
|
|
239
|
+
if scope_score(best) < 0:
|
|
240
|
+
# No candidate is reachable from caller's scope.
|
|
241
|
+
continue
|
|
242
|
+
|
|
243
|
+
site.callee_signature = f"{best.signature}.__init__"
|
|
244
|
+
resolved += 1
|
|
245
|
+
|
|
246
|
+
return resolved
|
|
247
|
+
|
|
248
|
+
|
|
249
|
+
def merge_edges(*edge_lists: list) -> list:
|
|
250
|
+
"""Merge multiple ``List[PyCallEdge]`` into one.
|
|
251
|
+
|
|
252
|
+
Edges with the same ``(source, target)`` are coalesced: weights sum,
|
|
253
|
+
provenance is the sorted union. Useful for combining edges produced
|
|
254
|
+
by different backends (e.g. Jedi + CodeQL).
|
|
255
|
+
"""
|
|
256
|
+
by_key: Dict[Tuple[str, str], PyCallEdge] = {}
|
|
257
|
+
for edges in edge_lists:
|
|
258
|
+
for e in edges:
|
|
259
|
+
k = (e.source, e.target)
|
|
260
|
+
if k in by_key:
|
|
261
|
+
cur = by_key[k]
|
|
262
|
+
cur.weight += e.weight
|
|
263
|
+
cur.provenance = sorted(set(cur.provenance) | set(e.provenance))
|
|
264
|
+
else:
|
|
265
|
+
by_key[k] = e.model_copy()
|
|
266
|
+
return list(by_key.values())
|