codeanalyzer-python 0.1.13__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,266 @@
1
+ ################################################################################
2
+ # Copyright IBM Corporation 2025
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ ################################################################################
16
+
17
+ """Adapters between the persisted call-graph schema and ``networkx``.
18
+
19
+ The schema persists the call graph as ``List[PyCallEdge]`` with signatures
20
+ referencing ``PyCallable`` entries already in the symbol table. These
21
+ helpers rehydrate it into a ``networkx.DiGraph`` for in-process queries
22
+ (paths, callers, callees) and reduce a built ``DiGraph`` back to the
23
+ serializable edge list.
24
+ """
25
+
26
+ from collections import Counter
27
+ from typing import Dict, Iterator, List, Tuple
28
+
29
+ import networkx as nx
30
+
31
+ from codeanalyzer.schema.py_schema import (
32
+ PyApplication,
33
+ PyCallable,
34
+ PyCallEdge,
35
+ PyClass,
36
+ PyModule,
37
+ )
38
+
39
+
40
+ def _walk_class_callables(cls: PyClass) -> Iterator[PyCallable]:
41
+ for method in cls.methods.values():
42
+ yield from _walk_callable(method)
43
+ for inner in cls.inner_classes.values():
44
+ yield from _walk_class_callables(inner)
45
+
46
+
47
+ def _walk_callable(c: PyCallable) -> Iterator[PyCallable]:
48
+ yield c
49
+ for inner in c.inner_callables.values():
50
+ yield from _walk_callable(inner)
51
+ for inner_cls in c.inner_classes.values():
52
+ yield from _walk_class_callables(inner_cls)
53
+
54
+
55
+ def _walk_module_callables(module: PyModule) -> Iterator[PyCallable]:
56
+ for fn in module.functions.values():
57
+ yield from _walk_callable(fn)
58
+ for cls in module.classes.values():
59
+ yield from _walk_class_callables(cls)
60
+
61
+
62
+ def iter_callables_in_symbol_table(
63
+ symbol_table: Dict[str, PyModule],
64
+ ) -> Iterator[PyCallable]:
65
+ """Yield every ``PyCallable`` in a symbol table, recursively."""
66
+ for module in symbol_table.values():
67
+ yield from _walk_module_callables(module)
68
+
69
+
70
+ def _walk_classes_in_class(cls: PyClass) -> Iterator[PyClass]:
71
+ yield cls
72
+ for inner in cls.inner_classes.values():
73
+ yield from _walk_classes_in_class(inner)
74
+ # Classes can live inside methods (e.g. a factory method that defines
75
+ # a helper class). Recurse through every method's callable subtree.
76
+ for method in cls.methods.values():
77
+ yield from _walk_classes_in_callable(method)
78
+
79
+
80
+ def _walk_classes_in_callable(c: PyCallable) -> Iterator[PyClass]:
81
+ for inner_cls in c.inner_classes.values():
82
+ yield from _walk_classes_in_class(inner_cls)
83
+ for inner in c.inner_callables.values():
84
+ yield from _walk_classes_in_callable(inner)
85
+
86
+
87
+ def iter_classes_in_symbol_table(
88
+ symbol_table: Dict[str, PyModule],
89
+ ) -> Iterator[PyClass]:
90
+ """Yield every ``PyClass`` in a symbol table, recursively — including
91
+ inner classes, classes nested in functions, and classes nested in
92
+ class methods."""
93
+ for module in symbol_table.values():
94
+ for cls in module.classes.values():
95
+ yield from _walk_classes_in_class(cls)
96
+ for fn in module.functions.values():
97
+ yield from _walk_classes_in_callable(fn)
98
+
99
+
100
+ def iter_callables(app: PyApplication) -> Iterator[PyCallable]:
101
+ """Yield every ``PyCallable`` in the application, recursively."""
102
+ yield from iter_callables_in_symbol_table(app.symbol_table)
103
+
104
+
105
+ def callables_by_signature(app: PyApplication) -> Dict[str, PyCallable]:
106
+ """Flat ``signature -> PyCallable`` index for O(1) node lookup."""
107
+ return {c.signature: c for c in iter_callables(app)}
108
+
109
+
110
+ def to_digraph(app: PyApplication) -> nx.DiGraph:
111
+ """Build a ``networkx.DiGraph`` from a ``PyApplication``.
112
+
113
+ Nodes are keyed by ``PyCallable.signature``. Nodes for in-source
114
+ callables carry a ``callable`` attribute holding the full
115
+ ``PyCallable`` and ``ghost=False``. Endpoints referenced by edges
116
+ but absent from the symbol table — RPC targets, third-party
117
+ libraries, framework callbacks, dynamically resolved callees — are
118
+ added as **ghost** nodes (``callable=None``, ``ghost=True``) so the
119
+ edges are preserved.
120
+
121
+ Edges carry ``type``, ``weight``, and ``provenance`` attributes.
122
+ """
123
+ g = nx.DiGraph()
124
+ by_sig = callables_by_signature(app)
125
+ for sig, c in by_sig.items():
126
+ g.add_node(sig, callable=c, ghost=False)
127
+ for e in app.call_graph:
128
+ for sig in (e.source, e.target):
129
+ if sig not in g.nodes:
130
+ g.add_node(sig, callable=None, ghost=True)
131
+ g.add_edge(
132
+ e.source,
133
+ e.target,
134
+ type=e.type,
135
+ weight=e.weight,
136
+ provenance=list(e.provenance),
137
+ )
138
+ return g
139
+
140
+
141
+ def from_digraph(g: nx.DiGraph) -> list:
142
+ """Reduce a ``DiGraph`` to the persisted ``List[PyCallEdge]`` form.
143
+
144
+ Only edges are extracted; nodes are not serialized here — they are
145
+ expected to already exist as ``PyCallable`` entries in the symbol
146
+ table. Edge attributes default to ``CALL_DEP`` / weight 1 / empty
147
+ provenance when missing.
148
+ """
149
+ edges = []
150
+ for src, dst, data in g.edges(data=True):
151
+ edges.append(
152
+ PyCallEdge(
153
+ source=src,
154
+ target=dst,
155
+ type=data.get("type", "CALL_DEP"),
156
+ weight=int(data.get("weight", 1)),
157
+ provenance=list(data.get("provenance", [])),
158
+ )
159
+ )
160
+ return edges
161
+
162
+
163
+ def jedi_call_graph_edges(
164
+ symbol_table: Dict[str, PyModule],
165
+ ) -> List[PyCallEdge]:
166
+ """Derive ``PyCallEdge`` entries from Jedi's per-callable ``call_sites``.
167
+
168
+ For every ``PyCallable`` in the symbol table, each ``PyCallsite`` whose
169
+ ``callee_signature`` is resolved (non-empty) contributes an edge
170
+ ``caller.signature -> site.callee_signature``. Sites where Jedi failed
171
+ to resolve the callee (``callee_signature`` is ``None`` or empty) are
172
+ skipped — they have no anchor to put on the graph.
173
+
174
+ Edges are coalesced on ``(source, target)``: ``weight`` is the count of
175
+ matching sites. Provenance is always ``["jedi"]``; combine with
176
+ CodeQL-derived edges via ``merge_edges``.
177
+ """
178
+ counts: Counter = Counter()
179
+ for caller in iter_callables_in_symbol_table(symbol_table):
180
+ for site in caller.call_sites:
181
+ if not site.callee_signature:
182
+ continue
183
+ counts[(caller.signature, site.callee_signature)] += 1
184
+
185
+ return [
186
+ PyCallEdge(source=src, target=dst, weight=n, provenance=["jedi"])
187
+ for (src, dst), n in counts.items()
188
+ ]
189
+
190
+
191
+ def resolve_unresolved_constructors(symbol_table: Dict[str, PyModule]) -> int:
192
+ """Fill in ``PyCallsite.callee_signature`` for unresolved constructor sites.
193
+
194
+ When both Jedi and CodeQL fail to resolve a constructor call (commonly
195
+ for classes nested inside functions or methods, where static-analysis
196
+ points-to is weakest), Jedi still flags the site as
197
+ ``is_constructor_call=True`` with ``method_name`` set to the class's
198
+ short name. This pass does the resolution heuristically:
199
+
200
+ 1. Build a ``short_name -> [PyClass]`` index from all classes in the
201
+ symbol table.
202
+ 2. For each unresolved constructor site under a caller ``C``, look up
203
+ candidates by ``site.method_name`` and prefer the class whose
204
+ ``signature`` is the longest prefix-ancestor of ``C.signature`` —
205
+ this approximates Python's LEGB scoping for nested classes.
206
+ 3. Set ``callee_signature = f"{class.signature}.__init__"``.
207
+
208
+ Returns the number of sites resolved. Best-effort; sites with no
209
+ matching class or ambiguous candidates with no scope tiebreaker are
210
+ left as-is.
211
+ """
212
+ by_name: Dict[str, List[PyClass]] = {}
213
+ for cls in iter_classes_in_symbol_table(symbol_table):
214
+ by_name.setdefault(cls.name, []).append(cls)
215
+
216
+ resolved = 0
217
+ for caller in iter_callables_in_symbol_table(symbol_table):
218
+ for site in caller.call_sites:
219
+ if not site.is_constructor_call or site.callee_signature:
220
+ continue
221
+ candidates = by_name.get(site.method_name)
222
+ if not candidates:
223
+ continue
224
+
225
+ # Prefer the class whose signature is the longest prefix of
226
+ # the caller's signature (closest enclosing scope).
227
+ def scope_score(c: PyClass, _caller_sig: str = caller.signature) -> int:
228
+ cls_sig = c.signature
229
+ parent_sig = cls_sig.rsplit(".", 1)[0] if "." in cls_sig else ""
230
+ # Score = length of parent_sig if it's a prefix of caller's
231
+ # signature, else -1 (not in scope, lowest priority).
232
+ if parent_sig and _caller_sig.startswith(parent_sig):
233
+ return len(parent_sig)
234
+ # Module-level class (parent_sig is the module path) — give
235
+ # it a base score so it still wins over no match.
236
+ return 0 if not parent_sig else -1
237
+
238
+ best = max(candidates, key=scope_score)
239
+ if scope_score(best) < 0:
240
+ # No candidate is reachable from caller's scope.
241
+ continue
242
+
243
+ site.callee_signature = f"{best.signature}.__init__"
244
+ resolved += 1
245
+
246
+ return resolved
247
+
248
+
249
+ def merge_edges(*edge_lists: list) -> list:
250
+ """Merge multiple ``List[PyCallEdge]`` into one.
251
+
252
+ Edges with the same ``(source, target)`` are coalesced: weights sum,
253
+ provenance is the sorted union. Useful for combining edges produced
254
+ by different backends (e.g. Jedi + CodeQL).
255
+ """
256
+ by_key: Dict[Tuple[str, str], PyCallEdge] = {}
257
+ for edges in edge_lists:
258
+ for e in edges:
259
+ k = (e.source, e.target)
260
+ if k in by_key:
261
+ cur = by_key[k]
262
+ cur.weight += e.weight
263
+ cur.provenance = sorted(set(cur.provenance) | set(e.provenance))
264
+ else:
265
+ by_key[k] = e.model_copy()
266
+ return list(by_key.values())