codeanalyzer-python 0.1.13__py3-none-any.whl → 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- codeanalyzer/__main__.py +99 -11
- codeanalyzer/core.py +154 -19
- codeanalyzer/neo4j/__init__.py +46 -0
- codeanalyzer/neo4j/bolt.py +223 -0
- codeanalyzer/neo4j/catalog.py +245 -0
- codeanalyzer/neo4j/cypher.py +138 -0
- codeanalyzer/neo4j/emit.py +74 -0
- codeanalyzer/neo4j/project.py +322 -0
- codeanalyzer/neo4j/rows.py +176 -0
- codeanalyzer/neo4j/schema.py +39 -0
- codeanalyzer/options/__init__.py +2 -2
- codeanalyzer/options/options.py +20 -1
- codeanalyzer/schema/py_schema.py +20 -0
- codeanalyzer/semantic_analysis/call_graph.py +266 -0
- codeanalyzer/semantic_analysis/codeql/codeql_analysis.py +318 -69
- codeanalyzer/semantic_analysis/codeql/codeql_loader.py +32 -4
- codeanalyzer/semantic_analysis/codeql/codeql_query_runner.py +51 -31
- codeanalyzer/syntactic_analysis/symbol_table_builder.py +87 -4
- codeanalyzer_python-0.2.0.dist-info/METADATA +393 -0
- codeanalyzer_python-0.2.0.dist-info/RECORD +39 -0
- {codeanalyzer_python-0.1.13.dist-info → codeanalyzer_python-0.2.0.dist-info}/WHEEL +1 -1
- codeanalyzer_python-0.2.0.dist-info/entry_points.txt +3 -0
- codeanalyzer/semantic_analysis/wala/__init__.py +0 -15
- codeanalyzer_python-0.1.13.dist-info/METADATA +0 -414
- codeanalyzer_python-0.1.13.dist-info/RECORD +0 -31
- codeanalyzer_python-0.1.13.dist-info/entry_points.txt +0 -2
- {codeanalyzer_python-0.1.13.dist-info → codeanalyzer_python-0.2.0.dist-info}/licenses/LICENSE +0 -0
- {codeanalyzer_python-0.1.13.dist-info → codeanalyzer_python-0.2.0.dist-info}/licenses/NOTICE +0 -0
|
@@ -0,0 +1,322 @@
|
|
|
1
|
+
################################################################################
|
|
2
|
+
# Copyright IBM Corporation 2025
|
|
3
|
+
#
|
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
# you may not use this file except in compliance with the License.
|
|
6
|
+
# You may obtain a copy of the License at
|
|
7
|
+
#
|
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
#
|
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
# See the License for the specific language governing permissions and
|
|
14
|
+
# limitations under the License.
|
|
15
|
+
################################################################################
|
|
16
|
+
|
|
17
|
+
"""``project()`` — the pure projection from the canonical :class:`PyApplication`
|
|
18
|
+
IR to graph rows. It walks the same recursive symbol table the call-graph builder
|
|
19
|
+
walks, but instead of collecting callables it emits nodes + edges. No I/O: the
|
|
20
|
+
writers (cypher snapshot / bolt incremental) consume the returned
|
|
21
|
+
:class:`GraphRows`.
|
|
22
|
+
|
|
23
|
+
Modelling decisions (mirror of the TypeScript backend):
|
|
24
|
+
- signature-keyed declarations (PyClass, PyCallable) carry a shared ``:PySymbol``
|
|
25
|
+
label (the global-identity / MERGE key).
|
|
26
|
+
- call sites, decorators, class attributes and variables are first-class nodes.
|
|
27
|
+
- call-graph endpoints absent from the symbol table become ``:PyExternal`` ghost
|
|
28
|
+
nodes, so RPC / third-party / framework edges are preserved (matching the
|
|
29
|
+
analyzer's own ghost-node behaviour).
|
|
30
|
+
- every project-owned node carries an internal ``_module`` provenance prop, so
|
|
31
|
+
the incremental writer can delete exactly what a re-analyzed module emitted.
|
|
32
|
+
"""
|
|
33
|
+
from __future__ import annotations
|
|
34
|
+
|
|
35
|
+
import json
|
|
36
|
+
from pathlib import Path
|
|
37
|
+
from typing import Any, List, Optional
|
|
38
|
+
|
|
39
|
+
from codeanalyzer.neo4j.catalog import SCHEMA_VERSION
|
|
40
|
+
from codeanalyzer.neo4j.rows import GraphRows, NodeRef, Props, RowBuilder, prune
|
|
41
|
+
from codeanalyzer.schema import (
|
|
42
|
+
PyApplication,
|
|
43
|
+
PyCallable,
|
|
44
|
+
PyClass,
|
|
45
|
+
PyClassAttribute,
|
|
46
|
+
PyComment,
|
|
47
|
+
PyModule,
|
|
48
|
+
PyVariableDeclaration,
|
|
49
|
+
)
|
|
50
|
+
from codeanalyzer.schema.py_schema import PyCallsite
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def project(app: PyApplication, app_name: str) -> GraphRows:
|
|
54
|
+
b = RowBuilder()
|
|
55
|
+
|
|
56
|
+
app_ref = b.node(["PyApplication"], "name", app_name, {"schema_version": SCHEMA_VERSION})
|
|
57
|
+
|
|
58
|
+
for file_key, mod in app.symbol_table.items():
|
|
59
|
+
mod_ref = b.node(["PyModule"], "file_key", file_key, _module_props(mod, file_key))
|
|
60
|
+
b.edge("PY_HAS_MODULE", app_ref, mod_ref)
|
|
61
|
+
_project_module_body(b, file_key, mod_ref, mod)
|
|
62
|
+
|
|
63
|
+
# The aggregated :PY_CALLS twin. Endpoints not present in the symbol table become
|
|
64
|
+
# :PyExternal ghost nodes (the analyzer already preserves them as ghost nodes).
|
|
65
|
+
for e in app.call_graph:
|
|
66
|
+
src = _call_endpoint(b, e.source)
|
|
67
|
+
tgt = _call_endpoint(b, e.target)
|
|
68
|
+
b.edge("PY_CALLS", src, tgt, _call_edge_props(e.weight, list(e.provenance or [])))
|
|
69
|
+
|
|
70
|
+
return b.finish()
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def _sym(signature: str) -> NodeRef:
|
|
74
|
+
return NodeRef("PySymbol", "signature", signature)
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def _call_endpoint(b: RowBuilder, signature: str) -> NodeRef:
|
|
78
|
+
"""A call-graph endpoint: a known callable already emitted, or a phantom
|
|
79
|
+
:PyExternal symbol materialized on demand for a ghost target."""
|
|
80
|
+
if b.has_key(signature):
|
|
81
|
+
return _sym(signature)
|
|
82
|
+
name = signature.rsplit(".", 1)[-1] if "." in signature else signature
|
|
83
|
+
return b.node(["PySymbol", "PyExternal"], "signature", signature, {"name": name})
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
# ----------------------------------------------------------------------------------------------
|
|
87
|
+
# Module body
|
|
88
|
+
# ----------------------------------------------------------------------------------------------
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
def _project_module_body(b: RowBuilder, file_key: str, mod_ref: NodeRef, mod: PyModule) -> None:
|
|
92
|
+
for fn in (mod.functions or {}).values():
|
|
93
|
+
_project_callable(b, file_key, mod_ref, "PY_DECLARES", fn)
|
|
94
|
+
for cl in (mod.classes or {}).values():
|
|
95
|
+
_project_class(b, file_key, mod_ref, "PY_DECLARES", cl)
|
|
96
|
+
for v in mod.variables or []:
|
|
97
|
+
_project_variable(b, file_key, mod_ref, file_key, v)
|
|
98
|
+
_project_imports(b, mod_ref, mod)
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
def _project_imports(b: RowBuilder, mod_ref: NodeRef, mod: PyModule) -> None:
|
|
102
|
+
# Per-target-module aggregation: collapse all bindings for a given imported
|
|
103
|
+
# module into one PY_IMPORTS edge to a shared :PyPackage node.
|
|
104
|
+
agg: dict = {}
|
|
105
|
+
for im in mod.imports or []:
|
|
106
|
+
if not im.module:
|
|
107
|
+
continue # relative `from . import x` — no resolvable package
|
|
108
|
+
a = agg.setdefault(im.module, {"names": set(), "aliases": set()})
|
|
109
|
+
if im.name:
|
|
110
|
+
a["names"].add(im.name)
|
|
111
|
+
if im.alias:
|
|
112
|
+
a["aliases"].add(im.alias)
|
|
113
|
+
for module_name, a in agg.items():
|
|
114
|
+
pkg = b.node(["PyPackage"], "name", module_name, {})
|
|
115
|
+
b.edge(
|
|
116
|
+
"PY_IMPORTS",
|
|
117
|
+
mod_ref,
|
|
118
|
+
pkg,
|
|
119
|
+
prune(
|
|
120
|
+
{
|
|
121
|
+
"imported_names": sorted(a["names"]) or None,
|
|
122
|
+
"aliases": sorted(a["aliases"]) or None,
|
|
123
|
+
}
|
|
124
|
+
),
|
|
125
|
+
)
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
# ----------------------------------------------------------------------------------------------
|
|
129
|
+
# Declarations
|
|
130
|
+
# ----------------------------------------------------------------------------------------------
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
def _project_class(
|
|
134
|
+
b: RowBuilder, file_key: str, parent: NodeRef, parent_rel: str, cl: PyClass
|
|
135
|
+
) -> None:
|
|
136
|
+
ref = b.node(["PySymbol", "PyClass"], "signature", cl.signature, _class_props(cl, file_key))
|
|
137
|
+
b.edge(parent_rel, parent, ref)
|
|
138
|
+
|
|
139
|
+
for base in cl.base_classes or []:
|
|
140
|
+
b.edge_to_symbol("PY_EXTENDS", ref, base)
|
|
141
|
+
|
|
142
|
+
for m in (cl.methods or {}).values():
|
|
143
|
+
_project_callable(b, file_key, ref, "PY_HAS_METHOD", m)
|
|
144
|
+
for a in (cl.attributes or {}).values():
|
|
145
|
+
_project_attribute(b, file_key, ref, cl.signature, a)
|
|
146
|
+
for ic in (cl.inner_classes or {}).values():
|
|
147
|
+
_project_class(b, file_key, ref, "PY_DECLARES", ic)
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
def _project_callable(
|
|
151
|
+
b: RowBuilder, file_key: str, owner: NodeRef, owner_rel: str, c: PyCallable
|
|
152
|
+
) -> None:
|
|
153
|
+
ref = b.node(["PySymbol", "PyCallable"], "signature", c.signature, _callable_props(c, file_key))
|
|
154
|
+
b.edge(owner_rel, owner, ref)
|
|
155
|
+
|
|
156
|
+
for d in c.decorators or []:
|
|
157
|
+
_project_decorator(b, ref, d)
|
|
158
|
+
|
|
159
|
+
for s in c.call_sites or []:
|
|
160
|
+
# Key off the relative file (a call site lives in its callable's file) so ids stay portable.
|
|
161
|
+
cs_id = f"{file_key}#{s.start_line}:{s.start_column}-{s.end_line}:{s.end_column}"
|
|
162
|
+
cs = b.node(["PyCallSite"], "id", cs_id, _call_site_props(s, file_key))
|
|
163
|
+
b.edge("PY_HAS_CALLSITE", ref, cs)
|
|
164
|
+
if s.callee_signature:
|
|
165
|
+
b.edge_to_symbol("PY_RESOLVES_TO", cs, s.callee_signature)
|
|
166
|
+
|
|
167
|
+
for v in c.local_variables or []:
|
|
168
|
+
_project_variable(b, file_key, ref, c.signature, v)
|
|
169
|
+
for ic in (c.inner_callables or {}).values():
|
|
170
|
+
_project_callable(b, file_key, ref, "PY_DECLARES", ic)
|
|
171
|
+
for cl in (c.inner_classes or {}).values():
|
|
172
|
+
_project_class(b, file_key, ref, "PY_DECLARES", cl)
|
|
173
|
+
|
|
174
|
+
|
|
175
|
+
def _project_attribute(
|
|
176
|
+
b: RowBuilder, file_key: str, owner: NodeRef, owner_sig: str, a: PyClassAttribute
|
|
177
|
+
) -> None:
|
|
178
|
+
attr_id = f"{owner_sig}.{a.name}"
|
|
179
|
+
ref = b.node(["PyAttribute"], "id", attr_id, _attribute_props(a, attr_id, file_key))
|
|
180
|
+
b.edge("PY_HAS_ATTRIBUTE", owner, ref)
|
|
181
|
+
|
|
182
|
+
|
|
183
|
+
def _project_variable(
|
|
184
|
+
b: RowBuilder, file_key: str, owner: NodeRef, owner_id: str, v: PyVariableDeclaration
|
|
185
|
+
) -> None:
|
|
186
|
+
var_id = f"{owner_id}#{v.name}@{v.start_line}"
|
|
187
|
+
ref = b.node(["PyVariable"], "id", var_id, _variable_props(v, var_id, file_key))
|
|
188
|
+
b.edge("PY_DECLARES_VAR", owner, ref)
|
|
189
|
+
|
|
190
|
+
|
|
191
|
+
def _project_decorator(b: RowBuilder, on: NodeRef, decorator: str) -> None:
|
|
192
|
+
dec = b.node(["PyDecorator"], "name", decorator, {"name": decorator})
|
|
193
|
+
b.edge("PY_DECORATED_BY", on, dec)
|
|
194
|
+
|
|
195
|
+
|
|
196
|
+
# ----------------------------------------------------------------------------------------------
|
|
197
|
+
# Property flattening
|
|
198
|
+
# ----------------------------------------------------------------------------------------------
|
|
199
|
+
|
|
200
|
+
|
|
201
|
+
def _module_props(mod: PyModule, file_key: str) -> Props:
|
|
202
|
+
return prune(
|
|
203
|
+
{
|
|
204
|
+
"module_name": mod.module_name,
|
|
205
|
+
"content_hash": mod.content_hash,
|
|
206
|
+
"last_modified": mod.last_modified,
|
|
207
|
+
"file_size": mod.file_size,
|
|
208
|
+
"_module": file_key,
|
|
209
|
+
}
|
|
210
|
+
)
|
|
211
|
+
|
|
212
|
+
|
|
213
|
+
def _class_props(cl: PyClass, file_key: str) -> Props:
|
|
214
|
+
return prune(
|
|
215
|
+
{
|
|
216
|
+
"name": cl.name,
|
|
217
|
+
"code": cl.code,
|
|
218
|
+
"base_classes": list(cl.base_classes or []),
|
|
219
|
+
"docstring": _docstring_of(cl.comments),
|
|
220
|
+
"start_line": cl.start_line,
|
|
221
|
+
"end_line": cl.end_line,
|
|
222
|
+
"_module": file_key,
|
|
223
|
+
}
|
|
224
|
+
)
|
|
225
|
+
|
|
226
|
+
|
|
227
|
+
def _callable_props(c: PyCallable, file_key: str) -> Props:
|
|
228
|
+
return prune(
|
|
229
|
+
{
|
|
230
|
+
"name": c.name,
|
|
231
|
+
"path": c.path,
|
|
232
|
+
"return_type": c.return_type,
|
|
233
|
+
"cyclomatic_complexity": c.cyclomatic_complexity,
|
|
234
|
+
"code": c.code,
|
|
235
|
+
"code_start_line": c.code_start_line,
|
|
236
|
+
"start_line": c.start_line,
|
|
237
|
+
"end_line": c.end_line,
|
|
238
|
+
"docstring": _docstring_of(c.comments),
|
|
239
|
+
"decorators": list(c.decorators or []),
|
|
240
|
+
"parameters_json": _stringify_if(c.parameters),
|
|
241
|
+
"accessed_symbols_json": _stringify_if(c.accessed_symbols),
|
|
242
|
+
"_module": file_key,
|
|
243
|
+
}
|
|
244
|
+
)
|
|
245
|
+
|
|
246
|
+
|
|
247
|
+
def _attribute_props(a: PyClassAttribute, attr_id: str, file_key: str) -> Props:
|
|
248
|
+
return prune(
|
|
249
|
+
{
|
|
250
|
+
"id": attr_id,
|
|
251
|
+
"name": a.name,
|
|
252
|
+
"type": a.type,
|
|
253
|
+
"docstring": _docstring_of(a.comments),
|
|
254
|
+
"start_line": a.start_line,
|
|
255
|
+
"end_line": a.end_line,
|
|
256
|
+
"_module": file_key,
|
|
257
|
+
}
|
|
258
|
+
)
|
|
259
|
+
|
|
260
|
+
|
|
261
|
+
def _variable_props(v: PyVariableDeclaration, var_id: str, file_key: str) -> Props:
|
|
262
|
+
return prune(
|
|
263
|
+
{
|
|
264
|
+
"id": var_id,
|
|
265
|
+
"name": v.name,
|
|
266
|
+
"type": v.type,
|
|
267
|
+
"initializer": v.initializer,
|
|
268
|
+
"scope": v.scope,
|
|
269
|
+
"start_line": v.start_line,
|
|
270
|
+
"end_line": v.end_line,
|
|
271
|
+
"_module": file_key,
|
|
272
|
+
}
|
|
273
|
+
)
|
|
274
|
+
|
|
275
|
+
|
|
276
|
+
def _call_site_props(s: PyCallsite, file_key: str) -> Props:
|
|
277
|
+
cs_id = f"{file_key}#{s.start_line}:{s.start_column}-{s.end_line}:{s.end_column}"
|
|
278
|
+
return prune(
|
|
279
|
+
{
|
|
280
|
+
"id": cs_id,
|
|
281
|
+
"method_name": s.method_name,
|
|
282
|
+
"receiver_expr": s.receiver_expr,
|
|
283
|
+
"receiver_type": s.receiver_type,
|
|
284
|
+
"argument_types": list(s.argument_types or []),
|
|
285
|
+
"return_type": s.return_type,
|
|
286
|
+
"callee_signature": s.callee_signature,
|
|
287
|
+
"is_constructor_call": s.is_constructor_call,
|
|
288
|
+
"start_line": s.start_line,
|
|
289
|
+
"start_column": s.start_column,
|
|
290
|
+
"end_line": s.end_line,
|
|
291
|
+
"end_column": s.end_column,
|
|
292
|
+
"_module": file_key,
|
|
293
|
+
}
|
|
294
|
+
)
|
|
295
|
+
|
|
296
|
+
|
|
297
|
+
def _call_edge_props(weight: int, provenance: List[str]) -> Props:
|
|
298
|
+
return prune({"weight": weight, "provenance": list(provenance)})
|
|
299
|
+
|
|
300
|
+
|
|
301
|
+
def _docstring_of(comments: Optional[List[PyComment]]) -> Optional[str]:
|
|
302
|
+
docs = [c.content for c in (comments or []) if c.is_docstring]
|
|
303
|
+
return "\n".join(docs) if docs else None
|
|
304
|
+
|
|
305
|
+
|
|
306
|
+
def _stringify_if(value: Any) -> Optional[str]:
|
|
307
|
+
"""JSON-encode a list/dict of pydantic models, or None when empty."""
|
|
308
|
+
if value is None:
|
|
309
|
+
return None
|
|
310
|
+
if isinstance(value, (list, dict)) and len(value) == 0:
|
|
311
|
+
return None
|
|
312
|
+
return json.dumps(value, default=_jsonable, sort_keys=True)
|
|
313
|
+
|
|
314
|
+
|
|
315
|
+
def _jsonable(o: Any) -> Any:
|
|
316
|
+
if hasattr(o, "model_dump"):
|
|
317
|
+
return o.model_dump()
|
|
318
|
+
if hasattr(o, "dict"):
|
|
319
|
+
return o.dict()
|
|
320
|
+
if isinstance(o, Path):
|
|
321
|
+
return str(o)
|
|
322
|
+
return str(o)
|
|
@@ -0,0 +1,176 @@
|
|
|
1
|
+
################################################################################
|
|
2
|
+
# Copyright IBM Corporation 2025
|
|
3
|
+
#
|
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
# you may not use this file except in compliance with the License.
|
|
6
|
+
# You may obtain a copy of the License at
|
|
7
|
+
#
|
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
#
|
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
# See the License for the specific language governing permissions and
|
|
14
|
+
# limitations under the License.
|
|
15
|
+
################################################################################
|
|
16
|
+
|
|
17
|
+
"""The output-agnostic intermediate between :func:`project` and the two writers
|
|
18
|
+
(cypher snapshot / bolt incremental). Pure data — no I/O, no driver. A
|
|
19
|
+
:class:`GraphRows` is a deterministic, deduped bag of nodes and edges that both
|
|
20
|
+
writers consume identically.
|
|
21
|
+
|
|
22
|
+
Property values are restricted to Neo4j-legal shapes: primitives and homogeneous
|
|
23
|
+
arrays of primitives. ``None`` values are pruned (in Neo4j a null property is
|
|
24
|
+
simply absence).
|
|
25
|
+
"""
|
|
26
|
+
from __future__ import annotations
|
|
27
|
+
|
|
28
|
+
from dataclasses import dataclass, field
|
|
29
|
+
from typing import Dict, List, Optional, Union
|
|
30
|
+
|
|
31
|
+
# A property value: a primitive, or a homogeneous list of primitives.
|
|
32
|
+
Scalar = Union[str, int, float, bool]
|
|
33
|
+
Prop = Union[Scalar, List[str], List[int], List[float], List[bool]]
|
|
34
|
+
Props = Dict[str, Prop]
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
@dataclass(frozen=True)
|
|
38
|
+
class NodeRef:
|
|
39
|
+
"""How an edge addresses one of its endpoints: the label + key property to
|
|
40
|
+
MATCH on, and the value."""
|
|
41
|
+
|
|
42
|
+
label: str # the label carrying the uniqueness constraint (e.g. "PySymbol", "PyModule")
|
|
43
|
+
key_prop: str # "signature" | "file_key" | "name" | "id"
|
|
44
|
+
value: str
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
@dataclass
|
|
48
|
+
class NodeRow:
|
|
49
|
+
labels: List[str] # labels[0] is the constrained MERGE label; the rest are SET as extra labels
|
|
50
|
+
key_prop: str
|
|
51
|
+
value: str
|
|
52
|
+
props: Props
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
@dataclass
|
|
56
|
+
class EdgeRow:
|
|
57
|
+
type: str
|
|
58
|
+
from_ref: NodeRef
|
|
59
|
+
to_ref: NodeRef
|
|
60
|
+
props: Props
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
@dataclass
|
|
64
|
+
class GraphRows:
|
|
65
|
+
nodes: List[NodeRow] = field(default_factory=list)
|
|
66
|
+
edges: List[EdgeRow] = field(default_factory=list)
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def prune(p: Dict[str, Optional[Prop]]) -> Props:
|
|
70
|
+
"""Drop ``None`` entries — in Neo4j a null property means "absent", so we
|
|
71
|
+
never store one. Empty lists are kept (a present-but-empty array is legal)."""
|
|
72
|
+
return {k: v for k, v in p.items() if v is not None}
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
class RowBuilder:
|
|
76
|
+
"""Accumulates nodes/edges with ``MERGE`` semantics in memory, so the same
|
|
77
|
+
node touched many times (a hot external symbol, a canonical decorator)
|
|
78
|
+
collapses to one row, and cross-reference edges to a target that never
|
|
79
|
+
materialized are dropped (the "edge-only-when-resolved" rule).
|
|
80
|
+
"""
|
|
81
|
+
|
|
82
|
+
def __init__(self) -> None:
|
|
83
|
+
self._nodes: Dict[str, NodeRow] = {} # key: f"{labels[0]} {value}"
|
|
84
|
+
self._edges: List[EdgeRow] = []
|
|
85
|
+
self._deferred: List[EdgeRow] = [] # edges gated against node existence at finish()
|
|
86
|
+
self._keys: set = set() # every node value seen, for resolved-gating
|
|
87
|
+
|
|
88
|
+
def node(self, labels: List[str], key_prop: str, value: str, props: Props) -> NodeRef:
|
|
89
|
+
"""Upsert a node. Re-seeing the same ``(labels[0], value)`` merges props
|
|
90
|
+
(last write wins) and unions labels — the in-memory analog of
|
|
91
|
+
``MERGE (n:Label {key}) SET n += props``."""
|
|
92
|
+
node_id = f"{labels[0]} {value}"
|
|
93
|
+
existing = self._nodes.get(node_id)
|
|
94
|
+
if existing is not None:
|
|
95
|
+
existing.props.update(props)
|
|
96
|
+
for label in labels:
|
|
97
|
+
if label not in existing.labels:
|
|
98
|
+
existing.labels.append(label)
|
|
99
|
+
else:
|
|
100
|
+
self._nodes[node_id] = NodeRow(list(labels), key_prop, value, dict(props))
|
|
101
|
+
self._keys.add(value)
|
|
102
|
+
return NodeRef(labels[0], key_prop, value)
|
|
103
|
+
|
|
104
|
+
def edge(self, type_: str, from_ref: NodeRef, to_ref: NodeRef, props: Optional[Props] = None) -> None:
|
|
105
|
+
"""An edge whose endpoints are known to exist (both ends emitted this run)."""
|
|
106
|
+
self._edges.append(EdgeRow(type_, from_ref, to_ref, dict(props or {})))
|
|
107
|
+
|
|
108
|
+
def edge_to_symbol(
|
|
109
|
+
self, type_: str, from_ref: NodeRef, target_signature: str, props: Optional[Props] = None
|
|
110
|
+
) -> None:
|
|
111
|
+
"""An edge to a ``:PySymbol`` target that may be external/library code not
|
|
112
|
+
present in the graph. Deferred and kept only if the target signature was
|
|
113
|
+
actually emitted as a node — so PY_EXTENDS / PY_RESOLVES_TO never dangle (the
|
|
114
|
+
string fallback lives on the source node's props)."""
|
|
115
|
+
self._deferred.append(
|
|
116
|
+
EdgeRow(
|
|
117
|
+
type_,
|
|
118
|
+
from_ref,
|
|
119
|
+
NodeRef("PySymbol", "signature", target_signature),
|
|
120
|
+
dict(props or {}),
|
|
121
|
+
)
|
|
122
|
+
)
|
|
123
|
+
|
|
124
|
+
def has_key(self, value: str) -> bool:
|
|
125
|
+
return value in self._keys
|
|
126
|
+
|
|
127
|
+
def finish(self) -> GraphRows:
|
|
128
|
+
for e in self._deferred:
|
|
129
|
+
if e.to_ref.value in self._keys:
|
|
130
|
+
self._edges.append(e)
|
|
131
|
+
nodes = sorted(self._nodes.values(), key=lambda n: f"{n.labels[0]} {n.value}")
|
|
132
|
+
edges = sorted(self._edges, key=lambda e: f"{e.type} {e.from_ref.value} {e.to_ref.value}")
|
|
133
|
+
return GraphRows(nodes, edges)
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
# ----------------------------------------------------------------------------------------------
|
|
137
|
+
# Cypher literal rendering (used by the snapshot writer; the bolt writer passes params instead).
|
|
138
|
+
# ----------------------------------------------------------------------------------------------
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
def cypher_value(v: Prop) -> str:
|
|
142
|
+
"""Render a property value as a Cypher literal."""
|
|
143
|
+
if isinstance(v, bool):
|
|
144
|
+
return "true" if v else "false"
|
|
145
|
+
if isinstance(v, str):
|
|
146
|
+
return _cypher_string(v)
|
|
147
|
+
if isinstance(v, (int, float)):
|
|
148
|
+
# bools are handled above; int/float fall through here.
|
|
149
|
+
if isinstance(v, float) and (v != v or v in (float("inf"), float("-inf"))):
|
|
150
|
+
return "null"
|
|
151
|
+
return repr(v) if isinstance(v, float) else str(v)
|
|
152
|
+
if isinstance(v, list):
|
|
153
|
+
return "[" + ", ".join(cypher_value(x) for x in v) + "]"
|
|
154
|
+
return "null"
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
def cypher_map(props: Props) -> str:
|
|
158
|
+
"""Render a props map as a Cypher map literal: ``{key: value, ...}``.
|
|
159
|
+
Keys are valid identifiers."""
|
|
160
|
+
return "{" + ", ".join(f"{k}: {cypher_value(v)}" for k, v in props.items()) + "}"
|
|
161
|
+
|
|
162
|
+
|
|
163
|
+
def _cypher_string(s: str) -> str:
|
|
164
|
+
escaped = (
|
|
165
|
+
s.replace("\\", "\\\\")
|
|
166
|
+
.replace("'", "\\'")
|
|
167
|
+
.replace("\n", "\\n")
|
|
168
|
+
.replace("\r", "\\r")
|
|
169
|
+
.replace("\t", "\\t")
|
|
170
|
+
)
|
|
171
|
+
return f"'{escaped}'"
|
|
172
|
+
|
|
173
|
+
|
|
174
|
+
def chunk(items: list, size: int) -> list:
|
|
175
|
+
"""Split a list into chunks of at most ``size`` (UNWIND batch sizing)."""
|
|
176
|
+
return [items[i : i + size] for i in range(0, len(items), size)]
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
################################################################################
|
|
2
|
+
# Copyright IBM Corporation 2025
|
|
3
|
+
#
|
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
# you may not use this file except in compliance with the License.
|
|
6
|
+
# You may obtain a copy of the License at
|
|
7
|
+
#
|
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
#
|
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
# See the License for the specific language governing permissions and
|
|
14
|
+
# limitations under the License.
|
|
15
|
+
################################################################################
|
|
16
|
+
|
|
17
|
+
"""The Cypher DDL — uniqueness constraints and indexes — shared by both writers.
|
|
18
|
+
Run BEFORE any load so MERGE uses an index seek (not a label scan) and the
|
|
19
|
+
identity invariant is enforced by the database. Every statement is idempotent
|
|
20
|
+
(``IF NOT EXISTS``).
|
|
21
|
+
"""
|
|
22
|
+
from typing import List
|
|
23
|
+
|
|
24
|
+
CONSTRAINTS: List[str] = [
|
|
25
|
+
"CREATE CONSTRAINT py_symbol_sig IF NOT EXISTS FOR (s:PySymbol) REQUIRE s.signature IS UNIQUE",
|
|
26
|
+
"CREATE CONSTRAINT py_app_name IF NOT EXISTS FOR (a:PyApplication) REQUIRE a.name IS UNIQUE",
|
|
27
|
+
"CREATE CONSTRAINT py_module_key IF NOT EXISTS FOR (m:PyModule) REQUIRE m.file_key IS UNIQUE",
|
|
28
|
+
"CREATE CONSTRAINT py_package_name IF NOT EXISTS FOR (p:PyPackage) REQUIRE p.name IS UNIQUE",
|
|
29
|
+
"CREATE CONSTRAINT py_decorator_name IF NOT EXISTS FOR (d:PyDecorator) REQUIRE d.name IS UNIQUE",
|
|
30
|
+
"CREATE CONSTRAINT py_callsite_id IF NOT EXISTS FOR (c:PyCallSite) REQUIRE c.id IS UNIQUE",
|
|
31
|
+
"CREATE CONSTRAINT py_attribute_id IF NOT EXISTS FOR (a:PyAttribute) REQUIRE a.id IS UNIQUE",
|
|
32
|
+
"CREATE CONSTRAINT py_variable_id IF NOT EXISTS FOR (v:PyVariable) REQUIRE v.id IS UNIQUE",
|
|
33
|
+
]
|
|
34
|
+
|
|
35
|
+
INDEXES: List[str] = [
|
|
36
|
+
"CREATE INDEX py_callable_name IF NOT EXISTS FOR (c:PyCallable) ON (c.name)",
|
|
37
|
+
"CREATE INDEX py_class_name IF NOT EXISTS FOR (c:PyClass) ON (c.name)",
|
|
38
|
+
"CREATE FULLTEXT INDEX py_code_fts IF NOT EXISTS FOR (c:PyCallable) ON EACH [c.code, c.docstring]",
|
|
39
|
+
]
|
codeanalyzer/options/__init__.py
CHANGED
|
@@ -1,3 +1,3 @@
|
|
|
1
|
-
from .options import AnalysisOptions
|
|
1
|
+
from .options import AnalysisOptions, EmitTarget, OutputFormat
|
|
2
2
|
|
|
3
|
-
__all__ = ["AnalysisOptions"]
|
|
3
|
+
__all__ = ["AnalysisOptions", "EmitTarget", "OutputFormat"]
|
codeanalyzer/options/options.py
CHANGED
|
@@ -9,12 +9,31 @@ class OutputFormat(str, Enum):
|
|
|
9
9
|
MSGPACK = "msgpack"
|
|
10
10
|
|
|
11
11
|
|
|
12
|
+
class EmitTarget(str, Enum):
|
|
13
|
+
"""Output target selected by ``--emit``.
|
|
14
|
+
|
|
15
|
+
- ``json`` : the canonical ``analysis.json`` (symbol table + call graph).
|
|
16
|
+
- ``neo4j`` : project the analysis into a labeled property graph — a
|
|
17
|
+
``graph.cypher`` snapshot, or a live Bolt push with ``--neo4j-uri``.
|
|
18
|
+
- ``schema`` : the machine-readable, version-stamped Neo4j schema contract.
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
JSON = "json"
|
|
22
|
+
NEO4J = "neo4j"
|
|
23
|
+
SCHEMA = "schema"
|
|
24
|
+
|
|
25
|
+
|
|
12
26
|
@dataclass
|
|
13
27
|
class AnalysisOptions:
|
|
14
28
|
input: Path
|
|
15
29
|
output: Optional[Path] = None
|
|
16
30
|
format: OutputFormat = OutputFormat.JSON
|
|
17
|
-
|
|
31
|
+
emit: EmitTarget = EmitTarget.JSON
|
|
32
|
+
app_name: Optional[str] = None
|
|
33
|
+
neo4j_uri: Optional[str] = None
|
|
34
|
+
neo4j_user: str = "neo4j"
|
|
35
|
+
neo4j_password: str = "neo4j"
|
|
36
|
+
neo4j_database: Optional[str] = None
|
|
18
37
|
using_codeql: bool = False
|
|
19
38
|
using_ray: bool = False
|
|
20
39
|
rebuild_analysis: bool = False
|
codeanalyzer/schema/py_schema.py
CHANGED
|
@@ -339,9 +339,29 @@ class PyModule(BaseModel):
|
|
|
339
339
|
file_size: Optional[int] = None
|
|
340
340
|
|
|
341
341
|
|
|
342
|
+
@builder
|
|
343
|
+
@msgpk
|
|
344
|
+
class PyCallEdge(BaseModel):
|
|
345
|
+
"""Identity-only call-graph edge with weight.
|
|
346
|
+
|
|
347
|
+
Mirrors Java's ``CallDependency``. ``source`` and ``target`` are
|
|
348
|
+
``PyCallable.signature`` strings — nodes of the graph are the existing
|
|
349
|
+
``PyCallable`` entries in the symbol table, not a separate vertex type.
|
|
350
|
+
Rich per-call metadata (receiver, arguments, location, ...) lives on
|
|
351
|
+
``PyCallsite`` inside the source ``PyCallable.call_sites``.
|
|
352
|
+
"""
|
|
353
|
+
|
|
354
|
+
source: str # caller's PyCallable.signature
|
|
355
|
+
target: str # callee's PyCallable.signature
|
|
356
|
+
type: Literal["CALL_DEP"] = "CALL_DEP"
|
|
357
|
+
weight: int = 1
|
|
358
|
+
provenance: List[Literal["jedi", "codeql", "joern"]] = []
|
|
359
|
+
|
|
360
|
+
|
|
342
361
|
@builder
|
|
343
362
|
@msgpk
|
|
344
363
|
class PyApplication(BaseModel):
|
|
345
364
|
"""Represents a Python application."""
|
|
346
365
|
|
|
347
366
|
symbol_table: Dict[str, PyModule]
|
|
367
|
+
call_graph: List[PyCallEdge] = []
|