codeanalyzer-python 0.1.14__py3-none-any.whl → 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- codeanalyzer/__main__.py +99 -6
- codeanalyzer/neo4j/__init__.py +46 -0
- codeanalyzer/neo4j/bolt.py +223 -0
- codeanalyzer/neo4j/catalog.py +245 -0
- codeanalyzer/neo4j/cypher.py +138 -0
- codeanalyzer/neo4j/emit.py +74 -0
- codeanalyzer/neo4j/project.py +322 -0
- codeanalyzer/neo4j/rows.py +176 -0
- codeanalyzer/neo4j/schema.py +39 -0
- codeanalyzer/options/__init__.py +2 -2
- codeanalyzer/options/options.py +20 -0
- codeanalyzer/semantic_analysis/codeql/codeql_analysis.py +109 -27
- codeanalyzer_python-0.2.0.dist-info/METADATA +393 -0
- {codeanalyzer_python-0.1.14.dist-info → codeanalyzer_python-0.2.0.dist-info}/RECORD +18 -10
- {codeanalyzer_python-0.1.14.dist-info → codeanalyzer_python-0.2.0.dist-info}/WHEEL +1 -1
- codeanalyzer_python-0.2.0.dist-info/entry_points.txt +3 -0
- codeanalyzer_python-0.1.14.dist-info/METADATA +0 -392
- codeanalyzer_python-0.1.14.dist-info/entry_points.txt +0 -2
- {codeanalyzer_python-0.1.14.dist-info → codeanalyzer_python-0.2.0.dist-info}/licenses/LICENSE +0 -0
- {codeanalyzer_python-0.1.14.dist-info → codeanalyzer_python-0.2.0.dist-info}/licenses/NOTICE +0 -0
codeanalyzer/__main__.py
CHANGED
|
@@ -7,13 +7,18 @@ from codeanalyzer.core import Codeanalyzer
|
|
|
7
7
|
from codeanalyzer.utils import _set_log_level, logger
|
|
8
8
|
from codeanalyzer.config import OutputFormat
|
|
9
9
|
from codeanalyzer.schema import model_dump_json
|
|
10
|
-
from codeanalyzer.options import AnalysisOptions
|
|
10
|
+
from codeanalyzer.options import AnalysisOptions, EmitTarget
|
|
11
11
|
|
|
12
12
|
|
|
13
13
|
def main(
|
|
14
14
|
input: Annotated[
|
|
15
|
-
Path,
|
|
16
|
-
|
|
15
|
+
Optional[Path],
|
|
16
|
+
typer.Option(
|
|
17
|
+
"-i",
|
|
18
|
+
"--input",
|
|
19
|
+
help="Path to the project root directory (not required for --emit schema).",
|
|
20
|
+
),
|
|
21
|
+
] = None,
|
|
17
22
|
output: Annotated[
|
|
18
23
|
Optional[Path],
|
|
19
24
|
typer.Option("-o", "--output", help="Output directory for artifacts."),
|
|
@@ -23,10 +28,61 @@ def main(
|
|
|
23
28
|
typer.Option(
|
|
24
29
|
"-f",
|
|
25
30
|
"--format",
|
|
26
|
-
help="Output format: json or msgpack.",
|
|
31
|
+
help="Output format for --emit json: json or msgpack.",
|
|
27
32
|
case_sensitive=False,
|
|
28
33
|
),
|
|
29
34
|
] = OutputFormat.JSON,
|
|
35
|
+
emit: Annotated[
|
|
36
|
+
EmitTarget,
|
|
37
|
+
typer.Option(
|
|
38
|
+
"--emit",
|
|
39
|
+
help="Output target: json (analysis.json, default) | neo4j (graph.cypher or live "
|
|
40
|
+
"Bolt push) | schema (the Neo4j schema.json contract).",
|
|
41
|
+
case_sensitive=False,
|
|
42
|
+
),
|
|
43
|
+
] = EmitTarget.JSON,
|
|
44
|
+
app_name: Annotated[
|
|
45
|
+
Optional[str],
|
|
46
|
+
typer.Option(
|
|
47
|
+
"--app-name",
|
|
48
|
+
help="Logical application name for the graph :PyApplication anchor "
|
|
49
|
+
"(default: input dir name).",
|
|
50
|
+
),
|
|
51
|
+
] = None,
|
|
52
|
+
neo4j_uri: Annotated[
|
|
53
|
+
Optional[str],
|
|
54
|
+
typer.Option(
|
|
55
|
+
"--neo4j-uri",
|
|
56
|
+
envvar="NEO4J_URI",
|
|
57
|
+
help="Push the graph to a live Neo4j over Bolt (incremental); omit to write "
|
|
58
|
+
"graph.cypher. [env: NEO4J_URI]",
|
|
59
|
+
),
|
|
60
|
+
] = None,
|
|
61
|
+
neo4j_user: Annotated[
|
|
62
|
+
str,
|
|
63
|
+
typer.Option(
|
|
64
|
+
"--neo4j-user",
|
|
65
|
+
envvar="NEO4J_USERNAME",
|
|
66
|
+
help="Neo4j username. [env: NEO4J_USERNAME]",
|
|
67
|
+
),
|
|
68
|
+
] = "neo4j",
|
|
69
|
+
neo4j_password: Annotated[
|
|
70
|
+
str,
|
|
71
|
+
typer.Option(
|
|
72
|
+
"--neo4j-password",
|
|
73
|
+
envvar="NEO4J_PASSWORD",
|
|
74
|
+
help="Neo4j password. Prefer the env var over the flag (the flag is visible in shell "
|
|
75
|
+
"history / process list). [env: NEO4J_PASSWORD]",
|
|
76
|
+
),
|
|
77
|
+
] = "neo4j",
|
|
78
|
+
neo4j_database: Annotated[
|
|
79
|
+
Optional[str],
|
|
80
|
+
typer.Option(
|
|
81
|
+
"--neo4j-database",
|
|
82
|
+
envvar="NEO4J_DATABASE",
|
|
83
|
+
help="Neo4j database name (default: server default). [env: NEO4J_DATABASE]",
|
|
84
|
+
),
|
|
85
|
+
] = None,
|
|
30
86
|
using_codeql: Annotated[
|
|
31
87
|
bool, typer.Option("--codeql/--no-codeql", help="Enable CodeQL-based analysis.")
|
|
32
88
|
] = False,
|
|
@@ -78,6 +134,12 @@ def main(
|
|
|
78
134
|
input=input,
|
|
79
135
|
output=output,
|
|
80
136
|
format=format,
|
|
137
|
+
emit=emit,
|
|
138
|
+
app_name=app_name,
|
|
139
|
+
neo4j_uri=neo4j_uri,
|
|
140
|
+
neo4j_user=neo4j_user,
|
|
141
|
+
neo4j_password=neo4j_password,
|
|
142
|
+
neo4j_database=neo4j_database,
|
|
81
143
|
using_codeql=using_codeql,
|
|
82
144
|
using_ray=using_ray,
|
|
83
145
|
rebuild_analysis=rebuild_analysis,
|
|
@@ -89,6 +151,18 @@ def main(
|
|
|
89
151
|
)
|
|
90
152
|
|
|
91
153
|
_set_log_level(options.verbosity)
|
|
154
|
+
|
|
155
|
+
# The schema contract is a static artifact — no project analysis required.
|
|
156
|
+
if options.emit == EmitTarget.SCHEMA:
|
|
157
|
+
from codeanalyzer.neo4j.emit import emit_schema
|
|
158
|
+
|
|
159
|
+
emit_schema(options.output)
|
|
160
|
+
return
|
|
161
|
+
|
|
162
|
+
# Every other target requires an input project.
|
|
163
|
+
if options.input is None:
|
|
164
|
+
logger.error("Missing option '-i' / '--input' (required for --emit json | neo4j).")
|
|
165
|
+
raise typer.Exit(code=1)
|
|
92
166
|
if not options.input.exists():
|
|
93
167
|
logger.error(f"Input path '{options.input}' does not exist.")
|
|
94
168
|
raise typer.Exit(code=1)
|
|
@@ -112,7 +186,11 @@ def main(
|
|
|
112
186
|
with Codeanalyzer(options) as analyzer:
|
|
113
187
|
artifacts = analyzer.analyze()
|
|
114
188
|
|
|
115
|
-
if options.
|
|
189
|
+
if options.emit == EmitTarget.NEO4J:
|
|
190
|
+
from codeanalyzer.neo4j.emit import emit_neo4j
|
|
191
|
+
|
|
192
|
+
emit_neo4j(artifacts, options)
|
|
193
|
+
elif options.output is None:
|
|
116
194
|
print(model_dump_json(artifacts, separators=(",", ":")))
|
|
117
195
|
else:
|
|
118
196
|
options.output.mkdir(parents=True, exist_ok=True)
|
|
@@ -142,7 +220,7 @@ def _write_output(artifacts, output_dir: Path, format: OutputFormat):
|
|
|
142
220
|
|
|
143
221
|
app = typer.Typer(
|
|
144
222
|
callback=main,
|
|
145
|
-
name="
|
|
223
|
+
name="canpy",
|
|
146
224
|
help="Static Analysis on Python source code using Jedi, CodeQL and Tree sitter.",
|
|
147
225
|
invoke_without_command=True,
|
|
148
226
|
no_args_is_help=True,
|
|
@@ -151,5 +229,20 @@ app = typer.Typer(
|
|
|
151
229
|
pretty_exceptions_show_locals=False,
|
|
152
230
|
)
|
|
153
231
|
|
|
232
|
+
def deprecated_main() -> None:
|
|
233
|
+
"""Entry point for the legacy ``codeanalyzer`` command. Prints a one-line
|
|
234
|
+
deprecation notice to stderr (so piped stdout — e.g. ``--emit schema`` — stays
|
|
235
|
+
clean) and then runs the CLI unchanged. Kept for backwards compatibility; will
|
|
236
|
+
be removed in a future release."""
|
|
237
|
+
import sys
|
|
238
|
+
|
|
239
|
+
print(
|
|
240
|
+
"codeanalyzer: this command has been renamed to `canpy`. The `codeanalyzer` "
|
|
241
|
+
"alias is deprecated and will be removed in a future release — please use `canpy`.",
|
|
242
|
+
file=sys.stderr,
|
|
243
|
+
)
|
|
244
|
+
app()
|
|
245
|
+
|
|
246
|
+
|
|
154
247
|
if __name__ == "__main__":
|
|
155
248
|
app()
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
################################################################################
|
|
2
|
+
# Copyright IBM Corporation 2025
|
|
3
|
+
#
|
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
# you may not use this file except in compliance with the License.
|
|
6
|
+
# You may obtain a copy of the License at
|
|
7
|
+
#
|
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
#
|
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
# See the License for the specific language governing permissions and
|
|
14
|
+
# limitations under the License.
|
|
15
|
+
################################################################################
|
|
16
|
+
|
|
17
|
+
"""Neo4j output: a pure projection of the :class:`PyApplication` IR to graph rows,
|
|
18
|
+
plus the two writers (cypher snapshot / bolt incremental). Nothing here runs
|
|
19
|
+
unless ``--emit neo4j`` (or ``--emit schema``) is selected.
|
|
20
|
+
"""
|
|
21
|
+
from codeanalyzer.neo4j.bolt import BoltConfig, bolt_writer
|
|
22
|
+
from codeanalyzer.neo4j.catalog import (
|
|
23
|
+
MARKER_LABELS,
|
|
24
|
+
NODE_LABELS,
|
|
25
|
+
REL_TYPES,
|
|
26
|
+
SCHEMA_VERSION,
|
|
27
|
+
build_schema_document,
|
|
28
|
+
)
|
|
29
|
+
from codeanalyzer.neo4j.cypher import render_cypher
|
|
30
|
+
from codeanalyzer.neo4j.project import project
|
|
31
|
+
from codeanalyzer.neo4j.rows import EdgeRow, GraphRows, NodeRow
|
|
32
|
+
|
|
33
|
+
__all__ = [
|
|
34
|
+
"project",
|
|
35
|
+
"render_cypher",
|
|
36
|
+
"bolt_writer",
|
|
37
|
+
"BoltConfig",
|
|
38
|
+
"build_schema_document",
|
|
39
|
+
"SCHEMA_VERSION",
|
|
40
|
+
"NODE_LABELS",
|
|
41
|
+
"REL_TYPES",
|
|
42
|
+
"MARKER_LABELS",
|
|
43
|
+
"GraphRows",
|
|
44
|
+
"NodeRow",
|
|
45
|
+
"EdgeRow",
|
|
46
|
+
]
|
|
@@ -0,0 +1,223 @@
|
|
|
1
|
+
################################################################################
|
|
2
|
+
# Copyright IBM Corporation 2025
|
|
3
|
+
#
|
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
# you may not use this file except in compliance with the License.
|
|
6
|
+
# You may obtain a copy of the License at
|
|
7
|
+
#
|
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
#
|
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
# See the License for the specific language governing permissions and
|
|
14
|
+
# limitations under the License.
|
|
15
|
+
################################################################################
|
|
16
|
+
|
|
17
|
+
"""The incremental writer: push :class:`GraphRows` into a live Neo4j over Bolt.
|
|
18
|
+
Unlike the snapshot writer, this one reads the DB's current state and updates
|
|
19
|
+
only what changed.
|
|
20
|
+
|
|
21
|
+
Algorithm (the module subgraph is the unit of idempotent replacement):
|
|
22
|
+
1. ensure constraints + indexes.
|
|
23
|
+
2. diff each module's ``content_hash`` against the DB → the set of changed modules.
|
|
24
|
+
3. per changed module, in a transaction: delete the edges it owned (edges out of
|
|
25
|
+
its nodes), detach-delete the declarations it no longer emits, then upsert
|
|
26
|
+
its current nodes.
|
|
27
|
+
4. upsert edges owned by changed modules (+ the shared edges).
|
|
28
|
+
5. on a FULL run only, prune modules whose source file vanished.
|
|
29
|
+
|
|
30
|
+
Nodes are MERGE-upserted, never blindly deleted, so a declaration another
|
|
31
|
+
(unchanged) module still references survives and its incoming edges stay valid.
|
|
32
|
+
``:PyExternal`` / ``:PyPackage`` / ``:PyDecorator`` are shared (no ``_module``) and are
|
|
33
|
+
MERGE-only.
|
|
34
|
+
|
|
35
|
+
The ``neo4j`` driver is imported lazily so it stays an optional dependency and
|
|
36
|
+
off the default (json) output path entirely.
|
|
37
|
+
"""
|
|
38
|
+
from __future__ import annotations
|
|
39
|
+
|
|
40
|
+
from dataclasses import dataclass
|
|
41
|
+
from typing import Dict, List, Optional
|
|
42
|
+
|
|
43
|
+
from codeanalyzer.neo4j.rows import EdgeRow, GraphRows, NodeRow, chunk
|
|
44
|
+
from codeanalyzer.neo4j.schema import CONSTRAINTS, INDEXES
|
|
45
|
+
from codeanalyzer.utils import logger
|
|
46
|
+
|
|
47
|
+
DESCENDANTS = "[:PY_DECLARES|PY_HAS_METHOD|PY_HAS_ATTRIBUTE|PY_DECLARES_VAR|PY_HAS_CALLSITE*1..]"
|
|
48
|
+
BATCH = 1000
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
@dataclass
|
|
52
|
+
class BoltConfig:
|
|
53
|
+
uri: str
|
|
54
|
+
user: str
|
|
55
|
+
password: str
|
|
56
|
+
database: Optional[str] = None
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def bolt_writer(rows: GraphRows, cfg: BoltConfig, full_run: bool) -> None:
|
|
60
|
+
try:
|
|
61
|
+
import neo4j # noqa: WPS433 (lazy, optional dependency)
|
|
62
|
+
except ImportError as exc: # pragma: no cover - exercised only without the extra
|
|
63
|
+
raise RuntimeError(
|
|
64
|
+
"The 'neo4j' driver is required for '--emit neo4j --neo4j-uri'. "
|
|
65
|
+
"Install it with: pip install 'codeanalyzer-python[neo4j]'"
|
|
66
|
+
) from exc
|
|
67
|
+
|
|
68
|
+
driver = neo4j.GraphDatabase.driver(cfg.uri, auth=(cfg.user, cfg.password))
|
|
69
|
+
session_kwargs = {"database": cfg.database} if cfg.database else {}
|
|
70
|
+
|
|
71
|
+
def session():
|
|
72
|
+
return driver.session(**session_kwargs)
|
|
73
|
+
|
|
74
|
+
try:
|
|
75
|
+
# 1. schema (DDL runs in its own autocommit transactions).
|
|
76
|
+
with session() as s:
|
|
77
|
+
for stmt in [*CONSTRAINTS, *INDEXES]:
|
|
78
|
+
s.run(stmt)
|
|
79
|
+
|
|
80
|
+
# Partition nodes by owning module; shared nodes have no _module.
|
|
81
|
+
by_module: Dict[str, List[NodeRow]] = {}
|
|
82
|
+
shared: List[NodeRow] = []
|
|
83
|
+
module_of: Dict[str, str] = {} # node value → owning module
|
|
84
|
+
for n in rows.nodes:
|
|
85
|
+
m = n.props.get("_module")
|
|
86
|
+
if isinstance(m, str):
|
|
87
|
+
by_module.setdefault(m, []).append(n)
|
|
88
|
+
module_of[n.value] = m
|
|
89
|
+
else:
|
|
90
|
+
shared.append(n)
|
|
91
|
+
|
|
92
|
+
# 2. diff content_hash.
|
|
93
|
+
db_hash: Dict[str, Optional[str]] = {}
|
|
94
|
+
with session() as s:
|
|
95
|
+
res = s.run("MATCH (m:PyModule) RETURN m.file_key AS k, m.content_hash AS h")
|
|
96
|
+
for rec in res:
|
|
97
|
+
db_hash[rec["k"]] = rec["h"]
|
|
98
|
+
changed = set()
|
|
99
|
+
for m, nodes in by_module.items():
|
|
100
|
+
row_hash = _hash_of(nodes, m)
|
|
101
|
+
if m not in db_hash or row_hash is None or row_hash != db_hash.get(m):
|
|
102
|
+
changed.add(m)
|
|
103
|
+
logger.info(
|
|
104
|
+
f"neo4j(bolt): {len(by_module)} modules ({len(changed)} changed), "
|
|
105
|
+
f"{len(shared)} shared nodes, {len(rows.edges)} edges"
|
|
106
|
+
)
|
|
107
|
+
|
|
108
|
+
# 3. shared nodes are always upserted (MERGE-only).
|
|
109
|
+
_upsert_nodes(session, neo4j, shared)
|
|
110
|
+
|
|
111
|
+
# 4. per changed module: purge owned edges + vanished decls, then upsert its nodes.
|
|
112
|
+
for m in changed:
|
|
113
|
+
nodes = by_module[m]
|
|
114
|
+
keys = [n.value for n in nodes]
|
|
115
|
+
with session() as s:
|
|
116
|
+
def _purge(tx, module=m, node_keys=keys):
|
|
117
|
+
tx.run("MATCH (x {_module: $m})-[r]->() DELETE r", m=module)
|
|
118
|
+
tx.run(
|
|
119
|
+
"MATCH (x {_module: $m}) "
|
|
120
|
+
"WHERE NOT coalesce(x.signature, x.id, x.file_key) IN $keys "
|
|
121
|
+
"DETACH DELETE x",
|
|
122
|
+
m=module,
|
|
123
|
+
keys=node_keys,
|
|
124
|
+
)
|
|
125
|
+
|
|
126
|
+
s.execute_write(_purge)
|
|
127
|
+
_upsert_nodes(session, neo4j, nodes)
|
|
128
|
+
|
|
129
|
+
# 5. upsert edges owned by a changed module (owner = source node's module) or shared.
|
|
130
|
+
edges = [
|
|
131
|
+
e
|
|
132
|
+
for e in rows.edges
|
|
133
|
+
if module_of.get(e.from_ref.value) is None or module_of.get(e.from_ref.value) in changed
|
|
134
|
+
]
|
|
135
|
+
_upsert_edges(session, neo4j, edges)
|
|
136
|
+
|
|
137
|
+
# 6. orphan prune — only safe on a full run (a targeted run can't tell deleted from untargeted).
|
|
138
|
+
if full_run:
|
|
139
|
+
present = list(by_module.keys())
|
|
140
|
+
with session() as s:
|
|
141
|
+
res = s.run(
|
|
142
|
+
"MATCH (m:PyModule) WHERE NOT m.file_key IN $present "
|
|
143
|
+
f"OPTIONAL MATCH (m)-{DESCENDANTS}->(x) DETACH DELETE x, m "
|
|
144
|
+
"RETURN count(m) AS pruned",
|
|
145
|
+
present=present,
|
|
146
|
+
)
|
|
147
|
+
pruned = res.single()
|
|
148
|
+
pruned_count = pruned["pruned"] if pruned else 0
|
|
149
|
+
logger.info(f"neo4j(bolt): pruned {pruned_count} vanished module(s)")
|
|
150
|
+
else:
|
|
151
|
+
logger.info(
|
|
152
|
+
"neo4j(bolt): targeted run — orphan pruning skipped (deleted files not removed)"
|
|
153
|
+
)
|
|
154
|
+
finally:
|
|
155
|
+
driver.close()
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
# ----------------------------------------------------------------------------------------------
|
|
159
|
+
# Batched upserts
|
|
160
|
+
# ----------------------------------------------------------------------------------------------
|
|
161
|
+
|
|
162
|
+
|
|
163
|
+
def _upsert_nodes(session, neo4j, nodes: List[NodeRow]) -> None:
|
|
164
|
+
groups: Dict[str, List[NodeRow]] = {}
|
|
165
|
+
for n in nodes:
|
|
166
|
+
groups.setdefault(f"{':'.join(n.labels)}|{n.key_prop}", []).append(n)
|
|
167
|
+
|
|
168
|
+
for group in groups.values():
|
|
169
|
+
labels = group[0].labels
|
|
170
|
+
key_prop = group[0].key_prop
|
|
171
|
+
set_labels = f", n:{':'.join(labels[1:])}" if len(labels) > 1 else ""
|
|
172
|
+
cypher = (
|
|
173
|
+
f"UNWIND $rows AS row MERGE (n:{labels[0]} {{{key_prop}: row.k}}) "
|
|
174
|
+
f"SET n += row.p{set_labels}"
|
|
175
|
+
)
|
|
176
|
+
for batch in chunk(group, BATCH):
|
|
177
|
+
payload = [{"k": n.value, "p": _to_params(n.props, neo4j)} for n in batch]
|
|
178
|
+
with session() as s:
|
|
179
|
+
s.run(cypher, rows=payload)
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
def _upsert_edges(session, neo4j, edges: List[EdgeRow]) -> None:
|
|
183
|
+
groups: Dict[str, List[EdgeRow]] = {}
|
|
184
|
+
for e in edges:
|
|
185
|
+
key = f"{e.type}|{e.from_ref.label}.{e.from_ref.key_prop}|{e.to_ref.label}.{e.to_ref.key_prop}"
|
|
186
|
+
groups.setdefault(key, []).append(e)
|
|
187
|
+
|
|
188
|
+
for group in groups.values():
|
|
189
|
+
first = group[0]
|
|
190
|
+
from_ref, to_ref = first.from_ref, first.to_ref
|
|
191
|
+
cypher = (
|
|
192
|
+
f"UNWIND $rows AS row "
|
|
193
|
+
f"MATCH (a:{from_ref.label} {{{from_ref.key_prop}: row.f}}) "
|
|
194
|
+
f"MATCH (b:{to_ref.label} {{{to_ref.key_prop}: row.t}}) "
|
|
195
|
+
f"MERGE (a)-[r:{first.type}]->(b) SET r += row.p"
|
|
196
|
+
)
|
|
197
|
+
for batch in chunk(group, BATCH):
|
|
198
|
+
payload = [
|
|
199
|
+
{"f": e.from_ref.value, "t": e.to_ref.value, "p": _to_params(e.props, neo4j)}
|
|
200
|
+
for e in batch
|
|
201
|
+
]
|
|
202
|
+
with session() as s:
|
|
203
|
+
s.run(cypher, rows=payload)
|
|
204
|
+
|
|
205
|
+
|
|
206
|
+
# ----------------------------------------------------------------------------------------------
|
|
207
|
+
# Helpers
|
|
208
|
+
# ----------------------------------------------------------------------------------------------
|
|
209
|
+
|
|
210
|
+
|
|
211
|
+
def _hash_of(nodes: List[NodeRow], file_key: str) -> Optional[str]:
|
|
212
|
+
for n in nodes:
|
|
213
|
+
if n.labels[0] == "PyModule" and n.value == file_key:
|
|
214
|
+
h = n.props.get("content_hash")
|
|
215
|
+
return h if isinstance(h, str) else None
|
|
216
|
+
return None
|
|
217
|
+
|
|
218
|
+
|
|
219
|
+
def _to_params(props, neo4j) -> dict:
|
|
220
|
+
"""Map props to driver params. The Python driver already distinguishes int
|
|
221
|
+
from float, so unlike the JS driver no integer coercion is needed — this is a
|
|
222
|
+
straight passthrough kept symmetric with the snapshot writer's shape."""
|
|
223
|
+
return dict(props)
|
|
@@ -0,0 +1,245 @@
|
|
|
1
|
+
################################################################################
|
|
2
|
+
# Copyright IBM Corporation 2025
|
|
3
|
+
#
|
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
# you may not use this file except in compliance with the License.
|
|
6
|
+
# You may obtain a copy of the License at
|
|
7
|
+
#
|
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
#
|
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
# See the License for the specific language governing permissions and
|
|
14
|
+
# limitations under the License.
|
|
15
|
+
################################################################################
|
|
16
|
+
|
|
17
|
+
"""The declarative Neo4j schema catalog — the single in-repo source of truth for
|
|
18
|
+
the graph contract (node labels, their keys and typed properties, relationship
|
|
19
|
+
types and their endpoints). ``--emit schema`` serializes this (with the DDL from
|
|
20
|
+
:mod:`codeanalyzer.neo4j.schema`) to a machine-readable ``schema.json``, and the
|
|
21
|
+
conformance test (``test/test_neo4j_schema.py``) asserts the real emitter never
|
|
22
|
+
produces a label / relationship / property that isn't declared here — so this
|
|
23
|
+
file cannot silently drift from :mod:`codeanalyzer.neo4j.project`.
|
|
24
|
+
|
|
25
|
+
SCHEMA_VERSION is the contract version: bump MAJOR on a breaking change
|
|
26
|
+
(renamed/removed label, relationship or key), MINOR on an additive change (new
|
|
27
|
+
label/rel/property). It is stamped onto the ``:PyApplication`` node of every
|
|
28
|
+
emitted graph so any consumer can detect a producer/consumer mismatch at runtime.
|
|
29
|
+
"""
|
|
30
|
+
from __future__ import annotations
|
|
31
|
+
|
|
32
|
+
from dataclasses import dataclass, field
|
|
33
|
+
from typing import Dict, List
|
|
34
|
+
|
|
35
|
+
from codeanalyzer.neo4j.schema import CONSTRAINTS, INDEXES
|
|
36
|
+
|
|
37
|
+
SCHEMA_VERSION = "1.0.0"
|
|
38
|
+
|
|
39
|
+
# PropType ∈ {"string", "integer", "float", "boolean", "string[]", "integer[]"}.
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
@dataclass
|
|
43
|
+
class NodeLabel:
|
|
44
|
+
label: str # the specific label (also the catalog key)
|
|
45
|
+
merge_label: str # the label the uniqueness constraint / MERGE is on
|
|
46
|
+
key: str
|
|
47
|
+
properties: Dict[str, str]
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
@dataclass
|
|
51
|
+
class RelType:
|
|
52
|
+
type: str
|
|
53
|
+
from_labels: List[str]
|
|
54
|
+
to_labels: List[str]
|
|
55
|
+
properties: Dict[str, str] = field(default_factory=dict)
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
# Labels layered onto a node in addition to its primary/specific label.
|
|
59
|
+
MARKER_LABELS: List[str] = []
|
|
60
|
+
|
|
61
|
+
_SPAN = {"start_line": "integer", "end_line": "integer"}
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
NODE_LABELS: List[NodeLabel] = [
|
|
65
|
+
NodeLabel(
|
|
66
|
+
"PyApplication",
|
|
67
|
+
"PyApplication",
|
|
68
|
+
"name",
|
|
69
|
+
{"name": "string", "schema_version": "string"},
|
|
70
|
+
),
|
|
71
|
+
NodeLabel(
|
|
72
|
+
"PyModule",
|
|
73
|
+
"PyModule",
|
|
74
|
+
"file_key",
|
|
75
|
+
{
|
|
76
|
+
"file_key": "string",
|
|
77
|
+
"module_name": "string",
|
|
78
|
+
"content_hash": "string",
|
|
79
|
+
"last_modified": "float",
|
|
80
|
+
"file_size": "integer",
|
|
81
|
+
"_module": "string",
|
|
82
|
+
},
|
|
83
|
+
),
|
|
84
|
+
NodeLabel(
|
|
85
|
+
"PyClass",
|
|
86
|
+
"PySymbol",
|
|
87
|
+
"signature",
|
|
88
|
+
{
|
|
89
|
+
"signature": "string",
|
|
90
|
+
"name": "string",
|
|
91
|
+
"code": "string",
|
|
92
|
+
"base_classes": "string[]",
|
|
93
|
+
"docstring": "string",
|
|
94
|
+
**_SPAN,
|
|
95
|
+
"_module": "string",
|
|
96
|
+
},
|
|
97
|
+
),
|
|
98
|
+
NodeLabel(
|
|
99
|
+
"PyCallable",
|
|
100
|
+
"PySymbol",
|
|
101
|
+
"signature",
|
|
102
|
+
{
|
|
103
|
+
"signature": "string",
|
|
104
|
+
"name": "string",
|
|
105
|
+
"path": "string",
|
|
106
|
+
"return_type": "string",
|
|
107
|
+
"cyclomatic_complexity": "integer",
|
|
108
|
+
"code": "string",
|
|
109
|
+
"code_start_line": "integer",
|
|
110
|
+
**_SPAN,
|
|
111
|
+
"docstring": "string",
|
|
112
|
+
"decorators": "string[]",
|
|
113
|
+
"parameters_json": "string",
|
|
114
|
+
"accessed_symbols_json": "string",
|
|
115
|
+
"_module": "string",
|
|
116
|
+
},
|
|
117
|
+
),
|
|
118
|
+
NodeLabel(
|
|
119
|
+
"PyExternal",
|
|
120
|
+
"PySymbol",
|
|
121
|
+
"signature",
|
|
122
|
+
{"signature": "string", "name": "string"},
|
|
123
|
+
),
|
|
124
|
+
NodeLabel("PyPackage", "PyPackage", "name", {"name": "string"}),
|
|
125
|
+
NodeLabel(
|
|
126
|
+
"PyDecorator",
|
|
127
|
+
"PyDecorator",
|
|
128
|
+
"name",
|
|
129
|
+
{"name": "string"},
|
|
130
|
+
),
|
|
131
|
+
NodeLabel(
|
|
132
|
+
"PyCallSite",
|
|
133
|
+
"PyCallSite",
|
|
134
|
+
"id",
|
|
135
|
+
{
|
|
136
|
+
"id": "string",
|
|
137
|
+
"method_name": "string",
|
|
138
|
+
"receiver_expr": "string",
|
|
139
|
+
"receiver_type": "string",
|
|
140
|
+
"argument_types": "string[]",
|
|
141
|
+
"return_type": "string",
|
|
142
|
+
"callee_signature": "string",
|
|
143
|
+
"is_constructor_call": "boolean",
|
|
144
|
+
"start_line": "integer",
|
|
145
|
+
"start_column": "integer",
|
|
146
|
+
"end_line": "integer",
|
|
147
|
+
"end_column": "integer",
|
|
148
|
+
"_module": "string",
|
|
149
|
+
},
|
|
150
|
+
),
|
|
151
|
+
NodeLabel(
|
|
152
|
+
"PyAttribute",
|
|
153
|
+
"PyAttribute",
|
|
154
|
+
"id",
|
|
155
|
+
{
|
|
156
|
+
"id": "string",
|
|
157
|
+
"name": "string",
|
|
158
|
+
"type": "string",
|
|
159
|
+
"docstring": "string",
|
|
160
|
+
**_SPAN,
|
|
161
|
+
"_module": "string",
|
|
162
|
+
},
|
|
163
|
+
),
|
|
164
|
+
NodeLabel(
|
|
165
|
+
"PyVariable",
|
|
166
|
+
"PyVariable",
|
|
167
|
+
"id",
|
|
168
|
+
{
|
|
169
|
+
"id": "string",
|
|
170
|
+
"name": "string",
|
|
171
|
+
"type": "string",
|
|
172
|
+
"initializer": "string",
|
|
173
|
+
"scope": "string",
|
|
174
|
+
**_SPAN,
|
|
175
|
+
"_module": "string",
|
|
176
|
+
},
|
|
177
|
+
),
|
|
178
|
+
]
|
|
179
|
+
|
|
180
|
+
_DECL_TARGETS = ["PyClass", "PyCallable"]
|
|
181
|
+
|
|
182
|
+
|
|
183
|
+
REL_TYPES: List[RelType] = [
|
|
184
|
+
RelType("PY_HAS_MODULE", ["PyApplication"], ["PyModule"]),
|
|
185
|
+
RelType("PY_DECLARES", ["PyModule", "PyClass", "PyCallable"], _DECL_TARGETS),
|
|
186
|
+
RelType("PY_HAS_METHOD", ["PyClass"], ["PyCallable"]),
|
|
187
|
+
RelType("PY_HAS_ATTRIBUTE", ["PyClass"], ["PyAttribute"]),
|
|
188
|
+
RelType("PY_DECLARES_VAR", ["PyModule", "PyCallable"], ["PyVariable"]),
|
|
189
|
+
RelType("PY_HAS_CALLSITE", ["PyCallable"], ["PyCallSite"]),
|
|
190
|
+
RelType("PY_RESOLVES_TO", ["PyCallSite"], ["PyCallable", "PyExternal"]),
|
|
191
|
+
RelType(
|
|
192
|
+
"PY_CALLS",
|
|
193
|
+
["PyCallable", "PyExternal"],
|
|
194
|
+
["PyCallable", "PyExternal"],
|
|
195
|
+
{"weight": "integer", "provenance": "string[]"},
|
|
196
|
+
),
|
|
197
|
+
RelType("PY_EXTENDS", ["PyClass"], ["PyClass"]),
|
|
198
|
+
RelType(
|
|
199
|
+
"PY_IMPORTS",
|
|
200
|
+
["PyModule"],
|
|
201
|
+
["PyPackage"],
|
|
202
|
+
{"imported_names": "string[]", "aliases": "string[]"},
|
|
203
|
+
),
|
|
204
|
+
RelType("PY_DECORATED_BY", ["PyCallable"], ["PyDecorator"]),
|
|
205
|
+
]
|
|
206
|
+
|
|
207
|
+
|
|
208
|
+
@dataclass
|
|
209
|
+
class SchemaDocument:
|
|
210
|
+
schema_version: str
|
|
211
|
+
generator: str
|
|
212
|
+
marker_labels: List[str]
|
|
213
|
+
node_labels: List[NodeLabel]
|
|
214
|
+
relationship_types: List[RelType]
|
|
215
|
+
constraints: List[str]
|
|
216
|
+
indexes: List[str]
|
|
217
|
+
|
|
218
|
+
|
|
219
|
+
def build_schema_document() -> dict:
|
|
220
|
+
"""Build the full machine-readable schema document emitted by ``--emit schema``."""
|
|
221
|
+
return {
|
|
222
|
+
"schema_version": SCHEMA_VERSION,
|
|
223
|
+
"generator": "codeanalyzer-python",
|
|
224
|
+
"marker_labels": list(MARKER_LABELS),
|
|
225
|
+
"node_labels": [
|
|
226
|
+
{
|
|
227
|
+
"label": n.label,
|
|
228
|
+
"merge_label": n.merge_label,
|
|
229
|
+
"key": n.key,
|
|
230
|
+
"properties": n.properties,
|
|
231
|
+
}
|
|
232
|
+
for n in NODE_LABELS
|
|
233
|
+
],
|
|
234
|
+
"relationship_types": [
|
|
235
|
+
{
|
|
236
|
+
"type": r.type,
|
|
237
|
+
"from": r.from_labels,
|
|
238
|
+
"to": r.to_labels,
|
|
239
|
+
"properties": r.properties,
|
|
240
|
+
}
|
|
241
|
+
for r in REL_TYPES
|
|
242
|
+
],
|
|
243
|
+
"constraints": list(CONSTRAINTS),
|
|
244
|
+
"indexes": list(INDEXES),
|
|
245
|
+
}
|