codeanalyzer-python 0.1.14__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,138 @@
1
+ ################################################################################
2
+ # Copyright IBM Corporation 2025
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ ################################################################################
16
+
17
+ """The snapshot writer: render :class:`GraphRows` to a self-contained ``.cypher``
18
+ script. Running it (e.g. ``cypher-shell < graph.cypher``) rebuilds this project's
19
+ subgraph from scratch — constraints, a scoped wipe of the prior version, then
20
+ batched ``UNWIND … MERGE`` for nodes and edges.
21
+
22
+ This artifact is intentionally NOT incremental: a static script has no view of
23
+ the live DB, so it expresses the full truth. Incremental updates are the bolt
24
+ writer's job.
25
+ """
26
+ from __future__ import annotations
27
+
28
+ from typing import Dict, List
29
+
30
+ from codeanalyzer.neo4j.rows import (
31
+ EdgeRow,
32
+ GraphRows,
33
+ NodeRow,
34
+ chunk,
35
+ cypher_map,
36
+ cypher_value,
37
+ )
38
+ from codeanalyzer.neo4j.schema import CONSTRAINTS, INDEXES
39
+
40
+ BATCH = 500
41
+
42
+
43
+ def render_cypher(rows: GraphRows, app_name: str) -> str:
44
+ out: List[str] = []
45
+
46
+ out.append("// ── constraints & indexes ──")
47
+ for stmt in CONSTRAINTS:
48
+ out.append(f"{stmt};")
49
+ for stmt in INDEXES:
50
+ out.append(f"{stmt};")
51
+
52
+ out.append("")
53
+ out.append("// ── wipe this project's prior subgraph (externals/packages/decorators are shared) ──")
54
+ out.append(_wipe(app_name))
55
+
56
+ out.append("")
57
+ out.append("// ── nodes ──")
58
+ out.extend(_node_statements(rows.nodes))
59
+
60
+ out.append("")
61
+ out.append("// ── relationships ──")
62
+ out.extend(_edge_statements(rows.edges))
63
+
64
+ out.append("")
65
+ return "\n".join(out)
66
+
67
+
68
+ def _wipe(app_name: str) -> str:
69
+ name = cypher_value(app_name)
70
+ return "\n".join(
71
+ [
72
+ f"MATCH (a:PyApplication {{name: {name}}})",
73
+ "OPTIONAL MATCH (a)-[:PY_HAS_MODULE]->(m:PyModule)",
74
+ "OPTIONAL MATCH (m)-[:PY_DECLARES|PY_HAS_METHOD|PY_HAS_ATTRIBUTE|PY_DECLARES_VAR|PY_HAS_CALLSITE*1..]->(x)",
75
+ "DETACH DELETE x, m, a;",
76
+ ]
77
+ )
78
+
79
+
80
+ # ----------------------------------------------------------------------------------------------
81
+ # Nodes — grouped by their full label set + key property, batched into UNWIND lists.
82
+ # ----------------------------------------------------------------------------------------------
83
+
84
+
85
+ def _node_statements(nodes: List[NodeRow]) -> List[str]:
86
+ groups: Dict[str, List[NodeRow]] = {}
87
+ for n in nodes:
88
+ key = f"{':'.join(n.labels)}|{n.key_prop}"
89
+ groups.setdefault(key, []).append(n)
90
+
91
+ blocks: List[str] = []
92
+ for group in groups.values():
93
+ labels = group[0].labels
94
+ key_prop = group[0].key_prop
95
+ merge_label = labels[0]
96
+ extra = labels[1:]
97
+ set_labels = f", n:{':'.join(extra)}" if extra else ""
98
+ for batch in chunk(group, BATCH):
99
+ rows_lit = ",\n".join(
100
+ f" {{k: {cypher_value(n.value)}, p: {cypher_map(n.props)}}}" for n in batch
101
+ )
102
+ blocks.append(
103
+ f"UNWIND [\n{rows_lit}\n] AS row\n"
104
+ f"MERGE (n:{merge_label} {{{key_prop}: row.k}})\n"
105
+ f"SET n += row.p{set_labels};"
106
+ )
107
+ return blocks
108
+
109
+
110
+ # ----------------------------------------------------------------------------------------------
111
+ # Edges — grouped by (type, endpoint labels + key props), batched.
112
+ # ----------------------------------------------------------------------------------------------
113
+
114
+
115
+ def _edge_statements(edges: List[EdgeRow]) -> List[str]:
116
+ groups: Dict[str, List[EdgeRow]] = {}
117
+ for e in edges:
118
+ key = f"{e.type}|{e.from_ref.label}.{e.from_ref.key_prop}|{e.to_ref.label}.{e.to_ref.key_prop}"
119
+ groups.setdefault(key, []).append(e)
120
+
121
+ blocks: List[str] = []
122
+ for group in groups.values():
123
+ first = group[0]
124
+ from_ref, to_ref = first.from_ref, first.to_ref
125
+ for batch in chunk(group, BATCH):
126
+ rows_lit = ",\n".join(
127
+ f" {{f: {cypher_value(e.from_ref.value)}, t: {cypher_value(e.to_ref.value)}, "
128
+ f"p: {cypher_map(e.props)}}}"
129
+ for e in batch
130
+ )
131
+ blocks.append(
132
+ f"UNWIND [\n{rows_lit}\n] AS row\n"
133
+ f"MATCH (a:{from_ref.label} {{{from_ref.key_prop}: row.f}})\n"
134
+ f"MATCH (b:{to_ref.label} {{{to_ref.key_prop}: row.t}})\n"
135
+ f"MERGE (a)-[r:{first.type}]->(b)\n"
136
+ f"SET r += row.p;"
137
+ )
138
+ return blocks
@@ -0,0 +1,74 @@
1
+ ################################################################################
2
+ # Copyright IBM Corporation 2025
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ ################################################################################
16
+
17
+ """The facade between the CLI and the Neo4j backend. Two entry points:
18
+
19
+ - :func:`emit_schema` — serialize the static, version-stamped schema contract
20
+ (``schema.json``). Needs no analyzed project.
21
+ - :func:`emit_neo4j` — project a :class:`PyApplication` to a graph and either
22
+ write a ``graph.cypher`` snapshot or push it to a live Neo4j over Bolt.
23
+ """
24
+ from __future__ import annotations
25
+
26
+ import json
27
+ from pathlib import Path
28
+ from typing import Optional
29
+
30
+ from codeanalyzer.neo4j.bolt import BoltConfig, bolt_writer
31
+ from codeanalyzer.neo4j.catalog import build_schema_document
32
+ from codeanalyzer.neo4j.cypher import render_cypher
33
+ from codeanalyzer.neo4j.project import project
34
+ from codeanalyzer.options import AnalysisOptions
35
+ from codeanalyzer.schema import PyApplication
36
+ from codeanalyzer.utils import logger
37
+
38
+
39
+ def emit_schema(output: Optional[Path]) -> None:
40
+ """Emit the Neo4j schema contract (``schema.json``) — a static artifact derived
41
+ from the in-repo catalog, independent of any analyzed project. With no
42
+ ``output`` it prints to stdout."""
43
+ doc = json.dumps(build_schema_document(), indent=2) + "\n"
44
+ if output is None:
45
+ print(doc, end="")
46
+ return
47
+ output.mkdir(parents=True, exist_ok=True)
48
+ (output / "schema.json").write_text(doc)
49
+ logger.info(f"Neo4j schema written to {output / 'schema.json'}")
50
+
51
+
52
+ def emit_neo4j(app: PyApplication, options: AnalysisOptions) -> None:
53
+ """Project the analysis to a graph and write it: a live Bolt push when
54
+ ``--neo4j-uri`` is set, otherwise a self-contained ``graph.cypher`` snapshot."""
55
+ app_name = options.app_name or Path(options.input).resolve().name
56
+ rows = project(app, app_name)
57
+
58
+ if options.neo4j_uri:
59
+ cfg = BoltConfig(
60
+ uri=options.neo4j_uri,
61
+ user=options.neo4j_user,
62
+ password=options.neo4j_password,
63
+ database=options.neo4j_database,
64
+ )
65
+ # A full run (no single-file restriction) makes orphan pruning safe.
66
+ full_run = options.file_name is None
67
+ bolt_writer(rows, cfg, full_run)
68
+ return
69
+
70
+ out_dir = options.output if options.output is not None else Path.cwd()
71
+ out_dir.mkdir(parents=True, exist_ok=True)
72
+ target = out_dir / "graph.cypher"
73
+ target.write_text(render_cypher(rows, app_name))
74
+ logger.info(f"Neo4j graph written to {target}")
@@ -0,0 +1,322 @@
1
+ ################################################################################
2
+ # Copyright IBM Corporation 2025
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ ################################################################################
16
+
17
+ """``project()`` — the pure projection from the canonical :class:`PyApplication`
18
+ IR to graph rows. It walks the same recursive symbol table the call-graph builder
19
+ walks, but instead of collecting callables it emits nodes + edges. No I/O: the
20
+ writers (cypher snapshot / bolt incremental) consume the returned
21
+ :class:`GraphRows`.
22
+
23
+ Modelling decisions (mirror of the TypeScript backend):
24
+ - signature-keyed declarations (PyClass, PyCallable) carry a shared ``:PySymbol``
25
+ label (the global-identity / MERGE key).
26
+ - call sites, decorators, class attributes and variables are first-class nodes.
27
+ - call-graph endpoints absent from the symbol table become ``:PyExternal`` ghost
28
+ nodes, so RPC / third-party / framework edges are preserved (matching the
29
+ analyzer's own ghost-node behaviour).
30
+ - every project-owned node carries an internal ``_module`` provenance prop, so
31
+ the incremental writer can delete exactly what a re-analyzed module emitted.
32
+ """
33
+ from __future__ import annotations
34
+
35
+ import json
36
+ from pathlib import Path
37
+ from typing import Any, List, Optional
38
+
39
+ from codeanalyzer.neo4j.catalog import SCHEMA_VERSION
40
+ from codeanalyzer.neo4j.rows import GraphRows, NodeRef, Props, RowBuilder, prune
41
+ from codeanalyzer.schema import (
42
+ PyApplication,
43
+ PyCallable,
44
+ PyClass,
45
+ PyClassAttribute,
46
+ PyComment,
47
+ PyModule,
48
+ PyVariableDeclaration,
49
+ )
50
+ from codeanalyzer.schema.py_schema import PyCallsite
51
+
52
+
53
+ def project(app: PyApplication, app_name: str) -> GraphRows:
54
+ b = RowBuilder()
55
+
56
+ app_ref = b.node(["PyApplication"], "name", app_name, {"schema_version": SCHEMA_VERSION})
57
+
58
+ for file_key, mod in app.symbol_table.items():
59
+ mod_ref = b.node(["PyModule"], "file_key", file_key, _module_props(mod, file_key))
60
+ b.edge("PY_HAS_MODULE", app_ref, mod_ref)
61
+ _project_module_body(b, file_key, mod_ref, mod)
62
+
63
+ # The aggregated :PY_CALLS twin. Endpoints not present in the symbol table become
64
+ # :PyExternal ghost nodes (the analyzer already preserves them as ghost nodes).
65
+ for e in app.call_graph:
66
+ src = _call_endpoint(b, e.source)
67
+ tgt = _call_endpoint(b, e.target)
68
+ b.edge("PY_CALLS", src, tgt, _call_edge_props(e.weight, list(e.provenance or [])))
69
+
70
+ return b.finish()
71
+
72
+
73
+ def _sym(signature: str) -> NodeRef:
74
+ return NodeRef("PySymbol", "signature", signature)
75
+
76
+
77
+ def _call_endpoint(b: RowBuilder, signature: str) -> NodeRef:
78
+ """A call-graph endpoint: a known callable already emitted, or a phantom
79
+ :PyExternal symbol materialized on demand for a ghost target."""
80
+ if b.has_key(signature):
81
+ return _sym(signature)
82
+ name = signature.rsplit(".", 1)[-1] if "." in signature else signature
83
+ return b.node(["PySymbol", "PyExternal"], "signature", signature, {"name": name})
84
+
85
+
86
+ # ----------------------------------------------------------------------------------------------
87
+ # Module body
88
+ # ----------------------------------------------------------------------------------------------
89
+
90
+
91
+ def _project_module_body(b: RowBuilder, file_key: str, mod_ref: NodeRef, mod: PyModule) -> None:
92
+ for fn in (mod.functions or {}).values():
93
+ _project_callable(b, file_key, mod_ref, "PY_DECLARES", fn)
94
+ for cl in (mod.classes or {}).values():
95
+ _project_class(b, file_key, mod_ref, "PY_DECLARES", cl)
96
+ for v in mod.variables or []:
97
+ _project_variable(b, file_key, mod_ref, file_key, v)
98
+ _project_imports(b, mod_ref, mod)
99
+
100
+
101
+ def _project_imports(b: RowBuilder, mod_ref: NodeRef, mod: PyModule) -> None:
102
+ # Per-target-module aggregation: collapse all bindings for a given imported
103
+ # module into one PY_IMPORTS edge to a shared :PyPackage node.
104
+ agg: dict = {}
105
+ for im in mod.imports or []:
106
+ if not im.module:
107
+ continue # relative `from . import x` — no resolvable package
108
+ a = agg.setdefault(im.module, {"names": set(), "aliases": set()})
109
+ if im.name:
110
+ a["names"].add(im.name)
111
+ if im.alias:
112
+ a["aliases"].add(im.alias)
113
+ for module_name, a in agg.items():
114
+ pkg = b.node(["PyPackage"], "name", module_name, {})
115
+ b.edge(
116
+ "PY_IMPORTS",
117
+ mod_ref,
118
+ pkg,
119
+ prune(
120
+ {
121
+ "imported_names": sorted(a["names"]) or None,
122
+ "aliases": sorted(a["aliases"]) or None,
123
+ }
124
+ ),
125
+ )
126
+
127
+
128
+ # ----------------------------------------------------------------------------------------------
129
+ # Declarations
130
+ # ----------------------------------------------------------------------------------------------
131
+
132
+
133
+ def _project_class(
134
+ b: RowBuilder, file_key: str, parent: NodeRef, parent_rel: str, cl: PyClass
135
+ ) -> None:
136
+ ref = b.node(["PySymbol", "PyClass"], "signature", cl.signature, _class_props(cl, file_key))
137
+ b.edge(parent_rel, parent, ref)
138
+
139
+ for base in cl.base_classes or []:
140
+ b.edge_to_symbol("PY_EXTENDS", ref, base)
141
+
142
+ for m in (cl.methods or {}).values():
143
+ _project_callable(b, file_key, ref, "PY_HAS_METHOD", m)
144
+ for a in (cl.attributes or {}).values():
145
+ _project_attribute(b, file_key, ref, cl.signature, a)
146
+ for ic in (cl.inner_classes or {}).values():
147
+ _project_class(b, file_key, ref, "PY_DECLARES", ic)
148
+
149
+
150
+ def _project_callable(
151
+ b: RowBuilder, file_key: str, owner: NodeRef, owner_rel: str, c: PyCallable
152
+ ) -> None:
153
+ ref = b.node(["PySymbol", "PyCallable"], "signature", c.signature, _callable_props(c, file_key))
154
+ b.edge(owner_rel, owner, ref)
155
+
156
+ for d in c.decorators or []:
157
+ _project_decorator(b, ref, d)
158
+
159
+ for s in c.call_sites or []:
160
+ # Key off the relative file (a call site lives in its callable's file) so ids stay portable.
161
+ cs_id = f"{file_key}#{s.start_line}:{s.start_column}-{s.end_line}:{s.end_column}"
162
+ cs = b.node(["PyCallSite"], "id", cs_id, _call_site_props(s, file_key))
163
+ b.edge("PY_HAS_CALLSITE", ref, cs)
164
+ if s.callee_signature:
165
+ b.edge_to_symbol("PY_RESOLVES_TO", cs, s.callee_signature)
166
+
167
+ for v in c.local_variables or []:
168
+ _project_variable(b, file_key, ref, c.signature, v)
169
+ for ic in (c.inner_callables or {}).values():
170
+ _project_callable(b, file_key, ref, "PY_DECLARES", ic)
171
+ for cl in (c.inner_classes or {}).values():
172
+ _project_class(b, file_key, ref, "PY_DECLARES", cl)
173
+
174
+
175
+ def _project_attribute(
176
+ b: RowBuilder, file_key: str, owner: NodeRef, owner_sig: str, a: PyClassAttribute
177
+ ) -> None:
178
+ attr_id = f"{owner_sig}.{a.name}"
179
+ ref = b.node(["PyAttribute"], "id", attr_id, _attribute_props(a, attr_id, file_key))
180
+ b.edge("PY_HAS_ATTRIBUTE", owner, ref)
181
+
182
+
183
+ def _project_variable(
184
+ b: RowBuilder, file_key: str, owner: NodeRef, owner_id: str, v: PyVariableDeclaration
185
+ ) -> None:
186
+ var_id = f"{owner_id}#{v.name}@{v.start_line}"
187
+ ref = b.node(["PyVariable"], "id", var_id, _variable_props(v, var_id, file_key))
188
+ b.edge("PY_DECLARES_VAR", owner, ref)
189
+
190
+
191
+ def _project_decorator(b: RowBuilder, on: NodeRef, decorator: str) -> None:
192
+ dec = b.node(["PyDecorator"], "name", decorator, {"name": decorator})
193
+ b.edge("PY_DECORATED_BY", on, dec)
194
+
195
+
196
+ # ----------------------------------------------------------------------------------------------
197
+ # Property flattening
198
+ # ----------------------------------------------------------------------------------------------
199
+
200
+
201
+ def _module_props(mod: PyModule, file_key: str) -> Props:
202
+ return prune(
203
+ {
204
+ "module_name": mod.module_name,
205
+ "content_hash": mod.content_hash,
206
+ "last_modified": mod.last_modified,
207
+ "file_size": mod.file_size,
208
+ "_module": file_key,
209
+ }
210
+ )
211
+
212
+
213
+ def _class_props(cl: PyClass, file_key: str) -> Props:
214
+ return prune(
215
+ {
216
+ "name": cl.name,
217
+ "code": cl.code,
218
+ "base_classes": list(cl.base_classes or []),
219
+ "docstring": _docstring_of(cl.comments),
220
+ "start_line": cl.start_line,
221
+ "end_line": cl.end_line,
222
+ "_module": file_key,
223
+ }
224
+ )
225
+
226
+
227
+ def _callable_props(c: PyCallable, file_key: str) -> Props:
228
+ return prune(
229
+ {
230
+ "name": c.name,
231
+ "path": c.path,
232
+ "return_type": c.return_type,
233
+ "cyclomatic_complexity": c.cyclomatic_complexity,
234
+ "code": c.code,
235
+ "code_start_line": c.code_start_line,
236
+ "start_line": c.start_line,
237
+ "end_line": c.end_line,
238
+ "docstring": _docstring_of(c.comments),
239
+ "decorators": list(c.decorators or []),
240
+ "parameters_json": _stringify_if(c.parameters),
241
+ "accessed_symbols_json": _stringify_if(c.accessed_symbols),
242
+ "_module": file_key,
243
+ }
244
+ )
245
+
246
+
247
+ def _attribute_props(a: PyClassAttribute, attr_id: str, file_key: str) -> Props:
248
+ return prune(
249
+ {
250
+ "id": attr_id,
251
+ "name": a.name,
252
+ "type": a.type,
253
+ "docstring": _docstring_of(a.comments),
254
+ "start_line": a.start_line,
255
+ "end_line": a.end_line,
256
+ "_module": file_key,
257
+ }
258
+ )
259
+
260
+
261
+ def _variable_props(v: PyVariableDeclaration, var_id: str, file_key: str) -> Props:
262
+ return prune(
263
+ {
264
+ "id": var_id,
265
+ "name": v.name,
266
+ "type": v.type,
267
+ "initializer": v.initializer,
268
+ "scope": v.scope,
269
+ "start_line": v.start_line,
270
+ "end_line": v.end_line,
271
+ "_module": file_key,
272
+ }
273
+ )
274
+
275
+
276
+ def _call_site_props(s: PyCallsite, file_key: str) -> Props:
277
+ cs_id = f"{file_key}#{s.start_line}:{s.start_column}-{s.end_line}:{s.end_column}"
278
+ return prune(
279
+ {
280
+ "id": cs_id,
281
+ "method_name": s.method_name,
282
+ "receiver_expr": s.receiver_expr,
283
+ "receiver_type": s.receiver_type,
284
+ "argument_types": list(s.argument_types or []),
285
+ "return_type": s.return_type,
286
+ "callee_signature": s.callee_signature,
287
+ "is_constructor_call": s.is_constructor_call,
288
+ "start_line": s.start_line,
289
+ "start_column": s.start_column,
290
+ "end_line": s.end_line,
291
+ "end_column": s.end_column,
292
+ "_module": file_key,
293
+ }
294
+ )
295
+
296
+
297
+ def _call_edge_props(weight: int, provenance: List[str]) -> Props:
298
+ return prune({"weight": weight, "provenance": list(provenance)})
299
+
300
+
301
+ def _docstring_of(comments: Optional[List[PyComment]]) -> Optional[str]:
302
+ docs = [c.content for c in (comments or []) if c.is_docstring]
303
+ return "\n".join(docs) if docs else None
304
+
305
+
306
+ def _stringify_if(value: Any) -> Optional[str]:
307
+ """JSON-encode a list/dict of pydantic models, or None when empty."""
308
+ if value is None:
309
+ return None
310
+ if isinstance(value, (list, dict)) and len(value) == 0:
311
+ return None
312
+ return json.dumps(value, default=_jsonable, sort_keys=True)
313
+
314
+
315
+ def _jsonable(o: Any) -> Any:
316
+ if hasattr(o, "model_dump"):
317
+ return o.model_dump()
318
+ if hasattr(o, "dict"):
319
+ return o.dict()
320
+ if isinstance(o, Path):
321
+ return str(o)
322
+ return str(o)