codeanalyzer-python 0.1.13__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
codeanalyzer/__main__.py CHANGED
@@ -7,13 +7,18 @@ from codeanalyzer.core import Codeanalyzer
7
7
  from codeanalyzer.utils import _set_log_level, logger
8
8
  from codeanalyzer.config import OutputFormat
9
9
  from codeanalyzer.schema import model_dump_json
10
- from codeanalyzer.options import AnalysisOptions
10
+ from codeanalyzer.options import AnalysisOptions, EmitTarget
11
11
 
12
12
 
13
13
  def main(
14
14
  input: Annotated[
15
- Path, typer.Option("-i", "--input", help="Path to the project root directory.")
16
- ],
15
+ Optional[Path],
16
+ typer.Option(
17
+ "-i",
18
+ "--input",
19
+ help="Path to the project root directory (not required for --emit schema).",
20
+ ),
21
+ ] = None,
17
22
  output: Annotated[
18
23
  Optional[Path],
19
24
  typer.Option("-o", "--output", help="Output directory for artifacts."),
@@ -23,14 +28,61 @@ def main(
23
28
  typer.Option(
24
29
  "-f",
25
30
  "--format",
26
- help="Output format: json or msgpack.",
31
+ help="Output format for --emit json: json or msgpack.",
27
32
  case_sensitive=False,
28
33
  ),
29
34
  ] = OutputFormat.JSON,
30
- analysis_level: Annotated[
31
- int,
32
- typer.Option("-a", "--analysis-level", help="1: symbol table, 2: call graph."),
33
- ] = 1,
35
+ emit: Annotated[
36
+ EmitTarget,
37
+ typer.Option(
38
+ "--emit",
39
+ help="Output target: json (analysis.json, default) | neo4j (graph.cypher or live "
40
+ "Bolt push) | schema (the Neo4j schema.json contract).",
41
+ case_sensitive=False,
42
+ ),
43
+ ] = EmitTarget.JSON,
44
+ app_name: Annotated[
45
+ Optional[str],
46
+ typer.Option(
47
+ "--app-name",
48
+ help="Logical application name for the graph :PyApplication anchor "
49
+ "(default: input dir name).",
50
+ ),
51
+ ] = None,
52
+ neo4j_uri: Annotated[
53
+ Optional[str],
54
+ typer.Option(
55
+ "--neo4j-uri",
56
+ envvar="NEO4J_URI",
57
+ help="Push the graph to a live Neo4j over Bolt (incremental); omit to write "
58
+ "graph.cypher. [env: NEO4J_URI]",
59
+ ),
60
+ ] = None,
61
+ neo4j_user: Annotated[
62
+ str,
63
+ typer.Option(
64
+ "--neo4j-user",
65
+ envvar="NEO4J_USERNAME",
66
+ help="Neo4j username. [env: NEO4J_USERNAME]",
67
+ ),
68
+ ] = "neo4j",
69
+ neo4j_password: Annotated[
70
+ str,
71
+ typer.Option(
72
+ "--neo4j-password",
73
+ envvar="NEO4J_PASSWORD",
74
+ help="Neo4j password. Prefer the env var over the flag (the flag is visible in shell "
75
+ "history / process list). [env: NEO4J_PASSWORD]",
76
+ ),
77
+ ] = "neo4j",
78
+ neo4j_database: Annotated[
79
+ Optional[str],
80
+ typer.Option(
81
+ "--neo4j-database",
82
+ envvar="NEO4J_DATABASE",
83
+ help="Neo4j database name (default: server default). [env: NEO4J_DATABASE]",
84
+ ),
85
+ ] = None,
34
86
  using_codeql: Annotated[
35
87
  bool, typer.Option("--codeql/--no-codeql", help="Enable CodeQL-based analysis.")
36
88
  ] = False,
@@ -82,7 +134,12 @@ def main(
82
134
  input=input,
83
135
  output=output,
84
136
  format=format,
85
- analysis_level=analysis_level,
137
+ emit=emit,
138
+ app_name=app_name,
139
+ neo4j_uri=neo4j_uri,
140
+ neo4j_user=neo4j_user,
141
+ neo4j_password=neo4j_password,
142
+ neo4j_database=neo4j_database,
86
143
  using_codeql=using_codeql,
87
144
  using_ray=using_ray,
88
145
  rebuild_analysis=rebuild_analysis,
@@ -94,6 +151,18 @@ def main(
94
151
  )
95
152
 
96
153
  _set_log_level(options.verbosity)
154
+
155
+ # The schema contract is a static artifact — no project analysis required.
156
+ if options.emit == EmitTarget.SCHEMA:
157
+ from codeanalyzer.neo4j.emit import emit_schema
158
+
159
+ emit_schema(options.output)
160
+ return
161
+
162
+ # Every other target requires an input project.
163
+ if options.input is None:
164
+ logger.error("Missing option '-i' / '--input' (required for --emit json | neo4j).")
165
+ raise typer.Exit(code=1)
97
166
  if not options.input.exists():
98
167
  logger.error(f"Input path '{options.input}' does not exist.")
99
168
  raise typer.Exit(code=1)
@@ -117,7 +186,11 @@ def main(
117
186
  with Codeanalyzer(options) as analyzer:
118
187
  artifacts = analyzer.analyze()
119
188
 
120
- if options.output is None:
189
+ if options.emit == EmitTarget.NEO4J:
190
+ from codeanalyzer.neo4j.emit import emit_neo4j
191
+
192
+ emit_neo4j(artifacts, options)
193
+ elif options.output is None:
121
194
  print(model_dump_json(artifacts, separators=(",", ":")))
122
195
  else:
123
196
  options.output.mkdir(parents=True, exist_ok=True)
@@ -147,7 +220,7 @@ def _write_output(artifacts, output_dir: Path, format: OutputFormat):
147
220
 
148
221
  app = typer.Typer(
149
222
  callback=main,
150
- name="codeanalyzer",
223
+ name="canpy",
151
224
  help="Static Analysis on Python source code using Jedi, CodeQL and Tree sitter.",
152
225
  invoke_without_command=True,
153
226
  no_args_is_help=True,
@@ -156,5 +229,20 @@ app = typer.Typer(
156
229
  pretty_exceptions_show_locals=False,
157
230
  )
158
231
 
232
+ def deprecated_main() -> None:
233
+ """Entry point for the legacy ``codeanalyzer`` command. Prints a one-line
234
+ deprecation notice to stderr (so piped stdout — e.g. ``--emit schema`` — stays
235
+ clean) and then runs the CLI unchanged. Kept for backwards compatibility; will
236
+ be removed in a future release."""
237
+ import sys
238
+
239
+ print(
240
+ "codeanalyzer: this command has been renamed to `canpy`. The `codeanalyzer` "
241
+ "alias is deprecated and will be removed in a future release — please use `canpy`.",
242
+ file=sys.stderr,
243
+ )
244
+ app()
245
+
246
+
159
247
  if __name__ == "__main__":
160
248
  app()
codeanalyzer/core.py CHANGED
@@ -9,7 +9,14 @@ from typing import Any, Dict, Optional, Union, List
9
9
  import ray
10
10
  from codeanalyzer.utils import logger
11
11
  from codeanalyzer.schema import PyApplication, PyModule, model_dump_json, model_validate_json
12
+ from codeanalyzer.schema.py_schema import PyCallEdge
13
+ from codeanalyzer.semantic_analysis.call_graph import (
14
+ jedi_call_graph_edges,
15
+ merge_edges,
16
+ resolve_unresolved_constructors,
17
+ )
12
18
  from codeanalyzer.semantic_analysis.codeql import CodeQLLoader
19
+ from codeanalyzer.semantic_analysis.codeql.codeql_analysis import CodeQL
13
20
  from codeanalyzer.semantic_analysis.codeql.codeql_exceptions import CodeQLExceptions
14
21
  from codeanalyzer.syntactic_analysis.exceptions import SymbolTableBuilderRayError
15
22
  from codeanalyzer.syntactic_analysis.symbol_table_builder import SymbolTableBuilder
@@ -49,7 +56,6 @@ class Codeanalyzer:
49
56
 
50
57
  def __init__(self, options: AnalysisOptions) -> None:
51
58
  self.options = options
52
- self.analysis_depth = options.analysis_level
53
59
  self.project_dir = Path(options.input).resolve()
54
60
  self.skip_tests = options.skip_tests
55
61
  self.using_codeql = options.using_codeql
@@ -60,6 +66,7 @@ class Codeanalyzer:
60
66
  self.clear_cache = options.clear_cache
61
67
  self.db_path: Optional[Path] = None
62
68
  self.codeql_bin: Optional[Path] = None
69
+ self.codeql_packs_dir: Optional[Path] = None
63
70
  self.virtualenv: Optional[Path] = None
64
71
  self.using_ray: bool = options.using_ray
65
72
  self.file_name: Optional[Path] = options.file_name
@@ -292,6 +299,15 @@ class Codeanalyzer:
292
299
 
293
300
  if self.using_codeql:
294
301
  logger.info(f"(Re-)initializing CodeQL analysis for {self.project_dir}")
302
+
303
+ # Resolve the CLI binary before anything else uses it: DB build
304
+ # below needs it, and so does every subsequent query run.
305
+ self.codeql_bin = self._ensure_codeql_bin()
306
+ # Download the standard query library pack (idempotent). The
307
+ # CLI install ships only the language extractors; the
308
+ # ``codeql/python-all`` library pack must be fetched separately.
309
+ self.codeql_packs_dir = self._ensure_codeql_packs(self.codeql_bin)
310
+
295
311
  cache_root = self.cache_dir / "codeql"
296
312
  cache_root.mkdir(parents=True, exist_ok=True)
297
313
  self.db_path = cache_root / f"{self.project_dir.name}-db"
@@ -310,19 +326,6 @@ class Codeanalyzer:
310
326
  if self.rebuild_analysis or not is_cache_valid():
311
327
  logger.info("Creating new CodeQL database...")
312
328
 
313
- codeql_in_path = shutil.which("codeql")
314
- if codeql_in_path:
315
- self.codeql_bin = Path(codeql_in_path)
316
- else:
317
- self.codeql_bin = CodeQLLoader.download_and_extract_codeql(
318
- self.cache_dir / "codeql" / "bin"
319
- )
320
-
321
- if not shutil.which(str(self.codeql_bin)):
322
- raise FileNotFoundError(
323
- f"CodeQL binary not executable: {self.codeql_bin}"
324
- )
325
-
326
329
  cmd = [
327
330
  str(self.codeql_bin),
328
331
  "database",
@@ -375,8 +378,27 @@ class Codeanalyzer:
375
378
  # Build symbol table from cached application if available (if no available, the build a new one)
376
379
  symbol_table = self._build_symbol_table(cached_pyapplication.symbol_table if cached_pyapplication else {})
377
380
 
381
+ # Build the call graph in four steps:
382
+ # 1. Run CodeQL (when enabled). Produces resolved edges with
383
+ # ``provenance=["codeql"]`` and augments ``PyCallsite``s
384
+ # in-place — filling ``callee_signature`` for sites Jedi
385
+ # couldn't resolve.
386
+ # 2. Heuristic fallback for constructor calls neither Jedi nor
387
+ # CodeQL could resolve (commonly classes nested inside
388
+ # functions). Walks the symbol table by class short-name +
389
+ # scope and writes ``<class>.__init__`` into the site.
390
+ # 3. Derive Jedi edges from the now-fully-augmented symbol
391
+ # table — these reflect every resolution the symbol table
392
+ # contains, regardless of which pass put it there.
393
+ # 4. Merge with CodeQL edges; provenance unions for edges both
394
+ # backends saw.
395
+ codeql_edges = self._get_call_graph(symbol_table, augment_sites=True)
396
+ resolve_unresolved_constructors(symbol_table)
397
+ jedi_edges = jedi_call_graph_edges(symbol_table)
398
+ call_graph = merge_edges(jedi_edges, codeql_edges)
399
+
378
400
  # Recreate pyapplication
379
- app = PyApplication.builder().symbol_table(symbol_table).build()
401
+ app = PyApplication.builder().symbol_table(symbol_table).call_graph(call_graph).build()
380
402
 
381
403
  # Save to cache
382
404
  self._save_analysis_cache(app, cache_file)
@@ -579,7 +601,120 @@ class Codeanalyzer:
579
601
  logger.info("✅ Symbol table generation complete.")
580
602
  return symbol_table
581
603
 
582
- def _get_call_graph(self) -> Dict[str, Any]:
583
- """Retrieve call graph from CodeQL database."""
584
- logger.warning("Call graph extraction not yet implemented.")
585
- return {}
604
+ def _ensure_codeql_packs(self, codeql_bin: Path) -> Path:
605
+ """Materialize a qlpack that depends on ``codeql/python-all``.
606
+
607
+ The CodeQL CLI install ships only the language extractors — query
608
+ library packs (and their transitive dependencies like
609
+ ``codeql/concepts``) must be resolved separately. The canonical
610
+ way is to declare the dependency in a ``qlpack.yml`` and run
611
+ ``codeql pack install`` in that directory; CodeQL writes a
612
+ ``codeql-pack.lock.yml`` and downloads everything needed.
613
+
614
+ We do this once per project under ``<cache_dir>/codeql/qlpack/``
615
+ and return that directory. The query runner then writes its
616
+ temporary ``.ql`` file inside this pack — colocation makes
617
+ ``import python`` resolve without any ``--additional-packs`` or
618
+ ``--search-path`` gymnastics.
619
+ """
620
+ pack_dir = self.cache_dir / "codeql" / "qlpack"
621
+ pack_dir.mkdir(parents=True, exist_ok=True)
622
+ qlpack_yml = pack_dir / "qlpack.yml"
623
+ lock_file = pack_dir / "codeql-pack.lock.yml"
624
+
625
+ if not qlpack_yml.exists():
626
+ qlpack_yml.write_text(
627
+ "name: codeanalyzer-deps\n"
628
+ "version: 1.0.0\n"
629
+ "dependencies:\n"
630
+ ' codeql/python-all: "*"\n'
631
+ )
632
+
633
+ if lock_file.exists():
634
+ logger.debug(f"CodeQL pack dependencies already installed in {pack_dir}")
635
+ return pack_dir
636
+
637
+ logger.info(f"Installing CodeQL pack dependencies in {pack_dir}.")
638
+ proc = subprocess.Popen(
639
+ [str(codeql_bin), "pack", "install", str(pack_dir)],
640
+ stdout=subprocess.PIPE,
641
+ stderr=subprocess.PIPE,
642
+ )
643
+ _, err = proc.communicate()
644
+ if proc.returncode != 0:
645
+ raise CodeQLExceptions.CodeQLDatabaseBuildException(
646
+ f"Failed to install CodeQL pack dependencies:\n"
647
+ f"{(err or b'').decode(errors='replace')}"
648
+ )
649
+ return pack_dir
650
+
651
+ def _ensure_codeql_bin(self) -> Path:
652
+ """Locate (or download) the CodeQL CLI binary into the project cache.
653
+
654
+ Resolution order:
655
+ 1. An existing binary inside ``<cache_dir>/codeql/bin/`` —
656
+ reused across runs on the same project.
657
+ 2. ``codeql`` already on the user's PATH — picked up verbatim.
658
+ 3. Otherwise, download into ``<cache_dir>/codeql/bin/``.
659
+
660
+ The project-local cache is preferred over PATH so the version we
661
+ installed earlier wins over whatever the OS ships — keeps behavior
662
+ deterministic when the user has both.
663
+ """
664
+ bin_root = self.cache_dir / "codeql" / "bin"
665
+ bin_root.mkdir(parents=True, exist_ok=True)
666
+
667
+ existing = next(
668
+ (p for p in bin_root.rglob("codeql") if p.is_file()),
669
+ None,
670
+ )
671
+ if existing and os.access(existing, os.X_OK):
672
+ logger.debug(f"Reusing cached CodeQL CLI at {existing}")
673
+ return existing.resolve()
674
+
675
+ on_path = shutil.which("codeql")
676
+ if on_path:
677
+ logger.debug(f"Using CodeQL CLI from PATH at {on_path}")
678
+ return Path(on_path)
679
+
680
+ logger.info(f"CodeQL CLI not found; downloading into {bin_root}.")
681
+ downloaded = CodeQLLoader.download_and_extract_codeql(bin_root)
682
+ if not downloaded.exists() or not os.access(downloaded, os.X_OK):
683
+ raise FileNotFoundError(
684
+ f"CodeQL binary not executable after download: {downloaded}"
685
+ )
686
+ return downloaded
687
+
688
+ def _get_call_graph(
689
+ self,
690
+ symbol_table: Dict[str, PyModule],
691
+ augment_sites: bool = False,
692
+ ) -> List[PyCallEdge]:
693
+ """Build CodeQL-resolved call edges and optionally augment sites.
694
+
695
+ Returns an empty list when CodeQL isn't enabled or the database
696
+ isn't available. Edges carry ``provenance=["codeql"]`` — merge
697
+ with Jedi-derived edges via ``call_graph.merge_edges``.
698
+
699
+ When ``augment_sites`` is True, also mutates
700
+ ``PyCallable.call_sites`` in the symbol table to backfill
701
+ ``callee_signature`` for sites Jedi couldn't resolve. The single
702
+ CodeQL query is shared (cached on the ``CodeQL`` instance) so
703
+ this costs no extra DB work.
704
+ """
705
+ if not self.using_codeql or self.db_path is None:
706
+ return []
707
+ try:
708
+ cq = CodeQL(
709
+ self.project_dir,
710
+ self.db_path,
711
+ codeql_bin=self.codeql_bin,
712
+ codeql_packs_dir=self.codeql_packs_dir,
713
+ )
714
+ edges = cq.build_call_graph_edges(symbol_table)
715
+ if augment_sites:
716
+ cq.augment_call_sites(symbol_table)
717
+ return edges
718
+ except Exception as exc:
719
+ logger.warning(f"CodeQL call-graph extraction failed: {exc}")
720
+ return []
@@ -0,0 +1,46 @@
1
+ ################################################################################
2
+ # Copyright IBM Corporation 2025
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ ################################################################################
16
+
17
+ """Neo4j output: a pure projection of the :class:`PyApplication` IR to graph rows,
18
+ plus the two writers (cypher snapshot / bolt incremental). Nothing here runs
19
+ unless ``--emit neo4j`` (or ``--emit schema``) is selected.
20
+ """
21
+ from codeanalyzer.neo4j.bolt import BoltConfig, bolt_writer
22
+ from codeanalyzer.neo4j.catalog import (
23
+ MARKER_LABELS,
24
+ NODE_LABELS,
25
+ REL_TYPES,
26
+ SCHEMA_VERSION,
27
+ build_schema_document,
28
+ )
29
+ from codeanalyzer.neo4j.cypher import render_cypher
30
+ from codeanalyzer.neo4j.project import project
31
+ from codeanalyzer.neo4j.rows import EdgeRow, GraphRows, NodeRow
32
+
33
+ __all__ = [
34
+ "project",
35
+ "render_cypher",
36
+ "bolt_writer",
37
+ "BoltConfig",
38
+ "build_schema_document",
39
+ "SCHEMA_VERSION",
40
+ "NODE_LABELS",
41
+ "REL_TYPES",
42
+ "MARKER_LABELS",
43
+ "GraphRows",
44
+ "NodeRow",
45
+ "EdgeRow",
46
+ ]
@@ -0,0 +1,223 @@
1
+ ################################################################################
2
+ # Copyright IBM Corporation 2025
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ ################################################################################
16
+
17
+ """The incremental writer: push :class:`GraphRows` into a live Neo4j over Bolt.
18
+ Unlike the snapshot writer, this one reads the DB's current state and updates
19
+ only what changed.
20
+
21
+ Algorithm (the module subgraph is the unit of idempotent replacement):
22
+ 1. ensure constraints + indexes.
23
+ 2. diff each module's ``content_hash`` against the DB → the set of changed modules.
24
+ 3. per changed module, in a transaction: delete the edges it owned (edges out of
25
+ its nodes), detach-delete the declarations it no longer emits, then upsert
26
+ its current nodes.
27
+ 4. upsert edges owned by changed modules (+ the shared edges).
28
+ 5. on a FULL run only, prune modules whose source file vanished.
29
+
30
+ Nodes are MERGE-upserted, never blindly deleted, so a declaration another
31
+ (unchanged) module still references survives and its incoming edges stay valid.
32
+ ``:PyExternal`` / ``:PyPackage`` / ``:PyDecorator`` are shared (no ``_module``) and are
33
+ MERGE-only.
34
+
35
+ The ``neo4j`` driver is imported lazily so it stays an optional dependency and
36
+ off the default (json) output path entirely.
37
+ """
38
+ from __future__ import annotations
39
+
40
+ from dataclasses import dataclass
41
+ from typing import Dict, List, Optional
42
+
43
+ from codeanalyzer.neo4j.rows import EdgeRow, GraphRows, NodeRow, chunk
44
+ from codeanalyzer.neo4j.schema import CONSTRAINTS, INDEXES
45
+ from codeanalyzer.utils import logger
46
+
47
+ DESCENDANTS = "[:PY_DECLARES|PY_HAS_METHOD|PY_HAS_ATTRIBUTE|PY_DECLARES_VAR|PY_HAS_CALLSITE*1..]"
48
+ BATCH = 1000
49
+
50
+
51
+ @dataclass
52
+ class BoltConfig:
53
+ uri: str
54
+ user: str
55
+ password: str
56
+ database: Optional[str] = None
57
+
58
+
59
+ def bolt_writer(rows: GraphRows, cfg: BoltConfig, full_run: bool) -> None:
60
+ try:
61
+ import neo4j # noqa: WPS433 (lazy, optional dependency)
62
+ except ImportError as exc: # pragma: no cover - exercised only without the extra
63
+ raise RuntimeError(
64
+ "The 'neo4j' driver is required for '--emit neo4j --neo4j-uri'. "
65
+ "Install it with: pip install 'codeanalyzer-python[neo4j]'"
66
+ ) from exc
67
+
68
+ driver = neo4j.GraphDatabase.driver(cfg.uri, auth=(cfg.user, cfg.password))
69
+ session_kwargs = {"database": cfg.database} if cfg.database else {}
70
+
71
+ def session():
72
+ return driver.session(**session_kwargs)
73
+
74
+ try:
75
+ # 1. schema (DDL runs in its own autocommit transactions).
76
+ with session() as s:
77
+ for stmt in [*CONSTRAINTS, *INDEXES]:
78
+ s.run(stmt)
79
+
80
+ # Partition nodes by owning module; shared nodes have no _module.
81
+ by_module: Dict[str, List[NodeRow]] = {}
82
+ shared: List[NodeRow] = []
83
+ module_of: Dict[str, str] = {} # node value → owning module
84
+ for n in rows.nodes:
85
+ m = n.props.get("_module")
86
+ if isinstance(m, str):
87
+ by_module.setdefault(m, []).append(n)
88
+ module_of[n.value] = m
89
+ else:
90
+ shared.append(n)
91
+
92
+ # 2. diff content_hash.
93
+ db_hash: Dict[str, Optional[str]] = {}
94
+ with session() as s:
95
+ res = s.run("MATCH (m:PyModule) RETURN m.file_key AS k, m.content_hash AS h")
96
+ for rec in res:
97
+ db_hash[rec["k"]] = rec["h"]
98
+ changed = set()
99
+ for m, nodes in by_module.items():
100
+ row_hash = _hash_of(nodes, m)
101
+ if m not in db_hash or row_hash is None or row_hash != db_hash.get(m):
102
+ changed.add(m)
103
+ logger.info(
104
+ f"neo4j(bolt): {len(by_module)} modules ({len(changed)} changed), "
105
+ f"{len(shared)} shared nodes, {len(rows.edges)} edges"
106
+ )
107
+
108
+ # 3. shared nodes are always upserted (MERGE-only).
109
+ _upsert_nodes(session, neo4j, shared)
110
+
111
+ # 4. per changed module: purge owned edges + vanished decls, then upsert its nodes.
112
+ for m in changed:
113
+ nodes = by_module[m]
114
+ keys = [n.value for n in nodes]
115
+ with session() as s:
116
+ def _purge(tx, module=m, node_keys=keys):
117
+ tx.run("MATCH (x {_module: $m})-[r]->() DELETE r", m=module)
118
+ tx.run(
119
+ "MATCH (x {_module: $m}) "
120
+ "WHERE NOT coalesce(x.signature, x.id, x.file_key) IN $keys "
121
+ "DETACH DELETE x",
122
+ m=module,
123
+ keys=node_keys,
124
+ )
125
+
126
+ s.execute_write(_purge)
127
+ _upsert_nodes(session, neo4j, nodes)
128
+
129
+ # 5. upsert edges owned by a changed module (owner = source node's module) or shared.
130
+ edges = [
131
+ e
132
+ for e in rows.edges
133
+ if module_of.get(e.from_ref.value) is None or module_of.get(e.from_ref.value) in changed
134
+ ]
135
+ _upsert_edges(session, neo4j, edges)
136
+
137
+ # 6. orphan prune — only safe on a full run (a targeted run can't tell deleted from untargeted).
138
+ if full_run:
139
+ present = list(by_module.keys())
140
+ with session() as s:
141
+ res = s.run(
142
+ "MATCH (m:PyModule) WHERE NOT m.file_key IN $present "
143
+ f"OPTIONAL MATCH (m)-{DESCENDANTS}->(x) DETACH DELETE x, m "
144
+ "RETURN count(m) AS pruned",
145
+ present=present,
146
+ )
147
+ pruned = res.single()
148
+ pruned_count = pruned["pruned"] if pruned else 0
149
+ logger.info(f"neo4j(bolt): pruned {pruned_count} vanished module(s)")
150
+ else:
151
+ logger.info(
152
+ "neo4j(bolt): targeted run — orphan pruning skipped (deleted files not removed)"
153
+ )
154
+ finally:
155
+ driver.close()
156
+
157
+
158
+ # ----------------------------------------------------------------------------------------------
159
+ # Batched upserts
160
+ # ----------------------------------------------------------------------------------------------
161
+
162
+
163
+ def _upsert_nodes(session, neo4j, nodes: List[NodeRow]) -> None:
164
+ groups: Dict[str, List[NodeRow]] = {}
165
+ for n in nodes:
166
+ groups.setdefault(f"{':'.join(n.labels)}|{n.key_prop}", []).append(n)
167
+
168
+ for group in groups.values():
169
+ labels = group[0].labels
170
+ key_prop = group[0].key_prop
171
+ set_labels = f", n:{':'.join(labels[1:])}" if len(labels) > 1 else ""
172
+ cypher = (
173
+ f"UNWIND $rows AS row MERGE (n:{labels[0]} {{{key_prop}: row.k}}) "
174
+ f"SET n += row.p{set_labels}"
175
+ )
176
+ for batch in chunk(group, BATCH):
177
+ payload = [{"k": n.value, "p": _to_params(n.props, neo4j)} for n in batch]
178
+ with session() as s:
179
+ s.run(cypher, rows=payload)
180
+
181
+
182
+ def _upsert_edges(session, neo4j, edges: List[EdgeRow]) -> None:
183
+ groups: Dict[str, List[EdgeRow]] = {}
184
+ for e in edges:
185
+ key = f"{e.type}|{e.from_ref.label}.{e.from_ref.key_prop}|{e.to_ref.label}.{e.to_ref.key_prop}"
186
+ groups.setdefault(key, []).append(e)
187
+
188
+ for group in groups.values():
189
+ first = group[0]
190
+ from_ref, to_ref = first.from_ref, first.to_ref
191
+ cypher = (
192
+ f"UNWIND $rows AS row "
193
+ f"MATCH (a:{from_ref.label} {{{from_ref.key_prop}: row.f}}) "
194
+ f"MATCH (b:{to_ref.label} {{{to_ref.key_prop}: row.t}}) "
195
+ f"MERGE (a)-[r:{first.type}]->(b) SET r += row.p"
196
+ )
197
+ for batch in chunk(group, BATCH):
198
+ payload = [
199
+ {"f": e.from_ref.value, "t": e.to_ref.value, "p": _to_params(e.props, neo4j)}
200
+ for e in batch
201
+ ]
202
+ with session() as s:
203
+ s.run(cypher, rows=payload)
204
+
205
+
206
+ # ----------------------------------------------------------------------------------------------
207
+ # Helpers
208
+ # ----------------------------------------------------------------------------------------------
209
+
210
+
211
+ def _hash_of(nodes: List[NodeRow], file_key: str) -> Optional[str]:
212
+ for n in nodes:
213
+ if n.labels[0] == "PyModule" and n.value == file_key:
214
+ h = n.props.get("content_hash")
215
+ return h if isinstance(h, str) else None
216
+ return None
217
+
218
+
219
+ def _to_params(props, neo4j) -> dict:
220
+ """Map props to driver params. The Python driver already distinguishes int
221
+ from float, so unlike the JS driver no integer coercion is needed — this is a
222
+ straight passthrough kept symmetric with the snapshot writer's shape."""
223
+ return dict(props)