codeanalyzer-python 0.1.13__py3-none-any.whl → 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- codeanalyzer/__main__.py +99 -11
- codeanalyzer/core.py +154 -19
- codeanalyzer/neo4j/__init__.py +46 -0
- codeanalyzer/neo4j/bolt.py +223 -0
- codeanalyzer/neo4j/catalog.py +245 -0
- codeanalyzer/neo4j/cypher.py +138 -0
- codeanalyzer/neo4j/emit.py +74 -0
- codeanalyzer/neo4j/project.py +322 -0
- codeanalyzer/neo4j/rows.py +176 -0
- codeanalyzer/neo4j/schema.py +39 -0
- codeanalyzer/options/__init__.py +2 -2
- codeanalyzer/options/options.py +20 -1
- codeanalyzer/schema/py_schema.py +20 -0
- codeanalyzer/semantic_analysis/call_graph.py +266 -0
- codeanalyzer/semantic_analysis/codeql/codeql_analysis.py +318 -69
- codeanalyzer/semantic_analysis/codeql/codeql_loader.py +32 -4
- codeanalyzer/semantic_analysis/codeql/codeql_query_runner.py +51 -31
- codeanalyzer/syntactic_analysis/symbol_table_builder.py +87 -4
- codeanalyzer_python-0.2.0.dist-info/METADATA +393 -0
- codeanalyzer_python-0.2.0.dist-info/RECORD +39 -0
- {codeanalyzer_python-0.1.13.dist-info → codeanalyzer_python-0.2.0.dist-info}/WHEEL +1 -1
- codeanalyzer_python-0.2.0.dist-info/entry_points.txt +3 -0
- codeanalyzer/semantic_analysis/wala/__init__.py +0 -15
- codeanalyzer_python-0.1.13.dist-info/METADATA +0 -414
- codeanalyzer_python-0.1.13.dist-info/RECORD +0 -31
- codeanalyzer_python-0.1.13.dist-info/entry_points.txt +0 -2
- {codeanalyzer_python-0.1.13.dist-info → codeanalyzer_python-0.2.0.dist-info}/licenses/LICENSE +0 -0
- {codeanalyzer_python-0.1.13.dist-info → codeanalyzer_python-0.2.0.dist-info}/licenses/NOTICE +0 -0
codeanalyzer/__main__.py
CHANGED
|
@@ -7,13 +7,18 @@ from codeanalyzer.core import Codeanalyzer
|
|
|
7
7
|
from codeanalyzer.utils import _set_log_level, logger
|
|
8
8
|
from codeanalyzer.config import OutputFormat
|
|
9
9
|
from codeanalyzer.schema import model_dump_json
|
|
10
|
-
from codeanalyzer.options import AnalysisOptions
|
|
10
|
+
from codeanalyzer.options import AnalysisOptions, EmitTarget
|
|
11
11
|
|
|
12
12
|
|
|
13
13
|
def main(
|
|
14
14
|
input: Annotated[
|
|
15
|
-
Path,
|
|
16
|
-
|
|
15
|
+
Optional[Path],
|
|
16
|
+
typer.Option(
|
|
17
|
+
"-i",
|
|
18
|
+
"--input",
|
|
19
|
+
help="Path to the project root directory (not required for --emit schema).",
|
|
20
|
+
),
|
|
21
|
+
] = None,
|
|
17
22
|
output: Annotated[
|
|
18
23
|
Optional[Path],
|
|
19
24
|
typer.Option("-o", "--output", help="Output directory for artifacts."),
|
|
@@ -23,14 +28,61 @@ def main(
|
|
|
23
28
|
typer.Option(
|
|
24
29
|
"-f",
|
|
25
30
|
"--format",
|
|
26
|
-
help="Output format: json or msgpack.",
|
|
31
|
+
help="Output format for --emit json: json or msgpack.",
|
|
27
32
|
case_sensitive=False,
|
|
28
33
|
),
|
|
29
34
|
] = OutputFormat.JSON,
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
typer.Option(
|
|
33
|
-
|
|
35
|
+
emit: Annotated[
|
|
36
|
+
EmitTarget,
|
|
37
|
+
typer.Option(
|
|
38
|
+
"--emit",
|
|
39
|
+
help="Output target: json (analysis.json, default) | neo4j (graph.cypher or live "
|
|
40
|
+
"Bolt push) | schema (the Neo4j schema.json contract).",
|
|
41
|
+
case_sensitive=False,
|
|
42
|
+
),
|
|
43
|
+
] = EmitTarget.JSON,
|
|
44
|
+
app_name: Annotated[
|
|
45
|
+
Optional[str],
|
|
46
|
+
typer.Option(
|
|
47
|
+
"--app-name",
|
|
48
|
+
help="Logical application name for the graph :PyApplication anchor "
|
|
49
|
+
"(default: input dir name).",
|
|
50
|
+
),
|
|
51
|
+
] = None,
|
|
52
|
+
neo4j_uri: Annotated[
|
|
53
|
+
Optional[str],
|
|
54
|
+
typer.Option(
|
|
55
|
+
"--neo4j-uri",
|
|
56
|
+
envvar="NEO4J_URI",
|
|
57
|
+
help="Push the graph to a live Neo4j over Bolt (incremental); omit to write "
|
|
58
|
+
"graph.cypher. [env: NEO4J_URI]",
|
|
59
|
+
),
|
|
60
|
+
] = None,
|
|
61
|
+
neo4j_user: Annotated[
|
|
62
|
+
str,
|
|
63
|
+
typer.Option(
|
|
64
|
+
"--neo4j-user",
|
|
65
|
+
envvar="NEO4J_USERNAME",
|
|
66
|
+
help="Neo4j username. [env: NEO4J_USERNAME]",
|
|
67
|
+
),
|
|
68
|
+
] = "neo4j",
|
|
69
|
+
neo4j_password: Annotated[
|
|
70
|
+
str,
|
|
71
|
+
typer.Option(
|
|
72
|
+
"--neo4j-password",
|
|
73
|
+
envvar="NEO4J_PASSWORD",
|
|
74
|
+
help="Neo4j password. Prefer the env var over the flag (the flag is visible in shell "
|
|
75
|
+
"history / process list). [env: NEO4J_PASSWORD]",
|
|
76
|
+
),
|
|
77
|
+
] = "neo4j",
|
|
78
|
+
neo4j_database: Annotated[
|
|
79
|
+
Optional[str],
|
|
80
|
+
typer.Option(
|
|
81
|
+
"--neo4j-database",
|
|
82
|
+
envvar="NEO4J_DATABASE",
|
|
83
|
+
help="Neo4j database name (default: server default). [env: NEO4J_DATABASE]",
|
|
84
|
+
),
|
|
85
|
+
] = None,
|
|
34
86
|
using_codeql: Annotated[
|
|
35
87
|
bool, typer.Option("--codeql/--no-codeql", help="Enable CodeQL-based analysis.")
|
|
36
88
|
] = False,
|
|
@@ -82,7 +134,12 @@ def main(
|
|
|
82
134
|
input=input,
|
|
83
135
|
output=output,
|
|
84
136
|
format=format,
|
|
85
|
-
|
|
137
|
+
emit=emit,
|
|
138
|
+
app_name=app_name,
|
|
139
|
+
neo4j_uri=neo4j_uri,
|
|
140
|
+
neo4j_user=neo4j_user,
|
|
141
|
+
neo4j_password=neo4j_password,
|
|
142
|
+
neo4j_database=neo4j_database,
|
|
86
143
|
using_codeql=using_codeql,
|
|
87
144
|
using_ray=using_ray,
|
|
88
145
|
rebuild_analysis=rebuild_analysis,
|
|
@@ -94,6 +151,18 @@ def main(
|
|
|
94
151
|
)
|
|
95
152
|
|
|
96
153
|
_set_log_level(options.verbosity)
|
|
154
|
+
|
|
155
|
+
# The schema contract is a static artifact — no project analysis required.
|
|
156
|
+
if options.emit == EmitTarget.SCHEMA:
|
|
157
|
+
from codeanalyzer.neo4j.emit import emit_schema
|
|
158
|
+
|
|
159
|
+
emit_schema(options.output)
|
|
160
|
+
return
|
|
161
|
+
|
|
162
|
+
# Every other target requires an input project.
|
|
163
|
+
if options.input is None:
|
|
164
|
+
logger.error("Missing option '-i' / '--input' (required for --emit json | neo4j).")
|
|
165
|
+
raise typer.Exit(code=1)
|
|
97
166
|
if not options.input.exists():
|
|
98
167
|
logger.error(f"Input path '{options.input}' does not exist.")
|
|
99
168
|
raise typer.Exit(code=1)
|
|
@@ -117,7 +186,11 @@ def main(
|
|
|
117
186
|
with Codeanalyzer(options) as analyzer:
|
|
118
187
|
artifacts = analyzer.analyze()
|
|
119
188
|
|
|
120
|
-
if options.
|
|
189
|
+
if options.emit == EmitTarget.NEO4J:
|
|
190
|
+
from codeanalyzer.neo4j.emit import emit_neo4j
|
|
191
|
+
|
|
192
|
+
emit_neo4j(artifacts, options)
|
|
193
|
+
elif options.output is None:
|
|
121
194
|
print(model_dump_json(artifacts, separators=(",", ":")))
|
|
122
195
|
else:
|
|
123
196
|
options.output.mkdir(parents=True, exist_ok=True)
|
|
@@ -147,7 +220,7 @@ def _write_output(artifacts, output_dir: Path, format: OutputFormat):
|
|
|
147
220
|
|
|
148
221
|
app = typer.Typer(
|
|
149
222
|
callback=main,
|
|
150
|
-
name="
|
|
223
|
+
name="canpy",
|
|
151
224
|
help="Static Analysis on Python source code using Jedi, CodeQL and Tree sitter.",
|
|
152
225
|
invoke_without_command=True,
|
|
153
226
|
no_args_is_help=True,
|
|
@@ -156,5 +229,20 @@ app = typer.Typer(
|
|
|
156
229
|
pretty_exceptions_show_locals=False,
|
|
157
230
|
)
|
|
158
231
|
|
|
232
|
+
def deprecated_main() -> None:
|
|
233
|
+
"""Entry point for the legacy ``codeanalyzer`` command. Prints a one-line
|
|
234
|
+
deprecation notice to stderr (so piped stdout — e.g. ``--emit schema`` — stays
|
|
235
|
+
clean) and then runs the CLI unchanged. Kept for backwards compatibility; will
|
|
236
|
+
be removed in a future release."""
|
|
237
|
+
import sys
|
|
238
|
+
|
|
239
|
+
print(
|
|
240
|
+
"codeanalyzer: this command has been renamed to `canpy`. The `codeanalyzer` "
|
|
241
|
+
"alias is deprecated and will be removed in a future release — please use `canpy`.",
|
|
242
|
+
file=sys.stderr,
|
|
243
|
+
)
|
|
244
|
+
app()
|
|
245
|
+
|
|
246
|
+
|
|
159
247
|
if __name__ == "__main__":
|
|
160
248
|
app()
|
codeanalyzer/core.py
CHANGED
|
@@ -9,7 +9,14 @@ from typing import Any, Dict, Optional, Union, List
|
|
|
9
9
|
import ray
|
|
10
10
|
from codeanalyzer.utils import logger
|
|
11
11
|
from codeanalyzer.schema import PyApplication, PyModule, model_dump_json, model_validate_json
|
|
12
|
+
from codeanalyzer.schema.py_schema import PyCallEdge
|
|
13
|
+
from codeanalyzer.semantic_analysis.call_graph import (
|
|
14
|
+
jedi_call_graph_edges,
|
|
15
|
+
merge_edges,
|
|
16
|
+
resolve_unresolved_constructors,
|
|
17
|
+
)
|
|
12
18
|
from codeanalyzer.semantic_analysis.codeql import CodeQLLoader
|
|
19
|
+
from codeanalyzer.semantic_analysis.codeql.codeql_analysis import CodeQL
|
|
13
20
|
from codeanalyzer.semantic_analysis.codeql.codeql_exceptions import CodeQLExceptions
|
|
14
21
|
from codeanalyzer.syntactic_analysis.exceptions import SymbolTableBuilderRayError
|
|
15
22
|
from codeanalyzer.syntactic_analysis.symbol_table_builder import SymbolTableBuilder
|
|
@@ -49,7 +56,6 @@ class Codeanalyzer:
|
|
|
49
56
|
|
|
50
57
|
def __init__(self, options: AnalysisOptions) -> None:
|
|
51
58
|
self.options = options
|
|
52
|
-
self.analysis_depth = options.analysis_level
|
|
53
59
|
self.project_dir = Path(options.input).resolve()
|
|
54
60
|
self.skip_tests = options.skip_tests
|
|
55
61
|
self.using_codeql = options.using_codeql
|
|
@@ -60,6 +66,7 @@ class Codeanalyzer:
|
|
|
60
66
|
self.clear_cache = options.clear_cache
|
|
61
67
|
self.db_path: Optional[Path] = None
|
|
62
68
|
self.codeql_bin: Optional[Path] = None
|
|
69
|
+
self.codeql_packs_dir: Optional[Path] = None
|
|
63
70
|
self.virtualenv: Optional[Path] = None
|
|
64
71
|
self.using_ray: bool = options.using_ray
|
|
65
72
|
self.file_name: Optional[Path] = options.file_name
|
|
@@ -292,6 +299,15 @@ class Codeanalyzer:
|
|
|
292
299
|
|
|
293
300
|
if self.using_codeql:
|
|
294
301
|
logger.info(f"(Re-)initializing CodeQL analysis for {self.project_dir}")
|
|
302
|
+
|
|
303
|
+
# Resolve the CLI binary before anything else uses it: DB build
|
|
304
|
+
# below needs it, and so does every subsequent query run.
|
|
305
|
+
self.codeql_bin = self._ensure_codeql_bin()
|
|
306
|
+
# Download the standard query library pack (idempotent). The
|
|
307
|
+
# CLI install ships only the language extractors; the
|
|
308
|
+
# ``codeql/python-all`` library pack must be fetched separately.
|
|
309
|
+
self.codeql_packs_dir = self._ensure_codeql_packs(self.codeql_bin)
|
|
310
|
+
|
|
295
311
|
cache_root = self.cache_dir / "codeql"
|
|
296
312
|
cache_root.mkdir(parents=True, exist_ok=True)
|
|
297
313
|
self.db_path = cache_root / f"{self.project_dir.name}-db"
|
|
@@ -310,19 +326,6 @@ class Codeanalyzer:
|
|
|
310
326
|
if self.rebuild_analysis or not is_cache_valid():
|
|
311
327
|
logger.info("Creating new CodeQL database...")
|
|
312
328
|
|
|
313
|
-
codeql_in_path = shutil.which("codeql")
|
|
314
|
-
if codeql_in_path:
|
|
315
|
-
self.codeql_bin = Path(codeql_in_path)
|
|
316
|
-
else:
|
|
317
|
-
self.codeql_bin = CodeQLLoader.download_and_extract_codeql(
|
|
318
|
-
self.cache_dir / "codeql" / "bin"
|
|
319
|
-
)
|
|
320
|
-
|
|
321
|
-
if not shutil.which(str(self.codeql_bin)):
|
|
322
|
-
raise FileNotFoundError(
|
|
323
|
-
f"CodeQL binary not executable: {self.codeql_bin}"
|
|
324
|
-
)
|
|
325
|
-
|
|
326
329
|
cmd = [
|
|
327
330
|
str(self.codeql_bin),
|
|
328
331
|
"database",
|
|
@@ -375,8 +378,27 @@ class Codeanalyzer:
|
|
|
375
378
|
# Build symbol table from cached application if available (if no available, the build a new one)
|
|
376
379
|
symbol_table = self._build_symbol_table(cached_pyapplication.symbol_table if cached_pyapplication else {})
|
|
377
380
|
|
|
381
|
+
# Build the call graph in four steps:
|
|
382
|
+
# 1. Run CodeQL (when enabled). Produces resolved edges with
|
|
383
|
+
# ``provenance=["codeql"]`` and augments ``PyCallsite``s
|
|
384
|
+
# in-place — filling ``callee_signature`` for sites Jedi
|
|
385
|
+
# couldn't resolve.
|
|
386
|
+
# 2. Heuristic fallback for constructor calls neither Jedi nor
|
|
387
|
+
# CodeQL could resolve (commonly classes nested inside
|
|
388
|
+
# functions). Walks the symbol table by class short-name +
|
|
389
|
+
# scope and writes ``<class>.__init__`` into the site.
|
|
390
|
+
# 3. Derive Jedi edges from the now-fully-augmented symbol
|
|
391
|
+
# table — these reflect every resolution the symbol table
|
|
392
|
+
# contains, regardless of which pass put it there.
|
|
393
|
+
# 4. Merge with CodeQL edges; provenance unions for edges both
|
|
394
|
+
# backends saw.
|
|
395
|
+
codeql_edges = self._get_call_graph(symbol_table, augment_sites=True)
|
|
396
|
+
resolve_unresolved_constructors(symbol_table)
|
|
397
|
+
jedi_edges = jedi_call_graph_edges(symbol_table)
|
|
398
|
+
call_graph = merge_edges(jedi_edges, codeql_edges)
|
|
399
|
+
|
|
378
400
|
# Recreate pyapplication
|
|
379
|
-
app = PyApplication.builder().symbol_table(symbol_table).build()
|
|
401
|
+
app = PyApplication.builder().symbol_table(symbol_table).call_graph(call_graph).build()
|
|
380
402
|
|
|
381
403
|
# Save to cache
|
|
382
404
|
self._save_analysis_cache(app, cache_file)
|
|
@@ -579,7 +601,120 @@ class Codeanalyzer:
|
|
|
579
601
|
logger.info("✅ Symbol table generation complete.")
|
|
580
602
|
return symbol_table
|
|
581
603
|
|
|
582
|
-
def
|
|
583
|
-
"""
|
|
584
|
-
|
|
585
|
-
|
|
604
|
+
def _ensure_codeql_packs(self, codeql_bin: Path) -> Path:
|
|
605
|
+
"""Materialize a qlpack that depends on ``codeql/python-all``.
|
|
606
|
+
|
|
607
|
+
The CodeQL CLI install ships only the language extractors — query
|
|
608
|
+
library packs (and their transitive dependencies like
|
|
609
|
+
``codeql/concepts``) must be resolved separately. The canonical
|
|
610
|
+
way is to declare the dependency in a ``qlpack.yml`` and run
|
|
611
|
+
``codeql pack install`` in that directory; CodeQL writes a
|
|
612
|
+
``codeql-pack.lock.yml`` and downloads everything needed.
|
|
613
|
+
|
|
614
|
+
We do this once per project under ``<cache_dir>/codeql/qlpack/``
|
|
615
|
+
and return that directory. The query runner then writes its
|
|
616
|
+
temporary ``.ql`` file inside this pack — colocation makes
|
|
617
|
+
``import python`` resolve without any ``--additional-packs`` or
|
|
618
|
+
``--search-path`` gymnastics.
|
|
619
|
+
"""
|
|
620
|
+
pack_dir = self.cache_dir / "codeql" / "qlpack"
|
|
621
|
+
pack_dir.mkdir(parents=True, exist_ok=True)
|
|
622
|
+
qlpack_yml = pack_dir / "qlpack.yml"
|
|
623
|
+
lock_file = pack_dir / "codeql-pack.lock.yml"
|
|
624
|
+
|
|
625
|
+
if not qlpack_yml.exists():
|
|
626
|
+
qlpack_yml.write_text(
|
|
627
|
+
"name: codeanalyzer-deps\n"
|
|
628
|
+
"version: 1.0.0\n"
|
|
629
|
+
"dependencies:\n"
|
|
630
|
+
' codeql/python-all: "*"\n'
|
|
631
|
+
)
|
|
632
|
+
|
|
633
|
+
if lock_file.exists():
|
|
634
|
+
logger.debug(f"CodeQL pack dependencies already installed in {pack_dir}")
|
|
635
|
+
return pack_dir
|
|
636
|
+
|
|
637
|
+
logger.info(f"Installing CodeQL pack dependencies in {pack_dir}.")
|
|
638
|
+
proc = subprocess.Popen(
|
|
639
|
+
[str(codeql_bin), "pack", "install", str(pack_dir)],
|
|
640
|
+
stdout=subprocess.PIPE,
|
|
641
|
+
stderr=subprocess.PIPE,
|
|
642
|
+
)
|
|
643
|
+
_, err = proc.communicate()
|
|
644
|
+
if proc.returncode != 0:
|
|
645
|
+
raise CodeQLExceptions.CodeQLDatabaseBuildException(
|
|
646
|
+
f"Failed to install CodeQL pack dependencies:\n"
|
|
647
|
+
f"{(err or b'').decode(errors='replace')}"
|
|
648
|
+
)
|
|
649
|
+
return pack_dir
|
|
650
|
+
|
|
651
|
+
def _ensure_codeql_bin(self) -> Path:
|
|
652
|
+
"""Locate (or download) the CodeQL CLI binary into the project cache.
|
|
653
|
+
|
|
654
|
+
Resolution order:
|
|
655
|
+
1. An existing binary inside ``<cache_dir>/codeql/bin/`` —
|
|
656
|
+
reused across runs on the same project.
|
|
657
|
+
2. ``codeql`` already on the user's PATH — picked up verbatim.
|
|
658
|
+
3. Otherwise, download into ``<cache_dir>/codeql/bin/``.
|
|
659
|
+
|
|
660
|
+
The project-local cache is preferred over PATH so the version we
|
|
661
|
+
installed earlier wins over whatever the OS ships — keeps behavior
|
|
662
|
+
deterministic when the user has both.
|
|
663
|
+
"""
|
|
664
|
+
bin_root = self.cache_dir / "codeql" / "bin"
|
|
665
|
+
bin_root.mkdir(parents=True, exist_ok=True)
|
|
666
|
+
|
|
667
|
+
existing = next(
|
|
668
|
+
(p for p in bin_root.rglob("codeql") if p.is_file()),
|
|
669
|
+
None,
|
|
670
|
+
)
|
|
671
|
+
if existing and os.access(existing, os.X_OK):
|
|
672
|
+
logger.debug(f"Reusing cached CodeQL CLI at {existing}")
|
|
673
|
+
return existing.resolve()
|
|
674
|
+
|
|
675
|
+
on_path = shutil.which("codeql")
|
|
676
|
+
if on_path:
|
|
677
|
+
logger.debug(f"Using CodeQL CLI from PATH at {on_path}")
|
|
678
|
+
return Path(on_path)
|
|
679
|
+
|
|
680
|
+
logger.info(f"CodeQL CLI not found; downloading into {bin_root}.")
|
|
681
|
+
downloaded = CodeQLLoader.download_and_extract_codeql(bin_root)
|
|
682
|
+
if not downloaded.exists() or not os.access(downloaded, os.X_OK):
|
|
683
|
+
raise FileNotFoundError(
|
|
684
|
+
f"CodeQL binary not executable after download: {downloaded}"
|
|
685
|
+
)
|
|
686
|
+
return downloaded
|
|
687
|
+
|
|
688
|
+
def _get_call_graph(
|
|
689
|
+
self,
|
|
690
|
+
symbol_table: Dict[str, PyModule],
|
|
691
|
+
augment_sites: bool = False,
|
|
692
|
+
) -> List[PyCallEdge]:
|
|
693
|
+
"""Build CodeQL-resolved call edges and optionally augment sites.
|
|
694
|
+
|
|
695
|
+
Returns an empty list when CodeQL isn't enabled or the database
|
|
696
|
+
isn't available. Edges carry ``provenance=["codeql"]`` — merge
|
|
697
|
+
with Jedi-derived edges via ``call_graph.merge_edges``.
|
|
698
|
+
|
|
699
|
+
When ``augment_sites`` is True, also mutates
|
|
700
|
+
``PyCallable.call_sites`` in the symbol table to backfill
|
|
701
|
+
``callee_signature`` for sites Jedi couldn't resolve. The single
|
|
702
|
+
CodeQL query is shared (cached on the ``CodeQL`` instance) so
|
|
703
|
+
this costs no extra DB work.
|
|
704
|
+
"""
|
|
705
|
+
if not self.using_codeql or self.db_path is None:
|
|
706
|
+
return []
|
|
707
|
+
try:
|
|
708
|
+
cq = CodeQL(
|
|
709
|
+
self.project_dir,
|
|
710
|
+
self.db_path,
|
|
711
|
+
codeql_bin=self.codeql_bin,
|
|
712
|
+
codeql_packs_dir=self.codeql_packs_dir,
|
|
713
|
+
)
|
|
714
|
+
edges = cq.build_call_graph_edges(symbol_table)
|
|
715
|
+
if augment_sites:
|
|
716
|
+
cq.augment_call_sites(symbol_table)
|
|
717
|
+
return edges
|
|
718
|
+
except Exception as exc:
|
|
719
|
+
logger.warning(f"CodeQL call-graph extraction failed: {exc}")
|
|
720
|
+
return []
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
################################################################################
|
|
2
|
+
# Copyright IBM Corporation 2025
|
|
3
|
+
#
|
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
# you may not use this file except in compliance with the License.
|
|
6
|
+
# You may obtain a copy of the License at
|
|
7
|
+
#
|
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
#
|
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
# See the License for the specific language governing permissions and
|
|
14
|
+
# limitations under the License.
|
|
15
|
+
################################################################################
|
|
16
|
+
|
|
17
|
+
"""Neo4j output: a pure projection of the :class:`PyApplication` IR to graph rows,
|
|
18
|
+
plus the two writers (cypher snapshot / bolt incremental). Nothing here runs
|
|
19
|
+
unless ``--emit neo4j`` (or ``--emit schema``) is selected.
|
|
20
|
+
"""
|
|
21
|
+
from codeanalyzer.neo4j.bolt import BoltConfig, bolt_writer
|
|
22
|
+
from codeanalyzer.neo4j.catalog import (
|
|
23
|
+
MARKER_LABELS,
|
|
24
|
+
NODE_LABELS,
|
|
25
|
+
REL_TYPES,
|
|
26
|
+
SCHEMA_VERSION,
|
|
27
|
+
build_schema_document,
|
|
28
|
+
)
|
|
29
|
+
from codeanalyzer.neo4j.cypher import render_cypher
|
|
30
|
+
from codeanalyzer.neo4j.project import project
|
|
31
|
+
from codeanalyzer.neo4j.rows import EdgeRow, GraphRows, NodeRow
|
|
32
|
+
|
|
33
|
+
__all__ = [
|
|
34
|
+
"project",
|
|
35
|
+
"render_cypher",
|
|
36
|
+
"bolt_writer",
|
|
37
|
+
"BoltConfig",
|
|
38
|
+
"build_schema_document",
|
|
39
|
+
"SCHEMA_VERSION",
|
|
40
|
+
"NODE_LABELS",
|
|
41
|
+
"REL_TYPES",
|
|
42
|
+
"MARKER_LABELS",
|
|
43
|
+
"GraphRows",
|
|
44
|
+
"NodeRow",
|
|
45
|
+
"EdgeRow",
|
|
46
|
+
]
|
|
@@ -0,0 +1,223 @@
|
|
|
1
|
+
################################################################################
|
|
2
|
+
# Copyright IBM Corporation 2025
|
|
3
|
+
#
|
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
# you may not use this file except in compliance with the License.
|
|
6
|
+
# You may obtain a copy of the License at
|
|
7
|
+
#
|
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
#
|
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
# See the License for the specific language governing permissions and
|
|
14
|
+
# limitations under the License.
|
|
15
|
+
################################################################################
|
|
16
|
+
|
|
17
|
+
"""The incremental writer: push :class:`GraphRows` into a live Neo4j over Bolt.
|
|
18
|
+
Unlike the snapshot writer, this one reads the DB's current state and updates
|
|
19
|
+
only what changed.
|
|
20
|
+
|
|
21
|
+
Algorithm (the module subgraph is the unit of idempotent replacement):
|
|
22
|
+
1. ensure constraints + indexes.
|
|
23
|
+
2. diff each module's ``content_hash`` against the DB → the set of changed modules.
|
|
24
|
+
3. per changed module, in a transaction: delete the edges it owned (edges out of
|
|
25
|
+
its nodes), detach-delete the declarations it no longer emits, then upsert
|
|
26
|
+
its current nodes.
|
|
27
|
+
4. upsert edges owned by changed modules (+ the shared edges).
|
|
28
|
+
5. on a FULL run only, prune modules whose source file vanished.
|
|
29
|
+
|
|
30
|
+
Nodes are MERGE-upserted, never blindly deleted, so a declaration another
|
|
31
|
+
(unchanged) module still references survives and its incoming edges stay valid.
|
|
32
|
+
``:PyExternal`` / ``:PyPackage`` / ``:PyDecorator`` are shared (no ``_module``) and are
|
|
33
|
+
MERGE-only.
|
|
34
|
+
|
|
35
|
+
The ``neo4j`` driver is imported lazily so it stays an optional dependency and
|
|
36
|
+
off the default (json) output path entirely.
|
|
37
|
+
"""
|
|
38
|
+
from __future__ import annotations
|
|
39
|
+
|
|
40
|
+
from dataclasses import dataclass
|
|
41
|
+
from typing import Dict, List, Optional
|
|
42
|
+
|
|
43
|
+
from codeanalyzer.neo4j.rows import EdgeRow, GraphRows, NodeRow, chunk
|
|
44
|
+
from codeanalyzer.neo4j.schema import CONSTRAINTS, INDEXES
|
|
45
|
+
from codeanalyzer.utils import logger
|
|
46
|
+
|
|
47
|
+
DESCENDANTS = "[:PY_DECLARES|PY_HAS_METHOD|PY_HAS_ATTRIBUTE|PY_DECLARES_VAR|PY_HAS_CALLSITE*1..]"
|
|
48
|
+
BATCH = 1000
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
@dataclass
|
|
52
|
+
class BoltConfig:
|
|
53
|
+
uri: str
|
|
54
|
+
user: str
|
|
55
|
+
password: str
|
|
56
|
+
database: Optional[str] = None
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def bolt_writer(rows: GraphRows, cfg: BoltConfig, full_run: bool) -> None:
|
|
60
|
+
try:
|
|
61
|
+
import neo4j # noqa: WPS433 (lazy, optional dependency)
|
|
62
|
+
except ImportError as exc: # pragma: no cover - exercised only without the extra
|
|
63
|
+
raise RuntimeError(
|
|
64
|
+
"The 'neo4j' driver is required for '--emit neo4j --neo4j-uri'. "
|
|
65
|
+
"Install it with: pip install 'codeanalyzer-python[neo4j]'"
|
|
66
|
+
) from exc
|
|
67
|
+
|
|
68
|
+
driver = neo4j.GraphDatabase.driver(cfg.uri, auth=(cfg.user, cfg.password))
|
|
69
|
+
session_kwargs = {"database": cfg.database} if cfg.database else {}
|
|
70
|
+
|
|
71
|
+
def session():
|
|
72
|
+
return driver.session(**session_kwargs)
|
|
73
|
+
|
|
74
|
+
try:
|
|
75
|
+
# 1. schema (DDL runs in its own autocommit transactions).
|
|
76
|
+
with session() as s:
|
|
77
|
+
for stmt in [*CONSTRAINTS, *INDEXES]:
|
|
78
|
+
s.run(stmt)
|
|
79
|
+
|
|
80
|
+
# Partition nodes by owning module; shared nodes have no _module.
|
|
81
|
+
by_module: Dict[str, List[NodeRow]] = {}
|
|
82
|
+
shared: List[NodeRow] = []
|
|
83
|
+
module_of: Dict[str, str] = {} # node value → owning module
|
|
84
|
+
for n in rows.nodes:
|
|
85
|
+
m = n.props.get("_module")
|
|
86
|
+
if isinstance(m, str):
|
|
87
|
+
by_module.setdefault(m, []).append(n)
|
|
88
|
+
module_of[n.value] = m
|
|
89
|
+
else:
|
|
90
|
+
shared.append(n)
|
|
91
|
+
|
|
92
|
+
# 2. diff content_hash.
|
|
93
|
+
db_hash: Dict[str, Optional[str]] = {}
|
|
94
|
+
with session() as s:
|
|
95
|
+
res = s.run("MATCH (m:PyModule) RETURN m.file_key AS k, m.content_hash AS h")
|
|
96
|
+
for rec in res:
|
|
97
|
+
db_hash[rec["k"]] = rec["h"]
|
|
98
|
+
changed = set()
|
|
99
|
+
for m, nodes in by_module.items():
|
|
100
|
+
row_hash = _hash_of(nodes, m)
|
|
101
|
+
if m not in db_hash or row_hash is None or row_hash != db_hash.get(m):
|
|
102
|
+
changed.add(m)
|
|
103
|
+
logger.info(
|
|
104
|
+
f"neo4j(bolt): {len(by_module)} modules ({len(changed)} changed), "
|
|
105
|
+
f"{len(shared)} shared nodes, {len(rows.edges)} edges"
|
|
106
|
+
)
|
|
107
|
+
|
|
108
|
+
# 3. shared nodes are always upserted (MERGE-only).
|
|
109
|
+
_upsert_nodes(session, neo4j, shared)
|
|
110
|
+
|
|
111
|
+
# 4. per changed module: purge owned edges + vanished decls, then upsert its nodes.
|
|
112
|
+
for m in changed:
|
|
113
|
+
nodes = by_module[m]
|
|
114
|
+
keys = [n.value for n in nodes]
|
|
115
|
+
with session() as s:
|
|
116
|
+
def _purge(tx, module=m, node_keys=keys):
|
|
117
|
+
tx.run("MATCH (x {_module: $m})-[r]->() DELETE r", m=module)
|
|
118
|
+
tx.run(
|
|
119
|
+
"MATCH (x {_module: $m}) "
|
|
120
|
+
"WHERE NOT coalesce(x.signature, x.id, x.file_key) IN $keys "
|
|
121
|
+
"DETACH DELETE x",
|
|
122
|
+
m=module,
|
|
123
|
+
keys=node_keys,
|
|
124
|
+
)
|
|
125
|
+
|
|
126
|
+
s.execute_write(_purge)
|
|
127
|
+
_upsert_nodes(session, neo4j, nodes)
|
|
128
|
+
|
|
129
|
+
# 5. upsert edges owned by a changed module (owner = source node's module) or shared.
|
|
130
|
+
edges = [
|
|
131
|
+
e
|
|
132
|
+
for e in rows.edges
|
|
133
|
+
if module_of.get(e.from_ref.value) is None or module_of.get(e.from_ref.value) in changed
|
|
134
|
+
]
|
|
135
|
+
_upsert_edges(session, neo4j, edges)
|
|
136
|
+
|
|
137
|
+
# 6. orphan prune — only safe on a full run (a targeted run can't tell deleted from untargeted).
|
|
138
|
+
if full_run:
|
|
139
|
+
present = list(by_module.keys())
|
|
140
|
+
with session() as s:
|
|
141
|
+
res = s.run(
|
|
142
|
+
"MATCH (m:PyModule) WHERE NOT m.file_key IN $present "
|
|
143
|
+
f"OPTIONAL MATCH (m)-{DESCENDANTS}->(x) DETACH DELETE x, m "
|
|
144
|
+
"RETURN count(m) AS pruned",
|
|
145
|
+
present=present,
|
|
146
|
+
)
|
|
147
|
+
pruned = res.single()
|
|
148
|
+
pruned_count = pruned["pruned"] if pruned else 0
|
|
149
|
+
logger.info(f"neo4j(bolt): pruned {pruned_count} vanished module(s)")
|
|
150
|
+
else:
|
|
151
|
+
logger.info(
|
|
152
|
+
"neo4j(bolt): targeted run — orphan pruning skipped (deleted files not removed)"
|
|
153
|
+
)
|
|
154
|
+
finally:
|
|
155
|
+
driver.close()
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
# ----------------------------------------------------------------------------------------------
|
|
159
|
+
# Batched upserts
|
|
160
|
+
# ----------------------------------------------------------------------------------------------
|
|
161
|
+
|
|
162
|
+
|
|
163
|
+
def _upsert_nodes(session, neo4j, nodes: List[NodeRow]) -> None:
|
|
164
|
+
groups: Dict[str, List[NodeRow]] = {}
|
|
165
|
+
for n in nodes:
|
|
166
|
+
groups.setdefault(f"{':'.join(n.labels)}|{n.key_prop}", []).append(n)
|
|
167
|
+
|
|
168
|
+
for group in groups.values():
|
|
169
|
+
labels = group[0].labels
|
|
170
|
+
key_prop = group[0].key_prop
|
|
171
|
+
set_labels = f", n:{':'.join(labels[1:])}" if len(labels) > 1 else ""
|
|
172
|
+
cypher = (
|
|
173
|
+
f"UNWIND $rows AS row MERGE (n:{labels[0]} {{{key_prop}: row.k}}) "
|
|
174
|
+
f"SET n += row.p{set_labels}"
|
|
175
|
+
)
|
|
176
|
+
for batch in chunk(group, BATCH):
|
|
177
|
+
payload = [{"k": n.value, "p": _to_params(n.props, neo4j)} for n in batch]
|
|
178
|
+
with session() as s:
|
|
179
|
+
s.run(cypher, rows=payload)
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
def _upsert_edges(session, neo4j, edges: List[EdgeRow]) -> None:
|
|
183
|
+
groups: Dict[str, List[EdgeRow]] = {}
|
|
184
|
+
for e in edges:
|
|
185
|
+
key = f"{e.type}|{e.from_ref.label}.{e.from_ref.key_prop}|{e.to_ref.label}.{e.to_ref.key_prop}"
|
|
186
|
+
groups.setdefault(key, []).append(e)
|
|
187
|
+
|
|
188
|
+
for group in groups.values():
|
|
189
|
+
first = group[0]
|
|
190
|
+
from_ref, to_ref = first.from_ref, first.to_ref
|
|
191
|
+
cypher = (
|
|
192
|
+
f"UNWIND $rows AS row "
|
|
193
|
+
f"MATCH (a:{from_ref.label} {{{from_ref.key_prop}: row.f}}) "
|
|
194
|
+
f"MATCH (b:{to_ref.label} {{{to_ref.key_prop}: row.t}}) "
|
|
195
|
+
f"MERGE (a)-[r:{first.type}]->(b) SET r += row.p"
|
|
196
|
+
)
|
|
197
|
+
for batch in chunk(group, BATCH):
|
|
198
|
+
payload = [
|
|
199
|
+
{"f": e.from_ref.value, "t": e.to_ref.value, "p": _to_params(e.props, neo4j)}
|
|
200
|
+
for e in batch
|
|
201
|
+
]
|
|
202
|
+
with session() as s:
|
|
203
|
+
s.run(cypher, rows=payload)
|
|
204
|
+
|
|
205
|
+
|
|
206
|
+
# ----------------------------------------------------------------------------------------------
|
|
207
|
+
# Helpers
|
|
208
|
+
# ----------------------------------------------------------------------------------------------
|
|
209
|
+
|
|
210
|
+
|
|
211
|
+
def _hash_of(nodes: List[NodeRow], file_key: str) -> Optional[str]:
|
|
212
|
+
for n in nodes:
|
|
213
|
+
if n.labels[0] == "PyModule" and n.value == file_key:
|
|
214
|
+
h = n.props.get("content_hash")
|
|
215
|
+
return h if isinstance(h, str) else None
|
|
216
|
+
return None
|
|
217
|
+
|
|
218
|
+
|
|
219
|
+
def _to_params(props, neo4j) -> dict:
|
|
220
|
+
"""Map props to driver params. The Python driver already distinguishes int
|
|
221
|
+
from float, so unlike the JS driver no integer coercion is needed — this is a
|
|
222
|
+
straight passthrough kept symmetric with the snapshot writer's shape."""
|
|
223
|
+
return dict(props)
|