codeanalyzer-python 0.1.13__py3-none-any.whl → 0.1.14__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- codeanalyzer/__main__.py +0 -5
- codeanalyzer/core.py +154 -19
- codeanalyzer/options/options.py +0 -1
- codeanalyzer/schema/py_schema.py +20 -0
- codeanalyzer/semantic_analysis/call_graph.py +266 -0
- codeanalyzer/semantic_analysis/codeql/codeql_analysis.py +236 -69
- codeanalyzer/semantic_analysis/codeql/codeql_loader.py +32 -4
- codeanalyzer/semantic_analysis/codeql/codeql_query_runner.py +51 -31
- codeanalyzer/syntactic_analysis/symbol_table_builder.py +87 -4
- {codeanalyzer_python-0.1.13.dist-info → codeanalyzer_python-0.1.14.dist-info}/METADATA +20 -42
- {codeanalyzer_python-0.1.13.dist-info → codeanalyzer_python-0.1.14.dist-info}/RECORD +15 -15
- {codeanalyzer_python-0.1.13.dist-info → codeanalyzer_python-0.1.14.dist-info}/WHEEL +1 -1
- codeanalyzer/semantic_analysis/wala/__init__.py +0 -15
- {codeanalyzer_python-0.1.13.dist-info → codeanalyzer_python-0.1.14.dist-info}/entry_points.txt +0 -0
- {codeanalyzer_python-0.1.13.dist-info → codeanalyzer_python-0.1.14.dist-info}/licenses/LICENSE +0 -0
- {codeanalyzer_python-0.1.13.dist-info → codeanalyzer_python-0.1.14.dist-info}/licenses/NOTICE +0 -0
codeanalyzer/__main__.py
CHANGED
|
@@ -27,10 +27,6 @@ def main(
|
|
|
27
27
|
case_sensitive=False,
|
|
28
28
|
),
|
|
29
29
|
] = OutputFormat.JSON,
|
|
30
|
-
analysis_level: Annotated[
|
|
31
|
-
int,
|
|
32
|
-
typer.Option("-a", "--analysis-level", help="1: symbol table, 2: call graph."),
|
|
33
|
-
] = 1,
|
|
34
30
|
using_codeql: Annotated[
|
|
35
31
|
bool, typer.Option("--codeql/--no-codeql", help="Enable CodeQL-based analysis.")
|
|
36
32
|
] = False,
|
|
@@ -82,7 +78,6 @@ def main(
|
|
|
82
78
|
input=input,
|
|
83
79
|
output=output,
|
|
84
80
|
format=format,
|
|
85
|
-
analysis_level=analysis_level,
|
|
86
81
|
using_codeql=using_codeql,
|
|
87
82
|
using_ray=using_ray,
|
|
88
83
|
rebuild_analysis=rebuild_analysis,
|
codeanalyzer/core.py
CHANGED
|
@@ -9,7 +9,14 @@ from typing import Any, Dict, Optional, Union, List
|
|
|
9
9
|
import ray
|
|
10
10
|
from codeanalyzer.utils import logger
|
|
11
11
|
from codeanalyzer.schema import PyApplication, PyModule, model_dump_json, model_validate_json
|
|
12
|
+
from codeanalyzer.schema.py_schema import PyCallEdge
|
|
13
|
+
from codeanalyzer.semantic_analysis.call_graph import (
|
|
14
|
+
jedi_call_graph_edges,
|
|
15
|
+
merge_edges,
|
|
16
|
+
resolve_unresolved_constructors,
|
|
17
|
+
)
|
|
12
18
|
from codeanalyzer.semantic_analysis.codeql import CodeQLLoader
|
|
19
|
+
from codeanalyzer.semantic_analysis.codeql.codeql_analysis import CodeQL
|
|
13
20
|
from codeanalyzer.semantic_analysis.codeql.codeql_exceptions import CodeQLExceptions
|
|
14
21
|
from codeanalyzer.syntactic_analysis.exceptions import SymbolTableBuilderRayError
|
|
15
22
|
from codeanalyzer.syntactic_analysis.symbol_table_builder import SymbolTableBuilder
|
|
@@ -49,7 +56,6 @@ class Codeanalyzer:
|
|
|
49
56
|
|
|
50
57
|
def __init__(self, options: AnalysisOptions) -> None:
|
|
51
58
|
self.options = options
|
|
52
|
-
self.analysis_depth = options.analysis_level
|
|
53
59
|
self.project_dir = Path(options.input).resolve()
|
|
54
60
|
self.skip_tests = options.skip_tests
|
|
55
61
|
self.using_codeql = options.using_codeql
|
|
@@ -60,6 +66,7 @@ class Codeanalyzer:
|
|
|
60
66
|
self.clear_cache = options.clear_cache
|
|
61
67
|
self.db_path: Optional[Path] = None
|
|
62
68
|
self.codeql_bin: Optional[Path] = None
|
|
69
|
+
self.codeql_packs_dir: Optional[Path] = None
|
|
63
70
|
self.virtualenv: Optional[Path] = None
|
|
64
71
|
self.using_ray: bool = options.using_ray
|
|
65
72
|
self.file_name: Optional[Path] = options.file_name
|
|
@@ -292,6 +299,15 @@ class Codeanalyzer:
|
|
|
292
299
|
|
|
293
300
|
if self.using_codeql:
|
|
294
301
|
logger.info(f"(Re-)initializing CodeQL analysis for {self.project_dir}")
|
|
302
|
+
|
|
303
|
+
# Resolve the CLI binary before anything else uses it: DB build
|
|
304
|
+
# below needs it, and so does every subsequent query run.
|
|
305
|
+
self.codeql_bin = self._ensure_codeql_bin()
|
|
306
|
+
# Download the standard query library pack (idempotent). The
|
|
307
|
+
# CLI install ships only the language extractors; the
|
|
308
|
+
# ``codeql/python-all`` library pack must be fetched separately.
|
|
309
|
+
self.codeql_packs_dir = self._ensure_codeql_packs(self.codeql_bin)
|
|
310
|
+
|
|
295
311
|
cache_root = self.cache_dir / "codeql"
|
|
296
312
|
cache_root.mkdir(parents=True, exist_ok=True)
|
|
297
313
|
self.db_path = cache_root / f"{self.project_dir.name}-db"
|
|
@@ -310,19 +326,6 @@ class Codeanalyzer:
|
|
|
310
326
|
if self.rebuild_analysis or not is_cache_valid():
|
|
311
327
|
logger.info("Creating new CodeQL database...")
|
|
312
328
|
|
|
313
|
-
codeql_in_path = shutil.which("codeql")
|
|
314
|
-
if codeql_in_path:
|
|
315
|
-
self.codeql_bin = Path(codeql_in_path)
|
|
316
|
-
else:
|
|
317
|
-
self.codeql_bin = CodeQLLoader.download_and_extract_codeql(
|
|
318
|
-
self.cache_dir / "codeql" / "bin"
|
|
319
|
-
)
|
|
320
|
-
|
|
321
|
-
if not shutil.which(str(self.codeql_bin)):
|
|
322
|
-
raise FileNotFoundError(
|
|
323
|
-
f"CodeQL binary not executable: {self.codeql_bin}"
|
|
324
|
-
)
|
|
325
|
-
|
|
326
329
|
cmd = [
|
|
327
330
|
str(self.codeql_bin),
|
|
328
331
|
"database",
|
|
@@ -375,8 +378,27 @@ class Codeanalyzer:
|
|
|
375
378
|
# Build symbol table from cached application if available (if no available, the build a new one)
|
|
376
379
|
symbol_table = self._build_symbol_table(cached_pyapplication.symbol_table if cached_pyapplication else {})
|
|
377
380
|
|
|
381
|
+
# Build the call graph in four steps:
|
|
382
|
+
# 1. Run CodeQL (when enabled). Produces resolved edges with
|
|
383
|
+
# ``provenance=["codeql"]`` and augments ``PyCallsite``s
|
|
384
|
+
# in-place — filling ``callee_signature`` for sites Jedi
|
|
385
|
+
# couldn't resolve.
|
|
386
|
+
# 2. Heuristic fallback for constructor calls neither Jedi nor
|
|
387
|
+
# CodeQL could resolve (commonly classes nested inside
|
|
388
|
+
# functions). Walks the symbol table by class short-name +
|
|
389
|
+
# scope and writes ``<class>.__init__`` into the site.
|
|
390
|
+
# 3. Derive Jedi edges from the now-fully-augmented symbol
|
|
391
|
+
# table — these reflect every resolution the symbol table
|
|
392
|
+
# contains, regardless of which pass put it there.
|
|
393
|
+
# 4. Merge with CodeQL edges; provenance unions for edges both
|
|
394
|
+
# backends saw.
|
|
395
|
+
codeql_edges = self._get_call_graph(symbol_table, augment_sites=True)
|
|
396
|
+
resolve_unresolved_constructors(symbol_table)
|
|
397
|
+
jedi_edges = jedi_call_graph_edges(symbol_table)
|
|
398
|
+
call_graph = merge_edges(jedi_edges, codeql_edges)
|
|
399
|
+
|
|
378
400
|
# Recreate pyapplication
|
|
379
|
-
app = PyApplication.builder().symbol_table(symbol_table).build()
|
|
401
|
+
app = PyApplication.builder().symbol_table(symbol_table).call_graph(call_graph).build()
|
|
380
402
|
|
|
381
403
|
# Save to cache
|
|
382
404
|
self._save_analysis_cache(app, cache_file)
|
|
@@ -579,7 +601,120 @@ class Codeanalyzer:
|
|
|
579
601
|
logger.info("✅ Symbol table generation complete.")
|
|
580
602
|
return symbol_table
|
|
581
603
|
|
|
582
|
-
def
|
|
583
|
-
"""
|
|
584
|
-
|
|
585
|
-
|
|
604
|
+
def _ensure_codeql_packs(self, codeql_bin: Path) -> Path:
|
|
605
|
+
"""Materialize a qlpack that depends on ``codeql/python-all``.
|
|
606
|
+
|
|
607
|
+
The CodeQL CLI install ships only the language extractors — query
|
|
608
|
+
library packs (and their transitive dependencies like
|
|
609
|
+
``codeql/concepts``) must be resolved separately. The canonical
|
|
610
|
+
way is to declare the dependency in a ``qlpack.yml`` and run
|
|
611
|
+
``codeql pack install`` in that directory; CodeQL writes a
|
|
612
|
+
``codeql-pack.lock.yml`` and downloads everything needed.
|
|
613
|
+
|
|
614
|
+
We do this once per project under ``<cache_dir>/codeql/qlpack/``
|
|
615
|
+
and return that directory. The query runner then writes its
|
|
616
|
+
temporary ``.ql`` file inside this pack — colocation makes
|
|
617
|
+
``import python`` resolve without any ``--additional-packs`` or
|
|
618
|
+
``--search-path`` gymnastics.
|
|
619
|
+
"""
|
|
620
|
+
pack_dir = self.cache_dir / "codeql" / "qlpack"
|
|
621
|
+
pack_dir.mkdir(parents=True, exist_ok=True)
|
|
622
|
+
qlpack_yml = pack_dir / "qlpack.yml"
|
|
623
|
+
lock_file = pack_dir / "codeql-pack.lock.yml"
|
|
624
|
+
|
|
625
|
+
if not qlpack_yml.exists():
|
|
626
|
+
qlpack_yml.write_text(
|
|
627
|
+
"name: codeanalyzer-deps\n"
|
|
628
|
+
"version: 1.0.0\n"
|
|
629
|
+
"dependencies:\n"
|
|
630
|
+
' codeql/python-all: "*"\n'
|
|
631
|
+
)
|
|
632
|
+
|
|
633
|
+
if lock_file.exists():
|
|
634
|
+
logger.debug(f"CodeQL pack dependencies already installed in {pack_dir}")
|
|
635
|
+
return pack_dir
|
|
636
|
+
|
|
637
|
+
logger.info(f"Installing CodeQL pack dependencies in {pack_dir}.")
|
|
638
|
+
proc = subprocess.Popen(
|
|
639
|
+
[str(codeql_bin), "pack", "install", str(pack_dir)],
|
|
640
|
+
stdout=subprocess.PIPE,
|
|
641
|
+
stderr=subprocess.PIPE,
|
|
642
|
+
)
|
|
643
|
+
_, err = proc.communicate()
|
|
644
|
+
if proc.returncode != 0:
|
|
645
|
+
raise CodeQLExceptions.CodeQLDatabaseBuildException(
|
|
646
|
+
f"Failed to install CodeQL pack dependencies:\n"
|
|
647
|
+
f"{(err or b'').decode(errors='replace')}"
|
|
648
|
+
)
|
|
649
|
+
return pack_dir
|
|
650
|
+
|
|
651
|
+
def _ensure_codeql_bin(self) -> Path:
|
|
652
|
+
"""Locate (or download) the CodeQL CLI binary into the project cache.
|
|
653
|
+
|
|
654
|
+
Resolution order:
|
|
655
|
+
1. An existing binary inside ``<cache_dir>/codeql/bin/`` —
|
|
656
|
+
reused across runs on the same project.
|
|
657
|
+
2. ``codeql`` already on the user's PATH — picked up verbatim.
|
|
658
|
+
3. Otherwise, download into ``<cache_dir>/codeql/bin/``.
|
|
659
|
+
|
|
660
|
+
The project-local cache is preferred over PATH so the version we
|
|
661
|
+
installed earlier wins over whatever the OS ships — keeps behavior
|
|
662
|
+
deterministic when the user has both.
|
|
663
|
+
"""
|
|
664
|
+
bin_root = self.cache_dir / "codeql" / "bin"
|
|
665
|
+
bin_root.mkdir(parents=True, exist_ok=True)
|
|
666
|
+
|
|
667
|
+
existing = next(
|
|
668
|
+
(p for p in bin_root.rglob("codeql") if p.is_file()),
|
|
669
|
+
None,
|
|
670
|
+
)
|
|
671
|
+
if existing and os.access(existing, os.X_OK):
|
|
672
|
+
logger.debug(f"Reusing cached CodeQL CLI at {existing}")
|
|
673
|
+
return existing.resolve()
|
|
674
|
+
|
|
675
|
+
on_path = shutil.which("codeql")
|
|
676
|
+
if on_path:
|
|
677
|
+
logger.debug(f"Using CodeQL CLI from PATH at {on_path}")
|
|
678
|
+
return Path(on_path)
|
|
679
|
+
|
|
680
|
+
logger.info(f"CodeQL CLI not found; downloading into {bin_root}.")
|
|
681
|
+
downloaded = CodeQLLoader.download_and_extract_codeql(bin_root)
|
|
682
|
+
if not downloaded.exists() or not os.access(downloaded, os.X_OK):
|
|
683
|
+
raise FileNotFoundError(
|
|
684
|
+
f"CodeQL binary not executable after download: {downloaded}"
|
|
685
|
+
)
|
|
686
|
+
return downloaded
|
|
687
|
+
|
|
688
|
+
def _get_call_graph(
|
|
689
|
+
self,
|
|
690
|
+
symbol_table: Dict[str, PyModule],
|
|
691
|
+
augment_sites: bool = False,
|
|
692
|
+
) -> List[PyCallEdge]:
|
|
693
|
+
"""Build CodeQL-resolved call edges and optionally augment sites.
|
|
694
|
+
|
|
695
|
+
Returns an empty list when CodeQL isn't enabled or the database
|
|
696
|
+
isn't available. Edges carry ``provenance=["codeql"]`` — merge
|
|
697
|
+
with Jedi-derived edges via ``call_graph.merge_edges``.
|
|
698
|
+
|
|
699
|
+
When ``augment_sites`` is True, also mutates
|
|
700
|
+
``PyCallable.call_sites`` in the symbol table to backfill
|
|
701
|
+
``callee_signature`` for sites Jedi couldn't resolve. The single
|
|
702
|
+
CodeQL query is shared (cached on the ``CodeQL`` instance) so
|
|
703
|
+
this costs no extra DB work.
|
|
704
|
+
"""
|
|
705
|
+
if not self.using_codeql or self.db_path is None:
|
|
706
|
+
return []
|
|
707
|
+
try:
|
|
708
|
+
cq = CodeQL(
|
|
709
|
+
self.project_dir,
|
|
710
|
+
self.db_path,
|
|
711
|
+
codeql_bin=self.codeql_bin,
|
|
712
|
+
codeql_packs_dir=self.codeql_packs_dir,
|
|
713
|
+
)
|
|
714
|
+
edges = cq.build_call_graph_edges(symbol_table)
|
|
715
|
+
if augment_sites:
|
|
716
|
+
cq.augment_call_sites(symbol_table)
|
|
717
|
+
return edges
|
|
718
|
+
except Exception as exc:
|
|
719
|
+
logger.warning(f"CodeQL call-graph extraction failed: {exc}")
|
|
720
|
+
return []
|
codeanalyzer/options/options.py
CHANGED
codeanalyzer/schema/py_schema.py
CHANGED
|
@@ -339,9 +339,29 @@ class PyModule(BaseModel):
|
|
|
339
339
|
file_size: Optional[int] = None
|
|
340
340
|
|
|
341
341
|
|
|
342
|
+
@builder
|
|
343
|
+
@msgpk
|
|
344
|
+
class PyCallEdge(BaseModel):
|
|
345
|
+
"""Identity-only call-graph edge with weight.
|
|
346
|
+
|
|
347
|
+
Mirrors Java's ``CallDependency``. ``source`` and ``target`` are
|
|
348
|
+
``PyCallable.signature`` strings — nodes of the graph are the existing
|
|
349
|
+
``PyCallable`` entries in the symbol table, not a separate vertex type.
|
|
350
|
+
Rich per-call metadata (receiver, arguments, location, ...) lives on
|
|
351
|
+
``PyCallsite`` inside the source ``PyCallable.call_sites``.
|
|
352
|
+
"""
|
|
353
|
+
|
|
354
|
+
source: str # caller's PyCallable.signature
|
|
355
|
+
target: str # callee's PyCallable.signature
|
|
356
|
+
type: Literal["CALL_DEP"] = "CALL_DEP"
|
|
357
|
+
weight: int = 1
|
|
358
|
+
provenance: List[Literal["jedi", "codeql", "joern"]] = []
|
|
359
|
+
|
|
360
|
+
|
|
342
361
|
@builder
|
|
343
362
|
@msgpk
|
|
344
363
|
class PyApplication(BaseModel):
|
|
345
364
|
"""Represents a Python application."""
|
|
346
365
|
|
|
347
366
|
symbol_table: Dict[str, PyModule]
|
|
367
|
+
call_graph: List[PyCallEdge] = []
|
|
@@ -0,0 +1,266 @@
|
|
|
1
|
+
################################################################################
|
|
2
|
+
# Copyright IBM Corporation 2025
|
|
3
|
+
#
|
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
# you may not use this file except in compliance with the License.
|
|
6
|
+
# You may obtain a copy of the License at
|
|
7
|
+
#
|
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
#
|
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
# See the License for the specific language governing permissions and
|
|
14
|
+
# limitations under the License.
|
|
15
|
+
################################################################################
|
|
16
|
+
|
|
17
|
+
"""Adapters between the persisted call-graph schema and ``networkx``.
|
|
18
|
+
|
|
19
|
+
The schema persists the call graph as ``List[PyCallEdge]`` with signatures
|
|
20
|
+
referencing ``PyCallable`` entries already in the symbol table. These
|
|
21
|
+
helpers rehydrate it into a ``networkx.DiGraph`` for in-process queries
|
|
22
|
+
(paths, callers, callees) and reduce a built ``DiGraph`` back to the
|
|
23
|
+
serializable edge list.
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
from collections import Counter
|
|
27
|
+
from typing import Dict, Iterator, List, Tuple
|
|
28
|
+
|
|
29
|
+
import networkx as nx
|
|
30
|
+
|
|
31
|
+
from codeanalyzer.schema.py_schema import (
|
|
32
|
+
PyApplication,
|
|
33
|
+
PyCallable,
|
|
34
|
+
PyCallEdge,
|
|
35
|
+
PyClass,
|
|
36
|
+
PyModule,
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def _walk_class_callables(cls: PyClass) -> Iterator[PyCallable]:
|
|
41
|
+
for method in cls.methods.values():
|
|
42
|
+
yield from _walk_callable(method)
|
|
43
|
+
for inner in cls.inner_classes.values():
|
|
44
|
+
yield from _walk_class_callables(inner)
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def _walk_callable(c: PyCallable) -> Iterator[PyCallable]:
|
|
48
|
+
yield c
|
|
49
|
+
for inner in c.inner_callables.values():
|
|
50
|
+
yield from _walk_callable(inner)
|
|
51
|
+
for inner_cls in c.inner_classes.values():
|
|
52
|
+
yield from _walk_class_callables(inner_cls)
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def _walk_module_callables(module: PyModule) -> Iterator[PyCallable]:
|
|
56
|
+
for fn in module.functions.values():
|
|
57
|
+
yield from _walk_callable(fn)
|
|
58
|
+
for cls in module.classes.values():
|
|
59
|
+
yield from _walk_class_callables(cls)
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def iter_callables_in_symbol_table(
|
|
63
|
+
symbol_table: Dict[str, PyModule],
|
|
64
|
+
) -> Iterator[PyCallable]:
|
|
65
|
+
"""Yield every ``PyCallable`` in a symbol table, recursively."""
|
|
66
|
+
for module in symbol_table.values():
|
|
67
|
+
yield from _walk_module_callables(module)
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def _walk_classes_in_class(cls: PyClass) -> Iterator[PyClass]:
|
|
71
|
+
yield cls
|
|
72
|
+
for inner in cls.inner_classes.values():
|
|
73
|
+
yield from _walk_classes_in_class(inner)
|
|
74
|
+
# Classes can live inside methods (e.g. a factory method that defines
|
|
75
|
+
# a helper class). Recurse through every method's callable subtree.
|
|
76
|
+
for method in cls.methods.values():
|
|
77
|
+
yield from _walk_classes_in_callable(method)
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def _walk_classes_in_callable(c: PyCallable) -> Iterator[PyClass]:
|
|
81
|
+
for inner_cls in c.inner_classes.values():
|
|
82
|
+
yield from _walk_classes_in_class(inner_cls)
|
|
83
|
+
for inner in c.inner_callables.values():
|
|
84
|
+
yield from _walk_classes_in_callable(inner)
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def iter_classes_in_symbol_table(
|
|
88
|
+
symbol_table: Dict[str, PyModule],
|
|
89
|
+
) -> Iterator[PyClass]:
|
|
90
|
+
"""Yield every ``PyClass`` in a symbol table, recursively — including
|
|
91
|
+
inner classes, classes nested in functions, and classes nested in
|
|
92
|
+
class methods."""
|
|
93
|
+
for module in symbol_table.values():
|
|
94
|
+
for cls in module.classes.values():
|
|
95
|
+
yield from _walk_classes_in_class(cls)
|
|
96
|
+
for fn in module.functions.values():
|
|
97
|
+
yield from _walk_classes_in_callable(fn)
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
def iter_callables(app: PyApplication) -> Iterator[PyCallable]:
|
|
101
|
+
"""Yield every ``PyCallable`` in the application, recursively."""
|
|
102
|
+
yield from iter_callables_in_symbol_table(app.symbol_table)
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
def callables_by_signature(app: PyApplication) -> Dict[str, PyCallable]:
|
|
106
|
+
"""Flat ``signature -> PyCallable`` index for O(1) node lookup."""
|
|
107
|
+
return {c.signature: c for c in iter_callables(app)}
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
def to_digraph(app: PyApplication) -> nx.DiGraph:
|
|
111
|
+
"""Build a ``networkx.DiGraph`` from a ``PyApplication``.
|
|
112
|
+
|
|
113
|
+
Nodes are keyed by ``PyCallable.signature``. Nodes for in-source
|
|
114
|
+
callables carry a ``callable`` attribute holding the full
|
|
115
|
+
``PyCallable`` and ``ghost=False``. Endpoints referenced by edges
|
|
116
|
+
but absent from the symbol table — RPC targets, third-party
|
|
117
|
+
libraries, framework callbacks, dynamically resolved callees — are
|
|
118
|
+
added as **ghost** nodes (``callable=None``, ``ghost=True``) so the
|
|
119
|
+
edges are preserved.
|
|
120
|
+
|
|
121
|
+
Edges carry ``type``, ``weight``, and ``provenance`` attributes.
|
|
122
|
+
"""
|
|
123
|
+
g = nx.DiGraph()
|
|
124
|
+
by_sig = callables_by_signature(app)
|
|
125
|
+
for sig, c in by_sig.items():
|
|
126
|
+
g.add_node(sig, callable=c, ghost=False)
|
|
127
|
+
for e in app.call_graph:
|
|
128
|
+
for sig in (e.source, e.target):
|
|
129
|
+
if sig not in g.nodes:
|
|
130
|
+
g.add_node(sig, callable=None, ghost=True)
|
|
131
|
+
g.add_edge(
|
|
132
|
+
e.source,
|
|
133
|
+
e.target,
|
|
134
|
+
type=e.type,
|
|
135
|
+
weight=e.weight,
|
|
136
|
+
provenance=list(e.provenance),
|
|
137
|
+
)
|
|
138
|
+
return g
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
def from_digraph(g: nx.DiGraph) -> list:
|
|
142
|
+
"""Reduce a ``DiGraph`` to the persisted ``List[PyCallEdge]`` form.
|
|
143
|
+
|
|
144
|
+
Only edges are extracted; nodes are not serialized here — they are
|
|
145
|
+
expected to already exist as ``PyCallable`` entries in the symbol
|
|
146
|
+
table. Edge attributes default to ``CALL_DEP`` / weight 1 / empty
|
|
147
|
+
provenance when missing.
|
|
148
|
+
"""
|
|
149
|
+
edges = []
|
|
150
|
+
for src, dst, data in g.edges(data=True):
|
|
151
|
+
edges.append(
|
|
152
|
+
PyCallEdge(
|
|
153
|
+
source=src,
|
|
154
|
+
target=dst,
|
|
155
|
+
type=data.get("type", "CALL_DEP"),
|
|
156
|
+
weight=int(data.get("weight", 1)),
|
|
157
|
+
provenance=list(data.get("provenance", [])),
|
|
158
|
+
)
|
|
159
|
+
)
|
|
160
|
+
return edges
|
|
161
|
+
|
|
162
|
+
|
|
163
|
+
def jedi_call_graph_edges(
|
|
164
|
+
symbol_table: Dict[str, PyModule],
|
|
165
|
+
) -> List[PyCallEdge]:
|
|
166
|
+
"""Derive ``PyCallEdge`` entries from Jedi's per-callable ``call_sites``.
|
|
167
|
+
|
|
168
|
+
For every ``PyCallable`` in the symbol table, each ``PyCallsite`` whose
|
|
169
|
+
``callee_signature`` is resolved (non-empty) contributes an edge
|
|
170
|
+
``caller.signature -> site.callee_signature``. Sites where Jedi failed
|
|
171
|
+
to resolve the callee (``callee_signature`` is ``None`` or empty) are
|
|
172
|
+
skipped — they have no anchor to put on the graph.
|
|
173
|
+
|
|
174
|
+
Edges are coalesced on ``(source, target)``: ``weight`` is the count of
|
|
175
|
+
matching sites. Provenance is always ``["jedi"]``; combine with
|
|
176
|
+
CodeQL-derived edges via ``merge_edges``.
|
|
177
|
+
"""
|
|
178
|
+
counts: Counter = Counter()
|
|
179
|
+
for caller in iter_callables_in_symbol_table(symbol_table):
|
|
180
|
+
for site in caller.call_sites:
|
|
181
|
+
if not site.callee_signature:
|
|
182
|
+
continue
|
|
183
|
+
counts[(caller.signature, site.callee_signature)] += 1
|
|
184
|
+
|
|
185
|
+
return [
|
|
186
|
+
PyCallEdge(source=src, target=dst, weight=n, provenance=["jedi"])
|
|
187
|
+
for (src, dst), n in counts.items()
|
|
188
|
+
]
|
|
189
|
+
|
|
190
|
+
|
|
191
|
+
def resolve_unresolved_constructors(symbol_table: Dict[str, PyModule]) -> int:
|
|
192
|
+
"""Fill in ``PyCallsite.callee_signature`` for unresolved constructor sites.
|
|
193
|
+
|
|
194
|
+
When both Jedi and CodeQL fail to resolve a constructor call (commonly
|
|
195
|
+
for classes nested inside functions or methods, where static-analysis
|
|
196
|
+
points-to is weakest), Jedi still flags the site as
|
|
197
|
+
``is_constructor_call=True`` with ``method_name`` set to the class's
|
|
198
|
+
short name. This pass does the resolution heuristically:
|
|
199
|
+
|
|
200
|
+
1. Build a ``short_name -> [PyClass]`` index from all classes in the
|
|
201
|
+
symbol table.
|
|
202
|
+
2. For each unresolved constructor site under a caller ``C``, look up
|
|
203
|
+
candidates by ``site.method_name`` and prefer the class whose
|
|
204
|
+
``signature`` is the longest prefix-ancestor of ``C.signature`` —
|
|
205
|
+
this approximates Python's LEGB scoping for nested classes.
|
|
206
|
+
3. Set ``callee_signature = f"{class.signature}.__init__"``.
|
|
207
|
+
|
|
208
|
+
Returns the number of sites resolved. Best-effort; sites with no
|
|
209
|
+
matching class or ambiguous candidates with no scope tiebreaker are
|
|
210
|
+
left as-is.
|
|
211
|
+
"""
|
|
212
|
+
by_name: Dict[str, List[PyClass]] = {}
|
|
213
|
+
for cls in iter_classes_in_symbol_table(symbol_table):
|
|
214
|
+
by_name.setdefault(cls.name, []).append(cls)
|
|
215
|
+
|
|
216
|
+
resolved = 0
|
|
217
|
+
for caller in iter_callables_in_symbol_table(symbol_table):
|
|
218
|
+
for site in caller.call_sites:
|
|
219
|
+
if not site.is_constructor_call or site.callee_signature:
|
|
220
|
+
continue
|
|
221
|
+
candidates = by_name.get(site.method_name)
|
|
222
|
+
if not candidates:
|
|
223
|
+
continue
|
|
224
|
+
|
|
225
|
+
# Prefer the class whose signature is the longest prefix of
|
|
226
|
+
# the caller's signature (closest enclosing scope).
|
|
227
|
+
def scope_score(c: PyClass, _caller_sig: str = caller.signature) -> int:
|
|
228
|
+
cls_sig = c.signature
|
|
229
|
+
parent_sig = cls_sig.rsplit(".", 1)[0] if "." in cls_sig else ""
|
|
230
|
+
# Score = length of parent_sig if it's a prefix of caller's
|
|
231
|
+
# signature, else -1 (not in scope, lowest priority).
|
|
232
|
+
if parent_sig and _caller_sig.startswith(parent_sig):
|
|
233
|
+
return len(parent_sig)
|
|
234
|
+
# Module-level class (parent_sig is the module path) — give
|
|
235
|
+
# it a base score so it still wins over no match.
|
|
236
|
+
return 0 if not parent_sig else -1
|
|
237
|
+
|
|
238
|
+
best = max(candidates, key=scope_score)
|
|
239
|
+
if scope_score(best) < 0:
|
|
240
|
+
# No candidate is reachable from caller's scope.
|
|
241
|
+
continue
|
|
242
|
+
|
|
243
|
+
site.callee_signature = f"{best.signature}.__init__"
|
|
244
|
+
resolved += 1
|
|
245
|
+
|
|
246
|
+
return resolved
|
|
247
|
+
|
|
248
|
+
|
|
249
|
+
def merge_edges(*edge_lists: list) -> list:
|
|
250
|
+
"""Merge multiple ``List[PyCallEdge]`` into one.
|
|
251
|
+
|
|
252
|
+
Edges with the same ``(source, target)`` are coalesced: weights sum,
|
|
253
|
+
provenance is the sorted union. Useful for combining edges produced
|
|
254
|
+
by different backends (e.g. Jedi + CodeQL).
|
|
255
|
+
"""
|
|
256
|
+
by_key: Dict[Tuple[str, str], PyCallEdge] = {}
|
|
257
|
+
for edges in edge_lists:
|
|
258
|
+
for e in edges:
|
|
259
|
+
k = (e.source, e.target)
|
|
260
|
+
if k in by_key:
|
|
261
|
+
cur = by_key[k]
|
|
262
|
+
cur.weight += e.weight
|
|
263
|
+
cur.provenance = sorted(set(cur.provenance) | set(e.provenance))
|
|
264
|
+
else:
|
|
265
|
+
by_key[k] = e.model_copy()
|
|
266
|
+
return list(by_key.values())
|