codeanalyzer-python 0.1.12__py3-none-any.whl → 0.1.14__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
codeanalyzer/__main__.py CHANGED
@@ -9,25 +9,75 @@ from codeanalyzer.config import OutputFormat
9
9
  from codeanalyzer.schema import model_dump_json
10
10
  from codeanalyzer.options import AnalysisOptions
11
11
 
12
+
12
13
  def main(
13
- input: Annotated[Path, typer.Option("-i", "--input", help="Path to the project root directory.")],
14
- output: Optional[Path] = typer.Option(None, "-o", "--output"),
15
- format: OutputFormat = typer.Option(OutputFormat.JSON, "-f", "--format"),
16
- analysis_level: int = typer.Option(1, "-a", "--analysis-level"),
17
- using_codeql: bool = typer.Option(False, "--codeql/--no-codeql"),
18
- using_ray: bool = typer.Option(False, "--ray/--no-ray"),
19
- rebuild_analysis: bool = typer.Option(False, "--eager/--lazy"),
20
- skip_tests: bool = typer.Option(True, "--skip-tests/--include-tests"),
21
- file_name: Optional[Path] = typer.Option(None, "--file-name"),
22
- cache_dir: Optional[Path] = typer.Option(None, "-c", "--cache-dir"),
23
- clear_cache: bool = typer.Option(False, "--clear-cache/--keep-cache"),
24
- verbosity: int = typer.Option(0, "-v", count=True),
14
+ input: Annotated[
15
+ Path, typer.Option("-i", "--input", help="Path to the project root directory.")
16
+ ],
17
+ output: Annotated[
18
+ Optional[Path],
19
+ typer.Option("-o", "--output", help="Output directory for artifacts."),
20
+ ] = None,
21
+ format: Annotated[
22
+ OutputFormat,
23
+ typer.Option(
24
+ "-f",
25
+ "--format",
26
+ help="Output format: json or msgpack.",
27
+ case_sensitive=False,
28
+ ),
29
+ ] = OutputFormat.JSON,
30
+ using_codeql: Annotated[
31
+ bool, typer.Option("--codeql/--no-codeql", help="Enable CodeQL-based analysis.")
32
+ ] = False,
33
+ using_ray: Annotated[
34
+ bool,
35
+ typer.Option("--ray/--no-ray", help="Enable Ray for distributed analysis."),
36
+ ] = False,
37
+ rebuild_analysis: Annotated[
38
+ bool,
39
+ typer.Option(
40
+ "--eager/--lazy",
41
+ help="Enable eager or lazy analysis. Defaults to lazy.",
42
+ ),
43
+ ] = False,
44
+ skip_tests: Annotated[
45
+ bool,
46
+ typer.Option(
47
+ "--skip-tests/--include-tests",
48
+ help="Skip test files in analysis.",
49
+ ),
50
+ ] = True,
51
+ file_name: Annotated[
52
+ Optional[Path],
53
+ typer.Option(
54
+ "--file-name",
55
+ help="Analyze only the specified file (relative to input directory).",
56
+ ),
57
+ ] = None,
58
+ cache_dir: Annotated[
59
+ Optional[Path],
60
+ typer.Option(
61
+ "-c",
62
+ "--cache-dir",
63
+ help="Directory to store analysis cache. Defaults to '.codeanalyzer' in the input directory.",
64
+ ),
65
+ ] = None,
66
+ clear_cache: Annotated[
67
+ bool,
68
+ typer.Option(
69
+ "--clear-cache/--keep-cache",
70
+ help="Clear cache after analysis. By default, cache is retained.",
71
+ ),
72
+ ] = False,
73
+ verbosity: Annotated[
74
+ int, typer.Option("-v", count=True, help="Increase verbosity: -v, -vv, -vvv")
75
+ ] = 0,
25
76
  ):
26
77
  options = AnalysisOptions(
27
78
  input=input,
28
79
  output=output,
29
80
  format=format,
30
- analysis_level=analysis_level,
31
81
  using_codeql=using_codeql,
32
82
  using_ray=using_ray,
33
83
  rebuild_analysis=rebuild_analysis,
@@ -46,13 +96,17 @@ def main(
46
96
  if options.file_name is not None:
47
97
  full_file_path = options.input / options.file_name
48
98
  if not full_file_path.exists():
49
- logger.error(f"Specified file '{options.file_name}' does not exist in '{options.input}'.")
99
+ logger.error(
100
+ f"Specified file '{options.file_name}' does not exist in '{options.input}'."
101
+ )
50
102
  raise typer.Exit(code=1)
51
103
  if not full_file_path.is_file():
52
104
  logger.error(f"Specified path '{options.file_name}' is not a file.")
53
105
  raise typer.Exit(code=1)
54
- if not str(options.file_name).endswith('.py'):
55
- logger.error(f"Specified file '{options.file_name}' is not a Python file (.py).")
106
+ if not str(options.file_name).endswith(".py"):
107
+ logger.error(
108
+ f"Specified file '{options.file_name}' is not a Python file (.py)."
109
+ )
56
110
  raise typer.Exit(code=1)
57
111
 
58
112
  with Codeanalyzer(options) as analyzer:
@@ -85,6 +139,7 @@ def _write_output(artifacts, output_dir: Path, format: OutputFormat):
85
139
  f"Compression ratio: {artifacts.get_compression_ratio():.1%} of JSON size"
86
140
  )
87
141
 
142
+
88
143
  app = typer.Typer(
89
144
  callback=main,
90
145
  name="codeanalyzer",
codeanalyzer/core.py CHANGED
@@ -9,7 +9,14 @@ from typing import Any, Dict, Optional, Union, List
9
9
  import ray
10
10
  from codeanalyzer.utils import logger
11
11
  from codeanalyzer.schema import PyApplication, PyModule, model_dump_json, model_validate_json
12
+ from codeanalyzer.schema.py_schema import PyCallEdge
13
+ from codeanalyzer.semantic_analysis.call_graph import (
14
+ jedi_call_graph_edges,
15
+ merge_edges,
16
+ resolve_unresolved_constructors,
17
+ )
12
18
  from codeanalyzer.semantic_analysis.codeql import CodeQLLoader
19
+ from codeanalyzer.semantic_analysis.codeql.codeql_analysis import CodeQL
13
20
  from codeanalyzer.semantic_analysis.codeql.codeql_exceptions import CodeQLExceptions
14
21
  from codeanalyzer.syntactic_analysis.exceptions import SymbolTableBuilderRayError
15
22
  from codeanalyzer.syntactic_analysis.symbol_table_builder import SymbolTableBuilder
@@ -49,7 +56,6 @@ class Codeanalyzer:
49
56
 
50
57
  def __init__(self, options: AnalysisOptions) -> None:
51
58
  self.options = options
52
- self.analysis_depth = options.analysis_level
53
59
  self.project_dir = Path(options.input).resolve()
54
60
  self.skip_tests = options.skip_tests
55
61
  self.using_codeql = options.using_codeql
@@ -60,6 +66,7 @@ class Codeanalyzer:
60
66
  self.clear_cache = options.clear_cache
61
67
  self.db_path: Optional[Path] = None
62
68
  self.codeql_bin: Optional[Path] = None
69
+ self.codeql_packs_dir: Optional[Path] = None
63
70
  self.virtualenv: Optional[Path] = None
64
71
  self.using_ray: bool = options.using_ray
65
72
  self.file_name: Optional[Path] = options.file_name
@@ -292,6 +299,15 @@ class Codeanalyzer:
292
299
 
293
300
  if self.using_codeql:
294
301
  logger.info(f"(Re-)initializing CodeQL analysis for {self.project_dir}")
302
+
303
+ # Resolve the CLI binary before anything else uses it: DB build
304
+ # below needs it, and so does every subsequent query run.
305
+ self.codeql_bin = self._ensure_codeql_bin()
306
+ # Download the standard query library pack (idempotent). The
307
+ # CLI install ships only the language extractors; the
308
+ # ``codeql/python-all`` library pack must be fetched separately.
309
+ self.codeql_packs_dir = self._ensure_codeql_packs(self.codeql_bin)
310
+
295
311
  cache_root = self.cache_dir / "codeql"
296
312
  cache_root.mkdir(parents=True, exist_ok=True)
297
313
  self.db_path = cache_root / f"{self.project_dir.name}-db"
@@ -310,19 +326,6 @@ class Codeanalyzer:
310
326
  if self.rebuild_analysis or not is_cache_valid():
311
327
  logger.info("Creating new CodeQL database...")
312
328
 
313
- codeql_in_path = shutil.which("codeql")
314
- if codeql_in_path:
315
- self.codeql_bin = Path(codeql_in_path)
316
- else:
317
- self.codeql_bin = CodeQLLoader.download_and_extract_codeql(
318
- self.cache_dir / "codeql" / "bin"
319
- )
320
-
321
- if not shutil.which(str(self.codeql_bin)):
322
- raise FileNotFoundError(
323
- f"CodeQL binary not executable: {self.codeql_bin}"
324
- )
325
-
326
329
  cmd = [
327
330
  str(self.codeql_bin),
328
331
  "database",
@@ -375,8 +378,27 @@ class Codeanalyzer:
375
378
  # Build symbol table from cached application if available (if no available, the build a new one)
376
379
  symbol_table = self._build_symbol_table(cached_pyapplication.symbol_table if cached_pyapplication else {})
377
380
 
381
+ # Build the call graph in four steps:
382
+ # 1. Run CodeQL (when enabled). Produces resolved edges with
383
+ # ``provenance=["codeql"]`` and augments ``PyCallsite``s
384
+ # in-place — filling ``callee_signature`` for sites Jedi
385
+ # couldn't resolve.
386
+ # 2. Heuristic fallback for constructor calls neither Jedi nor
387
+ # CodeQL could resolve (commonly classes nested inside
388
+ # functions). Walks the symbol table by class short-name +
389
+ # scope and writes ``<class>.__init__`` into the site.
390
+ # 3. Derive Jedi edges from the now-fully-augmented symbol
391
+ # table — these reflect every resolution the symbol table
392
+ # contains, regardless of which pass put it there.
393
+ # 4. Merge with CodeQL edges; provenance unions for edges both
394
+ # backends saw.
395
+ codeql_edges = self._get_call_graph(symbol_table, augment_sites=True)
396
+ resolve_unresolved_constructors(symbol_table)
397
+ jedi_edges = jedi_call_graph_edges(symbol_table)
398
+ call_graph = merge_edges(jedi_edges, codeql_edges)
399
+
378
400
  # Recreate pyapplication
379
- app = PyApplication.builder().symbol_table(symbol_table).build()
401
+ app = PyApplication.builder().symbol_table(symbol_table).call_graph(call_graph).build()
380
402
 
381
403
  # Save to cache
382
404
  self._save_analysis_cache(app, cache_file)
@@ -579,7 +601,120 @@ class Codeanalyzer:
579
601
  logger.info("✅ Symbol table generation complete.")
580
602
  return symbol_table
581
603
 
582
- def _get_call_graph(self) -> Dict[str, Any]:
583
- """Retrieve call graph from CodeQL database."""
584
- logger.warning("Call graph extraction not yet implemented.")
585
- return {}
604
+ def _ensure_codeql_packs(self, codeql_bin: Path) -> Path:
605
+ """Materialize a qlpack that depends on ``codeql/python-all``.
606
+
607
+ The CodeQL CLI install ships only the language extractors — query
608
+ library packs (and their transitive dependencies like
609
+ ``codeql/concepts``) must be resolved separately. The canonical
610
+ way is to declare the dependency in a ``qlpack.yml`` and run
611
+ ``codeql pack install`` in that directory; CodeQL writes a
612
+ ``codeql-pack.lock.yml`` and downloads everything needed.
613
+
614
+ We do this once per project under ``<cache_dir>/codeql/qlpack/``
615
+ and return that directory. The query runner then writes its
616
+ temporary ``.ql`` file inside this pack — colocation makes
617
+ ``import python`` resolve without any ``--additional-packs`` or
618
+ ``--search-path`` gymnastics.
619
+ """
620
+ pack_dir = self.cache_dir / "codeql" / "qlpack"
621
+ pack_dir.mkdir(parents=True, exist_ok=True)
622
+ qlpack_yml = pack_dir / "qlpack.yml"
623
+ lock_file = pack_dir / "codeql-pack.lock.yml"
624
+
625
+ if not qlpack_yml.exists():
626
+ qlpack_yml.write_text(
627
+ "name: codeanalyzer-deps\n"
628
+ "version: 1.0.0\n"
629
+ "dependencies:\n"
630
+ ' codeql/python-all: "*"\n'
631
+ )
632
+
633
+ if lock_file.exists():
634
+ logger.debug(f"CodeQL pack dependencies already installed in {pack_dir}")
635
+ return pack_dir
636
+
637
+ logger.info(f"Installing CodeQL pack dependencies in {pack_dir}.")
638
+ proc = subprocess.Popen(
639
+ [str(codeql_bin), "pack", "install", str(pack_dir)],
640
+ stdout=subprocess.PIPE,
641
+ stderr=subprocess.PIPE,
642
+ )
643
+ _, err = proc.communicate()
644
+ if proc.returncode != 0:
645
+ raise CodeQLExceptions.CodeQLDatabaseBuildException(
646
+ f"Failed to install CodeQL pack dependencies:\n"
647
+ f"{(err or b'').decode(errors='replace')}"
648
+ )
649
+ return pack_dir
650
+
651
+ def _ensure_codeql_bin(self) -> Path:
652
+ """Locate (or download) the CodeQL CLI binary into the project cache.
653
+
654
+ Resolution order:
655
+ 1. An existing binary inside ``<cache_dir>/codeql/bin/`` —
656
+ reused across runs on the same project.
657
+ 2. ``codeql`` already on the user's PATH — picked up verbatim.
658
+ 3. Otherwise, download into ``<cache_dir>/codeql/bin/``.
659
+
660
+ The project-local cache is preferred over PATH so the version we
661
+ installed earlier wins over whatever the OS ships — keeps behavior
662
+ deterministic when the user has both.
663
+ """
664
+ bin_root = self.cache_dir / "codeql" / "bin"
665
+ bin_root.mkdir(parents=True, exist_ok=True)
666
+
667
+ existing = next(
668
+ (p for p in bin_root.rglob("codeql") if p.is_file()),
669
+ None,
670
+ )
671
+ if existing and os.access(existing, os.X_OK):
672
+ logger.debug(f"Reusing cached CodeQL CLI at {existing}")
673
+ return existing.resolve()
674
+
675
+ on_path = shutil.which("codeql")
676
+ if on_path:
677
+ logger.debug(f"Using CodeQL CLI from PATH at {on_path}")
678
+ return Path(on_path)
679
+
680
+ logger.info(f"CodeQL CLI not found; downloading into {bin_root}.")
681
+ downloaded = CodeQLLoader.download_and_extract_codeql(bin_root)
682
+ if not downloaded.exists() or not os.access(downloaded, os.X_OK):
683
+ raise FileNotFoundError(
684
+ f"CodeQL binary not executable after download: {downloaded}"
685
+ )
686
+ return downloaded
687
+
688
+ def _get_call_graph(
689
+ self,
690
+ symbol_table: Dict[str, PyModule],
691
+ augment_sites: bool = False,
692
+ ) -> List[PyCallEdge]:
693
+ """Build CodeQL-resolved call edges and optionally augment sites.
694
+
695
+ Returns an empty list when CodeQL isn't enabled or the database
696
+ isn't available. Edges carry ``provenance=["codeql"]`` — merge
697
+ with Jedi-derived edges via ``call_graph.merge_edges``.
698
+
699
+ When ``augment_sites`` is True, also mutates
700
+ ``PyCallable.call_sites`` in the symbol table to backfill
701
+ ``callee_signature`` for sites Jedi couldn't resolve. The single
702
+ CodeQL query is shared (cached on the ``CodeQL`` instance) so
703
+ this costs no extra DB work.
704
+ """
705
+ if not self.using_codeql or self.db_path is None:
706
+ return []
707
+ try:
708
+ cq = CodeQL(
709
+ self.project_dir,
710
+ self.db_path,
711
+ codeql_bin=self.codeql_bin,
712
+ codeql_packs_dir=self.codeql_packs_dir,
713
+ )
714
+ edges = cq.build_call_graph_edges(symbol_table)
715
+ if augment_sites:
716
+ cq.augment_call_sites(symbol_table)
717
+ return edges
718
+ except Exception as exc:
719
+ logger.warning(f"CodeQL call-graph extraction failed: {exc}")
720
+ return []
@@ -14,7 +14,6 @@ class AnalysisOptions:
14
14
  input: Path
15
15
  output: Optional[Path] = None
16
16
  format: OutputFormat = OutputFormat.JSON
17
- analysis_level: int = 1
18
17
  using_codeql: bool = False
19
18
  using_ray: bool = False
20
19
  rebuild_analysis: bool = False
@@ -339,9 +339,29 @@ class PyModule(BaseModel):
339
339
  file_size: Optional[int] = None
340
340
 
341
341
 
342
+ @builder
343
+ @msgpk
344
+ class PyCallEdge(BaseModel):
345
+ """Identity-only call-graph edge with weight.
346
+
347
+ Mirrors Java's ``CallDependency``. ``source`` and ``target`` are
348
+ ``PyCallable.signature`` strings — nodes of the graph are the existing
349
+ ``PyCallable`` entries in the symbol table, not a separate vertex type.
350
+ Rich per-call metadata (receiver, arguments, location, ...) lives on
351
+ ``PyCallsite`` inside the source ``PyCallable.call_sites``.
352
+ """
353
+
354
+ source: str # caller's PyCallable.signature
355
+ target: str # callee's PyCallable.signature
356
+ type: Literal["CALL_DEP"] = "CALL_DEP"
357
+ weight: int = 1
358
+ provenance: List[Literal["jedi", "codeql", "joern"]] = []
359
+
360
+
342
361
  @builder
343
362
  @msgpk
344
363
  class PyApplication(BaseModel):
345
364
  """Represents a Python application."""
346
365
 
347
366
  symbol_table: Dict[str, PyModule]
367
+ call_graph: List[PyCallEdge] = []
@@ -0,0 +1,266 @@
1
+ ################################################################################
2
+ # Copyright IBM Corporation 2025
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ ################################################################################
16
+
17
+ """Adapters between the persisted call-graph schema and ``networkx``.
18
+
19
+ The schema persists the call graph as ``List[PyCallEdge]`` with signatures
20
+ referencing ``PyCallable`` entries already in the symbol table. These
21
+ helpers rehydrate it into a ``networkx.DiGraph`` for in-process queries
22
+ (paths, callers, callees) and reduce a built ``DiGraph`` back to the
23
+ serializable edge list.
24
+ """
25
+
26
+ from collections import Counter
27
+ from typing import Dict, Iterator, List, Tuple
28
+
29
+ import networkx as nx
30
+
31
+ from codeanalyzer.schema.py_schema import (
32
+ PyApplication,
33
+ PyCallable,
34
+ PyCallEdge,
35
+ PyClass,
36
+ PyModule,
37
+ )
38
+
39
+
40
+ def _walk_class_callables(cls: PyClass) -> Iterator[PyCallable]:
41
+ for method in cls.methods.values():
42
+ yield from _walk_callable(method)
43
+ for inner in cls.inner_classes.values():
44
+ yield from _walk_class_callables(inner)
45
+
46
+
47
+ def _walk_callable(c: PyCallable) -> Iterator[PyCallable]:
48
+ yield c
49
+ for inner in c.inner_callables.values():
50
+ yield from _walk_callable(inner)
51
+ for inner_cls in c.inner_classes.values():
52
+ yield from _walk_class_callables(inner_cls)
53
+
54
+
55
+ def _walk_module_callables(module: PyModule) -> Iterator[PyCallable]:
56
+ for fn in module.functions.values():
57
+ yield from _walk_callable(fn)
58
+ for cls in module.classes.values():
59
+ yield from _walk_class_callables(cls)
60
+
61
+
62
+ def iter_callables_in_symbol_table(
63
+ symbol_table: Dict[str, PyModule],
64
+ ) -> Iterator[PyCallable]:
65
+ """Yield every ``PyCallable`` in a symbol table, recursively."""
66
+ for module in symbol_table.values():
67
+ yield from _walk_module_callables(module)
68
+
69
+
70
+ def _walk_classes_in_class(cls: PyClass) -> Iterator[PyClass]:
71
+ yield cls
72
+ for inner in cls.inner_classes.values():
73
+ yield from _walk_classes_in_class(inner)
74
+ # Classes can live inside methods (e.g. a factory method that defines
75
+ # a helper class). Recurse through every method's callable subtree.
76
+ for method in cls.methods.values():
77
+ yield from _walk_classes_in_callable(method)
78
+
79
+
80
+ def _walk_classes_in_callable(c: PyCallable) -> Iterator[PyClass]:
81
+ for inner_cls in c.inner_classes.values():
82
+ yield from _walk_classes_in_class(inner_cls)
83
+ for inner in c.inner_callables.values():
84
+ yield from _walk_classes_in_callable(inner)
85
+
86
+
87
+ def iter_classes_in_symbol_table(
88
+ symbol_table: Dict[str, PyModule],
89
+ ) -> Iterator[PyClass]:
90
+ """Yield every ``PyClass`` in a symbol table, recursively — including
91
+ inner classes, classes nested in functions, and classes nested in
92
+ class methods."""
93
+ for module in symbol_table.values():
94
+ for cls in module.classes.values():
95
+ yield from _walk_classes_in_class(cls)
96
+ for fn in module.functions.values():
97
+ yield from _walk_classes_in_callable(fn)
98
+
99
+
100
+ def iter_callables(app: PyApplication) -> Iterator[PyCallable]:
101
+ """Yield every ``PyCallable`` in the application, recursively."""
102
+ yield from iter_callables_in_symbol_table(app.symbol_table)
103
+
104
+
105
+ def callables_by_signature(app: PyApplication) -> Dict[str, PyCallable]:
106
+ """Flat ``signature -> PyCallable`` index for O(1) node lookup."""
107
+ return {c.signature: c for c in iter_callables(app)}
108
+
109
+
110
+ def to_digraph(app: PyApplication) -> nx.DiGraph:
111
+ """Build a ``networkx.DiGraph`` from a ``PyApplication``.
112
+
113
+ Nodes are keyed by ``PyCallable.signature``. Nodes for in-source
114
+ callables carry a ``callable`` attribute holding the full
115
+ ``PyCallable`` and ``ghost=False``. Endpoints referenced by edges
116
+ but absent from the symbol table — RPC targets, third-party
117
+ libraries, framework callbacks, dynamically resolved callees — are
118
+ added as **ghost** nodes (``callable=None``, ``ghost=True``) so the
119
+ edges are preserved.
120
+
121
+ Edges carry ``type``, ``weight``, and ``provenance`` attributes.
122
+ """
123
+ g = nx.DiGraph()
124
+ by_sig = callables_by_signature(app)
125
+ for sig, c in by_sig.items():
126
+ g.add_node(sig, callable=c, ghost=False)
127
+ for e in app.call_graph:
128
+ for sig in (e.source, e.target):
129
+ if sig not in g.nodes:
130
+ g.add_node(sig, callable=None, ghost=True)
131
+ g.add_edge(
132
+ e.source,
133
+ e.target,
134
+ type=e.type,
135
+ weight=e.weight,
136
+ provenance=list(e.provenance),
137
+ )
138
+ return g
139
+
140
+
141
+ def from_digraph(g: nx.DiGraph) -> list:
142
+ """Reduce a ``DiGraph`` to the persisted ``List[PyCallEdge]`` form.
143
+
144
+ Only edges are extracted; nodes are not serialized here — they are
145
+ expected to already exist as ``PyCallable`` entries in the symbol
146
+ table. Edge attributes default to ``CALL_DEP`` / weight 1 / empty
147
+ provenance when missing.
148
+ """
149
+ edges = []
150
+ for src, dst, data in g.edges(data=True):
151
+ edges.append(
152
+ PyCallEdge(
153
+ source=src,
154
+ target=dst,
155
+ type=data.get("type", "CALL_DEP"),
156
+ weight=int(data.get("weight", 1)),
157
+ provenance=list(data.get("provenance", [])),
158
+ )
159
+ )
160
+ return edges
161
+
162
+
163
+ def jedi_call_graph_edges(
164
+ symbol_table: Dict[str, PyModule],
165
+ ) -> List[PyCallEdge]:
166
+ """Derive ``PyCallEdge`` entries from Jedi's per-callable ``call_sites``.
167
+
168
+ For every ``PyCallable`` in the symbol table, each ``PyCallsite`` whose
169
+ ``callee_signature`` is resolved (non-empty) contributes an edge
170
+ ``caller.signature -> site.callee_signature``. Sites where Jedi failed
171
+ to resolve the callee (``callee_signature`` is ``None`` or empty) are
172
+ skipped — they have no anchor to put on the graph.
173
+
174
+ Edges are coalesced on ``(source, target)``: ``weight`` is the count of
175
+ matching sites. Provenance is always ``["jedi"]``; combine with
176
+ CodeQL-derived edges via ``merge_edges``.
177
+ """
178
+ counts: Counter = Counter()
179
+ for caller in iter_callables_in_symbol_table(symbol_table):
180
+ for site in caller.call_sites:
181
+ if not site.callee_signature:
182
+ continue
183
+ counts[(caller.signature, site.callee_signature)] += 1
184
+
185
+ return [
186
+ PyCallEdge(source=src, target=dst, weight=n, provenance=["jedi"])
187
+ for (src, dst), n in counts.items()
188
+ ]
189
+
190
+
191
+ def resolve_unresolved_constructors(symbol_table: Dict[str, PyModule]) -> int:
192
+ """Fill in ``PyCallsite.callee_signature`` for unresolved constructor sites.
193
+
194
+ When both Jedi and CodeQL fail to resolve a constructor call (commonly
195
+ for classes nested inside functions or methods, where static-analysis
196
+ points-to is weakest), Jedi still flags the site as
197
+ ``is_constructor_call=True`` with ``method_name`` set to the class's
198
+ short name. This pass does the resolution heuristically:
199
+
200
+ 1. Build a ``short_name -> [PyClass]`` index from all classes in the
201
+ symbol table.
202
+ 2. For each unresolved constructor site under a caller ``C``, look up
203
+ candidates by ``site.method_name`` and prefer the class whose
204
+ ``signature`` is the longest prefix-ancestor of ``C.signature`` —
205
+ this approximates Python's LEGB scoping for nested classes.
206
+ 3. Set ``callee_signature = f"{class.signature}.__init__"``.
207
+
208
+ Returns the number of sites resolved. Best-effort; sites with no
209
+ matching class or ambiguous candidates with no scope tiebreaker are
210
+ left as-is.
211
+ """
212
+ by_name: Dict[str, List[PyClass]] = {}
213
+ for cls in iter_classes_in_symbol_table(symbol_table):
214
+ by_name.setdefault(cls.name, []).append(cls)
215
+
216
+ resolved = 0
217
+ for caller in iter_callables_in_symbol_table(symbol_table):
218
+ for site in caller.call_sites:
219
+ if not site.is_constructor_call or site.callee_signature:
220
+ continue
221
+ candidates = by_name.get(site.method_name)
222
+ if not candidates:
223
+ continue
224
+
225
+ # Prefer the class whose signature is the longest prefix of
226
+ # the caller's signature (closest enclosing scope).
227
+ def scope_score(c: PyClass, _caller_sig: str = caller.signature) -> int:
228
+ cls_sig = c.signature
229
+ parent_sig = cls_sig.rsplit(".", 1)[0] if "." in cls_sig else ""
230
+ # Score = length of parent_sig if it's a prefix of caller's
231
+ # signature, else -1 (not in scope, lowest priority).
232
+ if parent_sig and _caller_sig.startswith(parent_sig):
233
+ return len(parent_sig)
234
+ # Module-level class (parent_sig is the module path) — give
235
+ # it a base score so it still wins over no match.
236
+ return 0 if not parent_sig else -1
237
+
238
+ best = max(candidates, key=scope_score)
239
+ if scope_score(best) < 0:
240
+ # No candidate is reachable from caller's scope.
241
+ continue
242
+
243
+ site.callee_signature = f"{best.signature}.__init__"
244
+ resolved += 1
245
+
246
+ return resolved
247
+
248
+
249
+ def merge_edges(*edge_lists: list) -> list:
250
+ """Merge multiple ``List[PyCallEdge]`` into one.
251
+
252
+ Edges with the same ``(source, target)`` are coalesced: weights sum,
253
+ provenance is the sorted union. Useful for combining edges produced
254
+ by different backends (e.g. Jedi + CodeQL).
255
+ """
256
+ by_key: Dict[Tuple[str, str], PyCallEdge] = {}
257
+ for edges in edge_lists:
258
+ for e in edges:
259
+ k = (e.source, e.target)
260
+ if k in by_key:
261
+ cur = by_key[k]
262
+ cur.weight += e.weight
263
+ cur.provenance = sorted(set(cur.provenance) | set(e.provenance))
264
+ else:
265
+ by_key[k] = e.model_copy()
266
+ return list(by_key.values())