codeanalyzer-python 0.1.13__py3-none-any.whl → 0.1.14__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -20,13 +20,16 @@ This module provides functionality to create and manage CodeQL databases
20
20
  for Python projects and execute queries against them.
21
21
  """
22
22
 
23
+ from collections import Counter
23
24
  from pathlib import Path
24
- from typing import Union
25
+ from typing import Any, Dict, Iterator, List, Tuple, Union
25
26
 
26
- from networkx import DiGraph
27
27
  from pandas import DataFrame
28
28
 
29
+ from codeanalyzer.schema.py_schema import PyCallEdge, PyModule
30
+ from codeanalyzer.semantic_analysis.call_graph import iter_callables_in_symbol_table
29
31
  from codeanalyzer.semantic_analysis.codeql.codeql_query_runner import CodeQLQueryRunner
32
+ from codeanalyzer.utils import logger
30
33
 
31
34
 
32
35
  class CodeQL:
@@ -40,94 +43,258 @@ class CodeQL:
40
43
  temp_db (TemporaryDirectory or None): The temporary directory object if a temporary database was created.
41
44
  """
42
45
 
43
- def __init__(self, project_dir: Union[str, Path], db_path: Path) -> None:
46
+ def __init__(
47
+ self,
48
+ project_dir: Union[str, Path],
49
+ db_path: Path,
50
+ codeql_bin: Union[str, Path, None] = None,
51
+ codeql_packs_dir: Union[str, Path, None] = None,
52
+ ) -> None:
44
53
  self.project_dir = project_dir
45
54
  self.db_path = db_path
55
+ self.codeql_bin = codeql_bin
56
+ self.codeql_packs_dir = codeql_packs_dir
57
+ self._cached_df: "DataFrame | None" = None
46
58
 
47
- def _build_call_graph(self) -> DiGraph:
48
- """Builds the call graph of the application.
59
+ def _query_call_edges(self) -> DataFrame:
60
+ """Runs the CodeQL query that emits one row per resolved call site.
49
61
 
50
- Returns:
51
- DiGraph: A directed graph representing the call graph of the application.
52
- """
53
- query = []
62
+ The query is written against CodeQL's Python library (``import python``).
63
+ It returns physical location handles for both endpoints so the
64
+ downstream post-processor can join into Jedi's existing
65
+ ``PyCallable.signature`` space via ``(file_path, start_line)`` —
66
+ no signature normalization required.
54
67
 
55
- # Add import
56
- query += ["import python"]
68
+ Filters:
69
+ * Caller must be a ``Function`` (skip module-level / class-body
70
+ calls — they have no ``PyCallable`` to anchor to).
71
+ * Callee may resolve to anything (in-source or library stub);
72
+ non-application callees become **ghost** nodes downstream so
73
+ RPC / third-party / framework edges are preserved.
57
74
 
58
- # Add Call edges between caller and callee and filter to only capture application methods.
59
- query += [
60
- "from Method caller, Method callee",
75
+ Returns:
76
+ DataFrame: one row per resolved (caller, callee, call-site)
77
+ triple. Duplicate ``(caller_file, caller_start_line,
78
+ callee_file, callee_start_line)`` tuples represent multiple
79
+ call sites in the same caller targeting the same callee and
80
+ are coalesced into a single ``PyCallEdge`` (weight = count)
81
+ by the post-processor.
82
+ """
83
+ query = [
84
+ "/**",
85
+ " * @name Python call-graph edges",
86
+ " * @description One row per resolved call site: caller, callee,",
87
+ " * and the call-expression location.",
88
+ " * @kind table",
89
+ " * @id py/codeanalyzer/call-graph-edges",
90
+ " */",
91
+ "import python",
92
+ # ``FunctionValue`` / ``ClassValue`` / the ``pointsTo`` predicate
93
+ # live in ObjectAPI, which ``import python`` only brings in as a
94
+ # private import — they aren't re-exported. Pull them in
95
+ # explicitly.
96
+ "import semmle.python.objects.ObjectAPI",
97
+ "",
98
+ # ``Value.getACall()`` is the modern call-resolution API in
99
+ # codeql/python-all 7.x — it returns the ``CallNode`` (CFG)
100
+ # whose target was resolved to that ``Value``. Cleaner than
101
+ # poking at ``pointsTo`` directly.
102
+ "from CallNode call, Function caller, FunctionValue calleeVal",
61
103
  "where",
62
- "caller.fromSource() and",
63
- "callee.fromSource() and",
64
- "caller.calls(callee)",
104
+ " call.getScope() = caller and",
105
+ " (",
106
+ # Direct function / bound-method call: foo() or obj.foo()
107
+ " call = calleeVal.getACall()",
108
+ " or",
109
+ # Constructor call: A(...) resolves to a ClassValue; the actual
110
+ # callee is the class's __init__ (via MRO lookup so subclasses
111
+ # without an explicit __init__ still resolve to the inherited one).
112
+ " exists(ClassValue clsVal |",
113
+ " call = clsVal.getACall() and",
114
+ ' clsVal.lookup("__init__") = calleeVal',
115
+ " )",
116
+ " )",
65
117
  "select",
118
+ # --- Caller endpoint --- (joins to PyCallable via file + start_line)
119
+ " caller.getLocation().getFile().getAbsolutePath(),",
120
+ " caller.getLocation().getStartLine(),",
121
+ " caller.getQualifiedName(),",
122
+ # --- Callee endpoint --- (file/line may live in a library stub;
123
+ # post-processor classifies as in-source or ghost)
124
+ " calleeVal.getScope().getLocation().getFile().getAbsolutePath(),",
125
+ " calleeVal.getScope().getLocation().getStartLine(),",
126
+ " calleeVal.getQualifiedName(),",
127
+ # --- Call-site location --- (for PyCallsite augmentation)
128
+ " call.getLocation().getStartLine(),",
129
+ " call.getLocation().getStartColumn(),",
130
+ " call.getLocation().getEndLine(),",
131
+ " call.getLocation().getEndColumn()",
132
+ # ``is_constructor`` is derived in the post-processor by
133
+ # checking whether ``callee_qname`` ends in ``.__init__``;
134
+ # avoids QL's restrictive ``if-then-else`` typing here.
66
135
  ]
67
-
68
- # Caller metadata
69
- query += [
70
- "caller.getFile().getAbsolutePath(),",
71
- '"[" + caller.getBody().getLocation().getStartLine() + ", " + caller.getBody().getLocation().getEndLine() + "]", //Caller body slice indices',
72
- "caller.getQualifiedName(), // Caller's fullsignature",
73
- "caller.getAModifier(), // caller's method modifier",
74
- "caller.paramsString(), // caller's method parameter types",
75
- "caller.getReturnType().toString(), // Caller's return type",
76
- "caller.getDeclaringType().getQualifiedName(), // Caller's class",
77
- "caller.getDeclaringType().getAModifier(), // Caller's class modifier",
78
- ]
79
-
80
- # Callee metadata
81
- query += [
82
- "callee.getFile().getAbsolutePath(),",
83
- '"[" + callee.getBody().getLocation().getStartLine() + ", " + callee.getBody().getLocation().getEndLine() + "]", //Caller body slice indices',
84
- "callee.getQualifiedName(), // Caller's fullsignature",
85
- "callee.getAModifier(), // callee's method modifier",
86
- "callee.paramsString(), // callee's method parameter types",
87
- "callee.getReturnType().toString(), // Caller's return type",
88
- "callee.getDeclaringType().getQualifiedName(), // Caller's class",
89
- "callee.getDeclaringType().getAModifier() // Caller's class modifier",
90
- ]
136
+ if self._cached_df is not None:
137
+ return self._cached_df
91
138
 
92
139
  query_string = "\n".join(query)
93
140
 
94
- # Execute the query using the CodeQLQueryRunner context manager
95
- with CodeQLQueryRunner(self.db_path) as query:
96
- query_result: DataFrame = query.execute(
141
+ with CodeQLQueryRunner(
142
+ self.db_path,
143
+ codeql_bin=self.codeql_bin,
144
+ codeql_packs_dir=self.codeql_packs_dir,
145
+ ) as runner:
146
+ df: DataFrame = runner.execute(
97
147
  query_string,
98
148
  column_names=[
99
- # Caller Columns
100
149
  "caller_file",
101
- "caller_body_slice_index",
102
- "caller_signature",
103
- "caller_modifier",
104
- "caller_params",
105
- "caller_return_type",
106
- "caller_class_signature",
107
- "caller_class_modifier",
108
- # Callee Columns
150
+ "caller_start_line",
151
+ "caller_qname",
109
152
  "callee_file",
110
- "callee_body_slice_index",
111
- "callee_signature",
112
- "callee_modifier",
113
- "callee_params",
114
- "callee_return_type",
115
- "callee_class_signature",
116
- "callee_class_modifier",
153
+ "callee_start_line",
154
+ "callee_qname",
155
+ "call_start_line",
156
+ "call_start_column",
157
+ "call_end_line",
158
+ "call_end_column",
117
159
  ],
118
160
  )
119
-
120
- # Process the query results into JMethod instances
121
- callgraph: DiGraph = self.__process_call_edges_to_callgraph(query_result)
122
- return callgraph
161
+ self._cached_df = df
162
+ return df
123
163
 
124
164
  @staticmethod
125
- def __process_call_edges_to_callgraph(query_result: DataFrame) -> DiGraph:
126
- """Processes call edges from query results into a call graph.
165
+ def _build_callable_location_index(
166
+ symbol_table: Dict[str, PyModule],
167
+ ) -> Dict[Tuple[str, int], "PyCallable"]:
168
+ """Build ``(absolute_file_path, start_line) -> PyCallable`` from Jedi.
169
+
170
+ Paths are resolved so they match CodeQL's ``getAbsolutePath()``
171
+ regardless of symlinks or the current working directory.
172
+ """
173
+ from codeanalyzer.schema.py_schema import PyCallable # local to avoid cycle
174
+
175
+ index: Dict[Tuple[str, int], PyCallable] = {}
176
+ for c in iter_callables_in_symbol_table(symbol_table):
177
+ try:
178
+ abs_path = str(Path(c.path).resolve())
179
+ except (OSError, RuntimeError):
180
+ abs_path = c.path
181
+ index[(abs_path, c.start_line)] = c
182
+ return index
183
+
184
+ def _iter_resolved_rows(
185
+ self, symbol_table: Dict[str, PyModule]
186
+ ) -> "Iterator[Tuple[str, str, Any]]":
187
+ """Yield ``(source_sig, target_sig, row)`` for every CodeQL row.
188
+
189
+ Rows whose caller can't be matched to a ``PyCallable`` in the
190
+ symbol table are skipped. Callee misses fall back to
191
+ ``row.callee_qname`` (ghost). Used by both edge construction and
192
+ call-site augmentation so a single CodeQL query feeds both.
193
+ """
194
+ df = self._query_call_edges()
195
+ if df.empty:
196
+ return
197
+ location_index = self._build_callable_location_index(symbol_table)
198
+
199
+ skipped_unknown_caller = 0
200
+ ghost_callees = 0
201
+ for row in df.itertuples(index=False):
202
+ caller_key = (row.caller_file, int(row.caller_start_line))
203
+ caller = location_index.get(caller_key)
204
+ if caller is None:
205
+ skipped_unknown_caller += 1
206
+ continue
207
+
208
+ callee_key = (row.callee_file, int(row.callee_start_line))
209
+ callee = location_index.get(callee_key)
210
+ if callee is not None:
211
+ target_sig = callee.signature
212
+ else:
213
+ target_sig = row.callee_qname
214
+ ghost_callees += 1
127
215
 
128
- Args:
129
- query_result (DataFrame): The DataFrame containing call edge information.
216
+ yield caller.signature, target_sig, row
217
+
218
+ if skipped_unknown_caller:
219
+ logger.debug(
220
+ f"CodeQL: skipped {skipped_unknown_caller} rows whose caller "
221
+ f"was not in Jedi's symbol table."
222
+ )
223
+ if ghost_callees:
224
+ logger.debug(
225
+ f"CodeQL: {ghost_callees} rows resolved to ghost (external) callees."
226
+ )
227
+
228
+ def build_call_graph_edges(
229
+ self, symbol_table: Dict[str, PyModule]
230
+ ) -> List[PyCallEdge]:
231
+ """Run the CodeQL query and turn each row into a ``PyCallEdge``.
232
+
233
+ Edges are coalesced on ``(source, target)`` — ``weight`` is the
234
+ number of distinct call sites in the caller targeting the callee.
235
+ Provenance is always ``["codeql"]``; combine with Jedi-derived
236
+ edges via ``call_graph.merge_edges``.
237
+ """
238
+ edge_counts: Counter = Counter()
239
+ for source_sig, target_sig, _row in self._iter_resolved_rows(symbol_table):
240
+ edge_counts[(source_sig, target_sig)] += 1
241
+
242
+ return [
243
+ PyCallEdge(
244
+ source=src,
245
+ target=dst,
246
+ weight=count,
247
+ provenance=["codeql"],
248
+ )
249
+ for (src, dst), count in edge_counts.items()
250
+ ]
251
+
252
+ def augment_call_sites(self, symbol_table: Dict[str, PyModule]) -> int:
253
+ """Backfill ``PyCallsite.callee_signature`` using CodeQL resolution.
254
+
255
+ Walks every CodeQL row, locates the matching ``PyCallsite`` inside
256
+ the caller's ``PyCallable.call_sites`` by call-expression line range
257
+ (``start_line``, ``end_line``), and fills in ``callee_signature``
258
+ **only when Jedi left it empty**. Existing Jedi-resolved signatures
259
+ are kept (Jedi sees lexical context CodeQL can't, e.g. closures).
260
+
261
+ Match is by line range — column matching is brittle across the two
262
+ tools' 0- vs 1-based conventions. Ambiguity on a single line
263
+ (e.g. ``a.b().c()``) resolves to the first matching site, which is
264
+ an acceptable approximation given how rarely Jedi misses callees
265
+ on chained call lines.
130
266
 
131
267
  Returns:
132
- DiGraph: A directed graph representing the call graph of the application.
268
+ Number of ``PyCallsite`` entries augmented.
133
269
  """
270
+ location_index = self._build_callable_location_index(symbol_table)
271
+ df = self._query_call_edges()
272
+ if df.empty:
273
+ return 0
274
+
275
+ augmented = 0
276
+ for row in df.itertuples(index=False):
277
+ caller_key = (row.caller_file, int(row.caller_start_line))
278
+ caller = location_index.get(caller_key)
279
+ if caller is None:
280
+ continue
281
+
282
+ callee_key = (row.callee_file, int(row.callee_start_line))
283
+ callee = location_index.get(callee_key)
284
+ resolved_sig = callee.signature if callee is not None else row.callee_qname
285
+
286
+ call_start = int(row.call_start_line)
287
+ call_end = int(row.call_end_line)
288
+ for site in caller.call_sites:
289
+ if site.start_line != call_start or site.end_line != call_end:
290
+ continue
291
+ if not site.callee_signature:
292
+ site.callee_signature = resolved_sig
293
+ augmented += 1
294
+ break
295
+
296
+ if augmented:
297
+ logger.debug(
298
+ f"CodeQL: augmented {augmented} PyCallsite.callee_signature entries."
299
+ )
300
+ return augmented
@@ -1,4 +1,6 @@
1
+ import os
1
2
  import platform
3
+ import stat
2
4
  import zipfile
3
5
  from pathlib import Path
4
6
 
@@ -52,12 +54,38 @@ class CodeQLLoader:
52
54
  extract_dir = temp_dir / filename.replace(".zip", "")
53
55
  extract_dir.mkdir(exist_ok=True)
54
56
 
55
- print(f"Extracting CodeQL CLI to {extract_dir}")
57
+ logger.info(f"Extracting CodeQL CLI to {extract_dir}")
58
+ # zipfile.extractall drops Unix permissions (the executable bit), so
59
+ # we extract entries manually and copy each one's stored mode onto
60
+ # the file system. Without this, the CodeQL launcher script can't
61
+ # be executed and the next subprocess.Popen raises PermissionError.
56
62
  with zipfile.ZipFile(archive_path, "r") as zip_ref:
57
- zip_ref.extractall(extract_dir)
63
+ for info in zip_ref.infolist():
64
+ extracted_path = zip_ref.extract(info, extract_dir)
65
+ stored_mode = info.external_attr >> 16
66
+ if stored_mode:
67
+ os.chmod(extracted_path, stored_mode)
58
68
 
59
- codeql_bin = next(extract_dir.rglob("codeql"), None)
60
- if not codeql_bin or not codeql_bin.exists():
69
+ # Archive is no longer needed once extracted.
70
+ try:
71
+ archive_path.unlink()
72
+ except OSError as exc:
73
+ logger.warning(f"Could not remove CodeQL archive {archive_path}: {exc}")
74
+
75
+ # rglob("codeql") returns both the launcher file *and* an internal
76
+ # directory of the same name (CodeQL ships its own runtime under
77
+ # ``codeql/codeql/``); insist on a regular file so we never bind to
78
+ # the directory.
79
+ codeql_bin = next(
80
+ (p for p in extract_dir.rglob("codeql") if p.is_file()),
81
+ None,
82
+ )
83
+ if not codeql_bin:
61
84
  raise FileNotFoundError("CodeQL binary not found in extracted contents.")
62
85
 
86
+ # Belt-and-suspenders: ensure the binary is executable even if the
87
+ # archive entry's mode was zero (some older zip producers omit it).
88
+ st = codeql_bin.stat()
89
+ codeql_bin.chmod(st.st_mode | stat.S_IXUSR | stat.S_IXGRP | stat.S_IXOTH)
90
+
63
91
  return codeql_bin.resolve()
@@ -40,9 +40,13 @@ class CodeQLQueryRunner:
40
40
 
41
41
  Args:
42
42
  database_path (str): The path to the CodeQL database.
43
+ codeql_bin (str | Path | None): Absolute path to the CodeQL CLI
44
+ binary. When ``None``, falls back to whatever ``codeql`` is on
45
+ ``PATH``.
43
46
 
44
47
  Attributes:
45
48
  database_path (Path): The path to the CodeQL database.
49
+ codeql_bin (str): Resolved binary path or the literal ``"codeql"``.
46
50
  temp_file_path (Path): The path to the temporary query file.
47
51
  csv_output_file (Path): The path to the CSV output file.
48
52
  temp_bqrs_file_path (Path): The path to the temporary bqrs file.
@@ -52,39 +56,46 @@ class CodeQLQueryRunner:
52
56
  CodeQLQueryExecutionException: If there is an error executing the query.
53
57
  """
54
58
 
55
- def __init__(self, database_path: str):
59
+ def __init__(self, database_path: str, codeql_bin=None, codeql_packs_dir=None):
56
60
  self.database_path: Path = Path(database_path)
61
+ self.codeql_bin: str = str(codeql_bin) if codeql_bin else "codeql"
62
+ self.codeql_packs_dir = (
63
+ Path(codeql_packs_dir) if codeql_packs_dir is not None else None
64
+ )
57
65
  self.temp_file_path: Path = None
58
66
 
59
67
  def __enter__(self):
60
- """Context entry that creates temporary files to execute a CodeQL query.
61
-
62
- Returns:
63
- CodeQLQueryRunner: The instance of the class.
64
-
65
- Note:
66
- This method creates temporary files to hold the query and store their paths.
68
+ """Context entry that prepares paths to execute a CodeQL query.
69
+
70
+ The ``.ql`` file is written **inside the prepared qlpack
71
+ directory** (``codeql_packs_dir``) so ``import python`` resolves
72
+ against that pack's installed dependencies — no
73
+ ``--additional-packs`` or ``--search-path`` needed. The CSV /
74
+ BQRS output files live in ``tempfile`` because they're transient
75
+ per-query artifacts.
67
76
  """
68
-
69
- # Create a temporary file to hold the query and store its path
70
- temp_file = tempfile.NamedTemporaryFile("w", delete=False, suffix=".ql")
77
+ # CSV and BQRS files are transient per-query — fine in /tmp.
71
78
  csv_file = tempfile.NamedTemporaryFile("w", delete=False, suffix=".csv")
72
79
  bqrs_file = tempfile.NamedTemporaryFile("w", delete=False, suffix=".bqrs")
73
- self.temp_file_path = Path(temp_file.name)
74
80
  self.csv_output_file = Path(csv_file.name)
75
81
  self.temp_bqrs_file_path = Path(bqrs_file.name)
76
-
77
- # Let's close the files, we'll reopen them by path when needed.
78
- temp_file.close()
79
- bqrs_file.close()
80
82
  csv_file.close()
83
+ bqrs_file.close()
81
84
 
82
- # Create a temporary qlpack.yml file
83
- self.temp_qlpack_file = self.temp_file_path.parent / "qlpack.yml"
84
- with self.temp_qlpack_file.open("w") as f:
85
- f.write("name: temp\n")
86
- f.write("version: 1.0.0\n")
87
- f.write("libraryPathDependencies: codeql/java-all\n")
85
+ # The .ql file MUST live inside the prepared qlpack so its
86
+ # ``import python`` resolves via that pack's lock file. Writing
87
+ # outside the pack means CodeQL falls back to a default
88
+ # search-path that doesn't include downloaded library packs.
89
+ if self.codeql_packs_dir is None:
90
+ raise RuntimeError(
91
+ "CodeQLQueryRunner requires codeql_packs_dir — the directory "
92
+ "of an installed qlpack that depends on codeql/python-all."
93
+ )
94
+ ql_file = tempfile.NamedTemporaryFile(
95
+ "w", delete=False, suffix=".ql", dir=str(self.codeql_packs_dir)
96
+ )
97
+ self.temp_file_path = Path(ql_file.name)
98
+ ql_file.close()
88
99
 
89
100
  return self
90
101
 
@@ -108,32 +119,41 @@ class CodeQLQueryRunner:
108
119
  # Write the query to the temp file so we can execute it.
109
120
  self.temp_file_path.write_text(query_string)
110
121
 
111
- # Construct and execute the CodeQL CLI command asking for a JSON output.
122
+ # The .ql file sits inside the qlpack directory whose lock file
123
+ # already resolves ``codeql/python-all`` and its transitive
124
+ # dependencies. ``codeql query run`` auto-discovers the enclosing
125
+ # qlpack — no extra flags required.
112
126
  codeql_query_cmd = shlex.split(
113
- f"codeql query run {self.temp_file_path} --database={self.database_path} --output={self.temp_bqrs_file_path}",
127
+ f"{shlex.quote(self.codeql_bin)} query run {self.temp_file_path} "
128
+ f"--database={self.database_path} "
129
+ f"--output={self.temp_bqrs_file_path}",
114
130
  posix=False,
115
131
  )
116
132
 
117
- call = subprocess.Popen(codeql_query_cmd, stdout=None, stderr=None)
133
+ call = subprocess.Popen(
134
+ codeql_query_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE
135
+ )
118
136
  _, err = call.communicate()
119
137
  if call.returncode != 0:
120
138
  raise CodeQLExceptions.CodeQLQueryExecutionException(
121
- f"Error executing query: {err.stderr}"
139
+ f"Error executing query: {(err or b'').decode(errors='replace')}"
122
140
  )
123
141
 
124
142
  # Convert the bqrs file to a CSV file
125
143
  bqrs2csv_command = shlex.split(
126
- f"codeql bqrs decode --format=csv --output={self.csv_output_file} {self.temp_bqrs_file_path}",
144
+ f"{shlex.quote(self.codeql_bin)} bqrs decode --format=csv --output={self.csv_output_file} {self.temp_bqrs_file_path}",
127
145
  posix=False,
128
146
  )
129
147
 
130
148
  # Read the CSV file content and cast it to a DataFrame
131
149
 
132
- call = subprocess.Popen(bqrs2csv_command, stdout=None, stderr=None)
150
+ call = subprocess.Popen(
151
+ bqrs2csv_command, stdout=subprocess.PIPE, stderr=subprocess.PIPE
152
+ )
133
153
  _, err = call.communicate()
134
154
  if call.returncode != 0:
135
155
  raise CodeQLExceptions.CodeQLQueryExecutionException(
136
- f"Error executing query: {err.stderr}"
156
+ f"Error decoding bqrs: {(err or b'').decode(errors='replace')}"
137
157
  )
138
158
  else:
139
159
  return pd.read_csv(
@@ -161,5 +181,5 @@ class CodeQLQueryRunner:
161
181
  if self.csv_output_file and self.csv_output_file.exists():
162
182
  self.csv_output_file.unlink()
163
183
 
164
- if self.temp_qlpack_file and self.temp_qlpack_file.exists():
165
- self.temp_qlpack_file.unlink()
184
+ if self.temp_bqrs_file_path and self.temp_bqrs_file_path.exists():
185
+ self.temp_bqrs_file_path.unlink()
@@ -4,7 +4,7 @@ import tokenize
4
4
  from ast import AST, ClassDef
5
5
  from io import StringIO
6
6
  from pathlib import Path
7
- from typing import Dict, List, Optional, Union
7
+ from typing import Dict, List, Optional, Tuple, Union
8
8
 
9
9
  import jedi
10
10
  from jedi.api import Script
@@ -71,6 +71,32 @@ class SymbolTableBuilder:
71
71
  pass
72
72
  return None
73
73
 
74
+ @staticmethod
75
+ def _infer_callee(
76
+ script: Script, line: int, column: int
77
+ ) -> Tuple[Optional[str], bool]:
78
+ """Infer ``(qualified_name, is_class)`` at a call expression.
79
+
80
+ When the callee resolves to a class (e.g. ``A()``), the qualified
81
+ name is normalized to ``<class>.__init__`` so it joins to the
82
+ ``PyCallable`` entry for the constructor in the symbol table —
83
+ classes themselves are not ``PyCallable``s, so without this
84
+ rewrite every constructor call would surface as a ghost node in
85
+ the call graph.
86
+ """
87
+ try:
88
+ definitions = script.infer(line=line, column=column)
89
+ if not definitions:
90
+ return None, False
91
+ d = definitions[0]
92
+ is_class = (d.type == "class")
93
+ full = d.full_name
94
+ if is_class and full:
95
+ full = f"{full}.__init__"
96
+ return full, is_class
97
+ except Exception:
98
+ return None, False
99
+
74
100
  def build_pymodule_from_file(self, py_file: Path) -> PyModule:
75
101
  """Builds a PyModule from a Python file.
76
102
 
@@ -485,6 +511,63 @@ class SymbolTableBuilder:
485
511
  symbols.append(symbol)
486
512
  return symbols
487
513
 
514
+ @staticmethod
515
+ def _iter_calls_in_scope(fn_node: ast.AST):
516
+ """Yield ``ast.Call`` nodes belonging to ``fn_node``'s own scope.
517
+
518
+ Naive ``ast.walk`` descends into nested ``FunctionDef`` / ``ClassDef``
519
+ bodies, attributing their calls to the outer function — wrong, since
520
+ those nested definitions have their own ``PyCallable`` entries
521
+ (built recursively by ``_callables``/``_add_class``) and own
522
+ ``call_sites`` lists.
523
+
524
+ Decorators, default arguments, return-type annotations, base
525
+ classes and class-level keyword args ARE evaluated in the
526
+ enclosing scope, so calls in those subtrees stay attributed to
527
+ ``fn_node``. Bodies of nested defs/classes are skipped. Lambdas,
528
+ comprehensions and inline conditionals don't get their own
529
+ ``PyCallable`` so their internals stay attributed to the enclosing
530
+ function.
531
+ """
532
+
533
+ def walk(node: ast.AST):
534
+ if isinstance(node, ast.Call):
535
+ yield node
536
+
537
+ if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)):
538
+ # Decorators, defaults, return annotations run in
539
+ # enclosing scope. Body and arg names run in inner scope.
540
+ for dec in node.decorator_list:
541
+ yield from walk(dec)
542
+ for default in node.args.defaults:
543
+ yield from walk(default)
544
+ for default in node.args.kw_defaults:
545
+ if default is not None:
546
+ yield from walk(default)
547
+ if node.returns is not None:
548
+ yield from walk(node.returns)
549
+ return
550
+
551
+ if isinstance(node, ast.ClassDef):
552
+ # Decorators, bases, and keyword args run in enclosing scope.
553
+ # Body runs in class scope.
554
+ for dec in node.decorator_list:
555
+ yield from walk(dec)
556
+ for base in node.bases:
557
+ yield from walk(base)
558
+ for kw in node.keywords:
559
+ yield from walk(kw.value)
560
+ return
561
+
562
+ for child in ast.iter_child_nodes(node):
563
+ yield from walk(child)
564
+
565
+ for stmt in getattr(fn_node, "body", []):
566
+ yield from walk(stmt)
567
+ # Decorators / defaults / returns of fn_node itself are evaluated
568
+ # in the ENCLOSING scope, so they belong to fn_node's parent, not
569
+ # fn_node. Don't yield them here.
570
+
488
571
  def _call_sites(self, fn_node: ast.FunctionDef, script: Script) -> List[PyCallsite]:
489
572
  """
490
573
  Finds all call sites made from within the function using Jedi for type inference.
@@ -498,14 +581,14 @@ class SymbolTableBuilder:
498
581
  """
499
582
  call_sites: List[PyCallsite] = []
500
583
 
501
- for node in ast.walk(fn_node):
584
+ for node in self._iter_calls_in_scope(fn_node):
502
585
  if not isinstance(node, ast.Call):
503
586
  continue
504
587
 
505
588
  func_expr = node.func
506
589
 
507
590
  method_name = "<unknown>"
508
- callee_signature = self._infer_qualified_name(
591
+ callee_signature, is_constructor = self._infer_callee(
509
592
  script, node.lineno, node.col_offset
510
593
  )
511
594
  return_type = self._infer_type(script, node.lineno, node.col_offset)
@@ -535,7 +618,7 @@ class SymbolTableBuilder:
535
618
  .argument_types(argument_types)
536
619
  .return_type(return_type)
537
620
  .callee_signature(callee_signature)
538
- .is_constructor_call(method_name == "__init__")
621
+ .is_constructor_call(is_constructor)
539
622
  .start_line(getattr(node, "lineno", -1))
540
623
  .start_column(getattr(node, "col_offset", -1))
541
624
  .end_line(getattr(node, "end_lineno", -1))