codeanalyzer-python 0.1.13__py3-none-any.whl → 0.1.14__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- codeanalyzer/__main__.py +0 -5
- codeanalyzer/core.py +154 -19
- codeanalyzer/options/options.py +0 -1
- codeanalyzer/schema/py_schema.py +20 -0
- codeanalyzer/semantic_analysis/call_graph.py +266 -0
- codeanalyzer/semantic_analysis/codeql/codeql_analysis.py +236 -69
- codeanalyzer/semantic_analysis/codeql/codeql_loader.py +32 -4
- codeanalyzer/semantic_analysis/codeql/codeql_query_runner.py +51 -31
- codeanalyzer/syntactic_analysis/symbol_table_builder.py +87 -4
- {codeanalyzer_python-0.1.13.dist-info → codeanalyzer_python-0.1.14.dist-info}/METADATA +20 -42
- {codeanalyzer_python-0.1.13.dist-info → codeanalyzer_python-0.1.14.dist-info}/RECORD +15 -15
- {codeanalyzer_python-0.1.13.dist-info → codeanalyzer_python-0.1.14.dist-info}/WHEEL +1 -1
- codeanalyzer/semantic_analysis/wala/__init__.py +0 -15
- {codeanalyzer_python-0.1.13.dist-info → codeanalyzer_python-0.1.14.dist-info}/entry_points.txt +0 -0
- {codeanalyzer_python-0.1.13.dist-info → codeanalyzer_python-0.1.14.dist-info}/licenses/LICENSE +0 -0
- {codeanalyzer_python-0.1.13.dist-info → codeanalyzer_python-0.1.14.dist-info}/licenses/NOTICE +0 -0
|
@@ -20,13 +20,16 @@ This module provides functionality to create and manage CodeQL databases
|
|
|
20
20
|
for Python projects and execute queries against them.
|
|
21
21
|
"""
|
|
22
22
|
|
|
23
|
+
from collections import Counter
|
|
23
24
|
from pathlib import Path
|
|
24
|
-
from typing import Union
|
|
25
|
+
from typing import Any, Dict, Iterator, List, Tuple, Union
|
|
25
26
|
|
|
26
|
-
from networkx import DiGraph
|
|
27
27
|
from pandas import DataFrame
|
|
28
28
|
|
|
29
|
+
from codeanalyzer.schema.py_schema import PyCallEdge, PyModule
|
|
30
|
+
from codeanalyzer.semantic_analysis.call_graph import iter_callables_in_symbol_table
|
|
29
31
|
from codeanalyzer.semantic_analysis.codeql.codeql_query_runner import CodeQLQueryRunner
|
|
32
|
+
from codeanalyzer.utils import logger
|
|
30
33
|
|
|
31
34
|
|
|
32
35
|
class CodeQL:
|
|
@@ -40,94 +43,258 @@ class CodeQL:
|
|
|
40
43
|
temp_db (TemporaryDirectory or None): The temporary directory object if a temporary database was created.
|
|
41
44
|
"""
|
|
42
45
|
|
|
43
|
-
def __init__(
|
|
46
|
+
def __init__(
|
|
47
|
+
self,
|
|
48
|
+
project_dir: Union[str, Path],
|
|
49
|
+
db_path: Path,
|
|
50
|
+
codeql_bin: Union[str, Path, None] = None,
|
|
51
|
+
codeql_packs_dir: Union[str, Path, None] = None,
|
|
52
|
+
) -> None:
|
|
44
53
|
self.project_dir = project_dir
|
|
45
54
|
self.db_path = db_path
|
|
55
|
+
self.codeql_bin = codeql_bin
|
|
56
|
+
self.codeql_packs_dir = codeql_packs_dir
|
|
57
|
+
self._cached_df: "DataFrame | None" = None
|
|
46
58
|
|
|
47
|
-
def
|
|
48
|
-
"""
|
|
59
|
+
def _query_call_edges(self) -> DataFrame:
|
|
60
|
+
"""Runs the CodeQL query that emits one row per resolved call site.
|
|
49
61
|
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
62
|
+
The query is written against CodeQL's Python library (``import python``).
|
|
63
|
+
It returns physical location handles for both endpoints so the
|
|
64
|
+
downstream post-processor can join into Jedi's existing
|
|
65
|
+
``PyCallable.signature`` space via ``(file_path, start_line)`` —
|
|
66
|
+
no signature normalization required.
|
|
54
67
|
|
|
55
|
-
|
|
56
|
-
|
|
68
|
+
Filters:
|
|
69
|
+
* Caller must be a ``Function`` (skip module-level / class-body
|
|
70
|
+
calls — they have no ``PyCallable`` to anchor to).
|
|
71
|
+
* Callee may resolve to anything (in-source or library stub);
|
|
72
|
+
non-application callees become **ghost** nodes downstream so
|
|
73
|
+
RPC / third-party / framework edges are preserved.
|
|
57
74
|
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
75
|
+
Returns:
|
|
76
|
+
DataFrame: one row per resolved (caller, callee, call-site)
|
|
77
|
+
triple. Duplicate ``(caller_file, caller_start_line,
|
|
78
|
+
callee_file, callee_start_line)`` tuples represent multiple
|
|
79
|
+
call sites in the same caller targeting the same callee and
|
|
80
|
+
are coalesced into a single ``PyCallEdge`` (weight = count)
|
|
81
|
+
by the post-processor.
|
|
82
|
+
"""
|
|
83
|
+
query = [
|
|
84
|
+
"/**",
|
|
85
|
+
" * @name Python call-graph edges",
|
|
86
|
+
" * @description One row per resolved call site: caller, callee,",
|
|
87
|
+
" * and the call-expression location.",
|
|
88
|
+
" * @kind table",
|
|
89
|
+
" * @id py/codeanalyzer/call-graph-edges",
|
|
90
|
+
" */",
|
|
91
|
+
"import python",
|
|
92
|
+
# ``FunctionValue`` / ``ClassValue`` / the ``pointsTo`` predicate
|
|
93
|
+
# live in ObjectAPI, which ``import python`` only brings in as a
|
|
94
|
+
# private import — they aren't re-exported. Pull them in
|
|
95
|
+
# explicitly.
|
|
96
|
+
"import semmle.python.objects.ObjectAPI",
|
|
97
|
+
"",
|
|
98
|
+
# ``Value.getACall()`` is the modern call-resolution API in
|
|
99
|
+
# codeql/python-all 7.x — it returns the ``CallNode`` (CFG)
|
|
100
|
+
# whose target was resolved to that ``Value``. Cleaner than
|
|
101
|
+
# poking at ``pointsTo`` directly.
|
|
102
|
+
"from CallNode call, Function caller, FunctionValue calleeVal",
|
|
61
103
|
"where",
|
|
62
|
-
"
|
|
63
|
-
"
|
|
64
|
-
|
|
104
|
+
" call.getScope() = caller and",
|
|
105
|
+
" (",
|
|
106
|
+
# Direct function / bound-method call: foo() or obj.foo()
|
|
107
|
+
" call = calleeVal.getACall()",
|
|
108
|
+
" or",
|
|
109
|
+
# Constructor call: A(...) resolves to a ClassValue; the actual
|
|
110
|
+
# callee is the class's __init__ (via MRO lookup so subclasses
|
|
111
|
+
# without an explicit __init__ still resolve to the inherited one).
|
|
112
|
+
" exists(ClassValue clsVal |",
|
|
113
|
+
" call = clsVal.getACall() and",
|
|
114
|
+
' clsVal.lookup("__init__") = calleeVal',
|
|
115
|
+
" )",
|
|
116
|
+
" )",
|
|
65
117
|
"select",
|
|
118
|
+
# --- Caller endpoint --- (joins to PyCallable via file + start_line)
|
|
119
|
+
" caller.getLocation().getFile().getAbsolutePath(),",
|
|
120
|
+
" caller.getLocation().getStartLine(),",
|
|
121
|
+
" caller.getQualifiedName(),",
|
|
122
|
+
# --- Callee endpoint --- (file/line may live in a library stub;
|
|
123
|
+
# post-processor classifies as in-source or ghost)
|
|
124
|
+
" calleeVal.getScope().getLocation().getFile().getAbsolutePath(),",
|
|
125
|
+
" calleeVal.getScope().getLocation().getStartLine(),",
|
|
126
|
+
" calleeVal.getQualifiedName(),",
|
|
127
|
+
# --- Call-site location --- (for PyCallsite augmentation)
|
|
128
|
+
" call.getLocation().getStartLine(),",
|
|
129
|
+
" call.getLocation().getStartColumn(),",
|
|
130
|
+
" call.getLocation().getEndLine(),",
|
|
131
|
+
" call.getLocation().getEndColumn()",
|
|
132
|
+
# ``is_constructor`` is derived in the post-processor by
|
|
133
|
+
# checking whether ``callee_qname`` ends in ``.__init__``;
|
|
134
|
+
# avoids QL's restrictive ``if-then-else`` typing here.
|
|
66
135
|
]
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
query += [
|
|
70
|
-
"caller.getFile().getAbsolutePath(),",
|
|
71
|
-
'"[" + caller.getBody().getLocation().getStartLine() + ", " + caller.getBody().getLocation().getEndLine() + "]", //Caller body slice indices',
|
|
72
|
-
"caller.getQualifiedName(), // Caller's fullsignature",
|
|
73
|
-
"caller.getAModifier(), // caller's method modifier",
|
|
74
|
-
"caller.paramsString(), // caller's method parameter types",
|
|
75
|
-
"caller.getReturnType().toString(), // Caller's return type",
|
|
76
|
-
"caller.getDeclaringType().getQualifiedName(), // Caller's class",
|
|
77
|
-
"caller.getDeclaringType().getAModifier(), // Caller's class modifier",
|
|
78
|
-
]
|
|
79
|
-
|
|
80
|
-
# Callee metadata
|
|
81
|
-
query += [
|
|
82
|
-
"callee.getFile().getAbsolutePath(),",
|
|
83
|
-
'"[" + callee.getBody().getLocation().getStartLine() + ", " + callee.getBody().getLocation().getEndLine() + "]", //Caller body slice indices',
|
|
84
|
-
"callee.getQualifiedName(), // Caller's fullsignature",
|
|
85
|
-
"callee.getAModifier(), // callee's method modifier",
|
|
86
|
-
"callee.paramsString(), // callee's method parameter types",
|
|
87
|
-
"callee.getReturnType().toString(), // Caller's return type",
|
|
88
|
-
"callee.getDeclaringType().getQualifiedName(), // Caller's class",
|
|
89
|
-
"callee.getDeclaringType().getAModifier() // Caller's class modifier",
|
|
90
|
-
]
|
|
136
|
+
if self._cached_df is not None:
|
|
137
|
+
return self._cached_df
|
|
91
138
|
|
|
92
139
|
query_string = "\n".join(query)
|
|
93
140
|
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
141
|
+
with CodeQLQueryRunner(
|
|
142
|
+
self.db_path,
|
|
143
|
+
codeql_bin=self.codeql_bin,
|
|
144
|
+
codeql_packs_dir=self.codeql_packs_dir,
|
|
145
|
+
) as runner:
|
|
146
|
+
df: DataFrame = runner.execute(
|
|
97
147
|
query_string,
|
|
98
148
|
column_names=[
|
|
99
|
-
# Caller Columns
|
|
100
149
|
"caller_file",
|
|
101
|
-
"
|
|
102
|
-
"
|
|
103
|
-
"caller_modifier",
|
|
104
|
-
"caller_params",
|
|
105
|
-
"caller_return_type",
|
|
106
|
-
"caller_class_signature",
|
|
107
|
-
"caller_class_modifier",
|
|
108
|
-
# Callee Columns
|
|
150
|
+
"caller_start_line",
|
|
151
|
+
"caller_qname",
|
|
109
152
|
"callee_file",
|
|
110
|
-
"
|
|
111
|
-
"
|
|
112
|
-
"
|
|
113
|
-
"
|
|
114
|
-
"
|
|
115
|
-
"
|
|
116
|
-
"callee_class_modifier",
|
|
153
|
+
"callee_start_line",
|
|
154
|
+
"callee_qname",
|
|
155
|
+
"call_start_line",
|
|
156
|
+
"call_start_column",
|
|
157
|
+
"call_end_line",
|
|
158
|
+
"call_end_column",
|
|
117
159
|
],
|
|
118
160
|
)
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
callgraph: DiGraph = self.__process_call_edges_to_callgraph(query_result)
|
|
122
|
-
return callgraph
|
|
161
|
+
self._cached_df = df
|
|
162
|
+
return df
|
|
123
163
|
|
|
124
164
|
@staticmethod
|
|
125
|
-
def
|
|
126
|
-
|
|
165
|
+
def _build_callable_location_index(
|
|
166
|
+
symbol_table: Dict[str, PyModule],
|
|
167
|
+
) -> Dict[Tuple[str, int], "PyCallable"]:
|
|
168
|
+
"""Build ``(absolute_file_path, start_line) -> PyCallable`` from Jedi.
|
|
169
|
+
|
|
170
|
+
Paths are resolved so they match CodeQL's ``getAbsolutePath()``
|
|
171
|
+
regardless of symlinks or the current working directory.
|
|
172
|
+
"""
|
|
173
|
+
from codeanalyzer.schema.py_schema import PyCallable # local to avoid cycle
|
|
174
|
+
|
|
175
|
+
index: Dict[Tuple[str, int], PyCallable] = {}
|
|
176
|
+
for c in iter_callables_in_symbol_table(symbol_table):
|
|
177
|
+
try:
|
|
178
|
+
abs_path = str(Path(c.path).resolve())
|
|
179
|
+
except (OSError, RuntimeError):
|
|
180
|
+
abs_path = c.path
|
|
181
|
+
index[(abs_path, c.start_line)] = c
|
|
182
|
+
return index
|
|
183
|
+
|
|
184
|
+
def _iter_resolved_rows(
|
|
185
|
+
self, symbol_table: Dict[str, PyModule]
|
|
186
|
+
) -> "Iterator[Tuple[str, str, Any]]":
|
|
187
|
+
"""Yield ``(source_sig, target_sig, row)`` for every CodeQL row.
|
|
188
|
+
|
|
189
|
+
Rows whose caller can't be matched to a ``PyCallable`` in the
|
|
190
|
+
symbol table are skipped. Callee misses fall back to
|
|
191
|
+
``row.callee_qname`` (ghost). Used by both edge construction and
|
|
192
|
+
call-site augmentation so a single CodeQL query feeds both.
|
|
193
|
+
"""
|
|
194
|
+
df = self._query_call_edges()
|
|
195
|
+
if df.empty:
|
|
196
|
+
return
|
|
197
|
+
location_index = self._build_callable_location_index(symbol_table)
|
|
198
|
+
|
|
199
|
+
skipped_unknown_caller = 0
|
|
200
|
+
ghost_callees = 0
|
|
201
|
+
for row in df.itertuples(index=False):
|
|
202
|
+
caller_key = (row.caller_file, int(row.caller_start_line))
|
|
203
|
+
caller = location_index.get(caller_key)
|
|
204
|
+
if caller is None:
|
|
205
|
+
skipped_unknown_caller += 1
|
|
206
|
+
continue
|
|
207
|
+
|
|
208
|
+
callee_key = (row.callee_file, int(row.callee_start_line))
|
|
209
|
+
callee = location_index.get(callee_key)
|
|
210
|
+
if callee is not None:
|
|
211
|
+
target_sig = callee.signature
|
|
212
|
+
else:
|
|
213
|
+
target_sig = row.callee_qname
|
|
214
|
+
ghost_callees += 1
|
|
127
215
|
|
|
128
|
-
|
|
129
|
-
|
|
216
|
+
yield caller.signature, target_sig, row
|
|
217
|
+
|
|
218
|
+
if skipped_unknown_caller:
|
|
219
|
+
logger.debug(
|
|
220
|
+
f"CodeQL: skipped {skipped_unknown_caller} rows whose caller "
|
|
221
|
+
f"was not in Jedi's symbol table."
|
|
222
|
+
)
|
|
223
|
+
if ghost_callees:
|
|
224
|
+
logger.debug(
|
|
225
|
+
f"CodeQL: {ghost_callees} rows resolved to ghost (external) callees."
|
|
226
|
+
)
|
|
227
|
+
|
|
228
|
+
def build_call_graph_edges(
|
|
229
|
+
self, symbol_table: Dict[str, PyModule]
|
|
230
|
+
) -> List[PyCallEdge]:
|
|
231
|
+
"""Run the CodeQL query and turn each row into a ``PyCallEdge``.
|
|
232
|
+
|
|
233
|
+
Edges are coalesced on ``(source, target)`` — ``weight`` is the
|
|
234
|
+
number of distinct call sites in the caller targeting the callee.
|
|
235
|
+
Provenance is always ``["codeql"]``; combine with Jedi-derived
|
|
236
|
+
edges via ``call_graph.merge_edges``.
|
|
237
|
+
"""
|
|
238
|
+
edge_counts: Counter = Counter()
|
|
239
|
+
for source_sig, target_sig, _row in self._iter_resolved_rows(symbol_table):
|
|
240
|
+
edge_counts[(source_sig, target_sig)] += 1
|
|
241
|
+
|
|
242
|
+
return [
|
|
243
|
+
PyCallEdge(
|
|
244
|
+
source=src,
|
|
245
|
+
target=dst,
|
|
246
|
+
weight=count,
|
|
247
|
+
provenance=["codeql"],
|
|
248
|
+
)
|
|
249
|
+
for (src, dst), count in edge_counts.items()
|
|
250
|
+
]
|
|
251
|
+
|
|
252
|
+
def augment_call_sites(self, symbol_table: Dict[str, PyModule]) -> int:
|
|
253
|
+
"""Backfill ``PyCallsite.callee_signature`` using CodeQL resolution.
|
|
254
|
+
|
|
255
|
+
Walks every CodeQL row, locates the matching ``PyCallsite`` inside
|
|
256
|
+
the caller's ``PyCallable.call_sites`` by call-expression line range
|
|
257
|
+
(``start_line``, ``end_line``), and fills in ``callee_signature``
|
|
258
|
+
**only when Jedi left it empty**. Existing Jedi-resolved signatures
|
|
259
|
+
are kept (Jedi sees lexical context CodeQL can't, e.g. closures).
|
|
260
|
+
|
|
261
|
+
Match is by line range — column matching is brittle across the two
|
|
262
|
+
tools' 0- vs 1-based conventions. Ambiguity on a single line
|
|
263
|
+
(e.g. ``a.b().c()``) resolves to the first matching site, which is
|
|
264
|
+
an acceptable approximation given how rarely Jedi misses callees
|
|
265
|
+
on chained call lines.
|
|
130
266
|
|
|
131
267
|
Returns:
|
|
132
|
-
|
|
268
|
+
Number of ``PyCallsite`` entries augmented.
|
|
133
269
|
"""
|
|
270
|
+
location_index = self._build_callable_location_index(symbol_table)
|
|
271
|
+
df = self._query_call_edges()
|
|
272
|
+
if df.empty:
|
|
273
|
+
return 0
|
|
274
|
+
|
|
275
|
+
augmented = 0
|
|
276
|
+
for row in df.itertuples(index=False):
|
|
277
|
+
caller_key = (row.caller_file, int(row.caller_start_line))
|
|
278
|
+
caller = location_index.get(caller_key)
|
|
279
|
+
if caller is None:
|
|
280
|
+
continue
|
|
281
|
+
|
|
282
|
+
callee_key = (row.callee_file, int(row.callee_start_line))
|
|
283
|
+
callee = location_index.get(callee_key)
|
|
284
|
+
resolved_sig = callee.signature if callee is not None else row.callee_qname
|
|
285
|
+
|
|
286
|
+
call_start = int(row.call_start_line)
|
|
287
|
+
call_end = int(row.call_end_line)
|
|
288
|
+
for site in caller.call_sites:
|
|
289
|
+
if site.start_line != call_start or site.end_line != call_end:
|
|
290
|
+
continue
|
|
291
|
+
if not site.callee_signature:
|
|
292
|
+
site.callee_signature = resolved_sig
|
|
293
|
+
augmented += 1
|
|
294
|
+
break
|
|
295
|
+
|
|
296
|
+
if augmented:
|
|
297
|
+
logger.debug(
|
|
298
|
+
f"CodeQL: augmented {augmented} PyCallsite.callee_signature entries."
|
|
299
|
+
)
|
|
300
|
+
return augmented
|
|
@@ -1,4 +1,6 @@
|
|
|
1
|
+
import os
|
|
1
2
|
import platform
|
|
3
|
+
import stat
|
|
2
4
|
import zipfile
|
|
3
5
|
from pathlib import Path
|
|
4
6
|
|
|
@@ -52,12 +54,38 @@ class CodeQLLoader:
|
|
|
52
54
|
extract_dir = temp_dir / filename.replace(".zip", "")
|
|
53
55
|
extract_dir.mkdir(exist_ok=True)
|
|
54
56
|
|
|
55
|
-
|
|
57
|
+
logger.info(f"Extracting CodeQL CLI to {extract_dir}")
|
|
58
|
+
# zipfile.extractall drops Unix permissions (the executable bit), so
|
|
59
|
+
# we extract entries manually and copy each one's stored mode onto
|
|
60
|
+
# the file system. Without this, the CodeQL launcher script can't
|
|
61
|
+
# be executed and the next subprocess.Popen raises PermissionError.
|
|
56
62
|
with zipfile.ZipFile(archive_path, "r") as zip_ref:
|
|
57
|
-
zip_ref.
|
|
63
|
+
for info in zip_ref.infolist():
|
|
64
|
+
extracted_path = zip_ref.extract(info, extract_dir)
|
|
65
|
+
stored_mode = info.external_attr >> 16
|
|
66
|
+
if stored_mode:
|
|
67
|
+
os.chmod(extracted_path, stored_mode)
|
|
58
68
|
|
|
59
|
-
|
|
60
|
-
|
|
69
|
+
# Archive is no longer needed once extracted.
|
|
70
|
+
try:
|
|
71
|
+
archive_path.unlink()
|
|
72
|
+
except OSError as exc:
|
|
73
|
+
logger.warning(f"Could not remove CodeQL archive {archive_path}: {exc}")
|
|
74
|
+
|
|
75
|
+
# rglob("codeql") returns both the launcher file *and* an internal
|
|
76
|
+
# directory of the same name (CodeQL ships its own runtime under
|
|
77
|
+
# ``codeql/codeql/``); insist on a regular file so we never bind to
|
|
78
|
+
# the directory.
|
|
79
|
+
codeql_bin = next(
|
|
80
|
+
(p for p in extract_dir.rglob("codeql") if p.is_file()),
|
|
81
|
+
None,
|
|
82
|
+
)
|
|
83
|
+
if not codeql_bin:
|
|
61
84
|
raise FileNotFoundError("CodeQL binary not found in extracted contents.")
|
|
62
85
|
|
|
86
|
+
# Belt-and-suspenders: ensure the binary is executable even if the
|
|
87
|
+
# archive entry's mode was zero (some older zip producers omit it).
|
|
88
|
+
st = codeql_bin.stat()
|
|
89
|
+
codeql_bin.chmod(st.st_mode | stat.S_IXUSR | stat.S_IXGRP | stat.S_IXOTH)
|
|
90
|
+
|
|
63
91
|
return codeql_bin.resolve()
|
|
@@ -40,9 +40,13 @@ class CodeQLQueryRunner:
|
|
|
40
40
|
|
|
41
41
|
Args:
|
|
42
42
|
database_path (str): The path to the CodeQL database.
|
|
43
|
+
codeql_bin (str | Path | None): Absolute path to the CodeQL CLI
|
|
44
|
+
binary. When ``None``, falls back to whatever ``codeql`` is on
|
|
45
|
+
``PATH``.
|
|
43
46
|
|
|
44
47
|
Attributes:
|
|
45
48
|
database_path (Path): The path to the CodeQL database.
|
|
49
|
+
codeql_bin (str): Resolved binary path or the literal ``"codeql"``.
|
|
46
50
|
temp_file_path (Path): The path to the temporary query file.
|
|
47
51
|
csv_output_file (Path): The path to the CSV output file.
|
|
48
52
|
temp_bqrs_file_path (Path): The path to the temporary bqrs file.
|
|
@@ -52,39 +56,46 @@ class CodeQLQueryRunner:
|
|
|
52
56
|
CodeQLQueryExecutionException: If there is an error executing the query.
|
|
53
57
|
"""
|
|
54
58
|
|
|
55
|
-
def __init__(self, database_path: str):
|
|
59
|
+
def __init__(self, database_path: str, codeql_bin=None, codeql_packs_dir=None):
|
|
56
60
|
self.database_path: Path = Path(database_path)
|
|
61
|
+
self.codeql_bin: str = str(codeql_bin) if codeql_bin else "codeql"
|
|
62
|
+
self.codeql_packs_dir = (
|
|
63
|
+
Path(codeql_packs_dir) if codeql_packs_dir is not None else None
|
|
64
|
+
)
|
|
57
65
|
self.temp_file_path: Path = None
|
|
58
66
|
|
|
59
67
|
def __enter__(self):
|
|
60
|
-
"""Context entry that
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
68
|
+
"""Context entry that prepares paths to execute a CodeQL query.
|
|
69
|
+
|
|
70
|
+
The ``.ql`` file is written **inside the prepared qlpack
|
|
71
|
+
directory** (``codeql_packs_dir``) so ``import python`` resolves
|
|
72
|
+
against that pack's installed dependencies — no
|
|
73
|
+
``--additional-packs`` or ``--search-path`` needed. The CSV /
|
|
74
|
+
BQRS output files live in ``tempfile`` because they're transient
|
|
75
|
+
per-query artifacts.
|
|
67
76
|
"""
|
|
68
|
-
|
|
69
|
-
# Create a temporary file to hold the query and store its path
|
|
70
|
-
temp_file = tempfile.NamedTemporaryFile("w", delete=False, suffix=".ql")
|
|
77
|
+
# CSV and BQRS files are transient per-query — fine in /tmp.
|
|
71
78
|
csv_file = tempfile.NamedTemporaryFile("w", delete=False, suffix=".csv")
|
|
72
79
|
bqrs_file = tempfile.NamedTemporaryFile("w", delete=False, suffix=".bqrs")
|
|
73
|
-
self.temp_file_path = Path(temp_file.name)
|
|
74
80
|
self.csv_output_file = Path(csv_file.name)
|
|
75
81
|
self.temp_bqrs_file_path = Path(bqrs_file.name)
|
|
76
|
-
|
|
77
|
-
# Let's close the files, we'll reopen them by path when needed.
|
|
78
|
-
temp_file.close()
|
|
79
|
-
bqrs_file.close()
|
|
80
82
|
csv_file.close()
|
|
83
|
+
bqrs_file.close()
|
|
81
84
|
|
|
82
|
-
#
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
85
|
+
# The .ql file MUST live inside the prepared qlpack so its
|
|
86
|
+
# ``import python`` resolves via that pack's lock file. Writing
|
|
87
|
+
# outside the pack means CodeQL falls back to a default
|
|
88
|
+
# search-path that doesn't include downloaded library packs.
|
|
89
|
+
if self.codeql_packs_dir is None:
|
|
90
|
+
raise RuntimeError(
|
|
91
|
+
"CodeQLQueryRunner requires codeql_packs_dir — the directory "
|
|
92
|
+
"of an installed qlpack that depends on codeql/python-all."
|
|
93
|
+
)
|
|
94
|
+
ql_file = tempfile.NamedTemporaryFile(
|
|
95
|
+
"w", delete=False, suffix=".ql", dir=str(self.codeql_packs_dir)
|
|
96
|
+
)
|
|
97
|
+
self.temp_file_path = Path(ql_file.name)
|
|
98
|
+
ql_file.close()
|
|
88
99
|
|
|
89
100
|
return self
|
|
90
101
|
|
|
@@ -108,32 +119,41 @@ class CodeQLQueryRunner:
|
|
|
108
119
|
# Write the query to the temp file so we can execute it.
|
|
109
120
|
self.temp_file_path.write_text(query_string)
|
|
110
121
|
|
|
111
|
-
#
|
|
122
|
+
# The .ql file sits inside the qlpack directory whose lock file
|
|
123
|
+
# already resolves ``codeql/python-all`` and its transitive
|
|
124
|
+
# dependencies. ``codeql query run`` auto-discovers the enclosing
|
|
125
|
+
# qlpack — no extra flags required.
|
|
112
126
|
codeql_query_cmd = shlex.split(
|
|
113
|
-
f"
|
|
127
|
+
f"{shlex.quote(self.codeql_bin)} query run {self.temp_file_path} "
|
|
128
|
+
f"--database={self.database_path} "
|
|
129
|
+
f"--output={self.temp_bqrs_file_path}",
|
|
114
130
|
posix=False,
|
|
115
131
|
)
|
|
116
132
|
|
|
117
|
-
call = subprocess.Popen(
|
|
133
|
+
call = subprocess.Popen(
|
|
134
|
+
codeql_query_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE
|
|
135
|
+
)
|
|
118
136
|
_, err = call.communicate()
|
|
119
137
|
if call.returncode != 0:
|
|
120
138
|
raise CodeQLExceptions.CodeQLQueryExecutionException(
|
|
121
|
-
f"Error executing query: {err.
|
|
139
|
+
f"Error executing query: {(err or b'').decode(errors='replace')}"
|
|
122
140
|
)
|
|
123
141
|
|
|
124
142
|
# Convert the bqrs file to a CSV file
|
|
125
143
|
bqrs2csv_command = shlex.split(
|
|
126
|
-
f"
|
|
144
|
+
f"{shlex.quote(self.codeql_bin)} bqrs decode --format=csv --output={self.csv_output_file} {self.temp_bqrs_file_path}",
|
|
127
145
|
posix=False,
|
|
128
146
|
)
|
|
129
147
|
|
|
130
148
|
# Read the CSV file content and cast it to a DataFrame
|
|
131
149
|
|
|
132
|
-
call = subprocess.Popen(
|
|
150
|
+
call = subprocess.Popen(
|
|
151
|
+
bqrs2csv_command, stdout=subprocess.PIPE, stderr=subprocess.PIPE
|
|
152
|
+
)
|
|
133
153
|
_, err = call.communicate()
|
|
134
154
|
if call.returncode != 0:
|
|
135
155
|
raise CodeQLExceptions.CodeQLQueryExecutionException(
|
|
136
|
-
f"Error
|
|
156
|
+
f"Error decoding bqrs: {(err or b'').decode(errors='replace')}"
|
|
137
157
|
)
|
|
138
158
|
else:
|
|
139
159
|
return pd.read_csv(
|
|
@@ -161,5 +181,5 @@ class CodeQLQueryRunner:
|
|
|
161
181
|
if self.csv_output_file and self.csv_output_file.exists():
|
|
162
182
|
self.csv_output_file.unlink()
|
|
163
183
|
|
|
164
|
-
if self.
|
|
165
|
-
self.
|
|
184
|
+
if self.temp_bqrs_file_path and self.temp_bqrs_file_path.exists():
|
|
185
|
+
self.temp_bqrs_file_path.unlink()
|
|
@@ -4,7 +4,7 @@ import tokenize
|
|
|
4
4
|
from ast import AST, ClassDef
|
|
5
5
|
from io import StringIO
|
|
6
6
|
from pathlib import Path
|
|
7
|
-
from typing import Dict, List, Optional, Union
|
|
7
|
+
from typing import Dict, List, Optional, Tuple, Union
|
|
8
8
|
|
|
9
9
|
import jedi
|
|
10
10
|
from jedi.api import Script
|
|
@@ -71,6 +71,32 @@ class SymbolTableBuilder:
|
|
|
71
71
|
pass
|
|
72
72
|
return None
|
|
73
73
|
|
|
74
|
+
@staticmethod
|
|
75
|
+
def _infer_callee(
|
|
76
|
+
script: Script, line: int, column: int
|
|
77
|
+
) -> Tuple[Optional[str], bool]:
|
|
78
|
+
"""Infer ``(qualified_name, is_class)`` at a call expression.
|
|
79
|
+
|
|
80
|
+
When the callee resolves to a class (e.g. ``A()``), the qualified
|
|
81
|
+
name is normalized to ``<class>.__init__`` so it joins to the
|
|
82
|
+
``PyCallable`` entry for the constructor in the symbol table —
|
|
83
|
+
classes themselves are not ``PyCallable``s, so without this
|
|
84
|
+
rewrite every constructor call would surface as a ghost node in
|
|
85
|
+
the call graph.
|
|
86
|
+
"""
|
|
87
|
+
try:
|
|
88
|
+
definitions = script.infer(line=line, column=column)
|
|
89
|
+
if not definitions:
|
|
90
|
+
return None, False
|
|
91
|
+
d = definitions[0]
|
|
92
|
+
is_class = (d.type == "class")
|
|
93
|
+
full = d.full_name
|
|
94
|
+
if is_class and full:
|
|
95
|
+
full = f"{full}.__init__"
|
|
96
|
+
return full, is_class
|
|
97
|
+
except Exception:
|
|
98
|
+
return None, False
|
|
99
|
+
|
|
74
100
|
def build_pymodule_from_file(self, py_file: Path) -> PyModule:
|
|
75
101
|
"""Builds a PyModule from a Python file.
|
|
76
102
|
|
|
@@ -485,6 +511,63 @@ class SymbolTableBuilder:
|
|
|
485
511
|
symbols.append(symbol)
|
|
486
512
|
return symbols
|
|
487
513
|
|
|
514
|
+
@staticmethod
|
|
515
|
+
def _iter_calls_in_scope(fn_node: ast.AST):
|
|
516
|
+
"""Yield ``ast.Call`` nodes belonging to ``fn_node``'s own scope.
|
|
517
|
+
|
|
518
|
+
Naive ``ast.walk`` descends into nested ``FunctionDef`` / ``ClassDef``
|
|
519
|
+
bodies, attributing their calls to the outer function — wrong, since
|
|
520
|
+
those nested definitions have their own ``PyCallable`` entries
|
|
521
|
+
(built recursively by ``_callables``/``_add_class``) and own
|
|
522
|
+
``call_sites`` lists.
|
|
523
|
+
|
|
524
|
+
Decorators, default arguments, return-type annotations, base
|
|
525
|
+
classes and class-level keyword args ARE evaluated in the
|
|
526
|
+
enclosing scope, so calls in those subtrees stay attributed to
|
|
527
|
+
``fn_node``. Bodies of nested defs/classes are skipped. Lambdas,
|
|
528
|
+
comprehensions and inline conditionals don't get their own
|
|
529
|
+
``PyCallable`` so their internals stay attributed to the enclosing
|
|
530
|
+
function.
|
|
531
|
+
"""
|
|
532
|
+
|
|
533
|
+
def walk(node: ast.AST):
|
|
534
|
+
if isinstance(node, ast.Call):
|
|
535
|
+
yield node
|
|
536
|
+
|
|
537
|
+
if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)):
|
|
538
|
+
# Decorators, defaults, return annotations run in
|
|
539
|
+
# enclosing scope. Body and arg names run in inner scope.
|
|
540
|
+
for dec in node.decorator_list:
|
|
541
|
+
yield from walk(dec)
|
|
542
|
+
for default in node.args.defaults:
|
|
543
|
+
yield from walk(default)
|
|
544
|
+
for default in node.args.kw_defaults:
|
|
545
|
+
if default is not None:
|
|
546
|
+
yield from walk(default)
|
|
547
|
+
if node.returns is not None:
|
|
548
|
+
yield from walk(node.returns)
|
|
549
|
+
return
|
|
550
|
+
|
|
551
|
+
if isinstance(node, ast.ClassDef):
|
|
552
|
+
# Decorators, bases, and keyword args run in enclosing scope.
|
|
553
|
+
# Body runs in class scope.
|
|
554
|
+
for dec in node.decorator_list:
|
|
555
|
+
yield from walk(dec)
|
|
556
|
+
for base in node.bases:
|
|
557
|
+
yield from walk(base)
|
|
558
|
+
for kw in node.keywords:
|
|
559
|
+
yield from walk(kw.value)
|
|
560
|
+
return
|
|
561
|
+
|
|
562
|
+
for child in ast.iter_child_nodes(node):
|
|
563
|
+
yield from walk(child)
|
|
564
|
+
|
|
565
|
+
for stmt in getattr(fn_node, "body", []):
|
|
566
|
+
yield from walk(stmt)
|
|
567
|
+
# Decorators / defaults / returns of fn_node itself are evaluated
|
|
568
|
+
# in the ENCLOSING scope, so they belong to fn_node's parent, not
|
|
569
|
+
# fn_node. Don't yield them here.
|
|
570
|
+
|
|
488
571
|
def _call_sites(self, fn_node: ast.FunctionDef, script: Script) -> List[PyCallsite]:
|
|
489
572
|
"""
|
|
490
573
|
Finds all call sites made from within the function using Jedi for type inference.
|
|
@@ -498,14 +581,14 @@ class SymbolTableBuilder:
|
|
|
498
581
|
"""
|
|
499
582
|
call_sites: List[PyCallsite] = []
|
|
500
583
|
|
|
501
|
-
for node in
|
|
584
|
+
for node in self._iter_calls_in_scope(fn_node):
|
|
502
585
|
if not isinstance(node, ast.Call):
|
|
503
586
|
continue
|
|
504
587
|
|
|
505
588
|
func_expr = node.func
|
|
506
589
|
|
|
507
590
|
method_name = "<unknown>"
|
|
508
|
-
callee_signature = self.
|
|
591
|
+
callee_signature, is_constructor = self._infer_callee(
|
|
509
592
|
script, node.lineno, node.col_offset
|
|
510
593
|
)
|
|
511
594
|
return_type = self._infer_type(script, node.lineno, node.col_offset)
|
|
@@ -535,7 +618,7 @@ class SymbolTableBuilder:
|
|
|
535
618
|
.argument_types(argument_types)
|
|
536
619
|
.return_type(return_type)
|
|
537
620
|
.callee_signature(callee_signature)
|
|
538
|
-
.is_constructor_call(
|
|
621
|
+
.is_constructor_call(is_constructor)
|
|
539
622
|
.start_line(getattr(node, "lineno", -1))
|
|
540
623
|
.start_column(getattr(node, "col_offset", -1))
|
|
541
624
|
.end_line(getattr(node, "end_lineno", -1))
|