apisec-code-bolt 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- apisec_code_bolt/__init__.py +42 -0
- apisec_code_bolt/__main__.py +11 -0
- apisec_code_bolt/analysis/__init__.py +96 -0
- apisec_code_bolt/analysis/analyzer.py +2309 -0
- apisec_code_bolt/analysis/binding_tracker.py +341 -0
- apisec_code_bolt/analysis/call_graph.py +1197 -0
- apisec_code_bolt/analysis/call_graph_types.py +332 -0
- apisec_code_bolt/analysis/call_resolver.py +988 -0
- apisec_code_bolt/analysis/capability_tagger.py +322 -0
- apisec_code_bolt/analysis/config_scanner.py +197 -0
- apisec_code_bolt/analysis/data_flow.py +1883 -0
- apisec_code_bolt/analysis/dependency_extractor.py +959 -0
- apisec_code_bolt/analysis/flow_analysis.py +1406 -0
- apisec_code_bolt/analysis/hof_catalog.py +61 -0
- apisec_code_bolt/analysis/integration_detector.py +1399 -0
- apisec_code_bolt/analysis/literal_scanner.py +300 -0
- apisec_code_bolt/analysis/path_normalizer.py +55 -0
- apisec_code_bolt/analysis/read_site_detector.py +310 -0
- apisec_code_bolt/analysis/request_patterns.py +162 -0
- apisec_code_bolt/analysis/sensitivity_classifier.py +224 -0
- apisec_code_bolt/analysis/sink_evidence.py +333 -0
- apisec_code_bolt/analysis/url_prefix_resolver.py +338 -0
- apisec_code_bolt/cli/__init__.py +5 -0
- apisec_code_bolt/cli/exit_codes.py +17 -0
- apisec_code_bolt/cli/main.py +1069 -0
- apisec_code_bolt/cloud/__init__.py +1 -0
- apisec_code_bolt/cloud/apisec_client.py +118 -0
- apisec_code_bolt/cloud/client.py +255 -0
- apisec_code_bolt/core/__init__.py +75 -0
- apisec_code_bolt/core/config.py +528 -0
- apisec_code_bolt/core/credentials.py +65 -0
- apisec_code_bolt/core/discovery.py +433 -0
- apisec_code_bolt/core/log_format.py +115 -0
- apisec_code_bolt/core/manifest.py +1009 -0
- apisec_code_bolt/core/repo.py +280 -0
- apisec_code_bolt/core/state.py +59 -0
- apisec_code_bolt/core/telemetry.py +451 -0
- apisec_code_bolt/core/types.py +587 -0
- apisec_code_bolt/fingerprinting/__init__.py +1 -0
- apisec_code_bolt/frameworks/__init__.py +29 -0
- apisec_code_bolt/frameworks/_jwt_common.py +50 -0
- apisec_code_bolt/frameworks/auth_helpers.py +437 -0
- apisec_code_bolt/frameworks/base.py +608 -0
- apisec_code_bolt/frameworks/dotnet/__init__.py +17 -0
- apisec_code_bolt/frameworks/dotnet/_path_helpers.py +43 -0
- apisec_code_bolt/frameworks/dotnet/aspnet_plugin.py +2546 -0
- apisec_code_bolt/frameworks/dotnet/grpc_plugin.py +559 -0
- apisec_code_bolt/frameworks/dotnet/jwt_config_extractor.py +545 -0
- apisec_code_bolt/frameworks/dotnet/legacy_aspnet_plugin.py +732 -0
- apisec_code_bolt/frameworks/dotnet/refit_plugin.py +374 -0
- apisec_code_bolt/frameworks/dotnet/wcf_plugin.py +1239 -0
- apisec_code_bolt/frameworks/java/__init__.py +6 -0
- apisec_code_bolt/frameworks/java/_annotations.py +167 -0
- apisec_code_bolt/frameworks/java/_constraints.py +128 -0
- apisec_code_bolt/frameworks/java/graphql_plugin.py +287 -0
- apisec_code_bolt/frameworks/java/jaxrs_plugin.py +748 -0
- apisec_code_bolt/frameworks/java/jwt_config_extractor.py +361 -0
- apisec_code_bolt/frameworks/java/micronaut_plugin.py +1059 -0
- apisec_code_bolt/frameworks/java/spring_plugin.py +1293 -0
- apisec_code_bolt/frameworks/js/__init__.py +8 -0
- apisec_code_bolt/frameworks/js/express_plugin.py +391 -0
- apisec_code_bolt/frameworks/js/fastify_plugin.py +381 -0
- apisec_code_bolt/frameworks/js/graphql_plugin.py +198 -0
- apisec_code_bolt/frameworks/js/nestjs_plugin.py +423 -0
- apisec_code_bolt/frameworks/python/__init__.py +19 -0
- apisec_code_bolt/frameworks/python/celery_plugin.py +393 -0
- apisec_code_bolt/frameworks/python/click_plugin.py +427 -0
- apisec_code_bolt/frameworks/python/django_plugin.py +867 -0
- apisec_code_bolt/frameworks/python/fastapi/__init__.py +28 -0
- apisec_code_bolt/frameworks/python/fastapi/plugin.py +1390 -0
- apisec_code_bolt/frameworks/python/flask_plugin.py +205 -0
- apisec_code_bolt/frameworks/python/graphql_plugin.py +274 -0
- apisec_code_bolt/frameworks/python/prefect_plugin.py +251 -0
- apisec_code_bolt/frameworks/python/webhook_plugin.py +255 -0
- apisec_code_bolt/parsing/__init__.py +62 -0
- apisec_code_bolt/parsing/base.py +554 -0
- apisec_code_bolt/parsing/csharp/__init__.py +5 -0
- apisec_code_bolt/parsing/csharp/language_services.py +203 -0
- apisec_code_bolt/parsing/csharp/literals.py +72 -0
- apisec_code_bolt/parsing/csharp/parser.py +1158 -0
- apisec_code_bolt/parsing/csharp/type_resolver.py +568 -0
- apisec_code_bolt/parsing/js/__init__.py +5 -0
- apisec_code_bolt/parsing/js/language_services.py +118 -0
- apisec_code_bolt/parsing/js/parser.py +622 -0
- apisec_code_bolt/parsing/jvm/__init__.py +7 -0
- apisec_code_bolt/parsing/jvm/language_services.py +270 -0
- apisec_code_bolt/parsing/jvm/parser.py +774 -0
- apisec_code_bolt/parsing/jvm/type_resolver.py +422 -0
- apisec_code_bolt/parsing/python/__init__.py +150 -0
- apisec_code_bolt/parsing/python/cbv_extractor.py +606 -0
- apisec_code_bolt/parsing/python/constant_resolver.py +500 -0
- apisec_code_bolt/parsing/python/cross_file_resolver.py +1054 -0
- apisec_code_bolt/parsing/python/dynamic_route_detector.py +532 -0
- apisec_code_bolt/parsing/python/expression_utils.py +221 -0
- apisec_code_bolt/parsing/python/extraction_types.py +271 -0
- apisec_code_bolt/parsing/python/language_services.py +487 -0
- apisec_code_bolt/parsing/python/parameter_analyzer.py +789 -0
- apisec_code_bolt/parsing/python/parser.py +719 -0
- apisec_code_bolt/parsing/python/path_resolver.py +576 -0
- apisec_code_bolt/parsing/python/router_registry.py +806 -0
- apisec_code_bolt/parsing/python/type_resolver.py +730 -0
- apisec_code_bolt/parsing/python/visitors.py +1544 -0
- apisec_code_bolt/parsing/services.py +544 -0
- apisec_code_bolt/query/__init__.py +1 -0
- apisec_code_bolt/query/ast_cache.py +182 -0
- apisec_code_bolt/query/executor.py +283 -0
- apisec_code_bolt/query/handlers.py +832 -0
- apisec_code_bolt-0.1.0.dist-info/METADATA +230 -0
- apisec_code_bolt-0.1.0.dist-info/RECORD +111 -0
- apisec_code_bolt-0.1.0.dist-info/WHEEL +4 -0
- apisec_code_bolt-0.1.0.dist-info/entry_points.txt +2 -0
|
@@ -0,0 +1,1883 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Inter-procedural data flow (taint) tracking.
|
|
3
|
+
|
|
4
|
+
This module implements taint analysis to track how data flows from HTTP inputs
|
|
5
|
+
(sources) through the program to function calls (potential sinks). The probe
|
|
6
|
+
does NOT decide what constitutes a vulnerability - it only tracks data flow
|
|
7
|
+
paths. The cloud service applies sink/source rules to identify issues.
|
|
8
|
+
|
|
9
|
+
Key concepts:
|
|
10
|
+
- **Origin**: Where data enters (HTTP params, body, headers, etc.)
|
|
11
|
+
- **Taint**: A marker indicating data came from an origin
|
|
12
|
+
- **Propagation**: How taint spreads through assignments and calls
|
|
13
|
+
- **Path**: The sequence of function calls data flows through
|
|
14
|
+
- **Sink**: Any function call that receives tainted data (cloud decides danger)
|
|
15
|
+
|
|
16
|
+
Algorithm:
|
|
17
|
+
1. Identify all data origins from route handlers (entry points)
|
|
18
|
+
2. Mark function parameters that receive origin data as tainted
|
|
19
|
+
3. Propagate taint through:
|
|
20
|
+
- Assignments: x = y spreads taint from y to x
|
|
21
|
+
- Returns: return tainted_var taints the call result
|
|
22
|
+
- Arguments: func(tainted) taints the parameter in callee
|
|
23
|
+
4. Record the full path when tainted data reaches a function call
|
|
24
|
+
5. Track transformations (string operations, encoding, etc.)
|
|
25
|
+
|
|
26
|
+
Depth limit: We stop propagation at a configurable max depth (default 10)
|
|
27
|
+
to prevent infinite recursion and manage memory.
|
|
28
|
+
"""
|
|
29
|
+
|
|
30
|
+
from __future__ import annotations
|
|
31
|
+
|
|
32
|
+
import logging
|
|
33
|
+
from collections import defaultdict
|
|
34
|
+
from dataclasses import dataclass, field
|
|
35
|
+
from pathlib import Path
|
|
36
|
+
from typing import TYPE_CHECKING, Any
|
|
37
|
+
|
|
38
|
+
from ..core.manifest import stable_id
|
|
39
|
+
from ..core.types import OriginType
|
|
40
|
+
|
|
41
|
+
if TYPE_CHECKING:
|
|
42
|
+
from ..parsing.base import ParsedCallSite, ParsedFile, ParsedFunction
|
|
43
|
+
from .call_graph import CallGraph, CallGraphEdge
|
|
44
|
+
from .flow_analysis import FlowSensitiveBindings
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
logger = logging.getLogger(__name__)
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
# =============================================================================
|
|
51
|
+
# Core Data Structures
|
|
52
|
+
# =============================================================================
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
@dataclass
|
|
56
|
+
class TaintedValue:
|
|
57
|
+
"""
|
|
58
|
+
Represents a tainted value being tracked.
|
|
59
|
+
|
|
60
|
+
A taint propagates from an origin and carries information about
|
|
61
|
+
where the data came from and how it was transformed.
|
|
62
|
+
"""
|
|
63
|
+
|
|
64
|
+
# Unique ID for this taint instance
|
|
65
|
+
taint_id: str
|
|
66
|
+
|
|
67
|
+
# Origin information
|
|
68
|
+
origin_type: OriginType
|
|
69
|
+
origin_name: str # Parameter name, header name, etc.
|
|
70
|
+
origin_location: tuple[Path, int] # (file, line)
|
|
71
|
+
entry_point_id: str | None # Reference to entry point (route)
|
|
72
|
+
|
|
73
|
+
# Current state
|
|
74
|
+
current_variable: str # Variable name holding tainted data
|
|
75
|
+
current_function: str # Function where taint currently is
|
|
76
|
+
current_file: Path
|
|
77
|
+
|
|
78
|
+
# Path tracking
|
|
79
|
+
depth: int = 0
|
|
80
|
+
path: list[FlowStep] = field(default_factory=list)
|
|
81
|
+
|
|
82
|
+
# Transformations applied
|
|
83
|
+
transformations: list[Transformation] = field(default_factory=list)
|
|
84
|
+
|
|
85
|
+
# Metadata
|
|
86
|
+
metadata: dict[str, Any] = field(default_factory=dict)
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
@dataclass
|
|
90
|
+
class FlowStep:
|
|
91
|
+
"""A step in the data flow path."""
|
|
92
|
+
|
|
93
|
+
depth: int
|
|
94
|
+
caller: str # Calling function
|
|
95
|
+
callee: str # Called function
|
|
96
|
+
argument_index: int | None # Which argument carried the taint
|
|
97
|
+
argument_name: str | None # Parameter name in callee
|
|
98
|
+
location: tuple[Path, int] # (file, line)
|
|
99
|
+
|
|
100
|
+
# Mapping: how caller's variable maps to callee's parameter
|
|
101
|
+
variable_mapping: dict[str, str] = field(default_factory=dict)
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
class TransformationType:
|
|
105
|
+
"""Well-known transformation type constants (cloud-interpretable)."""
|
|
106
|
+
|
|
107
|
+
FUNCTION_CALL = "function_call" # x = func(y)
|
|
108
|
+
METHOD_CALL = "method_call" # x = y.method()
|
|
109
|
+
STRING_FORMAT = "string_format" # x = f"...{y}..."
|
|
110
|
+
BINARY_OP = "binary_op" # x = y + z
|
|
111
|
+
TYPE_CONSTRUCTOR = "type_constructor" # x = int(y), x = str(y)
|
|
112
|
+
ATTRIBUTE_ACCESS = "attribute_access" # x = y.attr
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
@dataclass
|
|
116
|
+
class Transformation:
|
|
117
|
+
"""A transformation applied to tainted data."""
|
|
118
|
+
|
|
119
|
+
type: str # See TransformationType constants
|
|
120
|
+
description: str | None
|
|
121
|
+
location: tuple[Path, int]
|
|
122
|
+
function: str | None # Function / method that performed the transform
|
|
123
|
+
|
|
124
|
+
# The depth in the inter-procedural path where this transformation occurs.
|
|
125
|
+
# -1 means intra-procedural (within the same function as the source).
|
|
126
|
+
depth: int = -1
|
|
127
|
+
|
|
128
|
+
# Structural evidence about the transformation call site (co-arguments, kwargs).
|
|
129
|
+
call_evidence: Any = None # CallSiteEvidence from sink_evidence module
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
@dataclass
|
|
133
|
+
class DataFlow:
|
|
134
|
+
"""
|
|
135
|
+
A complete data flow from origin to a function call.
|
|
136
|
+
|
|
137
|
+
This is the output of taint analysis - one flow per origin-to-call path.
|
|
138
|
+
"""
|
|
139
|
+
|
|
140
|
+
id: str # Unique ID (flow-001, etc.)
|
|
141
|
+
|
|
142
|
+
# Origin (source)
|
|
143
|
+
origin_type: OriginType
|
|
144
|
+
origin_name: str
|
|
145
|
+
origin_location: tuple[Path, int]
|
|
146
|
+
entry_point_id: str | None
|
|
147
|
+
|
|
148
|
+
# Sink (destination function call)
|
|
149
|
+
sink_function: str # Qualified name of called function
|
|
150
|
+
sink_argument_index: int | None
|
|
151
|
+
sink_argument_name: str | None
|
|
152
|
+
sink_location: tuple[Path, int]
|
|
153
|
+
sink_call_id: str | None # Reference to call in call graph
|
|
154
|
+
|
|
155
|
+
# Path through code
|
|
156
|
+
path: list[FlowStep]
|
|
157
|
+
depth: int
|
|
158
|
+
truncated: bool = False # True if cut off at max depth
|
|
159
|
+
|
|
160
|
+
# Transformations
|
|
161
|
+
transformations: list[Transformation] = field(default_factory=list)
|
|
162
|
+
|
|
163
|
+
# Sink evidence (structural facts about the call site)
|
|
164
|
+
sink_evidence: Any = None # CallSiteEvidence from sink_evidence module
|
|
165
|
+
|
|
166
|
+
# Context at sink
|
|
167
|
+
sink_context: dict[str, Any] = field(default_factory=dict)
|
|
168
|
+
|
|
169
|
+
# Confidence
|
|
170
|
+
confidence: str = "HIGH" # HIGH, MEDIUM, LOW
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
# =============================================================================
|
|
174
|
+
# Taint Propagator
|
|
175
|
+
# =============================================================================
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
class TaintPropagator:
|
|
179
|
+
"""
|
|
180
|
+
Propagates taint through the program using the call graph.
|
|
181
|
+
|
|
182
|
+
This is the core engine that tracks how data flows through function
|
|
183
|
+
calls. It uses the call graph for inter-procedural tracking and
|
|
184
|
+
flow-sensitive bindings for intra-procedural tracking.
|
|
185
|
+
|
|
186
|
+
Uses actual call-site argument analysis when parsed_files are provided
|
|
187
|
+
for accurate taint propagation (which variable flows to which parameter).
|
|
188
|
+
"""
|
|
189
|
+
|
|
190
|
+
def __init__(
|
|
191
|
+
self,
|
|
192
|
+
call_graph: CallGraph,
|
|
193
|
+
flow_bindings: FlowSensitiveBindings | None = None,
|
|
194
|
+
max_depth: int = 10,
|
|
195
|
+
parsed_files: list[ParsedFile] | None = None,
|
|
196
|
+
):
|
|
197
|
+
self._call_graph = call_graph
|
|
198
|
+
self._flow_bindings = flow_bindings
|
|
199
|
+
self._max_depth = max_depth
|
|
200
|
+
self._parsed_files = {pf.path: pf for pf in (parsed_files or []) if pf.success}
|
|
201
|
+
|
|
202
|
+
# Taint state: function -> variable -> set of taints
|
|
203
|
+
self._taints: dict[str, dict[str, set[str]]] = defaultdict(lambda: defaultdict(set))
|
|
204
|
+
|
|
205
|
+
# Active taints being propagated
|
|
206
|
+
self._active_taints: dict[str, TaintedValue] = {}
|
|
207
|
+
|
|
208
|
+
# Completed flows
|
|
209
|
+
self._flows: list[DataFlow] = []
|
|
210
|
+
|
|
211
|
+
# Next IDs (taint IDs are internal-only, stay sequential)
|
|
212
|
+
self._next_taint_id = 0
|
|
213
|
+
|
|
214
|
+
# Visited states (to prevent infinite loops)
|
|
215
|
+
self._visited: set[tuple[str, str, int]] = set()
|
|
216
|
+
|
|
217
|
+
# Statistics
|
|
218
|
+
self._stats = {
|
|
219
|
+
"taints_created": 0,
|
|
220
|
+
"flows_found": 0,
|
|
221
|
+
"max_depth_reached": 0,
|
|
222
|
+
"cycles_avoided": 0,
|
|
223
|
+
}
|
|
224
|
+
|
|
225
|
+
def add_origin(
|
|
226
|
+
self,
|
|
227
|
+
origin_type: OriginType,
|
|
228
|
+
origin_name: str,
|
|
229
|
+
origin_location: tuple[Path, int],
|
|
230
|
+
entry_point_id: str | None,
|
|
231
|
+
function_qname: str,
|
|
232
|
+
parameter_name: str,
|
|
233
|
+
) -> str:
|
|
234
|
+
"""
|
|
235
|
+
Add a new data origin to track.
|
|
236
|
+
|
|
237
|
+
This creates a taint that will be propagated through the code.
|
|
238
|
+
|
|
239
|
+
Args:
|
|
240
|
+
origin_type: Type of origin (HTTP_PATH_PARAM, etc.)
|
|
241
|
+
origin_name: Name of the origin (parameter name, header name, etc.)
|
|
242
|
+
origin_location: Where the origin is in code
|
|
243
|
+
entry_point_id: Reference to the entry point (route)
|
|
244
|
+
function_qname: Function where the origin enters
|
|
245
|
+
parameter_name: Parameter that receives the origin data
|
|
246
|
+
|
|
247
|
+
Returns:
|
|
248
|
+
Taint ID for tracking
|
|
249
|
+
"""
|
|
250
|
+
taint_id = f"taint-{self._next_taint_id:04d}"
|
|
251
|
+
self._next_taint_id += 1
|
|
252
|
+
|
|
253
|
+
taint = TaintedValue(
|
|
254
|
+
taint_id=taint_id,
|
|
255
|
+
origin_type=origin_type,
|
|
256
|
+
origin_name=origin_name,
|
|
257
|
+
origin_location=origin_location,
|
|
258
|
+
entry_point_id=entry_point_id,
|
|
259
|
+
current_variable=parameter_name,
|
|
260
|
+
current_function=function_qname,
|
|
261
|
+
current_file=origin_location[0],
|
|
262
|
+
depth=0,
|
|
263
|
+
path=[],
|
|
264
|
+
)
|
|
265
|
+
|
|
266
|
+
self._active_taints[taint_id] = taint
|
|
267
|
+
self._taints[function_qname][parameter_name].add(taint_id)
|
|
268
|
+
self._stats["taints_created"] += 1
|
|
269
|
+
|
|
270
|
+
return taint_id
|
|
271
|
+
|
|
272
|
+
def propagate_all(self) -> list[DataFlow]:
|
|
273
|
+
"""
|
|
274
|
+
Propagate all taints through the call graph.
|
|
275
|
+
|
|
276
|
+
This is the main entry point for taint analysis. It propagates
|
|
277
|
+
all registered origins until they either:
|
|
278
|
+
- Reach max depth
|
|
279
|
+
- Hit a cycle
|
|
280
|
+
- Reach a leaf function (no more calls)
|
|
281
|
+
|
|
282
|
+
Returns:
|
|
283
|
+
List of discovered data flows
|
|
284
|
+
"""
|
|
285
|
+
# Process each function in topological order (if possible)
|
|
286
|
+
# For now, process all functions that have taints
|
|
287
|
+
worklist = list(self._active_taints.keys())
|
|
288
|
+
|
|
289
|
+
while worklist:
|
|
290
|
+
taint_id = worklist.pop(0)
|
|
291
|
+
|
|
292
|
+
if taint_id not in self._active_taints:
|
|
293
|
+
continue
|
|
294
|
+
|
|
295
|
+
taint = self._active_taints[taint_id]
|
|
296
|
+
|
|
297
|
+
# Check depth limit
|
|
298
|
+
if taint.depth >= self._max_depth:
|
|
299
|
+
self._stats["max_depth_reached"] += 1
|
|
300
|
+
continue
|
|
301
|
+
|
|
302
|
+
# Check for cycles
|
|
303
|
+
state = (taint.current_function, taint.current_variable, taint.depth)
|
|
304
|
+
if state in self._visited:
|
|
305
|
+
self._stats["cycles_avoided"] += 1
|
|
306
|
+
continue
|
|
307
|
+
self._visited.add(state)
|
|
308
|
+
|
|
309
|
+
# Propagate through calls from this function
|
|
310
|
+
new_taints = self._propagate_function(taint)
|
|
311
|
+
worklist.extend(new_taints)
|
|
312
|
+
|
|
313
|
+
return self._flows
|
|
314
|
+
|
|
315
|
+
def _propagate_function(self, taint: TaintedValue) -> list[str]:
|
|
316
|
+
"""
|
|
317
|
+
Propagate taint through calls made by a function.
|
|
318
|
+
|
|
319
|
+
Returns list of new taint IDs created.
|
|
320
|
+
"""
|
|
321
|
+
new_taint_ids: list[str] = []
|
|
322
|
+
|
|
323
|
+
# Get all calls from this function
|
|
324
|
+
callees = self._call_graph.get_callees(taint.current_function)
|
|
325
|
+
|
|
326
|
+
for callee_qname in callees:
|
|
327
|
+
# Get the edges (call sites) for this caller-callee pair
|
|
328
|
+
# Filter edges from caller to this specific callee
|
|
329
|
+
edges = [
|
|
330
|
+
e
|
|
331
|
+
for e in self._call_graph.get_edges_from(taint.current_function)
|
|
332
|
+
if e.callee == callee_qname
|
|
333
|
+
]
|
|
334
|
+
|
|
335
|
+
for edge in edges:
|
|
336
|
+
# Check if the tainted variable is passed to this call
|
|
337
|
+
taint_passes = self._check_taint_passes(taint, edge, callee_qname)
|
|
338
|
+
|
|
339
|
+
if taint_passes:
|
|
340
|
+
arg_index, arg_name = taint_passes
|
|
341
|
+
|
|
342
|
+
# Extract transformations along the derivation chain
|
|
343
|
+
# from the tainted variable to the actual argument.
|
|
344
|
+
chain_transforms = self._extract_transformations_in_chain(
|
|
345
|
+
taint,
|
|
346
|
+
arg_name if arg_name != "iterable" else taint.current_variable,
|
|
347
|
+
edge.line,
|
|
348
|
+
)
|
|
349
|
+
inline_transforms = self._extract_arg_transformations(taint, edge, callee_qname)
|
|
350
|
+
all_transforms = (
|
|
351
|
+
list(taint.transformations) + chain_transforms + inline_transforms
|
|
352
|
+
)
|
|
353
|
+
|
|
354
|
+
# Record this as a flow to a function call (potential sink)
|
|
355
|
+
flow = self._create_flow(
|
|
356
|
+
taint,
|
|
357
|
+
edge,
|
|
358
|
+
callee_qname,
|
|
359
|
+
arg_index,
|
|
360
|
+
arg_name,
|
|
361
|
+
extra_transformations=all_transforms,
|
|
362
|
+
)
|
|
363
|
+
self._flows.append(flow)
|
|
364
|
+
self._stats["flows_found"] += 1
|
|
365
|
+
|
|
366
|
+
# Continue propagation into the callee, carrying
|
|
367
|
+
# accumulated transformations forward.
|
|
368
|
+
if self._should_propagate_into(callee_qname):
|
|
369
|
+
new_taint = self._create_propagated_taint(
|
|
370
|
+
taint,
|
|
371
|
+
edge,
|
|
372
|
+
callee_qname,
|
|
373
|
+
arg_name,
|
|
374
|
+
accumulated_transforms=all_transforms,
|
|
375
|
+
)
|
|
376
|
+
if new_taint:
|
|
377
|
+
new_taint_ids.append(new_taint.taint_id)
|
|
378
|
+
|
|
379
|
+
# Return value propagation: if callee returns the tainted param,
|
|
380
|
+
# the call result (if assigned) is tainted in the caller
|
|
381
|
+
if self._flow_bindings and self._flow_bindings.function_returns_parameter(
|
|
382
|
+
callee_qname, arg_name
|
|
383
|
+
):
|
|
384
|
+
lhs = self._get_assignment_target_for_call(
|
|
385
|
+
taint.current_function,
|
|
386
|
+
edge.file_path,
|
|
387
|
+
edge.line,
|
|
388
|
+
callee_qname,
|
|
389
|
+
)
|
|
390
|
+
if lhs and taint.depth < self._max_depth - 1:
|
|
391
|
+
return_taint = self._create_return_propagated_taint(
|
|
392
|
+
taint,
|
|
393
|
+
edge,
|
|
394
|
+
lhs,
|
|
395
|
+
accumulated_transforms=all_transforms,
|
|
396
|
+
)
|
|
397
|
+
if return_taint:
|
|
398
|
+
new_taint_ids.append(return_taint.taint_id)
|
|
399
|
+
|
|
400
|
+
# HOF: propagate taint into callback when data arg is tainted
|
|
401
|
+
from .hof_catalog import is_known_hof
|
|
402
|
+
|
|
403
|
+
if is_known_hof(callee_qname) and arg_name == "iterable":
|
|
404
|
+
callback_qname = self._resolve_hof_callback(
|
|
405
|
+
taint.current_function,
|
|
406
|
+
edge.file_path,
|
|
407
|
+
edge.line,
|
|
408
|
+
callee_qname,
|
|
409
|
+
)
|
|
410
|
+
if (
|
|
411
|
+
callback_qname
|
|
412
|
+
and self._should_propagate_into(callback_qname)
|
|
413
|
+
and taint.depth < self._max_depth - 1
|
|
414
|
+
):
|
|
415
|
+
callback_node = self._call_graph.get_node(callback_qname)
|
|
416
|
+
first_param = (
|
|
417
|
+
callback_node.parameters[0]
|
|
418
|
+
if callback_node and callback_node.parameters
|
|
419
|
+
else "x"
|
|
420
|
+
)
|
|
421
|
+
cb_taint = self._create_propagated_taint(
|
|
422
|
+
taint, edge, callback_qname, first_param
|
|
423
|
+
)
|
|
424
|
+
if cb_taint:
|
|
425
|
+
new_taint_ids.append(cb_taint.taint_id)
|
|
426
|
+
|
|
427
|
+
return new_taint_ids
|
|
428
|
+
|
|
429
|
+
def _get_assignment_target_for_call(
|
|
430
|
+
self,
|
|
431
|
+
caller: str,
|
|
432
|
+
file_path: Path,
|
|
433
|
+
line: int,
|
|
434
|
+
callee: str,
|
|
435
|
+
) -> str | None:
|
|
436
|
+
"""
|
|
437
|
+
Find if the call result is assigned: x = callee(...).
|
|
438
|
+
|
|
439
|
+
Returns the target variable name if found, None otherwise.
|
|
440
|
+
Used for return value taint propagation.
|
|
441
|
+
"""
|
|
442
|
+
parsed = self._parsed_files.get(file_path)
|
|
443
|
+
if not parsed or not hasattr(parsed, "assignments"):
|
|
444
|
+
return None
|
|
445
|
+
for assign in parsed.assignments:
|
|
446
|
+
if assign.location.line != line:
|
|
447
|
+
continue
|
|
448
|
+
if assign.source_type != "call" or not assign.source_call:
|
|
449
|
+
continue
|
|
450
|
+
assign_func = assign.in_function or ""
|
|
451
|
+
if assign_func != caller and not (caller and caller.endswith(f".{assign_func}")):
|
|
452
|
+
continue
|
|
453
|
+
# Match callee: source_call might be "get_id", callee might be "main.get_id"
|
|
454
|
+
sc = assign.source_call or ""
|
|
455
|
+
if (
|
|
456
|
+
sc == callee
|
|
457
|
+
or callee.endswith(f".{sc}")
|
|
458
|
+
or sc.endswith(f".{callee.split('.')[-1]}")
|
|
459
|
+
):
|
|
460
|
+
return assign.target
|
|
461
|
+
return None
|
|
462
|
+
|
|
463
|
+
def _extract_transformations_in_chain(
|
|
464
|
+
self,
|
|
465
|
+
taint: TaintedValue,
|
|
466
|
+
arg_variable: str,
|
|
467
|
+
call_line: int,
|
|
468
|
+
) -> list[Transformation]:
|
|
469
|
+
"""
|
|
470
|
+
Walk the derivation chain from the taint's original variable to the
|
|
471
|
+
argument variable and record every transformation encountered.
|
|
472
|
+
|
|
473
|
+
For example:
|
|
474
|
+
user_id (tainted) → clean = int(user_id) → safe = html.escape(clean)
|
|
475
|
+
→ foo(safe)
|
|
476
|
+
|
|
477
|
+
Returns:
|
|
478
|
+
[Transformation(type=function_call, function="int", ...),
|
|
479
|
+
Transformation(type=function_call, function="html.escape", ...)]
|
|
480
|
+
"""
|
|
481
|
+
if not self._flow_bindings:
|
|
482
|
+
return []
|
|
483
|
+
if arg_variable == taint.current_variable:
|
|
484
|
+
return []
|
|
485
|
+
|
|
486
|
+
chain = self._flow_bindings.get_derivation_chain(
|
|
487
|
+
taint.current_function,
|
|
488
|
+
arg_variable,
|
|
489
|
+
taint.current_variable,
|
|
490
|
+
call_line,
|
|
491
|
+
)
|
|
492
|
+
|
|
493
|
+
if not chain:
|
|
494
|
+
return []
|
|
495
|
+
|
|
496
|
+
transforms: list[Transformation] = []
|
|
497
|
+
for defn in chain:
|
|
498
|
+
if defn.assigned_from_call:
|
|
499
|
+
if defn.is_method_call:
|
|
500
|
+
ttype = TransformationType.METHOD_CALL
|
|
501
|
+
elif defn.is_string_interpolation:
|
|
502
|
+
ttype = TransformationType.STRING_FORMAT
|
|
503
|
+
elif defn.assigned_from_call in (
|
|
504
|
+
"int",
|
|
505
|
+
"str",
|
|
506
|
+
"float",
|
|
507
|
+
"bool",
|
|
508
|
+
"bytes",
|
|
509
|
+
"complex",
|
|
510
|
+
):
|
|
511
|
+
ttype = TransformationType.TYPE_CONSTRUCTOR
|
|
512
|
+
else:
|
|
513
|
+
ttype = TransformationType.FUNCTION_CALL
|
|
514
|
+
|
|
515
|
+
# Try to find the call site for this transformation to attach evidence
|
|
516
|
+
call_ev = self._get_transformation_call_evidence(
|
|
517
|
+
taint.current_function,
|
|
518
|
+
taint.current_file,
|
|
519
|
+
defn.line,
|
|
520
|
+
defn.variable,
|
|
521
|
+
)
|
|
522
|
+
|
|
523
|
+
transforms.append(
|
|
524
|
+
Transformation(
|
|
525
|
+
type=ttype,
|
|
526
|
+
description=None,
|
|
527
|
+
location=(taint.current_file, defn.line),
|
|
528
|
+
function=defn.assigned_from_call,
|
|
529
|
+
depth=taint.depth,
|
|
530
|
+
call_evidence=call_ev,
|
|
531
|
+
)
|
|
532
|
+
)
|
|
533
|
+
elif defn.is_string_interpolation:
|
|
534
|
+
transforms.append(
|
|
535
|
+
Transformation(
|
|
536
|
+
type=TransformationType.STRING_FORMAT,
|
|
537
|
+
description=None,
|
|
538
|
+
location=(taint.current_file, defn.line),
|
|
539
|
+
function=None,
|
|
540
|
+
depth=taint.depth,
|
|
541
|
+
)
|
|
542
|
+
)
|
|
543
|
+
|
|
544
|
+
return transforms
|
|
545
|
+
|
|
546
|
+
def _get_transformation_call_evidence(
|
|
547
|
+
self,
|
|
548
|
+
function_qname: str,
|
|
549
|
+
file_path: Path,
|
|
550
|
+
line: int,
|
|
551
|
+
tainted_var: str | None,
|
|
552
|
+
) -> Any:
|
|
553
|
+
"""Look up the call site for a transformation assignment and build evidence."""
|
|
554
|
+
from .sink_evidence import build_evidence_for_call_site
|
|
555
|
+
|
|
556
|
+
cs = self._get_call_site(function_qname, file_path, line)
|
|
557
|
+
if cs is None:
|
|
558
|
+
return None
|
|
559
|
+
# For a transformation like `clean = int(uid)`, the call site is
|
|
560
|
+
# `int(uid)`. We identify which argument carries the tainted
|
|
561
|
+
# variable by matching source_variables.
|
|
562
|
+
tainted_idx: int | None = None
|
|
563
|
+
tainted_name: str | None = None
|
|
564
|
+
if tainted_var:
|
|
565
|
+
for arg in cs.arguments:
|
|
566
|
+
if arg.is_variable and arg.variable_name == tainted_var:
|
|
567
|
+
tainted_idx = arg.position
|
|
568
|
+
tainted_name = arg.name
|
|
569
|
+
break
|
|
570
|
+
if tainted_var in arg.source_variables:
|
|
571
|
+
tainted_idx = arg.position
|
|
572
|
+
tainted_name = arg.name
|
|
573
|
+
break
|
|
574
|
+
return build_evidence_for_call_site(
|
|
575
|
+
cs,
|
|
576
|
+
tainted_arg_index=tainted_idx,
|
|
577
|
+
tainted_arg_name=tainted_name,
|
|
578
|
+
)
|
|
579
|
+
|
|
580
|
+
def _extract_arg_transformations(
|
|
581
|
+
self,
|
|
582
|
+
taint: TaintedValue,
|
|
583
|
+
edge: CallGraphEdge,
|
|
584
|
+
callee_qname: str,
|
|
585
|
+
) -> list[Transformation]:
|
|
586
|
+
"""
|
|
587
|
+
Check whether the call argument itself is a transformation (inline).
|
|
588
|
+
|
|
589
|
+
Handles the pattern: foo(int(user_id)) where int() wraps the tainted
|
|
590
|
+
variable directly in the call argument.
|
|
591
|
+
"""
|
|
592
|
+
call_site = self._get_call_site(taint.current_function, edge.file_path, edge.line)
|
|
593
|
+
if not call_site or not call_site.arguments:
|
|
594
|
+
return []
|
|
595
|
+
|
|
596
|
+
transforms: list[Transformation] = []
|
|
597
|
+
for arg in call_site.arguments:
|
|
598
|
+
if arg.is_call_result and arg.called_function:
|
|
599
|
+
# The argument is the result of a call — check if the
|
|
600
|
+
# called function wraps the tainted variable.
|
|
601
|
+
# E.g. foo(int(user_id)) — arg is int(user_id).
|
|
602
|
+
if arg.expression_text and taint.current_variable in (arg.expression_text):
|
|
603
|
+
ttype = (
|
|
604
|
+
TransformationType.TYPE_CONSTRUCTOR
|
|
605
|
+
if (
|
|
606
|
+
arg.called_function
|
|
607
|
+
in (
|
|
608
|
+
"int",
|
|
609
|
+
"str",
|
|
610
|
+
"float",
|
|
611
|
+
"bool",
|
|
612
|
+
"bytes",
|
|
613
|
+
)
|
|
614
|
+
)
|
|
615
|
+
else TransformationType.FUNCTION_CALL
|
|
616
|
+
)
|
|
617
|
+
transforms.append(
|
|
618
|
+
Transformation(
|
|
619
|
+
type=ttype,
|
|
620
|
+
description=None,
|
|
621
|
+
location=(edge.file_path, edge.line),
|
|
622
|
+
function=arg.called_function,
|
|
623
|
+
depth=taint.depth,
|
|
624
|
+
)
|
|
625
|
+
)
|
|
626
|
+
return transforms
|
|
627
|
+
|
|
628
|
+
def _resolve_hof_callback(
|
|
629
|
+
self,
|
|
630
|
+
caller: str,
|
|
631
|
+
file_path: Path,
|
|
632
|
+
line: int,
|
|
633
|
+
hof_callee: str,
|
|
634
|
+
) -> str | None:
|
|
635
|
+
"""
|
|
636
|
+
Resolve the callback argument of a HOF call to a function in the call graph.
|
|
637
|
+
E.g. map(sanitize, data) -> resolve "sanitize" to "main.sanitize" if in graph.
|
|
638
|
+
"""
|
|
639
|
+
from .hof_catalog import get_hof_callback_position
|
|
640
|
+
|
|
641
|
+
callback_pos = get_hof_callback_position(hof_callee)
|
|
642
|
+
if callback_pos is None:
|
|
643
|
+
return None
|
|
644
|
+
call_site = self._get_call_site(caller, file_path, line)
|
|
645
|
+
if not call_site or not call_site.arguments:
|
|
646
|
+
return None
|
|
647
|
+
for arg in call_site.arguments:
|
|
648
|
+
if arg.position != callback_pos:
|
|
649
|
+
continue
|
|
650
|
+
name = arg.variable_name if (arg.is_variable and arg.variable_name) else None
|
|
651
|
+
if not name:
|
|
652
|
+
continue
|
|
653
|
+
# Try as qualified name
|
|
654
|
+
if self._call_graph.get_node(name):
|
|
655
|
+
return name
|
|
656
|
+
# Try with caller's module prefix
|
|
657
|
+
if "." in caller:
|
|
658
|
+
prefix = caller.rsplit(".", 1)[0]
|
|
659
|
+
candidate = f"{prefix}.{name}"
|
|
660
|
+
if self._call_graph.get_node(candidate):
|
|
661
|
+
return candidate
|
|
662
|
+
# Try any symbol that ends with .name
|
|
663
|
+
for sym in self._call_graph._symbols:
|
|
664
|
+
if sym == name or sym.endswith(f".{name}"):
|
|
665
|
+
return sym
|
|
666
|
+
return None
|
|
667
|
+
return None
|
|
668
|
+
|
|
669
|
+
def _get_call_site(
|
|
670
|
+
self,
|
|
671
|
+
caller: str,
|
|
672
|
+
file_path: Path,
|
|
673
|
+
line: int,
|
|
674
|
+
callee_hint: str | None = None,
|
|
675
|
+
) -> ParsedCallSite | None:
|
|
676
|
+
"""
|
|
677
|
+
Look up the ParsedCallSite for a call at (caller, file_path, line).
|
|
678
|
+
|
|
679
|
+
When multiple calls share a line (e.g. ``Template(content).render()``),
|
|
680
|
+
*callee_hint* disambiguates: it prefers the call site whose callee_name
|
|
681
|
+
matches the hint.
|
|
682
|
+
"""
|
|
683
|
+
parsed = self._parsed_files.get(file_path)
|
|
684
|
+
if not parsed or not hasattr(parsed, "call_sites"):
|
|
685
|
+
return None
|
|
686
|
+
|
|
687
|
+
candidates: list[ParsedCallSite] = []
|
|
688
|
+
for call in parsed.call_sites:
|
|
689
|
+
if call.location.line != line:
|
|
690
|
+
continue
|
|
691
|
+
caller_qn = call.caller_function
|
|
692
|
+
caller_str = (
|
|
693
|
+
caller_qn.full
|
|
694
|
+
if caller_qn and hasattr(caller_qn, "full")
|
|
695
|
+
else str(caller_qn)
|
|
696
|
+
if caller_qn
|
|
697
|
+
else ""
|
|
698
|
+
)
|
|
699
|
+
if (
|
|
700
|
+
caller_str == caller
|
|
701
|
+
or caller_str
|
|
702
|
+
and caller.endswith(f".{caller_str}")
|
|
703
|
+
or caller_str
|
|
704
|
+
and caller_str.endswith(f".{caller.split('.')[-1]}")
|
|
705
|
+
):
|
|
706
|
+
candidates.append(call)
|
|
707
|
+
|
|
708
|
+
if not candidates:
|
|
709
|
+
return None
|
|
710
|
+
if len(candidates) == 1 or not callee_hint:
|
|
711
|
+
return candidates[0]
|
|
712
|
+
|
|
713
|
+
callee_leaf = callee_hint.rsplit(".", 1)[-1]
|
|
714
|
+
for c in candidates:
|
|
715
|
+
cname = c.callee_name or ""
|
|
716
|
+
if cname in (callee_hint, callee_leaf) or cname.endswith(f".{callee_leaf}"):
|
|
717
|
+
return c
|
|
718
|
+
return candidates[0]
|
|
719
|
+
|
|
720
|
+
def _check_taint_passes(
|
|
721
|
+
self,
|
|
722
|
+
taint: TaintedValue,
|
|
723
|
+
edge: CallGraphEdge,
|
|
724
|
+
callee_qname: str,
|
|
725
|
+
) -> tuple[int, str] | None:
|
|
726
|
+
"""
|
|
727
|
+
Check if tainted variable is passed to this call.
|
|
728
|
+
|
|
729
|
+
Uses actual call-site argument analysis when available for accuracy.
|
|
730
|
+
Falls back to heuristics when call site cannot be resolved.
|
|
731
|
+
|
|
732
|
+
Returns (argument_index, parameter_name) if taint passes, None otherwise.
|
|
733
|
+
"""
|
|
734
|
+
tainted_var = taint.current_variable
|
|
735
|
+
callee_node = self._call_graph.get_node(callee_qname)
|
|
736
|
+
callee_params = callee_node.parameters if callee_node else []
|
|
737
|
+
|
|
738
|
+
call_site = self._get_call_site(
|
|
739
|
+
taint.current_function,
|
|
740
|
+
edge.file_path,
|
|
741
|
+
edge.line,
|
|
742
|
+
callee_hint=callee_qname,
|
|
743
|
+
)
|
|
744
|
+
|
|
745
|
+
if call_site and call_site.arguments:
|
|
746
|
+
# Precise: check which argument passes the tainted variable
|
|
747
|
+
# Use value-flow (variable_derives_from) for renaming: uid = user_id;
|
|
748
|
+
# foo(uid) should propagate taint
|
|
749
|
+
# Handle *args and **kwargs: tainted *var or **var spreads to callee
|
|
750
|
+
for arg in call_site.arguments:
|
|
751
|
+
arg_var = None
|
|
752
|
+
if arg.is_variable and arg.variable_name or arg.is_spread or arg.is_keyword_spread:
|
|
753
|
+
arg_var = arg.variable_name
|
|
754
|
+
if not arg_var:
|
|
755
|
+
continue
|
|
756
|
+
# Direct match or value-flow (variable renaming): does arg_var
|
|
757
|
+
# derive from tainted_var? e.g. uid = user_id; foo(uid)
|
|
758
|
+
if arg_var != tainted_var:
|
|
759
|
+
if self._flow_bindings:
|
|
760
|
+
if not self._flow_bindings.variable_derives_from(
|
|
761
|
+
taint.current_function,
|
|
762
|
+
arg_var,
|
|
763
|
+
tainted_var,
|
|
764
|
+
edge.line,
|
|
765
|
+
):
|
|
766
|
+
continue
|
|
767
|
+
else:
|
|
768
|
+
continue
|
|
769
|
+
# This argument passes the tainted variable (or derives from it)
|
|
770
|
+
param_index = arg.position
|
|
771
|
+
param_name = None
|
|
772
|
+
if arg.name and callee_params:
|
|
773
|
+
# Keyword argument: find param by name
|
|
774
|
+
for i, p in enumerate(callee_params):
|
|
775
|
+
if p == arg.name:
|
|
776
|
+
param_index = i
|
|
777
|
+
param_name = p
|
|
778
|
+
break
|
|
779
|
+
elif param_index is not None and param_index < len(callee_params):
|
|
780
|
+
param_name = callee_params[param_index]
|
|
781
|
+
else:
|
|
782
|
+
param_name = tainted_var
|
|
783
|
+
return (param_index if param_index is not None else 0, param_name or tainted_var)
|
|
784
|
+
# Precise analysis had call-site arguments but none carried taint;
|
|
785
|
+
# trust the verdict and skip heuristic fallback.
|
|
786
|
+
return None
|
|
787
|
+
|
|
788
|
+
# Fallback: heuristic-based matching (only when call-site is unresolved)
|
|
789
|
+
if callee_params:
|
|
790
|
+
for i, param in enumerate(callee_params):
|
|
791
|
+
if param == tainted_var or tainted_var.endswith(f".{param}"):
|
|
792
|
+
return (i, param)
|
|
793
|
+
if param in ("data", "value", "input", "content", "payload"):
|
|
794
|
+
return (i, param)
|
|
795
|
+
|
|
796
|
+
if self._is_likely_data_variable(tainted_var):
|
|
797
|
+
return (0, callee_params[0] if callee_params else tainted_var)
|
|
798
|
+
|
|
799
|
+
# HOF: map(f, data), filter(f, data) - taint in data arg flows to callback
|
|
800
|
+
from .hof_catalog import get_hof_data_position, is_known_hof
|
|
801
|
+
|
|
802
|
+
if is_known_hof(callee_qname):
|
|
803
|
+
data_pos = get_hof_data_position(callee_qname)
|
|
804
|
+
if data_pos is not None and call_site and call_site.arguments:
|
|
805
|
+
for arg in call_site.arguments:
|
|
806
|
+
if arg.position != data_pos:
|
|
807
|
+
continue
|
|
808
|
+
arg_var = arg.variable_name if (arg.is_variable and arg.variable_name) else None
|
|
809
|
+
if not arg_var:
|
|
810
|
+
continue
|
|
811
|
+
if arg_var == tainted_var:
|
|
812
|
+
return (data_pos, "iterable")
|
|
813
|
+
if self._flow_bindings and self._flow_bindings.variable_derives_from(
|
|
814
|
+
taint.current_function, arg_var, tainted_var, edge.line
|
|
815
|
+
):
|
|
816
|
+
return (data_pos, "iterable")
|
|
817
|
+
|
|
818
|
+
return None
|
|
819
|
+
|
|
820
|
+
def _is_likely_data_variable(self, var_name: str) -> bool:
|
|
821
|
+
"""Check if variable name strongly suggests it carries user data.
|
|
822
|
+
|
|
823
|
+
Only matches terms that almost always indicate untrusted input.
|
|
824
|
+
Generic terms like "name", "id", "value", "result", "response"
|
|
825
|
+
are excluded because they appear frequently in non-tainted contexts.
|
|
826
|
+
"""
|
|
827
|
+
import re as _re
|
|
828
|
+
|
|
829
|
+
_STRONG_DATA_PATTERNS = {
|
|
830
|
+
"data",
|
|
831
|
+
"input",
|
|
832
|
+
"body",
|
|
833
|
+
"payload",
|
|
834
|
+
"content",
|
|
835
|
+
"request",
|
|
836
|
+
"params",
|
|
837
|
+
"query",
|
|
838
|
+
"form",
|
|
839
|
+
"json",
|
|
840
|
+
"user_input",
|
|
841
|
+
"user_data",
|
|
842
|
+
"raw_input",
|
|
843
|
+
"email",
|
|
844
|
+
"password",
|
|
845
|
+
"token",
|
|
846
|
+
}
|
|
847
|
+
|
|
848
|
+
var_lower = var_name.lower()
|
|
849
|
+
# Exact match on the full variable name or the last dotted segment
|
|
850
|
+
leaf = var_lower.rsplit(".", 1)[-1]
|
|
851
|
+
if leaf in _STRONG_DATA_PATTERNS:
|
|
852
|
+
return True
|
|
853
|
+
|
|
854
|
+
# Word-boundary match so "user_payload" hits but "response" does not
|
|
855
|
+
for pattern in _STRONG_DATA_PATTERNS:
|
|
856
|
+
if _re.search(rf"(?:^|_){_re.escape(pattern)}(?:_|$)", leaf):
|
|
857
|
+
return True
|
|
858
|
+
|
|
859
|
+
return False
|
|
860
|
+
|
|
861
|
+
def _create_flow(
|
|
862
|
+
self,
|
|
863
|
+
taint: TaintedValue,
|
|
864
|
+
edge: CallGraphEdge,
|
|
865
|
+
callee_qname: str,
|
|
866
|
+
arg_index: int,
|
|
867
|
+
arg_name: str,
|
|
868
|
+
extra_transformations: list[Transformation] | None = None,
|
|
869
|
+
) -> DataFlow:
|
|
870
|
+
"""Create a DataFlow record for a taint reaching a function call."""
|
|
871
|
+
flow_id = stable_id(
|
|
872
|
+
"flow",
|
|
873
|
+
taint.origin_type.name,
|
|
874
|
+
taint.entry_point_id or "",
|
|
875
|
+
callee_qname,
|
|
876
|
+
str(arg_index),
|
|
877
|
+
arg_name,
|
|
878
|
+
str(edge.file_path),
|
|
879
|
+
str(edge.line),
|
|
880
|
+
)
|
|
881
|
+
|
|
882
|
+
# Build the path
|
|
883
|
+
path = list(taint.path)
|
|
884
|
+
path.append(
|
|
885
|
+
FlowStep(
|
|
886
|
+
depth=taint.depth + 1,
|
|
887
|
+
caller=taint.current_function,
|
|
888
|
+
callee=callee_qname,
|
|
889
|
+
argument_index=arg_index,
|
|
890
|
+
argument_name=arg_name,
|
|
891
|
+
location=(edge.file_path, edge.line),
|
|
892
|
+
variable_mapping={taint.current_variable: arg_name},
|
|
893
|
+
)
|
|
894
|
+
)
|
|
895
|
+
|
|
896
|
+
# Get context at sink
|
|
897
|
+
sink_context = {
|
|
898
|
+
"in_loop": edge.in_loop,
|
|
899
|
+
"in_conditional": edge.in_conditional,
|
|
900
|
+
"in_try_block": edge.in_try_block,
|
|
901
|
+
"in_except_handler": edge.in_except_handler,
|
|
902
|
+
}
|
|
903
|
+
|
|
904
|
+
# Merge transformations: accumulated from propagation + chain + inline.
|
|
905
|
+
# Deduplicate by (function, line) to avoid repeats.
|
|
906
|
+
all_transforms = extra_transformations or list(taint.transformations)
|
|
907
|
+
seen_keys: set[tuple[str | None, int]] = set()
|
|
908
|
+
deduped: list[Transformation] = []
|
|
909
|
+
for t in all_transforms:
|
|
910
|
+
key = (t.function, t.location[1])
|
|
911
|
+
if key not in seen_keys:
|
|
912
|
+
seen_keys.add(key)
|
|
913
|
+
deduped.append(t)
|
|
914
|
+
|
|
915
|
+
return DataFlow(
|
|
916
|
+
id=flow_id,
|
|
917
|
+
origin_type=taint.origin_type,
|
|
918
|
+
origin_name=taint.origin_name,
|
|
919
|
+
origin_location=taint.origin_location,
|
|
920
|
+
entry_point_id=taint.entry_point_id,
|
|
921
|
+
sink_function=callee_qname,
|
|
922
|
+
sink_argument_index=arg_index,
|
|
923
|
+
sink_argument_name=arg_name,
|
|
924
|
+
sink_location=(edge.file_path, edge.line),
|
|
925
|
+
sink_call_id=None,
|
|
926
|
+
path=path,
|
|
927
|
+
depth=len(path),
|
|
928
|
+
truncated=taint.depth >= self._max_depth - 1,
|
|
929
|
+
transformations=deduped,
|
|
930
|
+
sink_context=sink_context,
|
|
931
|
+
confidence="HIGH" if taint.depth < 3 else "MEDIUM",
|
|
932
|
+
)
|
|
933
|
+
|
|
934
|
+
def _should_propagate_into(self, callee_qname: str) -> bool:
|
|
935
|
+
"""Check if we should continue propagation into this function."""
|
|
936
|
+
# Don't propagate into external/library functions
|
|
937
|
+
node = self._call_graph.get_node(callee_qname)
|
|
938
|
+
if node is None:
|
|
939
|
+
return False
|
|
940
|
+
|
|
941
|
+
# Don't propagate into builtins
|
|
942
|
+
if callee_qname.startswith("builtins."):
|
|
943
|
+
return False
|
|
944
|
+
|
|
945
|
+
# Don't propagate into known library functions
|
|
946
|
+
library_prefixes = {
|
|
947
|
+
# Python
|
|
948
|
+
"sqlalchemy.",
|
|
949
|
+
"django.",
|
|
950
|
+
"flask.",
|
|
951
|
+
"fastapi.",
|
|
952
|
+
"pydantic.",
|
|
953
|
+
"requests.",
|
|
954
|
+
"httpx.",
|
|
955
|
+
"aiohttp.",
|
|
956
|
+
"json.",
|
|
957
|
+
"os.",
|
|
958
|
+
"sys.",
|
|
959
|
+
"subprocess.",
|
|
960
|
+
"logging.",
|
|
961
|
+
# Java standard library and major frameworks
|
|
962
|
+
"java.",
|
|
963
|
+
"javax.",
|
|
964
|
+
"jakarta.",
|
|
965
|
+
"org.springframework.",
|
|
966
|
+
"io.micronaut.",
|
|
967
|
+
"com.google.",
|
|
968
|
+
"com.fasterxml.", # Jackson
|
|
969
|
+
"org.apache.",
|
|
970
|
+
"org.slf4j.",
|
|
971
|
+
"org.hibernate.",
|
|
972
|
+
"io.jsonwebtoken.",
|
|
973
|
+
# .NET standard library and ASP.NET Core
|
|
974
|
+
"System.",
|
|
975
|
+
"Microsoft.",
|
|
976
|
+
"Newtonsoft.",
|
|
977
|
+
"AutoMapper.",
|
|
978
|
+
"FluentValidation.",
|
|
979
|
+
}
|
|
980
|
+
return all(not callee_qname.startswith(prefix) for prefix in library_prefixes)
|
|
981
|
+
|
|
982
|
+
def _create_propagated_taint(
|
|
983
|
+
self,
|
|
984
|
+
source_taint: TaintedValue,
|
|
985
|
+
edge: CallGraphEdge,
|
|
986
|
+
callee_qname: str,
|
|
987
|
+
param_name: str,
|
|
988
|
+
accumulated_transforms: list[Transformation] | None = None,
|
|
989
|
+
) -> TaintedValue | None:
|
|
990
|
+
"""Create a new taint for propagation into a callee."""
|
|
991
|
+
taint_id = f"taint-{self._next_taint_id:04d}"
|
|
992
|
+
self._next_taint_id += 1
|
|
993
|
+
|
|
994
|
+
# Build updated path
|
|
995
|
+
new_path = list(source_taint.path)
|
|
996
|
+
new_path.append(
|
|
997
|
+
FlowStep(
|
|
998
|
+
depth=source_taint.depth + 1,
|
|
999
|
+
caller=source_taint.current_function,
|
|
1000
|
+
callee=callee_qname,
|
|
1001
|
+
argument_index=0,
|
|
1002
|
+
argument_name=param_name,
|
|
1003
|
+
location=(edge.file_path, edge.line),
|
|
1004
|
+
variable_mapping={source_taint.current_variable: param_name},
|
|
1005
|
+
)
|
|
1006
|
+
)
|
|
1007
|
+
|
|
1008
|
+
new_taint = TaintedValue(
|
|
1009
|
+
taint_id=taint_id,
|
|
1010
|
+
origin_type=source_taint.origin_type,
|
|
1011
|
+
origin_name=source_taint.origin_name,
|
|
1012
|
+
origin_location=source_taint.origin_location,
|
|
1013
|
+
entry_point_id=source_taint.entry_point_id,
|
|
1014
|
+
current_variable=param_name,
|
|
1015
|
+
current_function=callee_qname,
|
|
1016
|
+
current_file=edge.file_path,
|
|
1017
|
+
depth=source_taint.depth + 1,
|
|
1018
|
+
path=new_path,
|
|
1019
|
+
transformations=accumulated_transforms
|
|
1020
|
+
if accumulated_transforms is not None
|
|
1021
|
+
else list(source_taint.transformations),
|
|
1022
|
+
metadata=dict(source_taint.metadata),
|
|
1023
|
+
)
|
|
1024
|
+
|
|
1025
|
+
self._active_taints[taint_id] = new_taint
|
|
1026
|
+
self._taints[callee_qname][param_name].add(taint_id)
|
|
1027
|
+
self._stats["taints_created"] += 1
|
|
1028
|
+
|
|
1029
|
+
return new_taint
|
|
1030
|
+
|
|
1031
|
+
def _create_return_propagated_taint(
|
|
1032
|
+
self,
|
|
1033
|
+
source_taint: TaintedValue,
|
|
1034
|
+
edge: CallGraphEdge,
|
|
1035
|
+
lhs_variable: str,
|
|
1036
|
+
accumulated_transforms: list[Transformation] | None = None,
|
|
1037
|
+
) -> TaintedValue | None:
|
|
1038
|
+
"""
|
|
1039
|
+
Create taint for LHS when callee returns tainted value.
|
|
1040
|
+
|
|
1041
|
+
E.g. user_id = get_id() where get_id returns request.args.get("id").
|
|
1042
|
+
The taint propagates from callee's return to caller's lhs_variable.
|
|
1043
|
+
"""
|
|
1044
|
+
taint_id = f"taint-{self._next_taint_id:04d}"
|
|
1045
|
+
self._next_taint_id += 1
|
|
1046
|
+
|
|
1047
|
+
new_taint = TaintedValue(
|
|
1048
|
+
taint_id=taint_id,
|
|
1049
|
+
origin_type=source_taint.origin_type,
|
|
1050
|
+
origin_name=source_taint.origin_name,
|
|
1051
|
+
origin_location=source_taint.origin_location,
|
|
1052
|
+
entry_point_id=source_taint.entry_point_id,
|
|
1053
|
+
current_variable=lhs_variable,
|
|
1054
|
+
current_function=source_taint.current_function,
|
|
1055
|
+
current_file=source_taint.current_file,
|
|
1056
|
+
depth=source_taint.depth,
|
|
1057
|
+
path=list(source_taint.path),
|
|
1058
|
+
transformations=accumulated_transforms
|
|
1059
|
+
if accumulated_transforms is not None
|
|
1060
|
+
else list(source_taint.transformations),
|
|
1061
|
+
metadata=dict(source_taint.metadata),
|
|
1062
|
+
)
|
|
1063
|
+
|
|
1064
|
+
self._active_taints[taint_id] = new_taint
|
|
1065
|
+
caller = source_taint.current_function
|
|
1066
|
+
self._taints[caller][lhs_variable].add(taint_id)
|
|
1067
|
+
self._stats["taints_created"] += 1
|
|
1068
|
+
|
|
1069
|
+
return new_taint
|
|
1070
|
+
|
|
1071
|
+
def get_statistics(self) -> dict[str, int]:
|
|
1072
|
+
"""Get propagation statistics."""
|
|
1073
|
+
return dict(self._stats)
|
|
1074
|
+
|
|
1075
|
+
|
|
1076
|
+
# =============================================================================
|
|
1077
|
+
# Source Identifier
|
|
1078
|
+
# =============================================================================
|
|
1079
|
+
|
|
1080
|
+
|
|
1081
|
+
class SourceIdentifier:
|
|
1082
|
+
"""
|
|
1083
|
+
Identifies data origins from entry points.
|
|
1084
|
+
|
|
1085
|
+
This analyzes entry-point handlers to find where untrusted data enters:
|
|
1086
|
+
- HTTP: path/query/body/header/cookie parameters
|
|
1087
|
+
- CLI: arguments and options (user input)
|
|
1088
|
+
- Task/Consumer/Scheduled: message-broker parameters
|
|
1089
|
+
- Webhook/Event: external system callback parameters
|
|
1090
|
+
"""
|
|
1091
|
+
|
|
1092
|
+
# FastAPI parameter patterns
|
|
1093
|
+
FASTAPI_PATH_PARAM = {"Path"}
|
|
1094
|
+
FASTAPI_QUERY_PARAM = {"Query"}
|
|
1095
|
+
FASTAPI_BODY_PARAM = {"Body", "Form", "File"}
|
|
1096
|
+
FASTAPI_HEADER_PARAM = {"Header"}
|
|
1097
|
+
FASTAPI_COOKIE_PARAM = {"Cookie"}
|
|
1098
|
+
|
|
1099
|
+
# Default OriginType per non-HTTP entry-point kind
|
|
1100
|
+
_KIND_ORIGIN_MAP: dict[str, OriginType] = {
|
|
1101
|
+
"cli": OriginType.USER_INPUT,
|
|
1102
|
+
"task": OriginType.MESSAGE_QUEUE,
|
|
1103
|
+
"consumer": OriginType.MESSAGE_QUEUE,
|
|
1104
|
+
"scheduled": OriginType.MESSAGE_QUEUE,
|
|
1105
|
+
"webhook": OriginType.EXTERNAL_API,
|
|
1106
|
+
"event": OriginType.EXTERNAL_API,
|
|
1107
|
+
}
|
|
1108
|
+
|
|
1109
|
+
# CLI-specific location values → OriginType
|
|
1110
|
+
_CLI_LOCATION_ORIGINS: dict[str, OriginType] = {
|
|
1111
|
+
"cli_argument": OriginType.USER_INPUT,
|
|
1112
|
+
"cli_option": OriginType.USER_INPUT,
|
|
1113
|
+
}
|
|
1114
|
+
|
|
1115
|
+
_SIMPLE_TYPES = frozenset(
|
|
1116
|
+
{
|
|
1117
|
+
"str",
|
|
1118
|
+
"int",
|
|
1119
|
+
"float",
|
|
1120
|
+
"bool",
|
|
1121
|
+
"bytes",
|
|
1122
|
+
"Optional[str]",
|
|
1123
|
+
"Optional[int]",
|
|
1124
|
+
"Optional[float]",
|
|
1125
|
+
"Optional[bool]",
|
|
1126
|
+
"str | None",
|
|
1127
|
+
"int | None",
|
|
1128
|
+
"float | None",
|
|
1129
|
+
"bool | None",
|
|
1130
|
+
}
|
|
1131
|
+
)
|
|
1132
|
+
|
|
1133
|
+
def __init__(self):
|
|
1134
|
+
self._origins: list[dict[str, Any]] = []
|
|
1135
|
+
|
|
1136
|
+
def identify_from_function(
|
|
1137
|
+
self,
|
|
1138
|
+
func: ParsedFunction,
|
|
1139
|
+
entry_point_id: str | None = None,
|
|
1140
|
+
route_path: str | None = None,
|
|
1141
|
+
param_locations: dict[str, str] | None = None,
|
|
1142
|
+
entry_point_kind: str | None = None,
|
|
1143
|
+
) -> list[dict[str, Any]]:
|
|
1144
|
+
"""
|
|
1145
|
+
Identify data origins from a function (entry-point handler).
|
|
1146
|
+
|
|
1147
|
+
Args:
|
|
1148
|
+
func: The parsed handler function.
|
|
1149
|
+
entry_point_id: Stable ID of the entry point.
|
|
1150
|
+
route_path: URL path template (HTTP only).
|
|
1151
|
+
param_locations: Pre-classified param name → location string.
|
|
1152
|
+
entry_point_kind: Entry-point kind (``"http"``, ``"cli"``,
|
|
1153
|
+
``"task"``, ``"consumer"``, ``"scheduled"``, ``"webhook"``,
|
|
1154
|
+
``"event"``). ``None`` is treated as ``"http"`` for
|
|
1155
|
+
backwards compatibility.
|
|
1156
|
+
|
|
1157
|
+
Returns:
|
|
1158
|
+
List of origin info dicts.
|
|
1159
|
+
"""
|
|
1160
|
+
origins: list[dict[str, Any]] = []
|
|
1161
|
+
|
|
1162
|
+
for param in func.parameters:
|
|
1163
|
+
origin = self._classify_parameter(
|
|
1164
|
+
param,
|
|
1165
|
+
func,
|
|
1166
|
+
route_path,
|
|
1167
|
+
param_locations,
|
|
1168
|
+
entry_point_kind,
|
|
1169
|
+
)
|
|
1170
|
+
if origin:
|
|
1171
|
+
origin["entry_point_id"] = entry_point_id
|
|
1172
|
+
origin["function_qname"] = func.qualified_name.full
|
|
1173
|
+
origins.append(origin)
|
|
1174
|
+
|
|
1175
|
+
return origins
|
|
1176
|
+
|
|
1177
|
+
def _classify_parameter(
|
|
1178
|
+
self,
|
|
1179
|
+
param: Any, # ParsedParameter
|
|
1180
|
+
func: ParsedFunction,
|
|
1181
|
+
route_path: str | None,
|
|
1182
|
+
param_locations: dict[str, str] | None = None,
|
|
1183
|
+
entry_point_kind: str | None = None,
|
|
1184
|
+
) -> dict[str, Any] | None:
|
|
1185
|
+
"""Classify a parameter as a data origin."""
|
|
1186
|
+
param_name = param.name
|
|
1187
|
+
param_type = param.type_annotation
|
|
1188
|
+
default_value = param.default_value
|
|
1189
|
+
|
|
1190
|
+
# Skip self/cls
|
|
1191
|
+
if param_name in ("self", "cls"):
|
|
1192
|
+
return None
|
|
1193
|
+
|
|
1194
|
+
# Non-HTTP entry points use kind-based classification
|
|
1195
|
+
if entry_point_kind and entry_point_kind != "http":
|
|
1196
|
+
return self._classify_non_http_parameter(
|
|
1197
|
+
param,
|
|
1198
|
+
func,
|
|
1199
|
+
entry_point_kind,
|
|
1200
|
+
param_locations,
|
|
1201
|
+
)
|
|
1202
|
+
|
|
1203
|
+
# Java/Spring & .NET: annotations stored in param.metadata by the language parser.
|
|
1204
|
+
param_metadata = getattr(param, "metadata", {}) or {}
|
|
1205
|
+
if param_metadata:
|
|
1206
|
+
spring_origin = self._classify_spring_param(param_name, param_metadata)
|
|
1207
|
+
if spring_origin:
|
|
1208
|
+
return {
|
|
1209
|
+
"type": spring_origin,
|
|
1210
|
+
"name": param_name,
|
|
1211
|
+
"location": (func.location.file, func.location.line),
|
|
1212
|
+
"parameter_name": param_name,
|
|
1213
|
+
"inferred_from": "spring_annotation",
|
|
1214
|
+
}
|
|
1215
|
+
dotnet_origin = self._classify_dotnet_param(param_name, param_metadata)
|
|
1216
|
+
if dotnet_origin:
|
|
1217
|
+
return {
|
|
1218
|
+
"type": dotnet_origin,
|
|
1219
|
+
"name": param_name,
|
|
1220
|
+
"location": (func.location.file, func.location.line),
|
|
1221
|
+
"parameter_name": param_name,
|
|
1222
|
+
"inferred_from": "dotnet_annotation",
|
|
1223
|
+
}
|
|
1224
|
+
|
|
1225
|
+
# Use pre-classified location from the entry_point if available
|
|
1226
|
+
# (single source of truth from ParameterAnalyzer).
|
|
1227
|
+
if param_locations and param_name in param_locations:
|
|
1228
|
+
origin_type_name = param_locations[param_name]
|
|
1229
|
+
return {
|
|
1230
|
+
"type": OriginType[origin_type_name],
|
|
1231
|
+
"name": param_name,
|
|
1232
|
+
"location": (func.location.file, func.location.line),
|
|
1233
|
+
"parameter_name": param_name,
|
|
1234
|
+
"inferred_from": "entry_point_classification",
|
|
1235
|
+
}
|
|
1236
|
+
|
|
1237
|
+
# Check for path parameter (appears in route path) — checked before
|
|
1238
|
+
# body-type heuristic so that a param named e.g. "code" in the URL
|
|
1239
|
+
# template is not misclassified as body due to its type annotation.
|
|
1240
|
+
if route_path and f"{{{param_name}}}" in route_path:
|
|
1241
|
+
return {
|
|
1242
|
+
"type": OriginType.HTTP_PATH_PARAM,
|
|
1243
|
+
"name": param_name,
|
|
1244
|
+
"location": (func.location.file, func.location.line),
|
|
1245
|
+
"parameter_name": param_name,
|
|
1246
|
+
"inferred_from": "route_path",
|
|
1247
|
+
}
|
|
1248
|
+
|
|
1249
|
+
# Check type annotation for Pydantic models (likely body)
|
|
1250
|
+
if param_type and self._is_likely_body_type(param_type):
|
|
1251
|
+
return {
|
|
1252
|
+
"type": OriginType.HTTP_BODY,
|
|
1253
|
+
"name": param_name,
|
|
1254
|
+
"location": (func.location.file, func.location.line),
|
|
1255
|
+
"parameter_name": param_name,
|
|
1256
|
+
"inferred_from": "type_annotation",
|
|
1257
|
+
}
|
|
1258
|
+
|
|
1259
|
+
# Check default value for FastAPI dependency patterns
|
|
1260
|
+
if default_value:
|
|
1261
|
+
origin_type = self._classify_fastapi_default(default_value)
|
|
1262
|
+
if origin_type:
|
|
1263
|
+
return {
|
|
1264
|
+
"type": origin_type,
|
|
1265
|
+
"name": param_name,
|
|
1266
|
+
"location": (func.location.file, func.location.line),
|
|
1267
|
+
"parameter_name": param_name,
|
|
1268
|
+
"inferred_from": "default_value",
|
|
1269
|
+
}
|
|
1270
|
+
|
|
1271
|
+
# Check for common parameter names
|
|
1272
|
+
if self._is_likely_query_param_name(param_name):
|
|
1273
|
+
return {
|
|
1274
|
+
"type": OriginType.HTTP_QUERY_PARAM,
|
|
1275
|
+
"name": param_name,
|
|
1276
|
+
"location": (func.location.file, func.location.line),
|
|
1277
|
+
"parameter_name": param_name,
|
|
1278
|
+
"inferred_from": "name_heuristic",
|
|
1279
|
+
}
|
|
1280
|
+
|
|
1281
|
+
# FastAPI fallback: simple-typed params not in path are query params.
|
|
1282
|
+
# Depends()-injected params were already handled above; anything
|
|
1283
|
+
# left with a primitive type annotation is user-supplied input.
|
|
1284
|
+
if param_type and param_type in self._SIMPLE_TYPES:
|
|
1285
|
+
return {
|
|
1286
|
+
"type": OriginType.HTTP_QUERY_PARAM,
|
|
1287
|
+
"name": param_name,
|
|
1288
|
+
"location": (func.location.file, func.location.line),
|
|
1289
|
+
"parameter_name": param_name,
|
|
1290
|
+
"inferred_from": "fastapi_simple_type_fallback",
|
|
1291
|
+
}
|
|
1292
|
+
|
|
1293
|
+
return None
|
|
1294
|
+
|
|
1295
|
+
def _is_likely_body_type(self, type_annotation: str) -> bool:
|
|
1296
|
+
"""Check if type annotation suggests a request body."""
|
|
1297
|
+
# Pydantic models typically end with Model, Schema, etc.
|
|
1298
|
+
body_patterns = {
|
|
1299
|
+
"Model",
|
|
1300
|
+
"Schema",
|
|
1301
|
+
"Request",
|
|
1302
|
+
"Input",
|
|
1303
|
+
"Create",
|
|
1304
|
+
"Update",
|
|
1305
|
+
"Payload",
|
|
1306
|
+
"Body",
|
|
1307
|
+
"Data",
|
|
1308
|
+
"DTO",
|
|
1309
|
+
}
|
|
1310
|
+
|
|
1311
|
+
for pattern in body_patterns:
|
|
1312
|
+
if pattern in type_annotation:
|
|
1313
|
+
return True
|
|
1314
|
+
|
|
1315
|
+
# Check for common base types that are NOT body
|
|
1316
|
+
non_body = {"str", "int", "float", "bool", "list", "dict", "Optional", "None"}
|
|
1317
|
+
if type_annotation in non_body:
|
|
1318
|
+
return False
|
|
1319
|
+
|
|
1320
|
+
# If it's a custom type (starts with uppercase), likely a model
|
|
1321
|
+
return bool(type_annotation and type_annotation[0].isupper())
|
|
1322
|
+
|
|
1323
|
+
def _classify_fastapi_default(self, default_value: str) -> OriginType | None:
|
|
1324
|
+
"""Classify FastAPI parameter defaults."""
|
|
1325
|
+
default_lower = default_value.lower()
|
|
1326
|
+
|
|
1327
|
+
if any(p.lower() in default_lower for p in self.FASTAPI_PATH_PARAM):
|
|
1328
|
+
return OriginType.HTTP_PATH_PARAM
|
|
1329
|
+
if any(p.lower() in default_lower for p in self.FASTAPI_QUERY_PARAM):
|
|
1330
|
+
return OriginType.HTTP_QUERY_PARAM
|
|
1331
|
+
if any(p.lower() in default_lower for p in self.FASTAPI_BODY_PARAM):
|
|
1332
|
+
return OriginType.HTTP_BODY
|
|
1333
|
+
if any(p.lower() in default_lower for p in self.FASTAPI_HEADER_PARAM):
|
|
1334
|
+
return OriginType.HTTP_HEADER
|
|
1335
|
+
if any(p.lower() in default_lower for p in self.FASTAPI_COOKIE_PARAM):
|
|
1336
|
+
return OriginType.HTTP_COOKIE
|
|
1337
|
+
|
|
1338
|
+
return None
|
|
1339
|
+
|
|
1340
|
+
def _classify_spring_param(
|
|
1341
|
+
self, param_name: str, metadata: dict[str, Any]
|
|
1342
|
+
) -> OriginType | None:
|
|
1343
|
+
"""Classify a Spring / Micronaut method parameter from its annotation metadata.
|
|
1344
|
+
|
|
1345
|
+
The JVM parser stores param annotations as:
|
|
1346
|
+
{"PathVariable": "slug"} → HTTP_PATH_PARAM
|
|
1347
|
+
{"RequestParam": "page"} → HTTP_QUERY_PARAM
|
|
1348
|
+
{"RequestBody": None} → HTTP_BODY
|
|
1349
|
+
{"RequestHeader": "X-API-Key"} → HTTP_HEADER
|
|
1350
|
+
{"CookieValue": "session"} → HTTP_COOKIE
|
|
1351
|
+
{"QueryValue": "limit"} → HTTP_QUERY_PARAM (Micronaut)
|
|
1352
|
+
{"Body": None} → HTTP_BODY (Micronaut)
|
|
1353
|
+
{"Header": "X-Token"} → HTTP_HEADER (Micronaut)
|
|
1354
|
+
"""
|
|
1355
|
+
ann_keys = {k.lower() for k in metadata}
|
|
1356
|
+
if "pathvariable" in ann_keys or "uriinfo" in ann_keys:
|
|
1357
|
+
return OriginType.HTTP_PATH_PARAM
|
|
1358
|
+
if "requestparam" in ann_keys or "queryvalue" in ann_keys or "queryparam" in ann_keys:
|
|
1359
|
+
return OriginType.HTTP_QUERY_PARAM
|
|
1360
|
+
if "requestbody" in ann_keys or "body" in ann_keys:
|
|
1361
|
+
return OriginType.HTTP_BODY
|
|
1362
|
+
if "requestheader" in ann_keys or "header" in ann_keys:
|
|
1363
|
+
return OriginType.HTTP_HEADER
|
|
1364
|
+
if "cookievalue" in ann_keys or "cookieparam" in ann_keys:
|
|
1365
|
+
return OriginType.HTTP_COOKIE
|
|
1366
|
+
if "matrixparam" in ann_keys:
|
|
1367
|
+
return OriginType.HTTP_QUERY_PARAM
|
|
1368
|
+
return None
|
|
1369
|
+
|
|
1370
|
+
def _classify_dotnet_param(
|
|
1371
|
+
self, param_name: str, metadata: dict[str, Any]
|
|
1372
|
+
) -> OriginType | None:
|
|
1373
|
+
"""Classify an ASP.NET Core / Minimal API parameter from its attribute metadata.
|
|
1374
|
+
|
|
1375
|
+
The C# parser stores binding attributes as metadata keys (value is True or a
|
|
1376
|
+
string for named params):
|
|
1377
|
+
{"FromRoute": True} → HTTP_PATH_PARAM
|
|
1378
|
+
{"FromQuery": True} → HTTP_QUERY_PARAM
|
|
1379
|
+
{"FromBody": True} → HTTP_BODY
|
|
1380
|
+
{"FromForm": True} → HTTP_FORM
|
|
1381
|
+
{"FromHeader": True} → HTTP_HEADER
|
|
1382
|
+
{"FromServices": True} → None (DI, not user input)
|
|
1383
|
+
"""
|
|
1384
|
+
ann_keys = {k.lower() for k in metadata}
|
|
1385
|
+
if "fromroute" in ann_keys or "fromuriattribute" in ann_keys:
|
|
1386
|
+
return OriginType.HTTP_PATH_PARAM
|
|
1387
|
+
if "fromquery" in ann_keys or "fromuniformresourceidentifier" in ann_keys:
|
|
1388
|
+
return OriginType.HTTP_QUERY_PARAM
|
|
1389
|
+
if "frombody" in ann_keys:
|
|
1390
|
+
return OriginType.HTTP_BODY
|
|
1391
|
+
if "fromform" in ann_keys:
|
|
1392
|
+
return OriginType.HTTP_FORM
|
|
1393
|
+
if "fromheader" in ann_keys:
|
|
1394
|
+
return OriginType.HTTP_HEADER
|
|
1395
|
+
if "fromservices" in ann_keys or "inject" in ann_keys or "fromkeyedservices" in ann_keys:
|
|
1396
|
+
return None # DI injection — not user input
|
|
1397
|
+
return None
|
|
1398
|
+
|
|
1399
|
+
def _is_likely_query_param_name(self, param_name: str) -> bool:
|
|
1400
|
+
"""Check if parameter name suggests a query parameter."""
|
|
1401
|
+
query_patterns = {
|
|
1402
|
+
"page",
|
|
1403
|
+
"limit",
|
|
1404
|
+
"offset",
|
|
1405
|
+
"skip",
|
|
1406
|
+
"take",
|
|
1407
|
+
"sort",
|
|
1408
|
+
"order",
|
|
1409
|
+
"filter",
|
|
1410
|
+
"search",
|
|
1411
|
+
"query",
|
|
1412
|
+
"q",
|
|
1413
|
+
"start",
|
|
1414
|
+
"end",
|
|
1415
|
+
"from",
|
|
1416
|
+
"to",
|
|
1417
|
+
}
|
|
1418
|
+
|
|
1419
|
+
return param_name.lower() in query_patterns
|
|
1420
|
+
|
|
1421
|
+
# -----------------------------------------------------------------
|
|
1422
|
+
# Non-HTTP entry-point classification
|
|
1423
|
+
# -----------------------------------------------------------------
|
|
1424
|
+
|
|
1425
|
+
def _classify_non_http_parameter(
|
|
1426
|
+
self,
|
|
1427
|
+
param: Any, # ParsedParameter
|
|
1428
|
+
func: ParsedFunction,
|
|
1429
|
+
kind: str,
|
|
1430
|
+
param_locations: dict[str, str] | None = None,
|
|
1431
|
+
) -> dict[str, Any] | None:
|
|
1432
|
+
"""Classify a parameter from a non-HTTP entry point.
|
|
1433
|
+
|
|
1434
|
+
For CLI entry points parameters are treated as direct user input.
|
|
1435
|
+
For task / consumer / scheduled entry points parameters arrive via
|
|
1436
|
+
a message broker (deserialized payloads). For webhook / event entry
|
|
1437
|
+
points parameters originate from an external system callback.
|
|
1438
|
+
|
|
1439
|
+
Pre-classified *param_locations* are honoured first; if a location
|
|
1440
|
+
value is not a valid ``OriginType`` name it is checked against the
|
|
1441
|
+
CLI-specific location table (``cli_argument``, ``cli_option``).
|
|
1442
|
+
"""
|
|
1443
|
+
param_name = param.name
|
|
1444
|
+
|
|
1445
|
+
# Check pre-classified location metadata from the entry point
|
|
1446
|
+
if param_locations and param_name in param_locations:
|
|
1447
|
+
location_value = param_locations[param_name]
|
|
1448
|
+
|
|
1449
|
+
# Try as a canonical OriginType enum name (e.g. "USER_INPUT")
|
|
1450
|
+
try:
|
|
1451
|
+
origin_type = OriginType[location_value]
|
|
1452
|
+
except KeyError:
|
|
1453
|
+
# CLI-specific location descriptors
|
|
1454
|
+
origin_type = self._CLI_LOCATION_ORIGINS.get(
|
|
1455
|
+
location_value,
|
|
1456
|
+
self._KIND_ORIGIN_MAP.get(kind, OriginType.UNKNOWN),
|
|
1457
|
+
)
|
|
1458
|
+
|
|
1459
|
+
return {
|
|
1460
|
+
"type": origin_type,
|
|
1461
|
+
"name": param_name,
|
|
1462
|
+
"location": (func.location.file, func.location.line),
|
|
1463
|
+
"parameter_name": param_name,
|
|
1464
|
+
"inferred_from": "entry_point_classification",
|
|
1465
|
+
}
|
|
1466
|
+
|
|
1467
|
+
# Fall back to the default origin for the entry-point kind
|
|
1468
|
+
origin_type = self._KIND_ORIGIN_MAP.get(kind, OriginType.UNKNOWN)
|
|
1469
|
+
return {
|
|
1470
|
+
"type": origin_type,
|
|
1471
|
+
"name": param_name,
|
|
1472
|
+
"location": (func.location.file, func.location.line),
|
|
1473
|
+
"parameter_name": param_name,
|
|
1474
|
+
"inferred_from": f"{kind}_default",
|
|
1475
|
+
}
|
|
1476
|
+
|
|
1477
|
+
|
|
1478
|
+
# =============================================================================
|
|
1479
|
+
# Data Flow Analyzer
|
|
1480
|
+
# =============================================================================
|
|
1481
|
+
|
|
1482
|
+
|
|
1483
|
+
class DataFlowAnalyzer:
|
|
1484
|
+
"""
|
|
1485
|
+
Main class for inter-procedural data flow analysis.
|
|
1486
|
+
|
|
1487
|
+
Usage:
|
|
1488
|
+
analyzer = DataFlowAnalyzer(
|
|
1489
|
+
call_graph=call_graph,
|
|
1490
|
+
parsed_files=parsed_files,
|
|
1491
|
+
entry_points=entry_points,
|
|
1492
|
+
max_depth=10,
|
|
1493
|
+
)
|
|
1494
|
+
|
|
1495
|
+
flows = analyzer.analyze()
|
|
1496
|
+
"""
|
|
1497
|
+
|
|
1498
|
+
def __init__(
|
|
1499
|
+
self,
|
|
1500
|
+
call_graph: CallGraph,
|
|
1501
|
+
parsed_files: list[ParsedFile],
|
|
1502
|
+
entry_points: list[dict[str, Any]],
|
|
1503
|
+
flow_bindings: FlowSensitiveBindings | None = None,
|
|
1504
|
+
max_depth: int = 10,
|
|
1505
|
+
):
|
|
1506
|
+
self._call_graph = call_graph
|
|
1507
|
+
self._parsed_files = {pf.path: pf for pf in parsed_files if pf.success}
|
|
1508
|
+
self._entry_points = entry_points
|
|
1509
|
+
self._flow_bindings = flow_bindings
|
|
1510
|
+
self._max_depth = max_depth
|
|
1511
|
+
|
|
1512
|
+
# Function lookup
|
|
1513
|
+
self._functions: dict[str, ParsedFunction] = {}
|
|
1514
|
+
self._build_function_index()
|
|
1515
|
+
|
|
1516
|
+
# Results
|
|
1517
|
+
self._flows: list[DataFlow] = []
|
|
1518
|
+
|
|
1519
|
+
# Statistics
|
|
1520
|
+
self._stats = {
|
|
1521
|
+
"entry_points_analyzed": 0,
|
|
1522
|
+
"origins_identified": 0,
|
|
1523
|
+
"flows_discovered": 0,
|
|
1524
|
+
"truncated_flows": 0,
|
|
1525
|
+
}
|
|
1526
|
+
|
|
1527
|
+
def _build_function_index(self) -> None:
|
|
1528
|
+
"""Build index of functions by qualified name."""
|
|
1529
|
+
for parsed in self._parsed_files.values():
|
|
1530
|
+
for func in parsed.functions:
|
|
1531
|
+
qname = func.qualified_name.full
|
|
1532
|
+
self._functions[qname] = func
|
|
1533
|
+
|
|
1534
|
+
for cls in parsed.classes:
|
|
1535
|
+
for method in cls.methods:
|
|
1536
|
+
qname = method.qualified_name.full
|
|
1537
|
+
self._functions[qname] = method
|
|
1538
|
+
|
|
1539
|
+
def _get_parsed_file_for_function(self, func: ParsedFunction) -> ParsedFile | None:
|
|
1540
|
+
"""Get the ParsedFile that contains this function."""
|
|
1541
|
+
if not func.location:
|
|
1542
|
+
return None
|
|
1543
|
+
file_path = getattr(func.location, "file", None) or getattr(func, "file_path", None)
|
|
1544
|
+
if not file_path:
|
|
1545
|
+
return None
|
|
1546
|
+
path = Path(file_path) if not isinstance(file_path, Path) else file_path
|
|
1547
|
+
return self._parsed_files.get(path)
|
|
1548
|
+
|
|
1549
|
+
def analyze(self) -> list[DataFlow]:
|
|
1550
|
+
"""
|
|
1551
|
+
Run data flow analysis on all entry points.
|
|
1552
|
+
|
|
1553
|
+
Returns list of discovered data flows.
|
|
1554
|
+
"""
|
|
1555
|
+
logger.info(
|
|
1556
|
+
f"Starting data flow analysis: "
|
|
1557
|
+
f"{len(self._entry_points)} entry points, "
|
|
1558
|
+
f"max depth {self._max_depth}"
|
|
1559
|
+
)
|
|
1560
|
+
|
|
1561
|
+
# Initialize taint propagator (with parsed files for call-site analysis)
|
|
1562
|
+
propagator = TaintPropagator(
|
|
1563
|
+
self._call_graph,
|
|
1564
|
+
self._flow_bindings,
|
|
1565
|
+
self._max_depth,
|
|
1566
|
+
parsed_files=list(self._parsed_files.values()),
|
|
1567
|
+
)
|
|
1568
|
+
|
|
1569
|
+
# Source identifier (parameter-based origins)
|
|
1570
|
+
source_id = SourceIdentifier()
|
|
1571
|
+
|
|
1572
|
+
# Process each entry point (framework from route for read-site detection)
|
|
1573
|
+
for ep in self._entry_points:
|
|
1574
|
+
framework = ep.get("framework") or "fastapi"
|
|
1575
|
+
ep_kind = ep.get("kind") # "http", "cli", "task", …
|
|
1576
|
+
self._stats["entry_points_analyzed"] += 1
|
|
1577
|
+
|
|
1578
|
+
# Get the handler function
|
|
1579
|
+
handler_qname = ep.get("handler_qualified_name")
|
|
1580
|
+
if not handler_qname:
|
|
1581
|
+
continue
|
|
1582
|
+
|
|
1583
|
+
func = self._functions.get(handler_qname)
|
|
1584
|
+
if not func:
|
|
1585
|
+
continue
|
|
1586
|
+
|
|
1587
|
+
# Build param location lookup from the entry_point's classified params
|
|
1588
|
+
# so data flow origin types stay consistent with entry_point classification.
|
|
1589
|
+
param_locations: dict[str, str] = {}
|
|
1590
|
+
for p in ep.get("path_params", []):
|
|
1591
|
+
pname = p.get("name") if isinstance(p, dict) else getattr(p, "name", None)
|
|
1592
|
+
if pname:
|
|
1593
|
+
param_locations[pname] = "HTTP_PATH_PARAM"
|
|
1594
|
+
for p in ep.get("query_params", []):
|
|
1595
|
+
pname = p.get("name") if isinstance(p, dict) else getattr(p, "name", None)
|
|
1596
|
+
if pname:
|
|
1597
|
+
param_locations[pname] = "HTTP_QUERY_PARAM"
|
|
1598
|
+
for p in ep.get("header_params", []):
|
|
1599
|
+
pname = p.get("name") if isinstance(p, dict) else getattr(p, "name", None)
|
|
1600
|
+
if pname:
|
|
1601
|
+
param_locations[pname] = "HTTP_HEADER"
|
|
1602
|
+
for p in ep.get("cookie_params", []):
|
|
1603
|
+
pname = p.get("name") if isinstance(p, dict) else getattr(p, "name", None)
|
|
1604
|
+
if pname:
|
|
1605
|
+
param_locations[pname] = "HTTP_COOKIE"
|
|
1606
|
+
ep_body = ep.get("body")
|
|
1607
|
+
if ep_body:
|
|
1608
|
+
body_fields = (
|
|
1609
|
+
ep_body.get("model_fields", [])
|
|
1610
|
+
if isinstance(ep_body, dict)
|
|
1611
|
+
else getattr(ep_body, "model_fields", [])
|
|
1612
|
+
)
|
|
1613
|
+
for f in body_fields:
|
|
1614
|
+
fname = (
|
|
1615
|
+
f
|
|
1616
|
+
if isinstance(f, str)
|
|
1617
|
+
else (f.get("name") if isinstance(f, dict) else getattr(f, "name", None))
|
|
1618
|
+
)
|
|
1619
|
+
if fname:
|
|
1620
|
+
param_locations[fname] = "HTTP_BODY"
|
|
1621
|
+
|
|
1622
|
+
# Non-HTTP entry points may carry parameter metadata in a
|
|
1623
|
+
# generic "parameters" list with per-param "location" values
|
|
1624
|
+
# (e.g. "cli_argument", "cli_option").
|
|
1625
|
+
if not param_locations:
|
|
1626
|
+
for p in ep.get("parameters", []):
|
|
1627
|
+
pname = p.get("name") if isinstance(p, dict) else getattr(p, "name", None)
|
|
1628
|
+
ploc = (
|
|
1629
|
+
p.get("location") if isinstance(p, dict) else getattr(p, "location", None)
|
|
1630
|
+
)
|
|
1631
|
+
if pname and ploc:
|
|
1632
|
+
param_locations[pname] = ploc
|
|
1633
|
+
|
|
1634
|
+
route_path = ep.get("path", "")
|
|
1635
|
+
origins = source_id.identify_from_function(
|
|
1636
|
+
func,
|
|
1637
|
+
entry_point_id=ep.get("id"),
|
|
1638
|
+
route_path=route_path,
|
|
1639
|
+
param_locations=param_locations if param_locations else None,
|
|
1640
|
+
entry_point_kind=ep_kind,
|
|
1641
|
+
)
|
|
1642
|
+
|
|
1643
|
+
# Register parameter-based origins
|
|
1644
|
+
for origin in origins:
|
|
1645
|
+
self._stats["origins_identified"] += 1
|
|
1646
|
+
propagator.add_origin(
|
|
1647
|
+
origin_type=origin["type"],
|
|
1648
|
+
origin_name=origin["name"],
|
|
1649
|
+
origin_location=origin["location"],
|
|
1650
|
+
entry_point_id=origin.get("entry_point_id"),
|
|
1651
|
+
function_qname=origin["function_qname"],
|
|
1652
|
+
parameter_name=origin["parameter_name"],
|
|
1653
|
+
)
|
|
1654
|
+
|
|
1655
|
+
# Read-site origins: request.path_params["id"], request.args.get("x"), etc.
|
|
1656
|
+
parsed_file = self._get_parsed_file_for_function(func)
|
|
1657
|
+
if parsed_file:
|
|
1658
|
+
from .read_site_detector import ReadSiteDetector
|
|
1659
|
+
|
|
1660
|
+
read_site_detector = ReadSiteDetector(framework=framework)
|
|
1661
|
+
read_origins = read_site_detector.detect_origins(func, parsed_file)
|
|
1662
|
+
for ro in read_origins:
|
|
1663
|
+
self._stats["origins_identified"] += 1
|
|
1664
|
+
try:
|
|
1665
|
+
ro_origin_type = OriginType[ro.origin_type]
|
|
1666
|
+
except KeyError:
|
|
1667
|
+
ro_origin_type = OriginType.USER_INPUT
|
|
1668
|
+
propagator.add_origin(
|
|
1669
|
+
origin_type=ro_origin_type,
|
|
1670
|
+
origin_name=ro.param_name or ro.target_variable,
|
|
1671
|
+
origin_location=(ro.file_path, ro.line),
|
|
1672
|
+
entry_point_id=ep.get("id"),
|
|
1673
|
+
function_qname=handler_qname,
|
|
1674
|
+
parameter_name=ro.target_variable,
|
|
1675
|
+
)
|
|
1676
|
+
|
|
1677
|
+
# Run propagation
|
|
1678
|
+
self._flows = propagator.propagate_all()
|
|
1679
|
+
|
|
1680
|
+
# Enrich each flow with sink-evidence
|
|
1681
|
+
from .sink_evidence import SinkEvidenceCollector
|
|
1682
|
+
|
|
1683
|
+
evidence_collector = SinkEvidenceCollector(list(self._parsed_files.values()))
|
|
1684
|
+
for flow in self._flows:
|
|
1685
|
+
ev = evidence_collector.collect_for_flow(
|
|
1686
|
+
sink_file=flow.sink_location[0],
|
|
1687
|
+
sink_line=flow.sink_location[1],
|
|
1688
|
+
sink_arg_index=flow.sink_argument_index,
|
|
1689
|
+
sink_arg_name=flow.sink_argument_name,
|
|
1690
|
+
)
|
|
1691
|
+
if ev is not None:
|
|
1692
|
+
flow.sink_evidence = ev
|
|
1693
|
+
|
|
1694
|
+
# Update statistics
|
|
1695
|
+
self._stats["flows_discovered"] = len(self._flows)
|
|
1696
|
+
self._stats["truncated_flows"] = sum(1 for f in self._flows if f.truncated)
|
|
1697
|
+
|
|
1698
|
+
prop_stats = propagator.get_statistics()
|
|
1699
|
+
self._stats.update(prop_stats)
|
|
1700
|
+
|
|
1701
|
+
logger.info(
|
|
1702
|
+
f"Data flow analysis complete: "
|
|
1703
|
+
f"{self._stats['origins_identified']} origins, "
|
|
1704
|
+
f"{self._stats['flows_discovered']} flows"
|
|
1705
|
+
)
|
|
1706
|
+
|
|
1707
|
+
return self._flows
|
|
1708
|
+
|
|
1709
|
+
def get_statistics(self) -> dict[str, int]:
|
|
1710
|
+
"""Get analysis statistics."""
|
|
1711
|
+
return dict(self._stats)
|
|
1712
|
+
|
|
1713
|
+
def to_manifest_flows(self) -> list[dict[str, Any]]:
|
|
1714
|
+
"""Convert flows to manifest format."""
|
|
1715
|
+
from ..core.manifest import (
|
|
1716
|
+
CallContextModel,
|
|
1717
|
+
DataFlowModel,
|
|
1718
|
+
DataOriginModel,
|
|
1719
|
+
DataSinkModel,
|
|
1720
|
+
FlowStepModel,
|
|
1721
|
+
LocationModel,
|
|
1722
|
+
TransformationModel,
|
|
1723
|
+
)
|
|
1724
|
+
|
|
1725
|
+
result = []
|
|
1726
|
+
|
|
1727
|
+
for flow in self._flows:
|
|
1728
|
+
path_models = [
|
|
1729
|
+
FlowStepModel(
|
|
1730
|
+
depth=step.depth,
|
|
1731
|
+
caller=step.caller,
|
|
1732
|
+
callee=step.callee,
|
|
1733
|
+
argument_mapping=step.variable_mapping,
|
|
1734
|
+
location=LocationModel(
|
|
1735
|
+
file=str(step.location[0]),
|
|
1736
|
+
line=step.location[1],
|
|
1737
|
+
),
|
|
1738
|
+
)
|
|
1739
|
+
for step in flow.path
|
|
1740
|
+
]
|
|
1741
|
+
|
|
1742
|
+
transform_models = [
|
|
1743
|
+
TransformationModel(
|
|
1744
|
+
depth=t.depth if t.depth >= 0 else 0,
|
|
1745
|
+
location=LocationModel(
|
|
1746
|
+
file=str(t.location[0]),
|
|
1747
|
+
line=t.location[1],
|
|
1748
|
+
),
|
|
1749
|
+
type=t.type,
|
|
1750
|
+
description=t.description,
|
|
1751
|
+
function=t.function,
|
|
1752
|
+
call_evidence=self._evidence_to_model(t.call_evidence),
|
|
1753
|
+
)
|
|
1754
|
+
for t in flow.transformations
|
|
1755
|
+
]
|
|
1756
|
+
|
|
1757
|
+
ctx = flow.sink_context or {}
|
|
1758
|
+
context_model = CallContextModel(
|
|
1759
|
+
in_try_block=ctx.get("in_try_block", False),
|
|
1760
|
+
in_conditional=ctx.get("in_conditional", False),
|
|
1761
|
+
in_loop=ctx.get("in_loop", False),
|
|
1762
|
+
)
|
|
1763
|
+
|
|
1764
|
+
sink_evidence_model = self._evidence_to_model(flow.sink_evidence)
|
|
1765
|
+
|
|
1766
|
+
flow_model = DataFlowModel(
|
|
1767
|
+
id=flow.id,
|
|
1768
|
+
origin=DataOriginModel(
|
|
1769
|
+
type=flow.origin_type.name,
|
|
1770
|
+
name=flow.origin_name,
|
|
1771
|
+
location=LocationModel(
|
|
1772
|
+
file=str(flow.origin_location[0]),
|
|
1773
|
+
line=flow.origin_location[1],
|
|
1774
|
+
),
|
|
1775
|
+
entry_point_ref=flow.entry_point_id,
|
|
1776
|
+
),
|
|
1777
|
+
sink=DataSinkModel(
|
|
1778
|
+
function=flow.sink_function,
|
|
1779
|
+
argument_index=flow.sink_argument_index,
|
|
1780
|
+
argument_name=flow.sink_argument_name,
|
|
1781
|
+
location=LocationModel(
|
|
1782
|
+
file=str(flow.sink_location[0]),
|
|
1783
|
+
line=flow.sink_location[1],
|
|
1784
|
+
),
|
|
1785
|
+
call_ref=flow.sink_call_id,
|
|
1786
|
+
),
|
|
1787
|
+
path=path_models,
|
|
1788
|
+
depth=flow.depth,
|
|
1789
|
+
truncated=flow.truncated,
|
|
1790
|
+
transformations=transform_models,
|
|
1791
|
+
sink_evidence=sink_evidence_model,
|
|
1792
|
+
context=context_model,
|
|
1793
|
+
confidence=flow.confidence,
|
|
1794
|
+
)
|
|
1795
|
+
|
|
1796
|
+
result.append(flow_model.model_dump())
|
|
1797
|
+
|
|
1798
|
+
return result
|
|
1799
|
+
|
|
1800
|
+
@staticmethod
|
|
1801
|
+
def _evidence_to_model(evidence: Any) -> CallSiteEvidenceModel | None: # noqa: F821
|
|
1802
|
+
"""Convert internal CallSiteEvidence to manifest Pydantic model."""
|
|
1803
|
+
if evidence is None:
|
|
1804
|
+
return None
|
|
1805
|
+
from ..core.manifest import (
|
|
1806
|
+
ArgumentEvidenceModel,
|
|
1807
|
+
CallSiteEvidenceModel,
|
|
1808
|
+
StringPatternModel,
|
|
1809
|
+
)
|
|
1810
|
+
|
|
1811
|
+
arg_models = [
|
|
1812
|
+
ArgumentEvidenceModel(
|
|
1813
|
+
position=a.position,
|
|
1814
|
+
name=a.name,
|
|
1815
|
+
is_literal=a.is_literal,
|
|
1816
|
+
literal_value=a.literal_value,
|
|
1817
|
+
literal_type=a.literal_type,
|
|
1818
|
+
is_variable=a.is_variable,
|
|
1819
|
+
variable_name=a.variable_name,
|
|
1820
|
+
is_call_result=a.is_call_result,
|
|
1821
|
+
called_function=a.called_function,
|
|
1822
|
+
construction=a.construction,
|
|
1823
|
+
container_type=a.container_type,
|
|
1824
|
+
source_variables=list(a.source_variables),
|
|
1825
|
+
expression_text=a.expression_text,
|
|
1826
|
+
is_tainted=a.is_tainted,
|
|
1827
|
+
)
|
|
1828
|
+
for a in evidence.all_arguments
|
|
1829
|
+
]
|
|
1830
|
+
pattern_models = [
|
|
1831
|
+
StringPatternModel(
|
|
1832
|
+
type=p.pattern_type,
|
|
1833
|
+
pattern=p.matched,
|
|
1834
|
+
argument_position=p.argument_position,
|
|
1835
|
+
)
|
|
1836
|
+
for p in evidence.string_patterns
|
|
1837
|
+
]
|
|
1838
|
+
return CallSiteEvidenceModel(
|
|
1839
|
+
tainted_argument_position=evidence.tainted_argument_position,
|
|
1840
|
+
tainted_argument_name=evidence.tainted_argument_name,
|
|
1841
|
+
tainted_argument_construction=evidence.tainted_argument_construction,
|
|
1842
|
+
all_arguments=arg_models,
|
|
1843
|
+
string_patterns=pattern_models,
|
|
1844
|
+
)
|
|
1845
|
+
|
|
1846
|
+
|
|
1847
|
+
# =============================================================================
|
|
1848
|
+
# Convenience Functions
|
|
1849
|
+
# =============================================================================
|
|
1850
|
+
|
|
1851
|
+
|
|
1852
|
+
def analyze_data_flow(
|
|
1853
|
+
call_graph: CallGraph,
|
|
1854
|
+
parsed_files: list[ParsedFile],
|
|
1855
|
+
entry_points: list[dict[str, Any]],
|
|
1856
|
+
flow_bindings: FlowSensitiveBindings | None = None,
|
|
1857
|
+
max_depth: int = 10,
|
|
1858
|
+
) -> tuple[list[DataFlow], dict[str, int]]:
|
|
1859
|
+
"""
|
|
1860
|
+
Convenience function to run data flow analysis.
|
|
1861
|
+
|
|
1862
|
+
Args:
|
|
1863
|
+
call_graph: Pre-built call graph
|
|
1864
|
+
parsed_files: Parsed source files
|
|
1865
|
+
entry_points: List of entry points (routes)
|
|
1866
|
+
flow_bindings: Optional flow-sensitive bindings
|
|
1867
|
+
max_depth: Maximum propagation depth
|
|
1868
|
+
|
|
1869
|
+
Returns:
|
|
1870
|
+
Tuple of (flows, statistics)
|
|
1871
|
+
"""
|
|
1872
|
+
analyzer = DataFlowAnalyzer(
|
|
1873
|
+
call_graph=call_graph,
|
|
1874
|
+
parsed_files=parsed_files,
|
|
1875
|
+
entry_points=entry_points,
|
|
1876
|
+
flow_bindings=flow_bindings,
|
|
1877
|
+
max_depth=max_depth,
|
|
1878
|
+
)
|
|
1879
|
+
|
|
1880
|
+
flows = analyzer.analyze()
|
|
1881
|
+
stats = analyzer.get_statistics()
|
|
1882
|
+
|
|
1883
|
+
return flows, stats
|