apisec-code-bolt 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (111) hide show
  1. apisec_code_bolt/__init__.py +42 -0
  2. apisec_code_bolt/__main__.py +11 -0
  3. apisec_code_bolt/analysis/__init__.py +96 -0
  4. apisec_code_bolt/analysis/analyzer.py +2309 -0
  5. apisec_code_bolt/analysis/binding_tracker.py +341 -0
  6. apisec_code_bolt/analysis/call_graph.py +1197 -0
  7. apisec_code_bolt/analysis/call_graph_types.py +332 -0
  8. apisec_code_bolt/analysis/call_resolver.py +988 -0
  9. apisec_code_bolt/analysis/capability_tagger.py +322 -0
  10. apisec_code_bolt/analysis/config_scanner.py +197 -0
  11. apisec_code_bolt/analysis/data_flow.py +1883 -0
  12. apisec_code_bolt/analysis/dependency_extractor.py +959 -0
  13. apisec_code_bolt/analysis/flow_analysis.py +1406 -0
  14. apisec_code_bolt/analysis/hof_catalog.py +61 -0
  15. apisec_code_bolt/analysis/integration_detector.py +1399 -0
  16. apisec_code_bolt/analysis/literal_scanner.py +300 -0
  17. apisec_code_bolt/analysis/path_normalizer.py +55 -0
  18. apisec_code_bolt/analysis/read_site_detector.py +310 -0
  19. apisec_code_bolt/analysis/request_patterns.py +162 -0
  20. apisec_code_bolt/analysis/sensitivity_classifier.py +224 -0
  21. apisec_code_bolt/analysis/sink_evidence.py +333 -0
  22. apisec_code_bolt/analysis/url_prefix_resolver.py +338 -0
  23. apisec_code_bolt/cli/__init__.py +5 -0
  24. apisec_code_bolt/cli/exit_codes.py +17 -0
  25. apisec_code_bolt/cli/main.py +1069 -0
  26. apisec_code_bolt/cloud/__init__.py +1 -0
  27. apisec_code_bolt/cloud/apisec_client.py +118 -0
  28. apisec_code_bolt/cloud/client.py +255 -0
  29. apisec_code_bolt/core/__init__.py +75 -0
  30. apisec_code_bolt/core/config.py +528 -0
  31. apisec_code_bolt/core/credentials.py +65 -0
  32. apisec_code_bolt/core/discovery.py +433 -0
  33. apisec_code_bolt/core/log_format.py +115 -0
  34. apisec_code_bolt/core/manifest.py +1009 -0
  35. apisec_code_bolt/core/repo.py +280 -0
  36. apisec_code_bolt/core/state.py +59 -0
  37. apisec_code_bolt/core/telemetry.py +451 -0
  38. apisec_code_bolt/core/types.py +587 -0
  39. apisec_code_bolt/fingerprinting/__init__.py +1 -0
  40. apisec_code_bolt/frameworks/__init__.py +29 -0
  41. apisec_code_bolt/frameworks/_jwt_common.py +50 -0
  42. apisec_code_bolt/frameworks/auth_helpers.py +437 -0
  43. apisec_code_bolt/frameworks/base.py +608 -0
  44. apisec_code_bolt/frameworks/dotnet/__init__.py +17 -0
  45. apisec_code_bolt/frameworks/dotnet/_path_helpers.py +43 -0
  46. apisec_code_bolt/frameworks/dotnet/aspnet_plugin.py +2546 -0
  47. apisec_code_bolt/frameworks/dotnet/grpc_plugin.py +559 -0
  48. apisec_code_bolt/frameworks/dotnet/jwt_config_extractor.py +545 -0
  49. apisec_code_bolt/frameworks/dotnet/legacy_aspnet_plugin.py +732 -0
  50. apisec_code_bolt/frameworks/dotnet/refit_plugin.py +374 -0
  51. apisec_code_bolt/frameworks/dotnet/wcf_plugin.py +1239 -0
  52. apisec_code_bolt/frameworks/java/__init__.py +6 -0
  53. apisec_code_bolt/frameworks/java/_annotations.py +167 -0
  54. apisec_code_bolt/frameworks/java/_constraints.py +128 -0
  55. apisec_code_bolt/frameworks/java/graphql_plugin.py +287 -0
  56. apisec_code_bolt/frameworks/java/jaxrs_plugin.py +748 -0
  57. apisec_code_bolt/frameworks/java/jwt_config_extractor.py +361 -0
  58. apisec_code_bolt/frameworks/java/micronaut_plugin.py +1059 -0
  59. apisec_code_bolt/frameworks/java/spring_plugin.py +1293 -0
  60. apisec_code_bolt/frameworks/js/__init__.py +8 -0
  61. apisec_code_bolt/frameworks/js/express_plugin.py +391 -0
  62. apisec_code_bolt/frameworks/js/fastify_plugin.py +381 -0
  63. apisec_code_bolt/frameworks/js/graphql_plugin.py +198 -0
  64. apisec_code_bolt/frameworks/js/nestjs_plugin.py +423 -0
  65. apisec_code_bolt/frameworks/python/__init__.py +19 -0
  66. apisec_code_bolt/frameworks/python/celery_plugin.py +393 -0
  67. apisec_code_bolt/frameworks/python/click_plugin.py +427 -0
  68. apisec_code_bolt/frameworks/python/django_plugin.py +867 -0
  69. apisec_code_bolt/frameworks/python/fastapi/__init__.py +28 -0
  70. apisec_code_bolt/frameworks/python/fastapi/plugin.py +1390 -0
  71. apisec_code_bolt/frameworks/python/flask_plugin.py +205 -0
  72. apisec_code_bolt/frameworks/python/graphql_plugin.py +274 -0
  73. apisec_code_bolt/frameworks/python/prefect_plugin.py +251 -0
  74. apisec_code_bolt/frameworks/python/webhook_plugin.py +255 -0
  75. apisec_code_bolt/parsing/__init__.py +62 -0
  76. apisec_code_bolt/parsing/base.py +554 -0
  77. apisec_code_bolt/parsing/csharp/__init__.py +5 -0
  78. apisec_code_bolt/parsing/csharp/language_services.py +203 -0
  79. apisec_code_bolt/parsing/csharp/literals.py +72 -0
  80. apisec_code_bolt/parsing/csharp/parser.py +1158 -0
  81. apisec_code_bolt/parsing/csharp/type_resolver.py +568 -0
  82. apisec_code_bolt/parsing/js/__init__.py +5 -0
  83. apisec_code_bolt/parsing/js/language_services.py +118 -0
  84. apisec_code_bolt/parsing/js/parser.py +622 -0
  85. apisec_code_bolt/parsing/jvm/__init__.py +7 -0
  86. apisec_code_bolt/parsing/jvm/language_services.py +270 -0
  87. apisec_code_bolt/parsing/jvm/parser.py +774 -0
  88. apisec_code_bolt/parsing/jvm/type_resolver.py +422 -0
  89. apisec_code_bolt/parsing/python/__init__.py +150 -0
  90. apisec_code_bolt/parsing/python/cbv_extractor.py +606 -0
  91. apisec_code_bolt/parsing/python/constant_resolver.py +500 -0
  92. apisec_code_bolt/parsing/python/cross_file_resolver.py +1054 -0
  93. apisec_code_bolt/parsing/python/dynamic_route_detector.py +532 -0
  94. apisec_code_bolt/parsing/python/expression_utils.py +221 -0
  95. apisec_code_bolt/parsing/python/extraction_types.py +271 -0
  96. apisec_code_bolt/parsing/python/language_services.py +487 -0
  97. apisec_code_bolt/parsing/python/parameter_analyzer.py +789 -0
  98. apisec_code_bolt/parsing/python/parser.py +719 -0
  99. apisec_code_bolt/parsing/python/path_resolver.py +576 -0
  100. apisec_code_bolt/parsing/python/router_registry.py +806 -0
  101. apisec_code_bolt/parsing/python/type_resolver.py +730 -0
  102. apisec_code_bolt/parsing/python/visitors.py +1544 -0
  103. apisec_code_bolt/parsing/services.py +544 -0
  104. apisec_code_bolt/query/__init__.py +1 -0
  105. apisec_code_bolt/query/ast_cache.py +182 -0
  106. apisec_code_bolt/query/executor.py +283 -0
  107. apisec_code_bolt/query/handlers.py +832 -0
  108. apisec_code_bolt-0.1.0.dist-info/METADATA +230 -0
  109. apisec_code_bolt-0.1.0.dist-info/RECORD +111 -0
  110. apisec_code_bolt-0.1.0.dist-info/WHEEL +4 -0
  111. apisec_code_bolt-0.1.0.dist-info/entry_points.txt +2 -0
@@ -0,0 +1,1883 @@
1
+ """
2
+ Inter-procedural data flow (taint) tracking.
3
+
4
+ This module implements taint analysis to track how data flows from HTTP inputs
5
+ (sources) through the program to function calls (potential sinks). The probe
6
+ does NOT decide what constitutes a vulnerability - it only tracks data flow
7
+ paths. The cloud service applies sink/source rules to identify issues.
8
+
9
+ Key concepts:
10
+ - **Origin**: Where data enters (HTTP params, body, headers, etc.)
11
+ - **Taint**: A marker indicating data came from an origin
12
+ - **Propagation**: How taint spreads through assignments and calls
13
+ - **Path**: The sequence of function calls data flows through
14
+ - **Sink**: Any function call that receives tainted data (cloud decides danger)
15
+
16
+ Algorithm:
17
+ 1. Identify all data origins from route handlers (entry points)
18
+ 2. Mark function parameters that receive origin data as tainted
19
+ 3. Propagate taint through:
20
+ - Assignments: x = y spreads taint from y to x
21
+ - Returns: return tainted_var taints the call result
22
+ - Arguments: func(tainted) taints the parameter in callee
23
+ 4. Record the full path when tainted data reaches a function call
24
+ 5. Track transformations (string operations, encoding, etc.)
25
+
26
+ Depth limit: We stop propagation at a configurable max depth (default 10)
27
+ to prevent infinite recursion and manage memory.
28
+ """
29
+
30
+ from __future__ import annotations
31
+
32
+ import logging
33
+ from collections import defaultdict
34
+ from dataclasses import dataclass, field
35
+ from pathlib import Path
36
+ from typing import TYPE_CHECKING, Any
37
+
38
+ from ..core.manifest import stable_id
39
+ from ..core.types import OriginType
40
+
41
+ if TYPE_CHECKING:
42
+ from ..parsing.base import ParsedCallSite, ParsedFile, ParsedFunction
43
+ from .call_graph import CallGraph, CallGraphEdge
44
+ from .flow_analysis import FlowSensitiveBindings
45
+
46
+
47
+ logger = logging.getLogger(__name__)
48
+
49
+
50
+ # =============================================================================
51
+ # Core Data Structures
52
+ # =============================================================================
53
+
54
+
55
+ @dataclass
56
+ class TaintedValue:
57
+ """
58
+ Represents a tainted value being tracked.
59
+
60
+ A taint propagates from an origin and carries information about
61
+ where the data came from and how it was transformed.
62
+ """
63
+
64
+ # Unique ID for this taint instance
65
+ taint_id: str
66
+
67
+ # Origin information
68
+ origin_type: OriginType
69
+ origin_name: str # Parameter name, header name, etc.
70
+ origin_location: tuple[Path, int] # (file, line)
71
+ entry_point_id: str | None # Reference to entry point (route)
72
+
73
+ # Current state
74
+ current_variable: str # Variable name holding tainted data
75
+ current_function: str # Function where taint currently is
76
+ current_file: Path
77
+
78
+ # Path tracking
79
+ depth: int = 0
80
+ path: list[FlowStep] = field(default_factory=list)
81
+
82
+ # Transformations applied
83
+ transformations: list[Transformation] = field(default_factory=list)
84
+
85
+ # Metadata
86
+ metadata: dict[str, Any] = field(default_factory=dict)
87
+
88
+
89
+ @dataclass
90
+ class FlowStep:
91
+ """A step in the data flow path."""
92
+
93
+ depth: int
94
+ caller: str # Calling function
95
+ callee: str # Called function
96
+ argument_index: int | None # Which argument carried the taint
97
+ argument_name: str | None # Parameter name in callee
98
+ location: tuple[Path, int] # (file, line)
99
+
100
+ # Mapping: how caller's variable maps to callee's parameter
101
+ variable_mapping: dict[str, str] = field(default_factory=dict)
102
+
103
+
104
+ class TransformationType:
105
+ """Well-known transformation type constants (cloud-interpretable)."""
106
+
107
+ FUNCTION_CALL = "function_call" # x = func(y)
108
+ METHOD_CALL = "method_call" # x = y.method()
109
+ STRING_FORMAT = "string_format" # x = f"...{y}..."
110
+ BINARY_OP = "binary_op" # x = y + z
111
+ TYPE_CONSTRUCTOR = "type_constructor" # x = int(y), x = str(y)
112
+ ATTRIBUTE_ACCESS = "attribute_access" # x = y.attr
113
+
114
+
115
+ @dataclass
116
+ class Transformation:
117
+ """A transformation applied to tainted data."""
118
+
119
+ type: str # See TransformationType constants
120
+ description: str | None
121
+ location: tuple[Path, int]
122
+ function: str | None # Function / method that performed the transform
123
+
124
+ # The depth in the inter-procedural path where this transformation occurs.
125
+ # -1 means intra-procedural (within the same function as the source).
126
+ depth: int = -1
127
+
128
+ # Structural evidence about the transformation call site (co-arguments, kwargs).
129
+ call_evidence: Any = None # CallSiteEvidence from sink_evidence module
130
+
131
+
132
+ @dataclass
133
+ class DataFlow:
134
+ """
135
+ A complete data flow from origin to a function call.
136
+
137
+ This is the output of taint analysis - one flow per origin-to-call path.
138
+ """
139
+
140
+ id: str # Unique ID (flow-001, etc.)
141
+
142
+ # Origin (source)
143
+ origin_type: OriginType
144
+ origin_name: str
145
+ origin_location: tuple[Path, int]
146
+ entry_point_id: str | None
147
+
148
+ # Sink (destination function call)
149
+ sink_function: str # Qualified name of called function
150
+ sink_argument_index: int | None
151
+ sink_argument_name: str | None
152
+ sink_location: tuple[Path, int]
153
+ sink_call_id: str | None # Reference to call in call graph
154
+
155
+ # Path through code
156
+ path: list[FlowStep]
157
+ depth: int
158
+ truncated: bool = False # True if cut off at max depth
159
+
160
+ # Transformations
161
+ transformations: list[Transformation] = field(default_factory=list)
162
+
163
+ # Sink evidence (structural facts about the call site)
164
+ sink_evidence: Any = None # CallSiteEvidence from sink_evidence module
165
+
166
+ # Context at sink
167
+ sink_context: dict[str, Any] = field(default_factory=dict)
168
+
169
+ # Confidence
170
+ confidence: str = "HIGH" # HIGH, MEDIUM, LOW
171
+
172
+
173
+ # =============================================================================
174
+ # Taint Propagator
175
+ # =============================================================================
176
+
177
+
178
+ class TaintPropagator:
179
+ """
180
+ Propagates taint through the program using the call graph.
181
+
182
+ This is the core engine that tracks how data flows through function
183
+ calls. It uses the call graph for inter-procedural tracking and
184
+ flow-sensitive bindings for intra-procedural tracking.
185
+
186
+ Uses actual call-site argument analysis when parsed_files are provided
187
+ for accurate taint propagation (which variable flows to which parameter).
188
+ """
189
+
190
+ def __init__(
191
+ self,
192
+ call_graph: CallGraph,
193
+ flow_bindings: FlowSensitiveBindings | None = None,
194
+ max_depth: int = 10,
195
+ parsed_files: list[ParsedFile] | None = None,
196
+ ):
197
+ self._call_graph = call_graph
198
+ self._flow_bindings = flow_bindings
199
+ self._max_depth = max_depth
200
+ self._parsed_files = {pf.path: pf for pf in (parsed_files or []) if pf.success}
201
+
202
+ # Taint state: function -> variable -> set of taints
203
+ self._taints: dict[str, dict[str, set[str]]] = defaultdict(lambda: defaultdict(set))
204
+
205
+ # Active taints being propagated
206
+ self._active_taints: dict[str, TaintedValue] = {}
207
+
208
+ # Completed flows
209
+ self._flows: list[DataFlow] = []
210
+
211
+ # Next IDs (taint IDs are internal-only, stay sequential)
212
+ self._next_taint_id = 0
213
+
214
+ # Visited states (to prevent infinite loops)
215
+ self._visited: set[tuple[str, str, int]] = set()
216
+
217
+ # Statistics
218
+ self._stats = {
219
+ "taints_created": 0,
220
+ "flows_found": 0,
221
+ "max_depth_reached": 0,
222
+ "cycles_avoided": 0,
223
+ }
224
+
225
+ def add_origin(
226
+ self,
227
+ origin_type: OriginType,
228
+ origin_name: str,
229
+ origin_location: tuple[Path, int],
230
+ entry_point_id: str | None,
231
+ function_qname: str,
232
+ parameter_name: str,
233
+ ) -> str:
234
+ """
235
+ Add a new data origin to track.
236
+
237
+ This creates a taint that will be propagated through the code.
238
+
239
+ Args:
240
+ origin_type: Type of origin (HTTP_PATH_PARAM, etc.)
241
+ origin_name: Name of the origin (parameter name, header name, etc.)
242
+ origin_location: Where the origin is in code
243
+ entry_point_id: Reference to the entry point (route)
244
+ function_qname: Function where the origin enters
245
+ parameter_name: Parameter that receives the origin data
246
+
247
+ Returns:
248
+ Taint ID for tracking
249
+ """
250
+ taint_id = f"taint-{self._next_taint_id:04d}"
251
+ self._next_taint_id += 1
252
+
253
+ taint = TaintedValue(
254
+ taint_id=taint_id,
255
+ origin_type=origin_type,
256
+ origin_name=origin_name,
257
+ origin_location=origin_location,
258
+ entry_point_id=entry_point_id,
259
+ current_variable=parameter_name,
260
+ current_function=function_qname,
261
+ current_file=origin_location[0],
262
+ depth=0,
263
+ path=[],
264
+ )
265
+
266
+ self._active_taints[taint_id] = taint
267
+ self._taints[function_qname][parameter_name].add(taint_id)
268
+ self._stats["taints_created"] += 1
269
+
270
+ return taint_id
271
+
272
+ def propagate_all(self) -> list[DataFlow]:
273
+ """
274
+ Propagate all taints through the call graph.
275
+
276
+ This is the main entry point for taint analysis. It propagates
277
+ all registered origins until they either:
278
+ - Reach max depth
279
+ - Hit a cycle
280
+ - Reach a leaf function (no more calls)
281
+
282
+ Returns:
283
+ List of discovered data flows
284
+ """
285
+ # Process each function in topological order (if possible)
286
+ # For now, process all functions that have taints
287
+ worklist = list(self._active_taints.keys())
288
+
289
+ while worklist:
290
+ taint_id = worklist.pop(0)
291
+
292
+ if taint_id not in self._active_taints:
293
+ continue
294
+
295
+ taint = self._active_taints[taint_id]
296
+
297
+ # Check depth limit
298
+ if taint.depth >= self._max_depth:
299
+ self._stats["max_depth_reached"] += 1
300
+ continue
301
+
302
+ # Check for cycles
303
+ state = (taint.current_function, taint.current_variable, taint.depth)
304
+ if state in self._visited:
305
+ self._stats["cycles_avoided"] += 1
306
+ continue
307
+ self._visited.add(state)
308
+
309
+ # Propagate through calls from this function
310
+ new_taints = self._propagate_function(taint)
311
+ worklist.extend(new_taints)
312
+
313
+ return self._flows
314
+
315
+ def _propagate_function(self, taint: TaintedValue) -> list[str]:
316
+ """
317
+ Propagate taint through calls made by a function.
318
+
319
+ Returns list of new taint IDs created.
320
+ """
321
+ new_taint_ids: list[str] = []
322
+
323
+ # Get all calls from this function
324
+ callees = self._call_graph.get_callees(taint.current_function)
325
+
326
+ for callee_qname in callees:
327
+ # Get the edges (call sites) for this caller-callee pair
328
+ # Filter edges from caller to this specific callee
329
+ edges = [
330
+ e
331
+ for e in self._call_graph.get_edges_from(taint.current_function)
332
+ if e.callee == callee_qname
333
+ ]
334
+
335
+ for edge in edges:
336
+ # Check if the tainted variable is passed to this call
337
+ taint_passes = self._check_taint_passes(taint, edge, callee_qname)
338
+
339
+ if taint_passes:
340
+ arg_index, arg_name = taint_passes
341
+
342
+ # Extract transformations along the derivation chain
343
+ # from the tainted variable to the actual argument.
344
+ chain_transforms = self._extract_transformations_in_chain(
345
+ taint,
346
+ arg_name if arg_name != "iterable" else taint.current_variable,
347
+ edge.line,
348
+ )
349
+ inline_transforms = self._extract_arg_transformations(taint, edge, callee_qname)
350
+ all_transforms = (
351
+ list(taint.transformations) + chain_transforms + inline_transforms
352
+ )
353
+
354
+ # Record this as a flow to a function call (potential sink)
355
+ flow = self._create_flow(
356
+ taint,
357
+ edge,
358
+ callee_qname,
359
+ arg_index,
360
+ arg_name,
361
+ extra_transformations=all_transforms,
362
+ )
363
+ self._flows.append(flow)
364
+ self._stats["flows_found"] += 1
365
+
366
+ # Continue propagation into the callee, carrying
367
+ # accumulated transformations forward.
368
+ if self._should_propagate_into(callee_qname):
369
+ new_taint = self._create_propagated_taint(
370
+ taint,
371
+ edge,
372
+ callee_qname,
373
+ arg_name,
374
+ accumulated_transforms=all_transforms,
375
+ )
376
+ if new_taint:
377
+ new_taint_ids.append(new_taint.taint_id)
378
+
379
+ # Return value propagation: if callee returns the tainted param,
380
+ # the call result (if assigned) is tainted in the caller
381
+ if self._flow_bindings and self._flow_bindings.function_returns_parameter(
382
+ callee_qname, arg_name
383
+ ):
384
+ lhs = self._get_assignment_target_for_call(
385
+ taint.current_function,
386
+ edge.file_path,
387
+ edge.line,
388
+ callee_qname,
389
+ )
390
+ if lhs and taint.depth < self._max_depth - 1:
391
+ return_taint = self._create_return_propagated_taint(
392
+ taint,
393
+ edge,
394
+ lhs,
395
+ accumulated_transforms=all_transforms,
396
+ )
397
+ if return_taint:
398
+ new_taint_ids.append(return_taint.taint_id)
399
+
400
+ # HOF: propagate taint into callback when data arg is tainted
401
+ from .hof_catalog import is_known_hof
402
+
403
+ if is_known_hof(callee_qname) and arg_name == "iterable":
404
+ callback_qname = self._resolve_hof_callback(
405
+ taint.current_function,
406
+ edge.file_path,
407
+ edge.line,
408
+ callee_qname,
409
+ )
410
+ if (
411
+ callback_qname
412
+ and self._should_propagate_into(callback_qname)
413
+ and taint.depth < self._max_depth - 1
414
+ ):
415
+ callback_node = self._call_graph.get_node(callback_qname)
416
+ first_param = (
417
+ callback_node.parameters[0]
418
+ if callback_node and callback_node.parameters
419
+ else "x"
420
+ )
421
+ cb_taint = self._create_propagated_taint(
422
+ taint, edge, callback_qname, first_param
423
+ )
424
+ if cb_taint:
425
+ new_taint_ids.append(cb_taint.taint_id)
426
+
427
+ return new_taint_ids
428
+
429
+ def _get_assignment_target_for_call(
430
+ self,
431
+ caller: str,
432
+ file_path: Path,
433
+ line: int,
434
+ callee: str,
435
+ ) -> str | None:
436
+ """
437
+ Find if the call result is assigned: x = callee(...).
438
+
439
+ Returns the target variable name if found, None otherwise.
440
+ Used for return value taint propagation.
441
+ """
442
+ parsed = self._parsed_files.get(file_path)
443
+ if not parsed or not hasattr(parsed, "assignments"):
444
+ return None
445
+ for assign in parsed.assignments:
446
+ if assign.location.line != line:
447
+ continue
448
+ if assign.source_type != "call" or not assign.source_call:
449
+ continue
450
+ assign_func = assign.in_function or ""
451
+ if assign_func != caller and not (caller and caller.endswith(f".{assign_func}")):
452
+ continue
453
+ # Match callee: source_call might be "get_id", callee might be "main.get_id"
454
+ sc = assign.source_call or ""
455
+ if (
456
+ sc == callee
457
+ or callee.endswith(f".{sc}")
458
+ or sc.endswith(f".{callee.split('.')[-1]}")
459
+ ):
460
+ return assign.target
461
+ return None
462
+
463
+ def _extract_transformations_in_chain(
464
+ self,
465
+ taint: TaintedValue,
466
+ arg_variable: str,
467
+ call_line: int,
468
+ ) -> list[Transformation]:
469
+ """
470
+ Walk the derivation chain from the taint's original variable to the
471
+ argument variable and record every transformation encountered.
472
+
473
+ For example:
474
+ user_id (tainted) → clean = int(user_id) → safe = html.escape(clean)
475
+ → foo(safe)
476
+
477
+ Returns:
478
+ [Transformation(type=function_call, function="int", ...),
479
+ Transformation(type=function_call, function="html.escape", ...)]
480
+ """
481
+ if not self._flow_bindings:
482
+ return []
483
+ if arg_variable == taint.current_variable:
484
+ return []
485
+
486
+ chain = self._flow_bindings.get_derivation_chain(
487
+ taint.current_function,
488
+ arg_variable,
489
+ taint.current_variable,
490
+ call_line,
491
+ )
492
+
493
+ if not chain:
494
+ return []
495
+
496
+ transforms: list[Transformation] = []
497
+ for defn in chain:
498
+ if defn.assigned_from_call:
499
+ if defn.is_method_call:
500
+ ttype = TransformationType.METHOD_CALL
501
+ elif defn.is_string_interpolation:
502
+ ttype = TransformationType.STRING_FORMAT
503
+ elif defn.assigned_from_call in (
504
+ "int",
505
+ "str",
506
+ "float",
507
+ "bool",
508
+ "bytes",
509
+ "complex",
510
+ ):
511
+ ttype = TransformationType.TYPE_CONSTRUCTOR
512
+ else:
513
+ ttype = TransformationType.FUNCTION_CALL
514
+
515
+ # Try to find the call site for this transformation to attach evidence
516
+ call_ev = self._get_transformation_call_evidence(
517
+ taint.current_function,
518
+ taint.current_file,
519
+ defn.line,
520
+ defn.variable,
521
+ )
522
+
523
+ transforms.append(
524
+ Transformation(
525
+ type=ttype,
526
+ description=None,
527
+ location=(taint.current_file, defn.line),
528
+ function=defn.assigned_from_call,
529
+ depth=taint.depth,
530
+ call_evidence=call_ev,
531
+ )
532
+ )
533
+ elif defn.is_string_interpolation:
534
+ transforms.append(
535
+ Transformation(
536
+ type=TransformationType.STRING_FORMAT,
537
+ description=None,
538
+ location=(taint.current_file, defn.line),
539
+ function=None,
540
+ depth=taint.depth,
541
+ )
542
+ )
543
+
544
+ return transforms
545
+
546
+ def _get_transformation_call_evidence(
547
+ self,
548
+ function_qname: str,
549
+ file_path: Path,
550
+ line: int,
551
+ tainted_var: str | None,
552
+ ) -> Any:
553
+ """Look up the call site for a transformation assignment and build evidence."""
554
+ from .sink_evidence import build_evidence_for_call_site
555
+
556
+ cs = self._get_call_site(function_qname, file_path, line)
557
+ if cs is None:
558
+ return None
559
+ # For a transformation like `clean = int(uid)`, the call site is
560
+ # `int(uid)`. We identify which argument carries the tainted
561
+ # variable by matching source_variables.
562
+ tainted_idx: int | None = None
563
+ tainted_name: str | None = None
564
+ if tainted_var:
565
+ for arg in cs.arguments:
566
+ if arg.is_variable and arg.variable_name == tainted_var:
567
+ tainted_idx = arg.position
568
+ tainted_name = arg.name
569
+ break
570
+ if tainted_var in arg.source_variables:
571
+ tainted_idx = arg.position
572
+ tainted_name = arg.name
573
+ break
574
+ return build_evidence_for_call_site(
575
+ cs,
576
+ tainted_arg_index=tainted_idx,
577
+ tainted_arg_name=tainted_name,
578
+ )
579
+
580
+ def _extract_arg_transformations(
581
+ self,
582
+ taint: TaintedValue,
583
+ edge: CallGraphEdge,
584
+ callee_qname: str,
585
+ ) -> list[Transformation]:
586
+ """
587
+ Check whether the call argument itself is a transformation (inline).
588
+
589
+ Handles the pattern: foo(int(user_id)) where int() wraps the tainted
590
+ variable directly in the call argument.
591
+ """
592
+ call_site = self._get_call_site(taint.current_function, edge.file_path, edge.line)
593
+ if not call_site or not call_site.arguments:
594
+ return []
595
+
596
+ transforms: list[Transformation] = []
597
+ for arg in call_site.arguments:
598
+ if arg.is_call_result and arg.called_function:
599
+ # The argument is the result of a call — check if the
600
+ # called function wraps the tainted variable.
601
+ # E.g. foo(int(user_id)) — arg is int(user_id).
602
+ if arg.expression_text and taint.current_variable in (arg.expression_text):
603
+ ttype = (
604
+ TransformationType.TYPE_CONSTRUCTOR
605
+ if (
606
+ arg.called_function
607
+ in (
608
+ "int",
609
+ "str",
610
+ "float",
611
+ "bool",
612
+ "bytes",
613
+ )
614
+ )
615
+ else TransformationType.FUNCTION_CALL
616
+ )
617
+ transforms.append(
618
+ Transformation(
619
+ type=ttype,
620
+ description=None,
621
+ location=(edge.file_path, edge.line),
622
+ function=arg.called_function,
623
+ depth=taint.depth,
624
+ )
625
+ )
626
+ return transforms
627
+
628
+ def _resolve_hof_callback(
629
+ self,
630
+ caller: str,
631
+ file_path: Path,
632
+ line: int,
633
+ hof_callee: str,
634
+ ) -> str | None:
635
+ """
636
+ Resolve the callback argument of a HOF call to a function in the call graph.
637
+ E.g. map(sanitize, data) -> resolve "sanitize" to "main.sanitize" if in graph.
638
+ """
639
+ from .hof_catalog import get_hof_callback_position
640
+
641
+ callback_pos = get_hof_callback_position(hof_callee)
642
+ if callback_pos is None:
643
+ return None
644
+ call_site = self._get_call_site(caller, file_path, line)
645
+ if not call_site or not call_site.arguments:
646
+ return None
647
+ for arg in call_site.arguments:
648
+ if arg.position != callback_pos:
649
+ continue
650
+ name = arg.variable_name if (arg.is_variable and arg.variable_name) else None
651
+ if not name:
652
+ continue
653
+ # Try as qualified name
654
+ if self._call_graph.get_node(name):
655
+ return name
656
+ # Try with caller's module prefix
657
+ if "." in caller:
658
+ prefix = caller.rsplit(".", 1)[0]
659
+ candidate = f"{prefix}.{name}"
660
+ if self._call_graph.get_node(candidate):
661
+ return candidate
662
+ # Try any symbol that ends with .name
663
+ for sym in self._call_graph._symbols:
664
+ if sym == name or sym.endswith(f".{name}"):
665
+ return sym
666
+ return None
667
+ return None
668
+
669
+ def _get_call_site(
670
+ self,
671
+ caller: str,
672
+ file_path: Path,
673
+ line: int,
674
+ callee_hint: str | None = None,
675
+ ) -> ParsedCallSite | None:
676
+ """
677
+ Look up the ParsedCallSite for a call at (caller, file_path, line).
678
+
679
+ When multiple calls share a line (e.g. ``Template(content).render()``),
680
+ *callee_hint* disambiguates: it prefers the call site whose callee_name
681
+ matches the hint.
682
+ """
683
+ parsed = self._parsed_files.get(file_path)
684
+ if not parsed or not hasattr(parsed, "call_sites"):
685
+ return None
686
+
687
+ candidates: list[ParsedCallSite] = []
688
+ for call in parsed.call_sites:
689
+ if call.location.line != line:
690
+ continue
691
+ caller_qn = call.caller_function
692
+ caller_str = (
693
+ caller_qn.full
694
+ if caller_qn and hasattr(caller_qn, "full")
695
+ else str(caller_qn)
696
+ if caller_qn
697
+ else ""
698
+ )
699
+ if (
700
+ caller_str == caller
701
+ or caller_str
702
+ and caller.endswith(f".{caller_str}")
703
+ or caller_str
704
+ and caller_str.endswith(f".{caller.split('.')[-1]}")
705
+ ):
706
+ candidates.append(call)
707
+
708
+ if not candidates:
709
+ return None
710
+ if len(candidates) == 1 or not callee_hint:
711
+ return candidates[0]
712
+
713
+ callee_leaf = callee_hint.rsplit(".", 1)[-1]
714
+ for c in candidates:
715
+ cname = c.callee_name or ""
716
+ if cname in (callee_hint, callee_leaf) or cname.endswith(f".{callee_leaf}"):
717
+ return c
718
+ return candidates[0]
719
+
720
+ def _check_taint_passes(
721
+ self,
722
+ taint: TaintedValue,
723
+ edge: CallGraphEdge,
724
+ callee_qname: str,
725
+ ) -> tuple[int, str] | None:
726
+ """
727
+ Check if tainted variable is passed to this call.
728
+
729
+ Uses actual call-site argument analysis when available for accuracy.
730
+ Falls back to heuristics when call site cannot be resolved.
731
+
732
+ Returns (argument_index, parameter_name) if taint passes, None otherwise.
733
+ """
734
+ tainted_var = taint.current_variable
735
+ callee_node = self._call_graph.get_node(callee_qname)
736
+ callee_params = callee_node.parameters if callee_node else []
737
+
738
+ call_site = self._get_call_site(
739
+ taint.current_function,
740
+ edge.file_path,
741
+ edge.line,
742
+ callee_hint=callee_qname,
743
+ )
744
+
745
+ if call_site and call_site.arguments:
746
+ # Precise: check which argument passes the tainted variable
747
+ # Use value-flow (variable_derives_from) for renaming: uid = user_id;
748
+ # foo(uid) should propagate taint
749
+ # Handle *args and **kwargs: tainted *var or **var spreads to callee
750
+ for arg in call_site.arguments:
751
+ arg_var = None
752
+ if arg.is_variable and arg.variable_name or arg.is_spread or arg.is_keyword_spread:
753
+ arg_var = arg.variable_name
754
+ if not arg_var:
755
+ continue
756
+ # Direct match or value-flow (variable renaming): does arg_var
757
+ # derive from tainted_var? e.g. uid = user_id; foo(uid)
758
+ if arg_var != tainted_var:
759
+ if self._flow_bindings:
760
+ if not self._flow_bindings.variable_derives_from(
761
+ taint.current_function,
762
+ arg_var,
763
+ tainted_var,
764
+ edge.line,
765
+ ):
766
+ continue
767
+ else:
768
+ continue
769
+ # This argument passes the tainted variable (or derives from it)
770
+ param_index = arg.position
771
+ param_name = None
772
+ if arg.name and callee_params:
773
+ # Keyword argument: find param by name
774
+ for i, p in enumerate(callee_params):
775
+ if p == arg.name:
776
+ param_index = i
777
+ param_name = p
778
+ break
779
+ elif param_index is not None and param_index < len(callee_params):
780
+ param_name = callee_params[param_index]
781
+ else:
782
+ param_name = tainted_var
783
+ return (param_index if param_index is not None else 0, param_name or tainted_var)
784
+ # Precise analysis had call-site arguments but none carried taint;
785
+ # trust the verdict and skip heuristic fallback.
786
+ return None
787
+
788
+ # Fallback: heuristic-based matching (only when call-site is unresolved)
789
+ if callee_params:
790
+ for i, param in enumerate(callee_params):
791
+ if param == tainted_var or tainted_var.endswith(f".{param}"):
792
+ return (i, param)
793
+ if param in ("data", "value", "input", "content", "payload"):
794
+ return (i, param)
795
+
796
+ if self._is_likely_data_variable(tainted_var):
797
+ return (0, callee_params[0] if callee_params else tainted_var)
798
+
799
+ # HOF: map(f, data), filter(f, data) - taint in data arg flows to callback
800
+ from .hof_catalog import get_hof_data_position, is_known_hof
801
+
802
+ if is_known_hof(callee_qname):
803
+ data_pos = get_hof_data_position(callee_qname)
804
+ if data_pos is not None and call_site and call_site.arguments:
805
+ for arg in call_site.arguments:
806
+ if arg.position != data_pos:
807
+ continue
808
+ arg_var = arg.variable_name if (arg.is_variable and arg.variable_name) else None
809
+ if not arg_var:
810
+ continue
811
+ if arg_var == tainted_var:
812
+ return (data_pos, "iterable")
813
+ if self._flow_bindings and self._flow_bindings.variable_derives_from(
814
+ taint.current_function, arg_var, tainted_var, edge.line
815
+ ):
816
+ return (data_pos, "iterable")
817
+
818
+ return None
819
+
820
+ def _is_likely_data_variable(self, var_name: str) -> bool:
821
+ """Check if variable name strongly suggests it carries user data.
822
+
823
+ Only matches terms that almost always indicate untrusted input.
824
+ Generic terms like "name", "id", "value", "result", "response"
825
+ are excluded because they appear frequently in non-tainted contexts.
826
+ """
827
+ import re as _re
828
+
829
+ _STRONG_DATA_PATTERNS = {
830
+ "data",
831
+ "input",
832
+ "body",
833
+ "payload",
834
+ "content",
835
+ "request",
836
+ "params",
837
+ "query",
838
+ "form",
839
+ "json",
840
+ "user_input",
841
+ "user_data",
842
+ "raw_input",
843
+ "email",
844
+ "password",
845
+ "token",
846
+ }
847
+
848
+ var_lower = var_name.lower()
849
+ # Exact match on the full variable name or the last dotted segment
850
+ leaf = var_lower.rsplit(".", 1)[-1]
851
+ if leaf in _STRONG_DATA_PATTERNS:
852
+ return True
853
+
854
+ # Word-boundary match so "user_payload" hits but "response" does not
855
+ for pattern in _STRONG_DATA_PATTERNS:
856
+ if _re.search(rf"(?:^|_){_re.escape(pattern)}(?:_|$)", leaf):
857
+ return True
858
+
859
+ return False
860
+
861
+ def _create_flow(
862
+ self,
863
+ taint: TaintedValue,
864
+ edge: CallGraphEdge,
865
+ callee_qname: str,
866
+ arg_index: int,
867
+ arg_name: str,
868
+ extra_transformations: list[Transformation] | None = None,
869
+ ) -> DataFlow:
870
+ """Create a DataFlow record for a taint reaching a function call."""
871
+ flow_id = stable_id(
872
+ "flow",
873
+ taint.origin_type.name,
874
+ taint.entry_point_id or "",
875
+ callee_qname,
876
+ str(arg_index),
877
+ arg_name,
878
+ str(edge.file_path),
879
+ str(edge.line),
880
+ )
881
+
882
+ # Build the path
883
+ path = list(taint.path)
884
+ path.append(
885
+ FlowStep(
886
+ depth=taint.depth + 1,
887
+ caller=taint.current_function,
888
+ callee=callee_qname,
889
+ argument_index=arg_index,
890
+ argument_name=arg_name,
891
+ location=(edge.file_path, edge.line),
892
+ variable_mapping={taint.current_variable: arg_name},
893
+ )
894
+ )
895
+
896
+ # Get context at sink
897
+ sink_context = {
898
+ "in_loop": edge.in_loop,
899
+ "in_conditional": edge.in_conditional,
900
+ "in_try_block": edge.in_try_block,
901
+ "in_except_handler": edge.in_except_handler,
902
+ }
903
+
904
+ # Merge transformations: accumulated from propagation + chain + inline.
905
+ # Deduplicate by (function, line) to avoid repeats.
906
+ all_transforms = extra_transformations or list(taint.transformations)
907
+ seen_keys: set[tuple[str | None, int]] = set()
908
+ deduped: list[Transformation] = []
909
+ for t in all_transforms:
910
+ key = (t.function, t.location[1])
911
+ if key not in seen_keys:
912
+ seen_keys.add(key)
913
+ deduped.append(t)
914
+
915
+ return DataFlow(
916
+ id=flow_id,
917
+ origin_type=taint.origin_type,
918
+ origin_name=taint.origin_name,
919
+ origin_location=taint.origin_location,
920
+ entry_point_id=taint.entry_point_id,
921
+ sink_function=callee_qname,
922
+ sink_argument_index=arg_index,
923
+ sink_argument_name=arg_name,
924
+ sink_location=(edge.file_path, edge.line),
925
+ sink_call_id=None,
926
+ path=path,
927
+ depth=len(path),
928
+ truncated=taint.depth >= self._max_depth - 1,
929
+ transformations=deduped,
930
+ sink_context=sink_context,
931
+ confidence="HIGH" if taint.depth < 3 else "MEDIUM",
932
+ )
933
+
934
+ def _should_propagate_into(self, callee_qname: str) -> bool:
935
+ """Check if we should continue propagation into this function."""
936
+ # Don't propagate into external/library functions
937
+ node = self._call_graph.get_node(callee_qname)
938
+ if node is None:
939
+ return False
940
+
941
+ # Don't propagate into builtins
942
+ if callee_qname.startswith("builtins."):
943
+ return False
944
+
945
+ # Don't propagate into known library functions
946
+ library_prefixes = {
947
+ # Python
948
+ "sqlalchemy.",
949
+ "django.",
950
+ "flask.",
951
+ "fastapi.",
952
+ "pydantic.",
953
+ "requests.",
954
+ "httpx.",
955
+ "aiohttp.",
956
+ "json.",
957
+ "os.",
958
+ "sys.",
959
+ "subprocess.",
960
+ "logging.",
961
+ # Java standard library and major frameworks
962
+ "java.",
963
+ "javax.",
964
+ "jakarta.",
965
+ "org.springframework.",
966
+ "io.micronaut.",
967
+ "com.google.",
968
+ "com.fasterxml.", # Jackson
969
+ "org.apache.",
970
+ "org.slf4j.",
971
+ "org.hibernate.",
972
+ "io.jsonwebtoken.",
973
+ # .NET standard library and ASP.NET Core
974
+ "System.",
975
+ "Microsoft.",
976
+ "Newtonsoft.",
977
+ "AutoMapper.",
978
+ "FluentValidation.",
979
+ }
980
+ return all(not callee_qname.startswith(prefix) for prefix in library_prefixes)
981
+
982
+ def _create_propagated_taint(
983
+ self,
984
+ source_taint: TaintedValue,
985
+ edge: CallGraphEdge,
986
+ callee_qname: str,
987
+ param_name: str,
988
+ accumulated_transforms: list[Transformation] | None = None,
989
+ ) -> TaintedValue | None:
990
+ """Create a new taint for propagation into a callee."""
991
+ taint_id = f"taint-{self._next_taint_id:04d}"
992
+ self._next_taint_id += 1
993
+
994
+ # Build updated path
995
+ new_path = list(source_taint.path)
996
+ new_path.append(
997
+ FlowStep(
998
+ depth=source_taint.depth + 1,
999
+ caller=source_taint.current_function,
1000
+ callee=callee_qname,
1001
+ argument_index=0,
1002
+ argument_name=param_name,
1003
+ location=(edge.file_path, edge.line),
1004
+ variable_mapping={source_taint.current_variable: param_name},
1005
+ )
1006
+ )
1007
+
1008
+ new_taint = TaintedValue(
1009
+ taint_id=taint_id,
1010
+ origin_type=source_taint.origin_type,
1011
+ origin_name=source_taint.origin_name,
1012
+ origin_location=source_taint.origin_location,
1013
+ entry_point_id=source_taint.entry_point_id,
1014
+ current_variable=param_name,
1015
+ current_function=callee_qname,
1016
+ current_file=edge.file_path,
1017
+ depth=source_taint.depth + 1,
1018
+ path=new_path,
1019
+ transformations=accumulated_transforms
1020
+ if accumulated_transforms is not None
1021
+ else list(source_taint.transformations),
1022
+ metadata=dict(source_taint.metadata),
1023
+ )
1024
+
1025
+ self._active_taints[taint_id] = new_taint
1026
+ self._taints[callee_qname][param_name].add(taint_id)
1027
+ self._stats["taints_created"] += 1
1028
+
1029
+ return new_taint
1030
+
1031
+ def _create_return_propagated_taint(
1032
+ self,
1033
+ source_taint: TaintedValue,
1034
+ edge: CallGraphEdge,
1035
+ lhs_variable: str,
1036
+ accumulated_transforms: list[Transformation] | None = None,
1037
+ ) -> TaintedValue | None:
1038
+ """
1039
+ Create taint for LHS when callee returns tainted value.
1040
+
1041
+ E.g. user_id = get_id() where get_id returns request.args.get("id").
1042
+ The taint propagates from callee's return to caller's lhs_variable.
1043
+ """
1044
+ taint_id = f"taint-{self._next_taint_id:04d}"
1045
+ self._next_taint_id += 1
1046
+
1047
+ new_taint = TaintedValue(
1048
+ taint_id=taint_id,
1049
+ origin_type=source_taint.origin_type,
1050
+ origin_name=source_taint.origin_name,
1051
+ origin_location=source_taint.origin_location,
1052
+ entry_point_id=source_taint.entry_point_id,
1053
+ current_variable=lhs_variable,
1054
+ current_function=source_taint.current_function,
1055
+ current_file=source_taint.current_file,
1056
+ depth=source_taint.depth,
1057
+ path=list(source_taint.path),
1058
+ transformations=accumulated_transforms
1059
+ if accumulated_transforms is not None
1060
+ else list(source_taint.transformations),
1061
+ metadata=dict(source_taint.metadata),
1062
+ )
1063
+
1064
+ self._active_taints[taint_id] = new_taint
1065
+ caller = source_taint.current_function
1066
+ self._taints[caller][lhs_variable].add(taint_id)
1067
+ self._stats["taints_created"] += 1
1068
+
1069
+ return new_taint
1070
+
1071
+ def get_statistics(self) -> dict[str, int]:
1072
+ """Get propagation statistics."""
1073
+ return dict(self._stats)
1074
+
1075
+
1076
+ # =============================================================================
1077
+ # Source Identifier
1078
+ # =============================================================================
1079
+
1080
+
1081
+ class SourceIdentifier:
1082
+ """
1083
+ Identifies data origins from entry points.
1084
+
1085
+ This analyzes entry-point handlers to find where untrusted data enters:
1086
+ - HTTP: path/query/body/header/cookie parameters
1087
+ - CLI: arguments and options (user input)
1088
+ - Task/Consumer/Scheduled: message-broker parameters
1089
+ - Webhook/Event: external system callback parameters
1090
+ """
1091
+
1092
+ # FastAPI parameter patterns
1093
+ FASTAPI_PATH_PARAM = {"Path"}
1094
+ FASTAPI_QUERY_PARAM = {"Query"}
1095
+ FASTAPI_BODY_PARAM = {"Body", "Form", "File"}
1096
+ FASTAPI_HEADER_PARAM = {"Header"}
1097
+ FASTAPI_COOKIE_PARAM = {"Cookie"}
1098
+
1099
+ # Default OriginType per non-HTTP entry-point kind
1100
+ _KIND_ORIGIN_MAP: dict[str, OriginType] = {
1101
+ "cli": OriginType.USER_INPUT,
1102
+ "task": OriginType.MESSAGE_QUEUE,
1103
+ "consumer": OriginType.MESSAGE_QUEUE,
1104
+ "scheduled": OriginType.MESSAGE_QUEUE,
1105
+ "webhook": OriginType.EXTERNAL_API,
1106
+ "event": OriginType.EXTERNAL_API,
1107
+ }
1108
+
1109
+ # CLI-specific location values → OriginType
1110
+ _CLI_LOCATION_ORIGINS: dict[str, OriginType] = {
1111
+ "cli_argument": OriginType.USER_INPUT,
1112
+ "cli_option": OriginType.USER_INPUT,
1113
+ }
1114
+
1115
+ _SIMPLE_TYPES = frozenset(
1116
+ {
1117
+ "str",
1118
+ "int",
1119
+ "float",
1120
+ "bool",
1121
+ "bytes",
1122
+ "Optional[str]",
1123
+ "Optional[int]",
1124
+ "Optional[float]",
1125
+ "Optional[bool]",
1126
+ "str | None",
1127
+ "int | None",
1128
+ "float | None",
1129
+ "bool | None",
1130
+ }
1131
+ )
1132
+
1133
+ def __init__(self):
1134
+ self._origins: list[dict[str, Any]] = []
1135
+
1136
+ def identify_from_function(
1137
+ self,
1138
+ func: ParsedFunction,
1139
+ entry_point_id: str | None = None,
1140
+ route_path: str | None = None,
1141
+ param_locations: dict[str, str] | None = None,
1142
+ entry_point_kind: str | None = None,
1143
+ ) -> list[dict[str, Any]]:
1144
+ """
1145
+ Identify data origins from a function (entry-point handler).
1146
+
1147
+ Args:
1148
+ func: The parsed handler function.
1149
+ entry_point_id: Stable ID of the entry point.
1150
+ route_path: URL path template (HTTP only).
1151
+ param_locations: Pre-classified param name → location string.
1152
+ entry_point_kind: Entry-point kind (``"http"``, ``"cli"``,
1153
+ ``"task"``, ``"consumer"``, ``"scheduled"``, ``"webhook"``,
1154
+ ``"event"``). ``None`` is treated as ``"http"`` for
1155
+ backwards compatibility.
1156
+
1157
+ Returns:
1158
+ List of origin info dicts.
1159
+ """
1160
+ origins: list[dict[str, Any]] = []
1161
+
1162
+ for param in func.parameters:
1163
+ origin = self._classify_parameter(
1164
+ param,
1165
+ func,
1166
+ route_path,
1167
+ param_locations,
1168
+ entry_point_kind,
1169
+ )
1170
+ if origin:
1171
+ origin["entry_point_id"] = entry_point_id
1172
+ origin["function_qname"] = func.qualified_name.full
1173
+ origins.append(origin)
1174
+
1175
+ return origins
1176
+
1177
+ def _classify_parameter(
1178
+ self,
1179
+ param: Any, # ParsedParameter
1180
+ func: ParsedFunction,
1181
+ route_path: str | None,
1182
+ param_locations: dict[str, str] | None = None,
1183
+ entry_point_kind: str | None = None,
1184
+ ) -> dict[str, Any] | None:
1185
+ """Classify a parameter as a data origin."""
1186
+ param_name = param.name
1187
+ param_type = param.type_annotation
1188
+ default_value = param.default_value
1189
+
1190
+ # Skip self/cls
1191
+ if param_name in ("self", "cls"):
1192
+ return None
1193
+
1194
+ # Non-HTTP entry points use kind-based classification
1195
+ if entry_point_kind and entry_point_kind != "http":
1196
+ return self._classify_non_http_parameter(
1197
+ param,
1198
+ func,
1199
+ entry_point_kind,
1200
+ param_locations,
1201
+ )
1202
+
1203
+ # Java/Spring & .NET: annotations stored in param.metadata by the language parser.
1204
+ param_metadata = getattr(param, "metadata", {}) or {}
1205
+ if param_metadata:
1206
+ spring_origin = self._classify_spring_param(param_name, param_metadata)
1207
+ if spring_origin:
1208
+ return {
1209
+ "type": spring_origin,
1210
+ "name": param_name,
1211
+ "location": (func.location.file, func.location.line),
1212
+ "parameter_name": param_name,
1213
+ "inferred_from": "spring_annotation",
1214
+ }
1215
+ dotnet_origin = self._classify_dotnet_param(param_name, param_metadata)
1216
+ if dotnet_origin:
1217
+ return {
1218
+ "type": dotnet_origin,
1219
+ "name": param_name,
1220
+ "location": (func.location.file, func.location.line),
1221
+ "parameter_name": param_name,
1222
+ "inferred_from": "dotnet_annotation",
1223
+ }
1224
+
1225
+ # Use pre-classified location from the entry_point if available
1226
+ # (single source of truth from ParameterAnalyzer).
1227
+ if param_locations and param_name in param_locations:
1228
+ origin_type_name = param_locations[param_name]
1229
+ return {
1230
+ "type": OriginType[origin_type_name],
1231
+ "name": param_name,
1232
+ "location": (func.location.file, func.location.line),
1233
+ "parameter_name": param_name,
1234
+ "inferred_from": "entry_point_classification",
1235
+ }
1236
+
1237
+ # Check for path parameter (appears in route path) — checked before
1238
+ # body-type heuristic so that a param named e.g. "code" in the URL
1239
+ # template is not misclassified as body due to its type annotation.
1240
+ if route_path and f"{{{param_name}}}" in route_path:
1241
+ return {
1242
+ "type": OriginType.HTTP_PATH_PARAM,
1243
+ "name": param_name,
1244
+ "location": (func.location.file, func.location.line),
1245
+ "parameter_name": param_name,
1246
+ "inferred_from": "route_path",
1247
+ }
1248
+
1249
+ # Check type annotation for Pydantic models (likely body)
1250
+ if param_type and self._is_likely_body_type(param_type):
1251
+ return {
1252
+ "type": OriginType.HTTP_BODY,
1253
+ "name": param_name,
1254
+ "location": (func.location.file, func.location.line),
1255
+ "parameter_name": param_name,
1256
+ "inferred_from": "type_annotation",
1257
+ }
1258
+
1259
+ # Check default value for FastAPI dependency patterns
1260
+ if default_value:
1261
+ origin_type = self._classify_fastapi_default(default_value)
1262
+ if origin_type:
1263
+ return {
1264
+ "type": origin_type,
1265
+ "name": param_name,
1266
+ "location": (func.location.file, func.location.line),
1267
+ "parameter_name": param_name,
1268
+ "inferred_from": "default_value",
1269
+ }
1270
+
1271
+ # Check for common parameter names
1272
+ if self._is_likely_query_param_name(param_name):
1273
+ return {
1274
+ "type": OriginType.HTTP_QUERY_PARAM,
1275
+ "name": param_name,
1276
+ "location": (func.location.file, func.location.line),
1277
+ "parameter_name": param_name,
1278
+ "inferred_from": "name_heuristic",
1279
+ }
1280
+
1281
+ # FastAPI fallback: simple-typed params not in path are query params.
1282
+ # Depends()-injected params were already handled above; anything
1283
+ # left with a primitive type annotation is user-supplied input.
1284
+ if param_type and param_type in self._SIMPLE_TYPES:
1285
+ return {
1286
+ "type": OriginType.HTTP_QUERY_PARAM,
1287
+ "name": param_name,
1288
+ "location": (func.location.file, func.location.line),
1289
+ "parameter_name": param_name,
1290
+ "inferred_from": "fastapi_simple_type_fallback",
1291
+ }
1292
+
1293
+ return None
1294
+
1295
+ def _is_likely_body_type(self, type_annotation: str) -> bool:
1296
+ """Check if type annotation suggests a request body."""
1297
+ # Pydantic models typically end with Model, Schema, etc.
1298
+ body_patterns = {
1299
+ "Model",
1300
+ "Schema",
1301
+ "Request",
1302
+ "Input",
1303
+ "Create",
1304
+ "Update",
1305
+ "Payload",
1306
+ "Body",
1307
+ "Data",
1308
+ "DTO",
1309
+ }
1310
+
1311
+ for pattern in body_patterns:
1312
+ if pattern in type_annotation:
1313
+ return True
1314
+
1315
+ # Check for common base types that are NOT body
1316
+ non_body = {"str", "int", "float", "bool", "list", "dict", "Optional", "None"}
1317
+ if type_annotation in non_body:
1318
+ return False
1319
+
1320
+ # If it's a custom type (starts with uppercase), likely a model
1321
+ return bool(type_annotation and type_annotation[0].isupper())
1322
+
1323
+ def _classify_fastapi_default(self, default_value: str) -> OriginType | None:
1324
+ """Classify FastAPI parameter defaults."""
1325
+ default_lower = default_value.lower()
1326
+
1327
+ if any(p.lower() in default_lower for p in self.FASTAPI_PATH_PARAM):
1328
+ return OriginType.HTTP_PATH_PARAM
1329
+ if any(p.lower() in default_lower for p in self.FASTAPI_QUERY_PARAM):
1330
+ return OriginType.HTTP_QUERY_PARAM
1331
+ if any(p.lower() in default_lower for p in self.FASTAPI_BODY_PARAM):
1332
+ return OriginType.HTTP_BODY
1333
+ if any(p.lower() in default_lower for p in self.FASTAPI_HEADER_PARAM):
1334
+ return OriginType.HTTP_HEADER
1335
+ if any(p.lower() in default_lower for p in self.FASTAPI_COOKIE_PARAM):
1336
+ return OriginType.HTTP_COOKIE
1337
+
1338
+ return None
1339
+
1340
+ def _classify_spring_param(
1341
+ self, param_name: str, metadata: dict[str, Any]
1342
+ ) -> OriginType | None:
1343
+ """Classify a Spring / Micronaut method parameter from its annotation metadata.
1344
+
1345
+ The JVM parser stores param annotations as:
1346
+ {"PathVariable": "slug"} → HTTP_PATH_PARAM
1347
+ {"RequestParam": "page"} → HTTP_QUERY_PARAM
1348
+ {"RequestBody": None} → HTTP_BODY
1349
+ {"RequestHeader": "X-API-Key"} → HTTP_HEADER
1350
+ {"CookieValue": "session"} → HTTP_COOKIE
1351
+ {"QueryValue": "limit"} → HTTP_QUERY_PARAM (Micronaut)
1352
+ {"Body": None} → HTTP_BODY (Micronaut)
1353
+ {"Header": "X-Token"} → HTTP_HEADER (Micronaut)
1354
+ """
1355
+ ann_keys = {k.lower() for k in metadata}
1356
+ if "pathvariable" in ann_keys or "uriinfo" in ann_keys:
1357
+ return OriginType.HTTP_PATH_PARAM
1358
+ if "requestparam" in ann_keys or "queryvalue" in ann_keys or "queryparam" in ann_keys:
1359
+ return OriginType.HTTP_QUERY_PARAM
1360
+ if "requestbody" in ann_keys or "body" in ann_keys:
1361
+ return OriginType.HTTP_BODY
1362
+ if "requestheader" in ann_keys or "header" in ann_keys:
1363
+ return OriginType.HTTP_HEADER
1364
+ if "cookievalue" in ann_keys or "cookieparam" in ann_keys:
1365
+ return OriginType.HTTP_COOKIE
1366
+ if "matrixparam" in ann_keys:
1367
+ return OriginType.HTTP_QUERY_PARAM
1368
+ return None
1369
+
1370
+ def _classify_dotnet_param(
1371
+ self, param_name: str, metadata: dict[str, Any]
1372
+ ) -> OriginType | None:
1373
+ """Classify an ASP.NET Core / Minimal API parameter from its attribute metadata.
1374
+
1375
+ The C# parser stores binding attributes as metadata keys (value is True or a
1376
+ string for named params):
1377
+ {"FromRoute": True} → HTTP_PATH_PARAM
1378
+ {"FromQuery": True} → HTTP_QUERY_PARAM
1379
+ {"FromBody": True} → HTTP_BODY
1380
+ {"FromForm": True} → HTTP_FORM
1381
+ {"FromHeader": True} → HTTP_HEADER
1382
+ {"FromServices": True} → None (DI, not user input)
1383
+ """
1384
+ ann_keys = {k.lower() for k in metadata}
1385
+ if "fromroute" in ann_keys or "fromuriattribute" in ann_keys:
1386
+ return OriginType.HTTP_PATH_PARAM
1387
+ if "fromquery" in ann_keys or "fromuniformresourceidentifier" in ann_keys:
1388
+ return OriginType.HTTP_QUERY_PARAM
1389
+ if "frombody" in ann_keys:
1390
+ return OriginType.HTTP_BODY
1391
+ if "fromform" in ann_keys:
1392
+ return OriginType.HTTP_FORM
1393
+ if "fromheader" in ann_keys:
1394
+ return OriginType.HTTP_HEADER
1395
+ if "fromservices" in ann_keys or "inject" in ann_keys or "fromkeyedservices" in ann_keys:
1396
+ return None # DI injection — not user input
1397
+ return None
1398
+
1399
+ def _is_likely_query_param_name(self, param_name: str) -> bool:
1400
+ """Check if parameter name suggests a query parameter."""
1401
+ query_patterns = {
1402
+ "page",
1403
+ "limit",
1404
+ "offset",
1405
+ "skip",
1406
+ "take",
1407
+ "sort",
1408
+ "order",
1409
+ "filter",
1410
+ "search",
1411
+ "query",
1412
+ "q",
1413
+ "start",
1414
+ "end",
1415
+ "from",
1416
+ "to",
1417
+ }
1418
+
1419
+ return param_name.lower() in query_patterns
1420
+
1421
+ # -----------------------------------------------------------------
1422
+ # Non-HTTP entry-point classification
1423
+ # -----------------------------------------------------------------
1424
+
1425
+ def _classify_non_http_parameter(
1426
+ self,
1427
+ param: Any, # ParsedParameter
1428
+ func: ParsedFunction,
1429
+ kind: str,
1430
+ param_locations: dict[str, str] | None = None,
1431
+ ) -> dict[str, Any] | None:
1432
+ """Classify a parameter from a non-HTTP entry point.
1433
+
1434
+ For CLI entry points parameters are treated as direct user input.
1435
+ For task / consumer / scheduled entry points parameters arrive via
1436
+ a message broker (deserialized payloads). For webhook / event entry
1437
+ points parameters originate from an external system callback.
1438
+
1439
+ Pre-classified *param_locations* are honoured first; if a location
1440
+ value is not a valid ``OriginType`` name it is checked against the
1441
+ CLI-specific location table (``cli_argument``, ``cli_option``).
1442
+ """
1443
+ param_name = param.name
1444
+
1445
+ # Check pre-classified location metadata from the entry point
1446
+ if param_locations and param_name in param_locations:
1447
+ location_value = param_locations[param_name]
1448
+
1449
+ # Try as a canonical OriginType enum name (e.g. "USER_INPUT")
1450
+ try:
1451
+ origin_type = OriginType[location_value]
1452
+ except KeyError:
1453
+ # CLI-specific location descriptors
1454
+ origin_type = self._CLI_LOCATION_ORIGINS.get(
1455
+ location_value,
1456
+ self._KIND_ORIGIN_MAP.get(kind, OriginType.UNKNOWN),
1457
+ )
1458
+
1459
+ return {
1460
+ "type": origin_type,
1461
+ "name": param_name,
1462
+ "location": (func.location.file, func.location.line),
1463
+ "parameter_name": param_name,
1464
+ "inferred_from": "entry_point_classification",
1465
+ }
1466
+
1467
+ # Fall back to the default origin for the entry-point kind
1468
+ origin_type = self._KIND_ORIGIN_MAP.get(kind, OriginType.UNKNOWN)
1469
+ return {
1470
+ "type": origin_type,
1471
+ "name": param_name,
1472
+ "location": (func.location.file, func.location.line),
1473
+ "parameter_name": param_name,
1474
+ "inferred_from": f"{kind}_default",
1475
+ }
1476
+
1477
+
1478
+ # =============================================================================
1479
+ # Data Flow Analyzer
1480
+ # =============================================================================
1481
+
1482
+
1483
+ class DataFlowAnalyzer:
1484
+ """
1485
+ Main class for inter-procedural data flow analysis.
1486
+
1487
+ Usage:
1488
+ analyzer = DataFlowAnalyzer(
1489
+ call_graph=call_graph,
1490
+ parsed_files=parsed_files,
1491
+ entry_points=entry_points,
1492
+ max_depth=10,
1493
+ )
1494
+
1495
+ flows = analyzer.analyze()
1496
+ """
1497
+
1498
+ def __init__(
1499
+ self,
1500
+ call_graph: CallGraph,
1501
+ parsed_files: list[ParsedFile],
1502
+ entry_points: list[dict[str, Any]],
1503
+ flow_bindings: FlowSensitiveBindings | None = None,
1504
+ max_depth: int = 10,
1505
+ ):
1506
+ self._call_graph = call_graph
1507
+ self._parsed_files = {pf.path: pf for pf in parsed_files if pf.success}
1508
+ self._entry_points = entry_points
1509
+ self._flow_bindings = flow_bindings
1510
+ self._max_depth = max_depth
1511
+
1512
+ # Function lookup
1513
+ self._functions: dict[str, ParsedFunction] = {}
1514
+ self._build_function_index()
1515
+
1516
+ # Results
1517
+ self._flows: list[DataFlow] = []
1518
+
1519
+ # Statistics
1520
+ self._stats = {
1521
+ "entry_points_analyzed": 0,
1522
+ "origins_identified": 0,
1523
+ "flows_discovered": 0,
1524
+ "truncated_flows": 0,
1525
+ }
1526
+
1527
+ def _build_function_index(self) -> None:
1528
+ """Build index of functions by qualified name."""
1529
+ for parsed in self._parsed_files.values():
1530
+ for func in parsed.functions:
1531
+ qname = func.qualified_name.full
1532
+ self._functions[qname] = func
1533
+
1534
+ for cls in parsed.classes:
1535
+ for method in cls.methods:
1536
+ qname = method.qualified_name.full
1537
+ self._functions[qname] = method
1538
+
1539
+ def _get_parsed_file_for_function(self, func: ParsedFunction) -> ParsedFile | None:
1540
+ """Get the ParsedFile that contains this function."""
1541
+ if not func.location:
1542
+ return None
1543
+ file_path = getattr(func.location, "file", None) or getattr(func, "file_path", None)
1544
+ if not file_path:
1545
+ return None
1546
+ path = Path(file_path) if not isinstance(file_path, Path) else file_path
1547
+ return self._parsed_files.get(path)
1548
+
1549
+ def analyze(self) -> list[DataFlow]:
1550
+ """
1551
+ Run data flow analysis on all entry points.
1552
+
1553
+ Returns list of discovered data flows.
1554
+ """
1555
+ logger.info(
1556
+ f"Starting data flow analysis: "
1557
+ f"{len(self._entry_points)} entry points, "
1558
+ f"max depth {self._max_depth}"
1559
+ )
1560
+
1561
+ # Initialize taint propagator (with parsed files for call-site analysis)
1562
+ propagator = TaintPropagator(
1563
+ self._call_graph,
1564
+ self._flow_bindings,
1565
+ self._max_depth,
1566
+ parsed_files=list(self._parsed_files.values()),
1567
+ )
1568
+
1569
+ # Source identifier (parameter-based origins)
1570
+ source_id = SourceIdentifier()
1571
+
1572
+ # Process each entry point (framework from route for read-site detection)
1573
+ for ep in self._entry_points:
1574
+ framework = ep.get("framework") or "fastapi"
1575
+ ep_kind = ep.get("kind") # "http", "cli", "task", …
1576
+ self._stats["entry_points_analyzed"] += 1
1577
+
1578
+ # Get the handler function
1579
+ handler_qname = ep.get("handler_qualified_name")
1580
+ if not handler_qname:
1581
+ continue
1582
+
1583
+ func = self._functions.get(handler_qname)
1584
+ if not func:
1585
+ continue
1586
+
1587
+ # Build param location lookup from the entry_point's classified params
1588
+ # so data flow origin types stay consistent with entry_point classification.
1589
+ param_locations: dict[str, str] = {}
1590
+ for p in ep.get("path_params", []):
1591
+ pname = p.get("name") if isinstance(p, dict) else getattr(p, "name", None)
1592
+ if pname:
1593
+ param_locations[pname] = "HTTP_PATH_PARAM"
1594
+ for p in ep.get("query_params", []):
1595
+ pname = p.get("name") if isinstance(p, dict) else getattr(p, "name", None)
1596
+ if pname:
1597
+ param_locations[pname] = "HTTP_QUERY_PARAM"
1598
+ for p in ep.get("header_params", []):
1599
+ pname = p.get("name") if isinstance(p, dict) else getattr(p, "name", None)
1600
+ if pname:
1601
+ param_locations[pname] = "HTTP_HEADER"
1602
+ for p in ep.get("cookie_params", []):
1603
+ pname = p.get("name") if isinstance(p, dict) else getattr(p, "name", None)
1604
+ if pname:
1605
+ param_locations[pname] = "HTTP_COOKIE"
1606
+ ep_body = ep.get("body")
1607
+ if ep_body:
1608
+ body_fields = (
1609
+ ep_body.get("model_fields", [])
1610
+ if isinstance(ep_body, dict)
1611
+ else getattr(ep_body, "model_fields", [])
1612
+ )
1613
+ for f in body_fields:
1614
+ fname = (
1615
+ f
1616
+ if isinstance(f, str)
1617
+ else (f.get("name") if isinstance(f, dict) else getattr(f, "name", None))
1618
+ )
1619
+ if fname:
1620
+ param_locations[fname] = "HTTP_BODY"
1621
+
1622
+ # Non-HTTP entry points may carry parameter metadata in a
1623
+ # generic "parameters" list with per-param "location" values
1624
+ # (e.g. "cli_argument", "cli_option").
1625
+ if not param_locations:
1626
+ for p in ep.get("parameters", []):
1627
+ pname = p.get("name") if isinstance(p, dict) else getattr(p, "name", None)
1628
+ ploc = (
1629
+ p.get("location") if isinstance(p, dict) else getattr(p, "location", None)
1630
+ )
1631
+ if pname and ploc:
1632
+ param_locations[pname] = ploc
1633
+
1634
+ route_path = ep.get("path", "")
1635
+ origins = source_id.identify_from_function(
1636
+ func,
1637
+ entry_point_id=ep.get("id"),
1638
+ route_path=route_path,
1639
+ param_locations=param_locations if param_locations else None,
1640
+ entry_point_kind=ep_kind,
1641
+ )
1642
+
1643
+ # Register parameter-based origins
1644
+ for origin in origins:
1645
+ self._stats["origins_identified"] += 1
1646
+ propagator.add_origin(
1647
+ origin_type=origin["type"],
1648
+ origin_name=origin["name"],
1649
+ origin_location=origin["location"],
1650
+ entry_point_id=origin.get("entry_point_id"),
1651
+ function_qname=origin["function_qname"],
1652
+ parameter_name=origin["parameter_name"],
1653
+ )
1654
+
1655
+ # Read-site origins: request.path_params["id"], request.args.get("x"), etc.
1656
+ parsed_file = self._get_parsed_file_for_function(func)
1657
+ if parsed_file:
1658
+ from .read_site_detector import ReadSiteDetector
1659
+
1660
+ read_site_detector = ReadSiteDetector(framework=framework)
1661
+ read_origins = read_site_detector.detect_origins(func, parsed_file)
1662
+ for ro in read_origins:
1663
+ self._stats["origins_identified"] += 1
1664
+ try:
1665
+ ro_origin_type = OriginType[ro.origin_type]
1666
+ except KeyError:
1667
+ ro_origin_type = OriginType.USER_INPUT
1668
+ propagator.add_origin(
1669
+ origin_type=ro_origin_type,
1670
+ origin_name=ro.param_name or ro.target_variable,
1671
+ origin_location=(ro.file_path, ro.line),
1672
+ entry_point_id=ep.get("id"),
1673
+ function_qname=handler_qname,
1674
+ parameter_name=ro.target_variable,
1675
+ )
1676
+
1677
+ # Run propagation
1678
+ self._flows = propagator.propagate_all()
1679
+
1680
+ # Enrich each flow with sink-evidence
1681
+ from .sink_evidence import SinkEvidenceCollector
1682
+
1683
+ evidence_collector = SinkEvidenceCollector(list(self._parsed_files.values()))
1684
+ for flow in self._flows:
1685
+ ev = evidence_collector.collect_for_flow(
1686
+ sink_file=flow.sink_location[0],
1687
+ sink_line=flow.sink_location[1],
1688
+ sink_arg_index=flow.sink_argument_index,
1689
+ sink_arg_name=flow.sink_argument_name,
1690
+ )
1691
+ if ev is not None:
1692
+ flow.sink_evidence = ev
1693
+
1694
+ # Update statistics
1695
+ self._stats["flows_discovered"] = len(self._flows)
1696
+ self._stats["truncated_flows"] = sum(1 for f in self._flows if f.truncated)
1697
+
1698
+ prop_stats = propagator.get_statistics()
1699
+ self._stats.update(prop_stats)
1700
+
1701
+ logger.info(
1702
+ f"Data flow analysis complete: "
1703
+ f"{self._stats['origins_identified']} origins, "
1704
+ f"{self._stats['flows_discovered']} flows"
1705
+ )
1706
+
1707
+ return self._flows
1708
+
1709
+ def get_statistics(self) -> dict[str, int]:
1710
+ """Get analysis statistics."""
1711
+ return dict(self._stats)
1712
+
1713
+ def to_manifest_flows(self) -> list[dict[str, Any]]:
1714
+ """Convert flows to manifest format."""
1715
+ from ..core.manifest import (
1716
+ CallContextModel,
1717
+ DataFlowModel,
1718
+ DataOriginModel,
1719
+ DataSinkModel,
1720
+ FlowStepModel,
1721
+ LocationModel,
1722
+ TransformationModel,
1723
+ )
1724
+
1725
+ result = []
1726
+
1727
+ for flow in self._flows:
1728
+ path_models = [
1729
+ FlowStepModel(
1730
+ depth=step.depth,
1731
+ caller=step.caller,
1732
+ callee=step.callee,
1733
+ argument_mapping=step.variable_mapping,
1734
+ location=LocationModel(
1735
+ file=str(step.location[0]),
1736
+ line=step.location[1],
1737
+ ),
1738
+ )
1739
+ for step in flow.path
1740
+ ]
1741
+
1742
+ transform_models = [
1743
+ TransformationModel(
1744
+ depth=t.depth if t.depth >= 0 else 0,
1745
+ location=LocationModel(
1746
+ file=str(t.location[0]),
1747
+ line=t.location[1],
1748
+ ),
1749
+ type=t.type,
1750
+ description=t.description,
1751
+ function=t.function,
1752
+ call_evidence=self._evidence_to_model(t.call_evidence),
1753
+ )
1754
+ for t in flow.transformations
1755
+ ]
1756
+
1757
+ ctx = flow.sink_context or {}
1758
+ context_model = CallContextModel(
1759
+ in_try_block=ctx.get("in_try_block", False),
1760
+ in_conditional=ctx.get("in_conditional", False),
1761
+ in_loop=ctx.get("in_loop", False),
1762
+ )
1763
+
1764
+ sink_evidence_model = self._evidence_to_model(flow.sink_evidence)
1765
+
1766
+ flow_model = DataFlowModel(
1767
+ id=flow.id,
1768
+ origin=DataOriginModel(
1769
+ type=flow.origin_type.name,
1770
+ name=flow.origin_name,
1771
+ location=LocationModel(
1772
+ file=str(flow.origin_location[0]),
1773
+ line=flow.origin_location[1],
1774
+ ),
1775
+ entry_point_ref=flow.entry_point_id,
1776
+ ),
1777
+ sink=DataSinkModel(
1778
+ function=flow.sink_function,
1779
+ argument_index=flow.sink_argument_index,
1780
+ argument_name=flow.sink_argument_name,
1781
+ location=LocationModel(
1782
+ file=str(flow.sink_location[0]),
1783
+ line=flow.sink_location[1],
1784
+ ),
1785
+ call_ref=flow.sink_call_id,
1786
+ ),
1787
+ path=path_models,
1788
+ depth=flow.depth,
1789
+ truncated=flow.truncated,
1790
+ transformations=transform_models,
1791
+ sink_evidence=sink_evidence_model,
1792
+ context=context_model,
1793
+ confidence=flow.confidence,
1794
+ )
1795
+
1796
+ result.append(flow_model.model_dump())
1797
+
1798
+ return result
1799
+
1800
+ @staticmethod
1801
+ def _evidence_to_model(evidence: Any) -> CallSiteEvidenceModel | None: # noqa: F821
1802
+ """Convert internal CallSiteEvidence to manifest Pydantic model."""
1803
+ if evidence is None:
1804
+ return None
1805
+ from ..core.manifest import (
1806
+ ArgumentEvidenceModel,
1807
+ CallSiteEvidenceModel,
1808
+ StringPatternModel,
1809
+ )
1810
+
1811
+ arg_models = [
1812
+ ArgumentEvidenceModel(
1813
+ position=a.position,
1814
+ name=a.name,
1815
+ is_literal=a.is_literal,
1816
+ literal_value=a.literal_value,
1817
+ literal_type=a.literal_type,
1818
+ is_variable=a.is_variable,
1819
+ variable_name=a.variable_name,
1820
+ is_call_result=a.is_call_result,
1821
+ called_function=a.called_function,
1822
+ construction=a.construction,
1823
+ container_type=a.container_type,
1824
+ source_variables=list(a.source_variables),
1825
+ expression_text=a.expression_text,
1826
+ is_tainted=a.is_tainted,
1827
+ )
1828
+ for a in evidence.all_arguments
1829
+ ]
1830
+ pattern_models = [
1831
+ StringPatternModel(
1832
+ type=p.pattern_type,
1833
+ pattern=p.matched,
1834
+ argument_position=p.argument_position,
1835
+ )
1836
+ for p in evidence.string_patterns
1837
+ ]
1838
+ return CallSiteEvidenceModel(
1839
+ tainted_argument_position=evidence.tainted_argument_position,
1840
+ tainted_argument_name=evidence.tainted_argument_name,
1841
+ tainted_argument_construction=evidence.tainted_argument_construction,
1842
+ all_arguments=arg_models,
1843
+ string_patterns=pattern_models,
1844
+ )
1845
+
1846
+
1847
+ # =============================================================================
1848
+ # Convenience Functions
1849
+ # =============================================================================
1850
+
1851
+
1852
+ def analyze_data_flow(
1853
+ call_graph: CallGraph,
1854
+ parsed_files: list[ParsedFile],
1855
+ entry_points: list[dict[str, Any]],
1856
+ flow_bindings: FlowSensitiveBindings | None = None,
1857
+ max_depth: int = 10,
1858
+ ) -> tuple[list[DataFlow], dict[str, int]]:
1859
+ """
1860
+ Convenience function to run data flow analysis.
1861
+
1862
+ Args:
1863
+ call_graph: Pre-built call graph
1864
+ parsed_files: Parsed source files
1865
+ entry_points: List of entry points (routes)
1866
+ flow_bindings: Optional flow-sensitive bindings
1867
+ max_depth: Maximum propagation depth
1868
+
1869
+ Returns:
1870
+ Tuple of (flows, statistics)
1871
+ """
1872
+ analyzer = DataFlowAnalyzer(
1873
+ call_graph=call_graph,
1874
+ parsed_files=parsed_files,
1875
+ entry_points=entry_points,
1876
+ flow_bindings=flow_bindings,
1877
+ max_depth=max_depth,
1878
+ )
1879
+
1880
+ flows = analyzer.analyze()
1881
+ stats = analyzer.get_statistics()
1882
+
1883
+ return flows, stats