cisco-ai-skill-scanner 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (100) hide show
  1. cisco_ai_skill_scanner-1.0.0.dist-info/METADATA +253 -0
  2. cisco_ai_skill_scanner-1.0.0.dist-info/RECORD +100 -0
  3. cisco_ai_skill_scanner-1.0.0.dist-info/WHEEL +4 -0
  4. cisco_ai_skill_scanner-1.0.0.dist-info/entry_points.txt +4 -0
  5. cisco_ai_skill_scanner-1.0.0.dist-info/licenses/LICENSE +17 -0
  6. skillanalyzer/__init__.py +45 -0
  7. skillanalyzer/_version.py +34 -0
  8. skillanalyzer/api/__init__.py +25 -0
  9. skillanalyzer/api/api.py +34 -0
  10. skillanalyzer/api/api_cli.py +78 -0
  11. skillanalyzer/api/api_server.py +634 -0
  12. skillanalyzer/api/router.py +527 -0
  13. skillanalyzer/cli/__init__.py +25 -0
  14. skillanalyzer/cli/cli.py +816 -0
  15. skillanalyzer/config/__init__.py +26 -0
  16. skillanalyzer/config/config.py +149 -0
  17. skillanalyzer/config/config_parser.py +122 -0
  18. skillanalyzer/config/constants.py +85 -0
  19. skillanalyzer/core/__init__.py +24 -0
  20. skillanalyzer/core/analyzers/__init__.py +75 -0
  21. skillanalyzer/core/analyzers/aidefense_analyzer.py +872 -0
  22. skillanalyzer/core/analyzers/base.py +53 -0
  23. skillanalyzer/core/analyzers/behavioral/__init__.py +30 -0
  24. skillanalyzer/core/analyzers/behavioral/alignment/__init__.py +45 -0
  25. skillanalyzer/core/analyzers/behavioral/alignment/alignment_llm_client.py +240 -0
  26. skillanalyzer/core/analyzers/behavioral/alignment/alignment_orchestrator.py +216 -0
  27. skillanalyzer/core/analyzers/behavioral/alignment/alignment_prompt_builder.py +422 -0
  28. skillanalyzer/core/analyzers/behavioral/alignment/alignment_response_validator.py +136 -0
  29. skillanalyzer/core/analyzers/behavioral/alignment/threat_vulnerability_classifier.py +198 -0
  30. skillanalyzer/core/analyzers/behavioral_analyzer.py +453 -0
  31. skillanalyzer/core/analyzers/cross_skill_analyzer.py +490 -0
  32. skillanalyzer/core/analyzers/llm_analyzer.py +440 -0
  33. skillanalyzer/core/analyzers/llm_prompt_builder.py +270 -0
  34. skillanalyzer/core/analyzers/llm_provider_config.py +215 -0
  35. skillanalyzer/core/analyzers/llm_request_handler.py +284 -0
  36. skillanalyzer/core/analyzers/llm_response_parser.py +81 -0
  37. skillanalyzer/core/analyzers/meta_analyzer.py +845 -0
  38. skillanalyzer/core/analyzers/static.py +1105 -0
  39. skillanalyzer/core/analyzers/trigger_analyzer.py +341 -0
  40. skillanalyzer/core/analyzers/virustotal_analyzer.py +463 -0
  41. skillanalyzer/core/exceptions.py +77 -0
  42. skillanalyzer/core/loader.py +377 -0
  43. skillanalyzer/core/models.py +300 -0
  44. skillanalyzer/core/reporters/__init__.py +26 -0
  45. skillanalyzer/core/reporters/json_reporter.py +65 -0
  46. skillanalyzer/core/reporters/markdown_reporter.py +209 -0
  47. skillanalyzer/core/reporters/sarif_reporter.py +246 -0
  48. skillanalyzer/core/reporters/table_reporter.py +195 -0
  49. skillanalyzer/core/rules/__init__.py +19 -0
  50. skillanalyzer/core/rules/patterns.py +165 -0
  51. skillanalyzer/core/rules/yara_scanner.py +157 -0
  52. skillanalyzer/core/scanner.py +437 -0
  53. skillanalyzer/core/static_analysis/__init__.py +27 -0
  54. skillanalyzer/core/static_analysis/cfg/__init__.py +21 -0
  55. skillanalyzer/core/static_analysis/cfg/builder.py +439 -0
  56. skillanalyzer/core/static_analysis/context_extractor.py +742 -0
  57. skillanalyzer/core/static_analysis/dataflow/__init__.py +25 -0
  58. skillanalyzer/core/static_analysis/dataflow/forward_analysis.py +715 -0
  59. skillanalyzer/core/static_analysis/interprocedural/__init__.py +21 -0
  60. skillanalyzer/core/static_analysis/interprocedural/call_graph_analyzer.py +406 -0
  61. skillanalyzer/core/static_analysis/interprocedural/cross_file_analyzer.py +190 -0
  62. skillanalyzer/core/static_analysis/parser/__init__.py +21 -0
  63. skillanalyzer/core/static_analysis/parser/python_parser.py +380 -0
  64. skillanalyzer/core/static_analysis/semantic/__init__.py +28 -0
  65. skillanalyzer/core/static_analysis/semantic/name_resolver.py +206 -0
  66. skillanalyzer/core/static_analysis/semantic/type_analyzer.py +200 -0
  67. skillanalyzer/core/static_analysis/taint/__init__.py +21 -0
  68. skillanalyzer/core/static_analysis/taint/tracker.py +252 -0
  69. skillanalyzer/core/static_analysis/types/__init__.py +36 -0
  70. skillanalyzer/data/__init__.py +30 -0
  71. skillanalyzer/data/prompts/boilerplate_protection_rule_prompt.md +26 -0
  72. skillanalyzer/data/prompts/code_alignment_threat_analysis_prompt.md +901 -0
  73. skillanalyzer/data/prompts/llm_response_schema.json +71 -0
  74. skillanalyzer/data/prompts/skill_meta_analysis_prompt.md +303 -0
  75. skillanalyzer/data/prompts/skill_threat_analysis_prompt.md +263 -0
  76. skillanalyzer/data/prompts/unified_response_schema.md +97 -0
  77. skillanalyzer/data/rules/signatures.yaml +440 -0
  78. skillanalyzer/data/yara_rules/autonomy_abuse.yara +66 -0
  79. skillanalyzer/data/yara_rules/code_execution.yara +61 -0
  80. skillanalyzer/data/yara_rules/coercive_injection.yara +115 -0
  81. skillanalyzer/data/yara_rules/command_injection.yara +54 -0
  82. skillanalyzer/data/yara_rules/credential_harvesting.yara +115 -0
  83. skillanalyzer/data/yara_rules/prompt_injection.yara +71 -0
  84. skillanalyzer/data/yara_rules/script_injection.yara +83 -0
  85. skillanalyzer/data/yara_rules/skill_discovery_abuse.yara +57 -0
  86. skillanalyzer/data/yara_rules/sql_injection.yara +73 -0
  87. skillanalyzer/data/yara_rules/system_manipulation.yara +65 -0
  88. skillanalyzer/data/yara_rules/tool_chaining_abuse.yara +60 -0
  89. skillanalyzer/data/yara_rules/transitive_trust_abuse.yara +73 -0
  90. skillanalyzer/data/yara_rules/unicode_steganography.yara +65 -0
  91. skillanalyzer/hooks/__init__.py +21 -0
  92. skillanalyzer/hooks/pre_commit.py +450 -0
  93. skillanalyzer/threats/__init__.py +25 -0
  94. skillanalyzer/threats/threats.py +480 -0
  95. skillanalyzer/utils/__init__.py +28 -0
  96. skillanalyzer/utils/command_utils.py +129 -0
  97. skillanalyzer/utils/di_container.py +154 -0
  98. skillanalyzer/utils/file_utils.py +86 -0
  99. skillanalyzer/utils/logging_config.py +96 -0
  100. skillanalyzer/utils/logging_utils.py +71 -0
@@ -0,0 +1,715 @@
1
+ # Copyright 2026 Cisco Systems, Inc.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ #
15
+ # SPDX-License-Identifier: Apache-2.0
16
+
17
+ """Forward dataflow analysis using Control Flow Graph.
18
+
19
+ Tracks parameter flows from function entry points through all control structures
20
+ using proper CFG-based fixpoint analysis. This provides accurate flow tracking
21
+ through branches, loops, and function calls.
22
+
23
+ Replaces the simple AST walker approach with proper dataflow analysis.
24
+ """
25
+
26
+ import ast
27
+ from dataclasses import dataclass, field
28
+ from typing import Any
29
+
30
+ from ..cfg.builder import CFGNode, DataFlowAnalyzer
31
+ from ..parser.python_parser import PythonParser
32
+ from ..taint.tracker import ShapeEnvironment, Taint, TaintStatus
33
+
34
+
35
+ @dataclass
36
+ class FlowPath:
37
+ """Represents a complete flow path from parameter."""
38
+
39
+ parameter_name: str
40
+ operations: list[dict[str, Any]] = field(default_factory=list)
41
+ reaches_calls: list[str] = field(default_factory=list)
42
+ reaches_assignments: list[str] = field(default_factory=list)
43
+ reaches_returns: bool = False
44
+ reaches_external: bool = False # Network, file, subprocess
45
+
46
+ def copy(self) -> "FlowPath":
47
+ """Create a deep copy of the flow path."""
48
+ return FlowPath(
49
+ parameter_name=self.parameter_name,
50
+ operations=self.operations.copy(),
51
+ reaches_calls=self.reaches_calls.copy(),
52
+ reaches_assignments=self.reaches_assignments.copy(),
53
+ reaches_returns=self.reaches_returns,
54
+ reaches_external=self.reaches_external,
55
+ )
56
+
57
+
58
+ @dataclass
59
+ class ForwardFlowFact:
60
+ """Dataflow fact tracking parameter flows."""
61
+
62
+ shape_env: ShapeEnvironment = field(default_factory=ShapeEnvironment)
63
+ parameter_flows: dict[str, FlowPath] = field(default_factory=dict)
64
+
65
+ def copy(self) -> "ForwardFlowFact":
66
+ """Create a deep copy."""
67
+ return ForwardFlowFact(
68
+ shape_env=self.shape_env.copy(),
69
+ parameter_flows={k: v.copy() for k, v in self.parameter_flows.items()},
70
+ )
71
+
72
+ def __eq__(self, other: object) -> bool:
73
+ """Check equality for fixpoint detection."""
74
+ if not isinstance(other, ForwardFlowFact):
75
+ return False
76
+
77
+ if self.shape_env != other.shape_env:
78
+ return False
79
+
80
+ if set(self.parameter_flows.keys()) != set(other.parameter_flows.keys()):
81
+ return False
82
+
83
+ # Compare flow paths
84
+ for param in self.parameter_flows:
85
+ self_flow = self.parameter_flows[param]
86
+ other_flow = other.parameter_flows[param]
87
+
88
+ if (
89
+ len(self_flow.operations) != len(other_flow.operations)
90
+ or set(self_flow.reaches_calls) != set(other_flow.reaches_calls)
91
+ or self_flow.reaches_returns != other_flow.reaches_returns
92
+ or self_flow.reaches_external != other_flow.reaches_external
93
+ ):
94
+ return False
95
+
96
+ return True
97
+
98
+
99
+ class ForwardDataflowAnalysis(DataFlowAnalyzer[ForwardFlowFact]):
100
+ """Track all forward flows from function parameters and script-level sources using CFG.
101
+
102
+ Uses proper control flow graph and fixpoint analysis to accurately
103
+ track how parameters flow through branches, loops, and function calls.
104
+ Also detects script-level sources (credential files, env vars) and tracks
105
+ their flows to sinks (network, eval, subprocess).
106
+ """
107
+
108
+ def __init__(self, parser: PythonParser, parameter_names: list[str] | None = None, detect_sources: bool = True):
109
+ """Initialize forward flow tracker.
110
+
111
+ Args:
112
+ parser: Python parser instance
113
+ parameter_names: Names of function parameters to track (None for script-level only)
114
+ detect_sources: Whether to detect script-level sources (credential files, env vars)
115
+ """
116
+ super().__init__(parser)
117
+ self.parameter_names = parameter_names or []
118
+ self.detect_sources = detect_sources
119
+ self.all_flows: list[FlowPath] = []
120
+ self.script_sources: list[str] = [] # Detected script-level sources
121
+
122
+ def analyze_forward_flows(self) -> list[FlowPath]:
123
+ """Run forward flow analysis from parameters and script-level sources.
124
+
125
+ Returns:
126
+ List of all flow paths from parameters and sources
127
+ """
128
+ # Clear state to prevent accumulation from previous analyses
129
+ # (defensive programming - instances should be fresh, but this ensures clean state)
130
+ self.all_flows.clear()
131
+ self.script_sources.clear()
132
+
133
+ self.build_cfg()
134
+
135
+ # Detect script-level sources if enabled
136
+ if self.detect_sources:
137
+ self._detect_script_sources()
138
+
139
+ # Initialize: mark all parameters and sources as tainted with unique labels
140
+ initial_fact = ForwardFlowFact()
141
+
142
+ # Track function parameters
143
+ for param_name in self.parameter_names:
144
+ taint = Taint(status=TaintStatus.TAINTED)
145
+ taint.add_label(f"param:{param_name}")
146
+ initial_fact.shape_env.set_taint(param_name, taint)
147
+ initial_fact.parameter_flows[param_name] = FlowPath(parameter_name=param_name)
148
+
149
+ # Track script-level sources (credential files, env vars)
150
+ for source_name in self.script_sources:
151
+ source_type = self._get_source_type(source_name)
152
+ taint = Taint(status=TaintStatus.TAINTED)
153
+ taint.add_label(f"source:{source_type}:{source_name}")
154
+ # Use a synthetic variable name for tracking
155
+ var_name = f"__source_{source_type}_{len(self.all_flows)}"
156
+ initial_fact.shape_env.set_taint(var_name, taint)
157
+ initial_fact.parameter_flows[source_name] = FlowPath(parameter_name=source_name)
158
+
159
+ self.analyze(initial_fact, forward=True)
160
+
161
+ # Collect all flows
162
+ self._collect_flows()
163
+
164
+ return self.all_flows
165
+
166
+ def _detect_script_sources(self) -> None:
167
+ """Detect script-level sources (credential files, env vars)."""
168
+ tree = getattr(self.parser, "tree", None)
169
+ if not tree:
170
+ return
171
+
172
+ CREDENTIAL_FILES = [".aws/credentials", ".ssh/id_rsa", ".ssh/id_dsa", ".kube/config", ".netrc"]
173
+ ENV_VAR_PATTERNS = ["API_KEY", "SECRET", "TOKEN", "PASSWORD", "CREDENTIAL"]
174
+
175
+ tree = getattr(self.parser, "tree", None)
176
+ if not tree:
177
+ return
178
+
179
+ for node in ast.walk(tree):
180
+ if isinstance(node, ast.Call):
181
+ call_name = self._get_call_name(node)
182
+
183
+ # Check for credential file access
184
+ for arg in node.args:
185
+ if isinstance(arg, ast.Constant) and isinstance(arg.value, str):
186
+ # Credential files
187
+ if any(cred in arg.value for cred in CREDENTIAL_FILES):
188
+ source_name = f"credential_file:{arg.value}"
189
+ if source_name not in self.script_sources:
190
+ self.script_sources.append(source_name)
191
+
192
+ # os.path.expanduser with credential paths
193
+ if call_name == "os.path.expanduser":
194
+ if any(cred in arg.value for cred in CREDENTIAL_FILES):
195
+ source_name = f"credential_file:{arg.value}"
196
+ if source_name not in self.script_sources:
197
+ self.script_sources.append(source_name)
198
+
199
+ # Check for env var access
200
+ if call_name in ["os.getenv", "os.environ.get", "getenv"]:
201
+ for arg in node.args:
202
+ if isinstance(arg, ast.Constant) and isinstance(arg.value, str):
203
+ if any(pattern in arg.value.upper() for pattern in ENV_VAR_PATTERNS):
204
+ source_name = f"env_var:{arg.value}"
205
+ if source_name not in self.script_sources:
206
+ self.script_sources.append(source_name)
207
+
208
+ # Check for os.environ.items() iteration
209
+ elif isinstance(node, ast.For):
210
+ if isinstance(node.iter, ast.Call):
211
+ if isinstance(node.iter.func, ast.Attribute):
212
+ if node.iter.func.attr == "items":
213
+ attr_name = self._get_attribute_name(node.iter.func.value)
214
+ if attr_name == "os.environ":
215
+ source_name = "env_var:os.environ (all)"
216
+ if source_name not in self.script_sources:
217
+ self.script_sources.append(source_name)
218
+
219
+ # Check for os.environ assignment
220
+ elif isinstance(node, ast.Assign):
221
+ if isinstance(node.value, ast.Attribute):
222
+ attr_name = self._get_attribute_name(node.value)
223
+ if attr_name == "os.environ":
224
+ source_name = "env_var:os.environ (assignment)"
225
+ if source_name not in self.script_sources:
226
+ self.script_sources.append(source_name)
227
+
228
+ def _get_source_type(self, source_name: str) -> str:
229
+ """Get source type from source name."""
230
+ if source_name.startswith("credential_file:"):
231
+ return "credential_file"
232
+ elif source_name.startswith("env_var:"):
233
+ return "env_var"
234
+ return "unknown"
235
+
236
+ def _get_attribute_name(self, node: ast.Attribute) -> str:
237
+ """Get full attribute name like 'os.environ'."""
238
+ parts = []
239
+ current = node
240
+
241
+ while isinstance(current, ast.Attribute):
242
+ parts.append(current.attr)
243
+ current = current.value
244
+
245
+ if isinstance(current, ast.Name):
246
+ parts.append(current.id)
247
+
248
+ return ".".join(reversed(parts))
249
+
250
+ def transfer(self, node: CFGNode, in_fact: ForwardFlowFact) -> ForwardFlowFact:
251
+ """Transfer function tracking parameter flows.
252
+
253
+ Args:
254
+ node: CFG node
255
+ in_fact: Input flow fact
256
+
257
+ Returns:
258
+ Output flow fact
259
+ """
260
+ out_fact = in_fact.copy()
261
+ ast_node = node.ast_node
262
+
263
+ self._transfer_python(ast_node, out_fact)
264
+ return out_fact
265
+
266
+ def _transfer_python(self, node: ast.AST, fact: ForwardFlowFact) -> None:
267
+ """Transfer function for Python nodes.
268
+
269
+ Args:
270
+ node: Python AST node
271
+ fact: Flow fact to update
272
+ """
273
+ # Track assignments
274
+ if isinstance(node, ast.Assign):
275
+ for target in node.targets:
276
+ if isinstance(target, ast.Name):
277
+ rhs_taint = self._eval_expr_taint(node.value, fact)
278
+
279
+ # Check if RHS is a source call (os.getenv, open with credential file, etc.)
280
+ if isinstance(node.value, ast.Call):
281
+ source_info = self._check_source_call(node.value)
282
+ if source_info:
283
+ source_type, source_name = source_info
284
+ rhs_taint = Taint(status=TaintStatus.TAINTED)
285
+ rhs_taint.add_label(f"source:{source_type}:{source_name}")
286
+ # Add to script sources if not already there
287
+ full_source_name = f"{source_type}:{source_name}"
288
+ if full_source_name not in self.script_sources:
289
+ self.script_sources.append(full_source_name)
290
+ if full_source_name not in fact.parameter_flows:
291
+ fact.parameter_flows[full_source_name] = FlowPath(parameter_name=full_source_name)
292
+
293
+ if rhs_taint.is_tainted():
294
+ fact.shape_env.set_taint(target.id, rhs_taint)
295
+
296
+ # Track which parameters/sources flow here
297
+ all_tracked = self.parameter_names + self.script_sources
298
+ for tracked_name in all_tracked:
299
+ if self._expr_uses_var(node.value, tracked_name, fact) or self._is_source_assignment(
300
+ node.value, tracked_name
301
+ ):
302
+ if tracked_name in fact.parameter_flows:
303
+ flow = fact.parameter_flows[tracked_name]
304
+
305
+ # Deduplicate: Check if this assignment was already recorded
306
+ assignment_str = f"{target.id} = {self._unparse_safe(node.value)}"
307
+ if assignment_str not in flow.reaches_assignments:
308
+ flow.reaches_assignments.append(assignment_str)
309
+
310
+ # Deduplicate operations by creating a key
311
+ op_key = (
312
+ "assignment",
313
+ target.id,
314
+ self._unparse_safe(node.value),
315
+ None, # function
316
+ None, # argument
317
+ node.lineno if hasattr(node, "lineno") else 0,
318
+ )
319
+ existing_op_keys = {
320
+ (
321
+ op.get("type"),
322
+ op.get("target"),
323
+ op.get("value"),
324
+ op.get("function"),
325
+ op.get("argument"),
326
+ op.get("line"),
327
+ )
328
+ for op in flow.operations
329
+ }
330
+ if op_key not in existing_op_keys:
331
+ flow.operations.append(
332
+ {
333
+ "type": "assignment",
334
+ "target": target.id,
335
+ "value": self._unparse_safe(node.value),
336
+ "line": node.lineno if hasattr(node, "lineno") else 0,
337
+ }
338
+ )
339
+
340
+ # Check if RHS is a call to external operation
341
+ if isinstance(node.value, ast.Call):
342
+ call_name = self._get_call_name(node.value)
343
+ if call_name not in flow.reaches_calls:
344
+ flow.reaches_calls.append(call_name)
345
+ if self._is_external_operation(call_name):
346
+ flow.reaches_external = True
347
+ else:
348
+ # Clear taint if RHS is not tainted
349
+ fact.shape_env.set_taint(target.id, Taint(status=TaintStatus.UNTAINTED))
350
+
351
+ # Track function calls
352
+ elif isinstance(node, ast.Call):
353
+ call_name = self._get_call_name(node)
354
+
355
+ # Check if any arguments contain tracked parameters/sources
356
+ for arg in node.args:
357
+ arg_taint = self._eval_expr_taint(arg, fact)
358
+ if arg_taint.is_tainted():
359
+ all_tracked = self.parameter_names + self.script_sources
360
+ for tracked_name in all_tracked:
361
+ if self._expr_uses_var(arg, tracked_name, fact):
362
+ if tracked_name in fact.parameter_flows:
363
+ flow = fact.parameter_flows[tracked_name]
364
+
365
+ # Deduplicate: Check if this call was already recorded
366
+ if call_name not in flow.reaches_calls:
367
+ flow.reaches_calls.append(call_name)
368
+
369
+ # Deduplicate operations
370
+ op_key = (
371
+ "function_call",
372
+ None, # target
373
+ None, # value
374
+ call_name,
375
+ self._unparse_safe(arg),
376
+ node.lineno if hasattr(node, "lineno") else 0,
377
+ )
378
+ existing_op_keys = {
379
+ (
380
+ op.get("type"),
381
+ op.get("target"),
382
+ op.get("value"),
383
+ op.get("function"),
384
+ op.get("argument"),
385
+ op.get("line"),
386
+ )
387
+ for op in flow.operations
388
+ }
389
+ if op_key not in existing_op_keys:
390
+ flow.operations.append(
391
+ {
392
+ "type": "function_call",
393
+ "function": call_name,
394
+ "argument": self._unparse_safe(arg),
395
+ "line": node.lineno if hasattr(node, "lineno") else 0,
396
+ }
397
+ )
398
+
399
+ if self._is_external_operation(call_name):
400
+ flow.reaches_external = True
401
+
402
+ # Track returns
403
+ elif isinstance(node, ast.Return):
404
+ if node.value:
405
+ ret_taint = self._eval_expr_taint(node.value, fact)
406
+ if ret_taint.is_tainted():
407
+ all_tracked = self.parameter_names + self.script_sources
408
+ for tracked_name in all_tracked:
409
+ if self._expr_uses_var(node.value, tracked_name, fact):
410
+ if tracked_name in fact.parameter_flows:
411
+ fact.parameter_flows[tracked_name].reaches_returns = True
412
+ fact.parameter_flows[tracked_name].operations.append(
413
+ {
414
+ "type": "return",
415
+ "value": self._unparse_safe(node.value),
416
+ "line": node.lineno if hasattr(node, "lineno") else 0,
417
+ }
418
+ )
419
+
420
+ def _eval_expr_taint(self, expr: ast.AST, fact: ForwardFlowFact) -> Taint:
421
+ """Evaluate taint of an expression.
422
+
423
+ Args:
424
+ expr: Expression node
425
+ fact: Current flow fact
426
+
427
+ Returns:
428
+ Taint of the expression
429
+ """
430
+ if isinstance(expr, ast.Name):
431
+ return fact.shape_env.get_taint(expr.id)
432
+
433
+ elif isinstance(expr, ast.Attribute):
434
+ if isinstance(expr.value, ast.Name):
435
+ obj_name = expr.value.id
436
+ field_name = expr.attr
437
+ shape = fact.shape_env.get(obj_name)
438
+ return shape.get_field(field_name)
439
+ else:
440
+ return self._eval_expr_taint(expr.value, fact)
441
+
442
+ elif isinstance(expr, ast.Subscript):
443
+ if isinstance(expr.value, ast.Name):
444
+ arr_name = expr.value.id
445
+ shape = fact.shape_env.get(arr_name)
446
+ return shape.get_element()
447
+ else:
448
+ return self._eval_expr_taint(expr.value, fact)
449
+
450
+ elif isinstance(expr, ast.Call):
451
+ # Merge taint from all arguments
452
+ result = Taint(status=TaintStatus.UNTAINTED)
453
+ for arg in expr.args:
454
+ arg_taint = self._eval_expr_taint(arg, fact)
455
+ result = result.merge(arg_taint)
456
+ return result
457
+
458
+ elif isinstance(expr, ast.BinOp):
459
+ left_taint = self._eval_expr_taint(expr.left, fact)
460
+ right_taint = self._eval_expr_taint(expr.right, fact)
461
+ return left_taint.merge(right_taint)
462
+
463
+ elif isinstance(expr, ast.JoinedStr):
464
+ result = Taint(status=TaintStatus.UNTAINTED)
465
+ for value in expr.values:
466
+ if isinstance(value, ast.FormattedValue):
467
+ taint = self._eval_expr_taint(value.value, fact)
468
+ result = result.merge(taint)
469
+ return result
470
+
471
+ elif isinstance(expr, (ast.List, ast.Tuple, ast.Set)):
472
+ result = Taint(status=TaintStatus.UNTAINTED)
473
+ for elt in expr.elts:
474
+ taint = self._eval_expr_taint(elt, fact)
475
+ result = result.merge(taint)
476
+ return result
477
+
478
+ else:
479
+ return Taint(status=TaintStatus.UNTAINTED)
480
+
481
+ def _expr_uses_var(self, expr: ast.AST, var_name: str, fact: ForwardFlowFact) -> bool:
482
+ """Check if expression uses a variable (directly or transitively).
483
+
484
+ Uses source-sensitive tracking via taint labels.
485
+
486
+ Args:
487
+ expr: Expression node
488
+ var_name: Variable name to check
489
+ fact: Current flow fact
490
+
491
+ Returns:
492
+ True if expression uses the variable
493
+ """
494
+ target_shape = fact.shape_env.get(var_name)
495
+ target_taint = target_shape.get_taint()
496
+ target_labels = target_taint.labels if target_taint.is_tainted() else set()
497
+ expected_label = f"param:{var_name}"
498
+
499
+ for node in ast.walk(expr):
500
+ if isinstance(node, ast.Name):
501
+ if node.id == var_name:
502
+ return True
503
+
504
+ # Check transitive dependencies with source sensitivity
505
+ node_shape = fact.shape_env.get(node.id)
506
+ node_taint = node_shape.get_taint()
507
+
508
+ if node_taint.is_tainted():
509
+ if expected_label in node_taint.labels:
510
+ return True
511
+
512
+ if target_labels and node_taint.labels & target_labels:
513
+ return True
514
+
515
+ # Check structural shapes
516
+ if node_shape.is_object:
517
+ for field_name, field_shape in node_shape.fields.items():
518
+ field_taint = field_shape.get_taint()
519
+ if expected_label in field_taint.labels:
520
+ return True
521
+
522
+ if node_shape.is_array and node_shape.element_shape:
523
+ elem_taint = node_shape.element_shape.get_taint()
524
+ if expected_label in elem_taint.labels:
525
+ return True
526
+
527
+ return False
528
+
529
+ def _get_call_name(self, node: ast.Call) -> str:
530
+ """Get function call name.
531
+
532
+ Args:
533
+ node: Call node
534
+
535
+ Returns:
536
+ Function name
537
+ """
538
+ if isinstance(node.func, ast.Name):
539
+ return node.func.id
540
+ elif isinstance(node.func, ast.Attribute):
541
+ parts = []
542
+ current = node.func
543
+ while isinstance(current, ast.Attribute):
544
+ parts.append(current.attr)
545
+ current = current.value
546
+ if isinstance(current, ast.Name):
547
+ parts.append(current.id)
548
+ return ".".join(reversed(parts))
549
+ try:
550
+ return ast.unparse(node.func) if hasattr(ast, "unparse") else str(node.func)
551
+ except (AttributeError, TypeError, ValueError):
552
+ return "<unknown>"
553
+
554
+ def _unparse_safe(self, node: ast.AST) -> str:
555
+ """Safely unparse AST node."""
556
+ try:
557
+ if hasattr(ast, "unparse"):
558
+ return ast.unparse(node)
559
+ return str(node)
560
+ except (AttributeError, TypeError, ValueError):
561
+ return "<unparseable>"
562
+
563
+ def _is_external_operation(self, call_name: str) -> bool:
564
+ """Check if call is an external operation (network, file, subprocess).
565
+
566
+ Args:
567
+ call_name: Function call name
568
+
569
+ Returns:
570
+ True if external operation
571
+ """
572
+ external_patterns = [
573
+ "requests",
574
+ "urllib",
575
+ "http",
576
+ "socket",
577
+ "post",
578
+ "get",
579
+ "fetch",
580
+ "open",
581
+ "read",
582
+ "write",
583
+ "file",
584
+ "subprocess",
585
+ "os.system",
586
+ "os.popen",
587
+ "exec",
588
+ "eval",
589
+ ]
590
+ call_lower = call_name.lower()
591
+ return any(pattern in call_lower for pattern in external_patterns)
592
+
593
+ def _check_source_call(self, call_node: ast.Call) -> tuple[str, str] | None:
594
+ """Check if a call is a source (credential file, env var).
595
+
596
+ Returns:
597
+ (source_type, source_name) if source, None otherwise
598
+ """
599
+ call_name = self._get_call_name(call_node)
600
+ CREDENTIAL_FILES = [".aws/credentials", ".ssh/id_rsa", ".ssh/id_dsa", ".kube/config", ".netrc"]
601
+ ENV_VAR_PATTERNS = ["API_KEY", "SECRET", "TOKEN", "PASSWORD", "CREDENTIAL"]
602
+
603
+ # Check for env var access
604
+ if call_name in ["os.getenv", "os.environ.get", "getenv"]:
605
+ for arg in call_node.args:
606
+ if isinstance(arg, ast.Constant) and isinstance(arg.value, str):
607
+ if any(pattern in arg.value.upper() for pattern in ENV_VAR_PATTERNS):
608
+ return ("env_var", arg.value)
609
+
610
+ # Check for credential file access
611
+ for arg in call_node.args:
612
+ if isinstance(arg, ast.Constant) and isinstance(arg.value, str):
613
+ if any(cred in arg.value for cred in CREDENTIAL_FILES):
614
+ return ("credential_file", arg.value)
615
+
616
+ # Check for os.path.expanduser with credential paths
617
+ if call_name == "os.path.expanduser":
618
+ for arg in call_node.args:
619
+ if isinstance(arg, ast.Constant) and isinstance(arg.value, str):
620
+ if any(cred in arg.value for cred in CREDENTIAL_FILES):
621
+ return ("credential_file", arg.value)
622
+
623
+ return None
624
+
625
+ def _is_source_assignment(self, expr: ast.AST, source_name: str) -> bool:
626
+ """Check if expression is an assignment from a source."""
627
+ if isinstance(expr, ast.Call):
628
+ source_info = self._check_source_call(expr)
629
+ if source_info:
630
+ source_type, name = source_info
631
+ full_name = f"{source_type}:{name}"
632
+ return full_name == source_name
633
+ return False
634
+
635
+ def _collect_flows(self) -> None:
636
+ """Collect all flows from analysis results."""
637
+ if not self.cfg or not self.cfg.exit:
638
+ return
639
+
640
+ # Get flows at exit node
641
+ exit_fact = self.out_facts.get(self.cfg.exit.id)
642
+ if exit_fact:
643
+ for param_name, flow in exit_fact.parameter_flows.items():
644
+ self.all_flows.append(flow)
645
+
646
+ def merge(self, facts: list[ForwardFlowFact]) -> ForwardFlowFact:
647
+ """Merge multiple flow facts.
648
+
649
+ Args:
650
+ facts: List of facts to merge
651
+
652
+ Returns:
653
+ Merged fact
654
+ """
655
+ if not facts:
656
+ return ForwardFlowFact()
657
+
658
+ if len(facts) == 1:
659
+ return facts[0]
660
+
661
+ result = facts[0].copy()
662
+
663
+ for fact in facts[1:]:
664
+ result.shape_env = result.shape_env.merge(fact.shape_env)
665
+
666
+ # Merge parameter flows
667
+ for param_name, flow in fact.parameter_flows.items():
668
+ if param_name in result.parameter_flows:
669
+ # Deduplicate operations by checking if already present
670
+ # Operations are dicts, so we compare by content
671
+ existing_ops = result.parameter_flows[param_name].operations
672
+ existing_ops_set = {
673
+ (
674
+ op.get("type"),
675
+ op.get("target"),
676
+ op.get("value"),
677
+ op.get("function"),
678
+ op.get("argument"),
679
+ op.get("line"),
680
+ )
681
+ for op in existing_ops
682
+ }
683
+
684
+ for op in flow.operations:
685
+ op_key = (
686
+ op.get("type"),
687
+ op.get("target"),
688
+ op.get("value"),
689
+ op.get("function"),
690
+ op.get("argument"),
691
+ op.get("line"),
692
+ )
693
+ if op_key not in existing_ops_set:
694
+ existing_ops.append(op)
695
+ existing_ops_set.add(op_key)
696
+
697
+ # Deduplicate reaches_calls and reaches_assignments using sets
698
+ result.parameter_flows[param_name].reaches_calls = list(
699
+ set(result.parameter_flows[param_name].reaches_calls + flow.reaches_calls)
700
+ )
701
+ result.parameter_flows[param_name].reaches_assignments = list(
702
+ set(result.parameter_flows[param_name].reaches_assignments + flow.reaches_assignments)
703
+ )
704
+
705
+ # Boolean flags use OR (idempotent)
706
+ result.parameter_flows[param_name].reaches_returns = (
707
+ result.parameter_flows[param_name].reaches_returns or flow.reaches_returns
708
+ )
709
+ result.parameter_flows[param_name].reaches_external = (
710
+ result.parameter_flows[param_name].reaches_external or flow.reaches_external
711
+ )
712
+ else:
713
+ result.parameter_flows[param_name] = flow
714
+
715
+ return result