cisco-ai-skill-scanner 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cisco_ai_skill_scanner-1.0.0.dist-info/METADATA +253 -0
- cisco_ai_skill_scanner-1.0.0.dist-info/RECORD +100 -0
- cisco_ai_skill_scanner-1.0.0.dist-info/WHEEL +4 -0
- cisco_ai_skill_scanner-1.0.0.dist-info/entry_points.txt +4 -0
- cisco_ai_skill_scanner-1.0.0.dist-info/licenses/LICENSE +17 -0
- skillanalyzer/__init__.py +45 -0
- skillanalyzer/_version.py +34 -0
- skillanalyzer/api/__init__.py +25 -0
- skillanalyzer/api/api.py +34 -0
- skillanalyzer/api/api_cli.py +78 -0
- skillanalyzer/api/api_server.py +634 -0
- skillanalyzer/api/router.py +527 -0
- skillanalyzer/cli/__init__.py +25 -0
- skillanalyzer/cli/cli.py +816 -0
- skillanalyzer/config/__init__.py +26 -0
- skillanalyzer/config/config.py +149 -0
- skillanalyzer/config/config_parser.py +122 -0
- skillanalyzer/config/constants.py +85 -0
- skillanalyzer/core/__init__.py +24 -0
- skillanalyzer/core/analyzers/__init__.py +75 -0
- skillanalyzer/core/analyzers/aidefense_analyzer.py +872 -0
- skillanalyzer/core/analyzers/base.py +53 -0
- skillanalyzer/core/analyzers/behavioral/__init__.py +30 -0
- skillanalyzer/core/analyzers/behavioral/alignment/__init__.py +45 -0
- skillanalyzer/core/analyzers/behavioral/alignment/alignment_llm_client.py +240 -0
- skillanalyzer/core/analyzers/behavioral/alignment/alignment_orchestrator.py +216 -0
- skillanalyzer/core/analyzers/behavioral/alignment/alignment_prompt_builder.py +422 -0
- skillanalyzer/core/analyzers/behavioral/alignment/alignment_response_validator.py +136 -0
- skillanalyzer/core/analyzers/behavioral/alignment/threat_vulnerability_classifier.py +198 -0
- skillanalyzer/core/analyzers/behavioral_analyzer.py +453 -0
- skillanalyzer/core/analyzers/cross_skill_analyzer.py +490 -0
- skillanalyzer/core/analyzers/llm_analyzer.py +440 -0
- skillanalyzer/core/analyzers/llm_prompt_builder.py +270 -0
- skillanalyzer/core/analyzers/llm_provider_config.py +215 -0
- skillanalyzer/core/analyzers/llm_request_handler.py +284 -0
- skillanalyzer/core/analyzers/llm_response_parser.py +81 -0
- skillanalyzer/core/analyzers/meta_analyzer.py +845 -0
- skillanalyzer/core/analyzers/static.py +1105 -0
- skillanalyzer/core/analyzers/trigger_analyzer.py +341 -0
- skillanalyzer/core/analyzers/virustotal_analyzer.py +463 -0
- skillanalyzer/core/exceptions.py +77 -0
- skillanalyzer/core/loader.py +377 -0
- skillanalyzer/core/models.py +300 -0
- skillanalyzer/core/reporters/__init__.py +26 -0
- skillanalyzer/core/reporters/json_reporter.py +65 -0
- skillanalyzer/core/reporters/markdown_reporter.py +209 -0
- skillanalyzer/core/reporters/sarif_reporter.py +246 -0
- skillanalyzer/core/reporters/table_reporter.py +195 -0
- skillanalyzer/core/rules/__init__.py +19 -0
- skillanalyzer/core/rules/patterns.py +165 -0
- skillanalyzer/core/rules/yara_scanner.py +157 -0
- skillanalyzer/core/scanner.py +437 -0
- skillanalyzer/core/static_analysis/__init__.py +27 -0
- skillanalyzer/core/static_analysis/cfg/__init__.py +21 -0
- skillanalyzer/core/static_analysis/cfg/builder.py +439 -0
- skillanalyzer/core/static_analysis/context_extractor.py +742 -0
- skillanalyzer/core/static_analysis/dataflow/__init__.py +25 -0
- skillanalyzer/core/static_analysis/dataflow/forward_analysis.py +715 -0
- skillanalyzer/core/static_analysis/interprocedural/__init__.py +21 -0
- skillanalyzer/core/static_analysis/interprocedural/call_graph_analyzer.py +406 -0
- skillanalyzer/core/static_analysis/interprocedural/cross_file_analyzer.py +190 -0
- skillanalyzer/core/static_analysis/parser/__init__.py +21 -0
- skillanalyzer/core/static_analysis/parser/python_parser.py +380 -0
- skillanalyzer/core/static_analysis/semantic/__init__.py +28 -0
- skillanalyzer/core/static_analysis/semantic/name_resolver.py +206 -0
- skillanalyzer/core/static_analysis/semantic/type_analyzer.py +200 -0
- skillanalyzer/core/static_analysis/taint/__init__.py +21 -0
- skillanalyzer/core/static_analysis/taint/tracker.py +252 -0
- skillanalyzer/core/static_analysis/types/__init__.py +36 -0
- skillanalyzer/data/__init__.py +30 -0
- skillanalyzer/data/prompts/boilerplate_protection_rule_prompt.md +26 -0
- skillanalyzer/data/prompts/code_alignment_threat_analysis_prompt.md +901 -0
- skillanalyzer/data/prompts/llm_response_schema.json +71 -0
- skillanalyzer/data/prompts/skill_meta_analysis_prompt.md +303 -0
- skillanalyzer/data/prompts/skill_threat_analysis_prompt.md +263 -0
- skillanalyzer/data/prompts/unified_response_schema.md +97 -0
- skillanalyzer/data/rules/signatures.yaml +440 -0
- skillanalyzer/data/yara_rules/autonomy_abuse.yara +66 -0
- skillanalyzer/data/yara_rules/code_execution.yara +61 -0
- skillanalyzer/data/yara_rules/coercive_injection.yara +115 -0
- skillanalyzer/data/yara_rules/command_injection.yara +54 -0
- skillanalyzer/data/yara_rules/credential_harvesting.yara +115 -0
- skillanalyzer/data/yara_rules/prompt_injection.yara +71 -0
- skillanalyzer/data/yara_rules/script_injection.yara +83 -0
- skillanalyzer/data/yara_rules/skill_discovery_abuse.yara +57 -0
- skillanalyzer/data/yara_rules/sql_injection.yara +73 -0
- skillanalyzer/data/yara_rules/system_manipulation.yara +65 -0
- skillanalyzer/data/yara_rules/tool_chaining_abuse.yara +60 -0
- skillanalyzer/data/yara_rules/transitive_trust_abuse.yara +73 -0
- skillanalyzer/data/yara_rules/unicode_steganography.yara +65 -0
- skillanalyzer/hooks/__init__.py +21 -0
- skillanalyzer/hooks/pre_commit.py +450 -0
- skillanalyzer/threats/__init__.py +25 -0
- skillanalyzer/threats/threats.py +480 -0
- skillanalyzer/utils/__init__.py +28 -0
- skillanalyzer/utils/command_utils.py +129 -0
- skillanalyzer/utils/di_container.py +154 -0
- skillanalyzer/utils/file_utils.py +86 -0
- skillanalyzer/utils/logging_config.py +96 -0
- skillanalyzer/utils/logging_utils.py +71 -0
|
@@ -0,0 +1,715 @@
|
|
|
1
|
+
# Copyright 2026 Cisco Systems, Inc.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
#
|
|
15
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
16
|
+
|
|
17
|
+
"""Forward dataflow analysis using Control Flow Graph.
|
|
18
|
+
|
|
19
|
+
Tracks parameter flows from function entry points through all control structures
|
|
20
|
+
using proper CFG-based fixpoint analysis. This provides accurate flow tracking
|
|
21
|
+
through branches, loops, and function calls.
|
|
22
|
+
|
|
23
|
+
Replaces the simple AST walker approach with proper dataflow analysis.
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
import ast
|
|
27
|
+
from dataclasses import dataclass, field
|
|
28
|
+
from typing import Any
|
|
29
|
+
|
|
30
|
+
from ..cfg.builder import CFGNode, DataFlowAnalyzer
|
|
31
|
+
from ..parser.python_parser import PythonParser
|
|
32
|
+
from ..taint.tracker import ShapeEnvironment, Taint, TaintStatus
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
@dataclass
|
|
36
|
+
class FlowPath:
|
|
37
|
+
"""Represents a complete flow path from parameter."""
|
|
38
|
+
|
|
39
|
+
parameter_name: str
|
|
40
|
+
operations: list[dict[str, Any]] = field(default_factory=list)
|
|
41
|
+
reaches_calls: list[str] = field(default_factory=list)
|
|
42
|
+
reaches_assignments: list[str] = field(default_factory=list)
|
|
43
|
+
reaches_returns: bool = False
|
|
44
|
+
reaches_external: bool = False # Network, file, subprocess
|
|
45
|
+
|
|
46
|
+
def copy(self) -> "FlowPath":
|
|
47
|
+
"""Create a deep copy of the flow path."""
|
|
48
|
+
return FlowPath(
|
|
49
|
+
parameter_name=self.parameter_name,
|
|
50
|
+
operations=self.operations.copy(),
|
|
51
|
+
reaches_calls=self.reaches_calls.copy(),
|
|
52
|
+
reaches_assignments=self.reaches_assignments.copy(),
|
|
53
|
+
reaches_returns=self.reaches_returns,
|
|
54
|
+
reaches_external=self.reaches_external,
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
@dataclass
|
|
59
|
+
class ForwardFlowFact:
|
|
60
|
+
"""Dataflow fact tracking parameter flows."""
|
|
61
|
+
|
|
62
|
+
shape_env: ShapeEnvironment = field(default_factory=ShapeEnvironment)
|
|
63
|
+
parameter_flows: dict[str, FlowPath] = field(default_factory=dict)
|
|
64
|
+
|
|
65
|
+
def copy(self) -> "ForwardFlowFact":
|
|
66
|
+
"""Create a deep copy."""
|
|
67
|
+
return ForwardFlowFact(
|
|
68
|
+
shape_env=self.shape_env.copy(),
|
|
69
|
+
parameter_flows={k: v.copy() for k, v in self.parameter_flows.items()},
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
def __eq__(self, other: object) -> bool:
|
|
73
|
+
"""Check equality for fixpoint detection."""
|
|
74
|
+
if not isinstance(other, ForwardFlowFact):
|
|
75
|
+
return False
|
|
76
|
+
|
|
77
|
+
if self.shape_env != other.shape_env:
|
|
78
|
+
return False
|
|
79
|
+
|
|
80
|
+
if set(self.parameter_flows.keys()) != set(other.parameter_flows.keys()):
|
|
81
|
+
return False
|
|
82
|
+
|
|
83
|
+
# Compare flow paths
|
|
84
|
+
for param in self.parameter_flows:
|
|
85
|
+
self_flow = self.parameter_flows[param]
|
|
86
|
+
other_flow = other.parameter_flows[param]
|
|
87
|
+
|
|
88
|
+
if (
|
|
89
|
+
len(self_flow.operations) != len(other_flow.operations)
|
|
90
|
+
or set(self_flow.reaches_calls) != set(other_flow.reaches_calls)
|
|
91
|
+
or self_flow.reaches_returns != other_flow.reaches_returns
|
|
92
|
+
or self_flow.reaches_external != other_flow.reaches_external
|
|
93
|
+
):
|
|
94
|
+
return False
|
|
95
|
+
|
|
96
|
+
return True
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
class ForwardDataflowAnalysis(DataFlowAnalyzer[ForwardFlowFact]):
|
|
100
|
+
"""Track all forward flows from function parameters and script-level sources using CFG.
|
|
101
|
+
|
|
102
|
+
Uses proper control flow graph and fixpoint analysis to accurately
|
|
103
|
+
track how parameters flow through branches, loops, and function calls.
|
|
104
|
+
Also detects script-level sources (credential files, env vars) and tracks
|
|
105
|
+
their flows to sinks (network, eval, subprocess).
|
|
106
|
+
"""
|
|
107
|
+
|
|
108
|
+
def __init__(self, parser: PythonParser, parameter_names: list[str] | None = None, detect_sources: bool = True):
|
|
109
|
+
"""Initialize forward flow tracker.
|
|
110
|
+
|
|
111
|
+
Args:
|
|
112
|
+
parser: Python parser instance
|
|
113
|
+
parameter_names: Names of function parameters to track (None for script-level only)
|
|
114
|
+
detect_sources: Whether to detect script-level sources (credential files, env vars)
|
|
115
|
+
"""
|
|
116
|
+
super().__init__(parser)
|
|
117
|
+
self.parameter_names = parameter_names or []
|
|
118
|
+
self.detect_sources = detect_sources
|
|
119
|
+
self.all_flows: list[FlowPath] = []
|
|
120
|
+
self.script_sources: list[str] = [] # Detected script-level sources
|
|
121
|
+
|
|
122
|
+
def analyze_forward_flows(self) -> list[FlowPath]:
|
|
123
|
+
"""Run forward flow analysis from parameters and script-level sources.
|
|
124
|
+
|
|
125
|
+
Returns:
|
|
126
|
+
List of all flow paths from parameters and sources
|
|
127
|
+
"""
|
|
128
|
+
# Clear state to prevent accumulation from previous analyses
|
|
129
|
+
# (defensive programming - instances should be fresh, but this ensures clean state)
|
|
130
|
+
self.all_flows.clear()
|
|
131
|
+
self.script_sources.clear()
|
|
132
|
+
|
|
133
|
+
self.build_cfg()
|
|
134
|
+
|
|
135
|
+
# Detect script-level sources if enabled
|
|
136
|
+
if self.detect_sources:
|
|
137
|
+
self._detect_script_sources()
|
|
138
|
+
|
|
139
|
+
# Initialize: mark all parameters and sources as tainted with unique labels
|
|
140
|
+
initial_fact = ForwardFlowFact()
|
|
141
|
+
|
|
142
|
+
# Track function parameters
|
|
143
|
+
for param_name in self.parameter_names:
|
|
144
|
+
taint = Taint(status=TaintStatus.TAINTED)
|
|
145
|
+
taint.add_label(f"param:{param_name}")
|
|
146
|
+
initial_fact.shape_env.set_taint(param_name, taint)
|
|
147
|
+
initial_fact.parameter_flows[param_name] = FlowPath(parameter_name=param_name)
|
|
148
|
+
|
|
149
|
+
# Track script-level sources (credential files, env vars)
|
|
150
|
+
for source_name in self.script_sources:
|
|
151
|
+
source_type = self._get_source_type(source_name)
|
|
152
|
+
taint = Taint(status=TaintStatus.TAINTED)
|
|
153
|
+
taint.add_label(f"source:{source_type}:{source_name}")
|
|
154
|
+
# Use a synthetic variable name for tracking
|
|
155
|
+
var_name = f"__source_{source_type}_{len(self.all_flows)}"
|
|
156
|
+
initial_fact.shape_env.set_taint(var_name, taint)
|
|
157
|
+
initial_fact.parameter_flows[source_name] = FlowPath(parameter_name=source_name)
|
|
158
|
+
|
|
159
|
+
self.analyze(initial_fact, forward=True)
|
|
160
|
+
|
|
161
|
+
# Collect all flows
|
|
162
|
+
self._collect_flows()
|
|
163
|
+
|
|
164
|
+
return self.all_flows
|
|
165
|
+
|
|
166
|
+
def _detect_script_sources(self) -> None:
|
|
167
|
+
"""Detect script-level sources (credential files, env vars)."""
|
|
168
|
+
tree = getattr(self.parser, "tree", None)
|
|
169
|
+
if not tree:
|
|
170
|
+
return
|
|
171
|
+
|
|
172
|
+
CREDENTIAL_FILES = [".aws/credentials", ".ssh/id_rsa", ".ssh/id_dsa", ".kube/config", ".netrc"]
|
|
173
|
+
ENV_VAR_PATTERNS = ["API_KEY", "SECRET", "TOKEN", "PASSWORD", "CREDENTIAL"]
|
|
174
|
+
|
|
175
|
+
tree = getattr(self.parser, "tree", None)
|
|
176
|
+
if not tree:
|
|
177
|
+
return
|
|
178
|
+
|
|
179
|
+
for node in ast.walk(tree):
|
|
180
|
+
if isinstance(node, ast.Call):
|
|
181
|
+
call_name = self._get_call_name(node)
|
|
182
|
+
|
|
183
|
+
# Check for credential file access
|
|
184
|
+
for arg in node.args:
|
|
185
|
+
if isinstance(arg, ast.Constant) and isinstance(arg.value, str):
|
|
186
|
+
# Credential files
|
|
187
|
+
if any(cred in arg.value for cred in CREDENTIAL_FILES):
|
|
188
|
+
source_name = f"credential_file:{arg.value}"
|
|
189
|
+
if source_name not in self.script_sources:
|
|
190
|
+
self.script_sources.append(source_name)
|
|
191
|
+
|
|
192
|
+
# os.path.expanduser with credential paths
|
|
193
|
+
if call_name == "os.path.expanduser":
|
|
194
|
+
if any(cred in arg.value for cred in CREDENTIAL_FILES):
|
|
195
|
+
source_name = f"credential_file:{arg.value}"
|
|
196
|
+
if source_name not in self.script_sources:
|
|
197
|
+
self.script_sources.append(source_name)
|
|
198
|
+
|
|
199
|
+
# Check for env var access
|
|
200
|
+
if call_name in ["os.getenv", "os.environ.get", "getenv"]:
|
|
201
|
+
for arg in node.args:
|
|
202
|
+
if isinstance(arg, ast.Constant) and isinstance(arg.value, str):
|
|
203
|
+
if any(pattern in arg.value.upper() for pattern in ENV_VAR_PATTERNS):
|
|
204
|
+
source_name = f"env_var:{arg.value}"
|
|
205
|
+
if source_name not in self.script_sources:
|
|
206
|
+
self.script_sources.append(source_name)
|
|
207
|
+
|
|
208
|
+
# Check for os.environ.items() iteration
|
|
209
|
+
elif isinstance(node, ast.For):
|
|
210
|
+
if isinstance(node.iter, ast.Call):
|
|
211
|
+
if isinstance(node.iter.func, ast.Attribute):
|
|
212
|
+
if node.iter.func.attr == "items":
|
|
213
|
+
attr_name = self._get_attribute_name(node.iter.func.value)
|
|
214
|
+
if attr_name == "os.environ":
|
|
215
|
+
source_name = "env_var:os.environ (all)"
|
|
216
|
+
if source_name not in self.script_sources:
|
|
217
|
+
self.script_sources.append(source_name)
|
|
218
|
+
|
|
219
|
+
# Check for os.environ assignment
|
|
220
|
+
elif isinstance(node, ast.Assign):
|
|
221
|
+
if isinstance(node.value, ast.Attribute):
|
|
222
|
+
attr_name = self._get_attribute_name(node.value)
|
|
223
|
+
if attr_name == "os.environ":
|
|
224
|
+
source_name = "env_var:os.environ (assignment)"
|
|
225
|
+
if source_name not in self.script_sources:
|
|
226
|
+
self.script_sources.append(source_name)
|
|
227
|
+
|
|
228
|
+
def _get_source_type(self, source_name: str) -> str:
|
|
229
|
+
"""Get source type from source name."""
|
|
230
|
+
if source_name.startswith("credential_file:"):
|
|
231
|
+
return "credential_file"
|
|
232
|
+
elif source_name.startswith("env_var:"):
|
|
233
|
+
return "env_var"
|
|
234
|
+
return "unknown"
|
|
235
|
+
|
|
236
|
+
def _get_attribute_name(self, node: ast.Attribute) -> str:
|
|
237
|
+
"""Get full attribute name like 'os.environ'."""
|
|
238
|
+
parts = []
|
|
239
|
+
current = node
|
|
240
|
+
|
|
241
|
+
while isinstance(current, ast.Attribute):
|
|
242
|
+
parts.append(current.attr)
|
|
243
|
+
current = current.value
|
|
244
|
+
|
|
245
|
+
if isinstance(current, ast.Name):
|
|
246
|
+
parts.append(current.id)
|
|
247
|
+
|
|
248
|
+
return ".".join(reversed(parts))
|
|
249
|
+
|
|
250
|
+
def transfer(self, node: CFGNode, in_fact: ForwardFlowFact) -> ForwardFlowFact:
|
|
251
|
+
"""Transfer function tracking parameter flows.
|
|
252
|
+
|
|
253
|
+
Args:
|
|
254
|
+
node: CFG node
|
|
255
|
+
in_fact: Input flow fact
|
|
256
|
+
|
|
257
|
+
Returns:
|
|
258
|
+
Output flow fact
|
|
259
|
+
"""
|
|
260
|
+
out_fact = in_fact.copy()
|
|
261
|
+
ast_node = node.ast_node
|
|
262
|
+
|
|
263
|
+
self._transfer_python(ast_node, out_fact)
|
|
264
|
+
return out_fact
|
|
265
|
+
|
|
266
|
+
def _transfer_python(self, node: ast.AST, fact: ForwardFlowFact) -> None:
|
|
267
|
+
"""Transfer function for Python nodes.
|
|
268
|
+
|
|
269
|
+
Args:
|
|
270
|
+
node: Python AST node
|
|
271
|
+
fact: Flow fact to update
|
|
272
|
+
"""
|
|
273
|
+
# Track assignments
|
|
274
|
+
if isinstance(node, ast.Assign):
|
|
275
|
+
for target in node.targets:
|
|
276
|
+
if isinstance(target, ast.Name):
|
|
277
|
+
rhs_taint = self._eval_expr_taint(node.value, fact)
|
|
278
|
+
|
|
279
|
+
# Check if RHS is a source call (os.getenv, open with credential file, etc.)
|
|
280
|
+
if isinstance(node.value, ast.Call):
|
|
281
|
+
source_info = self._check_source_call(node.value)
|
|
282
|
+
if source_info:
|
|
283
|
+
source_type, source_name = source_info
|
|
284
|
+
rhs_taint = Taint(status=TaintStatus.TAINTED)
|
|
285
|
+
rhs_taint.add_label(f"source:{source_type}:{source_name}")
|
|
286
|
+
# Add to script sources if not already there
|
|
287
|
+
full_source_name = f"{source_type}:{source_name}"
|
|
288
|
+
if full_source_name not in self.script_sources:
|
|
289
|
+
self.script_sources.append(full_source_name)
|
|
290
|
+
if full_source_name not in fact.parameter_flows:
|
|
291
|
+
fact.parameter_flows[full_source_name] = FlowPath(parameter_name=full_source_name)
|
|
292
|
+
|
|
293
|
+
if rhs_taint.is_tainted():
|
|
294
|
+
fact.shape_env.set_taint(target.id, rhs_taint)
|
|
295
|
+
|
|
296
|
+
# Track which parameters/sources flow here
|
|
297
|
+
all_tracked = self.parameter_names + self.script_sources
|
|
298
|
+
for tracked_name in all_tracked:
|
|
299
|
+
if self._expr_uses_var(node.value, tracked_name, fact) or self._is_source_assignment(
|
|
300
|
+
node.value, tracked_name
|
|
301
|
+
):
|
|
302
|
+
if tracked_name in fact.parameter_flows:
|
|
303
|
+
flow = fact.parameter_flows[tracked_name]
|
|
304
|
+
|
|
305
|
+
# Deduplicate: Check if this assignment was already recorded
|
|
306
|
+
assignment_str = f"{target.id} = {self._unparse_safe(node.value)}"
|
|
307
|
+
if assignment_str not in flow.reaches_assignments:
|
|
308
|
+
flow.reaches_assignments.append(assignment_str)
|
|
309
|
+
|
|
310
|
+
# Deduplicate operations by creating a key
|
|
311
|
+
op_key = (
|
|
312
|
+
"assignment",
|
|
313
|
+
target.id,
|
|
314
|
+
self._unparse_safe(node.value),
|
|
315
|
+
None, # function
|
|
316
|
+
None, # argument
|
|
317
|
+
node.lineno if hasattr(node, "lineno") else 0,
|
|
318
|
+
)
|
|
319
|
+
existing_op_keys = {
|
|
320
|
+
(
|
|
321
|
+
op.get("type"),
|
|
322
|
+
op.get("target"),
|
|
323
|
+
op.get("value"),
|
|
324
|
+
op.get("function"),
|
|
325
|
+
op.get("argument"),
|
|
326
|
+
op.get("line"),
|
|
327
|
+
)
|
|
328
|
+
for op in flow.operations
|
|
329
|
+
}
|
|
330
|
+
if op_key not in existing_op_keys:
|
|
331
|
+
flow.operations.append(
|
|
332
|
+
{
|
|
333
|
+
"type": "assignment",
|
|
334
|
+
"target": target.id,
|
|
335
|
+
"value": self._unparse_safe(node.value),
|
|
336
|
+
"line": node.lineno if hasattr(node, "lineno") else 0,
|
|
337
|
+
}
|
|
338
|
+
)
|
|
339
|
+
|
|
340
|
+
# Check if RHS is a call to external operation
|
|
341
|
+
if isinstance(node.value, ast.Call):
|
|
342
|
+
call_name = self._get_call_name(node.value)
|
|
343
|
+
if call_name not in flow.reaches_calls:
|
|
344
|
+
flow.reaches_calls.append(call_name)
|
|
345
|
+
if self._is_external_operation(call_name):
|
|
346
|
+
flow.reaches_external = True
|
|
347
|
+
else:
|
|
348
|
+
# Clear taint if RHS is not tainted
|
|
349
|
+
fact.shape_env.set_taint(target.id, Taint(status=TaintStatus.UNTAINTED))
|
|
350
|
+
|
|
351
|
+
# Track function calls
|
|
352
|
+
elif isinstance(node, ast.Call):
|
|
353
|
+
call_name = self._get_call_name(node)
|
|
354
|
+
|
|
355
|
+
# Check if any arguments contain tracked parameters/sources
|
|
356
|
+
for arg in node.args:
|
|
357
|
+
arg_taint = self._eval_expr_taint(arg, fact)
|
|
358
|
+
if arg_taint.is_tainted():
|
|
359
|
+
all_tracked = self.parameter_names + self.script_sources
|
|
360
|
+
for tracked_name in all_tracked:
|
|
361
|
+
if self._expr_uses_var(arg, tracked_name, fact):
|
|
362
|
+
if tracked_name in fact.parameter_flows:
|
|
363
|
+
flow = fact.parameter_flows[tracked_name]
|
|
364
|
+
|
|
365
|
+
# Deduplicate: Check if this call was already recorded
|
|
366
|
+
if call_name not in flow.reaches_calls:
|
|
367
|
+
flow.reaches_calls.append(call_name)
|
|
368
|
+
|
|
369
|
+
# Deduplicate operations
|
|
370
|
+
op_key = (
|
|
371
|
+
"function_call",
|
|
372
|
+
None, # target
|
|
373
|
+
None, # value
|
|
374
|
+
call_name,
|
|
375
|
+
self._unparse_safe(arg),
|
|
376
|
+
node.lineno if hasattr(node, "lineno") else 0,
|
|
377
|
+
)
|
|
378
|
+
existing_op_keys = {
|
|
379
|
+
(
|
|
380
|
+
op.get("type"),
|
|
381
|
+
op.get("target"),
|
|
382
|
+
op.get("value"),
|
|
383
|
+
op.get("function"),
|
|
384
|
+
op.get("argument"),
|
|
385
|
+
op.get("line"),
|
|
386
|
+
)
|
|
387
|
+
for op in flow.operations
|
|
388
|
+
}
|
|
389
|
+
if op_key not in existing_op_keys:
|
|
390
|
+
flow.operations.append(
|
|
391
|
+
{
|
|
392
|
+
"type": "function_call",
|
|
393
|
+
"function": call_name,
|
|
394
|
+
"argument": self._unparse_safe(arg),
|
|
395
|
+
"line": node.lineno if hasattr(node, "lineno") else 0,
|
|
396
|
+
}
|
|
397
|
+
)
|
|
398
|
+
|
|
399
|
+
if self._is_external_operation(call_name):
|
|
400
|
+
flow.reaches_external = True
|
|
401
|
+
|
|
402
|
+
# Track returns
|
|
403
|
+
elif isinstance(node, ast.Return):
|
|
404
|
+
if node.value:
|
|
405
|
+
ret_taint = self._eval_expr_taint(node.value, fact)
|
|
406
|
+
if ret_taint.is_tainted():
|
|
407
|
+
all_tracked = self.parameter_names + self.script_sources
|
|
408
|
+
for tracked_name in all_tracked:
|
|
409
|
+
if self._expr_uses_var(node.value, tracked_name, fact):
|
|
410
|
+
if tracked_name in fact.parameter_flows:
|
|
411
|
+
fact.parameter_flows[tracked_name].reaches_returns = True
|
|
412
|
+
fact.parameter_flows[tracked_name].operations.append(
|
|
413
|
+
{
|
|
414
|
+
"type": "return",
|
|
415
|
+
"value": self._unparse_safe(node.value),
|
|
416
|
+
"line": node.lineno if hasattr(node, "lineno") else 0,
|
|
417
|
+
}
|
|
418
|
+
)
|
|
419
|
+
|
|
420
|
+
def _eval_expr_taint(self, expr: ast.AST, fact: ForwardFlowFact) -> Taint:
|
|
421
|
+
"""Evaluate taint of an expression.
|
|
422
|
+
|
|
423
|
+
Args:
|
|
424
|
+
expr: Expression node
|
|
425
|
+
fact: Current flow fact
|
|
426
|
+
|
|
427
|
+
Returns:
|
|
428
|
+
Taint of the expression
|
|
429
|
+
"""
|
|
430
|
+
if isinstance(expr, ast.Name):
|
|
431
|
+
return fact.shape_env.get_taint(expr.id)
|
|
432
|
+
|
|
433
|
+
elif isinstance(expr, ast.Attribute):
|
|
434
|
+
if isinstance(expr.value, ast.Name):
|
|
435
|
+
obj_name = expr.value.id
|
|
436
|
+
field_name = expr.attr
|
|
437
|
+
shape = fact.shape_env.get(obj_name)
|
|
438
|
+
return shape.get_field(field_name)
|
|
439
|
+
else:
|
|
440
|
+
return self._eval_expr_taint(expr.value, fact)
|
|
441
|
+
|
|
442
|
+
elif isinstance(expr, ast.Subscript):
|
|
443
|
+
if isinstance(expr.value, ast.Name):
|
|
444
|
+
arr_name = expr.value.id
|
|
445
|
+
shape = fact.shape_env.get(arr_name)
|
|
446
|
+
return shape.get_element()
|
|
447
|
+
else:
|
|
448
|
+
return self._eval_expr_taint(expr.value, fact)
|
|
449
|
+
|
|
450
|
+
elif isinstance(expr, ast.Call):
|
|
451
|
+
# Merge taint from all arguments
|
|
452
|
+
result = Taint(status=TaintStatus.UNTAINTED)
|
|
453
|
+
for arg in expr.args:
|
|
454
|
+
arg_taint = self._eval_expr_taint(arg, fact)
|
|
455
|
+
result = result.merge(arg_taint)
|
|
456
|
+
return result
|
|
457
|
+
|
|
458
|
+
elif isinstance(expr, ast.BinOp):
|
|
459
|
+
left_taint = self._eval_expr_taint(expr.left, fact)
|
|
460
|
+
right_taint = self._eval_expr_taint(expr.right, fact)
|
|
461
|
+
return left_taint.merge(right_taint)
|
|
462
|
+
|
|
463
|
+
elif isinstance(expr, ast.JoinedStr):
|
|
464
|
+
result = Taint(status=TaintStatus.UNTAINTED)
|
|
465
|
+
for value in expr.values:
|
|
466
|
+
if isinstance(value, ast.FormattedValue):
|
|
467
|
+
taint = self._eval_expr_taint(value.value, fact)
|
|
468
|
+
result = result.merge(taint)
|
|
469
|
+
return result
|
|
470
|
+
|
|
471
|
+
elif isinstance(expr, (ast.List, ast.Tuple, ast.Set)):
|
|
472
|
+
result = Taint(status=TaintStatus.UNTAINTED)
|
|
473
|
+
for elt in expr.elts:
|
|
474
|
+
taint = self._eval_expr_taint(elt, fact)
|
|
475
|
+
result = result.merge(taint)
|
|
476
|
+
return result
|
|
477
|
+
|
|
478
|
+
else:
|
|
479
|
+
return Taint(status=TaintStatus.UNTAINTED)
|
|
480
|
+
|
|
481
|
+
def _expr_uses_var(self, expr: ast.AST, var_name: str, fact: ForwardFlowFact) -> bool:
|
|
482
|
+
"""Check if expression uses a variable (directly or transitively).
|
|
483
|
+
|
|
484
|
+
Uses source-sensitive tracking via taint labels.
|
|
485
|
+
|
|
486
|
+
Args:
|
|
487
|
+
expr: Expression node
|
|
488
|
+
var_name: Variable name to check
|
|
489
|
+
fact: Current flow fact
|
|
490
|
+
|
|
491
|
+
Returns:
|
|
492
|
+
True if expression uses the variable
|
|
493
|
+
"""
|
|
494
|
+
target_shape = fact.shape_env.get(var_name)
|
|
495
|
+
target_taint = target_shape.get_taint()
|
|
496
|
+
target_labels = target_taint.labels if target_taint.is_tainted() else set()
|
|
497
|
+
expected_label = f"param:{var_name}"
|
|
498
|
+
|
|
499
|
+
for node in ast.walk(expr):
|
|
500
|
+
if isinstance(node, ast.Name):
|
|
501
|
+
if node.id == var_name:
|
|
502
|
+
return True
|
|
503
|
+
|
|
504
|
+
# Check transitive dependencies with source sensitivity
|
|
505
|
+
node_shape = fact.shape_env.get(node.id)
|
|
506
|
+
node_taint = node_shape.get_taint()
|
|
507
|
+
|
|
508
|
+
if node_taint.is_tainted():
|
|
509
|
+
if expected_label in node_taint.labels:
|
|
510
|
+
return True
|
|
511
|
+
|
|
512
|
+
if target_labels and node_taint.labels & target_labels:
|
|
513
|
+
return True
|
|
514
|
+
|
|
515
|
+
# Check structural shapes
|
|
516
|
+
if node_shape.is_object:
|
|
517
|
+
for field_name, field_shape in node_shape.fields.items():
|
|
518
|
+
field_taint = field_shape.get_taint()
|
|
519
|
+
if expected_label in field_taint.labels:
|
|
520
|
+
return True
|
|
521
|
+
|
|
522
|
+
if node_shape.is_array and node_shape.element_shape:
|
|
523
|
+
elem_taint = node_shape.element_shape.get_taint()
|
|
524
|
+
if expected_label in elem_taint.labels:
|
|
525
|
+
return True
|
|
526
|
+
|
|
527
|
+
return False
|
|
528
|
+
|
|
529
|
+
def _get_call_name(self, node: ast.Call) -> str:
|
|
530
|
+
"""Get function call name.
|
|
531
|
+
|
|
532
|
+
Args:
|
|
533
|
+
node: Call node
|
|
534
|
+
|
|
535
|
+
Returns:
|
|
536
|
+
Function name
|
|
537
|
+
"""
|
|
538
|
+
if isinstance(node.func, ast.Name):
|
|
539
|
+
return node.func.id
|
|
540
|
+
elif isinstance(node.func, ast.Attribute):
|
|
541
|
+
parts = []
|
|
542
|
+
current = node.func
|
|
543
|
+
while isinstance(current, ast.Attribute):
|
|
544
|
+
parts.append(current.attr)
|
|
545
|
+
current = current.value
|
|
546
|
+
if isinstance(current, ast.Name):
|
|
547
|
+
parts.append(current.id)
|
|
548
|
+
return ".".join(reversed(parts))
|
|
549
|
+
try:
|
|
550
|
+
return ast.unparse(node.func) if hasattr(ast, "unparse") else str(node.func)
|
|
551
|
+
except (AttributeError, TypeError, ValueError):
|
|
552
|
+
return "<unknown>"
|
|
553
|
+
|
|
554
|
+
def _unparse_safe(self, node: ast.AST) -> str:
|
|
555
|
+
"""Safely unparse AST node."""
|
|
556
|
+
try:
|
|
557
|
+
if hasattr(ast, "unparse"):
|
|
558
|
+
return ast.unparse(node)
|
|
559
|
+
return str(node)
|
|
560
|
+
except (AttributeError, TypeError, ValueError):
|
|
561
|
+
return "<unparseable>"
|
|
562
|
+
|
|
563
|
+
def _is_external_operation(self, call_name: str) -> bool:
|
|
564
|
+
"""Check if call is an external operation (network, file, subprocess).
|
|
565
|
+
|
|
566
|
+
Args:
|
|
567
|
+
call_name: Function call name
|
|
568
|
+
|
|
569
|
+
Returns:
|
|
570
|
+
True if external operation
|
|
571
|
+
"""
|
|
572
|
+
external_patterns = [
|
|
573
|
+
"requests",
|
|
574
|
+
"urllib",
|
|
575
|
+
"http",
|
|
576
|
+
"socket",
|
|
577
|
+
"post",
|
|
578
|
+
"get",
|
|
579
|
+
"fetch",
|
|
580
|
+
"open",
|
|
581
|
+
"read",
|
|
582
|
+
"write",
|
|
583
|
+
"file",
|
|
584
|
+
"subprocess",
|
|
585
|
+
"os.system",
|
|
586
|
+
"os.popen",
|
|
587
|
+
"exec",
|
|
588
|
+
"eval",
|
|
589
|
+
]
|
|
590
|
+
call_lower = call_name.lower()
|
|
591
|
+
return any(pattern in call_lower for pattern in external_patterns)
|
|
592
|
+
|
|
593
|
+
def _check_source_call(self, call_node: ast.Call) -> tuple[str, str] | None:
|
|
594
|
+
"""Check if a call is a source (credential file, env var).
|
|
595
|
+
|
|
596
|
+
Returns:
|
|
597
|
+
(source_type, source_name) if source, None otherwise
|
|
598
|
+
"""
|
|
599
|
+
call_name = self._get_call_name(call_node)
|
|
600
|
+
CREDENTIAL_FILES = [".aws/credentials", ".ssh/id_rsa", ".ssh/id_dsa", ".kube/config", ".netrc"]
|
|
601
|
+
ENV_VAR_PATTERNS = ["API_KEY", "SECRET", "TOKEN", "PASSWORD", "CREDENTIAL"]
|
|
602
|
+
|
|
603
|
+
# Check for env var access
|
|
604
|
+
if call_name in ["os.getenv", "os.environ.get", "getenv"]:
|
|
605
|
+
for arg in call_node.args:
|
|
606
|
+
if isinstance(arg, ast.Constant) and isinstance(arg.value, str):
|
|
607
|
+
if any(pattern in arg.value.upper() for pattern in ENV_VAR_PATTERNS):
|
|
608
|
+
return ("env_var", arg.value)
|
|
609
|
+
|
|
610
|
+
# Check for credential file access
|
|
611
|
+
for arg in call_node.args:
|
|
612
|
+
if isinstance(arg, ast.Constant) and isinstance(arg.value, str):
|
|
613
|
+
if any(cred in arg.value for cred in CREDENTIAL_FILES):
|
|
614
|
+
return ("credential_file", arg.value)
|
|
615
|
+
|
|
616
|
+
# Check for os.path.expanduser with credential paths
|
|
617
|
+
if call_name == "os.path.expanduser":
|
|
618
|
+
for arg in call_node.args:
|
|
619
|
+
if isinstance(arg, ast.Constant) and isinstance(arg.value, str):
|
|
620
|
+
if any(cred in arg.value for cred in CREDENTIAL_FILES):
|
|
621
|
+
return ("credential_file", arg.value)
|
|
622
|
+
|
|
623
|
+
return None
|
|
624
|
+
|
|
625
|
+
def _is_source_assignment(self, expr: ast.AST, source_name: str) -> bool:
|
|
626
|
+
"""Check if expression is an assignment from a source."""
|
|
627
|
+
if isinstance(expr, ast.Call):
|
|
628
|
+
source_info = self._check_source_call(expr)
|
|
629
|
+
if source_info:
|
|
630
|
+
source_type, name = source_info
|
|
631
|
+
full_name = f"{source_type}:{name}"
|
|
632
|
+
return full_name == source_name
|
|
633
|
+
return False
|
|
634
|
+
|
|
635
|
+
def _collect_flows(self) -> None:
|
|
636
|
+
"""Collect all flows from analysis results."""
|
|
637
|
+
if not self.cfg or not self.cfg.exit:
|
|
638
|
+
return
|
|
639
|
+
|
|
640
|
+
# Get flows at exit node
|
|
641
|
+
exit_fact = self.out_facts.get(self.cfg.exit.id)
|
|
642
|
+
if exit_fact:
|
|
643
|
+
for param_name, flow in exit_fact.parameter_flows.items():
|
|
644
|
+
self.all_flows.append(flow)
|
|
645
|
+
|
|
646
|
+
def merge(self, facts: list[ForwardFlowFact]) -> ForwardFlowFact:
|
|
647
|
+
"""Merge multiple flow facts.
|
|
648
|
+
|
|
649
|
+
Args:
|
|
650
|
+
facts: List of facts to merge
|
|
651
|
+
|
|
652
|
+
Returns:
|
|
653
|
+
Merged fact
|
|
654
|
+
"""
|
|
655
|
+
if not facts:
|
|
656
|
+
return ForwardFlowFact()
|
|
657
|
+
|
|
658
|
+
if len(facts) == 1:
|
|
659
|
+
return facts[0]
|
|
660
|
+
|
|
661
|
+
result = facts[0].copy()
|
|
662
|
+
|
|
663
|
+
for fact in facts[1:]:
|
|
664
|
+
result.shape_env = result.shape_env.merge(fact.shape_env)
|
|
665
|
+
|
|
666
|
+
# Merge parameter flows
|
|
667
|
+
for param_name, flow in fact.parameter_flows.items():
|
|
668
|
+
if param_name in result.parameter_flows:
|
|
669
|
+
# Deduplicate operations by checking if already present
|
|
670
|
+
# Operations are dicts, so we compare by content
|
|
671
|
+
existing_ops = result.parameter_flows[param_name].operations
|
|
672
|
+
existing_ops_set = {
|
|
673
|
+
(
|
|
674
|
+
op.get("type"),
|
|
675
|
+
op.get("target"),
|
|
676
|
+
op.get("value"),
|
|
677
|
+
op.get("function"),
|
|
678
|
+
op.get("argument"),
|
|
679
|
+
op.get("line"),
|
|
680
|
+
)
|
|
681
|
+
for op in existing_ops
|
|
682
|
+
}
|
|
683
|
+
|
|
684
|
+
for op in flow.operations:
|
|
685
|
+
op_key = (
|
|
686
|
+
op.get("type"),
|
|
687
|
+
op.get("target"),
|
|
688
|
+
op.get("value"),
|
|
689
|
+
op.get("function"),
|
|
690
|
+
op.get("argument"),
|
|
691
|
+
op.get("line"),
|
|
692
|
+
)
|
|
693
|
+
if op_key not in existing_ops_set:
|
|
694
|
+
existing_ops.append(op)
|
|
695
|
+
existing_ops_set.add(op_key)
|
|
696
|
+
|
|
697
|
+
# Deduplicate reaches_calls and reaches_assignments using sets
|
|
698
|
+
result.parameter_flows[param_name].reaches_calls = list(
|
|
699
|
+
set(result.parameter_flows[param_name].reaches_calls + flow.reaches_calls)
|
|
700
|
+
)
|
|
701
|
+
result.parameter_flows[param_name].reaches_assignments = list(
|
|
702
|
+
set(result.parameter_flows[param_name].reaches_assignments + flow.reaches_assignments)
|
|
703
|
+
)
|
|
704
|
+
|
|
705
|
+
# Boolean flags use OR (idempotent)
|
|
706
|
+
result.parameter_flows[param_name].reaches_returns = (
|
|
707
|
+
result.parameter_flows[param_name].reaches_returns or flow.reaches_returns
|
|
708
|
+
)
|
|
709
|
+
result.parameter_flows[param_name].reaches_external = (
|
|
710
|
+
result.parameter_flows[param_name].reaches_external or flow.reaches_external
|
|
711
|
+
)
|
|
712
|
+
else:
|
|
713
|
+
result.parameter_flows[param_name] = flow
|
|
714
|
+
|
|
715
|
+
return result
|