cisco-ai-skill-scanner 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cisco_ai_skill_scanner-1.0.0.dist-info/METADATA +253 -0
- cisco_ai_skill_scanner-1.0.0.dist-info/RECORD +100 -0
- cisco_ai_skill_scanner-1.0.0.dist-info/WHEEL +4 -0
- cisco_ai_skill_scanner-1.0.0.dist-info/entry_points.txt +4 -0
- cisco_ai_skill_scanner-1.0.0.dist-info/licenses/LICENSE +17 -0
- skillanalyzer/__init__.py +45 -0
- skillanalyzer/_version.py +34 -0
- skillanalyzer/api/__init__.py +25 -0
- skillanalyzer/api/api.py +34 -0
- skillanalyzer/api/api_cli.py +78 -0
- skillanalyzer/api/api_server.py +634 -0
- skillanalyzer/api/router.py +527 -0
- skillanalyzer/cli/__init__.py +25 -0
- skillanalyzer/cli/cli.py +816 -0
- skillanalyzer/config/__init__.py +26 -0
- skillanalyzer/config/config.py +149 -0
- skillanalyzer/config/config_parser.py +122 -0
- skillanalyzer/config/constants.py +85 -0
- skillanalyzer/core/__init__.py +24 -0
- skillanalyzer/core/analyzers/__init__.py +75 -0
- skillanalyzer/core/analyzers/aidefense_analyzer.py +872 -0
- skillanalyzer/core/analyzers/base.py +53 -0
- skillanalyzer/core/analyzers/behavioral/__init__.py +30 -0
- skillanalyzer/core/analyzers/behavioral/alignment/__init__.py +45 -0
- skillanalyzer/core/analyzers/behavioral/alignment/alignment_llm_client.py +240 -0
- skillanalyzer/core/analyzers/behavioral/alignment/alignment_orchestrator.py +216 -0
- skillanalyzer/core/analyzers/behavioral/alignment/alignment_prompt_builder.py +422 -0
- skillanalyzer/core/analyzers/behavioral/alignment/alignment_response_validator.py +136 -0
- skillanalyzer/core/analyzers/behavioral/alignment/threat_vulnerability_classifier.py +198 -0
- skillanalyzer/core/analyzers/behavioral_analyzer.py +453 -0
- skillanalyzer/core/analyzers/cross_skill_analyzer.py +490 -0
- skillanalyzer/core/analyzers/llm_analyzer.py +440 -0
- skillanalyzer/core/analyzers/llm_prompt_builder.py +270 -0
- skillanalyzer/core/analyzers/llm_provider_config.py +215 -0
- skillanalyzer/core/analyzers/llm_request_handler.py +284 -0
- skillanalyzer/core/analyzers/llm_response_parser.py +81 -0
- skillanalyzer/core/analyzers/meta_analyzer.py +845 -0
- skillanalyzer/core/analyzers/static.py +1105 -0
- skillanalyzer/core/analyzers/trigger_analyzer.py +341 -0
- skillanalyzer/core/analyzers/virustotal_analyzer.py +463 -0
- skillanalyzer/core/exceptions.py +77 -0
- skillanalyzer/core/loader.py +377 -0
- skillanalyzer/core/models.py +300 -0
- skillanalyzer/core/reporters/__init__.py +26 -0
- skillanalyzer/core/reporters/json_reporter.py +65 -0
- skillanalyzer/core/reporters/markdown_reporter.py +209 -0
- skillanalyzer/core/reporters/sarif_reporter.py +246 -0
- skillanalyzer/core/reporters/table_reporter.py +195 -0
- skillanalyzer/core/rules/__init__.py +19 -0
- skillanalyzer/core/rules/patterns.py +165 -0
- skillanalyzer/core/rules/yara_scanner.py +157 -0
- skillanalyzer/core/scanner.py +437 -0
- skillanalyzer/core/static_analysis/__init__.py +27 -0
- skillanalyzer/core/static_analysis/cfg/__init__.py +21 -0
- skillanalyzer/core/static_analysis/cfg/builder.py +439 -0
- skillanalyzer/core/static_analysis/context_extractor.py +742 -0
- skillanalyzer/core/static_analysis/dataflow/__init__.py +25 -0
- skillanalyzer/core/static_analysis/dataflow/forward_analysis.py +715 -0
- skillanalyzer/core/static_analysis/interprocedural/__init__.py +21 -0
- skillanalyzer/core/static_analysis/interprocedural/call_graph_analyzer.py +406 -0
- skillanalyzer/core/static_analysis/interprocedural/cross_file_analyzer.py +190 -0
- skillanalyzer/core/static_analysis/parser/__init__.py +21 -0
- skillanalyzer/core/static_analysis/parser/python_parser.py +380 -0
- skillanalyzer/core/static_analysis/semantic/__init__.py +28 -0
- skillanalyzer/core/static_analysis/semantic/name_resolver.py +206 -0
- skillanalyzer/core/static_analysis/semantic/type_analyzer.py +200 -0
- skillanalyzer/core/static_analysis/taint/__init__.py +21 -0
- skillanalyzer/core/static_analysis/taint/tracker.py +252 -0
- skillanalyzer/core/static_analysis/types/__init__.py +36 -0
- skillanalyzer/data/__init__.py +30 -0
- skillanalyzer/data/prompts/boilerplate_protection_rule_prompt.md +26 -0
- skillanalyzer/data/prompts/code_alignment_threat_analysis_prompt.md +901 -0
- skillanalyzer/data/prompts/llm_response_schema.json +71 -0
- skillanalyzer/data/prompts/skill_meta_analysis_prompt.md +303 -0
- skillanalyzer/data/prompts/skill_threat_analysis_prompt.md +263 -0
- skillanalyzer/data/prompts/unified_response_schema.md +97 -0
- skillanalyzer/data/rules/signatures.yaml +440 -0
- skillanalyzer/data/yara_rules/autonomy_abuse.yara +66 -0
- skillanalyzer/data/yara_rules/code_execution.yara +61 -0
- skillanalyzer/data/yara_rules/coercive_injection.yara +115 -0
- skillanalyzer/data/yara_rules/command_injection.yara +54 -0
- skillanalyzer/data/yara_rules/credential_harvesting.yara +115 -0
- skillanalyzer/data/yara_rules/prompt_injection.yara +71 -0
- skillanalyzer/data/yara_rules/script_injection.yara +83 -0
- skillanalyzer/data/yara_rules/skill_discovery_abuse.yara +57 -0
- skillanalyzer/data/yara_rules/sql_injection.yara +73 -0
- skillanalyzer/data/yara_rules/system_manipulation.yara +65 -0
- skillanalyzer/data/yara_rules/tool_chaining_abuse.yara +60 -0
- skillanalyzer/data/yara_rules/transitive_trust_abuse.yara +73 -0
- skillanalyzer/data/yara_rules/unicode_steganography.yara +65 -0
- skillanalyzer/hooks/__init__.py +21 -0
- skillanalyzer/hooks/pre_commit.py +450 -0
- skillanalyzer/threats/__init__.py +25 -0
- skillanalyzer/threats/threats.py +480 -0
- skillanalyzer/utils/__init__.py +28 -0
- skillanalyzer/utils/command_utils.py +129 -0
- skillanalyzer/utils/di_container.py +154 -0
- skillanalyzer/utils/file_utils.py +86 -0
- skillanalyzer/utils/logging_config.py +96 -0
- skillanalyzer/utils/logging_utils.py +71 -0
|
@@ -0,0 +1,742 @@
|
|
|
1
|
+
# Copyright 2026 Cisco Systems, Inc.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
#
|
|
15
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
16
|
+
|
|
17
|
+
"""
|
|
18
|
+
Context extractor for Claude Skills behavioral analysis.
|
|
19
|
+
|
|
20
|
+
Extracts comprehensive security context from skill scripts for LLM analysis.
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
import ast
|
|
24
|
+
from dataclasses import dataclass, field
|
|
25
|
+
from pathlib import Path
|
|
26
|
+
from typing import Any
|
|
27
|
+
|
|
28
|
+
from .dataflow.forward_analysis import ForwardDataflowAnalysis
|
|
29
|
+
from .parser.python_parser import FunctionInfo, PythonParser
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
@dataclass
|
|
33
|
+
class SkillScriptContext:
|
|
34
|
+
"""Complete security context for a skill script."""
|
|
35
|
+
|
|
36
|
+
file_path: str
|
|
37
|
+
functions: list[FunctionInfo]
|
|
38
|
+
imports: list[str]
|
|
39
|
+
dataflows: list[dict[str, Any]] = field(default_factory=list) # Empty - pattern detection used instead
|
|
40
|
+
|
|
41
|
+
# Security indicators (aggregated from all functions)
|
|
42
|
+
has_network: bool = False
|
|
43
|
+
has_file_ops: bool = False
|
|
44
|
+
has_subprocess: bool = False
|
|
45
|
+
has_eval_exec: bool = False
|
|
46
|
+
has_credential_access: bool = False
|
|
47
|
+
has_env_var_access: bool = False
|
|
48
|
+
|
|
49
|
+
# Dangerous patterns (simple pattern matching results)
|
|
50
|
+
dangerous_flows: list[dict[str, Any]] = field(default_factory=list)
|
|
51
|
+
has_exfiltration_chain: bool = False
|
|
52
|
+
has_injection_chain: bool = False
|
|
53
|
+
|
|
54
|
+
# Evidence for LLM
|
|
55
|
+
all_function_calls: list[str] = field(default_factory=list)
|
|
56
|
+
all_string_literals: list[str] = field(default_factory=list)
|
|
57
|
+
suspicious_urls: list[str] = field(default_factory=list)
|
|
58
|
+
|
|
59
|
+
def to_dict(self) -> dict[str, Any]:
|
|
60
|
+
"""Convert to dictionary for LLM prompt."""
|
|
61
|
+
return {
|
|
62
|
+
"file_path": self.file_path,
|
|
63
|
+
"function_count": len(self.functions),
|
|
64
|
+
"imports": self.imports,
|
|
65
|
+
"security_indicators": {
|
|
66
|
+
"has_network": self.has_network,
|
|
67
|
+
"has_file_ops": self.has_file_ops,
|
|
68
|
+
"has_subprocess": self.has_subprocess,
|
|
69
|
+
"has_eval_exec": self.has_eval_exec,
|
|
70
|
+
"has_credential_access": self.has_credential_access,
|
|
71
|
+
"has_env_var_access": self.has_env_var_access,
|
|
72
|
+
},
|
|
73
|
+
"dangerous_patterns": {
|
|
74
|
+
"exfiltration_chain": self.has_exfiltration_chain,
|
|
75
|
+
"injection_chain": self.has_injection_chain,
|
|
76
|
+
"dangerous_flow_count": len(self.dangerous_flows),
|
|
77
|
+
},
|
|
78
|
+
"functions": [
|
|
79
|
+
{
|
|
80
|
+
"name": f.name,
|
|
81
|
+
"parameters": f.parameters,
|
|
82
|
+
"has_network": f.has_network_calls,
|
|
83
|
+
"has_file_ops": f.has_file_operations,
|
|
84
|
+
"has_subprocess": f.has_subprocess,
|
|
85
|
+
"has_eval_exec": f.has_eval_exec,
|
|
86
|
+
"calls": f.function_calls[:10], # First 10
|
|
87
|
+
}
|
|
88
|
+
for f in self.functions
|
|
89
|
+
],
|
|
90
|
+
"suspicious_urls": self.suspicious_urls,
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
@dataclass
|
|
95
|
+
class SkillFunctionContext:
|
|
96
|
+
"""Complete context for a single function (for alignment verification).
|
|
97
|
+
|
|
98
|
+
This dataclass contains rich analysis data for a single function,
|
|
99
|
+
including dataflow analysis, parameter tracking, and behavioral patterns.
|
|
100
|
+
Used by the alignment verification layer to detect description/code mismatches.
|
|
101
|
+
"""
|
|
102
|
+
|
|
103
|
+
# Required fields (no defaults)
|
|
104
|
+
name: str
|
|
105
|
+
imports: list[str]
|
|
106
|
+
function_calls: list[dict[str, Any]]
|
|
107
|
+
assignments: list[dict[str, Any]]
|
|
108
|
+
control_flow: dict[str, Any]
|
|
109
|
+
parameter_flows: list[dict[str, Any]] # All paths from parameters
|
|
110
|
+
constants: dict[str, Any]
|
|
111
|
+
variable_dependencies: dict[str, list[str]]
|
|
112
|
+
has_file_operations: bool
|
|
113
|
+
has_network_operations: bool
|
|
114
|
+
has_subprocess_calls: bool
|
|
115
|
+
has_eval_exec: bool
|
|
116
|
+
|
|
117
|
+
# Optional fields (with defaults)
|
|
118
|
+
docstring: str | None = None
|
|
119
|
+
parameters: list[dict[str, Any]] = field(default_factory=list)
|
|
120
|
+
return_type: str | None = None
|
|
121
|
+
line_number: int = 0
|
|
122
|
+
|
|
123
|
+
# Cross-file analysis
|
|
124
|
+
cross_file_calls: list[dict[str, Any]] = field(default_factory=list)
|
|
125
|
+
reachable_functions: list[str] = field(default_factory=list)
|
|
126
|
+
|
|
127
|
+
# High-value security indicators
|
|
128
|
+
string_literals: list[str] = field(default_factory=list)
|
|
129
|
+
return_expressions: list[str] = field(default_factory=list)
|
|
130
|
+
exception_handlers: list[dict[str, Any]] = field(default_factory=list)
|
|
131
|
+
env_var_access: list[str] = field(default_factory=list)
|
|
132
|
+
|
|
133
|
+
# State manipulation
|
|
134
|
+
global_writes: list[dict[str, Any]] = field(default_factory=list)
|
|
135
|
+
attribute_access: list[dict[str, Any]] = field(default_factory=list)
|
|
136
|
+
|
|
137
|
+
# Dataflow facts
|
|
138
|
+
dataflow_summary: dict[str, Any] = field(default_factory=dict)
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
class ContextExtractor:
|
|
142
|
+
"""Extract comprehensive security context from skill scripts."""
|
|
143
|
+
|
|
144
|
+
SUSPICIOUS_DOMAINS = ["attacker.com", "evil.com", "malicious.com", "pastebin.com"]
|
|
145
|
+
|
|
146
|
+
# Legitimate domains that should NOT be flagged as suspicious
|
|
147
|
+
LEGITIMATE_DOMAINS = [
|
|
148
|
+
# Claude/Anthropic services
|
|
149
|
+
"api.anthropic.com",
|
|
150
|
+
"statsig.anthropic.com",
|
|
151
|
+
# Code repositories
|
|
152
|
+
"github.com",
|
|
153
|
+
"gitlab.com",
|
|
154
|
+
"bitbucket.org",
|
|
155
|
+
# Package registries
|
|
156
|
+
"registry.npmjs.org",
|
|
157
|
+
"npmjs.com",
|
|
158
|
+
"npmjs.org",
|
|
159
|
+
"yarnpkg.com",
|
|
160
|
+
"registry.yarnpkg.com",
|
|
161
|
+
"pypi.org",
|
|
162
|
+
"files.pythonhosted.org",
|
|
163
|
+
"pythonhosted.org",
|
|
164
|
+
# System packages
|
|
165
|
+
"archive.ubuntu.com",
|
|
166
|
+
"security.ubuntu.com",
|
|
167
|
+
# XML schemas (for OOXML document processing)
|
|
168
|
+
"schemas.microsoft.com",
|
|
169
|
+
"schemas.openxmlformats.org",
|
|
170
|
+
"www.w3.org",
|
|
171
|
+
"purl.org",
|
|
172
|
+
# Localhost and development
|
|
173
|
+
"localhost",
|
|
174
|
+
"127.0.0.1",
|
|
175
|
+
"0.0.0.0",
|
|
176
|
+
]
|
|
177
|
+
|
|
178
|
+
def extract_context(self, file_path: Path, source_code: str) -> SkillScriptContext:
|
|
179
|
+
"""
|
|
180
|
+
Extract complete security context from a script.
|
|
181
|
+
|
|
182
|
+
Args:
|
|
183
|
+
file_path: Path to the script file
|
|
184
|
+
source_code: Python source code
|
|
185
|
+
|
|
186
|
+
Returns:
|
|
187
|
+
SkillScriptContext with extracted information
|
|
188
|
+
"""
|
|
189
|
+
# Parse with AST parser
|
|
190
|
+
parser = PythonParser(source_code)
|
|
191
|
+
if not parser.parse():
|
|
192
|
+
# Return empty context if parsing fails
|
|
193
|
+
return SkillScriptContext(file_path=str(file_path), functions=[], imports=[], dataflows=[])
|
|
194
|
+
|
|
195
|
+
# Aggregate security indicators
|
|
196
|
+
has_network = any(f.has_network_calls for f in parser.functions)
|
|
197
|
+
has_file_ops = any(f.has_file_operations for f in parser.functions)
|
|
198
|
+
has_subprocess = any(f.has_subprocess for f in parser.functions)
|
|
199
|
+
has_eval_exec = any(f.has_eval_exec for f in parser.functions)
|
|
200
|
+
|
|
201
|
+
# Use CFG-based ForwardDataflowAnalysis for script-level source detection and flow tracking
|
|
202
|
+
try:
|
|
203
|
+
forward_analyzer = ForwardDataflowAnalysis(parser, parameter_names=[], detect_sources=True)
|
|
204
|
+
script_flows = forward_analyzer.analyze_forward_flows()
|
|
205
|
+
except Exception as e:
|
|
206
|
+
import logging
|
|
207
|
+
|
|
208
|
+
logging.getLogger(__name__).warning(f"CFG-based script-level analysis failed: {e}")
|
|
209
|
+
script_flows = []
|
|
210
|
+
|
|
211
|
+
# Extract credential/env access from detected sources
|
|
212
|
+
has_credential_access = any(flow.parameter_name.startswith("credential_file:") for flow in script_flows)
|
|
213
|
+
has_env_var_access = any(flow.parameter_name.startswith("env_var:") for flow in script_flows)
|
|
214
|
+
|
|
215
|
+
# Extract dangerous flows
|
|
216
|
+
dangerous_flows = []
|
|
217
|
+
for flow in script_flows:
|
|
218
|
+
if flow.reaches_external:
|
|
219
|
+
source_type = "parameter"
|
|
220
|
+
if flow.parameter_name.startswith("credential_file:"):
|
|
221
|
+
source_type = "credential_file"
|
|
222
|
+
elif flow.parameter_name.startswith("env_var:"):
|
|
223
|
+
source_type = "env_var"
|
|
224
|
+
|
|
225
|
+
# Determine sink type from calls
|
|
226
|
+
sink_type = "external"
|
|
227
|
+
network_calls = ["requests", "urllib", "http", "socket", "post", "get"]
|
|
228
|
+
eval_calls = ["eval", "exec", "compile"]
|
|
229
|
+
if any(any(nc in call.lower() for nc in network_calls) for call in flow.reaches_calls):
|
|
230
|
+
sink_type = "network"
|
|
231
|
+
elif any(any(ec in call.lower() for ec in eval_calls) for call in flow.reaches_calls):
|
|
232
|
+
sink_type = "eval"
|
|
233
|
+
|
|
234
|
+
dangerous_flows.append(
|
|
235
|
+
{
|
|
236
|
+
"source_type": source_type,
|
|
237
|
+
"source_name": flow.parameter_name,
|
|
238
|
+
"sink_type": sink_type,
|
|
239
|
+
"sink_operation": ", ".join(flow.reaches_calls),
|
|
240
|
+
"is_dangerous": True,
|
|
241
|
+
}
|
|
242
|
+
)
|
|
243
|
+
|
|
244
|
+
has_exfiltration_chain = any(
|
|
245
|
+
flow.get("source_type") in ["credential_file", "env_var"] and flow.get("sink_type") == "network"
|
|
246
|
+
for flow in dangerous_flows
|
|
247
|
+
)
|
|
248
|
+
has_injection_chain = any(
|
|
249
|
+
flow.get("source_type") == "parameter" and flow.get("sink_type") == "eval" for flow in dangerous_flows
|
|
250
|
+
)
|
|
251
|
+
|
|
252
|
+
# Collect all function calls and strings
|
|
253
|
+
all_calls = []
|
|
254
|
+
all_strings = []
|
|
255
|
+
for func in parser.functions:
|
|
256
|
+
all_calls.extend(func.function_calls)
|
|
257
|
+
all_strings.extend(func.string_literals)
|
|
258
|
+
|
|
259
|
+
# Also collect module-level strings (class attributes, etc.)
|
|
260
|
+
all_strings.extend(parser.module_strings)
|
|
261
|
+
|
|
262
|
+
# Find suspicious URLs (filter out legitimate domains and docstrings)
|
|
263
|
+
suspicious_urls = []
|
|
264
|
+
for s in all_strings:
|
|
265
|
+
# Skip if not URL-like or contains newlines (docstrings)
|
|
266
|
+
if "\n" in s or not s.startswith("http"):
|
|
267
|
+
continue
|
|
268
|
+
# Skip if too long (likely docstring) or too short
|
|
269
|
+
if len(s) > 200 or len(s) < 10:
|
|
270
|
+
continue
|
|
271
|
+
# Skip if contains legitimate domain
|
|
272
|
+
if any(domain in s for domain in self.LEGITIMATE_DOMAINS):
|
|
273
|
+
continue
|
|
274
|
+
# Flag if contains known suspicious domain OR is generic http URL
|
|
275
|
+
if any(domain in s for domain in self.SUSPICIOUS_DOMAINS):
|
|
276
|
+
suspicious_urls.append(s)
|
|
277
|
+
# Generic URLs only if they look suspicious (not just schema URLs)
|
|
278
|
+
elif not any(schema in s for schema in ["schemas.", "www.w3.org", "xmlns"]):
|
|
279
|
+
suspicious_urls.append(s)
|
|
280
|
+
|
|
281
|
+
# Create context
|
|
282
|
+
context = SkillScriptContext(
|
|
283
|
+
file_path=str(file_path),
|
|
284
|
+
functions=parser.functions,
|
|
285
|
+
imports=parser.imports,
|
|
286
|
+
dataflows=[], # Empty - using pattern detection instead
|
|
287
|
+
has_network=has_network,
|
|
288
|
+
has_file_ops=has_file_ops,
|
|
289
|
+
has_subprocess=has_subprocess,
|
|
290
|
+
has_eval_exec=has_eval_exec,
|
|
291
|
+
has_credential_access=has_credential_access,
|
|
292
|
+
has_env_var_access=has_env_var_access,
|
|
293
|
+
dangerous_flows=dangerous_flows,
|
|
294
|
+
has_exfiltration_chain=has_exfiltration_chain,
|
|
295
|
+
has_injection_chain=has_injection_chain,
|
|
296
|
+
all_function_calls=list(set(all_calls)),
|
|
297
|
+
all_string_literals=all_strings,
|
|
298
|
+
suspicious_urls=suspicious_urls,
|
|
299
|
+
)
|
|
300
|
+
|
|
301
|
+
return context
|
|
302
|
+
|
|
303
|
+
def extract_function_contexts(self, file_path: Path, source_code: str) -> list[SkillFunctionContext]:
|
|
304
|
+
"""Extract detailed context for each function in the source code.
|
|
305
|
+
|
|
306
|
+
Used by the alignment verification layer to analyze individual functions.
|
|
307
|
+
|
|
308
|
+
Args:
|
|
309
|
+
file_path: Path to the script file
|
|
310
|
+
source_code: Python source code
|
|
311
|
+
|
|
312
|
+
Returns:
|
|
313
|
+
List of SkillFunctionContext for each function
|
|
314
|
+
"""
|
|
315
|
+
contexts = []
|
|
316
|
+
|
|
317
|
+
try:
|
|
318
|
+
tree = ast.parse(source_code)
|
|
319
|
+
except SyntaxError:
|
|
320
|
+
return contexts
|
|
321
|
+
|
|
322
|
+
# Parse with AST parser
|
|
323
|
+
parser = PythonParser(source_code)
|
|
324
|
+
if not parser.parse():
|
|
325
|
+
return contexts
|
|
326
|
+
|
|
327
|
+
# Extract module-level imports
|
|
328
|
+
imports = parser.imports
|
|
329
|
+
|
|
330
|
+
# Process each function
|
|
331
|
+
for node in ast.walk(tree):
|
|
332
|
+
if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)):
|
|
333
|
+
context = self._extract_function_context(node, imports, source_code, file_path)
|
|
334
|
+
if context:
|
|
335
|
+
contexts.append(context)
|
|
336
|
+
|
|
337
|
+
return contexts
|
|
338
|
+
|
|
339
|
+
def _extract_function_context(
|
|
340
|
+
self, node: ast.FunctionDef, imports: list[str], source_code: str, file_path: Path
|
|
341
|
+
) -> SkillFunctionContext:
|
|
342
|
+
"""Extract detailed context for a single function.
|
|
343
|
+
|
|
344
|
+
Args:
|
|
345
|
+
node: Function AST node
|
|
346
|
+
imports: Module-level imports
|
|
347
|
+
source_code: Full source code
|
|
348
|
+
file_path: Path to the file
|
|
349
|
+
|
|
350
|
+
Returns:
|
|
351
|
+
SkillFunctionContext with extracted information
|
|
352
|
+
"""
|
|
353
|
+
# Basic info
|
|
354
|
+
name = node.name
|
|
355
|
+
docstring = ast.get_docstring(node)
|
|
356
|
+
parameters = self._extract_parameters(node)
|
|
357
|
+
return_type = self._extract_return_type(node)
|
|
358
|
+
line_number = node.lineno
|
|
359
|
+
|
|
360
|
+
# Code structure
|
|
361
|
+
function_calls = self._extract_function_calls(node)
|
|
362
|
+
assignments = self._extract_assignments(node)
|
|
363
|
+
control_flow = self._analyze_control_flow(node)
|
|
364
|
+
|
|
365
|
+
# Parameter flow analysis
|
|
366
|
+
parameter_flows = self._analyze_parameter_flows(node, parameters)
|
|
367
|
+
|
|
368
|
+
# Constants
|
|
369
|
+
constants = self._extract_constants(node)
|
|
370
|
+
|
|
371
|
+
# Variable dependencies
|
|
372
|
+
var_deps = self._analyze_variable_dependencies(node)
|
|
373
|
+
|
|
374
|
+
# Behavioral patterns
|
|
375
|
+
has_file_ops = self._has_file_operations(node)
|
|
376
|
+
has_network_ops = self._has_network_operations(node)
|
|
377
|
+
has_subprocess = self._has_subprocess_calls(node)
|
|
378
|
+
has_eval_exec = self._has_eval_exec(node)
|
|
379
|
+
|
|
380
|
+
# High-value security indicators
|
|
381
|
+
string_literals = self._extract_string_literals(node)
|
|
382
|
+
return_expressions = self._extract_return_expressions(node)
|
|
383
|
+
exception_handlers = self._extract_exception_handlers(node)
|
|
384
|
+
env_var_access = self._extract_env_var_access(node)
|
|
385
|
+
|
|
386
|
+
# State manipulation
|
|
387
|
+
global_writes = self._extract_global_writes(node)
|
|
388
|
+
attribute_access = self._extract_attribute_access(node)
|
|
389
|
+
|
|
390
|
+
# Dataflow summary
|
|
391
|
+
dataflow_summary = self._create_dataflow_summary(node)
|
|
392
|
+
|
|
393
|
+
return SkillFunctionContext(
|
|
394
|
+
name=name,
|
|
395
|
+
docstring=docstring,
|
|
396
|
+
parameters=parameters,
|
|
397
|
+
return_type=return_type,
|
|
398
|
+
line_number=line_number,
|
|
399
|
+
imports=imports,
|
|
400
|
+
function_calls=function_calls,
|
|
401
|
+
assignments=assignments,
|
|
402
|
+
control_flow=control_flow,
|
|
403
|
+
parameter_flows=parameter_flows,
|
|
404
|
+
constants=constants,
|
|
405
|
+
variable_dependencies=var_deps,
|
|
406
|
+
has_file_operations=has_file_ops,
|
|
407
|
+
has_network_operations=has_network_ops,
|
|
408
|
+
has_subprocess_calls=has_subprocess,
|
|
409
|
+
has_eval_exec=has_eval_exec,
|
|
410
|
+
string_literals=string_literals,
|
|
411
|
+
return_expressions=return_expressions,
|
|
412
|
+
exception_handlers=exception_handlers,
|
|
413
|
+
env_var_access=env_var_access,
|
|
414
|
+
global_writes=global_writes,
|
|
415
|
+
attribute_access=attribute_access,
|
|
416
|
+
dataflow_summary=dataflow_summary,
|
|
417
|
+
)
|
|
418
|
+
|
|
419
|
+
def _extract_parameters(self, node: ast.FunctionDef) -> list[dict[str, Any]]:
|
|
420
|
+
"""Extract function parameters with type hints."""
|
|
421
|
+
params = []
|
|
422
|
+
for arg in node.args.args:
|
|
423
|
+
param_info = {"name": arg.arg}
|
|
424
|
+
if arg.annotation:
|
|
425
|
+
try:
|
|
426
|
+
param_info["type"] = ast.unparse(arg.annotation)
|
|
427
|
+
except (AttributeError, TypeError, ValueError):
|
|
428
|
+
param_info["type"] = "<unknown>"
|
|
429
|
+
params.append(param_info)
|
|
430
|
+
return params
|
|
431
|
+
|
|
432
|
+
def _extract_return_type(self, node: ast.FunctionDef) -> str:
|
|
433
|
+
"""Extract return type annotation."""
|
|
434
|
+
if node.returns:
|
|
435
|
+
try:
|
|
436
|
+
return ast.unparse(node.returns)
|
|
437
|
+
except (AttributeError, TypeError, ValueError):
|
|
438
|
+
return "<unknown>"
|
|
439
|
+
return None
|
|
440
|
+
|
|
441
|
+
def _extract_function_calls(self, node: ast.FunctionDef) -> list[dict[str, Any]]:
|
|
442
|
+
"""Extract all function calls with arguments."""
|
|
443
|
+
calls = []
|
|
444
|
+
for child in ast.walk(node):
|
|
445
|
+
if isinstance(child, ast.Call):
|
|
446
|
+
args_list = []
|
|
447
|
+
for arg in child.args:
|
|
448
|
+
try:
|
|
449
|
+
args_list.append(ast.unparse(arg))
|
|
450
|
+
except (AttributeError, TypeError, ValueError):
|
|
451
|
+
args_list.append("<complex>")
|
|
452
|
+
|
|
453
|
+
call_info = {
|
|
454
|
+
"name": self._get_call_name(child),
|
|
455
|
+
"args": args_list,
|
|
456
|
+
"line": child.lineno if hasattr(child, "lineno") else 0,
|
|
457
|
+
}
|
|
458
|
+
calls.append(call_info)
|
|
459
|
+
return calls
|
|
460
|
+
|
|
461
|
+
def _get_call_name(self, node: ast.Call) -> str:
|
|
462
|
+
"""Get function call name."""
|
|
463
|
+
if isinstance(node.func, ast.Name):
|
|
464
|
+
return node.func.id
|
|
465
|
+
elif isinstance(node.func, ast.Attribute):
|
|
466
|
+
parts = []
|
|
467
|
+
current = node.func
|
|
468
|
+
while isinstance(current, ast.Attribute):
|
|
469
|
+
parts.append(current.attr)
|
|
470
|
+
current = current.value
|
|
471
|
+
if isinstance(current, ast.Name):
|
|
472
|
+
parts.append(current.id)
|
|
473
|
+
return ".".join(reversed(parts))
|
|
474
|
+
try:
|
|
475
|
+
return ast.unparse(node.func)
|
|
476
|
+
except (AttributeError, TypeError, ValueError):
|
|
477
|
+
return "<unknown>"
|
|
478
|
+
|
|
479
|
+
def _extract_assignments(self, node: ast.FunctionDef) -> list[dict[str, Any]]:
|
|
480
|
+
"""Extract all assignments."""
|
|
481
|
+
assignments = []
|
|
482
|
+
for child in ast.walk(node):
|
|
483
|
+
if isinstance(child, ast.Assign):
|
|
484
|
+
for target in child.targets:
|
|
485
|
+
if isinstance(target, ast.Name):
|
|
486
|
+
try:
|
|
487
|
+
value_str = ast.unparse(child.value)
|
|
488
|
+
except (AttributeError, TypeError, ValueError):
|
|
489
|
+
value_str = "<complex>"
|
|
490
|
+
assignments.append(
|
|
491
|
+
{
|
|
492
|
+
"variable": target.id,
|
|
493
|
+
"value": value_str,
|
|
494
|
+
"line": child.lineno if hasattr(child, "lineno") else 0,
|
|
495
|
+
}
|
|
496
|
+
)
|
|
497
|
+
return assignments
|
|
498
|
+
|
|
499
|
+
def _analyze_control_flow(self, node: ast.FunctionDef) -> dict[str, Any]:
|
|
500
|
+
"""Analyze control flow structure."""
|
|
501
|
+
has_if = any(isinstance(n, ast.If) for n in ast.walk(node))
|
|
502
|
+
has_for = any(isinstance(n, (ast.For, ast.AsyncFor)) for n in ast.walk(node))
|
|
503
|
+
has_while = any(isinstance(n, ast.While) for n in ast.walk(node))
|
|
504
|
+
has_try = any(isinstance(n, ast.Try) for n in ast.walk(node))
|
|
505
|
+
|
|
506
|
+
return {
|
|
507
|
+
"has_conditionals": has_if,
|
|
508
|
+
"has_loops": has_for or has_while,
|
|
509
|
+
"has_exception_handling": has_try,
|
|
510
|
+
}
|
|
511
|
+
|
|
512
|
+
def _analyze_parameter_flows(self, node: ast.FunctionDef, parameters: list[dict[str, Any]]) -> list[dict[str, Any]]:
|
|
513
|
+
"""Analyze how parameters flow through the function using CFG-based analysis.
|
|
514
|
+
|
|
515
|
+
Uses proper control flow graph and fixpoint analysis for accurate tracking
|
|
516
|
+
through branches, loops, and function calls.
|
|
517
|
+
"""
|
|
518
|
+
flows = []
|
|
519
|
+
param_names = [p["name"] for p in parameters]
|
|
520
|
+
|
|
521
|
+
if not param_names:
|
|
522
|
+
return flows
|
|
523
|
+
|
|
524
|
+
# Extract function source code for parser
|
|
525
|
+
try:
|
|
526
|
+
func_source = ast.unparse(node) if hasattr(ast, "unparse") else None
|
|
527
|
+
except (AttributeError, TypeError, ValueError):
|
|
528
|
+
# Reconstruct from AST if unparse fails
|
|
529
|
+
|
|
530
|
+
param_str = ", ".join(p["name"] for p in parameters)
|
|
531
|
+
func_source = f"def {node.name}({param_str}):\n pass"
|
|
532
|
+
|
|
533
|
+
if not func_source:
|
|
534
|
+
return flows
|
|
535
|
+
|
|
536
|
+
# Create parser and run CFG-based forward analysis
|
|
537
|
+
parser = PythonParser(func_source)
|
|
538
|
+
if not parser.parse():
|
|
539
|
+
return flows
|
|
540
|
+
|
|
541
|
+
try:
|
|
542
|
+
forward_analyzer = ForwardDataflowAnalysis(parser, param_names)
|
|
543
|
+
flow_paths = forward_analyzer.analyze_forward_flows()
|
|
544
|
+
|
|
545
|
+
# Convert FlowPath objects to dict format
|
|
546
|
+
for flow_path in flow_paths:
|
|
547
|
+
flows.append(
|
|
548
|
+
{
|
|
549
|
+
"parameter": flow_path.parameter_name,
|
|
550
|
+
"operations": flow_path.operations,
|
|
551
|
+
"reaches_calls": flow_path.reaches_calls,
|
|
552
|
+
"reaches_assignments": flow_path.reaches_assignments,
|
|
553
|
+
"reaches_returns": flow_path.reaches_returns,
|
|
554
|
+
"reaches_external": flow_path.reaches_external,
|
|
555
|
+
}
|
|
556
|
+
)
|
|
557
|
+
except Exception as e:
|
|
558
|
+
# Log error but return empty flows (no fallback)
|
|
559
|
+
import logging
|
|
560
|
+
|
|
561
|
+
logging.getLogger(__name__).warning(f"CFG-based parameter flow analysis failed: {e}")
|
|
562
|
+
return flows
|
|
563
|
+
|
|
564
|
+
return flows
|
|
565
|
+
|
|
566
|
+
def _extract_constants(self, node: ast.FunctionDef) -> dict[str, Any]:
|
|
567
|
+
"""Extract constant values."""
|
|
568
|
+
constants = {}
|
|
569
|
+
for child in ast.walk(node):
|
|
570
|
+
if isinstance(child, ast.Assign):
|
|
571
|
+
for target in child.targets:
|
|
572
|
+
if isinstance(target, ast.Name) and isinstance(child.value, ast.Constant):
|
|
573
|
+
constants[target.id] = child.value.value
|
|
574
|
+
return constants
|
|
575
|
+
|
|
576
|
+
def _analyze_variable_dependencies(self, node: ast.FunctionDef) -> dict[str, list[str]]:
|
|
577
|
+
"""Analyze variable dependencies."""
|
|
578
|
+
dependencies = {}
|
|
579
|
+
for child in ast.walk(node):
|
|
580
|
+
if isinstance(child, ast.Assign):
|
|
581
|
+
for target in child.targets:
|
|
582
|
+
if isinstance(target, ast.Name):
|
|
583
|
+
deps = []
|
|
584
|
+
for name_node in ast.walk(child.value):
|
|
585
|
+
if isinstance(name_node, ast.Name):
|
|
586
|
+
deps.append(name_node.id)
|
|
587
|
+
dependencies[target.id] = deps
|
|
588
|
+
return dependencies
|
|
589
|
+
|
|
590
|
+
def _has_file_operations(self, node: ast.FunctionDef) -> bool:
|
|
591
|
+
"""Check for file operations."""
|
|
592
|
+
file_patterns = ["open", "read", "write", "path", "file", "os.remove", "shutil"]
|
|
593
|
+
for child in ast.walk(node):
|
|
594
|
+
if isinstance(child, ast.Call):
|
|
595
|
+
call_name = self._get_call_name(child).lower()
|
|
596
|
+
if any(pattern in call_name for pattern in file_patterns):
|
|
597
|
+
return True
|
|
598
|
+
return False
|
|
599
|
+
|
|
600
|
+
def _has_network_operations(self, node: ast.FunctionDef) -> bool:
|
|
601
|
+
"""Check for network operations."""
|
|
602
|
+
network_patterns = ["requests", "urllib", "http", "socket", "post", "get", "fetch"]
|
|
603
|
+
for child in ast.walk(node):
|
|
604
|
+
if isinstance(child, ast.Call):
|
|
605
|
+
call_name = self._get_call_name(child).lower()
|
|
606
|
+
if any(pattern in call_name for pattern in network_patterns):
|
|
607
|
+
return True
|
|
608
|
+
return False
|
|
609
|
+
|
|
610
|
+
def _has_subprocess_calls(self, node: ast.FunctionDef) -> bool:
|
|
611
|
+
"""Check for subprocess calls."""
|
|
612
|
+
subprocess_patterns = ["subprocess", "os.system", "os.popen", "shell", "exec"]
|
|
613
|
+
for child in ast.walk(node):
|
|
614
|
+
if isinstance(child, ast.Call):
|
|
615
|
+
call_name = self._get_call_name(child).lower()
|
|
616
|
+
if any(pattern in call_name for pattern in subprocess_patterns):
|
|
617
|
+
return True
|
|
618
|
+
return False
|
|
619
|
+
|
|
620
|
+
def _has_eval_exec(self, node: ast.FunctionDef) -> bool:
|
|
621
|
+
"""Check for eval/exec calls."""
|
|
622
|
+
for child in ast.walk(node):
|
|
623
|
+
if isinstance(child, ast.Call):
|
|
624
|
+
call_name = self._get_call_name(child)
|
|
625
|
+
if call_name in ["eval", "exec", "compile", "__import__"]:
|
|
626
|
+
return True
|
|
627
|
+
return False
|
|
628
|
+
|
|
629
|
+
def _extract_string_literals(self, node: ast.FunctionDef) -> list[str]:
|
|
630
|
+
"""Extract all string literals from function."""
|
|
631
|
+
literals = []
|
|
632
|
+
for child in ast.walk(node):
|
|
633
|
+
if isinstance(child, ast.Constant) and isinstance(child.value, str):
|
|
634
|
+
literal = child.value[:200]
|
|
635
|
+
if literal and literal not in literals:
|
|
636
|
+
literals.append(literal)
|
|
637
|
+
return literals[:20]
|
|
638
|
+
|
|
639
|
+
def _extract_return_expressions(self, node: ast.FunctionDef) -> list[str]:
|
|
640
|
+
"""Extract return expressions from function."""
|
|
641
|
+
returns = []
|
|
642
|
+
for child in ast.walk(node):
|
|
643
|
+
if isinstance(child, ast.Return) and child.value:
|
|
644
|
+
try:
|
|
645
|
+
return_expr = ast.unparse(child.value)[:100]
|
|
646
|
+
returns.append(return_expr)
|
|
647
|
+
except (AttributeError, TypeError, ValueError):
|
|
648
|
+
returns.append("<unparseable>")
|
|
649
|
+
return returns
|
|
650
|
+
|
|
651
|
+
def _extract_exception_handlers(self, node: ast.FunctionDef) -> list[dict[str, Any]]:
|
|
652
|
+
"""Extract exception handling details."""
|
|
653
|
+
handlers = []
|
|
654
|
+
for child in ast.walk(node):
|
|
655
|
+
if isinstance(child, ast.ExceptHandler):
|
|
656
|
+
handler_info = {
|
|
657
|
+
"line": child.lineno,
|
|
658
|
+
"exception_type": ast.unparse(child.type) if child.type else "Exception",
|
|
659
|
+
"is_silent": len(child.body) == 1 and isinstance(child.body[0], ast.Pass),
|
|
660
|
+
}
|
|
661
|
+
handlers.append(handler_info)
|
|
662
|
+
return handlers
|
|
663
|
+
|
|
664
|
+
def _extract_env_var_access(self, node: ast.FunctionDef) -> list[str]:
|
|
665
|
+
"""Extract environment variable accesses."""
|
|
666
|
+
env_accesses = []
|
|
667
|
+
for child in ast.walk(node):
|
|
668
|
+
if isinstance(child, ast.Call):
|
|
669
|
+
call_name = self._get_call_name(child)
|
|
670
|
+
if "environ" in call_name or "getenv" in call_name:
|
|
671
|
+
if child.args and isinstance(child.args[0], ast.Constant):
|
|
672
|
+
key = child.args[0].value
|
|
673
|
+
env_accesses.append(f"{call_name}('{key}')")
|
|
674
|
+
else:
|
|
675
|
+
env_accesses.append(call_name)
|
|
676
|
+
return env_accesses
|
|
677
|
+
|
|
678
|
+
def _extract_global_writes(self, node: ast.FunctionDef) -> list[dict[str, Any]]:
|
|
679
|
+
"""Extract global variable writes."""
|
|
680
|
+
global_writes = []
|
|
681
|
+
global_vars = set()
|
|
682
|
+
|
|
683
|
+
for child in ast.walk(node):
|
|
684
|
+
if isinstance(child, ast.Global):
|
|
685
|
+
global_vars.update(child.names)
|
|
686
|
+
|
|
687
|
+
for child in ast.walk(node):
|
|
688
|
+
if isinstance(child, ast.Assign):
|
|
689
|
+
for target in child.targets:
|
|
690
|
+
if isinstance(target, ast.Name) and target.id in global_vars:
|
|
691
|
+
try:
|
|
692
|
+
value_str = ast.unparse(child.value)[:100]
|
|
693
|
+
except (AttributeError, TypeError, ValueError):
|
|
694
|
+
value_str = "<complex>"
|
|
695
|
+
global_writes.append({"variable": target.id, "value": value_str, "line": child.lineno})
|
|
696
|
+
|
|
697
|
+
return global_writes
|
|
698
|
+
|
|
699
|
+
def _extract_attribute_access(self, node: ast.FunctionDef) -> list[dict[str, Any]]:
|
|
700
|
+
"""Extract attribute access patterns."""
|
|
701
|
+
attribute_ops = []
|
|
702
|
+
|
|
703
|
+
for child in ast.walk(node):
|
|
704
|
+
if isinstance(child, ast.Assign):
|
|
705
|
+
for target in child.targets:
|
|
706
|
+
if isinstance(target, ast.Attribute):
|
|
707
|
+
obj_name = ""
|
|
708
|
+
if isinstance(target.value, ast.Name):
|
|
709
|
+
obj_name = target.value.id
|
|
710
|
+
try:
|
|
711
|
+
value_str = ast.unparse(child.value)[:100]
|
|
712
|
+
except (AttributeError, TypeError, ValueError):
|
|
713
|
+
value_str = "<complex>"
|
|
714
|
+
attribute_ops.append(
|
|
715
|
+
{
|
|
716
|
+
"type": "write",
|
|
717
|
+
"object": obj_name,
|
|
718
|
+
"attribute": target.attr,
|
|
719
|
+
"value": value_str,
|
|
720
|
+
"line": child.lineno,
|
|
721
|
+
}
|
|
722
|
+
)
|
|
723
|
+
|
|
724
|
+
return attribute_ops[:20]
|
|
725
|
+
|
|
726
|
+
def _create_dataflow_summary(self, node: ast.FunctionDef) -> dict[str, Any]:
|
|
727
|
+
"""Create dataflow summary."""
|
|
728
|
+
return {
|
|
729
|
+
"total_statements": len([n for n in ast.walk(node) if isinstance(n, ast.stmt)]),
|
|
730
|
+
"total_expressions": len([n for n in ast.walk(node) if isinstance(n, ast.expr)]),
|
|
731
|
+
"complexity": self._calculate_complexity(node),
|
|
732
|
+
}
|
|
733
|
+
|
|
734
|
+
def _calculate_complexity(self, node: ast.FunctionDef) -> int:
|
|
735
|
+
"""Calculate cyclomatic complexity."""
|
|
736
|
+
complexity = 1
|
|
737
|
+
for child in ast.walk(node):
|
|
738
|
+
if isinstance(child, (ast.If, ast.For, ast.While, ast.ExceptHandler)):
|
|
739
|
+
complexity += 1
|
|
740
|
+
elif isinstance(child, ast.BoolOp):
|
|
741
|
+
complexity += len(child.values) - 1
|
|
742
|
+
return complexity
|