prooflayer-runtime 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. prooflayer/__init__.py +50 -0
  2. prooflayer/cli.py +362 -0
  3. prooflayer/config/__init__.py +6 -0
  4. prooflayer/config/allowlist.py +138 -0
  5. prooflayer/config/loader.py +29 -0
  6. prooflayer/detection/__init__.py +21 -0
  7. prooflayer/detection/engine.py +783 -0
  8. prooflayer/detection/models.py +49 -0
  9. prooflayer/detection/normalizer.py +245 -0
  10. prooflayer/detection/rules.py +104 -0
  11. prooflayer/detection/scanner.py +160 -0
  12. prooflayer/detection/scorer.py +65 -0
  13. prooflayer/detection/semantic.py +73 -0
  14. prooflayer/metrics.py +266 -0
  15. prooflayer/reporting/__init__.py +5 -0
  16. prooflayer/reporting/reporter.py +190 -0
  17. prooflayer/response/__init__.py +6 -0
  18. prooflayer/response/actions.py +152 -0
  19. prooflayer/response/killer.py +73 -0
  20. prooflayer/rules/command-injection.yaml +123 -0
  21. prooflayer/rules/data-exfiltration.yaml +83 -0
  22. prooflayer/rules/jailbreaks.yaml +67 -0
  23. prooflayer/rules/prompt-injection.yaml +99 -0
  24. prooflayer/rules/role-manipulation.yaml +60 -0
  25. prooflayer/rules/sql-injection.yaml +51 -0
  26. prooflayer/rules/ssrf-xxe.yaml +51 -0
  27. prooflayer/rules/tool-poisoning.yaml +46 -0
  28. prooflayer/runtime/__init__.py +21 -0
  29. prooflayer/runtime/interceptor.py +91 -0
  30. prooflayer/runtime/mcp_wrapper.py +395 -0
  31. prooflayer/runtime/middleware.py +86 -0
  32. prooflayer/runtime/transport.py +306 -0
  33. prooflayer/runtime/wrapper.py +265 -0
  34. prooflayer/utils/__init__.py +21 -0
  35. prooflayer/utils/encoding.py +87 -0
  36. prooflayer/utils/entropy.py +51 -0
  37. prooflayer/utils/logging.py +86 -0
  38. prooflayer/utils/masking.py +72 -0
  39. prooflayer/version.py +6 -0
  40. prooflayer_runtime-0.1.0.dist-info/METADATA +266 -0
  41. prooflayer_runtime-0.1.0.dist-info/RECORD +45 -0
  42. prooflayer_runtime-0.1.0.dist-info/WHEEL +5 -0
  43. prooflayer_runtime-0.1.0.dist-info/entry_points.txt +2 -0
  44. prooflayer_runtime-0.1.0.dist-info/licenses/LICENSE +4 -0
  45. prooflayer_runtime-0.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,49 @@
1
+ """Detection models and dataclasses."""
2
+
3
+ import re
4
+ import logging
5
+ from typing import Optional, List, Dict, Any, Iterator
6
+ from dataclasses import dataclass, field
7
+
8
+ logger = logging.getLogger(__name__)
9
+
10
+
11
+ @dataclass
12
+ class DetectionRule:
13
+ """A single detection rule."""
14
+ id: str
15
+ severity: str # "low", "medium", "high", "critical"
16
+ message: str
17
+ pattern: str
18
+ score: int
19
+ category: str
20
+ owasp: list = field(default_factory=list)
21
+ compiled_pattern: Optional[re.Pattern] = None
22
+
23
+ def __post_init__(self):
24
+ """Compile the regex pattern after initialization."""
25
+ try:
26
+ self.compiled_pattern = re.compile(self.pattern, re.IGNORECASE | re.DOTALL)
27
+ except re.error as e:
28
+ logger.warning(f"Failed to compile pattern for rule {self.id}: {e}")
29
+ self.compiled_pattern = None
30
+
31
+
32
+ @dataclass
33
+ class ScanResult:
34
+ """Result of a detection engine scan."""
35
+ score: int
36
+ level: str # "SAFE", "SUSPICIOUS", "THREAT"
37
+ action: str # "ALLOW", "WARN", "BLOCK"
38
+ matched_rules: List[DetectionRule] = field(default_factory=list)
39
+ scoring_breakdown: Dict[str, int] = field(default_factory=dict)
40
+ tool_name: str = ""
41
+ arguments: Dict[str, Any] = field(default_factory=dict)
42
+ timestamp: str = ""
43
+ latency_ms: float = 0.0
44
+ owasp_mapping: List[str] = field(default_factory=list)
45
+
46
+ def __iter__(self) -> Iterator[Any]:
47
+ """Backwards compatibility: allows `score, rules = engine.scan(...)`."""
48
+ yield self.score
49
+ yield self.matched_rules
@@ -0,0 +1,245 @@
1
+ """
2
+ Input Normalization and Decoding Layer
3
+ =======================================
4
+
5
+ Pre-processes input text before regex matching to defeat evasion techniques:
6
+ - Case normalization
7
+ - Unicode homoglyph normalization
8
+ - Encoding decoding (hex, octal, unicode, URL, base64)
9
+ - Whitespace normalization
10
+ - Nested object flattening
11
+ """
12
+
13
+ import re
14
+ import base64
15
+ import logging
16
+ import unicodedata
17
+ from typing import Any, Dict, List
18
+
19
+ logger = logging.getLogger(__name__)
20
+
21
+ # Mapping of common Unicode homoglyphs (Cyrillic and other lookalikes) to ASCII.
22
+ # This catches attackers substituting visually-similar characters to evade regex.
23
+ HOMOGLYPH_MAP = {
24
+ # Cyrillic → Latin
25
+ "\u0410": "A", # А
26
+ "\u0412": "B", # В
27
+ "\u0421": "C", # С
28
+ "\u0415": "E", # Е
29
+ "\u041d": "H", # Н
30
+ "\u041a": "K", # К
31
+ "\u041c": "M", # М
32
+ "\u041e": "O", # О
33
+ "\u0420": "P", # Р
34
+ "\u0422": "T", # Т
35
+ "\u0425": "X", # Х
36
+ "\u0430": "a", # а
37
+ "\u0435": "e", # е
38
+ "\u043e": "o", # о
39
+ "\u0440": "p", # р
40
+ "\u0441": "c", # с
41
+ "\u0443": "y", # у
42
+ "\u0445": "x", # х
43
+ "\u0455": "s", # ѕ (Cyrillic small letter dze)
44
+ "\u0456": "i", # і (Cyrillic small letter byelorussian-ukrainian i)
45
+ "\u0458": "j", # ј
46
+ "\u04bb": "h", # һ
47
+ "\u04c0": "l", # Ӏ (Cyrillic letter palochka)
48
+ # Greek → Latin
49
+ "\u0391": "A", # Α
50
+ "\u0392": "B", # Β
51
+ "\u0395": "E", # Ε
52
+ "\u0397": "H", # Η
53
+ "\u0399": "I", # Ι
54
+ "\u039a": "K", # Κ
55
+ "\u039c": "M", # Μ
56
+ "\u039d": "N", # Ν
57
+ "\u039f": "O", # Ο
58
+ "\u03a1": "P", # Ρ
59
+ "\u03a4": "T", # Τ
60
+ "\u03a5": "Y", # Υ
61
+ "\u03a7": "X", # Χ
62
+ "\u03b1": "a", # α (only when used as lookalike)
63
+ "\u03bf": "o", # ο
64
+ # Fullwidth → ASCII
65
+ "\uff21": "A",
66
+ "\uff22": "B",
67
+ "\uff23": "C",
68
+ "\uff24": "D",
69
+ "\uff25": "E",
70
+ "\uff26": "F",
71
+ "\uff27": "G",
72
+ "\uff28": "H",
73
+ "\uff29": "I",
74
+ "\uff2a": "J",
75
+ "\uff2b": "K",
76
+ "\uff2c": "L",
77
+ "\uff2d": "M",
78
+ "\uff2e": "N",
79
+ "\uff2f": "O",
80
+ "\uff30": "P",
81
+ "\uff31": "Q",
82
+ "\uff32": "R",
83
+ "\uff33": "S",
84
+ "\uff34": "T",
85
+ "\uff35": "U",
86
+ "\uff36": "V",
87
+ "\uff37": "W",
88
+ "\uff38": "X",
89
+ "\uff39": "Y",
90
+ "\uff3a": "Z",
91
+ "\uff41": "a",
92
+ "\uff42": "b",
93
+ "\uff43": "c",
94
+ "\uff44": "d",
95
+ "\uff45": "e",
96
+ "\uff46": "f",
97
+ "\uff47": "g",
98
+ "\uff48": "h",
99
+ "\uff49": "i",
100
+ "\uff4a": "j",
101
+ "\uff4b": "k",
102
+ "\uff4c": "l",
103
+ "\uff4d": "m",
104
+ "\uff4e": "n",
105
+ "\uff4f": "o",
106
+ "\uff50": "p",
107
+ "\uff51": "q",
108
+ "\uff52": "r",
109
+ "\uff53": "s",
110
+ "\uff54": "t",
111
+ "\uff55": "u",
112
+ "\uff56": "v",
113
+ "\uff57": "w",
114
+ "\uff58": "x",
115
+ "\uff59": "y",
116
+ "\uff5a": "z",
117
+ }
118
+
119
+ # Import decode functions from encoding module (shared with utils.encoding)
120
+ from ..utils.encoding import (
121
+ HEX_ESCAPE_RE as _HEX_ESCAPE_RE,
122
+ OCTAL_ESCAPE_RE as _OCTAL_ESCAPE_RE,
123
+ UNICODE_ESCAPE_RE as _UNICODE_ESCAPE_RE,
124
+ URL_ENCODE_RE as _URL_ENCODE_RE,
125
+ BASE64_RE as _BASE64_RE,
126
+ decode_hex_escapes,
127
+ decode_octal_escapes,
128
+ decode_unicode_escapes,
129
+ decode_url_encoding,
130
+ decode_base64_payloads,
131
+ )
132
+
133
+ _WHITESPACE_RE = re.compile(r"[\s\t\n\r]+")
134
+
135
+ # Zero-width and bidirectional override characters to strip
136
+ _ZERO_WIDTH_RE = re.compile(
137
+ "[\u200b\u200c\u200d\u2060\ufeff" # ZWS, ZWNJ, ZWJ, word joiner, BOM
138
+ "\u202a\u202b\u202c\u202d\u202e" # bidi overrides
139
+ "]+"
140
+ )
141
+
142
+
143
+ def strip_zero_width(text: str) -> str:
144
+ """Strip zero-width characters and bidi overrides."""
145
+ return _ZERO_WIDTH_RE.sub("", text)
146
+
147
+
148
+ def normalize_path(text: str) -> str:
149
+ """
150
+ Normalize path-like sequences in text.
151
+
152
+ Resolves /./ → /
153
+ Resolves // → /
154
+ Strips trailing slashes from path-like segments.
155
+ Does NOT resolve ../ (that changes semantics).
156
+ """
157
+ # Resolve /./ → /
158
+ text = re.sub(r"/\./", "/", text)
159
+ # Resolve // → / (but preserve protocol://)
160
+ text = re.sub(r"(?<!:)//+", "/", text)
161
+ # Strip trailing slashes from path-like segments (but not standalone /)
162
+ text = re.sub(r"(/[a-zA-Z0-9._-]+)/+(?=\s|$)", r"\1", text)
163
+ return text
164
+
165
+
166
+ def normalize_unicode(text: str) -> str:
167
+ """
168
+ Normalize Unicode homoglyphs to ASCII equivalents.
169
+
170
+ Uses NFKD decomposition first (handles fullwidth, compatibility forms),
171
+ then applies explicit homoglyph mapping for Cyrillic/Greek lookalikes.
172
+ """
173
+ # NFKD normalization decomposes compatibility characters
174
+ text = unicodedata.normalize("NFKD", text)
175
+
176
+ # Apply homoglyph mapping for characters that survive NFKD
177
+ result = []
178
+ for char in text:
179
+ if char in HOMOGLYPH_MAP:
180
+ result.append(HOMOGLYPH_MAP[char])
181
+ else:
182
+ result.append(char)
183
+
184
+ return "".join(result)
185
+
186
+
187
+ def normalize_whitespace(text: str) -> str:
188
+ """Normalize tabs, newlines, and multiple spaces to single spaces."""
189
+ return _WHITESPACE_RE.sub(" ", text).strip()
190
+
191
+
192
+ def flatten_value(value: Any) -> List[str]:
193
+ """
194
+ Recursively extract all string values from nested dicts/lists.
195
+
196
+ Instead of str() which produces Python repr format (e.g. "{'key': 'value'}"),
197
+ this extracts the actual string values for proper pattern matching.
198
+ """
199
+ strings = []
200
+ if isinstance(value, str):
201
+ strings.append(value)
202
+ elif isinstance(value, dict):
203
+ for v in value.values():
204
+ strings.extend(flatten_value(v))
205
+ elif isinstance(value, (list, tuple)):
206
+ for item in value:
207
+ strings.extend(flatten_value(item))
208
+ elif value is not None:
209
+ strings.append(str(value))
210
+ return strings
211
+
212
+
213
+ def flatten_arguments(arguments: Dict[str, Any]) -> Dict[str, List[str]]:
214
+ """
215
+ Flatten all argument values, returning a mapping of param name
216
+ to list of extracted string values.
217
+ """
218
+ result = {}
219
+ for key, value in arguments.items():
220
+ result[key] = flatten_value(value)
221
+ return result
222
+
223
+
224
+ def normalize_text(text: str) -> str:
225
+ """
226
+ Apply the full normalization pipeline to a single text string.
227
+
228
+ Order matters:
229
+ 1. Base64 decoding (FIRST — base64 is case-sensitive, must decode before lowering)
230
+ 2. Unicode homoglyph normalization (before lowering, to map correctly)
231
+ 3. Encoding decoding (hex, octal, unicode, URL)
232
+ 4. Case normalization (lowercase)
233
+ 5. Whitespace normalization
234
+ """
235
+ text = strip_zero_width(text)
236
+ text = decode_base64_payloads(text)
237
+ text = normalize_unicode(text)
238
+ text = decode_hex_escapes(text)
239
+ text = decode_octal_escapes(text)
240
+ text = decode_unicode_escapes(text)
241
+ text = decode_url_encoding(text)
242
+ text = normalize_path(text)
243
+ text = text.lower()
244
+ text = normalize_whitespace(text)
245
+ return text
@@ -0,0 +1,104 @@
1
+ """Rule loader for YAML detection rules."""
2
+
3
+ import yaml # type: ignore[import-untyped]
4
+ import logging
5
+ from pathlib import Path
6
+ from typing import List
7
+
8
+ from .models import DetectionRule
9
+
10
+ logger = logging.getLogger(__name__)
11
+
12
+
13
+ class RuleLoadError(Exception):
14
+ """Raised when detection rules fail to load. This is a security-critical error."""
15
+ pass
16
+
17
+
18
+ class RuleLoader:
19
+ """Load detection rules from YAML files."""
20
+
21
+ @staticmethod
22
+ def load_from_directory(rules_dir: str) -> List[DetectionRule]:
23
+ """
24
+ Load all YAML rule files from directory.
25
+
26
+ Args:
27
+ rules_dir: Directory containing YAML rule files
28
+
29
+ Returns:
30
+ List of DetectionRule objects
31
+
32
+ Raises:
33
+ RuleLoadError: If the directory does not exist or no rules are loaded
34
+ """
35
+ rules = []
36
+ rules_path = Path(rules_dir)
37
+ load_errors = []
38
+
39
+ if not rules_path.exists():
40
+ raise RuleLoadError(
41
+ f"Rules directory not found: {rules_dir}. "
42
+ "ProofLayer cannot start without detection rules."
43
+ )
44
+
45
+ for yaml_file in rules_path.glob("*.yaml"):
46
+ try:
47
+ file_rules = RuleLoader.load_from_file(str(yaml_file))
48
+ rules.extend(file_rules)
49
+ logger.info(f"Loaded {len(file_rules)} rules from {yaml_file.name}")
50
+ except Exception as e:
51
+ load_errors.append(f"{yaml_file}: {e}")
52
+ logger.error(f"Failed to load rules from {yaml_file}: {e}")
53
+
54
+ if not rules:
55
+ error_detail = ""
56
+ if load_errors:
57
+ error_detail = " Errors: " + "; ".join(load_errors)
58
+ raise RuleLoadError(
59
+ f"No detection rules loaded from {rules_dir}.{error_detail} "
60
+ "ProofLayer cannot start without detection rules."
61
+ )
62
+
63
+ if load_errors:
64
+ logger.warning(
65
+ f"Some rule files failed to load ({len(load_errors)} errors), "
66
+ f"but {len(rules)} rules were loaded successfully."
67
+ )
68
+
69
+ return rules
70
+
71
+ @staticmethod
72
+ def load_from_file(file_path: str) -> List[DetectionRule]:
73
+ """
74
+ Load detection rules from a single YAML file.
75
+
76
+ Args:
77
+ file_path: Path to YAML file
78
+
79
+ Returns:
80
+ List of DetectionRule objects
81
+ """
82
+ with open(file_path, "r") as f:
83
+ data = yaml.safe_load(f)
84
+
85
+ if not data or "rules" not in data:
86
+ return []
87
+
88
+ rules = []
89
+ for rule_data in data["rules"]:
90
+ try:
91
+ rule = DetectionRule(
92
+ id=rule_data["id"],
93
+ severity=rule_data.get("severity", "medium"),
94
+ message=rule_data["message"],
95
+ pattern=rule_data["pattern"],
96
+ score=rule_data.get("score", 10),
97
+ category=rule_data.get("category", "unknown"),
98
+ owasp=rule_data.get("owasp", [])
99
+ )
100
+ rules.append(rule)
101
+ except KeyError as e:
102
+ logger.warning(f"Skipping invalid rule in {file_path}: missing {e}")
103
+
104
+ return rules
@@ -0,0 +1,160 @@
1
+ """
2
+ Pattern Scanner
3
+ ===============
4
+
5
+ Regex-based pattern matching with ReDoS protection for threat detection.
6
+ """
7
+
8
+ import logging
9
+ import threading
10
+ from typing import List, Set, Tuple
11
+
12
+ from .models import DetectionRule
13
+
14
+ logger = logging.getLogger(__name__)
15
+
16
+ # ReDoS protection constants
17
+ REGEX_TIMEOUT_SECONDS = 0.1 # 100ms
18
+ REGEX_CIRCUIT_BREAKER_THRESHOLD = 3
19
+
20
+
21
+ class PatternScanner:
22
+ """
23
+ Scans text against detection rules using regex with ReDoS protection.
24
+
25
+ Each regex match runs in a daemon thread with a timeout. A circuit breaker
26
+ trips after consecutive timeouts to block suspicious requests.
27
+ """
28
+
29
+ def match_rule(
30
+ self, rule: DetectionRule, text: str
31
+ ) -> Tuple[bool, bool]:
32
+ """
33
+ Match a single rule against text with ReDoS protection.
34
+
35
+ Args:
36
+ rule: Detection rule with a compiled regex pattern.
37
+ text: Text to scan.
38
+
39
+ Returns:
40
+ (matched, timed_out)
41
+ """
42
+ if not rule.compiled_pattern:
43
+ return False, False
44
+
45
+ matched_flag = [None]
46
+
47
+ def _search():
48
+ matched_flag[0] = rule.compiled_pattern.search(text) is not None
49
+
50
+ t = threading.Thread(target=_search, daemon=True)
51
+ t.start()
52
+ t.join(timeout=REGEX_TIMEOUT_SECONDS)
53
+
54
+ if t.is_alive():
55
+ return False, True # timed out
56
+
57
+ return bool(matched_flag[0]), False
58
+
59
+ def scan_text(
60
+ self,
61
+ rules: List[DetectionRule],
62
+ text: str,
63
+ consecutive_timeouts: int,
64
+ lock: threading.Lock,
65
+ ) -> Tuple[List[DetectionRule], int, Set[str], int]:
66
+ """
67
+ Scan text against all rules.
68
+
69
+ Args:
70
+ rules: List of detection rules to match.
71
+ text: Normalized search text.
72
+ consecutive_timeouts: Current consecutive timeout count.
73
+ lock: Threading lock for timeout counter.
74
+
75
+ Returns:
76
+ (matched_rules, pattern_score, matched_rule_ids, updated_consecutive_timeouts)
77
+ Returns None for the first element if the circuit breaker tripped.
78
+ """
79
+ matched_rules: List[DetectionRule] = []
80
+ pattern_score = 0
81
+ matched_rule_ids: Set[str] = set()
82
+
83
+ for rule in rules:
84
+ matched, timed_out = self.match_rule(rule, text)
85
+ if timed_out:
86
+ with lock:
87
+ consecutive_timeouts += 1
88
+ logger.warning(
89
+ "Regex timeout for rule %s (consecutive: %d)",
90
+ rule.id,
91
+ consecutive_timeouts,
92
+ )
93
+ with lock:
94
+ if consecutive_timeouts >= REGEX_CIRCUIT_BREAKER_THRESHOLD:
95
+ return matched_rules, pattern_score, matched_rule_ids, consecutive_timeouts
96
+ continue
97
+ else:
98
+ with lock:
99
+ consecutive_timeouts = 0
100
+
101
+ if matched:
102
+ matched_rules.append(rule)
103
+ matched_rule_ids.add(rule.id)
104
+ pattern_score += rule.score
105
+ logger.debug("Rule matched: %s (+%d points)", rule.id, rule.score)
106
+
107
+ return matched_rules, pattern_score, matched_rule_ids, consecutive_timeouts
108
+
109
+ def scan_cross_param(
110
+ self,
111
+ rules: List[DetectionRule],
112
+ combined_variants: List[str],
113
+ matched_rule_ids: Set[str],
114
+ consecutive_timeouts: int,
115
+ lock: threading.Lock,
116
+ ) -> Tuple[List[DetectionRule], int, Set[str], int]:
117
+ """
118
+ Cross-parameter correlation scan.
119
+
120
+ Args:
121
+ rules: List of detection rules.
122
+ combined_variants: List of combined text variants to scan.
123
+ matched_rule_ids: Already-matched rule IDs to skip.
124
+ consecutive_timeouts: Current consecutive timeout count.
125
+ lock: Threading lock for timeout counter.
126
+
127
+ Returns:
128
+ (new_matched_rules, additional_score, updated_matched_rule_ids, updated_consecutive_timeouts)
129
+ """
130
+ new_matched_rules: List[DetectionRule] = []
131
+ additional_score = 0
132
+
133
+ for combined_text in combined_variants:
134
+ for rule in rules:
135
+ if rule.id not in matched_rule_ids and rule.compiled_pattern:
136
+ matched, timed_out = self.match_rule(rule, combined_text)
137
+ if timed_out:
138
+ with lock:
139
+ consecutive_timeouts += 1
140
+ logger.warning(
141
+ "Regex timeout for rule %s in cross-param (consecutive: %d)",
142
+ rule.id,
143
+ consecutive_timeouts,
144
+ )
145
+ continue
146
+ else:
147
+ with lock:
148
+ consecutive_timeouts = 0
149
+ if matched:
150
+ new_matched_rules.append(rule)
151
+ matched_rule_ids.add(rule.id)
152
+ additional_score += rule.score
153
+ logger.debug(
154
+ "Rule matched via cross-parameter correlation: "
155
+ "%s (+%d points)",
156
+ rule.id,
157
+ rule.score,
158
+ )
159
+
160
+ return new_matched_rules, additional_score, matched_rule_ids, consecutive_timeouts
@@ -0,0 +1,65 @@
1
+ """
2
+ Risk Scorer
3
+ ============
4
+
5
+ Calculates composite risk scores from multiple detection signals.
6
+ """
7
+
8
+ import logging
9
+ from typing import Dict
10
+
11
+ from ..utils.entropy import calculate_shannon_entropy
12
+
13
+ logger = logging.getLogger(__name__)
14
+
15
+ # Shell metacharacters that indicate potential command injection
16
+ DANGEROUS_CHARS = [';', '|', '&&', '||', '`', '$', '>', '<']
17
+
18
+
19
+ class RiskScorer:
20
+ """
21
+ Calculates risk scores from 4 components:
22
+ pattern matching, shell metacharacters, entropy, and semantic analysis.
23
+ """
24
+
25
+ def calculate_risk(
26
+ self,
27
+ pattern_score: int,
28
+ search_text: str,
29
+ semantic_score: int,
30
+ ) -> Dict[str, int]:
31
+ """
32
+ Calculate composite risk score.
33
+
34
+ Args:
35
+ pattern_score: Sum of matched rule scores.
36
+ search_text: Normalized text for metachar/entropy analysis.
37
+ semantic_score: Score from semantic analysis.
38
+
39
+ Returns:
40
+ Dict with pattern_score, metachar_score, entropy_score,
41
+ semantic_score, total_score, and risk_score (capped at 100).
42
+ """
43
+ metachar_score = 0
44
+ for char in DANGEROUS_CHARS:
45
+ if char in search_text:
46
+ metachar_score += 10
47
+ logger.debug("Dangerous char '%s' detected (+10 points)", char)
48
+
49
+ entropy_score = 0
50
+ entropy = calculate_shannon_entropy(search_text)
51
+ if entropy > 4.5:
52
+ entropy_score = 20
53
+ logger.debug("High entropy detected: %.2f (+20 points)", entropy)
54
+
55
+ total_score = pattern_score + metachar_score + entropy_score + semantic_score
56
+ risk_score = min(total_score, 100)
57
+
58
+ return {
59
+ "pattern_score": pattern_score,
60
+ "metachar_score": metachar_score,
61
+ "entropy_score": entropy_score,
62
+ "semantic_score": semantic_score,
63
+ "total_score": total_score,
64
+ "risk_score": risk_score,
65
+ }
@@ -0,0 +1,73 @@
1
+ """
2
+ Semantic Analyzer
3
+ =================
4
+
5
+ Parameter-level semantic validation for MCP tool calls.
6
+ """
7
+
8
+ import logging
9
+ from typing import Dict, Any
10
+
11
+ logger = logging.getLogger(__name__)
12
+
13
+
14
+ class SemanticAnalyzer:
15
+ """
16
+ Detects semantic mismatches in tool call arguments.
17
+
18
+ For example, a "hostname" parameter should not contain URLs or shell commands.
19
+ """
20
+
21
+ def analyze(self, tool_name: str, arguments: Dict[str, Any]) -> int:
22
+ """
23
+ Run semantic analysis on tool call arguments.
24
+
25
+ Args:
26
+ tool_name: Name of the MCP tool being called.
27
+ arguments: Tool call arguments dict.
28
+
29
+ Returns:
30
+ Semantic risk score (0+).
31
+ """
32
+ score = 0
33
+
34
+ for param_name, param_value in arguments.items():
35
+ param_lower = param_name.lower()
36
+ param_value_str = str(param_value).lower()
37
+
38
+ # Hostname/server/endpoint should not contain URLs
39
+ if any(kw in param_lower for kw in [
40
+ "hostname", "server", "host", "endpoint", "target", "address",
41
+ ]):
42
+ if any(proto in param_value_str for proto in [
43
+ "http://", "https://", "ftp://",
44
+ "ssh://", "file://", "data://", "gopher://",
45
+ ]):
46
+ score += 15
47
+ logger.debug("Semantic mismatch: hostname/server contains URL (+15 points)")
48
+
49
+ # System ID should be numeric or alphanumeric, not commands
50
+ if "system_id" in param_lower or "id" in param_lower:
51
+ if any(cmd in param_value_str for cmd in [
52
+ "curl", "wget", "bash", "sh",
53
+ "nc", "netcat", "python", "perl", "ruby", "ncat",
54
+ ]):
55
+ score += 20
56
+ logger.debug("Semantic mismatch: ID contains command (+20 points)")
57
+
58
+ # Numeric params containing non-numeric content with commands
59
+ if any(kw in param_lower for kw in ["port", "timeout", "count", "limit"]):
60
+ if any(cmd in param_value_str for cmd in [
61
+ "curl", "wget", "bash", "sh", "nc", "netcat",
62
+ "python", "perl", "ruby", "ncat",
63
+ ]):
64
+ score += 15
65
+ logger.debug("Semantic mismatch: numeric param contains command (+15 points)")
66
+
67
+ # Path/file params containing pipe or semicolon
68
+ if any(kw in param_lower for kw in ["path", "file"]):
69
+ if "|" in param_value_str or ";" in param_value_str:
70
+ score += 20
71
+ logger.debug("Semantic mismatch: path/file param contains pipe or semicolon (+20 points)")
72
+
73
+ return score