prooflayer-runtime 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- prooflayer/__init__.py +50 -0
- prooflayer/cli.py +362 -0
- prooflayer/config/__init__.py +6 -0
- prooflayer/config/allowlist.py +138 -0
- prooflayer/config/loader.py +29 -0
- prooflayer/detection/__init__.py +21 -0
- prooflayer/detection/engine.py +783 -0
- prooflayer/detection/models.py +49 -0
- prooflayer/detection/normalizer.py +245 -0
- prooflayer/detection/rules.py +104 -0
- prooflayer/detection/scanner.py +160 -0
- prooflayer/detection/scorer.py +65 -0
- prooflayer/detection/semantic.py +73 -0
- prooflayer/metrics.py +266 -0
- prooflayer/reporting/__init__.py +5 -0
- prooflayer/reporting/reporter.py +190 -0
- prooflayer/response/__init__.py +6 -0
- prooflayer/response/actions.py +152 -0
- prooflayer/response/killer.py +73 -0
- prooflayer/rules/command-injection.yaml +123 -0
- prooflayer/rules/data-exfiltration.yaml +83 -0
- prooflayer/rules/jailbreaks.yaml +67 -0
- prooflayer/rules/prompt-injection.yaml +99 -0
- prooflayer/rules/role-manipulation.yaml +60 -0
- prooflayer/rules/sql-injection.yaml +51 -0
- prooflayer/rules/ssrf-xxe.yaml +51 -0
- prooflayer/rules/tool-poisoning.yaml +46 -0
- prooflayer/runtime/__init__.py +21 -0
- prooflayer/runtime/interceptor.py +91 -0
- prooflayer/runtime/mcp_wrapper.py +395 -0
- prooflayer/runtime/middleware.py +86 -0
- prooflayer/runtime/transport.py +306 -0
- prooflayer/runtime/wrapper.py +265 -0
- prooflayer/utils/__init__.py +21 -0
- prooflayer/utils/encoding.py +87 -0
- prooflayer/utils/entropy.py +51 -0
- prooflayer/utils/logging.py +86 -0
- prooflayer/utils/masking.py +72 -0
- prooflayer/version.py +6 -0
- prooflayer_runtime-0.1.0.dist-info/METADATA +266 -0
- prooflayer_runtime-0.1.0.dist-info/RECORD +45 -0
- prooflayer_runtime-0.1.0.dist-info/WHEEL +5 -0
- prooflayer_runtime-0.1.0.dist-info/entry_points.txt +2 -0
- prooflayer_runtime-0.1.0.dist-info/licenses/LICENSE +4 -0
- prooflayer_runtime-0.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
"""Detection models and dataclasses."""
|
|
2
|
+
|
|
3
|
+
import re
|
|
4
|
+
import logging
|
|
5
|
+
from typing import Optional, List, Dict, Any, Iterator
|
|
6
|
+
from dataclasses import dataclass, field
|
|
7
|
+
|
|
8
|
+
logger = logging.getLogger(__name__)
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@dataclass
|
|
12
|
+
class DetectionRule:
|
|
13
|
+
"""A single detection rule."""
|
|
14
|
+
id: str
|
|
15
|
+
severity: str # "low", "medium", "high", "critical"
|
|
16
|
+
message: str
|
|
17
|
+
pattern: str
|
|
18
|
+
score: int
|
|
19
|
+
category: str
|
|
20
|
+
owasp: list = field(default_factory=list)
|
|
21
|
+
compiled_pattern: Optional[re.Pattern] = None
|
|
22
|
+
|
|
23
|
+
def __post_init__(self):
|
|
24
|
+
"""Compile the regex pattern after initialization."""
|
|
25
|
+
try:
|
|
26
|
+
self.compiled_pattern = re.compile(self.pattern, re.IGNORECASE | re.DOTALL)
|
|
27
|
+
except re.error as e:
|
|
28
|
+
logger.warning(f"Failed to compile pattern for rule {self.id}: {e}")
|
|
29
|
+
self.compiled_pattern = None
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
@dataclass
|
|
33
|
+
class ScanResult:
|
|
34
|
+
"""Result of a detection engine scan."""
|
|
35
|
+
score: int
|
|
36
|
+
level: str # "SAFE", "SUSPICIOUS", "THREAT"
|
|
37
|
+
action: str # "ALLOW", "WARN", "BLOCK"
|
|
38
|
+
matched_rules: List[DetectionRule] = field(default_factory=list)
|
|
39
|
+
scoring_breakdown: Dict[str, int] = field(default_factory=dict)
|
|
40
|
+
tool_name: str = ""
|
|
41
|
+
arguments: Dict[str, Any] = field(default_factory=dict)
|
|
42
|
+
timestamp: str = ""
|
|
43
|
+
latency_ms: float = 0.0
|
|
44
|
+
owasp_mapping: List[str] = field(default_factory=list)
|
|
45
|
+
|
|
46
|
+
def __iter__(self) -> Iterator[Any]:
|
|
47
|
+
"""Backwards compatibility: allows `score, rules = engine.scan(...)`."""
|
|
48
|
+
yield self.score
|
|
49
|
+
yield self.matched_rules
|
|
@@ -0,0 +1,245 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Input Normalization and Decoding Layer
|
|
3
|
+
=======================================
|
|
4
|
+
|
|
5
|
+
Pre-processes input text before regex matching to defeat evasion techniques:
|
|
6
|
+
- Case normalization
|
|
7
|
+
- Unicode homoglyph normalization
|
|
8
|
+
- Encoding decoding (hex, octal, unicode, URL, base64)
|
|
9
|
+
- Whitespace normalization
|
|
10
|
+
- Nested object flattening
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
import re
|
|
14
|
+
import base64
|
|
15
|
+
import logging
|
|
16
|
+
import unicodedata
|
|
17
|
+
from typing import Any, Dict, List
|
|
18
|
+
|
|
19
|
+
logger = logging.getLogger(__name__)
|
|
20
|
+
|
|
21
|
+
# Mapping of common Unicode homoglyphs (Cyrillic and other lookalikes) to ASCII.
|
|
22
|
+
# This catches attackers substituting visually-similar characters to evade regex.
|
|
23
|
+
HOMOGLYPH_MAP = {
|
|
24
|
+
# Cyrillic → Latin
|
|
25
|
+
"\u0410": "A", # А
|
|
26
|
+
"\u0412": "B", # В
|
|
27
|
+
"\u0421": "C", # С
|
|
28
|
+
"\u0415": "E", # Е
|
|
29
|
+
"\u041d": "H", # Н
|
|
30
|
+
"\u041a": "K", # К
|
|
31
|
+
"\u041c": "M", # М
|
|
32
|
+
"\u041e": "O", # О
|
|
33
|
+
"\u0420": "P", # Р
|
|
34
|
+
"\u0422": "T", # Т
|
|
35
|
+
"\u0425": "X", # Х
|
|
36
|
+
"\u0430": "a", # а
|
|
37
|
+
"\u0435": "e", # е
|
|
38
|
+
"\u043e": "o", # о
|
|
39
|
+
"\u0440": "p", # р
|
|
40
|
+
"\u0441": "c", # с
|
|
41
|
+
"\u0443": "y", # у
|
|
42
|
+
"\u0445": "x", # х
|
|
43
|
+
"\u0455": "s", # ѕ (Cyrillic small letter dze)
|
|
44
|
+
"\u0456": "i", # і (Cyrillic small letter byelorussian-ukrainian i)
|
|
45
|
+
"\u0458": "j", # ј
|
|
46
|
+
"\u04bb": "h", # һ
|
|
47
|
+
"\u04c0": "l", # Ӏ (Cyrillic letter palochka)
|
|
48
|
+
# Greek → Latin
|
|
49
|
+
"\u0391": "A", # Α
|
|
50
|
+
"\u0392": "B", # Β
|
|
51
|
+
"\u0395": "E", # Ε
|
|
52
|
+
"\u0397": "H", # Η
|
|
53
|
+
"\u0399": "I", # Ι
|
|
54
|
+
"\u039a": "K", # Κ
|
|
55
|
+
"\u039c": "M", # Μ
|
|
56
|
+
"\u039d": "N", # Ν
|
|
57
|
+
"\u039f": "O", # Ο
|
|
58
|
+
"\u03a1": "P", # Ρ
|
|
59
|
+
"\u03a4": "T", # Τ
|
|
60
|
+
"\u03a5": "Y", # Υ
|
|
61
|
+
"\u03a7": "X", # Χ
|
|
62
|
+
"\u03b1": "a", # α (only when used as lookalike)
|
|
63
|
+
"\u03bf": "o", # ο
|
|
64
|
+
# Fullwidth → ASCII
|
|
65
|
+
"\uff21": "A",
|
|
66
|
+
"\uff22": "B",
|
|
67
|
+
"\uff23": "C",
|
|
68
|
+
"\uff24": "D",
|
|
69
|
+
"\uff25": "E",
|
|
70
|
+
"\uff26": "F",
|
|
71
|
+
"\uff27": "G",
|
|
72
|
+
"\uff28": "H",
|
|
73
|
+
"\uff29": "I",
|
|
74
|
+
"\uff2a": "J",
|
|
75
|
+
"\uff2b": "K",
|
|
76
|
+
"\uff2c": "L",
|
|
77
|
+
"\uff2d": "M",
|
|
78
|
+
"\uff2e": "N",
|
|
79
|
+
"\uff2f": "O",
|
|
80
|
+
"\uff30": "P",
|
|
81
|
+
"\uff31": "Q",
|
|
82
|
+
"\uff32": "R",
|
|
83
|
+
"\uff33": "S",
|
|
84
|
+
"\uff34": "T",
|
|
85
|
+
"\uff35": "U",
|
|
86
|
+
"\uff36": "V",
|
|
87
|
+
"\uff37": "W",
|
|
88
|
+
"\uff38": "X",
|
|
89
|
+
"\uff39": "Y",
|
|
90
|
+
"\uff3a": "Z",
|
|
91
|
+
"\uff41": "a",
|
|
92
|
+
"\uff42": "b",
|
|
93
|
+
"\uff43": "c",
|
|
94
|
+
"\uff44": "d",
|
|
95
|
+
"\uff45": "e",
|
|
96
|
+
"\uff46": "f",
|
|
97
|
+
"\uff47": "g",
|
|
98
|
+
"\uff48": "h",
|
|
99
|
+
"\uff49": "i",
|
|
100
|
+
"\uff4a": "j",
|
|
101
|
+
"\uff4b": "k",
|
|
102
|
+
"\uff4c": "l",
|
|
103
|
+
"\uff4d": "m",
|
|
104
|
+
"\uff4e": "n",
|
|
105
|
+
"\uff4f": "o",
|
|
106
|
+
"\uff50": "p",
|
|
107
|
+
"\uff51": "q",
|
|
108
|
+
"\uff52": "r",
|
|
109
|
+
"\uff53": "s",
|
|
110
|
+
"\uff54": "t",
|
|
111
|
+
"\uff55": "u",
|
|
112
|
+
"\uff56": "v",
|
|
113
|
+
"\uff57": "w",
|
|
114
|
+
"\uff58": "x",
|
|
115
|
+
"\uff59": "y",
|
|
116
|
+
"\uff5a": "z",
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
# Import decode functions from encoding module (shared with utils.encoding)
|
|
120
|
+
from ..utils.encoding import (
|
|
121
|
+
HEX_ESCAPE_RE as _HEX_ESCAPE_RE,
|
|
122
|
+
OCTAL_ESCAPE_RE as _OCTAL_ESCAPE_RE,
|
|
123
|
+
UNICODE_ESCAPE_RE as _UNICODE_ESCAPE_RE,
|
|
124
|
+
URL_ENCODE_RE as _URL_ENCODE_RE,
|
|
125
|
+
BASE64_RE as _BASE64_RE,
|
|
126
|
+
decode_hex_escapes,
|
|
127
|
+
decode_octal_escapes,
|
|
128
|
+
decode_unicode_escapes,
|
|
129
|
+
decode_url_encoding,
|
|
130
|
+
decode_base64_payloads,
|
|
131
|
+
)
|
|
132
|
+
|
|
133
|
+
_WHITESPACE_RE = re.compile(r"[\s\t\n\r]+")
|
|
134
|
+
|
|
135
|
+
# Zero-width and bidirectional override characters to strip
|
|
136
|
+
_ZERO_WIDTH_RE = re.compile(
|
|
137
|
+
"[\u200b\u200c\u200d\u2060\ufeff" # ZWS, ZWNJ, ZWJ, word joiner, BOM
|
|
138
|
+
"\u202a\u202b\u202c\u202d\u202e" # bidi overrides
|
|
139
|
+
"]+"
|
|
140
|
+
)
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
def strip_zero_width(text: str) -> str:
|
|
144
|
+
"""Strip zero-width characters and bidi overrides."""
|
|
145
|
+
return _ZERO_WIDTH_RE.sub("", text)
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
def normalize_path(text: str) -> str:
|
|
149
|
+
"""
|
|
150
|
+
Normalize path-like sequences in text.
|
|
151
|
+
|
|
152
|
+
Resolves /./ → /
|
|
153
|
+
Resolves // → /
|
|
154
|
+
Strips trailing slashes from path-like segments.
|
|
155
|
+
Does NOT resolve ../ (that changes semantics).
|
|
156
|
+
"""
|
|
157
|
+
# Resolve /./ → /
|
|
158
|
+
text = re.sub(r"/\./", "/", text)
|
|
159
|
+
# Resolve // → / (but preserve protocol://)
|
|
160
|
+
text = re.sub(r"(?<!:)//+", "/", text)
|
|
161
|
+
# Strip trailing slashes from path-like segments (but not standalone /)
|
|
162
|
+
text = re.sub(r"(/[a-zA-Z0-9._-]+)/+(?=\s|$)", r"\1", text)
|
|
163
|
+
return text
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
def normalize_unicode(text: str) -> str:
|
|
167
|
+
"""
|
|
168
|
+
Normalize Unicode homoglyphs to ASCII equivalents.
|
|
169
|
+
|
|
170
|
+
Uses NFKD decomposition first (handles fullwidth, compatibility forms),
|
|
171
|
+
then applies explicit homoglyph mapping for Cyrillic/Greek lookalikes.
|
|
172
|
+
"""
|
|
173
|
+
# NFKD normalization decomposes compatibility characters
|
|
174
|
+
text = unicodedata.normalize("NFKD", text)
|
|
175
|
+
|
|
176
|
+
# Apply homoglyph mapping for characters that survive NFKD
|
|
177
|
+
result = []
|
|
178
|
+
for char in text:
|
|
179
|
+
if char in HOMOGLYPH_MAP:
|
|
180
|
+
result.append(HOMOGLYPH_MAP[char])
|
|
181
|
+
else:
|
|
182
|
+
result.append(char)
|
|
183
|
+
|
|
184
|
+
return "".join(result)
|
|
185
|
+
|
|
186
|
+
|
|
187
|
+
def normalize_whitespace(text: str) -> str:
|
|
188
|
+
"""Normalize tabs, newlines, and multiple spaces to single spaces."""
|
|
189
|
+
return _WHITESPACE_RE.sub(" ", text).strip()
|
|
190
|
+
|
|
191
|
+
|
|
192
|
+
def flatten_value(value: Any) -> List[str]:
|
|
193
|
+
"""
|
|
194
|
+
Recursively extract all string values from nested dicts/lists.
|
|
195
|
+
|
|
196
|
+
Instead of str() which produces Python repr format (e.g. "{'key': 'value'}"),
|
|
197
|
+
this extracts the actual string values for proper pattern matching.
|
|
198
|
+
"""
|
|
199
|
+
strings = []
|
|
200
|
+
if isinstance(value, str):
|
|
201
|
+
strings.append(value)
|
|
202
|
+
elif isinstance(value, dict):
|
|
203
|
+
for v in value.values():
|
|
204
|
+
strings.extend(flatten_value(v))
|
|
205
|
+
elif isinstance(value, (list, tuple)):
|
|
206
|
+
for item in value:
|
|
207
|
+
strings.extend(flatten_value(item))
|
|
208
|
+
elif value is not None:
|
|
209
|
+
strings.append(str(value))
|
|
210
|
+
return strings
|
|
211
|
+
|
|
212
|
+
|
|
213
|
+
def flatten_arguments(arguments: Dict[str, Any]) -> Dict[str, List[str]]:
|
|
214
|
+
"""
|
|
215
|
+
Flatten all argument values, returning a mapping of param name
|
|
216
|
+
to list of extracted string values.
|
|
217
|
+
"""
|
|
218
|
+
result = {}
|
|
219
|
+
for key, value in arguments.items():
|
|
220
|
+
result[key] = flatten_value(value)
|
|
221
|
+
return result
|
|
222
|
+
|
|
223
|
+
|
|
224
|
+
def normalize_text(text: str) -> str:
|
|
225
|
+
"""
|
|
226
|
+
Apply the full normalization pipeline to a single text string.
|
|
227
|
+
|
|
228
|
+
Order matters:
|
|
229
|
+
1. Base64 decoding (FIRST — base64 is case-sensitive, must decode before lowering)
|
|
230
|
+
2. Unicode homoglyph normalization (before lowering, to map correctly)
|
|
231
|
+
3. Encoding decoding (hex, octal, unicode, URL)
|
|
232
|
+
4. Case normalization (lowercase)
|
|
233
|
+
5. Whitespace normalization
|
|
234
|
+
"""
|
|
235
|
+
text = strip_zero_width(text)
|
|
236
|
+
text = decode_base64_payloads(text)
|
|
237
|
+
text = normalize_unicode(text)
|
|
238
|
+
text = decode_hex_escapes(text)
|
|
239
|
+
text = decode_octal_escapes(text)
|
|
240
|
+
text = decode_unicode_escapes(text)
|
|
241
|
+
text = decode_url_encoding(text)
|
|
242
|
+
text = normalize_path(text)
|
|
243
|
+
text = text.lower()
|
|
244
|
+
text = normalize_whitespace(text)
|
|
245
|
+
return text
|
|
@@ -0,0 +1,104 @@
|
|
|
1
|
+
"""Rule loader for YAML detection rules."""
|
|
2
|
+
|
|
3
|
+
import yaml # type: ignore[import-untyped]
|
|
4
|
+
import logging
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import List
|
|
7
|
+
|
|
8
|
+
from .models import DetectionRule
|
|
9
|
+
|
|
10
|
+
logger = logging.getLogger(__name__)
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class RuleLoadError(Exception):
|
|
14
|
+
"""Raised when detection rules fail to load. This is a security-critical error."""
|
|
15
|
+
pass
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class RuleLoader:
|
|
19
|
+
"""Load detection rules from YAML files."""
|
|
20
|
+
|
|
21
|
+
@staticmethod
|
|
22
|
+
def load_from_directory(rules_dir: str) -> List[DetectionRule]:
|
|
23
|
+
"""
|
|
24
|
+
Load all YAML rule files from directory.
|
|
25
|
+
|
|
26
|
+
Args:
|
|
27
|
+
rules_dir: Directory containing YAML rule files
|
|
28
|
+
|
|
29
|
+
Returns:
|
|
30
|
+
List of DetectionRule objects
|
|
31
|
+
|
|
32
|
+
Raises:
|
|
33
|
+
RuleLoadError: If the directory does not exist or no rules are loaded
|
|
34
|
+
"""
|
|
35
|
+
rules = []
|
|
36
|
+
rules_path = Path(rules_dir)
|
|
37
|
+
load_errors = []
|
|
38
|
+
|
|
39
|
+
if not rules_path.exists():
|
|
40
|
+
raise RuleLoadError(
|
|
41
|
+
f"Rules directory not found: {rules_dir}. "
|
|
42
|
+
"ProofLayer cannot start without detection rules."
|
|
43
|
+
)
|
|
44
|
+
|
|
45
|
+
for yaml_file in rules_path.glob("*.yaml"):
|
|
46
|
+
try:
|
|
47
|
+
file_rules = RuleLoader.load_from_file(str(yaml_file))
|
|
48
|
+
rules.extend(file_rules)
|
|
49
|
+
logger.info(f"Loaded {len(file_rules)} rules from {yaml_file.name}")
|
|
50
|
+
except Exception as e:
|
|
51
|
+
load_errors.append(f"{yaml_file}: {e}")
|
|
52
|
+
logger.error(f"Failed to load rules from {yaml_file}: {e}")
|
|
53
|
+
|
|
54
|
+
if not rules:
|
|
55
|
+
error_detail = ""
|
|
56
|
+
if load_errors:
|
|
57
|
+
error_detail = " Errors: " + "; ".join(load_errors)
|
|
58
|
+
raise RuleLoadError(
|
|
59
|
+
f"No detection rules loaded from {rules_dir}.{error_detail} "
|
|
60
|
+
"ProofLayer cannot start without detection rules."
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
if load_errors:
|
|
64
|
+
logger.warning(
|
|
65
|
+
f"Some rule files failed to load ({len(load_errors)} errors), "
|
|
66
|
+
f"but {len(rules)} rules were loaded successfully."
|
|
67
|
+
)
|
|
68
|
+
|
|
69
|
+
return rules
|
|
70
|
+
|
|
71
|
+
@staticmethod
|
|
72
|
+
def load_from_file(file_path: str) -> List[DetectionRule]:
|
|
73
|
+
"""
|
|
74
|
+
Load detection rules from a single YAML file.
|
|
75
|
+
|
|
76
|
+
Args:
|
|
77
|
+
file_path: Path to YAML file
|
|
78
|
+
|
|
79
|
+
Returns:
|
|
80
|
+
List of DetectionRule objects
|
|
81
|
+
"""
|
|
82
|
+
with open(file_path, "r") as f:
|
|
83
|
+
data = yaml.safe_load(f)
|
|
84
|
+
|
|
85
|
+
if not data or "rules" not in data:
|
|
86
|
+
return []
|
|
87
|
+
|
|
88
|
+
rules = []
|
|
89
|
+
for rule_data in data["rules"]:
|
|
90
|
+
try:
|
|
91
|
+
rule = DetectionRule(
|
|
92
|
+
id=rule_data["id"],
|
|
93
|
+
severity=rule_data.get("severity", "medium"),
|
|
94
|
+
message=rule_data["message"],
|
|
95
|
+
pattern=rule_data["pattern"],
|
|
96
|
+
score=rule_data.get("score", 10),
|
|
97
|
+
category=rule_data.get("category", "unknown"),
|
|
98
|
+
owasp=rule_data.get("owasp", [])
|
|
99
|
+
)
|
|
100
|
+
rules.append(rule)
|
|
101
|
+
except KeyError as e:
|
|
102
|
+
logger.warning(f"Skipping invalid rule in {file_path}: missing {e}")
|
|
103
|
+
|
|
104
|
+
return rules
|
|
@@ -0,0 +1,160 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Pattern Scanner
|
|
3
|
+
===============
|
|
4
|
+
|
|
5
|
+
Regex-based pattern matching with ReDoS protection for threat detection.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import logging
|
|
9
|
+
import threading
|
|
10
|
+
from typing import List, Set, Tuple
|
|
11
|
+
|
|
12
|
+
from .models import DetectionRule
|
|
13
|
+
|
|
14
|
+
logger = logging.getLogger(__name__)
|
|
15
|
+
|
|
16
|
+
# ReDoS protection constants
|
|
17
|
+
REGEX_TIMEOUT_SECONDS = 0.1 # 100ms
|
|
18
|
+
REGEX_CIRCUIT_BREAKER_THRESHOLD = 3
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class PatternScanner:
|
|
22
|
+
"""
|
|
23
|
+
Scans text against detection rules using regex with ReDoS protection.
|
|
24
|
+
|
|
25
|
+
Each regex match runs in a daemon thread with a timeout. A circuit breaker
|
|
26
|
+
trips after consecutive timeouts to block suspicious requests.
|
|
27
|
+
"""
|
|
28
|
+
|
|
29
|
+
def match_rule(
|
|
30
|
+
self, rule: DetectionRule, text: str
|
|
31
|
+
) -> Tuple[bool, bool]:
|
|
32
|
+
"""
|
|
33
|
+
Match a single rule against text with ReDoS protection.
|
|
34
|
+
|
|
35
|
+
Args:
|
|
36
|
+
rule: Detection rule with a compiled regex pattern.
|
|
37
|
+
text: Text to scan.
|
|
38
|
+
|
|
39
|
+
Returns:
|
|
40
|
+
(matched, timed_out)
|
|
41
|
+
"""
|
|
42
|
+
if not rule.compiled_pattern:
|
|
43
|
+
return False, False
|
|
44
|
+
|
|
45
|
+
matched_flag = [None]
|
|
46
|
+
|
|
47
|
+
def _search():
|
|
48
|
+
matched_flag[0] = rule.compiled_pattern.search(text) is not None
|
|
49
|
+
|
|
50
|
+
t = threading.Thread(target=_search, daemon=True)
|
|
51
|
+
t.start()
|
|
52
|
+
t.join(timeout=REGEX_TIMEOUT_SECONDS)
|
|
53
|
+
|
|
54
|
+
if t.is_alive():
|
|
55
|
+
return False, True # timed out
|
|
56
|
+
|
|
57
|
+
return bool(matched_flag[0]), False
|
|
58
|
+
|
|
59
|
+
def scan_text(
|
|
60
|
+
self,
|
|
61
|
+
rules: List[DetectionRule],
|
|
62
|
+
text: str,
|
|
63
|
+
consecutive_timeouts: int,
|
|
64
|
+
lock: threading.Lock,
|
|
65
|
+
) -> Tuple[List[DetectionRule], int, Set[str], int]:
|
|
66
|
+
"""
|
|
67
|
+
Scan text against all rules.
|
|
68
|
+
|
|
69
|
+
Args:
|
|
70
|
+
rules: List of detection rules to match.
|
|
71
|
+
text: Normalized search text.
|
|
72
|
+
consecutive_timeouts: Current consecutive timeout count.
|
|
73
|
+
lock: Threading lock for timeout counter.
|
|
74
|
+
|
|
75
|
+
Returns:
|
|
76
|
+
(matched_rules, pattern_score, matched_rule_ids, updated_consecutive_timeouts)
|
|
77
|
+
Returns None for the first element if the circuit breaker tripped.
|
|
78
|
+
"""
|
|
79
|
+
matched_rules: List[DetectionRule] = []
|
|
80
|
+
pattern_score = 0
|
|
81
|
+
matched_rule_ids: Set[str] = set()
|
|
82
|
+
|
|
83
|
+
for rule in rules:
|
|
84
|
+
matched, timed_out = self.match_rule(rule, text)
|
|
85
|
+
if timed_out:
|
|
86
|
+
with lock:
|
|
87
|
+
consecutive_timeouts += 1
|
|
88
|
+
logger.warning(
|
|
89
|
+
"Regex timeout for rule %s (consecutive: %d)",
|
|
90
|
+
rule.id,
|
|
91
|
+
consecutive_timeouts,
|
|
92
|
+
)
|
|
93
|
+
with lock:
|
|
94
|
+
if consecutive_timeouts >= REGEX_CIRCUIT_BREAKER_THRESHOLD:
|
|
95
|
+
return matched_rules, pattern_score, matched_rule_ids, consecutive_timeouts
|
|
96
|
+
continue
|
|
97
|
+
else:
|
|
98
|
+
with lock:
|
|
99
|
+
consecutive_timeouts = 0
|
|
100
|
+
|
|
101
|
+
if matched:
|
|
102
|
+
matched_rules.append(rule)
|
|
103
|
+
matched_rule_ids.add(rule.id)
|
|
104
|
+
pattern_score += rule.score
|
|
105
|
+
logger.debug("Rule matched: %s (+%d points)", rule.id, rule.score)
|
|
106
|
+
|
|
107
|
+
return matched_rules, pattern_score, matched_rule_ids, consecutive_timeouts
|
|
108
|
+
|
|
109
|
+
def scan_cross_param(
|
|
110
|
+
self,
|
|
111
|
+
rules: List[DetectionRule],
|
|
112
|
+
combined_variants: List[str],
|
|
113
|
+
matched_rule_ids: Set[str],
|
|
114
|
+
consecutive_timeouts: int,
|
|
115
|
+
lock: threading.Lock,
|
|
116
|
+
) -> Tuple[List[DetectionRule], int, Set[str], int]:
|
|
117
|
+
"""
|
|
118
|
+
Cross-parameter correlation scan.
|
|
119
|
+
|
|
120
|
+
Args:
|
|
121
|
+
rules: List of detection rules.
|
|
122
|
+
combined_variants: List of combined text variants to scan.
|
|
123
|
+
matched_rule_ids: Already-matched rule IDs to skip.
|
|
124
|
+
consecutive_timeouts: Current consecutive timeout count.
|
|
125
|
+
lock: Threading lock for timeout counter.
|
|
126
|
+
|
|
127
|
+
Returns:
|
|
128
|
+
(new_matched_rules, additional_score, updated_matched_rule_ids, updated_consecutive_timeouts)
|
|
129
|
+
"""
|
|
130
|
+
new_matched_rules: List[DetectionRule] = []
|
|
131
|
+
additional_score = 0
|
|
132
|
+
|
|
133
|
+
for combined_text in combined_variants:
|
|
134
|
+
for rule in rules:
|
|
135
|
+
if rule.id not in matched_rule_ids and rule.compiled_pattern:
|
|
136
|
+
matched, timed_out = self.match_rule(rule, combined_text)
|
|
137
|
+
if timed_out:
|
|
138
|
+
with lock:
|
|
139
|
+
consecutive_timeouts += 1
|
|
140
|
+
logger.warning(
|
|
141
|
+
"Regex timeout for rule %s in cross-param (consecutive: %d)",
|
|
142
|
+
rule.id,
|
|
143
|
+
consecutive_timeouts,
|
|
144
|
+
)
|
|
145
|
+
continue
|
|
146
|
+
else:
|
|
147
|
+
with lock:
|
|
148
|
+
consecutive_timeouts = 0
|
|
149
|
+
if matched:
|
|
150
|
+
new_matched_rules.append(rule)
|
|
151
|
+
matched_rule_ids.add(rule.id)
|
|
152
|
+
additional_score += rule.score
|
|
153
|
+
logger.debug(
|
|
154
|
+
"Rule matched via cross-parameter correlation: "
|
|
155
|
+
"%s (+%d points)",
|
|
156
|
+
rule.id,
|
|
157
|
+
rule.score,
|
|
158
|
+
)
|
|
159
|
+
|
|
160
|
+
return new_matched_rules, additional_score, matched_rule_ids, consecutive_timeouts
|
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Risk Scorer
|
|
3
|
+
============
|
|
4
|
+
|
|
5
|
+
Calculates composite risk scores from multiple detection signals.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import logging
|
|
9
|
+
from typing import Dict
|
|
10
|
+
|
|
11
|
+
from ..utils.entropy import calculate_shannon_entropy
|
|
12
|
+
|
|
13
|
+
logger = logging.getLogger(__name__)
|
|
14
|
+
|
|
15
|
+
# Shell metacharacters that indicate potential command injection
|
|
16
|
+
DANGEROUS_CHARS = [';', '|', '&&', '||', '`', '$', '>', '<']
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class RiskScorer:
|
|
20
|
+
"""
|
|
21
|
+
Calculates risk scores from 4 components:
|
|
22
|
+
pattern matching, shell metacharacters, entropy, and semantic analysis.
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
def calculate_risk(
|
|
26
|
+
self,
|
|
27
|
+
pattern_score: int,
|
|
28
|
+
search_text: str,
|
|
29
|
+
semantic_score: int,
|
|
30
|
+
) -> Dict[str, int]:
|
|
31
|
+
"""
|
|
32
|
+
Calculate composite risk score.
|
|
33
|
+
|
|
34
|
+
Args:
|
|
35
|
+
pattern_score: Sum of matched rule scores.
|
|
36
|
+
search_text: Normalized text for metachar/entropy analysis.
|
|
37
|
+
semantic_score: Score from semantic analysis.
|
|
38
|
+
|
|
39
|
+
Returns:
|
|
40
|
+
Dict with pattern_score, metachar_score, entropy_score,
|
|
41
|
+
semantic_score, total_score, and risk_score (capped at 100).
|
|
42
|
+
"""
|
|
43
|
+
metachar_score = 0
|
|
44
|
+
for char in DANGEROUS_CHARS:
|
|
45
|
+
if char in search_text:
|
|
46
|
+
metachar_score += 10
|
|
47
|
+
logger.debug("Dangerous char '%s' detected (+10 points)", char)
|
|
48
|
+
|
|
49
|
+
entropy_score = 0
|
|
50
|
+
entropy = calculate_shannon_entropy(search_text)
|
|
51
|
+
if entropy > 4.5:
|
|
52
|
+
entropy_score = 20
|
|
53
|
+
logger.debug("High entropy detected: %.2f (+20 points)", entropy)
|
|
54
|
+
|
|
55
|
+
total_score = pattern_score + metachar_score + entropy_score + semantic_score
|
|
56
|
+
risk_score = min(total_score, 100)
|
|
57
|
+
|
|
58
|
+
return {
|
|
59
|
+
"pattern_score": pattern_score,
|
|
60
|
+
"metachar_score": metachar_score,
|
|
61
|
+
"entropy_score": entropy_score,
|
|
62
|
+
"semantic_score": semantic_score,
|
|
63
|
+
"total_score": total_score,
|
|
64
|
+
"risk_score": risk_score,
|
|
65
|
+
}
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Semantic Analyzer
|
|
3
|
+
=================
|
|
4
|
+
|
|
5
|
+
Parameter-level semantic validation for MCP tool calls.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import logging
|
|
9
|
+
from typing import Dict, Any
|
|
10
|
+
|
|
11
|
+
logger = logging.getLogger(__name__)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class SemanticAnalyzer:
|
|
15
|
+
"""
|
|
16
|
+
Detects semantic mismatches in tool call arguments.
|
|
17
|
+
|
|
18
|
+
For example, a "hostname" parameter should not contain URLs or shell commands.
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
def analyze(self, tool_name: str, arguments: Dict[str, Any]) -> int:
|
|
22
|
+
"""
|
|
23
|
+
Run semantic analysis on tool call arguments.
|
|
24
|
+
|
|
25
|
+
Args:
|
|
26
|
+
tool_name: Name of the MCP tool being called.
|
|
27
|
+
arguments: Tool call arguments dict.
|
|
28
|
+
|
|
29
|
+
Returns:
|
|
30
|
+
Semantic risk score (0+).
|
|
31
|
+
"""
|
|
32
|
+
score = 0
|
|
33
|
+
|
|
34
|
+
for param_name, param_value in arguments.items():
|
|
35
|
+
param_lower = param_name.lower()
|
|
36
|
+
param_value_str = str(param_value).lower()
|
|
37
|
+
|
|
38
|
+
# Hostname/server/endpoint should not contain URLs
|
|
39
|
+
if any(kw in param_lower for kw in [
|
|
40
|
+
"hostname", "server", "host", "endpoint", "target", "address",
|
|
41
|
+
]):
|
|
42
|
+
if any(proto in param_value_str for proto in [
|
|
43
|
+
"http://", "https://", "ftp://",
|
|
44
|
+
"ssh://", "file://", "data://", "gopher://",
|
|
45
|
+
]):
|
|
46
|
+
score += 15
|
|
47
|
+
logger.debug("Semantic mismatch: hostname/server contains URL (+15 points)")
|
|
48
|
+
|
|
49
|
+
# System ID should be numeric or alphanumeric, not commands
|
|
50
|
+
if "system_id" in param_lower or "id" in param_lower:
|
|
51
|
+
if any(cmd in param_value_str for cmd in [
|
|
52
|
+
"curl", "wget", "bash", "sh",
|
|
53
|
+
"nc", "netcat", "python", "perl", "ruby", "ncat",
|
|
54
|
+
]):
|
|
55
|
+
score += 20
|
|
56
|
+
logger.debug("Semantic mismatch: ID contains command (+20 points)")
|
|
57
|
+
|
|
58
|
+
# Numeric params containing non-numeric content with commands
|
|
59
|
+
if any(kw in param_lower for kw in ["port", "timeout", "count", "limit"]):
|
|
60
|
+
if any(cmd in param_value_str for cmd in [
|
|
61
|
+
"curl", "wget", "bash", "sh", "nc", "netcat",
|
|
62
|
+
"python", "perl", "ruby", "ncat",
|
|
63
|
+
]):
|
|
64
|
+
score += 15
|
|
65
|
+
logger.debug("Semantic mismatch: numeric param contains command (+15 points)")
|
|
66
|
+
|
|
67
|
+
# Path/file params containing pipe or semicolon
|
|
68
|
+
if any(kw in param_lower for kw in ["path", "file"]):
|
|
69
|
+
if "|" in param_value_str or ";" in param_value_str:
|
|
70
|
+
score += 20
|
|
71
|
+
logger.debug("Semantic mismatch: path/file param contains pipe or semicolon (+20 points)")
|
|
72
|
+
|
|
73
|
+
return score
|