skeleton-key-http 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- skeleton_key/__init__.py +92 -0
- skeleton_key/__main__.py +105 -0
- skeleton_key/_paths.py +45 -0
- skeleton_key/detect.py +313 -0
- skeleton_key/engine.py +399 -0
- skeleton_key/handlers.py +183 -0
- skeleton_key/registry/__init__.py +99 -0
- skeleton_key/registry/data/contrib_example.yaml +36 -0
- skeleton_key/registry/data/picks.yaml +65 -0
- skeleton_key/registry/data/schema.json +110 -0
- skeleton_key/registry/data/tumblers.yaml +180 -0
- skeleton_key/resilience/__init__.py +634 -0
- skeleton_key/schema.py +155 -0
- skeleton_key/shims/__init__.py +535 -0
- skeleton_key/template_pick.py +71 -0
- skeleton_key/transport/__init__.py +629 -0
- skeleton_key_http-0.1.0.dist-info/METADATA +120 -0
- skeleton_key_http-0.1.0.dist-info/RECORD +21 -0
- skeleton_key_http-0.1.0.dist-info/WHEEL +4 -0
- skeleton_key_http-0.1.0.dist-info/entry_points.txt +2 -0
- skeleton_key_http-0.1.0.dist-info/licenses/LICENSE +21 -0
skeleton_key/__init__.py
ADDED
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
"""skeleton_key — the canonical, declarative registry that formalizes the lockpicking model.
|
|
2
|
+
|
|
3
|
+
Layered architecture (each layer's responsibility starts where the one below stops):
|
|
4
|
+
|
|
5
|
+
skeleton_key.transport MECHANISM the KEY: impersonate-only transport (TLS/JA3), cookie cache
|
|
6
|
+
resilient_fetch RESILIENCE identity rotation, egress Pool, browser fallback
|
|
7
|
+
skeleton_key (this) REGISTRY the FORMAT: tumblers (detection rules) + picks (manifests)
|
|
8
|
+
+ the derived tumbler->pick matrix + the drive loop
|
|
9
|
+
|
|
10
|
+
The registry is language-agnostic YAML (registry/*.yaml, contract in registry/schema.json);
|
|
11
|
+
this Python package is one engine for it. Detection is a Sigma-style match-DSL; picks are
|
|
12
|
+
manifests bound to per-language handlers. The matrix is derived, never hand-written.
|
|
13
|
+
|
|
14
|
+
Quick start:
|
|
15
|
+
from skeleton_key import open_door
|
|
16
|
+
result = open_door("https://www.cloudflare.com/cdn-cgi/trace")
|
|
17
|
+
print(result.opened, result.opened_by, result.resp.status_code)
|
|
18
|
+
|
|
19
|
+
# introspect the registry / matrix:
|
|
20
|
+
from skeleton_key import get_registry
|
|
21
|
+
reg = get_registry()
|
|
22
|
+
for tid, picks in reg.matrix.items():
|
|
23
|
+
print(tid, "->", [p.id for p in picks])
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
from __future__ import annotations
|
|
27
|
+
|
|
28
|
+
from .detect import (
|
|
29
|
+
KNOWN_OPS,
|
|
30
|
+
FormatError,
|
|
31
|
+
View,
|
|
32
|
+
detect,
|
|
33
|
+
evaluate,
|
|
34
|
+
is_blocked,
|
|
35
|
+
native_matcher,
|
|
36
|
+
validate_node,
|
|
37
|
+
)
|
|
38
|
+
from .engine import (
|
|
39
|
+
OpenResult,
|
|
40
|
+
Registry,
|
|
41
|
+
get_registry,
|
|
42
|
+
load_registry,
|
|
43
|
+
open_door,
|
|
44
|
+
resolve,
|
|
45
|
+
)
|
|
46
|
+
from .handlers import HANDLERS, handler
|
|
47
|
+
from .schema import (
|
|
48
|
+
Attempt,
|
|
49
|
+
Category,
|
|
50
|
+
Cost,
|
|
51
|
+
Kind,
|
|
52
|
+
Pick,
|
|
53
|
+
PickContext,
|
|
54
|
+
Severity,
|
|
55
|
+
Tumbler,
|
|
56
|
+
TumblerHit,
|
|
57
|
+
)
|
|
58
|
+
|
|
59
|
+
__version__ = "0.1.0"
|
|
60
|
+
|
|
61
|
+
__all__ = [
|
|
62
|
+
# format types
|
|
63
|
+
"Category",
|
|
64
|
+
"Severity",
|
|
65
|
+
"Cost",
|
|
66
|
+
"Kind",
|
|
67
|
+
"Tumbler",
|
|
68
|
+
"Pick",
|
|
69
|
+
"TumblerHit",
|
|
70
|
+
"PickContext",
|
|
71
|
+
"Attempt",
|
|
72
|
+
# detection
|
|
73
|
+
"detect",
|
|
74
|
+
"is_blocked",
|
|
75
|
+
"evaluate",
|
|
76
|
+
"validate_node",
|
|
77
|
+
"View",
|
|
78
|
+
"native_matcher",
|
|
79
|
+
"KNOWN_OPS",
|
|
80
|
+
# engine
|
|
81
|
+
"Registry",
|
|
82
|
+
"OpenResult",
|
|
83
|
+
"load_registry",
|
|
84
|
+
"get_registry",
|
|
85
|
+
"resolve",
|
|
86
|
+
"open_door",
|
|
87
|
+
# handler binding
|
|
88
|
+
"HANDLERS",
|
|
89
|
+
"handler",
|
|
90
|
+
# errors
|
|
91
|
+
"FormatError",
|
|
92
|
+
]
|
skeleton_key/__main__.py
ADDED
|
@@ -0,0 +1,105 @@
|
|
|
1
|
+
"""skeleton_key.__main__ — a thin command-line front door.
|
|
2
|
+
|
|
3
|
+
python -m skeleton_key <url> [--matrix] [--no-shims] [--no-browser] [-v]
|
|
4
|
+
|
|
5
|
+
Drives the full honest ladder (the shim-aware ``open_door``: Key -> wrenches ->
|
|
6
|
+
picks -> identity rotation -> browser, then alternate-route shims ONLY if the
|
|
7
|
+
front door is confirmed stuck) and prints whether the door opened, what opened
|
|
8
|
+
it, the HTTP status, and — when an alternate route was used — its fidelity
|
|
9
|
+
label (source + fidelity class + freshness).
|
|
10
|
+
|
|
11
|
+
Exit code is 0 when the door opened, non-zero when it is still locked, so the
|
|
12
|
+
command composes in shell pipelines.
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
from __future__ import annotations
|
|
16
|
+
|
|
17
|
+
import argparse
|
|
18
|
+
import sys
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def _run(args: argparse.Namespace) -> int:
|
|
22
|
+
url = args.url
|
|
23
|
+
|
|
24
|
+
if args.matrix:
|
|
25
|
+
# Advanced: the registry's matrix-only loop (no shims, no rotation).
|
|
26
|
+
from skeleton_key import open_door as registry_open_door # registry variant
|
|
27
|
+
|
|
28
|
+
result = registry_open_door(url, verbose=args.verbose)
|
|
29
|
+
status = getattr(result.resp, "status_code", None)
|
|
30
|
+
if result.opened:
|
|
31
|
+
print("opened: yes")
|
|
32
|
+
print(f"opened_by: {result.opened_by or 'front door (already open)'}")
|
|
33
|
+
print(f"status: {status}")
|
|
34
|
+
return 0
|
|
35
|
+
print("opened: no")
|
|
36
|
+
print("opened_by: -")
|
|
37
|
+
print(f"status: {status}")
|
|
38
|
+
return 2
|
|
39
|
+
|
|
40
|
+
# Default: the canonical shim-aware ladder.
|
|
41
|
+
from skeleton_key.shims import AllDoorsStuck, open_door
|
|
42
|
+
|
|
43
|
+
try:
|
|
44
|
+
door = open_door(
|
|
45
|
+
url,
|
|
46
|
+
use_browser=not args.no_browser,
|
|
47
|
+
allow_shims=not args.no_shims,
|
|
48
|
+
verbose=args.verbose,
|
|
49
|
+
)
|
|
50
|
+
except AllDoorsStuck as e:
|
|
51
|
+
print("opened: no")
|
|
52
|
+
print("opened_by: -")
|
|
53
|
+
print(f"reason: {e}")
|
|
54
|
+
return 2
|
|
55
|
+
|
|
56
|
+
status = getattr(door.resp, "status_code", None)
|
|
57
|
+
if door.via_shim and door.shim_result is not None:
|
|
58
|
+
opened_by = f"shim:{door.shim_result.shim}"
|
|
59
|
+
else:
|
|
60
|
+
opened_by = door.label() # "front door (picks, <backend>)"
|
|
61
|
+
|
|
62
|
+
print("opened: yes")
|
|
63
|
+
print(f"opened_by: {opened_by}")
|
|
64
|
+
print(f"status: {status}")
|
|
65
|
+
if door.via_shim and door.shim_result is not None:
|
|
66
|
+
# Honest fidelity label: source + fidelity class + freshness.
|
|
67
|
+
print(f"fidelity: {door.shim_result.label()}")
|
|
68
|
+
return 0
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def main(argv: list[str] | None = None) -> int:
|
|
72
|
+
parser = argparse.ArgumentParser(
|
|
73
|
+
prog="Skeleton Key",
|
|
74
|
+
description="Open a door (URL) with an authentic browser key, honestly.",
|
|
75
|
+
)
|
|
76
|
+
parser.add_argument("url", help="the URL / door to open")
|
|
77
|
+
parser.add_argument(
|
|
78
|
+
"--matrix",
|
|
79
|
+
action="store_true",
|
|
80
|
+
help="use the registry's matrix-only loop (advanced; no rotation or shims)",
|
|
81
|
+
)
|
|
82
|
+
parser.add_argument(
|
|
83
|
+
"--no-shims",
|
|
84
|
+
action="store_true",
|
|
85
|
+
help="disable last-resort alternate-route shims (front door only)",
|
|
86
|
+
)
|
|
87
|
+
parser.add_argument(
|
|
88
|
+
"--no-browser",
|
|
89
|
+
action="store_true",
|
|
90
|
+
help="disable the browser fallback pass",
|
|
91
|
+
)
|
|
92
|
+
parser.add_argument(
|
|
93
|
+
"-v", "--verbose", action="store_true", help="print ladder progress to stderr"
|
|
94
|
+
)
|
|
95
|
+
args = parser.parse_args(argv)
|
|
96
|
+
|
|
97
|
+
try:
|
|
98
|
+
return _run(args)
|
|
99
|
+
except KeyboardInterrupt: # pragma: no cover
|
|
100
|
+
print("interrupted", file=sys.stderr)
|
|
101
|
+
return 130
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
if __name__ == "__main__": # pragma: no cover
|
|
105
|
+
raise SystemExit(main())
|
skeleton_key/_paths.py
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
"""skeleton_key._paths — cache-path parameterization.
|
|
2
|
+
|
|
3
|
+
The engine persists a little state to disk: a cookie cache (so one solved
|
|
4
|
+
challenge survives across CLI invocations and threads) and a persistent
|
|
5
|
+
browser-profile dir (so a warmed identity sticks across browser fallbacks).
|
|
6
|
+
|
|
7
|
+
Both are resolved through one helper so the package writes nowhere
|
|
8
|
+
surprising and the location stays overridable:
|
|
9
|
+
|
|
10
|
+
cache_dir() -> os.environ["SKELETON_KEY_CACHE"] if set,
|
|
11
|
+
else platformdirs.user_cache_dir("skeleton_key").
|
|
12
|
+
|
|
13
|
+
The directory is created on demand (exist_ok).
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
from __future__ import annotations
|
|
17
|
+
|
|
18
|
+
import os
|
|
19
|
+
|
|
20
|
+
import platformdirs
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def cache_dir() -> str:
|
|
24
|
+
"""The base cache directory for all on-disk state.
|
|
25
|
+
|
|
26
|
+
Honors the ``SKELETON_KEY_CACHE`` environment variable when set;
|
|
27
|
+
otherwise uses the per-user OS cache location.
|
|
28
|
+
"""
|
|
29
|
+
base = os.environ.get("SKELETON_KEY_CACHE")
|
|
30
|
+
if not base:
|
|
31
|
+
base = platformdirs.user_cache_dir("skeleton_key")
|
|
32
|
+
os.makedirs(base, exist_ok=True)
|
|
33
|
+
return base
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def cookie_cache_path() -> str:
|
|
37
|
+
"""Path to the per-registered-domain cookie cache JSON file."""
|
|
38
|
+
return os.path.join(cache_dir(), "skeleton_cookies.json")
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def browser_profile_dir() -> str:
|
|
42
|
+
"""Path to the persistent browser profile directory (created on demand)."""
|
|
43
|
+
d = os.path.join(cache_dir(), "patchright-profile")
|
|
44
|
+
os.makedirs(d, exist_ok=True)
|
|
45
|
+
return d
|
skeleton_key/detect.py
ADDED
|
@@ -0,0 +1,313 @@
|
|
|
1
|
+
"""skeleton_key.detect — the predicate-DSL evaluator + unified tumbler detector.
|
|
2
|
+
|
|
3
|
+
A tumbler's `match:` is a tree of primitive predicates combined by any/all/not. This is
|
|
4
|
+
the SAME idea as Sigma / YARA: a language-neutral signature format with an engine in each
|
|
5
|
+
language. The semantics below ARE the contract — any reimplementation (Rust, Go, JS) that
|
|
6
|
+
evaluates a match-tree the same way is interoperable with the YAML registry verbatim.
|
|
7
|
+
|
|
8
|
+
A match-node is a dict carrying exactly ONE key (an operator or a primitive):
|
|
9
|
+
|
|
10
|
+
Combinators
|
|
11
|
+
all: [node, node, ...] every child must hold (AND)
|
|
12
|
+
any: [node, node, ...] at least one child holds (OR)
|
|
13
|
+
not: node the child must NOT hold (NOT)
|
|
14
|
+
|
|
15
|
+
Primitives (evaluated against a response View — status, headers, body)
|
|
16
|
+
status: [403, 429] status code in this set (int or list)
|
|
17
|
+
status_range: [500, 599] low <= status <= high (inclusive)
|
|
18
|
+
header_present: ["cf-mitigated"] any of these header names present
|
|
19
|
+
header_absent: ["x-foo"] none of these header names present
|
|
20
|
+
header_equals: {server: cloudflare} header value == (case-insensitive)
|
|
21
|
+
header_contains: {set-cookie: "datadome="} header value contains substring (ci)
|
|
22
|
+
body_contains: ["just a moment"] any substring present in body (case-insensitive)
|
|
23
|
+
body_regex: "ray id:\\s*[0-9a-f]" regex search over body (re.I)
|
|
24
|
+
body_len_lt: 1500 len(body bytes) < N
|
|
25
|
+
body_len_gte: 1500 len(body bytes) >= N
|
|
26
|
+
content_truncated: 0.6 body shorter than content-length * ratio
|
|
27
|
+
host_suffix: ["reddit.com"] response URL's host == / endswith any of these
|
|
28
|
+
url_contains: ["/api/"] response URL contains any of these substrings
|
|
29
|
+
native: "fn_name" ESCAPE HATCH -> a registered Python matcher.
|
|
30
|
+
Non-portable; use only for irreducible heuristics.
|
|
31
|
+
|
|
32
|
+
Header names are matched case-insensitively (the View lowercases them). Body text is the
|
|
33
|
+
first 64 KiB decoded utf-8 (replace); body_contains and body_regex are case-insensitive.
|
|
34
|
+
"""
|
|
35
|
+
|
|
36
|
+
from __future__ import annotations
|
|
37
|
+
|
|
38
|
+
import re
|
|
39
|
+
from collections.abc import Callable
|
|
40
|
+
from urllib.parse import urlparse
|
|
41
|
+
|
|
42
|
+
from .schema import Tumbler, TumblerHit
|
|
43
|
+
|
|
44
|
+
_BODY_CAP = 65536 # first 64 KiB is plenty for any wall's interstitial/markers
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
class FormatError(ValueError):
|
|
48
|
+
"""A registry/match-tree that does not conform to the format. Raised at load time."""
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
# ---------------------------------------------------------------------------
|
|
52
|
+
# native matcher escape hatch — register a Python predicate by name
|
|
53
|
+
# ---------------------------------------------------------------------------
|
|
54
|
+
NATIVE_MATCHERS: dict[str, Callable[[View], bool]] = {}
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def native_matcher(name: str):
|
|
58
|
+
"""Decorator: register a non-portable Python predicate for `match: {native: name}`."""
|
|
59
|
+
|
|
60
|
+
def deco(fn: Callable[[View], bool]):
|
|
61
|
+
NATIVE_MATCHERS[name] = fn
|
|
62
|
+
return fn
|
|
63
|
+
|
|
64
|
+
return deco
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
# ---------------------------------------------------------------------------
|
|
68
|
+
# View — a normalized, cheap-to-query snapshot of a Resp
|
|
69
|
+
# ---------------------------------------------------------------------------
|
|
70
|
+
class View:
|
|
71
|
+
"""Normalized response facts the predicates read. Built once per detect() call."""
|
|
72
|
+
|
|
73
|
+
__slots__ = ("status", "headers", "raw_len", "_text", "_lower", "content_length", "url", "host")
|
|
74
|
+
|
|
75
|
+
def __init__(self, status: int, headers: dict, content: bytes, url: str = ""):
|
|
76
|
+
self.status = int(status)
|
|
77
|
+
# headers arrive lowercased from skeleton_key.transport; normalize defensively anyway.
|
|
78
|
+
self.headers = {str(k).lower(): v for k, v in (headers or {}).items()}
|
|
79
|
+
body = content or b""
|
|
80
|
+
self.raw_len = len(body)
|
|
81
|
+
self._text = body[:_BODY_CAP].decode("utf-8", "replace")
|
|
82
|
+
self._lower = self._text.lower()
|
|
83
|
+
cl = self.headers.get("content-length")
|
|
84
|
+
self.content_length = int(cl) if (cl and str(cl).isdigit()) else None
|
|
85
|
+
self.url = url or ""
|
|
86
|
+
self.host = (urlparse(self.url).hostname or "").lower()
|
|
87
|
+
|
|
88
|
+
@property
|
|
89
|
+
def text(self) -> str:
|
|
90
|
+
return self._text
|
|
91
|
+
|
|
92
|
+
@property
|
|
93
|
+
def lower(self) -> str:
|
|
94
|
+
return self._lower
|
|
95
|
+
|
|
96
|
+
@classmethod
|
|
97
|
+
def of(cls, resp) -> View:
|
|
98
|
+
return cls(
|
|
99
|
+
resp.status_code,
|
|
100
|
+
dict(getattr(resp, "headers", {}) or {}),
|
|
101
|
+
resp.content or b"",
|
|
102
|
+
getattr(resp, "url", "") or "",
|
|
103
|
+
)
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
# ---------------------------------------------------------------------------
|
|
107
|
+
# primitive evaluators — each takes (value_from_yaml, View) -> bool
|
|
108
|
+
# ---------------------------------------------------------------------------
|
|
109
|
+
def _as_list(v):
|
|
110
|
+
return v if isinstance(v, (list, tuple)) else [v]
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
def _p_status(v, view: View) -> bool:
|
|
114
|
+
return view.status in {int(x) for x in _as_list(v)}
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
def _p_status_range(v, view: View) -> bool:
|
|
118
|
+
lo, hi = int(v[0]), int(v[1])
|
|
119
|
+
return lo <= view.status <= hi
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
def _p_header_present(v, view: View) -> bool:
|
|
123
|
+
return any(name.lower() in view.headers for name in _as_list(v))
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
def _p_header_absent(v, view: View) -> bool:
|
|
127
|
+
return all(name.lower() not in view.headers for name in _as_list(v))
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
def _p_header_equals(v, view: View) -> bool:
|
|
131
|
+
return all(
|
|
132
|
+
(view.headers.get(k.lower(), "") or "").lower() == str(val).lower() for k, val in v.items()
|
|
133
|
+
)
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
def _p_header_contains(v, view: View) -> bool:
|
|
137
|
+
return all(
|
|
138
|
+
str(val).lower() in (view.headers.get(k.lower(), "") or "").lower() for k, val in v.items()
|
|
139
|
+
)
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
def _p_body_contains(v, view: View) -> bool:
|
|
143
|
+
return any(str(s).lower() in view.lower for s in _as_list(v))
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
_regex_cache: dict[str, re.Pattern] = {}
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
def _p_body_regex(v, view: View) -> bool:
|
|
150
|
+
pat = _regex_cache.get(v)
|
|
151
|
+
if pat is None:
|
|
152
|
+
pat = _regex_cache[v] = re.compile(v, re.I)
|
|
153
|
+
return bool(pat.search(view.text))
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
def _p_body_len_lt(v, view: View) -> bool:
|
|
157
|
+
return view.raw_len < int(v)
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
def _p_body_len_gte(v, view: View) -> bool:
|
|
161
|
+
return view.raw_len >= int(v)
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
def _p_content_truncated(v, view: View) -> bool:
|
|
165
|
+
return view.content_length is not None and view.raw_len < view.content_length * float(v)
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
def _p_host_suffix(v, view: View) -> bool:
|
|
169
|
+
return any(view.host == s.lower() or view.host.endswith("." + s.lower()) for s in _as_list(v))
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
def _p_url_contains(v, view: View) -> bool:
|
|
173
|
+
return any(str(s).lower() in view.url.lower() for s in _as_list(v))
|
|
174
|
+
|
|
175
|
+
|
|
176
|
+
def _p_native(v, view: View) -> bool:
|
|
177
|
+
fn = NATIVE_MATCHERS.get(v)
|
|
178
|
+
if fn is None:
|
|
179
|
+
raise FormatError(f"native matcher {v!r} is not registered")
|
|
180
|
+
return bool(fn(view))
|
|
181
|
+
|
|
182
|
+
|
|
183
|
+
_PRIMITIVES: dict[str, Callable] = {
|
|
184
|
+
"status": _p_status,
|
|
185
|
+
"status_range": _p_status_range,
|
|
186
|
+
"header_present": _p_header_present,
|
|
187
|
+
"header_absent": _p_header_absent,
|
|
188
|
+
"header_equals": _p_header_equals,
|
|
189
|
+
"header_contains": _p_header_contains,
|
|
190
|
+
"body_contains": _p_body_contains,
|
|
191
|
+
"body_regex": _p_body_regex,
|
|
192
|
+
"body_len_lt": _p_body_len_lt,
|
|
193
|
+
"body_len_gte": _p_body_len_gte,
|
|
194
|
+
"content_truncated": _p_content_truncated,
|
|
195
|
+
"host_suffix": _p_host_suffix,
|
|
196
|
+
"url_contains": _p_url_contains,
|
|
197
|
+
"native": _p_native,
|
|
198
|
+
}
|
|
199
|
+
_COMBINATORS = {"all", "any", "not"}
|
|
200
|
+
KNOWN_OPS = set(_PRIMITIVES) | _COMBINATORS
|
|
201
|
+
|
|
202
|
+
|
|
203
|
+
def evaluate(node, view: View) -> bool:
|
|
204
|
+
"""Evaluate one match-node against a View. Raises FormatError on a malformed node."""
|
|
205
|
+
if not isinstance(node, dict) or len(node) != 1:
|
|
206
|
+
raise FormatError(f"match-node must be a dict with exactly one key, got: {node!r}")
|
|
207
|
+
op, val = next(iter(node.items()))
|
|
208
|
+
if op == "all":
|
|
209
|
+
return all(evaluate(child, view) for child in val)
|
|
210
|
+
if op == "any":
|
|
211
|
+
return any(evaluate(child, view) for child in val)
|
|
212
|
+
if op == "not":
|
|
213
|
+
return not evaluate(val, view)
|
|
214
|
+
fn = _PRIMITIVES.get(op)
|
|
215
|
+
if fn is None:
|
|
216
|
+
raise FormatError(f"unknown predicate {op!r} (known: {sorted(KNOWN_OPS)})")
|
|
217
|
+
return fn(val, view)
|
|
218
|
+
|
|
219
|
+
|
|
220
|
+
def _check(cond, msg) -> None:
|
|
221
|
+
if not cond:
|
|
222
|
+
raise FormatError(msg)
|
|
223
|
+
|
|
224
|
+
|
|
225
|
+
def _str_or_strlist(v) -> bool:
|
|
226
|
+
return isinstance(v, str) or (
|
|
227
|
+
isinstance(v, (list, tuple)) and bool(v) and all(isinstance(x, str) for x in v)
|
|
228
|
+
)
|
|
229
|
+
|
|
230
|
+
|
|
231
|
+
def _validate_primitive(op, val) -> None:
|
|
232
|
+
"""Type-check a primitive's value SHAPE so a malformed rule fails at LOAD, not mid-detect."""
|
|
233
|
+
if op == "status":
|
|
234
|
+
_check(
|
|
235
|
+
isinstance(val, int)
|
|
236
|
+
or (isinstance(val, (list, tuple)) and val and all(isinstance(x, int) for x in val)),
|
|
237
|
+
f"{op!r} needs an int or non-empty list of ints, got {val!r}",
|
|
238
|
+
)
|
|
239
|
+
elif op == "status_range":
|
|
240
|
+
_check(
|
|
241
|
+
isinstance(val, (list, tuple))
|
|
242
|
+
and len(val) == 2
|
|
243
|
+
and all(isinstance(x, int) for x in val),
|
|
244
|
+
f"status_range needs [low, high] ints, got {val!r}",
|
|
245
|
+
)
|
|
246
|
+
elif op in ("header_present", "header_absent", "body_contains", "host_suffix", "url_contains"):
|
|
247
|
+
_check(
|
|
248
|
+
_str_or_strlist(val), f"{op!r} needs a string or non-empty list of strings, got {val!r}"
|
|
249
|
+
)
|
|
250
|
+
elif op in ("header_equals", "header_contains"):
|
|
251
|
+
_check(
|
|
252
|
+
isinstance(val, dict)
|
|
253
|
+
and val
|
|
254
|
+
and all(isinstance(k, str) and isinstance(v, str) for k, v in val.items()),
|
|
255
|
+
f"{op!r} needs a non-empty string->string mapping, got {val!r}",
|
|
256
|
+
)
|
|
257
|
+
elif op in ("body_regex", "native"):
|
|
258
|
+
_check(isinstance(val, str) and val, f"{op!r} needs a non-empty string, got {val!r}")
|
|
259
|
+
if op == "body_regex":
|
|
260
|
+
try:
|
|
261
|
+
re.compile(val)
|
|
262
|
+
except re.error as e:
|
|
263
|
+
raise FormatError(f"body_regex is not a valid regex ({e}): {val!r}") from e
|
|
264
|
+
elif op in ("body_len_lt", "body_len_gte"):
|
|
265
|
+
_check(
|
|
266
|
+
isinstance(val, int) and not isinstance(val, bool), f"{op!r} needs an int, got {val!r}"
|
|
267
|
+
)
|
|
268
|
+
elif op == "content_truncated":
|
|
269
|
+
_check(
|
|
270
|
+
isinstance(val, (int, float)) and not isinstance(val, bool) and 0 < float(val) <= 1,
|
|
271
|
+
f"content_truncated needs a number in (0, 1], got {val!r}",
|
|
272
|
+
)
|
|
273
|
+
|
|
274
|
+
|
|
275
|
+
def validate_node(node) -> None:
|
|
276
|
+
"""Static check of a match-tree (no View needed). Validates structure AND primitive value
|
|
277
|
+
shapes, so a malformed registry raises FormatError at load instead of crashing at detect."""
|
|
278
|
+
if not isinstance(node, dict) or len(node) != 1:
|
|
279
|
+
raise FormatError(f"match-node must be a dict with exactly one key, got: {node!r}")
|
|
280
|
+
op, val = next(iter(node.items()))
|
|
281
|
+
if op in ("all", "any"):
|
|
282
|
+
if not isinstance(val, list) or not val:
|
|
283
|
+
raise FormatError(f"{op!r} needs a non-empty list of nodes")
|
|
284
|
+
for child in val:
|
|
285
|
+
validate_node(child)
|
|
286
|
+
elif op == "not":
|
|
287
|
+
validate_node(val)
|
|
288
|
+
elif op in _PRIMITIVES:
|
|
289
|
+
_validate_primitive(op, val)
|
|
290
|
+
else:
|
|
291
|
+
raise FormatError(f"unknown predicate {op!r} (known: {sorted(KNOWN_OPS)})")
|
|
292
|
+
|
|
293
|
+
|
|
294
|
+
# ---------------------------------------------------------------------------
|
|
295
|
+
# detection — run tumblers (severity-ordered) against a response
|
|
296
|
+
# ---------------------------------------------------------------------------
|
|
297
|
+
def detect(resp, tumblers: list[Tumbler]) -> TumblerHit | None:
|
|
298
|
+
"""Return the highest-severity TumblerHit that fires on `resp`, or None if open.
|
|
299
|
+
|
|
300
|
+
`tumblers` should already be severity-ordered (engine sorts at load). The first match
|
|
301
|
+
wins, so a specific high-severity wall (e.g. a Cloudflare challenge 403) takes priority
|
|
302
|
+
over a generic low-severity rule (e.g. bare-403 reputation) when both could match."""
|
|
303
|
+
view = View.of(resp)
|
|
304
|
+
for t in tumblers:
|
|
305
|
+
if evaluate(t.match, view):
|
|
306
|
+
return TumblerHit(tumbler=t, reason=f"{t.category.value}:{t.id}")
|
|
307
|
+
return None
|
|
308
|
+
|
|
309
|
+
|
|
310
|
+
def is_blocked(resp, tumblers: list[Tumbler]) -> tuple[bool, str]:
|
|
311
|
+
"""Compatibility shim mirroring the legacy is_blocked() API, but registry-driven."""
|
|
312
|
+
hit = detect(resp, tumblers)
|
|
313
|
+
return (True, hit.reason) if hit else (False, "ok")
|