skeleton-key-http 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,92 @@
1
+ """skeleton_key — the canonical, declarative registry that formalizes the lockpicking model.
2
+
3
+ Layered architecture (each layer's responsibility starts where the one below stops):
4
+
5
+ skeleton_key.transport MECHANISM the KEY: impersonate-only transport (TLS/JA3), cookie cache
6
+ resilient_fetch RESILIENCE identity rotation, egress Pool, browser fallback
7
+ skeleton_key (this) REGISTRY the FORMAT: tumblers (detection rules) + picks (manifests)
8
+ + the derived tumbler->pick matrix + the drive loop
9
+
10
+ The registry is language-agnostic YAML (registry/*.yaml, contract in registry/schema.json);
11
+ this Python package is one engine for it. Detection is a Sigma-style match-DSL; picks are
12
+ manifests bound to per-language handlers. The matrix is derived, never hand-written.
13
+
14
+ Quick start:
15
+ from skeleton_key import open_door
16
+ result = open_door("https://www.cloudflare.com/cdn-cgi/trace")
17
+ print(result.opened, result.opened_by, result.resp.status_code)
18
+
19
+ # introspect the registry / matrix:
20
+ from skeleton_key import get_registry
21
+ reg = get_registry()
22
+ for tid, picks in reg.matrix.items():
23
+ print(tid, "->", [p.id for p in picks])
24
+ """
25
+
26
+ from __future__ import annotations
27
+
28
+ from .detect import (
29
+ KNOWN_OPS,
30
+ FormatError,
31
+ View,
32
+ detect,
33
+ evaluate,
34
+ is_blocked,
35
+ native_matcher,
36
+ validate_node,
37
+ )
38
+ from .engine import (
39
+ OpenResult,
40
+ Registry,
41
+ get_registry,
42
+ load_registry,
43
+ open_door,
44
+ resolve,
45
+ )
46
+ from .handlers import HANDLERS, handler
47
+ from .schema import (
48
+ Attempt,
49
+ Category,
50
+ Cost,
51
+ Kind,
52
+ Pick,
53
+ PickContext,
54
+ Severity,
55
+ Tumbler,
56
+ TumblerHit,
57
+ )
58
+
59
+ __version__ = "0.1.0"
60
+
61
+ __all__ = [
62
+ # format types
63
+ "Category",
64
+ "Severity",
65
+ "Cost",
66
+ "Kind",
67
+ "Tumbler",
68
+ "Pick",
69
+ "TumblerHit",
70
+ "PickContext",
71
+ "Attempt",
72
+ # detection
73
+ "detect",
74
+ "is_blocked",
75
+ "evaluate",
76
+ "validate_node",
77
+ "View",
78
+ "native_matcher",
79
+ "KNOWN_OPS",
80
+ # engine
81
+ "Registry",
82
+ "OpenResult",
83
+ "load_registry",
84
+ "get_registry",
85
+ "resolve",
86
+ "open_door",
87
+ # handler binding
88
+ "HANDLERS",
89
+ "handler",
90
+ # errors
91
+ "FormatError",
92
+ ]
@@ -0,0 +1,105 @@
1
+ """skeleton_key.__main__ — a thin command-line front door.
2
+
3
+ python -m skeleton_key <url> [--matrix] [--no-shims] [--no-browser] [-v]
4
+
5
+ Drives the full honest ladder (the shim-aware ``open_door``: Key -> wrenches ->
6
+ picks -> identity rotation -> browser, then alternate-route shims ONLY if the
7
+ front door is confirmed stuck) and prints whether the door opened, what opened
8
+ it, the HTTP status, and — when an alternate route was used — its fidelity
9
+ label (source + fidelity class + freshness).
10
+
11
+ Exit code is 0 when the door opened, non-zero when it is still locked, so the
12
+ command composes in shell pipelines.
13
+ """
14
+
15
+ from __future__ import annotations
16
+
17
+ import argparse
18
+ import sys
19
+
20
+
21
+ def _run(args: argparse.Namespace) -> int:
22
+ url = args.url
23
+
24
+ if args.matrix:
25
+ # Advanced: the registry's matrix-only loop (no shims, no rotation).
26
+ from skeleton_key import open_door as registry_open_door # registry variant
27
+
28
+ result = registry_open_door(url, verbose=args.verbose)
29
+ status = getattr(result.resp, "status_code", None)
30
+ if result.opened:
31
+ print("opened: yes")
32
+ print(f"opened_by: {result.opened_by or 'front door (already open)'}")
33
+ print(f"status: {status}")
34
+ return 0
35
+ print("opened: no")
36
+ print("opened_by: -")
37
+ print(f"status: {status}")
38
+ return 2
39
+
40
+ # Default: the canonical shim-aware ladder.
41
+ from skeleton_key.shims import AllDoorsStuck, open_door
42
+
43
+ try:
44
+ door = open_door(
45
+ url,
46
+ use_browser=not args.no_browser,
47
+ allow_shims=not args.no_shims,
48
+ verbose=args.verbose,
49
+ )
50
+ except AllDoorsStuck as e:
51
+ print("opened: no")
52
+ print("opened_by: -")
53
+ print(f"reason: {e}")
54
+ return 2
55
+
56
+ status = getattr(door.resp, "status_code", None)
57
+ if door.via_shim and door.shim_result is not None:
58
+ opened_by = f"shim:{door.shim_result.shim}"
59
+ else:
60
+ opened_by = door.label() # "front door (picks, <backend>)"
61
+
62
+ print("opened: yes")
63
+ print(f"opened_by: {opened_by}")
64
+ print(f"status: {status}")
65
+ if door.via_shim and door.shim_result is not None:
66
+ # Honest fidelity label: source + fidelity class + freshness.
67
+ print(f"fidelity: {door.shim_result.label()}")
68
+ return 0
69
+
70
+
71
+ def main(argv: list[str] | None = None) -> int:
72
+ parser = argparse.ArgumentParser(
73
+ prog="Skeleton Key",
74
+ description="Open a door (URL) with an authentic browser key, honestly.",
75
+ )
76
+ parser.add_argument("url", help="the URL / door to open")
77
+ parser.add_argument(
78
+ "--matrix",
79
+ action="store_true",
80
+ help="use the registry's matrix-only loop (advanced; no rotation or shims)",
81
+ )
82
+ parser.add_argument(
83
+ "--no-shims",
84
+ action="store_true",
85
+ help="disable last-resort alternate-route shims (front door only)",
86
+ )
87
+ parser.add_argument(
88
+ "--no-browser",
89
+ action="store_true",
90
+ help="disable the browser fallback pass",
91
+ )
92
+ parser.add_argument(
93
+ "-v", "--verbose", action="store_true", help="print ladder progress to stderr"
94
+ )
95
+ args = parser.parse_args(argv)
96
+
97
+ try:
98
+ return _run(args)
99
+ except KeyboardInterrupt: # pragma: no cover
100
+ print("interrupted", file=sys.stderr)
101
+ return 130
102
+
103
+
104
+ if __name__ == "__main__": # pragma: no cover
105
+ raise SystemExit(main())
skeleton_key/_paths.py ADDED
@@ -0,0 +1,45 @@
1
+ """skeleton_key._paths — cache-path parameterization.
2
+
3
+ The engine persists a little state to disk: a cookie cache (so one solved
4
+ challenge survives across CLI invocations and threads) and a persistent
5
+ browser-profile dir (so a warmed identity sticks across browser fallbacks).
6
+
7
+ Both are resolved through one helper so the package writes nowhere
8
+ surprising and the location stays overridable:
9
+
10
+ cache_dir() -> os.environ["SKELETON_KEY_CACHE"] if set,
11
+ else platformdirs.user_cache_dir("skeleton_key").
12
+
13
+ The directory is created on demand (exist_ok).
14
+ """
15
+
16
+ from __future__ import annotations
17
+
18
+ import os
19
+
20
+ import platformdirs
21
+
22
+
23
+ def cache_dir() -> str:
24
+ """The base cache directory for all on-disk state.
25
+
26
+ Honors the ``SKELETON_KEY_CACHE`` environment variable when set;
27
+ otherwise uses the per-user OS cache location.
28
+ """
29
+ base = os.environ.get("SKELETON_KEY_CACHE")
30
+ if not base:
31
+ base = platformdirs.user_cache_dir("skeleton_key")
32
+ os.makedirs(base, exist_ok=True)
33
+ return base
34
+
35
+
36
+ def cookie_cache_path() -> str:
37
+ """Path to the per-registered-domain cookie cache JSON file."""
38
+ return os.path.join(cache_dir(), "skeleton_cookies.json")
39
+
40
+
41
+ def browser_profile_dir() -> str:
42
+ """Path to the persistent browser profile directory (created on demand)."""
43
+ d = os.path.join(cache_dir(), "patchright-profile")
44
+ os.makedirs(d, exist_ok=True)
45
+ return d
skeleton_key/detect.py ADDED
@@ -0,0 +1,313 @@
1
+ """skeleton_key.detect — the predicate-DSL evaluator + unified tumbler detector.
2
+
3
+ A tumbler's `match:` is a tree of primitive predicates combined by any/all/not. This is
4
+ the SAME idea as Sigma / YARA: a language-neutral signature format with an engine in each
5
+ language. The semantics below ARE the contract — any reimplementation (Rust, Go, JS) that
6
+ evaluates a match-tree the same way is interoperable with the YAML registry verbatim.
7
+
8
+ A match-node is a dict carrying exactly ONE key (an operator or a primitive):
9
+
10
+ Combinators
11
+ all: [node, node, ...] every child must hold (AND)
12
+ any: [node, node, ...] at least one child holds (OR)
13
+ not: node the child must NOT hold (NOT)
14
+
15
+ Primitives (evaluated against a response View — status, headers, body)
16
+ status: [403, 429] status code in this set (int or list)
17
+ status_range: [500, 599] low <= status <= high (inclusive)
18
+ header_present: ["cf-mitigated"] any of these header names present
19
+ header_absent: ["x-foo"] none of these header names present
20
+ header_equals: {server: cloudflare} header value == (case-insensitive)
21
+ header_contains: {set-cookie: "datadome="} header value contains substring (ci)
22
+ body_contains: ["just a moment"] any substring present in body (case-insensitive)
23
+ body_regex: "ray id:\\s*[0-9a-f]" regex search over body (re.I)
24
+ body_len_lt: 1500 len(body bytes) < N
25
+ body_len_gte: 1500 len(body bytes) >= N
26
+ content_truncated: 0.6 body shorter than content-length * ratio
27
+ host_suffix: ["reddit.com"] response URL's host == / endswith any of these
28
+ url_contains: ["/api/"] response URL contains any of these substrings
29
+ native: "fn_name" ESCAPE HATCH -> a registered Python matcher.
30
+ Non-portable; use only for irreducible heuristics.
31
+
32
+ Header names are matched case-insensitively (the View lowercases them). Body text is the
33
+ first 64 KiB decoded utf-8 (replace); body_contains and body_regex are case-insensitive.
34
+ """
35
+
36
+ from __future__ import annotations
37
+
38
+ import re
39
+ from collections.abc import Callable
40
+ from urllib.parse import urlparse
41
+
42
+ from .schema import Tumbler, TumblerHit
43
+
44
+ _BODY_CAP = 65536 # first 64 KiB is plenty for any wall's interstitial/markers
45
+
46
+
47
+ class FormatError(ValueError):
48
+ """A registry/match-tree that does not conform to the format. Raised at load time."""
49
+
50
+
51
+ # ---------------------------------------------------------------------------
52
+ # native matcher escape hatch — register a Python predicate by name
53
+ # ---------------------------------------------------------------------------
54
+ NATIVE_MATCHERS: dict[str, Callable[[View], bool]] = {}
55
+
56
+
57
+ def native_matcher(name: str):
58
+ """Decorator: register a non-portable Python predicate for `match: {native: name}`."""
59
+
60
+ def deco(fn: Callable[[View], bool]):
61
+ NATIVE_MATCHERS[name] = fn
62
+ return fn
63
+
64
+ return deco
65
+
66
+
67
+ # ---------------------------------------------------------------------------
68
+ # View — a normalized, cheap-to-query snapshot of a Resp
69
+ # ---------------------------------------------------------------------------
70
+ class View:
71
+ """Normalized response facts the predicates read. Built once per detect() call."""
72
+
73
+ __slots__ = ("status", "headers", "raw_len", "_text", "_lower", "content_length", "url", "host")
74
+
75
+ def __init__(self, status: int, headers: dict, content: bytes, url: str = ""):
76
+ self.status = int(status)
77
+ # headers arrive lowercased from skeleton_key.transport; normalize defensively anyway.
78
+ self.headers = {str(k).lower(): v for k, v in (headers or {}).items()}
79
+ body = content or b""
80
+ self.raw_len = len(body)
81
+ self._text = body[:_BODY_CAP].decode("utf-8", "replace")
82
+ self._lower = self._text.lower()
83
+ cl = self.headers.get("content-length")
84
+ self.content_length = int(cl) if (cl and str(cl).isdigit()) else None
85
+ self.url = url or ""
86
+ self.host = (urlparse(self.url).hostname or "").lower()
87
+
88
+ @property
89
+ def text(self) -> str:
90
+ return self._text
91
+
92
+ @property
93
+ def lower(self) -> str:
94
+ return self._lower
95
+
96
+ @classmethod
97
+ def of(cls, resp) -> View:
98
+ return cls(
99
+ resp.status_code,
100
+ dict(getattr(resp, "headers", {}) or {}),
101
+ resp.content or b"",
102
+ getattr(resp, "url", "") or "",
103
+ )
104
+
105
+
106
+ # ---------------------------------------------------------------------------
107
+ # primitive evaluators — each takes (value_from_yaml, View) -> bool
108
+ # ---------------------------------------------------------------------------
109
+ def _as_list(v):
110
+ return v if isinstance(v, (list, tuple)) else [v]
111
+
112
+
113
+ def _p_status(v, view: View) -> bool:
114
+ return view.status in {int(x) for x in _as_list(v)}
115
+
116
+
117
+ def _p_status_range(v, view: View) -> bool:
118
+ lo, hi = int(v[0]), int(v[1])
119
+ return lo <= view.status <= hi
120
+
121
+
122
+ def _p_header_present(v, view: View) -> bool:
123
+ return any(name.lower() in view.headers for name in _as_list(v))
124
+
125
+
126
+ def _p_header_absent(v, view: View) -> bool:
127
+ return all(name.lower() not in view.headers for name in _as_list(v))
128
+
129
+
130
+ def _p_header_equals(v, view: View) -> bool:
131
+ return all(
132
+ (view.headers.get(k.lower(), "") or "").lower() == str(val).lower() for k, val in v.items()
133
+ )
134
+
135
+
136
+ def _p_header_contains(v, view: View) -> bool:
137
+ return all(
138
+ str(val).lower() in (view.headers.get(k.lower(), "") or "").lower() for k, val in v.items()
139
+ )
140
+
141
+
142
+ def _p_body_contains(v, view: View) -> bool:
143
+ return any(str(s).lower() in view.lower for s in _as_list(v))
144
+
145
+
146
+ _regex_cache: dict[str, re.Pattern] = {}
147
+
148
+
149
+ def _p_body_regex(v, view: View) -> bool:
150
+ pat = _regex_cache.get(v)
151
+ if pat is None:
152
+ pat = _regex_cache[v] = re.compile(v, re.I)
153
+ return bool(pat.search(view.text))
154
+
155
+
156
+ def _p_body_len_lt(v, view: View) -> bool:
157
+ return view.raw_len < int(v)
158
+
159
+
160
+ def _p_body_len_gte(v, view: View) -> bool:
161
+ return view.raw_len >= int(v)
162
+
163
+
164
+ def _p_content_truncated(v, view: View) -> bool:
165
+ return view.content_length is not None and view.raw_len < view.content_length * float(v)
166
+
167
+
168
+ def _p_host_suffix(v, view: View) -> bool:
169
+ return any(view.host == s.lower() or view.host.endswith("." + s.lower()) for s in _as_list(v))
170
+
171
+
172
+ def _p_url_contains(v, view: View) -> bool:
173
+ return any(str(s).lower() in view.url.lower() for s in _as_list(v))
174
+
175
+
176
+ def _p_native(v, view: View) -> bool:
177
+ fn = NATIVE_MATCHERS.get(v)
178
+ if fn is None:
179
+ raise FormatError(f"native matcher {v!r} is not registered")
180
+ return bool(fn(view))
181
+
182
+
183
+ _PRIMITIVES: dict[str, Callable] = {
184
+ "status": _p_status,
185
+ "status_range": _p_status_range,
186
+ "header_present": _p_header_present,
187
+ "header_absent": _p_header_absent,
188
+ "header_equals": _p_header_equals,
189
+ "header_contains": _p_header_contains,
190
+ "body_contains": _p_body_contains,
191
+ "body_regex": _p_body_regex,
192
+ "body_len_lt": _p_body_len_lt,
193
+ "body_len_gte": _p_body_len_gte,
194
+ "content_truncated": _p_content_truncated,
195
+ "host_suffix": _p_host_suffix,
196
+ "url_contains": _p_url_contains,
197
+ "native": _p_native,
198
+ }
199
+ _COMBINATORS = {"all", "any", "not"}
200
+ KNOWN_OPS = set(_PRIMITIVES) | _COMBINATORS
201
+
202
+
203
+ def evaluate(node, view: View) -> bool:
204
+ """Evaluate one match-node against a View. Raises FormatError on a malformed node."""
205
+ if not isinstance(node, dict) or len(node) != 1:
206
+ raise FormatError(f"match-node must be a dict with exactly one key, got: {node!r}")
207
+ op, val = next(iter(node.items()))
208
+ if op == "all":
209
+ return all(evaluate(child, view) for child in val)
210
+ if op == "any":
211
+ return any(evaluate(child, view) for child in val)
212
+ if op == "not":
213
+ return not evaluate(val, view)
214
+ fn = _PRIMITIVES.get(op)
215
+ if fn is None:
216
+ raise FormatError(f"unknown predicate {op!r} (known: {sorted(KNOWN_OPS)})")
217
+ return fn(val, view)
218
+
219
+
220
+ def _check(cond, msg) -> None:
221
+ if not cond:
222
+ raise FormatError(msg)
223
+
224
+
225
+ def _str_or_strlist(v) -> bool:
226
+ return isinstance(v, str) or (
227
+ isinstance(v, (list, tuple)) and bool(v) and all(isinstance(x, str) for x in v)
228
+ )
229
+
230
+
231
+ def _validate_primitive(op, val) -> None:
232
+ """Type-check a primitive's value SHAPE so a malformed rule fails at LOAD, not mid-detect."""
233
+ if op == "status":
234
+ _check(
235
+ isinstance(val, int)
236
+ or (isinstance(val, (list, tuple)) and val and all(isinstance(x, int) for x in val)),
237
+ f"{op!r} needs an int or non-empty list of ints, got {val!r}",
238
+ )
239
+ elif op == "status_range":
240
+ _check(
241
+ isinstance(val, (list, tuple))
242
+ and len(val) == 2
243
+ and all(isinstance(x, int) for x in val),
244
+ f"status_range needs [low, high] ints, got {val!r}",
245
+ )
246
+ elif op in ("header_present", "header_absent", "body_contains", "host_suffix", "url_contains"):
247
+ _check(
248
+ _str_or_strlist(val), f"{op!r} needs a string or non-empty list of strings, got {val!r}"
249
+ )
250
+ elif op in ("header_equals", "header_contains"):
251
+ _check(
252
+ isinstance(val, dict)
253
+ and val
254
+ and all(isinstance(k, str) and isinstance(v, str) for k, v in val.items()),
255
+ f"{op!r} needs a non-empty string->string mapping, got {val!r}",
256
+ )
257
+ elif op in ("body_regex", "native"):
258
+ _check(isinstance(val, str) and val, f"{op!r} needs a non-empty string, got {val!r}")
259
+ if op == "body_regex":
260
+ try:
261
+ re.compile(val)
262
+ except re.error as e:
263
+ raise FormatError(f"body_regex is not a valid regex ({e}): {val!r}") from e
264
+ elif op in ("body_len_lt", "body_len_gte"):
265
+ _check(
266
+ isinstance(val, int) and not isinstance(val, bool), f"{op!r} needs an int, got {val!r}"
267
+ )
268
+ elif op == "content_truncated":
269
+ _check(
270
+ isinstance(val, (int, float)) and not isinstance(val, bool) and 0 < float(val) <= 1,
271
+ f"content_truncated needs a number in (0, 1], got {val!r}",
272
+ )
273
+
274
+
275
+ def validate_node(node) -> None:
276
+ """Static check of a match-tree (no View needed). Validates structure AND primitive value
277
+ shapes, so a malformed registry raises FormatError at load instead of crashing at detect."""
278
+ if not isinstance(node, dict) or len(node) != 1:
279
+ raise FormatError(f"match-node must be a dict with exactly one key, got: {node!r}")
280
+ op, val = next(iter(node.items()))
281
+ if op in ("all", "any"):
282
+ if not isinstance(val, list) or not val:
283
+ raise FormatError(f"{op!r} needs a non-empty list of nodes")
284
+ for child in val:
285
+ validate_node(child)
286
+ elif op == "not":
287
+ validate_node(val)
288
+ elif op in _PRIMITIVES:
289
+ _validate_primitive(op, val)
290
+ else:
291
+ raise FormatError(f"unknown predicate {op!r} (known: {sorted(KNOWN_OPS)})")
292
+
293
+
294
+ # ---------------------------------------------------------------------------
295
+ # detection — run tumblers (severity-ordered) against a response
296
+ # ---------------------------------------------------------------------------
297
+ def detect(resp, tumblers: list[Tumbler]) -> TumblerHit | None:
298
+ """Return the highest-severity TumblerHit that fires on `resp`, or None if open.
299
+
300
+ `tumblers` should already be severity-ordered (engine sorts at load). The first match
301
+ wins, so a specific high-severity wall (e.g. a Cloudflare challenge 403) takes priority
302
+ over a generic low-severity rule (e.g. bare-403 reputation) when both could match."""
303
+ view = View.of(resp)
304
+ for t in tumblers:
305
+ if evaluate(t.match, view):
306
+ return TumblerHit(tumbler=t, reason=f"{t.category.value}:{t.id}")
307
+ return None
308
+
309
+
310
+ def is_blocked(resp, tumblers: list[Tumbler]) -> tuple[bool, str]:
311
+ """Compatibility shim mirroring the legacy is_blocked() API, but registry-driven."""
312
+ hit = detect(resp, tumblers)
313
+ return (True, hit.reason) if hit else (False, "ok")