@gajae-code/coding-agent 0.7.1 → 0.7.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +19 -0
- package/dist/types/cli/notify-cli.d.ts +2 -0
- package/dist/types/config/settings-schema.d.ts +39 -2
- package/dist/types/extensibility/shared-events.d.ts +1 -0
- package/dist/types/gjc-runtime/ralplan-runtime.d.ts +1 -1
- package/dist/types/lsp/types.d.ts +2 -0
- package/dist/types/notifications/attachment-registry.d.ts +17 -0
- package/dist/types/notifications/chat-adapters.d.ts +9 -0
- package/dist/types/notifications/config.d.ts +9 -1
- package/dist/types/notifications/engine.d.ts +59 -0
- package/dist/types/notifications/managed-daemon.d.ts +48 -0
- package/dist/types/notifications/telegram-daemon.d.ts +19 -0
- package/dist/types/notifications/threaded-inbound.d.ts +19 -0
- package/dist/types/notifications/threaded-render.d.ts +6 -1
- package/dist/types/session/agent-session.d.ts +2 -0
- package/dist/types/tools/fetch.d.ts +23 -0
- package/dist/types/tools/index.d.ts +1 -0
- package/dist/types/tools/telegram-send.d.ts +32 -0
- package/dist/types/web/insane/bridge.d.ts +103 -0
- package/dist/types/web/insane/url-guard.d.ts +22 -0
- package/dist/types/web/search/provider.d.ts +18 -1
- package/dist/types/web/search/providers/insane.d.ts +53 -0
- package/dist/types/web/search/providers/text-citations.d.ts +23 -0
- package/dist/types/web/search/types.d.ts +12 -4
- package/package.json +10 -8
- package/scripts/verify-insane-vendor.ts +132 -0
- package/src/cli/args.ts +1 -1
- package/src/cli/fast-help.ts +1 -1
- package/src/cli/notify-cli.ts +152 -5
- package/src/commands/team.ts +1 -1
- package/src/config/settings-schema.ts +30 -1
- package/src/defaults/gjc/skills/ralplan/SKILL.md +11 -4
- package/src/extensibility/shared-events.ts +1 -0
- package/src/gjc-runtime/launch-tmux.ts +17 -3
- package/src/gjc-runtime/ledger-event-renderer.ts +1 -0
- package/src/gjc-runtime/ralplan-runtime.ts +2 -2
- package/src/gjc-runtime/workflow-manifest.generated.json +29 -0
- package/src/gjc-runtime/workflow-manifest.ts +7 -2
- package/src/internal-urls/docs-index.generated.ts +7 -7
- package/src/lsp/config.ts +16 -3
- package/src/lsp/defaults.json +7 -0
- package/src/lsp/types.ts +2 -0
- package/src/modes/controllers/event-controller.ts +15 -0
- package/src/modes/interactive-mode.ts +46 -2
- package/src/modes/utils/context-usage.ts +2 -2
- package/src/notifications/attachment-registry.ts +23 -0
- package/src/notifications/chat-adapters.ts +147 -0
- package/src/notifications/config.ts +23 -2
- package/src/notifications/engine.ts +100 -0
- package/src/notifications/index.ts +180 -38
- package/src/notifications/managed-daemon.ts +163 -0
- package/src/notifications/telegram-daemon.ts +235 -14
- package/src/notifications/threaded-inbound.ts +60 -4
- package/src/notifications/threaded-render.ts +20 -2
- package/src/session/agent-session.ts +82 -51
- package/src/tools/fetch.ts +78 -1
- package/src/tools/index.ts +3 -0
- package/src/tools/telegram-send.ts +137 -0
- package/src/web/insane/bridge.ts +350 -0
- package/src/web/insane/url-guard.ts +155 -0
- package/src/web/search/provider.ts +77 -18
- package/src/web/search/providers/anthropic.ts +70 -3
- package/src/web/search/providers/codex.ts +1 -119
- package/src/web/search/providers/gemini.ts +99 -0
- package/src/web/search/providers/insane.ts +551 -0
- package/src/web/search/providers/openai-compatible.ts +66 -32
- package/src/web/search/providers/text-citations.ts +111 -0
- package/src/web/search/types.ts +13 -2
- package/vendor/insane-search/LICENSE +21 -0
- package/vendor/insane-search/MANIFEST.json +24 -0
- package/vendor/insane-search/engine/__init__.py +23 -0
- package/vendor/insane-search/engine/__main__.py +128 -0
- package/vendor/insane-search/engine/bias_check.py +183 -0
- package/vendor/insane-search/engine/executor.py +254 -0
- package/vendor/insane-search/engine/fetch_chain.py +725 -0
- package/vendor/insane-search/engine/learning.py +175 -0
- package/vendor/insane-search/engine/phase0.py +214 -0
- package/vendor/insane-search/engine/safety.py +91 -0
- package/vendor/insane-search/engine/templates/package.json +11 -0
- package/vendor/insane-search/engine/templates/playwright_mobile_chrome.js +188 -0
- package/vendor/insane-search/engine/templates/playwright_real_chrome.js +243 -0
- package/vendor/insane-search/engine/tests/test_hardening.py +57 -0
- package/vendor/insane-search/engine/tests/test_smoke.py +152 -0
- package/vendor/insane-search/engine/tests/test_u1.py +200 -0
- package/vendor/insane-search/engine/tests/test_u4.py +131 -0
- package/vendor/insane-search/engine/tests/test_u5.py +163 -0
- package/vendor/insane-search/engine/tests/test_u7.py +124 -0
- package/vendor/insane-search/engine/transport.py +211 -0
- package/vendor/insane-search/engine/url_transforms.py +98 -0
- package/vendor/insane-search/engine/validators.py +331 -0
- package/vendor/insane-search/engine/waf_detector.py +214 -0
- package/vendor/insane-search/engine/waf_profiles.yaml +162 -0
|
@@ -0,0 +1,98 @@
|
|
|
1
|
+
"""Generic URL transforms for the fetch grid.
|
|
2
|
+
|
|
3
|
+
Transforms are domain-agnostic *rules*. They never reference a specific
|
|
4
|
+
site by name. A transform either applies (returns a new URL) or is skipped
|
|
5
|
+
(returns None). Callers iterate transforms in order.
|
|
6
|
+
|
|
7
|
+
Empirically useful transforms (see observations/):
|
|
8
|
+
* mobile_subdomain — `www.example.com` → `m.example.com`
|
|
9
|
+
Strong win on SSR sites with mobile-first serving. Loss on SPA shells
|
|
10
|
+
(some mobile sites return tiny bootstrap HTML).
|
|
11
|
+
* am_prefix — `example.com` (no www) → `m.example.com`
|
|
12
|
+
* drop_www — occasionally unblocks hosts that gate www but not apex.
|
|
13
|
+
|
|
14
|
+
Adding new transforms: prove they help on ≥2 unrelated sites first
|
|
15
|
+
(cross-site validation — bias check).
|
|
16
|
+
"""
|
|
17
|
+
from __future__ import annotations
|
|
18
|
+
|
|
19
|
+
from typing import Callable, Optional
|
|
20
|
+
from urllib.parse import urlsplit, urlunsplit
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def _replace_host(url: str, new_host: str) -> str:
|
|
24
|
+
parts = urlsplit(url)
|
|
25
|
+
return urlunsplit(parts._replace(netloc=new_host))
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def _original(url: str) -> Optional[str]:
|
|
29
|
+
return url
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def _mobile_subdomain(url: str) -> Optional[str]:
|
|
33
|
+
"""`https://www.example.com/a` → `https://m.example.com/a` (only if host starts with www.)."""
|
|
34
|
+
parts = urlsplit(url)
|
|
35
|
+
host = parts.hostname or ""
|
|
36
|
+
if not host.startswith("www."):
|
|
37
|
+
return None
|
|
38
|
+
new_host = "m." + host[4:]
|
|
39
|
+
if parts.port:
|
|
40
|
+
new_host = f"{new_host}:{parts.port}"
|
|
41
|
+
return _replace_host(url, new_host)
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def _am_prefix(url: str) -> Optional[str]:
|
|
45
|
+
"""`https://example.com/a` → `https://m.example.com/a` (only if host has no subdomain)."""
|
|
46
|
+
parts = urlsplit(url)
|
|
47
|
+
host = parts.hostname or ""
|
|
48
|
+
if not host or host.startswith("m."):
|
|
49
|
+
return None
|
|
50
|
+
# Only apply to apex-like hosts (≤2 dot-separated labels).
|
|
51
|
+
if host.count(".") >= 2 and not host.startswith("www."):
|
|
52
|
+
return None
|
|
53
|
+
if host.startswith("www."):
|
|
54
|
+
return None # handled by mobile_subdomain
|
|
55
|
+
return _replace_host(url, "m." + host)
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def _drop_www(url: str) -> Optional[str]:
|
|
59
|
+
parts = urlsplit(url)
|
|
60
|
+
host = parts.hostname or ""
|
|
61
|
+
if not host.startswith("www."):
|
|
62
|
+
return None
|
|
63
|
+
return _replace_host(url, host[4:])
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
TRANSFORMS: dict[str, Callable[[str], Optional[str]]] = {
|
|
67
|
+
"original": _original,
|
|
68
|
+
"mobile_subdomain": _mobile_subdomain,
|
|
69
|
+
"am_prefix": _am_prefix,
|
|
70
|
+
"drop_www": _drop_www,
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def apply_transform(name: str, url: str) -> Optional[str]:
|
|
75
|
+
"""Apply one transform by name. Returns transformed URL or None if skipped."""
|
|
76
|
+
fn = TRANSFORMS.get(name)
|
|
77
|
+
if fn is None:
|
|
78
|
+
raise ValueError(f"Unknown transform: {name!r}. Known: {list(TRANSFORMS)}")
|
|
79
|
+
return fn(url)
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def iter_transformed(url: str, order: list[str]) -> list[tuple[str, str]]:
|
|
83
|
+
"""Yield (transform_name, transformed_url) pairs for a given order.
|
|
84
|
+
|
|
85
|
+
Skips transforms that return None (not applicable) and deduplicates
|
|
86
|
+
URLs (so `original` and `drop_www` of `https://example.com` don't double-run).
|
|
87
|
+
"""
|
|
88
|
+
seen: set[str] = set()
|
|
89
|
+
out: list[tuple[str, str]] = []
|
|
90
|
+
for name in order:
|
|
91
|
+
new_url = apply_transform(name, url)
|
|
92
|
+
if new_url is None:
|
|
93
|
+
continue
|
|
94
|
+
if new_url in seen:
|
|
95
|
+
continue
|
|
96
|
+
seen.add(new_url)
|
|
97
|
+
out.append((name, new_url))
|
|
98
|
+
return out
|
|
@@ -0,0 +1,331 @@
|
|
|
1
|
+
"""Generic challenge / success validator (v2).
|
|
2
|
+
|
|
3
|
+
Layers (all generic, never site-specific):
|
|
4
|
+
1. HTTP status semantics (rate-limit / auth / not-found / transient / blocked)
|
|
5
|
+
2. HARD challenge markers (structural WAF containers — always decisive)
|
|
6
|
+
3. Size fingerprints (known bad BYTE sizes hinted by caller)
|
|
7
|
+
4. Content-Type / JSON awareness (small JSON APIs are NOT challenges)
|
|
8
|
+
5. Caller success_selectors (strongest positive proof for HTML)
|
|
9
|
+
6. SOFT markers + cookie sensor + tiny-body heuristics (only when no
|
|
10
|
+
positive proof is available)
|
|
11
|
+
|
|
12
|
+
v2 changes vs v1 (per multi-AI review 2026-06-21):
|
|
13
|
+
* `WEAK_OK` is reserved for genuinely clean responses. Ambiguous states
|
|
14
|
+
(`_abck` unresolved, soft-block words without proof) now return the new
|
|
15
|
+
non-terminal `SUSPECT_OK` so the fetch chain keeps searching instead of
|
|
16
|
+
declaring a blocked page a success.
|
|
17
|
+
* Small valid JSON (e.g. an internal API) is no longer mislabelled
|
|
18
|
+
`CHALLENGE` — unblocks the R7 API-first route.
|
|
19
|
+
* SOFT markers (e.g. the word "captcha" buried in a script) no longer
|
|
20
|
+
override a matched success_selector.
|
|
21
|
+
* Size compared in BYTES, not unicode char count.
|
|
22
|
+
* Status codes differentiated (429/401/404/5xx) instead of one BLOCKED.
|
|
23
|
+
"""
|
|
24
|
+
from __future__ import annotations
|
|
25
|
+
|
|
26
|
+
import json
|
|
27
|
+
import re
|
|
28
|
+
from dataclasses import dataclass, field
|
|
29
|
+
from enum import Enum
|
|
30
|
+
from typing import Optional
|
|
31
|
+
|
|
32
|
+
try:
|
|
33
|
+
from bs4 import BeautifulSoup
|
|
34
|
+
except ImportError: # bs4 is a soft dep: only used when selectors given
|
|
35
|
+
BeautifulSoup = None # type: ignore
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
# HARD markers: structural challenge/block containers. Decisive on their own —
|
|
39
|
+
# these strings do not appear in legitimate page content. (WAF products only.)
|
|
40
|
+
HARD_CHALLENGE_MARKERS: list[str] = [
|
|
41
|
+
"sec-if-cpt-container",
|
|
42
|
+
"Powered and protected by Akamai",
|
|
43
|
+
"Just a moment...",
|
|
44
|
+
"cf-chl-bypass",
|
|
45
|
+
"Attention Required! | Cloudflare",
|
|
46
|
+
"<title>Bot Challenge</title>",
|
|
47
|
+
"The requested URL was rejected",
|
|
48
|
+
"Request unsuccessful. Incapsula",
|
|
49
|
+
"Please enable JS and disable any ad blocker",
|
|
50
|
+
]
|
|
51
|
+
|
|
52
|
+
# SOFT markers: words that strongly suggest a challenge BUT can legitimately
|
|
53
|
+
# appear in real content (scripts, articles about bots, etc). Only decisive
|
|
54
|
+
# when the caller has no positive proof (success_selectors) that overrides.
|
|
55
|
+
SOFT_CHALLENGE_MARKERS: list[str] = [
|
|
56
|
+
"access denied",
|
|
57
|
+
"checking your browser",
|
|
58
|
+
"datadome",
|
|
59
|
+
"captcha",
|
|
60
|
+
]
|
|
61
|
+
|
|
62
|
+
# Backward-compatible export (some callers import CHALLENGE_MARKERS).
|
|
63
|
+
CHALLENGE_MARKERS: list[str] = HARD_CHALLENGE_MARKERS + SOFT_CHALLENGE_MARKERS
|
|
64
|
+
|
|
65
|
+
# Minimum BODY BYTE size below which we suspect a stub / challenge page.
|
|
66
|
+
SMALL_BODY_THRESHOLD = 3000
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
class Verdict(Enum):
|
|
70
|
+
"""Classification of a fetched response."""
|
|
71
|
+
|
|
72
|
+
STRONG_OK = "strong_ok" # positive proof present → terminal success
|
|
73
|
+
WEAK_OK = "weak_ok" # clean, no negative signal → terminal success
|
|
74
|
+
SUSPECT_OK = "suspect_ok" # ambiguous (abck unresolved / soft) → NON-terminal
|
|
75
|
+
CHALLENGE = "challenge" # WAF challenge (negative proof)
|
|
76
|
+
BLOCKED = "blocked" # generic non-2xx block
|
|
77
|
+
RATE_LIMITED = "rate_limited" # 429 — back off, do not hammer
|
|
78
|
+
AUTH_REQUIRED = "auth_required" # 401/407 — terminal, retrying TLS won't help
|
|
79
|
+
NOT_FOUND = "not_found" # 404/410 — terminal
|
|
80
|
+
UNKNOWN = "unknown" # exception / dependency missing
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
# Verdicts that mean "stop the grid — more TLS attempts cannot help".
|
|
84
|
+
TERMINAL_NONSUCCESS = frozenset({
|
|
85
|
+
Verdict.AUTH_REQUIRED, Verdict.NOT_FOUND, Verdict.RATE_LIMITED,
|
|
86
|
+
})
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
@dataclass
|
|
90
|
+
class ValidationResult:
|
|
91
|
+
verdict: Verdict
|
|
92
|
+
reasons: list[str] = field(default_factory=list)
|
|
93
|
+
matched_selectors: list[str] = field(default_factory=list)
|
|
94
|
+
body_size: int = 0 # bytes
|
|
95
|
+
status: int = 0
|
|
96
|
+
|
|
97
|
+
@property
|
|
98
|
+
def ok(self) -> bool:
|
|
99
|
+
"""Terminal success only. SUSPECT_OK is intentionally excluded."""
|
|
100
|
+
return self.verdict in (Verdict.STRONG_OK, Verdict.WEAK_OK)
|
|
101
|
+
|
|
102
|
+
def to_dict(self) -> dict:
|
|
103
|
+
return {
|
|
104
|
+
"verdict": self.verdict.value,
|
|
105
|
+
"reasons": self.reasons,
|
|
106
|
+
"matched_selectors": self.matched_selectors,
|
|
107
|
+
"body_size": self.body_size,
|
|
108
|
+
"status": self.status,
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
def _hard_marker_hits(body_lower: str) -> list[str]:
|
|
113
|
+
return [m for m in HARD_CHALLENGE_MARKERS if m.lower() in body_lower]
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
def _soft_marker_hits(body_lower: str) -> list[str]:
|
|
117
|
+
return [m for m in SOFT_CHALLENGE_MARKERS if m in body_lower]
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
def _abck_unresolved(cookies: dict) -> bool:
|
|
121
|
+
abck = cookies.get("_abck", "")
|
|
122
|
+
return bool(abck) and "~-1~" in abck
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
def _content_type(resp) -> str:
|
|
126
|
+
try:
|
|
127
|
+
headers = {k.lower(): v for k, v in dict(getattr(resp, "headers", {}) or {}).items()}
|
|
128
|
+
return str(headers.get("content-type", "")).lower()
|
|
129
|
+
except Exception:
|
|
130
|
+
return ""
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
def _looks_like_json(text: str, ctype: str) -> bool:
|
|
134
|
+
if "json" in ctype:
|
|
135
|
+
return True
|
|
136
|
+
s = text.lstrip()[:1]
|
|
137
|
+
return s in ("{", "[")
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
def _json_ok(text: str) -> Optional[bool]:
|
|
141
|
+
"""True if text parses as non-empty JSON, False if parses-but-empty,
|
|
142
|
+
None if not parseable."""
|
|
143
|
+
try:
|
|
144
|
+
obj = json.loads(text)
|
|
145
|
+
except Exception:
|
|
146
|
+
return None
|
|
147
|
+
if obj in (None, {}, [], ""):
|
|
148
|
+
return False
|
|
149
|
+
return True
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
def _byte_size(resp, text: str) -> int:
|
|
153
|
+
content = getattr(resp, "content", None)
|
|
154
|
+
if isinstance(content, (bytes, bytearray)):
|
|
155
|
+
return len(content)
|
|
156
|
+
return len(text.encode("utf-8", "ignore"))
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
def _looks_complete_content_page(text: str, lowered: str) -> bool:
|
|
160
|
+
"""True when a SMALL body is still a real (short) page, not a challenge stub.
|
|
161
|
+
|
|
162
|
+
A genuine page is a COMPLETE HTML document (closes `</html>`/`</body>`) that
|
|
163
|
+
carries meaningful visible text — e.g. example.com at ~600B. A WAF interstitial
|
|
164
|
+
that slipped past the marker checks is typically script-only, empty, or an
|
|
165
|
+
incomplete fragment, so it has little visible text and returns False."""
|
|
166
|
+
if "</html>" not in lowered and "</body>" not in lowered:
|
|
167
|
+
return False
|
|
168
|
+
visible = re.sub(r"(?is)<(script|style)[^>]*>.*?</\1>", " ", text)
|
|
169
|
+
visible = re.sub(r"(?s)<[^>]+>", " ", visible)
|
|
170
|
+
visible = re.sub(r"\s+", " ", visible).strip()
|
|
171
|
+
return len(visible) >= 64
|
|
172
|
+
|
|
173
|
+
|
|
174
|
+
def _selector_hits(body: str, selectors: list[str]) -> Optional[list[str]]:
|
|
175
|
+
"""Return matched-selector list, or None if BS4 is unavailable."""
|
|
176
|
+
if BeautifulSoup is None:
|
|
177
|
+
return None
|
|
178
|
+
try:
|
|
179
|
+
soup = BeautifulSoup(body, "html.parser")
|
|
180
|
+
except Exception:
|
|
181
|
+
return []
|
|
182
|
+
hits: list[str] = []
|
|
183
|
+
for sel in selectors:
|
|
184
|
+
try:
|
|
185
|
+
if soup.select(sel):
|
|
186
|
+
hits.append(sel)
|
|
187
|
+
except Exception:
|
|
188
|
+
continue
|
|
189
|
+
return hits
|
|
190
|
+
|
|
191
|
+
|
|
192
|
+
def validate(
|
|
193
|
+
resp,
|
|
194
|
+
*,
|
|
195
|
+
success_selectors: Optional[list[str]] = None,
|
|
196
|
+
known_bad_sizes: Optional[list[int]] = None,
|
|
197
|
+
size_tolerance: int = 20,
|
|
198
|
+
) -> ValidationResult:
|
|
199
|
+
"""Validate a `curl_cffi` / `requests` response (v2)."""
|
|
200
|
+
try:
|
|
201
|
+
status = int(getattr(resp, "status_code", 0) or 0)
|
|
202
|
+
text = getattr(resp, "text", "") or ""
|
|
203
|
+
size = _byte_size(resp, text)
|
|
204
|
+
except Exception as e:
|
|
205
|
+
return ValidationResult(verdict=Verdict.UNKNOWN, reasons=[f"parse_error:{e}"])
|
|
206
|
+
|
|
207
|
+
r = ValidationResult(verdict=Verdict.UNKNOWN, body_size=size, status=status)
|
|
208
|
+
|
|
209
|
+
# --- Layer 1: status semantics ----------------------------------------
|
|
210
|
+
if status == 429:
|
|
211
|
+
r.verdict = Verdict.RATE_LIMITED
|
|
212
|
+
r.reasons.append("status=429")
|
|
213
|
+
return r
|
|
214
|
+
if status in (401, 407):
|
|
215
|
+
r.verdict = Verdict.AUTH_REQUIRED
|
|
216
|
+
r.reasons.append(f"status={status}")
|
|
217
|
+
return r
|
|
218
|
+
if status in (404, 410):
|
|
219
|
+
r.verdict = Verdict.NOT_FOUND
|
|
220
|
+
r.reasons.append(f"status={status}")
|
|
221
|
+
return r
|
|
222
|
+
if 500 <= status <= 599:
|
|
223
|
+
r.verdict = Verdict.BLOCKED
|
|
224
|
+
r.reasons.append(f"status={status}")
|
|
225
|
+
return r
|
|
226
|
+
if status == 0:
|
|
227
|
+
r.verdict = Verdict.UNKNOWN
|
|
228
|
+
r.reasons.append("status=0")
|
|
229
|
+
return r
|
|
230
|
+
# 403/406/etc fall through to marker analysis (often a WAF challenge body).
|
|
231
|
+
|
|
232
|
+
lowered = text.lower()
|
|
233
|
+
|
|
234
|
+
# --- Layer 2: HARD markers (decisive) ---------------------------------
|
|
235
|
+
hard = _hard_marker_hits(lowered)
|
|
236
|
+
if hard:
|
|
237
|
+
r.verdict = Verdict.CHALLENGE
|
|
238
|
+
r.reasons.extend(f"hard:{m}" for m in hard[:3])
|
|
239
|
+
return r
|
|
240
|
+
|
|
241
|
+
# --- Layer 3: size fingerprint (bytes, tolerant) ----------------------
|
|
242
|
+
if known_bad_sizes:
|
|
243
|
+
for bad in known_bad_sizes:
|
|
244
|
+
if abs(size - bad) <= size_tolerance:
|
|
245
|
+
r.verdict = Verdict.CHALLENGE
|
|
246
|
+
r.reasons.append(f"size_fp:{size}~{bad}")
|
|
247
|
+
return r
|
|
248
|
+
|
|
249
|
+
# --- Layer 4: JSON awareness (before tiny-body heuristic) -------------
|
|
250
|
+
ctype = _content_type(resp)
|
|
251
|
+
if _looks_like_json(text, ctype):
|
|
252
|
+
j = _json_ok(text)
|
|
253
|
+
if j is True:
|
|
254
|
+
# A 2xx with non-empty parseable JSON is a successful API hit even
|
|
255
|
+
# if tiny. CSS selectors don't apply to JSON, so WEAK_OK is the
|
|
256
|
+
# ceiling here (no HTML positive-proof concept).
|
|
257
|
+
r.verdict = Verdict.WEAK_OK
|
|
258
|
+
r.reasons.append("json_ok")
|
|
259
|
+
return r
|
|
260
|
+
if j is False:
|
|
261
|
+
r.verdict = Verdict.SUSPECT_OK
|
|
262
|
+
r.reasons.append("json_empty")
|
|
263
|
+
return r
|
|
264
|
+
# j is None → not actually JSON; fall through to HTML handling.
|
|
265
|
+
|
|
266
|
+
cookies = _extract_cookies(resp)
|
|
267
|
+
abck_bad = _abck_unresolved(cookies)
|
|
268
|
+
|
|
269
|
+
# --- Layer 5: caller positive proof (HTML) ----------------------------
|
|
270
|
+
if success_selectors:
|
|
271
|
+
hits = _selector_hits(text, success_selectors)
|
|
272
|
+
if hits is None:
|
|
273
|
+
r.verdict = Verdict.UNKNOWN
|
|
274
|
+
r.reasons.append("bs4_missing")
|
|
275
|
+
return r
|
|
276
|
+
if hits:
|
|
277
|
+
r.matched_selectors = hits
|
|
278
|
+
# Selector matched → soft markers are ignored (they were likely in
|
|
279
|
+
# a script or unrelated text). But an unresolved sensor cookie
|
|
280
|
+
# still demotes us to NON-terminal SUSPECT_OK.
|
|
281
|
+
if abck_bad:
|
|
282
|
+
r.reasons.append("abck_unresolved")
|
|
283
|
+
r.verdict = Verdict.SUSPECT_OK
|
|
284
|
+
return r
|
|
285
|
+
r.verdict = Verdict.STRONG_OK
|
|
286
|
+
return r
|
|
287
|
+
# Selectors requested but none matched → challenge.
|
|
288
|
+
r.verdict = Verdict.CHALLENGE
|
|
289
|
+
r.reasons.append("no_success_selector")
|
|
290
|
+
return r
|
|
291
|
+
|
|
292
|
+
# --- Layer 6: no positive proof — heuristics --------------------------
|
|
293
|
+
soft = _soft_marker_hits(lowered)
|
|
294
|
+
if soft:
|
|
295
|
+
r.verdict = Verdict.CHALLENGE
|
|
296
|
+
r.reasons.extend(f"soft:{m}" for m in soft[:3])
|
|
297
|
+
return r
|
|
298
|
+
|
|
299
|
+
if size < SMALL_BODY_THRESHOLD:
|
|
300
|
+
# A small body is only weak evidence of a challenge stub. A COMPLETE,
|
|
301
|
+
# content-bearing HTML document that just happens to be short (e.g.
|
|
302
|
+
# example.com ~600B) is a real page → clean weak success. Only an
|
|
303
|
+
# incomplete / script-only / empty small body stays suspicious.
|
|
304
|
+
if _looks_complete_content_page(text, lowered):
|
|
305
|
+
r.verdict = Verdict.WEAK_OK
|
|
306
|
+
r.reasons.append(f"small_but_complete:{size}")
|
|
307
|
+
return r
|
|
308
|
+
r.verdict = Verdict.CHALLENGE
|
|
309
|
+
r.reasons.append(f"tiny_body:{size}")
|
|
310
|
+
return r
|
|
311
|
+
|
|
312
|
+
if abck_bad:
|
|
313
|
+
# Unresolved Akamai sensor with no positive proof: do NOT declare
|
|
314
|
+
# success. Non-terminal so the chain keeps trying.
|
|
315
|
+
r.reasons.append("abck_unresolved")
|
|
316
|
+
r.verdict = Verdict.SUSPECT_OK
|
|
317
|
+
return r
|
|
318
|
+
|
|
319
|
+
# Clean, sizeable, no negative signal, no sensor problem → terminal weak ok.
|
|
320
|
+
r.verdict = Verdict.WEAK_OK
|
|
321
|
+
return r
|
|
322
|
+
|
|
323
|
+
|
|
324
|
+
def _extract_cookies(resp) -> dict:
|
|
325
|
+
try:
|
|
326
|
+
return {c.name: c.value for c in resp.cookies.jar}
|
|
327
|
+
except Exception:
|
|
328
|
+
try:
|
|
329
|
+
return dict(resp.cookies) if hasattr(resp, "cookies") else {}
|
|
330
|
+
except Exception:
|
|
331
|
+
return {}
|
|
@@ -0,0 +1,214 @@
|
|
|
1
|
+
"""WAF-product detection from a live response.
|
|
2
|
+
|
|
3
|
+
Returns a *ranking* of (profile_id, confidence) pairs — never a single verdict.
|
|
4
|
+
Single-answer detectors cause cascading wrong plans when misfiring (Codex's
|
|
5
|
+
critique). Planner consumes the ranking and tries top candidates in order.
|
|
6
|
+
|
|
7
|
+
All detectors operate on WAF-vendor artifacts (cookies / headers / body
|
|
8
|
+
strings) — never site hostnames. See engine/waf_profiles.yaml for the
|
|
9
|
+
profile definitions.
|
|
10
|
+
"""
|
|
11
|
+
from __future__ import annotations
|
|
12
|
+
|
|
13
|
+
import fnmatch
|
|
14
|
+
import os
|
|
15
|
+
import re
|
|
16
|
+
from dataclasses import dataclass
|
|
17
|
+
from typing import Optional
|
|
18
|
+
|
|
19
|
+
try:
|
|
20
|
+
import yaml # PyYAML
|
|
21
|
+
except ImportError:
|
|
22
|
+
yaml = None # type: ignore
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
PROFILES_PATH = os.path.join(os.path.dirname(__file__), "waf_profiles.yaml")
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
# In-code safety net — used when waf_profiles.yaml is missing / invalid
|
|
29
|
+
# or PyYAML isn't installed. Keeps fetch() working in a degraded-but-sane
|
|
30
|
+
# mode. Must stay site-agnostic (No-Site-Name Rule).
|
|
31
|
+
_DEFAULT_PROFILES: dict = {
|
|
32
|
+
"unknown_challenge": {
|
|
33
|
+
"detectors": {},
|
|
34
|
+
"confidence_rules": {"strong": 0, "weak": 0},
|
|
35
|
+
"capabilities_needed": ["needs_js_exec"],
|
|
36
|
+
"tls_impersonate_candidates": [
|
|
37
|
+
["safari", "chrome", "firefox"],
|
|
38
|
+
["safari_ios", "chrome_android"],
|
|
39
|
+
],
|
|
40
|
+
"referer_strategies": ["self_root", "google_search", "none"],
|
|
41
|
+
"url_transform_order": ["original", "mobile_subdomain"],
|
|
42
|
+
"fallback_when_challenge": ["playwright_mcp", "playwright_real_chrome"],
|
|
43
|
+
"notes": "in-code default — waf_profiles.yaml unavailable",
|
|
44
|
+
},
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
# Module-level sticky error. Readers call `last_load_error()` after each
|
|
49
|
+
# `_load_profiles()` call to surface YAML problems in FetchResult.trace.
|
|
50
|
+
_LAST_LOAD_ERROR: Optional[str] = None
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
@dataclass
|
|
54
|
+
class DetectionHit:
|
|
55
|
+
profile_id: str
|
|
56
|
+
confidence: float
|
|
57
|
+
signals: list[str]
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def last_load_error() -> Optional[str]:
|
|
61
|
+
"""Return the most recent profile-loader error (or None if clean)."""
|
|
62
|
+
return _LAST_LOAD_ERROR
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def _load_profiles(path: str = PROFILES_PATH) -> dict:
|
|
66
|
+
"""Load profiles with graceful fallback.
|
|
67
|
+
|
|
68
|
+
Never raises. On any failure (PyYAML missing, file missing, parse error,
|
|
69
|
+
unexpected shape) it returns a copy of `_DEFAULT_PROFILES` and stores
|
|
70
|
+
the reason in `_LAST_LOAD_ERROR` for the caller to surface.
|
|
71
|
+
"""
|
|
72
|
+
global _LAST_LOAD_ERROR
|
|
73
|
+
_LAST_LOAD_ERROR = None
|
|
74
|
+
|
|
75
|
+
if yaml is None:
|
|
76
|
+
_LAST_LOAD_ERROR = "PyYAML not installed — using in-code default profile"
|
|
77
|
+
return dict(_DEFAULT_PROFILES)
|
|
78
|
+
try:
|
|
79
|
+
with open(path, "r", encoding="utf-8") as f:
|
|
80
|
+
loaded = yaml.safe_load(f) or {}
|
|
81
|
+
except FileNotFoundError:
|
|
82
|
+
_LAST_LOAD_ERROR = f"waf_profiles.yaml not found at {path}"
|
|
83
|
+
return dict(_DEFAULT_PROFILES)
|
|
84
|
+
except yaml.YAMLError as e:
|
|
85
|
+
_LAST_LOAD_ERROR = f"YAML parse error: {type(e).__name__}: {str(e)[:200]}"
|
|
86
|
+
return dict(_DEFAULT_PROFILES)
|
|
87
|
+
except Exception as e:
|
|
88
|
+
_LAST_LOAD_ERROR = f"profile loader: {type(e).__name__}: {str(e)[:200]}"
|
|
89
|
+
return dict(_DEFAULT_PROFILES)
|
|
90
|
+
|
|
91
|
+
if not isinstance(loaded, dict) or not any(k for k in loaded if not k.startswith("_")):
|
|
92
|
+
_LAST_LOAD_ERROR = f"waf_profiles.yaml has no usable profiles"
|
|
93
|
+
return dict(_DEFAULT_PROFILES)
|
|
94
|
+
|
|
95
|
+
return loaded
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
def _cookies_dict(resp) -> dict:
|
|
99
|
+
try:
|
|
100
|
+
return {c.name: c.value for c in resp.cookies.jar}
|
|
101
|
+
except Exception:
|
|
102
|
+
try:
|
|
103
|
+
return dict(resp.cookies) if hasattr(resp, "cookies") else {}
|
|
104
|
+
except Exception:
|
|
105
|
+
return {}
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
def _headers_dict(resp) -> dict:
|
|
109
|
+
try:
|
|
110
|
+
return {k.lower(): v for k, v in dict(resp.headers).items()}
|
|
111
|
+
except Exception:
|
|
112
|
+
return {}
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
def _match_patterns(haystack_keys: list[str], patterns: list[str]) -> list[str]:
|
|
116
|
+
"""Match literal names or fnmatch patterns (for wildcards like `X-Akamai-*`)."""
|
|
117
|
+
hits: list[str] = []
|
|
118
|
+
lowered_keys = [k.lower() for k in haystack_keys]
|
|
119
|
+
for pat in patterns or []:
|
|
120
|
+
pat_l = pat.lower()
|
|
121
|
+
if any(c in pat for c in "*?["):
|
|
122
|
+
for key in lowered_keys:
|
|
123
|
+
if fnmatch.fnmatchcase(key, pat_l):
|
|
124
|
+
hits.append(pat)
|
|
125
|
+
break
|
|
126
|
+
else:
|
|
127
|
+
if pat_l in lowered_keys:
|
|
128
|
+
hits.append(pat)
|
|
129
|
+
return hits
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
def _score_profile(profile_id: str, profile: dict, resp) -> Optional[DetectionHit]:
|
|
133
|
+
"""Apply profile detectors to resp. Returns hit or None."""
|
|
134
|
+
if profile_id.startswith("_"):
|
|
135
|
+
return None
|
|
136
|
+
detectors = profile.get("detectors") or {}
|
|
137
|
+
if not detectors and profile_id != "unknown_challenge":
|
|
138
|
+
return None
|
|
139
|
+
|
|
140
|
+
cookies = _cookies_dict(resp)
|
|
141
|
+
headers = _headers_dict(resp)
|
|
142
|
+
body = (getattr(resp, "text", "") or "").lower()
|
|
143
|
+
server = headers.get("server", "")
|
|
144
|
+
|
|
145
|
+
signals: list[str] = []
|
|
146
|
+
|
|
147
|
+
# Cookie detectors
|
|
148
|
+
cookie_pats = detectors.get("cookie") or []
|
|
149
|
+
for hit in _match_patterns(list(cookies.keys()), cookie_pats):
|
|
150
|
+
signals.append(f"cookie:{hit}")
|
|
151
|
+
|
|
152
|
+
# Header detectors
|
|
153
|
+
header_pats = detectors.get("header") or []
|
|
154
|
+
for hit in _match_patterns(list(headers.keys()), header_pats):
|
|
155
|
+
signals.append(f"header:{hit}")
|
|
156
|
+
|
|
157
|
+
# Server substring
|
|
158
|
+
for needle in detectors.get("server_contains") or []:
|
|
159
|
+
if needle.lower() in server:
|
|
160
|
+
signals.append(f"server:{needle}")
|
|
161
|
+
|
|
162
|
+
# Body markers
|
|
163
|
+
for needle in detectors.get("body") or []:
|
|
164
|
+
if needle.lower() in body:
|
|
165
|
+
signals.append(f"body:{needle}")
|
|
166
|
+
|
|
167
|
+
if not signals:
|
|
168
|
+
return None
|
|
169
|
+
|
|
170
|
+
rules = profile.get("confidence_rules") or {"strong": 2, "weak": 1}
|
|
171
|
+
n = len(signals)
|
|
172
|
+
if n >= rules.get("strong", 2):
|
|
173
|
+
conf = 0.9
|
|
174
|
+
elif n >= rules.get("weak", 1):
|
|
175
|
+
conf = 0.6
|
|
176
|
+
else:
|
|
177
|
+
conf = 0.3
|
|
178
|
+
|
|
179
|
+
return DetectionHit(profile_id=profile_id, confidence=conf, signals=signals)
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
def detect(resp, *, profiles: Optional[dict] = None, min_confidence: float = 0.0) -> list[DetectionHit]:
|
|
183
|
+
"""Return ranked list of detection hits (best first).
|
|
184
|
+
|
|
185
|
+
When nothing fires, the returned list contains a single `unknown_challenge`
|
|
186
|
+
hit with confidence 0.1 — caller can use its conservative settings.
|
|
187
|
+
"""
|
|
188
|
+
if profiles is None:
|
|
189
|
+
profiles = _load_profiles()
|
|
190
|
+
|
|
191
|
+
hits: list[DetectionHit] = []
|
|
192
|
+
for profile_id, profile in profiles.items():
|
|
193
|
+
if profile_id.startswith("_"):
|
|
194
|
+
continue
|
|
195
|
+
h = _score_profile(profile_id, profile, resp)
|
|
196
|
+
if h and h.confidence >= min_confidence:
|
|
197
|
+
hits.append(h)
|
|
198
|
+
|
|
199
|
+
hits.sort(key=lambda x: x.confidence, reverse=True)
|
|
200
|
+
|
|
201
|
+
if not hits:
|
|
202
|
+
hits.append(DetectionHit(
|
|
203
|
+
profile_id="unknown_challenge",
|
|
204
|
+
confidence=0.1,
|
|
205
|
+
signals=["fallback"],
|
|
206
|
+
))
|
|
207
|
+
return hits
|
|
208
|
+
|
|
209
|
+
|
|
210
|
+
def load_profile(profile_id: str, *, profiles: Optional[dict] = None) -> dict:
|
|
211
|
+
"""Get one profile by id, resolving `unknown_challenge` if missing."""
|
|
212
|
+
if profiles is None:
|
|
213
|
+
profiles = _load_profiles()
|
|
214
|
+
return profiles.get(profile_id) or profiles.get("unknown_challenge") or {}
|