@gajae-code/coding-agent 0.7.0 → 0.7.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (101) hide show
  1. package/CHANGELOG.md +28 -0
  2. package/dist/types/cli/notify-cli.d.ts +2 -0
  3. package/dist/types/config/settings-schema.d.ts +39 -2
  4. package/dist/types/extensibility/shared-events.d.ts +1 -0
  5. package/dist/types/gjc-runtime/launch-tmux.d.ts +1 -0
  6. package/dist/types/gjc-runtime/ralplan-runtime.d.ts +1 -1
  7. package/dist/types/gjc-runtime/tmux-common.d.ts +3 -0
  8. package/dist/types/gjc-runtime/tmux-sessions.d.ts +2 -0
  9. package/dist/types/lsp/types.d.ts +2 -0
  10. package/dist/types/notifications/attachment-registry.d.ts +17 -0
  11. package/dist/types/notifications/chat-adapters.d.ts +9 -0
  12. package/dist/types/notifications/config.d.ts +9 -1
  13. package/dist/types/notifications/engine.d.ts +59 -0
  14. package/dist/types/notifications/managed-daemon.d.ts +48 -0
  15. package/dist/types/notifications/telegram-daemon.d.ts +19 -0
  16. package/dist/types/notifications/threaded-inbound.d.ts +19 -0
  17. package/dist/types/notifications/threaded-render.d.ts +6 -1
  18. package/dist/types/session/agent-session.d.ts +2 -0
  19. package/dist/types/tools/fetch.d.ts +23 -0
  20. package/dist/types/tools/index.d.ts +1 -0
  21. package/dist/types/tools/telegram-send.d.ts +32 -0
  22. package/dist/types/web/insane/bridge.d.ts +103 -0
  23. package/dist/types/web/insane/url-guard.d.ts +22 -0
  24. package/dist/types/web/search/provider.d.ts +18 -1
  25. package/dist/types/web/search/providers/insane.d.ts +53 -0
  26. package/dist/types/web/search/providers/text-citations.d.ts +23 -0
  27. package/dist/types/web/search/types.d.ts +12 -4
  28. package/package.json +10 -8
  29. package/scripts/verify-insane-vendor.ts +132 -0
  30. package/src/cli/args.ts +1 -1
  31. package/src/cli/fast-help.ts +1 -1
  32. package/src/cli/notify-cli.ts +152 -5
  33. package/src/cli.ts +1 -3
  34. package/src/commands/team.ts +1 -1
  35. package/src/config/settings-schema.ts +30 -1
  36. package/src/defaults/gjc/skills/ralplan/SKILL.md +11 -4
  37. package/src/edit/modes/replace.ts +1 -1
  38. package/src/extensibility/shared-events.ts +1 -0
  39. package/src/gjc-runtime/launch-tmux.ts +27 -5
  40. package/src/gjc-runtime/ledger-event-renderer.ts +1 -0
  41. package/src/gjc-runtime/ralplan-runtime.ts +2 -2
  42. package/src/gjc-runtime/tmux-common.ts +8 -0
  43. package/src/gjc-runtime/tmux-sessions.ts +8 -1
  44. package/src/gjc-runtime/workflow-manifest.generated.json +29 -0
  45. package/src/gjc-runtime/workflow-manifest.ts +7 -2
  46. package/src/hashline/hash.ts +1 -1
  47. package/src/internal-urls/docs-index.generated.ts +9 -8
  48. package/src/lsp/config.ts +16 -3
  49. package/src/lsp/defaults.json +7 -0
  50. package/src/lsp/types.ts +2 -0
  51. package/src/modes/controllers/event-controller.ts +15 -0
  52. package/src/modes/interactive-mode.ts +46 -2
  53. package/src/modes/utils/context-usage.ts +2 -2
  54. package/src/notifications/attachment-registry.ts +23 -0
  55. package/src/notifications/chat-adapters.ts +147 -0
  56. package/src/notifications/config.ts +23 -2
  57. package/src/notifications/engine.ts +100 -0
  58. package/src/notifications/index.ts +224 -45
  59. package/src/notifications/managed-daemon.ts +163 -0
  60. package/src/notifications/telegram-daemon.ts +235 -14
  61. package/src/notifications/threaded-inbound.ts +60 -4
  62. package/src/notifications/threaded-render.ts +20 -2
  63. package/src/session/agent-session.ts +82 -51
  64. package/src/tools/ask.ts +3 -2
  65. package/src/tools/fetch.ts +78 -1
  66. package/src/tools/index.ts +3 -0
  67. package/src/tools/telegram-send.ts +137 -0
  68. package/src/web/insane/bridge.ts +350 -0
  69. package/src/web/insane/url-guard.ts +155 -0
  70. package/src/web/search/provider.ts +77 -18
  71. package/src/web/search/providers/anthropic.ts +70 -3
  72. package/src/web/search/providers/codex.ts +1 -119
  73. package/src/web/search/providers/gemini.ts +99 -0
  74. package/src/web/search/providers/insane.ts +551 -0
  75. package/src/web/search/providers/openai-compatible.ts +66 -32
  76. package/src/web/search/providers/text-citations.ts +111 -0
  77. package/src/web/search/types.ts +13 -2
  78. package/vendor/insane-search/LICENSE +21 -0
  79. package/vendor/insane-search/MANIFEST.json +24 -0
  80. package/vendor/insane-search/engine/__init__.py +23 -0
  81. package/vendor/insane-search/engine/__main__.py +128 -0
  82. package/vendor/insane-search/engine/bias_check.py +183 -0
  83. package/vendor/insane-search/engine/executor.py +254 -0
  84. package/vendor/insane-search/engine/fetch_chain.py +725 -0
  85. package/vendor/insane-search/engine/learning.py +175 -0
  86. package/vendor/insane-search/engine/phase0.py +214 -0
  87. package/vendor/insane-search/engine/safety.py +91 -0
  88. package/vendor/insane-search/engine/templates/package.json +11 -0
  89. package/vendor/insane-search/engine/templates/playwright_mobile_chrome.js +188 -0
  90. package/vendor/insane-search/engine/templates/playwright_real_chrome.js +243 -0
  91. package/vendor/insane-search/engine/tests/test_hardening.py +57 -0
  92. package/vendor/insane-search/engine/tests/test_smoke.py +152 -0
  93. package/vendor/insane-search/engine/tests/test_u1.py +200 -0
  94. package/vendor/insane-search/engine/tests/test_u4.py +131 -0
  95. package/vendor/insane-search/engine/tests/test_u5.py +163 -0
  96. package/vendor/insane-search/engine/tests/test_u7.py +124 -0
  97. package/vendor/insane-search/engine/transport.py +211 -0
  98. package/vendor/insane-search/engine/url_transforms.py +98 -0
  99. package/vendor/insane-search/engine/validators.py +331 -0
  100. package/vendor/insane-search/engine/waf_detector.py +214 -0
  101. package/vendor/insane-search/engine/waf_profiles.yaml +162 -0
@@ -0,0 +1,98 @@
1
+ """Generic URL transforms for the fetch grid.
2
+
3
+ Transforms are domain-agnostic *rules*. They never reference a specific
4
+ site by name. A transform either applies (returns a new URL) or is skipped
5
+ (returns None). Callers iterate transforms in order.
6
+
7
+ Empirically useful transforms (see observations/):
8
+ * mobile_subdomain — `www.example.com` → `m.example.com`
9
+ Strong win on SSR sites with mobile-first serving. Loss on SPA shells
10
+ (some mobile sites return tiny bootstrap HTML).
11
+ * am_prefix — `example.com` (no www) → `m.example.com`
12
+ * drop_www — occasionally unblocks hosts that gate www but not apex.
13
+
14
+ Adding new transforms: prove they help on ≥2 unrelated sites first
15
+ (cross-site validation — bias check).
16
+ """
17
+ from __future__ import annotations
18
+
19
+ from typing import Callable, Optional
20
+ from urllib.parse import urlsplit, urlunsplit
21
+
22
+
23
+ def _replace_host(url: str, new_host: str) -> str:
24
+ parts = urlsplit(url)
25
+ return urlunsplit(parts._replace(netloc=new_host))
26
+
27
+
28
+ def _original(url: str) -> Optional[str]:
29
+ return url
30
+
31
+
32
+ def _mobile_subdomain(url: str) -> Optional[str]:
33
+ """`https://www.example.com/a` → `https://m.example.com/a` (only if host starts with www.)."""
34
+ parts = urlsplit(url)
35
+ host = parts.hostname or ""
36
+ if not host.startswith("www."):
37
+ return None
38
+ new_host = "m." + host[4:]
39
+ if parts.port:
40
+ new_host = f"{new_host}:{parts.port}"
41
+ return _replace_host(url, new_host)
42
+
43
+
44
+ def _am_prefix(url: str) -> Optional[str]:
45
+ """`https://example.com/a` → `https://m.example.com/a` (only if host has no subdomain)."""
46
+ parts = urlsplit(url)
47
+ host = parts.hostname or ""
48
+ if not host or host.startswith("m."):
49
+ return None
50
+ # Only apply to apex-like hosts (≤2 dot-separated labels).
51
+ if host.count(".") >= 2 and not host.startswith("www."):
52
+ return None
53
+ if host.startswith("www."):
54
+ return None # handled by mobile_subdomain
55
+ return _replace_host(url, "m." + host)
56
+
57
+
58
+ def _drop_www(url: str) -> Optional[str]:
59
+ parts = urlsplit(url)
60
+ host = parts.hostname or ""
61
+ if not host.startswith("www."):
62
+ return None
63
+ return _replace_host(url, host[4:])
64
+
65
+
66
+ TRANSFORMS: dict[str, Callable[[str], Optional[str]]] = {
67
+ "original": _original,
68
+ "mobile_subdomain": _mobile_subdomain,
69
+ "am_prefix": _am_prefix,
70
+ "drop_www": _drop_www,
71
+ }
72
+
73
+
74
+ def apply_transform(name: str, url: str) -> Optional[str]:
75
+ """Apply one transform by name. Returns transformed URL or None if skipped."""
76
+ fn = TRANSFORMS.get(name)
77
+ if fn is None:
78
+ raise ValueError(f"Unknown transform: {name!r}. Known: {list(TRANSFORMS)}")
79
+ return fn(url)
80
+
81
+
82
+ def iter_transformed(url: str, order: list[str]) -> list[tuple[str, str]]:
83
+ """Yield (transform_name, transformed_url) pairs for a given order.
84
+
85
+ Skips transforms that return None (not applicable) and deduplicates
86
+ URLs (so `original` and `drop_www` of `https://example.com` don't double-run).
87
+ """
88
+ seen: set[str] = set()
89
+ out: list[tuple[str, str]] = []
90
+ for name in order:
91
+ new_url = apply_transform(name, url)
92
+ if new_url is None:
93
+ continue
94
+ if new_url in seen:
95
+ continue
96
+ seen.add(new_url)
97
+ out.append((name, new_url))
98
+ return out
@@ -0,0 +1,331 @@
1
+ """Generic challenge / success validator (v2).
2
+
3
+ Layers (all generic, never site-specific):
4
+ 1. HTTP status semantics (rate-limit / auth / not-found / transient / blocked)
5
+ 2. HARD challenge markers (structural WAF containers — always decisive)
6
+ 3. Size fingerprints (known bad BYTE sizes hinted by caller)
7
+ 4. Content-Type / JSON awareness (small JSON APIs are NOT challenges)
8
+ 5. Caller success_selectors (strongest positive proof for HTML)
9
+ 6. SOFT markers + cookie sensor + tiny-body heuristics (only when no
10
+ positive proof is available)
11
+
12
+ v2 changes vs v1 (per multi-AI review 2026-06-21):
13
+ * `WEAK_OK` is reserved for genuinely clean responses. Ambiguous states
14
+ (`_abck` unresolved, soft-block words without proof) now return the new
15
+ non-terminal `SUSPECT_OK` so the fetch chain keeps searching instead of
16
+ declaring a blocked page a success.
17
+ * Small valid JSON (e.g. an internal API) is no longer mislabelled
18
+ `CHALLENGE` — unblocks the R7 API-first route.
19
+ * SOFT markers (e.g. the word "captcha" buried in a script) no longer
20
+ override a matched success_selector.
21
+ * Size compared in BYTES, not unicode char count.
22
+ * Status codes differentiated (429/401/404/5xx) instead of one BLOCKED.
23
+ """
24
+ from __future__ import annotations
25
+
26
+ import json
27
+ import re
28
+ from dataclasses import dataclass, field
29
+ from enum import Enum
30
+ from typing import Optional
31
+
32
+ try:
33
+ from bs4 import BeautifulSoup
34
+ except ImportError: # bs4 is a soft dep: only used when selectors given
35
+ BeautifulSoup = None # type: ignore
36
+
37
+
38
+ # HARD markers: structural challenge/block containers. Decisive on their own —
39
+ # these strings do not appear in legitimate page content. (WAF products only.)
40
+ HARD_CHALLENGE_MARKERS: list[str] = [
41
+ "sec-if-cpt-container",
42
+ "Powered and protected by Akamai",
43
+ "Just a moment...",
44
+ "cf-chl-bypass",
45
+ "Attention Required! | Cloudflare",
46
+ "<title>Bot Challenge</title>",
47
+ "The requested URL was rejected",
48
+ "Request unsuccessful. Incapsula",
49
+ "Please enable JS and disable any ad blocker",
50
+ ]
51
+
52
+ # SOFT markers: words that strongly suggest a challenge BUT can legitimately
53
+ # appear in real content (scripts, articles about bots, etc). Only decisive
54
+ # when the caller has no positive proof (success_selectors) that overrides.
55
+ SOFT_CHALLENGE_MARKERS: list[str] = [
56
+ "access denied",
57
+ "checking your browser",
58
+ "datadome",
59
+ "captcha",
60
+ ]
61
+
62
+ # Backward-compatible export (some callers import CHALLENGE_MARKERS).
63
+ CHALLENGE_MARKERS: list[str] = HARD_CHALLENGE_MARKERS + SOFT_CHALLENGE_MARKERS
64
+
65
+ # Minimum BODY BYTE size below which we suspect a stub / challenge page.
66
+ SMALL_BODY_THRESHOLD = 3000
67
+
68
+
69
+ class Verdict(Enum):
70
+ """Classification of a fetched response."""
71
+
72
+ STRONG_OK = "strong_ok" # positive proof present → terminal success
73
+ WEAK_OK = "weak_ok" # clean, no negative signal → terminal success
74
+ SUSPECT_OK = "suspect_ok" # ambiguous (abck unresolved / soft) → NON-terminal
75
+ CHALLENGE = "challenge" # WAF challenge (negative proof)
76
+ BLOCKED = "blocked" # generic non-2xx block
77
+ RATE_LIMITED = "rate_limited" # 429 — back off, do not hammer
78
+ AUTH_REQUIRED = "auth_required" # 401/407 — terminal, retrying TLS won't help
79
+ NOT_FOUND = "not_found" # 404/410 — terminal
80
+ UNKNOWN = "unknown" # exception / dependency missing
81
+
82
+
83
+ # Verdicts that mean "stop the grid — more TLS attempts cannot help".
84
+ TERMINAL_NONSUCCESS = frozenset({
85
+ Verdict.AUTH_REQUIRED, Verdict.NOT_FOUND, Verdict.RATE_LIMITED,
86
+ })
87
+
88
+
89
+ @dataclass
90
+ class ValidationResult:
91
+ verdict: Verdict
92
+ reasons: list[str] = field(default_factory=list)
93
+ matched_selectors: list[str] = field(default_factory=list)
94
+ body_size: int = 0 # bytes
95
+ status: int = 0
96
+
97
+ @property
98
+ def ok(self) -> bool:
99
+ """Terminal success only. SUSPECT_OK is intentionally excluded."""
100
+ return self.verdict in (Verdict.STRONG_OK, Verdict.WEAK_OK)
101
+
102
+ def to_dict(self) -> dict:
103
+ return {
104
+ "verdict": self.verdict.value,
105
+ "reasons": self.reasons,
106
+ "matched_selectors": self.matched_selectors,
107
+ "body_size": self.body_size,
108
+ "status": self.status,
109
+ }
110
+
111
+
112
+ def _hard_marker_hits(body_lower: str) -> list[str]:
113
+ return [m for m in HARD_CHALLENGE_MARKERS if m.lower() in body_lower]
114
+
115
+
116
+ def _soft_marker_hits(body_lower: str) -> list[str]:
117
+ return [m for m in SOFT_CHALLENGE_MARKERS if m in body_lower]
118
+
119
+
120
+ def _abck_unresolved(cookies: dict) -> bool:
121
+ abck = cookies.get("_abck", "")
122
+ return bool(abck) and "~-1~" in abck
123
+
124
+
125
+ def _content_type(resp) -> str:
126
+ try:
127
+ headers = {k.lower(): v for k, v in dict(getattr(resp, "headers", {}) or {}).items()}
128
+ return str(headers.get("content-type", "")).lower()
129
+ except Exception:
130
+ return ""
131
+
132
+
133
+ def _looks_like_json(text: str, ctype: str) -> bool:
134
+ if "json" in ctype:
135
+ return True
136
+ s = text.lstrip()[:1]
137
+ return s in ("{", "[")
138
+
139
+
140
+ def _json_ok(text: str) -> Optional[bool]:
141
+ """True if text parses as non-empty JSON, False if parses-but-empty,
142
+ None if not parseable."""
143
+ try:
144
+ obj = json.loads(text)
145
+ except Exception:
146
+ return None
147
+ if obj in (None, {}, [], ""):
148
+ return False
149
+ return True
150
+
151
+
152
+ def _byte_size(resp, text: str) -> int:
153
+ content = getattr(resp, "content", None)
154
+ if isinstance(content, (bytes, bytearray)):
155
+ return len(content)
156
+ return len(text.encode("utf-8", "ignore"))
157
+
158
+
159
+ def _looks_complete_content_page(text: str, lowered: str) -> bool:
160
+ """True when a SMALL body is still a real (short) page, not a challenge stub.
161
+
162
+ A genuine page is a COMPLETE HTML document (closes `</html>`/`</body>`) that
163
+ carries meaningful visible text — e.g. example.com at ~600B. A WAF interstitial
164
+ that slipped past the marker checks is typically script-only, empty, or an
165
+ incomplete fragment, so it has little visible text and returns False."""
166
+ if "</html>" not in lowered and "</body>" not in lowered:
167
+ return False
168
+ visible = re.sub(r"(?is)<(script|style)[^>]*>.*?</\1>", " ", text)
169
+ visible = re.sub(r"(?s)<[^>]+>", " ", visible)
170
+ visible = re.sub(r"\s+", " ", visible).strip()
171
+ return len(visible) >= 64
172
+
173
+
174
+ def _selector_hits(body: str, selectors: list[str]) -> Optional[list[str]]:
175
+ """Return matched-selector list, or None if BS4 is unavailable."""
176
+ if BeautifulSoup is None:
177
+ return None
178
+ try:
179
+ soup = BeautifulSoup(body, "html.parser")
180
+ except Exception:
181
+ return []
182
+ hits: list[str] = []
183
+ for sel in selectors:
184
+ try:
185
+ if soup.select(sel):
186
+ hits.append(sel)
187
+ except Exception:
188
+ continue
189
+ return hits
190
+
191
+
192
+ def validate(
193
+ resp,
194
+ *,
195
+ success_selectors: Optional[list[str]] = None,
196
+ known_bad_sizes: Optional[list[int]] = None,
197
+ size_tolerance: int = 20,
198
+ ) -> ValidationResult:
199
+ """Validate a `curl_cffi` / `requests` response (v2)."""
200
+ try:
201
+ status = int(getattr(resp, "status_code", 0) or 0)
202
+ text = getattr(resp, "text", "") or ""
203
+ size = _byte_size(resp, text)
204
+ except Exception as e:
205
+ return ValidationResult(verdict=Verdict.UNKNOWN, reasons=[f"parse_error:{e}"])
206
+
207
+ r = ValidationResult(verdict=Verdict.UNKNOWN, body_size=size, status=status)
208
+
209
+ # --- Layer 1: status semantics ----------------------------------------
210
+ if status == 429:
211
+ r.verdict = Verdict.RATE_LIMITED
212
+ r.reasons.append("status=429")
213
+ return r
214
+ if status in (401, 407):
215
+ r.verdict = Verdict.AUTH_REQUIRED
216
+ r.reasons.append(f"status={status}")
217
+ return r
218
+ if status in (404, 410):
219
+ r.verdict = Verdict.NOT_FOUND
220
+ r.reasons.append(f"status={status}")
221
+ return r
222
+ if 500 <= status <= 599:
223
+ r.verdict = Verdict.BLOCKED
224
+ r.reasons.append(f"status={status}")
225
+ return r
226
+ if status == 0:
227
+ r.verdict = Verdict.UNKNOWN
228
+ r.reasons.append("status=0")
229
+ return r
230
+ # 403/406/etc fall through to marker analysis (often a WAF challenge body).
231
+
232
+ lowered = text.lower()
233
+
234
+ # --- Layer 2: HARD markers (decisive) ---------------------------------
235
+ hard = _hard_marker_hits(lowered)
236
+ if hard:
237
+ r.verdict = Verdict.CHALLENGE
238
+ r.reasons.extend(f"hard:{m}" for m in hard[:3])
239
+ return r
240
+
241
+ # --- Layer 3: size fingerprint (bytes, tolerant) ----------------------
242
+ if known_bad_sizes:
243
+ for bad in known_bad_sizes:
244
+ if abs(size - bad) <= size_tolerance:
245
+ r.verdict = Verdict.CHALLENGE
246
+ r.reasons.append(f"size_fp:{size}~{bad}")
247
+ return r
248
+
249
+ # --- Layer 4: JSON awareness (before tiny-body heuristic) -------------
250
+ ctype = _content_type(resp)
251
+ if _looks_like_json(text, ctype):
252
+ j = _json_ok(text)
253
+ if j is True:
254
+ # A 2xx with non-empty parseable JSON is a successful API hit even
255
+ # if tiny. CSS selectors don't apply to JSON, so WEAK_OK is the
256
+ # ceiling here (no HTML positive-proof concept).
257
+ r.verdict = Verdict.WEAK_OK
258
+ r.reasons.append("json_ok")
259
+ return r
260
+ if j is False:
261
+ r.verdict = Verdict.SUSPECT_OK
262
+ r.reasons.append("json_empty")
263
+ return r
264
+ # j is None → not actually JSON; fall through to HTML handling.
265
+
266
+ cookies = _extract_cookies(resp)
267
+ abck_bad = _abck_unresolved(cookies)
268
+
269
+ # --- Layer 5: caller positive proof (HTML) ----------------------------
270
+ if success_selectors:
271
+ hits = _selector_hits(text, success_selectors)
272
+ if hits is None:
273
+ r.verdict = Verdict.UNKNOWN
274
+ r.reasons.append("bs4_missing")
275
+ return r
276
+ if hits:
277
+ r.matched_selectors = hits
278
+ # Selector matched → soft markers are ignored (they were likely in
279
+ # a script or unrelated text). But an unresolved sensor cookie
280
+ # still demotes us to NON-terminal SUSPECT_OK.
281
+ if abck_bad:
282
+ r.reasons.append("abck_unresolved")
283
+ r.verdict = Verdict.SUSPECT_OK
284
+ return r
285
+ r.verdict = Verdict.STRONG_OK
286
+ return r
287
+ # Selectors requested but none matched → challenge.
288
+ r.verdict = Verdict.CHALLENGE
289
+ r.reasons.append("no_success_selector")
290
+ return r
291
+
292
+ # --- Layer 6: no positive proof — heuristics --------------------------
293
+ soft = _soft_marker_hits(lowered)
294
+ if soft:
295
+ r.verdict = Verdict.CHALLENGE
296
+ r.reasons.extend(f"soft:{m}" for m in soft[:3])
297
+ return r
298
+
299
+ if size < SMALL_BODY_THRESHOLD:
300
+ # A small body is only weak evidence of a challenge stub. A COMPLETE,
301
+ # content-bearing HTML document that just happens to be short (e.g.
302
+ # example.com ~600B) is a real page → clean weak success. Only an
303
+ # incomplete / script-only / empty small body stays suspicious.
304
+ if _looks_complete_content_page(text, lowered):
305
+ r.verdict = Verdict.WEAK_OK
306
+ r.reasons.append(f"small_but_complete:{size}")
307
+ return r
308
+ r.verdict = Verdict.CHALLENGE
309
+ r.reasons.append(f"tiny_body:{size}")
310
+ return r
311
+
312
+ if abck_bad:
313
+ # Unresolved Akamai sensor with no positive proof: do NOT declare
314
+ # success. Non-terminal so the chain keeps trying.
315
+ r.reasons.append("abck_unresolved")
316
+ r.verdict = Verdict.SUSPECT_OK
317
+ return r
318
+
319
+ # Clean, sizeable, no negative signal, no sensor problem → terminal weak ok.
320
+ r.verdict = Verdict.WEAK_OK
321
+ return r
322
+
323
+
324
+ def _extract_cookies(resp) -> dict:
325
+ try:
326
+ return {c.name: c.value for c in resp.cookies.jar}
327
+ except Exception:
328
+ try:
329
+ return dict(resp.cookies) if hasattr(resp, "cookies") else {}
330
+ except Exception:
331
+ return {}
@@ -0,0 +1,214 @@
1
+ """WAF-product detection from a live response.
2
+
3
+ Returns a *ranking* of (profile_id, confidence) pairs — never a single verdict.
4
+ Single-answer detectors cause cascading wrong plans when misfiring (Codex's
5
+ critique). Planner consumes the ranking and tries top candidates in order.
6
+
7
+ All detectors operate on WAF-vendor artifacts (cookies / headers / body
8
+ strings) — never site hostnames. See engine/waf_profiles.yaml for the
9
+ profile definitions.
10
+ """
11
+ from __future__ import annotations
12
+
13
+ import fnmatch
14
+ import os
15
+ import re
16
+ from dataclasses import dataclass
17
+ from typing import Optional
18
+
19
+ try:
20
+ import yaml # PyYAML
21
+ except ImportError:
22
+ yaml = None # type: ignore
23
+
24
+
25
+ PROFILES_PATH = os.path.join(os.path.dirname(__file__), "waf_profiles.yaml")
26
+
27
+
28
+ # In-code safety net — used when waf_profiles.yaml is missing / invalid
29
+ # or PyYAML isn't installed. Keeps fetch() working in a degraded-but-sane
30
+ # mode. Must stay site-agnostic (No-Site-Name Rule).
31
+ _DEFAULT_PROFILES: dict = {
32
+ "unknown_challenge": {
33
+ "detectors": {},
34
+ "confidence_rules": {"strong": 0, "weak": 0},
35
+ "capabilities_needed": ["needs_js_exec"],
36
+ "tls_impersonate_candidates": [
37
+ ["safari", "chrome", "firefox"],
38
+ ["safari_ios", "chrome_android"],
39
+ ],
40
+ "referer_strategies": ["self_root", "google_search", "none"],
41
+ "url_transform_order": ["original", "mobile_subdomain"],
42
+ "fallback_when_challenge": ["playwright_mcp", "playwright_real_chrome"],
43
+ "notes": "in-code default — waf_profiles.yaml unavailable",
44
+ },
45
+ }
46
+
47
+
48
+ # Module-level sticky error. Readers call `last_load_error()` after each
49
+ # `_load_profiles()` call to surface YAML problems in FetchResult.trace.
50
+ _LAST_LOAD_ERROR: Optional[str] = None
51
+
52
+
53
+ @dataclass
54
+ class DetectionHit:
55
+ profile_id: str
56
+ confidence: float
57
+ signals: list[str]
58
+
59
+
60
+ def last_load_error() -> Optional[str]:
61
+ """Return the most recent profile-loader error (or None if clean)."""
62
+ return _LAST_LOAD_ERROR
63
+
64
+
65
+ def _load_profiles(path: str = PROFILES_PATH) -> dict:
66
+ """Load profiles with graceful fallback.
67
+
68
+ Never raises. On any failure (PyYAML missing, file missing, parse error,
69
+ unexpected shape) it returns a copy of `_DEFAULT_PROFILES` and stores
70
+ the reason in `_LAST_LOAD_ERROR` for the caller to surface.
71
+ """
72
+ global _LAST_LOAD_ERROR
73
+ _LAST_LOAD_ERROR = None
74
+
75
+ if yaml is None:
76
+ _LAST_LOAD_ERROR = "PyYAML not installed — using in-code default profile"
77
+ return dict(_DEFAULT_PROFILES)
78
+ try:
79
+ with open(path, "r", encoding="utf-8") as f:
80
+ loaded = yaml.safe_load(f) or {}
81
+ except FileNotFoundError:
82
+ _LAST_LOAD_ERROR = f"waf_profiles.yaml not found at {path}"
83
+ return dict(_DEFAULT_PROFILES)
84
+ except yaml.YAMLError as e:
85
+ _LAST_LOAD_ERROR = f"YAML parse error: {type(e).__name__}: {str(e)[:200]}"
86
+ return dict(_DEFAULT_PROFILES)
87
+ except Exception as e:
88
+ _LAST_LOAD_ERROR = f"profile loader: {type(e).__name__}: {str(e)[:200]}"
89
+ return dict(_DEFAULT_PROFILES)
90
+
91
+ if not isinstance(loaded, dict) or not any(k for k in loaded if not k.startswith("_")):
92
+ _LAST_LOAD_ERROR = f"waf_profiles.yaml has no usable profiles"
93
+ return dict(_DEFAULT_PROFILES)
94
+
95
+ return loaded
96
+
97
+
98
+ def _cookies_dict(resp) -> dict:
99
+ try:
100
+ return {c.name: c.value for c in resp.cookies.jar}
101
+ except Exception:
102
+ try:
103
+ return dict(resp.cookies) if hasattr(resp, "cookies") else {}
104
+ except Exception:
105
+ return {}
106
+
107
+
108
+ def _headers_dict(resp) -> dict:
109
+ try:
110
+ return {k.lower(): v for k, v in dict(resp.headers).items()}
111
+ except Exception:
112
+ return {}
113
+
114
+
115
+ def _match_patterns(haystack_keys: list[str], patterns: list[str]) -> list[str]:
116
+ """Match literal names or fnmatch patterns (for wildcards like `X-Akamai-*`)."""
117
+ hits: list[str] = []
118
+ lowered_keys = [k.lower() for k in haystack_keys]
119
+ for pat in patterns or []:
120
+ pat_l = pat.lower()
121
+ if any(c in pat for c in "*?["):
122
+ for key in lowered_keys:
123
+ if fnmatch.fnmatchcase(key, pat_l):
124
+ hits.append(pat)
125
+ break
126
+ else:
127
+ if pat_l in lowered_keys:
128
+ hits.append(pat)
129
+ return hits
130
+
131
+
132
+ def _score_profile(profile_id: str, profile: dict, resp) -> Optional[DetectionHit]:
133
+ """Apply profile detectors to resp. Returns hit or None."""
134
+ if profile_id.startswith("_"):
135
+ return None
136
+ detectors = profile.get("detectors") or {}
137
+ if not detectors and profile_id != "unknown_challenge":
138
+ return None
139
+
140
+ cookies = _cookies_dict(resp)
141
+ headers = _headers_dict(resp)
142
+ body = (getattr(resp, "text", "") or "").lower()
143
+ server = headers.get("server", "")
144
+
145
+ signals: list[str] = []
146
+
147
+ # Cookie detectors
148
+ cookie_pats = detectors.get("cookie") or []
149
+ for hit in _match_patterns(list(cookies.keys()), cookie_pats):
150
+ signals.append(f"cookie:{hit}")
151
+
152
+ # Header detectors
153
+ header_pats = detectors.get("header") or []
154
+ for hit in _match_patterns(list(headers.keys()), header_pats):
155
+ signals.append(f"header:{hit}")
156
+
157
+ # Server substring
158
+ for needle in detectors.get("server_contains") or []:
159
+ if needle.lower() in server:
160
+ signals.append(f"server:{needle}")
161
+
162
+ # Body markers
163
+ for needle in detectors.get("body") or []:
164
+ if needle.lower() in body:
165
+ signals.append(f"body:{needle}")
166
+
167
+ if not signals:
168
+ return None
169
+
170
+ rules = profile.get("confidence_rules") or {"strong": 2, "weak": 1}
171
+ n = len(signals)
172
+ if n >= rules.get("strong", 2):
173
+ conf = 0.9
174
+ elif n >= rules.get("weak", 1):
175
+ conf = 0.6
176
+ else:
177
+ conf = 0.3
178
+
179
+ return DetectionHit(profile_id=profile_id, confidence=conf, signals=signals)
180
+
181
+
182
+ def detect(resp, *, profiles: Optional[dict] = None, min_confidence: float = 0.0) -> list[DetectionHit]:
183
+ """Return ranked list of detection hits (best first).
184
+
185
+ When nothing fires, the returned list contains a single `unknown_challenge`
186
+ hit with confidence 0.1 — caller can use its conservative settings.
187
+ """
188
+ if profiles is None:
189
+ profiles = _load_profiles()
190
+
191
+ hits: list[DetectionHit] = []
192
+ for profile_id, profile in profiles.items():
193
+ if profile_id.startswith("_"):
194
+ continue
195
+ h = _score_profile(profile_id, profile, resp)
196
+ if h and h.confidence >= min_confidence:
197
+ hits.append(h)
198
+
199
+ hits.sort(key=lambda x: x.confidence, reverse=True)
200
+
201
+ if not hits:
202
+ hits.append(DetectionHit(
203
+ profile_id="unknown_challenge",
204
+ confidence=0.1,
205
+ signals=["fallback"],
206
+ ))
207
+ return hits
208
+
209
+
210
+ def load_profile(profile_id: str, *, profiles: Optional[dict] = None) -> dict:
211
+ """Get one profile by id, resolving `unknown_challenge` if missing."""
212
+ if profiles is None:
213
+ profiles = _load_profiles()
214
+ return profiles.get(profile_id) or profiles.get("unknown_challenge") or {}