@gajae-code/coding-agent 0.7.1 → 0.7.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (92) hide show
  1. package/CHANGELOG.md +19 -0
  2. package/dist/types/cli/notify-cli.d.ts +2 -0
  3. package/dist/types/config/settings-schema.d.ts +39 -2
  4. package/dist/types/extensibility/shared-events.d.ts +1 -0
  5. package/dist/types/gjc-runtime/ralplan-runtime.d.ts +1 -1
  6. package/dist/types/lsp/types.d.ts +2 -0
  7. package/dist/types/notifications/attachment-registry.d.ts +17 -0
  8. package/dist/types/notifications/chat-adapters.d.ts +9 -0
  9. package/dist/types/notifications/config.d.ts +9 -1
  10. package/dist/types/notifications/engine.d.ts +59 -0
  11. package/dist/types/notifications/managed-daemon.d.ts +48 -0
  12. package/dist/types/notifications/telegram-daemon.d.ts +19 -0
  13. package/dist/types/notifications/threaded-inbound.d.ts +19 -0
  14. package/dist/types/notifications/threaded-render.d.ts +6 -1
  15. package/dist/types/session/agent-session.d.ts +2 -0
  16. package/dist/types/tools/fetch.d.ts +23 -0
  17. package/dist/types/tools/index.d.ts +1 -0
  18. package/dist/types/tools/telegram-send.d.ts +32 -0
  19. package/dist/types/web/insane/bridge.d.ts +103 -0
  20. package/dist/types/web/insane/url-guard.d.ts +22 -0
  21. package/dist/types/web/search/provider.d.ts +18 -1
  22. package/dist/types/web/search/providers/insane.d.ts +53 -0
  23. package/dist/types/web/search/providers/text-citations.d.ts +23 -0
  24. package/dist/types/web/search/types.d.ts +12 -4
  25. package/package.json +10 -8
  26. package/scripts/verify-insane-vendor.ts +132 -0
  27. package/src/cli/args.ts +1 -1
  28. package/src/cli/fast-help.ts +1 -1
  29. package/src/cli/notify-cli.ts +152 -5
  30. package/src/commands/team.ts +1 -1
  31. package/src/config/settings-schema.ts +30 -1
  32. package/src/defaults/gjc/skills/ralplan/SKILL.md +11 -4
  33. package/src/extensibility/shared-events.ts +1 -0
  34. package/src/gjc-runtime/launch-tmux.ts +17 -3
  35. package/src/gjc-runtime/ledger-event-renderer.ts +1 -0
  36. package/src/gjc-runtime/ralplan-runtime.ts +2 -2
  37. package/src/gjc-runtime/workflow-manifest.generated.json +29 -0
  38. package/src/gjc-runtime/workflow-manifest.ts +7 -2
  39. package/src/internal-urls/docs-index.generated.ts +7 -7
  40. package/src/lsp/config.ts +16 -3
  41. package/src/lsp/defaults.json +7 -0
  42. package/src/lsp/types.ts +2 -0
  43. package/src/modes/controllers/event-controller.ts +15 -0
  44. package/src/modes/interactive-mode.ts +46 -2
  45. package/src/modes/utils/context-usage.ts +2 -2
  46. package/src/notifications/attachment-registry.ts +23 -0
  47. package/src/notifications/chat-adapters.ts +147 -0
  48. package/src/notifications/config.ts +23 -2
  49. package/src/notifications/engine.ts +100 -0
  50. package/src/notifications/index.ts +180 -38
  51. package/src/notifications/managed-daemon.ts +163 -0
  52. package/src/notifications/telegram-daemon.ts +235 -14
  53. package/src/notifications/threaded-inbound.ts +60 -4
  54. package/src/notifications/threaded-render.ts +20 -2
  55. package/src/session/agent-session.ts +82 -51
  56. package/src/tools/fetch.ts +78 -1
  57. package/src/tools/index.ts +3 -0
  58. package/src/tools/telegram-send.ts +137 -0
  59. package/src/web/insane/bridge.ts +350 -0
  60. package/src/web/insane/url-guard.ts +155 -0
  61. package/src/web/search/provider.ts +77 -18
  62. package/src/web/search/providers/anthropic.ts +70 -3
  63. package/src/web/search/providers/codex.ts +1 -119
  64. package/src/web/search/providers/gemini.ts +99 -0
  65. package/src/web/search/providers/insane.ts +551 -0
  66. package/src/web/search/providers/openai-compatible.ts +66 -32
  67. package/src/web/search/providers/text-citations.ts +111 -0
  68. package/src/web/search/types.ts +13 -2
  69. package/vendor/insane-search/LICENSE +21 -0
  70. package/vendor/insane-search/MANIFEST.json +24 -0
  71. package/vendor/insane-search/engine/__init__.py +23 -0
  72. package/vendor/insane-search/engine/__main__.py +128 -0
  73. package/vendor/insane-search/engine/bias_check.py +183 -0
  74. package/vendor/insane-search/engine/executor.py +254 -0
  75. package/vendor/insane-search/engine/fetch_chain.py +725 -0
  76. package/vendor/insane-search/engine/learning.py +175 -0
  77. package/vendor/insane-search/engine/phase0.py +214 -0
  78. package/vendor/insane-search/engine/safety.py +91 -0
  79. package/vendor/insane-search/engine/templates/package.json +11 -0
  80. package/vendor/insane-search/engine/templates/playwright_mobile_chrome.js +188 -0
  81. package/vendor/insane-search/engine/templates/playwright_real_chrome.js +243 -0
  82. package/vendor/insane-search/engine/tests/test_hardening.py +57 -0
  83. package/vendor/insane-search/engine/tests/test_smoke.py +152 -0
  84. package/vendor/insane-search/engine/tests/test_u1.py +200 -0
  85. package/vendor/insane-search/engine/tests/test_u4.py +131 -0
  86. package/vendor/insane-search/engine/tests/test_u5.py +163 -0
  87. package/vendor/insane-search/engine/tests/test_u7.py +124 -0
  88. package/vendor/insane-search/engine/transport.py +211 -0
  89. package/vendor/insane-search/engine/url_transforms.py +98 -0
  90. package/vendor/insane-search/engine/validators.py +331 -0
  91. package/vendor/insane-search/engine/waf_detector.py +214 -0
  92. package/vendor/insane-search/engine/waf_profiles.yaml +162 -0
@@ -0,0 +1,725 @@
1
+ """Single entrypoint: insane-search generic fetch chain.
2
+
3
+ from insane_search.engine import fetch
4
+ result = fetch("https://example.com/path", success_selectors=["article"])
5
+
6
+ Public contract:
7
+ * One function: `fetch(url, ...) -> FetchResult`.
8
+ * Internal structure preserved as explicit phases so tests & debug logs
9
+ can target each stage: probe → validate → detect → plan → execute → report.
10
+ * `FetchResult.trace` exposes every attempt (transform × impersonate ×
11
+ referer × executor) — callers can diagnose without re-running.
12
+
13
+ v2 scheduler (multi-AI review 2026-06-21):
14
+ * `_build_plan` materializes the whole grid then orders it for DIVERSITY —
15
+ one representative per TLS family across both URL transforms first, so a
16
+ small attempt budget still touches every family/transform instead of
17
+ burning out on the Safari family.
18
+ * `tls_impersonate_avoid` entries are DEPRIORITIZED (moved last), never
19
+ deleted — they are still attempted in exhaustive mode.
20
+ * `max_attempts=None` (new default) means EXHAUSTIVE — run the full plan,
21
+ honouring R6. A numeric cap is a *budget*, and exhaustion vs budget vs
22
+ early-terminal is reported via `stop_reason` / `grid_exhausted`.
23
+ * Jitter sleeps only on a CONTINUING (failed) attempt, never before a
24
+ successful return.
25
+ * `SUSPECT_OK` (abck unresolved / soft block) is NON-terminal: kept as
26
+ best-effort, but the grid keeps searching for real proof.
27
+
28
+ No site-specific branching. Site knowledge enters only via:
29
+ * `success_selectors` (caller-supplied positive proof)
30
+ * `user_hint` (optional runtime hints; never persisted by this module)
31
+ """
32
+ from __future__ import annotations
33
+
34
+ import os
35
+ import random
36
+ import time
37
+ from dataclasses import dataclass, field, asdict
38
+ from typing import Any, Optional
39
+
40
+ from .validators import Verdict, validate, TERMINAL_NONSUCCESS
41
+ from .waf_detector import detect, load_profile, _load_profiles, last_load_error
42
+ from .url_transforms import iter_transformed
43
+
44
+
45
+ _OK_VALUES = (Verdict.STRONG_OK.value, Verdict.WEAK_OK.value)
46
+ _TERMINAL_NONSUCCESS_VALUES = frozenset(v.value for v in TERMINAL_NONSUCCESS)
47
+
48
+
49
+ # --- Referer strategies (name → function of original URL) --------------------
50
+ def _self_root(url: str) -> str:
51
+ from urllib.parse import urlsplit
52
+ p = urlsplit(url)
53
+ return f"{p.scheme}://{p.netloc}/"
54
+
55
+
56
+ REFERER_STRATEGIES = {
57
+ "self_root": _self_root,
58
+ "google_search": lambda _url: "https://www.google.com/",
59
+ "none": lambda _url: "",
60
+ }
61
+
62
+
63
+ # --- Attempt & result schema -------------------------------------------------
64
+ @dataclass
65
+ class Attempt:
66
+ phase: str # probe | grid | fallback
67
+ executor: str # curl_cffi | playwright_real_chrome | ...
68
+ url: str
69
+ url_transform: str # original | mobile_subdomain | ...
70
+ impersonate: Optional[str] # safari | chrome | ... | None (non-curl)
71
+ referer: str
72
+ status: int = 0
73
+ body_size: int = 0
74
+ verdict: str = ""
75
+ reasons: list[str] = field(default_factory=list)
76
+ elapsed_s: float = 0.0
77
+ error: Optional[str] = None
78
+
79
+ def to_dict(self) -> dict:
80
+ return asdict(self)
81
+
82
+
83
+ @dataclass
84
+ class FetchResult:
85
+ ok: bool
86
+ content: str = ""
87
+ final_url: str = ""
88
+ verdict: str = ""
89
+ profile_used: Optional[str] = None
90
+ trace: list[Attempt] = field(default_factory=list)
91
+ summary: str = ""
92
+ # v2 scheduler diagnostics
93
+ planned_attempts: int = 0
94
+ executed_attempts: int = 0
95
+ grid_exhausted: bool = False
96
+ stop_reason: str = "" # success | exhausted | budget | <terminal verdict> | error
97
+ # Failure gate (R6): when ok=False these tell the caller it is NOT finished —
98
+ # which escalation routes the engine could not perform itself remain to try.
99
+ untried_routes: list[str] = field(default_factory=list)
100
+ must_invoke_playwright_mcp: bool = False
101
+
102
+ def to_dict(self, *, include_content: bool = False, content_limit: int = 4_000_000) -> dict:
103
+ content = self.content or ""
104
+ bounded_content = content[:max(0, content_limit)]
105
+ payload = {
106
+ "ok": self.ok,
107
+ "final_url": self.final_url,
108
+ "verdict": self.verdict,
109
+ "profile_used": self.profile_used,
110
+ "trace": [a.to_dict() for a in self.trace],
111
+ "summary": self.summary,
112
+ "content_length": len(content),
113
+ "content_truncated": len(bounded_content) < len(content),
114
+ "planned_attempts": self.planned_attempts,
115
+ "executed_attempts": self.executed_attempts,
116
+ "grid_exhausted": self.grid_exhausted,
117
+ "stop_reason": self.stop_reason,
118
+ "untried_routes": self.untried_routes,
119
+ "must_invoke_playwright_mcp": self.must_invoke_playwright_mcp,
120
+ }
121
+ if include_content:
122
+ payload["content"] = bounded_content
123
+ return payload
124
+
125
+
126
+ # --- curl_cffi probe executor ------------------------------------------------
127
+ def _curl_probe(
128
+ url: str, *, impersonate: str, referer: str, timeout: int = 20
129
+ ) -> tuple[Any, Optional[str]]:
130
+ """Returns (response, error_str). response may be None on exception.
131
+
132
+ Routes through the per-host SessionPool so cookies (WAF sensors) and the
133
+ warm connection persist across attempts and across pages of the same host.
134
+ The pool degrades to a one-shot GET when a Session can't be created.
135
+ """
136
+ from .transport import POOL
137
+ return POOL.request(url, impersonate=impersonate, referer=referer, timeout=timeout)
138
+
139
+
140
+ def _run_attempt(
141
+ url: str,
142
+ *,
143
+ transform_name: str,
144
+ impersonate: str,
145
+ referer_name: str,
146
+ success_selectors: Optional[list[str]],
147
+ known_bad_sizes: Optional[list[int]],
148
+ timeout: int,
149
+ phase: str,
150
+ ) -> tuple[Attempt, Any]:
151
+ """Execute one curl_cffi attempt and produce an Attempt record."""
152
+ referer_url = REFERER_STRATEGIES.get(referer_name, REFERER_STRATEGIES["none"])(url)
153
+ t0 = time.time()
154
+ resp, err = _curl_probe(url, impersonate=impersonate, referer=referer_url, timeout=timeout)
155
+ elapsed = round(time.time() - t0, 3)
156
+
157
+ att = Attempt(
158
+ phase=phase,
159
+ executor="curl_cffi",
160
+ url=url,
161
+ url_transform=transform_name,
162
+ impersonate=impersonate,
163
+ referer=referer_name,
164
+ elapsed_s=elapsed,
165
+ )
166
+
167
+ if err or resp is None:
168
+ att.error = err or "no response"
169
+ att.verdict = Verdict.UNKNOWN.value
170
+ return att, None
171
+
172
+ vr = validate(resp, success_selectors=success_selectors, known_bad_sizes=known_bad_sizes)
173
+ att.status = vr.status
174
+ att.body_size = vr.body_size
175
+ att.verdict = vr.verdict.value
176
+ att.reasons = vr.reasons
177
+ return att, resp
178
+
179
+
180
+ # --- Diversity planner -------------------------------------------------------
181
+ @dataclass(frozen=True)
182
+ class _Cand:
183
+ profile_id: str
184
+ transform: str
185
+ url: str
186
+ impersonate: str
187
+ referer: str
188
+ known_bad_sizes: Optional[tuple]
189
+
190
+
191
+ _FAMILIES = ("safari_ios", "safari", "chrome_android", "chrome", "edge", "firefox")
192
+
193
+
194
+ def _family(tls: str) -> str:
195
+ for fam in _FAMILIES:
196
+ if tls.startswith(fam):
197
+ return fam
198
+ return tls
199
+
200
+
201
+ def _is_mobile_tls(t: str) -> bool:
202
+ return ("ios" in t) or ("android" in t)
203
+
204
+
205
+ def _plan_for_profile(
206
+ url: str, profile_id: str, profile: dict, device_class: str
207
+ ) -> list[_Cand]:
208
+ groups: list[list[str]] = [list(g) for g in (profile.get("tls_impersonate_candidates") or [["safari", "chrome"]])]
209
+ avoid = set(profile.get("tls_impersonate_avoid") or [])
210
+ referer_order = list(profile.get("referer_strategies") or ["self_root"])
211
+ transform_order = list(profile.get("url_transform_order") or ["original"])
212
+ kb = profile.get("known_bad_sizes") or None
213
+ known_bad = tuple(kb) if kb else None
214
+
215
+ # device_class shaping (fixes desktop/mobile drift)
216
+ if device_class == "mobile":
217
+ groups = [[t for t in g if _is_mobile_tls(t)] for g in groups]
218
+ for extra in ("mobile_subdomain", "am_prefix"):
219
+ if extra not in transform_order:
220
+ transform_order.append(extra)
221
+ elif device_class == "desktop":
222
+ groups = [[t for t in g if not _is_mobile_tls(t)] for g in groups]
223
+ transform_order = [t for t in transform_order if t not in ("mobile_subdomain", "am_prefix")] or ["original"]
224
+
225
+ # deprioritize (not delete) avoid targets within each family group
226
+ def _reorder(g: list[str]) -> list[str]:
227
+ return [t for t in g if t not in avoid] + [t for t in g if t in avoid]
228
+
229
+ groups = [_reorder(g) for g in groups if g]
230
+ if not groups:
231
+ groups = [["safari", "chrome"]]
232
+
233
+ transforms = iter_transformed(url, transform_order) or [("original", url)]
234
+
235
+ # Diversity ordering: vary FAMILY fastest, then TRANSFORM, then version
236
+ # DEPTH, then REFERER. A small budget thus touches every family/transform
237
+ # before exhausting one family's old versions.
238
+ max_depth = max(len(g) for g in groups)
239
+ cands: list[_Cand] = []
240
+ seen: set[tuple] = set()
241
+ for ref in referer_order:
242
+ for depth in range(max_depth):
243
+ for (t_name, t_url) in transforms:
244
+ for g in groups:
245
+ if depth >= len(g):
246
+ continue
247
+ imp = g[depth]
248
+ key = (t_url, imp, ref)
249
+ if key in seen:
250
+ continue
251
+ seen.add(key)
252
+ cands.append(_Cand(profile_id, t_name, t_url, imp, ref, known_bad))
253
+ return cands
254
+
255
+
256
+ def _build_plan(
257
+ url: str,
258
+ hits: list,
259
+ profiles: dict,
260
+ device_class: str,
261
+ probe_impersonate: str,
262
+ probe_referer: str,
263
+ priority: Optional[dict] = None,
264
+ ) -> list[_Cand]:
265
+ """Materialize a diversity-ordered candidate plan across the top profiles.
266
+
267
+ Profiles are round-robin interleaved so a confident #1 profile cannot
268
+ starve #2/#3. The probe combo is removed (already executed).
269
+
270
+ `priority` (U5 self-learning): a previously-successful route
271
+ ``{"transform","impersonate","referer"}`` for this host — the matching
272
+ candidate is moved to the FRONT so a known-good route is retried first."""
273
+ per: list[list[_Cand]] = []
274
+ for hit in hits[:3]:
275
+ pid = getattr(hit, "profile_id", None) or "unknown_challenge"
276
+ prof = load_profile(pid, profiles=profiles)
277
+ per.append(_plan_for_profile(url, pid, prof, device_class))
278
+
279
+ probe_key = (url, probe_impersonate, probe_referer)
280
+ merged: list[_Cand] = []
281
+ seen: set[tuple] = set()
282
+ i = 0
283
+ while any(i < len(p) for p in per):
284
+ for p in per:
285
+ if i < len(p):
286
+ c = p[i]
287
+ key = (c.url, c.impersonate, c.referer)
288
+ if key == probe_key or key in seen:
289
+ continue
290
+ seen.add(key)
291
+ merged.append(c)
292
+ i += 1
293
+
294
+ if priority:
295
+ front = [c for c in merged if c.transform == priority.get("transform")
296
+ and c.impersonate == priority.get("impersonate")
297
+ and c.referer == priority.get("referer")]
298
+ if front:
299
+ rest = [c for c in merged if c not in front]
300
+ merged = front + rest
301
+ return merged
302
+
303
+
304
+ # --- Public entrypoint: self-learning wrapper (U5) ---------------------------
305
+ def _winning_route(result: FetchResult) -> Optional[dict]:
306
+ """Extract the curl route that produced the OK result, from the trace.
307
+
308
+ Only probe/grid curl wins are learnable: Phase 0 always runs first anyway,
309
+ and a browser win carries no reusable curl identity."""
310
+ for att in reversed(result.trace):
311
+ if (att.verdict in _OK_VALUES and att.phase in ("probe", "grid")
312
+ and att.executor == "curl_cffi" and att.impersonate):
313
+ return {
314
+ "transform": att.url_transform,
315
+ "impersonate": att.impersonate,
316
+ "referer": att.referer,
317
+ "phase": att.phase,
318
+ }
319
+ return None
320
+
321
+
322
+ def fetch(
323
+ url: str,
324
+ *,
325
+ success_selectors: Optional[list[str]] = None,
326
+ device_class: str = "auto",
327
+ user_hint: Optional[dict] = None,
328
+ timeout: int = 25,
329
+ max_attempts: Optional[int] = None,
330
+ max_browser_attempts: int = 2,
331
+ enable_playwright: bool = True,
332
+ enable_phase0: bool = True,
333
+ enable_learning: bool = True,
334
+ ) -> FetchResult:
335
+ """Public entrypoint — the generic grid wrapped with per-host self-learning.
336
+
337
+ 1. Before fetching, look up the route that last succeeded for this host and
338
+ promote it: it becomes the probe identity AND the front of the grid.
339
+ 2. After fetching, record the winning route; or, if a learned route was
340
+ promoted and the run hit a REAL block, strike it (evicted after two
341
+ consecutive strikes — see `learning.py`).
342
+
343
+ The store is a bounded, self-pruning JSON file; any error in it is swallowed
344
+ so learning can never break a fetch. Disable per-call with
345
+ ``enable_learning=False`` or globally with ``INSANE_LEARN=0``."""
346
+ priority: Optional[dict] = None
347
+ learned_existed = False
348
+ uh = dict(user_hint or {})
349
+ try:
350
+ from . import learning
351
+ if enable_learning and learning.enabled():
352
+ priority = learning.lookup(url, device_class)
353
+ if priority:
354
+ learned_existed = True
355
+ uh.setdefault("impersonate_first", priority.get("impersonate"))
356
+ uh.setdefault("referer_strategy", priority.get("referer"))
357
+ except Exception:
358
+ priority = None
359
+
360
+ result = _fetch_core(
361
+ url, success_selectors=success_selectors, device_class=device_class,
362
+ user_hint=uh, timeout=timeout, max_attempts=max_attempts,
363
+ max_browser_attempts=max_browser_attempts,
364
+ enable_playwright=enable_playwright, enable_phase0=enable_phase0,
365
+ priority=priority,
366
+ )
367
+
368
+ try:
369
+ from . import learning
370
+ if enable_learning and learning.enabled():
371
+ if result.ok:
372
+ win = _winning_route(result)
373
+ if win:
374
+ learning.record_success(url, device_class, win)
375
+ elif learned_existed:
376
+ learning.record_failure(
377
+ url, device_class,
378
+ penalize=learning.is_real_failure(result.stop_reason))
379
+ except Exception:
380
+ pass
381
+
382
+ return result
383
+
384
+
385
+ # --- Main entrypoint ---------------------------------------------------------
386
+ def _fetch_core(
387
+ url: str,
388
+ *,
389
+ success_selectors: Optional[list[str]] = None,
390
+ device_class: str = "auto", # "auto" | "desktop" | "mobile"
391
+ user_hint: Optional[dict] = None,
392
+ timeout: int = 25,
393
+ max_attempts: Optional[int] = None, # None = exhaustive (R6); int = budget
394
+ max_browser_attempts: int = 2,
395
+ enable_playwright: bool = True,
396
+ enable_phase0: bool = True,
397
+ priority: Optional[dict] = None, # U5: learned route to retry first
398
+ ) -> FetchResult:
399
+ """Fetch `url` using the generic diversity grid.
400
+
401
+ max_attempts
402
+ None (default) → run the whole plan (exhaustive, honours R6).
403
+ int → TOTAL curl-attempt budget (probe included). On budget exit the
404
+ result reports `stop_reason="budget"`, `grid_exhausted=False`, so
405
+ callers never mistake a truncated run for a true exhaustive failure.
406
+ """
407
+ user_hint = user_hint or {}
408
+ profiles = _load_profiles()
409
+ trace: list[Attempt] = []
410
+ last_resp = None
411
+ last_attempt: Optional[Attempt] = None
412
+ best_suspect: Optional[tuple] = None # (resp, attempt)
413
+ profile_used: Optional[str] = None
414
+
415
+ _jmin = int(os.environ.get("INSANE_JITTER_MS_MIN", "150"))
416
+ _jmax = int(os.environ.get("INSANE_JITTER_MS_MAX", "400"))
417
+
418
+ def _jitter():
419
+ time.sleep(random.uniform(_jmin / 1000.0, _jmax / 1000.0))
420
+
421
+ # Surface profile-loader failures as a diagnostic trace entry (not counted
422
+ # as a network attempt).
423
+ load_err = last_load_error()
424
+ if load_err:
425
+ trace.append(Attempt(
426
+ phase="probe", executor="profile_loader", url=url,
427
+ url_transform="original", impersonate=None, referer="",
428
+ verdict=Verdict.UNKNOWN.value, error=f"profiles_fallback: {load_err}",
429
+ ))
430
+
431
+ # -------- Phase 0: official public-API router (R5; site-aware, sanctioned) --
432
+ # For recognised platforms (Reddit/X/YouTube/...) try the official no-auth
433
+ # endpoint BEFORE the generic grid. This is the *enforced* version of the
434
+ # old agent-driven SKILL snippets — the agent can no longer skip it, which
435
+ # is what made Reddit/X look "blocked" (grid 403'd .json; nobody tried .rss).
436
+ if enable_phase0:
437
+ try:
438
+ from .phase0 import route as _phase0_route
439
+ p0 = _phase0_route(url, timeout=timeout)
440
+ except Exception as e: # router must never break the generic chain
441
+ p0 = None
442
+ trace.append(Attempt(
443
+ phase="phase0", executor="phase0", url=url, url_transform="original",
444
+ impersonate=None, referer="", verdict=Verdict.UNKNOWN.value,
445
+ error=f"{type(e).__name__}:{str(e)[:120]}",
446
+ ))
447
+ if p0 is not None:
448
+ for a in p0["attempts"]:
449
+ trace.append(Attempt(
450
+ phase="phase0", executor=a["route"], url=url, url_transform="-",
451
+ impersonate=None, referer="",
452
+ status=a.get("status", 0), body_size=a.get("bytes", 0),
453
+ verdict=(Verdict.STRONG_OK.value if a["ok"] else Verdict.BLOCKED.value),
454
+ reasons=[a["note"]] if a.get("note") else [],
455
+ ))
456
+ if p0["ok"]:
457
+ return FetchResult(
458
+ ok=True, content=p0["content"], final_url=p0["final_url"],
459
+ verdict=Verdict.STRONG_OK.value,
460
+ profile_used=f"phase0:{p0['platform']}", trace=trace,
461
+ summary=f"Phase 0 official route: {p0['platform']}:{p0['route']}",
462
+ stop_reason="success",
463
+ )
464
+ # Recognised platform but every official route failed → fall through
465
+ # to the generic grid (don't give up; R6).
466
+
467
+ # -------- Phase 1: probe -------------------------------------------------
468
+ base_impersonate = user_hint.get("impersonate_first") or (
469
+ "safari_ios" if device_class == "mobile" else "safari")
470
+ base_referer = user_hint.get("referer_strategy") or "self_root"
471
+
472
+ # Root warmup (deep URLs only): let a WAF sensor set a resolved cookie on
473
+ # the probe identity's session before the deep request — the classic
474
+ # first-hit rejection fix. Skipped when the target already IS the root.
475
+ try:
476
+ from .transport import POOL, pool_enabled, _host_of, _root_of
477
+ if pool_enabled():
478
+ _root = _root_of(url)
479
+ if _root != url:
480
+ POOL.warmup(_host_of(url), base_impersonate, _root, timeout=min(timeout, 15))
481
+ except Exception:
482
+ pass
483
+
484
+ curl_attempts = 0
485
+ probe_attempt, probe_resp = _run_attempt(
486
+ url, transform_name="original", impersonate=base_impersonate,
487
+ referer_name=base_referer, success_selectors=success_selectors,
488
+ known_bad_sizes=None, timeout=timeout, phase="probe",
489
+ )
490
+ trace.append(probe_attempt)
491
+ curl_attempts += 1
492
+ if probe_resp is not None:
493
+ last_resp, last_attempt = probe_resp, probe_attempt
494
+ if probe_attempt.verdict in _OK_VALUES:
495
+ return _build_result(probe_resp, probe_attempt, trace, profile_used=None,
496
+ planned=0, executed=curl_attempts,
497
+ grid_exhausted=False, stop_reason="success")
498
+ if probe_attempt.verdict == Verdict.SUSPECT_OK.value:
499
+ best_suspect = (probe_resp, probe_attempt)
500
+ elif probe_attempt.verdict in _TERMINAL_NONSUCCESS_VALUES:
501
+ return _give_up(trace, profile_used, last_resp, last_attempt, best_suspect,
502
+ planned=0, executed=curl_attempts, grid_exhausted=False,
503
+ stop_reason=probe_attempt.verdict)
504
+
505
+ # -------- Phase 2: detect + plan + execute ------------------------------
506
+ if last_resp is not None:
507
+ hits = detect(last_resp, profiles=profiles)
508
+ else:
509
+ hits = [type("H", (), {"profile_id": "unknown_challenge", "confidence": 0.1,
510
+ "signals": ["no_probe_response"]})()]
511
+ profile_used = hits[0].profile_id if hits else None
512
+
513
+ plan = _build_plan(url, hits, profiles, device_class, base_impersonate,
514
+ base_referer, priority=priority)
515
+ planned = len(plan)
516
+ grid_exhausted = False
517
+ stop_reason = ""
518
+
519
+ for cand in plan:
520
+ if max_attempts is not None and curl_attempts >= max_attempts:
521
+ stop_reason = "budget"
522
+ break
523
+ att, resp = _run_attempt(
524
+ cand.url, transform_name=cand.transform, impersonate=cand.impersonate,
525
+ referer_name=cand.referer, success_selectors=success_selectors,
526
+ known_bad_sizes=list(cand.known_bad_sizes) if cand.known_bad_sizes else None,
527
+ timeout=timeout, phase="grid",
528
+ )
529
+ trace.append(att)
530
+ curl_attempts += 1
531
+ if resp is not None:
532
+ last_resp, last_attempt = resp, att
533
+ if att.verdict in _OK_VALUES:
534
+ return _build_result(resp, att, trace, profile_used=cand.profile_id,
535
+ planned=planned, executed=curl_attempts,
536
+ grid_exhausted=False, stop_reason="success")
537
+ if att.verdict == Verdict.SUSPECT_OK.value and best_suspect is None:
538
+ best_suspect = (resp, att)
539
+ if att.verdict in _TERMINAL_NONSUCCESS_VALUES:
540
+ stop_reason = att.verdict
541
+ break
542
+ # continuing → polite jitter (only on non-terminal failure)
543
+ _jitter()
544
+ else:
545
+ grid_exhausted = True
546
+ stop_reason = "exhausted"
547
+
548
+ # If a terminal-nonsuccess (404/auth/429) stopped us, browser won't help.
549
+ skip_browser = stop_reason in _TERMINAL_NONSUCCESS_VALUES
550
+
551
+ # -------- Phase 3: Playwright fallback ----------------------------------
552
+ if enable_playwright and not skip_browser:
553
+ browser_used = 0
554
+ try:
555
+ from .executor import run_playwright_fallback
556
+ fb_profile = load_profile(profile_used or "unknown_challenge", profiles=profiles)
557
+ fb_order = fb_profile.get("fallback_when_challenge") or ["playwright_real_chrome"]
558
+ for fb_name in fb_order:
559
+ if fb_name == "curl_grid_exhaust":
560
+ continue
561
+ if browser_used >= max_browser_attempts:
562
+ break
563
+ pw_attempt, pw_content = run_playwright_fallback(
564
+ url, profile_id=profile_used or "unknown_challenge",
565
+ success_selectors=success_selectors, device_class=device_class,
566
+ force_executor=fb_name, timeout=timeout if timeout and timeout > 30 else 90,
567
+ )
568
+ trace.append(pw_attempt)
569
+ browser_used += 1
570
+ if pw_attempt.verdict in _OK_VALUES:
571
+ return FetchResult(
572
+ ok=True, content=pw_content, final_url=pw_attempt.url,
573
+ verdict=pw_attempt.verdict, profile_used=profile_used,
574
+ trace=trace, summary=f"Playwright fallback succeeded via {fb_name}",
575
+ planned_attempts=planned, executed_attempts=curl_attempts,
576
+ grid_exhausted=grid_exhausted, stop_reason="success",
577
+ )
578
+ if pw_attempt.verdict == Verdict.SUSPECT_OK.value and best_suspect is None:
579
+ best_suspect = (None, pw_attempt)
580
+ except ImportError:
581
+ trace.append(Attempt(
582
+ phase="fallback", executor="playwright", url=url,
583
+ url_transform="original", impersonate=None, referer="",
584
+ verdict=Verdict.UNKNOWN.value, error="executor module not available"))
585
+ except Exception as e:
586
+ trace.append(Attempt(
587
+ phase="fallback", executor="playwright", url=url,
588
+ url_transform="original", impersonate=None, referer="",
589
+ verdict=Verdict.UNKNOWN.value, error=f"{type(e).__name__}:{str(e)[:200]}"))
590
+
591
+ # -------- Give up, return best we have ----------------------------------
592
+ return _give_up(trace, profile_used, last_resp, last_attempt, best_suspect,
593
+ planned=planned, executed=curl_attempts,
594
+ grid_exhausted=grid_exhausted, stop_reason=stop_reason or "exhausted")
595
+
596
+
597
+ def _untried_routes(stop_reason, grid_exhausted) -> tuple[list[str], bool]:
598
+ """Failure gate (R6): name the escalation routes the engine itself could not
599
+ perform, so the caller never mistakes give-up for "everything was tried".
600
+
601
+ Returns (untried_routes, must_invoke_playwright_mcp).
602
+ """
603
+ routes: list[str] = []
604
+ # 429 is TRANSIENT, not a wall — exclude it from terminal so the gate still
605
+ # surfaces backoff/MCP instead of telling the agent to give up (the exact
606
+ # premature-failure this hardening exists to prevent).
607
+ rate_limited = stop_reason == Verdict.RATE_LIMITED.value
608
+ # Terminal non-success (404 / auth / paywall) → a real wall; nothing else helps.
609
+ terminal = stop_reason in _TERMINAL_NONSUCCESS_VALUES and not rate_limited
610
+ if terminal:
611
+ return routes, False
612
+
613
+ if rate_limited:
614
+ routes.append("rate-limited (429) — transient: back off a few seconds then retry; a different TLS family or Playwright MCP often clears it. Do NOT hammer the grid.")
615
+ # Budget cut → the curl grid itself was not finished (skip for 429: don't hammer).
616
+ elif stop_reason == "budget" or not grid_exhausted:
617
+ routes.append("generic-grid: NOT exhausted — re-run fetch() with max_attempts=None")
618
+
619
+ # A gated page that survived the curl grid → the real browser is the next
620
+ # escalation, and Playwright MCP must be driven from the AGENT session
621
+ # (the engine can only spawn local Node Chrome, which Cloudflare-class
622
+ # challenges often detect). So MCP is, by construction, an untried route here.
623
+ must_mcp = True
624
+ routes.append(
625
+ "playwright_mcp (run from the agent session): browser_navigate → "
626
+ "browser_network_requests → catch /api,/graphql,*.json internal endpoint → "
627
+ "re-fetch that API URL with `python3 -m engine`; or browser_snapshot for rendered HTML"
628
+ )
629
+ routes.append("user_hint retry: fetch(url, user_hint={'impersonate_first': 'safari_ios'|'chrome', 'referer_strategy': 'none'}) and/or device_class='mobile'")
630
+ return routes, must_mcp
631
+
632
+
633
+ def _give_up(trace, profile_used, last_resp, last_attempt, best_suspect,
634
+ *, planned, executed, grid_exhausted, stop_reason) -> FetchResult:
635
+ """Return the most honest failure result, preferring suspect content."""
636
+ untried, must_mcp = _untried_routes(stop_reason, grid_exhausted)
637
+ if best_suspect is not None:
638
+ s_resp, s_att = best_suspect
639
+ content = getattr(s_resp, "text", "") if s_resp is not None else ""
640
+ return FetchResult(
641
+ ok=False, content=content or "",
642
+ final_url=str(getattr(s_resp, "url", s_att.url)) if s_resp is not None else s_att.url,
643
+ verdict=s_att.verdict, profile_used=profile_used, trace=trace,
644
+ summary=_format_summary(trace, profile_used, stop_reason),
645
+ planned_attempts=planned, executed_attempts=executed,
646
+ grid_exhausted=grid_exhausted, stop_reason=stop_reason,
647
+ untried_routes=untried, must_invoke_playwright_mcp=must_mcp,
648
+ )
649
+ return FetchResult(
650
+ ok=False,
651
+ content=getattr(last_resp, "text", "") if last_resp is not None else "",
652
+ final_url=str(getattr(last_resp, "url", url_of(last_attempt))) if last_resp is not None else url_of(last_attempt),
653
+ verdict=last_attempt.verdict if last_attempt else Verdict.UNKNOWN.value,
654
+ profile_used=profile_used, trace=trace,
655
+ summary=_format_summary(trace, profile_used, stop_reason),
656
+ planned_attempts=planned, executed_attempts=executed,
657
+ grid_exhausted=grid_exhausted, stop_reason=stop_reason,
658
+ untried_routes=untried, must_invoke_playwright_mcp=must_mcp,
659
+ )
660
+
661
+
662
+ def url_of(attempt: Optional[Attempt]) -> str:
663
+ return attempt.url if attempt else ""
664
+
665
+
666
+ def fetch_many(urls: list[str], **kwargs) -> list[FetchResult]:
667
+ """Fetch many URLs, reusing the per-host SessionPool across calls.
668
+
669
+ The first URL of a host may pay for warmup / browser bootstrap; later URLs
670
+ of the SAME host reuse the winning session's cookies + connection, which is
671
+ where R7-style bulk collection gets its throughput. Ordering by host keeps
672
+ the warm session hot."""
673
+ by_host: dict[str, list[int]] = {}
674
+ for i, u in enumerate(urls):
675
+ from .transport import _host_of
676
+ by_host.setdefault(_host_of(u), []).append(i)
677
+ results: list[Optional[FetchResult]] = [None] * len(urls)
678
+ for _host, idxs in by_host.items():
679
+ for i in idxs:
680
+ results[i] = fetch(urls[i], **kwargs)
681
+ return [r for r in results if r is not None]
682
+
683
+
684
+ def _build_result(resp, attempt: Attempt, trace: list[Attempt], profile_used: Optional[str],
685
+ *, planned: int, executed: int, grid_exhausted: bool, stop_reason: str) -> FetchResult:
686
+ return FetchResult(
687
+ ok=True,
688
+ content=getattr(resp, "text", "") or "",
689
+ final_url=str(getattr(resp, "url", attempt.url)),
690
+ verdict=attempt.verdict,
691
+ profile_used=profile_used,
692
+ trace=trace,
693
+ summary=f"{attempt.executor} {attempt.impersonate} + {attempt.url_transform} + referer:{attempt.referer} → {attempt.verdict}",
694
+ planned_attempts=planned, executed_attempts=executed,
695
+ grid_exhausted=grid_exhausted, stop_reason=stop_reason,
696
+ )
697
+
698
+
699
+ # WAF profiles known to typically gate HTML but leave internal JSON APIs
700
+ # (relatively) open. R7 hint surfaces an API-first route.
701
+ _R7_ELIGIBLE_PROFILES = frozenset({
702
+ "akamai_bot_manager", "cloudflare_turnstile", "datadome_probable",
703
+ "perimeterx_human", "f5_big_ip", "aws_waf",
704
+ })
705
+
706
+ R7_HINT = (
707
+ "💡 R7 API-first 권장: WAF가 HTML 경로를 차단 중. "
708
+ "Playwright MCP 사용 → browser_navigate → browser_network_requests "
709
+ "→ `/api/`·`/graphql`·`\\.json` 필터로 내부 엔드포인트 탐지 → "
710
+ "해당 URL을 `python3 -m engine <API_URL>`로 재호출. 대부분 API 레이어는 "
711
+ "WAF 방어가 얕아 curl_cffi만으로 수집됨."
712
+ )
713
+
714
+
715
+ def _format_summary(trace: list[Attempt], profile: Optional[str], stop_reason: str = "") -> str:
716
+ n = len(trace)
717
+ verdicts = [a.verdict for a in trace]
718
+ challenge_count = sum(1 for v in verdicts if v == Verdict.CHALLENGE.value)
719
+ base = (
720
+ f"failed after {n} attempts; profile={profile}; stop={stop_reason}; "
721
+ f"verdicts={','.join(v for v in verdicts[:5])}" + ("..." if n > 5 else "")
722
+ )
723
+ if profile in _R7_ELIGIBLE_PROFILES and challenge_count >= 3:
724
+ return base + "\n" + R7_HINT
725
+ return base