@gajae-code/coding-agent 0.7.0 → 0.7.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +28 -0
- package/dist/types/cli/notify-cli.d.ts +2 -0
- package/dist/types/config/settings-schema.d.ts +39 -2
- package/dist/types/extensibility/shared-events.d.ts +1 -0
- package/dist/types/gjc-runtime/launch-tmux.d.ts +1 -0
- package/dist/types/gjc-runtime/ralplan-runtime.d.ts +1 -1
- package/dist/types/gjc-runtime/tmux-common.d.ts +3 -0
- package/dist/types/gjc-runtime/tmux-sessions.d.ts +2 -0
- package/dist/types/lsp/types.d.ts +2 -0
- package/dist/types/notifications/attachment-registry.d.ts +17 -0
- package/dist/types/notifications/chat-adapters.d.ts +9 -0
- package/dist/types/notifications/config.d.ts +9 -1
- package/dist/types/notifications/engine.d.ts +59 -0
- package/dist/types/notifications/managed-daemon.d.ts +48 -0
- package/dist/types/notifications/telegram-daemon.d.ts +19 -0
- package/dist/types/notifications/threaded-inbound.d.ts +19 -0
- package/dist/types/notifications/threaded-render.d.ts +6 -1
- package/dist/types/session/agent-session.d.ts +2 -0
- package/dist/types/tools/fetch.d.ts +23 -0
- package/dist/types/tools/index.d.ts +1 -0
- package/dist/types/tools/telegram-send.d.ts +32 -0
- package/dist/types/web/insane/bridge.d.ts +103 -0
- package/dist/types/web/insane/url-guard.d.ts +22 -0
- package/dist/types/web/search/provider.d.ts +18 -1
- package/dist/types/web/search/providers/insane.d.ts +53 -0
- package/dist/types/web/search/providers/text-citations.d.ts +23 -0
- package/dist/types/web/search/types.d.ts +12 -4
- package/package.json +10 -8
- package/scripts/verify-insane-vendor.ts +132 -0
- package/src/cli/args.ts +1 -1
- package/src/cli/fast-help.ts +1 -1
- package/src/cli/notify-cli.ts +152 -5
- package/src/cli.ts +1 -3
- package/src/commands/team.ts +1 -1
- package/src/config/settings-schema.ts +30 -1
- package/src/defaults/gjc/skills/ralplan/SKILL.md +11 -4
- package/src/edit/modes/replace.ts +1 -1
- package/src/extensibility/shared-events.ts +1 -0
- package/src/gjc-runtime/launch-tmux.ts +27 -5
- package/src/gjc-runtime/ledger-event-renderer.ts +1 -0
- package/src/gjc-runtime/ralplan-runtime.ts +2 -2
- package/src/gjc-runtime/tmux-common.ts +8 -0
- package/src/gjc-runtime/tmux-sessions.ts +8 -1
- package/src/gjc-runtime/workflow-manifest.generated.json +29 -0
- package/src/gjc-runtime/workflow-manifest.ts +7 -2
- package/src/hashline/hash.ts +1 -1
- package/src/internal-urls/docs-index.generated.ts +9 -8
- package/src/lsp/config.ts +16 -3
- package/src/lsp/defaults.json +7 -0
- package/src/lsp/types.ts +2 -0
- package/src/modes/controllers/event-controller.ts +15 -0
- package/src/modes/interactive-mode.ts +46 -2
- package/src/modes/utils/context-usage.ts +2 -2
- package/src/notifications/attachment-registry.ts +23 -0
- package/src/notifications/chat-adapters.ts +147 -0
- package/src/notifications/config.ts +23 -2
- package/src/notifications/engine.ts +100 -0
- package/src/notifications/index.ts +224 -45
- package/src/notifications/managed-daemon.ts +163 -0
- package/src/notifications/telegram-daemon.ts +235 -14
- package/src/notifications/threaded-inbound.ts +60 -4
- package/src/notifications/threaded-render.ts +20 -2
- package/src/session/agent-session.ts +82 -51
- package/src/tools/ask.ts +3 -2
- package/src/tools/fetch.ts +78 -1
- package/src/tools/index.ts +3 -0
- package/src/tools/telegram-send.ts +137 -0
- package/src/web/insane/bridge.ts +350 -0
- package/src/web/insane/url-guard.ts +155 -0
- package/src/web/search/provider.ts +77 -18
- package/src/web/search/providers/anthropic.ts +70 -3
- package/src/web/search/providers/codex.ts +1 -119
- package/src/web/search/providers/gemini.ts +99 -0
- package/src/web/search/providers/insane.ts +551 -0
- package/src/web/search/providers/openai-compatible.ts +66 -32
- package/src/web/search/providers/text-citations.ts +111 -0
- package/src/web/search/types.ts +13 -2
- package/vendor/insane-search/LICENSE +21 -0
- package/vendor/insane-search/MANIFEST.json +24 -0
- package/vendor/insane-search/engine/__init__.py +23 -0
- package/vendor/insane-search/engine/__main__.py +128 -0
- package/vendor/insane-search/engine/bias_check.py +183 -0
- package/vendor/insane-search/engine/executor.py +254 -0
- package/vendor/insane-search/engine/fetch_chain.py +725 -0
- package/vendor/insane-search/engine/learning.py +175 -0
- package/vendor/insane-search/engine/phase0.py +214 -0
- package/vendor/insane-search/engine/safety.py +91 -0
- package/vendor/insane-search/engine/templates/package.json +11 -0
- package/vendor/insane-search/engine/templates/playwright_mobile_chrome.js +188 -0
- package/vendor/insane-search/engine/templates/playwright_real_chrome.js +243 -0
- package/vendor/insane-search/engine/tests/test_hardening.py +57 -0
- package/vendor/insane-search/engine/tests/test_smoke.py +152 -0
- package/vendor/insane-search/engine/tests/test_u1.py +200 -0
- package/vendor/insane-search/engine/tests/test_u4.py +131 -0
- package/vendor/insane-search/engine/tests/test_u5.py +163 -0
- package/vendor/insane-search/engine/tests/test_u7.py +124 -0
- package/vendor/insane-search/engine/transport.py +211 -0
- package/vendor/insane-search/engine/url_transforms.py +98 -0
- package/vendor/insane-search/engine/validators.py +331 -0
- package/vendor/insane-search/engine/waf_detector.py +214 -0
- package/vendor/insane-search/engine/waf_profiles.yaml +162 -0
|
@@ -0,0 +1,725 @@
|
|
|
1
|
+
"""Single entrypoint: insane-search generic fetch chain.
|
|
2
|
+
|
|
3
|
+
from insane_search.engine import fetch
|
|
4
|
+
result = fetch("https://example.com/path", success_selectors=["article"])
|
|
5
|
+
|
|
6
|
+
Public contract:
|
|
7
|
+
* One function: `fetch(url, ...) -> FetchResult`.
|
|
8
|
+
* Internal structure preserved as explicit phases so tests & debug logs
|
|
9
|
+
can target each stage: probe → validate → detect → plan → execute → report.
|
|
10
|
+
* `FetchResult.trace` exposes every attempt (transform × impersonate ×
|
|
11
|
+
referer × executor) — callers can diagnose without re-running.
|
|
12
|
+
|
|
13
|
+
v2 scheduler (multi-AI review 2026-06-21):
|
|
14
|
+
* `_build_plan` materializes the whole grid then orders it for DIVERSITY —
|
|
15
|
+
one representative per TLS family across both URL transforms first, so a
|
|
16
|
+
small attempt budget still touches every family/transform instead of
|
|
17
|
+
burning out on the Safari family.
|
|
18
|
+
* `tls_impersonate_avoid` entries are DEPRIORITIZED (moved last), never
|
|
19
|
+
deleted — they are still attempted in exhaustive mode.
|
|
20
|
+
* `max_attempts=None` (new default) means EXHAUSTIVE — run the full plan,
|
|
21
|
+
honouring R6. A numeric cap is a *budget*, and exhaustion vs budget vs
|
|
22
|
+
early-terminal is reported via `stop_reason` / `grid_exhausted`.
|
|
23
|
+
* Jitter sleeps only on a CONTINUING (failed) attempt, never before a
|
|
24
|
+
successful return.
|
|
25
|
+
* `SUSPECT_OK` (abck unresolved / soft block) is NON-terminal: kept as
|
|
26
|
+
best-effort, but the grid keeps searching for real proof.
|
|
27
|
+
|
|
28
|
+
No site-specific branching. Site knowledge enters only via:
|
|
29
|
+
* `success_selectors` (caller-supplied positive proof)
|
|
30
|
+
* `user_hint` (optional runtime hints; never persisted by this module)
|
|
31
|
+
"""
|
|
32
|
+
from __future__ import annotations
|
|
33
|
+
|
|
34
|
+
import os
|
|
35
|
+
import random
|
|
36
|
+
import time
|
|
37
|
+
from dataclasses import dataclass, field, asdict
|
|
38
|
+
from typing import Any, Optional
|
|
39
|
+
|
|
40
|
+
from .validators import Verdict, validate, TERMINAL_NONSUCCESS
|
|
41
|
+
from .waf_detector import detect, load_profile, _load_profiles, last_load_error
|
|
42
|
+
from .url_transforms import iter_transformed
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
_OK_VALUES = (Verdict.STRONG_OK.value, Verdict.WEAK_OK.value)
|
|
46
|
+
_TERMINAL_NONSUCCESS_VALUES = frozenset(v.value for v in TERMINAL_NONSUCCESS)
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
# --- Referer strategies (name → function of original URL) --------------------
|
|
50
|
+
def _self_root(url: str) -> str:
|
|
51
|
+
from urllib.parse import urlsplit
|
|
52
|
+
p = urlsplit(url)
|
|
53
|
+
return f"{p.scheme}://{p.netloc}/"
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
REFERER_STRATEGIES = {
|
|
57
|
+
"self_root": _self_root,
|
|
58
|
+
"google_search": lambda _url: "https://www.google.com/",
|
|
59
|
+
"none": lambda _url: "",
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
# --- Attempt & result schema -------------------------------------------------
|
|
64
|
+
@dataclass
|
|
65
|
+
class Attempt:
|
|
66
|
+
phase: str # probe | grid | fallback
|
|
67
|
+
executor: str # curl_cffi | playwright_real_chrome | ...
|
|
68
|
+
url: str
|
|
69
|
+
url_transform: str # original | mobile_subdomain | ...
|
|
70
|
+
impersonate: Optional[str] # safari | chrome | ... | None (non-curl)
|
|
71
|
+
referer: str
|
|
72
|
+
status: int = 0
|
|
73
|
+
body_size: int = 0
|
|
74
|
+
verdict: str = ""
|
|
75
|
+
reasons: list[str] = field(default_factory=list)
|
|
76
|
+
elapsed_s: float = 0.0
|
|
77
|
+
error: Optional[str] = None
|
|
78
|
+
|
|
79
|
+
def to_dict(self) -> dict:
|
|
80
|
+
return asdict(self)
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
@dataclass
|
|
84
|
+
class FetchResult:
|
|
85
|
+
ok: bool
|
|
86
|
+
content: str = ""
|
|
87
|
+
final_url: str = ""
|
|
88
|
+
verdict: str = ""
|
|
89
|
+
profile_used: Optional[str] = None
|
|
90
|
+
trace: list[Attempt] = field(default_factory=list)
|
|
91
|
+
summary: str = ""
|
|
92
|
+
# v2 scheduler diagnostics
|
|
93
|
+
planned_attempts: int = 0
|
|
94
|
+
executed_attempts: int = 0
|
|
95
|
+
grid_exhausted: bool = False
|
|
96
|
+
stop_reason: str = "" # success | exhausted | budget | <terminal verdict> | error
|
|
97
|
+
# Failure gate (R6): when ok=False these tell the caller it is NOT finished —
|
|
98
|
+
# which escalation routes the engine could not perform itself remain to try.
|
|
99
|
+
untried_routes: list[str] = field(default_factory=list)
|
|
100
|
+
must_invoke_playwright_mcp: bool = False
|
|
101
|
+
|
|
102
|
+
def to_dict(self, *, include_content: bool = False, content_limit: int = 4_000_000) -> dict:
|
|
103
|
+
content = self.content or ""
|
|
104
|
+
bounded_content = content[:max(0, content_limit)]
|
|
105
|
+
payload = {
|
|
106
|
+
"ok": self.ok,
|
|
107
|
+
"final_url": self.final_url,
|
|
108
|
+
"verdict": self.verdict,
|
|
109
|
+
"profile_used": self.profile_used,
|
|
110
|
+
"trace": [a.to_dict() for a in self.trace],
|
|
111
|
+
"summary": self.summary,
|
|
112
|
+
"content_length": len(content),
|
|
113
|
+
"content_truncated": len(bounded_content) < len(content),
|
|
114
|
+
"planned_attempts": self.planned_attempts,
|
|
115
|
+
"executed_attempts": self.executed_attempts,
|
|
116
|
+
"grid_exhausted": self.grid_exhausted,
|
|
117
|
+
"stop_reason": self.stop_reason,
|
|
118
|
+
"untried_routes": self.untried_routes,
|
|
119
|
+
"must_invoke_playwright_mcp": self.must_invoke_playwright_mcp,
|
|
120
|
+
}
|
|
121
|
+
if include_content:
|
|
122
|
+
payload["content"] = bounded_content
|
|
123
|
+
return payload
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
# --- curl_cffi probe executor ------------------------------------------------
|
|
127
|
+
def _curl_probe(
|
|
128
|
+
url: str, *, impersonate: str, referer: str, timeout: int = 20
|
|
129
|
+
) -> tuple[Any, Optional[str]]:
|
|
130
|
+
"""Returns (response, error_str). response may be None on exception.
|
|
131
|
+
|
|
132
|
+
Routes through the per-host SessionPool so cookies (WAF sensors) and the
|
|
133
|
+
warm connection persist across attempts and across pages of the same host.
|
|
134
|
+
The pool degrades to a one-shot GET when a Session can't be created.
|
|
135
|
+
"""
|
|
136
|
+
from .transport import POOL
|
|
137
|
+
return POOL.request(url, impersonate=impersonate, referer=referer, timeout=timeout)
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
def _run_attempt(
|
|
141
|
+
url: str,
|
|
142
|
+
*,
|
|
143
|
+
transform_name: str,
|
|
144
|
+
impersonate: str,
|
|
145
|
+
referer_name: str,
|
|
146
|
+
success_selectors: Optional[list[str]],
|
|
147
|
+
known_bad_sizes: Optional[list[int]],
|
|
148
|
+
timeout: int,
|
|
149
|
+
phase: str,
|
|
150
|
+
) -> tuple[Attempt, Any]:
|
|
151
|
+
"""Execute one curl_cffi attempt and produce an Attempt record."""
|
|
152
|
+
referer_url = REFERER_STRATEGIES.get(referer_name, REFERER_STRATEGIES["none"])(url)
|
|
153
|
+
t0 = time.time()
|
|
154
|
+
resp, err = _curl_probe(url, impersonate=impersonate, referer=referer_url, timeout=timeout)
|
|
155
|
+
elapsed = round(time.time() - t0, 3)
|
|
156
|
+
|
|
157
|
+
att = Attempt(
|
|
158
|
+
phase=phase,
|
|
159
|
+
executor="curl_cffi",
|
|
160
|
+
url=url,
|
|
161
|
+
url_transform=transform_name,
|
|
162
|
+
impersonate=impersonate,
|
|
163
|
+
referer=referer_name,
|
|
164
|
+
elapsed_s=elapsed,
|
|
165
|
+
)
|
|
166
|
+
|
|
167
|
+
if err or resp is None:
|
|
168
|
+
att.error = err or "no response"
|
|
169
|
+
att.verdict = Verdict.UNKNOWN.value
|
|
170
|
+
return att, None
|
|
171
|
+
|
|
172
|
+
vr = validate(resp, success_selectors=success_selectors, known_bad_sizes=known_bad_sizes)
|
|
173
|
+
att.status = vr.status
|
|
174
|
+
att.body_size = vr.body_size
|
|
175
|
+
att.verdict = vr.verdict.value
|
|
176
|
+
att.reasons = vr.reasons
|
|
177
|
+
return att, resp
|
|
178
|
+
|
|
179
|
+
|
|
180
|
+
# --- Diversity planner -------------------------------------------------------
|
|
181
|
+
@dataclass(frozen=True)
|
|
182
|
+
class _Cand:
|
|
183
|
+
profile_id: str
|
|
184
|
+
transform: str
|
|
185
|
+
url: str
|
|
186
|
+
impersonate: str
|
|
187
|
+
referer: str
|
|
188
|
+
known_bad_sizes: Optional[tuple]
|
|
189
|
+
|
|
190
|
+
|
|
191
|
+
_FAMILIES = ("safari_ios", "safari", "chrome_android", "chrome", "edge", "firefox")
|
|
192
|
+
|
|
193
|
+
|
|
194
|
+
def _family(tls: str) -> str:
|
|
195
|
+
for fam in _FAMILIES:
|
|
196
|
+
if tls.startswith(fam):
|
|
197
|
+
return fam
|
|
198
|
+
return tls
|
|
199
|
+
|
|
200
|
+
|
|
201
|
+
def _is_mobile_tls(t: str) -> bool:
|
|
202
|
+
return ("ios" in t) or ("android" in t)
|
|
203
|
+
|
|
204
|
+
|
|
205
|
+
def _plan_for_profile(
|
|
206
|
+
url: str, profile_id: str, profile: dict, device_class: str
|
|
207
|
+
) -> list[_Cand]:
|
|
208
|
+
groups: list[list[str]] = [list(g) for g in (profile.get("tls_impersonate_candidates") or [["safari", "chrome"]])]
|
|
209
|
+
avoid = set(profile.get("tls_impersonate_avoid") or [])
|
|
210
|
+
referer_order = list(profile.get("referer_strategies") or ["self_root"])
|
|
211
|
+
transform_order = list(profile.get("url_transform_order") or ["original"])
|
|
212
|
+
kb = profile.get("known_bad_sizes") or None
|
|
213
|
+
known_bad = tuple(kb) if kb else None
|
|
214
|
+
|
|
215
|
+
# device_class shaping (fixes desktop/mobile drift)
|
|
216
|
+
if device_class == "mobile":
|
|
217
|
+
groups = [[t for t in g if _is_mobile_tls(t)] for g in groups]
|
|
218
|
+
for extra in ("mobile_subdomain", "am_prefix"):
|
|
219
|
+
if extra not in transform_order:
|
|
220
|
+
transform_order.append(extra)
|
|
221
|
+
elif device_class == "desktop":
|
|
222
|
+
groups = [[t for t in g if not _is_mobile_tls(t)] for g in groups]
|
|
223
|
+
transform_order = [t for t in transform_order if t not in ("mobile_subdomain", "am_prefix")] or ["original"]
|
|
224
|
+
|
|
225
|
+
# deprioritize (not delete) avoid targets within each family group
|
|
226
|
+
def _reorder(g: list[str]) -> list[str]:
|
|
227
|
+
return [t for t in g if t not in avoid] + [t for t in g if t in avoid]
|
|
228
|
+
|
|
229
|
+
groups = [_reorder(g) for g in groups if g]
|
|
230
|
+
if not groups:
|
|
231
|
+
groups = [["safari", "chrome"]]
|
|
232
|
+
|
|
233
|
+
transforms = iter_transformed(url, transform_order) or [("original", url)]
|
|
234
|
+
|
|
235
|
+
# Diversity ordering: vary FAMILY fastest, then TRANSFORM, then version
|
|
236
|
+
# DEPTH, then REFERER. A small budget thus touches every family/transform
|
|
237
|
+
# before exhausting one family's old versions.
|
|
238
|
+
max_depth = max(len(g) for g in groups)
|
|
239
|
+
cands: list[_Cand] = []
|
|
240
|
+
seen: set[tuple] = set()
|
|
241
|
+
for ref in referer_order:
|
|
242
|
+
for depth in range(max_depth):
|
|
243
|
+
for (t_name, t_url) in transforms:
|
|
244
|
+
for g in groups:
|
|
245
|
+
if depth >= len(g):
|
|
246
|
+
continue
|
|
247
|
+
imp = g[depth]
|
|
248
|
+
key = (t_url, imp, ref)
|
|
249
|
+
if key in seen:
|
|
250
|
+
continue
|
|
251
|
+
seen.add(key)
|
|
252
|
+
cands.append(_Cand(profile_id, t_name, t_url, imp, ref, known_bad))
|
|
253
|
+
return cands
|
|
254
|
+
|
|
255
|
+
|
|
256
|
+
def _build_plan(
|
|
257
|
+
url: str,
|
|
258
|
+
hits: list,
|
|
259
|
+
profiles: dict,
|
|
260
|
+
device_class: str,
|
|
261
|
+
probe_impersonate: str,
|
|
262
|
+
probe_referer: str,
|
|
263
|
+
priority: Optional[dict] = None,
|
|
264
|
+
) -> list[_Cand]:
|
|
265
|
+
"""Materialize a diversity-ordered candidate plan across the top profiles.
|
|
266
|
+
|
|
267
|
+
Profiles are round-robin interleaved so a confident #1 profile cannot
|
|
268
|
+
starve #2/#3. The probe combo is removed (already executed).
|
|
269
|
+
|
|
270
|
+
`priority` (U5 self-learning): a previously-successful route
|
|
271
|
+
``{"transform","impersonate","referer"}`` for this host — the matching
|
|
272
|
+
candidate is moved to the FRONT so a known-good route is retried first."""
|
|
273
|
+
per: list[list[_Cand]] = []
|
|
274
|
+
for hit in hits[:3]:
|
|
275
|
+
pid = getattr(hit, "profile_id", None) or "unknown_challenge"
|
|
276
|
+
prof = load_profile(pid, profiles=profiles)
|
|
277
|
+
per.append(_plan_for_profile(url, pid, prof, device_class))
|
|
278
|
+
|
|
279
|
+
probe_key = (url, probe_impersonate, probe_referer)
|
|
280
|
+
merged: list[_Cand] = []
|
|
281
|
+
seen: set[tuple] = set()
|
|
282
|
+
i = 0
|
|
283
|
+
while any(i < len(p) for p in per):
|
|
284
|
+
for p in per:
|
|
285
|
+
if i < len(p):
|
|
286
|
+
c = p[i]
|
|
287
|
+
key = (c.url, c.impersonate, c.referer)
|
|
288
|
+
if key == probe_key or key in seen:
|
|
289
|
+
continue
|
|
290
|
+
seen.add(key)
|
|
291
|
+
merged.append(c)
|
|
292
|
+
i += 1
|
|
293
|
+
|
|
294
|
+
if priority:
|
|
295
|
+
front = [c for c in merged if c.transform == priority.get("transform")
|
|
296
|
+
and c.impersonate == priority.get("impersonate")
|
|
297
|
+
and c.referer == priority.get("referer")]
|
|
298
|
+
if front:
|
|
299
|
+
rest = [c for c in merged if c not in front]
|
|
300
|
+
merged = front + rest
|
|
301
|
+
return merged
|
|
302
|
+
|
|
303
|
+
|
|
304
|
+
# --- Public entrypoint: self-learning wrapper (U5) ---------------------------
|
|
305
|
+
def _winning_route(result: FetchResult) -> Optional[dict]:
|
|
306
|
+
"""Extract the curl route that produced the OK result, from the trace.
|
|
307
|
+
|
|
308
|
+
Only probe/grid curl wins are learnable: Phase 0 always runs first anyway,
|
|
309
|
+
and a browser win carries no reusable curl identity."""
|
|
310
|
+
for att in reversed(result.trace):
|
|
311
|
+
if (att.verdict in _OK_VALUES and att.phase in ("probe", "grid")
|
|
312
|
+
and att.executor == "curl_cffi" and att.impersonate):
|
|
313
|
+
return {
|
|
314
|
+
"transform": att.url_transform,
|
|
315
|
+
"impersonate": att.impersonate,
|
|
316
|
+
"referer": att.referer,
|
|
317
|
+
"phase": att.phase,
|
|
318
|
+
}
|
|
319
|
+
return None
|
|
320
|
+
|
|
321
|
+
|
|
322
|
+
def fetch(
|
|
323
|
+
url: str,
|
|
324
|
+
*,
|
|
325
|
+
success_selectors: Optional[list[str]] = None,
|
|
326
|
+
device_class: str = "auto",
|
|
327
|
+
user_hint: Optional[dict] = None,
|
|
328
|
+
timeout: int = 25,
|
|
329
|
+
max_attempts: Optional[int] = None,
|
|
330
|
+
max_browser_attempts: int = 2,
|
|
331
|
+
enable_playwright: bool = True,
|
|
332
|
+
enable_phase0: bool = True,
|
|
333
|
+
enable_learning: bool = True,
|
|
334
|
+
) -> FetchResult:
|
|
335
|
+
"""Public entrypoint — the generic grid wrapped with per-host self-learning.
|
|
336
|
+
|
|
337
|
+
1. Before fetching, look up the route that last succeeded for this host and
|
|
338
|
+
promote it: it becomes the probe identity AND the front of the grid.
|
|
339
|
+
2. After fetching, record the winning route; or, if a learned route was
|
|
340
|
+
promoted and the run hit a REAL block, strike it (evicted after two
|
|
341
|
+
consecutive strikes — see `learning.py`).
|
|
342
|
+
|
|
343
|
+
The store is a bounded, self-pruning JSON file; any error in it is swallowed
|
|
344
|
+
so learning can never break a fetch. Disable per-call with
|
|
345
|
+
``enable_learning=False`` or globally with ``INSANE_LEARN=0``."""
|
|
346
|
+
priority: Optional[dict] = None
|
|
347
|
+
learned_existed = False
|
|
348
|
+
uh = dict(user_hint or {})
|
|
349
|
+
try:
|
|
350
|
+
from . import learning
|
|
351
|
+
if enable_learning and learning.enabled():
|
|
352
|
+
priority = learning.lookup(url, device_class)
|
|
353
|
+
if priority:
|
|
354
|
+
learned_existed = True
|
|
355
|
+
uh.setdefault("impersonate_first", priority.get("impersonate"))
|
|
356
|
+
uh.setdefault("referer_strategy", priority.get("referer"))
|
|
357
|
+
except Exception:
|
|
358
|
+
priority = None
|
|
359
|
+
|
|
360
|
+
result = _fetch_core(
|
|
361
|
+
url, success_selectors=success_selectors, device_class=device_class,
|
|
362
|
+
user_hint=uh, timeout=timeout, max_attempts=max_attempts,
|
|
363
|
+
max_browser_attempts=max_browser_attempts,
|
|
364
|
+
enable_playwright=enable_playwright, enable_phase0=enable_phase0,
|
|
365
|
+
priority=priority,
|
|
366
|
+
)
|
|
367
|
+
|
|
368
|
+
try:
|
|
369
|
+
from . import learning
|
|
370
|
+
if enable_learning and learning.enabled():
|
|
371
|
+
if result.ok:
|
|
372
|
+
win = _winning_route(result)
|
|
373
|
+
if win:
|
|
374
|
+
learning.record_success(url, device_class, win)
|
|
375
|
+
elif learned_existed:
|
|
376
|
+
learning.record_failure(
|
|
377
|
+
url, device_class,
|
|
378
|
+
penalize=learning.is_real_failure(result.stop_reason))
|
|
379
|
+
except Exception:
|
|
380
|
+
pass
|
|
381
|
+
|
|
382
|
+
return result
|
|
383
|
+
|
|
384
|
+
|
|
385
|
+
# --- Main entrypoint ---------------------------------------------------------
|
|
386
|
+
def _fetch_core(
|
|
387
|
+
url: str,
|
|
388
|
+
*,
|
|
389
|
+
success_selectors: Optional[list[str]] = None,
|
|
390
|
+
device_class: str = "auto", # "auto" | "desktop" | "mobile"
|
|
391
|
+
user_hint: Optional[dict] = None,
|
|
392
|
+
timeout: int = 25,
|
|
393
|
+
max_attempts: Optional[int] = None, # None = exhaustive (R6); int = budget
|
|
394
|
+
max_browser_attempts: int = 2,
|
|
395
|
+
enable_playwright: bool = True,
|
|
396
|
+
enable_phase0: bool = True,
|
|
397
|
+
priority: Optional[dict] = None, # U5: learned route to retry first
|
|
398
|
+
) -> FetchResult:
|
|
399
|
+
"""Fetch `url` using the generic diversity grid.
|
|
400
|
+
|
|
401
|
+
max_attempts
|
|
402
|
+
None (default) → run the whole plan (exhaustive, honours R6).
|
|
403
|
+
int → TOTAL curl-attempt budget (probe included). On budget exit the
|
|
404
|
+
result reports `stop_reason="budget"`, `grid_exhausted=False`, so
|
|
405
|
+
callers never mistake a truncated run for a true exhaustive failure.
|
|
406
|
+
"""
|
|
407
|
+
user_hint = user_hint or {}
|
|
408
|
+
profiles = _load_profiles()
|
|
409
|
+
trace: list[Attempt] = []
|
|
410
|
+
last_resp = None
|
|
411
|
+
last_attempt: Optional[Attempt] = None
|
|
412
|
+
best_suspect: Optional[tuple] = None # (resp, attempt)
|
|
413
|
+
profile_used: Optional[str] = None
|
|
414
|
+
|
|
415
|
+
_jmin = int(os.environ.get("INSANE_JITTER_MS_MIN", "150"))
|
|
416
|
+
_jmax = int(os.environ.get("INSANE_JITTER_MS_MAX", "400"))
|
|
417
|
+
|
|
418
|
+
def _jitter():
|
|
419
|
+
time.sleep(random.uniform(_jmin / 1000.0, _jmax / 1000.0))
|
|
420
|
+
|
|
421
|
+
# Surface profile-loader failures as a diagnostic trace entry (not counted
|
|
422
|
+
# as a network attempt).
|
|
423
|
+
load_err = last_load_error()
|
|
424
|
+
if load_err:
|
|
425
|
+
trace.append(Attempt(
|
|
426
|
+
phase="probe", executor="profile_loader", url=url,
|
|
427
|
+
url_transform="original", impersonate=None, referer="",
|
|
428
|
+
verdict=Verdict.UNKNOWN.value, error=f"profiles_fallback: {load_err}",
|
|
429
|
+
))
|
|
430
|
+
|
|
431
|
+
# -------- Phase 0: official public-API router (R5; site-aware, sanctioned) --
|
|
432
|
+
# For recognised platforms (Reddit/X/YouTube/...) try the official no-auth
|
|
433
|
+
# endpoint BEFORE the generic grid. This is the *enforced* version of the
|
|
434
|
+
# old agent-driven SKILL snippets — the agent can no longer skip it, which
|
|
435
|
+
# is what made Reddit/X look "blocked" (grid 403'd .json; nobody tried .rss).
|
|
436
|
+
if enable_phase0:
|
|
437
|
+
try:
|
|
438
|
+
from .phase0 import route as _phase0_route
|
|
439
|
+
p0 = _phase0_route(url, timeout=timeout)
|
|
440
|
+
except Exception as e: # router must never break the generic chain
|
|
441
|
+
p0 = None
|
|
442
|
+
trace.append(Attempt(
|
|
443
|
+
phase="phase0", executor="phase0", url=url, url_transform="original",
|
|
444
|
+
impersonate=None, referer="", verdict=Verdict.UNKNOWN.value,
|
|
445
|
+
error=f"{type(e).__name__}:{str(e)[:120]}",
|
|
446
|
+
))
|
|
447
|
+
if p0 is not None:
|
|
448
|
+
for a in p0["attempts"]:
|
|
449
|
+
trace.append(Attempt(
|
|
450
|
+
phase="phase0", executor=a["route"], url=url, url_transform="-",
|
|
451
|
+
impersonate=None, referer="",
|
|
452
|
+
status=a.get("status", 0), body_size=a.get("bytes", 0),
|
|
453
|
+
verdict=(Verdict.STRONG_OK.value if a["ok"] else Verdict.BLOCKED.value),
|
|
454
|
+
reasons=[a["note"]] if a.get("note") else [],
|
|
455
|
+
))
|
|
456
|
+
if p0["ok"]:
|
|
457
|
+
return FetchResult(
|
|
458
|
+
ok=True, content=p0["content"], final_url=p0["final_url"],
|
|
459
|
+
verdict=Verdict.STRONG_OK.value,
|
|
460
|
+
profile_used=f"phase0:{p0['platform']}", trace=trace,
|
|
461
|
+
summary=f"Phase 0 official route: {p0['platform']}:{p0['route']}",
|
|
462
|
+
stop_reason="success",
|
|
463
|
+
)
|
|
464
|
+
# Recognised platform but every official route failed → fall through
|
|
465
|
+
# to the generic grid (don't give up; R6).
|
|
466
|
+
|
|
467
|
+
# -------- Phase 1: probe -------------------------------------------------
|
|
468
|
+
base_impersonate = user_hint.get("impersonate_first") or (
|
|
469
|
+
"safari_ios" if device_class == "mobile" else "safari")
|
|
470
|
+
base_referer = user_hint.get("referer_strategy") or "self_root"
|
|
471
|
+
|
|
472
|
+
# Root warmup (deep URLs only): let a WAF sensor set a resolved cookie on
|
|
473
|
+
# the probe identity's session before the deep request — the classic
|
|
474
|
+
# first-hit rejection fix. Skipped when the target already IS the root.
|
|
475
|
+
try:
|
|
476
|
+
from .transport import POOL, pool_enabled, _host_of, _root_of
|
|
477
|
+
if pool_enabled():
|
|
478
|
+
_root = _root_of(url)
|
|
479
|
+
if _root != url:
|
|
480
|
+
POOL.warmup(_host_of(url), base_impersonate, _root, timeout=min(timeout, 15))
|
|
481
|
+
except Exception:
|
|
482
|
+
pass
|
|
483
|
+
|
|
484
|
+
curl_attempts = 0
|
|
485
|
+
probe_attempt, probe_resp = _run_attempt(
|
|
486
|
+
url, transform_name="original", impersonate=base_impersonate,
|
|
487
|
+
referer_name=base_referer, success_selectors=success_selectors,
|
|
488
|
+
known_bad_sizes=None, timeout=timeout, phase="probe",
|
|
489
|
+
)
|
|
490
|
+
trace.append(probe_attempt)
|
|
491
|
+
curl_attempts += 1
|
|
492
|
+
if probe_resp is not None:
|
|
493
|
+
last_resp, last_attempt = probe_resp, probe_attempt
|
|
494
|
+
if probe_attempt.verdict in _OK_VALUES:
|
|
495
|
+
return _build_result(probe_resp, probe_attempt, trace, profile_used=None,
|
|
496
|
+
planned=0, executed=curl_attempts,
|
|
497
|
+
grid_exhausted=False, stop_reason="success")
|
|
498
|
+
if probe_attempt.verdict == Verdict.SUSPECT_OK.value:
|
|
499
|
+
best_suspect = (probe_resp, probe_attempt)
|
|
500
|
+
elif probe_attempt.verdict in _TERMINAL_NONSUCCESS_VALUES:
|
|
501
|
+
return _give_up(trace, profile_used, last_resp, last_attempt, best_suspect,
|
|
502
|
+
planned=0, executed=curl_attempts, grid_exhausted=False,
|
|
503
|
+
stop_reason=probe_attempt.verdict)
|
|
504
|
+
|
|
505
|
+
# -------- Phase 2: detect + plan + execute ------------------------------
|
|
506
|
+
if last_resp is not None:
|
|
507
|
+
hits = detect(last_resp, profiles=profiles)
|
|
508
|
+
else:
|
|
509
|
+
hits = [type("H", (), {"profile_id": "unknown_challenge", "confidence": 0.1,
|
|
510
|
+
"signals": ["no_probe_response"]})()]
|
|
511
|
+
profile_used = hits[0].profile_id if hits else None
|
|
512
|
+
|
|
513
|
+
plan = _build_plan(url, hits, profiles, device_class, base_impersonate,
|
|
514
|
+
base_referer, priority=priority)
|
|
515
|
+
planned = len(plan)
|
|
516
|
+
grid_exhausted = False
|
|
517
|
+
stop_reason = ""
|
|
518
|
+
|
|
519
|
+
for cand in plan:
|
|
520
|
+
if max_attempts is not None and curl_attempts >= max_attempts:
|
|
521
|
+
stop_reason = "budget"
|
|
522
|
+
break
|
|
523
|
+
att, resp = _run_attempt(
|
|
524
|
+
cand.url, transform_name=cand.transform, impersonate=cand.impersonate,
|
|
525
|
+
referer_name=cand.referer, success_selectors=success_selectors,
|
|
526
|
+
known_bad_sizes=list(cand.known_bad_sizes) if cand.known_bad_sizes else None,
|
|
527
|
+
timeout=timeout, phase="grid",
|
|
528
|
+
)
|
|
529
|
+
trace.append(att)
|
|
530
|
+
curl_attempts += 1
|
|
531
|
+
if resp is not None:
|
|
532
|
+
last_resp, last_attempt = resp, att
|
|
533
|
+
if att.verdict in _OK_VALUES:
|
|
534
|
+
return _build_result(resp, att, trace, profile_used=cand.profile_id,
|
|
535
|
+
planned=planned, executed=curl_attempts,
|
|
536
|
+
grid_exhausted=False, stop_reason="success")
|
|
537
|
+
if att.verdict == Verdict.SUSPECT_OK.value and best_suspect is None:
|
|
538
|
+
best_suspect = (resp, att)
|
|
539
|
+
if att.verdict in _TERMINAL_NONSUCCESS_VALUES:
|
|
540
|
+
stop_reason = att.verdict
|
|
541
|
+
break
|
|
542
|
+
# continuing → polite jitter (only on non-terminal failure)
|
|
543
|
+
_jitter()
|
|
544
|
+
else:
|
|
545
|
+
grid_exhausted = True
|
|
546
|
+
stop_reason = "exhausted"
|
|
547
|
+
|
|
548
|
+
# If a terminal-nonsuccess (404/auth/429) stopped us, browser won't help.
|
|
549
|
+
skip_browser = stop_reason in _TERMINAL_NONSUCCESS_VALUES
|
|
550
|
+
|
|
551
|
+
# -------- Phase 3: Playwright fallback ----------------------------------
|
|
552
|
+
if enable_playwright and not skip_browser:
|
|
553
|
+
browser_used = 0
|
|
554
|
+
try:
|
|
555
|
+
from .executor import run_playwright_fallback
|
|
556
|
+
fb_profile = load_profile(profile_used or "unknown_challenge", profiles=profiles)
|
|
557
|
+
fb_order = fb_profile.get("fallback_when_challenge") or ["playwright_real_chrome"]
|
|
558
|
+
for fb_name in fb_order:
|
|
559
|
+
if fb_name == "curl_grid_exhaust":
|
|
560
|
+
continue
|
|
561
|
+
if browser_used >= max_browser_attempts:
|
|
562
|
+
break
|
|
563
|
+
pw_attempt, pw_content = run_playwright_fallback(
|
|
564
|
+
url, profile_id=profile_used or "unknown_challenge",
|
|
565
|
+
success_selectors=success_selectors, device_class=device_class,
|
|
566
|
+
force_executor=fb_name, timeout=timeout if timeout and timeout > 30 else 90,
|
|
567
|
+
)
|
|
568
|
+
trace.append(pw_attempt)
|
|
569
|
+
browser_used += 1
|
|
570
|
+
if pw_attempt.verdict in _OK_VALUES:
|
|
571
|
+
return FetchResult(
|
|
572
|
+
ok=True, content=pw_content, final_url=pw_attempt.url,
|
|
573
|
+
verdict=pw_attempt.verdict, profile_used=profile_used,
|
|
574
|
+
trace=trace, summary=f"Playwright fallback succeeded via {fb_name}",
|
|
575
|
+
planned_attempts=planned, executed_attempts=curl_attempts,
|
|
576
|
+
grid_exhausted=grid_exhausted, stop_reason="success",
|
|
577
|
+
)
|
|
578
|
+
if pw_attempt.verdict == Verdict.SUSPECT_OK.value and best_suspect is None:
|
|
579
|
+
best_suspect = (None, pw_attempt)
|
|
580
|
+
except ImportError:
|
|
581
|
+
trace.append(Attempt(
|
|
582
|
+
phase="fallback", executor="playwright", url=url,
|
|
583
|
+
url_transform="original", impersonate=None, referer="",
|
|
584
|
+
verdict=Verdict.UNKNOWN.value, error="executor module not available"))
|
|
585
|
+
except Exception as e:
|
|
586
|
+
trace.append(Attempt(
|
|
587
|
+
phase="fallback", executor="playwright", url=url,
|
|
588
|
+
url_transform="original", impersonate=None, referer="",
|
|
589
|
+
verdict=Verdict.UNKNOWN.value, error=f"{type(e).__name__}:{str(e)[:200]}"))
|
|
590
|
+
|
|
591
|
+
# -------- Give up, return best we have ----------------------------------
|
|
592
|
+
return _give_up(trace, profile_used, last_resp, last_attempt, best_suspect,
|
|
593
|
+
planned=planned, executed=curl_attempts,
|
|
594
|
+
grid_exhausted=grid_exhausted, stop_reason=stop_reason or "exhausted")
|
|
595
|
+
|
|
596
|
+
|
|
597
|
+
def _untried_routes(stop_reason, grid_exhausted) -> tuple[list[str], bool]:
|
|
598
|
+
"""Failure gate (R6): name the escalation routes the engine itself could not
|
|
599
|
+
perform, so the caller never mistakes give-up for "everything was tried".
|
|
600
|
+
|
|
601
|
+
Returns (untried_routes, must_invoke_playwright_mcp).
|
|
602
|
+
"""
|
|
603
|
+
routes: list[str] = []
|
|
604
|
+
# 429 is TRANSIENT, not a wall — exclude it from terminal so the gate still
|
|
605
|
+
# surfaces backoff/MCP instead of telling the agent to give up (the exact
|
|
606
|
+
# premature-failure this hardening exists to prevent).
|
|
607
|
+
rate_limited = stop_reason == Verdict.RATE_LIMITED.value
|
|
608
|
+
# Terminal non-success (404 / auth / paywall) → a real wall; nothing else helps.
|
|
609
|
+
terminal = stop_reason in _TERMINAL_NONSUCCESS_VALUES and not rate_limited
|
|
610
|
+
if terminal:
|
|
611
|
+
return routes, False
|
|
612
|
+
|
|
613
|
+
if rate_limited:
|
|
614
|
+
routes.append("rate-limited (429) — transient: back off a few seconds then retry; a different TLS family or Playwright MCP often clears it. Do NOT hammer the grid.")
|
|
615
|
+
# Budget cut → the curl grid itself was not finished (skip for 429: don't hammer).
|
|
616
|
+
elif stop_reason == "budget" or not grid_exhausted:
|
|
617
|
+
routes.append("generic-grid: NOT exhausted — re-run fetch() with max_attempts=None")
|
|
618
|
+
|
|
619
|
+
# A gated page that survived the curl grid → the real browser is the next
|
|
620
|
+
# escalation, and Playwright MCP must be driven from the AGENT session
|
|
621
|
+
# (the engine can only spawn local Node Chrome, which Cloudflare-class
|
|
622
|
+
# challenges often detect). So MCP is, by construction, an untried route here.
|
|
623
|
+
must_mcp = True
|
|
624
|
+
routes.append(
|
|
625
|
+
"playwright_mcp (run from the agent session): browser_navigate → "
|
|
626
|
+
"browser_network_requests → catch /api,/graphql,*.json internal endpoint → "
|
|
627
|
+
"re-fetch that API URL with `python3 -m engine`; or browser_snapshot for rendered HTML"
|
|
628
|
+
)
|
|
629
|
+
routes.append("user_hint retry: fetch(url, user_hint={'impersonate_first': 'safari_ios'|'chrome', 'referer_strategy': 'none'}) and/or device_class='mobile'")
|
|
630
|
+
return routes, must_mcp
|
|
631
|
+
|
|
632
|
+
|
|
633
|
+
def _give_up(trace, profile_used, last_resp, last_attempt, best_suspect,
|
|
634
|
+
*, planned, executed, grid_exhausted, stop_reason) -> FetchResult:
|
|
635
|
+
"""Return the most honest failure result, preferring suspect content."""
|
|
636
|
+
untried, must_mcp = _untried_routes(stop_reason, grid_exhausted)
|
|
637
|
+
if best_suspect is not None:
|
|
638
|
+
s_resp, s_att = best_suspect
|
|
639
|
+
content = getattr(s_resp, "text", "") if s_resp is not None else ""
|
|
640
|
+
return FetchResult(
|
|
641
|
+
ok=False, content=content or "",
|
|
642
|
+
final_url=str(getattr(s_resp, "url", s_att.url)) if s_resp is not None else s_att.url,
|
|
643
|
+
verdict=s_att.verdict, profile_used=profile_used, trace=trace,
|
|
644
|
+
summary=_format_summary(trace, profile_used, stop_reason),
|
|
645
|
+
planned_attempts=planned, executed_attempts=executed,
|
|
646
|
+
grid_exhausted=grid_exhausted, stop_reason=stop_reason,
|
|
647
|
+
untried_routes=untried, must_invoke_playwright_mcp=must_mcp,
|
|
648
|
+
)
|
|
649
|
+
return FetchResult(
|
|
650
|
+
ok=False,
|
|
651
|
+
content=getattr(last_resp, "text", "") if last_resp is not None else "",
|
|
652
|
+
final_url=str(getattr(last_resp, "url", url_of(last_attempt))) if last_resp is not None else url_of(last_attempt),
|
|
653
|
+
verdict=last_attempt.verdict if last_attempt else Verdict.UNKNOWN.value,
|
|
654
|
+
profile_used=profile_used, trace=trace,
|
|
655
|
+
summary=_format_summary(trace, profile_used, stop_reason),
|
|
656
|
+
planned_attempts=planned, executed_attempts=executed,
|
|
657
|
+
grid_exhausted=grid_exhausted, stop_reason=stop_reason,
|
|
658
|
+
untried_routes=untried, must_invoke_playwright_mcp=must_mcp,
|
|
659
|
+
)
|
|
660
|
+
|
|
661
|
+
|
|
662
|
+
def url_of(attempt: Optional[Attempt]) -> str:
|
|
663
|
+
return attempt.url if attempt else ""
|
|
664
|
+
|
|
665
|
+
|
|
666
|
+
def fetch_many(urls: list[str], **kwargs) -> list[FetchResult]:
|
|
667
|
+
"""Fetch many URLs, reusing the per-host SessionPool across calls.
|
|
668
|
+
|
|
669
|
+
The first URL of a host may pay for warmup / browser bootstrap; later URLs
|
|
670
|
+
of the SAME host reuse the winning session's cookies + connection, which is
|
|
671
|
+
where R7-style bulk collection gets its throughput. Ordering by host keeps
|
|
672
|
+
the warm session hot."""
|
|
673
|
+
by_host: dict[str, list[int]] = {}
|
|
674
|
+
for i, u in enumerate(urls):
|
|
675
|
+
from .transport import _host_of
|
|
676
|
+
by_host.setdefault(_host_of(u), []).append(i)
|
|
677
|
+
results: list[Optional[FetchResult]] = [None] * len(urls)
|
|
678
|
+
for _host, idxs in by_host.items():
|
|
679
|
+
for i in idxs:
|
|
680
|
+
results[i] = fetch(urls[i], **kwargs)
|
|
681
|
+
return [r for r in results if r is not None]
|
|
682
|
+
|
|
683
|
+
|
|
684
|
+
def _build_result(resp, attempt: Attempt, trace: list[Attempt], profile_used: Optional[str],
|
|
685
|
+
*, planned: int, executed: int, grid_exhausted: bool, stop_reason: str) -> FetchResult:
|
|
686
|
+
return FetchResult(
|
|
687
|
+
ok=True,
|
|
688
|
+
content=getattr(resp, "text", "") or "",
|
|
689
|
+
final_url=str(getattr(resp, "url", attempt.url)),
|
|
690
|
+
verdict=attempt.verdict,
|
|
691
|
+
profile_used=profile_used,
|
|
692
|
+
trace=trace,
|
|
693
|
+
summary=f"{attempt.executor} {attempt.impersonate} + {attempt.url_transform} + referer:{attempt.referer} → {attempt.verdict}",
|
|
694
|
+
planned_attempts=planned, executed_attempts=executed,
|
|
695
|
+
grid_exhausted=grid_exhausted, stop_reason=stop_reason,
|
|
696
|
+
)
|
|
697
|
+
|
|
698
|
+
|
|
699
|
+
# WAF profiles known to typically gate HTML but leave internal JSON APIs
|
|
700
|
+
# (relatively) open. R7 hint surfaces an API-first route.
|
|
701
|
+
_R7_ELIGIBLE_PROFILES = frozenset({
|
|
702
|
+
"akamai_bot_manager", "cloudflare_turnstile", "datadome_probable",
|
|
703
|
+
"perimeterx_human", "f5_big_ip", "aws_waf",
|
|
704
|
+
})
|
|
705
|
+
|
|
706
|
+
R7_HINT = (
|
|
707
|
+
"💡 R7 API-first 권장: WAF가 HTML 경로를 차단 중. "
|
|
708
|
+
"Playwright MCP 사용 → browser_navigate → browser_network_requests "
|
|
709
|
+
"→ `/api/`·`/graphql`·`\\.json` 필터로 내부 엔드포인트 탐지 → "
|
|
710
|
+
"해당 URL을 `python3 -m engine <API_URL>`로 재호출. 대부분 API 레이어는 "
|
|
711
|
+
"WAF 방어가 얕아 curl_cffi만으로 수집됨."
|
|
712
|
+
)
|
|
713
|
+
|
|
714
|
+
|
|
715
|
+
def _format_summary(trace: list[Attempt], profile: Optional[str], stop_reason: str = "") -> str:
|
|
716
|
+
n = len(trace)
|
|
717
|
+
verdicts = [a.verdict for a in trace]
|
|
718
|
+
challenge_count = sum(1 for v in verdicts if v == Verdict.CHALLENGE.value)
|
|
719
|
+
base = (
|
|
720
|
+
f"failed after {n} attempts; profile={profile}; stop={stop_reason}; "
|
|
721
|
+
f"verdicts={','.join(v for v in verdicts[:5])}" + ("..." if n > 5 else "")
|
|
722
|
+
)
|
|
723
|
+
if profile in _R7_ELIGIBLE_PROFILES and challenge_count >= 3:
|
|
724
|
+
return base + "\n" + R7_HINT
|
|
725
|
+
return base
|