@gajae-code/coding-agent 0.7.0 → 0.7.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (101) hide show
  1. package/CHANGELOG.md +28 -0
  2. package/dist/types/cli/notify-cli.d.ts +2 -0
  3. package/dist/types/config/settings-schema.d.ts +39 -2
  4. package/dist/types/extensibility/shared-events.d.ts +1 -0
  5. package/dist/types/gjc-runtime/launch-tmux.d.ts +1 -0
  6. package/dist/types/gjc-runtime/ralplan-runtime.d.ts +1 -1
  7. package/dist/types/gjc-runtime/tmux-common.d.ts +3 -0
  8. package/dist/types/gjc-runtime/tmux-sessions.d.ts +2 -0
  9. package/dist/types/lsp/types.d.ts +2 -0
  10. package/dist/types/notifications/attachment-registry.d.ts +17 -0
  11. package/dist/types/notifications/chat-adapters.d.ts +9 -0
  12. package/dist/types/notifications/config.d.ts +9 -1
  13. package/dist/types/notifications/engine.d.ts +59 -0
  14. package/dist/types/notifications/managed-daemon.d.ts +48 -0
  15. package/dist/types/notifications/telegram-daemon.d.ts +19 -0
  16. package/dist/types/notifications/threaded-inbound.d.ts +19 -0
  17. package/dist/types/notifications/threaded-render.d.ts +6 -1
  18. package/dist/types/session/agent-session.d.ts +2 -0
  19. package/dist/types/tools/fetch.d.ts +23 -0
  20. package/dist/types/tools/index.d.ts +1 -0
  21. package/dist/types/tools/telegram-send.d.ts +32 -0
  22. package/dist/types/web/insane/bridge.d.ts +103 -0
  23. package/dist/types/web/insane/url-guard.d.ts +22 -0
  24. package/dist/types/web/search/provider.d.ts +18 -1
  25. package/dist/types/web/search/providers/insane.d.ts +53 -0
  26. package/dist/types/web/search/providers/text-citations.d.ts +23 -0
  27. package/dist/types/web/search/types.d.ts +12 -4
  28. package/package.json +10 -8
  29. package/scripts/verify-insane-vendor.ts +132 -0
  30. package/src/cli/args.ts +1 -1
  31. package/src/cli/fast-help.ts +1 -1
  32. package/src/cli/notify-cli.ts +152 -5
  33. package/src/cli.ts +1 -3
  34. package/src/commands/team.ts +1 -1
  35. package/src/config/settings-schema.ts +30 -1
  36. package/src/defaults/gjc/skills/ralplan/SKILL.md +11 -4
  37. package/src/edit/modes/replace.ts +1 -1
  38. package/src/extensibility/shared-events.ts +1 -0
  39. package/src/gjc-runtime/launch-tmux.ts +27 -5
  40. package/src/gjc-runtime/ledger-event-renderer.ts +1 -0
  41. package/src/gjc-runtime/ralplan-runtime.ts +2 -2
  42. package/src/gjc-runtime/tmux-common.ts +8 -0
  43. package/src/gjc-runtime/tmux-sessions.ts +8 -1
  44. package/src/gjc-runtime/workflow-manifest.generated.json +29 -0
  45. package/src/gjc-runtime/workflow-manifest.ts +7 -2
  46. package/src/hashline/hash.ts +1 -1
  47. package/src/internal-urls/docs-index.generated.ts +9 -8
  48. package/src/lsp/config.ts +16 -3
  49. package/src/lsp/defaults.json +7 -0
  50. package/src/lsp/types.ts +2 -0
  51. package/src/modes/controllers/event-controller.ts +15 -0
  52. package/src/modes/interactive-mode.ts +46 -2
  53. package/src/modes/utils/context-usage.ts +2 -2
  54. package/src/notifications/attachment-registry.ts +23 -0
  55. package/src/notifications/chat-adapters.ts +147 -0
  56. package/src/notifications/config.ts +23 -2
  57. package/src/notifications/engine.ts +100 -0
  58. package/src/notifications/index.ts +224 -45
  59. package/src/notifications/managed-daemon.ts +163 -0
  60. package/src/notifications/telegram-daemon.ts +235 -14
  61. package/src/notifications/threaded-inbound.ts +60 -4
  62. package/src/notifications/threaded-render.ts +20 -2
  63. package/src/session/agent-session.ts +82 -51
  64. package/src/tools/ask.ts +3 -2
  65. package/src/tools/fetch.ts +78 -1
  66. package/src/tools/index.ts +3 -0
  67. package/src/tools/telegram-send.ts +137 -0
  68. package/src/web/insane/bridge.ts +350 -0
  69. package/src/web/insane/url-guard.ts +155 -0
  70. package/src/web/search/provider.ts +77 -18
  71. package/src/web/search/providers/anthropic.ts +70 -3
  72. package/src/web/search/providers/codex.ts +1 -119
  73. package/src/web/search/providers/gemini.ts +99 -0
  74. package/src/web/search/providers/insane.ts +551 -0
  75. package/src/web/search/providers/openai-compatible.ts +66 -32
  76. package/src/web/search/providers/text-citations.ts +111 -0
  77. package/src/web/search/types.ts +13 -2
  78. package/vendor/insane-search/LICENSE +21 -0
  79. package/vendor/insane-search/MANIFEST.json +24 -0
  80. package/vendor/insane-search/engine/__init__.py +23 -0
  81. package/vendor/insane-search/engine/__main__.py +128 -0
  82. package/vendor/insane-search/engine/bias_check.py +183 -0
  83. package/vendor/insane-search/engine/executor.py +254 -0
  84. package/vendor/insane-search/engine/fetch_chain.py +725 -0
  85. package/vendor/insane-search/engine/learning.py +175 -0
  86. package/vendor/insane-search/engine/phase0.py +214 -0
  87. package/vendor/insane-search/engine/safety.py +91 -0
  88. package/vendor/insane-search/engine/templates/package.json +11 -0
  89. package/vendor/insane-search/engine/templates/playwright_mobile_chrome.js +188 -0
  90. package/vendor/insane-search/engine/templates/playwright_real_chrome.js +243 -0
  91. package/vendor/insane-search/engine/tests/test_hardening.py +57 -0
  92. package/vendor/insane-search/engine/tests/test_smoke.py +152 -0
  93. package/vendor/insane-search/engine/tests/test_u1.py +200 -0
  94. package/vendor/insane-search/engine/tests/test_u4.py +131 -0
  95. package/vendor/insane-search/engine/tests/test_u5.py +163 -0
  96. package/vendor/insane-search/engine/tests/test_u7.py +124 -0
  97. package/vendor/insane-search/engine/transport.py +211 -0
  98. package/vendor/insane-search/engine/url_transforms.py +98 -0
  99. package/vendor/insane-search/engine/validators.py +331 -0
  100. package/vendor/insane-search/engine/waf_detector.py +214 -0
  101. package/vendor/insane-search/engine/waf_profiles.yaml +162 -0
@@ -0,0 +1,243 @@
1
+ #!/usr/bin/env node
2
+ /**
3
+ * Generic Playwright fetcher — real Chrome channel (not bundled Chromium).
4
+ *
5
+ * Usage (driven by engine/executor.py):
6
+ * echo '{"url":"...", "profileDir":"/tmp/.p", "waitSelector":"article"}' | node playwright_real_chrome.js
7
+ *
8
+ * Outputs page HTML to stdout on success; errors to stderr with non-zero exit.
9
+ *
10
+ * NO-SITE-NAME RULE: this file must never branch on specific hostnames.
11
+ * All site specifics come from the JSON input (url, waitSelector).
12
+ *
13
+ * Dependencies (install once on target machine):
14
+ * npm i -g playwright playwright-extra puppeteer-extra-plugin-stealth
15
+ * npx playwright install chrome # system Chrome binary
16
+ */
17
+
18
+ const dns = require('dns').promises;
19
+ const net = require('net');
20
+
21
+ // Drain stdout fully before exiting. `process.exit()` can truncate a large
22
+ // HTML payload because it does not wait for pending stdout I/O (Node docs).
23
+ function writeStdoutAsync(payload) {
24
+ return new Promise((resolve, reject) => {
25
+ process.stdout.write(payload, (err) => (err ? reject(err) : resolve()));
26
+ });
27
+ }
28
+
29
+ // Structured envelope so the Python side can (a) validate on real status /
30
+ // final URL and (b) bridge the browser-cleared cookies + UA into curl_cffi.
31
+ async function buildEnvelope(ctx, page, html, resp, automation) {
32
+ let cookies = [];
33
+ try { cookies = (await ctx.cookies()).map((c) => ({ name: c.name, value: c.value, domain: c.domain })); } catch (_e) {}
34
+ let userAgent = '';
35
+ try { userAgent = await page.evaluate(() => navigator.userAgent); } catch (_e) {}
36
+ let finalUrl = '';
37
+ try { finalUrl = page.url(); } catch (_e) {}
38
+ let status = 0;
39
+ try { status = resp ? resp.status() : 0; } catch (_e) {}
40
+ return JSON.stringify({ html, finalUrl, status, cookies, userAgent, automation });
41
+ }
42
+
43
+
44
+ class UnsafeUrlError extends Error {
45
+ constructor(reason) {
46
+ super(`unsafe_url:${reason}`);
47
+ this.name = 'UnsafeUrlError';
48
+ }
49
+ }
50
+
51
+ function isBlockedHostname(hostname) {
52
+ const h = (hostname || '').toLowerCase().replace(/\.$/, '');
53
+ return !h || h === 'localhost' || h.endsWith('.localhost') || h.endsWith('.local') || h.endsWith('.internal') || h.endsWith('.home.arpa');
54
+ }
55
+
56
+ function isPrivateIPv4(address) {
57
+ const parts = address.split('.').map((part) => Number.parseInt(part, 10));
58
+ if (parts.length !== 4 || parts.some((part) => !Number.isInteger(part) || part < 0 || part > 255)) return true;
59
+ const [a, b] = parts;
60
+ return a === 0 || a === 10 || a === 127 || (a === 100 && b >= 64 && b <= 127) ||
61
+ (a === 169 && b === 254) || (a === 172 && b >= 16 && b <= 31) ||
62
+ (a === 192 && (b === 0 || b === 168)) || (a === 198 && (b === 18 || b === 19 || b === 51)) ||
63
+ (a === 203 && b === 0) || a >= 224;
64
+ }
65
+
66
+ function normalizeIPv4MappedIPv6(address) {
67
+ const lower = address.toLowerCase();
68
+ return lower.startsWith('::ffff:') ? lower.slice(7) : lower;
69
+ }
70
+
71
+ function isPrivateIPv6(address) {
72
+ const lower = address.toLowerCase();
73
+ const mapped = normalizeIPv4MappedIPv6(lower);
74
+ if (mapped !== lower && net.isIP(mapped) === 4) return isPrivateIPv4(mapped);
75
+ return lower === '::' || lower === '::1' || lower.startsWith('fc') || lower.startsWith('fd') ||
76
+ lower.startsWith('fe8') || lower.startsWith('fe9') || lower.startsWith('fea') || lower.startsWith('feb') ||
77
+ lower.startsWith('ff') || lower.startsWith('2001:db8') || lower.startsWith('::ffff:');
78
+ }
79
+
80
+ function isPrivateOrSpecialAddress(address) {
81
+ const normalized = normalizeIPv4MappedIPv6(address);
82
+ const family = net.isIP(normalized);
83
+ if (family === 4) return isPrivateIPv4(normalized);
84
+ if (family === 6) return isPrivateIPv6(normalized);
85
+ if (net.isIP(address) === 6) return isPrivateIPv6(address);
86
+ return true;
87
+ }
88
+
89
+ async function assertPublicHttpUrl(rawUrl) {
90
+ let parsed;
91
+ try { parsed = new URL(rawUrl); } catch (_e) { throw new UnsafeUrlError('invalid_url'); }
92
+ if (parsed.protocol !== 'http:' && parsed.protocol !== 'https:') throw new UnsafeUrlError(`scheme:${parsed.protocol || 'none'}`);
93
+ if (parsed.username || parsed.password) throw new UnsafeUrlError('credentials');
94
+ if (isBlockedHostname(parsed.hostname)) throw new UnsafeUrlError('internal_host');
95
+ if (net.isIP(parsed.hostname)) {
96
+ if (isPrivateOrSpecialAddress(parsed.hostname)) throw new UnsafeUrlError(`ip_blocked:${parsed.hostname}`);
97
+ return;
98
+ }
99
+ let records;
100
+ try { records = await dns.lookup(parsed.hostname, { all: true, verbatim: true }); }
101
+ catch (_e) { throw new UnsafeUrlError('resolve_failed'); }
102
+ if (!records.length) throw new UnsafeUrlError('resolve_empty');
103
+ const blocked = records.find((record) => isPrivateOrSpecialAddress(record.address));
104
+ if (blocked) throw new UnsafeUrlError(`resolves_internal:${parsed.hostname}->${blocked.address}`);
105
+ }
106
+
107
+ async function assertPagePublic(page, label) {
108
+ let current = '';
109
+ try { current = page.url(); } catch (_e) {}
110
+ await assertPublicHttpUrl(current);
111
+ return current;
112
+ }
113
+
114
+ async function readStdinJson() {
115
+ return await new Promise((resolve, reject) => {
116
+ let data = '';
117
+ process.stdin.on('data', (c) => (data += c));
118
+ process.stdin.on('end', () => {
119
+ try { resolve(JSON.parse(data || '{}')); }
120
+ catch (e) { reject(e); }
121
+ });
122
+ process.stdin.on('error', reject);
123
+ });
124
+ }
125
+
126
+ async function main() {
127
+ const args = await readStdinJson();
128
+ const url = args.url;
129
+ if (!url) { process.stderr.write('missing url\n'); process.exit(2); }
130
+ await assertPublicHttpUrl(url);
131
+
132
+ const profileDir = args.profileDir || '/tmp/.insane_pw_profile';
133
+ const waitSelector = args.waitSelector || null;
134
+ const timeoutMs = args.timeout || 60000;
135
+ const headless = args.headless ?? false; // Akamai/etc detect headless
136
+ const viewport = args.viewport || { width: 1366, height: 900 };
137
+
138
+ let chromium;
139
+ let automation = 'playwright';
140
+ try {
141
+ // Patchright is a DROP-IN Playwright fork (same API) that closes the CDP
142
+ // Runtime.enable leak Cloudflare/DataDome now detect. Preferred when
143
+ // installed; it does its own patching, so NO stealth plugin is added.
144
+ // Additive only: if patchright is absent we fall back to exactly the
145
+ // previous playwright-extra(+stealth) → playwright behaviour.
146
+ ({ chromium } = require('patchright'));
147
+ automation = 'patchright';
148
+ } catch (_e0) {
149
+ try {
150
+ ({ chromium } = require('playwright-extra'));
151
+ const stealth = require('puppeteer-extra-plugin-stealth')();
152
+ chromium.use(stealth);
153
+ automation = 'playwright-extra+stealth';
154
+ } catch (_e) {
155
+ // Fallback to plain playwright (no stealth). Still uses channel:chrome.
156
+ ({ chromium } = require('playwright'));
157
+ automation = 'playwright';
158
+ }
159
+ }
160
+
161
+ let ctx;
162
+ try {
163
+ // Patchright official best practice: channel:'chrome', headless:false,
164
+ // no_viewport (JS: viewport:null), persistent context, and NO custom
165
+ // headers/UA/flags. We only override viewport for patchright; plain
166
+ // playwright keeps the fixed viewport it has always used.
167
+ const ctxOpts = { channel: 'chrome', headless };
168
+ if (automation === 'patchright') {
169
+ ctxOpts.viewport = null; // == no_viewport=True (use real window size)
170
+ } else {
171
+ ctxOpts.viewport = viewport;
172
+ }
173
+ ctx = await chromium.launchPersistentContext(profileDir, ctxOpts);
174
+ const page = await ctx.newPage();
175
+ // Single shared deadline across warmup + main + reload navigations so the
176
+ // first nav can't eat the whole budget and starve the rest.
177
+ const deadline = Date.now() + timeoutMs;
178
+ const rem = (cap) => Math.max(1000, Math.min(cap || timeoutMs, deadline - Date.now()));
179
+
180
+ // Warmup hop: visit the site root first so Akamai-style bot managers
181
+ // can run their JS sensor and set a resolved session cookie. Direct
182
+ // landing on a search/deep URL is the classic first-hit rejection pattern.
183
+ // Use domcontentloaded (not networkidle) — many SPAs keep analytics/xhr
184
+ // open indefinitely and would hit the 90s timeout.
185
+ try {
186
+ const urlObj = new URL(url);
187
+ const rootUrl = `${urlObj.protocol}//${urlObj.host}/`;
188
+ if (rootUrl !== url) {
189
+ await assertPublicHttpUrl(rootUrl);
190
+ await page.goto(rootUrl, { waitUntil: 'domcontentloaded', timeout: rem(90000) });
191
+ await assertPagePublic(page, 'warmup');
192
+ await page.waitForTimeout(3500); // let sensor JS finish
193
+ }
194
+ } catch (_e) {
195
+ if (_e && _e.name === 'UnsafeUrlError') throw _e;
196
+ // warmup is best-effort; continue even if it hiccups
197
+ }
198
+
199
+ // Main page — DOM loaded then give the sensor a moment.
200
+ let mainResp = await page.goto(url, { waitUntil: 'domcontentloaded', timeout: rem(90000) });
201
+ await assertPagePublic(page, 'main');
202
+ await page.waitForTimeout(2500);
203
+
204
+ if (waitSelector) {
205
+ try {
206
+ await page.waitForSelector(waitSelector, { timeout: rem(20000) });
207
+ } catch (_e) {
208
+ // Selector still missing — try one hard reload in case the first hit
209
+ // landed on a challenge page and the sensor has just cleared.
210
+ try {
211
+ mainResp = await page.reload({ waitUntil: 'domcontentloaded', timeout: rem(90000) });
212
+ await assertPagePublic(page, 'reload');
213
+ await page.waitForTimeout(2000);
214
+ try {
215
+ await page.waitForSelector(waitSelector, { timeout: rem(10000) });
216
+ } catch (_e2) {
217
+ // Still no luck — caller validates HTML anyway.
218
+ }
219
+ } catch (_e3) {
220
+ // reload failed — proceed with whatever we have
221
+ }
222
+ }
223
+ } else {
224
+ // Without a positive-proof selector, give the sensor a couple more seconds.
225
+ await page.waitForTimeout(2000);
226
+ }
227
+
228
+ await assertPagePublic(page, 'content');
229
+ const html = await page.content();
230
+ const payload = await buildEnvelope(ctx, page, html, mainResp, automation);
231
+ await writeStdoutAsync(payload); // flush fully before any exit
232
+ process.exitCode = 0;
233
+ return; // let finally close ctx, then exit naturally
234
+ } catch (e) {
235
+ process.stderr.write(`${e.name || 'Error'}: ${e.message || e}\n`);
236
+ process.exitCode = 1;
237
+ return;
238
+ } finally {
239
+ try { if (ctx) await ctx.close(); } catch (_e) {}
240
+ }
241
+ }
242
+
243
+ main();
@@ -0,0 +1,57 @@
1
+ #!/usr/bin/env python3
2
+ from __future__ import annotations
3
+
4
+ import os
5
+ import subprocess
6
+ import sys
7
+ import unittest
8
+
9
+ HERE = os.path.dirname(os.path.abspath(__file__))
10
+ ROOT = os.path.abspath(os.path.join(HERE, "..", ".."))
11
+ sys.path.insert(0, ROOT)
12
+
13
+ from engine.fetch_chain import FetchResult # noqa: E402
14
+ from engine.transport import SessionPool # noqa: E402
15
+
16
+
17
+ class FetchResultJsonContractTest(unittest.TestCase):
18
+ def test_to_dict_omits_content_by_default_but_reports_length(self):
19
+ payload = FetchResult(ok=True, content="recovered").to_dict()
20
+ self.assertNotIn("content", payload)
21
+ self.assertEqual(payload["content_length"], len("recovered"))
22
+ self.assertFalse(payload["content_truncated"])
23
+
24
+ def test_to_dict_includes_bounded_content_for_cli_json(self):
25
+ payload = FetchResult(ok=True, content="abcdef").to_dict(include_content=True, content_limit=3)
26
+ self.assertEqual(payload["content"], "abc")
27
+ self.assertEqual(payload["content_length"], 6)
28
+ self.assertTrue(payload["content_truncated"])
29
+
30
+
31
+ class RedirectSafetyTest(unittest.TestCase):
32
+ def test_transport_redirect_to_private_target_is_blocked(self):
33
+ class Resp:
34
+ status_code = 302
35
+ headers = {"Location": "http://127.0.0.1/private"}
36
+ text = ""
37
+ url = "https://public.example/redirect"
38
+
39
+ resp, err = SessionPool._fetch_following(lambda _url: Resp(), "https://public.example/redirect", False, 10, None)
40
+ self.assertIsNone(resp)
41
+ self.assertTrue(err.startswith("ssrf_redirect_blocked:"), err)
42
+
43
+ def test_playwright_templates_reject_private_initial_url_before_browser_launch(self):
44
+ template = os.path.join(ROOT, "engine", "templates", "playwright_real_chrome.js")
45
+ proc = subprocess.run(
46
+ ["node", template],
47
+ input='{"url":"http://127.0.0.1/private"}',
48
+ capture_output=True,
49
+ text=True,
50
+ timeout=10,
51
+ )
52
+ self.assertNotEqual(proc.returncode, 0)
53
+ self.assertIn("unsafe_url", proc.stderr)
54
+
55
+
56
+ if __name__ == "__main__":
57
+ unittest.main()
@@ -0,0 +1,152 @@
1
+ #!/usr/bin/env python3
2
+ """Smoke / regression test for the generic fetch chain.
3
+
4
+ These tests hit real endpoints — mark as online / integration. They verify
5
+ behaviour patterns, not content. No assertions on specific site brands.
6
+
7
+ Run manually:
8
+ python3 engine/tests/test_smoke.py
9
+ """
10
+ from __future__ import annotations
11
+
12
+ import json
13
+ import os
14
+ import sys
15
+ import time
16
+
17
+ # Allow running from anywhere.
18
+ HERE = os.path.dirname(os.path.abspath(__file__))
19
+ ROOT = os.path.abspath(os.path.join(HERE, "..", ".."))
20
+ sys.path.insert(0, ROOT)
21
+
22
+ from engine import fetch # noqa: E402
23
+ from engine.validators import validate, Verdict # noqa: E402
24
+ from engine.waf_detector import detect, _load_profiles # noqa: E402
25
+ from engine.url_transforms import iter_transformed # noqa: E402
26
+
27
+
28
+ # --- unit-level -------------------------------------------------------------
29
+ def t_validator_tiny_body_is_challenge():
30
+ class R:
31
+ status_code = 200
32
+ text = "<html>short</html>"
33
+ headers = {}
34
+ cookies = type("C", (), {"jar": iter(())})()
35
+ vr = validate(R())
36
+ assert vr.verdict == Verdict.CHALLENGE, vr.verdict
37
+ assert any("tiny_body" in r for r in vr.reasons)
38
+ print(" ✓ tiny body → challenge")
39
+
40
+
41
+ def t_validator_marker_is_challenge():
42
+ class R:
43
+ status_code = 200
44
+ text = "<html>" + ("x" * 5000) + " sec-if-cpt-container found </html>"
45
+ headers = {}
46
+ cookies = type("C", (), {"jar": iter(())})()
47
+ vr = validate(R())
48
+ assert vr.verdict == Verdict.CHALLENGE, vr.verdict
49
+ print(" ✓ challenge marker → challenge")
50
+
51
+
52
+ def t_validator_weak_ok_without_selectors():
53
+ class R:
54
+ status_code = 200
55
+ text = "<html>" + ("x" * 5000) + "</html>"
56
+ headers = {}
57
+ cookies = type("C", (), {"jar": iter(())})()
58
+ vr = validate(R())
59
+ assert vr.verdict == Verdict.WEAK_OK, vr.verdict
60
+ print(" ✓ clean body w/o selectors → weak_ok")
61
+
62
+
63
+ def t_validator_strong_ok_with_selectors():
64
+ class R:
65
+ status_code = 200
66
+ text = "<html><body>" + ("x" * 5000) + "<article>hello</article></body></html>"
67
+ headers = {}
68
+ cookies = type("C", (), {"jar": iter(())})()
69
+ vr = validate(R(), success_selectors=["article"])
70
+ assert vr.verdict == Verdict.STRONG_OK, vr.verdict
71
+ assert "article" in vr.matched_selectors
72
+ print(" ✓ selectors matched → strong_ok")
73
+
74
+
75
+ def t_profiles_load():
76
+ p = _load_profiles()
77
+ for required in ("akamai_bot_manager", "cloudflare_turnstile", "unknown_challenge"):
78
+ assert required in p, f"missing profile: {required}"
79
+ print(f" ✓ profiles loaded ({len(p)} keys)")
80
+
81
+
82
+ def t_url_transforms():
83
+ # www → m
84
+ out = iter_transformed("https://www.example.com/a", ["original", "mobile_subdomain"])
85
+ urls = [u for _, u in out]
86
+ assert "https://www.example.com/a" in urls
87
+ assert "https://m.example.com/a" in urls, urls
88
+ # apex with am_prefix
89
+ out2 = iter_transformed("https://example.com/", ["original", "am_prefix"])
90
+ urls2 = [u for _, u in out2]
91
+ assert "https://m.example.com/" in urls2, urls2
92
+ print(f" ✓ url_transforms produce expected forms")
93
+
94
+
95
+ # --- online (network) -------------------------------------------------------
96
+ def t_online_benign_site():
97
+ """A simple, usually-open site should pass probe directly when selectors provided."""
98
+ # example.com serves ~1.2KB content — below tiny_body threshold — but with
99
+ # success_selectors we trust caller's "content exists" definition.
100
+ r = fetch(
101
+ "https://example.com/",
102
+ success_selectors=["h1", "p"],
103
+ timeout=15,
104
+ max_attempts=3,
105
+ enable_playwright=False,
106
+ )
107
+ assert r.ok, f"{r.summary} | trace: {[a.verdict for a in r.trace]}"
108
+ assert r.verdict in ("strong_ok", "weak_ok"), r.verdict
109
+ print(f" ✓ benign site → verdict={r.verdict} size={len(r.content)}")
110
+
111
+
112
+ def t_online_trace_shape():
113
+ """Even on failure, trace should be populated and well-formed."""
114
+ r = fetch("https://httpbin.org/status/403", timeout=10, max_attempts=3, enable_playwright=False)
115
+ assert isinstance(r.trace, list) and len(r.trace) >= 1
116
+ for att in r.trace:
117
+ d = att.to_dict()
118
+ assert "phase" in d and "executor" in d and "verdict" in d
119
+ print(f" ✓ httpbin 403 → trace_len={len(r.trace)} final={r.verdict}")
120
+
121
+
122
+ ALL_TESTS = [
123
+ ("validator_tiny_body_is_challenge", t_validator_tiny_body_is_challenge),
124
+ ("validator_marker_is_challenge", t_validator_marker_is_challenge),
125
+ ("validator_weak_ok_without_selectors", t_validator_weak_ok_without_selectors),
126
+ ("validator_strong_ok_with_selectors", t_validator_strong_ok_with_selectors),
127
+ ("profiles_load", t_profiles_load),
128
+ ("url_transforms", t_url_transforms),
129
+ ("online_benign_site", t_online_benign_site),
130
+ ("online_trace_shape", t_online_trace_shape),
131
+ ]
132
+
133
+
134
+ def main() -> int:
135
+ passed, failed = 0, 0
136
+ for name, fn in ALL_TESTS:
137
+ try:
138
+ print(f"[{name}]")
139
+ fn()
140
+ passed += 1
141
+ except AssertionError as e:
142
+ failed += 1
143
+ print(f" ✗ FAIL: {e}")
144
+ except Exception as e:
145
+ failed += 1
146
+ print(f" ✗ ERROR: {type(e).__name__}: {e}")
147
+ print(f"\n{passed} passed, {failed} failed")
148
+ return 0 if failed == 0 else 1
149
+
150
+
151
+ if __name__ == "__main__":
152
+ sys.exit(main())
@@ -0,0 +1,200 @@
1
+ #!/usr/bin/env python3
2
+ """U1 regression tests — validator v2 + diversity scheduler.
3
+
4
+ Deterministic, network-free. Locks in the multi-AI-review fixes:
5
+ * grid diversity under a small cap (all TLS families + both transforms)
6
+ * avoid targets deprioritized, NOT deleted
7
+ * validator: small JSON ok, _abck-unresolved non-terminal, soft-marker
8
+ overridden by selector, status semantics.
9
+
10
+ Run: python3 engine/tests/test_u1.py
11
+ """
12
+ from __future__ import annotations
13
+
14
+ import os
15
+ import sys
16
+
17
+ HERE = os.path.dirname(os.path.abspath(__file__))
18
+ ROOT = os.path.abspath(os.path.join(HERE, "..", ".."))
19
+ sys.path.insert(0, ROOT)
20
+
21
+ from engine.validators import validate, Verdict # noqa: E402
22
+ from engine.waf_detector import _load_profiles # noqa: E402
23
+ from engine.fetch_chain import _build_plan, _family # noqa: E402
24
+
25
+
26
+ class _Ck:
27
+ def __init__(self, name, value):
28
+ self.name, self.value = name, value
29
+
30
+
31
+ class _Jar:
32
+ def __init__(self, d):
33
+ self.jar = [_Ck(k, v) for k, v in d.items()]
34
+
35
+
36
+ class _Resp:
37
+ def __init__(self, status=200, text="", headers=None, cookies=None):
38
+ self.status_code = status
39
+ self.text = text
40
+ self.headers = headers or {}
41
+ self.cookies = _Jar(cookies or {})
42
+
43
+
44
+ class _Hit:
45
+ def __init__(self, pid):
46
+ self.profile_id = pid
47
+ self.confidence = 0.9
48
+ self.signals = []
49
+
50
+
51
+ # ---------- scheduler ----------
52
+ def t_scheduler_diversity_under_cap():
53
+ profiles = _load_profiles()
54
+ plan = _build_plan("https://www.example.com/p", [_Hit("akamai_bot_manager")],
55
+ profiles, "auto", "safari", "self_root")
56
+ budget = 11 # max_attempts 12 - probe
57
+ head = plan[:budget]
58
+ fams = set(_family(c.impersonate) for c in head)
59
+ transforms = set(c.transform for c in head)
60
+ assert fams == {"safari", "safari_ios", "chrome", "chrome_android", "edge"}, fams
61
+ assert transforms == {"original", "mobile_subdomain"}, transforms
62
+ print(f" ✓ first {budget} cover all families {sorted(fams)} + transforms {sorted(transforms)}")
63
+
64
+
65
+ def t_scheduler_avoid_deprioritized_not_deleted():
66
+ profiles = _load_profiles()
67
+ plan = _build_plan("https://www.example.com/p", [_Hit("akamai_bot_manager")],
68
+ profiles, "auto", "safari", "self_root")
69
+ imps = [c.impersonate for c in plan]
70
+ # chrome145/146 are in avoid; must still be present (exhaustive) but late.
71
+ assert "chrome145" in imps and "chrome146" in imps, "avoid targets were deleted!"
72
+ pos145 = min(i for i, x in enumerate(imps) if x == "chrome145")
73
+ early = imps[: len(imps) // 2]
74
+ assert "chrome145" not in early, "avoid target not deprioritized"
75
+ print(f" ✓ avoid targets retained but late (chrome145 idx={pos145}/{len(imps)})")
76
+
77
+
78
+ def t_scheduler_desktop_drops_mobile_transform():
79
+ profiles = _load_profiles()
80
+ plan = _build_plan("https://www.example.com/p", [_Hit("akamai_bot_manager")],
81
+ profiles, "desktop", "safari", "self_root")
82
+ transforms = set(c.transform for c in plan)
83
+ fams = set(_family(c.impersonate) for c in plan)
84
+ assert "mobile_subdomain" not in transforms, transforms
85
+ assert "safari_ios" not in fams and "chrome_android" not in fams, fams
86
+ print(f" ✓ desktop drops mobile transform & mobile TLS (transforms={sorted(transforms)})")
87
+
88
+
89
+ # ---------- validator v2 ----------
90
+ def t_validator_small_json_ok():
91
+ r = _Resp(200, '{"items":[{"id":1}],"total":1}', headers={"Content-Type": "application/json"})
92
+ v = validate(r)
93
+ assert v.verdict == Verdict.WEAK_OK, v.verdict
94
+ assert not (v.verdict == Verdict.CHALLENGE)
95
+ print(f" ✓ small JSON → {v.verdict.value} (was challenge)")
96
+
97
+
98
+ def t_validator_abck_unresolved_is_non_terminal():
99
+ r = _Resp(200, "<html>" + "x" * 5000 + "</html>", cookies={"_abck": "AA~-1~bb"})
100
+ v = validate(r)
101
+ assert v.verdict == Verdict.SUSPECT_OK, v.verdict
102
+ assert v.ok is False, "SUSPECT_OK must not count as terminal success"
103
+ print(f" ✓ _abck unresolved → {v.verdict.value}, ok={v.ok} (was weak_ok/ok=True)")
104
+
105
+
106
+ def t_validator_soft_marker_overridden_by_selector():
107
+ html = "<html><script>var s='captcha';</script><body>" + "x" * 5000 + "<main id='c'>real</main></body></html>"
108
+ v = validate(_Resp(200, html), success_selectors=["#c"])
109
+ assert v.verdict == Verdict.STRONG_OK, v.verdict
110
+ print(f" ✓ 'captcha' word + matching selector → {v.verdict.value} (was challenge)")
111
+
112
+
113
+ def t_validator_hard_marker_still_challenge():
114
+ v = validate(_Resp(200, "<html>" + "x" * 5000 + " sec-if-cpt-container </html>"))
115
+ assert v.verdict == Verdict.CHALLENGE, v.verdict
116
+ print(f" ✓ hard marker still → {v.verdict.value}")
117
+
118
+
119
+ def t_validator_status_semantics():
120
+ assert validate(_Resp(429, "slow down")).verdict == Verdict.RATE_LIMITED
121
+ assert validate(_Resp(401, "nope")).verdict == Verdict.AUTH_REQUIRED
122
+ assert validate(_Resp(404, "gone")).verdict == Verdict.NOT_FOUND
123
+ assert validate(_Resp(503, "later")).verdict == Verdict.BLOCKED
124
+ print(" ✓ status semantics 429/401/404/503 differentiated")
125
+
126
+
127
+ def t_validator_byte_size_not_char_count():
128
+ # 1500 Korean chars = 1500 chars but 4500 bytes (>threshold) → not tiny.
129
+ body = "가" * 1500
130
+ v = validate(_Resp(200, body, headers={"Content-Type": "text/html"}))
131
+ # 4500 bytes ≥ 3000 → not tiny_body; no markers/selectors → weak_ok
132
+ assert v.body_size >= 3000, v.body_size
133
+ assert v.verdict == Verdict.WEAK_OK, (v.verdict, v.body_size)
134
+ print(f" ✓ byte size counts UTF-8 bytes ({v.body_size}B from 1500 chars) → {v.verdict.value}")
135
+
136
+
137
+ def t_validator_small_complete_page_is_weak_ok():
138
+ # example.com is a complete ~600B HTML document with real text — a small but
139
+ # genuine page must NOT be mislabelled a challenge stub (regression guard).
140
+ body = ('<!doctype html><html lang="en"><head><title>Example Domain</title>'
141
+ '</head><body><div><h1>Example Domain</h1><p>This domain is for use in '
142
+ 'documentation examples without needing permission.</p>'
143
+ '<p><a href="https://iana.org/domains/example">Learn more</a></p>'
144
+ '</div></body></html>')
145
+ v = validate(_Resp(200, body, headers={"Content-Type": "text/html"}))
146
+ assert v.body_size < 3000, v.body_size
147
+ assert v.verdict == Verdict.WEAK_OK, (v.verdict, v.reasons)
148
+ print(f" ✓ small complete page → {v.verdict.value} ({v.reasons})")
149
+
150
+
151
+ def t_validator_small_script_stub_still_challenge():
152
+ # Script-only tiny body (no visible text) is still a suspicious stub.
153
+ body = '<html><head></head><body><script src="/cdn-cgi/challenge.js"></script></body></html>'
154
+ v = validate(_Resp(200, body, headers={"Content-Type": "text/html"}))
155
+ assert v.verdict == Verdict.CHALLENGE, (v.verdict, v.reasons)
156
+ print(f" ✓ script-only tiny body → {v.verdict.value}")
157
+
158
+
159
+ def t_validator_small_fragment_still_challenge():
160
+ # Incomplete fragment (no closing </html>/</body>) stays suspicious.
161
+ v = validate(_Resp(200, "<div>loading", headers={"Content-Type": "text/html"}))
162
+ assert v.verdict == Verdict.CHALLENGE, (v.verdict, v.reasons)
163
+ print(f" ✓ incomplete fragment → {v.verdict.value}")
164
+
165
+
166
+ ALL = [
167
+ ("scheduler_diversity_under_cap", t_scheduler_diversity_under_cap),
168
+ ("scheduler_avoid_deprioritized_not_deleted", t_scheduler_avoid_deprioritized_not_deleted),
169
+ ("scheduler_desktop_drops_mobile_transform", t_scheduler_desktop_drops_mobile_transform),
170
+ ("validator_small_json_ok", t_validator_small_json_ok),
171
+ ("validator_abck_unresolved_is_non_terminal", t_validator_abck_unresolved_is_non_terminal),
172
+ ("validator_soft_marker_overridden_by_selector", t_validator_soft_marker_overridden_by_selector),
173
+ ("validator_hard_marker_still_challenge", t_validator_hard_marker_still_challenge),
174
+ ("validator_status_semantics", t_validator_status_semantics),
175
+ ("validator_byte_size_not_char_count", t_validator_byte_size_not_char_count),
176
+ ("validator_small_complete_page_is_weak_ok", t_validator_small_complete_page_is_weak_ok),
177
+ ("validator_small_script_stub_still_challenge", t_validator_small_script_stub_still_challenge),
178
+ ("validator_small_fragment_still_challenge", t_validator_small_fragment_still_challenge),
179
+ ]
180
+
181
+
182
+ def main() -> int:
183
+ p = f = 0
184
+ for name, fn in ALL:
185
+ try:
186
+ print(f"[{name}]")
187
+ fn()
188
+ p += 1
189
+ except AssertionError as e:
190
+ f += 1
191
+ print(f" ✗ FAIL: {e}")
192
+ except Exception as e:
193
+ f += 1
194
+ print(f" ✗ ERROR: {type(e).__name__}: {e}")
195
+ print(f"\n{p} passed, {f} failed")
196
+ return 0 if f == 0 else 1
197
+
198
+
199
+ if __name__ == "__main__":
200
+ sys.exit(main())