sanook-cli 0.5.2 → 0.5.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (127) hide show
  1. package/CHANGELOG.md +112 -2
  2. package/README.md +15 -3
  3. package/README.th.md +8 -1
  4. package/dist/approval.js +7 -0
  5. package/dist/bin.js +637 -56
  6. package/dist/brain-consolidate.js +335 -0
  7. package/dist/brain-context.js +42 -3
  8. package/dist/brain-final.js +15 -9
  9. package/dist/brain-link.js +73 -0
  10. package/dist/brain-metrics.js +277 -0
  11. package/dist/brain-new.js +402 -0
  12. package/dist/brain-pack.js +210 -0
  13. package/dist/brain-repair.js +280 -0
  14. package/dist/brain.js +3 -0
  15. package/dist/brand.js +4 -0
  16. package/dist/cli-args.js +47 -9
  17. package/dist/cli-option-values.js +1 -1
  18. package/dist/clipboard.js +65 -0
  19. package/dist/commands.js +98 -15
  20. package/dist/config.js +66 -34
  21. package/dist/context-pack.js +145 -0
  22. package/dist/cost.js +20 -0
  23. package/dist/dashboard/api-helpers.js +87 -0
  24. package/dist/dashboard/server.js +179 -0
  25. package/dist/dashboard/static/app.js +277 -0
  26. package/dist/dashboard/static/index.html +39 -0
  27. package/dist/dashboard/static/styles.css +85 -0
  28. package/dist/diff.js +10 -2
  29. package/dist/gateway/auth.js +14 -3
  30. package/dist/gateway/deliver.js +45 -3
  31. package/dist/gateway/doctor.js +456 -0
  32. package/dist/gateway/email.js +30 -1
  33. package/dist/gateway/ledger.js +20 -1
  34. package/dist/gateway/session.js +34 -11
  35. package/dist/hotkeys.js +21 -0
  36. package/dist/i18n/en.js +98 -0
  37. package/dist/i18n/index.js +19 -0
  38. package/dist/i18n/th.js +98 -0
  39. package/dist/i18n/types.js +1 -0
  40. package/dist/insights-args.js +24 -4
  41. package/dist/knowledge.js +55 -29
  42. package/dist/loop.js +65 -9
  43. package/dist/mcp-hub.js +33 -0
  44. package/dist/mcp-registry.js +153 -9
  45. package/dist/mcp-risk.js +71 -0
  46. package/dist/mcp.js +77 -5
  47. package/dist/memory-log.js +90 -0
  48. package/dist/memory-store.js +37 -1
  49. package/dist/memory.js +51 -7
  50. package/dist/model-picker.js +58 -0
  51. package/dist/orchestrate.js +7 -5
  52. package/dist/plan-handoff.js +17 -0
  53. package/dist/polyglot.js +162 -0
  54. package/dist/process-runner.js +96 -0
  55. package/dist/project-init.js +91 -0
  56. package/dist/project-registry.js +143 -0
  57. package/dist/project-scaffold.js +124 -0
  58. package/dist/prompt-size.js +155 -0
  59. package/dist/providers/codex-login.js +138 -0
  60. package/dist/providers/codex.js +20 -8
  61. package/dist/providers/keys.js +21 -0
  62. package/dist/providers/models.js +1 -1
  63. package/dist/providers/registry.js +11 -1
  64. package/dist/search/cli.js +9 -1
  65. package/dist/search/embedding-config.js +22 -0
  66. package/dist/search/engine.js +2 -13
  67. package/dist/search/indexer.js +10 -10
  68. package/dist/session-brain.js +103 -0
  69. package/dist/session-distill.js +84 -0
  70. package/dist/session.js +1 -11
  71. package/dist/skill-install.js +24 -1
  72. package/dist/skills.js +33 -0
  73. package/dist/slash-completion.js +155 -0
  74. package/dist/support-dump.js +31 -0
  75. package/dist/tool-catalog.js +59 -0
  76. package/dist/tools/index.js +5 -0
  77. package/dist/tools/permission.js +82 -16
  78. package/dist/tools/polyglot.js +126 -0
  79. package/dist/tools/sandbox.js +38 -13
  80. package/dist/tools/search.js +9 -2
  81. package/dist/tools/task.js +22 -2
  82. package/dist/tools/timeout.js +7 -5
  83. package/dist/tools/web-fetch-tool.js +33 -0
  84. package/dist/turn-retrieval.js +83 -0
  85. package/dist/ui/app.js +874 -35
  86. package/dist/ui/banner.js +78 -4
  87. package/dist/ui/markdown.js +122 -0
  88. package/dist/ui/overlay.js +496 -0
  89. package/dist/ui/queue.js +23 -0
  90. package/dist/ui/render.js +30 -2
  91. package/dist/ui/session-panel.js +115 -0
  92. package/dist/ui/setup-providers.js +40 -0
  93. package/dist/ui/setup.js +163 -50
  94. package/dist/ui/status.js +142 -0
  95. package/dist/ui/thinking-panel.js +36 -0
  96. package/dist/ui/tool-trail.js +97 -0
  97. package/dist/ui/transcript.js +26 -0
  98. package/dist/ui/useBusyElapsed.js +19 -0
  99. package/dist/ui/useEditor.js +144 -5
  100. package/dist/ui/useGitBranch.js +57 -0
  101. package/dist/update.js +32 -6
  102. package/dist/usage-cli.js +160 -0
  103. package/dist/usage-ledger.js +169 -0
  104. package/dist/web-fetch.js +637 -0
  105. package/dist/web-surface.js +190 -0
  106. package/package.json +4 -3
  107. package/scripts/postinstall.mjs +4 -4
  108. package/second-brain/Projects/_Index.md +17 -4
  109. package/second-brain/Projects/sanook-cli/_Index.md +7 -3
  110. package/second-brain/Projects/sanook-cli/context.md +35 -0
  111. package/second-brain/Projects/sanook-cli/current-state.md +32 -0
  112. package/second-brain/Projects/sanook-cli/overview.md +41 -0
  113. package/second-brain/Projects/sanook-cli/repo.md +34 -0
  114. package/second-brain/Projects/sanook-cli/second-brain-feature-roadmap.md +52 -11
  115. package/second-brain/Research/2026-06-18-hermes-tui-parity-map.md +129 -0
  116. package/second-brain/Research/2026-06-19-hermes-python-architecture-for-sanook.md +49 -0
  117. package/second-brain/Research/2026-06-19-terminal-ui-brand-research.md +52 -0
  118. package/second-brain/Research/_Index.md +2 -0
  119. package/second-brain/Shared/Operating-State/current-state.md +14 -23
  120. package/second-brain/Shared/Tech-Standards/_Index.md +2 -0
  121. package/second-brain/Shared/Tech-Standards/polyglot-runtime-strategy.md +46 -0
  122. package/second-brain/Shared/Tech-Standards/web-search-grounding-policy.md +70 -0
  123. package/second-brain/Templates/project-workspace/_Index.md +31 -0
  124. package/second-brain/Templates/project-workspace/context.md +28 -0
  125. package/second-brain/Templates/project-workspace/current-state.md +29 -0
  126. package/second-brain/Templates/project-workspace/overview.md +39 -0
  127. package/second-brain/Templates/project-workspace/repo.md +33 -0
@@ -0,0 +1,637 @@
1
+ import { BRAND } from './brand.js';
2
+ // Honest, identifying User-Agent. Sanook does NOT impersonate a human browser to evade detection.
3
+ export const SANOOK_USER_AGENT = `${BRAND.cliName}-cli (+web-fetch; respects robots.txt)`;
4
+ // Techniques Sanook will NEVER attempt. Surfaced on every fetch result for transparency.
5
+ // (See second-brain/Shared/Tech-Standards/web-search-grounding-policy.md)
6
+ export const REFUSED_TECHNIQUES = [
7
+ 'CAPTCHA solving',
8
+ 'login / authentication bypass',
9
+ 'paywall / metered-content circumvention',
10
+ 'WAF / bot-challenge (Cloudflare etc.) defeat',
11
+ 'browser-fingerprint / TLS spoofing',
12
+ 'residential-proxy rotation to evade blocks',
13
+ ];
14
+ // Ordered, cheapest + most-polite first. Documents the ladder for `web status` and docs.
15
+ export const WEB_FETCH_LADDER = [
16
+ { tier: 0, name: 'preflight', solves: 'SSRF guard + robots.txt + honest UA + timeout/size caps (cross-cutting, always on)' },
17
+ { tier: 1, name: 'direct', solves: 'normal public HTML — extract title/meta/headings/links/JSON-LD to understand the site' },
18
+ { tier: 2, name: 'reader', solves: 'messy or JS-shell HTML — clean markdown via a reader service (Jina r.jina.ai)' },
19
+ { tier: 3, name: 'tavily', solves: 'origin blocks a plain fetch — provider-side extract/search (needs TAVILY_API_KEY)' },
20
+ { tier: 4, name: 'archive', solves: 'origin is down or removed — public archived snapshot (Wayback Machine)' },
21
+ ];
22
+ const DEFAULTS = {
23
+ fetchImpl: ((url, init) => fetch(url, init)),
24
+ userAgent: SANOOK_USER_AGENT,
25
+ timeoutMs: 15_000,
26
+ maxBytes: 600_000,
27
+ respectRobots: true,
28
+ allowReader: true,
29
+ readerBase: 'https://r.jina.ai/',
30
+ allowArchive: true,
31
+ allowPrivateHosts: false,
32
+ tavilyBase: 'https://api.tavily.com',
33
+ maxLinks: 25,
34
+ maxHeadings: 20,
35
+ };
36
+ function resolveOptions(options) {
37
+ return { ...DEFAULTS, ...options, fetchImpl: options.fetchImpl ?? DEFAULTS.fetchImpl };
38
+ }
39
+ // ── SSRF guard ──────────────────────────────────────────────────────────────
40
+ // Block loopback / private / link-local / metadata hosts by default so an agent
41
+ // fetching an arbitrary URL can't reach internal services.
42
+ export function isPrivateHost(hostname) {
43
+ // strip brackets, lowercase, drop a single trailing dot (FQDN form evades endsWith otherwise)
44
+ const host = hostname
45
+ .toLowerCase()
46
+ .replace(/^\[|\]$/g, '')
47
+ .replace(/\.$/, '');
48
+ // IPv4-mapped IPv6 (::ffff:127.0.0.1 or ::ffff:7f00:1) — unwrap and re-check the embedded v4
49
+ const mapped = host.match(/^::ffff:(.+)$/i);
50
+ if (mapped) {
51
+ const inner = mapped[1];
52
+ if (/^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$/.test(inner))
53
+ return isPrivateHost(inner);
54
+ const hex = inner.match(/^([0-9a-f]{1,4}):([0-9a-f]{1,4})$/i);
55
+ if (hex) {
56
+ const hi = parseInt(hex[1], 16);
57
+ const lo = parseInt(hex[2], 16);
58
+ return isPrivateHost(`${(hi >> 8) & 255}.${hi & 255}.${(lo >> 8) & 255}.${lo & 255}`);
59
+ }
60
+ return true; // unrecognised mapped form → fail closed
61
+ }
62
+ if (host === 'localhost' || host.endsWith('.localhost') || host.endsWith('.local') || host.endsWith('.internal'))
63
+ return true;
64
+ if (host === '0.0.0.0' || host === '::' || host === '::1')
65
+ return true;
66
+ // IPv4
67
+ const v4 = host.match(/^(\d{1,3})\.(\d{1,3})\.(\d{1,3})\.(\d{1,3})$/);
68
+ if (v4) {
69
+ const [a, b] = [Number(v4[1]), Number(v4[2])];
70
+ if (a === 127 || a === 10 || a === 0)
71
+ return true;
72
+ if (a === 192 && b === 168)
73
+ return true;
74
+ if (a === 169 && b === 254)
75
+ return true; // link-local + cloud metadata 169.254.169.254
76
+ if (a === 172 && b >= 16 && b <= 31)
77
+ return true;
78
+ if (a === 100 && b >= 64 && b <= 127)
79
+ return true; // CGNAT
80
+ }
81
+ // IPv6 unique-local / link-local
82
+ if (/^f[cd][0-9a-f]{2}:/i.test(host) || /^fe80:/i.test(host))
83
+ return true;
84
+ return false;
85
+ }
86
+ // ── robots.txt ──────────────────────────────────────────────────────────────
87
+ // Minimal but correct-enough parser: honour the most specific matching group for
88
+ // our UA token, falling back to `*`. Returns whether `path` is allowed to fetch.
89
+ export function isAllowedByRobots(robotsTxt, path, uaToken = BRAND.cliName) {
90
+ const groups = [];
91
+ let current = null;
92
+ let lastWasAgent = false;
93
+ for (const rawLine of robotsTxt.split(/\r?\n/)) {
94
+ const line = rawLine.replace(/#.*$/, '').trim();
95
+ if (!line)
96
+ continue;
97
+ const idx = line.indexOf(':');
98
+ if (idx < 0)
99
+ continue;
100
+ const field = line.slice(0, idx).trim().toLowerCase();
101
+ const value = line.slice(idx + 1).trim();
102
+ if (field === 'user-agent') {
103
+ if (!current || !lastWasAgent) {
104
+ current = { agents: [], rules: [] };
105
+ groups.push(current);
106
+ }
107
+ current.agents.push(value.toLowerCase());
108
+ lastWasAgent = true;
109
+ }
110
+ else if ((field === 'allow' || field === 'disallow') && current) {
111
+ current.rules.push({ allow: field === 'allow', path: value });
112
+ lastWasAgent = false;
113
+ }
114
+ else {
115
+ lastWasAgent = false;
116
+ }
117
+ }
118
+ // Match the single product token (e.g. "sanook") against agent lines, not the full
119
+ // descriptive UA — otherwise a group named "web"/"cli"/"fetch" would falsely apply.
120
+ const ua = uaToken.toLowerCase().split(/[\s/]/)[0];
121
+ const specific = groups.find((g) => g.agents.some((a) => a !== '*' && a !== '' && (ua === a || ua.startsWith(a))));
122
+ const wildcard = groups.find((g) => g.agents.includes('*'));
123
+ const group = specific ?? wildcard;
124
+ if (!group)
125
+ return true;
126
+ // Longest matching pattern wins; an empty Disallow means "allow all".
127
+ let decision = true;
128
+ let best = -1;
129
+ for (const rule of group.rules) {
130
+ if (!rule.path)
131
+ continue; // empty path (e.g. "Disallow:") = no-op / allow all
132
+ if (rule.path.length > best && robotsRuleToRegex(rule.path).test(path)) {
133
+ best = rule.path.length;
134
+ decision = rule.allow;
135
+ }
136
+ }
137
+ return decision;
138
+ }
139
+ // Convert a robots.txt path rule (with * wildcards and optional $ end-anchor) to a regex.
140
+ function robotsRuleToRegex(rule) {
141
+ const anchored = rule.endsWith('$');
142
+ const body = (anchored ? rule.slice(0, -1) : rule)
143
+ .split('*')
144
+ .map((segment) => segment.replace(/[.+?^${}()|[\]\\]/g, '\\$&'))
145
+ .join('.*');
146
+ return new RegExp(`^${body}${anchored ? '$' : ''}`);
147
+ }
148
+ // ── HTML helpers ─────────────────────────────────────────────────────────────
149
+ function decodeEntities(input) {
150
+ return input
151
+ .replace(/&amp;/g, '&')
152
+ .replace(/&lt;/g, '<')
153
+ .replace(/&gt;/g, '>')
154
+ .replace(/&quot;/g, '"')
155
+ .replace(/&#0*39;|&apos;/g, "'")
156
+ .replace(/&nbsp;/g, ' ')
157
+ .replace(/&#(\d+);/g, (_, n) => safeCodePoint(Number(n)))
158
+ .replace(/&#x([0-9a-f]+);/gi, (_, h) => safeCodePoint(parseInt(h, 16)));
159
+ }
160
+ function safeCodePoint(n) {
161
+ try {
162
+ return Number.isFinite(n) && n > 0 && n <= 0x10ffff ? String.fromCodePoint(n) : '';
163
+ }
164
+ catch {
165
+ return '';
166
+ }
167
+ }
168
+ function stripTags(html) {
169
+ return decodeEntities(html.replace(/<[^>]+>/g, ' ')).replace(/\s+/g, ' ').trim();
170
+ }
171
+ function withoutNoise(html) {
172
+ return html
173
+ .replace(/<script\b[^>]*>[\s\S]*?<\/script>/gi, ' ')
174
+ .replace(/<style\b[^>]*>[\s\S]*?<\/style>/gi, ' ')
175
+ .replace(/<!--[\s\S]*?-->/g, ' ')
176
+ .replace(/<noscript\b[^>]*>[\s\S]*?<\/noscript>/gi, ' ');
177
+ }
178
+ function getAttr(tag, name) {
179
+ const m = tag.match(new RegExp(`\\b${name}\\s*=\\s*("([^"]*)"|'([^']*)'|([^\\s>]+))`, 'i'));
180
+ return m ? decodeEntities(m[2] ?? m[3] ?? m[4] ?? '') : undefined;
181
+ }
182
+ function collectMeta(html) {
183
+ const out = new Map();
184
+ for (const m of html.matchAll(/<meta\b[^>]*>/gi)) {
185
+ const tag = m[0];
186
+ const key = (getAttr(tag, 'name') ?? getAttr(tag, 'property') ?? getAttr(tag, 'itemprop'))?.toLowerCase();
187
+ const content = getAttr(tag, 'content');
188
+ if (key && content && !out.has(key))
189
+ out.set(key, content);
190
+ }
191
+ return out;
192
+ }
193
+ function extractLinks(html, baseUrl, max) {
194
+ const seen = new Set();
195
+ const out = [];
196
+ for (const m of html.matchAll(/<a\b[^>]*\bhref\s*=\s*("([^"]*)"|'([^']*)'|([^\s>]+))[^>]*>([\s\S]*?)<\/a>/gi)) {
197
+ const rawHref = decodeEntities(m[2] ?? m[3] ?? m[4] ?? '').trim();
198
+ if (!rawHref || rawHref.startsWith('#') || /^(javascript|mailto|tel):/i.test(rawHref))
199
+ continue;
200
+ let href;
201
+ try {
202
+ href = new URL(rawHref, baseUrl).href;
203
+ }
204
+ catch {
205
+ continue;
206
+ }
207
+ if (!/^https?:/i.test(href) || seen.has(href))
208
+ continue;
209
+ seen.add(href);
210
+ out.push({ text: stripTags(m[5]).slice(0, 120) || href, href });
211
+ if (out.length >= max)
212
+ break;
213
+ }
214
+ return out;
215
+ }
216
+ function extractJsonLdTypes(html) {
217
+ const types = new Set();
218
+ const visit = (node) => {
219
+ if (Array.isArray(node)) {
220
+ node.forEach(visit);
221
+ return;
222
+ }
223
+ if (!node || typeof node !== 'object')
224
+ return;
225
+ const rec = node;
226
+ const t = rec['@type'];
227
+ if (typeof t === 'string')
228
+ types.add(t);
229
+ else if (Array.isArray(t))
230
+ t.forEach((x) => typeof x === 'string' && types.add(x));
231
+ if (Array.isArray(rec['@graph']))
232
+ rec['@graph'].forEach(visit);
233
+ };
234
+ for (const m of html.matchAll(/<script\b[^>]*type\s*=\s*["']application\/ld\+json["'][^>]*>([\s\S]*?)<\/script>/gi)) {
235
+ try {
236
+ visit(JSON.parse(m[1].trim()));
237
+ }
238
+ catch {
239
+ /* malformed JSON-LD — skip */
240
+ }
241
+ }
242
+ return [...types].slice(0, 12);
243
+ }
244
+ export function extractStructure(html, baseUrl, options = {}) {
245
+ const opts = resolveOptions(options);
246
+ const meta = collectMeta(html);
247
+ const metaOf = (...keys) => {
248
+ for (const k of keys) {
249
+ const v = meta.get(k);
250
+ if (v)
251
+ return v;
252
+ }
253
+ return undefined;
254
+ };
255
+ const titleTag = html.match(/<title[^>]*>([\s\S]*?)<\/title>/i);
256
+ const title = (titleTag ? stripTags(titleTag[1]) : undefined) || metaOf('og:title', 'twitter:title');
257
+ const description = metaOf('description', 'og:description', 'twitter:description');
258
+ const siteName = metaOf('og:site_name', 'application-name');
259
+ const lang = html.match(/<html\b[^>]*\blang\s*=\s*["']([^"']+)["']/i)?.[1];
260
+ const headings = [];
261
+ for (const m of withoutNoise(html).matchAll(/<h[1-3]\b[^>]*>([\s\S]*?)<\/h[1-3]>/gi)) {
262
+ const text = stripTags(m[1]);
263
+ if (text && !headings.includes(text))
264
+ headings.push(text);
265
+ if (headings.length >= opts.maxHeadings)
266
+ break;
267
+ }
268
+ const wordCount = withoutNoise(html)
269
+ .replace(/<[^>]+>/g, ' ')
270
+ .split(/\s+/)
271
+ .filter((w) => w.length > 1).length;
272
+ let host = baseUrl;
273
+ try {
274
+ host = new URL(baseUrl).host;
275
+ }
276
+ catch {
277
+ /* keep raw */
278
+ }
279
+ return {
280
+ title,
281
+ description,
282
+ siteName,
283
+ lang,
284
+ headings,
285
+ links: extractLinks(html, baseUrl, opts.maxLinks),
286
+ jsonLdTypes: extractJsonLdTypes(html),
287
+ wordCount,
288
+ summary: buildSummary({ title, description, siteName, headings, host }),
289
+ };
290
+ }
291
+ function buildSummary(parts) {
292
+ const lead = parts.title || parts.siteName || parts.host;
293
+ const tail = parts.description || parts.headings[0] || '';
294
+ const text = tail ? `${lead} — ${tail}` : lead;
295
+ return text.length > 280 ? `${text.slice(0, 277)}…` : text;
296
+ }
297
+ // Markers of a bot-challenge / JS-required interstitial. If we see one we treat the
298
+ // tier as a FAILURE and fall through — this is RECOGNISING a block, never bypassing it.
299
+ // Unambiguous interstitial markers — if present, it's a challenge page regardless of length.
300
+ const STRONG_CHALLENGE_MARKERS = [
301
+ 'cf-browser-verification',
302
+ 'challenge-platform',
303
+ '/cdn-cgi/challenge',
304
+ 'attention required! | cloudflare',
305
+ 'checking your browser before accessing',
306
+ 'enable javascript and cookies to continue',
307
+ 'ddos protection by',
308
+ ];
309
+ // Phrases that also appear in legit prose — only count them when the page has little real content.
310
+ const WEAK_CHALLENGE_MARKERS = ['just a moment', 'verifying you are human', 'please verify you are a human', 'requiring captcha'];
311
+ export function looksBlocked(text) {
312
+ const lc = text.toLowerCase(); // scan the whole (already size-capped) body, not just the head
313
+ if (STRONG_CHALLENGE_MARKERS.some((marker) => lc.includes(marker)))
314
+ return true;
315
+ if (WEAK_CHALLENGE_MARKERS.some((marker) => lc.includes(marker))) {
316
+ const words = text.replace(/<[^>]+>/g, ' ').split(/\s+/).filter((w) => w.length > 1).length;
317
+ return words < 120; // a real article that merely mentions the phrase has far more content
318
+ }
319
+ return false;
320
+ }
321
+ function looksHtml(contentType, body) {
322
+ if (contentType && /(text\/html|application\/xhtml)/i.test(contentType))
323
+ return true;
324
+ if (contentType && /(json|pdf|image\/|application\/octet)/i.test(contentType))
325
+ return false;
326
+ return /<html[\s>]|<!doctype html|<head[\s>]|<body[\s>]/i.test(body.slice(0, 4000));
327
+ }
328
+ const MAX_REDIRECTS = 5;
329
+ function urlGuardError(parsed, opts) {
330
+ if (parsed.protocol !== 'http:' && parsed.protocol !== 'https:')
331
+ return `unsupported protocol: ${parsed.protocol}`;
332
+ if (!opts.allowPrivateHosts && isPrivateHost(parsed.hostname))
333
+ return `blocked internal/loopback host: ${parsed.hostname} (SSRF guard)`;
334
+ return null;
335
+ }
336
+ // ── single GET with timeout, size cap, and SSRF-safe MANUAL redirect handling ─
337
+ // Every redirect hop is re-validated against the SSRF guard, so a public URL that
338
+ // 30x-redirects to a private/loopback/metadata host is rejected, not followed.
339
+ async function doGet(url, opts, extraHeaders = {}) {
340
+ const controller = new AbortController();
341
+ const timer = setTimeout(() => controller.abort(), opts.timeoutMs);
342
+ try {
343
+ let currentUrl = url;
344
+ for (let hop = 0;; hop++) {
345
+ let parsed;
346
+ try {
347
+ parsed = new URL(currentUrl);
348
+ }
349
+ catch {
350
+ return { error: 'invalid redirect URL' };
351
+ }
352
+ const guard = urlGuardError(parsed, opts);
353
+ if (guard)
354
+ return { error: guard };
355
+ const res = await opts.fetchImpl(currentUrl, {
356
+ redirect: 'manual',
357
+ signal: controller.signal,
358
+ headers: { 'user-agent': opts.userAgent, accept: 'text/html,application/xhtml+xml,text/plain,*/*', ...extraHeaders },
359
+ });
360
+ if (res.status >= 300 && res.status < 400) {
361
+ const location = res.headers.get('location');
362
+ if (!location)
363
+ return { error: `${res.status} redirect without Location`, status: res.status };
364
+ if (hop >= MAX_REDIRECTS)
365
+ return { error: `too many redirects (>${MAX_REDIRECTS})`, status: res.status };
366
+ currentUrl = new URL(location, currentUrl).href; // re-validated at top of next hop
367
+ continue;
368
+ }
369
+ if (!res.ok) {
370
+ return { error: `${res.status}`, status: res.status, retryAfter: res.headers.get('retry-after') ?? undefined };
371
+ }
372
+ const declared = Number(res.headers.get('content-length') ?? '');
373
+ if (Number.isFinite(declared) && declared > opts.maxBytes) {
374
+ return { error: `response too large (${declared} bytes > ${opts.maxBytes} cap)`, status: res.status };
375
+ }
376
+ const body = (await res.text()).slice(0, opts.maxBytes);
377
+ return { res, body, finalUrl: currentUrl };
378
+ }
379
+ }
380
+ catch (e) {
381
+ return { error: e.name === 'AbortError' ? `timeout >${opts.timeoutMs}ms` : e.message };
382
+ }
383
+ finally {
384
+ clearTimeout(timer);
385
+ }
386
+ }
387
+ async function checkRobots(parsed, opts, attempts) {
388
+ const robotsUrl = `${parsed.protocol}//${parsed.host}/robots.txt`;
389
+ const got = await doGet(robotsUrl, { ...opts, respectRobots: false });
390
+ if ('error' in got) {
391
+ // No robots.txt (or unreachable) → not disallowed.
392
+ attempts.push({ tier: 'robots', ok: true, detail: `no robots.txt readable (${got.error}) — treated as allowed` });
393
+ return true;
394
+ }
395
+ const allowed = isAllowedByRobots(got.body, parsed.pathname + parsed.search, BRAND.cliName);
396
+ attempts.push({ tier: 'robots', ok: allowed, detail: allowed ? 'allowed by robots.txt' : `disallowed by robots.txt for ${parsed.pathname}` });
397
+ return allowed;
398
+ }
399
+ async function tierDirect(parsed, opts, attempts) {
400
+ const got = await doGet(parsed.href, opts);
401
+ if ('error' in got) {
402
+ const extra = got.retryAfter ? ` (retry-after: ${got.retryAfter})` : '';
403
+ attempts.push({ tier: 'direct', ok: false, detail: `direct fetch failed: ${got.error}${extra}`, status: got.status });
404
+ return null;
405
+ }
406
+ const contentType = got.res.headers.get('content-type') ?? undefined;
407
+ if (looksBlocked(got.body)) {
408
+ attempts.push({ tier: 'direct', ok: false, detail: 'got a bot-challenge/JS interstitial — not bypassing, falling through', status: got.res.status });
409
+ return null;
410
+ }
411
+ if (looksHtml(contentType, got.body)) {
412
+ const structure = extractStructure(got.body, got.finalUrl, opts);
413
+ attempts.push({ tier: 'direct', ok: true, detail: `direct HTML — ${structure.wordCount} words, ${structure.headings.length} headings`, status: got.res.status });
414
+ return { winningTier: 'direct', finalUrl: got.finalUrl, status: got.res.status, contentType, structure };
415
+ }
416
+ attempts.push({ tier: 'direct', ok: true, detail: `direct non-HTML (${contentType ?? 'unknown'}) — returned as text`, status: got.res.status });
417
+ return { winningTier: 'direct', finalUrl: got.finalUrl, status: got.res.status, contentType, content: stripTags(got.body).slice(0, opts.maxBytes) };
418
+ }
419
+ async function tierReader(parsed, opts, attempts) {
420
+ // Jina reader takes the full target URL appended to its base; do NOT pre-encode the whole URL
421
+ // (encodeURI/encodeURIComponent would break r.jina.ai's path parsing of query strings).
422
+ const readerUrl = `${opts.readerBase.replace(/\/$/, '')}/${parsed.href}`;
423
+ const got = await doGet(readerUrl, opts, { accept: 'text/plain', 'x-return-format': 'markdown' });
424
+ if ('error' in got) {
425
+ attempts.push({ tier: 'reader', ok: false, detail: `reader (${opts.readerBase}) failed: ${got.error}`, status: got.status });
426
+ return null;
427
+ }
428
+ const content = got.body.trim();
429
+ if (!content) {
430
+ attempts.push({ tier: 'reader', ok: false, detail: 'reader returned empty body' });
431
+ return null;
432
+ }
433
+ if (looksBlocked(content)) {
434
+ attempts.push({ tier: 'reader', ok: false, detail: 'reader returned a bot-challenge interstitial — not bypassing, falling through', status: got.res.status });
435
+ return null;
436
+ }
437
+ attempts.push({ tier: 'reader', ok: true, detail: `reader markdown — ${content.length} chars`, status: got.res.status });
438
+ return { winningTier: 'reader', finalUrl: parsed.href, status: got.res.status, contentType: 'text/markdown', content, structure: structureFromMarkdown(content, parsed.host) };
439
+ }
440
+ function structureFromMarkdown(markdown, host) {
441
+ const titleLine = markdown.match(/^Title:\s*(.+)$/im)?.[1] ?? markdown.match(/^#\s+(.+)$/m)?.[1];
442
+ const headings = [...markdown.matchAll(/^#{1,3}\s+(.+)$/gm)].map((m) => m[1].trim()).slice(0, 20);
443
+ const firstPara = markdown.split(/\n\s*\n/).map((p) => p.trim()).find((p) => p && !p.startsWith('#') && !/^(title|url source):/i.test(p));
444
+ const title = titleLine?.trim();
445
+ return {
446
+ title,
447
+ description: firstPara?.slice(0, 280),
448
+ headings,
449
+ links: [],
450
+ jsonLdTypes: [],
451
+ wordCount: markdown.split(/\s+/).filter((w) => w.length > 1).length,
452
+ summary: buildSummary({ title, description: firstPara, headings, host }),
453
+ };
454
+ }
455
+ async function tierTavily(parsed, opts, attempts) {
456
+ if (!opts.tavilyApiKey)
457
+ return null;
458
+ try {
459
+ const res = await opts.fetchImpl(`${opts.tavilyBase.replace(/\/$/, '')}/extract`, {
460
+ method: 'POST',
461
+ signal: AbortSignal.timeout(opts.timeoutMs),
462
+ headers: { 'content-type': 'application/json', authorization: `Bearer ${opts.tavilyApiKey}` },
463
+ body: JSON.stringify({ urls: [parsed.href] }),
464
+ });
465
+ if (!res.ok) {
466
+ attempts.push({ tier: 'tavily', ok: false, detail: `tavily extract failed: ${res.status}`, status: res.status });
467
+ return null;
468
+ }
469
+ const json = JSON.parse(await res.text());
470
+ const content = json.results?.[0]?.raw_content || json.results?.[0]?.content;
471
+ if (!content) {
472
+ attempts.push({ tier: 'tavily', ok: false, detail: 'tavily extract returned no content' });
473
+ return null;
474
+ }
475
+ if (looksBlocked(content)) {
476
+ attempts.push({ tier: 'tavily', ok: false, detail: 'tavily returned a bot-challenge interstitial — not bypassing, falling through', status: res.status });
477
+ return null;
478
+ }
479
+ attempts.push({ tier: 'tavily', ok: true, detail: `tavily extract — ${content.length} chars`, status: res.status });
480
+ return { winningTier: 'tavily', finalUrl: parsed.href, status: res.status, contentType: 'text/plain', content: content.slice(0, opts.maxBytes), structure: structureFromMarkdown(content, parsed.host) };
481
+ }
482
+ catch (e) {
483
+ attempts.push({ tier: 'tavily', ok: false, detail: `tavily extract error: ${e.message}` });
484
+ return null;
485
+ }
486
+ }
487
+ async function tierArchive(parsed, opts, attempts) {
488
+ const availUrl = `https://archive.org/wayback/available?url=${encodeURIComponent(parsed.href)}`;
489
+ const avail = await doGet(availUrl, { ...opts, respectRobots: false }, { accept: 'application/json' });
490
+ if ('error' in avail) {
491
+ attempts.push({ tier: 'archive', ok: false, detail: `wayback lookup failed: ${avail.error}` });
492
+ return null;
493
+ }
494
+ let snapshotUrl;
495
+ let timestamp;
496
+ try {
497
+ const json = JSON.parse(avail.body);
498
+ const closest = json.archived_snapshots?.closest;
499
+ if (closest?.available && closest.url) {
500
+ snapshotUrl = closest.url;
501
+ timestamp = closest.timestamp;
502
+ }
503
+ }
504
+ catch {
505
+ /* fall through */
506
+ }
507
+ if (!snapshotUrl) {
508
+ attempts.push({ tier: 'archive', ok: false, detail: 'no Wayback snapshot available' });
509
+ return null;
510
+ }
511
+ const snap = await doGet(snapshotUrl, { ...opts, respectRobots: false });
512
+ if ('error' in snap) {
513
+ attempts.push({ tier: 'archive', ok: false, detail: `snapshot fetch failed: ${snap.error}`, status: snap.status });
514
+ return null;
515
+ }
516
+ if (looksBlocked(snap.body)) {
517
+ attempts.push({ tier: 'archive', ok: false, detail: 'archived snapshot is itself a challenge page', status: snap.res.status });
518
+ return null;
519
+ }
520
+ const structure = extractStructure(snap.body, snapshotUrl, opts);
521
+ attempts.push({ tier: 'archive', ok: true, detail: `Wayback snapshot ${timestamp ?? ''} — ${structure.wordCount} words`, status: snap.res.status });
522
+ return { winningTier: 'archive', finalUrl: snapshotUrl, status: snap.res.status, contentType: snap.res.headers.get('content-type') ?? undefined, structure };
523
+ }
524
+ function blockedNote(opts) {
525
+ const tried = ['direct fetch', opts.allowReader ? 'reader service' : '', opts.tavilyApiKey ? 'Tavily extract' : '', opts.allowArchive ? 'Wayback archive' : '']
526
+ .filter(Boolean)
527
+ .join(', ');
528
+ return `ดึงไม่ได้ด้วยวิธีที่ถูกกติกา (${tried}). หน้านี้อาจต้อง JS render, login, หรือมี anti-bot. Sanook จะไม่ทำ evasion (${REFUSED_TECHNIQUES.join(', ')}) — ลองหา official API / RSS / sitemap หรือขออนุญาตเจ้าของเว็บแทน`;
529
+ }
530
+ /**
531
+ * Fetch a public web page through an ordered, ethical fallback ladder and return
532
+ * a rough structural understanding of it. Never attempts any REFUSED_TECHNIQUES.
533
+ */
534
+ export async function fetchWeb(rawUrl, options = {}) {
535
+ const opts = resolveOptions(options);
536
+ const attempts = [];
537
+ const result = { url: rawUrl, ok: false, attempts, refused: REFUSED_TECHNIQUES };
538
+ let parsed;
539
+ try {
540
+ parsed = new URL(rawUrl);
541
+ }
542
+ catch {
543
+ attempts.push({ tier: 'preflight', ok: false, detail: 'invalid URL' });
544
+ result.note = 'URL ไม่ถูกต้อง';
545
+ return result;
546
+ }
547
+ if (parsed.protocol !== 'http:' && parsed.protocol !== 'https:') {
548
+ attempts.push({ tier: 'preflight', ok: false, detail: `unsupported protocol: ${parsed.protocol}` });
549
+ result.note = 'รองรับเฉพาะ http/https';
550
+ return result;
551
+ }
552
+ if (!opts.allowPrivateHosts && isPrivateHost(parsed.hostname)) {
553
+ attempts.push({ tier: 'preflight', ok: false, detail: `blocked internal/loopback host: ${parsed.hostname} (SSRF guard)` });
554
+ result.note = 'host ภายใน/loopback ถูกบล็อกกัน SSRF — ใช้ allowPrivateHosts ถ้าตั้งใจจริง';
555
+ return result;
556
+ }
557
+ if (opts.respectRobots) {
558
+ const allowed = await checkRobots(parsed, opts, attempts);
559
+ if (!allowed) {
560
+ result.note = `robots.txt ของ ${parsed.host} ห้าม fetch path นี้ — เคารพ robots, ไม่ดึงต่อ. ใช้ official API/ขออนุญาตแทน`;
561
+ return result;
562
+ }
563
+ }
564
+ const ladder = [tierDirect];
565
+ if (opts.allowReader)
566
+ ladder.push(tierReader);
567
+ if (opts.tavilyApiKey)
568
+ ladder.push(tierTavily);
569
+ if (opts.allowArchive)
570
+ ladder.push(tierArchive);
571
+ for (const tier of ladder) {
572
+ const outcome = await tier(parsed, opts, attempts);
573
+ if (outcome)
574
+ return { ...result, ok: true, ...outcome };
575
+ }
576
+ result.note = blockedNote(opts);
577
+ return result;
578
+ }
579
+ export async function tavilySearch(query, options) {
580
+ const fetchImpl = options.fetchImpl ?? DEFAULTS.fetchImpl;
581
+ const base = (options.tavilyBase ?? DEFAULTS.tavilyBase).replace(/\/$/, '');
582
+ const res = await fetchImpl(`${base}/search`, {
583
+ method: 'POST',
584
+ signal: AbortSignal.timeout(options.timeoutMs ?? DEFAULTS.timeoutMs),
585
+ headers: { 'content-type': 'application/json', authorization: `Bearer ${options.apiKey}` },
586
+ body: JSON.stringify({ query, max_results: options.maxResults ?? 5 }),
587
+ });
588
+ if (!res.ok)
589
+ throw new Error(`tavily search ${res.status}`);
590
+ const json = JSON.parse(await res.text());
591
+ return (json.results ?? []).slice(0, options.maxResults ?? 5);
592
+ }
593
+ // ── rendering ────────────────────────────────────────────────────────────────
594
+ export function renderWebFetchResult(result) {
595
+ const lines = [`web fetch: ${result.url}`];
596
+ if (result.finalUrl && result.finalUrl !== result.url)
597
+ lines.push(`final url: ${result.finalUrl}`);
598
+ lines.push(`result: ${result.ok ? `OK via tier "${result.winningTier}"` : 'BLOCKED (no ethical tier succeeded)'}`);
599
+ if (result.status)
600
+ lines.push(`status: ${result.status}${result.contentType ? ` · ${result.contentType}` : ''}`);
601
+ if (result.structure) {
602
+ const s = result.structure;
603
+ lines.push('', 'structure:');
604
+ if (s.title)
605
+ lines.push(` title: ${s.title}`);
606
+ if (s.siteName)
607
+ lines.push(` site: ${s.siteName}`);
608
+ if (s.description)
609
+ lines.push(` description: ${s.description}`);
610
+ if (s.lang)
611
+ lines.push(` lang: ${s.lang}`);
612
+ lines.push(` words: ${s.wordCount}`);
613
+ if (s.jsonLdTypes.length)
614
+ lines.push(` schema.org types: ${s.jsonLdTypes.join(', ')}`);
615
+ if (s.headings.length) {
616
+ lines.push(` headings (${s.headings.length}):`);
617
+ for (const h of s.headings.slice(0, 10))
618
+ lines.push(` - ${h}`);
619
+ }
620
+ if (s.links.length) {
621
+ lines.push(` links (${s.links.length}):`);
622
+ for (const l of s.links.slice(0, 10))
623
+ lines.push(` - ${l.text} → ${l.href}`);
624
+ }
625
+ lines.push('', `summary: ${s.summary}`);
626
+ }
627
+ else if (result.content) {
628
+ lines.push('', `content (${result.content.length} chars), first 600:`, result.content.slice(0, 600));
629
+ }
630
+ lines.push('', 'attempts:');
631
+ for (const a of result.attempts)
632
+ lines.push(` ${a.ok ? '✓' : '✗'} ${a.tier}: ${a.detail}`);
633
+ if (result.note)
634
+ lines.push('', `note: ${result.note}`);
635
+ lines.push('', `never attempted (policy): ${result.refused.join(', ')}`);
636
+ return lines.join('\n');
637
+ }