@pugi/cli 0.1.0-alpha.6 → 0.1.0-alpha.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,535 @@
1
+ /**
2
+ * web_fetch tool — Sprint α6.15 Phase 1 quick-win.
3
+ *
4
+ * One-shot HTTP GET against an operator-supplied URL. The response is
5
+ * parsed with Readability over a linkedom DOM, converted to Markdown
6
+ * with Turndown, and returned wrapped in an `<untrusted-content-NONCE>`
7
+ * sentinel that downstream prompts must treat as data, never as
8
+ * instructions.
9
+ *
10
+ * Sentinel pattern (P0 from `docs/specs/pugi-browser-integration-2026-05-24.md`
11
+ * §10 risk 3): fetched bytes can carry prompt injection. The Mira system
12
+ * prompt is expected to honor the `<untrusted-content-*>` wrapping the
13
+ * way Anthropic Computer Use and Codex `skills/chrome/SKILL.md` do —
14
+ * treat the block as fact, refuse to follow instructions inside.
15
+ * The tag carries a per-call random nonce so a page literal of
16
+ * `</untrusted-content>` cannot break out of the boundary, and the
17
+ * source URL lives inside the body (escaped) instead of as an opening
18
+ * attribute so quote-injection cannot break the tag.
19
+ *
20
+ * Gate: the tool refuses unless either the caller flips
21
+ * `web.fetch.enabled = true` in `.pugi/settings.json` or the CLI
22
+ * runtime sets `allowFetch: true` (mapped from `--allow-fetch`). The
23
+ * default-off posture mirrors the «no auto-fetch from chat» rule in
24
+ * §8 anti-pattern 1 of the spec.
25
+ *
26
+ * SSRF guard: every URL we are about to fetch (initial + every redirect
27
+ * hop) is resolved via `dns.lookup` and rejected if any answer maps to
28
+ * loopback, link-local, RFC 1918, CGNAT, IPv6 ULA/link-local, or the
29
+ * 0.0.0.0/8 wildcard. There is a microsecond TOCTOU window between
30
+ * lookup and the kernel's connect(); we accept it for v1 because
31
+ * exploiting it requires DNS control over the target host plus a
32
+ * timing race. Tracked as P3 follow-up.
33
+ *
34
+ * Brand voice: brief / dispatch / ship / sentinel only. The
35
+ * brandbook §08 forbidden-word list applies — see CLAUDE.md.
36
+ */
37
+ import { request } from 'undici';
38
+ import { Readability } from '@mozilla/readability';
39
+ import { parseHTML } from 'linkedom';
40
+ import TurndownService from 'turndown';
41
+ import { randomBytes } from 'node:crypto';
42
+ import { lookup as dnsLookup } from 'node:dns/promises';
43
+ import { isIPv4, isIPv6 } from 'node:net';
44
+ let activeLookup = async (hostname) => await dnsLookup(hostname, { all: true, verbatim: true });
45
+ export function _setLookupForTests(fn) {
46
+ activeLookup = fn ?? (async (hostname) => await dnsLookup(hostname, { all: true, verbatim: true }));
47
+ }
48
+ const FETCH_TIMEOUT_MS = 10_000;
49
+ const MAX_RESPONSE_BYTES = 5 * 1024 * 1024; // 5 MiB
50
+ const MAX_REDIRECTS = 5;
51
+ const USER_AGENT = 'pugi-cli/0.1 (+https://pugi.dev)';
52
+ const ALLOWED_CONTENT_TYPES = ['text/html', 'application/xhtml+xml', 'text/plain'];
53
+ export function isWebFetchEnabled(ctx) {
54
+ if (ctx.allowFetch === true)
55
+ return true;
56
+ return ctx.settings.web?.fetch?.enabled === true;
57
+ }
58
+ /* ----------------------- SSRF guard helpers ---------------------- */
59
+ /**
60
+ * Parse a dotted IPv4 string into a 32-bit unsigned integer. Returns
61
+ * `null` if the string is not a syntactically valid IPv4. We avoid
62
+ * adding `ip-address` to keep the deps surface clean.
63
+ */
64
+ function ipv4ToInt(ip) {
65
+ const parts = ip.split('.');
66
+ if (parts.length !== 4)
67
+ return null;
68
+ let acc = 0;
69
+ for (const part of parts) {
70
+ if (!/^\d{1,3}$/.test(part))
71
+ return null;
72
+ const n = Number(part);
73
+ if (n < 0 || n > 255)
74
+ return null;
75
+ acc = (acc << 8) + n;
76
+ }
77
+ // Force unsigned 32-bit.
78
+ return acc >>> 0;
79
+ }
80
+ /**
81
+ * Hand-rolled IPv4 CIDR check. `prefix` is the high-bit count.
82
+ */
83
+ function ipv4InCidr(ip, cidr, prefix) {
84
+ const ipInt = ipv4ToInt(ip);
85
+ const baseInt = ipv4ToInt(cidr);
86
+ if (ipInt === null || baseInt === null)
87
+ return false;
88
+ if (prefix === 0)
89
+ return true;
90
+ const mask = prefix === 32 ? 0xffffffff : (0xffffffff << (32 - prefix)) >>> 0;
91
+ return (ipInt & mask) === (baseInt & mask);
92
+ }
93
+ /**
94
+ * IPv4 blocklist — every range that must never reach a server-side
95
+ * fetcher. Sources: IANA special-purpose registry plus the standard
96
+ * SSRF cheat-sheet (loopback, RFC 1918, link-local, CGNAT, wildcard).
97
+ */
98
+ const IPV4_BLOCKED_RANGES = [
99
+ ['0.0.0.0', 8], // "this network" wildcard
100
+ ['10.0.0.0', 8], // RFC 1918
101
+ ['100.64.0.0', 10], // RFC 6598 CGNAT
102
+ ['127.0.0.0', 8], // loopback
103
+ ['169.254.0.0', 16], // link-local + AWS/GCP metadata
104
+ ['172.16.0.0', 12], // RFC 1918
105
+ ['192.0.0.0', 24], // IETF protocol assignments
106
+ ['192.168.0.0', 16], // RFC 1918
107
+ ['198.18.0.0', 15], // benchmarking
108
+ ['224.0.0.0', 4], // multicast
109
+ ['240.0.0.0', 4], // reserved (includes 255.255.255.255 broadcast)
110
+ ];
111
+ /**
112
+ * Expand an IPv6 address into 8 16-bit hex words. Handles `::`
113
+ * shorthand and IPv4-mapped trailers (`::ffff:1.2.3.4`).
114
+ * Returns `null` if the input cannot be parsed as IPv6.
115
+ */
116
+ function expandIPv6(ip) {
117
+ // Strip zone id (`%eth0` etc) — it is not part of the address.
118
+ const bare = ip.split('%')[0] ?? ip;
119
+ // Handle IPv4-mapped form by converting the trailing dotted quad
120
+ // into two hex words first.
121
+ let working = bare;
122
+ const lastColon = working.lastIndexOf(':');
123
+ if (lastColon !== -1 && working.slice(lastColon + 1).includes('.')) {
124
+ const dotted = working.slice(lastColon + 1);
125
+ const v4 = ipv4ToInt(dotted);
126
+ if (v4 === null)
127
+ return null;
128
+ const hi = ((v4 >>> 16) & 0xffff).toString(16);
129
+ const lo = (v4 & 0xffff).toString(16);
130
+ working = `${working.slice(0, lastColon)}:${hi}:${lo}`;
131
+ }
132
+ if (!working.includes(':'))
133
+ return null;
134
+ const sides = working.split('::');
135
+ if (sides.length > 2)
136
+ return null;
137
+ const leftRaw = sides[0] ?? '';
138
+ const rightRaw = sides.length === 2 ? sides[1] ?? '' : '';
139
+ const left = leftRaw === '' ? [] : leftRaw.split(':');
140
+ const right = rightRaw === '' ? [] : rightRaw.split(':');
141
+ const totalGiven = left.length + right.length;
142
+ if (sides.length === 1 && totalGiven !== 8)
143
+ return null;
144
+ if (totalGiven > 8)
145
+ return null;
146
+ const fillCount = 8 - totalGiven;
147
+ const filled = [...left, ...Array(fillCount).fill('0'), ...right];
148
+ for (const word of filled) {
149
+ if (!/^[0-9a-fA-F]{1,4}$/.test(word))
150
+ return null;
151
+ }
152
+ return filled.map((w) => w.toLowerCase().padStart(4, '0'));
153
+ }
154
+ /**
155
+ * Reject the IPv6 ranges the SSRF guard never wants to reach.
156
+ * Covers loopback (::1), unspecified (::), link-local (fe80::/10),
157
+ * unique local (fc00::/7), discard (100::/64), and IPv4-mapped
158
+ * (::ffff:0:0/96 — must also block since the embedded IPv4 still
159
+ * routes locally on some stacks).
160
+ */
161
+ /**
162
+ * Build a dotted IPv4 string from the last two 16-bit words of an
163
+ * expanded IPv6 address. Shared by every embedded-IPv4 path below
164
+ * (IPv4-mapped, IPv4-translated SIIT, NAT64 well-known).
165
+ */
166
+ function embeddedIPv4FromTrailingWords(words) {
167
+ const high = parseInt(words[6] ?? '0', 16);
168
+ const low = parseInt(words[7] ?? '0', 16);
169
+ return `${high >>> 8}.${high & 0xff}.${low >>> 8}.${low & 0xff}`;
170
+ }
171
+ function ipv6IsBlocked(ip) {
172
+ const words = expandIPv6(ip);
173
+ if (!words)
174
+ return false;
175
+ const joined = words.join('');
176
+ // ::1 loopback.
177
+ if (joined === '00000000000000000000000000000001')
178
+ return true;
179
+ // :: unspecified / wildcard.
180
+ if (joined === '00000000000000000000000000000000')
181
+ return true;
182
+ // ::ffff:0:0/96 IPv4-mapped (RFC 4291 §2.5.5.2):
183
+ // words[0..4] = 0000, words[5] = ffff.
184
+ // Example: ::ffff:127.0.0.1 → [0,0,0,0,0,ffff,7f00,0001].
185
+ if (words.slice(0, 5).every((w) => w === '0000') && words[5] === 'ffff') {
186
+ const embedded = embeddedIPv4FromTrailingWords(words);
187
+ if (ipv4IsBlocked(embedded))
188
+ return true;
189
+ }
190
+ // ::ffff:0:0:0/96 IPv4-translated (RFC 6145 §2.2 / RFC 6052 SIIT):
191
+ // words[0..3] = 0000, words[4] = ffff, words[5] = 0000.
192
+ // Example: ::ffff:0:a9fe:a9fe → [0,0,0,0,ffff,0,a9fe,a9fe] → 169.254.169.254.
193
+ // Codex P2 (PR #349): the original guard only covered the IPv4-mapped
194
+ // form above. SIIT/NAT64 stacks (Linux clatd, some macOS revisions,
195
+ // and various carrier-NAT64 deployments) translate `::ffff:0:a.b.c.d`
196
+ // straight to the embedded IPv4, so without this branch a hostile
197
+ // literal could ride through to the metadata service.
198
+ if (words.slice(0, 4).every((w) => w === '0000') &&
199
+ words[4] === 'ffff' &&
200
+ words[5] === '0000') {
201
+ const embedded = embeddedIPv4FromTrailingWords(words);
202
+ if (ipv4IsBlocked(embedded))
203
+ return true;
204
+ }
205
+ // 100::/64 discard prefix.
206
+ if (words[0] === '0100' && words.slice(1, 4).every((w) => w === '0000'))
207
+ return true;
208
+ // 64:ff9b::/96 — well-known NAT64 (still resolves to embedded IPv4).
209
+ if (words[0] === '0064' && words[1] === 'ff9b' && words.slice(2, 6).every((w) => w === '0000')) {
210
+ const embedded = embeddedIPv4FromTrailingWords(words);
211
+ if (ipv4IsBlocked(embedded))
212
+ return true;
213
+ }
214
+ // fc00::/7 — unique local (high 7 bits = 1111110).
215
+ const firstByte = parseInt(words[0]?.slice(0, 2) ?? '00', 16);
216
+ if ((firstByte & 0xfe) === 0xfc)
217
+ return true;
218
+ // fe80::/10 — link-local (first 10 bits = 1111111010).
219
+ const firstTen = parseInt(words[0] ?? '0000', 16) & 0xffc0;
220
+ if (firstTen === 0xfe80)
221
+ return true;
222
+ // ff00::/8 — multicast.
223
+ if (firstByte === 0xff)
224
+ return true;
225
+ return false;
226
+ }
227
+ function ipv4IsBlocked(ip) {
228
+ for (const [base, prefix] of IPV4_BLOCKED_RANGES) {
229
+ if (ipv4InCidr(ip, base, prefix))
230
+ return true;
231
+ }
232
+ return false;
233
+ }
234
+ /**
235
+ * Resolve `hostname` via dns.lookup and reject if any answer maps to
236
+ * a private/loopback/link-local/CGNAT range. Returns `null` on success
237
+ * (safe to fetch), an error string when the lookup or guard fails.
238
+ *
239
+ * `hostname` is whatever URL.hostname returned, so it may already be
240
+ * a literal IP (with brackets stripped). We honor that fast-path and
241
+ * skip DNS.
242
+ */
243
+ async function validateHostnameForFetch(hostname) {
244
+ // URL.hostname keeps the brackets off IPv6 literals already.
245
+ if (!hostname)
246
+ return 'empty hostname';
247
+ // Literal `localhost` resolves locally regardless of DNS — refuse
248
+ // by name so a hosts-file alias to a public IP cannot smuggle it.
249
+ if (hostname.toLowerCase() === 'localhost') {
250
+ return 'localhost is blocked (SSRF guard)';
251
+ }
252
+ // Fast-path: literal IP. Skip DNS.
253
+ if (isIPv4(hostname)) {
254
+ if (ipv4IsBlocked(hostname)) {
255
+ return `IP ${hostname} is in a blocked range (SSRF guard)`;
256
+ }
257
+ return null;
258
+ }
259
+ if (isIPv6(hostname)) {
260
+ if (ipv6IsBlocked(hostname)) {
261
+ return `IPv6 ${hostname} is in a blocked range (SSRF guard)`;
262
+ }
263
+ return null;
264
+ }
265
+ // DNS lookup — refuse if any answer is private. The active resolver
266
+ // is module-private so tests can stub it.
267
+ let answers;
268
+ try {
269
+ answers = await activeLookup(hostname);
270
+ }
271
+ catch (error) {
272
+ const msg = error instanceof Error ? error.message : String(error);
273
+ return `DNS lookup failed for ${hostname}: ${msg}`;
274
+ }
275
+ if (answers.length === 0) {
276
+ return `DNS returned no answers for ${hostname}`;
277
+ }
278
+ for (const answer of answers) {
279
+ if (answer.family === 4 && ipv4IsBlocked(answer.address)) {
280
+ return `${hostname} resolves to ${answer.address} which is in a blocked range (SSRF guard)`;
281
+ }
282
+ if (answer.family === 6 && ipv6IsBlocked(answer.address)) {
283
+ return `${hostname} resolves to ${answer.address} which is in a blocked range (SSRF guard)`;
284
+ }
285
+ }
286
+ return null;
287
+ }
288
+ /* ----------------------- sentinel helpers ---------------------- */
289
+ /**
290
+ * HTML-escape the five characters that can break out of either an
291
+ * element body or an attribute value. We place the source URL inside
292
+ * the sentinel body (not as an attribute), so the only realistic
293
+ * breakout vector is a literal `</untrusted-content-NONCE>` closing
294
+ * tag, but escaping `<` and `>` covers it cheaply.
295
+ *
296
+ * Exported for spec coverage; production callers must keep using
297
+ * the wrapper inside `webFetchTool`.
298
+ */
299
+ export function escapeForSentinelBody(input) {
300
+ return input
301
+ .replace(/&/g, '&amp;')
302
+ .replace(/</g, '&lt;')
303
+ .replace(/>/g, '&gt;')
304
+ .replace(/"/g, '&quot;')
305
+ .replace(/'/g, '&#39;');
306
+ }
307
+ /**
308
+ * Strip any literal `</untrusted-content-NONCE>` (or the bare legacy
309
+ * form) from fetched body content. The nonce makes a successful
310
+ * breakout cryptographically improbable but the extra scrub costs
311
+ * nothing and gives defense-in-depth.
312
+ */
313
+ function scrubSentinelEscapes(input, nonce) {
314
+ const nonceTag = new RegExp(`</?untrusted-content-${nonce}>`, 'gi');
315
+ const bareTag = /<\/?untrusted-content[^>]*>/gi;
316
+ return input.replace(nonceTag, '').replace(bareTag, '');
317
+ }
318
+ /* ----------------------- response read ---------------------- */
319
+ /**
320
+ * Read the response body with a hard 5 MiB streaming cap. Disables
321
+ * undici auto-decompression upstream (caller sets accept-encoding:
322
+ * identity) so the cap is meaningful — otherwise a 50 KB gzip bomb
323
+ * could expand to gigabytes before we noticed.
324
+ *
325
+ * On size overflow we abort the request via the AbortController AND
326
+ * destroy the body stream so the socket closes instead of dangling.
327
+ */
328
+ async function readBodyWithCap(body, controller) {
329
+ const chunks = [];
330
+ let total = 0;
331
+ try {
332
+ for await (const chunk of body) {
333
+ const buf = Buffer.isBuffer(chunk) ? chunk : Buffer.from(chunk);
334
+ total += buf.length;
335
+ if (total > MAX_RESPONSE_BYTES) {
336
+ controller.abort();
337
+ // undici BodyReadable extends Node Readable — destroy() closes
338
+ // the underlying socket eagerly so we are not waiting on GC.
339
+ try {
340
+ if (typeof body.destroy === 'function')
341
+ body.destroy();
342
+ }
343
+ catch {
344
+ /* ignore — abort already fired */
345
+ }
346
+ return { ok: false, error: `Response exceeded ${MAX_RESPONSE_BYTES} byte cap.` };
347
+ }
348
+ chunks.push(buf);
349
+ }
350
+ }
351
+ catch (error) {
352
+ const msg = error instanceof Error ? error.message : String(error);
353
+ return { ok: false, error: `Body read failed: ${msg}` };
354
+ }
355
+ return { ok: true, buffer: Buffer.concat(chunks) };
356
+ }
357
+ /**
358
+ * Dispatch a single GET, follow up to MAX_REDIRECTS hops, enforce the
359
+ * 5 MiB / 10 s caps, refuse non-2xx and unsupported content-types.
360
+ * Returns the wrapped Markdown on success, an error result otherwise.
361
+ *
362
+ * No retries: GET is idempotent but the contract is one-shot per
363
+ * spec; surface the error to the operator and let them re-dispatch
364
+ * explicitly.
365
+ */
366
+ export async function webFetchTool(input, ctx) {
367
+ if (!isWebFetchEnabled(ctx)) {
368
+ return {
369
+ ok: false,
370
+ error: 'web_fetch disabled. Enable with --allow-fetch or set web.fetch.enabled=true in .pugi/settings.json.',
371
+ };
372
+ }
373
+ let parsedUrl;
374
+ try {
375
+ parsedUrl = new URL(input.url);
376
+ }
377
+ catch {
378
+ return { ok: false, error: `Invalid URL: ${input.url}` };
379
+ }
380
+ if (parsedUrl.protocol !== 'http:' && parsedUrl.protocol !== 'https:') {
381
+ return { ok: false, error: `Unsupported scheme ${parsedUrl.protocol} — only http/https.` };
382
+ }
383
+ // Strip IPv6 brackets — URL.hostname keeps them, dns/net do not.
384
+ const initialHost = parsedUrl.hostname.replace(/^\[|\]$/g, '');
385
+ const initialGuard = await validateHostnameForFetch(initialHost);
386
+ if (initialGuard) {
387
+ return { ok: false, error: `SSRF refused: ${initialGuard}` };
388
+ }
389
+ // Manual redirect loop: undici v8 moved `maxRedirections` off the
390
+ // per-request options and onto the redirect interceptor, which we
391
+ // skip to keep the call site MockAgent-compatible. Cap at 5 hops.
392
+ // Every hop re-runs the SSRF guard because a public origin can
393
+ // return `302 Location: http://169.254.169.254/...`.
394
+ let response = null;
395
+ let currentUrl = parsedUrl;
396
+ let hops = 0;
397
+ const controller = new AbortController();
398
+ try {
399
+ while (true) {
400
+ response = await request(currentUrl.toString(), {
401
+ method: 'GET',
402
+ headers: {
403
+ 'user-agent': USER_AGENT,
404
+ accept: 'text/html,application/xhtml+xml',
405
+ // Disable content-encoding negotiation — undici would
406
+ // auto-decompress gzip/br otherwise and our streaming cap
407
+ // would only see post-decompression bytes, making a small
408
+ // gzip bomb expand to GBs before the cap trips.
409
+ 'accept-encoding': 'identity',
410
+ },
411
+ bodyTimeout: FETCH_TIMEOUT_MS,
412
+ headersTimeout: FETCH_TIMEOUT_MS,
413
+ signal: controller.signal,
414
+ });
415
+ if (response.statusCode >= 300 && response.statusCode < 400) {
416
+ const loc = response.headers['location'];
417
+ const locStr = Array.isArray(loc) ? loc[0] : loc;
418
+ if (typeof locStr !== 'string' || locStr.length === 0)
419
+ break;
420
+ hops += 1;
421
+ if (hops > MAX_REDIRECTS) {
422
+ // Drain the body on the way out so the underlying socket
423
+ // closes deterministically instead of lingering until GC.
424
+ // Codex P2 (PR #349 triple-review): without this dump() the
425
+ // socket stays half-read until the response object is
426
+ // collected, which under load can exhaust the connection
427
+ // pool. `dump()` swallows errors; the catch is belt + braces.
428
+ try {
429
+ await response.body.dump();
430
+ }
431
+ catch {
432
+ try {
433
+ response.body.destroy();
434
+ }
435
+ catch {
436
+ /* socket already closed — nothing to do */
437
+ }
438
+ }
439
+ return { ok: false, error: `Exceeded ${MAX_REDIRECTS} redirect hops.` };
440
+ }
441
+ // Drain prior body so the socket can be reused.
442
+ await response.body.dump();
443
+ let nextUrl;
444
+ try {
445
+ nextUrl = new URL(locStr, currentUrl);
446
+ }
447
+ catch {
448
+ return { ok: false, error: `Invalid redirect target: ${locStr}` };
449
+ }
450
+ if (nextUrl.protocol !== 'http:' && nextUrl.protocol !== 'https:') {
451
+ return {
452
+ ok: false,
453
+ error: `Refusing redirect to unsupported scheme ${nextUrl.protocol}.`,
454
+ };
455
+ }
456
+ const nextHost = nextUrl.hostname.replace(/^\[|\]$/g, '');
457
+ const guard = await validateHostnameForFetch(nextHost);
458
+ if (guard) {
459
+ return { ok: false, error: `SSRF refused on redirect: ${guard}` };
460
+ }
461
+ currentUrl = nextUrl;
462
+ continue;
463
+ }
464
+ break;
465
+ }
466
+ }
467
+ catch (error) {
468
+ const message = error instanceof Error ? error.message : String(error);
469
+ return { ok: false, error: `Fetch failed: ${message}` };
470
+ }
471
+ if (!response) {
472
+ return { ok: false, error: 'No response received.' };
473
+ }
474
+ if (response.statusCode < 200 || response.statusCode >= 300) {
475
+ return { ok: false, error: `HTTP ${response.statusCode} from ${currentUrl.toString()}` };
476
+ }
477
+ // content-length is advisory — never trust it for the size cap, but
478
+ // we can short-circuit obviously huge declared payloads BEFORE we
479
+ // start reading. The streaming cap is still the source of truth.
480
+ const declaredLengthRaw = response.headers['content-length'];
481
+ const declaredLength = Array.isArray(declaredLengthRaw) ? declaredLengthRaw[0] : declaredLengthRaw;
482
+ if (typeof declaredLength === 'string' && /^\d+$/.test(declaredLength)) {
483
+ const n = Number(declaredLength);
484
+ if (n > MAX_RESPONSE_BYTES) {
485
+ controller.abort();
486
+ try {
487
+ response.body.destroy();
488
+ }
489
+ catch {
490
+ /* ignore */
491
+ }
492
+ return {
493
+ ok: false,
494
+ error: `Declared content-length ${n} exceeds ${MAX_RESPONSE_BYTES} byte cap.`,
495
+ };
496
+ }
497
+ }
498
+ const contentTypeRaw = response.headers['content-type'];
499
+ const contentType = Array.isArray(contentTypeRaw) ? contentTypeRaw[0] : contentTypeRaw;
500
+ const mime = typeof contentType === 'string' ? contentType.split(';')[0]?.trim().toLowerCase() ?? '' : '';
501
+ if (!ALLOWED_CONTENT_TYPES.includes(mime)) {
502
+ return { ok: false, error: `Disallowed content-type ${mime || '(none)'}; only HTML/XHTML/text.` };
503
+ }
504
+ const bodyResult = await readBodyWithCap(response.body, controller);
505
+ if (!bodyResult.ok)
506
+ return bodyResult;
507
+ const html = bodyResult.buffer.toString('utf8');
508
+ // linkedom is the lightweight DOM Readability needs; jsdom would
509
+ // add ~3 MB to the install footprint for the same surface.
510
+ const { document } = parseHTML(html);
511
+ const article = new Readability(document).parse();
512
+ const title = article?.title?.trim() || currentUrl.hostname;
513
+ const articleHtml = article?.content ?? html;
514
+ const turndown = new TurndownService({ headingStyle: 'atx', codeBlockStyle: 'fenced' });
515
+ const markdown = turndown.turndown(articleHtml).trim();
516
+ // Per-call nonce defeats sentinel escape via literal `</untrusted-content>`
517
+ // inside fetched bodies. Tag carries the nonce; downstream consumers
518
+ // match dynamically. Source URL lives INSIDE the sentinel body
519
+ // (escaped) so a quote-injection in the URL cannot break the tag.
520
+ const nonce = randomBytes(8).toString('hex');
521
+ const scrubbedMarkdown = scrubSentinelEscapes(markdown, nonce);
522
+ const safeSource = escapeForSentinelBody(currentUrl.toString());
523
+ const wrapped = `<untrusted-content-${nonce}>\n` +
524
+ `Source: ${safeSource}\n\n` +
525
+ `${scrubbedMarkdown}\n` +
526
+ `</untrusted-content-${nonce}>`;
527
+ return {
528
+ ok: true,
529
+ url: currentUrl.toString(),
530
+ title,
531
+ content_md: wrapped,
532
+ fetched_at: new Date().toISOString(),
533
+ };
534
+ }
535
+ //# sourceMappingURL=web-fetch.js.map
@@ -0,0 +1,142 @@
1
+ import { jsx as _jsx, jsxs as _jsxs } from "react/jsx-runtime";
2
+ import { useEffect, useState } from 'react';
3
+ import { Box, Text, useInput } from 'ink';
4
+ const SPINNER_FRAMES = ['⠋', '⠙', '⠹', '⠸', '⠼', '⠴', '⠦', '⠧', '⠇', '⠏'];
5
+ const ACCENT_CYAN = 'cyan';
6
+ const ACCENT_RED = 'red';
7
+ /**
8
+ * P2-1 (triple-review 2026-05-24): both `userCode` and
9
+ * `verificationUrl` originate from a server response and would render
10
+ * verbatim into Ink `<Text>`. A hostile or compromised server could
11
+ * embed ANSI escape sequences (clear screen, cursor moves, hyperlink
12
+ * faking) and they would EXECUTE in the user's terminal. We whitelist
13
+ * the user code to the documented alphabet (uppercase letters, digits,
14
+ * dashes) and clamp the URL to the same safe-http guard used by the
15
+ * auto-open helper. Anything outside the whitelist renders as a
16
+ * `<invalid>` placeholder so the visual contract stays intact while
17
+ * the dangerous bytes never reach the terminal.
18
+ */
19
+ const USER_CODE_PATTERN = /^[A-Z0-9-]{1,16}$/;
20
+ function sanitizeUserCode(code) {
21
+ return USER_CODE_PATTERN.test(code) ? code : '<invalid>';
22
+ }
23
+ function hasControlChar(input) {
24
+ // Reject any C0 (0x00-0x1F, 0x7F) or C1 (0x80-0x9F) control byte.
25
+ // ESC (0x1B) is the gateway character for every ANSI escape sequence
26
+ // we care about (CSI, OSC, hyperlink), and CSI alone (0x9B) is the
27
+ // C1 single-byte form some terminals still honour.
28
+ for (let i = 0; i < input.length; i += 1) {
29
+ const code = input.charCodeAt(i);
30
+ if (code <= 0x1f)
31
+ return true;
32
+ if (code === 0x7f)
33
+ return true;
34
+ if (code >= 0x80 && code <= 0x9f)
35
+ return true;
36
+ }
37
+ return false;
38
+ }
39
+ function sanitizeDisplayUrl(url) {
40
+ // Reject control characters BEFORE handing to `new URL`. The URL
41
+ // parser preserves ESC bytes inside the path, so a server response
42
+ // with an embedded escape would otherwise round-trip into the Ink
43
+ // render and execute against the user terminal.
44
+ if (hasControlChar(url))
45
+ return '<invalid URL>';
46
+ try {
47
+ const parsed = new URL(url);
48
+ if (parsed.protocol !== 'https:' && parsed.protocol !== 'http:') {
49
+ return '<invalid URL>';
50
+ }
51
+ return url;
52
+ }
53
+ catch {
54
+ return '<invalid URL>';
55
+ }
56
+ }
57
+ /**
58
+ * Pugi device-flow Ink view. Stateless w.r.t. the network — the host
59
+ * supplies `status` and we render the matching frame.
60
+ */
61
+ export function DeviceFlow(props) {
62
+ const copiedFlashMs = props.copiedFlashMs ?? 800;
63
+ const spinnerIntervalMs = props.spinnerIntervalMs ?? 120;
64
+ const [copied, setCopied] = useState(false);
65
+ const [spinnerFrame, setSpinnerFrame] = useState(0);
66
+ useInput((input, key) => {
67
+ if (key.escape) {
68
+ props.onCancel();
69
+ return;
70
+ }
71
+ if (input === 'c' || input === 'C') {
72
+ // The host owns the clipboard side effect. The component just
73
+ // signals the intent and surfaces the "Copied!" flash.
74
+ props.onCopy();
75
+ setCopied(true);
76
+ return;
77
+ }
78
+ if (key.return && props.status.kind === 'success') {
79
+ props.onContinue();
80
+ return;
81
+ }
82
+ });
83
+ // Spinner pulse — only while polling. Pausing the interval on
84
+ // success/failure avoids a no-op timer surviving past the final
85
+ // render frame.
86
+ useEffect(() => {
87
+ if (props.status.kind !== 'polling')
88
+ return;
89
+ const handle = setInterval(() => {
90
+ setSpinnerFrame((current) => (current + 1) % SPINNER_FRAMES.length);
91
+ }, spinnerIntervalMs);
92
+ return () => clearInterval(handle);
93
+ }, [props.status.kind, spinnerIntervalMs]);
94
+ // "Copied!" auto-clear so the chip flashes briefly without sticking.
95
+ useEffect(() => {
96
+ if (!copied)
97
+ return;
98
+ const handle = setTimeout(() => setCopied(false), copiedFlashMs);
99
+ return () => clearTimeout(handle);
100
+ }, [copied, copiedFlashMs]);
101
+ return (_jsxs(Box, { flexDirection: "column", paddingX: 2, paddingY: 1, children: [_jsx(Box, { children: _jsx(Text, { bold: true, color: ACCENT_CYAN, children: "Authorize Pugi CLI" }) }), _jsx(CodeChip, { code: sanitizeUserCode(props.userCode) }), _jsx(BrowserHintRow, { opened: props.browserOpened, url: sanitizeDisplayUrl(props.verificationUrl), copied: copied }), _jsx(Box, { marginTop: 1, children: _jsx(StatusRow, { status: props.status, spinnerFrame: spinnerFrame }) }), _jsx(Box, { marginTop: 1, children: _jsx(FooterHint, { status: props.status }) })] }));
102
+ }
103
+ /**
104
+ * Centered short-code "chip". The Pugi device-flow user code is
105
+ * always the 9-character "XXXX-XXXX" form (4 + dash + 4) generated by
106
+ * the Anvil device-flow start endpoint — we render it in cyan-bold so
107
+ * the user's eye lands on it instantly when comparing against the
108
+ * cabinet page.
109
+ */
110
+ function CodeChip({ code }) {
111
+ return (_jsx(Box, { marginTop: 1, justifyContent: "center", children: _jsx(Text, { bold: true, color: ACCENT_CYAN, children: ` ${code} ` }) }));
112
+ }
113
+ function BrowserHintRow({ opened, url, copied, }) {
114
+ if (copied) {
115
+ return (_jsx(Box, { marginTop: 1, children: _jsx(Text, { color: ACCENT_CYAN, children: "Copied!" }) }));
116
+ }
117
+ if (opened) {
118
+ return (_jsxs(Box, { marginTop: 1, flexDirection: "column", children: [_jsx(Text, { dimColor: true, children: "Browser opened to" }), _jsx(Text, { dimColor: true, children: url })] }));
119
+ }
120
+ return (_jsxs(Box, { marginTop: 1, flexDirection: "column", children: [_jsx(Text, { dimColor: true, children: "Browser didn't open? Use the URL below to sign in (c to copy)" }), _jsx(Text, { children: url })] }));
121
+ }
122
+ function StatusRow({ status, spinnerFrame, }) {
123
+ if (status.kind === 'polling') {
124
+ const glyph = SPINNER_FRAMES[spinnerFrame] ?? SPINNER_FRAMES[0];
125
+ return (_jsxs(Text, { children: [_jsx(Text, { color: ACCENT_CYAN, children: glyph }), _jsx(Text, { children: ' Waiting for browser approval…' })] }));
126
+ }
127
+ if (status.kind === 'success') {
128
+ return (_jsxs(Text, { children: [_jsx(Text, { color: ACCENT_CYAN, children: '✓ ' }), _jsx(Text, { color: ACCENT_CYAN, children: `Logged in as ${status.principalLabel}` })] }));
129
+ }
130
+ // failure
131
+ return (_jsxs(Box, { flexDirection: "column", children: [_jsxs(Text, { children: [_jsx(Text, { color: ACCENT_RED, children: '✗ ' }), _jsx(Text, { color: ACCENT_RED, children: status.reason })] }), status.hint ? _jsx(Text, { dimColor: true, children: status.hint }) : null] }));
132
+ }
133
+ function FooterHint({ status }) {
134
+ if (status.kind === 'success') {
135
+ return _jsx(Text, { dimColor: true, children: "Press Enter to continue\u2026" });
136
+ }
137
+ if (status.kind === 'failure') {
138
+ return _jsx(Text, { dimColor: true, children: "Esc to dismiss \u00B7 Run `pugi login` again to retry" });
139
+ }
140
+ return _jsx(Text, { dimColor: true, children: "Esc to cancel \u00B7 c to copy URL" });
141
+ }
142
+ //# sourceMappingURL=device-flow.js.map