glance-cli 0.13.0 → 0.14.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,592 @@
1
+ /**
2
+ * Production-Grade Page Fetcher
3
+ *
4
+ * Features:
5
+ * - Dual-mode: Fast fetch for static sites, Puppeteer for SPAs
6
+ * - Comprehensive error handling with categorization
7
+ * - Retry logic with exponential backoff
8
+ * - Timeout protection (prevents hanging)
9
+ * - Resource cleanup (no memory leaks)
10
+ * - Redirect handling
11
+ * - Content-type validation
12
+ * - Size limits (prevents memory exhaustion)
13
+ * - User-agent rotation with glance-cli default
14
+ * - Response validation
15
+ * - Better error messages with hints
16
+ * - Precise byte-size validation
17
+ * - Optimized encoding detection
18
+ */
19
+
20
+ // Puppeteer types for when we dynamically import
21
+ type Browser = import("puppeteer").Browser;
22
+ type Page = import("puppeteer").Page;
23
+
24
+ // === Text Encoding Utilities ===
25
+
26
+ /**
27
+ * Get text content with proper encoding handling
28
+ */
29
+ async function getTextWithProperEncoding(response: Response): Promise<string> {
30
+ const contentType = response.headers.get("content-type") || "";
31
+ const charsetMatch = contentType.match(/charset=([^;]+)/i);
32
+ const declaredCharset = charsetMatch
33
+ ? (charsetMatch[1]?.toLowerCase().trim() ?? null)
34
+ : null;
35
+
36
+ try {
37
+ const buffer = await response.arrayBuffer();
38
+ const bytes = new Uint8Array(buffer);
39
+
40
+ let detectedCharset = declaredCharset;
41
+
42
+ if (!detectedCharset && bytes.length > 0) {
43
+ // Reduced to 512 bytes - sufficient for most meta tags
44
+ const decoder = new TextDecoder("utf-8", { fatal: false });
45
+ const preview = decoder.decode(
46
+ bytes.slice(0, Math.min(512, bytes.length)),
47
+ );
48
+
49
+ const metaCharsetMatch = preview.match(
50
+ /<meta[^>]*charset[=\s]*["']?([^"'\s>]+)/i,
51
+ );
52
+ if (metaCharsetMatch) {
53
+ detectedCharset = metaCharsetMatch[1]?.toLowerCase().trim() ?? null;
54
+ } else {
55
+ const httpEquivMatch = preview.match(
56
+ /<meta[^>]*http-equiv[=\s]*["']?content-type["']?[^>]*content[=\s]*["']?[^"']*charset[=\s]*([^"'\s;]+)/i,
57
+ );
58
+ if (httpEquivMatch) {
59
+ detectedCharset = httpEquivMatch[1]?.toLowerCase().trim() ?? null;
60
+ }
61
+ }
62
+ }
63
+
64
+ if (!detectedCharset) detectedCharset = "utf-8";
65
+
66
+ const normalizedCharset = normalizeCharset(detectedCharset);
67
+
68
+ try {
69
+ const decoder = new TextDecoder(normalizedCharset as any, {
70
+ fatal: false,
71
+ });
72
+ const text = decoder.decode(bytes);
73
+ return cleanEncodingArtifacts(text);
74
+ } catch {
75
+ const decoder = new TextDecoder("utf-8", { fatal: false });
76
+ const text = decoder.decode(bytes);
77
+ return cleanEncodingArtifacts(text);
78
+ }
79
+ } catch {
80
+ const text = await response.text();
81
+ return cleanEncodingArtifacts(text);
82
+ }
83
+ }
84
+
85
+ /**
86
+ * Normalize charset names to standard forms
87
+ */
88
+ function normalizeCharset(charset: string): string {
89
+ const normalized = charset.toLowerCase().replace(/[-_\s]/g, "");
90
+ const charsetMap: Record<string, string> = {
91
+ iso88591: "iso-8859-1",
92
+ latin1: "iso-8859-1",
93
+ utf8: "utf-8",
94
+ windows1252: "windows-1252",
95
+ cp1252: "windows-1252",
96
+ ascii: "us-ascii",
97
+ usascii: "us-ascii",
98
+ gb2312: "gb18030",
99
+ gbk: "gb18030",
100
+ };
101
+ return charsetMap[normalized] || charset;
102
+ }
103
+
104
+ /**
105
+ * Clean up common encoding artifacts and problematic characters
106
+ */
107
+ function cleanEncodingArtifacts(text: string): string {
108
+ return text
109
+ .replace(/\x00/g, "")
110
+ .replace(/\x7F/g, "")
111
+ .replace(/[\x01-\x08\x0B\x0C\x0E-\x1F]/g, "")
112
+ .replace(/\uFFFD/g, "")
113
+ .replace(/\uFEFF/g, "")
114
+ .replace(/[\u200B-\u200D\u2060]/g, "")
115
+ .replace(/’/g, "'")
116
+ .replace(/“/g, '"')
117
+ .replace(/â€\x9D/g, '"')
118
+ .replace(/â€"/g, "—")
119
+ .replace(/â€\x93/g, "–")
120
+ .replace(/Â /g, " ")
121
+ .replace(/â¢/g, "•")
122
+ .replace(/é/g, "é")
123
+ .replace(/á/g, "á")
124
+ .replace(/í/g, "í")
125
+ .replace(/ó/g, "ó")
126
+ .replace(/ú/gu, "ú")
127
+ .replace(/ñ/g, "ñ")
128
+ .replace(/Ã\x87/g, "Ç")
129
+ .replace(/[^\x00-\x7F\u00A0-\uFFFF]/g, "");
130
+ }
131
+
132
+ // === Configuration ===
133
+ const FETCH_CONFIG = {
134
+ SIMPLE_FETCH_TIMEOUT: 30000,
135
+ FULL_RENDER_TIMEOUT: 60000,
136
+ PAGE_LOAD_TIMEOUT: 45000,
137
+ MAX_CONTENT_SIZE: 50 * 1024 * 1024, // 50MB
138
+ MAX_REDIRECTS: 10,
139
+ MAX_RETRIES: 3,
140
+ RETRY_DELAY: 1000,
141
+ USER_AGENTS: [
142
+ "glance-cli/1.0[](https://github.com/jkenley/glance-cli)",
143
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
144
+ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
145
+ ],
146
+ VALID_CONTENT_TYPES: [
147
+ "text/html",
148
+ "application/xhtml+xml",
149
+ "application/xml",
150
+ "text/plain",
151
+ ],
152
+ } as const;
153
+
154
+ // === Custom Error Types ===
155
+ class FetchError extends Error {
156
+ constructor(
157
+ message: string,
158
+ public code: string,
159
+ public userMessage: string,
160
+ public recoverable: boolean = false,
161
+ public hint?: string,
162
+ public statusCode?: number,
163
+ ) {
164
+ super(message);
165
+ this.name = "FetchError";
166
+ }
167
+ }
168
+
169
+ // === Fetch Options ===
170
+ export interface FetchOptions {
171
+ fullRender?: boolean;
172
+ timeout?: number;
173
+ userAgent?: string;
174
+ followRedirects?: boolean;
175
+ maxSize?: number;
176
+ waitUntil?: "networkidle2" | "networkidle0" | "load" | "domcontentloaded";
177
+ maxRetries?: number;
178
+ }
179
+
180
+ // === Validation Functions ===
181
+
182
+ function validateURL(url: string): void {
183
+ if (!url || typeof url !== "string") {
184
+ throw new FetchError(
185
+ "Invalid URL: empty or not a string",
186
+ "INVALID_URL",
187
+ "Invalid URL provided",
188
+ false,
189
+ "URL must be a non-empty string starting with http:// or https://",
190
+ );
191
+ }
192
+ try {
193
+ const parsed = new URL(url);
194
+ if (!["http:", "https:"].includes(parsed.protocol)) {
195
+ throw new FetchError(
196
+ `Invalid protocol: ${parsed.protocol}`,
197
+ "INVALID_PROTOCOL",
198
+ "Invalid URL protocol",
199
+ false,
200
+ "URL must start with http:// or https://",
201
+ );
202
+ }
203
+ } catch (err: any) {
204
+ if (err instanceof FetchError) throw err;
205
+ throw new FetchError(
206
+ `Malformed URL: ${err.message}`,
207
+ "MALFORMED_URL",
208
+ "Invalid URL format",
209
+ false,
210
+ "URL must be properly formatted (e.g., https://example.com)",
211
+ );
212
+ }
213
+ }
214
+
215
+ function validateContentType(contentType: string | null): void {
216
+ if (!contentType) return;
217
+ const type = contentType.split(";")[0]?.trim().toLowerCase() || "";
218
+ if (!FETCH_CONFIG.VALID_CONTENT_TYPES.some((valid) => type.includes(valid))) {
219
+ throw new FetchError(
220
+ `Invalid content type: ${contentType}`,
221
+ "INVALID_CONTENT_TYPE",
222
+ "Page is not HTML",
223
+ false,
224
+ `Expected HTML but got ${type}. This URL may point to a file download or API endpoint.`,
225
+ );
226
+ }
227
+ }
228
+
229
+ function validateContentSize(sizeBytes: number, maxSize: number): void {
230
+ if (sizeBytes > maxSize) {
231
+ const sizeMB = (sizeBytes / 1024 / 1024).toFixed(1);
232
+ const maxMB = (maxSize / 1024 / 1024).toFixed(1);
233
+ throw new FetchError(
234
+ `Content too large: ${sizeMB}MB`,
235
+ "CONTENT_TOO_LARGE",
236
+ "Page is too large",
237
+ false,
238
+ `Page size (${sizeMB}MB) exceeds maximum (${maxMB}MB).`,
239
+ );
240
+ }
241
+ }
242
+
243
+ function getUserAgent(custom?: string): string {
244
+ if (custom) return custom;
245
+ // Default to glance-cli UA for better identification
246
+ return FETCH_CONFIG.USER_AGENTS[0];
247
+ }
248
+
249
+ // === Simple Fetch Implementation ===
250
+
251
+ async function simpleFetch(
252
+ url: string,
253
+ options: Required<FetchOptions>,
254
+ ): Promise<string> {
255
+ const controller = new AbortController();
256
+ const timeoutId = setTimeout(() => controller.abort(), options.timeout);
257
+
258
+ try {
259
+ const response = await fetch(url, {
260
+ headers: {
261
+ "User-Agent": getUserAgent(options.userAgent),
262
+ Accept:
263
+ "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
264
+ "Accept-Language": "en-US,en;q=0.9",
265
+ "Accept-Encoding": "gzip, deflate, br",
266
+ "Cache-Control": "no-cache",
267
+ Pragma: "no-cache",
268
+ },
269
+ redirect: options.followRedirects ? "follow" : "manual",
270
+ signal: controller.signal,
271
+ });
272
+
273
+ clearTimeout(timeoutId);
274
+
275
+ if (!response.ok) {
276
+ const statusCode = response.status;
277
+ if (statusCode === 404)
278
+ throw new FetchError(
279
+ `HTTP 404: Not Found`,
280
+ "NOT_FOUND",
281
+ "Page not found",
282
+ false,
283
+ "The URL doesn't exist. Check for typos.",
284
+ 404,
285
+ );
286
+ if (statusCode === 403)
287
+ throw new FetchError(
288
+ `HTTP 403: Forbidden`,
289
+ "FORBIDDEN",
290
+ "Access denied",
291
+ false,
292
+ "The server is blocking access. Try with --full-render.",
293
+ 403,
294
+ );
295
+ if (statusCode === 401)
296
+ throw new FetchError(
297
+ `HTTP 401: Unauthorized`,
298
+ "UNAUTHORIZED",
299
+ "Authentication required",
300
+ false,
301
+ "This page requires login.",
302
+ 401,
303
+ );
304
+ if (statusCode === 429)
305
+ throw new FetchError(
306
+ `HTTP 429: Too Many Requests`,
307
+ "RATE_LIMITED",
308
+ "Rate limited",
309
+ true,
310
+ "The server is rate limiting requests. Wait a moment.",
311
+ 429,
312
+ );
313
+ if (statusCode >= 500)
314
+ throw new FetchError(
315
+ `HTTP ${statusCode}: ${response.statusText}`,
316
+ "SERVER_ERROR",
317
+ "Server error",
318
+ true,
319
+ "The server is experiencing issues. Try again later.",
320
+ statusCode,
321
+ );
322
+ throw new FetchError(
323
+ `HTTP ${statusCode}: ${response.statusText}`,
324
+ "HTTP_ERROR",
325
+ `HTTP ${statusCode} error`,
326
+ statusCode >= 500,
327
+ undefined,
328
+ statusCode,
329
+ );
330
+ }
331
+
332
+ const contentType = response.headers.get("content-type");
333
+ validateContentType(contentType);
334
+
335
+ const contentLength = response.headers.get("content-length");
336
+ if (contentLength) {
337
+ const size = parseInt(contentLength, 10);
338
+ validateContentSize(size, options.maxSize);
339
+ }
340
+
341
+ const html = await getTextWithProperEncoding(response);
342
+
343
+ // Precise byte size validation (more accurate than string.length)
344
+ const byteSize = new TextEncoder().encode(html).length;
345
+ validateContentSize(byteSize, options.maxSize);
346
+
347
+ if (!html.trim()) {
348
+ throw new FetchError(
349
+ "Empty response",
350
+ "EMPTY_RESPONSE",
351
+ "Page is empty",
352
+ false,
353
+ "The server returned an empty page. The URL may be incorrect.",
354
+ );
355
+ }
356
+
357
+ return html;
358
+ } catch (err: any) {
359
+ clearTimeout(timeoutId);
360
+ if (err instanceof FetchError) throw err;
361
+ if (err.name === "AbortError")
362
+ throw new FetchError(
363
+ "Request timeout",
364
+ "TIMEOUT",
365
+ "Request timed out",
366
+ true,
367
+ `Page took too long (>${options.timeout / 1000}s).`,
368
+ );
369
+ if (
370
+ err.message?.includes("fetch failed") ||
371
+ err.cause?.code === "ENOTFOUND"
372
+ )
373
+ throw new FetchError(
374
+ err.message,
375
+ "DNS_ERROR",
376
+ "Cannot resolve domain",
377
+ false,
378
+ "Check domain and internet connection.",
379
+ );
380
+ if (err.cause?.code === "ECONNREFUSED")
381
+ throw new FetchError(
382
+ err.message,
383
+ "CONNECTION_REFUSED",
384
+ "Connection refused",
385
+ false,
386
+ "The server refused the connection.",
387
+ );
388
+ if (
389
+ err.cause?.code === "ECONNRESET" ||
390
+ err.message?.includes("socket hang up")
391
+ )
392
+ throw new FetchError(
393
+ err.message,
394
+ "CONNECTION_RESET",
395
+ "Connection reset",
396
+ true,
397
+ "The connection was reset.",
398
+ );
399
+ if (err.cause?.code === "ETIMEDOUT")
400
+ throw new FetchError(
401
+ err.message,
402
+ "NETWORK_TIMEOUT",
403
+ "Network timeout",
404
+ true,
405
+ "Network connection timed out.",
406
+ );
407
+ if (
408
+ err.message?.includes("certificate") ||
409
+ err.cause?.code?.includes("CERT")
410
+ )
411
+ throw new FetchError(
412
+ err.message,
413
+ "SSL_ERROR",
414
+ "SSL/TLS error",
415
+ false,
416
+ "Invalid SSL certificate.",
417
+ );
418
+ throw new FetchError(
419
+ err.message,
420
+ "NETWORK_ERROR",
421
+ "Network error",
422
+ true,
423
+ "A network error occurred.",
424
+ );
425
+ }
426
+ }
427
+
428
+ // === Full Render Implementation ===
429
+
430
+ async function fullRenderFetch(
431
+ url: string,
432
+ options: Required<FetchOptions>,
433
+ puppeteer: typeof import("puppeteer"),
434
+ ): Promise<string> {
435
+ let browser: Browser | undefined;
436
+ let page: Page | undefined;
437
+
438
+ try {
439
+ browser = await puppeteer.default.launch({
440
+ headless: true,
441
+ args: [
442
+ "--no-sandbox",
443
+ "--disable-setuid-sandbox",
444
+ "--disable-dev-shm-usage",
445
+ "--disable-accelerated-2d-canvas",
446
+ "--no-first-run",
447
+ "--no-zygote",
448
+ "--disable-gpu",
449
+ "--disable-web-security",
450
+ "--disable-features=IsolateOrigins,site-per-process",
451
+ // Added for better performance in headless mode
452
+ "--disable-background-timer-throttling",
453
+ "--disable-renderer-backgrounding",
454
+ "--disable-backgrounding-occluded-windows",
455
+ ],
456
+ timeout: 10000,
457
+ });
458
+
459
+ page = await browser.newPage();
460
+ await page.setUserAgent(getUserAgent(options.userAgent));
461
+ await page.setViewport({ width: 1920, height: 1080, deviceScaleFactor: 1 });
462
+ page.setDefaultTimeout(options.timeout);
463
+ page.setDefaultNavigationTimeout(FETCH_CONFIG.PAGE_LOAD_TIMEOUT);
464
+
465
+ await page.goto(url, {
466
+ waitUntil: options.waitUntil,
467
+ timeout: FETCH_CONFIG.PAGE_LOAD_TIMEOUT,
468
+ });
469
+
470
+ const html = await page.content();
471
+ const byteSize = new TextEncoder().encode(html).length;
472
+ validateContentSize(byteSize, options.maxSize);
473
+
474
+ if (!html.trim()) {
475
+ throw new FetchError(
476
+ "Empty page",
477
+ "EMPTY_PAGE",
478
+ "Page is empty",
479
+ false,
480
+ "The page rendered but is empty.",
481
+ );
482
+ }
483
+
484
+ await page.close();
485
+ await browser.close();
486
+ return html;
487
+ } catch (err: any) {
488
+ if (page) await page.close().catch(() => {});
489
+ if (browser) await browser.close().catch(() => {});
490
+
491
+ if (err instanceof FetchError) throw err;
492
+ if (
493
+ err.message?.includes("Could not find") ||
494
+ err.message?.includes("Chromium")
495
+ ) {
496
+ throw new FetchError(
497
+ err.message,
498
+ "BROWSER_NOT_FOUND",
499
+ "Chromium not found",
500
+ false,
501
+ "Reinstall: bun install puppeteer",
502
+ );
503
+ }
504
+ if (err.message?.includes("Failed to launch")) {
505
+ throw new FetchError(
506
+ err.message,
507
+ "BROWSER_LAUNCH_FAILED",
508
+ "Failed to launch browser",
509
+ false,
510
+ "Check system resources.",
511
+ );
512
+ }
513
+ throw new FetchError(
514
+ err.message,
515
+ "RENDER_ERROR",
516
+ "Page rendering failed",
517
+ true,
518
+ "Full render failed.",
519
+ );
520
+ }
521
+ }
522
+
523
+ // === Retry Logic ===
524
+
525
+ async function fetchWithRetry(
526
+ url: string,
527
+ options: Required<FetchOptions>,
528
+ attempt: number = 1,
529
+ ): Promise<string> {
530
+ try {
531
+ if (options.fullRender) {
532
+ const puppeteer = await import("puppeteer");
533
+ return await fullRenderFetch(url, options, puppeteer);
534
+ }
535
+ return await simpleFetch(url, options);
536
+ } catch (err: any) {
537
+ if (err instanceof FetchError && !err.recoverable) throw err;
538
+ if (attempt >= options.maxRetries) throw err;
539
+
540
+ const delay = FETCH_CONFIG.RETRY_DELAY * 2 ** (attempt - 1);
541
+ await new Promise((resolve) => setTimeout(resolve, delay));
542
+ return fetchWithRetry(url, options, attempt + 1);
543
+ }
544
+ }
545
+
546
+ // === Main Export Function ===
547
+
548
+ export async function fetchPage(
549
+ url: string,
550
+ options: FetchOptions = {},
551
+ ): Promise<string> {
552
+ validateURL(url);
553
+
554
+ const completeOptions: Required<FetchOptions> = {
555
+ fullRender: options.fullRender ?? false,
556
+ timeout:
557
+ options.timeout ??
558
+ (options.fullRender
559
+ ? FETCH_CONFIG.FULL_RENDER_TIMEOUT
560
+ : FETCH_CONFIG.SIMPLE_FETCH_TIMEOUT),
561
+ userAgent: options.userAgent ?? getUserAgent(),
562
+ followRedirects: options.followRedirects ?? true,
563
+ maxSize: options.maxSize ?? FETCH_CONFIG.MAX_CONTENT_SIZE,
564
+ waitUntil: options.waitUntil ?? "networkidle2",
565
+ maxRetries: options.maxRetries ?? FETCH_CONFIG.MAX_RETRIES,
566
+ };
567
+
568
+ return await fetchWithRetry(url, completeOptions);
569
+ }
570
+
571
+ export async function checkURL(url: string): Promise<{
572
+ accessible: boolean;
573
+ status?: number;
574
+ error?: string;
575
+ }> {
576
+ try {
577
+ validateURL(url);
578
+ const controller = new AbortController();
579
+ const timeoutId = setTimeout(() => controller.abort(), 10000);
580
+
581
+ const response = await fetch(url, {
582
+ method: "HEAD",
583
+ headers: { "User-Agent": getUserAgent() },
584
+ signal: controller.signal,
585
+ });
586
+
587
+ clearTimeout(timeoutId);
588
+ return { accessible: response.ok, status: response.status };
589
+ } catch (err: any) {
590
+ return { accessible: false, error: err.message };
591
+ }
592
+ }