@pi-unipi/web-api 0.1.13 → 0.1.15

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,642 @@
1
+ /**
2
+ * @unipi/web-api — Core Extraction Pipeline
3
+ *
4
+ * The heart of the smart-fetch engine:
5
+ * URL validation → wreq-js fetch → content-type routing → defuddle extraction → fallbacks.
6
+ */
7
+
8
+ import type {
9
+ FetchResult,
10
+ FetchError,
11
+ FetchOptions,
12
+ FetchProgress,
13
+ FetchExecutionHooks,
14
+ BatchFetchResult,
15
+ BatchFetchItemResult,
16
+ FetchProgressStatus,
17
+ } from "./types.js";
18
+ import {
19
+ DEFAULT_BROWSER,
20
+ DEFAULT_OS,
21
+ DEFAULT_FORMAT,
22
+ DEFAULT_MAX_CHARS,
23
+ DEFAULT_TIMEOUT_MS,
24
+ DEFAULT_REMOVE_IMAGES,
25
+ DEFAULT_INCLUDE_REPLIES,
26
+ DEFAULT_HEADERS,
27
+ DEFAULT_BATCH_CONCURRENCY,
28
+ } from "./constants.js";
29
+ import { resolveBrowserProfile, resolveOSProfile } from "./profiles.js";
30
+ import { getWreq, getDefuddle, getMimeTypes } from "./dependencies.js";
31
+ import { parseHTML, extractTextContent, elementToMarkdown } from "./dom.js";
32
+ import { truncateContent, formatContent } from "./format.js";
33
+
34
+ /** Maximum meta refresh redirects to follow */
35
+ const MAX_REDIRECTS = 5;
36
+
37
+ /** Maximum alternate link fallbacks to try */
38
+ const MAX_ALTERNATE_LINKS = 3;
39
+
40
+ /**
41
+ * Validate a URL for fetching.
42
+ * Only http and https protocols are supported.
43
+ *
44
+ * @param url - URL to validate
45
+ * @returns Validated URL or throws
46
+ */
47
+ function validateUrl(url: string): URL {
48
+ let parsed: URL;
49
+
50
+ try {
51
+ parsed = new URL(url);
52
+ } catch {
53
+ throw createError(
54
+ "invalid_url",
55
+ "validation",
56
+ `Invalid URL format: ${url}`,
57
+ false
58
+ );
59
+ }
60
+
61
+ if (parsed.protocol !== "http:" && parsed.protocol !== "https:") {
62
+ throw createError(
63
+ "unsupported_protocol",
64
+ "validation",
65
+ `Unsupported protocol: ${parsed.protocol}. Only http and https are supported.`,
66
+ false
67
+ );
68
+ }
69
+
70
+ return parsed;
71
+ }
72
+
73
+ /**
74
+ * Create a FetchError object.
75
+ */
76
+ function createError(
77
+ code: FetchError["code"],
78
+ phase: FetchError["phase"],
79
+ message: string,
80
+ retryable: boolean,
81
+ extra: Partial<FetchError> = {}
82
+ ): FetchError {
83
+ return {
84
+ error: message,
85
+ code,
86
+ phase,
87
+ retryable,
88
+ ...extra,
89
+ };
90
+ }
91
+
92
+ /**
93
+ * Create a FetchResult object.
94
+ */
95
+ function createResult(
96
+ url: string,
97
+ finalUrl: string,
98
+ content: string,
99
+ metadata: Partial<FetchResult> = {}
100
+ ): FetchResult {
101
+ return {
102
+ url,
103
+ finalUrl,
104
+ title: metadata.title || "",
105
+ author: metadata.author || "",
106
+ published: metadata.published || "",
107
+ site: metadata.site || "",
108
+ language: metadata.language || "",
109
+ wordCount: content.split(/\s+/).filter(Boolean).length,
110
+ content,
111
+ format: metadata.format || "markdown",
112
+ mimeType: metadata.mimeType || "text/html",
113
+ };
114
+ }
115
+
116
+ /**
117
+ * Extract metadata from defuddle result.
118
+ */
119
+ function extractMetadata(
120
+ defuddleResult: any,
121
+ document: Document
122
+ ): Partial<FetchResult> {
123
+ const metadata: Partial<FetchResult> = {};
124
+
125
+ // Try defuddle-extracted metadata
126
+ if (defuddleResult) {
127
+ metadata.title = defuddleResult.title || "";
128
+ metadata.author = defuddleResult.author || "";
129
+ metadata.published = defuddleResult.published || defuddleResult.date || "";
130
+ metadata.site = defuddleResult.site || defuddleResult.siteName || "";
131
+ metadata.language = defuddleResult.language || "";
132
+ }
133
+
134
+ // Fall back to DOM extraction
135
+ if (!metadata.title) {
136
+ const titleEl = document.querySelector("title");
137
+ metadata.title = titleEl?.textContent?.trim() || "";
138
+ }
139
+
140
+ // Try og:title
141
+ if (!metadata.title) {
142
+ const ogTitle = document.querySelector('meta[property="og:title"]');
143
+ metadata.title = ogTitle?.getAttribute("content") || "";
144
+ }
145
+
146
+ // Try meta author
147
+ if (!metadata.author) {
148
+ const authorMeta = document.querySelector('meta[name="author"]');
149
+ metadata.author = authorMeta?.getAttribute("content") || "";
150
+ }
151
+
152
+ // Try meta site
153
+ if (!metadata.site) {
154
+ const siteMeta = document.querySelector('meta[property="og:site_name"]');
155
+ metadata.site = siteMeta?.getAttribute("content") || "";
156
+ }
157
+
158
+ // Try html lang
159
+ if (!metadata.language) {
160
+ const htmlEl = document.querySelector("html");
161
+ metadata.language = htmlEl?.getAttribute("lang") || "";
162
+ }
163
+
164
+ return metadata;
165
+ }
166
+
167
+ /**
168
+ * Check for client-side meta refresh redirects.
169
+ *
170
+ * @param document - DOM document
171
+ * @returns Redirect URL if found
172
+ */
173
+ function findMetaRefresh(document: Document): string | null {
174
+ const metaRefresh = document.querySelector(
175
+ 'meta[http-equiv="refresh"]'
176
+ ) as HTMLMetaElement | null;
177
+
178
+ if (!metaRefresh) {
179
+ return null;
180
+ }
181
+
182
+ const content = metaRefresh.getAttribute("content");
183
+ if (!content) {
184
+ return null;
185
+ }
186
+
187
+ // Parse: "0;url=https://example.com" or "0; URL='https://example.com'"
188
+ const match = content.match(/url\s*=\s*['"]?([^'"\s]+)['"]?/i);
189
+ if (!match) {
190
+ return null;
191
+ }
192
+
193
+ return match[1];
194
+ }
195
+
196
+ /**
197
+ * Check for alternate JSON content links.
198
+ *
199
+ * @param document - DOM document
200
+ * @returns Array of alternate URLs
201
+ */
202
+ function findAlternateLinks(document: Document): string[] {
203
+ const alternates: string[] = [];
204
+
205
+ // Look for JSON feeds, oEmbed, etc.
206
+ const links = document.querySelectorAll(
207
+ 'link[rel="alternate"][type="application/json"], ' +
208
+ 'link[rel="alternate"][type="application/ld+json"]'
209
+ );
210
+
211
+ for (const link of Array.from(links)) {
212
+ const href = link.getAttribute("href");
213
+ if (href) {
214
+ alternates.push(href);
215
+ }
216
+ }
217
+
218
+ return alternates.slice(0, MAX_ALTERNATE_LINKS);
219
+ }
220
+
221
+ /**
222
+ * Detect content type from response.
223
+ */
224
+ function detectContentType(
225
+ response: Response,
226
+ buffer: ArrayBuffer
227
+ ): { mimeType: string; isBinary: boolean } {
228
+ const contentType = response.headers.get("content-type") || "";
229
+ const mimeType = contentType.split(";")[0].trim().toLowerCase();
230
+
231
+ // Check for binary types
232
+ const binaryTypes = [
233
+ "application/octet-stream",
234
+ "application/pdf",
235
+ "application/zip",
236
+ "application/x-",
237
+ "image/",
238
+ "video/",
239
+ "audio/",
240
+ "font/",
241
+ ];
242
+
243
+ const isBinary = binaryTypes.some((t) => mimeType.startsWith(t));
244
+
245
+ return { mimeType, isBinary };
246
+ }
247
+
248
+ /**
249
+ * The main fetch + extraction pipeline.
250
+ *
251
+ * @param url - URL to fetch
252
+ * @param options - Fetch options
253
+ * @param hooks - Execution hooks for progress
254
+ * @returns Fetch result or throws FetchError
255
+ */
256
+ export async function defuddleFetch(
257
+ url: string,
258
+ options: FetchOptions = {},
259
+ hooks?: FetchExecutionHooks
260
+ ): Promise<FetchResult> {
261
+ const {
262
+ browser = DEFAULT_BROWSER,
263
+ os = DEFAULT_OS,
264
+ format = DEFAULT_FORMAT,
265
+ maxChars = DEFAULT_MAX_CHARS,
266
+ timeoutMs = DEFAULT_TIMEOUT_MS,
267
+ removeImages = DEFAULT_REMOVE_IMAGES,
268
+ includeReplies = DEFAULT_INCLUDE_REPLIES,
269
+ proxy,
270
+ headers: customHeaders,
271
+ } = options;
272
+
273
+ // Track progress
274
+ const updateProgress = (
275
+ status: FetchProgressStatus,
276
+ percent: number = 0,
277
+ phase: string = "",
278
+ bytesLoaded: number = 0,
279
+ bytesTotal: number = 0
280
+ ) => {
281
+ hooks?.onProgress?.({
282
+ url,
283
+ status,
284
+ percent,
285
+ bytesLoaded,
286
+ bytesTotal,
287
+ phase,
288
+ });
289
+ };
290
+
291
+ let finalUrl = url;
292
+ let redirectCount = 0;
293
+
294
+ // Validate URL
295
+ updateProgress("connecting", 0, "validation");
296
+ try {
297
+ validateUrl(url);
298
+ } catch (error) {
299
+ if ((error as FetchError).code) {
300
+ throw error;
301
+ }
302
+ throw createError("invalid_url", "validation", (error as Error).message, false, {
303
+ url,
304
+ });
305
+ }
306
+
307
+ // Get wreq-js
308
+ const wreq = await getWreq();
309
+
310
+ // Build request options
311
+ const resolvedBrowser = resolveBrowserProfile(browser);
312
+ const resolvedOS = resolveOSProfile(os);
313
+
314
+ const requestHeaders = {
315
+ ...DEFAULT_HEADERS,
316
+ ...customHeaders,
317
+ };
318
+
319
+ // Main fetch loop (handles meta refresh redirects)
320
+ while (redirectCount < MAX_REDIRECTS) {
321
+ updateProgress("connecting", 10, "connecting");
322
+
323
+ try {
324
+ // wreq-js request
325
+ const response = await wreq.fetch(finalUrl, {
326
+ browser: resolvedBrowser,
327
+ os: resolvedOS,
328
+ timeout: timeoutMs,
329
+ proxy,
330
+ headers: requestHeaders,
331
+ });
332
+
333
+ updateProgress("waiting", 30, "waiting");
334
+
335
+ // Check HTTP status
336
+ if (!response.ok) {
337
+ throw createError(
338
+ "http_error",
339
+ "waiting",
340
+ `HTTP error: ${response.status} ${response.statusText}`,
341
+ response.status >= 500 || response.status === 429,
342
+ {
343
+ url,
344
+ finalUrl,
345
+ statusCode: response.status,
346
+ statusText: response.statusText,
347
+ }
348
+ );
349
+ }
350
+
351
+ updateProgress("loading", 40, "loading");
352
+
353
+ // Get response body
354
+ const buffer = await response.arrayBuffer();
355
+ const contentLength = response.headers.get("content-length");
356
+ const bytesTotal = contentLength ? parseInt(contentLength, 10) : buffer.byteLength;
357
+
358
+ updateProgress("loading", 60, "loading", buffer.byteLength, bytesTotal);
359
+
360
+ // Detect content type
361
+ const { mimeType, isBinary } = detectContentType(response, buffer);
362
+
363
+ // Handle binary content
364
+ if (isBinary) {
365
+ updateProgress("processing", 80, "processing");
366
+
367
+ // For binary files, return a placeholder with metadata
368
+ return createResult(url, finalUrl, `[Binary file: ${mimeType}]`, {
369
+ mimeType,
370
+ format,
371
+ });
372
+ }
373
+
374
+ // Handle JSON
375
+ if (mimeType === "application/json") {
376
+ updateProgress("processing", 80, "processing");
377
+ const text = new TextDecoder().decode(buffer);
378
+ const json = JSON.parse(text);
379
+ const content = JSON.stringify(json, null, 2);
380
+ const truncated = truncateContent(content, maxChars);
381
+
382
+ return createResult(url, finalUrl, truncated, {
383
+ mimeType,
384
+ format: "json",
385
+ });
386
+ }
387
+
388
+ // Handle plain text
389
+ if (mimeType.startsWith("text/plain")) {
390
+ updateProgress("processing", 80, "processing");
391
+ const text = new TextDecoder().decode(buffer);
392
+ const truncated = truncateContent(text, maxChars);
393
+
394
+ return createResult(url, finalUrl, truncated, {
395
+ mimeType,
396
+ format: "text",
397
+ });
398
+ }
399
+
400
+ // Handle HTML
401
+ updateProgress("processing", 70, "processing");
402
+
403
+ const html = new TextDecoder().decode(buffer);
404
+ const { document, window } = parseHTML(html);
405
+
406
+ // Check for meta refresh redirect
407
+ const redirectUrl = findMetaRefresh(document);
408
+ if (redirectUrl) {
409
+ redirectCount++;
410
+ // Resolve relative URLs
411
+ finalUrl = new URL(redirectUrl, finalUrl).href;
412
+ continue; // Loop to fetch the redirect target
413
+ }
414
+
415
+ // Try defuddle extraction
416
+ let content: string;
417
+ let metadata: Partial<FetchResult> = {};
418
+
419
+ try {
420
+ const defuddle = await getDefuddle();
421
+
422
+ // defuddle expects a window object with document
423
+ const defuddleOptions = {
424
+ removeImages,
425
+ includeReplies: includeReplies === true ? true : includeReplies === "extractors" ? "extractors" : false,
426
+ };
427
+
428
+ const defuddleResult = await defuddle(window, defuddleOptions);
429
+
430
+ if (defuddleResult?.content) {
431
+ content = defuddleResult.content;
432
+ metadata = extractMetadata(defuddleResult, document);
433
+ } else {
434
+ // Fallback to DOM extraction
435
+ content = fallbackExtraction(document);
436
+ metadata = extractMetadata(null, document);
437
+ }
438
+ } catch (_defuddleError) {
439
+ // Defuddle extraction failed — use fallback DOM extraction.
440
+ content = fallbackExtraction(document);
441
+ metadata = extractMetadata(null, document);
442
+ }
443
+
444
+ // Truncate content
445
+ content = truncateContent(content, maxChars);
446
+
447
+ // Format content based on requested format
448
+ const formattedContent = formatContent(
449
+ createResult(url, finalUrl, content, metadata),
450
+ format,
451
+ maxChars
452
+ );
453
+
454
+ updateProgress("done", 100, "done", bytesTotal, bytesTotal);
455
+
456
+ return createResult(url, finalUrl, formattedContent, {
457
+ ...metadata,
458
+ mimeType,
459
+ format,
460
+ });
461
+ } catch (error) {
462
+ // Handle wreq-js fetch errors
463
+ if ((error as FetchError).code) {
464
+ throw error;
465
+ }
466
+
467
+ const err = error as Error;
468
+
469
+ // Classify error
470
+ if (err.message.includes("timeout")) {
471
+ throw createError("timeout", "waiting", err.message, true, {
472
+ url,
473
+ finalUrl,
474
+ timeoutMs,
475
+ });
476
+ }
477
+
478
+ if (err.message.includes("network") || err.message.includes("ECONNREFUSED")) {
479
+ throw createError("network_error", "connecting", err.message, true, {
480
+ url,
481
+ finalUrl,
482
+ });
483
+ }
484
+
485
+ throw createError("unexpected_response", "loading", err.message, false, {
486
+ url,
487
+ finalUrl,
488
+ });
489
+ }
490
+ }
491
+
492
+ // Too many redirects
493
+ throw createError(
494
+ "too_many_redirects",
495
+ "processing",
496
+ `Too many meta refresh redirects (${redirectCount})`,
497
+ false,
498
+ { url, finalUrl }
499
+ );
500
+ }
501
+
502
+ /**
503
+ * Fallback content extraction from DOM.
504
+ */
505
+ function fallbackExtraction(document: Document): string {
506
+ // Try article content first
507
+ const article = document.querySelector("article, main, [role='main'], .content, #content");
508
+
509
+ if (article) {
510
+ return elementToMarkdown(article);
511
+ }
512
+
513
+ // Fall back to body
514
+ const body = document.querySelector("body");
515
+ if (body) {
516
+ // Try to extract main content area
517
+ const main = body.querySelector("main, article, [role='main']");
518
+ if (main) {
519
+ return elementToMarkdown(main);
520
+ }
521
+ return elementToMarkdown(body);
522
+ }
523
+
524
+ // Last resort: full document text
525
+ return extractTextContent(document.documentElement);
526
+ }
527
+
528
+ /**
529
+ * Fetch multiple URLs concurrently.
530
+ *
531
+ * @param urls - URLs to fetch
532
+ * @param options - Fetch options
533
+ * @param hooks - Execution hooks
534
+ * @returns Batch fetch result
535
+ */
536
+ export async function defuddleFetchMultiple(
537
+ urls: string[],
538
+ options: FetchOptions & { batchConcurrency?: number } = {},
539
+ hooks?: FetchExecutionHooks
540
+ ): Promise<BatchFetchResult> {
541
+ const {
542
+ batchConcurrency = DEFAULT_BATCH_CONCURRENCY,
543
+ ...fetchOptions
544
+ } = options;
545
+
546
+ const items: BatchFetchItemResult[] = new Array(urls.length);
547
+ const progress: FetchProgress[] = urls.map((url) => ({
548
+ url,
549
+ status: "queued" as FetchProgressStatus,
550
+ percent: 0,
551
+ bytesLoaded: 0,
552
+ bytesTotal: 0,
553
+ phase: "queued",
554
+ }));
555
+
556
+ // Worker function
557
+ const fetchWorker = async (index: number): Promise<void> => {
558
+ const url = urls[index];
559
+
560
+ progress[index] = {
561
+ url,
562
+ status: "connecting",
563
+ percent: 0,
564
+ bytesLoaded: 0,
565
+ bytesTotal: 0,
566
+ phase: "connecting",
567
+ };
568
+ hooks?.onUpdate?.([...progress]);
569
+
570
+ try {
571
+ const result = await defuddleFetch(url, fetchOptions, {
572
+ onProgress: (p) => {
573
+ progress[index] = p;
574
+ hooks?.onUpdate?.([...progress]);
575
+ },
576
+ });
577
+
578
+ items[index] = { status: "done", result };
579
+ progress[index] = {
580
+ url,
581
+ status: "done",
582
+ percent: 100,
583
+ bytesLoaded: progress[index].bytesTotal,
584
+ bytesTotal: progress[index].bytesTotal,
585
+ phase: "done",
586
+ };
587
+ } catch (error) {
588
+ const fetchError = (error as FetchError).code
589
+ ? (error as FetchError)
590
+ : createError("processing_error", "unknown", (error as Error).message, false, { url });
591
+
592
+ items[index] = { status: "error", error: fetchError };
593
+ progress[index] = {
594
+ url,
595
+ status: "error",
596
+ percent: 0,
597
+ bytesLoaded: 0,
598
+ bytesTotal: 0,
599
+ phase: "error",
600
+ error: fetchError,
601
+ };
602
+ }
603
+
604
+ hooks?.onUpdate?.([...progress]);
605
+ };
606
+
607
+ // Bounded concurrency
608
+ let nextIndex = 0;
609
+ const workers: Promise<void>[] = [];
610
+
611
+ const startWorker = (): void => {
612
+ if (nextIndex >= urls.length) return;
613
+ const index = nextIndex++;
614
+ workers.push(
615
+ fetchWorker(index).then(() => {
616
+ // Start next worker after completion
617
+ if (nextIndex < urls.length) {
618
+ startWorker();
619
+ }
620
+ })
621
+ );
622
+ };
623
+
624
+ // Start initial workers
625
+ for (let i = 0; i < Math.min(batchConcurrency, urls.length); i++) {
626
+ startWorker();
627
+ }
628
+
629
+ // Wait for all workers to complete
630
+ await Promise.all(workers);
631
+
632
+ // Calculate statistics
633
+ const succeeded = items.filter((item) => item.status === "done").length;
634
+ const failed = items.filter((item) => item.status === "error").length;
635
+
636
+ return {
637
+ total: urls.length,
638
+ succeeded,
639
+ failed,
640
+ items,
641
+ };
642
+ }