@pi-unipi/web-api 0.1.14 → 0.1.16
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +81 -114
- package/package.json +9 -2
- package/skills/web/SKILL.md +54 -11
- package/src/engine/constants.ts +36 -0
- package/src/engine/dependencies.ts +145 -0
- package/src/engine/dom.ts +266 -0
- package/src/engine/extract.ts +642 -0
- package/src/engine/format.ts +306 -0
- package/src/engine/profiles.ts +102 -0
- package/src/engine/types.ts +169 -0
- package/src/index.ts +9 -2
- package/src/providers/base.ts +9 -1
- package/src/settings.ts +70 -4
- package/src/tools.ts +281 -24
- package/src/tui/progress.ts +168 -0
- package/src/tui/result.ts +173 -0
- package/src/tui/settings-dialog.ts +168 -0
|
@@ -0,0 +1,642 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @unipi/web-api — Core Extraction Pipeline
|
|
3
|
+
*
|
|
4
|
+
* The heart of the smart-fetch engine:
|
|
5
|
+
* URL validation → wreq-js fetch → content-type routing → defuddle extraction → fallbacks.
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
import type {
|
|
9
|
+
FetchResult,
|
|
10
|
+
FetchError,
|
|
11
|
+
FetchOptions,
|
|
12
|
+
FetchProgress,
|
|
13
|
+
FetchExecutionHooks,
|
|
14
|
+
BatchFetchResult,
|
|
15
|
+
BatchFetchItemResult,
|
|
16
|
+
FetchProgressStatus,
|
|
17
|
+
} from "./types.js";
|
|
18
|
+
import {
|
|
19
|
+
DEFAULT_BROWSER,
|
|
20
|
+
DEFAULT_OS,
|
|
21
|
+
DEFAULT_FORMAT,
|
|
22
|
+
DEFAULT_MAX_CHARS,
|
|
23
|
+
DEFAULT_TIMEOUT_MS,
|
|
24
|
+
DEFAULT_REMOVE_IMAGES,
|
|
25
|
+
DEFAULT_INCLUDE_REPLIES,
|
|
26
|
+
DEFAULT_HEADERS,
|
|
27
|
+
DEFAULT_BATCH_CONCURRENCY,
|
|
28
|
+
} from "./constants.js";
|
|
29
|
+
import { resolveBrowserProfile, resolveOSProfile } from "./profiles.js";
|
|
30
|
+
import { getWreq, getDefuddle, getMimeTypes } from "./dependencies.js";
|
|
31
|
+
import { parseHTML, extractTextContent, elementToMarkdown } from "./dom.js";
|
|
32
|
+
import { truncateContent, formatContent } from "./format.js";
|
|
33
|
+
|
|
34
|
+
/** Maximum meta refresh redirects to follow */
|
|
35
|
+
const MAX_REDIRECTS = 5;
|
|
36
|
+
|
|
37
|
+
/** Maximum alternate link fallbacks to try */
|
|
38
|
+
const MAX_ALTERNATE_LINKS = 3;
|
|
39
|
+
|
|
40
|
+
/**
|
|
41
|
+
* Validate a URL for fetching.
|
|
42
|
+
* Only http and https protocols are supported.
|
|
43
|
+
*
|
|
44
|
+
* @param url - URL to validate
|
|
45
|
+
* @returns Validated URL or throws
|
|
46
|
+
*/
|
|
47
|
+
function validateUrl(url: string): URL {
|
|
48
|
+
let parsed: URL;
|
|
49
|
+
|
|
50
|
+
try {
|
|
51
|
+
parsed = new URL(url);
|
|
52
|
+
} catch {
|
|
53
|
+
throw createError(
|
|
54
|
+
"invalid_url",
|
|
55
|
+
"validation",
|
|
56
|
+
`Invalid URL format: ${url}`,
|
|
57
|
+
false
|
|
58
|
+
);
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
if (parsed.protocol !== "http:" && parsed.protocol !== "https:") {
|
|
62
|
+
throw createError(
|
|
63
|
+
"unsupported_protocol",
|
|
64
|
+
"validation",
|
|
65
|
+
`Unsupported protocol: ${parsed.protocol}. Only http and https are supported.`,
|
|
66
|
+
false
|
|
67
|
+
);
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
return parsed;
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
/**
|
|
74
|
+
* Create a FetchError object.
|
|
75
|
+
*/
|
|
76
|
+
function createError(
|
|
77
|
+
code: FetchError["code"],
|
|
78
|
+
phase: FetchError["phase"],
|
|
79
|
+
message: string,
|
|
80
|
+
retryable: boolean,
|
|
81
|
+
extra: Partial<FetchError> = {}
|
|
82
|
+
): FetchError {
|
|
83
|
+
return {
|
|
84
|
+
error: message,
|
|
85
|
+
code,
|
|
86
|
+
phase,
|
|
87
|
+
retryable,
|
|
88
|
+
...extra,
|
|
89
|
+
};
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
/**
|
|
93
|
+
* Create a FetchResult object.
|
|
94
|
+
*/
|
|
95
|
+
function createResult(
|
|
96
|
+
url: string,
|
|
97
|
+
finalUrl: string,
|
|
98
|
+
content: string,
|
|
99
|
+
metadata: Partial<FetchResult> = {}
|
|
100
|
+
): FetchResult {
|
|
101
|
+
return {
|
|
102
|
+
url,
|
|
103
|
+
finalUrl,
|
|
104
|
+
title: metadata.title || "",
|
|
105
|
+
author: metadata.author || "",
|
|
106
|
+
published: metadata.published || "",
|
|
107
|
+
site: metadata.site || "",
|
|
108
|
+
language: metadata.language || "",
|
|
109
|
+
wordCount: content.split(/\s+/).filter(Boolean).length,
|
|
110
|
+
content,
|
|
111
|
+
format: metadata.format || "markdown",
|
|
112
|
+
mimeType: metadata.mimeType || "text/html",
|
|
113
|
+
};
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
/**
|
|
117
|
+
* Extract metadata from defuddle result.
|
|
118
|
+
*/
|
|
119
|
+
function extractMetadata(
|
|
120
|
+
defuddleResult: any,
|
|
121
|
+
document: Document
|
|
122
|
+
): Partial<FetchResult> {
|
|
123
|
+
const metadata: Partial<FetchResult> = {};
|
|
124
|
+
|
|
125
|
+
// Try defuddle-extracted metadata
|
|
126
|
+
if (defuddleResult) {
|
|
127
|
+
metadata.title = defuddleResult.title || "";
|
|
128
|
+
metadata.author = defuddleResult.author || "";
|
|
129
|
+
metadata.published = defuddleResult.published || defuddleResult.date || "";
|
|
130
|
+
metadata.site = defuddleResult.site || defuddleResult.siteName || "";
|
|
131
|
+
metadata.language = defuddleResult.language || "";
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
// Fall back to DOM extraction
|
|
135
|
+
if (!metadata.title) {
|
|
136
|
+
const titleEl = document.querySelector("title");
|
|
137
|
+
metadata.title = titleEl?.textContent?.trim() || "";
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
// Try og:title
|
|
141
|
+
if (!metadata.title) {
|
|
142
|
+
const ogTitle = document.querySelector('meta[property="og:title"]');
|
|
143
|
+
metadata.title = ogTitle?.getAttribute("content") || "";
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
// Try meta author
|
|
147
|
+
if (!metadata.author) {
|
|
148
|
+
const authorMeta = document.querySelector('meta[name="author"]');
|
|
149
|
+
metadata.author = authorMeta?.getAttribute("content") || "";
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
// Try meta site
|
|
153
|
+
if (!metadata.site) {
|
|
154
|
+
const siteMeta = document.querySelector('meta[property="og:site_name"]');
|
|
155
|
+
metadata.site = siteMeta?.getAttribute("content") || "";
|
|
156
|
+
}
|
|
157
|
+
|
|
158
|
+
// Try html lang
|
|
159
|
+
if (!metadata.language) {
|
|
160
|
+
const htmlEl = document.querySelector("html");
|
|
161
|
+
metadata.language = htmlEl?.getAttribute("lang") || "";
|
|
162
|
+
}
|
|
163
|
+
|
|
164
|
+
return metadata;
|
|
165
|
+
}
|
|
166
|
+
|
|
167
|
+
/**
|
|
168
|
+
* Check for client-side meta refresh redirects.
|
|
169
|
+
*
|
|
170
|
+
* @param document - DOM document
|
|
171
|
+
* @returns Redirect URL if found
|
|
172
|
+
*/
|
|
173
|
+
function findMetaRefresh(document: Document): string | null {
|
|
174
|
+
const metaRefresh = document.querySelector(
|
|
175
|
+
'meta[http-equiv="refresh"]'
|
|
176
|
+
) as HTMLMetaElement | null;
|
|
177
|
+
|
|
178
|
+
if (!metaRefresh) {
|
|
179
|
+
return null;
|
|
180
|
+
}
|
|
181
|
+
|
|
182
|
+
const content = metaRefresh.getAttribute("content");
|
|
183
|
+
if (!content) {
|
|
184
|
+
return null;
|
|
185
|
+
}
|
|
186
|
+
|
|
187
|
+
// Parse: "0;url=https://example.com" or "0; URL='https://example.com'"
|
|
188
|
+
const match = content.match(/url\s*=\s*['"]?([^'"\s]+)['"]?/i);
|
|
189
|
+
if (!match) {
|
|
190
|
+
return null;
|
|
191
|
+
}
|
|
192
|
+
|
|
193
|
+
return match[1];
|
|
194
|
+
}
|
|
195
|
+
|
|
196
|
+
/**
|
|
197
|
+
* Check for alternate JSON content links.
|
|
198
|
+
*
|
|
199
|
+
* @param document - DOM document
|
|
200
|
+
* @returns Array of alternate URLs
|
|
201
|
+
*/
|
|
202
|
+
function findAlternateLinks(document: Document): string[] {
|
|
203
|
+
const alternates: string[] = [];
|
|
204
|
+
|
|
205
|
+
// Look for JSON feeds, oEmbed, etc.
|
|
206
|
+
const links = document.querySelectorAll(
|
|
207
|
+
'link[rel="alternate"][type="application/json"], ' +
|
|
208
|
+
'link[rel="alternate"][type="application/ld+json"]'
|
|
209
|
+
);
|
|
210
|
+
|
|
211
|
+
for (const link of Array.from(links)) {
|
|
212
|
+
const href = link.getAttribute("href");
|
|
213
|
+
if (href) {
|
|
214
|
+
alternates.push(href);
|
|
215
|
+
}
|
|
216
|
+
}
|
|
217
|
+
|
|
218
|
+
return alternates.slice(0, MAX_ALTERNATE_LINKS);
|
|
219
|
+
}
|
|
220
|
+
|
|
221
|
+
/**
|
|
222
|
+
* Detect content type from response.
|
|
223
|
+
*/
|
|
224
|
+
function detectContentType(
|
|
225
|
+
response: Response,
|
|
226
|
+
buffer: ArrayBuffer
|
|
227
|
+
): { mimeType: string; isBinary: boolean } {
|
|
228
|
+
const contentType = response.headers.get("content-type") || "";
|
|
229
|
+
const mimeType = contentType.split(";")[0].trim().toLowerCase();
|
|
230
|
+
|
|
231
|
+
// Check for binary types
|
|
232
|
+
const binaryTypes = [
|
|
233
|
+
"application/octet-stream",
|
|
234
|
+
"application/pdf",
|
|
235
|
+
"application/zip",
|
|
236
|
+
"application/x-",
|
|
237
|
+
"image/",
|
|
238
|
+
"video/",
|
|
239
|
+
"audio/",
|
|
240
|
+
"font/",
|
|
241
|
+
];
|
|
242
|
+
|
|
243
|
+
const isBinary = binaryTypes.some((t) => mimeType.startsWith(t));
|
|
244
|
+
|
|
245
|
+
return { mimeType, isBinary };
|
|
246
|
+
}
|
|
247
|
+
|
|
248
|
+
/**
|
|
249
|
+
* The main fetch + extraction pipeline.
|
|
250
|
+
*
|
|
251
|
+
* @param url - URL to fetch
|
|
252
|
+
* @param options - Fetch options
|
|
253
|
+
* @param hooks - Execution hooks for progress
|
|
254
|
+
* @returns Fetch result or throws FetchError
|
|
255
|
+
*/
|
|
256
|
+
export async function defuddleFetch(
|
|
257
|
+
url: string,
|
|
258
|
+
options: FetchOptions = {},
|
|
259
|
+
hooks?: FetchExecutionHooks
|
|
260
|
+
): Promise<FetchResult> {
|
|
261
|
+
const {
|
|
262
|
+
browser = DEFAULT_BROWSER,
|
|
263
|
+
os = DEFAULT_OS,
|
|
264
|
+
format = DEFAULT_FORMAT,
|
|
265
|
+
maxChars = DEFAULT_MAX_CHARS,
|
|
266
|
+
timeoutMs = DEFAULT_TIMEOUT_MS,
|
|
267
|
+
removeImages = DEFAULT_REMOVE_IMAGES,
|
|
268
|
+
includeReplies = DEFAULT_INCLUDE_REPLIES,
|
|
269
|
+
proxy,
|
|
270
|
+
headers: customHeaders,
|
|
271
|
+
} = options;
|
|
272
|
+
|
|
273
|
+
// Track progress
|
|
274
|
+
const updateProgress = (
|
|
275
|
+
status: FetchProgressStatus,
|
|
276
|
+
percent: number = 0,
|
|
277
|
+
phase: string = "",
|
|
278
|
+
bytesLoaded: number = 0,
|
|
279
|
+
bytesTotal: number = 0
|
|
280
|
+
) => {
|
|
281
|
+
hooks?.onProgress?.({
|
|
282
|
+
url,
|
|
283
|
+
status,
|
|
284
|
+
percent,
|
|
285
|
+
bytesLoaded,
|
|
286
|
+
bytesTotal,
|
|
287
|
+
phase,
|
|
288
|
+
});
|
|
289
|
+
};
|
|
290
|
+
|
|
291
|
+
let finalUrl = url;
|
|
292
|
+
let redirectCount = 0;
|
|
293
|
+
|
|
294
|
+
// Validate URL
|
|
295
|
+
updateProgress("connecting", 0, "validation");
|
|
296
|
+
try {
|
|
297
|
+
validateUrl(url);
|
|
298
|
+
} catch (error) {
|
|
299
|
+
if ((error as FetchError).code) {
|
|
300
|
+
throw error;
|
|
301
|
+
}
|
|
302
|
+
throw createError("invalid_url", "validation", (error as Error).message, false, {
|
|
303
|
+
url,
|
|
304
|
+
});
|
|
305
|
+
}
|
|
306
|
+
|
|
307
|
+
// Get wreq-js
|
|
308
|
+
const wreq = await getWreq();
|
|
309
|
+
|
|
310
|
+
// Build request options
|
|
311
|
+
const resolvedBrowser = resolveBrowserProfile(browser);
|
|
312
|
+
const resolvedOS = resolveOSProfile(os);
|
|
313
|
+
|
|
314
|
+
const requestHeaders = {
|
|
315
|
+
...DEFAULT_HEADERS,
|
|
316
|
+
...customHeaders,
|
|
317
|
+
};
|
|
318
|
+
|
|
319
|
+
// Main fetch loop (handles meta refresh redirects)
|
|
320
|
+
while (redirectCount < MAX_REDIRECTS) {
|
|
321
|
+
updateProgress("connecting", 10, "connecting");
|
|
322
|
+
|
|
323
|
+
try {
|
|
324
|
+
// wreq-js request
|
|
325
|
+
const response = await wreq.fetch(finalUrl, {
|
|
326
|
+
browser: resolvedBrowser,
|
|
327
|
+
os: resolvedOS,
|
|
328
|
+
timeout: timeoutMs,
|
|
329
|
+
proxy,
|
|
330
|
+
headers: requestHeaders,
|
|
331
|
+
});
|
|
332
|
+
|
|
333
|
+
updateProgress("waiting", 30, "waiting");
|
|
334
|
+
|
|
335
|
+
// Check HTTP status
|
|
336
|
+
if (!response.ok) {
|
|
337
|
+
throw createError(
|
|
338
|
+
"http_error",
|
|
339
|
+
"waiting",
|
|
340
|
+
`HTTP error: ${response.status} ${response.statusText}`,
|
|
341
|
+
response.status >= 500 || response.status === 429,
|
|
342
|
+
{
|
|
343
|
+
url,
|
|
344
|
+
finalUrl,
|
|
345
|
+
statusCode: response.status,
|
|
346
|
+
statusText: response.statusText,
|
|
347
|
+
}
|
|
348
|
+
);
|
|
349
|
+
}
|
|
350
|
+
|
|
351
|
+
updateProgress("loading", 40, "loading");
|
|
352
|
+
|
|
353
|
+
// Get response body
|
|
354
|
+
const buffer = await response.arrayBuffer();
|
|
355
|
+
const contentLength = response.headers.get("content-length");
|
|
356
|
+
const bytesTotal = contentLength ? parseInt(contentLength, 10) : buffer.byteLength;
|
|
357
|
+
|
|
358
|
+
updateProgress("loading", 60, "loading", buffer.byteLength, bytesTotal);
|
|
359
|
+
|
|
360
|
+
// Detect content type
|
|
361
|
+
const { mimeType, isBinary } = detectContentType(response, buffer);
|
|
362
|
+
|
|
363
|
+
// Handle binary content
|
|
364
|
+
if (isBinary) {
|
|
365
|
+
updateProgress("processing", 80, "processing");
|
|
366
|
+
|
|
367
|
+
// For binary files, return a placeholder with metadata
|
|
368
|
+
return createResult(url, finalUrl, `[Binary file: ${mimeType}]`, {
|
|
369
|
+
mimeType,
|
|
370
|
+
format,
|
|
371
|
+
});
|
|
372
|
+
}
|
|
373
|
+
|
|
374
|
+
// Handle JSON
|
|
375
|
+
if (mimeType === "application/json") {
|
|
376
|
+
updateProgress("processing", 80, "processing");
|
|
377
|
+
const text = new TextDecoder().decode(buffer);
|
|
378
|
+
const json = JSON.parse(text);
|
|
379
|
+
const content = JSON.stringify(json, null, 2);
|
|
380
|
+
const truncated = truncateContent(content, maxChars);
|
|
381
|
+
|
|
382
|
+
return createResult(url, finalUrl, truncated, {
|
|
383
|
+
mimeType,
|
|
384
|
+
format: "json",
|
|
385
|
+
});
|
|
386
|
+
}
|
|
387
|
+
|
|
388
|
+
// Handle plain text
|
|
389
|
+
if (mimeType.startsWith("text/plain")) {
|
|
390
|
+
updateProgress("processing", 80, "processing");
|
|
391
|
+
const text = new TextDecoder().decode(buffer);
|
|
392
|
+
const truncated = truncateContent(text, maxChars);
|
|
393
|
+
|
|
394
|
+
return createResult(url, finalUrl, truncated, {
|
|
395
|
+
mimeType,
|
|
396
|
+
format: "text",
|
|
397
|
+
});
|
|
398
|
+
}
|
|
399
|
+
|
|
400
|
+
// Handle HTML
|
|
401
|
+
updateProgress("processing", 70, "processing");
|
|
402
|
+
|
|
403
|
+
const html = new TextDecoder().decode(buffer);
|
|
404
|
+
const { document, window } = parseHTML(html);
|
|
405
|
+
|
|
406
|
+
// Check for meta refresh redirect
|
|
407
|
+
const redirectUrl = findMetaRefresh(document);
|
|
408
|
+
if (redirectUrl) {
|
|
409
|
+
redirectCount++;
|
|
410
|
+
// Resolve relative URLs
|
|
411
|
+
finalUrl = new URL(redirectUrl, finalUrl).href;
|
|
412
|
+
continue; // Loop to fetch the redirect target
|
|
413
|
+
}
|
|
414
|
+
|
|
415
|
+
// Try defuddle extraction
|
|
416
|
+
let content: string;
|
|
417
|
+
let metadata: Partial<FetchResult> = {};
|
|
418
|
+
|
|
419
|
+
try {
|
|
420
|
+
const defuddle = await getDefuddle();
|
|
421
|
+
|
|
422
|
+
// defuddle expects a window object with document
|
|
423
|
+
const defuddleOptions = {
|
|
424
|
+
removeImages,
|
|
425
|
+
includeReplies: includeReplies === true ? true : includeReplies === "extractors" ? "extractors" : false,
|
|
426
|
+
};
|
|
427
|
+
|
|
428
|
+
const defuddleResult = await defuddle(window, defuddleOptions);
|
|
429
|
+
|
|
430
|
+
if (defuddleResult?.content) {
|
|
431
|
+
content = defuddleResult.content;
|
|
432
|
+
metadata = extractMetadata(defuddleResult, document);
|
|
433
|
+
} else {
|
|
434
|
+
// Fallback to DOM extraction
|
|
435
|
+
content = fallbackExtraction(document);
|
|
436
|
+
metadata = extractMetadata(null, document);
|
|
437
|
+
}
|
|
438
|
+
} catch (_defuddleError) {
|
|
439
|
+
// Defuddle extraction failed — use fallback DOM extraction.
|
|
440
|
+
content = fallbackExtraction(document);
|
|
441
|
+
metadata = extractMetadata(null, document);
|
|
442
|
+
}
|
|
443
|
+
|
|
444
|
+
// Truncate content
|
|
445
|
+
content = truncateContent(content, maxChars);
|
|
446
|
+
|
|
447
|
+
// Format content based on requested format
|
|
448
|
+
const formattedContent = formatContent(
|
|
449
|
+
createResult(url, finalUrl, content, metadata),
|
|
450
|
+
format,
|
|
451
|
+
maxChars
|
|
452
|
+
);
|
|
453
|
+
|
|
454
|
+
updateProgress("done", 100, "done", bytesTotal, bytesTotal);
|
|
455
|
+
|
|
456
|
+
return createResult(url, finalUrl, formattedContent, {
|
|
457
|
+
...metadata,
|
|
458
|
+
mimeType,
|
|
459
|
+
format,
|
|
460
|
+
});
|
|
461
|
+
} catch (error) {
|
|
462
|
+
// Handle wreq-js fetch errors
|
|
463
|
+
if ((error as FetchError).code) {
|
|
464
|
+
throw error;
|
|
465
|
+
}
|
|
466
|
+
|
|
467
|
+
const err = error as Error;
|
|
468
|
+
|
|
469
|
+
// Classify error
|
|
470
|
+
if (err.message.includes("timeout")) {
|
|
471
|
+
throw createError("timeout", "waiting", err.message, true, {
|
|
472
|
+
url,
|
|
473
|
+
finalUrl,
|
|
474
|
+
timeoutMs,
|
|
475
|
+
});
|
|
476
|
+
}
|
|
477
|
+
|
|
478
|
+
if (err.message.includes("network") || err.message.includes("ECONNREFUSED")) {
|
|
479
|
+
throw createError("network_error", "connecting", err.message, true, {
|
|
480
|
+
url,
|
|
481
|
+
finalUrl,
|
|
482
|
+
});
|
|
483
|
+
}
|
|
484
|
+
|
|
485
|
+
throw createError("unexpected_response", "loading", err.message, false, {
|
|
486
|
+
url,
|
|
487
|
+
finalUrl,
|
|
488
|
+
});
|
|
489
|
+
}
|
|
490
|
+
}
|
|
491
|
+
|
|
492
|
+
// Too many redirects
|
|
493
|
+
throw createError(
|
|
494
|
+
"too_many_redirects",
|
|
495
|
+
"processing",
|
|
496
|
+
`Too many meta refresh redirects (${redirectCount})`,
|
|
497
|
+
false,
|
|
498
|
+
{ url, finalUrl }
|
|
499
|
+
);
|
|
500
|
+
}
|
|
501
|
+
|
|
502
|
+
/**
|
|
503
|
+
* Fallback content extraction from DOM.
|
|
504
|
+
*/
|
|
505
|
+
function fallbackExtraction(document: Document): string {
|
|
506
|
+
// Try article content first
|
|
507
|
+
const article = document.querySelector("article, main, [role='main'], .content, #content");
|
|
508
|
+
|
|
509
|
+
if (article) {
|
|
510
|
+
return elementToMarkdown(article);
|
|
511
|
+
}
|
|
512
|
+
|
|
513
|
+
// Fall back to body
|
|
514
|
+
const body = document.querySelector("body");
|
|
515
|
+
if (body) {
|
|
516
|
+
// Try to extract main content area
|
|
517
|
+
const main = body.querySelector("main, article, [role='main']");
|
|
518
|
+
if (main) {
|
|
519
|
+
return elementToMarkdown(main);
|
|
520
|
+
}
|
|
521
|
+
return elementToMarkdown(body);
|
|
522
|
+
}
|
|
523
|
+
|
|
524
|
+
// Last resort: full document text
|
|
525
|
+
return extractTextContent(document.documentElement);
|
|
526
|
+
}
|
|
527
|
+
|
|
528
|
+
/**
|
|
529
|
+
* Fetch multiple URLs concurrently.
|
|
530
|
+
*
|
|
531
|
+
* @param urls - URLs to fetch
|
|
532
|
+
* @param options - Fetch options
|
|
533
|
+
* @param hooks - Execution hooks
|
|
534
|
+
* @returns Batch fetch result
|
|
535
|
+
*/
|
|
536
|
+
export async function defuddleFetchMultiple(
|
|
537
|
+
urls: string[],
|
|
538
|
+
options: FetchOptions & { batchConcurrency?: number } = {},
|
|
539
|
+
hooks?: FetchExecutionHooks
|
|
540
|
+
): Promise<BatchFetchResult> {
|
|
541
|
+
const {
|
|
542
|
+
batchConcurrency = DEFAULT_BATCH_CONCURRENCY,
|
|
543
|
+
...fetchOptions
|
|
544
|
+
} = options;
|
|
545
|
+
|
|
546
|
+
const items: BatchFetchItemResult[] = new Array(urls.length);
|
|
547
|
+
const progress: FetchProgress[] = urls.map((url) => ({
|
|
548
|
+
url,
|
|
549
|
+
status: "queued" as FetchProgressStatus,
|
|
550
|
+
percent: 0,
|
|
551
|
+
bytesLoaded: 0,
|
|
552
|
+
bytesTotal: 0,
|
|
553
|
+
phase: "queued",
|
|
554
|
+
}));
|
|
555
|
+
|
|
556
|
+
// Worker function
|
|
557
|
+
const fetchWorker = async (index: number): Promise<void> => {
|
|
558
|
+
const url = urls[index];
|
|
559
|
+
|
|
560
|
+
progress[index] = {
|
|
561
|
+
url,
|
|
562
|
+
status: "connecting",
|
|
563
|
+
percent: 0,
|
|
564
|
+
bytesLoaded: 0,
|
|
565
|
+
bytesTotal: 0,
|
|
566
|
+
phase: "connecting",
|
|
567
|
+
};
|
|
568
|
+
hooks?.onUpdate?.([...progress]);
|
|
569
|
+
|
|
570
|
+
try {
|
|
571
|
+
const result = await defuddleFetch(url, fetchOptions, {
|
|
572
|
+
onProgress: (p) => {
|
|
573
|
+
progress[index] = p;
|
|
574
|
+
hooks?.onUpdate?.([...progress]);
|
|
575
|
+
},
|
|
576
|
+
});
|
|
577
|
+
|
|
578
|
+
items[index] = { status: "done", result };
|
|
579
|
+
progress[index] = {
|
|
580
|
+
url,
|
|
581
|
+
status: "done",
|
|
582
|
+
percent: 100,
|
|
583
|
+
bytesLoaded: progress[index].bytesTotal,
|
|
584
|
+
bytesTotal: progress[index].bytesTotal,
|
|
585
|
+
phase: "done",
|
|
586
|
+
};
|
|
587
|
+
} catch (error) {
|
|
588
|
+
const fetchError = (error as FetchError).code
|
|
589
|
+
? (error as FetchError)
|
|
590
|
+
: createError("processing_error", "unknown", (error as Error).message, false, { url });
|
|
591
|
+
|
|
592
|
+
items[index] = { status: "error", error: fetchError };
|
|
593
|
+
progress[index] = {
|
|
594
|
+
url,
|
|
595
|
+
status: "error",
|
|
596
|
+
percent: 0,
|
|
597
|
+
bytesLoaded: 0,
|
|
598
|
+
bytesTotal: 0,
|
|
599
|
+
phase: "error",
|
|
600
|
+
error: fetchError,
|
|
601
|
+
};
|
|
602
|
+
}
|
|
603
|
+
|
|
604
|
+
hooks?.onUpdate?.([...progress]);
|
|
605
|
+
};
|
|
606
|
+
|
|
607
|
+
// Bounded concurrency
|
|
608
|
+
let nextIndex = 0;
|
|
609
|
+
const workers: Promise<void>[] = [];
|
|
610
|
+
|
|
611
|
+
const startWorker = (): void => {
|
|
612
|
+
if (nextIndex >= urls.length) return;
|
|
613
|
+
const index = nextIndex++;
|
|
614
|
+
workers.push(
|
|
615
|
+
fetchWorker(index).then(() => {
|
|
616
|
+
// Start next worker after completion
|
|
617
|
+
if (nextIndex < urls.length) {
|
|
618
|
+
startWorker();
|
|
619
|
+
}
|
|
620
|
+
})
|
|
621
|
+
);
|
|
622
|
+
};
|
|
623
|
+
|
|
624
|
+
// Start initial workers
|
|
625
|
+
for (let i = 0; i < Math.min(batchConcurrency, urls.length); i++) {
|
|
626
|
+
startWorker();
|
|
627
|
+
}
|
|
628
|
+
|
|
629
|
+
// Wait for all workers to complete
|
|
630
|
+
await Promise.all(workers);
|
|
631
|
+
|
|
632
|
+
// Calculate statistics
|
|
633
|
+
const succeeded = items.filter((item) => item.status === "done").length;
|
|
634
|
+
const failed = items.filter((item) => item.status === "error").length;
|
|
635
|
+
|
|
636
|
+
return {
|
|
637
|
+
total: urls.length,
|
|
638
|
+
succeeded,
|
|
639
|
+
failed,
|
|
640
|
+
items,
|
|
641
|
+
};
|
|
642
|
+
}
|