magpie-html 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +22 -0
- package/README.md +424 -0
- package/dist/index.cjs +5197 -0
- package/dist/index.cjs.map +1 -0
- package/dist/index.d.cts +3072 -0
- package/dist/index.d.ts +3072 -0
- package/dist/index.js +5149 -0
- package/dist/index.js.map +1 -0
- package/package.json +80 -0
package/dist/index.d.cts
ADDED
|
@@ -0,0 +1,3072 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Content extraction types.
|
|
3
|
+
*
|
|
4
|
+
* @remarks
|
|
5
|
+
* Types for article content extraction using Mozilla Readability.
|
|
6
|
+
*
|
|
7
|
+
* @packageDocumentation
|
|
8
|
+
*/
|
|
9
|
+
/**
|
|
10
|
+
* Options for content extraction.
|
|
11
|
+
*/
|
|
12
|
+
interface ContentExtractionOptions {
|
|
13
|
+
/**
|
|
14
|
+
* Base URL for resolving relative links and images.
|
|
15
|
+
* Highly recommended for proper link resolution.
|
|
16
|
+
*/
|
|
17
|
+
baseUrl?: string;
|
|
18
|
+
/**
|
|
19
|
+
* Minimum character count for article content.
|
|
20
|
+
* Articles shorter than this are considered too short.
|
|
21
|
+
* @default 500
|
|
22
|
+
*/
|
|
23
|
+
charThreshold?: number;
|
|
24
|
+
/**
|
|
25
|
+
* Maximum number of elements to parse.
|
|
26
|
+
* Set to 0 for no limit.
|
|
27
|
+
* @default 0
|
|
28
|
+
*/
|
|
29
|
+
maxElemsToParse?: number;
|
|
30
|
+
/**
|
|
31
|
+
* Whether to preserve CSS classes in extracted HTML.
|
|
32
|
+
* @default false
|
|
33
|
+
*/
|
|
34
|
+
keepClasses?: boolean;
|
|
35
|
+
/**
|
|
36
|
+
* CSS classes to preserve when keepClasses is false.
|
|
37
|
+
*/
|
|
38
|
+
classesToPreserve?: string[];
|
|
39
|
+
/**
|
|
40
|
+
* Whether to skip JSON-LD parsing for metadata.
|
|
41
|
+
* @default false
|
|
42
|
+
*/
|
|
43
|
+
disableJSONLD?: boolean;
|
|
44
|
+
/**
|
|
45
|
+
* Check if content is probably readerable before extraction.
|
|
46
|
+
* If true and content is not readerable, returns early with failure.
|
|
47
|
+
* @default false
|
|
48
|
+
*/
|
|
49
|
+
checkReadability?: boolean;
|
|
50
|
+
/**
|
|
51
|
+
* Enable debug logging.
|
|
52
|
+
* @default false
|
|
53
|
+
*/
|
|
54
|
+
debug?: boolean;
|
|
55
|
+
}
|
|
56
|
+
/**
|
|
57
|
+
* Successfully extracted content.
|
|
58
|
+
*/
|
|
59
|
+
interface ExtractedContent {
|
|
60
|
+
/** Extraction succeeded */
|
|
61
|
+
success: true;
|
|
62
|
+
/** Article title */
|
|
63
|
+
title: string;
|
|
64
|
+
/** Cleaned HTML content */
|
|
65
|
+
content: string;
|
|
66
|
+
/** Plain text content (HTML stripped) */
|
|
67
|
+
textContent: string;
|
|
68
|
+
/** Article excerpt/summary */
|
|
69
|
+
excerpt: string;
|
|
70
|
+
/** Author byline */
|
|
71
|
+
byline?: string;
|
|
72
|
+
/** Site name */
|
|
73
|
+
siteName?: string;
|
|
74
|
+
/** Content language code (e.g., 'en', 'de') */
|
|
75
|
+
lang?: string;
|
|
76
|
+
/** Text direction */
|
|
77
|
+
dir?: 'ltr' | 'rtl';
|
|
78
|
+
/** Published time (ISO 8601 string if available) */
|
|
79
|
+
publishedTime?: string;
|
|
80
|
+
/** Character count of text content */
|
|
81
|
+
length: number;
|
|
82
|
+
/** Word count */
|
|
83
|
+
wordCount: number;
|
|
84
|
+
/** Estimated reading time in minutes */
|
|
85
|
+
readingTime: number;
|
|
86
|
+
/** Whether content passed readability check */
|
|
87
|
+
readerable: boolean;
|
|
88
|
+
/** Extraction time in milliseconds */
|
|
89
|
+
extractionTime: number;
|
|
90
|
+
}
|
|
91
|
+
/**
|
|
92
|
+
* Error types for extraction failures.
|
|
93
|
+
*/
|
|
94
|
+
type ExtractionErrorType = 'NOT_READERABLE' | 'PARSE_ERROR' | 'EXTRACTION_FAILED' | 'INVALID_HTML' | 'UNKNOWN';
|
|
95
|
+
/**
|
|
96
|
+
* Failed content extraction.
|
|
97
|
+
*/
|
|
98
|
+
interface ExtractionFailure {
|
|
99
|
+
/** Extraction failed */
|
|
100
|
+
success: false;
|
|
101
|
+
/** Error message */
|
|
102
|
+
error: string;
|
|
103
|
+
/** Categorized error type */
|
|
104
|
+
errorType: ExtractionErrorType;
|
|
105
|
+
/** Whether content passed readability check (if checked) */
|
|
106
|
+
readerable: boolean;
|
|
107
|
+
/** Extraction time in milliseconds */
|
|
108
|
+
extractionTime: number;
|
|
109
|
+
/** Original error details (if available) */
|
|
110
|
+
details?: unknown;
|
|
111
|
+
}
|
|
112
|
+
/**
|
|
113
|
+
* Result of content extraction.
|
|
114
|
+
*
|
|
115
|
+
* @remarks
|
|
116
|
+
* Always returns a result, never throws exceptions.
|
|
117
|
+
*/
|
|
118
|
+
type ContentResult = ExtractedContent | ExtractionFailure;
|
|
119
|
+
/**
|
|
120
|
+
* Quality assessment metrics.
|
|
121
|
+
*/
|
|
122
|
+
interface ContentQuality {
|
|
123
|
+
/** Word count */
|
|
124
|
+
wordCount: number;
|
|
125
|
+
/** Character count */
|
|
126
|
+
charCount: number;
|
|
127
|
+
/** Estimated reading time in minutes */
|
|
128
|
+
readingTime: number;
|
|
129
|
+
/** Average words per sentence */
|
|
130
|
+
avgWordsPerSentence: number;
|
|
131
|
+
/** Paragraph count */
|
|
132
|
+
paragraphCount: number;
|
|
133
|
+
/** Image count in content */
|
|
134
|
+
imageCount: number;
|
|
135
|
+
/** Link count in content */
|
|
136
|
+
linkCount: number;
|
|
137
|
+
/** Link density (ratio of link text to total text) */
|
|
138
|
+
linkDensity: number;
|
|
139
|
+
/** Overall quality score (0-100) */
|
|
140
|
+
qualityScore: number;
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
/**
|
|
144
|
+
* Main content extraction module.
|
|
145
|
+
*
|
|
146
|
+
* @remarks
|
|
147
|
+
* Extracts article content from HTML using Mozilla Readability.
|
|
148
|
+
* Never throws exceptions - always returns a ContentResult.
|
|
149
|
+
*
|
|
150
|
+
* @packageDocumentation
|
|
151
|
+
*/
|
|
152
|
+
|
|
153
|
+
/**
|
|
154
|
+
* Extract article content from HTML.
|
|
155
|
+
*
|
|
156
|
+
* @remarks
|
|
157
|
+
* Uses Mozilla Readability to extract clean article content from a pre-parsed Document.
|
|
158
|
+
* This function never throws exceptions - always returns a ContentResult.
|
|
159
|
+
*
|
|
160
|
+
* Error handling:
|
|
161
|
+
* - Returns success: false for any extraction failure
|
|
162
|
+
* - Categorizes errors by type for better handling
|
|
163
|
+
* - Includes extraction time even for failures
|
|
164
|
+
*
|
|
165
|
+
* @param doc - Pre-parsed Document to extract content from
|
|
166
|
+
* @param options - Extraction options
|
|
167
|
+
* @returns Extraction result (success or failure)
|
|
168
|
+
*
|
|
169
|
+
* @example
|
|
170
|
+
* ```typescript
|
|
171
|
+
* import { parseHTML } from '../utils/html-parser.js';
|
|
172
|
+
* import { extractSEO } from '../metadata/index.js';
|
|
173
|
+
*
|
|
174
|
+
* const doc = parseHTML(html);
|
|
175
|
+
* const metadata = extractSEO(doc);
|
|
176
|
+
* const content = extractContent(doc, {
|
|
177
|
+
* baseUrl: 'https://example.com/article',
|
|
178
|
+
* charThreshold: 300,
|
|
179
|
+
* checkReadability: true,
|
|
180
|
+
* });
|
|
181
|
+
*
|
|
182
|
+
* if (content.success) {
|
|
183
|
+
* console.log(content.title);
|
|
184
|
+
* console.log(content.wordCount);
|
|
185
|
+
* console.log(`${content.readingTime} min read`);
|
|
186
|
+
* } else {
|
|
187
|
+
* console.error(content.error);
|
|
188
|
+
* }
|
|
189
|
+
* ```
|
|
190
|
+
*/
|
|
191
|
+
declare function extractContent(doc: Document, options?: ContentExtractionOptions): ContentResult;
|
|
192
|
+
|
|
193
|
+
/**
|
|
194
|
+
* HTML to text conversion types.
|
|
195
|
+
*
|
|
196
|
+
* @remarks
|
|
197
|
+
* Types for converting HTML to plain text with the `htmlToText` function.
|
|
198
|
+
*
|
|
199
|
+
* @packageDocumentation
|
|
200
|
+
*/
|
|
201
|
+
/**
|
|
202
|
+
* Options for HTML to plain text conversion.
|
|
203
|
+
*/
|
|
204
|
+
interface HtmlToTextOptions {
|
|
205
|
+
/**
|
|
206
|
+
* How to treat the input HTML.
|
|
207
|
+
*
|
|
208
|
+
* @remarks
|
|
209
|
+
* - `"fragment"`: Treat as HTML fragment (default)
|
|
210
|
+
* - `"document"`: Treat as full document (ignores `<head>` content)
|
|
211
|
+
*
|
|
212
|
+
* @defaultValue `"fragment"`
|
|
213
|
+
*/
|
|
214
|
+
mode?: 'fragment' | 'document';
|
|
215
|
+
/**
|
|
216
|
+
* How to render anchor (`<a>`) tags.
|
|
217
|
+
*
|
|
218
|
+
* @remarks
|
|
219
|
+
* - `"text"`: Show only the link text (default)
|
|
220
|
+
* - `"inline"`: Show text followed by URL in parentheses, e.g., "Click here (https://example.com)"
|
|
221
|
+
* - `"remove"`: Remove links entirely
|
|
222
|
+
*
|
|
223
|
+
* @defaultValue `"text"`
|
|
224
|
+
*/
|
|
225
|
+
links?: 'text' | 'inline' | 'remove';
|
|
226
|
+
/**
|
|
227
|
+
* How to render image (`<img>`) tags.
|
|
228
|
+
*
|
|
229
|
+
* @remarks
|
|
230
|
+
* - `"alt"`: Show the alt text (default)
|
|
231
|
+
* - `"remove"`: Remove images entirely
|
|
232
|
+
*
|
|
233
|
+
* @defaultValue `"alt"`
|
|
234
|
+
*/
|
|
235
|
+
images?: 'alt' | 'remove';
|
|
236
|
+
/**
|
|
237
|
+
* Collapse consecutive whitespace outside preserved tags.
|
|
238
|
+
*
|
|
239
|
+
* @remarks
|
|
240
|
+
* When `true`, multiple spaces, tabs, and line breaks are collapsed into single spaces.
|
|
241
|
+
* Whitespace inside preserved tags (e.g., `<pre>`, `<code>`) is always kept intact.
|
|
242
|
+
*
|
|
243
|
+
* @defaultValue `true`
|
|
244
|
+
*/
|
|
245
|
+
collapseWhitespace?: boolean;
|
|
246
|
+
/**
|
|
247
|
+
* Maximum consecutive newlines allowed after compaction.
|
|
248
|
+
*
|
|
249
|
+
* @remarks
|
|
250
|
+
* Limits runs of newlines to this value. Set to `1` for single spacing,
|
|
251
|
+
* `2` for double spacing (default), or higher values as needed.
|
|
252
|
+
*
|
|
253
|
+
* @defaultValue `2`
|
|
254
|
+
*/
|
|
255
|
+
maxNewlines?: number;
|
|
256
|
+
/**
|
|
257
|
+
* Optional hard-wrap column width.
|
|
258
|
+
*
|
|
259
|
+
* @remarks
|
|
260
|
+
* When set to a positive number, lines will be wrapped at this column width.
|
|
261
|
+
* Does not wrap inside preserved tags like `<pre>` or `<code>`.
|
|
262
|
+
* Set to `null` to disable wrapping (default).
|
|
263
|
+
*
|
|
264
|
+
* @defaultValue `null`
|
|
265
|
+
*/
|
|
266
|
+
wrap?: number | null;
|
|
267
|
+
/**
|
|
268
|
+
* Separator between table cells.
|
|
269
|
+
*
|
|
270
|
+
* @remarks
|
|
271
|
+
* - `"tab"`: Use tab character (default)
|
|
272
|
+
* - `"space"`: Use space character
|
|
273
|
+
*
|
|
274
|
+
* @defaultValue `"tab"`
|
|
275
|
+
*/
|
|
276
|
+
tableCellSeparator?: 'tab' | 'space';
|
|
277
|
+
/**
|
|
278
|
+
* HTML tags to exclude entirely along with their contents.
|
|
279
|
+
*
|
|
280
|
+
* @remarks
|
|
281
|
+
* By default excludes: `script`, `style`, `noscript`, `template`, `svg`, `canvas`
|
|
282
|
+
*
|
|
283
|
+
* @defaultValue `["script", "style", "noscript", "template", "svg", "canvas"]`
|
|
284
|
+
*/
|
|
285
|
+
excludeTags?: string[];
|
|
286
|
+
/**
|
|
287
|
+
* Decode HTML entities.
|
|
288
|
+
*
|
|
289
|
+
* @remarks
|
|
290
|
+
* When `true`, decodes entities like `&`, `<`, `—`, etc.
|
|
291
|
+
*
|
|
292
|
+
* @defaultValue `true`
|
|
293
|
+
*/
|
|
294
|
+
decodeEntities?: boolean;
|
|
295
|
+
/**
|
|
296
|
+
* Tags whose internal whitespace is preserved.
|
|
297
|
+
*
|
|
298
|
+
* @remarks
|
|
299
|
+
* These tags will not have their whitespace collapsed, allowing proper
|
|
300
|
+
* formatting of code blocks and preformatted text.
|
|
301
|
+
*
|
|
302
|
+
* @defaultValue `["pre", "code", "textarea"]`
|
|
303
|
+
*/
|
|
304
|
+
preserveTags?: string[];
|
|
305
|
+
/**
|
|
306
|
+
* Trim leading and trailing whitespace from the result.
|
|
307
|
+
*
|
|
308
|
+
* @defaultValue `true`
|
|
309
|
+
*/
|
|
310
|
+
trim?: boolean;
|
|
311
|
+
}
|
|
312
|
+
|
|
313
|
+
/**
|
|
314
|
+
* HTML to text conversion.
|
|
315
|
+
*
|
|
316
|
+
* @remarks
|
|
317
|
+
* Convert HTML to plain text using a zero-dependency streaming tokenizer.
|
|
318
|
+
* Pure, deterministic transformation suitable for logs, previews, classification,
|
|
319
|
+
* and search indexing. Preserves essential structure by inserting newlines at
|
|
320
|
+
* block boundaries, handles entities, and provides configurable options.
|
|
321
|
+
*
|
|
322
|
+
* @packageDocumentation
|
|
323
|
+
*/
|
|
324
|
+
|
|
325
|
+
/**
|
|
326
|
+
* Convert an HTML string to plain text.
|
|
327
|
+
*
|
|
328
|
+
* @remarks
|
|
329
|
+
* This function uses a streaming tokenizer to parse HTML and extract text content.
|
|
330
|
+
* It handles block elements, whitespace preservation, HTML entities, tables, and more.
|
|
331
|
+
*
|
|
332
|
+
* Features:
|
|
333
|
+
* - Preserves document structure with appropriate line breaks
|
|
334
|
+
* - Handles HTML entities (numeric and common named entities)
|
|
335
|
+
* - Configurable link and image handling
|
|
336
|
+
* - Table rendering with configurable cell separators
|
|
337
|
+
* - Whitespace preservation for code/pre blocks
|
|
338
|
+
* - Optional hard-wrapping at column width
|
|
339
|
+
*
|
|
340
|
+
* @param html - HTML string (fragment or full document)
|
|
341
|
+
* @param options - Conversion options
|
|
342
|
+
* @returns Plain text string
|
|
343
|
+
*
|
|
344
|
+
* @throws {TypeError} If html is not a string
|
|
345
|
+
*
|
|
346
|
+
* @example
|
|
347
|
+
* ```typescript
|
|
348
|
+
* const html = '<div><h1>Hello</h1><p>World!</p></div>';
|
|
349
|
+
* const text = htmlToText(html);
|
|
350
|
+
* console.log(text); // "Hello\n\nWorld!"
|
|
351
|
+
* ```
|
|
352
|
+
*
|
|
353
|
+
* @example
|
|
354
|
+
* ```typescript
|
|
355
|
+
* const html = '<a href="https://example.com">Visit</a>';
|
|
356
|
+
* const text = htmlToText(html, { links: 'inline' });
|
|
357
|
+
* console.log(text); // "Visit (https://example.com)"
|
|
358
|
+
* ```
|
|
359
|
+
*/
|
|
360
|
+
declare function htmlToText(html: string, options?: HtmlToTextOptions): string;
|
|
361
|
+
|
|
362
|
+
/**
|
|
363
|
+
* Content quality assessment.
|
|
364
|
+
*
|
|
365
|
+
* @remarks
|
|
366
|
+
* Analyzes extracted content to provide quality metrics.
|
|
367
|
+
*
|
|
368
|
+
* @packageDocumentation
|
|
369
|
+
*/
|
|
370
|
+
|
|
371
|
+
/**
|
|
372
|
+
* Calculate word count from text.
|
|
373
|
+
*
|
|
374
|
+
* @param text - Text to count words in
|
|
375
|
+
* @returns Number of words
|
|
376
|
+
*/
|
|
377
|
+
declare function countWords(text: string): number;
|
|
378
|
+
/**
|
|
379
|
+
* Calculate reading time in minutes.
|
|
380
|
+
*
|
|
381
|
+
* @remarks
|
|
382
|
+
* Uses average reading speed of 200 words per minute.
|
|
383
|
+
*
|
|
384
|
+
* @param wordCount - Number of words
|
|
385
|
+
* @returns Estimated reading time in minutes
|
|
386
|
+
*/
|
|
387
|
+
declare function calculateReadingTime(wordCount: number): number;
|
|
388
|
+
/**
|
|
389
|
+
* Assess content quality.
|
|
390
|
+
*
|
|
391
|
+
* @remarks
|
|
392
|
+
* Analyzes extracted content and returns comprehensive quality metrics.
|
|
393
|
+
*
|
|
394
|
+
* @param content - Extracted content
|
|
395
|
+
* @returns Quality assessment
|
|
396
|
+
*
|
|
397
|
+
* @example
|
|
398
|
+
* ```typescript
|
|
399
|
+
* const content = extractContent(html);
|
|
400
|
+
* if (content.success) {
|
|
401
|
+
* const quality = assessContentQuality(content);
|
|
402
|
+
* console.log(`Quality score: ${quality.qualityScore}/100`);
|
|
403
|
+
* console.log(`Reading time: ${quality.readingTime} minutes`);
|
|
404
|
+
* }
|
|
405
|
+
* ```
|
|
406
|
+
*/
|
|
407
|
+
declare function assessContentQuality(content: ExtractedContent): ContentQuality;
|
|
408
|
+
|
|
409
|
+
/**
|
|
410
|
+
* Mozilla Readability wrapper with linkedom.
|
|
411
|
+
*
|
|
412
|
+
* @remarks
|
|
413
|
+
* Provides a clean interface to Mozilla Readability using linkedom as the DOM implementation.
|
|
414
|
+
*
|
|
415
|
+
* @packageDocumentation
|
|
416
|
+
*/
|
|
417
|
+
|
|
418
|
+
/**
|
|
419
|
+
* Check if HTML content is probably readerable.
|
|
420
|
+
*
|
|
421
|
+
* @remarks
|
|
422
|
+
* Quick check to determine if content extraction is likely to succeed.
|
|
423
|
+
* This is a heuristic check and may produce false positives/negatives.
|
|
424
|
+
*
|
|
425
|
+
* @param doc - Pre-parsed Document to check
|
|
426
|
+
* @param options - Readability check options
|
|
427
|
+
* @returns True if content appears to be an article
|
|
428
|
+
*
|
|
429
|
+
* @example
|
|
430
|
+
* ```typescript
|
|
431
|
+
* import { parseHTML } from '../utils/html-parser.js';
|
|
432
|
+
*
|
|
433
|
+
* const doc = parseHTML(html);
|
|
434
|
+
* if (isProbablyReaderable(doc)) {
|
|
435
|
+
* const result = extractContent(doc);
|
|
436
|
+
* }
|
|
437
|
+
* ```
|
|
438
|
+
*/
|
|
439
|
+
declare function isProbablyReaderable(doc: Document, options?: {
|
|
440
|
+
minContentLength?: number;
|
|
441
|
+
minScore?: number;
|
|
442
|
+
}): boolean;
|
|
443
|
+
|
|
444
|
+
/**
|
|
445
|
+
* Feed format detection utilities.
|
|
446
|
+
*
|
|
447
|
+
* @packageDocumentation
|
|
448
|
+
*/
|
|
449
|
+
/**
|
|
450
|
+
* Feed format type.
|
|
451
|
+
*
|
|
452
|
+
* @remarks
|
|
453
|
+
* Represents the detected or expected format of a feed.
|
|
454
|
+
* - `'rss'` - RSS 2.0, 0.9x, or RSS 1.0 (RDF)
|
|
455
|
+
* - `'atom'` - Atom 1.0
|
|
456
|
+
* - `'json-feed'` - JSON Feed 1.0 or 1.1
|
|
457
|
+
* - `'unknown'` - Format could not be determined
|
|
458
|
+
*/
|
|
459
|
+
type FeedFormat = 'rss' | 'atom' | 'json-feed' | 'unknown';
|
|
460
|
+
/**
|
|
461
|
+
* Detect feed format from content string.
|
|
462
|
+
*
|
|
463
|
+
* @remarks
|
|
464
|
+
* Analyzes the content to determine if it's RSS, Atom, or JSON Feed.
|
|
465
|
+
* Detection is based on root elements, namespaces, and structure.
|
|
466
|
+
*
|
|
467
|
+
* Detection priority:
|
|
468
|
+
* 1. JSON Feed (checks for JSON with jsonfeed.org version)
|
|
469
|
+
* 2. RSS (checks for `<rss>` or `<rdf:RDF>` root elements)
|
|
470
|
+
* 3. Atom (checks for `<feed>` root element with Atom namespace)
|
|
471
|
+
*
|
|
472
|
+
* @param content - Feed content as string
|
|
473
|
+
* @returns Detected format or 'unknown' if format cannot be determined
|
|
474
|
+
*
|
|
475
|
+
* @example
|
|
476
|
+
* ```typescript
|
|
477
|
+
* const format = detectFormat(feedContent);
|
|
478
|
+
* if (format === 'rss') {
|
|
479
|
+
* console.log('This is an RSS feed');
|
|
480
|
+
* }
|
|
481
|
+
* ```
|
|
482
|
+
*/
|
|
483
|
+
declare function detectFormat(content: string): FeedFormat;
|
|
484
|
+
/**
|
|
485
|
+
* Check if content is a valid feed (any format).
|
|
486
|
+
*
|
|
487
|
+
* @param content - Feed content as string
|
|
488
|
+
* @returns `true` if content is RSS, Atom, or JSON Feed
|
|
489
|
+
*
|
|
490
|
+
* @example
|
|
491
|
+
* ```typescript
|
|
492
|
+
* if (isFeed(content)) {
|
|
493
|
+
* const result = parseFeed(content);
|
|
494
|
+
* }
|
|
495
|
+
* ```
|
|
496
|
+
*/
|
|
497
|
+
declare function isFeed(content: string): boolean;
|
|
498
|
+
/**
|
|
499
|
+
* Check if content is RSS format.
|
|
500
|
+
*
|
|
501
|
+
* @param content - Feed content as string
|
|
502
|
+
* @returns `true` if content is RSS (any version)
|
|
503
|
+
*/
|
|
504
|
+
declare function isRSS(content: string): boolean;
|
|
505
|
+
/**
|
|
506
|
+
* Check if content is Atom format.
|
|
507
|
+
*
|
|
508
|
+
* @param content - Feed content as string
|
|
509
|
+
* @returns `true` if content is Atom 1.0
|
|
510
|
+
*/
|
|
511
|
+
declare function isAtom(content: string): boolean;
|
|
512
|
+
/**
|
|
513
|
+
* Check if content is JSON Feed format.
|
|
514
|
+
*
|
|
515
|
+
* @param content - Feed content as string
|
|
516
|
+
* @returns `true` if content is JSON Feed (1.0 or 1.1)
|
|
517
|
+
*/
|
|
518
|
+
declare function isJSONFeed(content: string): boolean;
|
|
519
|
+
|
|
520
|
+
/**
|
|
521
|
+
* Unified feed types - normalized interface across all feed formats.
|
|
522
|
+
*
|
|
523
|
+
* @remarks
|
|
524
|
+
* These types provide a consistent interface for working with feeds regardless
|
|
525
|
+
* of the original format (RSS, Atom, or JSON Feed). All format-specific data
|
|
526
|
+
* is normalized to this structure by the parser.
|
|
527
|
+
*
|
|
528
|
+
* @packageDocumentation
|
|
529
|
+
*/
|
|
530
|
+
/**
|
|
531
|
+
* Feed author information.
|
|
532
|
+
*
|
|
533
|
+
* @remarks
|
|
534
|
+
* Represents author/contributor information normalized across all feed formats.
|
|
535
|
+
* Not all formats provide all fields.
|
|
536
|
+
*/
|
|
537
|
+
interface FeedAuthor {
|
|
538
|
+
/** Author's name */
|
|
539
|
+
name?: string;
|
|
540
|
+
/** Author's email address */
|
|
541
|
+
email?: string;
|
|
542
|
+
/** Author's website URL */
|
|
543
|
+
url?: string;
|
|
544
|
+
}
|
|
545
|
+
/**
|
|
546
|
+
* Feed enclosure (attached file).
|
|
547
|
+
*
|
|
548
|
+
* @remarks
|
|
549
|
+
* Represents attached files like audio, video, or documents. Commonly used
|
|
550
|
+
* for podcasts and media feeds.
|
|
551
|
+
*/
|
|
552
|
+
interface FeedEnclosure {
|
|
553
|
+
/** URL of the attached file */
|
|
554
|
+
url: string;
|
|
555
|
+
/** MIME type of the file (e.g., 'audio/mpeg', 'video/mp4') */
|
|
556
|
+
type?: string;
|
|
557
|
+
/** File size in bytes */
|
|
558
|
+
length?: number;
|
|
559
|
+
}
|
|
560
|
+
/**
|
|
561
|
+
* Feed item (entry/article/post).
|
|
562
|
+
*
|
|
563
|
+
* @remarks
|
|
564
|
+
* Represents a single item in a feed. Items are normalized across all formats
|
|
565
|
+
* to provide a consistent interface. Not all fields are available in all formats.
|
|
566
|
+
*/
|
|
567
|
+
interface FeedItem {
|
|
568
|
+
/** Unique identifier for the item (GUID, ID, or URL) */
|
|
569
|
+
id: string;
|
|
570
|
+
/** Item title */
|
|
571
|
+
title?: string;
|
|
572
|
+
/** Canonical URL for the item */
|
|
573
|
+
url?: string;
|
|
574
|
+
/** External URL for linked posts (when different from canonical URL) */
|
|
575
|
+
externalUrl?: string;
|
|
576
|
+
/** Full HTML content of the item */
|
|
577
|
+
contentHtml?: string;
|
|
578
|
+
/** Plain text content of the item */
|
|
579
|
+
contentText?: string;
|
|
580
|
+
/** Short summary or description */
|
|
581
|
+
summary?: string;
|
|
582
|
+
/** Publication date in ISO 8601 format */
|
|
583
|
+
published?: string;
|
|
584
|
+
/** Last modified date in ISO 8601 format */
|
|
585
|
+
modified?: string;
|
|
586
|
+
/** Item authors (may be empty if using feed-level authors) */
|
|
587
|
+
authors?: FeedAuthor[];
|
|
588
|
+
/** Tags, categories, or keywords */
|
|
589
|
+
tags?: string[];
|
|
590
|
+
/** Featured image URL */
|
|
591
|
+
image?: string;
|
|
592
|
+
/** Attached files (audio, video, documents) */
|
|
593
|
+
enclosures?: FeedEnclosure[];
|
|
594
|
+
}
|
|
595
|
+
/**
|
|
596
|
+
* Normalized feed data.
|
|
597
|
+
*
|
|
598
|
+
* @remarks
|
|
599
|
+
* The main feed object containing metadata and items. This is the recommended
|
|
600
|
+
* interface for working with feeds as it provides a consistent structure
|
|
601
|
+
* regardless of the original format.
|
|
602
|
+
*/
|
|
603
|
+
interface Feed {
|
|
604
|
+
/** Original feed format */
|
|
605
|
+
format: 'rss' | 'atom' | 'json-feed';
|
|
606
|
+
/** Feed title (required) */
|
|
607
|
+
title: string;
|
|
608
|
+
/** Feed description or subtitle */
|
|
609
|
+
description?: string;
|
|
610
|
+
/** Feed's home page URL */
|
|
611
|
+
url?: string;
|
|
612
|
+
/** Feed's own URL (self-reference) */
|
|
613
|
+
feedUrl?: string;
|
|
614
|
+
/** Feed language code (e.g., 'en', 'de') */
|
|
615
|
+
language?: string;
|
|
616
|
+
/** Feed icon or logo URL */
|
|
617
|
+
image?: string;
|
|
618
|
+
/** Feed-level authors */
|
|
619
|
+
authors?: FeedAuthor[];
|
|
620
|
+
/** Last update date in ISO 8601 format */
|
|
621
|
+
updated?: string;
|
|
622
|
+
/** Feed items (entries/articles/posts) */
|
|
623
|
+
items: FeedItem[];
|
|
624
|
+
}
|
|
625
|
+
/**
|
|
626
|
+
* Parse result containing both normalized and original data.
|
|
627
|
+
*
|
|
628
|
+
* @remarks
|
|
629
|
+
* Returned by {@link parseFeed}. Contains both the normalized feed data
|
|
630
|
+
* (recommended for most use cases) and the original format-specific data
|
|
631
|
+
* (for advanced use cases requiring format-specific fields).
|
|
632
|
+
*/
|
|
633
|
+
interface ParseResult {
|
|
634
|
+
/** Normalized feed data (recommended) */
|
|
635
|
+
feed: Feed;
|
|
636
|
+
/** Original format-specific data (advanced use) */
|
|
637
|
+
original: unknown;
|
|
638
|
+
}
|
|
639
|
+
|
|
640
|
+
/**
|
|
641
|
+
* Unified feed parser with automatic format detection.
|
|
642
|
+
*
|
|
643
|
+
* @packageDocumentation
|
|
644
|
+
*/
|
|
645
|
+
|
|
646
|
+
/**
|
|
647
|
+
* Parse any feed format with automatic format detection.
|
|
648
|
+
*
|
|
649
|
+
* @remarks
|
|
650
|
+
* This is the main entry point for feed parsing. It automatically detects whether
|
|
651
|
+
* the content is RSS, Atom, or JSON Feed, parses it, and returns a normalized
|
|
652
|
+
* output structure along with the original format-specific data.
|
|
653
|
+
*
|
|
654
|
+
* All relative URLs in the feed are converted to absolute URLs if a base URL is provided.
|
|
655
|
+
* This is essential for feed readers that need to fetch images, enclosures, or follow links.
|
|
656
|
+
*
|
|
657
|
+
* @param content - Feed content as string (XML or JSON)
|
|
658
|
+
* @param baseUrl - Optional base URL for resolving relative URLs (string or URL object)
|
|
659
|
+
* @returns Object containing normalized feed data and original format-specific data
|
|
660
|
+
* @throws Error if format cannot be detected or parsing fails
|
|
661
|
+
*
|
|
662
|
+
* @example
|
|
663
|
+
* ```typescript
|
|
664
|
+
* const feedContent = await fetch('https://example.com/feed.xml').then(r => r.text());
|
|
665
|
+
* const result = parseFeed(feedContent, 'https://example.com/feed.xml');
|
|
666
|
+
*
|
|
667
|
+
* console.log(result.feed.title);
|
|
668
|
+
* console.log(result.feed.items[0].title);
|
|
669
|
+
* console.log(result.feed.items[0].url); // Absolute URL
|
|
670
|
+
* ```
|
|
671
|
+
*/
|
|
672
|
+
declare function parseFeed(content: string, baseUrl?: string | URL): ParseResult;
|
|
673
|
+
|
|
674
|
+
/**
|
|
675
|
+
* Types for high-level gathering functionality.
|
|
676
|
+
*
|
|
677
|
+
* @packageDocumentation
|
|
678
|
+
*/
|
|
679
|
+
/**
|
|
680
|
+
* Gathered website data.
|
|
681
|
+
*
|
|
682
|
+
* @remarks
|
|
683
|
+
* This interface represents the complete gathered data from a website,
|
|
684
|
+
* including the authoritative URL and all extracted metadata.
|
|
685
|
+
* It will be extended incrementally with more properties.
|
|
686
|
+
*/
|
|
687
|
+
interface Website {
|
|
688
|
+
/**
|
|
689
|
+
* Authoritative URL for the page.
|
|
690
|
+
*
|
|
691
|
+
* @remarks
|
|
692
|
+
* Uses canonical URL if present, otherwise the final URL after redirects.
|
|
693
|
+
*/
|
|
694
|
+
url: URL;
|
|
695
|
+
/** Discovered feed URLs (RSS, Atom, JSON Feed) as URL objects */
|
|
696
|
+
feeds: URL[];
|
|
697
|
+
/**
|
|
698
|
+
* Page title (cleaned, from best available source).
|
|
699
|
+
*
|
|
700
|
+
* @remarks
|
|
701
|
+
* Collects titles from multiple sources, cleans them, and picks the longest.
|
|
702
|
+
* Sources: OpenGraph, Twitter Card, HTML title tag, First H1
|
|
703
|
+
*/
|
|
704
|
+
title?: string;
|
|
705
|
+
/**
|
|
706
|
+
* Page description (from best available source).
|
|
707
|
+
*
|
|
708
|
+
* @remarks
|
|
709
|
+
* Collects descriptions from metadata and picks the longest.
|
|
710
|
+
* Sources: OpenGraph, Twitter Card, HTML meta description
|
|
711
|
+
*/
|
|
712
|
+
description?: string;
|
|
713
|
+
/**
|
|
714
|
+
* Page keyvisual/image URL (from best available source).
|
|
715
|
+
*
|
|
716
|
+
* @remarks
|
|
717
|
+
* Priority: OpenGraph > Twitter Card > Largest Apple Touch Icon > Favicon
|
|
718
|
+
* Returns the URL object of the best visual representation of the site.
|
|
719
|
+
*/
|
|
720
|
+
image?: URL;
|
|
721
|
+
/**
|
|
722
|
+
* Best available icon/favicon for the site.
|
|
723
|
+
*
|
|
724
|
+
* @remarks
|
|
725
|
+
* Priority: Largest Apple Touch Icon > Safari mask icon > Favicon > Shortcut icon > MS tile > Fluid icon
|
|
726
|
+
* Returns the highest quality icon available, preferring modern, high-resolution formats.
|
|
727
|
+
*/
|
|
728
|
+
icon?: URL;
|
|
729
|
+
/**
|
|
730
|
+
* Primary language code (ISO 639-1).
|
|
731
|
+
*
|
|
732
|
+
* @remarks
|
|
733
|
+
* Extracted from HTML lang attribute, content-language meta tag, or OpenGraph locale.
|
|
734
|
+
* Normalized to lowercase ISO 639-1 format (e.g., 'en', 'de', 'fr', 'ja').
|
|
735
|
+
*/
|
|
736
|
+
language?: string;
|
|
737
|
+
/**
|
|
738
|
+
* Region code (ISO 3166-1 alpha-2).
|
|
739
|
+
*
|
|
740
|
+
* @remarks
|
|
741
|
+
* Only present if the language includes a region specifier.
|
|
742
|
+
* Normalized to uppercase ISO 3166-1 alpha-2 format (e.g., 'US', 'GB', 'DE').
|
|
743
|
+
*/
|
|
744
|
+
region?: string;
|
|
745
|
+
/**
|
|
746
|
+
* Raw HTML content of the page (UTF-8).
|
|
747
|
+
*
|
|
748
|
+
* @remarks
|
|
749
|
+
* The complete HTML source after fetching and decoding to UTF-8.
|
|
750
|
+
* Useful for custom processing or caching.
|
|
751
|
+
*/
|
|
752
|
+
html: string;
|
|
753
|
+
/**
|
|
754
|
+
* Plain text content extracted from the HTML.
|
|
755
|
+
*
|
|
756
|
+
* @remarks
|
|
757
|
+
* Automatically converted from HTML using the `htmlToText` function.
|
|
758
|
+
* Removes all tags, decodes entities, and preserves document structure
|
|
759
|
+
* with appropriate line breaks.
|
|
760
|
+
*/
|
|
761
|
+
text: string;
|
|
762
|
+
/**
|
|
763
|
+
* Internal links found on the page (same domain, excluding current URL).
|
|
764
|
+
*
|
|
765
|
+
* @remarks
|
|
766
|
+
* All links are URL objects. The current page URL is excluded to avoid
|
|
767
|
+
* self-references. Useful for site crawling and navigation analysis.
|
|
768
|
+
*/
|
|
769
|
+
internalLinks: URL[];
|
|
770
|
+
/**
|
|
771
|
+
* External links found on the page (different domains).
|
|
772
|
+
*
|
|
773
|
+
* @remarks
|
|
774
|
+
* All links are URL objects. Useful for analyzing outbound links,
|
|
775
|
+
* citations, and external resources.
|
|
776
|
+
*/
|
|
777
|
+
externalLinks: URL[];
|
|
778
|
+
}
|
|
779
|
+
/**
|
|
780
|
+
* Gathered article data.
|
|
781
|
+
*
|
|
782
|
+
* @remarks
|
|
783
|
+
* This interface represents the complete gathered data from an article page,
|
|
784
|
+
* including the authoritative URL, raw HTML, and extracted content.
|
|
785
|
+
* It will be extended incrementally with more properties.
|
|
786
|
+
*/
|
|
787
|
+
interface Article {
|
|
788
|
+
/**
|
|
789
|
+
* Authoritative URL for the article.
|
|
790
|
+
*
|
|
791
|
+
* @remarks
|
|
792
|
+
* Uses canonical URL if present, otherwise the final URL after redirects.
|
|
793
|
+
*/
|
|
794
|
+
url: URL;
|
|
795
|
+
/**
|
|
796
|
+
* Raw HTML content of the article page (UTF-8).
|
|
797
|
+
*
|
|
798
|
+
* @remarks
|
|
799
|
+
* The complete HTML source after fetching and decoding to UTF-8.
|
|
800
|
+
* Useful for custom processing or caching.
|
|
801
|
+
*/
|
|
802
|
+
html: string;
|
|
803
|
+
/**
|
|
804
|
+
* Plain text content extracted from the HTML.
|
|
805
|
+
*
|
|
806
|
+
* @remarks
|
|
807
|
+
* Automatically converted from HTML using the `htmlToText` function.
|
|
808
|
+
* Removes all tags, decodes entities, and preserves document structure
|
|
809
|
+
* with appropriate line breaks.
|
|
810
|
+
*/
|
|
811
|
+
text: string;
|
|
812
|
+
/**
|
|
813
|
+
* Cleaned article content (plain text).
|
|
814
|
+
*
|
|
815
|
+
* @remarks
|
|
816
|
+
* Extracted using Mozilla Readability (cleaned HTML), then converted to
|
|
817
|
+
* plain text using `htmlToText` for proper formatting.
|
|
818
|
+
* This is the main article body without navigation, ads, or other clutter.
|
|
819
|
+
* Falls back to undefined if Readability extraction fails.
|
|
820
|
+
*/
|
|
821
|
+
content?: string;
|
|
822
|
+
/**
|
|
823
|
+
* Article title.
|
|
824
|
+
*
|
|
825
|
+
* @remarks
|
|
826
|
+
* Extracted from Mozilla Readability if available.
|
|
827
|
+
* Falls back to metadata (Schema.org, OpenGraph, Twitter Card, HTML title)
|
|
828
|
+
* if Readability extraction fails or title is empty.
|
|
829
|
+
*/
|
|
830
|
+
title?: string;
|
|
831
|
+
/**
|
|
832
|
+
* Article description/excerpt.
|
|
833
|
+
*
|
|
834
|
+
* @remarks
|
|
835
|
+
* Extracted from Mozilla Readability's excerpt if available.
|
|
836
|
+
* Falls back to metadata (OpenGraph, Twitter Card, HTML meta description)
|
|
837
|
+
* if Readability excerpt is empty or extraction fails.
|
|
838
|
+
*/
|
|
839
|
+
description?: string;
|
|
840
|
+
/**
|
|
841
|
+
* Article keyvisual/image URL (from best available source).
|
|
842
|
+
*
|
|
843
|
+
* @remarks
|
|
844
|
+
* Priority: Schema.org NewsArticle/Article (largest) > OpenGraph > Twitter Card > Largest Apple Touch Icon > Favicon
|
|
845
|
+
* Returns the URL object of the best visual representation of the article.
|
|
846
|
+
*/
|
|
847
|
+
image?: URL;
|
|
848
|
+
/**
|
|
849
|
+
* Primary language code (ISO 639-1).
|
|
850
|
+
*
|
|
851
|
+
* @remarks
|
|
852
|
+
* Extracted from HTML lang attribute, Content-Language meta, or OpenGraph locale.
|
|
853
|
+
* Returns lowercase 2-letter ISO 639-1 code (e.g., 'en', 'de', 'fr').
|
|
854
|
+
*/
|
|
855
|
+
language?: string;
|
|
856
|
+
/**
|
|
857
|
+
* Region/country code (ISO 3166-1 alpha-2).
|
|
858
|
+
*
|
|
859
|
+
* @remarks
|
|
860
|
+
* Extracted from language tags like 'en-US' or 'de-DE'.
|
|
861
|
+
* Returns uppercase 2-letter ISO 3166-1 alpha-2 code (e.g., 'US', 'GB', 'DE').
|
|
862
|
+
*/
|
|
863
|
+
region?: string;
|
|
864
|
+
/**
|
|
865
|
+
* Internal links found in the article (same domain/subdomain).
|
|
866
|
+
*
|
|
867
|
+
* @remarks
|
|
868
|
+
* Links pointing to pages within the same domain.
|
|
869
|
+
* Automatically excludes the current article URL.
|
|
870
|
+
* All URLs are absolute and normalized.
|
|
871
|
+
*/
|
|
872
|
+
internalLinks: URL[];
|
|
873
|
+
/**
|
|
874
|
+
* External links found in the article (different domains).
|
|
875
|
+
*
|
|
876
|
+
* @remarks
|
|
877
|
+
* Links pointing to external domains (useful for citations, references).
|
|
878
|
+
* All URLs are absolute and normalized.
|
|
879
|
+
*/
|
|
880
|
+
externalLinks: URL[];
|
|
881
|
+
/**
|
|
882
|
+
* Word count of the article.
|
|
883
|
+
*
|
|
884
|
+
* @remarks
|
|
885
|
+
* Calculated from `content` if available (Readability-cleaned content),
|
|
886
|
+
* otherwise calculated from `text` (full page text).
|
|
887
|
+
* Based on whitespace-separated word boundaries.
|
|
888
|
+
*/
|
|
889
|
+
wordCount: number;
|
|
890
|
+
/**
|
|
891
|
+
* Estimated reading time in minutes.
|
|
892
|
+
*
|
|
893
|
+
* @remarks
|
|
894
|
+
* Calculated from word count using average reading speed of 200 words per minute.
|
|
895
|
+
* Minimum value is 1 minute.
|
|
896
|
+
*/
|
|
897
|
+
readingTime: number;
|
|
898
|
+
}
|
|
899
|
+
|
|
900
|
+
/**
|
|
901
|
+
* High-level article gathering functionality.
|
|
902
|
+
*
|
|
903
|
+
* @packageDocumentation
|
|
904
|
+
*/
|
|
905
|
+
|
|
906
|
+
/**
|
|
907
|
+
* Gather article data from a URL in one convenient call.
|
|
908
|
+
*
|
|
909
|
+
* @remarks
|
|
910
|
+
* This is a high-level convenience method that fetches an article page and extracts
|
|
911
|
+
* relevant data. It handles encoding detection, redirects, and provides
|
|
912
|
+
* a unified interface for all article data.
|
|
913
|
+
*
|
|
914
|
+
* This method will be extended incrementally to include metadata extraction,
|
|
915
|
+
* content extraction, and more.
|
|
916
|
+
*
|
|
917
|
+
* @param url - Article URL as string or URL object
|
|
918
|
+
* @returns Gathered article data including URL, content, metadata, language, and links
|
|
919
|
+
* @throws Error if URL is invalid or fetch fails
|
|
920
|
+
*
|
|
921
|
+
* @example
|
|
922
|
+
* ```typescript
|
|
923
|
+
* // Fetch an article and get its data
|
|
924
|
+
* const article = await gatherArticle('https://example.com/article');
|
|
925
|
+
* console.log(article.url); // Final URL after redirects
|
|
926
|
+
* console.log(article.html); // Raw HTML content (UTF-8)
|
|
927
|
+
* console.log(article.text); // Plain text (full page HTML converted)
|
|
928
|
+
* console.log(article.content); // Cleaned article content (Readability + htmlToText)
|
|
929
|
+
* console.log(article.title); // Article title (from Readability or metadata)
|
|
930
|
+
* console.log(article.description); // Article excerpt or description
|
|
931
|
+
* console.log(article.image); // Article keyvisual/image (from best source)
|
|
932
|
+
* console.log(article.language); // Language code (ISO 639-1, e.g., 'en')
|
|
933
|
+
* console.log(article.region); // Region code (ISO 3166-1 alpha-2, e.g., 'US')
|
|
934
|
+
* console.log(article.internalLinks); // Array of internal link URLs
|
|
935
|
+
* console.log(article.externalLinks); // Array of external link URLs
|
|
936
|
+
* console.log(article.wordCount); // Word count (from content or text)
|
|
937
|
+
* console.log(article.readingTime); // Estimated reading time in minutes
|
|
938
|
+
* ```
|
|
939
|
+
*/
|
|
940
|
+
declare function gatherArticle(url: string | URL): Promise<Article>;
|
|
941
|
+
|
|
942
|
+
/**
|
|
943
|
+
* High-level feed gathering functionality.
|
|
944
|
+
*
|
|
945
|
+
* @packageDocumentation
|
|
946
|
+
*/
|
|
947
|
+
|
|
948
|
+
/**
|
|
949
|
+
* Gather and parse a feed from a URL in one convenient call.
|
|
950
|
+
*
|
|
951
|
+
* @remarks
|
|
952
|
+
* This is a high-level convenience method that combines fetching and parsing.
|
|
953
|
+
* It handles encoding detection, redirects, and feed format detection automatically.
|
|
954
|
+
*
|
|
955
|
+
* @param url - Feed URL as string or URL object
|
|
956
|
+
* @returns Normalized feed data
|
|
957
|
+
* @throws Error if URL is invalid, fetch fails, or feed cannot be parsed
|
|
958
|
+
*
|
|
959
|
+
* @example
|
|
960
|
+
* ```typescript
|
|
961
|
+
* // Fetch and parse a feed
|
|
962
|
+
* const feed = await gatherFeed('https://example.com/feed.xml');
|
|
963
|
+
*
|
|
964
|
+
* console.log(feed.title);
|
|
965
|
+
* console.log(feed.items[0].title);
|
|
966
|
+
* console.log(feed.items[0].url);
|
|
967
|
+
* ```
|
|
968
|
+
*/
|
|
969
|
+
declare function gatherFeed(url: string | URL): Promise<Feed>;
|
|
970
|
+
|
|
971
|
+
/**
|
|
972
|
+
* High-level website gathering functionality.
|
|
973
|
+
*
|
|
974
|
+
* @packageDocumentation
|
|
975
|
+
*/
|
|
976
|
+
|
|
977
|
+
/**
|
|
978
|
+
* Gather website data from a URL in one convenient call.
|
|
979
|
+
*
|
|
980
|
+
* @remarks
|
|
981
|
+
* This is a high-level convenience method that fetches a website and extracts
|
|
982
|
+
* all relevant data. It handles encoding detection, redirects, and provides
|
|
983
|
+
* a unified interface for all website data.
|
|
984
|
+
*
|
|
985
|
+
* This method will be extended incrementally to include metadata extraction,
|
|
986
|
+
* content extraction, and more.
|
|
987
|
+
*
|
|
988
|
+
* @param url - Website URL as string or URL object
|
|
989
|
+
* @returns Gathered website data including final URL, title, description, image, icon, language, html, text, feeds, and links
|
|
990
|
+
* @throws Error if URL is invalid or fetch fails
|
|
991
|
+
*
|
|
992
|
+
* @example
|
|
993
|
+
* ```typescript
|
|
994
|
+
* // Fetch a website and get its data
|
|
995
|
+
* const site = await gatherWebsite('https://example.com');
|
|
996
|
+
* console.log(site.url); // Final URL after redirects
|
|
997
|
+
* console.log(site.title); // Page title (cleaned, from best source)
|
|
998
|
+
* console.log(site.description); // Page description (from best source)
|
|
999
|
+
* console.log(site.image); // Page image/keyvisual (from best source)
|
|
1000
|
+
* console.log(site.icon); // Best available icon/favicon
|
|
1001
|
+
* console.log(site.language); // Primary language code (ISO 639-1)
|
|
1002
|
+
* console.log(site.region); // Region code (ISO 3166-1 alpha-2)
|
|
1003
|
+
* console.log(site.html); // Raw HTML content (UTF-8)
|
|
1004
|
+
* console.log(site.text); // Plain text content (extracted from HTML)
|
|
1005
|
+
* console.log(site.feeds); // Array of feed URL objects
|
|
1006
|
+
* console.log(site.internalLinks); // Array of internal link URL objects
|
|
1007
|
+
* console.log(site.externalLinks); // Array of external link URL objects
|
|
1008
|
+
* ```
|
|
1009
|
+
*/
|
|
1010
|
+
declare function gatherWebsite(url: string | URL): Promise<Website>;
|
|
1011
|
+
|
|
1012
|
+
/**
|
|
1013
|
+
* HTML parsing utilities using linkedom.
|
|
1014
|
+
*
|
|
1015
|
+
* @remarks
|
|
1016
|
+
* This module provides a simple wrapper around linkedom for consistent
|
|
1017
|
+
* HTML parsing across all metadata extraction modules. Parsing should happen
|
|
1018
|
+
* once at the top level and the parsed document passed to all extractors.
|
|
1019
|
+
*
|
|
1020
|
+
* @packageDocumentation
|
|
1021
|
+
*/
|
|
1022
|
+
/**
|
|
1023
|
+
* Parse HTML string into a DOM document.
|
|
1024
|
+
*
|
|
1025
|
+
* @remarks
|
|
1026
|
+
* Parses HTML using linkedom, providing a standards-compliant DOM implementation.
|
|
1027
|
+
* This should be called once per document, with the result passed to all metadata
|
|
1028
|
+
* extractors for performance.
|
|
1029
|
+
*
|
|
1030
|
+
* Never throws - returns a document even for malformed HTML.
|
|
1031
|
+
*
|
|
1032
|
+
* @param html - HTML string to parse
|
|
1033
|
+
* @param baseUrl - Optional base URL for resolving relative URLs
|
|
1034
|
+
* @returns Parsed DOM document
|
|
1035
|
+
*
|
|
1036
|
+
* @example
|
|
1037
|
+
* ```typescript
|
|
1038
|
+
* const doc = parseHTML('<html><head><title>Test</title></head></html>');
|
|
1039
|
+
* const title = doc.querySelector('title')?.textContent;
|
|
1040
|
+
* ```
|
|
1041
|
+
*/
|
|
1042
|
+
declare function parseHTML(html: string, baseUrl?: string): Document;
|
|
1043
|
+
type HTMLDocument = Document;
|
|
1044
|
+
|
|
1045
|
+
/**
|
|
1046
|
+
* Analytics and tracking types.
|
|
1047
|
+
*
|
|
1048
|
+
* @remarks
|
|
1049
|
+
* Types for analytics service detection (IDs only, no tracking).
|
|
1050
|
+
*
|
|
1051
|
+
* @packageDocumentation
|
|
1052
|
+
*/
|
|
1053
|
+
/**
|
|
1054
|
+
* Analytics metadata.
|
|
1055
|
+
*
|
|
1056
|
+
* @remarks
|
|
1057
|
+
* Contains detected analytics service IDs. Privacy-conscious - only extracts IDs,
|
|
1058
|
+
* doesn't perform any tracking.
|
|
1059
|
+
*/
|
|
1060
|
+
interface AnalyticsMetadata {
|
|
1061
|
+
/** Google Analytics tracking IDs (UA-, G-, GT- prefixes) */
|
|
1062
|
+
googleAnalytics?: string[];
|
|
1063
|
+
/** Google Tag Manager container IDs */
|
|
1064
|
+
googleTagManager?: string[];
|
|
1065
|
+
/** Facebook Pixel IDs */
|
|
1066
|
+
facebookPixel?: string[];
|
|
1067
|
+
/** Matomo/Piwik site IDs */
|
|
1068
|
+
matomo?: string[];
|
|
1069
|
+
/** Plausible Analytics domains */
|
|
1070
|
+
plausible?: string[];
|
|
1071
|
+
/** Adobe Analytics (Omniture) IDs */
|
|
1072
|
+
adobe?: string[];
|
|
1073
|
+
/** Cloudflare Web Analytics tokens */
|
|
1074
|
+
cloudflare?: string[];
|
|
1075
|
+
/** Fathom Analytics site IDs */
|
|
1076
|
+
fathom?: string[];
|
|
1077
|
+
}
|
|
1078
|
+
|
|
1079
|
+
/**
|
|
1080
|
+
* Analytics and tracking extraction.
|
|
1081
|
+
*
|
|
1082
|
+
* @remarks
|
|
1083
|
+
* Detects analytics service IDs from HTML documents.
|
|
1084
|
+
* Privacy-conscious - only extracts IDs, doesn't perform any tracking.
|
|
1085
|
+
*
|
|
1086
|
+
* @packageDocumentation
|
|
1087
|
+
*/
|
|
1088
|
+
|
|
1089
|
+
/**
|
|
1090
|
+
* Extract analytics metadata from parsed HTML document.
|
|
1091
|
+
*
|
|
1092
|
+
* @remarks
|
|
1093
|
+
* Detects analytics service IDs by examining script tags and their content.
|
|
1094
|
+
* Only extracts identifiers, does not track or collect user data.
|
|
1095
|
+
*
|
|
1096
|
+
* @param doc - Parsed HTML document
|
|
1097
|
+
* @returns Analytics metadata
|
|
1098
|
+
*
|
|
1099
|
+
* @example
|
|
1100
|
+
* ```typescript
|
|
1101
|
+
* const doc = parseHTML(htmlString);
|
|
1102
|
+
* const analytics = extractAnalytics(doc);
|
|
1103
|
+
* console.log(analytics.googleAnalytics);
|
|
1104
|
+
* console.log(analytics.googleTagManager);
|
|
1105
|
+
* ```
|
|
1106
|
+
*/
|
|
1107
|
+
declare function extractAnalytics(doc: HTMLDocument): AnalyticsMetadata;
|
|
1108
|
+
|
|
1109
|
+
/**
|
|
1110
|
+
* Assets extraction types.
|
|
1111
|
+
*
|
|
1112
|
+
* @remarks
|
|
1113
|
+
* Types for categorized asset URLs extracted from HTML documents.
|
|
1114
|
+
*
|
|
1115
|
+
* @author Anonyfox <max@anonyfox.com>
|
|
1116
|
+
* @license MIT
|
|
1117
|
+
* @see {@link https://github.com/Anonyfox/ravenjs}
|
|
1118
|
+
* @see {@link https://ravenjs.dev}
|
|
1119
|
+
* @see {@link https://anonyfox.com}
|
|
1120
|
+
*
|
|
1121
|
+
* @packageDocumentation
|
|
1122
|
+
*/
|
|
1123
|
+
/**
|
|
1124
|
+
* Categorized assets extracted from HTML.
|
|
1125
|
+
*
|
|
1126
|
+
* @remarks
|
|
1127
|
+
* Contains all external assets referenced in the document, organized by type.
|
|
1128
|
+
* All URLs are normalized to absolute format if a base URL is available.
|
|
1129
|
+
*/
|
|
1130
|
+
interface AssetsMetadata {
|
|
1131
|
+
/** Image URLs from img, picture, srcset, and meta tags */
|
|
1132
|
+
images?: string[];
|
|
1133
|
+
/** Stylesheet URLs from link tags */
|
|
1134
|
+
stylesheets?: string[];
|
|
1135
|
+
/** Script URLs from script tags */
|
|
1136
|
+
scripts?: string[];
|
|
1137
|
+
/** Font URLs extracted from CSS */
|
|
1138
|
+
fonts?: string[];
|
|
1139
|
+
/** Media URLs from video, audio, source, and track elements */
|
|
1140
|
+
media?: string[];
|
|
1141
|
+
/** Web app manifest URLs */
|
|
1142
|
+
manifests?: string[];
|
|
1143
|
+
/** Preload/prefetch resource hints */
|
|
1144
|
+
preloads?: PreloadResource[];
|
|
1145
|
+
/** DNS prefetch and preconnect hints */
|
|
1146
|
+
connectionHints?: ConnectionHint[];
|
|
1147
|
+
}
|
|
1148
|
+
/**
|
|
1149
|
+
* Preload or prefetch resource hint.
|
|
1150
|
+
*/
|
|
1151
|
+
interface PreloadResource {
|
|
1152
|
+
/** Resource URL */
|
|
1153
|
+
url: string;
|
|
1154
|
+
/** Resource type (script, style, font, image, etc.) */
|
|
1155
|
+
as?: string;
|
|
1156
|
+
/** MIME type */
|
|
1157
|
+
type?: string;
|
|
1158
|
+
/** Crossorigin attribute */
|
|
1159
|
+
crossorigin?: string;
|
|
1160
|
+
/** Whether this is a prefetch (true) or preload (false) */
|
|
1161
|
+
prefetch?: boolean;
|
|
1162
|
+
}
|
|
1163
|
+
/**
|
|
1164
|
+
* DNS prefetch or preconnect hint.
|
|
1165
|
+
*/
|
|
1166
|
+
interface ConnectionHint {
|
|
1167
|
+
/** Domain URL */
|
|
1168
|
+
url: string;
|
|
1169
|
+
/** Whether this is a preconnect (true) or dns-prefetch (false) */
|
|
1170
|
+
preconnect?: boolean;
|
|
1171
|
+
/** Crossorigin attribute */
|
|
1172
|
+
crossorigin?: string;
|
|
1173
|
+
}
|
|
1174
|
+
|
|
1175
|
+
/**
|
|
1176
|
+
* Assets extraction.
|
|
1177
|
+
*
|
|
1178
|
+
* @remarks
|
|
1179
|
+
* Extracts categorized asset URLs from HTML documents.
|
|
1180
|
+
*
|
|
1181
|
+
* @author Anonyfox <max@anonyfox.com>
|
|
1182
|
+
* @license MIT
|
|
1183
|
+
* @see {@link https://github.com/Anonyfox/ravenjs}
|
|
1184
|
+
* @see {@link https://ravenjs.dev}
|
|
1185
|
+
* @see {@link https://anonyfox.com}
|
|
1186
|
+
*
|
|
1187
|
+
* @packageDocumentation
|
|
1188
|
+
*/
|
|
1189
|
+
|
|
1190
|
+
/**
|
|
1191
|
+
* Extract assets metadata from parsed HTML document.
|
|
1192
|
+
*
|
|
1193
|
+
* @remarks
|
|
1194
|
+
* Extracts all external assets referenced in the document, organized by type.
|
|
1195
|
+
* All URLs are normalized to absolute format based on the document's base URL.
|
|
1196
|
+
*
|
|
1197
|
+
* The extractor finds assets from:
|
|
1198
|
+
* - Images: `<img>`, `<picture>`, `srcset`, OpenGraph meta tags
|
|
1199
|
+
* - Stylesheets: `<link rel="stylesheet">`
|
|
1200
|
+
* - Scripts: `<script src>`
|
|
1201
|
+
* - Fonts: CSS `@font-face` and `url()` with font extensions
|
|
1202
|
+
* - Media: `<video>`, `<audio>`, `<source>`, `<track>`
|
|
1203
|
+
* - Manifests: `<link rel="manifest">`
|
|
1204
|
+
* - Preloads: `<link rel="preload">` and `<link rel="prefetch">`
|
|
1205
|
+
* - Connection hints: `<link rel="dns-prefetch">` and `<link rel="preconnect">`
|
|
1206
|
+
*
|
|
1207
|
+
* @param doc - Parsed HTML document
|
|
1208
|
+
* @param baseUrl - Optional base URL for resolving relative URLs
|
|
1209
|
+
* @returns Assets metadata object with categorized URLs
|
|
1210
|
+
*
|
|
1211
|
+
* @example
|
|
1212
|
+
* ```typescript
|
|
1213
|
+
* const doc = parseHTML(htmlString);
|
|
1214
|
+
* const assets = extractAssets(doc, 'https://example.com');
|
|
1215
|
+
* console.log(assets.images);
|
|
1216
|
+
* console.log(assets.stylesheets);
|
|
1217
|
+
* console.log(assets.scripts);
|
|
1218
|
+
* ```
|
|
1219
|
+
*/
|
|
1220
|
+
declare function extractAssets(doc: HTMLDocument, baseUrl?: string | URL | null): AssetsMetadata;
|
|
1221
|
+
|
|
1222
|
+
/**
|
|
1223
|
+
* Canonical and alternate URL metadata types.
|
|
1224
|
+
*
|
|
1225
|
+
* @remarks
|
|
1226
|
+
* URL relationships, internationalization, and special versions.
|
|
1227
|
+
*
|
|
1228
|
+
* @packageDocumentation
|
|
1229
|
+
*/
|
|
1230
|
+
/**
|
|
1231
|
+
* Alternate URL relationship.
|
|
1232
|
+
*/
|
|
1233
|
+
interface AlternateLink {
|
|
1234
|
+
/** URL of the alternate version */
|
|
1235
|
+
href: string;
|
|
1236
|
+
/** Language/locale code (hreflang) */
|
|
1237
|
+
hreflang?: string;
|
|
1238
|
+
/** MIME type */
|
|
1239
|
+
type?: string;
|
|
1240
|
+
/** Title/description */
|
|
1241
|
+
title?: string;
|
|
1242
|
+
}
|
|
1243
|
+
/**
|
|
1244
|
+
* App link metadata for deep linking.
|
|
1245
|
+
*/
|
|
1246
|
+
interface AppLinks {
|
|
1247
|
+
/** iOS app URL */
|
|
1248
|
+
ios?: string;
|
|
1249
|
+
/** Android app URL */
|
|
1250
|
+
android?: string;
|
|
1251
|
+
/** Web fallback URL */
|
|
1252
|
+
web?: string;
|
|
1253
|
+
}
|
|
1254
|
+
/**
|
|
1255
|
+
* Canonical and alternate URL metadata.
|
|
1256
|
+
*
|
|
1257
|
+
* @remarks
|
|
1258
|
+
* Contains canonical URLs, language alternates, special versions (AMP),
|
|
1259
|
+
* and app linking metadata.
|
|
1260
|
+
*/
|
|
1261
|
+
interface CanonicalMetadata {
|
|
1262
|
+
/** Canonical URL for this page */
|
|
1263
|
+
canonical?: string;
|
|
1264
|
+
/** Language/region alternates */
|
|
1265
|
+
alternates?: AlternateLink[];
|
|
1266
|
+
/** AMP (Accelerated Mobile Pages) version URL */
|
|
1267
|
+
amphtml?: string;
|
|
1268
|
+
/** Web app manifest URL */
|
|
1269
|
+
manifest?: string;
|
|
1270
|
+
/** App deep linking URLs */
|
|
1271
|
+
appLinks?: AppLinks;
|
|
1272
|
+
}
|
|
1273
|
+
|
|
1274
|
+
/**
|
|
1275
|
+
* Canonical and alternate URL extraction.
|
|
1276
|
+
*
|
|
1277
|
+
* @remarks
|
|
1278
|
+
* Extracts canonical URLs, alternates, and special versions from HTML documents.
|
|
1279
|
+
*
|
|
1280
|
+
* @packageDocumentation
|
|
1281
|
+
*/
|
|
1282
|
+
|
|
1283
|
+
/**
|
|
1284
|
+
* Extract canonical and alternate URL metadata from parsed HTML document.
|
|
1285
|
+
*
|
|
1286
|
+
* @remarks
|
|
1287
|
+
* Extracts canonical URLs, language alternates, AMP versions, manifests,
|
|
1288
|
+
* and app linking metadata.
|
|
1289
|
+
*
|
|
1290
|
+
* @param doc - Parsed HTML document
|
|
1291
|
+
* @returns Canonical metadata object
|
|
1292
|
+
*
|
|
1293
|
+
* @example
|
|
1294
|
+
* ```typescript
|
|
1295
|
+
* const doc = parseHTML(htmlString);
|
|
1296
|
+
* const canonical = extractCanonical(doc);
|
|
1297
|
+
* console.log(canonical.canonical);
|
|
1298
|
+
* console.log(canonical.alternates);
|
|
1299
|
+
* ```
|
|
1300
|
+
*/
|
|
1301
|
+
declare function extractCanonical(doc: HTMLDocument): CanonicalMetadata;
|
|
1302
|
+
|
|
1303
|
+
/**
|
|
1304
|
+
* Copyright and licensing types.
|
|
1305
|
+
*
|
|
1306
|
+
* @remarks
|
|
1307
|
+
* Types for copyright and content licensing information.
|
|
1308
|
+
*
|
|
1309
|
+
* @packageDocumentation
|
|
1310
|
+
*/
|
|
1311
|
+
/**
|
|
1312
|
+
* Copyright and licensing metadata.
|
|
1313
|
+
*
|
|
1314
|
+
* @remarks
|
|
1315
|
+
* Contains copyright and license information from various sources.
|
|
1316
|
+
*/
|
|
1317
|
+
interface CopyrightMetadata {
|
|
1318
|
+
/** Copyright notice */
|
|
1319
|
+
copyright?: string;
|
|
1320
|
+
/** License URL or identifier */
|
|
1321
|
+
license?: string;
|
|
1322
|
+
/** Copyright holder/owner */
|
|
1323
|
+
holder?: string;
|
|
1324
|
+
/** Copyright year */
|
|
1325
|
+
year?: string;
|
|
1326
|
+
}
|
|
1327
|
+
|
|
1328
|
+
/**
|
|
1329
|
+
* Copyright and licensing extraction.
|
|
1330
|
+
*
|
|
1331
|
+
* @remarks
|
|
1332
|
+
* Extracts copyright and license metadata from HTML documents.
|
|
1333
|
+
*
|
|
1334
|
+
* @packageDocumentation
|
|
1335
|
+
*/
|
|
1336
|
+
|
|
1337
|
+
/**
|
|
1338
|
+
* Extract copyright metadata from parsed HTML document.
|
|
1339
|
+
*
|
|
1340
|
+
* @remarks
|
|
1341
|
+
* Extracts copyright and licensing information from meta tags, link tags,
|
|
1342
|
+
* and Schema.org structured data.
|
|
1343
|
+
*
|
|
1344
|
+
* @param doc - Parsed HTML document
|
|
1345
|
+
* @returns Copyright metadata
|
|
1346
|
+
*
|
|
1347
|
+
* @example
|
|
1348
|
+
* ```typescript
|
|
1349
|
+
* const doc = parseHTML(htmlString);
|
|
1350
|
+
* const copyright = extractCopyright(doc);
|
|
1351
|
+
* console.log(copyright.copyright);
|
|
1352
|
+
* console.log(copyright.license);
|
|
1353
|
+
* ```
|
|
1354
|
+
*/
|
|
1355
|
+
declare function extractCopyright(doc: HTMLDocument): CopyrightMetadata;
|
|
1356
|
+
|
|
1357
|
+
/**
|
|
1358
|
+
* Dublin Core metadata types.
|
|
1359
|
+
*
|
|
1360
|
+
* @remarks
|
|
1361
|
+
* Library and academic metadata standard.
|
|
1362
|
+
*
|
|
1363
|
+
* @packageDocumentation
|
|
1364
|
+
*/
|
|
1365
|
+
/**
|
|
1366
|
+
* Dublin Core metadata extracted from meta tags.
|
|
1367
|
+
*
|
|
1368
|
+
* @remarks
|
|
1369
|
+
* Contains metadata using the Dublin Core standard, commonly used in
|
|
1370
|
+
* academic and library contexts. Supports both DC. and dcterms. prefixes.
|
|
1371
|
+
*/
|
|
1372
|
+
interface DublinCoreMetadata {
|
|
1373
|
+
/** Resource title */
|
|
1374
|
+
title?: string;
|
|
1375
|
+
/** Entity responsible for making the resource (authors, creators) */
|
|
1376
|
+
creator?: string[];
|
|
1377
|
+
/** Topic or subject of the resource */
|
|
1378
|
+
subject?: string[];
|
|
1379
|
+
/** Description of the resource */
|
|
1380
|
+
description?: string;
|
|
1381
|
+
/** Entity responsible for making the resource available */
|
|
1382
|
+
publisher?: string;
|
|
1383
|
+
/** Entity responsible for contributions to the resource */
|
|
1384
|
+
contributor?: string[];
|
|
1385
|
+
/** Date of resource creation/publication */
|
|
1386
|
+
date?: string;
|
|
1387
|
+
/** Nature or genre of the resource */
|
|
1388
|
+
type?: string;
|
|
1389
|
+
/** File format, physical medium, or dimensions */
|
|
1390
|
+
format?: string;
|
|
1391
|
+
/** Unambiguous reference to the resource */
|
|
1392
|
+
identifier?: string;
|
|
1393
|
+
/** Related resource from which the described resource is derived */
|
|
1394
|
+
source?: string;
|
|
1395
|
+
/** Language of the resource */
|
|
1396
|
+
language?: string;
|
|
1397
|
+
/** Related resource */
|
|
1398
|
+
relation?: string;
|
|
1399
|
+
/** Spatial or temporal topic, location, or period */
|
|
1400
|
+
coverage?: string;
|
|
1401
|
+
/** Information about rights held in and over the resource */
|
|
1402
|
+
rights?: string;
|
|
1403
|
+
}
|
|
1404
|
+
|
|
1405
|
+
/**
|
|
1406
|
+
* Dublin Core metadata extraction.
|
|
1407
|
+
*
|
|
1408
|
+
* @remarks
|
|
1409
|
+
* Extracts Dublin Core metadata from HTML documents.
|
|
1410
|
+
*
|
|
1411
|
+
* @packageDocumentation
|
|
1412
|
+
*/
|
|
1413
|
+
|
|
1414
|
+
/**
|
|
1415
|
+
* Extract Dublin Core metadata from parsed HTML document.
|
|
1416
|
+
*
|
|
1417
|
+
* @remarks
|
|
1418
|
+
* Extracts Dublin Core metadata using both DC. and dcterms. prefixes.
|
|
1419
|
+
* Fields that can have multiple values (creator, subject, contributor)
|
|
1420
|
+
* are extracted as arrays.
|
|
1421
|
+
*
|
|
1422
|
+
* @param doc - Parsed HTML document
|
|
1423
|
+
* @returns Dublin Core metadata object
|
|
1424
|
+
*
|
|
1425
|
+
* @example
|
|
1426
|
+
* ```typescript
|
|
1427
|
+
* const doc = parseHTML(htmlString);
|
|
1428
|
+
* const dc = extractDublinCore(doc);
|
|
1429
|
+
* console.log(dc.title);
|
|
1430
|
+
* console.log(dc.creator);
|
|
1431
|
+
* ```
|
|
1432
|
+
*/
|
|
1433
|
+
declare function extractDublinCore(doc: HTMLDocument): DublinCoreMetadata;
|
|
1434
|
+
|
|
1435
|
+
/**
|
|
1436
|
+
* Feed discovery types.
|
|
1437
|
+
*
|
|
1438
|
+
* @remarks
|
|
1439
|
+
* Types for discovering RSS, Atom, and JSON feeds.
|
|
1440
|
+
*
|
|
1441
|
+
* @packageDocumentation
|
|
1442
|
+
*/
|
|
1443
|
+
/**
|
|
1444
|
+
* Discovered feed information.
|
|
1445
|
+
*/
|
|
1446
|
+
interface DiscoveredFeed {
|
|
1447
|
+
/** Feed URL */
|
|
1448
|
+
url: string;
|
|
1449
|
+
/** Feed type */
|
|
1450
|
+
type: 'rss' | 'atom' | 'json' | 'unknown';
|
|
1451
|
+
/** Feed title (if provided in link tag) */
|
|
1452
|
+
title?: string;
|
|
1453
|
+
}
|
|
1454
|
+
/**
|
|
1455
|
+
* Feed discovery metadata.
|
|
1456
|
+
*
|
|
1457
|
+
* @remarks
|
|
1458
|
+
* Contains all discovered feeds and suggested feed URLs based on common patterns.
|
|
1459
|
+
*/
|
|
1460
|
+
interface FeedDiscoveryMetadata {
|
|
1461
|
+
/** Feeds explicitly declared in <link> tags */
|
|
1462
|
+
feeds: DiscoveredFeed[];
|
|
1463
|
+
/** Suggested feed URLs based on common patterns (not verified) */
|
|
1464
|
+
suggestions?: string[];
|
|
1465
|
+
}
|
|
1466
|
+
|
|
1467
|
+
/**
|
|
1468
|
+
* Feed discovery extraction.
|
|
1469
|
+
*
|
|
1470
|
+
* @remarks
|
|
1471
|
+
* Discovers RSS, Atom, and JSON feeds in HTML documents.
|
|
1472
|
+
*
|
|
1473
|
+
* @packageDocumentation
|
|
1474
|
+
*/
|
|
1475
|
+
|
|
1476
|
+
/**
|
|
1477
|
+
* Extract feed discovery metadata from parsed HTML document.
|
|
1478
|
+
*
|
|
1479
|
+
* @remarks
|
|
1480
|
+
* Finds all feeds declared in <link rel="alternate"> tags and generates
|
|
1481
|
+
* suggestions for common feed URL patterns.
|
|
1482
|
+
*
|
|
1483
|
+
* @param doc - Parsed HTML document
|
|
1484
|
+
* @param documentUrl - Optional document URL for generating absolute feed suggestions
|
|
1485
|
+
* @returns Feed discovery metadata
|
|
1486
|
+
*
|
|
1487
|
+
* @example
|
|
1488
|
+
* ```typescript
|
|
1489
|
+
* const doc = parseHTML(htmlString);
|
|
1490
|
+
* const feeds = extractFeedDiscovery(doc, 'https://example.com');
|
|
1491
|
+
* console.log(feeds.feeds); // Discovered feeds
|
|
1492
|
+
* console.log(feeds.suggestions); // Suggested feed URLs
|
|
1493
|
+
* ```
|
|
1494
|
+
*/
|
|
1495
|
+
declare function extractFeedDiscovery(doc: HTMLDocument, documentUrl?: string | URL): FeedDiscoveryMetadata;
|
|
1496
|
+
|
|
1497
|
+
/**
|
|
1498
|
+
* Geographic location types.
|
|
1499
|
+
*
|
|
1500
|
+
* @remarks
|
|
1501
|
+
* Types for geographic location metadata.
|
|
1502
|
+
*
|
|
1503
|
+
* @packageDocumentation
|
|
1504
|
+
*/
|
|
1505
|
+
/**
|
|
1506
|
+
* Geographic coordinates.
|
|
1507
|
+
*
|
|
1508
|
+
* @remarks
|
|
1509
|
+
* Latitude and longitude coordinates.
|
|
1510
|
+
*/
|
|
1511
|
+
interface GeoPosition {
|
|
1512
|
+
/** Latitude in decimal degrees */
|
|
1513
|
+
latitude: number;
|
|
1514
|
+
/** Longitude in decimal degrees */
|
|
1515
|
+
longitude: number;
|
|
1516
|
+
}
|
|
1517
|
+
/**
|
|
1518
|
+
* Geographic metadata.
|
|
1519
|
+
*
|
|
1520
|
+
* @remarks
|
|
1521
|
+
* Contains geographic location information from meta tags.
|
|
1522
|
+
*/
|
|
1523
|
+
interface GeoMetadata {
|
|
1524
|
+
/** Geographic position (latitude/longitude) */
|
|
1525
|
+
position?: GeoPosition;
|
|
1526
|
+
/** Place name */
|
|
1527
|
+
placename?: string;
|
|
1528
|
+
/** Region code (e.g., US-CA for California, USA) */
|
|
1529
|
+
region?: string;
|
|
1530
|
+
/** Country name or code */
|
|
1531
|
+
country?: string;
|
|
1532
|
+
}
|
|
1533
|
+
|
|
1534
|
+
/**
|
|
1535
|
+
* Geographic location extraction.
|
|
1536
|
+
*
|
|
1537
|
+
* @remarks
|
|
1538
|
+
* Extracts geographic location metadata from HTML documents.
|
|
1539
|
+
*
|
|
1540
|
+
* @packageDocumentation
|
|
1541
|
+
*/
|
|
1542
|
+
|
|
1543
|
+
/**
|
|
1544
|
+
* Extract geographic metadata from parsed HTML document.
|
|
1545
|
+
*
|
|
1546
|
+
* @remarks
|
|
1547
|
+
* Extracts geographic location information including coordinates,
|
|
1548
|
+
* place names, and region codes from meta tags.
|
|
1549
|
+
*
|
|
1550
|
+
* @param doc - Parsed HTML document
|
|
1551
|
+
* @returns Geographic metadata
|
|
1552
|
+
*
|
|
1553
|
+
* @example
|
|
1554
|
+
* ```typescript
|
|
1555
|
+
* const doc = parseHTML(htmlString);
|
|
1556
|
+
* const geo = extractGeo(doc);
|
|
1557
|
+
* console.log(geo.position?.latitude);
|
|
1558
|
+
* console.log(geo.placename);
|
|
1559
|
+
* ```
|
|
1560
|
+
*/
|
|
1561
|
+
declare function extractGeo(doc: HTMLDocument): GeoMetadata;
|
|
1562
|
+
|
|
1563
|
+
/**
|
|
1564
|
+
* Icons and visual assets types.
|
|
1565
|
+
*
|
|
1566
|
+
* @remarks
|
|
1567
|
+
* Types for favicons, app icons, and visual branding.
|
|
1568
|
+
*
|
|
1569
|
+
* @packageDocumentation
|
|
1570
|
+
*/
|
|
1571
|
+
/**
|
|
1572
|
+
* Apple touch icon metadata.
|
|
1573
|
+
*/
|
|
1574
|
+
interface AppleTouchIcon {
|
|
1575
|
+
/** Icon URL */
|
|
1576
|
+
url: string;
|
|
1577
|
+
/** Icon size (e.g., "180x180") */
|
|
1578
|
+
sizes?: string;
|
|
1579
|
+
/** Whether it's precomposed (no effects applied) */
|
|
1580
|
+
precomposed?: boolean;
|
|
1581
|
+
}
|
|
1582
|
+
/**
|
|
1583
|
+
* Safari mask icon metadata.
|
|
1584
|
+
*/
|
|
1585
|
+
interface MaskIcon {
|
|
1586
|
+
/** SVG icon URL */
|
|
1587
|
+
url: string;
|
|
1588
|
+
/** Icon color */
|
|
1589
|
+
color?: string;
|
|
1590
|
+
}
|
|
1591
|
+
/**
|
|
1592
|
+
* Microsoft tile metadata.
|
|
1593
|
+
*/
|
|
1594
|
+
interface MSTile {
|
|
1595
|
+
/** Tile image URL */
|
|
1596
|
+
image?: string;
|
|
1597
|
+
/** Tile background color */
|
|
1598
|
+
color?: string;
|
|
1599
|
+
/** Microsoft browserconfig XML URL */
|
|
1600
|
+
config?: string;
|
|
1601
|
+
}
|
|
1602
|
+
/**
|
|
1603
|
+
* Icons and visual assets metadata.
|
|
1604
|
+
*
|
|
1605
|
+
* @remarks
|
|
1606
|
+
* Contains all icon-related metadata including favicons, app icons,
|
|
1607
|
+
* and platform-specific icons.
|
|
1608
|
+
*/
|
|
1609
|
+
interface IconsMetadata {
|
|
1610
|
+
/** Standard favicon */
|
|
1611
|
+
favicon?: string;
|
|
1612
|
+
/** Shortcut icon (legacy) */
|
|
1613
|
+
shortcutIcon?: string;
|
|
1614
|
+
/** Apple touch icons for iOS */
|
|
1615
|
+
appleTouchIcons?: AppleTouchIcon[];
|
|
1616
|
+
/** Safari pinned tab icon */
|
|
1617
|
+
maskIcon?: MaskIcon;
|
|
1618
|
+
/** Microsoft tile configuration */
|
|
1619
|
+
msTile?: MSTile;
|
|
1620
|
+
/** Fluid icon (legacy) */
|
|
1621
|
+
fluidIcon?: string;
|
|
1622
|
+
}
|
|
1623
|
+
|
|
1624
|
+
/**
|
|
1625
|
+
* Icons and visual assets extraction.
|
|
1626
|
+
*
|
|
1627
|
+
* @remarks
|
|
1628
|
+
* Extracts icon metadata from HTML documents.
|
|
1629
|
+
*
|
|
1630
|
+
* @packageDocumentation
|
|
1631
|
+
*/
|
|
1632
|
+
|
|
1633
|
+
/**
|
|
1634
|
+
* Extract icons metadata from parsed HTML document.
|
|
1635
|
+
*
|
|
1636
|
+
* @remarks
|
|
1637
|
+
* Extracts all icon-related metadata including favicons, Apple touch icons,
|
|
1638
|
+
* Safari mask icons, and Microsoft tile configuration.
|
|
1639
|
+
*
|
|
1640
|
+
* @param doc - Parsed HTML document
|
|
1641
|
+
* @returns Icons metadata
|
|
1642
|
+
*
|
|
1643
|
+
* @example
|
|
1644
|
+
* ```typescript
|
|
1645
|
+
* const doc = parseHTML(htmlString);
|
|
1646
|
+
* const icons = extractIcons(doc);
|
|
1647
|
+
* console.log(icons.favicon);
|
|
1648
|
+
* console.log(icons.appleTouchIcons);
|
|
1649
|
+
* ```
|
|
1650
|
+
*/
|
|
1651
|
+
declare function extractIcons(doc: HTMLDocument): IconsMetadata;
|
|
1652
|
+
|
|
1653
|
+
/**
|
|
1654
|
+
* Language and localization types.
|
|
1655
|
+
*
|
|
1656
|
+
* @remarks
|
|
1657
|
+
* Types for language and locale metadata.
|
|
1658
|
+
*
|
|
1659
|
+
* @packageDocumentation
|
|
1660
|
+
*/
|
|
1661
|
+
/**
|
|
1662
|
+
* Language and localization metadata.
|
|
1663
|
+
*
|
|
1664
|
+
* @remarks
|
|
1665
|
+
* Contains language and locale information from various sources including
|
|
1666
|
+
* HTML lang attribute, meta tags, and OpenGraph locale.
|
|
1667
|
+
*/
|
|
1668
|
+
interface LanguageMetadata {
|
|
1669
|
+
/** HTML lang attribute from <html> tag */
|
|
1670
|
+
htmlLang?: string;
|
|
1671
|
+
/** Content-Language meta tag */
|
|
1672
|
+
contentLanguage?: string;
|
|
1673
|
+
/** OpenGraph locale */
|
|
1674
|
+
ogLocale?: string;
|
|
1675
|
+
/** OpenGraph alternate locales */
|
|
1676
|
+
alternateLocales?: string[];
|
|
1677
|
+
/** Primary language (best guess, normalized to ISO 639-1) */
|
|
1678
|
+
primary?: string;
|
|
1679
|
+
/** Region code (if available, ISO 3166-1 alpha-2) */
|
|
1680
|
+
region?: string;
|
|
1681
|
+
}
|
|
1682
|
+
|
|
1683
|
+
/**
|
|
1684
|
+
* Language and localization extraction.
|
|
1685
|
+
*
|
|
1686
|
+
* @remarks
|
|
1687
|
+
* Extracts language and locale metadata from HTML documents.
|
|
1688
|
+
*
|
|
1689
|
+
* @packageDocumentation
|
|
1690
|
+
*/
|
|
1691
|
+
|
|
1692
|
+
/**
|
|
1693
|
+
* Extract language and localization metadata from parsed HTML document.
|
|
1694
|
+
*
|
|
1695
|
+
* @remarks
|
|
1696
|
+
* Extracts language information from HTML lang attribute, meta tags,
|
|
1697
|
+
* and OpenGraph locale. Normalizes to provide a primary language and region.
|
|
1698
|
+
*
|
|
1699
|
+
* @param doc - Parsed HTML document
|
|
1700
|
+
* @returns Language metadata
|
|
1701
|
+
*
|
|
1702
|
+
* @example
|
|
1703
|
+
* ```typescript
|
|
1704
|
+
* const doc = parseHTML(htmlString);
|
|
1705
|
+
* const lang = extractLanguage(doc);
|
|
1706
|
+
* console.log(lang.primary); // 'en'
|
|
1707
|
+
* console.log(lang.region); // 'US'
|
|
1708
|
+
* ```
|
|
1709
|
+
*/
|
|
1710
|
+
declare function extractLanguage(doc: HTMLDocument): LanguageMetadata;
|
|
1711
|
+
|
|
1712
|
+
/**
|
|
1713
|
+
* Links extraction types.
|
|
1714
|
+
*
|
|
1715
|
+
* @remarks
|
|
1716
|
+
* Types for navigational link extraction and analysis.
|
|
1717
|
+
*
|
|
1718
|
+
* @author Anonyfox <max@anonyfox.com>
|
|
1719
|
+
* @license MIT
|
|
1720
|
+
* @see {@link https://github.com/Anonyfox/ravenjs}
|
|
1721
|
+
* @see {@link https://ravenjs.dev}
|
|
1722
|
+
* @see {@link https://anonyfox.com}
|
|
1723
|
+
*
|
|
1724
|
+
* @packageDocumentation
|
|
1725
|
+
*/
|
|
1726
|
+
/**
|
|
1727
|
+
* Extracted link with metadata.
|
|
1728
|
+
*
|
|
1729
|
+
* @remarks
|
|
1730
|
+
* Represents a single hyperlink with all relevant attributes.
|
|
1731
|
+
* URLs are normalized to absolute format if a base URL is available.
|
|
1732
|
+
*/
|
|
1733
|
+
interface ExtractedLink {
|
|
1734
|
+
/** Absolute URL of the link */
|
|
1735
|
+
url: string;
|
|
1736
|
+
/** Anchor text (visible text content) */
|
|
1737
|
+
text?: string;
|
|
1738
|
+
/** Title attribute */
|
|
1739
|
+
title?: string;
|
|
1740
|
+
/** Rel attribute value */
|
|
1741
|
+
rel?: string;
|
|
1742
|
+
/** Target attribute (_blank, _self, etc.) */
|
|
1743
|
+
target?: string;
|
|
1744
|
+
/** Whether this is an internal link (same origin) */
|
|
1745
|
+
internal?: boolean;
|
|
1746
|
+
/** Whether this is an external link (different origin) */
|
|
1747
|
+
external?: boolean;
|
|
1748
|
+
/** Whether link has nofollow rel */
|
|
1749
|
+
nofollow?: boolean;
|
|
1750
|
+
/** Whether link has ugc (User Generated Content) rel */
|
|
1751
|
+
ugc?: boolean;
|
|
1752
|
+
/** Whether link has sponsored rel */
|
|
1753
|
+
sponsored?: boolean;
|
|
1754
|
+
/** Whether link has noopener rel */
|
|
1755
|
+
noopener?: boolean;
|
|
1756
|
+
/** Whether link has noreferrer rel */
|
|
1757
|
+
noreferrer?: boolean;
|
|
1758
|
+
}
|
|
1759
|
+
/**
|
|
1760
|
+
* Links extraction options.
|
|
1761
|
+
*/
|
|
1762
|
+
interface LinksExtractionOptions {
|
|
1763
|
+
/**
|
|
1764
|
+
* Filter links by scope.
|
|
1765
|
+
*
|
|
1766
|
+
* @remarks
|
|
1767
|
+
* - `'all'` - Return all links (default)
|
|
1768
|
+
* - `'internal'` - Only links to same origin
|
|
1769
|
+
* - `'external'` - Only links to different origins
|
|
1770
|
+
*/
|
|
1771
|
+
scope?: 'all' | 'internal' | 'external';
|
|
1772
|
+
/**
|
|
1773
|
+
* Filter out links with specific rel attributes.
|
|
1774
|
+
*
|
|
1775
|
+
* @remarks
|
|
1776
|
+
* Useful for crawlers to skip nofollow, sponsored, or UGC links.
|
|
1777
|
+
*
|
|
1778
|
+
* @example
|
|
1779
|
+
* ```typescript
|
|
1780
|
+
* // Skip nofollow and sponsored links
|
|
1781
|
+
* { excludeRel: ['nofollow', 'sponsored'] }
|
|
1782
|
+
* ```
|
|
1783
|
+
*/
|
|
1784
|
+
excludeRel?: Array<'nofollow' | 'noopener' | 'noreferrer' | 'ugc' | 'sponsored'>;
|
|
1785
|
+
/**
|
|
1786
|
+
* Include only links with specific rel attributes.
|
|
1787
|
+
*
|
|
1788
|
+
* @remarks
|
|
1789
|
+
* If specified, only links matching these rel values are included.
|
|
1790
|
+
*/
|
|
1791
|
+
includeRel?: Array<'nofollow' | 'noopener' | 'noreferrer' | 'ugc' | 'sponsored'>;
|
|
1792
|
+
/**
|
|
1793
|
+
* Whether to include hash-only links (#anchor).
|
|
1794
|
+
*
|
|
1795
|
+
* @default false
|
|
1796
|
+
*/
|
|
1797
|
+
includeHashLinks?: boolean;
|
|
1798
|
+
/**
|
|
1799
|
+
* Whether to deduplicate URLs.
|
|
1800
|
+
*
|
|
1801
|
+
* @remarks
|
|
1802
|
+
* If true, only unique URLs are returned (keeps first occurrence).
|
|
1803
|
+
*
|
|
1804
|
+
* @default true
|
|
1805
|
+
*/
|
|
1806
|
+
deduplicate?: boolean;
|
|
1807
|
+
/**
|
|
1808
|
+
* Maximum number of links to extract.
|
|
1809
|
+
*
|
|
1810
|
+
* @remarks
|
|
1811
|
+
* Useful for limiting extraction on large pages.
|
|
1812
|
+
*/
|
|
1813
|
+
limit?: number;
|
|
1814
|
+
}
|
|
1815
|
+
/**
|
|
1816
|
+
* Links metadata extracted from HTML.
|
|
1817
|
+
*
|
|
1818
|
+
* @remarks
|
|
1819
|
+
* Contains categorized and analyzed links from the document.
|
|
1820
|
+
*/
|
|
1821
|
+
interface LinksMetadata {
|
|
1822
|
+
/** All extracted links */
|
|
1823
|
+
all?: ExtractedLink[];
|
|
1824
|
+
/** Internal links (same origin) */
|
|
1825
|
+
internal?: ExtractedLink[];
|
|
1826
|
+
/** External links (different origin) */
|
|
1827
|
+
external?: ExtractedLink[];
|
|
1828
|
+
/** Links with nofollow rel */
|
|
1829
|
+
nofollow?: ExtractedLink[];
|
|
1830
|
+
/** Total count of links found */
|
|
1831
|
+
totalCount?: number;
|
|
1832
|
+
/** Count of internal links */
|
|
1833
|
+
internalCount?: number;
|
|
1834
|
+
/** Count of external links */
|
|
1835
|
+
externalCount?: number;
|
|
1836
|
+
/** Count of nofollow links */
|
|
1837
|
+
nofollowCount?: number;
|
|
1838
|
+
}
|
|
1839
|
+
|
|
1840
|
+
/**
|
|
1841
|
+
* Links extraction.
|
|
1842
|
+
*
|
|
1843
|
+
* @remarks
|
|
1844
|
+
* Extract navigational links from HTML documents with advanced filtering
|
|
1845
|
+
* and categorization for crawler and SEO use cases.
|
|
1846
|
+
*
|
|
1847
|
+
* @author Anonyfox <max@anonyfox.com>
|
|
1848
|
+
* @license MIT
|
|
1849
|
+
* @see {@link https://github.com/Anonyfox/ravenjs}
|
|
1850
|
+
* @see {@link https://ravenjs.dev}
|
|
1851
|
+
* @see {@link https://anonyfox.com}
|
|
1852
|
+
*
|
|
1853
|
+
* @packageDocumentation
|
|
1854
|
+
*/
|
|
1855
|
+
|
|
1856
|
+
/**
|
|
1857
|
+
* Extract links from parsed HTML document.
|
|
1858
|
+
*
|
|
1859
|
+
* @remarks
|
|
1860
|
+
* Extracts all `<a href>` links with comprehensive metadata and filtering options.
|
|
1861
|
+
* Perfect for crawlers, SEO analysis, and link discovery.
|
|
1862
|
+
*
|
|
1863
|
+
* Features:
|
|
1864
|
+
* - Internal/external link categorization
|
|
1865
|
+
* - Rel attribute filtering (nofollow, ugc, sponsored, etc.)
|
|
1866
|
+
* - Automatic URL normalization
|
|
1867
|
+
* - Hash link filtering
|
|
1868
|
+
* - Scheme filtering (only http/https)
|
|
1869
|
+
* - Deduplication
|
|
1870
|
+
* - Link text extraction
|
|
1871
|
+
*
|
|
1872
|
+
* @param doc - Parsed HTML document
|
|
1873
|
+
* @param baseUrl - Base URL for resolving relative links and determining internal/external
|
|
1874
|
+
* @param options - Extraction options for filtering and categorization
|
|
1875
|
+
* @returns Links metadata with categorized links
|
|
1876
|
+
*
|
|
1877
|
+
* @example
|
|
1878
|
+
* ```typescript
|
|
1879
|
+
* const doc = parseHTML(htmlString);
|
|
1880
|
+
* const links = extractLinks(doc, 'https://example.com');
|
|
1881
|
+
*
|
|
1882
|
+
* // Get all internal links (same origin)
|
|
1883
|
+
* console.log(links.internal);
|
|
1884
|
+
*
|
|
1885
|
+
* // Get external links excluding nofollow
|
|
1886
|
+
* const linksNoFollow = extractLinks(doc, 'https://example.com', {
|
|
1887
|
+
* scope: 'external',
|
|
1888
|
+
* excludeRel: ['nofollow']
|
|
1889
|
+
* });
|
|
1890
|
+
* ```
|
|
1891
|
+
*
|
|
1892
|
+
* @example
|
|
1893
|
+
* ```typescript
|
|
1894
|
+
* // Crawler use case - get follow-able links
|
|
1895
|
+
* const links = extractLinks(doc, baseUrl, {
|
|
1896
|
+
* excludeRel: ['nofollow', 'ugc', 'sponsored'],
|
|
1897
|
+
* includeHashLinks: false
|
|
1898
|
+
* });
|
|
1899
|
+
* ```
|
|
1900
|
+
*/
|
|
1901
|
+
declare function extractLinks(doc: HTMLDocument, baseUrl?: string | URL | null, options?: LinksExtractionOptions): LinksMetadata;
|
|
1902
|
+
|
|
1903
|
+
/**
|
|
1904
|
+
* Monetization and payment types.
|
|
1905
|
+
*
|
|
1906
|
+
* @remarks
|
|
1907
|
+
* Types for web monetization and payment metadata.
|
|
1908
|
+
*
|
|
1909
|
+
* @packageDocumentation
|
|
1910
|
+
*/
|
|
1911
|
+
/**
|
|
1912
|
+
* Monetization metadata.
|
|
1913
|
+
*
|
|
1914
|
+
* @remarks
|
|
1915
|
+
* Contains web monetization and payment verification metadata.
|
|
1916
|
+
*/
|
|
1917
|
+
interface MonetizationMetadata {
|
|
1918
|
+
/** Web Monetization API payment pointer */
|
|
1919
|
+
webMonetization?: string;
|
|
1920
|
+
/** PayPal site verification token */
|
|
1921
|
+
paypalVerification?: string;
|
|
1922
|
+
/** Brave Creator verification token */
|
|
1923
|
+
braveCreator?: string;
|
|
1924
|
+
/** Coil payment pointer (legacy) */
|
|
1925
|
+
coil?: string;
|
|
1926
|
+
/** Bitcoin address */
|
|
1927
|
+
bitcoin?: string;
|
|
1928
|
+
/** Ethereum address */
|
|
1929
|
+
ethereum?: string;
|
|
1930
|
+
}
|
|
1931
|
+
|
|
1932
|
+
/**
|
|
1933
|
+
* Monetization and payment extraction.
|
|
1934
|
+
*
|
|
1935
|
+
* @remarks
|
|
1936
|
+
* Extracts web monetization and payment metadata from HTML documents.
|
|
1937
|
+
*
|
|
1938
|
+
* @packageDocumentation
|
|
1939
|
+
*/
|
|
1940
|
+
|
|
1941
|
+
/**
|
|
1942
|
+
* Extract monetization metadata from parsed HTML document.
|
|
1943
|
+
*
|
|
1944
|
+
* @remarks
|
|
1945
|
+
* Extracts web monetization, payment verification, and cryptocurrency
|
|
1946
|
+
* addresses from meta tags and link tags.
|
|
1947
|
+
*
|
|
1948
|
+
* @param doc - Parsed HTML document
|
|
1949
|
+
* @returns Monetization metadata
|
|
1950
|
+
*
|
|
1951
|
+
* @example
|
|
1952
|
+
* ```typescript
|
|
1953
|
+
* const doc = parseHTML(htmlString);
|
|
1954
|
+
* const monetization = extractMonetization(doc);
|
|
1955
|
+
* console.log(monetization.webMonetization);
|
|
1956
|
+
* console.log(monetization.bitcoin);
|
|
1957
|
+
* ```
|
|
1958
|
+
*/
|
|
1959
|
+
declare function extractMonetization(doc: HTMLDocument): MonetizationMetadata;
|
|
1960
|
+
|
|
1961
|
+
/**
|
|
1962
|
+
* News and press types.
|
|
1963
|
+
*
|
|
1964
|
+
* @remarks
|
|
1965
|
+
* Types for news-specific metadata.
|
|
1966
|
+
*
|
|
1967
|
+
* @packageDocumentation
|
|
1968
|
+
*/
|
|
1969
|
+
/**
|
|
1970
|
+
* News metadata.
|
|
1971
|
+
*
|
|
1972
|
+
* @remarks
|
|
1973
|
+
* Contains news-specific metadata for articles and press releases.
|
|
1974
|
+
*/
|
|
1975
|
+
interface NewsMetadata {
|
|
1976
|
+
/** News keywords (distinct from regular keywords) */
|
|
1977
|
+
keywords?: string[];
|
|
1978
|
+
/** Google News standout tag (indicates exceptional journalism) */
|
|
1979
|
+
standout?: string;
|
|
1980
|
+
/** Syndication source (original publisher) */
|
|
1981
|
+
syndicationSource?: string;
|
|
1982
|
+
/** Original source URL */
|
|
1983
|
+
originalSource?: string;
|
|
1984
|
+
}
|
|
1985
|
+
|
|
1986
|
+
/**
|
|
1987
|
+
* News and press extraction.
|
|
1988
|
+
*
|
|
1989
|
+
* @remarks
|
|
1990
|
+
* Extracts news-specific metadata from HTML documents.
|
|
1991
|
+
*
|
|
1992
|
+
* @packageDocumentation
|
|
1993
|
+
*/
|
|
1994
|
+
|
|
1995
|
+
/**
|
|
1996
|
+
* Extract news metadata from parsed HTML document.
|
|
1997
|
+
*
|
|
1998
|
+
* @remarks
|
|
1999
|
+
* Extracts news-specific metadata including keywords, standout tags,
|
|
2000
|
+
* and syndication information.
|
|
2001
|
+
*
|
|
2002
|
+
* @param doc - Parsed HTML document
|
|
2003
|
+
* @returns News metadata
|
|
2004
|
+
*
|
|
2005
|
+
* @example
|
|
2006
|
+
* ```typescript
|
|
2007
|
+
* const doc = parseHTML(htmlString);
|
|
2008
|
+
* const news = extractNews(doc);
|
|
2009
|
+
* console.log(news.keywords);
|
|
2010
|
+
* console.log(news.standout);
|
|
2011
|
+
* ```
|
|
2012
|
+
*/
|
|
2013
|
+
declare function extractNews(doc: HTMLDocument): NewsMetadata;
|
|
2014
|
+
|
|
2015
|
+
/**
|
|
2016
|
+
* OpenGraph metadata types.
|
|
2017
|
+
*
|
|
2018
|
+
* @remarks
|
|
2019
|
+
* Facebook's Open Graph protocol for rich social sharing.
|
|
2020
|
+
*
|
|
2021
|
+
* @packageDocumentation
|
|
2022
|
+
*/
|
|
2023
|
+
/**
|
|
2024
|
+
* OpenGraph article metadata.
|
|
2025
|
+
*/
|
|
2026
|
+
interface OpenGraphArticle {
|
|
2027
|
+
/** Publication date */
|
|
2028
|
+
publishedTime?: string;
|
|
2029
|
+
/** Last modification date */
|
|
2030
|
+
modifiedTime?: string;
|
|
2031
|
+
/** Expiration date */
|
|
2032
|
+
expirationTime?: string;
|
|
2033
|
+
/** Article authors */
|
|
2034
|
+
authors?: string[];
|
|
2035
|
+
/** Article section/category */
|
|
2036
|
+
section?: string;
|
|
2037
|
+
/** Article tags */
|
|
2038
|
+
tags?: string[];
|
|
2039
|
+
}
|
|
2040
|
+
/**
|
|
2041
|
+
* OpenGraph video metadata.
|
|
2042
|
+
*/
|
|
2043
|
+
interface OpenGraphVideo {
|
|
2044
|
+
/** Video URL */
|
|
2045
|
+
url?: string;
|
|
2046
|
+
/** HTTPS video URL */
|
|
2047
|
+
secureUrl?: string;
|
|
2048
|
+
/** MIME type */
|
|
2049
|
+
type?: string;
|
|
2050
|
+
/** Video width in pixels */
|
|
2051
|
+
width?: number;
|
|
2052
|
+
/** Video height in pixels */
|
|
2053
|
+
height?: number;
|
|
2054
|
+
/** Video duration in seconds */
|
|
2055
|
+
duration?: number;
|
|
2056
|
+
/** Release date */
|
|
2057
|
+
releaseDate?: string;
|
|
2058
|
+
/** Video tags */
|
|
2059
|
+
tags?: string[];
|
|
2060
|
+
}
|
|
2061
|
+
/**
|
|
2062
|
+
* OpenGraph audio metadata.
|
|
2063
|
+
*/
|
|
2064
|
+
interface OpenGraphAudio {
|
|
2065
|
+
/** Audio URL */
|
|
2066
|
+
url?: string;
|
|
2067
|
+
/** HTTPS audio URL */
|
|
2068
|
+
secureUrl?: string;
|
|
2069
|
+
/** MIME type */
|
|
2070
|
+
type?: string;
|
|
2071
|
+
}
|
|
2072
|
+
/**
|
|
2073
|
+
* OpenGraph image metadata.
|
|
2074
|
+
*/
|
|
2075
|
+
interface OpenGraphImage {
|
|
2076
|
+
/** Image URL */
|
|
2077
|
+
url: string;
|
|
2078
|
+
/** HTTPS image URL */
|
|
2079
|
+
secureUrl?: string;
|
|
2080
|
+
/** MIME type */
|
|
2081
|
+
type?: string;
|
|
2082
|
+
/** Image width in pixels */
|
|
2083
|
+
width?: number;
|
|
2084
|
+
/** Image height in pixels */
|
|
2085
|
+
height?: number;
|
|
2086
|
+
/** Alt text */
|
|
2087
|
+
alt?: string;
|
|
2088
|
+
}
|
|
2089
|
+
/**
|
|
2090
|
+
* OpenGraph book metadata.
|
|
2091
|
+
*/
|
|
2092
|
+
interface OpenGraphBook {
|
|
2093
|
+
/** Book authors */
|
|
2094
|
+
authors?: string[];
|
|
2095
|
+
/** ISBN number */
|
|
2096
|
+
isbn?: string;
|
|
2097
|
+
/** Release date */
|
|
2098
|
+
releaseDate?: string;
|
|
2099
|
+
/** Book tags */
|
|
2100
|
+
tags?: string[];
|
|
2101
|
+
}
|
|
2102
|
+
/**
|
|
2103
|
+
* OpenGraph profile metadata.
|
|
2104
|
+
*/
|
|
2105
|
+
interface OpenGraphProfile {
|
|
2106
|
+
/** First name */
|
|
2107
|
+
firstName?: string;
|
|
2108
|
+
/** Last name */
|
|
2109
|
+
lastName?: string;
|
|
2110
|
+
/** Username */
|
|
2111
|
+
username?: string;
|
|
2112
|
+
/** Gender */
|
|
2113
|
+
gender?: string;
|
|
2114
|
+
}
|
|
2115
|
+
/**
|
|
2116
|
+
* OpenGraph metadata extracted from meta tags.
|
|
2117
|
+
*
|
|
2118
|
+
* @remarks
|
|
2119
|
+
* Contains metadata from the Open Graph protocol used for rich social sharing.
|
|
2120
|
+
* All fields are optional - only present if found in the document.
|
|
2121
|
+
*/
|
|
2122
|
+
interface OpenGraphMetadata {
|
|
2123
|
+
/** Content title */
|
|
2124
|
+
title?: string;
|
|
2125
|
+
/** Content type (article, website, video, etc.) */
|
|
2126
|
+
type?: string;
|
|
2127
|
+
/** Preview image URL (primary image) */
|
|
2128
|
+
image?: string;
|
|
2129
|
+
/** Canonical URL */
|
|
2130
|
+
url?: string;
|
|
2131
|
+
/** Content description */
|
|
2132
|
+
description?: string;
|
|
2133
|
+
/** Site name */
|
|
2134
|
+
siteName?: string;
|
|
2135
|
+
/** Content locale (e.g., en_US) */
|
|
2136
|
+
locale?: string;
|
|
2137
|
+
/** Alternate locales */
|
|
2138
|
+
localeAlternate?: string[];
|
|
2139
|
+
/** Article-specific metadata (if type is article) */
|
|
2140
|
+
article?: OpenGraphArticle;
|
|
2141
|
+
/** Video metadata (if type is video or video present) */
|
|
2142
|
+
video?: OpenGraphVideo;
|
|
2143
|
+
/** Audio metadata (if audio present) */
|
|
2144
|
+
audio?: OpenGraphAudio;
|
|
2145
|
+
/** All images with full metadata (if multiple images) */
|
|
2146
|
+
images?: OpenGraphImage[];
|
|
2147
|
+
/** Book metadata (if type is book) */
|
|
2148
|
+
book?: OpenGraphBook;
|
|
2149
|
+
/** Profile metadata (if type is profile) */
|
|
2150
|
+
profile?: OpenGraphProfile;
|
|
2151
|
+
}
|
|
2152
|
+
|
|
2153
|
+
/**
|
|
2154
|
+
* OpenGraph metadata extraction.
|
|
2155
|
+
*
|
|
2156
|
+
* @remarks
|
|
2157
|
+
* Extracts Open Graph protocol metadata from HTML documents.
|
|
2158
|
+
*
|
|
2159
|
+
* @packageDocumentation
|
|
2160
|
+
*/
|
|
2161
|
+
|
|
2162
|
+
/**
|
|
2163
|
+
* Extract OpenGraph metadata from parsed HTML document.
|
|
2164
|
+
*
|
|
2165
|
+
* @remarks
|
|
2166
|
+
* Extracts Open Graph protocol metadata including basic metadata,
|
|
2167
|
+
* article data, video/audio, images, books, and profiles.
|
|
2168
|
+
*
|
|
2169
|
+
* @param doc - Parsed HTML document
|
|
2170
|
+
* @returns OpenGraph metadata object
|
|
2171
|
+
*
|
|
2172
|
+
* @example
|
|
2173
|
+
* ```typescript
|
|
2174
|
+
* const doc = parseHTML(htmlString);
|
|
2175
|
+
* const og = extractOpenGraph(doc);
|
|
2176
|
+
* console.log(og.title);
|
|
2177
|
+
* console.log(og.image);
|
|
2178
|
+
* console.log(og.article?.publishedTime);
|
|
2179
|
+
* ```
|
|
2180
|
+
*/
|
|
2181
|
+
declare function extractOpenGraph(doc: HTMLDocument): OpenGraphMetadata;
|
|
2182
|
+
|
|
2183
|
+
/**
|
|
2184
|
+
* Pagination metadata types.
|
|
2185
|
+
*
|
|
2186
|
+
* @remarks
|
|
2187
|
+
* Types for multi-page content navigation.
|
|
2188
|
+
*
|
|
2189
|
+
* @packageDocumentation
|
|
2190
|
+
*/
|
|
2191
|
+
/**
|
|
2192
|
+
* Pagination metadata.
|
|
2193
|
+
*
|
|
2194
|
+
* @remarks
|
|
2195
|
+
* Contains navigation links for multi-page content series.
|
|
2196
|
+
*/
|
|
2197
|
+
interface PaginationMetadata {
|
|
2198
|
+
/** Previous page URL */
|
|
2199
|
+
prev?: string;
|
|
2200
|
+
/** Next page URL */
|
|
2201
|
+
next?: string;
|
|
2202
|
+
/** First page URL */
|
|
2203
|
+
first?: string;
|
|
2204
|
+
/** Last page URL */
|
|
2205
|
+
last?: string;
|
|
2206
|
+
/** Parent/up level URL */
|
|
2207
|
+
up?: string;
|
|
2208
|
+
/** Index/table of contents URL */
|
|
2209
|
+
index?: string;
|
|
2210
|
+
}
|
|
2211
|
+
|
|
2212
|
+
/**
|
|
2213
|
+
* Pagination metadata extraction.
|
|
2214
|
+
*
|
|
2215
|
+
* @remarks
|
|
2216
|
+
* Extracts pagination navigation links from HTML documents.
|
|
2217
|
+
*
|
|
2218
|
+
* @packageDocumentation
|
|
2219
|
+
*/
|
|
2220
|
+
|
|
2221
|
+
/**
|
|
2222
|
+
* Extract pagination metadata from parsed HTML document.
|
|
2223
|
+
*
|
|
2224
|
+
* @remarks
|
|
2225
|
+
* Extracts pagination navigation links including prev, next, first, last,
|
|
2226
|
+
* up (parent), and index links.
|
|
2227
|
+
*
|
|
2228
|
+
* @param doc - Parsed HTML document
|
|
2229
|
+
* @returns Pagination metadata
|
|
2230
|
+
*
|
|
2231
|
+
* @example
|
|
2232
|
+
* ```typescript
|
|
2233
|
+
* const doc = parseHTML(htmlString);
|
|
2234
|
+
* const pagination = extractPagination(doc);
|
|
2235
|
+
* console.log(pagination.prev); // Previous page URL
|
|
2236
|
+
* console.log(pagination.next); // Next page URL
|
|
2237
|
+
* ```
|
|
2238
|
+
*/
|
|
2239
|
+
declare function extractPagination(doc: HTMLDocument): PaginationMetadata;
|
|
2240
|
+
|
|
2241
|
+
/**
|
|
2242
|
+
* Robots and crawling directives types.
|
|
2243
|
+
*
|
|
2244
|
+
* @remarks
|
|
2245
|
+
* Types for robot crawling and indexing directives.
|
|
2246
|
+
*
|
|
2247
|
+
* @packageDocumentation
|
|
2248
|
+
*/
|
|
2249
|
+
/**
|
|
2250
|
+
* Parsed robot directives.
|
|
2251
|
+
*/
|
|
2252
|
+
interface RobotDirectives {
|
|
2253
|
+
/** Allow indexing */
|
|
2254
|
+
index?: boolean;
|
|
2255
|
+
/** Allow following links */
|
|
2256
|
+
follow?: boolean;
|
|
2257
|
+
/** Prevent archiving/caching */
|
|
2258
|
+
noarchive?: boolean;
|
|
2259
|
+
/** Prevent showing snippets */
|
|
2260
|
+
nosnippet?: boolean;
|
|
2261
|
+
/** Prevent indexing images */
|
|
2262
|
+
noimageindex?: boolean;
|
|
2263
|
+
/** Maximum snippet length (characters) */
|
|
2264
|
+
maxSnippet?: number;
|
|
2265
|
+
/** Maximum image preview size */
|
|
2266
|
+
maxImagePreview?: string;
|
|
2267
|
+
/** Maximum video preview length (seconds) */
|
|
2268
|
+
maxVideoPreview?: number;
|
|
2269
|
+
/** Prevent translation */
|
|
2270
|
+
notranslate?: boolean;
|
|
2271
|
+
/** Date after which content is unavailable */
|
|
2272
|
+
unavailableAfter?: string;
|
|
2273
|
+
}
|
|
2274
|
+
/**
|
|
2275
|
+
* Robots and crawling metadata.
|
|
2276
|
+
*
|
|
2277
|
+
* @remarks
|
|
2278
|
+
* Contains robot directives for search engines and crawlers.
|
|
2279
|
+
*/
|
|
2280
|
+
interface RobotsMetadata {
|
|
2281
|
+
/** General robots directives */
|
|
2282
|
+
robots?: RobotDirectives;
|
|
2283
|
+
/** Google-specific directives */
|
|
2284
|
+
googlebot?: RobotDirectives;
|
|
2285
|
+
/** Bing-specific directives */
|
|
2286
|
+
bingbot?: RobotDirectives;
|
|
2287
|
+
/** Google News-specific directives */
|
|
2288
|
+
googlebotNews?: RobotDirectives;
|
|
2289
|
+
}
|
|
2290
|
+
|
|
2291
|
+
/**
|
|
2292
|
+
* Robots and crawling directives extraction.
|
|
2293
|
+
*
|
|
2294
|
+
* @remarks
|
|
2295
|
+
* Extracts robot crawling and indexing directives from HTML documents.
|
|
2296
|
+
*
|
|
2297
|
+
* @packageDocumentation
|
|
2298
|
+
*/
|
|
2299
|
+
|
|
2300
|
+
/**
|
|
2301
|
+
* Extract robots metadata from parsed HTML document.
|
|
2302
|
+
*
|
|
2303
|
+
* @remarks
|
|
2304
|
+
* Extracts robot directives from meta tags for general robots,
|
|
2305
|
+
* Googlebot, Bingbot, and Google News bot.
|
|
2306
|
+
*
|
|
2307
|
+
* @param doc - Parsed HTML document
|
|
2308
|
+
* @returns Robots metadata
|
|
2309
|
+
*
|
|
2310
|
+
* @example
|
|
2311
|
+
* ```typescript
|
|
2312
|
+
* const doc = parseHTML(htmlString);
|
|
2313
|
+
* const robots = extractRobots(doc);
|
|
2314
|
+
* console.log(robots.robots?.index); // true/false
|
|
2315
|
+
* console.log(robots.robots?.follow); // true/false
|
|
2316
|
+
* ```
|
|
2317
|
+
*/
|
|
2318
|
+
declare function extractRobots(doc: HTMLDocument): RobotsMetadata;
|
|
2319
|
+
|
|
2320
|
+
/**
|
|
2321
|
+
* Schema.org / JSON-LD metadata types.
|
|
2322
|
+
*
|
|
2323
|
+
* @remarks
|
|
2324
|
+
* Structured data for search engines and rich snippets using JSON-LD format.
|
|
2325
|
+
*
|
|
2326
|
+
* @packageDocumentation
|
|
2327
|
+
*/
|
|
2328
|
+
/**
|
|
2329
|
+
* A single JSON-LD block found in the document.
|
|
2330
|
+
*/
|
|
2331
|
+
interface JsonLdBlock {
|
|
2332
|
+
/** Original JSON string */
|
|
2333
|
+
raw: string;
|
|
2334
|
+
/** Parsed JSON object */
|
|
2335
|
+
parsed: unknown;
|
|
2336
|
+
/** @type field(s) from the JSON-LD */
|
|
2337
|
+
type?: string | string[];
|
|
2338
|
+
/** JSON-LD context field */
|
|
2339
|
+
context?: string | unknown;
|
|
2340
|
+
}
|
|
2341
|
+
/**
|
|
2342
|
+
* Schema.org metadata extracted from JSON-LD scripts.
|
|
2343
|
+
*
|
|
2344
|
+
* @remarks
|
|
2345
|
+
* Contains all JSON-LD structured data blocks found in the document.
|
|
2346
|
+
* Provides convenience accessors for common types.
|
|
2347
|
+
*/
|
|
2348
|
+
interface SchemaOrgMetadata {
|
|
2349
|
+
/** All JSON-LD blocks found in the document */
|
|
2350
|
+
jsonLd: JsonLdBlock[];
|
|
2351
|
+
/** Convenience: Article/NewsArticle/BlogPosting types */
|
|
2352
|
+
articles?: unknown[];
|
|
2353
|
+
/** Convenience: WebPage/WebSite types */
|
|
2354
|
+
webPages?: unknown[];
|
|
2355
|
+
/** Convenience: BreadcrumbList type */
|
|
2356
|
+
breadcrumbs?: unknown[];
|
|
2357
|
+
/** Convenience: Organization type */
|
|
2358
|
+
organization?: unknown;
|
|
2359
|
+
/** Convenience: Person type */
|
|
2360
|
+
person?: unknown;
|
|
2361
|
+
/** Convenience: Product types */
|
|
2362
|
+
products?: unknown[];
|
|
2363
|
+
/** Convenience: Event types */
|
|
2364
|
+
events?: unknown[];
|
|
2365
|
+
/** Convenience: Recipe types */
|
|
2366
|
+
recipes?: unknown[];
|
|
2367
|
+
/** Convenience: VideoObject types */
|
|
2368
|
+
videos?: unknown[];
|
|
2369
|
+
/** Convenience: ImageObject types */
|
|
2370
|
+
images?: unknown[];
|
|
2371
|
+
}
|
|
2372
|
+
|
|
2373
|
+
/**
|
|
2374
|
+
* Schema.org / JSON-LD extraction.
|
|
2375
|
+
*
|
|
2376
|
+
* @remarks
|
|
2377
|
+
* Extracts structured data from JSON-LD script tags.
|
|
2378
|
+
*
|
|
2379
|
+
* @packageDocumentation
|
|
2380
|
+
*/
|
|
2381
|
+
|
|
2382
|
+
/**
|
|
2383
|
+
* Extract Schema.org metadata from parsed HTML document.
|
|
2384
|
+
*
|
|
2385
|
+
* @remarks
|
|
2386
|
+
* Finds all <script type="application/ld+json"> tags, parses the JSON-LD,
|
|
2387
|
+
* and organizes by type for easy access.
|
|
2388
|
+
*
|
|
2389
|
+
* @param doc - Parsed HTML document
|
|
2390
|
+
* @returns Schema.org metadata object
|
|
2391
|
+
*
|
|
2392
|
+
* @example
|
|
2393
|
+
* ```typescript
|
|
2394
|
+
* const doc = parseHTML(htmlString);
|
|
2395
|
+
* const schema = extractSchemaOrg(doc);
|
|
2396
|
+
* console.log(schema.jsonLd.length);
|
|
2397
|
+
* console.log(schema.articles);
|
|
2398
|
+
* ```
|
|
2399
|
+
*/
|
|
2400
|
+
declare function extractSchemaOrg(doc: HTMLDocument): SchemaOrgMetadata;
|
|
2401
|
+
|
|
2402
|
+
/**
|
|
2403
|
+
* Security and privacy types.
|
|
2404
|
+
*
|
|
2405
|
+
* @remarks
|
|
2406
|
+
* Types for security and privacy-related metadata.
|
|
2407
|
+
*
|
|
2408
|
+
* @packageDocumentation
|
|
2409
|
+
*/
|
|
2410
|
+
/**
|
|
2411
|
+
* Security metadata.
|
|
2412
|
+
*
|
|
2413
|
+
* @remarks
|
|
2414
|
+
* Contains security and privacy-related headers and meta tags.
|
|
2415
|
+
*/
|
|
2416
|
+
interface SecurityMetadata {
|
|
2417
|
+
/** Referrer policy (controls Referer header) */
|
|
2418
|
+
referrerPolicy?: string;
|
|
2419
|
+
/** Content Security Policy directives */
|
|
2420
|
+
contentSecurityPolicy?: string;
|
|
2421
|
+
/** X-UA-Compatible directive (IE compatibility mode) */
|
|
2422
|
+
xUaCompatible?: string;
|
|
2423
|
+
/** Format detection (phone numbers, dates, etc.) */
|
|
2424
|
+
formatDetection?: string;
|
|
2425
|
+
}
|
|
2426
|
+
|
|
2427
|
+
/**
|
|
2428
|
+
* Security and privacy extraction.
|
|
2429
|
+
*
|
|
2430
|
+
* @remarks
|
|
2431
|
+
* Extracts security and privacy-related metadata from HTML documents.
|
|
2432
|
+
*
|
|
2433
|
+
* @packageDocumentation
|
|
2434
|
+
*/
|
|
2435
|
+
|
|
2436
|
+
/**
|
|
2437
|
+
* Extract security metadata from parsed HTML document.
|
|
2438
|
+
*
|
|
2439
|
+
* @remarks
|
|
2440
|
+
* Extracts security and privacy-related meta tags including referrer policy,
|
|
2441
|
+
* content security policy, and browser compatibility directives.
|
|
2442
|
+
*
|
|
2443
|
+
* @param doc - Parsed HTML document
|
|
2444
|
+
* @returns Security metadata
|
|
2445
|
+
*
|
|
2446
|
+
* @example
|
|
2447
|
+
* ```typescript
|
|
2448
|
+
* const doc = parseHTML(htmlString);
|
|
2449
|
+
* const security = extractSecurity(doc);
|
|
2450
|
+
* console.log(security.referrerPolicy);
|
|
2451
|
+
* console.log(security.contentSecurityPolicy);
|
|
2452
|
+
* ```
|
|
2453
|
+
*/
|
|
2454
|
+
declare function extractSecurity(doc: HTMLDocument): SecurityMetadata;
|
|
2455
|
+
|
|
2456
|
+
/**
|
|
2457
|
+
* SEO metadata types.
|
|
2458
|
+
*
|
|
2459
|
+
* @remarks
|
|
2460
|
+
* Standard HTML meta tags used by search engines and browsers.
|
|
2461
|
+
*
|
|
2462
|
+
* @packageDocumentation
|
|
2463
|
+
*/
|
|
2464
|
+
/**
|
|
2465
|
+
* Basic SEO metadata extracted from standard HTML meta tags.
|
|
2466
|
+
*
|
|
2467
|
+
* @remarks
|
|
2468
|
+
* Contains metadata from common SEO-related meta tags including
|
|
2469
|
+
* title, description, keywords, and browser-specific tags.
|
|
2470
|
+
*/
|
|
2471
|
+
interface SEOMetadata {
|
|
2472
|
+
/** Page title from <title> tag */
|
|
2473
|
+
title?: string;
|
|
2474
|
+
/** Meta description for search results */
|
|
2475
|
+
description?: string;
|
|
2476
|
+
/** Keywords (legacy but still used) */
|
|
2477
|
+
keywords?: string[];
|
|
2478
|
+
/** Page author */
|
|
2479
|
+
author?: string;
|
|
2480
|
+
/** Site generator (e.g., WordPress, Hugo) */
|
|
2481
|
+
generator?: string;
|
|
2482
|
+
/** Viewport settings */
|
|
2483
|
+
viewport?: string;
|
|
2484
|
+
/** Browser theme color */
|
|
2485
|
+
themeColor?: string;
|
|
2486
|
+
/** Color scheme preference (light, dark, auto) */
|
|
2487
|
+
colorScheme?: string;
|
|
2488
|
+
/** Web application name */
|
|
2489
|
+
applicationName?: string;
|
|
2490
|
+
/** iOS web app title */
|
|
2491
|
+
appleMobileWebAppTitle?: string;
|
|
2492
|
+
/** iOS web app capable (standalone mode) */
|
|
2493
|
+
appleMobileWebAppCapable?: boolean;
|
|
2494
|
+
/** iOS status bar style */
|
|
2495
|
+
appleMobileWebAppStatusBarStyle?: string;
|
|
2496
|
+
}
|
|
2497
|
+
|
|
2498
|
+
/**
|
|
2499
|
+
* SEO metadata extraction.
|
|
2500
|
+
*
|
|
2501
|
+
* @remarks
|
|
2502
|
+
* Extracts standard SEO meta tags from HTML documents.
|
|
2503
|
+
*
|
|
2504
|
+
* @packageDocumentation
|
|
2505
|
+
*/
|
|
2506
|
+
|
|
2507
|
+
/**
|
|
2508
|
+
* Extract SEO metadata from parsed HTML document.
|
|
2509
|
+
*
|
|
2510
|
+
* @remarks
|
|
2511
|
+
* Extracts standard SEO meta tags including title, description, keywords,
|
|
2512
|
+
* and browser-specific configuration. All fields are optional.
|
|
2513
|
+
*
|
|
2514
|
+
* @param doc - Parsed HTML document
|
|
2515
|
+
* @returns SEO metadata object
|
|
2516
|
+
*
|
|
2517
|
+
* @example
|
|
2518
|
+
* ```typescript
|
|
2519
|
+
* const doc = parseHTML(htmlString);
|
|
2520
|
+
* const seo = extractSEO(doc);
|
|
2521
|
+
* console.log(seo.title); // Page title
|
|
2522
|
+
* console.log(seo.description); // Meta description
|
|
2523
|
+
* ```
|
|
2524
|
+
*/
|
|
2525
|
+
declare function extractSEO(doc: HTMLDocument): SEOMetadata;
|
|
2526
|
+
|
|
2527
|
+
/**
|
|
2528
|
+
* Sitemap discovery types.
|
|
2529
|
+
*
|
|
2530
|
+
* @remarks
|
|
2531
|
+
* Types for discovering XML sitemaps and sitemap indexes.
|
|
2532
|
+
*
|
|
2533
|
+
* @packageDocumentation
|
|
2534
|
+
*/
|
|
2535
|
+
/**
|
|
2536
|
+
* Sitemap discovery metadata.
|
|
2537
|
+
*
|
|
2538
|
+
* @remarks
|
|
2539
|
+
* Contains discovered sitemaps from <link> tags and suggested common sitemap URLs.
|
|
2540
|
+
*/
|
|
2541
|
+
interface SitemapDiscoveryMetadata {
|
|
2542
|
+
/** Sitemaps explicitly declared in <link rel="sitemap"> tags */
|
|
2543
|
+
sitemaps: string[];
|
|
2544
|
+
/** Suggested sitemap URLs based on common patterns (not verified) */
|
|
2545
|
+
suggestions?: string[];
|
|
2546
|
+
}
|
|
2547
|
+
|
|
2548
|
+
/**
|
|
2549
|
+
* Sitemap discovery extraction.
|
|
2550
|
+
*
|
|
2551
|
+
* @remarks
|
|
2552
|
+
* Discovers XML sitemaps in HTML documents.
|
|
2553
|
+
*
|
|
2554
|
+
* @packageDocumentation
|
|
2555
|
+
*/
|
|
2556
|
+
|
|
2557
|
+
/**
|
|
2558
|
+
* Extract sitemap discovery metadata from parsed HTML document.
|
|
2559
|
+
*
|
|
2560
|
+
* @remarks
|
|
2561
|
+
* Finds all sitemaps declared in <link rel="sitemap"> tags and generates
|
|
2562
|
+
* suggestions for common sitemap URL patterns.
|
|
2563
|
+
*
|
|
2564
|
+
* @param doc - Parsed HTML document
|
|
2565
|
+
* @param documentUrl - Optional document URL for generating absolute sitemap suggestions
|
|
2566
|
+
* @returns Sitemap discovery metadata
|
|
2567
|
+
*
|
|
2568
|
+
* @example
|
|
2569
|
+
* ```typescript
|
|
2570
|
+
* const doc = parseHTML(htmlString);
|
|
2571
|
+
* const sitemaps = extractSitemapDiscovery(doc, 'https://example.com');
|
|
2572
|
+
* console.log(sitemaps.sitemaps); // Discovered sitemaps
|
|
2573
|
+
* console.log(sitemaps.suggestions); // Suggested sitemap URLs
|
|
2574
|
+
* ```
|
|
2575
|
+
*/
|
|
2576
|
+
declare function extractSitemapDiscovery(doc: HTMLDocument, documentUrl?: string | URL): SitemapDiscoveryMetadata;
|
|
2577
|
+
|
|
2578
|
+
/**
|
|
2579
|
+
* Social profiles types.
|
|
2580
|
+
*
|
|
2581
|
+
* @remarks
|
|
2582
|
+
* Types for social media profile links.
|
|
2583
|
+
*
|
|
2584
|
+
* @packageDocumentation
|
|
2585
|
+
*/
|
|
2586
|
+
/**
|
|
2587
|
+
* Social profile metadata.
|
|
2588
|
+
*
|
|
2589
|
+
* @remarks
|
|
2590
|
+
* Contains social media profile URLs and handles from various platforms.
|
|
2591
|
+
*/
|
|
2592
|
+
interface SocialProfilesMetadata {
|
|
2593
|
+
/** Twitter/X username (without @) */
|
|
2594
|
+
twitter?: string;
|
|
2595
|
+
/** Facebook profile/page URL */
|
|
2596
|
+
facebook?: string;
|
|
2597
|
+
/** Instagram username or URL */
|
|
2598
|
+
instagram?: string;
|
|
2599
|
+
/** LinkedIn profile/company URL */
|
|
2600
|
+
linkedin?: string;
|
|
2601
|
+
/** YouTube channel URL */
|
|
2602
|
+
youtube?: string;
|
|
2603
|
+
/** GitHub username or organization URL */
|
|
2604
|
+
github?: string;
|
|
2605
|
+
/** TikTok username or URL */
|
|
2606
|
+
tiktok?: string;
|
|
2607
|
+
/** Pinterest username or URL */
|
|
2608
|
+
pinterest?: string;
|
|
2609
|
+
/** Mastodon profile URL */
|
|
2610
|
+
mastodon?: string;
|
|
2611
|
+
/** Reddit username or URL */
|
|
2612
|
+
reddit?: string;
|
|
2613
|
+
/** Other social profiles (platform: url/username) */
|
|
2614
|
+
other?: Record<string, string>;
|
|
2615
|
+
}
|
|
2616
|
+
|
|
2617
|
+
/**
|
|
2618
|
+
* Social profiles extraction.
|
|
2619
|
+
*
|
|
2620
|
+
* @remarks
|
|
2621
|
+
* Extracts social media profile links from HTML documents.
|
|
2622
|
+
*
|
|
2623
|
+
* @packageDocumentation
|
|
2624
|
+
*/
|
|
2625
|
+
|
|
2626
|
+
/**
|
|
2627
|
+
* Extract social profiles metadata from parsed HTML document.
|
|
2628
|
+
*
|
|
2629
|
+
* @remarks
|
|
2630
|
+
* Extracts social media profile URLs and handles from meta tags and structured data.
|
|
2631
|
+
*
|
|
2632
|
+
* @param doc - Parsed HTML document
|
|
2633
|
+
* @returns Social profiles metadata
|
|
2634
|
+
*
|
|
2635
|
+
* @example
|
|
2636
|
+
* ```typescript
|
|
2637
|
+
* const doc = parseHTML(htmlString);
|
|
2638
|
+
* const profiles = extractSocialProfiles(doc);
|
|
2639
|
+
* console.log(profiles.twitter);
|
|
2640
|
+
* console.log(profiles.facebook);
|
|
2641
|
+
* ```
|
|
2642
|
+
*/
|
|
2643
|
+
declare function extractSocialProfiles(doc: HTMLDocument): SocialProfilesMetadata;
|
|
2644
|
+
|
|
2645
|
+
/**
|
|
2646
|
+
* Twitter Card metadata types.
|
|
2647
|
+
*
|
|
2648
|
+
* @remarks
|
|
2649
|
+
* Twitter-specific metadata for rich cards.
|
|
2650
|
+
*
|
|
2651
|
+
* @packageDocumentation
|
|
2652
|
+
*/
|
|
2653
|
+
/**
|
|
2654
|
+
* Twitter app card metadata for a specific platform.
|
|
2655
|
+
*/
|
|
2656
|
+
interface TwitterAppPlatform {
|
|
2657
|
+
/** App name */
|
|
2658
|
+
name?: string;
|
|
2659
|
+
/** App ID */
|
|
2660
|
+
id?: string;
|
|
2661
|
+
/** App URL/deep link */
|
|
2662
|
+
url?: string;
|
|
2663
|
+
}
|
|
2664
|
+
/**
|
|
2665
|
+
* Twitter app card metadata.
|
|
2666
|
+
*/
|
|
2667
|
+
interface TwitterApp {
|
|
2668
|
+
/** iPhone app details */
|
|
2669
|
+
iphone?: TwitterAppPlatform;
|
|
2670
|
+
/** iPad app details */
|
|
2671
|
+
ipad?: TwitterAppPlatform;
|
|
2672
|
+
/** Google Play app details */
|
|
2673
|
+
googleplay?: TwitterAppPlatform;
|
|
2674
|
+
}
|
|
2675
|
+
/**
|
|
2676
|
+
* Twitter player card metadata.
|
|
2677
|
+
*/
|
|
2678
|
+
interface TwitterPlayer {
|
|
2679
|
+
/** Player URL */
|
|
2680
|
+
url?: string;
|
|
2681
|
+
/** Player width in pixels */
|
|
2682
|
+
width?: number;
|
|
2683
|
+
/** Player height in pixels */
|
|
2684
|
+
height?: number;
|
|
2685
|
+
/** Stream URL */
|
|
2686
|
+
stream?: string;
|
|
2687
|
+
}
|
|
2688
|
+
/**
|
|
2689
|
+
* Twitter Card metadata extracted from meta tags.
|
|
2690
|
+
*
|
|
2691
|
+
* @remarks
|
|
2692
|
+
* Contains metadata for Twitter Cards used for rich social sharing on Twitter.
|
|
2693
|
+
* All fields are optional - only present if found in the document.
|
|
2694
|
+
*/
|
|
2695
|
+
interface TwitterCardMetadata {
|
|
2696
|
+
/** Card type (summary, summary_large_image, app, player) */
|
|
2697
|
+
card?: 'summary' | 'summary_large_image' | 'app' | 'player' | string;
|
|
2698
|
+
/** Twitter username of website (with or without @ symbol) */
|
|
2699
|
+
site?: string;
|
|
2700
|
+
/** Twitter username of content creator (with or without @ symbol) */
|
|
2701
|
+
creator?: string;
|
|
2702
|
+
/** Content title (max 70 chars) */
|
|
2703
|
+
title?: string;
|
|
2704
|
+
/** Content description (max 200 chars) */
|
|
2705
|
+
description?: string;
|
|
2706
|
+
/** Image URL */
|
|
2707
|
+
image?: string;
|
|
2708
|
+
/** Image alt text */
|
|
2709
|
+
imageAlt?: string;
|
|
2710
|
+
/** App card details (if card type is 'app') */
|
|
2711
|
+
app?: TwitterApp;
|
|
2712
|
+
/** Player card details (if card type is 'player') */
|
|
2713
|
+
player?: TwitterPlayer;
|
|
2714
|
+
}
|
|
2715
|
+
|
|
2716
|
+
/**
|
|
2717
|
+
* Twitter Card metadata extraction.
|
|
2718
|
+
*
|
|
2719
|
+
* @remarks
|
|
2720
|
+
* Extracts Twitter Card metadata from HTML documents.
|
|
2721
|
+
*
|
|
2722
|
+
* @packageDocumentation
|
|
2723
|
+
*/
|
|
2724
|
+
|
|
2725
|
+
/**
|
|
2726
|
+
* Extract Twitter Card metadata from parsed HTML document.
|
|
2727
|
+
*
|
|
2728
|
+
* @remarks
|
|
2729
|
+
* Extracts Twitter Card metadata including card type, site/creator info,
|
|
2730
|
+
* title/description, images, app cards, and player cards.
|
|
2731
|
+
*
|
|
2732
|
+
* @param doc - Parsed HTML document
|
|
2733
|
+
* @returns Twitter Card metadata object
|
|
2734
|
+
*
|
|
2735
|
+
* @example
|
|
2736
|
+
* ```typescript
|
|
2737
|
+
* const doc = parseHTML(htmlString);
|
|
2738
|
+
* const twitter = extractTwitterCard(doc);
|
|
2739
|
+
* console.log(twitter.card);
|
|
2740
|
+
* console.log(twitter.title);
|
|
2741
|
+
* ```
|
|
2742
|
+
*/
|
|
2743
|
+
declare function extractTwitterCard(doc: HTMLDocument): TwitterCardMetadata;
|
|
2744
|
+
|
|
2745
|
+
/**
|
|
2746
|
+
* Verification tags types.
|
|
2747
|
+
*
|
|
2748
|
+
* @remarks
|
|
2749
|
+
* Types for domain and ownership verification tags.
|
|
2750
|
+
*
|
|
2751
|
+
* @packageDocumentation
|
|
2752
|
+
*/
|
|
2753
|
+
/**
|
|
2754
|
+
* Verification metadata.
|
|
2755
|
+
*
|
|
2756
|
+
* @remarks
|
|
2757
|
+
* Contains verification tags from various platforms for domain and ownership verification.
|
|
2758
|
+
*/
|
|
2759
|
+
interface VerificationMetadata {
|
|
2760
|
+
/** Google Site Verification token */
|
|
2761
|
+
googleSiteVerification?: string;
|
|
2762
|
+
/** Bing/Microsoft verification token */
|
|
2763
|
+
msvalidate?: string;
|
|
2764
|
+
/** Yandex verification token */
|
|
2765
|
+
yandexVerification?: string;
|
|
2766
|
+
/** Facebook domain verification token */
|
|
2767
|
+
facebookDomainVerification?: string;
|
|
2768
|
+
/** Pinterest domain verification token */
|
|
2769
|
+
pinterestVerification?: string;
|
|
2770
|
+
/** Alexa verification token */
|
|
2771
|
+
alexaVerification?: string;
|
|
2772
|
+
/** Norton Safe Web verification token */
|
|
2773
|
+
nortonSafeWeb?: string;
|
|
2774
|
+
/** Other verification tags (platform: token) */
|
|
2775
|
+
other?: Record<string, string>;
|
|
2776
|
+
}
|
|
2777
|
+
|
|
2778
|
+
/**
|
|
2779
|
+
* Verification tags extraction.
|
|
2780
|
+
*
|
|
2781
|
+
* @remarks
|
|
2782
|
+
* Extracts domain and ownership verification tags from HTML documents.
|
|
2783
|
+
*
|
|
2784
|
+
* @packageDocumentation
|
|
2785
|
+
*/
|
|
2786
|
+
|
|
2787
|
+
/**
|
|
2788
|
+
* Extract verification metadata from parsed HTML document.
|
|
2789
|
+
*
|
|
2790
|
+
* @remarks
|
|
2791
|
+
* Extracts verification tags used by various platforms for domain and ownership verification.
|
|
2792
|
+
*
|
|
2793
|
+
* @param doc - Parsed HTML document
|
|
2794
|
+
* @returns Verification metadata
|
|
2795
|
+
*
|
|
2796
|
+
* @example
|
|
2797
|
+
* ```typescript
|
|
2798
|
+
* const doc = parseHTML(htmlString);
|
|
2799
|
+
* const verification = extractVerification(doc);
|
|
2800
|
+
* console.log(verification.googleSiteVerification);
|
|
2801
|
+
* console.log(verification.facebookDomainVerification);
|
|
2802
|
+
* ```
|
|
2803
|
+
*/
|
|
2804
|
+
declare function extractVerification(doc: HTMLDocument): VerificationMetadata;
|
|
2805
|
+
|
|
2806
|
+
/**
|
|
2807
|
+
* Enhanced fetch types for web scraping.
|
|
2808
|
+
*
|
|
2809
|
+
* @remarks
|
|
2810
|
+
* Types for pluck() - fetch-compatible enhanced HTTP client.
|
|
2811
|
+
*
|
|
2812
|
+
* @author Anonyfox <max@anonyfox.com>
|
|
2813
|
+
* @license MIT
|
|
2814
|
+
* @see {@link https://github.com/Anonyfox/ravenjs}
|
|
2815
|
+
* @see {@link https://ravenjs.dev}
|
|
2816
|
+
* @see {@link https://anonyfox.com}
|
|
2817
|
+
*
|
|
2818
|
+
* @packageDocumentation
|
|
2819
|
+
*/
|
|
2820
|
+
/**
|
|
2821
|
+
* Extended RequestInit with pluck-specific options.
|
|
2822
|
+
*
|
|
2823
|
+
* @remarks
|
|
2824
|
+
* Extends standard fetch RequestInit with additional options for
|
|
2825
|
+
* robust web scraping. All standard fetch options are supported.
|
|
2826
|
+
*/
|
|
2827
|
+
interface PluckInit extends RequestInit {
|
|
2828
|
+
/**
|
|
2829
|
+
* Request timeout in milliseconds.
|
|
2830
|
+
*
|
|
2831
|
+
* @default 30000 (30 seconds)
|
|
2832
|
+
*/
|
|
2833
|
+
timeout?: number;
|
|
2834
|
+
/**
|
|
2835
|
+
* Maximum number of redirects to follow.
|
|
2836
|
+
*
|
|
2837
|
+
* @default 10
|
|
2838
|
+
*/
|
|
2839
|
+
maxRedirects?: number;
|
|
2840
|
+
/**
|
|
2841
|
+
* Maximum response size in bytes.
|
|
2842
|
+
*
|
|
2843
|
+
* @default 10485760 (10MB)
|
|
2844
|
+
*/
|
|
2845
|
+
maxSize?: number;
|
|
2846
|
+
/**
|
|
2847
|
+
* User-Agent header shortcut.
|
|
2848
|
+
*
|
|
2849
|
+
* @remarks
|
|
2850
|
+
* Convenience property that sets the User-Agent header.
|
|
2851
|
+
* Overrides any User-Agent in the headers object.
|
|
2852
|
+
*/
|
|
2853
|
+
userAgent?: string;
|
|
2854
|
+
/**
|
|
2855
|
+
* Throw error on HTTP error status (4xx, 5xx).
|
|
2856
|
+
*
|
|
2857
|
+
* @default true
|
|
2858
|
+
*/
|
|
2859
|
+
throwOnHttpError?: boolean;
|
|
2860
|
+
/**
|
|
2861
|
+
* Validate Content-Type header.
|
|
2862
|
+
*
|
|
2863
|
+
* @remarks
|
|
2864
|
+
* If true, throws error if Content-Type is not in allowedContentTypes.
|
|
2865
|
+
*
|
|
2866
|
+
* @default false
|
|
2867
|
+
*/
|
|
2868
|
+
strictContentType?: boolean;
|
|
2869
|
+
/**
|
|
2870
|
+
* Allowed Content-Type values for strictContentType.
|
|
2871
|
+
*
|
|
2872
|
+
* @default ['text/html', 'text/xml', 'application/xml', 'application/xhtml+xml', 'application/rss+xml', 'application/atom+xml', 'application/json']
|
|
2873
|
+
*/
|
|
2874
|
+
allowedContentTypes?: string[];
|
|
2875
|
+
/**
|
|
2876
|
+
* Follow redirects automatically.
|
|
2877
|
+
*
|
|
2878
|
+
* @remarks
|
|
2879
|
+
* If false, returns the 3xx response directly without following.
|
|
2880
|
+
*
|
|
2881
|
+
* @default true
|
|
2882
|
+
*/
|
|
2883
|
+
followRedirects?: boolean;
|
|
2884
|
+
/**
|
|
2885
|
+
* Validate detected encoding.
|
|
2886
|
+
*
|
|
2887
|
+
* @remarks
|
|
2888
|
+
* If true, throws error if detected encoding is invalid or unsupported.
|
|
2889
|
+
*
|
|
2890
|
+
* @default true
|
|
2891
|
+
*/
|
|
2892
|
+
validateEncoding?: boolean;
|
|
2893
|
+
}
|
|
2894
|
+
/**
|
|
2895
|
+
* Enhanced Response with pluck-specific properties.
|
|
2896
|
+
*
|
|
2897
|
+
* @remarks
|
|
2898
|
+
* Extends standard Response with additional metadata about the request.
|
|
2899
|
+
* All standard Response properties and methods are available.
|
|
2900
|
+
*/
|
|
2901
|
+
interface PluckResponse extends Response {
|
|
2902
|
+
/**
|
|
2903
|
+
* Final URL after following redirects.
|
|
2904
|
+
*/
|
|
2905
|
+
finalUrl: string;
|
|
2906
|
+
/**
|
|
2907
|
+
* Original request URL.
|
|
2908
|
+
*/
|
|
2909
|
+
originalUrl: string;
|
|
2910
|
+
/**
|
|
2911
|
+
* Array of redirect URLs (excluding original and final).
|
|
2912
|
+
*/
|
|
2913
|
+
redirectChain: string[];
|
|
2914
|
+
/**
|
|
2915
|
+
* Detected character encoding.
|
|
2916
|
+
*
|
|
2917
|
+
* @example 'utf-8', 'windows-1252', 'iso-8859-1'
|
|
2918
|
+
*/
|
|
2919
|
+
detectedEncoding: string;
|
|
2920
|
+
/**
|
|
2921
|
+
* Request timing information.
|
|
2922
|
+
*/
|
|
2923
|
+
timing: {
|
|
2924
|
+
/** Request start timestamp (milliseconds since epoch) */
|
|
2925
|
+
start: number;
|
|
2926
|
+
/** Request end timestamp (milliseconds since epoch) */
|
|
2927
|
+
end: number;
|
|
2928
|
+
/** Total duration in milliseconds */
|
|
2929
|
+
duration: number;
|
|
2930
|
+
/** Time spent in redirects (milliseconds) */
|
|
2931
|
+
redirectDuration?: number;
|
|
2932
|
+
};
|
|
2933
|
+
/**
|
|
2934
|
+
* Get response body as UTF-8 text.
|
|
2935
|
+
*
|
|
2936
|
+
* @remarks
|
|
2937
|
+
* Unlike standard text(), this guarantees UTF-8 output regardless
|
|
2938
|
+
* of the source encoding. Uses detected encoding to decode properly.
|
|
2939
|
+
*
|
|
2940
|
+
* @returns UTF-8 decoded text
|
|
2941
|
+
*/
|
|
2942
|
+
textUtf8(): Promise<string>;
|
|
2943
|
+
}
|
|
2944
|
+
/**
|
|
2945
|
+
* Base error class for pluck errors.
|
|
2946
|
+
*/
|
|
2947
|
+
declare class PluckError extends Error {
|
|
2948
|
+
constructor(message: string);
|
|
2949
|
+
}
|
|
2950
|
+
/**
|
|
2951
|
+
* Network error (connection failed, DNS, etc.).
|
|
2952
|
+
*/
|
|
2953
|
+
declare class PluckNetworkError extends PluckError {
|
|
2954
|
+
readonly cause?: Error | undefined;
|
|
2955
|
+
constructor(message: string, cause?: Error | undefined);
|
|
2956
|
+
}
|
|
2957
|
+
/**
|
|
2958
|
+
* Request timeout error.
|
|
2959
|
+
*/
|
|
2960
|
+
declare class PluckTimeoutError extends PluckError {
|
|
2961
|
+
readonly timeoutMs: number;
|
|
2962
|
+
constructor(message: string, timeoutMs: number);
|
|
2963
|
+
}
|
|
2964
|
+
/**
|
|
2965
|
+
* HTTP error (4xx, 5xx status codes).
|
|
2966
|
+
*/
|
|
2967
|
+
declare class PluckHttpError extends PluckError {
|
|
2968
|
+
readonly statusCode: number;
|
|
2969
|
+
readonly statusText: string;
|
|
2970
|
+
readonly response: Response;
|
|
2971
|
+
constructor(message: string, statusCode: number, statusText: string, response: Response);
|
|
2972
|
+
}
|
|
2973
|
+
/**
|
|
2974
|
+
* Response size exceeded maximum.
|
|
2975
|
+
*/
|
|
2976
|
+
declare class PluckSizeError extends PluckError {
|
|
2977
|
+
readonly maxSize: number;
|
|
2978
|
+
readonly actualSize?: number | undefined;
|
|
2979
|
+
constructor(message: string, maxSize: number, actualSize?: number | undefined);
|
|
2980
|
+
}
|
|
2981
|
+
/**
|
|
2982
|
+
* Encoding detection or conversion error.
|
|
2983
|
+
*/
|
|
2984
|
+
declare class PluckEncodingError extends PluckError {
|
|
2985
|
+
readonly encoding?: string | undefined;
|
|
2986
|
+
readonly cause?: Error | undefined;
|
|
2987
|
+
constructor(message: string, encoding?: string | undefined, cause?: Error | undefined);
|
|
2988
|
+
}
|
|
2989
|
+
/**
|
|
2990
|
+
* Too many redirects or redirect loop detected.
|
|
2991
|
+
*/
|
|
2992
|
+
declare class PluckRedirectError extends PluckError {
|
|
2993
|
+
readonly redirectChain: string[];
|
|
2994
|
+
readonly maxRedirects?: number | undefined;
|
|
2995
|
+
constructor(message: string, redirectChain: string[], maxRedirects?: number | undefined);
|
|
2996
|
+
}
|
|
2997
|
+
/**
|
|
2998
|
+
* Invalid or disallowed Content-Type.
|
|
2999
|
+
*/
|
|
3000
|
+
declare class PluckContentTypeError extends PluckError {
|
|
3001
|
+
readonly contentType: string;
|
|
3002
|
+
readonly allowedTypes?: string[] | undefined;
|
|
3003
|
+
constructor(message: string, contentType: string, allowedTypes?: string[] | undefined);
|
|
3004
|
+
}
|
|
3005
|
+
|
|
3006
|
+
/**
|
|
3007
|
+
* Enhanced fetch for web scraping.
|
|
3008
|
+
*
|
|
3009
|
+
* @remarks
|
|
3010
|
+
* fetch-compatible HTTP client with robust handling of real-world web content.
|
|
3011
|
+
*
|
|
3012
|
+
* @author Anonyfox <max@anonyfox.com>
|
|
3013
|
+
* @license MIT
|
|
3014
|
+
* @see {@link https://github.com/Anonyfox/ravenjs}
|
|
3015
|
+
* @see {@link https://ravenjs.dev}
|
|
3016
|
+
* @see {@link https://anonyfox.com}
|
|
3017
|
+
*
|
|
3018
|
+
* @packageDocumentation
|
|
3019
|
+
*/
|
|
3020
|
+
|
|
3021
|
+
/**
|
|
3022
|
+
* Enhanced fetch for web scraping.
|
|
3023
|
+
*
|
|
3024
|
+
* @remarks
|
|
3025
|
+
* Drop-in replacement for fetch() with enhanced error handling, encoding detection,
|
|
3026
|
+
* redirect tracking, and size limits. Perfect for scraping HTML, feeds, and APIs.
|
|
3027
|
+
*
|
|
3028
|
+
* Features:
|
|
3029
|
+
* - Manual redirect tracking with full chain
|
|
3030
|
+
* - Automatic encoding detection and UTF-8 conversion
|
|
3031
|
+
* - Configurable timeouts and size limits
|
|
3032
|
+
* - Smart default headers for web scraping
|
|
3033
|
+
* - Content-Type validation
|
|
3034
|
+
* - Comprehensive error types
|
|
3035
|
+
*
|
|
3036
|
+
* @param input - URL string or Request object
|
|
3037
|
+
* @param init - Request options (extends standard RequestInit)
|
|
3038
|
+
* @returns Enhanced Response with additional metadata
|
|
3039
|
+
* @throws {PluckTimeoutError} Request timeout
|
|
3040
|
+
* @throws {PluckNetworkError} Network or DNS error
|
|
3041
|
+
* @throws {PluckHttpError} HTTP error status (4xx, 5xx)
|
|
3042
|
+
* @throws {PluckRedirectError} Too many redirects or loop
|
|
3043
|
+
* @throws {PluckSizeError} Response too large
|
|
3044
|
+
* @throws {PluckEncodingError} Invalid encoding
|
|
3045
|
+
* @throws {PluckContentTypeError} Invalid content type
|
|
3046
|
+
*
|
|
3047
|
+
* @example
|
|
3048
|
+
* ```typescript
|
|
3049
|
+
* // Basic usage (works like fetch)
|
|
3050
|
+
* const response = await pluck('https://example.com');
|
|
3051
|
+
* const html = await response.text();
|
|
3052
|
+
*
|
|
3053
|
+
* // With enhancements
|
|
3054
|
+
* console.log(response.redirectChain);
|
|
3055
|
+
* console.log(response.detectedEncoding);
|
|
3056
|
+
* console.log(response.timing);
|
|
3057
|
+
* ```
|
|
3058
|
+
*
|
|
3059
|
+
* @example
|
|
3060
|
+
* ```typescript
|
|
3061
|
+
* // With options
|
|
3062
|
+
* const response = await pluck('https://example.com', {
|
|
3063
|
+
* timeout: 60000,
|
|
3064
|
+
* maxRedirects: 5,
|
|
3065
|
+
* userAgent: 'MyBot/1.0',
|
|
3066
|
+
* throwOnHttpError: true
|
|
3067
|
+
* });
|
|
3068
|
+
* ```
|
|
3069
|
+
*/
|
|
3070
|
+
declare function pluck(input: string | URL | Request, init?: PluckInit): Promise<PluckResponse>;
|
|
3071
|
+
|
|
3072
|
+
export { type AlternateLink, type AnalyticsMetadata, type AppLinks, type AppleTouchIcon, type Article, type AssetsMetadata, type CanonicalMetadata, type ConnectionHint, type ContentExtractionOptions, type ContentQuality, type ContentResult, type CopyrightMetadata, type DiscoveredFeed, type DublinCoreMetadata, type ExtractedContent, type ExtractedLink, type ExtractionErrorType, type ExtractionFailure, type Feed, type FeedAuthor, type FeedDiscoveryMetadata, type FeedEnclosure, type FeedFormat, type FeedItem, type GeoMetadata, type GeoPosition, type HTMLDocument, type HtmlToTextOptions, type IconsMetadata, type JsonLdBlock, type LanguageMetadata, type LinksExtractionOptions, type LinksMetadata, type MSTile, type MaskIcon, type MonetizationMetadata, type NewsMetadata, type OpenGraphArticle, type OpenGraphAudio, type OpenGraphBook, type OpenGraphImage, type OpenGraphMetadata, type OpenGraphProfile, type OpenGraphVideo, type PaginationMetadata, type ParseResult, PluckContentTypeError, PluckEncodingError, PluckError, PluckHttpError, type PluckInit, PluckNetworkError, PluckRedirectError, type PluckResponse, PluckSizeError, PluckTimeoutError, type PreloadResource, type RobotDirectives, type RobotsMetadata, type SEOMetadata, type SchemaOrgMetadata, type SecurityMetadata, type SitemapDiscoveryMetadata, type SocialProfilesMetadata, type TwitterApp, type TwitterAppPlatform, type TwitterCardMetadata, type TwitterPlayer, type VerificationMetadata, type Website, assessContentQuality, calculateReadingTime, countWords, detectFormat, extractAnalytics, extractAssets, extractCanonical, extractContent, extractCopyright, extractDublinCore, extractFeedDiscovery, extractGeo, extractIcons, extractLanguage, extractLinks, extractMonetization, extractNews, extractOpenGraph, extractPagination, extractRobots, extractSEO, extractSchemaOrg, extractSecurity, extractSitemapDiscovery, extractSocialProfiles, extractTwitterCard, extractVerification, gatherArticle, gatherFeed, gatherWebsite, htmlToText, isAtom, isFeed, isJSONFeed, isProbablyReaderable, isRSS, parseFeed, parseHTML, pluck };
|