magpie-html 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,3072 @@
1
+ /**
2
+ * Content extraction types.
3
+ *
4
+ * @remarks
5
+ * Types for article content extraction using Mozilla Readability.
6
+ *
7
+ * @packageDocumentation
8
+ */
9
+ /**
10
+ * Options for content extraction.
11
+ */
12
+ interface ContentExtractionOptions {
13
+ /**
14
+ * Base URL for resolving relative links and images.
15
+ * Highly recommended for proper link resolution.
16
+ */
17
+ baseUrl?: string;
18
+ /**
19
+ * Minimum character count for article content.
20
+ * Articles shorter than this are considered too short.
21
+ * @default 500
22
+ */
23
+ charThreshold?: number;
24
+ /**
25
+ * Maximum number of elements to parse.
26
+ * Set to 0 for no limit.
27
+ * @default 0
28
+ */
29
+ maxElemsToParse?: number;
30
+ /**
31
+ * Whether to preserve CSS classes in extracted HTML.
32
+ * @default false
33
+ */
34
+ keepClasses?: boolean;
35
+ /**
36
+ * CSS classes to preserve when keepClasses is false.
37
+ */
38
+ classesToPreserve?: string[];
39
+ /**
40
+ * Whether to skip JSON-LD parsing for metadata.
41
+ * @default false
42
+ */
43
+ disableJSONLD?: boolean;
44
+ /**
45
+ * Check if content is probably readerable before extraction.
46
+ * If true and content is not readerable, returns early with failure.
47
+ * @default false
48
+ */
49
+ checkReadability?: boolean;
50
+ /**
51
+ * Enable debug logging.
52
+ * @default false
53
+ */
54
+ debug?: boolean;
55
+ }
56
+ /**
57
+ * Successfully extracted content.
58
+ */
59
+ interface ExtractedContent {
60
+ /** Extraction succeeded */
61
+ success: true;
62
+ /** Article title */
63
+ title: string;
64
+ /** Cleaned HTML content */
65
+ content: string;
66
+ /** Plain text content (HTML stripped) */
67
+ textContent: string;
68
+ /** Article excerpt/summary */
69
+ excerpt: string;
70
+ /** Author byline */
71
+ byline?: string;
72
+ /** Site name */
73
+ siteName?: string;
74
+ /** Content language code (e.g., 'en', 'de') */
75
+ lang?: string;
76
+ /** Text direction */
77
+ dir?: 'ltr' | 'rtl';
78
+ /** Published time (ISO 8601 string if available) */
79
+ publishedTime?: string;
80
+ /** Character count of text content */
81
+ length: number;
82
+ /** Word count */
83
+ wordCount: number;
84
+ /** Estimated reading time in minutes */
85
+ readingTime: number;
86
+ /** Whether content passed readability check */
87
+ readerable: boolean;
88
+ /** Extraction time in milliseconds */
89
+ extractionTime: number;
90
+ }
91
+ /**
92
+ * Error types for extraction failures.
93
+ */
94
+ type ExtractionErrorType = 'NOT_READERABLE' | 'PARSE_ERROR' | 'EXTRACTION_FAILED' | 'INVALID_HTML' | 'UNKNOWN';
95
+ /**
96
+ * Failed content extraction.
97
+ */
98
+ interface ExtractionFailure {
99
+ /** Extraction failed */
100
+ success: false;
101
+ /** Error message */
102
+ error: string;
103
+ /** Categorized error type */
104
+ errorType: ExtractionErrorType;
105
+ /** Whether content passed readability check (if checked) */
106
+ readerable: boolean;
107
+ /** Extraction time in milliseconds */
108
+ extractionTime: number;
109
+ /** Original error details (if available) */
110
+ details?: unknown;
111
+ }
112
+ /**
113
+ * Result of content extraction.
114
+ *
115
+ * @remarks
116
+ * Always returns a result, never throws exceptions.
117
+ */
118
+ type ContentResult = ExtractedContent | ExtractionFailure;
119
+ /**
120
+ * Quality assessment metrics.
121
+ */
122
+ interface ContentQuality {
123
+ /** Word count */
124
+ wordCount: number;
125
+ /** Character count */
126
+ charCount: number;
127
+ /** Estimated reading time in minutes */
128
+ readingTime: number;
129
+ /** Average words per sentence */
130
+ avgWordsPerSentence: number;
131
+ /** Paragraph count */
132
+ paragraphCount: number;
133
+ /** Image count in content */
134
+ imageCount: number;
135
+ /** Link count in content */
136
+ linkCount: number;
137
+ /** Link density (ratio of link text to total text) */
138
+ linkDensity: number;
139
+ /** Overall quality score (0-100) */
140
+ qualityScore: number;
141
+ }
142
+
143
+ /**
144
+ * Main content extraction module.
145
+ *
146
+ * @remarks
147
+ * Extracts article content from HTML using Mozilla Readability.
148
+ * Never throws exceptions - always returns a ContentResult.
149
+ *
150
+ * @packageDocumentation
151
+ */
152
+
153
+ /**
154
+ * Extract article content from HTML.
155
+ *
156
+ * @remarks
157
+ * Uses Mozilla Readability to extract clean article content from a pre-parsed Document.
158
+ * This function never throws exceptions - always returns a ContentResult.
159
+ *
160
+ * Error handling:
161
+ * - Returns success: false for any extraction failure
162
+ * - Categorizes errors by type for better handling
163
+ * - Includes extraction time even for failures
164
+ *
165
+ * @param doc - Pre-parsed Document to extract content from
166
+ * @param options - Extraction options
167
+ * @returns Extraction result (success or failure)
168
+ *
169
+ * @example
170
+ * ```typescript
171
+ * import { parseHTML } from '../utils/html-parser.js';
172
+ * import { extractSEO } from '../metadata/index.js';
173
+ *
174
+ * const doc = parseHTML(html);
175
+ * const metadata = extractSEO(doc);
176
+ * const content = extractContent(doc, {
177
+ * baseUrl: 'https://example.com/article',
178
+ * charThreshold: 300,
179
+ * checkReadability: true,
180
+ * });
181
+ *
182
+ * if (content.success) {
183
+ * console.log(content.title);
184
+ * console.log(content.wordCount);
185
+ * console.log(`${content.readingTime} min read`);
186
+ * } else {
187
+ * console.error(content.error);
188
+ * }
189
+ * ```
190
+ */
191
+ declare function extractContent(doc: Document, options?: ContentExtractionOptions): ContentResult;
192
+
193
+ /**
194
+ * HTML to text conversion types.
195
+ *
196
+ * @remarks
197
+ * Types for converting HTML to plain text with the `htmlToText` function.
198
+ *
199
+ * @packageDocumentation
200
+ */
201
+ /**
202
+ * Options for HTML to plain text conversion.
203
+ */
204
+ interface HtmlToTextOptions {
205
+ /**
206
+ * How to treat the input HTML.
207
+ *
208
+ * @remarks
209
+ * - `"fragment"`: Treat as HTML fragment (default)
210
+ * - `"document"`: Treat as full document (ignores `<head>` content)
211
+ *
212
+ * @defaultValue `"fragment"`
213
+ */
214
+ mode?: 'fragment' | 'document';
215
+ /**
216
+ * How to render anchor (`<a>`) tags.
217
+ *
218
+ * @remarks
219
+ * - `"text"`: Show only the link text (default)
220
+ * - `"inline"`: Show text followed by URL in parentheses, e.g., "Click here (https://example.com)"
221
+ * - `"remove"`: Remove links entirely
222
+ *
223
+ * @defaultValue `"text"`
224
+ */
225
+ links?: 'text' | 'inline' | 'remove';
226
+ /**
227
+ * How to render image (`<img>`) tags.
228
+ *
229
+ * @remarks
230
+ * - `"alt"`: Show the alt text (default)
231
+ * - `"remove"`: Remove images entirely
232
+ *
233
+ * @defaultValue `"alt"`
234
+ */
235
+ images?: 'alt' | 'remove';
236
+ /**
237
+ * Collapse consecutive whitespace outside preserved tags.
238
+ *
239
+ * @remarks
240
+ * When `true`, multiple spaces, tabs, and line breaks are collapsed into single spaces.
241
+ * Whitespace inside preserved tags (e.g., `<pre>`, `<code>`) is always kept intact.
242
+ *
243
+ * @defaultValue `true`
244
+ */
245
+ collapseWhitespace?: boolean;
246
+ /**
247
+ * Maximum consecutive newlines allowed after compaction.
248
+ *
249
+ * @remarks
250
+ * Limits runs of newlines to this value. Set to `1` for single spacing,
251
+ * `2` for double spacing (default), or higher values as needed.
252
+ *
253
+ * @defaultValue `2`
254
+ */
255
+ maxNewlines?: number;
256
+ /**
257
+ * Optional hard-wrap column width.
258
+ *
259
+ * @remarks
260
+ * When set to a positive number, lines will be wrapped at this column width.
261
+ * Does not wrap inside preserved tags like `<pre>` or `<code>`.
262
+ * Set to `null` to disable wrapping (default).
263
+ *
264
+ * @defaultValue `null`
265
+ */
266
+ wrap?: number | null;
267
+ /**
268
+ * Separator between table cells.
269
+ *
270
+ * @remarks
271
+ * - `"tab"`: Use tab character (default)
272
+ * - `"space"`: Use space character
273
+ *
274
+ * @defaultValue `"tab"`
275
+ */
276
+ tableCellSeparator?: 'tab' | 'space';
277
+ /**
278
+ * HTML tags to exclude entirely along with their contents.
279
+ *
280
+ * @remarks
281
+ * By default excludes: `script`, `style`, `noscript`, `template`, `svg`, `canvas`
282
+ *
283
+ * @defaultValue `["script", "style", "noscript", "template", "svg", "canvas"]`
284
+ */
285
+ excludeTags?: string[];
286
+ /**
287
+ * Decode HTML entities.
288
+ *
289
+ * @remarks
290
+ * When `true`, decodes entities like `&amp;`, `&lt;`, `&#8212;`, etc.
291
+ *
292
+ * @defaultValue `true`
293
+ */
294
+ decodeEntities?: boolean;
295
+ /**
296
+ * Tags whose internal whitespace is preserved.
297
+ *
298
+ * @remarks
299
+ * These tags will not have their whitespace collapsed, allowing proper
300
+ * formatting of code blocks and preformatted text.
301
+ *
302
+ * @defaultValue `["pre", "code", "textarea"]`
303
+ */
304
+ preserveTags?: string[];
305
+ /**
306
+ * Trim leading and trailing whitespace from the result.
307
+ *
308
+ * @defaultValue `true`
309
+ */
310
+ trim?: boolean;
311
+ }
312
+
313
+ /**
314
+ * HTML to text conversion.
315
+ *
316
+ * @remarks
317
+ * Convert HTML to plain text using a zero-dependency streaming tokenizer.
318
+ * Pure, deterministic transformation suitable for logs, previews, classification,
319
+ * and search indexing. Preserves essential structure by inserting newlines at
320
+ * block boundaries, handles entities, and provides configurable options.
321
+ *
322
+ * @packageDocumentation
323
+ */
324
+
325
+ /**
326
+ * Convert an HTML string to plain text.
327
+ *
328
+ * @remarks
329
+ * This function uses a streaming tokenizer to parse HTML and extract text content.
330
+ * It handles block elements, whitespace preservation, HTML entities, tables, and more.
331
+ *
332
+ * Features:
333
+ * - Preserves document structure with appropriate line breaks
334
+ * - Handles HTML entities (numeric and common named entities)
335
+ * - Configurable link and image handling
336
+ * - Table rendering with configurable cell separators
337
+ * - Whitespace preservation for code/pre blocks
338
+ * - Optional hard-wrapping at column width
339
+ *
340
+ * @param html - HTML string (fragment or full document)
341
+ * @param options - Conversion options
342
+ * @returns Plain text string
343
+ *
344
+ * @throws {TypeError} If html is not a string
345
+ *
346
+ * @example
347
+ * ```typescript
348
+ * const html = '<div><h1>Hello</h1><p>World!</p></div>';
349
+ * const text = htmlToText(html);
350
+ * console.log(text); // "Hello\n\nWorld!"
351
+ * ```
352
+ *
353
+ * @example
354
+ * ```typescript
355
+ * const html = '<a href="https://example.com">Visit</a>';
356
+ * const text = htmlToText(html, { links: 'inline' });
357
+ * console.log(text); // "Visit (https://example.com)"
358
+ * ```
359
+ */
360
+ declare function htmlToText(html: string, options?: HtmlToTextOptions): string;
361
+
362
+ /**
363
+ * Content quality assessment.
364
+ *
365
+ * @remarks
366
+ * Analyzes extracted content to provide quality metrics.
367
+ *
368
+ * @packageDocumentation
369
+ */
370
+
371
+ /**
372
+ * Calculate word count from text.
373
+ *
374
+ * @param text - Text to count words in
375
+ * @returns Number of words
376
+ */
377
+ declare function countWords(text: string): number;
378
+ /**
379
+ * Calculate reading time in minutes.
380
+ *
381
+ * @remarks
382
+ * Uses average reading speed of 200 words per minute.
383
+ *
384
+ * @param wordCount - Number of words
385
+ * @returns Estimated reading time in minutes
386
+ */
387
+ declare function calculateReadingTime(wordCount: number): number;
388
+ /**
389
+ * Assess content quality.
390
+ *
391
+ * @remarks
392
+ * Analyzes extracted content and returns comprehensive quality metrics.
393
+ *
394
+ * @param content - Extracted content
395
+ * @returns Quality assessment
396
+ *
397
+ * @example
398
+ * ```typescript
399
+ * const content = extractContent(html);
400
+ * if (content.success) {
401
+ * const quality = assessContentQuality(content);
402
+ * console.log(`Quality score: ${quality.qualityScore}/100`);
403
+ * console.log(`Reading time: ${quality.readingTime} minutes`);
404
+ * }
405
+ * ```
406
+ */
407
+ declare function assessContentQuality(content: ExtractedContent): ContentQuality;
408
+
409
+ /**
410
+ * Mozilla Readability wrapper with linkedom.
411
+ *
412
+ * @remarks
413
+ * Provides a clean interface to Mozilla Readability using linkedom as the DOM implementation.
414
+ *
415
+ * @packageDocumentation
416
+ */
417
+
418
+ /**
419
+ * Check if HTML content is probably readerable.
420
+ *
421
+ * @remarks
422
+ * Quick check to determine if content extraction is likely to succeed.
423
+ * This is a heuristic check and may produce false positives/negatives.
424
+ *
425
+ * @param doc - Pre-parsed Document to check
426
+ * @param options - Readability check options
427
+ * @returns True if content appears to be an article
428
+ *
429
+ * @example
430
+ * ```typescript
431
+ * import { parseHTML } from '../utils/html-parser.js';
432
+ *
433
+ * const doc = parseHTML(html);
434
+ * if (isProbablyReaderable(doc)) {
435
+ * const result = extractContent(doc);
436
+ * }
437
+ * ```
438
+ */
439
+ declare function isProbablyReaderable(doc: Document, options?: {
440
+ minContentLength?: number;
441
+ minScore?: number;
442
+ }): boolean;
443
+
444
+ /**
445
+ * Feed format detection utilities.
446
+ *
447
+ * @packageDocumentation
448
+ */
449
+ /**
450
+ * Feed format type.
451
+ *
452
+ * @remarks
453
+ * Represents the detected or expected format of a feed.
454
+ * - `'rss'` - RSS 2.0, 0.9x, or RSS 1.0 (RDF)
455
+ * - `'atom'` - Atom 1.0
456
+ * - `'json-feed'` - JSON Feed 1.0 or 1.1
457
+ * - `'unknown'` - Format could not be determined
458
+ */
459
+ type FeedFormat = 'rss' | 'atom' | 'json-feed' | 'unknown';
460
+ /**
461
+ * Detect feed format from content string.
462
+ *
463
+ * @remarks
464
+ * Analyzes the content to determine if it's RSS, Atom, or JSON Feed.
465
+ * Detection is based on root elements, namespaces, and structure.
466
+ *
467
+ * Detection priority:
468
+ * 1. JSON Feed (checks for JSON with jsonfeed.org version)
469
+ * 2. RSS (checks for `<rss>` or `<rdf:RDF>` root elements)
470
+ * 3. Atom (checks for `<feed>` root element with Atom namespace)
471
+ *
472
+ * @param content - Feed content as string
473
+ * @returns Detected format or 'unknown' if format cannot be determined
474
+ *
475
+ * @example
476
+ * ```typescript
477
+ * const format = detectFormat(feedContent);
478
+ * if (format === 'rss') {
479
+ * console.log('This is an RSS feed');
480
+ * }
481
+ * ```
482
+ */
483
+ declare function detectFormat(content: string): FeedFormat;
484
+ /**
485
+ * Check if content is a valid feed (any format).
486
+ *
487
+ * @param content - Feed content as string
488
+ * @returns `true` if content is RSS, Atom, or JSON Feed
489
+ *
490
+ * @example
491
+ * ```typescript
492
+ * if (isFeed(content)) {
493
+ * const result = parseFeed(content);
494
+ * }
495
+ * ```
496
+ */
497
+ declare function isFeed(content: string): boolean;
498
+ /**
499
+ * Check if content is RSS format.
500
+ *
501
+ * @param content - Feed content as string
502
+ * @returns `true` if content is RSS (any version)
503
+ */
504
+ declare function isRSS(content: string): boolean;
505
+ /**
506
+ * Check if content is Atom format.
507
+ *
508
+ * @param content - Feed content as string
509
+ * @returns `true` if content is Atom 1.0
510
+ */
511
+ declare function isAtom(content: string): boolean;
512
+ /**
513
+ * Check if content is JSON Feed format.
514
+ *
515
+ * @param content - Feed content as string
516
+ * @returns `true` if content is JSON Feed (1.0 or 1.1)
517
+ */
518
+ declare function isJSONFeed(content: string): boolean;
519
+
520
+ /**
521
+ * Unified feed types - normalized interface across all feed formats.
522
+ *
523
+ * @remarks
524
+ * These types provide a consistent interface for working with feeds regardless
525
+ * of the original format (RSS, Atom, or JSON Feed). All format-specific data
526
+ * is normalized to this structure by the parser.
527
+ *
528
+ * @packageDocumentation
529
+ */
530
+ /**
531
+ * Feed author information.
532
+ *
533
+ * @remarks
534
+ * Represents author/contributor information normalized across all feed formats.
535
+ * Not all formats provide all fields.
536
+ */
537
+ interface FeedAuthor {
538
+ /** Author's name */
539
+ name?: string;
540
+ /** Author's email address */
541
+ email?: string;
542
+ /** Author's website URL */
543
+ url?: string;
544
+ }
545
+ /**
546
+ * Feed enclosure (attached file).
547
+ *
548
+ * @remarks
549
+ * Represents attached files like audio, video, or documents. Commonly used
550
+ * for podcasts and media feeds.
551
+ */
552
+ interface FeedEnclosure {
553
+ /** URL of the attached file */
554
+ url: string;
555
+ /** MIME type of the file (e.g., 'audio/mpeg', 'video/mp4') */
556
+ type?: string;
557
+ /** File size in bytes */
558
+ length?: number;
559
+ }
560
+ /**
561
+ * Feed item (entry/article/post).
562
+ *
563
+ * @remarks
564
+ * Represents a single item in a feed. Items are normalized across all formats
565
+ * to provide a consistent interface. Not all fields are available in all formats.
566
+ */
567
+ interface FeedItem {
568
+ /** Unique identifier for the item (GUID, ID, or URL) */
569
+ id: string;
570
+ /** Item title */
571
+ title?: string;
572
+ /** Canonical URL for the item */
573
+ url?: string;
574
+ /** External URL for linked posts (when different from canonical URL) */
575
+ externalUrl?: string;
576
+ /** Full HTML content of the item */
577
+ contentHtml?: string;
578
+ /** Plain text content of the item */
579
+ contentText?: string;
580
+ /** Short summary or description */
581
+ summary?: string;
582
+ /** Publication date in ISO 8601 format */
583
+ published?: string;
584
+ /** Last modified date in ISO 8601 format */
585
+ modified?: string;
586
+ /** Item authors (may be empty if using feed-level authors) */
587
+ authors?: FeedAuthor[];
588
+ /** Tags, categories, or keywords */
589
+ tags?: string[];
590
+ /** Featured image URL */
591
+ image?: string;
592
+ /** Attached files (audio, video, documents) */
593
+ enclosures?: FeedEnclosure[];
594
+ }
595
+ /**
596
+ * Normalized feed data.
597
+ *
598
+ * @remarks
599
+ * The main feed object containing metadata and items. This is the recommended
600
+ * interface for working with feeds as it provides a consistent structure
601
+ * regardless of the original format.
602
+ */
603
+ interface Feed {
604
+ /** Original feed format */
605
+ format: 'rss' | 'atom' | 'json-feed';
606
+ /** Feed title (required) */
607
+ title: string;
608
+ /** Feed description or subtitle */
609
+ description?: string;
610
+ /** Feed's home page URL */
611
+ url?: string;
612
+ /** Feed's own URL (self-reference) */
613
+ feedUrl?: string;
614
+ /** Feed language code (e.g., 'en', 'de') */
615
+ language?: string;
616
+ /** Feed icon or logo URL */
617
+ image?: string;
618
+ /** Feed-level authors */
619
+ authors?: FeedAuthor[];
620
+ /** Last update date in ISO 8601 format */
621
+ updated?: string;
622
+ /** Feed items (entries/articles/posts) */
623
+ items: FeedItem[];
624
+ }
625
+ /**
626
+ * Parse result containing both normalized and original data.
627
+ *
628
+ * @remarks
629
+ * Returned by {@link parseFeed}. Contains both the normalized feed data
630
+ * (recommended for most use cases) and the original format-specific data
631
+ * (for advanced use cases requiring format-specific fields).
632
+ */
633
+ interface ParseResult {
634
+ /** Normalized feed data (recommended) */
635
+ feed: Feed;
636
+ /** Original format-specific data (advanced use) */
637
+ original: unknown;
638
+ }
639
+
640
+ /**
641
+ * Unified feed parser with automatic format detection.
642
+ *
643
+ * @packageDocumentation
644
+ */
645
+
646
+ /**
647
+ * Parse any feed format with automatic format detection.
648
+ *
649
+ * @remarks
650
+ * This is the main entry point for feed parsing. It automatically detects whether
651
+ * the content is RSS, Atom, or JSON Feed, parses it, and returns a normalized
652
+ * output structure along with the original format-specific data.
653
+ *
654
+ * All relative URLs in the feed are converted to absolute URLs if a base URL is provided.
655
+ * This is essential for feed readers that need to fetch images, enclosures, or follow links.
656
+ *
657
+ * @param content - Feed content as string (XML or JSON)
658
+ * @param baseUrl - Optional base URL for resolving relative URLs (string or URL object)
659
+ * @returns Object containing normalized feed data and original format-specific data
660
+ * @throws Error if format cannot be detected or parsing fails
661
+ *
662
+ * @example
663
+ * ```typescript
664
+ * const feedContent = await fetch('https://example.com/feed.xml').then(r => r.text());
665
+ * const result = parseFeed(feedContent, 'https://example.com/feed.xml');
666
+ *
667
+ * console.log(result.feed.title);
668
+ * console.log(result.feed.items[0].title);
669
+ * console.log(result.feed.items[0].url); // Absolute URL
670
+ * ```
671
+ */
672
+ declare function parseFeed(content: string, baseUrl?: string | URL): ParseResult;
673
+
674
+ /**
675
+ * Types for high-level gathering functionality.
676
+ *
677
+ * @packageDocumentation
678
+ */
679
+ /**
680
+ * Gathered website data.
681
+ *
682
+ * @remarks
683
+ * This interface represents the complete gathered data from a website,
684
+ * including the authoritative URL and all extracted metadata.
685
+ * It will be extended incrementally with more properties.
686
+ */
687
+ interface Website {
688
+ /**
689
+ * Authoritative URL for the page.
690
+ *
691
+ * @remarks
692
+ * Uses canonical URL if present, otherwise the final URL after redirects.
693
+ */
694
+ url: URL;
695
+ /** Discovered feed URLs (RSS, Atom, JSON Feed) as URL objects */
696
+ feeds: URL[];
697
+ /**
698
+ * Page title (cleaned, from best available source).
699
+ *
700
+ * @remarks
701
+ * Collects titles from multiple sources, cleans them, and picks the longest.
702
+ * Sources: OpenGraph, Twitter Card, HTML title tag, First H1
703
+ */
704
+ title?: string;
705
+ /**
706
+ * Page description (from best available source).
707
+ *
708
+ * @remarks
709
+ * Collects descriptions from metadata and picks the longest.
710
+ * Sources: OpenGraph, Twitter Card, HTML meta description
711
+ */
712
+ description?: string;
713
+ /**
714
+ * Page keyvisual/image URL (from best available source).
715
+ *
716
+ * @remarks
717
+ * Priority: OpenGraph > Twitter Card > Largest Apple Touch Icon > Favicon
718
+ * Returns the URL object of the best visual representation of the site.
719
+ */
720
+ image?: URL;
721
+ /**
722
+ * Best available icon/favicon for the site.
723
+ *
724
+ * @remarks
725
+ * Priority: Largest Apple Touch Icon > Safari mask icon > Favicon > Shortcut icon > MS tile > Fluid icon
726
+ * Returns the highest quality icon available, preferring modern, high-resolution formats.
727
+ */
728
+ icon?: URL;
729
+ /**
730
+ * Primary language code (ISO 639-1).
731
+ *
732
+ * @remarks
733
+ * Extracted from HTML lang attribute, content-language meta tag, or OpenGraph locale.
734
+ * Normalized to lowercase ISO 639-1 format (e.g., 'en', 'de', 'fr', 'ja').
735
+ */
736
+ language?: string;
737
+ /**
738
+ * Region code (ISO 3166-1 alpha-2).
739
+ *
740
+ * @remarks
741
+ * Only present if the language includes a region specifier.
742
+ * Normalized to uppercase ISO 3166-1 alpha-2 format (e.g., 'US', 'GB', 'DE').
743
+ */
744
+ region?: string;
745
+ /**
746
+ * Raw HTML content of the page (UTF-8).
747
+ *
748
+ * @remarks
749
+ * The complete HTML source after fetching and decoding to UTF-8.
750
+ * Useful for custom processing or caching.
751
+ */
752
+ html: string;
753
+ /**
754
+ * Plain text content extracted from the HTML.
755
+ *
756
+ * @remarks
757
+ * Automatically converted from HTML using the `htmlToText` function.
758
+ * Removes all tags, decodes entities, and preserves document structure
759
+ * with appropriate line breaks.
760
+ */
761
+ text: string;
762
+ /**
763
+ * Internal links found on the page (same domain, excluding current URL).
764
+ *
765
+ * @remarks
766
+ * All links are URL objects. The current page URL is excluded to avoid
767
+ * self-references. Useful for site crawling and navigation analysis.
768
+ */
769
+ internalLinks: URL[];
770
+ /**
771
+ * External links found on the page (different domains).
772
+ *
773
+ * @remarks
774
+ * All links are URL objects. Useful for analyzing outbound links,
775
+ * citations, and external resources.
776
+ */
777
+ externalLinks: URL[];
778
+ }
779
+ /**
780
+ * Gathered article data.
781
+ *
782
+ * @remarks
783
+ * This interface represents the complete gathered data from an article page,
784
+ * including the authoritative URL, raw HTML, and extracted content.
785
+ * It will be extended incrementally with more properties.
786
+ */
787
+ interface Article {
788
+ /**
789
+ * Authoritative URL for the article.
790
+ *
791
+ * @remarks
792
+ * Uses canonical URL if present, otherwise the final URL after redirects.
793
+ */
794
+ url: URL;
795
+ /**
796
+ * Raw HTML content of the article page (UTF-8).
797
+ *
798
+ * @remarks
799
+ * The complete HTML source after fetching and decoding to UTF-8.
800
+ * Useful for custom processing or caching.
801
+ */
802
+ html: string;
803
+ /**
804
+ * Plain text content extracted from the HTML.
805
+ *
806
+ * @remarks
807
+ * Automatically converted from HTML using the `htmlToText` function.
808
+ * Removes all tags, decodes entities, and preserves document structure
809
+ * with appropriate line breaks.
810
+ */
811
+ text: string;
812
+ /**
813
+ * Cleaned article content (plain text).
814
+ *
815
+ * @remarks
816
+ * Extracted using Mozilla Readability (cleaned HTML), then converted to
817
+ * plain text using `htmlToText` for proper formatting.
818
+ * This is the main article body without navigation, ads, or other clutter.
819
+ * Falls back to undefined if Readability extraction fails.
820
+ */
821
+ content?: string;
822
+ /**
823
+ * Article title.
824
+ *
825
+ * @remarks
826
+ * Extracted from Mozilla Readability if available.
827
+ * Falls back to metadata (Schema.org, OpenGraph, Twitter Card, HTML title)
828
+ * if Readability extraction fails or title is empty.
829
+ */
830
+ title?: string;
831
+ /**
832
+ * Article description/excerpt.
833
+ *
834
+ * @remarks
835
+ * Extracted from Mozilla Readability's excerpt if available.
836
+ * Falls back to metadata (OpenGraph, Twitter Card, HTML meta description)
837
+ * if Readability excerpt is empty or extraction fails.
838
+ */
839
+ description?: string;
840
+ /**
841
+ * Article keyvisual/image URL (from best available source).
842
+ *
843
+ * @remarks
844
+ * Priority: Schema.org NewsArticle/Article (largest) > OpenGraph > Twitter Card > Largest Apple Touch Icon > Favicon
845
+ * Returns the URL object of the best visual representation of the article.
846
+ */
847
+ image?: URL;
848
+ /**
849
+ * Primary language code (ISO 639-1).
850
+ *
851
+ * @remarks
852
+ * Extracted from HTML lang attribute, Content-Language meta, or OpenGraph locale.
853
+ * Returns lowercase 2-letter ISO 639-1 code (e.g., 'en', 'de', 'fr').
854
+ */
855
+ language?: string;
856
+ /**
857
+ * Region/country code (ISO 3166-1 alpha-2).
858
+ *
859
+ * @remarks
860
+ * Extracted from language tags like 'en-US' or 'de-DE'.
861
+ * Returns uppercase 2-letter ISO 3166-1 alpha-2 code (e.g., 'US', 'GB', 'DE').
862
+ */
863
+ region?: string;
864
+ /**
865
+ * Internal links found in the article (same domain/subdomain).
866
+ *
867
+ * @remarks
868
+ * Links pointing to pages within the same domain.
869
+ * Automatically excludes the current article URL.
870
+ * All URLs are absolute and normalized.
871
+ */
872
+ internalLinks: URL[];
873
+ /**
874
+ * External links found in the article (different domains).
875
+ *
876
+ * @remarks
877
+ * Links pointing to external domains (useful for citations, references).
878
+ * All URLs are absolute and normalized.
879
+ */
880
+ externalLinks: URL[];
881
+ /**
882
+ * Word count of the article.
883
+ *
884
+ * @remarks
885
+ * Calculated from `content` if available (Readability-cleaned content),
886
+ * otherwise calculated from `text` (full page text).
887
+ * Based on whitespace-separated word boundaries.
888
+ */
889
+ wordCount: number;
890
+ /**
891
+ * Estimated reading time in minutes.
892
+ *
893
+ * @remarks
894
+ * Calculated from word count using average reading speed of 200 words per minute.
895
+ * Minimum value is 1 minute.
896
+ */
897
+ readingTime: number;
898
+ }
899
+
900
+ /**
901
+ * High-level article gathering functionality.
902
+ *
903
+ * @packageDocumentation
904
+ */
905
+
906
+ /**
907
+ * Gather article data from a URL in one convenient call.
908
+ *
909
+ * @remarks
910
+ * This is a high-level convenience method that fetches an article page and extracts
911
+ * relevant data. It handles encoding detection, redirects, and provides
912
+ * a unified interface for all article data.
913
+ *
914
+ * This method will be extended incrementally to include metadata extraction,
915
+ * content extraction, and more.
916
+ *
917
+ * @param url - Article URL as string or URL object
918
+ * @returns Gathered article data including URL, content, metadata, language, and links
919
+ * @throws Error if URL is invalid or fetch fails
920
+ *
921
+ * @example
922
+ * ```typescript
923
+ * // Fetch an article and get its data
924
+ * const article = await gatherArticle('https://example.com/article');
925
+ * console.log(article.url); // Final URL after redirects
926
+ * console.log(article.html); // Raw HTML content (UTF-8)
927
+ * console.log(article.text); // Plain text (full page HTML converted)
928
+ * console.log(article.content); // Cleaned article content (Readability + htmlToText)
929
+ * console.log(article.title); // Article title (from Readability or metadata)
930
+ * console.log(article.description); // Article excerpt or description
931
+ * console.log(article.image); // Article keyvisual/image (from best source)
932
+ * console.log(article.language); // Language code (ISO 639-1, e.g., 'en')
933
+ * console.log(article.region); // Region code (ISO 3166-1 alpha-2, e.g., 'US')
934
+ * console.log(article.internalLinks); // Array of internal link URLs
935
+ * console.log(article.externalLinks); // Array of external link URLs
936
+ * console.log(article.wordCount); // Word count (from content or text)
937
+ * console.log(article.readingTime); // Estimated reading time in minutes
938
+ * ```
939
+ */
940
+ declare function gatherArticle(url: string | URL): Promise<Article>;
941
+
942
+ /**
943
+ * High-level feed gathering functionality.
944
+ *
945
+ * @packageDocumentation
946
+ */
947
+
948
+ /**
949
+ * Gather and parse a feed from a URL in one convenient call.
950
+ *
951
+ * @remarks
952
+ * This is a high-level convenience method that combines fetching and parsing.
953
+ * It handles encoding detection, redirects, and feed format detection automatically.
954
+ *
955
+ * @param url - Feed URL as string or URL object
956
+ * @returns Normalized feed data
957
+ * @throws Error if URL is invalid, fetch fails, or feed cannot be parsed
958
+ *
959
+ * @example
960
+ * ```typescript
961
+ * // Fetch and parse a feed
962
+ * const feed = await gatherFeed('https://example.com/feed.xml');
963
+ *
964
+ * console.log(feed.title);
965
+ * console.log(feed.items[0].title);
966
+ * console.log(feed.items[0].url);
967
+ * ```
968
+ */
969
+ declare function gatherFeed(url: string | URL): Promise<Feed>;
970
+
971
+ /**
972
+ * High-level website gathering functionality.
973
+ *
974
+ * @packageDocumentation
975
+ */
976
+
977
+ /**
978
+ * Gather website data from a URL in one convenient call.
979
+ *
980
+ * @remarks
981
+ * This is a high-level convenience method that fetches a website and extracts
982
+ * all relevant data. It handles encoding detection, redirects, and provides
983
+ * a unified interface for all website data.
984
+ *
985
+ * This method will be extended incrementally to include metadata extraction,
986
+ * content extraction, and more.
987
+ *
988
+ * @param url - Website URL as string or URL object
989
+ * @returns Gathered website data including final URL, title, description, image, icon, language, html, text, feeds, and links
990
+ * @throws Error if URL is invalid or fetch fails
991
+ *
992
+ * @example
993
+ * ```typescript
994
+ * // Fetch a website and get its data
995
+ * const site = await gatherWebsite('https://example.com');
996
+ * console.log(site.url); // Final URL after redirects
997
+ * console.log(site.title); // Page title (cleaned, from best source)
998
+ * console.log(site.description); // Page description (from best source)
999
+ * console.log(site.image); // Page image/keyvisual (from best source)
1000
+ * console.log(site.icon); // Best available icon/favicon
1001
+ * console.log(site.language); // Primary language code (ISO 639-1)
1002
+ * console.log(site.region); // Region code (ISO 3166-1 alpha-2)
1003
+ * console.log(site.html); // Raw HTML content (UTF-8)
1004
+ * console.log(site.text); // Plain text content (extracted from HTML)
1005
+ * console.log(site.feeds); // Array of feed URL objects
1006
+ * console.log(site.internalLinks); // Array of internal link URL objects
1007
+ * console.log(site.externalLinks); // Array of external link URL objects
1008
+ * ```
1009
+ */
1010
+ declare function gatherWebsite(url: string | URL): Promise<Website>;
1011
+
1012
+ /**
1013
+ * HTML parsing utilities using linkedom.
1014
+ *
1015
+ * @remarks
1016
+ * This module provides a simple wrapper around linkedom for consistent
1017
+ * HTML parsing across all metadata extraction modules. Parsing should happen
1018
+ * once at the top level and the parsed document passed to all extractors.
1019
+ *
1020
+ * @packageDocumentation
1021
+ */
1022
+ /**
1023
+ * Parse HTML string into a DOM document.
1024
+ *
1025
+ * @remarks
1026
+ * Parses HTML using linkedom, providing a standards-compliant DOM implementation.
1027
+ * This should be called once per document, with the result passed to all metadata
1028
+ * extractors for performance.
1029
+ *
1030
+ * Never throws - returns a document even for malformed HTML.
1031
+ *
1032
+ * @param html - HTML string to parse
1033
+ * @param baseUrl - Optional base URL for resolving relative URLs
1034
+ * @returns Parsed DOM document
1035
+ *
1036
+ * @example
1037
+ * ```typescript
1038
+ * const doc = parseHTML('<html><head><title>Test</title></head></html>');
1039
+ * const title = doc.querySelector('title')?.textContent;
1040
+ * ```
1041
+ */
1042
+ declare function parseHTML(html: string, baseUrl?: string): Document;
1043
+ type HTMLDocument = Document;
1044
+
1045
+ /**
1046
+ * Analytics and tracking types.
1047
+ *
1048
+ * @remarks
1049
+ * Types for analytics service detection (IDs only, no tracking).
1050
+ *
1051
+ * @packageDocumentation
1052
+ */
1053
+ /**
1054
+ * Analytics metadata.
1055
+ *
1056
+ * @remarks
1057
+ * Contains detected analytics service IDs. Privacy-conscious - only extracts IDs,
1058
+ * doesn't perform any tracking.
1059
+ */
1060
+ interface AnalyticsMetadata {
1061
+ /** Google Analytics tracking IDs (UA-, G-, GT- prefixes) */
1062
+ googleAnalytics?: string[];
1063
+ /** Google Tag Manager container IDs */
1064
+ googleTagManager?: string[];
1065
+ /** Facebook Pixel IDs */
1066
+ facebookPixel?: string[];
1067
+ /** Matomo/Piwik site IDs */
1068
+ matomo?: string[];
1069
+ /** Plausible Analytics domains */
1070
+ plausible?: string[];
1071
+ /** Adobe Analytics (Omniture) IDs */
1072
+ adobe?: string[];
1073
+ /** Cloudflare Web Analytics tokens */
1074
+ cloudflare?: string[];
1075
+ /** Fathom Analytics site IDs */
1076
+ fathom?: string[];
1077
+ }
1078
+
1079
+ /**
1080
+ * Analytics and tracking extraction.
1081
+ *
1082
+ * @remarks
1083
+ * Detects analytics service IDs from HTML documents.
1084
+ * Privacy-conscious - only extracts IDs, doesn't perform any tracking.
1085
+ *
1086
+ * @packageDocumentation
1087
+ */
1088
+
1089
+ /**
1090
+ * Extract analytics metadata from parsed HTML document.
1091
+ *
1092
+ * @remarks
1093
+ * Detects analytics service IDs by examining script tags and their content.
1094
+ * Only extracts identifiers, does not track or collect user data.
1095
+ *
1096
+ * @param doc - Parsed HTML document
1097
+ * @returns Analytics metadata
1098
+ *
1099
+ * @example
1100
+ * ```typescript
1101
+ * const doc = parseHTML(htmlString);
1102
+ * const analytics = extractAnalytics(doc);
1103
+ * console.log(analytics.googleAnalytics);
1104
+ * console.log(analytics.googleTagManager);
1105
+ * ```
1106
+ */
1107
+ declare function extractAnalytics(doc: HTMLDocument): AnalyticsMetadata;
1108
+
1109
+ /**
1110
+ * Assets extraction types.
1111
+ *
1112
+ * @remarks
1113
+ * Types for categorized asset URLs extracted from HTML documents.
1114
+ *
1115
+ * @author Anonyfox <max@anonyfox.com>
1116
+ * @license MIT
1117
+ * @see {@link https://github.com/Anonyfox/ravenjs}
1118
+ * @see {@link https://ravenjs.dev}
1119
+ * @see {@link https://anonyfox.com}
1120
+ *
1121
+ * @packageDocumentation
1122
+ */
1123
+ /**
1124
+ * Categorized assets extracted from HTML.
1125
+ *
1126
+ * @remarks
1127
+ * Contains all external assets referenced in the document, organized by type.
1128
+ * All URLs are normalized to absolute format if a base URL is available.
1129
+ */
1130
+ interface AssetsMetadata {
1131
+ /** Image URLs from img, picture, srcset, and meta tags */
1132
+ images?: string[];
1133
+ /** Stylesheet URLs from link tags */
1134
+ stylesheets?: string[];
1135
+ /** Script URLs from script tags */
1136
+ scripts?: string[];
1137
+ /** Font URLs extracted from CSS */
1138
+ fonts?: string[];
1139
+ /** Media URLs from video, audio, source, and track elements */
1140
+ media?: string[];
1141
+ /** Web app manifest URLs */
1142
+ manifests?: string[];
1143
+ /** Preload/prefetch resource hints */
1144
+ preloads?: PreloadResource[];
1145
+ /** DNS prefetch and preconnect hints */
1146
+ connectionHints?: ConnectionHint[];
1147
+ }
1148
+ /**
1149
+ * Preload or prefetch resource hint.
1150
+ */
1151
+ interface PreloadResource {
1152
+ /** Resource URL */
1153
+ url: string;
1154
+ /** Resource type (script, style, font, image, etc.) */
1155
+ as?: string;
1156
+ /** MIME type */
1157
+ type?: string;
1158
+ /** Crossorigin attribute */
1159
+ crossorigin?: string;
1160
+ /** Whether this is a prefetch (true) or preload (false) */
1161
+ prefetch?: boolean;
1162
+ }
1163
+ /**
1164
+ * DNS prefetch or preconnect hint.
1165
+ */
1166
+ interface ConnectionHint {
1167
+ /** Domain URL */
1168
+ url: string;
1169
+ /** Whether this is a preconnect (true) or dns-prefetch (false) */
1170
+ preconnect?: boolean;
1171
+ /** Crossorigin attribute */
1172
+ crossorigin?: string;
1173
+ }
1174
+
1175
+ /**
1176
+ * Assets extraction.
1177
+ *
1178
+ * @remarks
1179
+ * Extracts categorized asset URLs from HTML documents.
1180
+ *
1181
+ * @author Anonyfox <max@anonyfox.com>
1182
+ * @license MIT
1183
+ * @see {@link https://github.com/Anonyfox/ravenjs}
1184
+ * @see {@link https://ravenjs.dev}
1185
+ * @see {@link https://anonyfox.com}
1186
+ *
1187
+ * @packageDocumentation
1188
+ */
1189
+
1190
+ /**
1191
+ * Extract assets metadata from parsed HTML document.
1192
+ *
1193
+ * @remarks
1194
+ * Extracts all external assets referenced in the document, organized by type.
1195
+ * All URLs are normalized to absolute format based on the document's base URL.
1196
+ *
1197
+ * The extractor finds assets from:
1198
+ * - Images: `<img>`, `<picture>`, `srcset`, OpenGraph meta tags
1199
+ * - Stylesheets: `<link rel="stylesheet">`
1200
+ * - Scripts: `<script src>`
1201
+ * - Fonts: CSS `@font-face` and `url()` with font extensions
1202
+ * - Media: `<video>`, `<audio>`, `<source>`, `<track>`
1203
+ * - Manifests: `<link rel="manifest">`
1204
+ * - Preloads: `<link rel="preload">` and `<link rel="prefetch">`
1205
+ * - Connection hints: `<link rel="dns-prefetch">` and `<link rel="preconnect">`
1206
+ *
1207
+ * @param doc - Parsed HTML document
1208
+ * @param baseUrl - Optional base URL for resolving relative URLs
1209
+ * @returns Assets metadata object with categorized URLs
1210
+ *
1211
+ * @example
1212
+ * ```typescript
1213
+ * const doc = parseHTML(htmlString);
1214
+ * const assets = extractAssets(doc, 'https://example.com');
1215
+ * console.log(assets.images);
1216
+ * console.log(assets.stylesheets);
1217
+ * console.log(assets.scripts);
1218
+ * ```
1219
+ */
1220
+ declare function extractAssets(doc: HTMLDocument, baseUrl?: string | URL | null): AssetsMetadata;
1221
+
1222
+ /**
1223
+ * Canonical and alternate URL metadata types.
1224
+ *
1225
+ * @remarks
1226
+ * URL relationships, internationalization, and special versions.
1227
+ *
1228
+ * @packageDocumentation
1229
+ */
1230
+ /**
1231
+ * Alternate URL relationship.
1232
+ */
1233
+ interface AlternateLink {
1234
+ /** URL of the alternate version */
1235
+ href: string;
1236
+ /** Language/locale code (hreflang) */
1237
+ hreflang?: string;
1238
+ /** MIME type */
1239
+ type?: string;
1240
+ /** Title/description */
1241
+ title?: string;
1242
+ }
1243
+ /**
1244
+ * App link metadata for deep linking.
1245
+ */
1246
+ interface AppLinks {
1247
+ /** iOS app URL */
1248
+ ios?: string;
1249
+ /** Android app URL */
1250
+ android?: string;
1251
+ /** Web fallback URL */
1252
+ web?: string;
1253
+ }
1254
+ /**
1255
+ * Canonical and alternate URL metadata.
1256
+ *
1257
+ * @remarks
1258
+ * Contains canonical URLs, language alternates, special versions (AMP),
1259
+ * and app linking metadata.
1260
+ */
1261
+ interface CanonicalMetadata {
1262
+ /** Canonical URL for this page */
1263
+ canonical?: string;
1264
+ /** Language/region alternates */
1265
+ alternates?: AlternateLink[];
1266
+ /** AMP (Accelerated Mobile Pages) version URL */
1267
+ amphtml?: string;
1268
+ /** Web app manifest URL */
1269
+ manifest?: string;
1270
+ /** App deep linking URLs */
1271
+ appLinks?: AppLinks;
1272
+ }
1273
+
1274
+ /**
1275
+ * Canonical and alternate URL extraction.
1276
+ *
1277
+ * @remarks
1278
+ * Extracts canonical URLs, alternates, and special versions from HTML documents.
1279
+ *
1280
+ * @packageDocumentation
1281
+ */
1282
+
1283
+ /**
1284
+ * Extract canonical and alternate URL metadata from parsed HTML document.
1285
+ *
1286
+ * @remarks
1287
+ * Extracts canonical URLs, language alternates, AMP versions, manifests,
1288
+ * and app linking metadata.
1289
+ *
1290
+ * @param doc - Parsed HTML document
1291
+ * @returns Canonical metadata object
1292
+ *
1293
+ * @example
1294
+ * ```typescript
1295
+ * const doc = parseHTML(htmlString);
1296
+ * const canonical = extractCanonical(doc);
1297
+ * console.log(canonical.canonical);
1298
+ * console.log(canonical.alternates);
1299
+ * ```
1300
+ */
1301
+ declare function extractCanonical(doc: HTMLDocument): CanonicalMetadata;
1302
+
1303
+ /**
1304
+ * Copyright and licensing types.
1305
+ *
1306
+ * @remarks
1307
+ * Types for copyright and content licensing information.
1308
+ *
1309
+ * @packageDocumentation
1310
+ */
1311
+ /**
1312
+ * Copyright and licensing metadata.
1313
+ *
1314
+ * @remarks
1315
+ * Contains copyright and license information from various sources.
1316
+ */
1317
+ interface CopyrightMetadata {
1318
+ /** Copyright notice */
1319
+ copyright?: string;
1320
+ /** License URL or identifier */
1321
+ license?: string;
1322
+ /** Copyright holder/owner */
1323
+ holder?: string;
1324
+ /** Copyright year */
1325
+ year?: string;
1326
+ }
1327
+
1328
+ /**
1329
+ * Copyright and licensing extraction.
1330
+ *
1331
+ * @remarks
1332
+ * Extracts copyright and license metadata from HTML documents.
1333
+ *
1334
+ * @packageDocumentation
1335
+ */
1336
+
1337
+ /**
1338
+ * Extract copyright metadata from parsed HTML document.
1339
+ *
1340
+ * @remarks
1341
+ * Extracts copyright and licensing information from meta tags, link tags,
1342
+ * and Schema.org structured data.
1343
+ *
1344
+ * @param doc - Parsed HTML document
1345
+ * @returns Copyright metadata
1346
+ *
1347
+ * @example
1348
+ * ```typescript
1349
+ * const doc = parseHTML(htmlString);
1350
+ * const copyright = extractCopyright(doc);
1351
+ * console.log(copyright.copyright);
1352
+ * console.log(copyright.license);
1353
+ * ```
1354
+ */
1355
+ declare function extractCopyright(doc: HTMLDocument): CopyrightMetadata;
1356
+
1357
+ /**
1358
+ * Dublin Core metadata types.
1359
+ *
1360
+ * @remarks
1361
+ * Library and academic metadata standard.
1362
+ *
1363
+ * @packageDocumentation
1364
+ */
1365
+ /**
1366
+ * Dublin Core metadata extracted from meta tags.
1367
+ *
1368
+ * @remarks
1369
+ * Contains metadata using the Dublin Core standard, commonly used in
1370
+ * academic and library contexts. Supports both DC. and dcterms. prefixes.
1371
+ */
1372
+ interface DublinCoreMetadata {
1373
+ /** Resource title */
1374
+ title?: string;
1375
+ /** Entity responsible for making the resource (authors, creators) */
1376
+ creator?: string[];
1377
+ /** Topic or subject of the resource */
1378
+ subject?: string[];
1379
+ /** Description of the resource */
1380
+ description?: string;
1381
+ /** Entity responsible for making the resource available */
1382
+ publisher?: string;
1383
+ /** Entity responsible for contributions to the resource */
1384
+ contributor?: string[];
1385
+ /** Date of resource creation/publication */
1386
+ date?: string;
1387
+ /** Nature or genre of the resource */
1388
+ type?: string;
1389
+ /** File format, physical medium, or dimensions */
1390
+ format?: string;
1391
+ /** Unambiguous reference to the resource */
1392
+ identifier?: string;
1393
+ /** Related resource from which the described resource is derived */
1394
+ source?: string;
1395
+ /** Language of the resource */
1396
+ language?: string;
1397
+ /** Related resource */
1398
+ relation?: string;
1399
+ /** Spatial or temporal topic, location, or period */
1400
+ coverage?: string;
1401
+ /** Information about rights held in and over the resource */
1402
+ rights?: string;
1403
+ }
1404
+
1405
+ /**
1406
+ * Dublin Core metadata extraction.
1407
+ *
1408
+ * @remarks
1409
+ * Extracts Dublin Core metadata from HTML documents.
1410
+ *
1411
+ * @packageDocumentation
1412
+ */
1413
+
1414
+ /**
1415
+ * Extract Dublin Core metadata from parsed HTML document.
1416
+ *
1417
+ * @remarks
1418
+ * Extracts Dublin Core metadata using both DC. and dcterms. prefixes.
1419
+ * Fields that can have multiple values (creator, subject, contributor)
1420
+ * are extracted as arrays.
1421
+ *
1422
+ * @param doc - Parsed HTML document
1423
+ * @returns Dublin Core metadata object
1424
+ *
1425
+ * @example
1426
+ * ```typescript
1427
+ * const doc = parseHTML(htmlString);
1428
+ * const dc = extractDublinCore(doc);
1429
+ * console.log(dc.title);
1430
+ * console.log(dc.creator);
1431
+ * ```
1432
+ */
1433
+ declare function extractDublinCore(doc: HTMLDocument): DublinCoreMetadata;
1434
+
1435
+ /**
1436
+ * Feed discovery types.
1437
+ *
1438
+ * @remarks
1439
+ * Types for discovering RSS, Atom, and JSON feeds.
1440
+ *
1441
+ * @packageDocumentation
1442
+ */
1443
+ /**
1444
+ * Discovered feed information.
1445
+ */
1446
+ interface DiscoveredFeed {
1447
+ /** Feed URL */
1448
+ url: string;
1449
+ /** Feed type */
1450
+ type: 'rss' | 'atom' | 'json' | 'unknown';
1451
+ /** Feed title (if provided in link tag) */
1452
+ title?: string;
1453
+ }
1454
+ /**
1455
+ * Feed discovery metadata.
1456
+ *
1457
+ * @remarks
1458
+ * Contains all discovered feeds and suggested feed URLs based on common patterns.
1459
+ */
1460
+ interface FeedDiscoveryMetadata {
1461
+ /** Feeds explicitly declared in <link> tags */
1462
+ feeds: DiscoveredFeed[];
1463
+ /** Suggested feed URLs based on common patterns (not verified) */
1464
+ suggestions?: string[];
1465
+ }
1466
+
1467
+ /**
1468
+ * Feed discovery extraction.
1469
+ *
1470
+ * @remarks
1471
+ * Discovers RSS, Atom, and JSON feeds in HTML documents.
1472
+ *
1473
+ * @packageDocumentation
1474
+ */
1475
+
1476
+ /**
1477
+ * Extract feed discovery metadata from parsed HTML document.
1478
+ *
1479
+ * @remarks
1480
+ * Finds all feeds declared in <link rel="alternate"> tags and generates
1481
+ * suggestions for common feed URL patterns.
1482
+ *
1483
+ * @param doc - Parsed HTML document
1484
+ * @param documentUrl - Optional document URL for generating absolute feed suggestions
1485
+ * @returns Feed discovery metadata
1486
+ *
1487
+ * @example
1488
+ * ```typescript
1489
+ * const doc = parseHTML(htmlString);
1490
+ * const feeds = extractFeedDiscovery(doc, 'https://example.com');
1491
+ * console.log(feeds.feeds); // Discovered feeds
1492
+ * console.log(feeds.suggestions); // Suggested feed URLs
1493
+ * ```
1494
+ */
1495
+ declare function extractFeedDiscovery(doc: HTMLDocument, documentUrl?: string | URL): FeedDiscoveryMetadata;
1496
+
1497
+ /**
1498
+ * Geographic location types.
1499
+ *
1500
+ * @remarks
1501
+ * Types for geographic location metadata.
1502
+ *
1503
+ * @packageDocumentation
1504
+ */
1505
+ /**
1506
+ * Geographic coordinates.
1507
+ *
1508
+ * @remarks
1509
+ * Latitude and longitude coordinates.
1510
+ */
1511
+ interface GeoPosition {
1512
+ /** Latitude in decimal degrees */
1513
+ latitude: number;
1514
+ /** Longitude in decimal degrees */
1515
+ longitude: number;
1516
+ }
1517
+ /**
1518
+ * Geographic metadata.
1519
+ *
1520
+ * @remarks
1521
+ * Contains geographic location information from meta tags.
1522
+ */
1523
+ interface GeoMetadata {
1524
+ /** Geographic position (latitude/longitude) */
1525
+ position?: GeoPosition;
1526
+ /** Place name */
1527
+ placename?: string;
1528
+ /** Region code (e.g., US-CA for California, USA) */
1529
+ region?: string;
1530
+ /** Country name or code */
1531
+ country?: string;
1532
+ }
1533
+
1534
+ /**
1535
+ * Geographic location extraction.
1536
+ *
1537
+ * @remarks
1538
+ * Extracts geographic location metadata from HTML documents.
1539
+ *
1540
+ * @packageDocumentation
1541
+ */
1542
+
1543
+ /**
1544
+ * Extract geographic metadata from parsed HTML document.
1545
+ *
1546
+ * @remarks
1547
+ * Extracts geographic location information including coordinates,
1548
+ * place names, and region codes from meta tags.
1549
+ *
1550
+ * @param doc - Parsed HTML document
1551
+ * @returns Geographic metadata
1552
+ *
1553
+ * @example
1554
+ * ```typescript
1555
+ * const doc = parseHTML(htmlString);
1556
+ * const geo = extractGeo(doc);
1557
+ * console.log(geo.position?.latitude);
1558
+ * console.log(geo.placename);
1559
+ * ```
1560
+ */
1561
+ declare function extractGeo(doc: HTMLDocument): GeoMetadata;
1562
+
1563
+ /**
1564
+ * Icons and visual assets types.
1565
+ *
1566
+ * @remarks
1567
+ * Types for favicons, app icons, and visual branding.
1568
+ *
1569
+ * @packageDocumentation
1570
+ */
1571
+ /**
1572
+ * Apple touch icon metadata.
1573
+ */
1574
+ interface AppleTouchIcon {
1575
+ /** Icon URL */
1576
+ url: string;
1577
+ /** Icon size (e.g., "180x180") */
1578
+ sizes?: string;
1579
+ /** Whether it's precomposed (no effects applied) */
1580
+ precomposed?: boolean;
1581
+ }
1582
+ /**
1583
+ * Safari mask icon metadata.
1584
+ */
1585
+ interface MaskIcon {
1586
+ /** SVG icon URL */
1587
+ url: string;
1588
+ /** Icon color */
1589
+ color?: string;
1590
+ }
1591
+ /**
1592
+ * Microsoft tile metadata.
1593
+ */
1594
+ interface MSTile {
1595
+ /** Tile image URL */
1596
+ image?: string;
1597
+ /** Tile background color */
1598
+ color?: string;
1599
+ /** Microsoft browserconfig XML URL */
1600
+ config?: string;
1601
+ }
1602
+ /**
1603
+ * Icons and visual assets metadata.
1604
+ *
1605
+ * @remarks
1606
+ * Contains all icon-related metadata including favicons, app icons,
1607
+ * and platform-specific icons.
1608
+ */
1609
+ interface IconsMetadata {
1610
+ /** Standard favicon */
1611
+ favicon?: string;
1612
+ /** Shortcut icon (legacy) */
1613
+ shortcutIcon?: string;
1614
+ /** Apple touch icons for iOS */
1615
+ appleTouchIcons?: AppleTouchIcon[];
1616
+ /** Safari pinned tab icon */
1617
+ maskIcon?: MaskIcon;
1618
+ /** Microsoft tile configuration */
1619
+ msTile?: MSTile;
1620
+ /** Fluid icon (legacy) */
1621
+ fluidIcon?: string;
1622
+ }
1623
+
1624
+ /**
1625
+ * Icons and visual assets extraction.
1626
+ *
1627
+ * @remarks
1628
+ * Extracts icon metadata from HTML documents.
1629
+ *
1630
+ * @packageDocumentation
1631
+ */
1632
+
1633
+ /**
1634
+ * Extract icons metadata from parsed HTML document.
1635
+ *
1636
+ * @remarks
1637
+ * Extracts all icon-related metadata including favicons, Apple touch icons,
1638
+ * Safari mask icons, and Microsoft tile configuration.
1639
+ *
1640
+ * @param doc - Parsed HTML document
1641
+ * @returns Icons metadata
1642
+ *
1643
+ * @example
1644
+ * ```typescript
1645
+ * const doc = parseHTML(htmlString);
1646
+ * const icons = extractIcons(doc);
1647
+ * console.log(icons.favicon);
1648
+ * console.log(icons.appleTouchIcons);
1649
+ * ```
1650
+ */
1651
+ declare function extractIcons(doc: HTMLDocument): IconsMetadata;
1652
+
1653
+ /**
1654
+ * Language and localization types.
1655
+ *
1656
+ * @remarks
1657
+ * Types for language and locale metadata.
1658
+ *
1659
+ * @packageDocumentation
1660
+ */
1661
+ /**
1662
+ * Language and localization metadata.
1663
+ *
1664
+ * @remarks
1665
+ * Contains language and locale information from various sources including
1666
+ * HTML lang attribute, meta tags, and OpenGraph locale.
1667
+ */
1668
+ interface LanguageMetadata {
1669
+ /** HTML lang attribute from <html> tag */
1670
+ htmlLang?: string;
1671
+ /** Content-Language meta tag */
1672
+ contentLanguage?: string;
1673
+ /** OpenGraph locale */
1674
+ ogLocale?: string;
1675
+ /** OpenGraph alternate locales */
1676
+ alternateLocales?: string[];
1677
+ /** Primary language (best guess, normalized to ISO 639-1) */
1678
+ primary?: string;
1679
+ /** Region code (if available, ISO 3166-1 alpha-2) */
1680
+ region?: string;
1681
+ }
1682
+
1683
+ /**
1684
+ * Language and localization extraction.
1685
+ *
1686
+ * @remarks
1687
+ * Extracts language and locale metadata from HTML documents.
1688
+ *
1689
+ * @packageDocumentation
1690
+ */
1691
+
1692
+ /**
1693
+ * Extract language and localization metadata from parsed HTML document.
1694
+ *
1695
+ * @remarks
1696
+ * Extracts language information from HTML lang attribute, meta tags,
1697
+ * and OpenGraph locale. Normalizes to provide a primary language and region.
1698
+ *
1699
+ * @param doc - Parsed HTML document
1700
+ * @returns Language metadata
1701
+ *
1702
+ * @example
1703
+ * ```typescript
1704
+ * const doc = parseHTML(htmlString);
1705
+ * const lang = extractLanguage(doc);
1706
+ * console.log(lang.primary); // 'en'
1707
+ * console.log(lang.region); // 'US'
1708
+ * ```
1709
+ */
1710
+ declare function extractLanguage(doc: HTMLDocument): LanguageMetadata;
1711
+
1712
+ /**
1713
+ * Links extraction types.
1714
+ *
1715
+ * @remarks
1716
+ * Types for navigational link extraction and analysis.
1717
+ *
1718
+ * @author Anonyfox <max@anonyfox.com>
1719
+ * @license MIT
1720
+ * @see {@link https://github.com/Anonyfox/ravenjs}
1721
+ * @see {@link https://ravenjs.dev}
1722
+ * @see {@link https://anonyfox.com}
1723
+ *
1724
+ * @packageDocumentation
1725
+ */
1726
+ /**
1727
+ * Extracted link with metadata.
1728
+ *
1729
+ * @remarks
1730
+ * Represents a single hyperlink with all relevant attributes.
1731
+ * URLs are normalized to absolute format if a base URL is available.
1732
+ */
1733
+ interface ExtractedLink {
1734
+ /** Absolute URL of the link */
1735
+ url: string;
1736
+ /** Anchor text (visible text content) */
1737
+ text?: string;
1738
+ /** Title attribute */
1739
+ title?: string;
1740
+ /** Rel attribute value */
1741
+ rel?: string;
1742
+ /** Target attribute (_blank, _self, etc.) */
1743
+ target?: string;
1744
+ /** Whether this is an internal link (same origin) */
1745
+ internal?: boolean;
1746
+ /** Whether this is an external link (different origin) */
1747
+ external?: boolean;
1748
+ /** Whether link has nofollow rel */
1749
+ nofollow?: boolean;
1750
+ /** Whether link has ugc (User Generated Content) rel */
1751
+ ugc?: boolean;
1752
+ /** Whether link has sponsored rel */
1753
+ sponsored?: boolean;
1754
+ /** Whether link has noopener rel */
1755
+ noopener?: boolean;
1756
+ /** Whether link has noreferrer rel */
1757
+ noreferrer?: boolean;
1758
+ }
1759
+ /**
1760
+ * Links extraction options.
1761
+ */
1762
+ interface LinksExtractionOptions {
1763
+ /**
1764
+ * Filter links by scope.
1765
+ *
1766
+ * @remarks
1767
+ * - `'all'` - Return all links (default)
1768
+ * - `'internal'` - Only links to same origin
1769
+ * - `'external'` - Only links to different origins
1770
+ */
1771
+ scope?: 'all' | 'internal' | 'external';
1772
+ /**
1773
+ * Filter out links with specific rel attributes.
1774
+ *
1775
+ * @remarks
1776
+ * Useful for crawlers to skip nofollow, sponsored, or UGC links.
1777
+ *
1778
+ * @example
1779
+ * ```typescript
1780
+ * // Skip nofollow and sponsored links
1781
+ * { excludeRel: ['nofollow', 'sponsored'] }
1782
+ * ```
1783
+ */
1784
+ excludeRel?: Array<'nofollow' | 'noopener' | 'noreferrer' | 'ugc' | 'sponsored'>;
1785
+ /**
1786
+ * Include only links with specific rel attributes.
1787
+ *
1788
+ * @remarks
1789
+ * If specified, only links matching these rel values are included.
1790
+ */
1791
+ includeRel?: Array<'nofollow' | 'noopener' | 'noreferrer' | 'ugc' | 'sponsored'>;
1792
+ /**
1793
+ * Whether to include hash-only links (#anchor).
1794
+ *
1795
+ * @default false
1796
+ */
1797
+ includeHashLinks?: boolean;
1798
+ /**
1799
+ * Whether to deduplicate URLs.
1800
+ *
1801
+ * @remarks
1802
+ * If true, only unique URLs are returned (keeps first occurrence).
1803
+ *
1804
+ * @default true
1805
+ */
1806
+ deduplicate?: boolean;
1807
+ /**
1808
+ * Maximum number of links to extract.
1809
+ *
1810
+ * @remarks
1811
+ * Useful for limiting extraction on large pages.
1812
+ */
1813
+ limit?: number;
1814
+ }
1815
+ /**
1816
+ * Links metadata extracted from HTML.
1817
+ *
1818
+ * @remarks
1819
+ * Contains categorized and analyzed links from the document.
1820
+ */
1821
+ interface LinksMetadata {
1822
+ /** All extracted links */
1823
+ all?: ExtractedLink[];
1824
+ /** Internal links (same origin) */
1825
+ internal?: ExtractedLink[];
1826
+ /** External links (different origin) */
1827
+ external?: ExtractedLink[];
1828
+ /** Links with nofollow rel */
1829
+ nofollow?: ExtractedLink[];
1830
+ /** Total count of links found */
1831
+ totalCount?: number;
1832
+ /** Count of internal links */
1833
+ internalCount?: number;
1834
+ /** Count of external links */
1835
+ externalCount?: number;
1836
+ /** Count of nofollow links */
1837
+ nofollowCount?: number;
1838
+ }
1839
+
1840
+ /**
1841
+ * Links extraction.
1842
+ *
1843
+ * @remarks
1844
+ * Extract navigational links from HTML documents with advanced filtering
1845
+ * and categorization for crawler and SEO use cases.
1846
+ *
1847
+ * @author Anonyfox <max@anonyfox.com>
1848
+ * @license MIT
1849
+ * @see {@link https://github.com/Anonyfox/ravenjs}
1850
+ * @see {@link https://ravenjs.dev}
1851
+ * @see {@link https://anonyfox.com}
1852
+ *
1853
+ * @packageDocumentation
1854
+ */
1855
+
1856
+ /**
1857
+ * Extract links from parsed HTML document.
1858
+ *
1859
+ * @remarks
1860
+ * Extracts all `<a href>` links with comprehensive metadata and filtering options.
1861
+ * Perfect for crawlers, SEO analysis, and link discovery.
1862
+ *
1863
+ * Features:
1864
+ * - Internal/external link categorization
1865
+ * - Rel attribute filtering (nofollow, ugc, sponsored, etc.)
1866
+ * - Automatic URL normalization
1867
+ * - Hash link filtering
1868
+ * - Scheme filtering (only http/https)
1869
+ * - Deduplication
1870
+ * - Link text extraction
1871
+ *
1872
+ * @param doc - Parsed HTML document
1873
+ * @param baseUrl - Base URL for resolving relative links and determining internal/external
1874
+ * @param options - Extraction options for filtering and categorization
1875
+ * @returns Links metadata with categorized links
1876
+ *
1877
+ * @example
1878
+ * ```typescript
1879
+ * const doc = parseHTML(htmlString);
1880
+ * const links = extractLinks(doc, 'https://example.com');
1881
+ *
1882
+ * // Get all internal links (same origin)
1883
+ * console.log(links.internal);
1884
+ *
1885
+ * // Get external links excluding nofollow
1886
+ * const linksNoFollow = extractLinks(doc, 'https://example.com', {
1887
+ * scope: 'external',
1888
+ * excludeRel: ['nofollow']
1889
+ * });
1890
+ * ```
1891
+ *
1892
+ * @example
1893
+ * ```typescript
1894
+ * // Crawler use case - get follow-able links
1895
+ * const links = extractLinks(doc, baseUrl, {
1896
+ * excludeRel: ['nofollow', 'ugc', 'sponsored'],
1897
+ * includeHashLinks: false
1898
+ * });
1899
+ * ```
1900
+ */
1901
+ declare function extractLinks(doc: HTMLDocument, baseUrl?: string | URL | null, options?: LinksExtractionOptions): LinksMetadata;
1902
+
1903
+ /**
1904
+ * Monetization and payment types.
1905
+ *
1906
+ * @remarks
1907
+ * Types for web monetization and payment metadata.
1908
+ *
1909
+ * @packageDocumentation
1910
+ */
1911
+ /**
1912
+ * Monetization metadata.
1913
+ *
1914
+ * @remarks
1915
+ * Contains web monetization and payment verification metadata.
1916
+ */
1917
+ interface MonetizationMetadata {
1918
+ /** Web Monetization API payment pointer */
1919
+ webMonetization?: string;
1920
+ /** PayPal site verification token */
1921
+ paypalVerification?: string;
1922
+ /** Brave Creator verification token */
1923
+ braveCreator?: string;
1924
+ /** Coil payment pointer (legacy) */
1925
+ coil?: string;
1926
+ /** Bitcoin address */
1927
+ bitcoin?: string;
1928
+ /** Ethereum address */
1929
+ ethereum?: string;
1930
+ }
1931
+
1932
+ /**
1933
+ * Monetization and payment extraction.
1934
+ *
1935
+ * @remarks
1936
+ * Extracts web monetization and payment metadata from HTML documents.
1937
+ *
1938
+ * @packageDocumentation
1939
+ */
1940
+
1941
+ /**
1942
+ * Extract monetization metadata from parsed HTML document.
1943
+ *
1944
+ * @remarks
1945
+ * Extracts web monetization, payment verification, and cryptocurrency
1946
+ * addresses from meta tags and link tags.
1947
+ *
1948
+ * @param doc - Parsed HTML document
1949
+ * @returns Monetization metadata
1950
+ *
1951
+ * @example
1952
+ * ```typescript
1953
+ * const doc = parseHTML(htmlString);
1954
+ * const monetization = extractMonetization(doc);
1955
+ * console.log(monetization.webMonetization);
1956
+ * console.log(monetization.bitcoin);
1957
+ * ```
1958
+ */
1959
+ declare function extractMonetization(doc: HTMLDocument): MonetizationMetadata;
1960
+
1961
+ /**
1962
+ * News and press types.
1963
+ *
1964
+ * @remarks
1965
+ * Types for news-specific metadata.
1966
+ *
1967
+ * @packageDocumentation
1968
+ */
1969
+ /**
1970
+ * News metadata.
1971
+ *
1972
+ * @remarks
1973
+ * Contains news-specific metadata for articles and press releases.
1974
+ */
1975
+ interface NewsMetadata {
1976
+ /** News keywords (distinct from regular keywords) */
1977
+ keywords?: string[];
1978
+ /** Google News standout tag (indicates exceptional journalism) */
1979
+ standout?: string;
1980
+ /** Syndication source (original publisher) */
1981
+ syndicationSource?: string;
1982
+ /** Original source URL */
1983
+ originalSource?: string;
1984
+ }
1985
+
1986
+ /**
1987
+ * News and press extraction.
1988
+ *
1989
+ * @remarks
1990
+ * Extracts news-specific metadata from HTML documents.
1991
+ *
1992
+ * @packageDocumentation
1993
+ */
1994
+
1995
+ /**
1996
+ * Extract news metadata from parsed HTML document.
1997
+ *
1998
+ * @remarks
1999
+ * Extracts news-specific metadata including keywords, standout tags,
2000
+ * and syndication information.
2001
+ *
2002
+ * @param doc - Parsed HTML document
2003
+ * @returns News metadata
2004
+ *
2005
+ * @example
2006
+ * ```typescript
2007
+ * const doc = parseHTML(htmlString);
2008
+ * const news = extractNews(doc);
2009
+ * console.log(news.keywords);
2010
+ * console.log(news.standout);
2011
+ * ```
2012
+ */
2013
+ declare function extractNews(doc: HTMLDocument): NewsMetadata;
2014
+
2015
+ /**
2016
+ * OpenGraph metadata types.
2017
+ *
2018
+ * @remarks
2019
+ * Facebook's Open Graph protocol for rich social sharing.
2020
+ *
2021
+ * @packageDocumentation
2022
+ */
2023
+ /**
2024
+ * OpenGraph article metadata.
2025
+ */
2026
+ interface OpenGraphArticle {
2027
+ /** Publication date */
2028
+ publishedTime?: string;
2029
+ /** Last modification date */
2030
+ modifiedTime?: string;
2031
+ /** Expiration date */
2032
+ expirationTime?: string;
2033
+ /** Article authors */
2034
+ authors?: string[];
2035
+ /** Article section/category */
2036
+ section?: string;
2037
+ /** Article tags */
2038
+ tags?: string[];
2039
+ }
2040
+ /**
2041
+ * OpenGraph video metadata.
2042
+ */
2043
+ interface OpenGraphVideo {
2044
+ /** Video URL */
2045
+ url?: string;
2046
+ /** HTTPS video URL */
2047
+ secureUrl?: string;
2048
+ /** MIME type */
2049
+ type?: string;
2050
+ /** Video width in pixels */
2051
+ width?: number;
2052
+ /** Video height in pixels */
2053
+ height?: number;
2054
+ /** Video duration in seconds */
2055
+ duration?: number;
2056
+ /** Release date */
2057
+ releaseDate?: string;
2058
+ /** Video tags */
2059
+ tags?: string[];
2060
+ }
2061
+ /**
2062
+ * OpenGraph audio metadata.
2063
+ */
2064
+ interface OpenGraphAudio {
2065
+ /** Audio URL */
2066
+ url?: string;
2067
+ /** HTTPS audio URL */
2068
+ secureUrl?: string;
2069
+ /** MIME type */
2070
+ type?: string;
2071
+ }
2072
+ /**
2073
+ * OpenGraph image metadata.
2074
+ */
2075
+ interface OpenGraphImage {
2076
+ /** Image URL */
2077
+ url: string;
2078
+ /** HTTPS image URL */
2079
+ secureUrl?: string;
2080
+ /** MIME type */
2081
+ type?: string;
2082
+ /** Image width in pixels */
2083
+ width?: number;
2084
+ /** Image height in pixels */
2085
+ height?: number;
2086
+ /** Alt text */
2087
+ alt?: string;
2088
+ }
2089
+ /**
2090
+ * OpenGraph book metadata.
2091
+ */
2092
+ interface OpenGraphBook {
2093
+ /** Book authors */
2094
+ authors?: string[];
2095
+ /** ISBN number */
2096
+ isbn?: string;
2097
+ /** Release date */
2098
+ releaseDate?: string;
2099
+ /** Book tags */
2100
+ tags?: string[];
2101
+ }
2102
+ /**
2103
+ * OpenGraph profile metadata.
2104
+ */
2105
+ interface OpenGraphProfile {
2106
+ /** First name */
2107
+ firstName?: string;
2108
+ /** Last name */
2109
+ lastName?: string;
2110
+ /** Username */
2111
+ username?: string;
2112
+ /** Gender */
2113
+ gender?: string;
2114
+ }
2115
+ /**
2116
+ * OpenGraph metadata extracted from meta tags.
2117
+ *
2118
+ * @remarks
2119
+ * Contains metadata from the Open Graph protocol used for rich social sharing.
2120
+ * All fields are optional - only present if found in the document.
2121
+ */
2122
+ interface OpenGraphMetadata {
2123
+ /** Content title */
2124
+ title?: string;
2125
+ /** Content type (article, website, video, etc.) */
2126
+ type?: string;
2127
+ /** Preview image URL (primary image) */
2128
+ image?: string;
2129
+ /** Canonical URL */
2130
+ url?: string;
2131
+ /** Content description */
2132
+ description?: string;
2133
+ /** Site name */
2134
+ siteName?: string;
2135
+ /** Content locale (e.g., en_US) */
2136
+ locale?: string;
2137
+ /** Alternate locales */
2138
+ localeAlternate?: string[];
2139
+ /** Article-specific metadata (if type is article) */
2140
+ article?: OpenGraphArticle;
2141
+ /** Video metadata (if type is video or video present) */
2142
+ video?: OpenGraphVideo;
2143
+ /** Audio metadata (if audio present) */
2144
+ audio?: OpenGraphAudio;
2145
+ /** All images with full metadata (if multiple images) */
2146
+ images?: OpenGraphImage[];
2147
+ /** Book metadata (if type is book) */
2148
+ book?: OpenGraphBook;
2149
+ /** Profile metadata (if type is profile) */
2150
+ profile?: OpenGraphProfile;
2151
+ }
2152
+
2153
+ /**
2154
+ * OpenGraph metadata extraction.
2155
+ *
2156
+ * @remarks
2157
+ * Extracts Open Graph protocol metadata from HTML documents.
2158
+ *
2159
+ * @packageDocumentation
2160
+ */
2161
+
2162
+ /**
2163
+ * Extract OpenGraph metadata from parsed HTML document.
2164
+ *
2165
+ * @remarks
2166
+ * Extracts Open Graph protocol metadata including basic metadata,
2167
+ * article data, video/audio, images, books, and profiles.
2168
+ *
2169
+ * @param doc - Parsed HTML document
2170
+ * @returns OpenGraph metadata object
2171
+ *
2172
+ * @example
2173
+ * ```typescript
2174
+ * const doc = parseHTML(htmlString);
2175
+ * const og = extractOpenGraph(doc);
2176
+ * console.log(og.title);
2177
+ * console.log(og.image);
2178
+ * console.log(og.article?.publishedTime);
2179
+ * ```
2180
+ */
2181
+ declare function extractOpenGraph(doc: HTMLDocument): OpenGraphMetadata;
2182
+
2183
+ /**
2184
+ * Pagination metadata types.
2185
+ *
2186
+ * @remarks
2187
+ * Types for multi-page content navigation.
2188
+ *
2189
+ * @packageDocumentation
2190
+ */
2191
+ /**
2192
+ * Pagination metadata.
2193
+ *
2194
+ * @remarks
2195
+ * Contains navigation links for multi-page content series.
2196
+ */
2197
+ interface PaginationMetadata {
2198
+ /** Previous page URL */
2199
+ prev?: string;
2200
+ /** Next page URL */
2201
+ next?: string;
2202
+ /** First page URL */
2203
+ first?: string;
2204
+ /** Last page URL */
2205
+ last?: string;
2206
+ /** Parent/up level URL */
2207
+ up?: string;
2208
+ /** Index/table of contents URL */
2209
+ index?: string;
2210
+ }
2211
+
2212
+ /**
2213
+ * Pagination metadata extraction.
2214
+ *
2215
+ * @remarks
2216
+ * Extracts pagination navigation links from HTML documents.
2217
+ *
2218
+ * @packageDocumentation
2219
+ */
2220
+
2221
+ /**
2222
+ * Extract pagination metadata from parsed HTML document.
2223
+ *
2224
+ * @remarks
2225
+ * Extracts pagination navigation links including prev, next, first, last,
2226
+ * up (parent), and index links.
2227
+ *
2228
+ * @param doc - Parsed HTML document
2229
+ * @returns Pagination metadata
2230
+ *
2231
+ * @example
2232
+ * ```typescript
2233
+ * const doc = parseHTML(htmlString);
2234
+ * const pagination = extractPagination(doc);
2235
+ * console.log(pagination.prev); // Previous page URL
2236
+ * console.log(pagination.next); // Next page URL
2237
+ * ```
2238
+ */
2239
+ declare function extractPagination(doc: HTMLDocument): PaginationMetadata;
2240
+
2241
+ /**
2242
+ * Robots and crawling directives types.
2243
+ *
2244
+ * @remarks
2245
+ * Types for robot crawling and indexing directives.
2246
+ *
2247
+ * @packageDocumentation
2248
+ */
2249
+ /**
2250
+ * Parsed robot directives.
2251
+ */
2252
+ interface RobotDirectives {
2253
+ /** Allow indexing */
2254
+ index?: boolean;
2255
+ /** Allow following links */
2256
+ follow?: boolean;
2257
+ /** Prevent archiving/caching */
2258
+ noarchive?: boolean;
2259
+ /** Prevent showing snippets */
2260
+ nosnippet?: boolean;
2261
+ /** Prevent indexing images */
2262
+ noimageindex?: boolean;
2263
+ /** Maximum snippet length (characters) */
2264
+ maxSnippet?: number;
2265
+ /** Maximum image preview size */
2266
+ maxImagePreview?: string;
2267
+ /** Maximum video preview length (seconds) */
2268
+ maxVideoPreview?: number;
2269
+ /** Prevent translation */
2270
+ notranslate?: boolean;
2271
+ /** Date after which content is unavailable */
2272
+ unavailableAfter?: string;
2273
+ }
2274
+ /**
2275
+ * Robots and crawling metadata.
2276
+ *
2277
+ * @remarks
2278
+ * Contains robot directives for search engines and crawlers.
2279
+ */
2280
+ interface RobotsMetadata {
2281
+ /** General robots directives */
2282
+ robots?: RobotDirectives;
2283
+ /** Google-specific directives */
2284
+ googlebot?: RobotDirectives;
2285
+ /** Bing-specific directives */
2286
+ bingbot?: RobotDirectives;
2287
+ /** Google News-specific directives */
2288
+ googlebotNews?: RobotDirectives;
2289
+ }
2290
+
2291
+ /**
2292
+ * Robots and crawling directives extraction.
2293
+ *
2294
+ * @remarks
2295
+ * Extracts robot crawling and indexing directives from HTML documents.
2296
+ *
2297
+ * @packageDocumentation
2298
+ */
2299
+
2300
+ /**
2301
+ * Extract robots metadata from parsed HTML document.
2302
+ *
2303
+ * @remarks
2304
+ * Extracts robot directives from meta tags for general robots,
2305
+ * Googlebot, Bingbot, and Google News bot.
2306
+ *
2307
+ * @param doc - Parsed HTML document
2308
+ * @returns Robots metadata
2309
+ *
2310
+ * @example
2311
+ * ```typescript
2312
+ * const doc = parseHTML(htmlString);
2313
+ * const robots = extractRobots(doc);
2314
+ * console.log(robots.robots?.index); // true/false
2315
+ * console.log(robots.robots?.follow); // true/false
2316
+ * ```
2317
+ */
2318
+ declare function extractRobots(doc: HTMLDocument): RobotsMetadata;
2319
+
2320
+ /**
2321
+ * Schema.org / JSON-LD metadata types.
2322
+ *
2323
+ * @remarks
2324
+ * Structured data for search engines and rich snippets using JSON-LD format.
2325
+ *
2326
+ * @packageDocumentation
2327
+ */
2328
+ /**
2329
+ * A single JSON-LD block found in the document.
2330
+ */
2331
+ interface JsonLdBlock {
2332
+ /** Original JSON string */
2333
+ raw: string;
2334
+ /** Parsed JSON object */
2335
+ parsed: unknown;
2336
+ /** @type field(s) from the JSON-LD */
2337
+ type?: string | string[];
2338
+ /** JSON-LD context field */
2339
+ context?: string | unknown;
2340
+ }
2341
+ /**
2342
+ * Schema.org metadata extracted from JSON-LD scripts.
2343
+ *
2344
+ * @remarks
2345
+ * Contains all JSON-LD structured data blocks found in the document.
2346
+ * Provides convenience accessors for common types.
2347
+ */
2348
+ interface SchemaOrgMetadata {
2349
+ /** All JSON-LD blocks found in the document */
2350
+ jsonLd: JsonLdBlock[];
2351
+ /** Convenience: Article/NewsArticle/BlogPosting types */
2352
+ articles?: unknown[];
2353
+ /** Convenience: WebPage/WebSite types */
2354
+ webPages?: unknown[];
2355
+ /** Convenience: BreadcrumbList type */
2356
+ breadcrumbs?: unknown[];
2357
+ /** Convenience: Organization type */
2358
+ organization?: unknown;
2359
+ /** Convenience: Person type */
2360
+ person?: unknown;
2361
+ /** Convenience: Product types */
2362
+ products?: unknown[];
2363
+ /** Convenience: Event types */
2364
+ events?: unknown[];
2365
+ /** Convenience: Recipe types */
2366
+ recipes?: unknown[];
2367
+ /** Convenience: VideoObject types */
2368
+ videos?: unknown[];
2369
+ /** Convenience: ImageObject types */
2370
+ images?: unknown[];
2371
+ }
2372
+
2373
+ /**
2374
+ * Schema.org / JSON-LD extraction.
2375
+ *
2376
+ * @remarks
2377
+ * Extracts structured data from JSON-LD script tags.
2378
+ *
2379
+ * @packageDocumentation
2380
+ */
2381
+
2382
+ /**
2383
+ * Extract Schema.org metadata from parsed HTML document.
2384
+ *
2385
+ * @remarks
2386
+ * Finds all <script type="application/ld+json"> tags, parses the JSON-LD,
2387
+ * and organizes by type for easy access.
2388
+ *
2389
+ * @param doc - Parsed HTML document
2390
+ * @returns Schema.org metadata object
2391
+ *
2392
+ * @example
2393
+ * ```typescript
2394
+ * const doc = parseHTML(htmlString);
2395
+ * const schema = extractSchemaOrg(doc);
2396
+ * console.log(schema.jsonLd.length);
2397
+ * console.log(schema.articles);
2398
+ * ```
2399
+ */
2400
+ declare function extractSchemaOrg(doc: HTMLDocument): SchemaOrgMetadata;
2401
+
2402
+ /**
2403
+ * Security and privacy types.
2404
+ *
2405
+ * @remarks
2406
+ * Types for security and privacy-related metadata.
2407
+ *
2408
+ * @packageDocumentation
2409
+ */
2410
+ /**
2411
+ * Security metadata.
2412
+ *
2413
+ * @remarks
2414
+ * Contains security and privacy-related headers and meta tags.
2415
+ */
2416
+ interface SecurityMetadata {
2417
+ /** Referrer policy (controls Referer header) */
2418
+ referrerPolicy?: string;
2419
+ /** Content Security Policy directives */
2420
+ contentSecurityPolicy?: string;
2421
+ /** X-UA-Compatible directive (IE compatibility mode) */
2422
+ xUaCompatible?: string;
2423
+ /** Format detection (phone numbers, dates, etc.) */
2424
+ formatDetection?: string;
2425
+ }
2426
+
2427
+ /**
2428
+ * Security and privacy extraction.
2429
+ *
2430
+ * @remarks
2431
+ * Extracts security and privacy-related metadata from HTML documents.
2432
+ *
2433
+ * @packageDocumentation
2434
+ */
2435
+
2436
+ /**
2437
+ * Extract security metadata from parsed HTML document.
2438
+ *
2439
+ * @remarks
2440
+ * Extracts security and privacy-related meta tags including referrer policy,
2441
+ * content security policy, and browser compatibility directives.
2442
+ *
2443
+ * @param doc - Parsed HTML document
2444
+ * @returns Security metadata
2445
+ *
2446
+ * @example
2447
+ * ```typescript
2448
+ * const doc = parseHTML(htmlString);
2449
+ * const security = extractSecurity(doc);
2450
+ * console.log(security.referrerPolicy);
2451
+ * console.log(security.contentSecurityPolicy);
2452
+ * ```
2453
+ */
2454
+ declare function extractSecurity(doc: HTMLDocument): SecurityMetadata;
2455
+
2456
+ /**
2457
+ * SEO metadata types.
2458
+ *
2459
+ * @remarks
2460
+ * Standard HTML meta tags used by search engines and browsers.
2461
+ *
2462
+ * @packageDocumentation
2463
+ */
2464
+ /**
2465
+ * Basic SEO metadata extracted from standard HTML meta tags.
2466
+ *
2467
+ * @remarks
2468
+ * Contains metadata from common SEO-related meta tags including
2469
+ * title, description, keywords, and browser-specific tags.
2470
+ */
2471
+ interface SEOMetadata {
2472
+ /** Page title from <title> tag */
2473
+ title?: string;
2474
+ /** Meta description for search results */
2475
+ description?: string;
2476
+ /** Keywords (legacy but still used) */
2477
+ keywords?: string[];
2478
+ /** Page author */
2479
+ author?: string;
2480
+ /** Site generator (e.g., WordPress, Hugo) */
2481
+ generator?: string;
2482
+ /** Viewport settings */
2483
+ viewport?: string;
2484
+ /** Browser theme color */
2485
+ themeColor?: string;
2486
+ /** Color scheme preference (light, dark, auto) */
2487
+ colorScheme?: string;
2488
+ /** Web application name */
2489
+ applicationName?: string;
2490
+ /** iOS web app title */
2491
+ appleMobileWebAppTitle?: string;
2492
+ /** iOS web app capable (standalone mode) */
2493
+ appleMobileWebAppCapable?: boolean;
2494
+ /** iOS status bar style */
2495
+ appleMobileWebAppStatusBarStyle?: string;
2496
+ }
2497
+
2498
+ /**
2499
+ * SEO metadata extraction.
2500
+ *
2501
+ * @remarks
2502
+ * Extracts standard SEO meta tags from HTML documents.
2503
+ *
2504
+ * @packageDocumentation
2505
+ */
2506
+
2507
+ /**
2508
+ * Extract SEO metadata from parsed HTML document.
2509
+ *
2510
+ * @remarks
2511
+ * Extracts standard SEO meta tags including title, description, keywords,
2512
+ * and browser-specific configuration. All fields are optional.
2513
+ *
2514
+ * @param doc - Parsed HTML document
2515
+ * @returns SEO metadata object
2516
+ *
2517
+ * @example
2518
+ * ```typescript
2519
+ * const doc = parseHTML(htmlString);
2520
+ * const seo = extractSEO(doc);
2521
+ * console.log(seo.title); // Page title
2522
+ * console.log(seo.description); // Meta description
2523
+ * ```
2524
+ */
2525
+ declare function extractSEO(doc: HTMLDocument): SEOMetadata;
2526
+
2527
+ /**
2528
+ * Sitemap discovery types.
2529
+ *
2530
+ * @remarks
2531
+ * Types for discovering XML sitemaps and sitemap indexes.
2532
+ *
2533
+ * @packageDocumentation
2534
+ */
2535
+ /**
2536
+ * Sitemap discovery metadata.
2537
+ *
2538
+ * @remarks
2539
+ * Contains discovered sitemaps from <link> tags and suggested common sitemap URLs.
2540
+ */
2541
+ interface SitemapDiscoveryMetadata {
2542
+ /** Sitemaps explicitly declared in <link rel="sitemap"> tags */
2543
+ sitemaps: string[];
2544
+ /** Suggested sitemap URLs based on common patterns (not verified) */
2545
+ suggestions?: string[];
2546
+ }
2547
+
2548
+ /**
2549
+ * Sitemap discovery extraction.
2550
+ *
2551
+ * @remarks
2552
+ * Discovers XML sitemaps in HTML documents.
2553
+ *
2554
+ * @packageDocumentation
2555
+ */
2556
+
2557
+ /**
2558
+ * Extract sitemap discovery metadata from parsed HTML document.
2559
+ *
2560
+ * @remarks
2561
+ * Finds all sitemaps declared in <link rel="sitemap"> tags and generates
2562
+ * suggestions for common sitemap URL patterns.
2563
+ *
2564
+ * @param doc - Parsed HTML document
2565
+ * @param documentUrl - Optional document URL for generating absolute sitemap suggestions
2566
+ * @returns Sitemap discovery metadata
2567
+ *
2568
+ * @example
2569
+ * ```typescript
2570
+ * const doc = parseHTML(htmlString);
2571
+ * const sitemaps = extractSitemapDiscovery(doc, 'https://example.com');
2572
+ * console.log(sitemaps.sitemaps); // Discovered sitemaps
2573
+ * console.log(sitemaps.suggestions); // Suggested sitemap URLs
2574
+ * ```
2575
+ */
2576
+ declare function extractSitemapDiscovery(doc: HTMLDocument, documentUrl?: string | URL): SitemapDiscoveryMetadata;
2577
+
2578
+ /**
2579
+ * Social profiles types.
2580
+ *
2581
+ * @remarks
2582
+ * Types for social media profile links.
2583
+ *
2584
+ * @packageDocumentation
2585
+ */
2586
+ /**
2587
+ * Social profile metadata.
2588
+ *
2589
+ * @remarks
2590
+ * Contains social media profile URLs and handles from various platforms.
2591
+ */
2592
+ interface SocialProfilesMetadata {
2593
+ /** Twitter/X username (without @) */
2594
+ twitter?: string;
2595
+ /** Facebook profile/page URL */
2596
+ facebook?: string;
2597
+ /** Instagram username or URL */
2598
+ instagram?: string;
2599
+ /** LinkedIn profile/company URL */
2600
+ linkedin?: string;
2601
+ /** YouTube channel URL */
2602
+ youtube?: string;
2603
+ /** GitHub username or organization URL */
2604
+ github?: string;
2605
+ /** TikTok username or URL */
2606
+ tiktok?: string;
2607
+ /** Pinterest username or URL */
2608
+ pinterest?: string;
2609
+ /** Mastodon profile URL */
2610
+ mastodon?: string;
2611
+ /** Reddit username or URL */
2612
+ reddit?: string;
2613
+ /** Other social profiles (platform: url/username) */
2614
+ other?: Record<string, string>;
2615
+ }
2616
+
2617
+ /**
2618
+ * Social profiles extraction.
2619
+ *
2620
+ * @remarks
2621
+ * Extracts social media profile links from HTML documents.
2622
+ *
2623
+ * @packageDocumentation
2624
+ */
2625
+
2626
+ /**
2627
+ * Extract social profiles metadata from parsed HTML document.
2628
+ *
2629
+ * @remarks
2630
+ * Extracts social media profile URLs and handles from meta tags and structured data.
2631
+ *
2632
+ * @param doc - Parsed HTML document
2633
+ * @returns Social profiles metadata
2634
+ *
2635
+ * @example
2636
+ * ```typescript
2637
+ * const doc = parseHTML(htmlString);
2638
+ * const profiles = extractSocialProfiles(doc);
2639
+ * console.log(profiles.twitter);
2640
+ * console.log(profiles.facebook);
2641
+ * ```
2642
+ */
2643
+ declare function extractSocialProfiles(doc: HTMLDocument): SocialProfilesMetadata;
2644
+
2645
+ /**
2646
+ * Twitter Card metadata types.
2647
+ *
2648
+ * @remarks
2649
+ * Twitter-specific metadata for rich cards.
2650
+ *
2651
+ * @packageDocumentation
2652
+ */
2653
+ /**
2654
+ * Twitter app card metadata for a specific platform.
2655
+ */
2656
+ interface TwitterAppPlatform {
2657
+ /** App name */
2658
+ name?: string;
2659
+ /** App ID */
2660
+ id?: string;
2661
+ /** App URL/deep link */
2662
+ url?: string;
2663
+ }
2664
+ /**
2665
+ * Twitter app card metadata.
2666
+ */
2667
+ interface TwitterApp {
2668
+ /** iPhone app details */
2669
+ iphone?: TwitterAppPlatform;
2670
+ /** iPad app details */
2671
+ ipad?: TwitterAppPlatform;
2672
+ /** Google Play app details */
2673
+ googleplay?: TwitterAppPlatform;
2674
+ }
2675
+ /**
2676
+ * Twitter player card metadata.
2677
+ */
2678
+ interface TwitterPlayer {
2679
+ /** Player URL */
2680
+ url?: string;
2681
+ /** Player width in pixels */
2682
+ width?: number;
2683
+ /** Player height in pixels */
2684
+ height?: number;
2685
+ /** Stream URL */
2686
+ stream?: string;
2687
+ }
2688
+ /**
2689
+ * Twitter Card metadata extracted from meta tags.
2690
+ *
2691
+ * @remarks
2692
+ * Contains metadata for Twitter Cards used for rich social sharing on Twitter.
2693
+ * All fields are optional - only present if found in the document.
2694
+ */
2695
+ interface TwitterCardMetadata {
2696
+ /** Card type (summary, summary_large_image, app, player) */
2697
+ card?: 'summary' | 'summary_large_image' | 'app' | 'player' | string;
2698
+ /** Twitter username of website (with or without @ symbol) */
2699
+ site?: string;
2700
+ /** Twitter username of content creator (with or without @ symbol) */
2701
+ creator?: string;
2702
+ /** Content title (max 70 chars) */
2703
+ title?: string;
2704
+ /** Content description (max 200 chars) */
2705
+ description?: string;
2706
+ /** Image URL */
2707
+ image?: string;
2708
+ /** Image alt text */
2709
+ imageAlt?: string;
2710
+ /** App card details (if card type is 'app') */
2711
+ app?: TwitterApp;
2712
+ /** Player card details (if card type is 'player') */
2713
+ player?: TwitterPlayer;
2714
+ }
2715
+
2716
+ /**
2717
+ * Twitter Card metadata extraction.
2718
+ *
2719
+ * @remarks
2720
+ * Extracts Twitter Card metadata from HTML documents.
2721
+ *
2722
+ * @packageDocumentation
2723
+ */
2724
+
2725
+ /**
2726
+ * Extract Twitter Card metadata from parsed HTML document.
2727
+ *
2728
+ * @remarks
2729
+ * Extracts Twitter Card metadata including card type, site/creator info,
2730
+ * title/description, images, app cards, and player cards.
2731
+ *
2732
+ * @param doc - Parsed HTML document
2733
+ * @returns Twitter Card metadata object
2734
+ *
2735
+ * @example
2736
+ * ```typescript
2737
+ * const doc = parseHTML(htmlString);
2738
+ * const twitter = extractTwitterCard(doc);
2739
+ * console.log(twitter.card);
2740
+ * console.log(twitter.title);
2741
+ * ```
2742
+ */
2743
+ declare function extractTwitterCard(doc: HTMLDocument): TwitterCardMetadata;
2744
+
2745
+ /**
2746
+ * Verification tags types.
2747
+ *
2748
+ * @remarks
2749
+ * Types for domain and ownership verification tags.
2750
+ *
2751
+ * @packageDocumentation
2752
+ */
2753
+ /**
2754
+ * Verification metadata.
2755
+ *
2756
+ * @remarks
2757
+ * Contains verification tags from various platforms for domain and ownership verification.
2758
+ */
2759
+ interface VerificationMetadata {
2760
+ /** Google Site Verification token */
2761
+ googleSiteVerification?: string;
2762
+ /** Bing/Microsoft verification token */
2763
+ msvalidate?: string;
2764
+ /** Yandex verification token */
2765
+ yandexVerification?: string;
2766
+ /** Facebook domain verification token */
2767
+ facebookDomainVerification?: string;
2768
+ /** Pinterest domain verification token */
2769
+ pinterestVerification?: string;
2770
+ /** Alexa verification token */
2771
+ alexaVerification?: string;
2772
+ /** Norton Safe Web verification token */
2773
+ nortonSafeWeb?: string;
2774
+ /** Other verification tags (platform: token) */
2775
+ other?: Record<string, string>;
2776
+ }
2777
+
2778
+ /**
2779
+ * Verification tags extraction.
2780
+ *
2781
+ * @remarks
2782
+ * Extracts domain and ownership verification tags from HTML documents.
2783
+ *
2784
+ * @packageDocumentation
2785
+ */
2786
+
2787
+ /**
2788
+ * Extract verification metadata from parsed HTML document.
2789
+ *
2790
+ * @remarks
2791
+ * Extracts verification tags used by various platforms for domain and ownership verification.
2792
+ *
2793
+ * @param doc - Parsed HTML document
2794
+ * @returns Verification metadata
2795
+ *
2796
+ * @example
2797
+ * ```typescript
2798
+ * const doc = parseHTML(htmlString);
2799
+ * const verification = extractVerification(doc);
2800
+ * console.log(verification.googleSiteVerification);
2801
+ * console.log(verification.facebookDomainVerification);
2802
+ * ```
2803
+ */
2804
+ declare function extractVerification(doc: HTMLDocument): VerificationMetadata;
2805
+
2806
+ /**
2807
+ * Enhanced fetch types for web scraping.
2808
+ *
2809
+ * @remarks
2810
+ * Types for pluck() - fetch-compatible enhanced HTTP client.
2811
+ *
2812
+ * @author Anonyfox <max@anonyfox.com>
2813
+ * @license MIT
2814
+ * @see {@link https://github.com/Anonyfox/ravenjs}
2815
+ * @see {@link https://ravenjs.dev}
2816
+ * @see {@link https://anonyfox.com}
2817
+ *
2818
+ * @packageDocumentation
2819
+ */
2820
+ /**
2821
+ * Extended RequestInit with pluck-specific options.
2822
+ *
2823
+ * @remarks
2824
+ * Extends standard fetch RequestInit with additional options for
2825
+ * robust web scraping. All standard fetch options are supported.
2826
+ */
2827
+ interface PluckInit extends RequestInit {
2828
+ /**
2829
+ * Request timeout in milliseconds.
2830
+ *
2831
+ * @default 30000 (30 seconds)
2832
+ */
2833
+ timeout?: number;
2834
+ /**
2835
+ * Maximum number of redirects to follow.
2836
+ *
2837
+ * @default 10
2838
+ */
2839
+ maxRedirects?: number;
2840
+ /**
2841
+ * Maximum response size in bytes.
2842
+ *
2843
+ * @default 10485760 (10MB)
2844
+ */
2845
+ maxSize?: number;
2846
+ /**
2847
+ * User-Agent header shortcut.
2848
+ *
2849
+ * @remarks
2850
+ * Convenience property that sets the User-Agent header.
2851
+ * Overrides any User-Agent in the headers object.
2852
+ */
2853
+ userAgent?: string;
2854
+ /**
2855
+ * Throw error on HTTP error status (4xx, 5xx).
2856
+ *
2857
+ * @default true
2858
+ */
2859
+ throwOnHttpError?: boolean;
2860
+ /**
2861
+ * Validate Content-Type header.
2862
+ *
2863
+ * @remarks
2864
+ * If true, throws error if Content-Type is not in allowedContentTypes.
2865
+ *
2866
+ * @default false
2867
+ */
2868
+ strictContentType?: boolean;
2869
+ /**
2870
+ * Allowed Content-Type values for strictContentType.
2871
+ *
2872
+ * @default ['text/html', 'text/xml', 'application/xml', 'application/xhtml+xml', 'application/rss+xml', 'application/atom+xml', 'application/json']
2873
+ */
2874
+ allowedContentTypes?: string[];
2875
+ /**
2876
+ * Follow redirects automatically.
2877
+ *
2878
+ * @remarks
2879
+ * If false, returns the 3xx response directly without following.
2880
+ *
2881
+ * @default true
2882
+ */
2883
+ followRedirects?: boolean;
2884
+ /**
2885
+ * Validate detected encoding.
2886
+ *
2887
+ * @remarks
2888
+ * If true, throws error if detected encoding is invalid or unsupported.
2889
+ *
2890
+ * @default true
2891
+ */
2892
+ validateEncoding?: boolean;
2893
+ }
2894
+ /**
2895
+ * Enhanced Response with pluck-specific properties.
2896
+ *
2897
+ * @remarks
2898
+ * Extends standard Response with additional metadata about the request.
2899
+ * All standard Response properties and methods are available.
2900
+ */
2901
+ interface PluckResponse extends Response {
2902
+ /**
2903
+ * Final URL after following redirects.
2904
+ */
2905
+ finalUrl: string;
2906
+ /**
2907
+ * Original request URL.
2908
+ */
2909
+ originalUrl: string;
2910
+ /**
2911
+ * Array of redirect URLs (excluding original and final).
2912
+ */
2913
+ redirectChain: string[];
2914
+ /**
2915
+ * Detected character encoding.
2916
+ *
2917
+ * @example 'utf-8', 'windows-1252', 'iso-8859-1'
2918
+ */
2919
+ detectedEncoding: string;
2920
+ /**
2921
+ * Request timing information.
2922
+ */
2923
+ timing: {
2924
+ /** Request start timestamp (milliseconds since epoch) */
2925
+ start: number;
2926
+ /** Request end timestamp (milliseconds since epoch) */
2927
+ end: number;
2928
+ /** Total duration in milliseconds */
2929
+ duration: number;
2930
+ /** Time spent in redirects (milliseconds) */
2931
+ redirectDuration?: number;
2932
+ };
2933
+ /**
2934
+ * Get response body as UTF-8 text.
2935
+ *
2936
+ * @remarks
2937
+ * Unlike standard text(), this guarantees UTF-8 output regardless
2938
+ * of the source encoding. Uses detected encoding to decode properly.
2939
+ *
2940
+ * @returns UTF-8 decoded text
2941
+ */
2942
+ textUtf8(): Promise<string>;
2943
+ }
2944
+ /**
2945
+ * Base error class for pluck errors.
2946
+ */
2947
+ declare class PluckError extends Error {
2948
+ constructor(message: string);
2949
+ }
2950
+ /**
2951
+ * Network error (connection failed, DNS, etc.).
2952
+ */
2953
+ declare class PluckNetworkError extends PluckError {
2954
+ readonly cause?: Error | undefined;
2955
+ constructor(message: string, cause?: Error | undefined);
2956
+ }
2957
+ /**
2958
+ * Request timeout error.
2959
+ */
2960
+ declare class PluckTimeoutError extends PluckError {
2961
+ readonly timeoutMs: number;
2962
+ constructor(message: string, timeoutMs: number);
2963
+ }
2964
+ /**
2965
+ * HTTP error (4xx, 5xx status codes).
2966
+ */
2967
+ declare class PluckHttpError extends PluckError {
2968
+ readonly statusCode: number;
2969
+ readonly statusText: string;
2970
+ readonly response: Response;
2971
+ constructor(message: string, statusCode: number, statusText: string, response: Response);
2972
+ }
2973
+ /**
2974
+ * Response size exceeded maximum.
2975
+ */
2976
+ declare class PluckSizeError extends PluckError {
2977
+ readonly maxSize: number;
2978
+ readonly actualSize?: number | undefined;
2979
+ constructor(message: string, maxSize: number, actualSize?: number | undefined);
2980
+ }
2981
+ /**
2982
+ * Encoding detection or conversion error.
2983
+ */
2984
+ declare class PluckEncodingError extends PluckError {
2985
+ readonly encoding?: string | undefined;
2986
+ readonly cause?: Error | undefined;
2987
+ constructor(message: string, encoding?: string | undefined, cause?: Error | undefined);
2988
+ }
2989
+ /**
2990
+ * Too many redirects or redirect loop detected.
2991
+ */
2992
+ declare class PluckRedirectError extends PluckError {
2993
+ readonly redirectChain: string[];
2994
+ readonly maxRedirects?: number | undefined;
2995
+ constructor(message: string, redirectChain: string[], maxRedirects?: number | undefined);
2996
+ }
2997
+ /**
2998
+ * Invalid or disallowed Content-Type.
2999
+ */
3000
+ declare class PluckContentTypeError extends PluckError {
3001
+ readonly contentType: string;
3002
+ readonly allowedTypes?: string[] | undefined;
3003
+ constructor(message: string, contentType: string, allowedTypes?: string[] | undefined);
3004
+ }
3005
+
3006
+ /**
3007
+ * Enhanced fetch for web scraping.
3008
+ *
3009
+ * @remarks
3010
+ * fetch-compatible HTTP client with robust handling of real-world web content.
3011
+ *
3012
+ * @author Anonyfox <max@anonyfox.com>
3013
+ * @license MIT
3014
+ * @see {@link https://github.com/Anonyfox/ravenjs}
3015
+ * @see {@link https://ravenjs.dev}
3016
+ * @see {@link https://anonyfox.com}
3017
+ *
3018
+ * @packageDocumentation
3019
+ */
3020
+
3021
+ /**
3022
+ * Enhanced fetch for web scraping.
3023
+ *
3024
+ * @remarks
3025
+ * Drop-in replacement for fetch() with enhanced error handling, encoding detection,
3026
+ * redirect tracking, and size limits. Perfect for scraping HTML, feeds, and APIs.
3027
+ *
3028
+ * Features:
3029
+ * - Manual redirect tracking with full chain
3030
+ * - Automatic encoding detection and UTF-8 conversion
3031
+ * - Configurable timeouts and size limits
3032
+ * - Smart default headers for web scraping
3033
+ * - Content-Type validation
3034
+ * - Comprehensive error types
3035
+ *
3036
+ * @param input - URL string or Request object
3037
+ * @param init - Request options (extends standard RequestInit)
3038
+ * @returns Enhanced Response with additional metadata
3039
+ * @throws {PluckTimeoutError} Request timeout
3040
+ * @throws {PluckNetworkError} Network or DNS error
3041
+ * @throws {PluckHttpError} HTTP error status (4xx, 5xx)
3042
+ * @throws {PluckRedirectError} Too many redirects or loop
3043
+ * @throws {PluckSizeError} Response too large
3044
+ * @throws {PluckEncodingError} Invalid encoding
3045
+ * @throws {PluckContentTypeError} Invalid content type
3046
+ *
3047
+ * @example
3048
+ * ```typescript
3049
+ * // Basic usage (works like fetch)
3050
+ * const response = await pluck('https://example.com');
3051
+ * const html = await response.text();
3052
+ *
3053
+ * // With enhancements
3054
+ * console.log(response.redirectChain);
3055
+ * console.log(response.detectedEncoding);
3056
+ * console.log(response.timing);
3057
+ * ```
3058
+ *
3059
+ * @example
3060
+ * ```typescript
3061
+ * // With options
3062
+ * const response = await pluck('https://example.com', {
3063
+ * timeout: 60000,
3064
+ * maxRedirects: 5,
3065
+ * userAgent: 'MyBot/1.0',
3066
+ * throwOnHttpError: true
3067
+ * });
3068
+ * ```
3069
+ */
3070
+ declare function pluck(input: string | URL | Request, init?: PluckInit): Promise<PluckResponse>;
3071
+
3072
+ export { type AlternateLink, type AnalyticsMetadata, type AppLinks, type AppleTouchIcon, type Article, type AssetsMetadata, type CanonicalMetadata, type ConnectionHint, type ContentExtractionOptions, type ContentQuality, type ContentResult, type CopyrightMetadata, type DiscoveredFeed, type DublinCoreMetadata, type ExtractedContent, type ExtractedLink, type ExtractionErrorType, type ExtractionFailure, type Feed, type FeedAuthor, type FeedDiscoveryMetadata, type FeedEnclosure, type FeedFormat, type FeedItem, type GeoMetadata, type GeoPosition, type HTMLDocument, type HtmlToTextOptions, type IconsMetadata, type JsonLdBlock, type LanguageMetadata, type LinksExtractionOptions, type LinksMetadata, type MSTile, type MaskIcon, type MonetizationMetadata, type NewsMetadata, type OpenGraphArticle, type OpenGraphAudio, type OpenGraphBook, type OpenGraphImage, type OpenGraphMetadata, type OpenGraphProfile, type OpenGraphVideo, type PaginationMetadata, type ParseResult, PluckContentTypeError, PluckEncodingError, PluckError, PluckHttpError, type PluckInit, PluckNetworkError, PluckRedirectError, type PluckResponse, PluckSizeError, PluckTimeoutError, type PreloadResource, type RobotDirectives, type RobotsMetadata, type SEOMetadata, type SchemaOrgMetadata, type SecurityMetadata, type SitemapDiscoveryMetadata, type SocialProfilesMetadata, type TwitterApp, type TwitterAppPlatform, type TwitterCardMetadata, type TwitterPlayer, type VerificationMetadata, type Website, assessContentQuality, calculateReadingTime, countWords, detectFormat, extractAnalytics, extractAssets, extractCanonical, extractContent, extractCopyright, extractDublinCore, extractFeedDiscovery, extractGeo, extractIcons, extractLanguage, extractLinks, extractMonetization, extractNews, extractOpenGraph, extractPagination, extractRobots, extractSEO, extractSchemaOrg, extractSecurity, extractSitemapDiscovery, extractSocialProfiles, extractTwitterCard, extractVerification, gatherArticle, gatherFeed, gatherWebsite, htmlToText, isAtom, isFeed, isJSONFeed, isProbablyReaderable, isRSS, parseFeed, parseHTML, pluck };