magpie-html 0.2.1 → 0.2.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.cts CHANGED
@@ -603,7 +603,7 @@ interface FeedItem {
603
603
  */
604
604
  interface Feed {
605
605
  /** Original feed format */
606
- format: 'rss' | 'atom' | 'json-feed';
606
+ format: 'rss' | 'atom' | 'json-feed' | 'sitemap';
607
607
  /** Feed title (required) */
608
608
  title: string;
609
609
  /** Feed description or subtitle */
@@ -952,6 +952,7 @@ declare function gatherArticle(url: string | URL): Promise<Article>;
952
952
  * @remarks
953
953
  * This is a high-level convenience method that combines fetching and parsing.
954
954
  * It handles encoding detection, redirects, and feed format detection automatically.
955
+ * Falls back to sitemap parsing when standard feed formats aren't detected.
955
956
  *
956
957
  * @param url - Feed URL as string or URL object
957
958
  * @returns Normalized feed data
@@ -1042,6 +1043,11 @@ declare function gatherWebsite(url: string | URL): Promise<Website>;
1042
1043
  */
1043
1044
  declare function parseHTML(html: string, baseUrl?: string): Document;
1044
1045
  type HTMLDocument = Document;
1046
+ /**
1047
+ * Input type that accepts either a parsed Document or raw HTML string.
1048
+ * This allows extractor functions to be more forgiving.
1049
+ */
1050
+ type DocumentInput = Document | string;
1045
1051
 
1046
1052
  /**
1047
1053
  * Analytics and tracking types.
@@ -1088,24 +1094,26 @@ interface AnalyticsMetadata {
1088
1094
  */
1089
1095
 
1090
1096
  /**
1091
- * Extract analytics metadata from parsed HTML document.
1097
+ * Extract analytics metadata from HTML.
1092
1098
  *
1093
1099
  * @remarks
1094
1100
  * Detects analytics service IDs by examining script tags and their content.
1095
1101
  * Only extracts identifiers, does not track or collect user data.
1096
1102
  *
1097
- * @param doc - Parsed HTML document
1103
+ * @param input - Parsed HTML document or raw HTML string
1098
1104
  * @returns Analytics metadata
1099
1105
  *
1100
1106
  * @example
1101
1107
  * ```typescript
1108
+ * // With parsed document (recommended for multiple extractions)
1102
1109
  * const doc = parseHTML(htmlString);
1103
1110
  * const analytics = extractAnalytics(doc);
1104
- * console.log(analytics.googleAnalytics);
1105
- * console.log(analytics.googleTagManager);
1111
+ *
1112
+ * // Or directly with HTML string
1113
+ * const analytics = extractAnalytics(htmlString);
1106
1114
  * ```
1107
1115
  */
1108
- declare function extractAnalytics(doc: HTMLDocument): AnalyticsMetadata;
1116
+ declare function extractAnalytics(input: DocumentInput): AnalyticsMetadata;
1109
1117
 
1110
1118
  /**
1111
1119
  * Assets extraction types.
@@ -1189,7 +1197,7 @@ interface ConnectionHint {
1189
1197
  */
1190
1198
 
1191
1199
  /**
1192
- * Extract assets metadata from parsed HTML document.
1200
+ * Extract assets metadata from HTML.
1193
1201
  *
1194
1202
  * @remarks
1195
1203
  * Extracts all external assets referenced in the document, organized by type.
@@ -1205,20 +1213,21 @@ interface ConnectionHint {
1205
1213
  * - Preloads: `<link rel="preload">` and `<link rel="prefetch">`
1206
1214
  * - Connection hints: `<link rel="dns-prefetch">` and `<link rel="preconnect">`
1207
1215
  *
1208
- * @param doc - Parsed HTML document
1216
+ * @param input - Parsed HTML document or raw HTML string
1209
1217
  * @param baseUrl - Optional base URL for resolving relative URLs
1210
1218
  * @returns Assets metadata object with categorized URLs
1211
1219
  *
1212
1220
  * @example
1213
1221
  * ```typescript
1222
+ * // With parsed document (recommended for multiple extractions)
1214
1223
  * const doc = parseHTML(htmlString);
1215
1224
  * const assets = extractAssets(doc, 'https://example.com');
1216
- * console.log(assets.images);
1217
- * console.log(assets.stylesheets);
1218
- * console.log(assets.scripts);
1225
+ *
1226
+ * // Or directly with HTML string
1227
+ * const assets = extractAssets(htmlString, 'https://example.com');
1219
1228
  * ```
1220
1229
  */
1221
- declare function extractAssets(doc: HTMLDocument, baseUrl?: string | URL | null): AssetsMetadata;
1230
+ declare function extractAssets(input: DocumentInput, baseUrl?: string | URL | null): AssetsMetadata;
1222
1231
 
1223
1232
  /**
1224
1233
  * Canonical and alternate URL metadata types.
@@ -1282,24 +1291,26 @@ interface CanonicalMetadata {
1282
1291
  */
1283
1292
 
1284
1293
  /**
1285
- * Extract canonical and alternate URL metadata from parsed HTML document.
1294
+ * Extract canonical and alternate URL metadata from HTML.
1286
1295
  *
1287
1296
  * @remarks
1288
1297
  * Extracts canonical URLs, language alternates, AMP versions, manifests,
1289
1298
  * and app linking metadata.
1290
1299
  *
1291
- * @param doc - Parsed HTML document
1300
+ * @param input - Parsed HTML document or raw HTML string
1292
1301
  * @returns Canonical metadata object
1293
1302
  *
1294
1303
  * @example
1295
1304
  * ```typescript
1305
+ * // With parsed document (recommended for multiple extractions)
1296
1306
  * const doc = parseHTML(htmlString);
1297
1307
  * const canonical = extractCanonical(doc);
1298
- * console.log(canonical.canonical);
1299
- * console.log(canonical.alternates);
1308
+ *
1309
+ * // Or directly with HTML string
1310
+ * const canonical = extractCanonical(htmlString);
1300
1311
  * ```
1301
1312
  */
1302
- declare function extractCanonical(doc: HTMLDocument): CanonicalMetadata;
1313
+ declare function extractCanonical(input: DocumentInput): CanonicalMetadata;
1303
1314
 
1304
1315
  /**
1305
1316
  * Copyright and licensing types.
@@ -1336,24 +1347,26 @@ interface CopyrightMetadata {
1336
1347
  */
1337
1348
 
1338
1349
  /**
1339
- * Extract copyright metadata from parsed HTML document.
1350
+ * Extract copyright metadata from HTML.
1340
1351
  *
1341
1352
  * @remarks
1342
1353
  * Extracts copyright and licensing information from meta tags, link tags,
1343
1354
  * and Schema.org structured data.
1344
1355
  *
1345
- * @param doc - Parsed HTML document
1356
+ * @param input - Parsed HTML document or raw HTML string
1346
1357
  * @returns Copyright metadata
1347
1358
  *
1348
1359
  * @example
1349
1360
  * ```typescript
1361
+ * // With parsed document (recommended for multiple extractions)
1350
1362
  * const doc = parseHTML(htmlString);
1351
1363
  * const copyright = extractCopyright(doc);
1352
- * console.log(copyright.copyright);
1353
- * console.log(copyright.license);
1364
+ *
1365
+ * // Or directly with HTML string
1366
+ * const copyright = extractCopyright(htmlString);
1354
1367
  * ```
1355
1368
  */
1356
- declare function extractCopyright(doc: HTMLDocument): CopyrightMetadata;
1369
+ declare function extractCopyright(input: DocumentInput): CopyrightMetadata;
1357
1370
 
1358
1371
  /**
1359
1372
  * Dublin Core metadata types.
@@ -1413,25 +1426,27 @@ interface DublinCoreMetadata {
1413
1426
  */
1414
1427
 
1415
1428
  /**
1416
- * Extract Dublin Core metadata from parsed HTML document.
1429
+ * Extract Dublin Core metadata from HTML.
1417
1430
  *
1418
1431
  * @remarks
1419
1432
  * Extracts Dublin Core metadata using both DC. and dcterms. prefixes.
1420
1433
  * Fields that can have multiple values (creator, subject, contributor)
1421
1434
  * are extracted as arrays.
1422
1435
  *
1423
- * @param doc - Parsed HTML document
1436
+ * @param input - Parsed HTML document or raw HTML string
1424
1437
  * @returns Dublin Core metadata object
1425
1438
  *
1426
1439
  * @example
1427
1440
  * ```typescript
1441
+ * // With parsed document (recommended for multiple extractions)
1428
1442
  * const doc = parseHTML(htmlString);
1429
1443
  * const dc = extractDublinCore(doc);
1430
- * console.log(dc.title);
1431
- * console.log(dc.creator);
1444
+ *
1445
+ * // Or directly with HTML string
1446
+ * const dc = extractDublinCore(htmlString);
1432
1447
  * ```
1433
1448
  */
1434
- declare function extractDublinCore(doc: HTMLDocument): DublinCoreMetadata;
1449
+ declare function extractDublinCore(input: DocumentInput): DublinCoreMetadata;
1435
1450
 
1436
1451
  /**
1437
1452
  * Feed discovery types.
@@ -1475,25 +1490,27 @@ interface FeedDiscoveryMetadata {
1475
1490
  */
1476
1491
 
1477
1492
  /**
1478
- * Extract feed discovery metadata from parsed HTML document.
1493
+ * Extract feed discovery metadata from HTML.
1479
1494
  *
1480
1495
  * @remarks
1481
1496
  * Finds all feeds declared in <link rel="alternate"> tags and generates
1482
1497
  * suggestions for common feed URL patterns.
1483
1498
  *
1484
- * @param doc - Parsed HTML document
1499
+ * @param input - Parsed HTML document or raw HTML string
1485
1500
  * @param documentUrl - Optional document URL for generating absolute feed suggestions
1486
1501
  * @returns Feed discovery metadata
1487
1502
  *
1488
1503
  * @example
1489
1504
  * ```typescript
1505
+ * // With parsed document (recommended for multiple extractions)
1490
1506
  * const doc = parseHTML(htmlString);
1491
1507
  * const feeds = extractFeedDiscovery(doc, 'https://example.com');
1492
- * console.log(feeds.feeds); // Discovered feeds
1493
- * console.log(feeds.suggestions); // Suggested feed URLs
1508
+ *
1509
+ * // Or directly with HTML string
1510
+ * const feeds = extractFeedDiscovery(htmlString, 'https://example.com');
1494
1511
  * ```
1495
1512
  */
1496
- declare function extractFeedDiscovery(doc: HTMLDocument, documentUrl?: string | URL): FeedDiscoveryMetadata;
1513
+ declare function extractFeedDiscovery(input: DocumentInput, documentUrl?: string | URL): FeedDiscoveryMetadata;
1497
1514
 
1498
1515
  /**
1499
1516
  * Geographic location types.
@@ -1542,24 +1559,26 @@ interface GeoMetadata {
1542
1559
  */
1543
1560
 
1544
1561
  /**
1545
- * Extract geographic metadata from parsed HTML document.
1562
+ * Extract geographic metadata from HTML.
1546
1563
  *
1547
1564
  * @remarks
1548
1565
  * Extracts geographic location information including coordinates,
1549
1566
  * place names, and region codes from meta tags.
1550
1567
  *
1551
- * @param doc - Parsed HTML document
1568
+ * @param input - Parsed HTML document or raw HTML string
1552
1569
  * @returns Geographic metadata
1553
1570
  *
1554
1571
  * @example
1555
1572
  * ```typescript
1573
+ * // With parsed document (recommended for multiple extractions)
1556
1574
  * const doc = parseHTML(htmlString);
1557
1575
  * const geo = extractGeo(doc);
1558
- * console.log(geo.position?.latitude);
1559
- * console.log(geo.placename);
1576
+ *
1577
+ * // Or directly with HTML string
1578
+ * const geo = extractGeo(htmlString);
1560
1579
  * ```
1561
1580
  */
1562
- declare function extractGeo(doc: HTMLDocument): GeoMetadata;
1581
+ declare function extractGeo(input: DocumentInput): GeoMetadata;
1563
1582
 
1564
1583
  /**
1565
1584
  * Icons and visual assets types.
@@ -1632,24 +1651,26 @@ interface IconsMetadata {
1632
1651
  */
1633
1652
 
1634
1653
  /**
1635
- * Extract icons metadata from parsed HTML document.
1654
+ * Extract icons metadata from HTML.
1636
1655
  *
1637
1656
  * @remarks
1638
1657
  * Extracts all icon-related metadata including favicons, Apple touch icons,
1639
1658
  * Safari mask icons, and Microsoft tile configuration.
1640
1659
  *
1641
- * @param doc - Parsed HTML document
1660
+ * @param input - Parsed HTML document or raw HTML string
1642
1661
  * @returns Icons metadata
1643
1662
  *
1644
1663
  * @example
1645
1664
  * ```typescript
1665
+ * // With parsed document (recommended for multiple extractions)
1646
1666
  * const doc = parseHTML(htmlString);
1647
1667
  * const icons = extractIcons(doc);
1648
- * console.log(icons.favicon);
1649
- * console.log(icons.appleTouchIcons);
1668
+ *
1669
+ * // Or directly with HTML string
1670
+ * const icons = extractIcons(htmlString);
1650
1671
  * ```
1651
1672
  */
1652
- declare function extractIcons(doc: HTMLDocument): IconsMetadata;
1673
+ declare function extractIcons(input: DocumentInput): IconsMetadata;
1653
1674
 
1654
1675
  /**
1655
1676
  * Language and localization types.
@@ -1691,24 +1712,26 @@ interface LanguageMetadata {
1691
1712
  */
1692
1713
 
1693
1714
  /**
1694
- * Extract language and localization metadata from parsed HTML document.
1715
+ * Extract language and localization metadata from HTML.
1695
1716
  *
1696
1717
  * @remarks
1697
1718
  * Extracts language information from HTML lang attribute, meta tags,
1698
1719
  * and OpenGraph locale. Normalizes to provide a primary language and region.
1699
1720
  *
1700
- * @param doc - Parsed HTML document
1721
+ * @param input - Parsed HTML document or raw HTML string
1701
1722
  * @returns Language metadata
1702
1723
  *
1703
1724
  * @example
1704
1725
  * ```typescript
1726
+ * // With parsed document (recommended for multiple extractions)
1705
1727
  * const doc = parseHTML(htmlString);
1706
1728
  * const lang = extractLanguage(doc);
1707
- * console.log(lang.primary); // 'en'
1708
- * console.log(lang.region); // 'US'
1729
+ *
1730
+ * // Or directly with HTML string
1731
+ * const lang = extractLanguage(htmlString);
1709
1732
  * ```
1710
1733
  */
1711
- declare function extractLanguage(doc: HTMLDocument): LanguageMetadata;
1734
+ declare function extractLanguage(input: DocumentInput): LanguageMetadata;
1712
1735
 
1713
1736
  /**
1714
1737
  * Links extraction types.
@@ -1855,7 +1878,7 @@ interface LinksMetadata {
1855
1878
  */
1856
1879
 
1857
1880
  /**
1858
- * Extract links from parsed HTML document.
1881
+ * Extract links from HTML.
1859
1882
  *
1860
1883
  * @remarks
1861
1884
  * Extracts all `<a href>` links with comprehensive metadata and filtering options.
@@ -1870,21 +1893,25 @@ interface LinksMetadata {
1870
1893
  * - Deduplication
1871
1894
  * - Link text extraction
1872
1895
  *
1873
- * @param doc - Parsed HTML document
1896
+ * @param input - Parsed HTML document or raw HTML string
1874
1897
  * @param baseUrl - Base URL for resolving relative links and determining internal/external
1875
1898
  * @param options - Extraction options for filtering and categorization
1876
1899
  * @returns Links metadata with categorized links
1877
1900
  *
1878
1901
  * @example
1879
1902
  * ```typescript
1903
+ * // With parsed document (recommended for multiple extractions)
1880
1904
  * const doc = parseHTML(htmlString);
1881
1905
  * const links = extractLinks(doc, 'https://example.com');
1882
1906
  *
1907
+ * // Or directly with HTML string
1908
+ * const links = extractLinks(htmlString, 'https://example.com');
1909
+ *
1883
1910
  * // Get all internal links (same origin)
1884
1911
  * console.log(links.internal);
1885
1912
  *
1886
1913
  * // Get external links excluding nofollow
1887
- * const linksNoFollow = extractLinks(doc, 'https://example.com', {
1914
+ * const linksNoFollow = extractLinks(htmlString, 'https://example.com', {
1888
1915
  * scope: 'external',
1889
1916
  * excludeRel: ['nofollow']
1890
1917
  * });
@@ -1893,13 +1920,13 @@ interface LinksMetadata {
1893
1920
  * @example
1894
1921
  * ```typescript
1895
1922
  * // Crawler use case - get follow-able links
1896
- * const links = extractLinks(doc, baseUrl, {
1923
+ * const links = extractLinks(html, baseUrl, {
1897
1924
  * excludeRel: ['nofollow', 'ugc', 'sponsored'],
1898
1925
  * includeHashLinks: false
1899
1926
  * });
1900
1927
  * ```
1901
1928
  */
1902
- declare function extractLinks(doc: HTMLDocument, baseUrl?: string | URL | null, options?: LinksExtractionOptions): LinksMetadata;
1929
+ declare function extractLinks(input: DocumentInput, baseUrl?: string | URL | null, options?: LinksExtractionOptions): LinksMetadata;
1903
1930
 
1904
1931
  /**
1905
1932
  * Monetization and payment types.
@@ -1940,24 +1967,26 @@ interface MonetizationMetadata {
1940
1967
  */
1941
1968
 
1942
1969
  /**
1943
- * Extract monetization metadata from parsed HTML document.
1970
+ * Extract monetization metadata from HTML.
1944
1971
  *
1945
1972
  * @remarks
1946
1973
  * Extracts web monetization, payment verification, and cryptocurrency
1947
1974
  * addresses from meta tags and link tags.
1948
1975
  *
1949
- * @param doc - Parsed HTML document
1976
+ * @param input - Parsed HTML document or raw HTML string
1950
1977
  * @returns Monetization metadata
1951
1978
  *
1952
1979
  * @example
1953
1980
  * ```typescript
1981
+ * // With parsed document (recommended for multiple extractions)
1954
1982
  * const doc = parseHTML(htmlString);
1955
1983
  * const monetization = extractMonetization(doc);
1956
- * console.log(monetization.webMonetization);
1957
- * console.log(monetization.bitcoin);
1984
+ *
1985
+ * // Or directly with HTML string
1986
+ * const monetization = extractMonetization(htmlString);
1958
1987
  * ```
1959
1988
  */
1960
- declare function extractMonetization(doc: HTMLDocument): MonetizationMetadata;
1989
+ declare function extractMonetization(input: DocumentInput): MonetizationMetadata;
1961
1990
 
1962
1991
  /**
1963
1992
  * News and press types.
@@ -1994,24 +2023,26 @@ interface NewsMetadata {
1994
2023
  */
1995
2024
 
1996
2025
  /**
1997
- * Extract news metadata from parsed HTML document.
2026
+ * Extract news metadata from HTML.
1998
2027
  *
1999
2028
  * @remarks
2000
2029
  * Extracts news-specific metadata including keywords, standout tags,
2001
2030
  * and syndication information.
2002
2031
  *
2003
- * @param doc - Parsed HTML document
2032
+ * @param input - Parsed HTML document or raw HTML string
2004
2033
  * @returns News metadata
2005
2034
  *
2006
2035
  * @example
2007
2036
  * ```typescript
2037
+ * // With parsed document (recommended for multiple extractions)
2008
2038
  * const doc = parseHTML(htmlString);
2009
2039
  * const news = extractNews(doc);
2010
- * console.log(news.keywords);
2011
- * console.log(news.standout);
2040
+ *
2041
+ * // Or directly with HTML string
2042
+ * const news = extractNews(htmlString);
2012
2043
  * ```
2013
2044
  */
2014
- declare function extractNews(doc: HTMLDocument): NewsMetadata;
2045
+ declare function extractNews(input: DocumentInput): NewsMetadata;
2015
2046
 
2016
2047
  /**
2017
2048
  * OpenGraph metadata types.
@@ -2179,7 +2210,7 @@ interface OpenGraphMetadata {
2179
2210
  * console.log(og.article?.publishedTime);
2180
2211
  * ```
2181
2212
  */
2182
- declare function extractOpenGraph(doc: HTMLDocument): OpenGraphMetadata;
2213
+ declare function extractOpenGraph(input: DocumentInput): OpenGraphMetadata;
2183
2214
 
2184
2215
  /**
2185
2216
  * Pagination metadata types.
@@ -2220,24 +2251,26 @@ interface PaginationMetadata {
2220
2251
  */
2221
2252
 
2222
2253
  /**
2223
- * Extract pagination metadata from parsed HTML document.
2254
+ * Extract pagination metadata from HTML.
2224
2255
  *
2225
2256
  * @remarks
2226
2257
  * Extracts pagination navigation links including prev, next, first, last,
2227
2258
  * up (parent), and index links.
2228
2259
  *
2229
- * @param doc - Parsed HTML document
2260
+ * @param input - Parsed HTML document or raw HTML string
2230
2261
  * @returns Pagination metadata
2231
2262
  *
2232
2263
  * @example
2233
2264
  * ```typescript
2265
+ * // With parsed document (recommended for multiple extractions)
2234
2266
  * const doc = parseHTML(htmlString);
2235
2267
  * const pagination = extractPagination(doc);
2236
- * console.log(pagination.prev); // Previous page URL
2237
- * console.log(pagination.next); // Next page URL
2268
+ *
2269
+ * // Or directly with HTML string
2270
+ * const pagination = extractPagination(htmlString);
2238
2271
  * ```
2239
2272
  */
2240
- declare function extractPagination(doc: HTMLDocument): PaginationMetadata;
2273
+ declare function extractPagination(input: DocumentInput): PaginationMetadata;
2241
2274
 
2242
2275
  /**
2243
2276
  * Robots and crawling directives types.
@@ -2299,24 +2332,26 @@ interface RobotsMetadata {
2299
2332
  */
2300
2333
 
2301
2334
  /**
2302
- * Extract robots metadata from parsed HTML document.
2335
+ * Extract robots metadata from HTML.
2303
2336
  *
2304
2337
  * @remarks
2305
2338
  * Extracts robot directives from meta tags for general robots,
2306
2339
  * Googlebot, Bingbot, and Google News bot.
2307
2340
  *
2308
- * @param doc - Parsed HTML document
2341
+ * @param input - Parsed HTML document or raw HTML string
2309
2342
  * @returns Robots metadata
2310
2343
  *
2311
2344
  * @example
2312
2345
  * ```typescript
2346
+ * // With parsed document (recommended for multiple extractions)
2313
2347
  * const doc = parseHTML(htmlString);
2314
2348
  * const robots = extractRobots(doc);
2315
- * console.log(robots.robots?.index); // true/false
2316
- * console.log(robots.robots?.follow); // true/false
2349
+ *
2350
+ * // Or directly with HTML string
2351
+ * const robots = extractRobots(htmlString);
2317
2352
  * ```
2318
2353
  */
2319
- declare function extractRobots(doc: HTMLDocument): RobotsMetadata;
2354
+ declare function extractRobots(input: DocumentInput): RobotsMetadata;
2320
2355
 
2321
2356
  /**
2322
2357
  * Schema.org / JSON-LD metadata types.
@@ -2381,24 +2416,26 @@ interface SchemaOrgMetadata {
2381
2416
  */
2382
2417
 
2383
2418
  /**
2384
- * Extract Schema.org metadata from parsed HTML document.
2419
+ * Extract Schema.org metadata from HTML.
2385
2420
  *
2386
2421
  * @remarks
2387
2422
  * Finds all <script type="application/ld+json"> tags, parses the JSON-LD,
2388
2423
  * and organizes by type for easy access.
2389
2424
  *
2390
- * @param doc - Parsed HTML document
2425
+ * @param input - Parsed HTML document or raw HTML string
2391
2426
  * @returns Schema.org metadata object
2392
2427
  *
2393
2428
  * @example
2394
2429
  * ```typescript
2430
+ * // With parsed document (recommended for multiple extractions)
2395
2431
  * const doc = parseHTML(htmlString);
2396
2432
  * const schema = extractSchemaOrg(doc);
2397
- * console.log(schema.jsonLd.length);
2398
- * console.log(schema.articles);
2433
+ *
2434
+ * // Or directly with HTML string
2435
+ * const schema = extractSchemaOrg(htmlString);
2399
2436
  * ```
2400
2437
  */
2401
- declare function extractSchemaOrg(doc: HTMLDocument): SchemaOrgMetadata;
2438
+ declare function extractSchemaOrg(input: DocumentInput): SchemaOrgMetadata;
2402
2439
 
2403
2440
  /**
2404
2441
  * Security and privacy types.
@@ -2435,24 +2472,26 @@ interface SecurityMetadata {
2435
2472
  */
2436
2473
 
2437
2474
  /**
2438
- * Extract security metadata from parsed HTML document.
2475
+ * Extract security metadata from HTML.
2439
2476
  *
2440
2477
  * @remarks
2441
2478
  * Extracts security and privacy-related meta tags including referrer policy,
2442
2479
  * content security policy, and browser compatibility directives.
2443
2480
  *
2444
- * @param doc - Parsed HTML document
2481
+ * @param input - Parsed HTML document or raw HTML string
2445
2482
  * @returns Security metadata
2446
2483
  *
2447
2484
  * @example
2448
2485
  * ```typescript
2486
+ * // With parsed document (recommended for multiple extractions)
2449
2487
  * const doc = parseHTML(htmlString);
2450
2488
  * const security = extractSecurity(doc);
2451
- * console.log(security.referrerPolicy);
2452
- * console.log(security.contentSecurityPolicy);
2489
+ *
2490
+ * // Or directly with HTML string
2491
+ * const security = extractSecurity(htmlString);
2453
2492
  * ```
2454
2493
  */
2455
- declare function extractSecurity(doc: HTMLDocument): SecurityMetadata;
2494
+ declare function extractSecurity(input: DocumentInput): SecurityMetadata;
2456
2495
 
2457
2496
  /**
2458
2497
  * SEO metadata types.
@@ -2506,24 +2545,26 @@ interface SEOMetadata {
2506
2545
  */
2507
2546
 
2508
2547
  /**
2509
- * Extract SEO metadata from parsed HTML document.
2548
+ * Extract SEO metadata from HTML.
2510
2549
  *
2511
2550
  * @remarks
2512
2551
  * Extracts standard SEO meta tags including title, description, keywords,
2513
2552
  * and browser-specific configuration. All fields are optional.
2514
2553
  *
2515
- * @param doc - Parsed HTML document
2554
+ * @param input - Parsed HTML document or raw HTML string
2516
2555
  * @returns SEO metadata object
2517
2556
  *
2518
2557
  * @example
2519
2558
  * ```typescript
2559
+ * // With parsed document (recommended for multiple extractions)
2520
2560
  * const doc = parseHTML(htmlString);
2521
2561
  * const seo = extractSEO(doc);
2522
- * console.log(seo.title); // Page title
2523
- * console.log(seo.description); // Meta description
2562
+ *
2563
+ * // Or directly with HTML string
2564
+ * const seo = extractSEO(htmlString);
2524
2565
  * ```
2525
2566
  */
2526
- declare function extractSEO(doc: HTMLDocument): SEOMetadata;
2567
+ declare function extractSEO(input: DocumentInput): SEOMetadata;
2527
2568
 
2528
2569
  /**
2529
2570
  * Sitemap discovery types.
@@ -2556,25 +2597,27 @@ interface SitemapDiscoveryMetadata {
2556
2597
  */
2557
2598
 
2558
2599
  /**
2559
- * Extract sitemap discovery metadata from parsed HTML document.
2600
+ * Extract sitemap discovery metadata from HTML.
2560
2601
  *
2561
2602
  * @remarks
2562
2603
  * Finds all sitemaps declared in <link rel="sitemap"> tags and generates
2563
2604
  * suggestions for common sitemap URL patterns.
2564
2605
  *
2565
- * @param doc - Parsed HTML document
2606
+ * @param input - Parsed HTML document or raw HTML string
2566
2607
  * @param documentUrl - Optional document URL for generating absolute sitemap suggestions
2567
2608
  * @returns Sitemap discovery metadata
2568
2609
  *
2569
2610
  * @example
2570
2611
  * ```typescript
2612
+ * // With parsed document (recommended for multiple extractions)
2571
2613
  * const doc = parseHTML(htmlString);
2572
2614
  * const sitemaps = extractSitemapDiscovery(doc, 'https://example.com');
2573
- * console.log(sitemaps.sitemaps); // Discovered sitemaps
2574
- * console.log(sitemaps.suggestions); // Suggested sitemap URLs
2615
+ *
2616
+ * // Or directly with HTML string
2617
+ * const sitemaps = extractSitemapDiscovery(htmlString, 'https://example.com');
2575
2618
  * ```
2576
2619
  */
2577
- declare function extractSitemapDiscovery(doc: HTMLDocument, documentUrl?: string | URL): SitemapDiscoveryMetadata;
2620
+ declare function extractSitemapDiscovery(input: DocumentInput, documentUrl?: string | URL): SitemapDiscoveryMetadata;
2578
2621
 
2579
2622
  /**
2580
2623
  * Social profiles types.
@@ -2625,23 +2668,25 @@ interface SocialProfilesMetadata {
2625
2668
  */
2626
2669
 
2627
2670
  /**
2628
- * Extract social profiles metadata from parsed HTML document.
2671
+ * Extract social profiles metadata from HTML.
2629
2672
  *
2630
2673
  * @remarks
2631
2674
  * Extracts social media profile URLs and handles from meta tags and structured data.
2632
2675
  *
2633
- * @param doc - Parsed HTML document
2676
+ * @param input - Parsed HTML document or raw HTML string
2634
2677
  * @returns Social profiles metadata
2635
2678
  *
2636
2679
  * @example
2637
2680
  * ```typescript
2681
+ * // With parsed document (recommended for multiple extractions)
2638
2682
  * const doc = parseHTML(htmlString);
2639
2683
  * const profiles = extractSocialProfiles(doc);
2640
- * console.log(profiles.twitter);
2641
- * console.log(profiles.facebook);
2684
+ *
2685
+ * // Or directly with HTML string
2686
+ * const profiles = extractSocialProfiles(htmlString);
2642
2687
  * ```
2643
2688
  */
2644
- declare function extractSocialProfiles(doc: HTMLDocument): SocialProfilesMetadata;
2689
+ declare function extractSocialProfiles(input: DocumentInput): SocialProfilesMetadata;
2645
2690
 
2646
2691
  /**
2647
2692
  * Twitter Card metadata types.
@@ -2724,24 +2769,26 @@ interface TwitterCardMetadata {
2724
2769
  */
2725
2770
 
2726
2771
  /**
2727
- * Extract Twitter Card metadata from parsed HTML document.
2772
+ * Extract Twitter Card metadata from HTML.
2728
2773
  *
2729
2774
  * @remarks
2730
2775
  * Extracts Twitter Card metadata including card type, site/creator info,
2731
2776
  * title/description, images, app cards, and player cards.
2732
2777
  *
2733
- * @param doc - Parsed HTML document
2778
+ * @param input - Parsed HTML document or raw HTML string
2734
2779
  * @returns Twitter Card metadata object
2735
2780
  *
2736
2781
  * @example
2737
2782
  * ```typescript
2783
+ * // With parsed document (recommended for multiple extractions)
2738
2784
  * const doc = parseHTML(htmlString);
2739
2785
  * const twitter = extractTwitterCard(doc);
2740
- * console.log(twitter.card);
2741
- * console.log(twitter.title);
2786
+ *
2787
+ * // Or directly with HTML string
2788
+ * const twitter = extractTwitterCard(htmlString);
2742
2789
  * ```
2743
2790
  */
2744
- declare function extractTwitterCard(doc: HTMLDocument): TwitterCardMetadata;
2791
+ declare function extractTwitterCard(input: DocumentInput): TwitterCardMetadata;
2745
2792
 
2746
2793
  /**
2747
2794
  * Verification tags types.
@@ -2786,23 +2833,25 @@ interface VerificationMetadata {
2786
2833
  */
2787
2834
 
2788
2835
  /**
2789
- * Extract verification metadata from parsed HTML document.
2836
+ * Extract verification metadata from HTML.
2790
2837
  *
2791
2838
  * @remarks
2792
2839
  * Extracts verification tags used by various platforms for domain and ownership verification.
2793
2840
  *
2794
- * @param doc - Parsed HTML document
2841
+ * @param input - Parsed HTML document or raw HTML string
2795
2842
  * @returns Verification metadata
2796
2843
  *
2797
2844
  * @example
2798
2845
  * ```typescript
2846
+ * // With parsed document (recommended for multiple extractions)
2799
2847
  * const doc = parseHTML(htmlString);
2800
2848
  * const verification = extractVerification(doc);
2801
- * console.log(verification.googleSiteVerification);
2802
- * console.log(verification.facebookDomainVerification);
2849
+ *
2850
+ * // Or directly with HTML string
2851
+ * const verification = extractVerification(htmlString);
2803
2852
  * ```
2804
2853
  */
2805
- declare function extractVerification(doc: HTMLDocument): VerificationMetadata;
2854
+ declare function extractVerification(input: DocumentInput): VerificationMetadata;
2806
2855
 
2807
2856
  /**
2808
2857
  * Enhanced fetch types for web scraping.
@@ -3253,4 +3302,4 @@ interface SwoopResult {
3253
3302
  */
3254
3303
  declare function swoop(url: string | URL, init?: SwoopInit): Promise<SwoopResult>;
3255
3304
 
3256
- export { type AlternateLink, type AnalyticsMetadata, type AppLinks, type AppleTouchIcon, type Article, type AssetsMetadata, type CanonicalMetadata, type ConnectionHint, type ContentExtractionOptions, type ContentQuality, type ContentResult, type CopyrightMetadata, type DiscoveredFeed, type DublinCoreMetadata, type ExtractedContent, type ExtractedLink, type ExtractionErrorType, type ExtractionFailure, type Feed, type FeedAuthor, type FeedDiscoveryMetadata, type FeedEnclosure, type FeedFormat, type FeedItem, type GeoMetadata, type GeoPosition, type HTMLDocument, type HtmlToTextOptions, type IconsMetadata, type JsonLdBlock, type LanguageMetadata, type LinksExtractionOptions, type LinksMetadata, type MSTile, type MaskIcon, type MonetizationMetadata, type NewsMetadata, type OpenGraphArticle, type OpenGraphAudio, type OpenGraphBook, type OpenGraphImage, type OpenGraphMetadata, type OpenGraphProfile, type OpenGraphVideo, type PaginationMetadata, type ParseResult, PluckContentTypeError, PluckEncodingError, PluckError, PluckHttpError, type PluckInit, PluckNetworkError, PluckRedirectError, type PluckResponse, PluckSizeError, PluckTimeoutError, type PreloadResource, type RobotDirectives, type RobotsMetadata, type SEOMetadata, type SchemaOrgMetadata, type SecurityMetadata, type SitemapDiscoveryMetadata, type SocialProfilesMetadata, SwoopEnvironmentError, SwoopError, SwoopExecutionError, type SwoopInit, type SwoopResult, SwoopSecurityError, SwoopTimeoutError, type SwoopWaitStrategy, type TwitterApp, type TwitterAppPlatform, type TwitterCardMetadata, type TwitterPlayer, type VerificationMetadata, type Website, assessContentQuality, calculateReadingTime, countWords, detectFormat, extractAnalytics, extractAssets, extractCanonical, extractContent, extractCopyright, extractDublinCore, extractFeedDiscovery, extractGeo, extractIcons, extractLanguage, extractLinks, extractMonetization, extractNews, extractOpenGraph, extractPagination, extractRobots, extractSEO, extractSchemaOrg, extractSecurity, extractSitemapDiscovery, extractSocialProfiles, extractTwitterCard, extractVerification, gatherArticle, gatherFeed, gatherWebsite, htmlToText, isAtom, isFeed, isJSONFeed, isProbablyReaderable, isRSS, parseFeed, parseHTML, pluck, swoop };
3305
+ export { type AlternateLink, type AnalyticsMetadata, type AppLinks, type AppleTouchIcon, type Article, type AssetsMetadata, type CanonicalMetadata, type ConnectionHint, type ContentExtractionOptions, type ContentQuality, type ContentResult, type CopyrightMetadata, type DiscoveredFeed, type DocumentInput, type DublinCoreMetadata, type ExtractedContent, type ExtractedLink, type ExtractionErrorType, type ExtractionFailure, type Feed, type FeedAuthor, type FeedDiscoveryMetadata, type FeedEnclosure, type FeedFormat, type FeedItem, type GeoMetadata, type GeoPosition, type HTMLDocument, type HtmlToTextOptions, type IconsMetadata, type JsonLdBlock, type LanguageMetadata, type LinksExtractionOptions, type LinksMetadata, type MSTile, type MaskIcon, type MonetizationMetadata, type NewsMetadata, type OpenGraphArticle, type OpenGraphAudio, type OpenGraphBook, type OpenGraphImage, type OpenGraphMetadata, type OpenGraphProfile, type OpenGraphVideo, type PaginationMetadata, type ParseResult, PluckContentTypeError, PluckEncodingError, PluckError, PluckHttpError, type PluckInit, PluckNetworkError, PluckRedirectError, type PluckResponse, PluckSizeError, PluckTimeoutError, type PreloadResource, type RobotDirectives, type RobotsMetadata, type SEOMetadata, type SchemaOrgMetadata, type SecurityMetadata, type SitemapDiscoveryMetadata, type SocialProfilesMetadata, SwoopEnvironmentError, SwoopError, SwoopExecutionError, type SwoopInit, type SwoopResult, SwoopSecurityError, SwoopTimeoutError, type SwoopWaitStrategy, type TwitterApp, type TwitterAppPlatform, type TwitterCardMetadata, type TwitterPlayer, type VerificationMetadata, type Website, assessContentQuality, calculateReadingTime, countWords, detectFormat, extractAnalytics, extractAssets, extractCanonical, extractContent, extractCopyright, extractDublinCore, extractFeedDiscovery, extractGeo, extractIcons, extractLanguage, extractLinks, extractMonetization, extractNews, extractOpenGraph, extractPagination, extractRobots, extractSEO, extractSchemaOrg, extractSecurity, extractSitemapDiscovery, extractSocialProfiles, extractTwitterCard, extractVerification, gatherArticle, gatherFeed, gatherWebsite, htmlToText, isAtom, isFeed, isJSONFeed, isProbablyReaderable, isRSS, parseFeed, parseHTML, pluck, swoop };