magpie-html 0.2.2 → 0.2.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.cjs +50 -22
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +156 -108
- package/dist/index.d.ts +156 -108
- package/dist/index.js +50 -22
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/dist/index.d.cts
CHANGED
|
@@ -1043,6 +1043,11 @@ declare function gatherWebsite(url: string | URL): Promise<Website>;
|
|
|
1043
1043
|
*/
|
|
1044
1044
|
declare function parseHTML(html: string, baseUrl?: string): Document;
|
|
1045
1045
|
type HTMLDocument = Document;
|
|
1046
|
+
/**
|
|
1047
|
+
* Input type that accepts either a parsed Document or raw HTML string.
|
|
1048
|
+
* This allows extractor functions to be more forgiving.
|
|
1049
|
+
*/
|
|
1050
|
+
type DocumentInput = Document | string;
|
|
1046
1051
|
|
|
1047
1052
|
/**
|
|
1048
1053
|
* Analytics and tracking types.
|
|
@@ -1089,24 +1094,26 @@ interface AnalyticsMetadata {
|
|
|
1089
1094
|
*/
|
|
1090
1095
|
|
|
1091
1096
|
/**
|
|
1092
|
-
* Extract analytics metadata from
|
|
1097
|
+
* Extract analytics metadata from HTML.
|
|
1093
1098
|
*
|
|
1094
1099
|
* @remarks
|
|
1095
1100
|
* Detects analytics service IDs by examining script tags and their content.
|
|
1096
1101
|
* Only extracts identifiers, does not track or collect user data.
|
|
1097
1102
|
*
|
|
1098
|
-
* @param
|
|
1103
|
+
* @param input - Parsed HTML document or raw HTML string
|
|
1099
1104
|
* @returns Analytics metadata
|
|
1100
1105
|
*
|
|
1101
1106
|
* @example
|
|
1102
1107
|
* ```typescript
|
|
1108
|
+
* // With parsed document (recommended for multiple extractions)
|
|
1103
1109
|
* const doc = parseHTML(htmlString);
|
|
1104
1110
|
* const analytics = extractAnalytics(doc);
|
|
1105
|
-
*
|
|
1106
|
-
*
|
|
1111
|
+
*
|
|
1112
|
+
* // Or directly with HTML string
|
|
1113
|
+
* const analytics = extractAnalytics(htmlString);
|
|
1107
1114
|
* ```
|
|
1108
1115
|
*/
|
|
1109
|
-
declare function extractAnalytics(
|
|
1116
|
+
declare function extractAnalytics(input: DocumentInput): AnalyticsMetadata;
|
|
1110
1117
|
|
|
1111
1118
|
/**
|
|
1112
1119
|
* Assets extraction types.
|
|
@@ -1190,7 +1197,7 @@ interface ConnectionHint {
|
|
|
1190
1197
|
*/
|
|
1191
1198
|
|
|
1192
1199
|
/**
|
|
1193
|
-
* Extract assets metadata from
|
|
1200
|
+
* Extract assets metadata from HTML.
|
|
1194
1201
|
*
|
|
1195
1202
|
* @remarks
|
|
1196
1203
|
* Extracts all external assets referenced in the document, organized by type.
|
|
@@ -1206,20 +1213,21 @@ interface ConnectionHint {
|
|
|
1206
1213
|
* - Preloads: `<link rel="preload">` and `<link rel="prefetch">`
|
|
1207
1214
|
* - Connection hints: `<link rel="dns-prefetch">` and `<link rel="preconnect">`
|
|
1208
1215
|
*
|
|
1209
|
-
* @param
|
|
1216
|
+
* @param input - Parsed HTML document or raw HTML string
|
|
1210
1217
|
* @param baseUrl - Optional base URL for resolving relative URLs
|
|
1211
1218
|
* @returns Assets metadata object with categorized URLs
|
|
1212
1219
|
*
|
|
1213
1220
|
* @example
|
|
1214
1221
|
* ```typescript
|
|
1222
|
+
* // With parsed document (recommended for multiple extractions)
|
|
1215
1223
|
* const doc = parseHTML(htmlString);
|
|
1216
1224
|
* const assets = extractAssets(doc, 'https://example.com');
|
|
1217
|
-
*
|
|
1218
|
-
*
|
|
1219
|
-
*
|
|
1225
|
+
*
|
|
1226
|
+
* // Or directly with HTML string
|
|
1227
|
+
* const assets = extractAssets(htmlString, 'https://example.com');
|
|
1220
1228
|
* ```
|
|
1221
1229
|
*/
|
|
1222
|
-
declare function extractAssets(
|
|
1230
|
+
declare function extractAssets(input: DocumentInput, baseUrl?: string | URL | null): AssetsMetadata;
|
|
1223
1231
|
|
|
1224
1232
|
/**
|
|
1225
1233
|
* Canonical and alternate URL metadata types.
|
|
@@ -1283,24 +1291,26 @@ interface CanonicalMetadata {
|
|
|
1283
1291
|
*/
|
|
1284
1292
|
|
|
1285
1293
|
/**
|
|
1286
|
-
* Extract canonical and alternate URL metadata from
|
|
1294
|
+
* Extract canonical and alternate URL metadata from HTML.
|
|
1287
1295
|
*
|
|
1288
1296
|
* @remarks
|
|
1289
1297
|
* Extracts canonical URLs, language alternates, AMP versions, manifests,
|
|
1290
1298
|
* and app linking metadata.
|
|
1291
1299
|
*
|
|
1292
|
-
* @param
|
|
1300
|
+
* @param input - Parsed HTML document or raw HTML string
|
|
1293
1301
|
* @returns Canonical metadata object
|
|
1294
1302
|
*
|
|
1295
1303
|
* @example
|
|
1296
1304
|
* ```typescript
|
|
1305
|
+
* // With parsed document (recommended for multiple extractions)
|
|
1297
1306
|
* const doc = parseHTML(htmlString);
|
|
1298
1307
|
* const canonical = extractCanonical(doc);
|
|
1299
|
-
*
|
|
1300
|
-
*
|
|
1308
|
+
*
|
|
1309
|
+
* // Or directly with HTML string
|
|
1310
|
+
* const canonical = extractCanonical(htmlString);
|
|
1301
1311
|
* ```
|
|
1302
1312
|
*/
|
|
1303
|
-
declare function extractCanonical(
|
|
1313
|
+
declare function extractCanonical(input: DocumentInput): CanonicalMetadata;
|
|
1304
1314
|
|
|
1305
1315
|
/**
|
|
1306
1316
|
* Copyright and licensing types.
|
|
@@ -1337,24 +1347,26 @@ interface CopyrightMetadata {
|
|
|
1337
1347
|
*/
|
|
1338
1348
|
|
|
1339
1349
|
/**
|
|
1340
|
-
* Extract copyright metadata from
|
|
1350
|
+
* Extract copyright metadata from HTML.
|
|
1341
1351
|
*
|
|
1342
1352
|
* @remarks
|
|
1343
1353
|
* Extracts copyright and licensing information from meta tags, link tags,
|
|
1344
1354
|
* and Schema.org structured data.
|
|
1345
1355
|
*
|
|
1346
|
-
* @param
|
|
1356
|
+
* @param input - Parsed HTML document or raw HTML string
|
|
1347
1357
|
* @returns Copyright metadata
|
|
1348
1358
|
*
|
|
1349
1359
|
* @example
|
|
1350
1360
|
* ```typescript
|
|
1361
|
+
* // With parsed document (recommended for multiple extractions)
|
|
1351
1362
|
* const doc = parseHTML(htmlString);
|
|
1352
1363
|
* const copyright = extractCopyright(doc);
|
|
1353
|
-
*
|
|
1354
|
-
*
|
|
1364
|
+
*
|
|
1365
|
+
* // Or directly with HTML string
|
|
1366
|
+
* const copyright = extractCopyright(htmlString);
|
|
1355
1367
|
* ```
|
|
1356
1368
|
*/
|
|
1357
|
-
declare function extractCopyright(
|
|
1369
|
+
declare function extractCopyright(input: DocumentInput): CopyrightMetadata;
|
|
1358
1370
|
|
|
1359
1371
|
/**
|
|
1360
1372
|
* Dublin Core metadata types.
|
|
@@ -1414,25 +1426,27 @@ interface DublinCoreMetadata {
|
|
|
1414
1426
|
*/
|
|
1415
1427
|
|
|
1416
1428
|
/**
|
|
1417
|
-
* Extract Dublin Core metadata from
|
|
1429
|
+
* Extract Dublin Core metadata from HTML.
|
|
1418
1430
|
*
|
|
1419
1431
|
* @remarks
|
|
1420
1432
|
* Extracts Dublin Core metadata using both DC. and dcterms. prefixes.
|
|
1421
1433
|
* Fields that can have multiple values (creator, subject, contributor)
|
|
1422
1434
|
* are extracted as arrays.
|
|
1423
1435
|
*
|
|
1424
|
-
* @param
|
|
1436
|
+
* @param input - Parsed HTML document or raw HTML string
|
|
1425
1437
|
* @returns Dublin Core metadata object
|
|
1426
1438
|
*
|
|
1427
1439
|
* @example
|
|
1428
1440
|
* ```typescript
|
|
1441
|
+
* // With parsed document (recommended for multiple extractions)
|
|
1429
1442
|
* const doc = parseHTML(htmlString);
|
|
1430
1443
|
* const dc = extractDublinCore(doc);
|
|
1431
|
-
*
|
|
1432
|
-
*
|
|
1444
|
+
*
|
|
1445
|
+
* // Or directly with HTML string
|
|
1446
|
+
* const dc = extractDublinCore(htmlString);
|
|
1433
1447
|
* ```
|
|
1434
1448
|
*/
|
|
1435
|
-
declare function extractDublinCore(
|
|
1449
|
+
declare function extractDublinCore(input: DocumentInput): DublinCoreMetadata;
|
|
1436
1450
|
|
|
1437
1451
|
/**
|
|
1438
1452
|
* Feed discovery types.
|
|
@@ -1476,25 +1490,27 @@ interface FeedDiscoveryMetadata {
|
|
|
1476
1490
|
*/
|
|
1477
1491
|
|
|
1478
1492
|
/**
|
|
1479
|
-
* Extract feed discovery metadata from
|
|
1493
|
+
* Extract feed discovery metadata from HTML.
|
|
1480
1494
|
*
|
|
1481
1495
|
* @remarks
|
|
1482
1496
|
* Finds all feeds declared in <link rel="alternate"> tags and generates
|
|
1483
1497
|
* suggestions for common feed URL patterns.
|
|
1484
1498
|
*
|
|
1485
|
-
* @param
|
|
1499
|
+
* @param input - Parsed HTML document or raw HTML string
|
|
1486
1500
|
* @param documentUrl - Optional document URL for generating absolute feed suggestions
|
|
1487
1501
|
* @returns Feed discovery metadata
|
|
1488
1502
|
*
|
|
1489
1503
|
* @example
|
|
1490
1504
|
* ```typescript
|
|
1505
|
+
* // With parsed document (recommended for multiple extractions)
|
|
1491
1506
|
* const doc = parseHTML(htmlString);
|
|
1492
1507
|
* const feeds = extractFeedDiscovery(doc, 'https://example.com');
|
|
1493
|
-
*
|
|
1494
|
-
*
|
|
1508
|
+
*
|
|
1509
|
+
* // Or directly with HTML string
|
|
1510
|
+
* const feeds = extractFeedDiscovery(htmlString, 'https://example.com');
|
|
1495
1511
|
* ```
|
|
1496
1512
|
*/
|
|
1497
|
-
declare function extractFeedDiscovery(
|
|
1513
|
+
declare function extractFeedDiscovery(input: DocumentInput, documentUrl?: string | URL): FeedDiscoveryMetadata;
|
|
1498
1514
|
|
|
1499
1515
|
/**
|
|
1500
1516
|
* Geographic location types.
|
|
@@ -1543,24 +1559,26 @@ interface GeoMetadata {
|
|
|
1543
1559
|
*/
|
|
1544
1560
|
|
|
1545
1561
|
/**
|
|
1546
|
-
* Extract geographic metadata from
|
|
1562
|
+
* Extract geographic metadata from HTML.
|
|
1547
1563
|
*
|
|
1548
1564
|
* @remarks
|
|
1549
1565
|
* Extracts geographic location information including coordinates,
|
|
1550
1566
|
* place names, and region codes from meta tags.
|
|
1551
1567
|
*
|
|
1552
|
-
* @param
|
|
1568
|
+
* @param input - Parsed HTML document or raw HTML string
|
|
1553
1569
|
* @returns Geographic metadata
|
|
1554
1570
|
*
|
|
1555
1571
|
* @example
|
|
1556
1572
|
* ```typescript
|
|
1573
|
+
* // With parsed document (recommended for multiple extractions)
|
|
1557
1574
|
* const doc = parseHTML(htmlString);
|
|
1558
1575
|
* const geo = extractGeo(doc);
|
|
1559
|
-
*
|
|
1560
|
-
*
|
|
1576
|
+
*
|
|
1577
|
+
* // Or directly with HTML string
|
|
1578
|
+
* const geo = extractGeo(htmlString);
|
|
1561
1579
|
* ```
|
|
1562
1580
|
*/
|
|
1563
|
-
declare function extractGeo(
|
|
1581
|
+
declare function extractGeo(input: DocumentInput): GeoMetadata;
|
|
1564
1582
|
|
|
1565
1583
|
/**
|
|
1566
1584
|
* Icons and visual assets types.
|
|
@@ -1633,24 +1651,26 @@ interface IconsMetadata {
|
|
|
1633
1651
|
*/
|
|
1634
1652
|
|
|
1635
1653
|
/**
|
|
1636
|
-
* Extract icons metadata from
|
|
1654
|
+
* Extract icons metadata from HTML.
|
|
1637
1655
|
*
|
|
1638
1656
|
* @remarks
|
|
1639
1657
|
* Extracts all icon-related metadata including favicons, Apple touch icons,
|
|
1640
1658
|
* Safari mask icons, and Microsoft tile configuration.
|
|
1641
1659
|
*
|
|
1642
|
-
* @param
|
|
1660
|
+
* @param input - Parsed HTML document or raw HTML string
|
|
1643
1661
|
* @returns Icons metadata
|
|
1644
1662
|
*
|
|
1645
1663
|
* @example
|
|
1646
1664
|
* ```typescript
|
|
1665
|
+
* // With parsed document (recommended for multiple extractions)
|
|
1647
1666
|
* const doc = parseHTML(htmlString);
|
|
1648
1667
|
* const icons = extractIcons(doc);
|
|
1649
|
-
*
|
|
1650
|
-
*
|
|
1668
|
+
*
|
|
1669
|
+
* // Or directly with HTML string
|
|
1670
|
+
* const icons = extractIcons(htmlString);
|
|
1651
1671
|
* ```
|
|
1652
1672
|
*/
|
|
1653
|
-
declare function extractIcons(
|
|
1673
|
+
declare function extractIcons(input: DocumentInput): IconsMetadata;
|
|
1654
1674
|
|
|
1655
1675
|
/**
|
|
1656
1676
|
* Language and localization types.
|
|
@@ -1692,24 +1712,26 @@ interface LanguageMetadata {
|
|
|
1692
1712
|
*/
|
|
1693
1713
|
|
|
1694
1714
|
/**
|
|
1695
|
-
* Extract language and localization metadata from
|
|
1715
|
+
* Extract language and localization metadata from HTML.
|
|
1696
1716
|
*
|
|
1697
1717
|
* @remarks
|
|
1698
1718
|
* Extracts language information from HTML lang attribute, meta tags,
|
|
1699
1719
|
* and OpenGraph locale. Normalizes to provide a primary language and region.
|
|
1700
1720
|
*
|
|
1701
|
-
* @param
|
|
1721
|
+
* @param input - Parsed HTML document or raw HTML string
|
|
1702
1722
|
* @returns Language metadata
|
|
1703
1723
|
*
|
|
1704
1724
|
* @example
|
|
1705
1725
|
* ```typescript
|
|
1726
|
+
* // With parsed document (recommended for multiple extractions)
|
|
1706
1727
|
* const doc = parseHTML(htmlString);
|
|
1707
1728
|
* const lang = extractLanguage(doc);
|
|
1708
|
-
*
|
|
1709
|
-
*
|
|
1729
|
+
*
|
|
1730
|
+
* // Or directly with HTML string
|
|
1731
|
+
* const lang = extractLanguage(htmlString);
|
|
1710
1732
|
* ```
|
|
1711
1733
|
*/
|
|
1712
|
-
declare function extractLanguage(
|
|
1734
|
+
declare function extractLanguage(input: DocumentInput): LanguageMetadata;
|
|
1713
1735
|
|
|
1714
1736
|
/**
|
|
1715
1737
|
* Links extraction types.
|
|
@@ -1856,7 +1878,7 @@ interface LinksMetadata {
|
|
|
1856
1878
|
*/
|
|
1857
1879
|
|
|
1858
1880
|
/**
|
|
1859
|
-
* Extract links from
|
|
1881
|
+
* Extract links from HTML.
|
|
1860
1882
|
*
|
|
1861
1883
|
* @remarks
|
|
1862
1884
|
* Extracts all `<a href>` links with comprehensive metadata and filtering options.
|
|
@@ -1871,21 +1893,25 @@ interface LinksMetadata {
|
|
|
1871
1893
|
* - Deduplication
|
|
1872
1894
|
* - Link text extraction
|
|
1873
1895
|
*
|
|
1874
|
-
* @param
|
|
1896
|
+
* @param input - Parsed HTML document or raw HTML string
|
|
1875
1897
|
* @param baseUrl - Base URL for resolving relative links and determining internal/external
|
|
1876
1898
|
* @param options - Extraction options for filtering and categorization
|
|
1877
1899
|
* @returns Links metadata with categorized links
|
|
1878
1900
|
*
|
|
1879
1901
|
* @example
|
|
1880
1902
|
* ```typescript
|
|
1903
|
+
* // With parsed document (recommended for multiple extractions)
|
|
1881
1904
|
* const doc = parseHTML(htmlString);
|
|
1882
1905
|
* const links = extractLinks(doc, 'https://example.com');
|
|
1883
1906
|
*
|
|
1907
|
+
* // Or directly with HTML string
|
|
1908
|
+
* const links = extractLinks(htmlString, 'https://example.com');
|
|
1909
|
+
*
|
|
1884
1910
|
* // Get all internal links (same origin)
|
|
1885
1911
|
* console.log(links.internal);
|
|
1886
1912
|
*
|
|
1887
1913
|
* // Get external links excluding nofollow
|
|
1888
|
-
* const linksNoFollow = extractLinks(
|
|
1914
|
+
* const linksNoFollow = extractLinks(htmlString, 'https://example.com', {
|
|
1889
1915
|
* scope: 'external',
|
|
1890
1916
|
* excludeRel: ['nofollow']
|
|
1891
1917
|
* });
|
|
@@ -1894,13 +1920,13 @@ interface LinksMetadata {
|
|
|
1894
1920
|
* @example
|
|
1895
1921
|
* ```typescript
|
|
1896
1922
|
* // Crawler use case - get follow-able links
|
|
1897
|
-
* const links = extractLinks(
|
|
1923
|
+
* const links = extractLinks(html, baseUrl, {
|
|
1898
1924
|
* excludeRel: ['nofollow', 'ugc', 'sponsored'],
|
|
1899
1925
|
* includeHashLinks: false
|
|
1900
1926
|
* });
|
|
1901
1927
|
* ```
|
|
1902
1928
|
*/
|
|
1903
|
-
declare function extractLinks(
|
|
1929
|
+
declare function extractLinks(input: DocumentInput, baseUrl?: string | URL | null, options?: LinksExtractionOptions): LinksMetadata;
|
|
1904
1930
|
|
|
1905
1931
|
/**
|
|
1906
1932
|
* Monetization and payment types.
|
|
@@ -1941,24 +1967,26 @@ interface MonetizationMetadata {
|
|
|
1941
1967
|
*/
|
|
1942
1968
|
|
|
1943
1969
|
/**
|
|
1944
|
-
* Extract monetization metadata from
|
|
1970
|
+
* Extract monetization metadata from HTML.
|
|
1945
1971
|
*
|
|
1946
1972
|
* @remarks
|
|
1947
1973
|
* Extracts web monetization, payment verification, and cryptocurrency
|
|
1948
1974
|
* addresses from meta tags and link tags.
|
|
1949
1975
|
*
|
|
1950
|
-
* @param
|
|
1976
|
+
* @param input - Parsed HTML document or raw HTML string
|
|
1951
1977
|
* @returns Monetization metadata
|
|
1952
1978
|
*
|
|
1953
1979
|
* @example
|
|
1954
1980
|
* ```typescript
|
|
1981
|
+
* // With parsed document (recommended for multiple extractions)
|
|
1955
1982
|
* const doc = parseHTML(htmlString);
|
|
1956
1983
|
* const monetization = extractMonetization(doc);
|
|
1957
|
-
*
|
|
1958
|
-
*
|
|
1984
|
+
*
|
|
1985
|
+
* // Or directly with HTML string
|
|
1986
|
+
* const monetization = extractMonetization(htmlString);
|
|
1959
1987
|
* ```
|
|
1960
1988
|
*/
|
|
1961
|
-
declare function extractMonetization(
|
|
1989
|
+
declare function extractMonetization(input: DocumentInput): MonetizationMetadata;
|
|
1962
1990
|
|
|
1963
1991
|
/**
|
|
1964
1992
|
* News and press types.
|
|
@@ -1995,24 +2023,26 @@ interface NewsMetadata {
|
|
|
1995
2023
|
*/
|
|
1996
2024
|
|
|
1997
2025
|
/**
|
|
1998
|
-
* Extract news metadata from
|
|
2026
|
+
* Extract news metadata from HTML.
|
|
1999
2027
|
*
|
|
2000
2028
|
* @remarks
|
|
2001
2029
|
* Extracts news-specific metadata including keywords, standout tags,
|
|
2002
2030
|
* and syndication information.
|
|
2003
2031
|
*
|
|
2004
|
-
* @param
|
|
2032
|
+
* @param input - Parsed HTML document or raw HTML string
|
|
2005
2033
|
* @returns News metadata
|
|
2006
2034
|
*
|
|
2007
2035
|
* @example
|
|
2008
2036
|
* ```typescript
|
|
2037
|
+
* // With parsed document (recommended for multiple extractions)
|
|
2009
2038
|
* const doc = parseHTML(htmlString);
|
|
2010
2039
|
* const news = extractNews(doc);
|
|
2011
|
-
*
|
|
2012
|
-
*
|
|
2040
|
+
*
|
|
2041
|
+
* // Or directly with HTML string
|
|
2042
|
+
* const news = extractNews(htmlString);
|
|
2013
2043
|
* ```
|
|
2014
2044
|
*/
|
|
2015
|
-
declare function extractNews(
|
|
2045
|
+
declare function extractNews(input: DocumentInput): NewsMetadata;
|
|
2016
2046
|
|
|
2017
2047
|
/**
|
|
2018
2048
|
* OpenGraph metadata types.
|
|
@@ -2180,7 +2210,7 @@ interface OpenGraphMetadata {
|
|
|
2180
2210
|
* console.log(og.article?.publishedTime);
|
|
2181
2211
|
* ```
|
|
2182
2212
|
*/
|
|
2183
|
-
declare function extractOpenGraph(
|
|
2213
|
+
declare function extractOpenGraph(input: DocumentInput): OpenGraphMetadata;
|
|
2184
2214
|
|
|
2185
2215
|
/**
|
|
2186
2216
|
* Pagination metadata types.
|
|
@@ -2221,24 +2251,26 @@ interface PaginationMetadata {
|
|
|
2221
2251
|
*/
|
|
2222
2252
|
|
|
2223
2253
|
/**
|
|
2224
|
-
* Extract pagination metadata from
|
|
2254
|
+
* Extract pagination metadata from HTML.
|
|
2225
2255
|
*
|
|
2226
2256
|
* @remarks
|
|
2227
2257
|
* Extracts pagination navigation links including prev, next, first, last,
|
|
2228
2258
|
* up (parent), and index links.
|
|
2229
2259
|
*
|
|
2230
|
-
* @param
|
|
2260
|
+
* @param input - Parsed HTML document or raw HTML string
|
|
2231
2261
|
* @returns Pagination metadata
|
|
2232
2262
|
*
|
|
2233
2263
|
* @example
|
|
2234
2264
|
* ```typescript
|
|
2265
|
+
* // With parsed document (recommended for multiple extractions)
|
|
2235
2266
|
* const doc = parseHTML(htmlString);
|
|
2236
2267
|
* const pagination = extractPagination(doc);
|
|
2237
|
-
*
|
|
2238
|
-
*
|
|
2268
|
+
*
|
|
2269
|
+
* // Or directly with HTML string
|
|
2270
|
+
* const pagination = extractPagination(htmlString);
|
|
2239
2271
|
* ```
|
|
2240
2272
|
*/
|
|
2241
|
-
declare function extractPagination(
|
|
2273
|
+
declare function extractPagination(input: DocumentInput): PaginationMetadata;
|
|
2242
2274
|
|
|
2243
2275
|
/**
|
|
2244
2276
|
* Robots and crawling directives types.
|
|
@@ -2300,24 +2332,26 @@ interface RobotsMetadata {
|
|
|
2300
2332
|
*/
|
|
2301
2333
|
|
|
2302
2334
|
/**
|
|
2303
|
-
* Extract robots metadata from
|
|
2335
|
+
* Extract robots metadata from HTML.
|
|
2304
2336
|
*
|
|
2305
2337
|
* @remarks
|
|
2306
2338
|
* Extracts robot directives from meta tags for general robots,
|
|
2307
2339
|
* Googlebot, Bingbot, and Google News bot.
|
|
2308
2340
|
*
|
|
2309
|
-
* @param
|
|
2341
|
+
* @param input - Parsed HTML document or raw HTML string
|
|
2310
2342
|
* @returns Robots metadata
|
|
2311
2343
|
*
|
|
2312
2344
|
* @example
|
|
2313
2345
|
* ```typescript
|
|
2346
|
+
* // With parsed document (recommended for multiple extractions)
|
|
2314
2347
|
* const doc = parseHTML(htmlString);
|
|
2315
2348
|
* const robots = extractRobots(doc);
|
|
2316
|
-
*
|
|
2317
|
-
*
|
|
2349
|
+
*
|
|
2350
|
+
* // Or directly with HTML string
|
|
2351
|
+
* const robots = extractRobots(htmlString);
|
|
2318
2352
|
* ```
|
|
2319
2353
|
*/
|
|
2320
|
-
declare function extractRobots(
|
|
2354
|
+
declare function extractRobots(input: DocumentInput): RobotsMetadata;
|
|
2321
2355
|
|
|
2322
2356
|
/**
|
|
2323
2357
|
* Schema.org / JSON-LD metadata types.
|
|
@@ -2382,24 +2416,26 @@ interface SchemaOrgMetadata {
|
|
|
2382
2416
|
*/
|
|
2383
2417
|
|
|
2384
2418
|
/**
|
|
2385
|
-
* Extract Schema.org metadata from
|
|
2419
|
+
* Extract Schema.org metadata from HTML.
|
|
2386
2420
|
*
|
|
2387
2421
|
* @remarks
|
|
2388
2422
|
* Finds all <script type="application/ld+json"> tags, parses the JSON-LD,
|
|
2389
2423
|
* and organizes by type for easy access.
|
|
2390
2424
|
*
|
|
2391
|
-
* @param
|
|
2425
|
+
* @param input - Parsed HTML document or raw HTML string
|
|
2392
2426
|
* @returns Schema.org metadata object
|
|
2393
2427
|
*
|
|
2394
2428
|
* @example
|
|
2395
2429
|
* ```typescript
|
|
2430
|
+
* // With parsed document (recommended for multiple extractions)
|
|
2396
2431
|
* const doc = parseHTML(htmlString);
|
|
2397
2432
|
* const schema = extractSchemaOrg(doc);
|
|
2398
|
-
*
|
|
2399
|
-
*
|
|
2433
|
+
*
|
|
2434
|
+
* // Or directly with HTML string
|
|
2435
|
+
* const schema = extractSchemaOrg(htmlString);
|
|
2400
2436
|
* ```
|
|
2401
2437
|
*/
|
|
2402
|
-
declare function extractSchemaOrg(
|
|
2438
|
+
declare function extractSchemaOrg(input: DocumentInput): SchemaOrgMetadata;
|
|
2403
2439
|
|
|
2404
2440
|
/**
|
|
2405
2441
|
* Security and privacy types.
|
|
@@ -2436,24 +2472,26 @@ interface SecurityMetadata {
|
|
|
2436
2472
|
*/
|
|
2437
2473
|
|
|
2438
2474
|
/**
|
|
2439
|
-
* Extract security metadata from
|
|
2475
|
+
* Extract security metadata from HTML.
|
|
2440
2476
|
*
|
|
2441
2477
|
* @remarks
|
|
2442
2478
|
* Extracts security and privacy-related meta tags including referrer policy,
|
|
2443
2479
|
* content security policy, and browser compatibility directives.
|
|
2444
2480
|
*
|
|
2445
|
-
* @param
|
|
2481
|
+
* @param input - Parsed HTML document or raw HTML string
|
|
2446
2482
|
* @returns Security metadata
|
|
2447
2483
|
*
|
|
2448
2484
|
* @example
|
|
2449
2485
|
* ```typescript
|
|
2486
|
+
* // With parsed document (recommended for multiple extractions)
|
|
2450
2487
|
* const doc = parseHTML(htmlString);
|
|
2451
2488
|
* const security = extractSecurity(doc);
|
|
2452
|
-
*
|
|
2453
|
-
*
|
|
2489
|
+
*
|
|
2490
|
+
* // Or directly with HTML string
|
|
2491
|
+
* const security = extractSecurity(htmlString);
|
|
2454
2492
|
* ```
|
|
2455
2493
|
*/
|
|
2456
|
-
declare function extractSecurity(
|
|
2494
|
+
declare function extractSecurity(input: DocumentInput): SecurityMetadata;
|
|
2457
2495
|
|
|
2458
2496
|
/**
|
|
2459
2497
|
* SEO metadata types.
|
|
@@ -2507,24 +2545,26 @@ interface SEOMetadata {
|
|
|
2507
2545
|
*/
|
|
2508
2546
|
|
|
2509
2547
|
/**
|
|
2510
|
-
* Extract SEO metadata from
|
|
2548
|
+
* Extract SEO metadata from HTML.
|
|
2511
2549
|
*
|
|
2512
2550
|
* @remarks
|
|
2513
2551
|
* Extracts standard SEO meta tags including title, description, keywords,
|
|
2514
2552
|
* and browser-specific configuration. All fields are optional.
|
|
2515
2553
|
*
|
|
2516
|
-
* @param
|
|
2554
|
+
* @param input - Parsed HTML document or raw HTML string
|
|
2517
2555
|
* @returns SEO metadata object
|
|
2518
2556
|
*
|
|
2519
2557
|
* @example
|
|
2520
2558
|
* ```typescript
|
|
2559
|
+
* // With parsed document (recommended for multiple extractions)
|
|
2521
2560
|
* const doc = parseHTML(htmlString);
|
|
2522
2561
|
* const seo = extractSEO(doc);
|
|
2523
|
-
*
|
|
2524
|
-
*
|
|
2562
|
+
*
|
|
2563
|
+
* // Or directly with HTML string
|
|
2564
|
+
* const seo = extractSEO(htmlString);
|
|
2525
2565
|
* ```
|
|
2526
2566
|
*/
|
|
2527
|
-
declare function extractSEO(
|
|
2567
|
+
declare function extractSEO(input: DocumentInput): SEOMetadata;
|
|
2528
2568
|
|
|
2529
2569
|
/**
|
|
2530
2570
|
* Sitemap discovery types.
|
|
@@ -2557,25 +2597,27 @@ interface SitemapDiscoveryMetadata {
|
|
|
2557
2597
|
*/
|
|
2558
2598
|
|
|
2559
2599
|
/**
|
|
2560
|
-
* Extract sitemap discovery metadata from
|
|
2600
|
+
* Extract sitemap discovery metadata from HTML.
|
|
2561
2601
|
*
|
|
2562
2602
|
* @remarks
|
|
2563
2603
|
* Finds all sitemaps declared in <link rel="sitemap"> tags and generates
|
|
2564
2604
|
* suggestions for common sitemap URL patterns.
|
|
2565
2605
|
*
|
|
2566
|
-
* @param
|
|
2606
|
+
* @param input - Parsed HTML document or raw HTML string
|
|
2567
2607
|
* @param documentUrl - Optional document URL for generating absolute sitemap suggestions
|
|
2568
2608
|
* @returns Sitemap discovery metadata
|
|
2569
2609
|
*
|
|
2570
2610
|
* @example
|
|
2571
2611
|
* ```typescript
|
|
2612
|
+
* // With parsed document (recommended for multiple extractions)
|
|
2572
2613
|
* const doc = parseHTML(htmlString);
|
|
2573
2614
|
* const sitemaps = extractSitemapDiscovery(doc, 'https://example.com');
|
|
2574
|
-
*
|
|
2575
|
-
*
|
|
2615
|
+
*
|
|
2616
|
+
* // Or directly with HTML string
|
|
2617
|
+
* const sitemaps = extractSitemapDiscovery(htmlString, 'https://example.com');
|
|
2576
2618
|
* ```
|
|
2577
2619
|
*/
|
|
2578
|
-
declare function extractSitemapDiscovery(
|
|
2620
|
+
declare function extractSitemapDiscovery(input: DocumentInput, documentUrl?: string | URL): SitemapDiscoveryMetadata;
|
|
2579
2621
|
|
|
2580
2622
|
/**
|
|
2581
2623
|
* Social profiles types.
|
|
@@ -2626,23 +2668,25 @@ interface SocialProfilesMetadata {
|
|
|
2626
2668
|
*/
|
|
2627
2669
|
|
|
2628
2670
|
/**
|
|
2629
|
-
* Extract social profiles metadata from
|
|
2671
|
+
* Extract social profiles metadata from HTML.
|
|
2630
2672
|
*
|
|
2631
2673
|
* @remarks
|
|
2632
2674
|
* Extracts social media profile URLs and handles from meta tags and structured data.
|
|
2633
2675
|
*
|
|
2634
|
-
* @param
|
|
2676
|
+
* @param input - Parsed HTML document or raw HTML string
|
|
2635
2677
|
* @returns Social profiles metadata
|
|
2636
2678
|
*
|
|
2637
2679
|
* @example
|
|
2638
2680
|
* ```typescript
|
|
2681
|
+
* // With parsed document (recommended for multiple extractions)
|
|
2639
2682
|
* const doc = parseHTML(htmlString);
|
|
2640
2683
|
* const profiles = extractSocialProfiles(doc);
|
|
2641
|
-
*
|
|
2642
|
-
*
|
|
2684
|
+
*
|
|
2685
|
+
* // Or directly with HTML string
|
|
2686
|
+
* const profiles = extractSocialProfiles(htmlString);
|
|
2643
2687
|
* ```
|
|
2644
2688
|
*/
|
|
2645
|
-
declare function extractSocialProfiles(
|
|
2689
|
+
declare function extractSocialProfiles(input: DocumentInput): SocialProfilesMetadata;
|
|
2646
2690
|
|
|
2647
2691
|
/**
|
|
2648
2692
|
* Twitter Card metadata types.
|
|
@@ -2725,24 +2769,26 @@ interface TwitterCardMetadata {
|
|
|
2725
2769
|
*/
|
|
2726
2770
|
|
|
2727
2771
|
/**
|
|
2728
|
-
* Extract Twitter Card metadata from
|
|
2772
|
+
* Extract Twitter Card metadata from HTML.
|
|
2729
2773
|
*
|
|
2730
2774
|
* @remarks
|
|
2731
2775
|
* Extracts Twitter Card metadata including card type, site/creator info,
|
|
2732
2776
|
* title/description, images, app cards, and player cards.
|
|
2733
2777
|
*
|
|
2734
|
-
* @param
|
|
2778
|
+
* @param input - Parsed HTML document or raw HTML string
|
|
2735
2779
|
* @returns Twitter Card metadata object
|
|
2736
2780
|
*
|
|
2737
2781
|
* @example
|
|
2738
2782
|
* ```typescript
|
|
2783
|
+
* // With parsed document (recommended for multiple extractions)
|
|
2739
2784
|
* const doc = parseHTML(htmlString);
|
|
2740
2785
|
* const twitter = extractTwitterCard(doc);
|
|
2741
|
-
*
|
|
2742
|
-
*
|
|
2786
|
+
*
|
|
2787
|
+
* // Or directly with HTML string
|
|
2788
|
+
* const twitter = extractTwitterCard(htmlString);
|
|
2743
2789
|
* ```
|
|
2744
2790
|
*/
|
|
2745
|
-
declare function extractTwitterCard(
|
|
2791
|
+
declare function extractTwitterCard(input: DocumentInput): TwitterCardMetadata;
|
|
2746
2792
|
|
|
2747
2793
|
/**
|
|
2748
2794
|
* Verification tags types.
|
|
@@ -2787,23 +2833,25 @@ interface VerificationMetadata {
|
|
|
2787
2833
|
*/
|
|
2788
2834
|
|
|
2789
2835
|
/**
|
|
2790
|
-
* Extract verification metadata from
|
|
2836
|
+
* Extract verification metadata from HTML.
|
|
2791
2837
|
*
|
|
2792
2838
|
* @remarks
|
|
2793
2839
|
* Extracts verification tags used by various platforms for domain and ownership verification.
|
|
2794
2840
|
*
|
|
2795
|
-
* @param
|
|
2841
|
+
* @param input - Parsed HTML document or raw HTML string
|
|
2796
2842
|
* @returns Verification metadata
|
|
2797
2843
|
*
|
|
2798
2844
|
* @example
|
|
2799
2845
|
* ```typescript
|
|
2846
|
+
* // With parsed document (recommended for multiple extractions)
|
|
2800
2847
|
* const doc = parseHTML(htmlString);
|
|
2801
2848
|
* const verification = extractVerification(doc);
|
|
2802
|
-
*
|
|
2803
|
-
*
|
|
2849
|
+
*
|
|
2850
|
+
* // Or directly with HTML string
|
|
2851
|
+
* const verification = extractVerification(htmlString);
|
|
2804
2852
|
* ```
|
|
2805
2853
|
*/
|
|
2806
|
-
declare function extractVerification(
|
|
2854
|
+
declare function extractVerification(input: DocumentInput): VerificationMetadata;
|
|
2807
2855
|
|
|
2808
2856
|
/**
|
|
2809
2857
|
* Enhanced fetch types for web scraping.
|
|
@@ -3254,4 +3302,4 @@ interface SwoopResult {
|
|
|
3254
3302
|
*/
|
|
3255
3303
|
declare function swoop(url: string | URL, init?: SwoopInit): Promise<SwoopResult>;
|
|
3256
3304
|
|
|
3257
|
-
export { type AlternateLink, type AnalyticsMetadata, type AppLinks, type AppleTouchIcon, type Article, type AssetsMetadata, type CanonicalMetadata, type ConnectionHint, type ContentExtractionOptions, type ContentQuality, type ContentResult, type CopyrightMetadata, type DiscoveredFeed, type DublinCoreMetadata, type ExtractedContent, type ExtractedLink, type ExtractionErrorType, type ExtractionFailure, type Feed, type FeedAuthor, type FeedDiscoveryMetadata, type FeedEnclosure, type FeedFormat, type FeedItem, type GeoMetadata, type GeoPosition, type HTMLDocument, type HtmlToTextOptions, type IconsMetadata, type JsonLdBlock, type LanguageMetadata, type LinksExtractionOptions, type LinksMetadata, type MSTile, type MaskIcon, type MonetizationMetadata, type NewsMetadata, type OpenGraphArticle, type OpenGraphAudio, type OpenGraphBook, type OpenGraphImage, type OpenGraphMetadata, type OpenGraphProfile, type OpenGraphVideo, type PaginationMetadata, type ParseResult, PluckContentTypeError, PluckEncodingError, PluckError, PluckHttpError, type PluckInit, PluckNetworkError, PluckRedirectError, type PluckResponse, PluckSizeError, PluckTimeoutError, type PreloadResource, type RobotDirectives, type RobotsMetadata, type SEOMetadata, type SchemaOrgMetadata, type SecurityMetadata, type SitemapDiscoveryMetadata, type SocialProfilesMetadata, SwoopEnvironmentError, SwoopError, SwoopExecutionError, type SwoopInit, type SwoopResult, SwoopSecurityError, SwoopTimeoutError, type SwoopWaitStrategy, type TwitterApp, type TwitterAppPlatform, type TwitterCardMetadata, type TwitterPlayer, type VerificationMetadata, type Website, assessContentQuality, calculateReadingTime, countWords, detectFormat, extractAnalytics, extractAssets, extractCanonical, extractContent, extractCopyright, extractDublinCore, extractFeedDiscovery, extractGeo, extractIcons, extractLanguage, extractLinks, extractMonetization, extractNews, extractOpenGraph, extractPagination, extractRobots, extractSEO, extractSchemaOrg, extractSecurity, extractSitemapDiscovery, extractSocialProfiles, extractTwitterCard, extractVerification, gatherArticle, gatherFeed, gatherWebsite, htmlToText, isAtom, isFeed, isJSONFeed, isProbablyReaderable, isRSS, parseFeed, parseHTML, pluck, swoop };
|
|
3305
|
+
export { type AlternateLink, type AnalyticsMetadata, type AppLinks, type AppleTouchIcon, type Article, type AssetsMetadata, type CanonicalMetadata, type ConnectionHint, type ContentExtractionOptions, type ContentQuality, type ContentResult, type CopyrightMetadata, type DiscoveredFeed, type DocumentInput, type DublinCoreMetadata, type ExtractedContent, type ExtractedLink, type ExtractionErrorType, type ExtractionFailure, type Feed, type FeedAuthor, type FeedDiscoveryMetadata, type FeedEnclosure, type FeedFormat, type FeedItem, type GeoMetadata, type GeoPosition, type HTMLDocument, type HtmlToTextOptions, type IconsMetadata, type JsonLdBlock, type LanguageMetadata, type LinksExtractionOptions, type LinksMetadata, type MSTile, type MaskIcon, type MonetizationMetadata, type NewsMetadata, type OpenGraphArticle, type OpenGraphAudio, type OpenGraphBook, type OpenGraphImage, type OpenGraphMetadata, type OpenGraphProfile, type OpenGraphVideo, type PaginationMetadata, type ParseResult, PluckContentTypeError, PluckEncodingError, PluckError, PluckHttpError, type PluckInit, PluckNetworkError, PluckRedirectError, type PluckResponse, PluckSizeError, PluckTimeoutError, type PreloadResource, type RobotDirectives, type RobotsMetadata, type SEOMetadata, type SchemaOrgMetadata, type SecurityMetadata, type SitemapDiscoveryMetadata, type SocialProfilesMetadata, SwoopEnvironmentError, SwoopError, SwoopExecutionError, type SwoopInit, type SwoopResult, SwoopSecurityError, SwoopTimeoutError, type SwoopWaitStrategy, type TwitterApp, type TwitterAppPlatform, type TwitterCardMetadata, type TwitterPlayer, type VerificationMetadata, type Website, assessContentQuality, calculateReadingTime, countWords, detectFormat, extractAnalytics, extractAssets, extractCanonical, extractContent, extractCopyright, extractDublinCore, extractFeedDiscovery, extractGeo, extractIcons, extractLanguage, extractLinks, extractMonetization, extractNews, extractOpenGraph, extractPagination, extractRobots, extractSEO, extractSchemaOrg, extractSecurity, extractSitemapDiscovery, extractSocialProfiles, extractTwitterCard, extractVerification, gatherArticle, gatherFeed, gatherWebsite, htmlToText, isAtom, isFeed, isJSONFeed, isProbablyReaderable, isRSS, parseFeed, parseHTML, pluck, swoop };
|