magpie-html 0.2.1 → 0.2.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -2347,6 +2347,378 @@ function parseFeedAs(content, format, baseUrl) {
2347
2347
  }
2348
2348
  }
2349
2349
 
2350
+ // src/feed/sitemap/xml-parser.ts
2351
+ function parseSitemapXML(xml) {
2352
+ const cleaned = cleanXMLDeclaration2(xml);
2353
+ const withoutDoctype = removeDoctype3(cleaned);
2354
+ const withoutComments = removeComments3(withoutDoctype);
2355
+ const root = parseElement3(withoutComments, 0).element;
2356
+ return root;
2357
+ }
2358
+ function cleanXMLDeclaration2(xml) {
2359
+ return xml.replace(/<\?xml[^?]*\?>/g, "").trim();
2360
+ }
2361
+ function removeDoctype3(xml) {
2362
+ return xml.replace(/<!DOCTYPE[^>]*>/gi, "");
2363
+ }
2364
+ function removeComments3(xml) {
2365
+ return xml.replace(/<!--[\s\S]*?-->/g, "");
2366
+ }
2367
+ function extractCDATA3(text) {
2368
+ const cdataMap = /* @__PURE__ */ new Map();
2369
+ let counter = 0;
2370
+ const processed = text.replace(/<!\[CDATA\[([\s\S]*?)\]\]>/g, (_match, content) => {
2371
+ const placeholder = `__CDATA_${counter}__`;
2372
+ cdataMap.set(placeholder, content);
2373
+ counter++;
2374
+ return placeholder;
2375
+ });
2376
+ return { text: processed, cdataMap };
2377
+ }
2378
+ function restoreCDATA3(text, cdataMap) {
2379
+ let result = text;
2380
+ for (const [placeholder, content] of cdataMap.entries()) {
2381
+ result = result.replace(placeholder, content);
2382
+ }
2383
+ return result;
2384
+ }
2385
+ function parseAttributes3(tagContent) {
2386
+ const attributes = {};
2387
+ const attrRegex = /(\S+)=["']([^"']*)["']/g;
2388
+ let match = attrRegex.exec(tagContent);
2389
+ while (match !== null) {
2390
+ attributes[match[1]] = match[2];
2391
+ match = attrRegex.exec(tagContent);
2392
+ }
2393
+ return attributes;
2394
+ }
2395
+ function findClosingTag3(xml, tagName, startPos) {
2396
+ const openTag = `<${tagName}`;
2397
+ const closeTag = `</${tagName}>`;
2398
+ let depth = 1;
2399
+ let pos = startPos;
2400
+ while (pos < xml.length && depth > 0) {
2401
+ const nextOpen = xml.indexOf(openTag, pos);
2402
+ const nextClose = xml.indexOf(closeTag, pos);
2403
+ if (nextClose === -1) {
2404
+ return -1;
2405
+ }
2406
+ if (nextOpen !== -1 && nextOpen < nextClose) {
2407
+ depth++;
2408
+ pos = nextOpen + openTag.length;
2409
+ } else {
2410
+ depth--;
2411
+ if (depth === 0) {
2412
+ return nextClose;
2413
+ }
2414
+ pos = nextClose + closeTag.length;
2415
+ }
2416
+ }
2417
+ return -1;
2418
+ }
2419
+ function parseElement3(xml, startPos, parent = null, cdataMap) {
2420
+ const extracted = cdataMap ? { text: xml, cdataMap } : extractCDATA3(xml);
2421
+ const cleanedXML = extracted.text;
2422
+ const currentCdataMap = extracted.cdataMap;
2423
+ const openTagStart = cleanedXML.indexOf("<", startPos);
2424
+ if (openTagStart === -1) {
2425
+ throw new Error("No opening tag found");
2426
+ }
2427
+ const openTagEnd = cleanedXML.indexOf(">", openTagStart);
2428
+ if (openTagEnd === -1) {
2429
+ throw new Error("Unclosed opening tag");
2430
+ }
2431
+ const openTagContent = cleanedXML.substring(openTagStart + 1, openTagEnd);
2432
+ const isSelfClosing = openTagContent.endsWith("/");
2433
+ const tagContent = isSelfClosing ? openTagContent.slice(0, -1).trim() : openTagContent;
2434
+ const spaceIndex = tagContent.indexOf(" ");
2435
+ const tagName = spaceIndex === -1 ? tagContent : tagContent.substring(0, spaceIndex);
2436
+ const attributes = spaceIndex === -1 ? {} : parseAttributes3(tagContent.substring(spaceIndex));
2437
+ const element = {
2438
+ tagName,
2439
+ attributes,
2440
+ text: "",
2441
+ children: [],
2442
+ parent
2443
+ };
2444
+ if (isSelfClosing) {
2445
+ return { element, endPos: openTagEnd + 1, cdataMap: currentCdataMap };
2446
+ }
2447
+ const closingTagPos = findClosingTag3(cleanedXML, tagName, openTagEnd + 1);
2448
+ if (closingTagPos === -1) {
2449
+ throw new Error(`No closing tag found for <${tagName}>`);
2450
+ }
2451
+ const content = cleanedXML.substring(openTagEnd + 1, closingTagPos);
2452
+ if (content.includes("<")) {
2453
+ let pos = 0;
2454
+ const trimmedContent = content.trim();
2455
+ while (pos < trimmedContent.length) {
2456
+ const nextTag = trimmedContent.indexOf("<", pos);
2457
+ if (nextTag === -1) break;
2458
+ if (trimmedContent[nextTag + 1] === "/" || trimmedContent[nextTag + 1] === "!") {
2459
+ pos = nextTag + 1;
2460
+ continue;
2461
+ }
2462
+ try {
2463
+ const { element: child, endPos } = parseElement3(
2464
+ trimmedContent,
2465
+ nextTag,
2466
+ element,
2467
+ currentCdataMap
2468
+ );
2469
+ element.children.push(child);
2470
+ pos = endPos;
2471
+ } catch {
2472
+ pos = nextTag + 1;
2473
+ }
2474
+ }
2475
+ let textContent = content.replace(/<[^>]+>/g, "").trim();
2476
+ textContent = restoreCDATA3(textContent, currentCdataMap);
2477
+ element.text = textContent;
2478
+ } else {
2479
+ let textContent = content.trim();
2480
+ textContent = restoreCDATA3(textContent, currentCdataMap);
2481
+ element.text = textContent;
2482
+ }
2483
+ const closingTagEnd = closingTagPos + `</${tagName}>`.length;
2484
+ return { element, endPos: closingTagEnd, cdataMap: currentCdataMap };
2485
+ }
2486
+ function querySelector3(element, selector, caseSensitive = false) {
2487
+ const tagName = caseSensitive ? selector : selector.toLowerCase();
2488
+ const elementTag = caseSensitive ? element.tagName : element.tagName.toLowerCase();
2489
+ if (elementTag === tagName) {
2490
+ return element;
2491
+ }
2492
+ for (const child of element.children) {
2493
+ const found = querySelector3(child, selector, caseSensitive);
2494
+ if (found) return found;
2495
+ }
2496
+ return null;
2497
+ }
2498
+ function querySelectorAll3(element, selector, caseSensitive = false) {
2499
+ const results = [];
2500
+ const tagName = caseSensitive ? selector : selector.toLowerCase();
2501
+ const elementTag = caseSensitive ? element.tagName : element.tagName.toLowerCase();
2502
+ if (elementTag === tagName) {
2503
+ results.push(element);
2504
+ }
2505
+ for (const child of element.children) {
2506
+ results.push(...querySelectorAll3(child, selector, caseSensitive));
2507
+ }
2508
+ return results;
2509
+ }
2510
+ function getText2(element) {
2511
+ return element?.text || "";
2512
+ }
2513
+ function getChild(element, tagName) {
2514
+ const lowerTag = tagName.toLowerCase();
2515
+ return element.children.find((c) => c.tagName.toLowerCase() === lowerTag) || null;
2516
+ }
2517
+ function getChildren(element, tagName) {
2518
+ const lowerTag = tagName.toLowerCase();
2519
+ return element.children.filter((c) => c.tagName.toLowerCase() === lowerTag);
2520
+ }
2521
+
2522
+ // src/feed/sitemap/parse.ts
2523
+ function parseSitemap(xml, baseUrl) {
2524
+ const doc = parseSitemapXML(xml);
2525
+ const sitemapIndex = querySelector3(doc, "sitemapindex");
2526
+ if (sitemapIndex) {
2527
+ return parseSitemapIndex(sitemapIndex, baseUrl);
2528
+ }
2529
+ const urlset = querySelector3(doc, "urlset");
2530
+ if (urlset) {
2531
+ return parseUrlset(urlset, baseUrl);
2532
+ }
2533
+ const urls = querySelectorAll3(doc, "url");
2534
+ if (urls.length > 0) {
2535
+ return {
2536
+ sitemap: {
2537
+ type: "urlset",
2538
+ urls: urls.map((url) => extractUrl(url, baseUrl)),
2539
+ sitemaps: []
2540
+ },
2541
+ isIndex: false
2542
+ };
2543
+ }
2544
+ return {
2545
+ sitemap: {
2546
+ type: "urlset",
2547
+ urls: [],
2548
+ sitemaps: []
2549
+ },
2550
+ isIndex: false
2551
+ };
2552
+ }
2553
+ function parseSitemapIndex(element, baseUrl) {
2554
+ const sitemapElements = getChildren(element, "sitemap");
2555
+ const sitemaps = sitemapElements.map((el) => {
2556
+ const loc = getText2(getChild(el, "loc"));
2557
+ const lastmod = getText2(getChild(el, "lastmod")) || void 0;
2558
+ return {
2559
+ loc: baseUrl ? normalizeUrlHttps(baseUrl, loc) : loc,
2560
+ lastmod
2561
+ };
2562
+ });
2563
+ return {
2564
+ sitemap: {
2565
+ type: "sitemapindex",
2566
+ urls: [],
2567
+ sitemaps
2568
+ },
2569
+ isIndex: true
2570
+ };
2571
+ }
2572
+ function parseUrlset(element, baseUrl) {
2573
+ const urlElements = getChildren(element, "url");
2574
+ const urls = urlElements.map((el) => extractUrl(el, baseUrl));
2575
+ return {
2576
+ sitemap: {
2577
+ type: "urlset",
2578
+ urls,
2579
+ sitemaps: []
2580
+ },
2581
+ isIndex: false
2582
+ };
2583
+ }
2584
+ function extractUrl(element, baseUrl) {
2585
+ const rawLoc = getText2(getChild(element, "loc"));
2586
+ const loc = decodeXmlEntities(rawLoc);
2587
+ const lastmod = getText2(getChild(element, "lastmod")) || void 0;
2588
+ const changefreq = getText2(getChild(element, "changefreq")) || void 0;
2589
+ const priorityText = getText2(getChild(element, "priority"));
2590
+ const priority = priorityText ? Number.parseFloat(priorityText) : void 0;
2591
+ const result = {
2592
+ loc: baseUrl ? normalizeUrlHttps(baseUrl, loc) : loc,
2593
+ lastmod,
2594
+ changefreq,
2595
+ priority: priority && !Number.isNaN(priority) ? priority : void 0
2596
+ };
2597
+ const news = extractNews(element);
2598
+ if (news) {
2599
+ result.news = news;
2600
+ }
2601
+ const images = extractImages(element, baseUrl);
2602
+ if (images.length > 0) {
2603
+ result.images = images;
2604
+ }
2605
+ const videos = extractVideos(element, baseUrl);
2606
+ if (videos.length > 0) {
2607
+ result.videos = videos;
2608
+ }
2609
+ return result;
2610
+ }
2611
+ function extractNews(urlElement) {
2612
+ const newsEl = getChild(urlElement, "news:news") || getChild(urlElement, "news") || urlElement.children.find((c) => c.tagName.toLowerCase().endsWith(":news"));
2613
+ if (!newsEl) {
2614
+ return void 0;
2615
+ }
2616
+ const news = {};
2617
+ const pubEl = getChild(newsEl, "news:publication") || getChild(newsEl, "publication") || newsEl.children.find((c) => c.tagName.toLowerCase().endsWith(":publication"));
2618
+ if (pubEl) {
2619
+ const name = getText2(getChild(pubEl, "news:name")) || getText2(getChild(pubEl, "name")) || getText2(pubEl.children.find((c) => c.tagName.toLowerCase().endsWith(":name")));
2620
+ const language = getText2(getChild(pubEl, "news:language")) || getText2(getChild(pubEl, "language")) || getText2(pubEl.children.find((c) => c.tagName.toLowerCase().endsWith(":language")));
2621
+ if (name || language) {
2622
+ news.publication = {
2623
+ name: name || void 0,
2624
+ language: language || void 0
2625
+ };
2626
+ }
2627
+ }
2628
+ const pubDate = getText2(getChild(newsEl, "news:publication_date")) || getText2(getChild(newsEl, "publication_date")) || getText2(newsEl.children.find((c) => c.tagName.toLowerCase().endsWith(":publication_date")));
2629
+ if (pubDate) {
2630
+ news.publicationDate = pubDate;
2631
+ }
2632
+ const title = getText2(getChild(newsEl, "news:title")) || getText2(getChild(newsEl, "title")) || getText2(newsEl.children.find((c) => c.tagName.toLowerCase().endsWith(":title")));
2633
+ if (title) {
2634
+ news.title = decodeXmlEntities(title);
2635
+ }
2636
+ const keywords = getText2(getChild(newsEl, "news:keywords")) || getText2(getChild(newsEl, "keywords")) || getText2(newsEl.children.find((c) => c.tagName.toLowerCase().endsWith(":keywords")));
2637
+ if (keywords) {
2638
+ news.keywords = keywords.split(",").map((k) => k.trim());
2639
+ }
2640
+ const stockTickers = getText2(getChild(newsEl, "news:stock_tickers")) || getText2(getChild(newsEl, "stock_tickers")) || getText2(newsEl.children.find((c) => c.tagName.toLowerCase().endsWith(":stock_tickers")));
2641
+ if (stockTickers) {
2642
+ news.stockTickers = stockTickers.split(",").map((t) => t.trim());
2643
+ }
2644
+ return Object.keys(news).length > 0 ? news : void 0;
2645
+ }
2646
+ function extractImages(urlElement, baseUrl) {
2647
+ const imageElements = urlElement.children.filter(
2648
+ (c) => c.tagName.toLowerCase() === "image:image" || c.tagName.toLowerCase() === "image" || c.tagName.toLowerCase().endsWith(":image")
2649
+ );
2650
+ return imageElements.map((imgEl) => {
2651
+ const loc = getText2(getChild(imgEl, "image:loc")) || getText2(getChild(imgEl, "loc")) || getText2(imgEl.children.find((c) => c.tagName.toLowerCase().endsWith(":loc")));
2652
+ if (!loc) return null;
2653
+ const image = {
2654
+ loc: baseUrl ? normalizeUrlHttps(baseUrl, loc) : loc
2655
+ };
2656
+ const caption = getText2(getChild(imgEl, "image:caption")) || getText2(getChild(imgEl, "caption")) || getText2(imgEl.children.find((c) => c.tagName.toLowerCase().endsWith(":caption")));
2657
+ if (caption) image.caption = decodeXmlEntities(caption);
2658
+ const geoLocation = getText2(getChild(imgEl, "image:geo_location")) || getText2(getChild(imgEl, "geo_location")) || getText2(imgEl.children.find((c) => c.tagName.toLowerCase().endsWith(":geo_location")));
2659
+ if (geoLocation) image.geoLocation = geoLocation;
2660
+ const title = getText2(getChild(imgEl, "image:title")) || getText2(getChild(imgEl, "title")) || getText2(imgEl.children.find((c) => c.tagName.toLowerCase().endsWith(":title")));
2661
+ if (title) image.title = decodeXmlEntities(title);
2662
+ const license = getText2(getChild(imgEl, "image:license")) || getText2(getChild(imgEl, "license")) || getText2(imgEl.children.find((c) => c.tagName.toLowerCase().endsWith(":license")));
2663
+ if (license) image.license = baseUrl ? normalizeUrlHttps(baseUrl, license) : license;
2664
+ return image;
2665
+ }).filter((img) => img !== null);
2666
+ }
2667
+ function extractVideos(urlElement, baseUrl) {
2668
+ const videoElements = urlElement.children.filter(
2669
+ (c) => c.tagName.toLowerCase() === "video:video" || c.tagName.toLowerCase() === "video" || c.tagName.toLowerCase().endsWith(":video")
2670
+ );
2671
+ return videoElements.map((vidEl) => {
2672
+ const thumbnailLoc = getText2(getChild(vidEl, "video:thumbnail_loc")) || getText2(getChild(vidEl, "thumbnail_loc")) || getText2(vidEl.children.find((c) => c.tagName.toLowerCase().endsWith(":thumbnail_loc")));
2673
+ const title = getText2(getChild(vidEl, "video:title")) || getText2(getChild(vidEl, "title")) || getText2(vidEl.children.find((c) => c.tagName.toLowerCase().endsWith(":title")));
2674
+ const description = getText2(getChild(vidEl, "video:description")) || getText2(getChild(vidEl, "description")) || getText2(vidEl.children.find((c) => c.tagName.toLowerCase().endsWith(":description")));
2675
+ if (!thumbnailLoc || !title || !description) return null;
2676
+ const video = {
2677
+ thumbnailLoc: baseUrl ? normalizeUrlHttps(baseUrl, thumbnailLoc) : thumbnailLoc,
2678
+ title: decodeXmlEntities(title),
2679
+ description: decodeXmlEntities(description)
2680
+ };
2681
+ const contentLoc = getText2(getChild(vidEl, "video:content_loc")) || getText2(getChild(vidEl, "content_loc")) || getText2(vidEl.children.find((c) => c.tagName.toLowerCase().endsWith(":content_loc")));
2682
+ if (contentLoc)
2683
+ video.contentLoc = baseUrl ? normalizeUrlHttps(baseUrl, contentLoc) : contentLoc;
2684
+ const playerLoc = getText2(getChild(vidEl, "video:player_loc")) || getText2(getChild(vidEl, "player_loc")) || getText2(vidEl.children.find((c) => c.tagName.toLowerCase().endsWith(":player_loc")));
2685
+ if (playerLoc) video.playerLoc = baseUrl ? normalizeUrlHttps(baseUrl, playerLoc) : playerLoc;
2686
+ const duration = getText2(getChild(vidEl, "video:duration")) || getText2(getChild(vidEl, "duration")) || getText2(vidEl.children.find((c) => c.tagName.toLowerCase().endsWith(":duration")));
2687
+ if (duration) {
2688
+ const dur = Number.parseInt(duration, 10);
2689
+ if (!Number.isNaN(dur)) video.duration = dur;
2690
+ }
2691
+ const rating = getText2(getChild(vidEl, "video:rating")) || getText2(getChild(vidEl, "rating")) || getText2(vidEl.children.find((c) => c.tagName.toLowerCase().endsWith(":rating")));
2692
+ if (rating) {
2693
+ const r = Number.parseFloat(rating);
2694
+ if (!Number.isNaN(r)) video.rating = r;
2695
+ }
2696
+ const viewCount = getText2(getChild(vidEl, "video:view_count")) || getText2(getChild(vidEl, "view_count")) || getText2(vidEl.children.find((c) => c.tagName.toLowerCase().endsWith(":view_count")));
2697
+ if (viewCount) {
2698
+ const vc = Number.parseInt(viewCount, 10);
2699
+ if (!Number.isNaN(vc)) video.viewCount = vc;
2700
+ }
2701
+ const publicationDate = getText2(getChild(vidEl, "video:publication_date")) || getText2(getChild(vidEl, "publication_date")) || getText2(vidEl.children.find((c) => c.tagName.toLowerCase().endsWith(":publication_date")));
2702
+ if (publicationDate) video.publicationDate = publicationDate;
2703
+ const familyFriendly = getText2(getChild(vidEl, "video:family_friendly")) || getText2(getChild(vidEl, "family_friendly")) || getText2(vidEl.children.find((c) => c.tagName.toLowerCase().endsWith(":family_friendly")));
2704
+ if (familyFriendly) {
2705
+ video.familyFriendly = familyFriendly.toLowerCase() === "yes";
2706
+ }
2707
+ const category = getText2(getChild(vidEl, "video:category")) || getText2(getChild(vidEl, "category")) || getText2(vidEl.children.find((c) => c.tagName.toLowerCase().endsWith(":category")));
2708
+ if (category) video.category = category;
2709
+ const tagElements = vidEl.children.filter(
2710
+ (c) => c.tagName.toLowerCase() === "video:tag" || c.tagName.toLowerCase() === "tag" || c.tagName.toLowerCase().endsWith(":tag")
2711
+ );
2712
+ if (tagElements.length > 0) {
2713
+ video.tags = tagElements.map((t) => getText2(t)).filter(Boolean);
2714
+ }
2715
+ return video;
2716
+ }).filter((vid) => vid !== null);
2717
+ }
2718
+ function decodeXmlEntities(text) {
2719
+ return text.replace(/&lt;/g, "<").replace(/&gt;/g, ">").replace(/&amp;/g, "&").replace(/&quot;/g, '"').replace(/&apos;/g, "'").replace(/&#(\d+);/g, (_, code) => String.fromCharCode(Number.parseInt(code, 10))).replace(/&#x([0-9a-fA-F]+);/g, (_, code) => String.fromCharCode(Number.parseInt(code, 16)));
2720
+ }
2721
+
2350
2722
  // src/pluck/types.ts
2351
2723
  var PluckError = class extends Error {
2352
2724
  constructor(message) {
@@ -2723,6 +3095,12 @@ function parseHTML(html, baseUrl) {
2723
3095
  });
2724
3096
  return document;
2725
3097
  }
3098
+ function ensureDocument(input, baseUrl) {
3099
+ if (typeof input === "string") {
3100
+ return parseHTML(input, baseUrl);
3101
+ }
3102
+ return input;
3103
+ }
2726
3104
 
2727
3105
  // src/utils/meta-helpers.ts
2728
3106
  function getMetaContent(doc, name) {
@@ -2750,7 +3128,8 @@ function getMetaHttpEquiv(doc, httpEquiv) {
2750
3128
  }
2751
3129
 
2752
3130
  // src/metadata/opengraph/extract.ts
2753
- function extractOpenGraph(doc) {
3131
+ function extractOpenGraph(input) {
3132
+ const doc = ensureDocument(input);
2754
3133
  const metadata = {};
2755
3134
  metadata.title = getMetaProperty(doc, "og:title");
2756
3135
  metadata.type = getMetaProperty(doc, "og:type");
@@ -2775,7 +3154,7 @@ function extractOpenGraph(doc) {
2775
3154
  if (Object.keys(audio).length > 0) {
2776
3155
  metadata.audio = audio;
2777
3156
  }
2778
- const images = extractImages(doc);
3157
+ const images = extractImages2(doc);
2779
3158
  if (images.length > 0) {
2780
3159
  metadata.images = images;
2781
3160
  }
@@ -2844,7 +3223,7 @@ function extractAudio(doc) {
2844
3223
  Object.entries(audio).filter(([_, value]) => value !== void 0)
2845
3224
  );
2846
3225
  }
2847
- function extractImages(doc) {
3226
+ function extractImages2(doc) {
2848
3227
  const images = [];
2849
3228
  const imageUrls = getAllMetaPropertyValues(doc, "og:image");
2850
3229
  const imageSecureUrls = getAllMetaPropertyValues(doc, "og:image:secure_url");
@@ -2963,7 +3342,8 @@ function matchesAnyType(obj, targetTypes) {
2963
3342
  }
2964
3343
 
2965
3344
  // src/metadata/schema-org/extract.ts
2966
- function extractSchemaOrg(doc) {
3345
+ function extractSchemaOrg(input) {
3346
+ const doc = ensureDocument(input);
2967
3347
  const metadata = {
2968
3348
  jsonLd: []
2969
3349
  };
@@ -3040,7 +3420,8 @@ function organizeByType(metadata) {
3040
3420
  }
3041
3421
 
3042
3422
  // src/metadata/seo/extract.ts
3043
- function extractSEO(doc) {
3423
+ function extractSEO(input) {
3424
+ const doc = ensureDocument(input);
3044
3425
  const metadata = {};
3045
3426
  const titleElement = doc.querySelector("title");
3046
3427
  if (titleElement?.textContent) {
@@ -3072,7 +3453,8 @@ function extractSEO(doc) {
3072
3453
  }
3073
3454
 
3074
3455
  // src/metadata/twitter-card/extract.ts
3075
- function extractTwitterCard(doc) {
3456
+ function extractTwitterCard(input) {
3457
+ const doc = ensureDocument(input);
3076
3458
  const metadata = {};
3077
3459
  metadata.card = getMetaContent(doc, "twitter:card");
3078
3460
  metadata.site = getMetaContent(doc, "twitter:site");
@@ -3229,7 +3611,8 @@ function getAllLinksByPrefix(doc, relPrefix) {
3229
3611
  }
3230
3612
 
3231
3613
  // src/metadata/icons/extract.ts
3232
- function extractIcons(doc) {
3614
+ function extractIcons(input) {
3615
+ const doc = ensureDocument(input);
3233
3616
  const metadata = {};
3234
3617
  const iconLinks = getAllLinksByRels(doc, ["icon", "shortcut icon"]);
3235
3618
  for (const link of iconLinks) {
@@ -3410,7 +3793,8 @@ function parseSizeString(sizeStr) {
3410
3793
  }
3411
3794
 
3412
3795
  // src/metadata/language/extract.ts
3413
- function extractLanguage(doc) {
3796
+ function extractLanguage(input) {
3797
+ const doc = ensureDocument(input);
3414
3798
  const metadata = {};
3415
3799
  const htmlElement = doc.querySelector("html");
3416
3800
  if (htmlElement) {
@@ -3462,7 +3846,8 @@ function extractBestLanguage(doc) {
3462
3846
  }
3463
3847
 
3464
3848
  // src/metadata/links/extract.ts
3465
- function extractLinks3(doc, baseUrl, options = {}) {
3849
+ function extractLinks3(input, baseUrl, options = {}) {
3850
+ const doc = ensureDocument(input);
3466
3851
  const opts = normalizeOptions3(options);
3467
3852
  const effectiveBaseUrl = getEffectiveBaseUrl(doc, baseUrl);
3468
3853
  const baseOrigin = effectiveBaseUrl ? getOrigin(effectiveBaseUrl) : null;
@@ -3793,7 +4178,8 @@ function getStringProperty3(obj, prop) {
3793
4178
  }
3794
4179
 
3795
4180
  // src/metadata/canonical/extract.ts
3796
- function extractCanonical(doc) {
4181
+ function extractCanonical(input) {
4182
+ const doc = ensureDocument(input);
3797
4183
  const metadata = {};
3798
4184
  metadata.canonical = getLinkHref(doc, "canonical");
3799
4185
  const alternateLinks = getAllLinks(doc, "alternate");
@@ -3938,9 +4324,63 @@ async function gatherFeed(url) {
3938
4324
  }
3939
4325
  const response = await pluck(feedUrl);
3940
4326
  const content = await response.textUtf8();
4327
+ const format = detectFormat(content);
4328
+ if (format === "sitemap") {
4329
+ return normalizeSitemapToFeed(content, response.finalUrl);
4330
+ }
3941
4331
  const result = parseFeed(content, response.finalUrl);
3942
4332
  return result.feed;
3943
4333
  }
4334
+ function normalizeSitemapToFeed(content, baseUrl) {
4335
+ const result = parseSitemap(content, baseUrl);
4336
+ if (result.isIndex) {
4337
+ const items2 = result.sitemap.sitemaps.map((sitemap, index) => ({
4338
+ id: sitemap.loc || `sitemap-${index}`,
4339
+ url: sitemap.loc,
4340
+ title: `Sitemap: ${sitemap.loc}`,
4341
+ modified: sitemap.lastmod
4342
+ }));
4343
+ return {
4344
+ format: "sitemap",
4345
+ title: "Sitemap Index",
4346
+ url: baseUrl,
4347
+ items: items2
4348
+ };
4349
+ }
4350
+ const items = result.sitemap.urls.map((url, index) => {
4351
+ const item = {
4352
+ id: url.loc || `url-${index}`,
4353
+ url: url.loc,
4354
+ modified: url.lastmod
4355
+ };
4356
+ if (url.news) {
4357
+ item.title = url.news.title;
4358
+ item.published = url.news.publicationDate;
4359
+ if (url.news.publication?.name) {
4360
+ item.authors = [{ name: url.news.publication.name }];
4361
+ }
4362
+ if (url.news.keywords) {
4363
+ item.tags = url.news.keywords;
4364
+ }
4365
+ }
4366
+ if (url.images && url.images.length > 0) {
4367
+ item.image = url.images[0].loc;
4368
+ }
4369
+ return item;
4370
+ });
4371
+ let title = "Sitemap";
4372
+ try {
4373
+ const urlObj = new URL(baseUrl);
4374
+ title = `${urlObj.hostname} Sitemap`;
4375
+ } catch {
4376
+ }
4377
+ return {
4378
+ format: "sitemap",
4379
+ title,
4380
+ url: baseUrl,
4381
+ items
4382
+ };
4383
+ }
3944
4384
 
3945
4385
  // src/metadata/feed-discovery/heuristics.ts
3946
4386
  var COMMON_FEED_PATHS = [
@@ -3975,7 +4415,8 @@ function generateFeedSuggestions(documentUrl) {
3975
4415
  }
3976
4416
 
3977
4417
  // src/metadata/feed-discovery/extract.ts
3978
- function extractFeedDiscovery(doc, documentUrl) {
4418
+ function extractFeedDiscovery(input, documentUrl) {
4419
+ const doc = ensureDocument(input);
3979
4420
  const metadata = {
3980
4421
  feeds: []
3981
4422
  };
@@ -4152,7 +4593,8 @@ async function gatherWebsite(url) {
4152
4593
  }
4153
4594
 
4154
4595
  // src/metadata/analytics/extract.ts
4155
- function extractAnalytics(doc) {
4596
+ function extractAnalytics(input) {
4597
+ const doc = ensureDocument(input);
4156
4598
  const metadata = {};
4157
4599
  const scripts = doc.querySelectorAll("script");
4158
4600
  const googleAnalytics = /* @__PURE__ */ new Set();
@@ -4244,10 +4686,11 @@ function extractAnalytics(doc) {
4244
4686
  }
4245
4687
 
4246
4688
  // src/metadata/assets/extract.ts
4247
- function extractAssets(doc, baseUrl) {
4689
+ function extractAssets(input, baseUrl) {
4690
+ const doc = ensureDocument(input);
4248
4691
  const metadata = {};
4249
4692
  const effectiveBaseUrl = getEffectiveBaseUrl2(doc, baseUrl);
4250
- const images = extractImages2(doc, effectiveBaseUrl);
4693
+ const images = extractImages3(doc, effectiveBaseUrl);
4251
4694
  if (images.length > 0) {
4252
4695
  metadata.images = images;
4253
4696
  }
@@ -4301,7 +4744,7 @@ function getEffectiveBaseUrl2(doc, baseUrl) {
4301
4744
  }
4302
4745
  return null;
4303
4746
  }
4304
- function extractImages2(doc, baseUrl) {
4747
+ function extractImages3(doc, baseUrl) {
4305
4748
  const urls = /* @__PURE__ */ new Set();
4306
4749
  const imgElements = doc.querySelectorAll("img[src]");
4307
4750
  for (const img of Array.from(imgElements)) {
@@ -4571,7 +5014,8 @@ function extractConnectionHints(doc, baseUrl) {
4571
5014
  }
4572
5015
 
4573
5016
  // src/metadata/copyright/extract.ts
4574
- function extractCopyright(doc) {
5017
+ function extractCopyright(input) {
5018
+ const doc = ensureDocument(input);
4575
5019
  const metadata = {};
4576
5020
  metadata.copyright = getMetaContent(doc, "copyright");
4577
5021
  metadata.license = getLinkHref(doc, "license");
@@ -4607,7 +5051,8 @@ function parseCopyright(copyrightString) {
4607
5051
  }
4608
5052
 
4609
5053
  // src/metadata/dublin-core/extract.ts
4610
- function extractDublinCore(doc) {
5054
+ function extractDublinCore(input) {
5055
+ const doc = ensureDocument(input);
4611
5056
  const metadata = {};
4612
5057
  metadata.title = getMetaContent(doc, "DC.title") || getMetaContent(doc, "dcterms.title");
4613
5058
  metadata.description = getMetaContent(doc, "DC.description") || getMetaContent(doc, "dcterms.description");
@@ -4648,7 +5093,8 @@ function extractMultiValue(doc, field) {
4648
5093
  }
4649
5094
 
4650
5095
  // src/metadata/geo/extract.ts
4651
- function extractGeo(doc) {
5096
+ function extractGeo(input) {
5097
+ const doc = ensureDocument(input);
4652
5098
  const metadata = {};
4653
5099
  const geoPosition = getMetaContent(doc, "geo.position");
4654
5100
  if (geoPosition) {
@@ -4705,7 +5151,8 @@ function parseICBM(icbm) {
4705
5151
  }
4706
5152
 
4707
5153
  // src/metadata/monetization/extract.ts
4708
- function extractMonetization(doc) {
5154
+ function extractMonetization(input) {
5155
+ const doc = ensureDocument(input);
4709
5156
  const metadata = {};
4710
5157
  metadata.webMonetization = getMetaContent(doc, "monetization");
4711
5158
  metadata.paypalVerification = getMetaContent(doc, "paypal-site-verification");
@@ -4719,7 +5166,8 @@ function extractMonetization(doc) {
4719
5166
  }
4720
5167
 
4721
5168
  // src/metadata/news/extract.ts
4722
- function extractNews(doc) {
5169
+ function extractNews2(input) {
5170
+ const doc = ensureDocument(input);
4723
5171
  const metadata = {};
4724
5172
  const newsKeywords = getMetaContent(doc, "news_keywords");
4725
5173
  if (newsKeywords) {
@@ -4737,7 +5185,8 @@ function extractNews(doc) {
4737
5185
  }
4738
5186
 
4739
5187
  // src/metadata/pagination/extract.ts
4740
- function extractPagination(doc) {
5188
+ function extractPagination(input) {
5189
+ const doc = ensureDocument(input);
4741
5190
  const metadata = {};
4742
5191
  metadata.prev = getLinkHref(doc, "prev") || getLinkHref(doc, "previous");
4743
5192
  metadata.next = getLinkHref(doc, "next");
@@ -4836,7 +5285,8 @@ function parseKeyValueDirective(key, value, result) {
4836
5285
  }
4837
5286
 
4838
5287
  // src/metadata/robots/extract.ts
4839
- function extractRobots(doc) {
5288
+ function extractRobots(input) {
5289
+ const doc = ensureDocument(input);
4840
5290
  const metadata = {};
4841
5291
  const robotsContent = getMetaContent(doc, "robots");
4842
5292
  if (robotsContent) {
@@ -4870,7 +5320,8 @@ function extractRobots(doc) {
4870
5320
  }
4871
5321
 
4872
5322
  // src/metadata/security/extract.ts
4873
- function extractSecurity(doc) {
5323
+ function extractSecurity(input) {
5324
+ const doc = ensureDocument(input);
4874
5325
  const metadata = {};
4875
5326
  metadata.referrerPolicy = getMetaContent(doc, "referrer");
4876
5327
  metadata.contentSecurityPolicy = getMetaHttpEquiv(doc, "Content-Security-Policy");
@@ -4921,7 +5372,8 @@ function generateSitemapSuggestions(documentUrl) {
4921
5372
  }
4922
5373
 
4923
5374
  // src/metadata/sitemap-discovery/extract.ts
4924
- function extractSitemapDiscovery(doc, documentUrl) {
5375
+ function extractSitemapDiscovery(input, documentUrl) {
5376
+ const doc = ensureDocument(input);
4925
5377
  const metadata = {
4926
5378
  sitemaps: []
4927
5379
  };
@@ -4934,7 +5386,8 @@ function extractSitemapDiscovery(doc, documentUrl) {
4934
5386
  }
4935
5387
 
4936
5388
  // src/metadata/social-profiles/extract.ts
4937
- function extractSocialProfiles(doc) {
5389
+ function extractSocialProfiles(input) {
5390
+ const doc = ensureDocument(input);
4938
5391
  const metadata = {};
4939
5392
  metadata.twitter = getMetaContent(doc, "twitter:site") || getMetaContent(doc, "twitter:creator") || extractFromProperty(doc, "twitter:site") || extractFromProperty(doc, "twitter:creator");
4940
5393
  if (metadata.twitter) {
@@ -5087,7 +5540,8 @@ function categorizeSchemaProfile(url, metadata) {
5087
5540
  }
5088
5541
 
5089
5542
  // src/metadata/verification/extract.ts
5090
- function extractVerification(doc) {
5543
+ function extractVerification(input) {
5544
+ const doc = ensureDocument(input);
5091
5545
  const metadata = {};
5092
5546
  metadata.googleSiteVerification = getMetaContent(doc, "google-site-verification");
5093
5547
  metadata.msvalidate = getMetaContent(doc, "msvalidate.01");
@@ -6934,6 +7388,6 @@ async function swoop(url, init) {
6934
7388
  * @packageDocumentation
6935
7389
  */
6936
7390
 
6937
- export { PluckContentTypeError, PluckEncodingError, PluckError, PluckHttpError, PluckNetworkError, PluckRedirectError, PluckSizeError, PluckTimeoutError, SwoopEnvironmentError, SwoopError, SwoopExecutionError, SwoopSecurityError, SwoopTimeoutError, assessContentQuality, calculateReadingTime, countWords, detectFormat, extractAnalytics, extractAssets, extractCanonical, extractContent, extractCopyright, extractDublinCore, extractFeedDiscovery, extractGeo, extractIcons, extractLanguage, extractLinks3 as extractLinks, extractMonetization, extractNews, extractOpenGraph, extractPagination, extractRobots, extractSEO, extractSchemaOrg, extractSecurity, extractSitemapDiscovery, extractSocialProfiles, extractTwitterCard, extractVerification, gatherArticle, gatherFeed, gatherWebsite, htmlToText, isAtom, isFeed, isJSONFeed, isProbablyReaderable, isRSS, parseFeed, parseHTML, pluck, swoop };
7391
+ export { PluckContentTypeError, PluckEncodingError, PluckError, PluckHttpError, PluckNetworkError, PluckRedirectError, PluckSizeError, PluckTimeoutError, SwoopEnvironmentError, SwoopError, SwoopExecutionError, SwoopSecurityError, SwoopTimeoutError, assessContentQuality, calculateReadingTime, countWords, detectFormat, extractAnalytics, extractAssets, extractCanonical, extractContent, extractCopyright, extractDublinCore, extractFeedDiscovery, extractGeo, extractIcons, extractLanguage, extractLinks3 as extractLinks, extractMonetization, extractNews2 as extractNews, extractOpenGraph, extractPagination, extractRobots, extractSEO, extractSchemaOrg, extractSecurity, extractSitemapDiscovery, extractSocialProfiles, extractTwitterCard, extractVerification, gatherArticle, gatherFeed, gatherWebsite, htmlToText, isAtom, isFeed, isJSONFeed, isProbablyReaderable, isRSS, parseFeed, parseHTML, pluck, swoop };
6938
7392
  //# sourceMappingURL=index.js.map
6939
7393
  //# sourceMappingURL=index.js.map