magpie-html 0.2.1 → 0.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.cts CHANGED
@@ -603,7 +603,7 @@ interface FeedItem {
603
603
  */
604
604
  interface Feed {
605
605
  /** Original feed format */
606
- format: 'rss' | 'atom' | 'json-feed';
606
+ format: 'rss' | 'atom' | 'json-feed' | 'sitemap';
607
607
  /** Feed title (required) */
608
608
  title: string;
609
609
  /** Feed description or subtitle */
@@ -952,6 +952,7 @@ declare function gatherArticle(url: string | URL): Promise<Article>;
952
952
  * @remarks
953
953
  * This is a high-level convenience method that combines fetching and parsing.
954
954
  * It handles encoding detection, redirects, and feed format detection automatically.
955
+ * Falls back to sitemap parsing when standard feed formats aren't detected.
955
956
  *
956
957
  * @param url - Feed URL as string or URL object
957
958
  * @returns Normalized feed data
package/dist/index.d.ts CHANGED
@@ -603,7 +603,7 @@ interface FeedItem {
603
603
  */
604
604
  interface Feed {
605
605
  /** Original feed format */
606
- format: 'rss' | 'atom' | 'json-feed';
606
+ format: 'rss' | 'atom' | 'json-feed' | 'sitemap';
607
607
  /** Feed title (required) */
608
608
  title: string;
609
609
  /** Feed description or subtitle */
@@ -952,6 +952,7 @@ declare function gatherArticle(url: string | URL): Promise<Article>;
952
952
  * @remarks
953
953
  * This is a high-level convenience method that combines fetching and parsing.
954
954
  * It handles encoding detection, redirects, and feed format detection automatically.
955
+ * Falls back to sitemap parsing when standard feed formats aren't detected.
955
956
  *
956
957
  * @param url - Feed URL as string or URL object
957
958
  * @returns Normalized feed data
package/dist/index.js CHANGED
@@ -2347,6 +2347,378 @@ function parseFeedAs(content, format, baseUrl) {
2347
2347
  }
2348
2348
  }
2349
2349
 
2350
+ // src/feed/sitemap/xml-parser.ts
2351
+ function parseSitemapXML(xml) {
2352
+ const cleaned = cleanXMLDeclaration2(xml);
2353
+ const withoutDoctype = removeDoctype3(cleaned);
2354
+ const withoutComments = removeComments3(withoutDoctype);
2355
+ const root = parseElement3(withoutComments, 0).element;
2356
+ return root;
2357
+ }
2358
+ function cleanXMLDeclaration2(xml) {
2359
+ return xml.replace(/<\?xml[^?]*\?>/g, "").trim();
2360
+ }
2361
+ function removeDoctype3(xml) {
2362
+ return xml.replace(/<!DOCTYPE[^>]*>/gi, "");
2363
+ }
2364
+ function removeComments3(xml) {
2365
+ return xml.replace(/<!--[\s\S]*?-->/g, "");
2366
+ }
2367
+ function extractCDATA3(text) {
2368
+ const cdataMap = /* @__PURE__ */ new Map();
2369
+ let counter = 0;
2370
+ const processed = text.replace(/<!\[CDATA\[([\s\S]*?)\]\]>/g, (_match, content) => {
2371
+ const placeholder = `__CDATA_${counter}__`;
2372
+ cdataMap.set(placeholder, content);
2373
+ counter++;
2374
+ return placeholder;
2375
+ });
2376
+ return { text: processed, cdataMap };
2377
+ }
2378
+ function restoreCDATA3(text, cdataMap) {
2379
+ let result = text;
2380
+ for (const [placeholder, content] of cdataMap.entries()) {
2381
+ result = result.replace(placeholder, content);
2382
+ }
2383
+ return result;
2384
+ }
2385
+ function parseAttributes3(tagContent) {
2386
+ const attributes = {};
2387
+ const attrRegex = /(\S+)=["']([^"']*)["']/g;
2388
+ let match = attrRegex.exec(tagContent);
2389
+ while (match !== null) {
2390
+ attributes[match[1]] = match[2];
2391
+ match = attrRegex.exec(tagContent);
2392
+ }
2393
+ return attributes;
2394
+ }
2395
+ function findClosingTag3(xml, tagName, startPos) {
2396
+ const openTag = `<${tagName}`;
2397
+ const closeTag = `</${tagName}>`;
2398
+ let depth = 1;
2399
+ let pos = startPos;
2400
+ while (pos < xml.length && depth > 0) {
2401
+ const nextOpen = xml.indexOf(openTag, pos);
2402
+ const nextClose = xml.indexOf(closeTag, pos);
2403
+ if (nextClose === -1) {
2404
+ return -1;
2405
+ }
2406
+ if (nextOpen !== -1 && nextOpen < nextClose) {
2407
+ depth++;
2408
+ pos = nextOpen + openTag.length;
2409
+ } else {
2410
+ depth--;
2411
+ if (depth === 0) {
2412
+ return nextClose;
2413
+ }
2414
+ pos = nextClose + closeTag.length;
2415
+ }
2416
+ }
2417
+ return -1;
2418
+ }
2419
+ function parseElement3(xml, startPos, parent = null, cdataMap) {
2420
+ const extracted = cdataMap ? { text: xml, cdataMap } : extractCDATA3(xml);
2421
+ const cleanedXML = extracted.text;
2422
+ const currentCdataMap = extracted.cdataMap;
2423
+ const openTagStart = cleanedXML.indexOf("<", startPos);
2424
+ if (openTagStart === -1) {
2425
+ throw new Error("No opening tag found");
2426
+ }
2427
+ const openTagEnd = cleanedXML.indexOf(">", openTagStart);
2428
+ if (openTagEnd === -1) {
2429
+ throw new Error("Unclosed opening tag");
2430
+ }
2431
+ const openTagContent = cleanedXML.substring(openTagStart + 1, openTagEnd);
2432
+ const isSelfClosing = openTagContent.endsWith("/");
2433
+ const tagContent = isSelfClosing ? openTagContent.slice(0, -1).trim() : openTagContent;
2434
+ const spaceIndex = tagContent.indexOf(" ");
2435
+ const tagName = spaceIndex === -1 ? tagContent : tagContent.substring(0, spaceIndex);
2436
+ const attributes = spaceIndex === -1 ? {} : parseAttributes3(tagContent.substring(spaceIndex));
2437
+ const element = {
2438
+ tagName,
2439
+ attributes,
2440
+ text: "",
2441
+ children: [],
2442
+ parent
2443
+ };
2444
+ if (isSelfClosing) {
2445
+ return { element, endPos: openTagEnd + 1, cdataMap: currentCdataMap };
2446
+ }
2447
+ const closingTagPos = findClosingTag3(cleanedXML, tagName, openTagEnd + 1);
2448
+ if (closingTagPos === -1) {
2449
+ throw new Error(`No closing tag found for <${tagName}>`);
2450
+ }
2451
+ const content = cleanedXML.substring(openTagEnd + 1, closingTagPos);
2452
+ if (content.includes("<")) {
2453
+ let pos = 0;
2454
+ const trimmedContent = content.trim();
2455
+ while (pos < trimmedContent.length) {
2456
+ const nextTag = trimmedContent.indexOf("<", pos);
2457
+ if (nextTag === -1) break;
2458
+ if (trimmedContent[nextTag + 1] === "/" || trimmedContent[nextTag + 1] === "!") {
2459
+ pos = nextTag + 1;
2460
+ continue;
2461
+ }
2462
+ try {
2463
+ const { element: child, endPos } = parseElement3(
2464
+ trimmedContent,
2465
+ nextTag,
2466
+ element,
2467
+ currentCdataMap
2468
+ );
2469
+ element.children.push(child);
2470
+ pos = endPos;
2471
+ } catch {
2472
+ pos = nextTag + 1;
2473
+ }
2474
+ }
2475
+ let textContent = content.replace(/<[^>]+>/g, "").trim();
2476
+ textContent = restoreCDATA3(textContent, currentCdataMap);
2477
+ element.text = textContent;
2478
+ } else {
2479
+ let textContent = content.trim();
2480
+ textContent = restoreCDATA3(textContent, currentCdataMap);
2481
+ element.text = textContent;
2482
+ }
2483
+ const closingTagEnd = closingTagPos + `</${tagName}>`.length;
2484
+ return { element, endPos: closingTagEnd, cdataMap: currentCdataMap };
2485
+ }
2486
+ function querySelector3(element, selector, caseSensitive = false) {
2487
+ const tagName = caseSensitive ? selector : selector.toLowerCase();
2488
+ const elementTag = caseSensitive ? element.tagName : element.tagName.toLowerCase();
2489
+ if (elementTag === tagName) {
2490
+ return element;
2491
+ }
2492
+ for (const child of element.children) {
2493
+ const found = querySelector3(child, selector, caseSensitive);
2494
+ if (found) return found;
2495
+ }
2496
+ return null;
2497
+ }
2498
+ function querySelectorAll3(element, selector, caseSensitive = false) {
2499
+ const results = [];
2500
+ const tagName = caseSensitive ? selector : selector.toLowerCase();
2501
+ const elementTag = caseSensitive ? element.tagName : element.tagName.toLowerCase();
2502
+ if (elementTag === tagName) {
2503
+ results.push(element);
2504
+ }
2505
+ for (const child of element.children) {
2506
+ results.push(...querySelectorAll3(child, selector, caseSensitive));
2507
+ }
2508
+ return results;
2509
+ }
2510
+ function getText2(element) {
2511
+ return element?.text || "";
2512
+ }
2513
+ function getChild(element, tagName) {
2514
+ const lowerTag = tagName.toLowerCase();
2515
+ return element.children.find((c) => c.tagName.toLowerCase() === lowerTag) || null;
2516
+ }
2517
+ function getChildren(element, tagName) {
2518
+ const lowerTag = tagName.toLowerCase();
2519
+ return element.children.filter((c) => c.tagName.toLowerCase() === lowerTag);
2520
+ }
2521
+
2522
+ // src/feed/sitemap/parse.ts
2523
+ function parseSitemap(xml, baseUrl) {
2524
+ const doc = parseSitemapXML(xml);
2525
+ const sitemapIndex = querySelector3(doc, "sitemapindex");
2526
+ if (sitemapIndex) {
2527
+ return parseSitemapIndex(sitemapIndex, baseUrl);
2528
+ }
2529
+ const urlset = querySelector3(doc, "urlset");
2530
+ if (urlset) {
2531
+ return parseUrlset(urlset, baseUrl);
2532
+ }
2533
+ const urls = querySelectorAll3(doc, "url");
2534
+ if (urls.length > 0) {
2535
+ return {
2536
+ sitemap: {
2537
+ type: "urlset",
2538
+ urls: urls.map((url) => extractUrl(url, baseUrl)),
2539
+ sitemaps: []
2540
+ },
2541
+ isIndex: false
2542
+ };
2543
+ }
2544
+ return {
2545
+ sitemap: {
2546
+ type: "urlset",
2547
+ urls: [],
2548
+ sitemaps: []
2549
+ },
2550
+ isIndex: false
2551
+ };
2552
+ }
2553
+ function parseSitemapIndex(element, baseUrl) {
2554
+ const sitemapElements = getChildren(element, "sitemap");
2555
+ const sitemaps = sitemapElements.map((el) => {
2556
+ const loc = getText2(getChild(el, "loc"));
2557
+ const lastmod = getText2(getChild(el, "lastmod")) || void 0;
2558
+ return {
2559
+ loc: baseUrl ? normalizeUrlHttps(baseUrl, loc) : loc,
2560
+ lastmod
2561
+ };
2562
+ });
2563
+ return {
2564
+ sitemap: {
2565
+ type: "sitemapindex",
2566
+ urls: [],
2567
+ sitemaps
2568
+ },
2569
+ isIndex: true
2570
+ };
2571
+ }
2572
+ function parseUrlset(element, baseUrl) {
2573
+ const urlElements = getChildren(element, "url");
2574
+ const urls = urlElements.map((el) => extractUrl(el, baseUrl));
2575
+ return {
2576
+ sitemap: {
2577
+ type: "urlset",
2578
+ urls,
2579
+ sitemaps: []
2580
+ },
2581
+ isIndex: false
2582
+ };
2583
+ }
2584
+ function extractUrl(element, baseUrl) {
2585
+ const rawLoc = getText2(getChild(element, "loc"));
2586
+ const loc = decodeXmlEntities(rawLoc);
2587
+ const lastmod = getText2(getChild(element, "lastmod")) || void 0;
2588
+ const changefreq = getText2(getChild(element, "changefreq")) || void 0;
2589
+ const priorityText = getText2(getChild(element, "priority"));
2590
+ const priority = priorityText ? Number.parseFloat(priorityText) : void 0;
2591
+ const result = {
2592
+ loc: baseUrl ? normalizeUrlHttps(baseUrl, loc) : loc,
2593
+ lastmod,
2594
+ changefreq,
2595
+ priority: priority && !Number.isNaN(priority) ? priority : void 0
2596
+ };
2597
+ const news = extractNews(element);
2598
+ if (news) {
2599
+ result.news = news;
2600
+ }
2601
+ const images = extractImages(element, baseUrl);
2602
+ if (images.length > 0) {
2603
+ result.images = images;
2604
+ }
2605
+ const videos = extractVideos(element, baseUrl);
2606
+ if (videos.length > 0) {
2607
+ result.videos = videos;
2608
+ }
2609
+ return result;
2610
+ }
2611
+ function extractNews(urlElement) {
2612
+ const newsEl = getChild(urlElement, "news:news") || getChild(urlElement, "news") || urlElement.children.find((c) => c.tagName.toLowerCase().endsWith(":news"));
2613
+ if (!newsEl) {
2614
+ return void 0;
2615
+ }
2616
+ const news = {};
2617
+ const pubEl = getChild(newsEl, "news:publication") || getChild(newsEl, "publication") || newsEl.children.find((c) => c.tagName.toLowerCase().endsWith(":publication"));
2618
+ if (pubEl) {
2619
+ const name = getText2(getChild(pubEl, "news:name")) || getText2(getChild(pubEl, "name")) || getText2(pubEl.children.find((c) => c.tagName.toLowerCase().endsWith(":name")));
2620
+ const language = getText2(getChild(pubEl, "news:language")) || getText2(getChild(pubEl, "language")) || getText2(pubEl.children.find((c) => c.tagName.toLowerCase().endsWith(":language")));
2621
+ if (name || language) {
2622
+ news.publication = {
2623
+ name: name || void 0,
2624
+ language: language || void 0
2625
+ };
2626
+ }
2627
+ }
2628
+ const pubDate = getText2(getChild(newsEl, "news:publication_date")) || getText2(getChild(newsEl, "publication_date")) || getText2(newsEl.children.find((c) => c.tagName.toLowerCase().endsWith(":publication_date")));
2629
+ if (pubDate) {
2630
+ news.publicationDate = pubDate;
2631
+ }
2632
+ const title = getText2(getChild(newsEl, "news:title")) || getText2(getChild(newsEl, "title")) || getText2(newsEl.children.find((c) => c.tagName.toLowerCase().endsWith(":title")));
2633
+ if (title) {
2634
+ news.title = decodeXmlEntities(title);
2635
+ }
2636
+ const keywords = getText2(getChild(newsEl, "news:keywords")) || getText2(getChild(newsEl, "keywords")) || getText2(newsEl.children.find((c) => c.tagName.toLowerCase().endsWith(":keywords")));
2637
+ if (keywords) {
2638
+ news.keywords = keywords.split(",").map((k) => k.trim());
2639
+ }
2640
+ const stockTickers = getText2(getChild(newsEl, "news:stock_tickers")) || getText2(getChild(newsEl, "stock_tickers")) || getText2(newsEl.children.find((c) => c.tagName.toLowerCase().endsWith(":stock_tickers")));
2641
+ if (stockTickers) {
2642
+ news.stockTickers = stockTickers.split(",").map((t) => t.trim());
2643
+ }
2644
+ return Object.keys(news).length > 0 ? news : void 0;
2645
+ }
2646
+ function extractImages(urlElement, baseUrl) {
2647
+ const imageElements = urlElement.children.filter(
2648
+ (c) => c.tagName.toLowerCase() === "image:image" || c.tagName.toLowerCase() === "image" || c.tagName.toLowerCase().endsWith(":image")
2649
+ );
2650
+ return imageElements.map((imgEl) => {
2651
+ const loc = getText2(getChild(imgEl, "image:loc")) || getText2(getChild(imgEl, "loc")) || getText2(imgEl.children.find((c) => c.tagName.toLowerCase().endsWith(":loc")));
2652
+ if (!loc) return null;
2653
+ const image = {
2654
+ loc: baseUrl ? normalizeUrlHttps(baseUrl, loc) : loc
2655
+ };
2656
+ const caption = getText2(getChild(imgEl, "image:caption")) || getText2(getChild(imgEl, "caption")) || getText2(imgEl.children.find((c) => c.tagName.toLowerCase().endsWith(":caption")));
2657
+ if (caption) image.caption = decodeXmlEntities(caption);
2658
+ const geoLocation = getText2(getChild(imgEl, "image:geo_location")) || getText2(getChild(imgEl, "geo_location")) || getText2(imgEl.children.find((c) => c.tagName.toLowerCase().endsWith(":geo_location")));
2659
+ if (geoLocation) image.geoLocation = geoLocation;
2660
+ const title = getText2(getChild(imgEl, "image:title")) || getText2(getChild(imgEl, "title")) || getText2(imgEl.children.find((c) => c.tagName.toLowerCase().endsWith(":title")));
2661
+ if (title) image.title = decodeXmlEntities(title);
2662
+ const license = getText2(getChild(imgEl, "image:license")) || getText2(getChild(imgEl, "license")) || getText2(imgEl.children.find((c) => c.tagName.toLowerCase().endsWith(":license")));
2663
+ if (license) image.license = baseUrl ? normalizeUrlHttps(baseUrl, license) : license;
2664
+ return image;
2665
+ }).filter((img) => img !== null);
2666
+ }
2667
+ function extractVideos(urlElement, baseUrl) {
2668
+ const videoElements = urlElement.children.filter(
2669
+ (c) => c.tagName.toLowerCase() === "video:video" || c.tagName.toLowerCase() === "video" || c.tagName.toLowerCase().endsWith(":video")
2670
+ );
2671
+ return videoElements.map((vidEl) => {
2672
+ const thumbnailLoc = getText2(getChild(vidEl, "video:thumbnail_loc")) || getText2(getChild(vidEl, "thumbnail_loc")) || getText2(vidEl.children.find((c) => c.tagName.toLowerCase().endsWith(":thumbnail_loc")));
2673
+ const title = getText2(getChild(vidEl, "video:title")) || getText2(getChild(vidEl, "title")) || getText2(vidEl.children.find((c) => c.tagName.toLowerCase().endsWith(":title")));
2674
+ const description = getText2(getChild(vidEl, "video:description")) || getText2(getChild(vidEl, "description")) || getText2(vidEl.children.find((c) => c.tagName.toLowerCase().endsWith(":description")));
2675
+ if (!thumbnailLoc || !title || !description) return null;
2676
+ const video = {
2677
+ thumbnailLoc: baseUrl ? normalizeUrlHttps(baseUrl, thumbnailLoc) : thumbnailLoc,
2678
+ title: decodeXmlEntities(title),
2679
+ description: decodeXmlEntities(description)
2680
+ };
2681
+ const contentLoc = getText2(getChild(vidEl, "video:content_loc")) || getText2(getChild(vidEl, "content_loc")) || getText2(vidEl.children.find((c) => c.tagName.toLowerCase().endsWith(":content_loc")));
2682
+ if (contentLoc)
2683
+ video.contentLoc = baseUrl ? normalizeUrlHttps(baseUrl, contentLoc) : contentLoc;
2684
+ const playerLoc = getText2(getChild(vidEl, "video:player_loc")) || getText2(getChild(vidEl, "player_loc")) || getText2(vidEl.children.find((c) => c.tagName.toLowerCase().endsWith(":player_loc")));
2685
+ if (playerLoc) video.playerLoc = baseUrl ? normalizeUrlHttps(baseUrl, playerLoc) : playerLoc;
2686
+ const duration = getText2(getChild(vidEl, "video:duration")) || getText2(getChild(vidEl, "duration")) || getText2(vidEl.children.find((c) => c.tagName.toLowerCase().endsWith(":duration")));
2687
+ if (duration) {
2688
+ const dur = Number.parseInt(duration, 10);
2689
+ if (!Number.isNaN(dur)) video.duration = dur;
2690
+ }
2691
+ const rating = getText2(getChild(vidEl, "video:rating")) || getText2(getChild(vidEl, "rating")) || getText2(vidEl.children.find((c) => c.tagName.toLowerCase().endsWith(":rating")));
2692
+ if (rating) {
2693
+ const r = Number.parseFloat(rating);
2694
+ if (!Number.isNaN(r)) video.rating = r;
2695
+ }
2696
+ const viewCount = getText2(getChild(vidEl, "video:view_count")) || getText2(getChild(vidEl, "view_count")) || getText2(vidEl.children.find((c) => c.tagName.toLowerCase().endsWith(":view_count")));
2697
+ if (viewCount) {
2698
+ const vc = Number.parseInt(viewCount, 10);
2699
+ if (!Number.isNaN(vc)) video.viewCount = vc;
2700
+ }
2701
+ const publicationDate = getText2(getChild(vidEl, "video:publication_date")) || getText2(getChild(vidEl, "publication_date")) || getText2(vidEl.children.find((c) => c.tagName.toLowerCase().endsWith(":publication_date")));
2702
+ if (publicationDate) video.publicationDate = publicationDate;
2703
+ const familyFriendly = getText2(getChild(vidEl, "video:family_friendly")) || getText2(getChild(vidEl, "family_friendly")) || getText2(vidEl.children.find((c) => c.tagName.toLowerCase().endsWith(":family_friendly")));
2704
+ if (familyFriendly) {
2705
+ video.familyFriendly = familyFriendly.toLowerCase() === "yes";
2706
+ }
2707
+ const category = getText2(getChild(vidEl, "video:category")) || getText2(getChild(vidEl, "category")) || getText2(vidEl.children.find((c) => c.tagName.toLowerCase().endsWith(":category")));
2708
+ if (category) video.category = category;
2709
+ const tagElements = vidEl.children.filter(
2710
+ (c) => c.tagName.toLowerCase() === "video:tag" || c.tagName.toLowerCase() === "tag" || c.tagName.toLowerCase().endsWith(":tag")
2711
+ );
2712
+ if (tagElements.length > 0) {
2713
+ video.tags = tagElements.map((t) => getText2(t)).filter(Boolean);
2714
+ }
2715
+ return video;
2716
+ }).filter((vid) => vid !== null);
2717
+ }
2718
+ function decodeXmlEntities(text) {
2719
+ return text.replace(/&lt;/g, "<").replace(/&gt;/g, ">").replace(/&amp;/g, "&").replace(/&quot;/g, '"').replace(/&apos;/g, "'").replace(/&#(\d+);/g, (_, code) => String.fromCharCode(Number.parseInt(code, 10))).replace(/&#x([0-9a-fA-F]+);/g, (_, code) => String.fromCharCode(Number.parseInt(code, 16)));
2720
+ }
2721
+
2350
2722
  // src/pluck/types.ts
2351
2723
  var PluckError = class extends Error {
2352
2724
  constructor(message) {
@@ -2775,7 +3147,7 @@ function extractOpenGraph(doc) {
2775
3147
  if (Object.keys(audio).length > 0) {
2776
3148
  metadata.audio = audio;
2777
3149
  }
2778
- const images = extractImages(doc);
3150
+ const images = extractImages2(doc);
2779
3151
  if (images.length > 0) {
2780
3152
  metadata.images = images;
2781
3153
  }
@@ -2844,7 +3216,7 @@ function extractAudio(doc) {
2844
3216
  Object.entries(audio).filter(([_, value]) => value !== void 0)
2845
3217
  );
2846
3218
  }
2847
- function extractImages(doc) {
3219
+ function extractImages2(doc) {
2848
3220
  const images = [];
2849
3221
  const imageUrls = getAllMetaPropertyValues(doc, "og:image");
2850
3222
  const imageSecureUrls = getAllMetaPropertyValues(doc, "og:image:secure_url");
@@ -3938,9 +4310,63 @@ async function gatherFeed(url) {
3938
4310
  }
3939
4311
  const response = await pluck(feedUrl);
3940
4312
  const content = await response.textUtf8();
4313
+ const format = detectFormat(content);
4314
+ if (format === "sitemap") {
4315
+ return normalizeSitemapToFeed(content, response.finalUrl);
4316
+ }
3941
4317
  const result = parseFeed(content, response.finalUrl);
3942
4318
  return result.feed;
3943
4319
  }
4320
+ function normalizeSitemapToFeed(content, baseUrl) {
4321
+ const result = parseSitemap(content, baseUrl);
4322
+ if (result.isIndex) {
4323
+ const items2 = result.sitemap.sitemaps.map((sitemap, index) => ({
4324
+ id: sitemap.loc || `sitemap-${index}`,
4325
+ url: sitemap.loc,
4326
+ title: `Sitemap: ${sitemap.loc}`,
4327
+ modified: sitemap.lastmod
4328
+ }));
4329
+ return {
4330
+ format: "sitemap",
4331
+ title: "Sitemap Index",
4332
+ url: baseUrl,
4333
+ items: items2
4334
+ };
4335
+ }
4336
+ const items = result.sitemap.urls.map((url, index) => {
4337
+ const item = {
4338
+ id: url.loc || `url-${index}`,
4339
+ url: url.loc,
4340
+ modified: url.lastmod
4341
+ };
4342
+ if (url.news) {
4343
+ item.title = url.news.title;
4344
+ item.published = url.news.publicationDate;
4345
+ if (url.news.publication?.name) {
4346
+ item.authors = [{ name: url.news.publication.name }];
4347
+ }
4348
+ if (url.news.keywords) {
4349
+ item.tags = url.news.keywords;
4350
+ }
4351
+ }
4352
+ if (url.images && url.images.length > 0) {
4353
+ item.image = url.images[0].loc;
4354
+ }
4355
+ return item;
4356
+ });
4357
+ let title = "Sitemap";
4358
+ try {
4359
+ const urlObj = new URL(baseUrl);
4360
+ title = `${urlObj.hostname} Sitemap`;
4361
+ } catch {
4362
+ }
4363
+ return {
4364
+ format: "sitemap",
4365
+ title,
4366
+ url: baseUrl,
4367
+ items
4368
+ };
4369
+ }
3944
4370
 
3945
4371
  // src/metadata/feed-discovery/heuristics.ts
3946
4372
  var COMMON_FEED_PATHS = [
@@ -4247,7 +4673,7 @@ function extractAnalytics(doc) {
4247
4673
  function extractAssets(doc, baseUrl) {
4248
4674
  const metadata = {};
4249
4675
  const effectiveBaseUrl = getEffectiveBaseUrl2(doc, baseUrl);
4250
- const images = extractImages2(doc, effectiveBaseUrl);
4676
+ const images = extractImages3(doc, effectiveBaseUrl);
4251
4677
  if (images.length > 0) {
4252
4678
  metadata.images = images;
4253
4679
  }
@@ -4301,7 +4727,7 @@ function getEffectiveBaseUrl2(doc, baseUrl) {
4301
4727
  }
4302
4728
  return null;
4303
4729
  }
4304
- function extractImages2(doc, baseUrl) {
4730
+ function extractImages3(doc, baseUrl) {
4305
4731
  const urls = /* @__PURE__ */ new Set();
4306
4732
  const imgElements = doc.querySelectorAll("img[src]");
4307
4733
  for (const img of Array.from(imgElements)) {
@@ -4719,7 +5145,7 @@ function extractMonetization(doc) {
4719
5145
  }
4720
5146
 
4721
5147
  // src/metadata/news/extract.ts
4722
- function extractNews(doc) {
5148
+ function extractNews2(doc) {
4723
5149
  const metadata = {};
4724
5150
  const newsKeywords = getMetaContent(doc, "news_keywords");
4725
5151
  if (newsKeywords) {
@@ -6934,6 +7360,6 @@ async function swoop(url, init) {
6934
7360
  * @packageDocumentation
6935
7361
  */
6936
7362
 
6937
- export { PluckContentTypeError, PluckEncodingError, PluckError, PluckHttpError, PluckNetworkError, PluckRedirectError, PluckSizeError, PluckTimeoutError, SwoopEnvironmentError, SwoopError, SwoopExecutionError, SwoopSecurityError, SwoopTimeoutError, assessContentQuality, calculateReadingTime, countWords, detectFormat, extractAnalytics, extractAssets, extractCanonical, extractContent, extractCopyright, extractDublinCore, extractFeedDiscovery, extractGeo, extractIcons, extractLanguage, extractLinks3 as extractLinks, extractMonetization, extractNews, extractOpenGraph, extractPagination, extractRobots, extractSEO, extractSchemaOrg, extractSecurity, extractSitemapDiscovery, extractSocialProfiles, extractTwitterCard, extractVerification, gatherArticle, gatherFeed, gatherWebsite, htmlToText, isAtom, isFeed, isJSONFeed, isProbablyReaderable, isRSS, parseFeed, parseHTML, pluck, swoop };
7363
+ export { PluckContentTypeError, PluckEncodingError, PluckError, PluckHttpError, PluckNetworkError, PluckRedirectError, PluckSizeError, PluckTimeoutError, SwoopEnvironmentError, SwoopError, SwoopExecutionError, SwoopSecurityError, SwoopTimeoutError, assessContentQuality, calculateReadingTime, countWords, detectFormat, extractAnalytics, extractAssets, extractCanonical, extractContent, extractCopyright, extractDublinCore, extractFeedDiscovery, extractGeo, extractIcons, extractLanguage, extractLinks3 as extractLinks, extractMonetization, extractNews2 as extractNews, extractOpenGraph, extractPagination, extractRobots, extractSEO, extractSchemaOrg, extractSecurity, extractSitemapDiscovery, extractSocialProfiles, extractTwitterCard, extractVerification, gatherArticle, gatherFeed, gatherWebsite, htmlToText, isAtom, isFeed, isJSONFeed, isProbablyReaderable, isRSS, parseFeed, parseHTML, pluck, swoop };
6938
7364
  //# sourceMappingURL=index.js.map
6939
7365
  //# sourceMappingURL=index.js.map