npm - magpie-html - Versions diffs - 0.2.0 → 0.2.2 - Mend

magpie-html 0.2.0 → 0.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

package/dist/index.cjs CHANGED Viewed

@@ -1081,8 +1081,12 @@ function extractEntry(entryElement) {
 function removeComments(xml) {
   return xml.replace(/<!--[\s\S]*?-->/g, "");
 }
+function removeDoctype(xml) {
+  return xml.replace(/<!DOCTYPE[^>]*>/gi, "");
+}
 function parseAtomXML(xml) {
-  const withoutComments = removeComments(xml);
+  const withoutDoctype = removeDoctype(xml);
+  const withoutComments = removeComments(withoutDoctype);
   const { text: cleanedXML, cdataMap } = extractCDATA(withoutComments);
   const root = parseElement(cleanedXML, 0, null, cdataMap).element;
   return root;
@@ -1890,13 +1894,17 @@ function parseRSSDate(dateString) {
 // src/feed/rss/xml-parser.ts
 function parseRSSXML(xml) {
   const cleaned = cleanXMLDeclaration(xml);
-  const withoutComments = removeComments2(cleaned);
+  const withoutDoctype = removeDoctype2(cleaned);
+  const withoutComments = removeComments2(withoutDoctype);
   const root = parseElement2(withoutComments, 0).element;
   return root;
 }
 function cleanXMLDeclaration(xml) {
   return xml.replace(/<\?xml[^?]*\?>/g, "").trim();
 }
+function removeDoctype2(xml) {
+  return xml.replace(/<!DOCTYPE[^>]*>/gi, "");
+}
 function removeComments2(xml) {
   return xml.replace(/<!--[\s\S]*?-->/g, "");
 }
@@ -2345,6 +2353,378 @@ function parseFeedAs(content, format, baseUrl) {
   }
 }
+// src/feed/sitemap/xml-parser.ts
+function parseSitemapXML(xml) {
+  const cleaned = cleanXMLDeclaration2(xml);
+  const withoutDoctype = removeDoctype3(cleaned);
+  const withoutComments = removeComments3(withoutDoctype);
+  const root = parseElement3(withoutComments, 0).element;
+  return root;
+}
+function cleanXMLDeclaration2(xml) {
+  return xml.replace(/<\?xml[^?]*\?>/g, "").trim();
+}
+function removeDoctype3(xml) {
+  return xml.replace(/<!DOCTYPE[^>]*>/gi, "");
+}
+function removeComments3(xml) {
+  return xml.replace(/<!--[\s\S]*?-->/g, "");
+}
+function extractCDATA3(text) {
+  const cdataMap = /* @__PURE__ */ new Map();
+  let counter = 0;
+  const processed = text.replace(/<!\[CDATA\[([\s\S]*?)\]\]>/g, (_match, content) => {
+    const placeholder = `__CDATA_${counter}__`;
+    cdataMap.set(placeholder, content);
+    counter++;
+    return placeholder;
+  });
+  return { text: processed, cdataMap };
+}
+function restoreCDATA3(text, cdataMap) {
+  let result = text;
+  for (const [placeholder, content] of cdataMap.entries()) {
+    result = result.replace(placeholder, content);
+  }
+  return result;
+}
+function parseAttributes3(tagContent) {
+  const attributes = {};
+  const attrRegex = /(\S+)=["']([^"']*)["']/g;
+  let match = attrRegex.exec(tagContent);
+  while (match !== null) {
+    attributes[match[1]] = match[2];
+    match = attrRegex.exec(tagContent);
+  }
+  return attributes;
+}
+function findClosingTag3(xml, tagName, startPos) {
+  const openTag = `<${tagName}`;
+  const closeTag = `</${tagName}>`;
+  let depth = 1;
+  let pos = startPos;
+  while (pos < xml.length && depth > 0) {
+    const nextOpen = xml.indexOf(openTag, pos);
+    const nextClose = xml.indexOf(closeTag, pos);
+    if (nextClose === -1) {
+      return -1;
+    }
+    if (nextOpen !== -1 && nextOpen < nextClose) {
+      depth++;
+      pos = nextOpen + openTag.length;
+    } else {
+      depth--;
+      if (depth === 0) {
+        return nextClose;
+      }
+      pos = nextClose + closeTag.length;
+    }
+  }
+  return -1;
+}
+function parseElement3(xml, startPos, parent = null, cdataMap) {
+  const extracted = cdataMap ? { text: xml, cdataMap } : extractCDATA3(xml);
+  const cleanedXML = extracted.text;
+  const currentCdataMap = extracted.cdataMap;
+  const openTagStart = cleanedXML.indexOf("<", startPos);
+  if (openTagStart === -1) {
+    throw new Error("No opening tag found");
+  }
+  const openTagEnd = cleanedXML.indexOf(">", openTagStart);
+  if (openTagEnd === -1) {
+    throw new Error("Unclosed opening tag");
+  }
+  const openTagContent = cleanedXML.substring(openTagStart + 1, openTagEnd);
+  const isSelfClosing = openTagContent.endsWith("/");
+  const tagContent = isSelfClosing ? openTagContent.slice(0, -1).trim() : openTagContent;
+  const spaceIndex = tagContent.indexOf(" ");
+  const tagName = spaceIndex === -1 ? tagContent : tagContent.substring(0, spaceIndex);
+  const attributes = spaceIndex === -1 ? {} : parseAttributes3(tagContent.substring(spaceIndex));
+  const element = {
+    tagName,
+    attributes,
+    text: "",
+    children: [],
+    parent
+  };
+  if (isSelfClosing) {
+    return { element, endPos: openTagEnd + 1, cdataMap: currentCdataMap };
+  }
+  const closingTagPos = findClosingTag3(cleanedXML, tagName, openTagEnd + 1);
+  if (closingTagPos === -1) {
+    throw new Error(`No closing tag found for <${tagName}>`);
+  }
+  const content = cleanedXML.substring(openTagEnd + 1, closingTagPos);
+  if (content.includes("<")) {
+    let pos = 0;
+    const trimmedContent = content.trim();
+    while (pos < trimmedContent.length) {
+      const nextTag = trimmedContent.indexOf("<", pos);
+      if (nextTag === -1) break;
+      if (trimmedContent[nextTag + 1] === "/" || trimmedContent[nextTag + 1] === "!") {
+        pos = nextTag + 1;
+        continue;
+      }
+      try {
+        const { element: child, endPos } = parseElement3(
+          trimmedContent,
+          nextTag,
+          element,
+          currentCdataMap
+        );
+        element.children.push(child);
+        pos = endPos;
+      } catch {
+        pos = nextTag + 1;
+      }
+    }
+    let textContent = content.replace(/<[^>]+>/g, "").trim();
+    textContent = restoreCDATA3(textContent, currentCdataMap);
+    element.text = textContent;
+  } else {
+    let textContent = content.trim();
+    textContent = restoreCDATA3(textContent, currentCdataMap);
+    element.text = textContent;
+  }
+  const closingTagEnd = closingTagPos + `</${tagName}>`.length;
+  return { element, endPos: closingTagEnd, cdataMap: currentCdataMap };
+}
+function querySelector3(element, selector, caseSensitive = false) {
+  const tagName = caseSensitive ? selector : selector.toLowerCase();
+  const elementTag = caseSensitive ? element.tagName : element.tagName.toLowerCase();
+  if (elementTag === tagName) {
+    return element;
+  }
+  for (const child of element.children) {
+    const found = querySelector3(child, selector, caseSensitive);
+    if (found) return found;
+  }
+  return null;
+}
+function querySelectorAll3(element, selector, caseSensitive = false) {
+  const results = [];
+  const tagName = caseSensitive ? selector : selector.toLowerCase();
+  const elementTag = caseSensitive ? element.tagName : element.tagName.toLowerCase();
+  if (elementTag === tagName) {
+    results.push(element);
+  }
+  for (const child of element.children) {
+    results.push(...querySelectorAll3(child, selector, caseSensitive));
+  }
+  return results;
+}
+function getText2(element) {
+  return element?.text || "";
+}
+function getChild(element, tagName) {
+  const lowerTag = tagName.toLowerCase();
+  return element.children.find((c) => c.tagName.toLowerCase() === lowerTag) || null;
+}
+function getChildren(element, tagName) {
+  const lowerTag = tagName.toLowerCase();
+  return element.children.filter((c) => c.tagName.toLowerCase() === lowerTag);
+}
+// src/feed/sitemap/parse.ts
+function parseSitemap(xml, baseUrl) {
+  const doc = parseSitemapXML(xml);
+  const sitemapIndex = querySelector3(doc, "sitemapindex");
+  if (sitemapIndex) {
+    return parseSitemapIndex(sitemapIndex, baseUrl);
+  }
+  const urlset = querySelector3(doc, "urlset");
+  if (urlset) {
+    return parseUrlset(urlset, baseUrl);
+  }
+  const urls = querySelectorAll3(doc, "url");
+  if (urls.length > 0) {
+    return {
+      sitemap: {
+        type: "urlset",
+        urls: urls.map((url) => extractUrl(url, baseUrl)),
+        sitemaps: []
+      },
+      isIndex: false
+    };
+  }
+  return {
+    sitemap: {
+      type: "urlset",
+      urls: [],
+      sitemaps: []
+    },
+    isIndex: false
+  };
+}
+function parseSitemapIndex(element, baseUrl) {
+  const sitemapElements = getChildren(element, "sitemap");
+  const sitemaps = sitemapElements.map((el) => {
+    const loc = getText2(getChild(el, "loc"));
+    const lastmod = getText2(getChild(el, "lastmod")) || void 0;
+    return {
+      loc: baseUrl ? normalizeUrlHttps(baseUrl, loc) : loc,
+      lastmod
+    };
+  });
+  return {
+    sitemap: {
+      type: "sitemapindex",
+      urls: [],
+      sitemaps
+    },
+    isIndex: true
+  };
+}
+function parseUrlset(element, baseUrl) {
+  const urlElements = getChildren(element, "url");
+  const urls = urlElements.map((el) => extractUrl(el, baseUrl));
+  return {
+    sitemap: {
+      type: "urlset",
+      urls,
+      sitemaps: []
+    },
+    isIndex: false
+  };
+}
+function extractUrl(element, baseUrl) {
+  const rawLoc = getText2(getChild(element, "loc"));
+  const loc = decodeXmlEntities(rawLoc);
+  const lastmod = getText2(getChild(element, "lastmod")) || void 0;
+  const changefreq = getText2(getChild(element, "changefreq")) || void 0;
+  const priorityText = getText2(getChild(element, "priority"));
+  const priority = priorityText ? Number.parseFloat(priorityText) : void 0;
+  const result = {
+    loc: baseUrl ? normalizeUrlHttps(baseUrl, loc) : loc,
+    lastmod,
+    changefreq,
+    priority: priority && !Number.isNaN(priority) ? priority : void 0
+  };
+  const news = extractNews(element);
+  if (news) {
+    result.news = news;
+  }
+  const images = extractImages(element, baseUrl);
+  if (images.length > 0) {
+    result.images = images;
+  }
+  const videos = extractVideos(element, baseUrl);
+  if (videos.length > 0) {
+    result.videos = videos;
+  }
+  return result;
+}
+function extractNews(urlElement) {
+  const newsEl = getChild(urlElement, "news:news") || getChild(urlElement, "news") || urlElement.children.find((c) => c.tagName.toLowerCase().endsWith(":news"));
+  if (!newsEl) {
+    return void 0;
+  }
+  const news = {};
+  const pubEl = getChild(newsEl, "news:publication") || getChild(newsEl, "publication") || newsEl.children.find((c) => c.tagName.toLowerCase().endsWith(":publication"));
+  if (pubEl) {
+    const name = getText2(getChild(pubEl, "news:name")) || getText2(getChild(pubEl, "name")) || getText2(pubEl.children.find((c) => c.tagName.toLowerCase().endsWith(":name")));
+    const language = getText2(getChild(pubEl, "news:language")) || getText2(getChild(pubEl, "language")) || getText2(pubEl.children.find((c) => c.tagName.toLowerCase().endsWith(":language")));
+    if (name || language) {
+      news.publication = {
+        name: name || void 0,
+        language: language || void 0
+      };
+    }
+  }
+  const pubDate = getText2(getChild(newsEl, "news:publication_date")) || getText2(getChild(newsEl, "publication_date")) || getText2(newsEl.children.find((c) => c.tagName.toLowerCase().endsWith(":publication_date")));
+  if (pubDate) {
+    news.publicationDate = pubDate;
+  }
+  const title = getText2(getChild(newsEl, "news:title")) || getText2(getChild(newsEl, "title")) || getText2(newsEl.children.find((c) => c.tagName.toLowerCase().endsWith(":title")));
+  if (title) {
+    news.title = decodeXmlEntities(title);
+  }
+  const keywords = getText2(getChild(newsEl, "news:keywords")) || getText2(getChild(newsEl, "keywords")) || getText2(newsEl.children.find((c) => c.tagName.toLowerCase().endsWith(":keywords")));
+  if (keywords) {
+    news.keywords = keywords.split(",").map((k) => k.trim());
+  }
+  const stockTickers = getText2(getChild(newsEl, "news:stock_tickers")) || getText2(getChild(newsEl, "stock_tickers")) || getText2(newsEl.children.find((c) => c.tagName.toLowerCase().endsWith(":stock_tickers")));
+  if (stockTickers) {
+    news.stockTickers = stockTickers.split(",").map((t) => t.trim());
+  }
+  return Object.keys(news).length > 0 ? news : void 0;
+}
+function extractImages(urlElement, baseUrl) {
+  const imageElements = urlElement.children.filter(
+    (c) => c.tagName.toLowerCase() === "image:image" || c.tagName.toLowerCase() === "image" || c.tagName.toLowerCase().endsWith(":image")
+  );
+  return imageElements.map((imgEl) => {
+    const loc = getText2(getChild(imgEl, "image:loc")) || getText2(getChild(imgEl, "loc")) || getText2(imgEl.children.find((c) => c.tagName.toLowerCase().endsWith(":loc")));
+    if (!loc) return null;
+    const image = {
+      loc: baseUrl ? normalizeUrlHttps(baseUrl, loc) : loc
+    };
+    const caption = getText2(getChild(imgEl, "image:caption")) || getText2(getChild(imgEl, "caption")) || getText2(imgEl.children.find((c) => c.tagName.toLowerCase().endsWith(":caption")));
+    if (caption) image.caption = decodeXmlEntities(caption);
+    const geoLocation = getText2(getChild(imgEl, "image:geo_location")) || getText2(getChild(imgEl, "geo_location")) || getText2(imgEl.children.find((c) => c.tagName.toLowerCase().endsWith(":geo_location")));
+    if (geoLocation) image.geoLocation = geoLocation;
+    const title = getText2(getChild(imgEl, "image:title")) || getText2(getChild(imgEl, "title")) || getText2(imgEl.children.find((c) => c.tagName.toLowerCase().endsWith(":title")));
+    if (title) image.title = decodeXmlEntities(title);
+    const license = getText2(getChild(imgEl, "image:license")) || getText2(getChild(imgEl, "license")) || getText2(imgEl.children.find((c) => c.tagName.toLowerCase().endsWith(":license")));
+    if (license) image.license = baseUrl ? normalizeUrlHttps(baseUrl, license) : license;
+    return image;
+  }).filter((img) => img !== null);
+}
+function extractVideos(urlElement, baseUrl) {
+  const videoElements = urlElement.children.filter(
+    (c) => c.tagName.toLowerCase() === "video:video" || c.tagName.toLowerCase() === "video" || c.tagName.toLowerCase().endsWith(":video")
+  );
+  return videoElements.map((vidEl) => {
+    const thumbnailLoc = getText2(getChild(vidEl, "video:thumbnail_loc")) || getText2(getChild(vidEl, "thumbnail_loc")) || getText2(vidEl.children.find((c) => c.tagName.toLowerCase().endsWith(":thumbnail_loc")));
+    const title = getText2(getChild(vidEl, "video:title")) || getText2(getChild(vidEl, "title")) || getText2(vidEl.children.find((c) => c.tagName.toLowerCase().endsWith(":title")));
+    const description = getText2(getChild(vidEl, "video:description")) || getText2(getChild(vidEl, "description")) || getText2(vidEl.children.find((c) => c.tagName.toLowerCase().endsWith(":description")));
+    if (!thumbnailLoc || !title || !description) return null;
+    const video = {
+      thumbnailLoc: baseUrl ? normalizeUrlHttps(baseUrl, thumbnailLoc) : thumbnailLoc,
+      title: decodeXmlEntities(title),
+      description: decodeXmlEntities(description)
+    };
+    const contentLoc = getText2(getChild(vidEl, "video:content_loc")) || getText2(getChild(vidEl, "content_loc")) || getText2(vidEl.children.find((c) => c.tagName.toLowerCase().endsWith(":content_loc")));
+    if (contentLoc)
+      video.contentLoc = baseUrl ? normalizeUrlHttps(baseUrl, contentLoc) : contentLoc;
+    const playerLoc = getText2(getChild(vidEl, "video:player_loc")) || getText2(getChild(vidEl, "player_loc")) || getText2(vidEl.children.find((c) => c.tagName.toLowerCase().endsWith(":player_loc")));
+    if (playerLoc) video.playerLoc = baseUrl ? normalizeUrlHttps(baseUrl, playerLoc) : playerLoc;
+    const duration = getText2(getChild(vidEl, "video:duration")) || getText2(getChild(vidEl, "duration")) || getText2(vidEl.children.find((c) => c.tagName.toLowerCase().endsWith(":duration")));
+    if (duration) {
+      const dur = Number.parseInt(duration, 10);
+      if (!Number.isNaN(dur)) video.duration = dur;
+    }
+    const rating = getText2(getChild(vidEl, "video:rating")) || getText2(getChild(vidEl, "rating")) || getText2(vidEl.children.find((c) => c.tagName.toLowerCase().endsWith(":rating")));
+    if (rating) {
+      const r = Number.parseFloat(rating);
+      if (!Number.isNaN(r)) video.rating = r;
+    }
+    const viewCount = getText2(getChild(vidEl, "video:view_count")) || getText2(getChild(vidEl, "view_count")) || getText2(vidEl.children.find((c) => c.tagName.toLowerCase().endsWith(":view_count")));
+    if (viewCount) {
+      const vc = Number.parseInt(viewCount, 10);
+      if (!Number.isNaN(vc)) video.viewCount = vc;
+    }
+    const publicationDate = getText2(getChild(vidEl, "video:publication_date")) || getText2(getChild(vidEl, "publication_date")) || getText2(vidEl.children.find((c) => c.tagName.toLowerCase().endsWith(":publication_date")));
+    if (publicationDate) video.publicationDate = publicationDate;
+    const familyFriendly = getText2(getChild(vidEl, "video:family_friendly")) || getText2(getChild(vidEl, "family_friendly")) || getText2(vidEl.children.find((c) => c.tagName.toLowerCase().endsWith(":family_friendly")));
+    if (familyFriendly) {
+      video.familyFriendly = familyFriendly.toLowerCase() === "yes";
+    }
+    const category = getText2(getChild(vidEl, "video:category")) || getText2(getChild(vidEl, "category")) || getText2(vidEl.children.find((c) => c.tagName.toLowerCase().endsWith(":category")));
+    if (category) video.category = category;
+    const tagElements = vidEl.children.filter(
+      (c) => c.tagName.toLowerCase() === "video:tag" || c.tagName.toLowerCase() === "tag" || c.tagName.toLowerCase().endsWith(":tag")
+    );
+    if (tagElements.length > 0) {
+      video.tags = tagElements.map((t) => getText2(t)).filter(Boolean);
+    }
+    return video;
+  }).filter((vid) => vid !== null);
+}
+function decodeXmlEntities(text) {
+  return text.replace(/&lt;/g, "<").replace(/&gt;/g, ">").replace(/&amp;/g, "&").replace(/&quot;/g, '"').replace(/&apos;/g, "'").replace(/&#(\d+);/g, (_, code) => String.fromCharCode(Number.parseInt(code, 10))).replace(/&#x([0-9a-fA-F]+);/g, (_, code) => String.fromCharCode(Number.parseInt(code, 16)));
+}
 // src/pluck/types.ts
 var PluckError = class extends Error {
   constructor(message) {
@@ -2773,7 +3153,7 @@ function extractOpenGraph(doc) {
   if (Object.keys(audio).length > 0) {
     metadata.audio = audio;
   }
-  const images = extractImages(doc);
+  const images = extractImages2(doc);
   if (images.length > 0) {
     metadata.images = images;
   }
@@ -2842,7 +3222,7 @@ function extractAudio(doc) {
     Object.entries(audio).filter(([_, value]) => value !== void 0)
   );
 }
-function extractImages(doc) {
+function extractImages2(doc) {
   const images = [];
   const imageUrls = getAllMetaPropertyValues(doc, "og:image");
   const imageSecureUrls = getAllMetaPropertyValues(doc, "og:image:secure_url");
@@ -3936,9 +4316,63 @@ async function gatherFeed(url) {
   }
   const response = await pluck(feedUrl);
   const content = await response.textUtf8();
+  const format = detectFormat(content);
+  if (format === "sitemap") {
+    return normalizeSitemapToFeed(content, response.finalUrl);
+  }
   const result = parseFeed(content, response.finalUrl);
   return result.feed;
 }
+function normalizeSitemapToFeed(content, baseUrl) {
+  const result = parseSitemap(content, baseUrl);
+  if (result.isIndex) {
+    const items2 = result.sitemap.sitemaps.map((sitemap, index) => ({
+      id: sitemap.loc || `sitemap-${index}`,
+      url: sitemap.loc,
+      title: `Sitemap: ${sitemap.loc}`,
+      modified: sitemap.lastmod
+    }));
+    return {
+      format: "sitemap",
+      title: "Sitemap Index",
+      url: baseUrl,
+      items: items2
+    };
+  }
+  const items = result.sitemap.urls.map((url, index) => {
+    const item = {
+      id: url.loc || `url-${index}`,
+      url: url.loc,
+      modified: url.lastmod
+    };
+    if (url.news) {
+      item.title = url.news.title;
+      item.published = url.news.publicationDate;
+      if (url.news.publication?.name) {
+        item.authors = [{ name: url.news.publication.name }];
+      }
+      if (url.news.keywords) {
+        item.tags = url.news.keywords;
+      }
+    }
+    if (url.images && url.images.length > 0) {
+      item.image = url.images[0].loc;
+    }
+    return item;
+  });
+  let title = "Sitemap";
+  try {
+    const urlObj = new URL(baseUrl);
+    title = `${urlObj.hostname} Sitemap`;
+  } catch {
+  }
+  return {
+    format: "sitemap",
+    title,
+    url: baseUrl,
+    items
+  };
+}
 // src/metadata/feed-discovery/heuristics.ts
 var COMMON_FEED_PATHS = [
@@ -4245,7 +4679,7 @@ function extractAnalytics(doc) {
 function extractAssets(doc, baseUrl) {
   const metadata = {};
   const effectiveBaseUrl = getEffectiveBaseUrl2(doc, baseUrl);
-  const images = extractImages2(doc, effectiveBaseUrl);
+  const images = extractImages3(doc, effectiveBaseUrl);
   if (images.length > 0) {
     metadata.images = images;
   }
@@ -4299,7 +4733,7 @@ function getEffectiveBaseUrl2(doc, baseUrl) {
   }
   return null;
 }
-function extractImages2(doc, baseUrl) {
+function extractImages3(doc, baseUrl) {
   const urls = /* @__PURE__ */ new Set();
   const imgElements = doc.querySelectorAll("img[src]");
   for (const img of Array.from(imgElements)) {
@@ -4717,7 +5151,7 @@ function extractMonetization(doc) {
 }
 // src/metadata/news/extract.ts
-function extractNews(doc) {
+function extractNews2(doc) {
   const metadata = {};
   const newsKeywords = getMetaContent(doc, "news_keywords");
   if (newsKeywords) {
@@ -6961,7 +7395,7 @@ exports.extractIcons = extractIcons;
 exports.extractLanguage = extractLanguage;
 exports.extractLinks = extractLinks3;
 exports.extractMonetization = extractMonetization;
-exports.extractNews = extractNews;
+exports.extractNews = extractNews2;
 exports.extractOpenGraph = extractOpenGraph;
 exports.extractPagination = extractPagination;
 exports.extractRobots = extractRobots;