npm - magpie-html - Versions diffs - 0.1.5 → 0.2.1 - Mend

magpie-html 0.1.5 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

package/dist/index.d.cts CHANGED Viewed

@@ -454,9 +454,10 @@ declare function isProbablyReaderable(doc: Document, options?: {
  * - `'rss'` - RSS 2.0, 0.9x, or RSS 1.0 (RDF)
  * - `'atom'` - Atom 1.0
  * - `'json-feed'` - JSON Feed 1.0 or 1.1
+ * - `'sitemap'` - XML Sitemap (urlset or sitemapindex)
  * - `'unknown'` - Format could not be determined
  */
-type FeedFormat = 'rss' | 'atom' | 'json-feed' | 'unknown';
+type FeedFormat = 'rss' | 'atom' | 'json-feed' | 'sitemap' | 'unknown';
 /**
  * Detect feed format from content string.
  *

package/dist/index.d.ts CHANGED Viewed

@@ -454,9 +454,10 @@ declare function isProbablyReaderable(doc: Document, options?: {
  * - `'rss'` - RSS 2.0, 0.9x, or RSS 1.0 (RDF)
  * - `'atom'` - Atom 1.0
  * - `'json-feed'` - JSON Feed 1.0 or 1.1
+ * - `'sitemap'` - XML Sitemap (urlset or sitemapindex)
  * - `'unknown'` - Format could not be determined
  */
-type FeedFormat = 'rss' | 'atom' | 'json-feed' | 'unknown';
+type FeedFormat = 'rss' | 'atom' | 'json-feed' | 'sitemap' | 'unknown';
 /**
  * Detect feed format from content string.
  *

package/dist/index.js CHANGED Viewed

@@ -1075,8 +1075,12 @@ function extractEntry(entryElement) {
 function removeComments(xml) {
   return xml.replace(/<!--[\s\S]*?-->/g, "");
 }
+function removeDoctype(xml) {
+  return xml.replace(/<!DOCTYPE[^>]*>/gi, "");
+}
 function parseAtomXML(xml) {
-  const withoutComments = removeComments(xml);
+  const withoutDoctype = removeDoctype(xml);
+  const withoutComments = removeComments(withoutDoctype);
   const { text: cleanedXML, cdataMap } = extractCDATA(withoutComments);
   const root = parseElement(cleanedXML, 0, null, cdataMap).element;
   return root;
@@ -1544,6 +1548,9 @@ function detectFormat(content) {
   if (cleaned.match(/<channel[\s>]/i)) {
     return "rss";
   }
+  if ((cleaned.match(/<urlset[\s>]/i) || cleaned.match(/<sitemapindex[\s>]/i)) && cleaned.includes("sitemaps.org")) {
+    return "sitemap";
+  }
   return "unknown";
 }
 function isFeed(content) {
@@ -1881,12 +1888,20 @@ function parseRSSDate(dateString) {
 // src/feed/rss/xml-parser.ts
 function parseRSSXML(xml) {
   const cleaned = cleanXMLDeclaration(xml);
-  const root = parseElement2(cleaned, 0).element;
+  const withoutDoctype = removeDoctype2(cleaned);
+  const withoutComments = removeComments2(withoutDoctype);
+  const root = parseElement2(withoutComments, 0).element;
   return root;
 }
 function cleanXMLDeclaration(xml) {
   return xml.replace(/<\?xml[^?]*\?>/g, "").trim();
 }
+function removeDoctype2(xml) {
+  return xml.replace(/<!DOCTYPE[^>]*>/gi, "");
+}
+function removeComments2(xml) {
+  return xml.replace(/<!--[\s\S]*?-->/g, "");
+}
 function extractCDATA2(text) {
   const cdataMap = /* @__PURE__ */ new Map();
   let counter = 0;
@@ -2324,6 +2339,11 @@ function parseFeedAs(content, format, baseUrl) {
         original: jsonFeed
       };
     }
+    case "sitemap": {
+      throw new Error(
+        "Sitemaps cannot be parsed with parseFeed(). Use parseSitemap() from the sitemap module instead."
+      );
+    }
   }
 }
@@ -2405,6 +2425,10 @@ function detectEncoding(buffer, contentType) {
   }
   const preview = new Uint8Array(buffer.slice(0, 1024));
   const previewText = new TextDecoder("utf-8", { fatal: false }).decode(preview);
+  const xmlEncoding = parseCharsetFromXml(previewText);
+  if (xmlEncoding) {
+    return xmlEncoding;
+  }
   const metaEncoding = parseCharsetFromHtml(previewText);
   if (metaEncoding) {
     return metaEncoding;
@@ -2445,6 +2469,13 @@ function parseCharsetFromHtml(html) {
   }
   return null;
 }
+function parseCharsetFromXml(xml) {
+  const xmlDeclMatch = /<\?xml[^?]*encoding\s*=\s*["']([^"']+)["'][^?]*\?>/i.exec(xml);
+  if (xmlDeclMatch) {
+    return normalizeEncoding(xmlDeclMatch[1]);
+  }
+  return null;
+}
 function normalizeEncoding(encoding) {
   const normalized = encoding.toLowerCase().trim();
   const aliases = {