npm - magpie-html - Versions diffs - 0.1.5 → 0.2.1 - Mend

magpie-html 0.1.5 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

package/README.md CHANGED Viewed

@@ -9,7 +9,7 @@
 [![Node.js](https://img.shields.io/badge/Node.js-%3E%3D18-green?style=flat-square&logo=node.js)](https://nodejs.org/)
 [![Live Demo](https://img.shields.io/badge/Live_Demo-CrispRead-eb6864?style=flat-square&logo=rss&logoColor=white)](https://crispread.com)
-**Modern web scraping for when you need the good parts, not the markup soup.** Extracts clean article content, parses feeds (RSS, Atom, JSON), and gathers metadata from any page. Handles broken encodings, malformed feeds, and the chaos of real-world HTML. TypeScript-native, works everywhere. Named after the bird known for collecting valuable things... you get the idea.
+**Modern web scraping for when you need the good parts, not the markup soup.** Extracts clean article content, parses feeds (RSS, Atom, JSON, Sitemaps), and gathers metadata from any page. Handles broken encodings, malformed feeds, and the chaos of real-world HTML. TypeScript-native, works everywhere. Named after the bird known for collecting valuable things... you get the idea.
 <div align="center">
   <img src="https://raw.githubusercontent.com/Anonyfox/magpie-html/main/assets/magpie-html-logo.png" alt="Magpie HTML Logo" width="300">
@@ -26,7 +26,7 @@
 - 🔒 **Type-safe** - Full TypeScript support
 - 🧪 **Well-tested** - Built with Node.js native test runner
 - 🚀 **Minimal dependencies** - Lightweight and fast
-- 🔄 **Multi-Format Feed Parser** - Parse RSS 2.0, Atom 1.0, and JSON Feed
+- 🔄 **Multi-Format Feed Parser** - Parse RSS 2.0, Atom 1.0, JSON Feed, and XML Sitemaps
 - 🔗 **Smart URL Resolution** - Automatic normalization to absolute URLs
 - 🛡️ **Error Resilient** - Graceful handling of malformed data
 - 🦅 **High-Level Convenience** - One-line functions for common tasks
@@ -199,6 +199,30 @@ console.log(result.feed.items[0].title);
 console.log(result.feed.format); // 'rss', 'atom', or 'json-feed'
 ```
+### Sitemap Parsing (Fallback)
+When standard feeds aren't available, XML sitemaps can be a useful fallback for discovering URLs. Supports standard sitemaps, sitemap indexes, and Google News/Image/Video extensions:
+```typescript
+import { pluck, parseSitemap, isSitemap } from "magpie-html";
+const response = await pluck("https://example.com/sitemap.xml");
+const content = await response.textUtf8();
+if (isSitemap(content)) {
+  const result = parseSitemap(content, response.finalUrl);
+  for (const url of result.sitemap.urls) {
+    console.log(url.loc); // URL
+    console.log(url.lastmod); // Last modified date
+    console.log(url.news?.title); // Google News title (if present)
+    console.log(url.news?.publicationDate); // Publication date
+  }
+  // For sitemap indexes, check result.sitemap.sitemaps[]
+}
+```
 ### Content Extraction
 ```typescript

package/dist/index.cjs CHANGED Viewed

@@ -1081,8 +1081,12 @@ function extractEntry(entryElement) {
 function removeComments(xml) {
   return xml.replace(/<!--[\s\S]*?-->/g, "");
 }
+function removeDoctype(xml) {
+  return xml.replace(/<!DOCTYPE[^>]*>/gi, "");
+}
 function parseAtomXML(xml) {
-  const withoutComments = removeComments(xml);
+  const withoutDoctype = removeDoctype(xml);
+  const withoutComments = removeComments(withoutDoctype);
   const { text: cleanedXML, cdataMap } = extractCDATA(withoutComments);
   const root = parseElement(cleanedXML, 0, null, cdataMap).element;
   return root;
@@ -1550,6 +1554,9 @@ function detectFormat(content) {
   if (cleaned.match(/<channel[\s>]/i)) {
     return "rss";
   }
+  if ((cleaned.match(/<urlset[\s>]/i) || cleaned.match(/<sitemapindex[\s>]/i)) && cleaned.includes("sitemaps.org")) {
+    return "sitemap";
+  }
   return "unknown";
 }
 function isFeed(content) {
@@ -1887,12 +1894,20 @@ function parseRSSDate(dateString) {
 // src/feed/rss/xml-parser.ts
 function parseRSSXML(xml) {
   const cleaned = cleanXMLDeclaration(xml);
-  const root = parseElement2(cleaned, 0).element;
+  const withoutDoctype = removeDoctype2(cleaned);
+  const withoutComments = removeComments2(withoutDoctype);
+  const root = parseElement2(withoutComments, 0).element;
   return root;
 }
 function cleanXMLDeclaration(xml) {
   return xml.replace(/<\?xml[^?]*\?>/g, "").trim();
 }
+function removeDoctype2(xml) {
+  return xml.replace(/<!DOCTYPE[^>]*>/gi, "");
+}
+function removeComments2(xml) {
+  return xml.replace(/<!--[\s\S]*?-->/g, "");
+}
 function extractCDATA2(text) {
   const cdataMap = /* @__PURE__ */ new Map();
   let counter = 0;
@@ -2330,6 +2345,11 @@ function parseFeedAs(content, format, baseUrl) {
         original: jsonFeed
       };
     }
+    case "sitemap": {
+      throw new Error(
+        "Sitemaps cannot be parsed with parseFeed(). Use parseSitemap() from the sitemap module instead."
+      );
+    }
   }
 }
@@ -2411,6 +2431,10 @@ function detectEncoding(buffer, contentType) {
   }
   const preview = new Uint8Array(buffer.slice(0, 1024));
   const previewText = new TextDecoder("utf-8", { fatal: false }).decode(preview);
+  const xmlEncoding = parseCharsetFromXml(previewText);
+  if (xmlEncoding) {
+    return xmlEncoding;
+  }
   const metaEncoding = parseCharsetFromHtml(previewText);
   if (metaEncoding) {
     return metaEncoding;
@@ -2451,6 +2475,13 @@ function parseCharsetFromHtml(html) {
   }
   return null;
 }
+function parseCharsetFromXml(xml) {
+  const xmlDeclMatch = /<\?xml[^?]*encoding\s*=\s*["']([^"']+)["'][^?]*\?>/i.exec(xml);
+  if (xmlDeclMatch) {
+    return normalizeEncoding(xmlDeclMatch[1]);
+  }
+  return null;
+}
 function normalizeEncoding(encoding) {
   const normalized = encoding.toLowerCase().trim();
   const aliases = {