magpie-html 0.1.5 โ†’ 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -9,7 +9,7 @@
9
9
  [![Node.js](https://img.shields.io/badge/Node.js-%3E%3D18-green?style=flat-square&logo=node.js)](https://nodejs.org/)
10
10
  [![Live Demo](https://img.shields.io/badge/Live_Demo-CrispRead-eb6864?style=flat-square&logo=rss&logoColor=white)](https://crispread.com)
11
11
 
12
- **Modern web scraping for when you need the good parts, not the markup soup.** Extracts clean article content, parses feeds (RSS, Atom, JSON), and gathers metadata from any page. Handles broken encodings, malformed feeds, and the chaos of real-world HTML. TypeScript-native, works everywhere. Named after the bird known for collecting valuable things... you get the idea.
12
+ **Modern web scraping for when you need the good parts, not the markup soup.** Extracts clean article content, parses feeds (RSS, Atom, JSON, Sitemaps), and gathers metadata from any page. Handles broken encodings, malformed feeds, and the chaos of real-world HTML. TypeScript-native, works everywhere. Named after the bird known for collecting valuable things... you get the idea.
13
13
 
14
14
  <div align="center">
15
15
  <img src="https://raw.githubusercontent.com/Anonyfox/magpie-html/main/assets/magpie-html-logo.png" alt="Magpie HTML Logo" width="300">
@@ -26,7 +26,7 @@
26
26
  - ๐Ÿ”’ **Type-safe** - Full TypeScript support
27
27
  - ๐Ÿงช **Well-tested** - Built with Node.js native test runner
28
28
  - ๐Ÿš€ **Minimal dependencies** - Lightweight and fast
29
- - ๐Ÿ”„ **Multi-Format Feed Parser** - Parse RSS 2.0, Atom 1.0, and JSON Feed
29
+ - ๐Ÿ”„ **Multi-Format Feed Parser** - Parse RSS 2.0, Atom 1.0, JSON Feed, and XML Sitemaps
30
30
  - ๐Ÿ”— **Smart URL Resolution** - Automatic normalization to absolute URLs
31
31
  - ๐Ÿ›ก๏ธ **Error Resilient** - Graceful handling of malformed data
32
32
  - ๐Ÿฆ… **High-Level Convenience** - One-line functions for common tasks
@@ -199,6 +199,30 @@ console.log(result.feed.items[0].title);
199
199
  console.log(result.feed.format); // 'rss', 'atom', or 'json-feed'
200
200
  ```
201
201
 
202
+ ### Sitemap Parsing (Fallback)
203
+
204
+ When standard feeds aren't available, XML sitemaps can be a useful fallback for discovering URLs. Supports standard sitemaps, sitemap indexes, and Google News/Image/Video extensions:
205
+
206
+ ```typescript
207
+ import { pluck, parseSitemap, isSitemap } from "magpie-html";
208
+
209
+ const response = await pluck("https://example.com/sitemap.xml");
210
+ const content = await response.textUtf8();
211
+
212
+ if (isSitemap(content)) {
213
+ const result = parseSitemap(content, response.finalUrl);
214
+
215
+ for (const url of result.sitemap.urls) {
216
+ console.log(url.loc); // URL
217
+ console.log(url.lastmod); // Last modified date
218
+ console.log(url.news?.title); // Google News title (if present)
219
+ console.log(url.news?.publicationDate); // Publication date
220
+ }
221
+
222
+ // For sitemap indexes, check result.sitemap.sitemaps[]
223
+ }
224
+ ```
225
+
202
226
  ### Content Extraction
203
227
 
204
228
  ```typescript
package/dist/index.cjs CHANGED
@@ -1081,8 +1081,12 @@ function extractEntry(entryElement) {
1081
1081
  function removeComments(xml) {
1082
1082
  return xml.replace(/<!--[\s\S]*?-->/g, "");
1083
1083
  }
1084
+ function removeDoctype(xml) {
1085
+ return xml.replace(/<!DOCTYPE[^>]*>/gi, "");
1086
+ }
1084
1087
  function parseAtomXML(xml) {
1085
- const withoutComments = removeComments(xml);
1088
+ const withoutDoctype = removeDoctype(xml);
1089
+ const withoutComments = removeComments(withoutDoctype);
1086
1090
  const { text: cleanedXML, cdataMap } = extractCDATA(withoutComments);
1087
1091
  const root = parseElement(cleanedXML, 0, null, cdataMap).element;
1088
1092
  return root;
@@ -1550,6 +1554,9 @@ function detectFormat(content) {
1550
1554
  if (cleaned.match(/<channel[\s>]/i)) {
1551
1555
  return "rss";
1552
1556
  }
1557
+ if ((cleaned.match(/<urlset[\s>]/i) || cleaned.match(/<sitemapindex[\s>]/i)) && cleaned.includes("sitemaps.org")) {
1558
+ return "sitemap";
1559
+ }
1553
1560
  return "unknown";
1554
1561
  }
1555
1562
  function isFeed(content) {
@@ -1887,12 +1894,20 @@ function parseRSSDate(dateString) {
1887
1894
  // src/feed/rss/xml-parser.ts
1888
1895
  function parseRSSXML(xml) {
1889
1896
  const cleaned = cleanXMLDeclaration(xml);
1890
- const root = parseElement2(cleaned, 0).element;
1897
+ const withoutDoctype = removeDoctype2(cleaned);
1898
+ const withoutComments = removeComments2(withoutDoctype);
1899
+ const root = parseElement2(withoutComments, 0).element;
1891
1900
  return root;
1892
1901
  }
1893
1902
  function cleanXMLDeclaration(xml) {
1894
1903
  return xml.replace(/<\?xml[^?]*\?>/g, "").trim();
1895
1904
  }
1905
+ function removeDoctype2(xml) {
1906
+ return xml.replace(/<!DOCTYPE[^>]*>/gi, "");
1907
+ }
1908
+ function removeComments2(xml) {
1909
+ return xml.replace(/<!--[\s\S]*?-->/g, "");
1910
+ }
1896
1911
  function extractCDATA2(text) {
1897
1912
  const cdataMap = /* @__PURE__ */ new Map();
1898
1913
  let counter = 0;
@@ -2330,6 +2345,11 @@ function parseFeedAs(content, format, baseUrl) {
2330
2345
  original: jsonFeed
2331
2346
  };
2332
2347
  }
2348
+ case "sitemap": {
2349
+ throw new Error(
2350
+ "Sitemaps cannot be parsed with parseFeed(). Use parseSitemap() from the sitemap module instead."
2351
+ );
2352
+ }
2333
2353
  }
2334
2354
  }
2335
2355
 
@@ -2411,6 +2431,10 @@ function detectEncoding(buffer, contentType) {
2411
2431
  }
2412
2432
  const preview = new Uint8Array(buffer.slice(0, 1024));
2413
2433
  const previewText = new TextDecoder("utf-8", { fatal: false }).decode(preview);
2434
+ const xmlEncoding = parseCharsetFromXml(previewText);
2435
+ if (xmlEncoding) {
2436
+ return xmlEncoding;
2437
+ }
2414
2438
  const metaEncoding = parseCharsetFromHtml(previewText);
2415
2439
  if (metaEncoding) {
2416
2440
  return metaEncoding;
@@ -2451,6 +2475,13 @@ function parseCharsetFromHtml(html) {
2451
2475
  }
2452
2476
  return null;
2453
2477
  }
2478
+ function parseCharsetFromXml(xml) {
2479
+ const xmlDeclMatch = /<\?xml[^?]*encoding\s*=\s*["']([^"']+)["'][^?]*\?>/i.exec(xml);
2480
+ if (xmlDeclMatch) {
2481
+ return normalizeEncoding(xmlDeclMatch[1]);
2482
+ }
2483
+ return null;
2484
+ }
2454
2485
  function normalizeEncoding(encoding) {
2455
2486
  const normalized = encoding.toLowerCase().trim();
2456
2487
  const aliases = {