magpie-html 0.1.5 โ†’ 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -9,7 +9,7 @@
9
9
  [![Node.js](https://img.shields.io/badge/Node.js-%3E%3D18-green?style=flat-square&logo=node.js)](https://nodejs.org/)
10
10
  [![Live Demo](https://img.shields.io/badge/Live_Demo-CrispRead-eb6864?style=flat-square&logo=rss&logoColor=white)](https://crispread.com)
11
11
 
12
- **Modern web scraping for when you need the good parts, not the markup soup.** Extracts clean article content, parses feeds (RSS, Atom, JSON), and gathers metadata from any page. Handles broken encodings, malformed feeds, and the chaos of real-world HTML. TypeScript-native, works everywhere. Named after the bird known for collecting valuable things... you get the idea.
12
+ **Modern web scraping for when you need the good parts, not the markup soup.** Extracts clean article content, parses feeds (RSS, Atom, JSON, Sitemaps), and gathers metadata from any page. Handles broken encodings, malformed feeds, and the chaos of real-world HTML. TypeScript-native, works everywhere. Named after the bird known for collecting valuable things... you get the idea.
13
13
 
14
14
  <div align="center">
15
15
  <img src="https://raw.githubusercontent.com/Anonyfox/magpie-html/main/assets/magpie-html-logo.png" alt="Magpie HTML Logo" width="300">
@@ -26,7 +26,7 @@
26
26
  - ๐Ÿ”’ **Type-safe** - Full TypeScript support
27
27
  - ๐Ÿงช **Well-tested** - Built with Node.js native test runner
28
28
  - ๐Ÿš€ **Minimal dependencies** - Lightweight and fast
29
- - ๐Ÿ”„ **Multi-Format Feed Parser** - Parse RSS 2.0, Atom 1.0, and JSON Feed
29
+ - ๐Ÿ”„ **Multi-Format Feed Parser** - Parse RSS 2.0, Atom 1.0, JSON Feed, and XML Sitemaps
30
30
  - ๐Ÿ”— **Smart URL Resolution** - Automatic normalization to absolute URLs
31
31
  - ๐Ÿ›ก๏ธ **Error Resilient** - Graceful handling of malformed data
32
32
  - ๐Ÿฆ… **High-Level Convenience** - One-line functions for common tasks
@@ -199,6 +199,30 @@ console.log(result.feed.items[0].title);
199
199
  console.log(result.feed.format); // 'rss', 'atom', or 'json-feed'
200
200
  ```
201
201
 
202
+ ### Sitemap Parsing (Fallback)
203
+
204
+ When standard feeds aren't available, XML sitemaps can be a useful fallback for discovering URLs. Supports standard sitemaps, sitemap indexes, and Google News/Image/Video extensions:
205
+
206
+ ```typescript
207
+ import { pluck, parseSitemap, isSitemap } from "magpie-html";
208
+
209
+ const response = await pluck("https://example.com/sitemap.xml");
210
+ const content = await response.textUtf8();
211
+
212
+ if (isSitemap(content)) {
213
+ const result = parseSitemap(content, response.finalUrl);
214
+
215
+ for (const url of result.sitemap.urls) {
216
+ console.log(url.loc); // URL
217
+ console.log(url.lastmod); // Last modified date
218
+ console.log(url.news?.title); // Google News title (if present)
219
+ console.log(url.news?.publicationDate); // Publication date
220
+ }
221
+
222
+ // For sitemap indexes, check result.sitemap.sitemaps[]
223
+ }
224
+ ```
225
+
202
226
  ### Content Extraction
203
227
 
204
228
  ```typescript
package/dist/index.cjs CHANGED
@@ -1550,6 +1550,9 @@ function detectFormat(content) {
1550
1550
  if (cleaned.match(/<channel[\s>]/i)) {
1551
1551
  return "rss";
1552
1552
  }
1553
+ if ((cleaned.match(/<urlset[\s>]/i) || cleaned.match(/<sitemapindex[\s>]/i)) && cleaned.includes("sitemaps.org")) {
1554
+ return "sitemap";
1555
+ }
1553
1556
  return "unknown";
1554
1557
  }
1555
1558
  function isFeed(content) {
@@ -1887,12 +1890,16 @@ function parseRSSDate(dateString) {
1887
1890
  // src/feed/rss/xml-parser.ts
1888
1891
  function parseRSSXML(xml) {
1889
1892
  const cleaned = cleanXMLDeclaration(xml);
1890
- const root = parseElement2(cleaned, 0).element;
1893
+ const withoutComments = removeComments2(cleaned);
1894
+ const root = parseElement2(withoutComments, 0).element;
1891
1895
  return root;
1892
1896
  }
1893
1897
  function cleanXMLDeclaration(xml) {
1894
1898
  return xml.replace(/<\?xml[^?]*\?>/g, "").trim();
1895
1899
  }
1900
+ function removeComments2(xml) {
1901
+ return xml.replace(/<!--[\s\S]*?-->/g, "");
1902
+ }
1896
1903
  function extractCDATA2(text) {
1897
1904
  const cdataMap = /* @__PURE__ */ new Map();
1898
1905
  let counter = 0;
@@ -2330,6 +2337,11 @@ function parseFeedAs(content, format, baseUrl) {
2330
2337
  original: jsonFeed
2331
2338
  };
2332
2339
  }
2340
+ case "sitemap": {
2341
+ throw new Error(
2342
+ "Sitemaps cannot be parsed with parseFeed(). Use parseSitemap() from the sitemap module instead."
2343
+ );
2344
+ }
2333
2345
  }
2334
2346
  }
2335
2347
 
@@ -2411,6 +2423,10 @@ function detectEncoding(buffer, contentType) {
2411
2423
  }
2412
2424
  const preview = new Uint8Array(buffer.slice(0, 1024));
2413
2425
  const previewText = new TextDecoder("utf-8", { fatal: false }).decode(preview);
2426
+ const xmlEncoding = parseCharsetFromXml(previewText);
2427
+ if (xmlEncoding) {
2428
+ return xmlEncoding;
2429
+ }
2414
2430
  const metaEncoding = parseCharsetFromHtml(previewText);
2415
2431
  if (metaEncoding) {
2416
2432
  return metaEncoding;
@@ -2451,6 +2467,13 @@ function parseCharsetFromHtml(html) {
2451
2467
  }
2452
2468
  return null;
2453
2469
  }
2470
+ function parseCharsetFromXml(xml) {
2471
+ const xmlDeclMatch = /<\?xml[^?]*encoding\s*=\s*["']([^"']+)["'][^?]*\?>/i.exec(xml);
2472
+ if (xmlDeclMatch) {
2473
+ return normalizeEncoding(xmlDeclMatch[1]);
2474
+ }
2475
+ return null;
2476
+ }
2454
2477
  function normalizeEncoding(encoding) {
2455
2478
  const normalized = encoding.toLowerCase().trim();
2456
2479
  const aliases = {