magpie-html 0.1.4 โ†’ 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -7,11 +7,16 @@
7
7
  [![License: MIT](https://img.shields.io/badge/License-MIT-blue.svg?style=flat-square)](https://opensource.org/licenses/MIT)
8
8
  [![TypeScript](https://img.shields.io/badge/TypeScript-5.7-blue?style=flat-square&logo=typescript)](https://www.typescriptlang.org/)
9
9
  [![Node.js](https://img.shields.io/badge/Node.js-%3E%3D18-green?style=flat-square&logo=node.js)](https://nodejs.org/)
10
+ [![Live Demo](https://img.shields.io/badge/Live_Demo-CrispRead-eb6864?style=flat-square&logo=rss&logoColor=white)](https://crispread.com)
10
11
 
11
- **Modern web scraping for when you need the good parts, not the markup soup.** Extracts clean article content, parses feeds (RSS, Atom, JSON), and gathers metadata from any page. Handles broken encodings, malformed feeds, and the chaos of real-world HTML. TypeScript-native, works everywhere. Named after the bird known for collecting valuable things... you get the idea.
12
+ **Modern web scraping for when you need the good parts, not the markup soup.** Extracts clean article content, parses feeds (RSS, Atom, JSON, Sitemaps), and gathers metadata from any page. Handles broken encodings, malformed feeds, and the chaos of real-world HTML. TypeScript-native, works everywhere. Named after the bird known for collecting valuable things... you get the idea.
12
13
 
13
14
  <div align="center">
14
15
  <img src="https://raw.githubusercontent.com/Anonyfox/magpie-html/main/assets/magpie-html-logo.png" alt="Magpie HTML Logo" width="300">
16
+
17
+ <br><br>
18
+ <strong>Production-ready</strong> ยท Powers <a href="https://crispread.com">CrispRead</a>, a trilingual news aggregator processing thousands of articles daily.
19
+
15
20
  </div>
16
21
 
17
22
  ## Features
@@ -21,7 +26,7 @@
21
26
  - ๐Ÿ”’ **Type-safe** - Full TypeScript support
22
27
  - ๐Ÿงช **Well-tested** - Built with Node.js native test runner
23
28
  - ๐Ÿš€ **Minimal dependencies** - Lightweight and fast
24
- - ๐Ÿ”„ **Multi-Format Feed Parser** - Parse RSS 2.0, Atom 1.0, and JSON Feed
29
+ - ๐Ÿ”„ **Multi-Format Feed Parser** - Parse RSS 2.0, Atom 1.0, JSON Feed, and XML Sitemaps
25
30
  - ๐Ÿ”— **Smart URL Resolution** - Automatic normalization to absolute URLs
26
31
  - ๐Ÿ›ก๏ธ **Error Resilient** - Graceful handling of malformed data
27
32
  - ๐Ÿฆ… **High-Level Convenience** - One-line functions for common tasks
@@ -194,6 +199,30 @@ console.log(result.feed.items[0].title);
194
199
  console.log(result.feed.format); // 'rss', 'atom', or 'json-feed'
195
200
  ```
196
201
 
202
+ ### Sitemap Parsing (Fallback)
203
+
204
+ When standard feeds aren't available, XML sitemaps can be a useful fallback for discovering URLs. Supports standard sitemaps, sitemap indexes, and Google News/Image/Video extensions:
205
+
206
+ ```typescript
207
+ import { pluck, parseSitemap, isSitemap } from "magpie-html";
208
+
209
+ const response = await pluck("https://example.com/sitemap.xml");
210
+ const content = await response.textUtf8();
211
+
212
+ if (isSitemap(content)) {
213
+ const result = parseSitemap(content, response.finalUrl);
214
+
215
+ for (const url of result.sitemap.urls) {
216
+ console.log(url.loc); // URL
217
+ console.log(url.lastmod); // Last modified date
218
+ console.log(url.news?.title); // Google News title (if present)
219
+ console.log(url.news?.publicationDate); // Publication date
220
+ }
221
+
222
+ // For sitemap indexes, check result.sitemap.sitemaps[]
223
+ }
224
+ ```
225
+
197
226
  ### Content Extraction
198
227
 
199
228
  ```typescript
@@ -454,6 +483,6 @@ If this package helps your project, consider sponsoring its maintenance:
454
483
 
455
484
  ---
456
485
 
457
- **[Anonyfox](https://anonyfox.com) โ€ข [MIT License](LICENSE)**
486
+ **[Anonyfox](https://anonyfox.com) โ€ข [API Docs](https://anonyfox.github.io/magpie-html) โ€ข [MIT License](LICENSE)**
458
487
 
459
488
  </div>
package/dist/index.cjs CHANGED
@@ -1550,6 +1550,9 @@ function detectFormat(content) {
1550
1550
  if (cleaned.match(/<channel[\s>]/i)) {
1551
1551
  return "rss";
1552
1552
  }
1553
+ if ((cleaned.match(/<urlset[\s>]/i) || cleaned.match(/<sitemapindex[\s>]/i)) && cleaned.includes("sitemaps.org")) {
1554
+ return "sitemap";
1555
+ }
1553
1556
  return "unknown";
1554
1557
  }
1555
1558
  function isFeed(content) {
@@ -1887,12 +1890,16 @@ function parseRSSDate(dateString) {
1887
1890
  // src/feed/rss/xml-parser.ts
1888
1891
  function parseRSSXML(xml) {
1889
1892
  const cleaned = cleanXMLDeclaration(xml);
1890
- const root = parseElement2(cleaned, 0).element;
1893
+ const withoutComments = removeComments2(cleaned);
1894
+ const root = parseElement2(withoutComments, 0).element;
1891
1895
  return root;
1892
1896
  }
1893
1897
  function cleanXMLDeclaration(xml) {
1894
1898
  return xml.replace(/<\?xml[^?]*\?>/g, "").trim();
1895
1899
  }
1900
+ function removeComments2(xml) {
1901
+ return xml.replace(/<!--[\s\S]*?-->/g, "");
1902
+ }
1896
1903
  function extractCDATA2(text) {
1897
1904
  const cdataMap = /* @__PURE__ */ new Map();
1898
1905
  let counter = 0;
@@ -2330,6 +2337,11 @@ function parseFeedAs(content, format, baseUrl) {
2330
2337
  original: jsonFeed
2331
2338
  };
2332
2339
  }
2340
+ case "sitemap": {
2341
+ throw new Error(
2342
+ "Sitemaps cannot be parsed with parseFeed(). Use parseSitemap() from the sitemap module instead."
2343
+ );
2344
+ }
2333
2345
  }
2334
2346
  }
2335
2347
 
@@ -2411,6 +2423,10 @@ function detectEncoding(buffer, contentType) {
2411
2423
  }
2412
2424
  const preview = new Uint8Array(buffer.slice(0, 1024));
2413
2425
  const previewText = new TextDecoder("utf-8", { fatal: false }).decode(preview);
2426
+ const xmlEncoding = parseCharsetFromXml(previewText);
2427
+ if (xmlEncoding) {
2428
+ return xmlEncoding;
2429
+ }
2414
2430
  const metaEncoding = parseCharsetFromHtml(previewText);
2415
2431
  if (metaEncoding) {
2416
2432
  return metaEncoding;
@@ -2451,6 +2467,13 @@ function parseCharsetFromHtml(html) {
2451
2467
  }
2452
2468
  return null;
2453
2469
  }
2470
+ function parseCharsetFromXml(xml) {
2471
+ const xmlDeclMatch = /<\?xml[^?]*encoding\s*=\s*["']([^"']+)["'][^?]*\?>/i.exec(xml);
2472
+ if (xmlDeclMatch) {
2473
+ return normalizeEncoding(xmlDeclMatch[1]);
2474
+ }
2475
+ return null;
2476
+ }
2454
2477
  function normalizeEncoding(encoding) {
2455
2478
  const normalized = encoding.toLowerCase().trim();
2456
2479
  const aliases = {