magpie-html 0.1.5 โ 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +26 -2
- package/dist/index.cjs +24 -1
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +2 -1
- package/dist/index.d.ts +2 -1
- package/dist/index.js +24 -1
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/README.md
CHANGED
|
@@ -9,7 +9,7 @@
|
|
|
9
9
|
[](https://nodejs.org/)
|
|
10
10
|
[](https://crispread.com)
|
|
11
11
|
|
|
12
|
-
**Modern web scraping for when you need the good parts, not the markup soup.** Extracts clean article content, parses feeds (RSS, Atom, JSON), and gathers metadata from any page. Handles broken encodings, malformed feeds, and the chaos of real-world HTML. TypeScript-native, works everywhere. Named after the bird known for collecting valuable things... you get the idea.
|
|
12
|
+
**Modern web scraping for when you need the good parts, not the markup soup.** Extracts clean article content, parses feeds (RSS, Atom, JSON, Sitemaps), and gathers metadata from any page. Handles broken encodings, malformed feeds, and the chaos of real-world HTML. TypeScript-native, works everywhere. Named after the bird known for collecting valuable things... you get the idea.
|
|
13
13
|
|
|
14
14
|
<div align="center">
|
|
15
15
|
<img src="https://raw.githubusercontent.com/Anonyfox/magpie-html/main/assets/magpie-html-logo.png" alt="Magpie HTML Logo" width="300">
|
|
@@ -26,7 +26,7 @@
|
|
|
26
26
|
- ๐ **Type-safe** - Full TypeScript support
|
|
27
27
|
- ๐งช **Well-tested** - Built with Node.js native test runner
|
|
28
28
|
- ๐ **Minimal dependencies** - Lightweight and fast
|
|
29
|
-
- ๐ **Multi-Format Feed Parser** - Parse RSS 2.0, Atom 1.0, and
|
|
29
|
+
- ๐ **Multi-Format Feed Parser** - Parse RSS 2.0, Atom 1.0, JSON Feed, and XML Sitemaps
|
|
30
30
|
- ๐ **Smart URL Resolution** - Automatic normalization to absolute URLs
|
|
31
31
|
- ๐ก๏ธ **Error Resilient** - Graceful handling of malformed data
|
|
32
32
|
- ๐ฆ
**High-Level Convenience** - One-line functions for common tasks
|
|
@@ -199,6 +199,30 @@ console.log(result.feed.items[0].title);
|
|
|
199
199
|
console.log(result.feed.format); // 'rss', 'atom', or 'json-feed'
|
|
200
200
|
```
|
|
201
201
|
|
|
202
|
+
### Sitemap Parsing (Fallback)
|
|
203
|
+
|
|
204
|
+
When standard feeds aren't available, XML sitemaps can be a useful fallback for discovering URLs. Supports standard sitemaps, sitemap indexes, and Google News/Image/Video extensions:
|
|
205
|
+
|
|
206
|
+
```typescript
|
|
207
|
+
import { pluck, parseSitemap, isSitemap } from "magpie-html";
|
|
208
|
+
|
|
209
|
+
const response = await pluck("https://example.com/sitemap.xml");
|
|
210
|
+
const content = await response.textUtf8();
|
|
211
|
+
|
|
212
|
+
if (isSitemap(content)) {
|
|
213
|
+
const result = parseSitemap(content, response.finalUrl);
|
|
214
|
+
|
|
215
|
+
for (const url of result.sitemap.urls) {
|
|
216
|
+
console.log(url.loc); // URL
|
|
217
|
+
console.log(url.lastmod); // Last modified date
|
|
218
|
+
console.log(url.news?.title); // Google News title (if present)
|
|
219
|
+
console.log(url.news?.publicationDate); // Publication date
|
|
220
|
+
}
|
|
221
|
+
|
|
222
|
+
// For sitemap indexes, check result.sitemap.sitemaps[]
|
|
223
|
+
}
|
|
224
|
+
```
|
|
225
|
+
|
|
202
226
|
### Content Extraction
|
|
203
227
|
|
|
204
228
|
```typescript
|
package/dist/index.cjs
CHANGED
|
@@ -1550,6 +1550,9 @@ function detectFormat(content) {
|
|
|
1550
1550
|
if (cleaned.match(/<channel[\s>]/i)) {
|
|
1551
1551
|
return "rss";
|
|
1552
1552
|
}
|
|
1553
|
+
if ((cleaned.match(/<urlset[\s>]/i) || cleaned.match(/<sitemapindex[\s>]/i)) && cleaned.includes("sitemaps.org")) {
|
|
1554
|
+
return "sitemap";
|
|
1555
|
+
}
|
|
1553
1556
|
return "unknown";
|
|
1554
1557
|
}
|
|
1555
1558
|
function isFeed(content) {
|
|
@@ -1887,12 +1890,16 @@ function parseRSSDate(dateString) {
|
|
|
1887
1890
|
// src/feed/rss/xml-parser.ts
|
|
1888
1891
|
function parseRSSXML(xml) {
|
|
1889
1892
|
const cleaned = cleanXMLDeclaration(xml);
|
|
1890
|
-
const
|
|
1893
|
+
const withoutComments = removeComments2(cleaned);
|
|
1894
|
+
const root = parseElement2(withoutComments, 0).element;
|
|
1891
1895
|
return root;
|
|
1892
1896
|
}
|
|
1893
1897
|
function cleanXMLDeclaration(xml) {
|
|
1894
1898
|
return xml.replace(/<\?xml[^?]*\?>/g, "").trim();
|
|
1895
1899
|
}
|
|
1900
|
+
function removeComments2(xml) {
|
|
1901
|
+
return xml.replace(/<!--[\s\S]*?-->/g, "");
|
|
1902
|
+
}
|
|
1896
1903
|
function extractCDATA2(text) {
|
|
1897
1904
|
const cdataMap = /* @__PURE__ */ new Map();
|
|
1898
1905
|
let counter = 0;
|
|
@@ -2330,6 +2337,11 @@ function parseFeedAs(content, format, baseUrl) {
|
|
|
2330
2337
|
original: jsonFeed
|
|
2331
2338
|
};
|
|
2332
2339
|
}
|
|
2340
|
+
case "sitemap": {
|
|
2341
|
+
throw new Error(
|
|
2342
|
+
"Sitemaps cannot be parsed with parseFeed(). Use parseSitemap() from the sitemap module instead."
|
|
2343
|
+
);
|
|
2344
|
+
}
|
|
2333
2345
|
}
|
|
2334
2346
|
}
|
|
2335
2347
|
|
|
@@ -2411,6 +2423,10 @@ function detectEncoding(buffer, contentType) {
|
|
|
2411
2423
|
}
|
|
2412
2424
|
const preview = new Uint8Array(buffer.slice(0, 1024));
|
|
2413
2425
|
const previewText = new TextDecoder("utf-8", { fatal: false }).decode(preview);
|
|
2426
|
+
const xmlEncoding = parseCharsetFromXml(previewText);
|
|
2427
|
+
if (xmlEncoding) {
|
|
2428
|
+
return xmlEncoding;
|
|
2429
|
+
}
|
|
2414
2430
|
const metaEncoding = parseCharsetFromHtml(previewText);
|
|
2415
2431
|
if (metaEncoding) {
|
|
2416
2432
|
return metaEncoding;
|
|
@@ -2451,6 +2467,13 @@ function parseCharsetFromHtml(html) {
|
|
|
2451
2467
|
}
|
|
2452
2468
|
return null;
|
|
2453
2469
|
}
|
|
2470
|
+
function parseCharsetFromXml(xml) {
|
|
2471
|
+
const xmlDeclMatch = /<\?xml[^?]*encoding\s*=\s*["']([^"']+)["'][^?]*\?>/i.exec(xml);
|
|
2472
|
+
if (xmlDeclMatch) {
|
|
2473
|
+
return normalizeEncoding(xmlDeclMatch[1]);
|
|
2474
|
+
}
|
|
2475
|
+
return null;
|
|
2476
|
+
}
|
|
2454
2477
|
function normalizeEncoding(encoding) {
|
|
2455
2478
|
const normalized = encoding.toLowerCase().trim();
|
|
2456
2479
|
const aliases = {
|