magpie-html 0.1.5 โ 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +26 -2
- package/dist/index.cjs +33 -2
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +2 -1
- package/dist/index.d.ts +2 -1
- package/dist/index.js +33 -2
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/README.md
CHANGED
|
@@ -9,7 +9,7 @@
|
|
|
9
9
|
[](https://nodejs.org/)
|
|
10
10
|
[](https://crispread.com)
|
|
11
11
|
|
|
12
|
-
**Modern web scraping for when you need the good parts, not the markup soup.** Extracts clean article content, parses feeds (RSS, Atom, JSON), and gathers metadata from any page. Handles broken encodings, malformed feeds, and the chaos of real-world HTML. TypeScript-native, works everywhere. Named after the bird known for collecting valuable things... you get the idea.
|
|
12
|
+
**Modern web scraping for when you need the good parts, not the markup soup.** Extracts clean article content, parses feeds (RSS, Atom, JSON, Sitemaps), and gathers metadata from any page. Handles broken encodings, malformed feeds, and the chaos of real-world HTML. TypeScript-native, works everywhere. Named after the bird known for collecting valuable things... you get the idea.
|
|
13
13
|
|
|
14
14
|
<div align="center">
|
|
15
15
|
<img src="https://raw.githubusercontent.com/Anonyfox/magpie-html/main/assets/magpie-html-logo.png" alt="Magpie HTML Logo" width="300">
|
|
@@ -26,7 +26,7 @@
|
|
|
26
26
|
- ๐ **Type-safe** - Full TypeScript support
|
|
27
27
|
- ๐งช **Well-tested** - Built with Node.js native test runner
|
|
28
28
|
- ๐ **Minimal dependencies** - Lightweight and fast
|
|
29
|
-
- ๐ **Multi-Format Feed Parser** - Parse RSS 2.0, Atom 1.0, and
|
|
29
|
+
- ๐ **Multi-Format Feed Parser** - Parse RSS 2.0, Atom 1.0, JSON Feed, and XML Sitemaps
|
|
30
30
|
- ๐ **Smart URL Resolution** - Automatic normalization to absolute URLs
|
|
31
31
|
- ๐ก๏ธ **Error Resilient** - Graceful handling of malformed data
|
|
32
32
|
- ๐ฆ
**High-Level Convenience** - One-line functions for common tasks
|
|
@@ -199,6 +199,30 @@ console.log(result.feed.items[0].title);
|
|
|
199
199
|
console.log(result.feed.format); // 'rss', 'atom', or 'json-feed'
|
|
200
200
|
```
|
|
201
201
|
|
|
202
|
+
### Sitemap Parsing (Fallback)
|
|
203
|
+
|
|
204
|
+
When standard feeds aren't available, XML sitemaps can be a useful fallback for discovering URLs. Supports standard sitemaps, sitemap indexes, and Google News/Image/Video extensions:
|
|
205
|
+
|
|
206
|
+
```typescript
|
|
207
|
+
import { pluck, parseSitemap, isSitemap } from "magpie-html";
|
|
208
|
+
|
|
209
|
+
const response = await pluck("https://example.com/sitemap.xml");
|
|
210
|
+
const content = await response.textUtf8();
|
|
211
|
+
|
|
212
|
+
if (isSitemap(content)) {
|
|
213
|
+
const result = parseSitemap(content, response.finalUrl);
|
|
214
|
+
|
|
215
|
+
for (const url of result.sitemap.urls) {
|
|
216
|
+
console.log(url.loc); // URL
|
|
217
|
+
console.log(url.lastmod); // Last modified date
|
|
218
|
+
console.log(url.news?.title); // Google News title (if present)
|
|
219
|
+
console.log(url.news?.publicationDate); // Publication date
|
|
220
|
+
}
|
|
221
|
+
|
|
222
|
+
// For sitemap indexes, check result.sitemap.sitemaps[]
|
|
223
|
+
}
|
|
224
|
+
```
|
|
225
|
+
|
|
202
226
|
### Content Extraction
|
|
203
227
|
|
|
204
228
|
```typescript
|
package/dist/index.cjs
CHANGED
|
@@ -1081,8 +1081,12 @@ function extractEntry(entryElement) {
|
|
|
1081
1081
|
function removeComments(xml) {
|
|
1082
1082
|
return xml.replace(/<!--[\s\S]*?-->/g, "");
|
|
1083
1083
|
}
|
|
1084
|
+
function removeDoctype(xml) {
|
|
1085
|
+
return xml.replace(/<!DOCTYPE[^>]*>/gi, "");
|
|
1086
|
+
}
|
|
1084
1087
|
function parseAtomXML(xml) {
|
|
1085
|
-
const
|
|
1088
|
+
const withoutDoctype = removeDoctype(xml);
|
|
1089
|
+
const withoutComments = removeComments(withoutDoctype);
|
|
1086
1090
|
const { text: cleanedXML, cdataMap } = extractCDATA(withoutComments);
|
|
1087
1091
|
const root = parseElement(cleanedXML, 0, null, cdataMap).element;
|
|
1088
1092
|
return root;
|
|
@@ -1550,6 +1554,9 @@ function detectFormat(content) {
|
|
|
1550
1554
|
if (cleaned.match(/<channel[\s>]/i)) {
|
|
1551
1555
|
return "rss";
|
|
1552
1556
|
}
|
|
1557
|
+
if ((cleaned.match(/<urlset[\s>]/i) || cleaned.match(/<sitemapindex[\s>]/i)) && cleaned.includes("sitemaps.org")) {
|
|
1558
|
+
return "sitemap";
|
|
1559
|
+
}
|
|
1553
1560
|
return "unknown";
|
|
1554
1561
|
}
|
|
1555
1562
|
function isFeed(content) {
|
|
@@ -1887,12 +1894,20 @@ function parseRSSDate(dateString) {
|
|
|
1887
1894
|
// src/feed/rss/xml-parser.ts
|
|
1888
1895
|
function parseRSSXML(xml) {
|
|
1889
1896
|
const cleaned = cleanXMLDeclaration(xml);
|
|
1890
|
-
const
|
|
1897
|
+
const withoutDoctype = removeDoctype2(cleaned);
|
|
1898
|
+
const withoutComments = removeComments2(withoutDoctype);
|
|
1899
|
+
const root = parseElement2(withoutComments, 0).element;
|
|
1891
1900
|
return root;
|
|
1892
1901
|
}
|
|
1893
1902
|
function cleanXMLDeclaration(xml) {
|
|
1894
1903
|
return xml.replace(/<\?xml[^?]*\?>/g, "").trim();
|
|
1895
1904
|
}
|
|
1905
|
+
function removeDoctype2(xml) {
|
|
1906
|
+
return xml.replace(/<!DOCTYPE[^>]*>/gi, "");
|
|
1907
|
+
}
|
|
1908
|
+
function removeComments2(xml) {
|
|
1909
|
+
return xml.replace(/<!--[\s\S]*?-->/g, "");
|
|
1910
|
+
}
|
|
1896
1911
|
function extractCDATA2(text) {
|
|
1897
1912
|
const cdataMap = /* @__PURE__ */ new Map();
|
|
1898
1913
|
let counter = 0;
|
|
@@ -2330,6 +2345,11 @@ function parseFeedAs(content, format, baseUrl) {
|
|
|
2330
2345
|
original: jsonFeed
|
|
2331
2346
|
};
|
|
2332
2347
|
}
|
|
2348
|
+
case "sitemap": {
|
|
2349
|
+
throw new Error(
|
|
2350
|
+
"Sitemaps cannot be parsed with parseFeed(). Use parseSitemap() from the sitemap module instead."
|
|
2351
|
+
);
|
|
2352
|
+
}
|
|
2333
2353
|
}
|
|
2334
2354
|
}
|
|
2335
2355
|
|
|
@@ -2411,6 +2431,10 @@ function detectEncoding(buffer, contentType) {
|
|
|
2411
2431
|
}
|
|
2412
2432
|
const preview = new Uint8Array(buffer.slice(0, 1024));
|
|
2413
2433
|
const previewText = new TextDecoder("utf-8", { fatal: false }).decode(preview);
|
|
2434
|
+
const xmlEncoding = parseCharsetFromXml(previewText);
|
|
2435
|
+
if (xmlEncoding) {
|
|
2436
|
+
return xmlEncoding;
|
|
2437
|
+
}
|
|
2414
2438
|
const metaEncoding = parseCharsetFromHtml(previewText);
|
|
2415
2439
|
if (metaEncoding) {
|
|
2416
2440
|
return metaEncoding;
|
|
@@ -2451,6 +2475,13 @@ function parseCharsetFromHtml(html) {
|
|
|
2451
2475
|
}
|
|
2452
2476
|
return null;
|
|
2453
2477
|
}
|
|
2478
|
+
function parseCharsetFromXml(xml) {
|
|
2479
|
+
const xmlDeclMatch = /<\?xml[^?]*encoding\s*=\s*["']([^"']+)["'][^?]*\?>/i.exec(xml);
|
|
2480
|
+
if (xmlDeclMatch) {
|
|
2481
|
+
return normalizeEncoding(xmlDeclMatch[1]);
|
|
2482
|
+
}
|
|
2483
|
+
return null;
|
|
2484
|
+
}
|
|
2454
2485
|
function normalizeEncoding(encoding) {
|
|
2455
2486
|
const normalized = encoding.toLowerCase().trim();
|
|
2456
2487
|
const aliases = {
|