magpie-html 0.1.1 → 0.1.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +148 -113
- package/dist/index.cjs +1724 -0
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +184 -1
- package/dist/index.d.ts +184 -1
- package/dist/index.js +1715 -1
- package/dist/index.js.map +1 -1
- package/package.json +8 -3
package/README.md
CHANGED
|
@@ -1,6 +1,18 @@
|
|
|
1
1
|
# Magpie HTML 🦅
|
|
2
2
|
|
|
3
|
-
|
|
3
|
+
[](https://www.npmjs.com/package/magpie-html)
|
|
4
|
+
[](https://www.npmjs.com/package/magpie-html)
|
|
5
|
+
[](https://github.com/Anonyfox/magpie-html/actions/workflows/ci.yml)
|
|
6
|
+
[](https://anonyfox.github.io/magpie-html)
|
|
7
|
+
[](https://opensource.org/licenses/MIT)
|
|
8
|
+
[](https://www.typescriptlang.org/)
|
|
9
|
+
[](https://nodejs.org/)
|
|
10
|
+
|
|
11
|
+
**Modern web scraping for when you need the good parts, not the markup soup.** Extracts clean article content, parses feeds (RSS, Atom, JSON), and gathers metadata from any page. Handles broken encodings, malformed feeds, and the chaos of real-world HTML. TypeScript-native, works everywhere. Named after the bird known for collecting valuable things... you get the idea.
|
|
12
|
+
|
|
13
|
+
<div align="center">
|
|
14
|
+
<img src="https://raw.githubusercontent.com/Anonyfox/magpie-html/main/assets/magpie-html-logo.png" alt="Magpie HTML Logo" width="300">
|
|
15
|
+
</div>
|
|
4
16
|
|
|
5
17
|
## Features
|
|
6
18
|
|
|
@@ -23,27 +35,27 @@ npm install magpie-html
|
|
|
23
35
|
## Quick Start
|
|
24
36
|
|
|
25
37
|
```typescript
|
|
26
|
-
import { gatherWebsite, gatherArticle, gatherFeed } from
|
|
38
|
+
import { gatherWebsite, gatherArticle, gatherFeed } from "magpie-html";
|
|
27
39
|
|
|
28
40
|
// Gather complete website metadata
|
|
29
|
-
const site = await gatherWebsite(
|
|
30
|
-
console.log(site.title);
|
|
31
|
-
console.log(site.description);
|
|
32
|
-
console.log(site.image);
|
|
33
|
-
console.log(site.feeds);
|
|
34
|
-
console.log(site.internalLinks);
|
|
41
|
+
const site = await gatherWebsite("https://example.com");
|
|
42
|
+
console.log(site.title); // Page title
|
|
43
|
+
console.log(site.description); // Meta description
|
|
44
|
+
console.log(site.image); // Featured image
|
|
45
|
+
console.log(site.feeds); // Discovered feeds
|
|
46
|
+
console.log(site.internalLinks); // Internal links
|
|
35
47
|
|
|
36
48
|
// Gather article content + metadata
|
|
37
|
-
const article = await gatherArticle(
|
|
38
|
-
console.log(article.title);
|
|
39
|
-
console.log(article.content);
|
|
40
|
-
console.log(article.wordCount);
|
|
49
|
+
const article = await gatherArticle("https://example.com/article");
|
|
50
|
+
console.log(article.title); // Article title
|
|
51
|
+
console.log(article.content); // Clean article text
|
|
52
|
+
console.log(article.wordCount); // Word count
|
|
41
53
|
console.log(article.readingTime); // Reading time in minutes
|
|
42
54
|
|
|
43
55
|
// Gather feed data
|
|
44
|
-
const feed = await gatherFeed(
|
|
45
|
-
console.log(feed.title);
|
|
46
|
-
console.log(feed.items);
|
|
56
|
+
const feed = await gatherFeed("https://example.com/feed.xml");
|
|
57
|
+
console.log(feed.title); // Feed title
|
|
58
|
+
console.log(feed.items); // Feed items
|
|
47
59
|
```
|
|
48
60
|
|
|
49
61
|
## Usage
|
|
@@ -53,32 +65,33 @@ console.log(feed.items); // Feed items
|
|
|
53
65
|
Extract comprehensive metadata from any webpage:
|
|
54
66
|
|
|
55
67
|
```typescript
|
|
56
|
-
import { gatherWebsite } from
|
|
68
|
+
import { gatherWebsite } from "magpie-html";
|
|
57
69
|
|
|
58
|
-
const site = await gatherWebsite(
|
|
70
|
+
const site = await gatherWebsite("https://example.com");
|
|
59
71
|
|
|
60
72
|
// Basic metadata
|
|
61
|
-
console.log(site.url);
|
|
62
|
-
console.log(site.title);
|
|
63
|
-
console.log(site.description);
|
|
64
|
-
console.log(site.image);
|
|
65
|
-
console.log(site.icon);
|
|
73
|
+
console.log(site.url); // Final URL (after redirects)
|
|
74
|
+
console.log(site.title); // Best title (cleaned)
|
|
75
|
+
console.log(site.description); // Meta description
|
|
76
|
+
console.log(site.image); // Featured image URL
|
|
77
|
+
console.log(site.icon); // Site favicon/icon
|
|
66
78
|
|
|
67
79
|
// Language & region
|
|
68
|
-
console.log(site.language);
|
|
69
|
-
console.log(site.region);
|
|
80
|
+
console.log(site.language); // ISO 639-1 code (e.g., 'en')
|
|
81
|
+
console.log(site.region); // ISO 3166-1 alpha-2 (e.g., 'US')
|
|
70
82
|
|
|
71
83
|
// Discovered content
|
|
72
|
-
console.log(site.feeds);
|
|
73
|
-
console.log(site.internalLinks);
|
|
74
|
-
console.log(site.externalLinks);
|
|
84
|
+
console.log(site.feeds); // Array of feed URLs
|
|
85
|
+
console.log(site.internalLinks); // Internal links (same domain)
|
|
86
|
+
console.log(site.externalLinks); // External links (other domains)
|
|
75
87
|
|
|
76
88
|
// Raw content
|
|
77
|
-
console.log(site.html);
|
|
78
|
-
console.log(site.text);
|
|
89
|
+
console.log(site.html); // Raw HTML
|
|
90
|
+
console.log(site.text); // Plain text (full page)
|
|
79
91
|
```
|
|
80
92
|
|
|
81
93
|
**What it does:**
|
|
94
|
+
|
|
82
95
|
- Fetches the page with automatic redirect handling
|
|
83
96
|
- Extracts metadata from multiple sources (OpenGraph, Schema.org, Twitter Card, etc.)
|
|
84
97
|
- Picks the "best" value for each field (longest, highest priority, cleaned)
|
|
@@ -91,33 +104,34 @@ console.log(site.text); // Plain text (full page)
|
|
|
91
104
|
Extract clean article content with metadata:
|
|
92
105
|
|
|
93
106
|
```typescript
|
|
94
|
-
import { gatherArticle } from
|
|
107
|
+
import { gatherArticle } from "magpie-html";
|
|
95
108
|
|
|
96
|
-
const article = await gatherArticle(
|
|
109
|
+
const article = await gatherArticle("https://example.com/article");
|
|
97
110
|
|
|
98
111
|
// Core content
|
|
99
|
-
console.log(article.url);
|
|
100
|
-
console.log(article.title);
|
|
101
|
-
console.log(article.content);
|
|
102
|
-
console.log(article.description);
|
|
112
|
+
console.log(article.url); // Final URL
|
|
113
|
+
console.log(article.title); // Article title (Readability or metadata)
|
|
114
|
+
console.log(article.content); // Clean article text (formatted)
|
|
115
|
+
console.log(article.description); // Excerpt/summary
|
|
103
116
|
|
|
104
117
|
// Metrics
|
|
105
|
-
console.log(article.wordCount);
|
|
106
|
-
console.log(article.readingTime);
|
|
118
|
+
console.log(article.wordCount); // Word count
|
|
119
|
+
console.log(article.readingTime); // Est. reading time (minutes)
|
|
107
120
|
|
|
108
121
|
// Media & language
|
|
109
|
-
console.log(article.image);
|
|
110
|
-
console.log(article.language);
|
|
111
|
-
console.log(article.region);
|
|
122
|
+
console.log(article.image); // Article image
|
|
123
|
+
console.log(article.language); // Language code
|
|
124
|
+
console.log(article.region); // Region code
|
|
112
125
|
|
|
113
126
|
// Links & raw content
|
|
114
127
|
console.log(article.internalLinks); // Internal links
|
|
115
128
|
console.log(article.externalLinks); // External links (citations)
|
|
116
|
-
console.log(article.html);
|
|
117
|
-
console.log(article.text);
|
|
129
|
+
console.log(article.html); // Raw HTML
|
|
130
|
+
console.log(article.text); // Plain text (full page)
|
|
118
131
|
```
|
|
119
132
|
|
|
120
133
|
**What it does:**
|
|
134
|
+
|
|
121
135
|
- Uses Mozilla Readability to extract clean article content
|
|
122
136
|
- Falls back to metadata extraction if Readability fails
|
|
123
137
|
- Converts cleaned HTML to well-formatted plain text
|
|
@@ -129,30 +143,31 @@ console.log(article.text); // Plain text (full page)
|
|
|
129
143
|
Parse any feed format with one function:
|
|
130
144
|
|
|
131
145
|
```typescript
|
|
132
|
-
import { gatherFeed } from
|
|
146
|
+
import { gatherFeed } from "magpie-html";
|
|
133
147
|
|
|
134
|
-
const feed = await gatherFeed(
|
|
148
|
+
const feed = await gatherFeed("https://example.com/feed.xml");
|
|
135
149
|
|
|
136
150
|
// Feed metadata
|
|
137
|
-
console.log(feed.title);
|
|
138
|
-
console.log(feed.description);
|
|
139
|
-
console.log(feed.url);
|
|
140
|
-
console.log(feed.siteUrl);
|
|
151
|
+
console.log(feed.title); // Feed title
|
|
152
|
+
console.log(feed.description); // Feed description
|
|
153
|
+
console.log(feed.url); // Feed URL
|
|
154
|
+
console.log(feed.siteUrl); // Website URL
|
|
141
155
|
|
|
142
156
|
// Feed items
|
|
143
157
|
for (const item of feed.items) {
|
|
144
|
-
console.log(item.title);
|
|
145
|
-
console.log(item.url);
|
|
146
|
-
console.log(item.description);
|
|
147
|
-
console.log(item.publishedAt);
|
|
148
|
-
console.log(item.author);
|
|
158
|
+
console.log(item.title); // Item title
|
|
159
|
+
console.log(item.url); // Item URL (absolute)
|
|
160
|
+
console.log(item.description); // Item description
|
|
161
|
+
console.log(item.publishedAt); // Publication date
|
|
162
|
+
console.log(item.author); // Author
|
|
149
163
|
}
|
|
150
164
|
|
|
151
165
|
// Format detection
|
|
152
|
-
console.log(feed.format);
|
|
166
|
+
console.log(feed.format); // 'rss', 'atom', or 'json-feed'
|
|
153
167
|
```
|
|
154
168
|
|
|
155
169
|
**What it does:**
|
|
170
|
+
|
|
156
171
|
- Auto-detects feed format (RSS 2.0, Atom 1.0, JSON Feed)
|
|
157
172
|
- Normalizes all formats to a unified interface
|
|
158
173
|
- Resolves relative URLs to absolute
|
|
@@ -165,10 +180,10 @@ For more control, use the lower-level modules directly:
|
|
|
165
180
|
### Feed Parsing
|
|
166
181
|
|
|
167
182
|
```typescript
|
|
168
|
-
import { pluck, parseFeed } from
|
|
183
|
+
import { pluck, parseFeed } from "magpie-html";
|
|
169
184
|
|
|
170
185
|
// Fetch feed content
|
|
171
|
-
const response = await pluck(
|
|
186
|
+
const response = await pluck("https://example.com/feed.xml");
|
|
172
187
|
const feedContent = await response.textUtf8();
|
|
173
188
|
|
|
174
189
|
// Parse with base URL for relative links
|
|
@@ -182,25 +197,25 @@ console.log(result.feed.format); // 'rss', 'atom', or 'json-feed'
|
|
|
182
197
|
### Content Extraction
|
|
183
198
|
|
|
184
199
|
```typescript
|
|
185
|
-
import { parseHTML, extractContent, htmlToText } from
|
|
200
|
+
import { parseHTML, extractContent, htmlToText } from "magpie-html";
|
|
186
201
|
|
|
187
202
|
// Parse HTML once
|
|
188
203
|
const doc = parseHTML(html);
|
|
189
204
|
|
|
190
205
|
// Extract article with Readability
|
|
191
206
|
const result = extractContent(doc, {
|
|
192
|
-
baseUrl:
|
|
207
|
+
baseUrl: "https://example.com/article",
|
|
193
208
|
cleanConditionally: true,
|
|
194
209
|
keepClasses: false,
|
|
195
210
|
});
|
|
196
211
|
|
|
197
212
|
if (result.success) {
|
|
198
|
-
console.log(result.title);
|
|
199
|
-
console.log(result.excerpt);
|
|
200
|
-
console.log(result.content);
|
|
201
|
-
console.log(result.textContent);
|
|
202
|
-
console.log(result.wordCount);
|
|
203
|
-
console.log(result.readingTime);
|
|
213
|
+
console.log(result.title); // Article title
|
|
214
|
+
console.log(result.excerpt); // Article excerpt
|
|
215
|
+
console.log(result.content); // Clean HTML
|
|
216
|
+
console.log(result.textContent); // Plain text
|
|
217
|
+
console.log(result.wordCount); // Word count
|
|
218
|
+
console.log(result.readingTime); // Reading time
|
|
204
219
|
}
|
|
205
220
|
|
|
206
221
|
// Or convert any HTML to text
|
|
@@ -214,7 +229,12 @@ const plainText = htmlToText(html, {
|
|
|
214
229
|
### Metadata Extraction
|
|
215
230
|
|
|
216
231
|
```typescript
|
|
217
|
-
import {
|
|
232
|
+
import {
|
|
233
|
+
parseHTML,
|
|
234
|
+
extractOpenGraph,
|
|
235
|
+
extractSchemaOrg,
|
|
236
|
+
extractSEO,
|
|
237
|
+
} from "magpie-html";
|
|
218
238
|
|
|
219
239
|
const doc = parseHTML(html);
|
|
220
240
|
|
|
@@ -236,6 +256,7 @@ console.log(seo.keywords);
|
|
|
236
256
|
```
|
|
237
257
|
|
|
238
258
|
**Available extractors:**
|
|
259
|
+
|
|
239
260
|
- `extractSEO` - SEO meta tags
|
|
240
261
|
- `extractOpenGraph` - OpenGraph metadata
|
|
241
262
|
- `extractTwitterCard` - Twitter Card metadata
|
|
@@ -253,92 +274,98 @@ console.log(seo.keywords);
|
|
|
253
274
|
Use `pluck()` for robust fetching with automatic encoding and redirect handling:
|
|
254
275
|
|
|
255
276
|
```typescript
|
|
256
|
-
import { pluck } from
|
|
277
|
+
import { pluck } from "magpie-html";
|
|
257
278
|
|
|
258
|
-
const response = await pluck(
|
|
259
|
-
timeout: 30000,
|
|
260
|
-
maxRedirects: 10,
|
|
261
|
-
maxSize: 10485760,
|
|
262
|
-
userAgent:
|
|
279
|
+
const response = await pluck("https://example.com", {
|
|
280
|
+
timeout: 30000, // 30 second timeout
|
|
281
|
+
maxRedirects: 10, // Follow up to 10 redirects
|
|
282
|
+
maxSize: 10485760, // 10MB limit
|
|
283
|
+
userAgent: "MyBot/1.0",
|
|
263
284
|
throwOnHttpError: true,
|
|
264
285
|
strictContentType: false,
|
|
265
286
|
});
|
|
266
287
|
|
|
267
288
|
// Enhanced response properties
|
|
268
|
-
console.log(response.finalUrl);
|
|
269
|
-
console.log(response.redirectChain);
|
|
289
|
+
console.log(response.finalUrl); // URL after redirects
|
|
290
|
+
console.log(response.redirectChain); // All redirect URLs
|
|
270
291
|
console.log(response.detectedEncoding); // Detected charset
|
|
271
|
-
console.log(response.timing);
|
|
292
|
+
console.log(response.timing); // Request timing
|
|
272
293
|
|
|
273
294
|
// Get UTF-8 decoded content
|
|
274
295
|
const text = await response.textUtf8();
|
|
275
296
|
```
|
|
276
297
|
|
|
277
298
|
**Why `pluck()`?**
|
|
299
|
+
|
|
278
300
|
- Handles broken sites with wrong/missing encoding declarations
|
|
279
301
|
- Follows redirect chains and tracks them
|
|
280
302
|
- Enforces timeouts and size limits
|
|
281
303
|
- Compatible with standard `fetch()` API
|
|
282
304
|
- Named `pluck()` to avoid confusion (magpies pluck things! 🦅)
|
|
283
305
|
|
|
284
|
-
##
|
|
306
|
+
## Experimental: `swoop()` (client-side DOM rendering without a browser engine)
|
|
285
307
|
|
|
286
|
-
|
|
308
|
+
> **⚠️ SECURITY WARNING — Remote Code Execution (RCE)**
|
|
309
|
+
>
|
|
310
|
+
> `swoop()` **executes remote, third‑party JavaScript inside your current Node.js process** (via `node:vm` + browser shims).
|
|
311
|
+
> This is **fundamentally insecure**. Only use `swoop()` on **fully trusted targets** and treat inputs as **hostile by default**.
|
|
312
|
+
> For any professional/untrusted scraping, run this in a **real sandbox** (container/VM/locked-down OS user + seccomp/apparmor/firejail, etc.).
|
|
287
313
|
|
|
288
|
-
-
|
|
289
|
-
- **`gatherArticle(url)`** - Extract article content + metadata
|
|
290
|
-
- **`gatherFeed(url)`** - Parse any feed format
|
|
314
|
+
> **Note:** `magpie-html` does **not** use `swoop()` internally. It’s provided as an **optional standalone utility** for the few cases where you really need DOM-only client-side rendering.
|
|
291
315
|
|
|
292
|
-
|
|
316
|
+
`swoop()` is an **explicitly experimental** helper that tries to execute client-side scripts against a **DOM-only** environment and then returns a **best-effort HTML snapshot**.
|
|
293
317
|
|
|
294
|
-
|
|
318
|
+
### Why this exists
|
|
295
319
|
|
|
296
|
-
|
|
320
|
+
Sometimes `curl` / `fetch` / `pluck()` isn’t enough because the page is a SPA and only renders content after client-side JavaScript runs.
|
|
321
|
+
`swoop()` exists to **quickly turn “CSR-only” pages into HTML** so the rest of `magpie-html` can work with the result.
|
|
297
322
|
|
|
298
|
-
|
|
299
|
-
- **`detectFormat(content)`** - Detect feed format
|
|
300
|
-
- **`parseHTML(html)`** - Parse HTML to Document
|
|
323
|
+
If it works, it can be **comparably light and fast** because it avoids a full browser engine by using a custom `node:vm`-based execution environment with browser shims.
|
|
301
324
|
|
|
302
|
-
|
|
325
|
+
For very complicated targets (heavy JS, complex navigation, strong anti-bot, layout-dependent rendering), you should use a **real browser engine** instead.
|
|
303
326
|
|
|
304
|
-
|
|
305
|
-
- **`htmlToText(html, options?)`** - Convert HTML to plain text
|
|
306
|
-
- **`isProbablyReaderable(doc)`** - Check if content is article-like
|
|
327
|
+
`swoop()` is best seen as a **building block**—you still need to provide the **real sandboxing** around it.
|
|
307
328
|
|
|
308
|
-
###
|
|
329
|
+
### What it is
|
|
309
330
|
|
|
310
|
-
-
|
|
311
|
-
-
|
|
312
|
-
- **`extractTwitterCard(doc)`** - Twitter Card metadata
|
|
313
|
-
- **`extractSchemaOrg(doc)`** - Schema.org / JSON-LD
|
|
314
|
-
- **`extractCanonical(doc)`** - Canonical URLs
|
|
315
|
-
- **`extractLanguage(doc)`** - Language detection
|
|
316
|
-
- **`extractIcons(doc)`** - Favicons and icons
|
|
317
|
-
- **`extractAssets(doc, baseUrl)`** - Linked assets
|
|
318
|
-
- **`extractLinks(doc, baseUrl, options?)`** - Navigation links
|
|
319
|
-
- **`extractFeedDiscovery(doc, baseUrl)`** - Discover feeds
|
|
320
|
-
- ...and 10+ more specialized extractors
|
|
331
|
+
- A pragmatic “SPA snapshotter” for cases where a page renders content via client-side JavaScript.
|
|
332
|
+
- **No browser engine**: no layout/paint/CSS correctness.
|
|
321
333
|
|
|
322
|
-
###
|
|
334
|
+
### What it is NOT
|
|
323
335
|
|
|
324
|
-
-
|
|
325
|
-
- **`countWords(text)`** - Count words in text
|
|
326
|
-
- **`calculateReadingTime(wordCount)`** - Estimate reading time
|
|
336
|
+
- Not a headless browser replacement (no navigation lifecycle, no reliable layout APIs).
|
|
327
337
|
|
|
328
|
-
|
|
338
|
+
### Usage
|
|
339
|
+
|
|
340
|
+
```typescript
|
|
341
|
+
import { swoop } from "magpie-html";
|
|
342
|
+
|
|
343
|
+
const result = await swoop("https://example.com/spa", {
|
|
344
|
+
waitStrategy: "networkidle",
|
|
345
|
+
timeout: 3000,
|
|
346
|
+
});
|
|
347
|
+
|
|
348
|
+
console.log(result.html);
|
|
349
|
+
console.log(result.errors);
|
|
350
|
+
```
|
|
329
351
|
|
|
330
352
|
## Performance Tips
|
|
331
353
|
|
|
332
354
|
**Best Practice:** Parse HTML once and reuse the document:
|
|
333
355
|
|
|
334
356
|
```typescript
|
|
335
|
-
import {
|
|
357
|
+
import {
|
|
358
|
+
parseHTML,
|
|
359
|
+
extractSEO,
|
|
360
|
+
extractOpenGraph,
|
|
361
|
+
extractContent,
|
|
362
|
+
} from "magpie-html";
|
|
336
363
|
|
|
337
364
|
const doc = parseHTML(html);
|
|
338
365
|
|
|
339
366
|
// Reuse the same document for multiple extractions
|
|
340
|
-
const seo = extractSEO(doc);
|
|
341
|
-
const og = extractOpenGraph(doc);
|
|
367
|
+
const seo = extractSEO(doc); // Fast: <5ms
|
|
368
|
+
const og = extractOpenGraph(doc); // Fast: <5ms
|
|
342
369
|
const content = extractContent(doc); // ~100-500ms
|
|
343
370
|
|
|
344
371
|
// Total: One parse + all extractions
|
|
@@ -415,10 +442,18 @@ npm publish
|
|
|
415
442
|
|
|
416
443
|
The `prepublishOnly` script automatically builds the package before publishing.
|
|
417
444
|
|
|
418
|
-
|
|
445
|
+
---
|
|
446
|
+
|
|
447
|
+
<div align="center">
|
|
448
|
+
|
|
449
|
+
### Support
|
|
450
|
+
|
|
451
|
+
If this package helps your project, consider sponsoring its maintenance:
|
|
452
|
+
|
|
453
|
+
[](https://github.com/sponsors/Anonyfox)
|
|
419
454
|
|
|
420
|
-
|
|
455
|
+
---
|
|
421
456
|
|
|
422
|
-
|
|
457
|
+
**[Anonyfox](https://anonyfox.com) • [MIT License](LICENSE)**
|
|
423
458
|
|
|
424
|
-
|
|
459
|
+
</div>
|