magpie-html 0.1.1 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +116 -81
- package/package.json +1 -1
package/README.md
CHANGED
|
@@ -1,6 +1,18 @@
|
|
|
1
1
|
# Magpie HTML 🦅
|
|
2
2
|
|
|
3
|
-
|
|
3
|
+
[](https://www.npmjs.com/package/magpie-html)
|
|
4
|
+
[](https://www.npmjs.com/package/magpie-html)
|
|
5
|
+
[](https://github.com/Anonyfox/magpie-html/actions/workflows/ci.yml)
|
|
6
|
+
[](https://anonyfox.github.io/magpie-html)
|
|
7
|
+
[](https://opensource.org/licenses/MIT)
|
|
8
|
+
[](https://www.typescriptlang.org/)
|
|
9
|
+
[](https://nodejs.org/)
|
|
10
|
+
|
|
11
|
+
**Modern web scraping for when you need the good parts, not the markup soup.** Extracts clean article content, parses feeds (RSS, Atom, JSON), and gathers metadata from any page. Handles broken encodings, malformed feeds, and the chaos of real-world HTML. TypeScript-native, works everywhere. Named after the bird known for collecting valuable things... you get the idea.
|
|
12
|
+
|
|
13
|
+
<div align="center">
|
|
14
|
+
<img src="https://raw.githubusercontent.com/Anonyfox/magpie-html/main/assets/magpie-html-logo.png" alt="Magpie HTML Logo" width="300">
|
|
15
|
+
</div>
|
|
4
16
|
|
|
5
17
|
## Features
|
|
6
18
|
|
|
@@ -23,27 +35,27 @@ npm install magpie-html
|
|
|
23
35
|
## Quick Start
|
|
24
36
|
|
|
25
37
|
```typescript
|
|
26
|
-
import { gatherWebsite, gatherArticle, gatherFeed } from
|
|
38
|
+
import { gatherWebsite, gatherArticle, gatherFeed } from "magpie-html";
|
|
27
39
|
|
|
28
40
|
// Gather complete website metadata
|
|
29
|
-
const site = await gatherWebsite(
|
|
30
|
-
console.log(site.title);
|
|
31
|
-
console.log(site.description);
|
|
32
|
-
console.log(site.image);
|
|
33
|
-
console.log(site.feeds);
|
|
34
|
-
console.log(site.internalLinks);
|
|
41
|
+
const site = await gatherWebsite("https://example.com");
|
|
42
|
+
console.log(site.title); // Page title
|
|
43
|
+
console.log(site.description); // Meta description
|
|
44
|
+
console.log(site.image); // Featured image
|
|
45
|
+
console.log(site.feeds); // Discovered feeds
|
|
46
|
+
console.log(site.internalLinks); // Internal links
|
|
35
47
|
|
|
36
48
|
// Gather article content + metadata
|
|
37
|
-
const article = await gatherArticle(
|
|
38
|
-
console.log(article.title);
|
|
39
|
-
console.log(article.content);
|
|
40
|
-
console.log(article.wordCount);
|
|
49
|
+
const article = await gatherArticle("https://example.com/article");
|
|
50
|
+
console.log(article.title); // Article title
|
|
51
|
+
console.log(article.content); // Clean article text
|
|
52
|
+
console.log(article.wordCount); // Word count
|
|
41
53
|
console.log(article.readingTime); // Reading time in minutes
|
|
42
54
|
|
|
43
55
|
// Gather feed data
|
|
44
|
-
const feed = await gatherFeed(
|
|
45
|
-
console.log(feed.title);
|
|
46
|
-
console.log(feed.items);
|
|
56
|
+
const feed = await gatherFeed("https://example.com/feed.xml");
|
|
57
|
+
console.log(feed.title); // Feed title
|
|
58
|
+
console.log(feed.items); // Feed items
|
|
47
59
|
```
|
|
48
60
|
|
|
49
61
|
## Usage
|
|
@@ -53,32 +65,33 @@ console.log(feed.items); // Feed items
|
|
|
53
65
|
Extract comprehensive metadata from any webpage:
|
|
54
66
|
|
|
55
67
|
```typescript
|
|
56
|
-
import { gatherWebsite } from
|
|
68
|
+
import { gatherWebsite } from "magpie-html";
|
|
57
69
|
|
|
58
|
-
const site = await gatherWebsite(
|
|
70
|
+
const site = await gatherWebsite("https://example.com");
|
|
59
71
|
|
|
60
72
|
// Basic metadata
|
|
61
|
-
console.log(site.url);
|
|
62
|
-
console.log(site.title);
|
|
63
|
-
console.log(site.description);
|
|
64
|
-
console.log(site.image);
|
|
65
|
-
console.log(site.icon);
|
|
73
|
+
console.log(site.url); // Final URL (after redirects)
|
|
74
|
+
console.log(site.title); // Best title (cleaned)
|
|
75
|
+
console.log(site.description); // Meta description
|
|
76
|
+
console.log(site.image); // Featured image URL
|
|
77
|
+
console.log(site.icon); // Site favicon/icon
|
|
66
78
|
|
|
67
79
|
// Language & region
|
|
68
|
-
console.log(site.language);
|
|
69
|
-
console.log(site.region);
|
|
80
|
+
console.log(site.language); // ISO 639-1 code (e.g., 'en')
|
|
81
|
+
console.log(site.region); // ISO 3166-1 alpha-2 (e.g., 'US')
|
|
70
82
|
|
|
71
83
|
// Discovered content
|
|
72
|
-
console.log(site.feeds);
|
|
73
|
-
console.log(site.internalLinks);
|
|
74
|
-
console.log(site.externalLinks);
|
|
84
|
+
console.log(site.feeds); // Array of feed URLs
|
|
85
|
+
console.log(site.internalLinks); // Internal links (same domain)
|
|
86
|
+
console.log(site.externalLinks); // External links (other domains)
|
|
75
87
|
|
|
76
88
|
// Raw content
|
|
77
|
-
console.log(site.html);
|
|
78
|
-
console.log(site.text);
|
|
89
|
+
console.log(site.html); // Raw HTML
|
|
90
|
+
console.log(site.text); // Plain text (full page)
|
|
79
91
|
```
|
|
80
92
|
|
|
81
93
|
**What it does:**
|
|
94
|
+
|
|
82
95
|
- Fetches the page with automatic redirect handling
|
|
83
96
|
- Extracts metadata from multiple sources (OpenGraph, Schema.org, Twitter Card, etc.)
|
|
84
97
|
- Picks the "best" value for each field (longest, highest priority, cleaned)
|
|
@@ -91,33 +104,34 @@ console.log(site.text); // Plain text (full page)
|
|
|
91
104
|
Extract clean article content with metadata:
|
|
92
105
|
|
|
93
106
|
```typescript
|
|
94
|
-
import { gatherArticle } from
|
|
107
|
+
import { gatherArticle } from "magpie-html";
|
|
95
108
|
|
|
96
|
-
const article = await gatherArticle(
|
|
109
|
+
const article = await gatherArticle("https://example.com/article");
|
|
97
110
|
|
|
98
111
|
// Core content
|
|
99
|
-
console.log(article.url);
|
|
100
|
-
console.log(article.title);
|
|
101
|
-
console.log(article.content);
|
|
102
|
-
console.log(article.description);
|
|
112
|
+
console.log(article.url); // Final URL
|
|
113
|
+
console.log(article.title); // Article title (Readability or metadata)
|
|
114
|
+
console.log(article.content); // Clean article text (formatted)
|
|
115
|
+
console.log(article.description); // Excerpt/summary
|
|
103
116
|
|
|
104
117
|
// Metrics
|
|
105
|
-
console.log(article.wordCount);
|
|
106
|
-
console.log(article.readingTime);
|
|
118
|
+
console.log(article.wordCount); // Word count
|
|
119
|
+
console.log(article.readingTime); // Est. reading time (minutes)
|
|
107
120
|
|
|
108
121
|
// Media & language
|
|
109
|
-
console.log(article.image);
|
|
110
|
-
console.log(article.language);
|
|
111
|
-
console.log(article.region);
|
|
122
|
+
console.log(article.image); // Article image
|
|
123
|
+
console.log(article.language); // Language code
|
|
124
|
+
console.log(article.region); // Region code
|
|
112
125
|
|
|
113
126
|
// Links & raw content
|
|
114
127
|
console.log(article.internalLinks); // Internal links
|
|
115
128
|
console.log(article.externalLinks); // External links (citations)
|
|
116
|
-
console.log(article.html);
|
|
117
|
-
console.log(article.text);
|
|
129
|
+
console.log(article.html); // Raw HTML
|
|
130
|
+
console.log(article.text); // Plain text (full page)
|
|
118
131
|
```
|
|
119
132
|
|
|
120
133
|
**What it does:**
|
|
134
|
+
|
|
121
135
|
- Uses Mozilla Readability to extract clean article content
|
|
122
136
|
- Falls back to metadata extraction if Readability fails
|
|
123
137
|
- Converts cleaned HTML to well-formatted plain text
|
|
@@ -129,30 +143,31 @@ console.log(article.text); // Plain text (full page)
|
|
|
129
143
|
Parse any feed format with one function:
|
|
130
144
|
|
|
131
145
|
```typescript
|
|
132
|
-
import { gatherFeed } from
|
|
146
|
+
import { gatherFeed } from "magpie-html";
|
|
133
147
|
|
|
134
|
-
const feed = await gatherFeed(
|
|
148
|
+
const feed = await gatherFeed("https://example.com/feed.xml");
|
|
135
149
|
|
|
136
150
|
// Feed metadata
|
|
137
|
-
console.log(feed.title);
|
|
138
|
-
console.log(feed.description);
|
|
139
|
-
console.log(feed.url);
|
|
140
|
-
console.log(feed.siteUrl);
|
|
151
|
+
console.log(feed.title); // Feed title
|
|
152
|
+
console.log(feed.description); // Feed description
|
|
153
|
+
console.log(feed.url); // Feed URL
|
|
154
|
+
console.log(feed.siteUrl); // Website URL
|
|
141
155
|
|
|
142
156
|
// Feed items
|
|
143
157
|
for (const item of feed.items) {
|
|
144
|
-
console.log(item.title);
|
|
145
|
-
console.log(item.url);
|
|
146
|
-
console.log(item.description);
|
|
147
|
-
console.log(item.publishedAt);
|
|
148
|
-
console.log(item.author);
|
|
158
|
+
console.log(item.title); // Item title
|
|
159
|
+
console.log(item.url); // Item URL (absolute)
|
|
160
|
+
console.log(item.description); // Item description
|
|
161
|
+
console.log(item.publishedAt); // Publication date
|
|
162
|
+
console.log(item.author); // Author
|
|
149
163
|
}
|
|
150
164
|
|
|
151
165
|
// Format detection
|
|
152
|
-
console.log(feed.format);
|
|
166
|
+
console.log(feed.format); // 'rss', 'atom', or 'json-feed'
|
|
153
167
|
```
|
|
154
168
|
|
|
155
169
|
**What it does:**
|
|
170
|
+
|
|
156
171
|
- Auto-detects feed format (RSS 2.0, Atom 1.0, JSON Feed)
|
|
157
172
|
- Normalizes all formats to a unified interface
|
|
158
173
|
- Resolves relative URLs to absolute
|
|
@@ -165,10 +180,10 @@ For more control, use the lower-level modules directly:
|
|
|
165
180
|
### Feed Parsing
|
|
166
181
|
|
|
167
182
|
```typescript
|
|
168
|
-
import { pluck, parseFeed } from
|
|
183
|
+
import { pluck, parseFeed } from "magpie-html";
|
|
169
184
|
|
|
170
185
|
// Fetch feed content
|
|
171
|
-
const response = await pluck(
|
|
186
|
+
const response = await pluck("https://example.com/feed.xml");
|
|
172
187
|
const feedContent = await response.textUtf8();
|
|
173
188
|
|
|
174
189
|
// Parse with base URL for relative links
|
|
@@ -182,25 +197,25 @@ console.log(result.feed.format); // 'rss', 'atom', or 'json-feed'
|
|
|
182
197
|
### Content Extraction
|
|
183
198
|
|
|
184
199
|
```typescript
|
|
185
|
-
import { parseHTML, extractContent, htmlToText } from
|
|
200
|
+
import { parseHTML, extractContent, htmlToText } from "magpie-html";
|
|
186
201
|
|
|
187
202
|
// Parse HTML once
|
|
188
203
|
const doc = parseHTML(html);
|
|
189
204
|
|
|
190
205
|
// Extract article with Readability
|
|
191
206
|
const result = extractContent(doc, {
|
|
192
|
-
baseUrl:
|
|
207
|
+
baseUrl: "https://example.com/article",
|
|
193
208
|
cleanConditionally: true,
|
|
194
209
|
keepClasses: false,
|
|
195
210
|
});
|
|
196
211
|
|
|
197
212
|
if (result.success) {
|
|
198
|
-
console.log(result.title);
|
|
199
|
-
console.log(result.excerpt);
|
|
200
|
-
console.log(result.content);
|
|
201
|
-
console.log(result.textContent);
|
|
202
|
-
console.log(result.wordCount);
|
|
203
|
-
console.log(result.readingTime);
|
|
213
|
+
console.log(result.title); // Article title
|
|
214
|
+
console.log(result.excerpt); // Article excerpt
|
|
215
|
+
console.log(result.content); // Clean HTML
|
|
216
|
+
console.log(result.textContent); // Plain text
|
|
217
|
+
console.log(result.wordCount); // Word count
|
|
218
|
+
console.log(result.readingTime); // Reading time
|
|
204
219
|
}
|
|
205
220
|
|
|
206
221
|
// Or convert any HTML to text
|
|
@@ -214,7 +229,12 @@ const plainText = htmlToText(html, {
|
|
|
214
229
|
### Metadata Extraction
|
|
215
230
|
|
|
216
231
|
```typescript
|
|
217
|
-
import {
|
|
232
|
+
import {
|
|
233
|
+
parseHTML,
|
|
234
|
+
extractOpenGraph,
|
|
235
|
+
extractSchemaOrg,
|
|
236
|
+
extractSEO,
|
|
237
|
+
} from "magpie-html";
|
|
218
238
|
|
|
219
239
|
const doc = parseHTML(html);
|
|
220
240
|
|
|
@@ -236,6 +256,7 @@ console.log(seo.keywords);
|
|
|
236
256
|
```
|
|
237
257
|
|
|
238
258
|
**Available extractors:**
|
|
259
|
+
|
|
239
260
|
- `extractSEO` - SEO meta tags
|
|
240
261
|
- `extractOpenGraph` - OpenGraph metadata
|
|
241
262
|
- `extractTwitterCard` - Twitter Card metadata
|
|
@@ -253,28 +274,29 @@ console.log(seo.keywords);
|
|
|
253
274
|
Use `pluck()` for robust fetching with automatic encoding and redirect handling:
|
|
254
275
|
|
|
255
276
|
```typescript
|
|
256
|
-
import { pluck } from
|
|
277
|
+
import { pluck } from "magpie-html";
|
|
257
278
|
|
|
258
|
-
const response = await pluck(
|
|
259
|
-
timeout: 30000,
|
|
260
|
-
maxRedirects: 10,
|
|
261
|
-
maxSize: 10485760,
|
|
262
|
-
userAgent:
|
|
279
|
+
const response = await pluck("https://example.com", {
|
|
280
|
+
timeout: 30000, // 30 second timeout
|
|
281
|
+
maxRedirects: 10, // Follow up to 10 redirects
|
|
282
|
+
maxSize: 10485760, // 10MB limit
|
|
283
|
+
userAgent: "MyBot/1.0",
|
|
263
284
|
throwOnHttpError: true,
|
|
264
285
|
strictContentType: false,
|
|
265
286
|
});
|
|
266
287
|
|
|
267
288
|
// Enhanced response properties
|
|
268
|
-
console.log(response.finalUrl);
|
|
269
|
-
console.log(response.redirectChain);
|
|
289
|
+
console.log(response.finalUrl); // URL after redirects
|
|
290
|
+
console.log(response.redirectChain); // All redirect URLs
|
|
270
291
|
console.log(response.detectedEncoding); // Detected charset
|
|
271
|
-
console.log(response.timing);
|
|
292
|
+
console.log(response.timing); // Request timing
|
|
272
293
|
|
|
273
294
|
// Get UTF-8 decoded content
|
|
274
295
|
const text = await response.textUtf8();
|
|
275
296
|
```
|
|
276
297
|
|
|
277
298
|
**Why `pluck()`?**
|
|
299
|
+
|
|
278
300
|
- Handles broken sites with wrong/missing encoding declarations
|
|
279
301
|
- Follows redirect chains and tracks them
|
|
280
302
|
- Enforces timeouts and size limits
|
|
@@ -332,13 +354,18 @@ See [TypeDoc documentation](https://anonyfox.github.io/magpie-html) for complete
|
|
|
332
354
|
**Best Practice:** Parse HTML once and reuse the document:
|
|
333
355
|
|
|
334
356
|
```typescript
|
|
335
|
-
import {
|
|
357
|
+
import {
|
|
358
|
+
parseHTML,
|
|
359
|
+
extractSEO,
|
|
360
|
+
extractOpenGraph,
|
|
361
|
+
extractContent,
|
|
362
|
+
} from "magpie-html";
|
|
336
363
|
|
|
337
364
|
const doc = parseHTML(html);
|
|
338
365
|
|
|
339
366
|
// Reuse the same document for multiple extractions
|
|
340
|
-
const seo = extractSEO(doc);
|
|
341
|
-
const og = extractOpenGraph(doc);
|
|
367
|
+
const seo = extractSEO(doc); // Fast: <5ms
|
|
368
|
+
const og = extractOpenGraph(doc); // Fast: <5ms
|
|
342
369
|
const content = extractContent(doc); // ~100-500ms
|
|
343
370
|
|
|
344
371
|
// Total: One parse + all extractions
|
|
@@ -415,10 +442,18 @@ npm publish
|
|
|
415
442
|
|
|
416
443
|
The `prepublishOnly` script automatically builds the package before publishing.
|
|
417
444
|
|
|
418
|
-
|
|
445
|
+
---
|
|
446
|
+
|
|
447
|
+
<div align="center">
|
|
448
|
+
|
|
449
|
+
### Support
|
|
450
|
+
|
|
451
|
+
If this package helps your project, consider sponsoring its maintenance:
|
|
452
|
+
|
|
453
|
+
[](https://github.com/sponsors/Anonyfox)
|
|
419
454
|
|
|
420
|
-
|
|
455
|
+
---
|
|
421
456
|
|
|
422
|
-
|
|
457
|
+
**[Anonyfox](https://anonyfox.com) • [MIT License](LICENSE)**
|
|
423
458
|
|
|
424
|
-
|
|
459
|
+
</div>
|