magpie-html 0.1.3 → 0.1.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +6 -1
- package/dist/index.cjs +63 -18
- package/dist/index.cjs.map +1 -1
- package/dist/index.js +67 -22
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/README.md
CHANGED
|
@@ -7,11 +7,16 @@
|
|
|
7
7
|
[](https://opensource.org/licenses/MIT)
|
|
8
8
|
[](https://www.typescriptlang.org/)
|
|
9
9
|
[](https://nodejs.org/)
|
|
10
|
+
[](https://crispread.com)
|
|
10
11
|
|
|
11
12
|
**Modern web scraping for when you need the good parts, not the markup soup.** Extracts clean article content, parses feeds (RSS, Atom, JSON), and gathers metadata from any page. Handles broken encodings, malformed feeds, and the chaos of real-world HTML. TypeScript-native, works everywhere. Named after the bird known for collecting valuable things... you get the idea.
|
|
12
13
|
|
|
13
14
|
<div align="center">
|
|
14
15
|
<img src="https://raw.githubusercontent.com/Anonyfox/magpie-html/main/assets/magpie-html-logo.png" alt="Magpie HTML Logo" width="300">
|
|
16
|
+
|
|
17
|
+
<br><br>
|
|
18
|
+
<strong>Production-ready</strong> · Powers <a href="https://crispread.com">CrispRead</a>, a trilingual news aggregator processing thousands of articles daily.
|
|
19
|
+
|
|
15
20
|
</div>
|
|
16
21
|
|
|
17
22
|
## Features
|
|
@@ -454,6 +459,6 @@ If this package helps your project, consider sponsoring its maintenance:
|
|
|
454
459
|
|
|
455
460
|
---
|
|
456
461
|
|
|
457
|
-
**[Anonyfox](https://anonyfox.com) • [MIT License](LICENSE)**
|
|
462
|
+
**[Anonyfox](https://anonyfox.com) • [API Docs](https://anonyfox.github.io/magpie-html) • [MIT License](LICENSE)**
|
|
458
463
|
|
|
459
464
|
</div>
|
package/dist/index.cjs
CHANGED
|
@@ -838,6 +838,32 @@ function parseAtomDate(dateString) {
|
|
|
838
838
|
}
|
|
839
839
|
|
|
840
840
|
// src/feed/atom/extract-entry.ts
|
|
841
|
+
function extractAtomDate(element) {
|
|
842
|
+
let dateText = element.querySelector("updated")?.textContent;
|
|
843
|
+
if (dateText) {
|
|
844
|
+
const parsed = parseAtomDate(dateText);
|
|
845
|
+
if (parsed) return parsed;
|
|
846
|
+
}
|
|
847
|
+
dateText = element.querySelector("modified")?.textContent;
|
|
848
|
+
if (dateText) {
|
|
849
|
+
const parsed = parseAtomDate(dateText);
|
|
850
|
+
if (parsed) return parsed;
|
|
851
|
+
}
|
|
852
|
+
dateText = element.querySelector("issued")?.textContent;
|
|
853
|
+
if (dateText) {
|
|
854
|
+
const parsed = parseAtomDate(dateText);
|
|
855
|
+
if (parsed) return parsed;
|
|
856
|
+
}
|
|
857
|
+
const dcDateElements = element.children.filter((child) => child.tagName === "dc:date");
|
|
858
|
+
if (dcDateElements.length > 0) {
|
|
859
|
+
dateText = dcDateElements[0].textContent;
|
|
860
|
+
if (dateText) {
|
|
861
|
+
const parsed = parseAtomDate(dateText);
|
|
862
|
+
if (parsed) return parsed;
|
|
863
|
+
}
|
|
864
|
+
}
|
|
865
|
+
return null;
|
|
866
|
+
}
|
|
841
867
|
function extractPerson(element) {
|
|
842
868
|
const name = element.querySelector("name")?.textContent;
|
|
843
869
|
if (!name) {
|
|
@@ -980,13 +1006,11 @@ function extractEntry(entryElement) {
|
|
|
980
1006
|
if (!title) {
|
|
981
1007
|
throw new Error("Invalid Atom entry: missing required <title> element");
|
|
982
1008
|
}
|
|
983
|
-
const
|
|
984
|
-
if (!updatedRaw) {
|
|
985
|
-
throw new Error("Invalid Atom entry: missing required <updated> element");
|
|
986
|
-
}
|
|
987
|
-
const updated = parseAtomDate(updatedRaw);
|
|
1009
|
+
const updated = extractAtomDate(entryElement);
|
|
988
1010
|
if (!updated) {
|
|
989
|
-
throw new Error(
|
|
1011
|
+
throw new Error(
|
|
1012
|
+
"Invalid Atom entry: missing or invalid date (tried <updated>, <modified>, <issued>, <dc:date>)"
|
|
1013
|
+
);
|
|
990
1014
|
}
|
|
991
1015
|
const entry = {
|
|
992
1016
|
id: cleanText(id),
|
|
@@ -1236,6 +1260,32 @@ function parseXML(xml) {
|
|
|
1236
1260
|
}
|
|
1237
1261
|
|
|
1238
1262
|
// src/feed/atom/extract-feed.ts
|
|
1263
|
+
function extractAtomDate2(element) {
|
|
1264
|
+
let dateText = element.querySelector("updated")?.textContent;
|
|
1265
|
+
if (dateText) {
|
|
1266
|
+
const parsed = parseAtomDate(dateText);
|
|
1267
|
+
if (parsed) return parsed;
|
|
1268
|
+
}
|
|
1269
|
+
dateText = element.querySelector("modified")?.textContent;
|
|
1270
|
+
if (dateText) {
|
|
1271
|
+
const parsed = parseAtomDate(dateText);
|
|
1272
|
+
if (parsed) return parsed;
|
|
1273
|
+
}
|
|
1274
|
+
dateText = element.querySelector("issued")?.textContent;
|
|
1275
|
+
if (dateText) {
|
|
1276
|
+
const parsed = parseAtomDate(dateText);
|
|
1277
|
+
if (parsed) return parsed;
|
|
1278
|
+
}
|
|
1279
|
+
const dcDateElements = element.children.filter((child) => child.tagName === "dc:date");
|
|
1280
|
+
if (dcDateElements.length > 0) {
|
|
1281
|
+
dateText = dcDateElements[0].textContent;
|
|
1282
|
+
if (dateText) {
|
|
1283
|
+
const parsed = parseAtomDate(dateText);
|
|
1284
|
+
if (parsed) return parsed;
|
|
1285
|
+
}
|
|
1286
|
+
}
|
|
1287
|
+
return null;
|
|
1288
|
+
}
|
|
1239
1289
|
function extractPerson2(element) {
|
|
1240
1290
|
const name = element.querySelector("name")?.textContent;
|
|
1241
1291
|
if (!name) {
|
|
@@ -1383,13 +1433,11 @@ function extractFeed(xml) {
|
|
|
1383
1433
|
if (!title) {
|
|
1384
1434
|
throw new Error("Invalid Atom feed: missing required <title> element");
|
|
1385
1435
|
}
|
|
1386
|
-
const
|
|
1387
|
-
if (!updatedRaw) {
|
|
1388
|
-
throw new Error("Invalid Atom feed: missing required <updated> element");
|
|
1389
|
-
}
|
|
1390
|
-
const updated = parseAtomDate(updatedRaw);
|
|
1436
|
+
const updated = extractAtomDate2(feed);
|
|
1391
1437
|
if (!updated) {
|
|
1392
|
-
throw new Error(
|
|
1438
|
+
throw new Error(
|
|
1439
|
+
"Invalid Atom feed: missing or invalid date (tried <updated>, <modified>, <issued>, <dc:date>)"
|
|
1440
|
+
);
|
|
1393
1441
|
}
|
|
1394
1442
|
const result = {
|
|
1395
1443
|
id: cleanText(id),
|
|
@@ -2451,13 +2499,12 @@ async function pluck(input, init) {
|
|
|
2451
2499
|
const startTime = Date.now();
|
|
2452
2500
|
const options = normalizeOptions2(init);
|
|
2453
2501
|
const originalUrl = typeof input === "string" || input instanceof URL ? String(input) : input.url;
|
|
2454
|
-
const
|
|
2455
|
-
const timeoutId = setTimeout(() => abortController.abort(), options.timeout);
|
|
2502
|
+
const signal = AbortSignal.timeout(options.timeout);
|
|
2456
2503
|
try {
|
|
2457
2504
|
const { response, redirectChain, redirectDuration } = await followRedirects(
|
|
2458
2505
|
input,
|
|
2459
2506
|
options,
|
|
2460
|
-
|
|
2507
|
+
signal
|
|
2461
2508
|
);
|
|
2462
2509
|
const finalUrl = response.url;
|
|
2463
2510
|
if (options.throwOnHttpError && !response.ok) {
|
|
@@ -2494,15 +2541,13 @@ async function pluck(input, init) {
|
|
|
2494
2541
|
if (error instanceof PluckTimeoutError || error instanceof PluckNetworkError) {
|
|
2495
2542
|
throw error;
|
|
2496
2543
|
}
|
|
2497
|
-
if (error.name === "
|
|
2544
|
+
if (error.name === "TimeoutError") {
|
|
2498
2545
|
throw new PluckTimeoutError(`Request timeout after ${options.timeout}ms`, options.timeout);
|
|
2499
2546
|
}
|
|
2500
2547
|
if (error instanceof TypeError) {
|
|
2501
2548
|
throw new PluckNetworkError(`Network error: ${error.message}`, error);
|
|
2502
2549
|
}
|
|
2503
2550
|
throw error;
|
|
2504
|
-
} finally {
|
|
2505
|
-
clearTimeout(timeoutId);
|
|
2506
2551
|
}
|
|
2507
2552
|
}
|
|
2508
2553
|
function normalizeOptions2(init) {
|