magpie-html 0.1.3 → 0.1.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -7,11 +7,16 @@
7
7
  [![License: MIT](https://img.shields.io/badge/License-MIT-blue.svg?style=flat-square)](https://opensource.org/licenses/MIT)
8
8
  [![TypeScript](https://img.shields.io/badge/TypeScript-5.7-blue?style=flat-square&logo=typescript)](https://www.typescriptlang.org/)
9
9
  [![Node.js](https://img.shields.io/badge/Node.js-%3E%3D18-green?style=flat-square&logo=node.js)](https://nodejs.org/)
10
+ [![Live Demo](https://img.shields.io/badge/Live_Demo-CrispRead-eb6864?style=flat-square&logo=rss&logoColor=white)](https://crispread.com)
10
11
 
11
12
  **Modern web scraping for when you need the good parts, not the markup soup.** Extracts clean article content, parses feeds (RSS, Atom, JSON), and gathers metadata from any page. Handles broken encodings, malformed feeds, and the chaos of real-world HTML. TypeScript-native, works everywhere. Named after the bird known for collecting valuable things... you get the idea.
12
13
 
13
14
  <div align="center">
14
15
  <img src="https://raw.githubusercontent.com/Anonyfox/magpie-html/main/assets/magpie-html-logo.png" alt="Magpie HTML Logo" width="300">
16
+
17
+ <br><br>
18
+ <strong>Production-ready</strong> · Powers <a href="https://crispread.com">CrispRead</a>, a trilingual news aggregator processing thousands of articles daily.
19
+
15
20
  </div>
16
21
 
17
22
  ## Features
@@ -454,6 +459,6 @@ If this package helps your project, consider sponsoring its maintenance:
454
459
 
455
460
  ---
456
461
 
457
- **[Anonyfox](https://anonyfox.com) • [MIT License](LICENSE)**
462
+ **[Anonyfox](https://anonyfox.com) • [API Docs](https://anonyfox.github.io/magpie-html) • [MIT License](LICENSE)**
458
463
 
459
464
  </div>
package/dist/index.cjs CHANGED
@@ -838,6 +838,32 @@ function parseAtomDate(dateString) {
838
838
  }
839
839
 
840
840
  // src/feed/atom/extract-entry.ts
841
+ function extractAtomDate(element) {
842
+ let dateText = element.querySelector("updated")?.textContent;
843
+ if (dateText) {
844
+ const parsed = parseAtomDate(dateText);
845
+ if (parsed) return parsed;
846
+ }
847
+ dateText = element.querySelector("modified")?.textContent;
848
+ if (dateText) {
849
+ const parsed = parseAtomDate(dateText);
850
+ if (parsed) return parsed;
851
+ }
852
+ dateText = element.querySelector("issued")?.textContent;
853
+ if (dateText) {
854
+ const parsed = parseAtomDate(dateText);
855
+ if (parsed) return parsed;
856
+ }
857
+ const dcDateElements = element.children.filter((child) => child.tagName === "dc:date");
858
+ if (dcDateElements.length > 0) {
859
+ dateText = dcDateElements[0].textContent;
860
+ if (dateText) {
861
+ const parsed = parseAtomDate(dateText);
862
+ if (parsed) return parsed;
863
+ }
864
+ }
865
+ return null;
866
+ }
841
867
  function extractPerson(element) {
842
868
  const name = element.querySelector("name")?.textContent;
843
869
  if (!name) {
@@ -980,13 +1006,11 @@ function extractEntry(entryElement) {
980
1006
  if (!title) {
981
1007
  throw new Error("Invalid Atom entry: missing required <title> element");
982
1008
  }
983
- const updatedRaw = entryElement.querySelector("updated")?.textContent;
984
- if (!updatedRaw) {
985
- throw new Error("Invalid Atom entry: missing required <updated> element");
986
- }
987
- const updated = parseAtomDate(updatedRaw);
1009
+ const updated = extractAtomDate(entryElement);
988
1010
  if (!updated) {
989
- throw new Error("Invalid Atom entry: invalid <updated> date");
1011
+ throw new Error(
1012
+ "Invalid Atom entry: missing or invalid date (tried <updated>, <modified>, <issued>, <dc:date>)"
1013
+ );
990
1014
  }
991
1015
  const entry = {
992
1016
  id: cleanText(id),
@@ -1236,6 +1260,32 @@ function parseXML(xml) {
1236
1260
  }
1237
1261
 
1238
1262
  // src/feed/atom/extract-feed.ts
1263
+ function extractAtomDate2(element) {
1264
+ let dateText = element.querySelector("updated")?.textContent;
1265
+ if (dateText) {
1266
+ const parsed = parseAtomDate(dateText);
1267
+ if (parsed) return parsed;
1268
+ }
1269
+ dateText = element.querySelector("modified")?.textContent;
1270
+ if (dateText) {
1271
+ const parsed = parseAtomDate(dateText);
1272
+ if (parsed) return parsed;
1273
+ }
1274
+ dateText = element.querySelector("issued")?.textContent;
1275
+ if (dateText) {
1276
+ const parsed = parseAtomDate(dateText);
1277
+ if (parsed) return parsed;
1278
+ }
1279
+ const dcDateElements = element.children.filter((child) => child.tagName === "dc:date");
1280
+ if (dcDateElements.length > 0) {
1281
+ dateText = dcDateElements[0].textContent;
1282
+ if (dateText) {
1283
+ const parsed = parseAtomDate(dateText);
1284
+ if (parsed) return parsed;
1285
+ }
1286
+ }
1287
+ return null;
1288
+ }
1239
1289
  function extractPerson2(element) {
1240
1290
  const name = element.querySelector("name")?.textContent;
1241
1291
  if (!name) {
@@ -1383,13 +1433,11 @@ function extractFeed(xml) {
1383
1433
  if (!title) {
1384
1434
  throw new Error("Invalid Atom feed: missing required <title> element");
1385
1435
  }
1386
- const updatedRaw = feed.querySelector("updated")?.textContent;
1387
- if (!updatedRaw) {
1388
- throw new Error("Invalid Atom feed: missing required <updated> element");
1389
- }
1390
- const updated = parseAtomDate(updatedRaw);
1436
+ const updated = extractAtomDate2(feed);
1391
1437
  if (!updated) {
1392
- throw new Error("Invalid Atom feed: invalid <updated> date");
1438
+ throw new Error(
1439
+ "Invalid Atom feed: missing or invalid date (tried <updated>, <modified>, <issued>, <dc:date>)"
1440
+ );
1393
1441
  }
1394
1442
  const result = {
1395
1443
  id: cleanText(id),
@@ -2451,13 +2499,12 @@ async function pluck(input, init) {
2451
2499
  const startTime = Date.now();
2452
2500
  const options = normalizeOptions2(init);
2453
2501
  const originalUrl = typeof input === "string" || input instanceof URL ? String(input) : input.url;
2454
- const abortController = new AbortController();
2455
- const timeoutId = setTimeout(() => abortController.abort(), options.timeout);
2502
+ const signal = AbortSignal.timeout(options.timeout);
2456
2503
  try {
2457
2504
  const { response, redirectChain, redirectDuration } = await followRedirects(
2458
2505
  input,
2459
2506
  options,
2460
- abortController.signal
2507
+ signal
2461
2508
  );
2462
2509
  const finalUrl = response.url;
2463
2510
  if (options.throwOnHttpError && !response.ok) {
@@ -2494,15 +2541,13 @@ async function pluck(input, init) {
2494
2541
  if (error instanceof PluckTimeoutError || error instanceof PluckNetworkError) {
2495
2542
  throw error;
2496
2543
  }
2497
- if (error.name === "AbortError") {
2544
+ if (error.name === "TimeoutError") {
2498
2545
  throw new PluckTimeoutError(`Request timeout after ${options.timeout}ms`, options.timeout);
2499
2546
  }
2500
2547
  if (error instanceof TypeError) {
2501
2548
  throw new PluckNetworkError(`Network error: ${error.message}`, error);
2502
2549
  }
2503
2550
  throw error;
2504
- } finally {
2505
- clearTimeout(timeoutId);
2506
2551
  }
2507
2552
  }
2508
2553
  function normalizeOptions2(init) {