magpie-html 0.1.5 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.cts CHANGED
@@ -454,9 +454,10 @@ declare function isProbablyReaderable(doc: Document, options?: {
454
454
  * - `'rss'` - RSS 2.0, 0.9x, or RSS 1.0 (RDF)
455
455
  * - `'atom'` - Atom 1.0
456
456
  * - `'json-feed'` - JSON Feed 1.0 or 1.1
457
+ * - `'sitemap'` - XML Sitemap (urlset or sitemapindex)
457
458
  * - `'unknown'` - Format could not be determined
458
459
  */
459
- type FeedFormat = 'rss' | 'atom' | 'json-feed' | 'unknown';
460
+ type FeedFormat = 'rss' | 'atom' | 'json-feed' | 'sitemap' | 'unknown';
460
461
  /**
461
462
  * Detect feed format from content string.
462
463
  *
package/dist/index.d.ts CHANGED
@@ -454,9 +454,10 @@ declare function isProbablyReaderable(doc: Document, options?: {
454
454
  * - `'rss'` - RSS 2.0, 0.9x, or RSS 1.0 (RDF)
455
455
  * - `'atom'` - Atom 1.0
456
456
  * - `'json-feed'` - JSON Feed 1.0 or 1.1
457
+ * - `'sitemap'` - XML Sitemap (urlset or sitemapindex)
457
458
  * - `'unknown'` - Format could not be determined
458
459
  */
459
- type FeedFormat = 'rss' | 'atom' | 'json-feed' | 'unknown';
460
+ type FeedFormat = 'rss' | 'atom' | 'json-feed' | 'sitemap' | 'unknown';
460
461
  /**
461
462
  * Detect feed format from content string.
462
463
  *
package/dist/index.js CHANGED
@@ -1075,8 +1075,12 @@ function extractEntry(entryElement) {
1075
1075
  function removeComments(xml) {
1076
1076
  return xml.replace(/<!--[\s\S]*?-->/g, "");
1077
1077
  }
1078
+ function removeDoctype(xml) {
1079
+ return xml.replace(/<!DOCTYPE[^>]*>/gi, "");
1080
+ }
1078
1081
  function parseAtomXML(xml) {
1079
- const withoutComments = removeComments(xml);
1082
+ const withoutDoctype = removeDoctype(xml);
1083
+ const withoutComments = removeComments(withoutDoctype);
1080
1084
  const { text: cleanedXML, cdataMap } = extractCDATA(withoutComments);
1081
1085
  const root = parseElement(cleanedXML, 0, null, cdataMap).element;
1082
1086
  return root;
@@ -1544,6 +1548,9 @@ function detectFormat(content) {
1544
1548
  if (cleaned.match(/<channel[\s>]/i)) {
1545
1549
  return "rss";
1546
1550
  }
1551
+ if ((cleaned.match(/<urlset[\s>]/i) || cleaned.match(/<sitemapindex[\s>]/i)) && cleaned.includes("sitemaps.org")) {
1552
+ return "sitemap";
1553
+ }
1547
1554
  return "unknown";
1548
1555
  }
1549
1556
  function isFeed(content) {
@@ -1881,12 +1888,20 @@ function parseRSSDate(dateString) {
1881
1888
  // src/feed/rss/xml-parser.ts
1882
1889
  function parseRSSXML(xml) {
1883
1890
  const cleaned = cleanXMLDeclaration(xml);
1884
- const root = parseElement2(cleaned, 0).element;
1891
+ const withoutDoctype = removeDoctype2(cleaned);
1892
+ const withoutComments = removeComments2(withoutDoctype);
1893
+ const root = parseElement2(withoutComments, 0).element;
1885
1894
  return root;
1886
1895
  }
1887
1896
  function cleanXMLDeclaration(xml) {
1888
1897
  return xml.replace(/<\?xml[^?]*\?>/g, "").trim();
1889
1898
  }
1899
+ function removeDoctype2(xml) {
1900
+ return xml.replace(/<!DOCTYPE[^>]*>/gi, "");
1901
+ }
1902
+ function removeComments2(xml) {
1903
+ return xml.replace(/<!--[\s\S]*?-->/g, "");
1904
+ }
1890
1905
  function extractCDATA2(text) {
1891
1906
  const cdataMap = /* @__PURE__ */ new Map();
1892
1907
  let counter = 0;
@@ -2324,6 +2339,11 @@ function parseFeedAs(content, format, baseUrl) {
2324
2339
  original: jsonFeed
2325
2340
  };
2326
2341
  }
2342
+ case "sitemap": {
2343
+ throw new Error(
2344
+ "Sitemaps cannot be parsed with parseFeed(). Use parseSitemap() from the sitemap module instead."
2345
+ );
2346
+ }
2327
2347
  }
2328
2348
  }
2329
2349
 
@@ -2405,6 +2425,10 @@ function detectEncoding(buffer, contentType) {
2405
2425
  }
2406
2426
  const preview = new Uint8Array(buffer.slice(0, 1024));
2407
2427
  const previewText = new TextDecoder("utf-8", { fatal: false }).decode(preview);
2428
+ const xmlEncoding = parseCharsetFromXml(previewText);
2429
+ if (xmlEncoding) {
2430
+ return xmlEncoding;
2431
+ }
2408
2432
  const metaEncoding = parseCharsetFromHtml(previewText);
2409
2433
  if (metaEncoding) {
2410
2434
  return metaEncoding;
@@ -2445,6 +2469,13 @@ function parseCharsetFromHtml(html) {
2445
2469
  }
2446
2470
  return null;
2447
2471
  }
2472
+ function parseCharsetFromXml(xml) {
2473
+ const xmlDeclMatch = /<\?xml[^?]*encoding\s*=\s*["']([^"']+)["'][^?]*\?>/i.exec(xml);
2474
+ if (xmlDeclMatch) {
2475
+ return normalizeEncoding(xmlDeclMatch[1]);
2476
+ }
2477
+ return null;
2478
+ }
2448
2479
  function normalizeEncoding(encoding) {
2449
2480
  const normalized = encoding.toLowerCase().trim();
2450
2481
  const aliases = {