magpie-html 0.1.5 → 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +26 -2
- package/dist/index.cjs +33 -2
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +2 -1
- package/dist/index.d.ts +2 -1
- package/dist/index.js +33 -2
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/dist/index.d.cts
CHANGED
|
@@ -454,9 +454,10 @@ declare function isProbablyReaderable(doc: Document, options?: {
|
|
|
454
454
|
* - `'rss'` - RSS 2.0, 0.9x, or RSS 1.0 (RDF)
|
|
455
455
|
* - `'atom'` - Atom 1.0
|
|
456
456
|
* - `'json-feed'` - JSON Feed 1.0 or 1.1
|
|
457
|
+
* - `'sitemap'` - XML Sitemap (urlset or sitemapindex)
|
|
457
458
|
* - `'unknown'` - Format could not be determined
|
|
458
459
|
*/
|
|
459
|
-
type FeedFormat = 'rss' | 'atom' | 'json-feed' | 'unknown';
|
|
460
|
+
type FeedFormat = 'rss' | 'atom' | 'json-feed' | 'sitemap' | 'unknown';
|
|
460
461
|
/**
|
|
461
462
|
* Detect feed format from content string.
|
|
462
463
|
*
|
package/dist/index.d.ts
CHANGED
|
@@ -454,9 +454,10 @@ declare function isProbablyReaderable(doc: Document, options?: {
|
|
|
454
454
|
* - `'rss'` - RSS 2.0, 0.9x, or RSS 1.0 (RDF)
|
|
455
455
|
* - `'atom'` - Atom 1.0
|
|
456
456
|
* - `'json-feed'` - JSON Feed 1.0 or 1.1
|
|
457
|
+
* - `'sitemap'` - XML Sitemap (urlset or sitemapindex)
|
|
457
458
|
* - `'unknown'` - Format could not be determined
|
|
458
459
|
*/
|
|
459
|
-
type FeedFormat = 'rss' | 'atom' | 'json-feed' | 'unknown';
|
|
460
|
+
type FeedFormat = 'rss' | 'atom' | 'json-feed' | 'sitemap' | 'unknown';
|
|
460
461
|
/**
|
|
461
462
|
* Detect feed format from content string.
|
|
462
463
|
*
|
package/dist/index.js
CHANGED
|
@@ -1075,8 +1075,12 @@ function extractEntry(entryElement) {
|
|
|
1075
1075
|
function removeComments(xml) {
|
|
1076
1076
|
return xml.replace(/<!--[\s\S]*?-->/g, "");
|
|
1077
1077
|
}
|
|
1078
|
+
function removeDoctype(xml) {
|
|
1079
|
+
return xml.replace(/<!DOCTYPE[^>]*>/gi, "");
|
|
1080
|
+
}
|
|
1078
1081
|
function parseAtomXML(xml) {
|
|
1079
|
-
const
|
|
1082
|
+
const withoutDoctype = removeDoctype(xml);
|
|
1083
|
+
const withoutComments = removeComments(withoutDoctype);
|
|
1080
1084
|
const { text: cleanedXML, cdataMap } = extractCDATA(withoutComments);
|
|
1081
1085
|
const root = parseElement(cleanedXML, 0, null, cdataMap).element;
|
|
1082
1086
|
return root;
|
|
@@ -1544,6 +1548,9 @@ function detectFormat(content) {
|
|
|
1544
1548
|
if (cleaned.match(/<channel[\s>]/i)) {
|
|
1545
1549
|
return "rss";
|
|
1546
1550
|
}
|
|
1551
|
+
if ((cleaned.match(/<urlset[\s>]/i) || cleaned.match(/<sitemapindex[\s>]/i)) && cleaned.includes("sitemaps.org")) {
|
|
1552
|
+
return "sitemap";
|
|
1553
|
+
}
|
|
1547
1554
|
return "unknown";
|
|
1548
1555
|
}
|
|
1549
1556
|
function isFeed(content) {
|
|
@@ -1881,12 +1888,20 @@ function parseRSSDate(dateString) {
|
|
|
1881
1888
|
// src/feed/rss/xml-parser.ts
|
|
1882
1889
|
function parseRSSXML(xml) {
|
|
1883
1890
|
const cleaned = cleanXMLDeclaration(xml);
|
|
1884
|
-
const
|
|
1891
|
+
const withoutDoctype = removeDoctype2(cleaned);
|
|
1892
|
+
const withoutComments = removeComments2(withoutDoctype);
|
|
1893
|
+
const root = parseElement2(withoutComments, 0).element;
|
|
1885
1894
|
return root;
|
|
1886
1895
|
}
|
|
1887
1896
|
function cleanXMLDeclaration(xml) {
|
|
1888
1897
|
return xml.replace(/<\?xml[^?]*\?>/g, "").trim();
|
|
1889
1898
|
}
|
|
1899
|
+
function removeDoctype2(xml) {
|
|
1900
|
+
return xml.replace(/<!DOCTYPE[^>]*>/gi, "");
|
|
1901
|
+
}
|
|
1902
|
+
function removeComments2(xml) {
|
|
1903
|
+
return xml.replace(/<!--[\s\S]*?-->/g, "");
|
|
1904
|
+
}
|
|
1890
1905
|
function extractCDATA2(text) {
|
|
1891
1906
|
const cdataMap = /* @__PURE__ */ new Map();
|
|
1892
1907
|
let counter = 0;
|
|
@@ -2324,6 +2339,11 @@ function parseFeedAs(content, format, baseUrl) {
|
|
|
2324
2339
|
original: jsonFeed
|
|
2325
2340
|
};
|
|
2326
2341
|
}
|
|
2342
|
+
case "sitemap": {
|
|
2343
|
+
throw new Error(
|
|
2344
|
+
"Sitemaps cannot be parsed with parseFeed(). Use parseSitemap() from the sitemap module instead."
|
|
2345
|
+
);
|
|
2346
|
+
}
|
|
2327
2347
|
}
|
|
2328
2348
|
}
|
|
2329
2349
|
|
|
@@ -2405,6 +2425,10 @@ function detectEncoding(buffer, contentType) {
|
|
|
2405
2425
|
}
|
|
2406
2426
|
const preview = new Uint8Array(buffer.slice(0, 1024));
|
|
2407
2427
|
const previewText = new TextDecoder("utf-8", { fatal: false }).decode(preview);
|
|
2428
|
+
const xmlEncoding = parseCharsetFromXml(previewText);
|
|
2429
|
+
if (xmlEncoding) {
|
|
2430
|
+
return xmlEncoding;
|
|
2431
|
+
}
|
|
2408
2432
|
const metaEncoding = parseCharsetFromHtml(previewText);
|
|
2409
2433
|
if (metaEncoding) {
|
|
2410
2434
|
return metaEncoding;
|
|
@@ -2445,6 +2469,13 @@ function parseCharsetFromHtml(html) {
|
|
|
2445
2469
|
}
|
|
2446
2470
|
return null;
|
|
2447
2471
|
}
|
|
2472
|
+
function parseCharsetFromXml(xml) {
|
|
2473
|
+
const xmlDeclMatch = /<\?xml[^?]*encoding\s*=\s*["']([^"']+)["'][^?]*\?>/i.exec(xml);
|
|
2474
|
+
if (xmlDeclMatch) {
|
|
2475
|
+
return normalizeEncoding(xmlDeclMatch[1]);
|
|
2476
|
+
}
|
|
2477
|
+
return null;
|
|
2478
|
+
}
|
|
2448
2479
|
function normalizeEncoding(encoding) {
|
|
2449
2480
|
const normalized = encoding.toLowerCase().trim();
|
|
2450
2481
|
const aliases = {
|