magpie-html 0.1.5 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +26 -2
- package/dist/index.cjs +24 -1
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +2 -1
- package/dist/index.d.ts +2 -1
- package/dist/index.js +24 -1
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/dist/index.d.cts
CHANGED
|
@@ -454,9 +454,10 @@ declare function isProbablyReaderable(doc: Document, options?: {
|
|
|
454
454
|
* - `'rss'` - RSS 2.0, 0.9x, or RSS 1.0 (RDF)
|
|
455
455
|
* - `'atom'` - Atom 1.0
|
|
456
456
|
* - `'json-feed'` - JSON Feed 1.0 or 1.1
|
|
457
|
+
* - `'sitemap'` - XML Sitemap (urlset or sitemapindex)
|
|
457
458
|
* - `'unknown'` - Format could not be determined
|
|
458
459
|
*/
|
|
459
|
-
type FeedFormat = 'rss' | 'atom' | 'json-feed' | 'unknown';
|
|
460
|
+
type FeedFormat = 'rss' | 'atom' | 'json-feed' | 'sitemap' | 'unknown';
|
|
460
461
|
/**
|
|
461
462
|
* Detect feed format from content string.
|
|
462
463
|
*
|
package/dist/index.d.ts
CHANGED
|
@@ -454,9 +454,10 @@ declare function isProbablyReaderable(doc: Document, options?: {
|
|
|
454
454
|
* - `'rss'` - RSS 2.0, 0.9x, or RSS 1.0 (RDF)
|
|
455
455
|
* - `'atom'` - Atom 1.0
|
|
456
456
|
* - `'json-feed'` - JSON Feed 1.0 or 1.1
|
|
457
|
+
* - `'sitemap'` - XML Sitemap (urlset or sitemapindex)
|
|
457
458
|
* - `'unknown'` - Format could not be determined
|
|
458
459
|
*/
|
|
459
|
-
type FeedFormat = 'rss' | 'atom' | 'json-feed' | 'unknown';
|
|
460
|
+
type FeedFormat = 'rss' | 'atom' | 'json-feed' | 'sitemap' | 'unknown';
|
|
460
461
|
/**
|
|
461
462
|
* Detect feed format from content string.
|
|
462
463
|
*
|
package/dist/index.js
CHANGED
|
@@ -1544,6 +1544,9 @@ function detectFormat(content) {
|
|
|
1544
1544
|
if (cleaned.match(/<channel[\s>]/i)) {
|
|
1545
1545
|
return "rss";
|
|
1546
1546
|
}
|
|
1547
|
+
if ((cleaned.match(/<urlset[\s>]/i) || cleaned.match(/<sitemapindex[\s>]/i)) && cleaned.includes("sitemaps.org")) {
|
|
1548
|
+
return "sitemap";
|
|
1549
|
+
}
|
|
1547
1550
|
return "unknown";
|
|
1548
1551
|
}
|
|
1549
1552
|
function isFeed(content) {
|
|
@@ -1881,12 +1884,16 @@ function parseRSSDate(dateString) {
|
|
|
1881
1884
|
// src/feed/rss/xml-parser.ts
|
|
1882
1885
|
function parseRSSXML(xml) {
|
|
1883
1886
|
const cleaned = cleanXMLDeclaration(xml);
|
|
1884
|
-
const
|
|
1887
|
+
const withoutComments = removeComments2(cleaned);
|
|
1888
|
+
const root = parseElement2(withoutComments, 0).element;
|
|
1885
1889
|
return root;
|
|
1886
1890
|
}
|
|
1887
1891
|
function cleanXMLDeclaration(xml) {
|
|
1888
1892
|
return xml.replace(/<\?xml[^?]*\?>/g, "").trim();
|
|
1889
1893
|
}
|
|
1894
|
+
function removeComments2(xml) {
|
|
1895
|
+
return xml.replace(/<!--[\s\S]*?-->/g, "");
|
|
1896
|
+
}
|
|
1890
1897
|
function extractCDATA2(text) {
|
|
1891
1898
|
const cdataMap = /* @__PURE__ */ new Map();
|
|
1892
1899
|
let counter = 0;
|
|
@@ -2324,6 +2331,11 @@ function parseFeedAs(content, format, baseUrl) {
|
|
|
2324
2331
|
original: jsonFeed
|
|
2325
2332
|
};
|
|
2326
2333
|
}
|
|
2334
|
+
case "sitemap": {
|
|
2335
|
+
throw new Error(
|
|
2336
|
+
"Sitemaps cannot be parsed with parseFeed(). Use parseSitemap() from the sitemap module instead."
|
|
2337
|
+
);
|
|
2338
|
+
}
|
|
2327
2339
|
}
|
|
2328
2340
|
}
|
|
2329
2341
|
|
|
@@ -2405,6 +2417,10 @@ function detectEncoding(buffer, contentType) {
|
|
|
2405
2417
|
}
|
|
2406
2418
|
const preview = new Uint8Array(buffer.slice(0, 1024));
|
|
2407
2419
|
const previewText = new TextDecoder("utf-8", { fatal: false }).decode(preview);
|
|
2420
|
+
const xmlEncoding = parseCharsetFromXml(previewText);
|
|
2421
|
+
if (xmlEncoding) {
|
|
2422
|
+
return xmlEncoding;
|
|
2423
|
+
}
|
|
2408
2424
|
const metaEncoding = parseCharsetFromHtml(previewText);
|
|
2409
2425
|
if (metaEncoding) {
|
|
2410
2426
|
return metaEncoding;
|
|
@@ -2445,6 +2461,13 @@ function parseCharsetFromHtml(html) {
|
|
|
2445
2461
|
}
|
|
2446
2462
|
return null;
|
|
2447
2463
|
}
|
|
2464
|
+
function parseCharsetFromXml(xml) {
|
|
2465
|
+
const xmlDeclMatch = /<\?xml[^?]*encoding\s*=\s*["']([^"']+)["'][^?]*\?>/i.exec(xml);
|
|
2466
|
+
if (xmlDeclMatch) {
|
|
2467
|
+
return normalizeEncoding(xmlDeclMatch[1]);
|
|
2468
|
+
}
|
|
2469
|
+
return null;
|
|
2470
|
+
}
|
|
2448
2471
|
function normalizeEncoding(encoding) {
|
|
2449
2472
|
const normalized = encoding.toLowerCase().trim();
|
|
2450
2473
|
const aliases = {
|