magpie-html 0.2.0 → 0.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.cjs +442 -8
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +2 -1
- package/dist/index.d.ts +2 -1
- package/dist/index.js +442 -8
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/dist/index.d.cts
CHANGED
|
@@ -603,7 +603,7 @@ interface FeedItem {
|
|
|
603
603
|
*/
|
|
604
604
|
interface Feed {
|
|
605
605
|
/** Original feed format */
|
|
606
|
-
format: 'rss' | 'atom' | 'json-feed';
|
|
606
|
+
format: 'rss' | 'atom' | 'json-feed' | 'sitemap';
|
|
607
607
|
/** Feed title (required) */
|
|
608
608
|
title: string;
|
|
609
609
|
/** Feed description or subtitle */
|
|
@@ -952,6 +952,7 @@ declare function gatherArticle(url: string | URL): Promise<Article>;
|
|
|
952
952
|
* @remarks
|
|
953
953
|
* This is a high-level convenience method that combines fetching and parsing.
|
|
954
954
|
* It handles encoding detection, redirects, and feed format detection automatically.
|
|
955
|
+
* Falls back to sitemap parsing when standard feed formats aren't detected.
|
|
955
956
|
*
|
|
956
957
|
* @param url - Feed URL as string or URL object
|
|
957
958
|
* @returns Normalized feed data
|
package/dist/index.d.ts
CHANGED
|
@@ -603,7 +603,7 @@ interface FeedItem {
|
|
|
603
603
|
*/
|
|
604
604
|
interface Feed {
|
|
605
605
|
/** Original feed format */
|
|
606
|
-
format: 'rss' | 'atom' | 'json-feed';
|
|
606
|
+
format: 'rss' | 'atom' | 'json-feed' | 'sitemap';
|
|
607
607
|
/** Feed title (required) */
|
|
608
608
|
title: string;
|
|
609
609
|
/** Feed description or subtitle */
|
|
@@ -952,6 +952,7 @@ declare function gatherArticle(url: string | URL): Promise<Article>;
|
|
|
952
952
|
* @remarks
|
|
953
953
|
* This is a high-level convenience method that combines fetching and parsing.
|
|
954
954
|
* It handles encoding detection, redirects, and feed format detection automatically.
|
|
955
|
+
* Falls back to sitemap parsing when standard feed formats aren't detected.
|
|
955
956
|
*
|
|
956
957
|
* @param url - Feed URL as string or URL object
|
|
957
958
|
* @returns Normalized feed data
|
package/dist/index.js
CHANGED
|
@@ -1075,8 +1075,12 @@ function extractEntry(entryElement) {
|
|
|
1075
1075
|
function removeComments(xml) {
|
|
1076
1076
|
return xml.replace(/<!--[\s\S]*?-->/g, "");
|
|
1077
1077
|
}
|
|
1078
|
+
function removeDoctype(xml) {
|
|
1079
|
+
return xml.replace(/<!DOCTYPE[^>]*>/gi, "");
|
|
1080
|
+
}
|
|
1078
1081
|
function parseAtomXML(xml) {
|
|
1079
|
-
const
|
|
1082
|
+
const withoutDoctype = removeDoctype(xml);
|
|
1083
|
+
const withoutComments = removeComments(withoutDoctype);
|
|
1080
1084
|
const { text: cleanedXML, cdataMap } = extractCDATA(withoutComments);
|
|
1081
1085
|
const root = parseElement(cleanedXML, 0, null, cdataMap).element;
|
|
1082
1086
|
return root;
|
|
@@ -1884,13 +1888,17 @@ function parseRSSDate(dateString) {
|
|
|
1884
1888
|
// src/feed/rss/xml-parser.ts
|
|
1885
1889
|
function parseRSSXML(xml) {
|
|
1886
1890
|
const cleaned = cleanXMLDeclaration(xml);
|
|
1887
|
-
const
|
|
1891
|
+
const withoutDoctype = removeDoctype2(cleaned);
|
|
1892
|
+
const withoutComments = removeComments2(withoutDoctype);
|
|
1888
1893
|
const root = parseElement2(withoutComments, 0).element;
|
|
1889
1894
|
return root;
|
|
1890
1895
|
}
|
|
1891
1896
|
function cleanXMLDeclaration(xml) {
|
|
1892
1897
|
return xml.replace(/<\?xml[^?]*\?>/g, "").trim();
|
|
1893
1898
|
}
|
|
1899
|
+
function removeDoctype2(xml) {
|
|
1900
|
+
return xml.replace(/<!DOCTYPE[^>]*>/gi, "");
|
|
1901
|
+
}
|
|
1894
1902
|
function removeComments2(xml) {
|
|
1895
1903
|
return xml.replace(/<!--[\s\S]*?-->/g, "");
|
|
1896
1904
|
}
|
|
@@ -2339,6 +2347,378 @@ function parseFeedAs(content, format, baseUrl) {
|
|
|
2339
2347
|
}
|
|
2340
2348
|
}
|
|
2341
2349
|
|
|
2350
|
+
// src/feed/sitemap/xml-parser.ts
|
|
2351
|
+
function parseSitemapXML(xml) {
|
|
2352
|
+
const cleaned = cleanXMLDeclaration2(xml);
|
|
2353
|
+
const withoutDoctype = removeDoctype3(cleaned);
|
|
2354
|
+
const withoutComments = removeComments3(withoutDoctype);
|
|
2355
|
+
const root = parseElement3(withoutComments, 0).element;
|
|
2356
|
+
return root;
|
|
2357
|
+
}
|
|
2358
|
+
function cleanXMLDeclaration2(xml) {
|
|
2359
|
+
return xml.replace(/<\?xml[^?]*\?>/g, "").trim();
|
|
2360
|
+
}
|
|
2361
|
+
function removeDoctype3(xml) {
|
|
2362
|
+
return xml.replace(/<!DOCTYPE[^>]*>/gi, "");
|
|
2363
|
+
}
|
|
2364
|
+
function removeComments3(xml) {
|
|
2365
|
+
return xml.replace(/<!--[\s\S]*?-->/g, "");
|
|
2366
|
+
}
|
|
2367
|
+
function extractCDATA3(text) {
|
|
2368
|
+
const cdataMap = /* @__PURE__ */ new Map();
|
|
2369
|
+
let counter = 0;
|
|
2370
|
+
const processed = text.replace(/<!\[CDATA\[([\s\S]*?)\]\]>/g, (_match, content) => {
|
|
2371
|
+
const placeholder = `__CDATA_${counter}__`;
|
|
2372
|
+
cdataMap.set(placeholder, content);
|
|
2373
|
+
counter++;
|
|
2374
|
+
return placeholder;
|
|
2375
|
+
});
|
|
2376
|
+
return { text: processed, cdataMap };
|
|
2377
|
+
}
|
|
2378
|
+
function restoreCDATA3(text, cdataMap) {
|
|
2379
|
+
let result = text;
|
|
2380
|
+
for (const [placeholder, content] of cdataMap.entries()) {
|
|
2381
|
+
result = result.replace(placeholder, content);
|
|
2382
|
+
}
|
|
2383
|
+
return result;
|
|
2384
|
+
}
|
|
2385
|
+
function parseAttributes3(tagContent) {
|
|
2386
|
+
const attributes = {};
|
|
2387
|
+
const attrRegex = /(\S+)=["']([^"']*)["']/g;
|
|
2388
|
+
let match = attrRegex.exec(tagContent);
|
|
2389
|
+
while (match !== null) {
|
|
2390
|
+
attributes[match[1]] = match[2];
|
|
2391
|
+
match = attrRegex.exec(tagContent);
|
|
2392
|
+
}
|
|
2393
|
+
return attributes;
|
|
2394
|
+
}
|
|
2395
|
+
function findClosingTag3(xml, tagName, startPos) {
|
|
2396
|
+
const openTag = `<${tagName}`;
|
|
2397
|
+
const closeTag = `</${tagName}>`;
|
|
2398
|
+
let depth = 1;
|
|
2399
|
+
let pos = startPos;
|
|
2400
|
+
while (pos < xml.length && depth > 0) {
|
|
2401
|
+
const nextOpen = xml.indexOf(openTag, pos);
|
|
2402
|
+
const nextClose = xml.indexOf(closeTag, pos);
|
|
2403
|
+
if (nextClose === -1) {
|
|
2404
|
+
return -1;
|
|
2405
|
+
}
|
|
2406
|
+
if (nextOpen !== -1 && nextOpen < nextClose) {
|
|
2407
|
+
depth++;
|
|
2408
|
+
pos = nextOpen + openTag.length;
|
|
2409
|
+
} else {
|
|
2410
|
+
depth--;
|
|
2411
|
+
if (depth === 0) {
|
|
2412
|
+
return nextClose;
|
|
2413
|
+
}
|
|
2414
|
+
pos = nextClose + closeTag.length;
|
|
2415
|
+
}
|
|
2416
|
+
}
|
|
2417
|
+
return -1;
|
|
2418
|
+
}
|
|
2419
|
+
function parseElement3(xml, startPos, parent = null, cdataMap) {
|
|
2420
|
+
const extracted = cdataMap ? { text: xml, cdataMap } : extractCDATA3(xml);
|
|
2421
|
+
const cleanedXML = extracted.text;
|
|
2422
|
+
const currentCdataMap = extracted.cdataMap;
|
|
2423
|
+
const openTagStart = cleanedXML.indexOf("<", startPos);
|
|
2424
|
+
if (openTagStart === -1) {
|
|
2425
|
+
throw new Error("No opening tag found");
|
|
2426
|
+
}
|
|
2427
|
+
const openTagEnd = cleanedXML.indexOf(">", openTagStart);
|
|
2428
|
+
if (openTagEnd === -1) {
|
|
2429
|
+
throw new Error("Unclosed opening tag");
|
|
2430
|
+
}
|
|
2431
|
+
const openTagContent = cleanedXML.substring(openTagStart + 1, openTagEnd);
|
|
2432
|
+
const isSelfClosing = openTagContent.endsWith("/");
|
|
2433
|
+
const tagContent = isSelfClosing ? openTagContent.slice(0, -1).trim() : openTagContent;
|
|
2434
|
+
const spaceIndex = tagContent.indexOf(" ");
|
|
2435
|
+
const tagName = spaceIndex === -1 ? tagContent : tagContent.substring(0, spaceIndex);
|
|
2436
|
+
const attributes = spaceIndex === -1 ? {} : parseAttributes3(tagContent.substring(spaceIndex));
|
|
2437
|
+
const element = {
|
|
2438
|
+
tagName,
|
|
2439
|
+
attributes,
|
|
2440
|
+
text: "",
|
|
2441
|
+
children: [],
|
|
2442
|
+
parent
|
|
2443
|
+
};
|
|
2444
|
+
if (isSelfClosing) {
|
|
2445
|
+
return { element, endPos: openTagEnd + 1, cdataMap: currentCdataMap };
|
|
2446
|
+
}
|
|
2447
|
+
const closingTagPos = findClosingTag3(cleanedXML, tagName, openTagEnd + 1);
|
|
2448
|
+
if (closingTagPos === -1) {
|
|
2449
|
+
throw new Error(`No closing tag found for <${tagName}>`);
|
|
2450
|
+
}
|
|
2451
|
+
const content = cleanedXML.substring(openTagEnd + 1, closingTagPos);
|
|
2452
|
+
if (content.includes("<")) {
|
|
2453
|
+
let pos = 0;
|
|
2454
|
+
const trimmedContent = content.trim();
|
|
2455
|
+
while (pos < trimmedContent.length) {
|
|
2456
|
+
const nextTag = trimmedContent.indexOf("<", pos);
|
|
2457
|
+
if (nextTag === -1) break;
|
|
2458
|
+
if (trimmedContent[nextTag + 1] === "/" || trimmedContent[nextTag + 1] === "!") {
|
|
2459
|
+
pos = nextTag + 1;
|
|
2460
|
+
continue;
|
|
2461
|
+
}
|
|
2462
|
+
try {
|
|
2463
|
+
const { element: child, endPos } = parseElement3(
|
|
2464
|
+
trimmedContent,
|
|
2465
|
+
nextTag,
|
|
2466
|
+
element,
|
|
2467
|
+
currentCdataMap
|
|
2468
|
+
);
|
|
2469
|
+
element.children.push(child);
|
|
2470
|
+
pos = endPos;
|
|
2471
|
+
} catch {
|
|
2472
|
+
pos = nextTag + 1;
|
|
2473
|
+
}
|
|
2474
|
+
}
|
|
2475
|
+
let textContent = content.replace(/<[^>]+>/g, "").trim();
|
|
2476
|
+
textContent = restoreCDATA3(textContent, currentCdataMap);
|
|
2477
|
+
element.text = textContent;
|
|
2478
|
+
} else {
|
|
2479
|
+
let textContent = content.trim();
|
|
2480
|
+
textContent = restoreCDATA3(textContent, currentCdataMap);
|
|
2481
|
+
element.text = textContent;
|
|
2482
|
+
}
|
|
2483
|
+
const closingTagEnd = closingTagPos + `</${tagName}>`.length;
|
|
2484
|
+
return { element, endPos: closingTagEnd, cdataMap: currentCdataMap };
|
|
2485
|
+
}
|
|
2486
|
+
function querySelector3(element, selector, caseSensitive = false) {
|
|
2487
|
+
const tagName = caseSensitive ? selector : selector.toLowerCase();
|
|
2488
|
+
const elementTag = caseSensitive ? element.tagName : element.tagName.toLowerCase();
|
|
2489
|
+
if (elementTag === tagName) {
|
|
2490
|
+
return element;
|
|
2491
|
+
}
|
|
2492
|
+
for (const child of element.children) {
|
|
2493
|
+
const found = querySelector3(child, selector, caseSensitive);
|
|
2494
|
+
if (found) return found;
|
|
2495
|
+
}
|
|
2496
|
+
return null;
|
|
2497
|
+
}
|
|
2498
|
+
function querySelectorAll3(element, selector, caseSensitive = false) {
|
|
2499
|
+
const results = [];
|
|
2500
|
+
const tagName = caseSensitive ? selector : selector.toLowerCase();
|
|
2501
|
+
const elementTag = caseSensitive ? element.tagName : element.tagName.toLowerCase();
|
|
2502
|
+
if (elementTag === tagName) {
|
|
2503
|
+
results.push(element);
|
|
2504
|
+
}
|
|
2505
|
+
for (const child of element.children) {
|
|
2506
|
+
results.push(...querySelectorAll3(child, selector, caseSensitive));
|
|
2507
|
+
}
|
|
2508
|
+
return results;
|
|
2509
|
+
}
|
|
2510
|
+
function getText2(element) {
|
|
2511
|
+
return element?.text || "";
|
|
2512
|
+
}
|
|
2513
|
+
function getChild(element, tagName) {
|
|
2514
|
+
const lowerTag = tagName.toLowerCase();
|
|
2515
|
+
return element.children.find((c) => c.tagName.toLowerCase() === lowerTag) || null;
|
|
2516
|
+
}
|
|
2517
|
+
function getChildren(element, tagName) {
|
|
2518
|
+
const lowerTag = tagName.toLowerCase();
|
|
2519
|
+
return element.children.filter((c) => c.tagName.toLowerCase() === lowerTag);
|
|
2520
|
+
}
|
|
2521
|
+
|
|
2522
|
+
// src/feed/sitemap/parse.ts
|
|
2523
|
+
function parseSitemap(xml, baseUrl) {
|
|
2524
|
+
const doc = parseSitemapXML(xml);
|
|
2525
|
+
const sitemapIndex = querySelector3(doc, "sitemapindex");
|
|
2526
|
+
if (sitemapIndex) {
|
|
2527
|
+
return parseSitemapIndex(sitemapIndex, baseUrl);
|
|
2528
|
+
}
|
|
2529
|
+
const urlset = querySelector3(doc, "urlset");
|
|
2530
|
+
if (urlset) {
|
|
2531
|
+
return parseUrlset(urlset, baseUrl);
|
|
2532
|
+
}
|
|
2533
|
+
const urls = querySelectorAll3(doc, "url");
|
|
2534
|
+
if (urls.length > 0) {
|
|
2535
|
+
return {
|
|
2536
|
+
sitemap: {
|
|
2537
|
+
type: "urlset",
|
|
2538
|
+
urls: urls.map((url) => extractUrl(url, baseUrl)),
|
|
2539
|
+
sitemaps: []
|
|
2540
|
+
},
|
|
2541
|
+
isIndex: false
|
|
2542
|
+
};
|
|
2543
|
+
}
|
|
2544
|
+
return {
|
|
2545
|
+
sitemap: {
|
|
2546
|
+
type: "urlset",
|
|
2547
|
+
urls: [],
|
|
2548
|
+
sitemaps: []
|
|
2549
|
+
},
|
|
2550
|
+
isIndex: false
|
|
2551
|
+
};
|
|
2552
|
+
}
|
|
2553
|
+
function parseSitemapIndex(element, baseUrl) {
|
|
2554
|
+
const sitemapElements = getChildren(element, "sitemap");
|
|
2555
|
+
const sitemaps = sitemapElements.map((el) => {
|
|
2556
|
+
const loc = getText2(getChild(el, "loc"));
|
|
2557
|
+
const lastmod = getText2(getChild(el, "lastmod")) || void 0;
|
|
2558
|
+
return {
|
|
2559
|
+
loc: baseUrl ? normalizeUrlHttps(baseUrl, loc) : loc,
|
|
2560
|
+
lastmod
|
|
2561
|
+
};
|
|
2562
|
+
});
|
|
2563
|
+
return {
|
|
2564
|
+
sitemap: {
|
|
2565
|
+
type: "sitemapindex",
|
|
2566
|
+
urls: [],
|
|
2567
|
+
sitemaps
|
|
2568
|
+
},
|
|
2569
|
+
isIndex: true
|
|
2570
|
+
};
|
|
2571
|
+
}
|
|
2572
|
+
function parseUrlset(element, baseUrl) {
|
|
2573
|
+
const urlElements = getChildren(element, "url");
|
|
2574
|
+
const urls = urlElements.map((el) => extractUrl(el, baseUrl));
|
|
2575
|
+
return {
|
|
2576
|
+
sitemap: {
|
|
2577
|
+
type: "urlset",
|
|
2578
|
+
urls,
|
|
2579
|
+
sitemaps: []
|
|
2580
|
+
},
|
|
2581
|
+
isIndex: false
|
|
2582
|
+
};
|
|
2583
|
+
}
|
|
2584
|
+
function extractUrl(element, baseUrl) {
|
|
2585
|
+
const rawLoc = getText2(getChild(element, "loc"));
|
|
2586
|
+
const loc = decodeXmlEntities(rawLoc);
|
|
2587
|
+
const lastmod = getText2(getChild(element, "lastmod")) || void 0;
|
|
2588
|
+
const changefreq = getText2(getChild(element, "changefreq")) || void 0;
|
|
2589
|
+
const priorityText = getText2(getChild(element, "priority"));
|
|
2590
|
+
const priority = priorityText ? Number.parseFloat(priorityText) : void 0;
|
|
2591
|
+
const result = {
|
|
2592
|
+
loc: baseUrl ? normalizeUrlHttps(baseUrl, loc) : loc,
|
|
2593
|
+
lastmod,
|
|
2594
|
+
changefreq,
|
|
2595
|
+
priority: priority && !Number.isNaN(priority) ? priority : void 0
|
|
2596
|
+
};
|
|
2597
|
+
const news = extractNews(element);
|
|
2598
|
+
if (news) {
|
|
2599
|
+
result.news = news;
|
|
2600
|
+
}
|
|
2601
|
+
const images = extractImages(element, baseUrl);
|
|
2602
|
+
if (images.length > 0) {
|
|
2603
|
+
result.images = images;
|
|
2604
|
+
}
|
|
2605
|
+
const videos = extractVideos(element, baseUrl);
|
|
2606
|
+
if (videos.length > 0) {
|
|
2607
|
+
result.videos = videos;
|
|
2608
|
+
}
|
|
2609
|
+
return result;
|
|
2610
|
+
}
|
|
2611
|
+
function extractNews(urlElement) {
|
|
2612
|
+
const newsEl = getChild(urlElement, "news:news") || getChild(urlElement, "news") || urlElement.children.find((c) => c.tagName.toLowerCase().endsWith(":news"));
|
|
2613
|
+
if (!newsEl) {
|
|
2614
|
+
return void 0;
|
|
2615
|
+
}
|
|
2616
|
+
const news = {};
|
|
2617
|
+
const pubEl = getChild(newsEl, "news:publication") || getChild(newsEl, "publication") || newsEl.children.find((c) => c.tagName.toLowerCase().endsWith(":publication"));
|
|
2618
|
+
if (pubEl) {
|
|
2619
|
+
const name = getText2(getChild(pubEl, "news:name")) || getText2(getChild(pubEl, "name")) || getText2(pubEl.children.find((c) => c.tagName.toLowerCase().endsWith(":name")));
|
|
2620
|
+
const language = getText2(getChild(pubEl, "news:language")) || getText2(getChild(pubEl, "language")) || getText2(pubEl.children.find((c) => c.tagName.toLowerCase().endsWith(":language")));
|
|
2621
|
+
if (name || language) {
|
|
2622
|
+
news.publication = {
|
|
2623
|
+
name: name || void 0,
|
|
2624
|
+
language: language || void 0
|
|
2625
|
+
};
|
|
2626
|
+
}
|
|
2627
|
+
}
|
|
2628
|
+
const pubDate = getText2(getChild(newsEl, "news:publication_date")) || getText2(getChild(newsEl, "publication_date")) || getText2(newsEl.children.find((c) => c.tagName.toLowerCase().endsWith(":publication_date")));
|
|
2629
|
+
if (pubDate) {
|
|
2630
|
+
news.publicationDate = pubDate;
|
|
2631
|
+
}
|
|
2632
|
+
const title = getText2(getChild(newsEl, "news:title")) || getText2(getChild(newsEl, "title")) || getText2(newsEl.children.find((c) => c.tagName.toLowerCase().endsWith(":title")));
|
|
2633
|
+
if (title) {
|
|
2634
|
+
news.title = decodeXmlEntities(title);
|
|
2635
|
+
}
|
|
2636
|
+
const keywords = getText2(getChild(newsEl, "news:keywords")) || getText2(getChild(newsEl, "keywords")) || getText2(newsEl.children.find((c) => c.tagName.toLowerCase().endsWith(":keywords")));
|
|
2637
|
+
if (keywords) {
|
|
2638
|
+
news.keywords = keywords.split(",").map((k) => k.trim());
|
|
2639
|
+
}
|
|
2640
|
+
const stockTickers = getText2(getChild(newsEl, "news:stock_tickers")) || getText2(getChild(newsEl, "stock_tickers")) || getText2(newsEl.children.find((c) => c.tagName.toLowerCase().endsWith(":stock_tickers")));
|
|
2641
|
+
if (stockTickers) {
|
|
2642
|
+
news.stockTickers = stockTickers.split(",").map((t) => t.trim());
|
|
2643
|
+
}
|
|
2644
|
+
return Object.keys(news).length > 0 ? news : void 0;
|
|
2645
|
+
}
|
|
2646
|
+
function extractImages(urlElement, baseUrl) {
|
|
2647
|
+
const imageElements = urlElement.children.filter(
|
|
2648
|
+
(c) => c.tagName.toLowerCase() === "image:image" || c.tagName.toLowerCase() === "image" || c.tagName.toLowerCase().endsWith(":image")
|
|
2649
|
+
);
|
|
2650
|
+
return imageElements.map((imgEl) => {
|
|
2651
|
+
const loc = getText2(getChild(imgEl, "image:loc")) || getText2(getChild(imgEl, "loc")) || getText2(imgEl.children.find((c) => c.tagName.toLowerCase().endsWith(":loc")));
|
|
2652
|
+
if (!loc) return null;
|
|
2653
|
+
const image = {
|
|
2654
|
+
loc: baseUrl ? normalizeUrlHttps(baseUrl, loc) : loc
|
|
2655
|
+
};
|
|
2656
|
+
const caption = getText2(getChild(imgEl, "image:caption")) || getText2(getChild(imgEl, "caption")) || getText2(imgEl.children.find((c) => c.tagName.toLowerCase().endsWith(":caption")));
|
|
2657
|
+
if (caption) image.caption = decodeXmlEntities(caption);
|
|
2658
|
+
const geoLocation = getText2(getChild(imgEl, "image:geo_location")) || getText2(getChild(imgEl, "geo_location")) || getText2(imgEl.children.find((c) => c.tagName.toLowerCase().endsWith(":geo_location")));
|
|
2659
|
+
if (geoLocation) image.geoLocation = geoLocation;
|
|
2660
|
+
const title = getText2(getChild(imgEl, "image:title")) || getText2(getChild(imgEl, "title")) || getText2(imgEl.children.find((c) => c.tagName.toLowerCase().endsWith(":title")));
|
|
2661
|
+
if (title) image.title = decodeXmlEntities(title);
|
|
2662
|
+
const license = getText2(getChild(imgEl, "image:license")) || getText2(getChild(imgEl, "license")) || getText2(imgEl.children.find((c) => c.tagName.toLowerCase().endsWith(":license")));
|
|
2663
|
+
if (license) image.license = baseUrl ? normalizeUrlHttps(baseUrl, license) : license;
|
|
2664
|
+
return image;
|
|
2665
|
+
}).filter((img) => img !== null);
|
|
2666
|
+
}
|
|
2667
|
+
function extractVideos(urlElement, baseUrl) {
|
|
2668
|
+
const videoElements = urlElement.children.filter(
|
|
2669
|
+
(c) => c.tagName.toLowerCase() === "video:video" || c.tagName.toLowerCase() === "video" || c.tagName.toLowerCase().endsWith(":video")
|
|
2670
|
+
);
|
|
2671
|
+
return videoElements.map((vidEl) => {
|
|
2672
|
+
const thumbnailLoc = getText2(getChild(vidEl, "video:thumbnail_loc")) || getText2(getChild(vidEl, "thumbnail_loc")) || getText2(vidEl.children.find((c) => c.tagName.toLowerCase().endsWith(":thumbnail_loc")));
|
|
2673
|
+
const title = getText2(getChild(vidEl, "video:title")) || getText2(getChild(vidEl, "title")) || getText2(vidEl.children.find((c) => c.tagName.toLowerCase().endsWith(":title")));
|
|
2674
|
+
const description = getText2(getChild(vidEl, "video:description")) || getText2(getChild(vidEl, "description")) || getText2(vidEl.children.find((c) => c.tagName.toLowerCase().endsWith(":description")));
|
|
2675
|
+
if (!thumbnailLoc || !title || !description) return null;
|
|
2676
|
+
const video = {
|
|
2677
|
+
thumbnailLoc: baseUrl ? normalizeUrlHttps(baseUrl, thumbnailLoc) : thumbnailLoc,
|
|
2678
|
+
title: decodeXmlEntities(title),
|
|
2679
|
+
description: decodeXmlEntities(description)
|
|
2680
|
+
};
|
|
2681
|
+
const contentLoc = getText2(getChild(vidEl, "video:content_loc")) || getText2(getChild(vidEl, "content_loc")) || getText2(vidEl.children.find((c) => c.tagName.toLowerCase().endsWith(":content_loc")));
|
|
2682
|
+
if (contentLoc)
|
|
2683
|
+
video.contentLoc = baseUrl ? normalizeUrlHttps(baseUrl, contentLoc) : contentLoc;
|
|
2684
|
+
const playerLoc = getText2(getChild(vidEl, "video:player_loc")) || getText2(getChild(vidEl, "player_loc")) || getText2(vidEl.children.find((c) => c.tagName.toLowerCase().endsWith(":player_loc")));
|
|
2685
|
+
if (playerLoc) video.playerLoc = baseUrl ? normalizeUrlHttps(baseUrl, playerLoc) : playerLoc;
|
|
2686
|
+
const duration = getText2(getChild(vidEl, "video:duration")) || getText2(getChild(vidEl, "duration")) || getText2(vidEl.children.find((c) => c.tagName.toLowerCase().endsWith(":duration")));
|
|
2687
|
+
if (duration) {
|
|
2688
|
+
const dur = Number.parseInt(duration, 10);
|
|
2689
|
+
if (!Number.isNaN(dur)) video.duration = dur;
|
|
2690
|
+
}
|
|
2691
|
+
const rating = getText2(getChild(vidEl, "video:rating")) || getText2(getChild(vidEl, "rating")) || getText2(vidEl.children.find((c) => c.tagName.toLowerCase().endsWith(":rating")));
|
|
2692
|
+
if (rating) {
|
|
2693
|
+
const r = Number.parseFloat(rating);
|
|
2694
|
+
if (!Number.isNaN(r)) video.rating = r;
|
|
2695
|
+
}
|
|
2696
|
+
const viewCount = getText2(getChild(vidEl, "video:view_count")) || getText2(getChild(vidEl, "view_count")) || getText2(vidEl.children.find((c) => c.tagName.toLowerCase().endsWith(":view_count")));
|
|
2697
|
+
if (viewCount) {
|
|
2698
|
+
const vc = Number.parseInt(viewCount, 10);
|
|
2699
|
+
if (!Number.isNaN(vc)) video.viewCount = vc;
|
|
2700
|
+
}
|
|
2701
|
+
const publicationDate = getText2(getChild(vidEl, "video:publication_date")) || getText2(getChild(vidEl, "publication_date")) || getText2(vidEl.children.find((c) => c.tagName.toLowerCase().endsWith(":publication_date")));
|
|
2702
|
+
if (publicationDate) video.publicationDate = publicationDate;
|
|
2703
|
+
const familyFriendly = getText2(getChild(vidEl, "video:family_friendly")) || getText2(getChild(vidEl, "family_friendly")) || getText2(vidEl.children.find((c) => c.tagName.toLowerCase().endsWith(":family_friendly")));
|
|
2704
|
+
if (familyFriendly) {
|
|
2705
|
+
video.familyFriendly = familyFriendly.toLowerCase() === "yes";
|
|
2706
|
+
}
|
|
2707
|
+
const category = getText2(getChild(vidEl, "video:category")) || getText2(getChild(vidEl, "category")) || getText2(vidEl.children.find((c) => c.tagName.toLowerCase().endsWith(":category")));
|
|
2708
|
+
if (category) video.category = category;
|
|
2709
|
+
const tagElements = vidEl.children.filter(
|
|
2710
|
+
(c) => c.tagName.toLowerCase() === "video:tag" || c.tagName.toLowerCase() === "tag" || c.tagName.toLowerCase().endsWith(":tag")
|
|
2711
|
+
);
|
|
2712
|
+
if (tagElements.length > 0) {
|
|
2713
|
+
video.tags = tagElements.map((t) => getText2(t)).filter(Boolean);
|
|
2714
|
+
}
|
|
2715
|
+
return video;
|
|
2716
|
+
}).filter((vid) => vid !== null);
|
|
2717
|
+
}
|
|
2718
|
+
function decodeXmlEntities(text) {
|
|
2719
|
+
return text.replace(/</g, "<").replace(/>/g, ">").replace(/&/g, "&").replace(/"/g, '"').replace(/'/g, "'").replace(/&#(\d+);/g, (_, code) => String.fromCharCode(Number.parseInt(code, 10))).replace(/&#x([0-9a-fA-F]+);/g, (_, code) => String.fromCharCode(Number.parseInt(code, 16)));
|
|
2720
|
+
}
|
|
2721
|
+
|
|
2342
2722
|
// src/pluck/types.ts
|
|
2343
2723
|
var PluckError = class extends Error {
|
|
2344
2724
|
constructor(message) {
|
|
@@ -2767,7 +3147,7 @@ function extractOpenGraph(doc) {
|
|
|
2767
3147
|
if (Object.keys(audio).length > 0) {
|
|
2768
3148
|
metadata.audio = audio;
|
|
2769
3149
|
}
|
|
2770
|
-
const images =
|
|
3150
|
+
const images = extractImages2(doc);
|
|
2771
3151
|
if (images.length > 0) {
|
|
2772
3152
|
metadata.images = images;
|
|
2773
3153
|
}
|
|
@@ -2836,7 +3216,7 @@ function extractAudio(doc) {
|
|
|
2836
3216
|
Object.entries(audio).filter(([_, value]) => value !== void 0)
|
|
2837
3217
|
);
|
|
2838
3218
|
}
|
|
2839
|
-
function
|
|
3219
|
+
function extractImages2(doc) {
|
|
2840
3220
|
const images = [];
|
|
2841
3221
|
const imageUrls = getAllMetaPropertyValues(doc, "og:image");
|
|
2842
3222
|
const imageSecureUrls = getAllMetaPropertyValues(doc, "og:image:secure_url");
|
|
@@ -3930,9 +4310,63 @@ async function gatherFeed(url) {
|
|
|
3930
4310
|
}
|
|
3931
4311
|
const response = await pluck(feedUrl);
|
|
3932
4312
|
const content = await response.textUtf8();
|
|
4313
|
+
const format = detectFormat(content);
|
|
4314
|
+
if (format === "sitemap") {
|
|
4315
|
+
return normalizeSitemapToFeed(content, response.finalUrl);
|
|
4316
|
+
}
|
|
3933
4317
|
const result = parseFeed(content, response.finalUrl);
|
|
3934
4318
|
return result.feed;
|
|
3935
4319
|
}
|
|
4320
|
+
function normalizeSitemapToFeed(content, baseUrl) {
|
|
4321
|
+
const result = parseSitemap(content, baseUrl);
|
|
4322
|
+
if (result.isIndex) {
|
|
4323
|
+
const items2 = result.sitemap.sitemaps.map((sitemap, index) => ({
|
|
4324
|
+
id: sitemap.loc || `sitemap-${index}`,
|
|
4325
|
+
url: sitemap.loc,
|
|
4326
|
+
title: `Sitemap: ${sitemap.loc}`,
|
|
4327
|
+
modified: sitemap.lastmod
|
|
4328
|
+
}));
|
|
4329
|
+
return {
|
|
4330
|
+
format: "sitemap",
|
|
4331
|
+
title: "Sitemap Index",
|
|
4332
|
+
url: baseUrl,
|
|
4333
|
+
items: items2
|
|
4334
|
+
};
|
|
4335
|
+
}
|
|
4336
|
+
const items = result.sitemap.urls.map((url, index) => {
|
|
4337
|
+
const item = {
|
|
4338
|
+
id: url.loc || `url-${index}`,
|
|
4339
|
+
url: url.loc,
|
|
4340
|
+
modified: url.lastmod
|
|
4341
|
+
};
|
|
4342
|
+
if (url.news) {
|
|
4343
|
+
item.title = url.news.title;
|
|
4344
|
+
item.published = url.news.publicationDate;
|
|
4345
|
+
if (url.news.publication?.name) {
|
|
4346
|
+
item.authors = [{ name: url.news.publication.name }];
|
|
4347
|
+
}
|
|
4348
|
+
if (url.news.keywords) {
|
|
4349
|
+
item.tags = url.news.keywords;
|
|
4350
|
+
}
|
|
4351
|
+
}
|
|
4352
|
+
if (url.images && url.images.length > 0) {
|
|
4353
|
+
item.image = url.images[0].loc;
|
|
4354
|
+
}
|
|
4355
|
+
return item;
|
|
4356
|
+
});
|
|
4357
|
+
let title = "Sitemap";
|
|
4358
|
+
try {
|
|
4359
|
+
const urlObj = new URL(baseUrl);
|
|
4360
|
+
title = `${urlObj.hostname} Sitemap`;
|
|
4361
|
+
} catch {
|
|
4362
|
+
}
|
|
4363
|
+
return {
|
|
4364
|
+
format: "sitemap",
|
|
4365
|
+
title,
|
|
4366
|
+
url: baseUrl,
|
|
4367
|
+
items
|
|
4368
|
+
};
|
|
4369
|
+
}
|
|
3936
4370
|
|
|
3937
4371
|
// src/metadata/feed-discovery/heuristics.ts
|
|
3938
4372
|
var COMMON_FEED_PATHS = [
|
|
@@ -4239,7 +4673,7 @@ function extractAnalytics(doc) {
|
|
|
4239
4673
|
function extractAssets(doc, baseUrl) {
|
|
4240
4674
|
const metadata = {};
|
|
4241
4675
|
const effectiveBaseUrl = getEffectiveBaseUrl2(doc, baseUrl);
|
|
4242
|
-
const images =
|
|
4676
|
+
const images = extractImages3(doc, effectiveBaseUrl);
|
|
4243
4677
|
if (images.length > 0) {
|
|
4244
4678
|
metadata.images = images;
|
|
4245
4679
|
}
|
|
@@ -4293,7 +4727,7 @@ function getEffectiveBaseUrl2(doc, baseUrl) {
|
|
|
4293
4727
|
}
|
|
4294
4728
|
return null;
|
|
4295
4729
|
}
|
|
4296
|
-
function
|
|
4730
|
+
function extractImages3(doc, baseUrl) {
|
|
4297
4731
|
const urls = /* @__PURE__ */ new Set();
|
|
4298
4732
|
const imgElements = doc.querySelectorAll("img[src]");
|
|
4299
4733
|
for (const img of Array.from(imgElements)) {
|
|
@@ -4711,7 +5145,7 @@ function extractMonetization(doc) {
|
|
|
4711
5145
|
}
|
|
4712
5146
|
|
|
4713
5147
|
// src/metadata/news/extract.ts
|
|
4714
|
-
function
|
|
5148
|
+
function extractNews2(doc) {
|
|
4715
5149
|
const metadata = {};
|
|
4716
5150
|
const newsKeywords = getMetaContent(doc, "news_keywords");
|
|
4717
5151
|
if (newsKeywords) {
|
|
@@ -6926,6 +7360,6 @@ async function swoop(url, init) {
|
|
|
6926
7360
|
* @packageDocumentation
|
|
6927
7361
|
*/
|
|
6928
7362
|
|
|
6929
|
-
export { PluckContentTypeError, PluckEncodingError, PluckError, PluckHttpError, PluckNetworkError, PluckRedirectError, PluckSizeError, PluckTimeoutError, SwoopEnvironmentError, SwoopError, SwoopExecutionError, SwoopSecurityError, SwoopTimeoutError, assessContentQuality, calculateReadingTime, countWords, detectFormat, extractAnalytics, extractAssets, extractCanonical, extractContent, extractCopyright, extractDublinCore, extractFeedDiscovery, extractGeo, extractIcons, extractLanguage, extractLinks3 as extractLinks, extractMonetization, extractNews, extractOpenGraph, extractPagination, extractRobots, extractSEO, extractSchemaOrg, extractSecurity, extractSitemapDiscovery, extractSocialProfiles, extractTwitterCard, extractVerification, gatherArticle, gatherFeed, gatherWebsite, htmlToText, isAtom, isFeed, isJSONFeed, isProbablyReaderable, isRSS, parseFeed, parseHTML, pluck, swoop };
|
|
7363
|
+
export { PluckContentTypeError, PluckEncodingError, PluckError, PluckHttpError, PluckNetworkError, PluckRedirectError, PluckSizeError, PluckTimeoutError, SwoopEnvironmentError, SwoopError, SwoopExecutionError, SwoopSecurityError, SwoopTimeoutError, assessContentQuality, calculateReadingTime, countWords, detectFormat, extractAnalytics, extractAssets, extractCanonical, extractContent, extractCopyright, extractDublinCore, extractFeedDiscovery, extractGeo, extractIcons, extractLanguage, extractLinks3 as extractLinks, extractMonetization, extractNews2 as extractNews, extractOpenGraph, extractPagination, extractRobots, extractSEO, extractSchemaOrg, extractSecurity, extractSitemapDiscovery, extractSocialProfiles, extractTwitterCard, extractVerification, gatherArticle, gatherFeed, gatherWebsite, htmlToText, isAtom, isFeed, isJSONFeed, isProbablyReaderable, isRSS, parseFeed, parseHTML, pluck, swoop };
|
|
6930
7364
|
//# sourceMappingURL=index.js.map
|
|
6931
7365
|
//# sourceMappingURL=index.js.map
|