magpie-html 0.2.1 → 0.2.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.cjs +481 -27
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +158 -109
- package/dist/index.d.ts +158 -109
- package/dist/index.js +481 -27
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/dist/index.js
CHANGED
|
@@ -2347,6 +2347,378 @@ function parseFeedAs(content, format, baseUrl) {
|
|
|
2347
2347
|
}
|
|
2348
2348
|
}
|
|
2349
2349
|
|
|
2350
|
+
// src/feed/sitemap/xml-parser.ts
|
|
2351
|
+
function parseSitemapXML(xml) {
|
|
2352
|
+
const cleaned = cleanXMLDeclaration2(xml);
|
|
2353
|
+
const withoutDoctype = removeDoctype3(cleaned);
|
|
2354
|
+
const withoutComments = removeComments3(withoutDoctype);
|
|
2355
|
+
const root = parseElement3(withoutComments, 0).element;
|
|
2356
|
+
return root;
|
|
2357
|
+
}
|
|
2358
|
+
function cleanXMLDeclaration2(xml) {
|
|
2359
|
+
return xml.replace(/<\?xml[^?]*\?>/g, "").trim();
|
|
2360
|
+
}
|
|
2361
|
+
function removeDoctype3(xml) {
|
|
2362
|
+
return xml.replace(/<!DOCTYPE[^>]*>/gi, "");
|
|
2363
|
+
}
|
|
2364
|
+
function removeComments3(xml) {
|
|
2365
|
+
return xml.replace(/<!--[\s\S]*?-->/g, "");
|
|
2366
|
+
}
|
|
2367
|
+
function extractCDATA3(text) {
|
|
2368
|
+
const cdataMap = /* @__PURE__ */ new Map();
|
|
2369
|
+
let counter = 0;
|
|
2370
|
+
const processed = text.replace(/<!\[CDATA\[([\s\S]*?)\]\]>/g, (_match, content) => {
|
|
2371
|
+
const placeholder = `__CDATA_${counter}__`;
|
|
2372
|
+
cdataMap.set(placeholder, content);
|
|
2373
|
+
counter++;
|
|
2374
|
+
return placeholder;
|
|
2375
|
+
});
|
|
2376
|
+
return { text: processed, cdataMap };
|
|
2377
|
+
}
|
|
2378
|
+
function restoreCDATA3(text, cdataMap) {
|
|
2379
|
+
let result = text;
|
|
2380
|
+
for (const [placeholder, content] of cdataMap.entries()) {
|
|
2381
|
+
result = result.replace(placeholder, content);
|
|
2382
|
+
}
|
|
2383
|
+
return result;
|
|
2384
|
+
}
|
|
2385
|
+
function parseAttributes3(tagContent) {
|
|
2386
|
+
const attributes = {};
|
|
2387
|
+
const attrRegex = /(\S+)=["']([^"']*)["']/g;
|
|
2388
|
+
let match = attrRegex.exec(tagContent);
|
|
2389
|
+
while (match !== null) {
|
|
2390
|
+
attributes[match[1]] = match[2];
|
|
2391
|
+
match = attrRegex.exec(tagContent);
|
|
2392
|
+
}
|
|
2393
|
+
return attributes;
|
|
2394
|
+
}
|
|
2395
|
+
function findClosingTag3(xml, tagName, startPos) {
|
|
2396
|
+
const openTag = `<${tagName}`;
|
|
2397
|
+
const closeTag = `</${tagName}>`;
|
|
2398
|
+
let depth = 1;
|
|
2399
|
+
let pos = startPos;
|
|
2400
|
+
while (pos < xml.length && depth > 0) {
|
|
2401
|
+
const nextOpen = xml.indexOf(openTag, pos);
|
|
2402
|
+
const nextClose = xml.indexOf(closeTag, pos);
|
|
2403
|
+
if (nextClose === -1) {
|
|
2404
|
+
return -1;
|
|
2405
|
+
}
|
|
2406
|
+
if (nextOpen !== -1 && nextOpen < nextClose) {
|
|
2407
|
+
depth++;
|
|
2408
|
+
pos = nextOpen + openTag.length;
|
|
2409
|
+
} else {
|
|
2410
|
+
depth--;
|
|
2411
|
+
if (depth === 0) {
|
|
2412
|
+
return nextClose;
|
|
2413
|
+
}
|
|
2414
|
+
pos = nextClose + closeTag.length;
|
|
2415
|
+
}
|
|
2416
|
+
}
|
|
2417
|
+
return -1;
|
|
2418
|
+
}
|
|
2419
|
+
function parseElement3(xml, startPos, parent = null, cdataMap) {
|
|
2420
|
+
const extracted = cdataMap ? { text: xml, cdataMap } : extractCDATA3(xml);
|
|
2421
|
+
const cleanedXML = extracted.text;
|
|
2422
|
+
const currentCdataMap = extracted.cdataMap;
|
|
2423
|
+
const openTagStart = cleanedXML.indexOf("<", startPos);
|
|
2424
|
+
if (openTagStart === -1) {
|
|
2425
|
+
throw new Error("No opening tag found");
|
|
2426
|
+
}
|
|
2427
|
+
const openTagEnd = cleanedXML.indexOf(">", openTagStart);
|
|
2428
|
+
if (openTagEnd === -1) {
|
|
2429
|
+
throw new Error("Unclosed opening tag");
|
|
2430
|
+
}
|
|
2431
|
+
const openTagContent = cleanedXML.substring(openTagStart + 1, openTagEnd);
|
|
2432
|
+
const isSelfClosing = openTagContent.endsWith("/");
|
|
2433
|
+
const tagContent = isSelfClosing ? openTagContent.slice(0, -1).trim() : openTagContent;
|
|
2434
|
+
const spaceIndex = tagContent.indexOf(" ");
|
|
2435
|
+
const tagName = spaceIndex === -1 ? tagContent : tagContent.substring(0, spaceIndex);
|
|
2436
|
+
const attributes = spaceIndex === -1 ? {} : parseAttributes3(tagContent.substring(spaceIndex));
|
|
2437
|
+
const element = {
|
|
2438
|
+
tagName,
|
|
2439
|
+
attributes,
|
|
2440
|
+
text: "",
|
|
2441
|
+
children: [],
|
|
2442
|
+
parent
|
|
2443
|
+
};
|
|
2444
|
+
if (isSelfClosing) {
|
|
2445
|
+
return { element, endPos: openTagEnd + 1, cdataMap: currentCdataMap };
|
|
2446
|
+
}
|
|
2447
|
+
const closingTagPos = findClosingTag3(cleanedXML, tagName, openTagEnd + 1);
|
|
2448
|
+
if (closingTagPos === -1) {
|
|
2449
|
+
throw new Error(`No closing tag found for <${tagName}>`);
|
|
2450
|
+
}
|
|
2451
|
+
const content = cleanedXML.substring(openTagEnd + 1, closingTagPos);
|
|
2452
|
+
if (content.includes("<")) {
|
|
2453
|
+
let pos = 0;
|
|
2454
|
+
const trimmedContent = content.trim();
|
|
2455
|
+
while (pos < trimmedContent.length) {
|
|
2456
|
+
const nextTag = trimmedContent.indexOf("<", pos);
|
|
2457
|
+
if (nextTag === -1) break;
|
|
2458
|
+
if (trimmedContent[nextTag + 1] === "/" || trimmedContent[nextTag + 1] === "!") {
|
|
2459
|
+
pos = nextTag + 1;
|
|
2460
|
+
continue;
|
|
2461
|
+
}
|
|
2462
|
+
try {
|
|
2463
|
+
const { element: child, endPos } = parseElement3(
|
|
2464
|
+
trimmedContent,
|
|
2465
|
+
nextTag,
|
|
2466
|
+
element,
|
|
2467
|
+
currentCdataMap
|
|
2468
|
+
);
|
|
2469
|
+
element.children.push(child);
|
|
2470
|
+
pos = endPos;
|
|
2471
|
+
} catch {
|
|
2472
|
+
pos = nextTag + 1;
|
|
2473
|
+
}
|
|
2474
|
+
}
|
|
2475
|
+
let textContent = content.replace(/<[^>]+>/g, "").trim();
|
|
2476
|
+
textContent = restoreCDATA3(textContent, currentCdataMap);
|
|
2477
|
+
element.text = textContent;
|
|
2478
|
+
} else {
|
|
2479
|
+
let textContent = content.trim();
|
|
2480
|
+
textContent = restoreCDATA3(textContent, currentCdataMap);
|
|
2481
|
+
element.text = textContent;
|
|
2482
|
+
}
|
|
2483
|
+
const closingTagEnd = closingTagPos + `</${tagName}>`.length;
|
|
2484
|
+
return { element, endPos: closingTagEnd, cdataMap: currentCdataMap };
|
|
2485
|
+
}
|
|
2486
|
+
function querySelector3(element, selector, caseSensitive = false) {
|
|
2487
|
+
const tagName = caseSensitive ? selector : selector.toLowerCase();
|
|
2488
|
+
const elementTag = caseSensitive ? element.tagName : element.tagName.toLowerCase();
|
|
2489
|
+
if (elementTag === tagName) {
|
|
2490
|
+
return element;
|
|
2491
|
+
}
|
|
2492
|
+
for (const child of element.children) {
|
|
2493
|
+
const found = querySelector3(child, selector, caseSensitive);
|
|
2494
|
+
if (found) return found;
|
|
2495
|
+
}
|
|
2496
|
+
return null;
|
|
2497
|
+
}
|
|
2498
|
+
function querySelectorAll3(element, selector, caseSensitive = false) {
|
|
2499
|
+
const results = [];
|
|
2500
|
+
const tagName = caseSensitive ? selector : selector.toLowerCase();
|
|
2501
|
+
const elementTag = caseSensitive ? element.tagName : element.tagName.toLowerCase();
|
|
2502
|
+
if (elementTag === tagName) {
|
|
2503
|
+
results.push(element);
|
|
2504
|
+
}
|
|
2505
|
+
for (const child of element.children) {
|
|
2506
|
+
results.push(...querySelectorAll3(child, selector, caseSensitive));
|
|
2507
|
+
}
|
|
2508
|
+
return results;
|
|
2509
|
+
}
|
|
2510
|
+
function getText2(element) {
|
|
2511
|
+
return element?.text || "";
|
|
2512
|
+
}
|
|
2513
|
+
function getChild(element, tagName) {
|
|
2514
|
+
const lowerTag = tagName.toLowerCase();
|
|
2515
|
+
return element.children.find((c) => c.tagName.toLowerCase() === lowerTag) || null;
|
|
2516
|
+
}
|
|
2517
|
+
function getChildren(element, tagName) {
|
|
2518
|
+
const lowerTag = tagName.toLowerCase();
|
|
2519
|
+
return element.children.filter((c) => c.tagName.toLowerCase() === lowerTag);
|
|
2520
|
+
}
|
|
2521
|
+
|
|
2522
|
+
// src/feed/sitemap/parse.ts
|
|
2523
|
+
function parseSitemap(xml, baseUrl) {
|
|
2524
|
+
const doc = parseSitemapXML(xml);
|
|
2525
|
+
const sitemapIndex = querySelector3(doc, "sitemapindex");
|
|
2526
|
+
if (sitemapIndex) {
|
|
2527
|
+
return parseSitemapIndex(sitemapIndex, baseUrl);
|
|
2528
|
+
}
|
|
2529
|
+
const urlset = querySelector3(doc, "urlset");
|
|
2530
|
+
if (urlset) {
|
|
2531
|
+
return parseUrlset(urlset, baseUrl);
|
|
2532
|
+
}
|
|
2533
|
+
const urls = querySelectorAll3(doc, "url");
|
|
2534
|
+
if (urls.length > 0) {
|
|
2535
|
+
return {
|
|
2536
|
+
sitemap: {
|
|
2537
|
+
type: "urlset",
|
|
2538
|
+
urls: urls.map((url) => extractUrl(url, baseUrl)),
|
|
2539
|
+
sitemaps: []
|
|
2540
|
+
},
|
|
2541
|
+
isIndex: false
|
|
2542
|
+
};
|
|
2543
|
+
}
|
|
2544
|
+
return {
|
|
2545
|
+
sitemap: {
|
|
2546
|
+
type: "urlset",
|
|
2547
|
+
urls: [],
|
|
2548
|
+
sitemaps: []
|
|
2549
|
+
},
|
|
2550
|
+
isIndex: false
|
|
2551
|
+
};
|
|
2552
|
+
}
|
|
2553
|
+
function parseSitemapIndex(element, baseUrl) {
|
|
2554
|
+
const sitemapElements = getChildren(element, "sitemap");
|
|
2555
|
+
const sitemaps = sitemapElements.map((el) => {
|
|
2556
|
+
const loc = getText2(getChild(el, "loc"));
|
|
2557
|
+
const lastmod = getText2(getChild(el, "lastmod")) || void 0;
|
|
2558
|
+
return {
|
|
2559
|
+
loc: baseUrl ? normalizeUrlHttps(baseUrl, loc) : loc,
|
|
2560
|
+
lastmod
|
|
2561
|
+
};
|
|
2562
|
+
});
|
|
2563
|
+
return {
|
|
2564
|
+
sitemap: {
|
|
2565
|
+
type: "sitemapindex",
|
|
2566
|
+
urls: [],
|
|
2567
|
+
sitemaps
|
|
2568
|
+
},
|
|
2569
|
+
isIndex: true
|
|
2570
|
+
};
|
|
2571
|
+
}
|
|
2572
|
+
function parseUrlset(element, baseUrl) {
|
|
2573
|
+
const urlElements = getChildren(element, "url");
|
|
2574
|
+
const urls = urlElements.map((el) => extractUrl(el, baseUrl));
|
|
2575
|
+
return {
|
|
2576
|
+
sitemap: {
|
|
2577
|
+
type: "urlset",
|
|
2578
|
+
urls,
|
|
2579
|
+
sitemaps: []
|
|
2580
|
+
},
|
|
2581
|
+
isIndex: false
|
|
2582
|
+
};
|
|
2583
|
+
}
|
|
2584
|
+
function extractUrl(element, baseUrl) {
|
|
2585
|
+
const rawLoc = getText2(getChild(element, "loc"));
|
|
2586
|
+
const loc = decodeXmlEntities(rawLoc);
|
|
2587
|
+
const lastmod = getText2(getChild(element, "lastmod")) || void 0;
|
|
2588
|
+
const changefreq = getText2(getChild(element, "changefreq")) || void 0;
|
|
2589
|
+
const priorityText = getText2(getChild(element, "priority"));
|
|
2590
|
+
const priority = priorityText ? Number.parseFloat(priorityText) : void 0;
|
|
2591
|
+
const result = {
|
|
2592
|
+
loc: baseUrl ? normalizeUrlHttps(baseUrl, loc) : loc,
|
|
2593
|
+
lastmod,
|
|
2594
|
+
changefreq,
|
|
2595
|
+
priority: priority && !Number.isNaN(priority) ? priority : void 0
|
|
2596
|
+
};
|
|
2597
|
+
const news = extractNews(element);
|
|
2598
|
+
if (news) {
|
|
2599
|
+
result.news = news;
|
|
2600
|
+
}
|
|
2601
|
+
const images = extractImages(element, baseUrl);
|
|
2602
|
+
if (images.length > 0) {
|
|
2603
|
+
result.images = images;
|
|
2604
|
+
}
|
|
2605
|
+
const videos = extractVideos(element, baseUrl);
|
|
2606
|
+
if (videos.length > 0) {
|
|
2607
|
+
result.videos = videos;
|
|
2608
|
+
}
|
|
2609
|
+
return result;
|
|
2610
|
+
}
|
|
2611
|
+
function extractNews(urlElement) {
|
|
2612
|
+
const newsEl = getChild(urlElement, "news:news") || getChild(urlElement, "news") || urlElement.children.find((c) => c.tagName.toLowerCase().endsWith(":news"));
|
|
2613
|
+
if (!newsEl) {
|
|
2614
|
+
return void 0;
|
|
2615
|
+
}
|
|
2616
|
+
const news = {};
|
|
2617
|
+
const pubEl = getChild(newsEl, "news:publication") || getChild(newsEl, "publication") || newsEl.children.find((c) => c.tagName.toLowerCase().endsWith(":publication"));
|
|
2618
|
+
if (pubEl) {
|
|
2619
|
+
const name = getText2(getChild(pubEl, "news:name")) || getText2(getChild(pubEl, "name")) || getText2(pubEl.children.find((c) => c.tagName.toLowerCase().endsWith(":name")));
|
|
2620
|
+
const language = getText2(getChild(pubEl, "news:language")) || getText2(getChild(pubEl, "language")) || getText2(pubEl.children.find((c) => c.tagName.toLowerCase().endsWith(":language")));
|
|
2621
|
+
if (name || language) {
|
|
2622
|
+
news.publication = {
|
|
2623
|
+
name: name || void 0,
|
|
2624
|
+
language: language || void 0
|
|
2625
|
+
};
|
|
2626
|
+
}
|
|
2627
|
+
}
|
|
2628
|
+
const pubDate = getText2(getChild(newsEl, "news:publication_date")) || getText2(getChild(newsEl, "publication_date")) || getText2(newsEl.children.find((c) => c.tagName.toLowerCase().endsWith(":publication_date")));
|
|
2629
|
+
if (pubDate) {
|
|
2630
|
+
news.publicationDate = pubDate;
|
|
2631
|
+
}
|
|
2632
|
+
const title = getText2(getChild(newsEl, "news:title")) || getText2(getChild(newsEl, "title")) || getText2(newsEl.children.find((c) => c.tagName.toLowerCase().endsWith(":title")));
|
|
2633
|
+
if (title) {
|
|
2634
|
+
news.title = decodeXmlEntities(title);
|
|
2635
|
+
}
|
|
2636
|
+
const keywords = getText2(getChild(newsEl, "news:keywords")) || getText2(getChild(newsEl, "keywords")) || getText2(newsEl.children.find((c) => c.tagName.toLowerCase().endsWith(":keywords")));
|
|
2637
|
+
if (keywords) {
|
|
2638
|
+
news.keywords = keywords.split(",").map((k) => k.trim());
|
|
2639
|
+
}
|
|
2640
|
+
const stockTickers = getText2(getChild(newsEl, "news:stock_tickers")) || getText2(getChild(newsEl, "stock_tickers")) || getText2(newsEl.children.find((c) => c.tagName.toLowerCase().endsWith(":stock_tickers")));
|
|
2641
|
+
if (stockTickers) {
|
|
2642
|
+
news.stockTickers = stockTickers.split(",").map((t) => t.trim());
|
|
2643
|
+
}
|
|
2644
|
+
return Object.keys(news).length > 0 ? news : void 0;
|
|
2645
|
+
}
|
|
2646
|
+
function extractImages(urlElement, baseUrl) {
|
|
2647
|
+
const imageElements = urlElement.children.filter(
|
|
2648
|
+
(c) => c.tagName.toLowerCase() === "image:image" || c.tagName.toLowerCase() === "image" || c.tagName.toLowerCase().endsWith(":image")
|
|
2649
|
+
);
|
|
2650
|
+
return imageElements.map((imgEl) => {
|
|
2651
|
+
const loc = getText2(getChild(imgEl, "image:loc")) || getText2(getChild(imgEl, "loc")) || getText2(imgEl.children.find((c) => c.tagName.toLowerCase().endsWith(":loc")));
|
|
2652
|
+
if (!loc) return null;
|
|
2653
|
+
const image = {
|
|
2654
|
+
loc: baseUrl ? normalizeUrlHttps(baseUrl, loc) : loc
|
|
2655
|
+
};
|
|
2656
|
+
const caption = getText2(getChild(imgEl, "image:caption")) || getText2(getChild(imgEl, "caption")) || getText2(imgEl.children.find((c) => c.tagName.toLowerCase().endsWith(":caption")));
|
|
2657
|
+
if (caption) image.caption = decodeXmlEntities(caption);
|
|
2658
|
+
const geoLocation = getText2(getChild(imgEl, "image:geo_location")) || getText2(getChild(imgEl, "geo_location")) || getText2(imgEl.children.find((c) => c.tagName.toLowerCase().endsWith(":geo_location")));
|
|
2659
|
+
if (geoLocation) image.geoLocation = geoLocation;
|
|
2660
|
+
const title = getText2(getChild(imgEl, "image:title")) || getText2(getChild(imgEl, "title")) || getText2(imgEl.children.find((c) => c.tagName.toLowerCase().endsWith(":title")));
|
|
2661
|
+
if (title) image.title = decodeXmlEntities(title);
|
|
2662
|
+
const license = getText2(getChild(imgEl, "image:license")) || getText2(getChild(imgEl, "license")) || getText2(imgEl.children.find((c) => c.tagName.toLowerCase().endsWith(":license")));
|
|
2663
|
+
if (license) image.license = baseUrl ? normalizeUrlHttps(baseUrl, license) : license;
|
|
2664
|
+
return image;
|
|
2665
|
+
}).filter((img) => img !== null);
|
|
2666
|
+
}
|
|
2667
|
+
function extractVideos(urlElement, baseUrl) {
|
|
2668
|
+
const videoElements = urlElement.children.filter(
|
|
2669
|
+
(c) => c.tagName.toLowerCase() === "video:video" || c.tagName.toLowerCase() === "video" || c.tagName.toLowerCase().endsWith(":video")
|
|
2670
|
+
);
|
|
2671
|
+
return videoElements.map((vidEl) => {
|
|
2672
|
+
const thumbnailLoc = getText2(getChild(vidEl, "video:thumbnail_loc")) || getText2(getChild(vidEl, "thumbnail_loc")) || getText2(vidEl.children.find((c) => c.tagName.toLowerCase().endsWith(":thumbnail_loc")));
|
|
2673
|
+
const title = getText2(getChild(vidEl, "video:title")) || getText2(getChild(vidEl, "title")) || getText2(vidEl.children.find((c) => c.tagName.toLowerCase().endsWith(":title")));
|
|
2674
|
+
const description = getText2(getChild(vidEl, "video:description")) || getText2(getChild(vidEl, "description")) || getText2(vidEl.children.find((c) => c.tagName.toLowerCase().endsWith(":description")));
|
|
2675
|
+
if (!thumbnailLoc || !title || !description) return null;
|
|
2676
|
+
const video = {
|
|
2677
|
+
thumbnailLoc: baseUrl ? normalizeUrlHttps(baseUrl, thumbnailLoc) : thumbnailLoc,
|
|
2678
|
+
title: decodeXmlEntities(title),
|
|
2679
|
+
description: decodeXmlEntities(description)
|
|
2680
|
+
};
|
|
2681
|
+
const contentLoc = getText2(getChild(vidEl, "video:content_loc")) || getText2(getChild(vidEl, "content_loc")) || getText2(vidEl.children.find((c) => c.tagName.toLowerCase().endsWith(":content_loc")));
|
|
2682
|
+
if (contentLoc)
|
|
2683
|
+
video.contentLoc = baseUrl ? normalizeUrlHttps(baseUrl, contentLoc) : contentLoc;
|
|
2684
|
+
const playerLoc = getText2(getChild(vidEl, "video:player_loc")) || getText2(getChild(vidEl, "player_loc")) || getText2(vidEl.children.find((c) => c.tagName.toLowerCase().endsWith(":player_loc")));
|
|
2685
|
+
if (playerLoc) video.playerLoc = baseUrl ? normalizeUrlHttps(baseUrl, playerLoc) : playerLoc;
|
|
2686
|
+
const duration = getText2(getChild(vidEl, "video:duration")) || getText2(getChild(vidEl, "duration")) || getText2(vidEl.children.find((c) => c.tagName.toLowerCase().endsWith(":duration")));
|
|
2687
|
+
if (duration) {
|
|
2688
|
+
const dur = Number.parseInt(duration, 10);
|
|
2689
|
+
if (!Number.isNaN(dur)) video.duration = dur;
|
|
2690
|
+
}
|
|
2691
|
+
const rating = getText2(getChild(vidEl, "video:rating")) || getText2(getChild(vidEl, "rating")) || getText2(vidEl.children.find((c) => c.tagName.toLowerCase().endsWith(":rating")));
|
|
2692
|
+
if (rating) {
|
|
2693
|
+
const r = Number.parseFloat(rating);
|
|
2694
|
+
if (!Number.isNaN(r)) video.rating = r;
|
|
2695
|
+
}
|
|
2696
|
+
const viewCount = getText2(getChild(vidEl, "video:view_count")) || getText2(getChild(vidEl, "view_count")) || getText2(vidEl.children.find((c) => c.tagName.toLowerCase().endsWith(":view_count")));
|
|
2697
|
+
if (viewCount) {
|
|
2698
|
+
const vc = Number.parseInt(viewCount, 10);
|
|
2699
|
+
if (!Number.isNaN(vc)) video.viewCount = vc;
|
|
2700
|
+
}
|
|
2701
|
+
const publicationDate = getText2(getChild(vidEl, "video:publication_date")) || getText2(getChild(vidEl, "publication_date")) || getText2(vidEl.children.find((c) => c.tagName.toLowerCase().endsWith(":publication_date")));
|
|
2702
|
+
if (publicationDate) video.publicationDate = publicationDate;
|
|
2703
|
+
const familyFriendly = getText2(getChild(vidEl, "video:family_friendly")) || getText2(getChild(vidEl, "family_friendly")) || getText2(vidEl.children.find((c) => c.tagName.toLowerCase().endsWith(":family_friendly")));
|
|
2704
|
+
if (familyFriendly) {
|
|
2705
|
+
video.familyFriendly = familyFriendly.toLowerCase() === "yes";
|
|
2706
|
+
}
|
|
2707
|
+
const category = getText2(getChild(vidEl, "video:category")) || getText2(getChild(vidEl, "category")) || getText2(vidEl.children.find((c) => c.tagName.toLowerCase().endsWith(":category")));
|
|
2708
|
+
if (category) video.category = category;
|
|
2709
|
+
const tagElements = vidEl.children.filter(
|
|
2710
|
+
(c) => c.tagName.toLowerCase() === "video:tag" || c.tagName.toLowerCase() === "tag" || c.tagName.toLowerCase().endsWith(":tag")
|
|
2711
|
+
);
|
|
2712
|
+
if (tagElements.length > 0) {
|
|
2713
|
+
video.tags = tagElements.map((t) => getText2(t)).filter(Boolean);
|
|
2714
|
+
}
|
|
2715
|
+
return video;
|
|
2716
|
+
}).filter((vid) => vid !== null);
|
|
2717
|
+
}
|
|
2718
|
+
function decodeXmlEntities(text) {
|
|
2719
|
+
return text.replace(/</g, "<").replace(/>/g, ">").replace(/&/g, "&").replace(/"/g, '"').replace(/'/g, "'").replace(/&#(\d+);/g, (_, code) => String.fromCharCode(Number.parseInt(code, 10))).replace(/&#x([0-9a-fA-F]+);/g, (_, code) => String.fromCharCode(Number.parseInt(code, 16)));
|
|
2720
|
+
}
|
|
2721
|
+
|
|
2350
2722
|
// src/pluck/types.ts
|
|
2351
2723
|
var PluckError = class extends Error {
|
|
2352
2724
|
constructor(message) {
|
|
@@ -2723,6 +3095,12 @@ function parseHTML(html, baseUrl) {
|
|
|
2723
3095
|
});
|
|
2724
3096
|
return document;
|
|
2725
3097
|
}
|
|
3098
|
+
function ensureDocument(input, baseUrl) {
|
|
3099
|
+
if (typeof input === "string") {
|
|
3100
|
+
return parseHTML(input, baseUrl);
|
|
3101
|
+
}
|
|
3102
|
+
return input;
|
|
3103
|
+
}
|
|
2726
3104
|
|
|
2727
3105
|
// src/utils/meta-helpers.ts
|
|
2728
3106
|
function getMetaContent(doc, name) {
|
|
@@ -2750,7 +3128,8 @@ function getMetaHttpEquiv(doc, httpEquiv) {
|
|
|
2750
3128
|
}
|
|
2751
3129
|
|
|
2752
3130
|
// src/metadata/opengraph/extract.ts
|
|
2753
|
-
function extractOpenGraph(
|
|
3131
|
+
function extractOpenGraph(input) {
|
|
3132
|
+
const doc = ensureDocument(input);
|
|
2754
3133
|
const metadata = {};
|
|
2755
3134
|
metadata.title = getMetaProperty(doc, "og:title");
|
|
2756
3135
|
metadata.type = getMetaProperty(doc, "og:type");
|
|
@@ -2775,7 +3154,7 @@ function extractOpenGraph(doc) {
|
|
|
2775
3154
|
if (Object.keys(audio).length > 0) {
|
|
2776
3155
|
metadata.audio = audio;
|
|
2777
3156
|
}
|
|
2778
|
-
const images =
|
|
3157
|
+
const images = extractImages2(doc);
|
|
2779
3158
|
if (images.length > 0) {
|
|
2780
3159
|
metadata.images = images;
|
|
2781
3160
|
}
|
|
@@ -2844,7 +3223,7 @@ function extractAudio(doc) {
|
|
|
2844
3223
|
Object.entries(audio).filter(([_, value]) => value !== void 0)
|
|
2845
3224
|
);
|
|
2846
3225
|
}
|
|
2847
|
-
function
|
|
3226
|
+
function extractImages2(doc) {
|
|
2848
3227
|
const images = [];
|
|
2849
3228
|
const imageUrls = getAllMetaPropertyValues(doc, "og:image");
|
|
2850
3229
|
const imageSecureUrls = getAllMetaPropertyValues(doc, "og:image:secure_url");
|
|
@@ -2963,7 +3342,8 @@ function matchesAnyType(obj, targetTypes) {
|
|
|
2963
3342
|
}
|
|
2964
3343
|
|
|
2965
3344
|
// src/metadata/schema-org/extract.ts
|
|
2966
|
-
function extractSchemaOrg(
|
|
3345
|
+
function extractSchemaOrg(input) {
|
|
3346
|
+
const doc = ensureDocument(input);
|
|
2967
3347
|
const metadata = {
|
|
2968
3348
|
jsonLd: []
|
|
2969
3349
|
};
|
|
@@ -3040,7 +3420,8 @@ function organizeByType(metadata) {
|
|
|
3040
3420
|
}
|
|
3041
3421
|
|
|
3042
3422
|
// src/metadata/seo/extract.ts
|
|
3043
|
-
function extractSEO(
|
|
3423
|
+
function extractSEO(input) {
|
|
3424
|
+
const doc = ensureDocument(input);
|
|
3044
3425
|
const metadata = {};
|
|
3045
3426
|
const titleElement = doc.querySelector("title");
|
|
3046
3427
|
if (titleElement?.textContent) {
|
|
@@ -3072,7 +3453,8 @@ function extractSEO(doc) {
|
|
|
3072
3453
|
}
|
|
3073
3454
|
|
|
3074
3455
|
// src/metadata/twitter-card/extract.ts
|
|
3075
|
-
function extractTwitterCard(
|
|
3456
|
+
function extractTwitterCard(input) {
|
|
3457
|
+
const doc = ensureDocument(input);
|
|
3076
3458
|
const metadata = {};
|
|
3077
3459
|
metadata.card = getMetaContent(doc, "twitter:card");
|
|
3078
3460
|
metadata.site = getMetaContent(doc, "twitter:site");
|
|
@@ -3229,7 +3611,8 @@ function getAllLinksByPrefix(doc, relPrefix) {
|
|
|
3229
3611
|
}
|
|
3230
3612
|
|
|
3231
3613
|
// src/metadata/icons/extract.ts
|
|
3232
|
-
function extractIcons(
|
|
3614
|
+
function extractIcons(input) {
|
|
3615
|
+
const doc = ensureDocument(input);
|
|
3233
3616
|
const metadata = {};
|
|
3234
3617
|
const iconLinks = getAllLinksByRels(doc, ["icon", "shortcut icon"]);
|
|
3235
3618
|
for (const link of iconLinks) {
|
|
@@ -3410,7 +3793,8 @@ function parseSizeString(sizeStr) {
|
|
|
3410
3793
|
}
|
|
3411
3794
|
|
|
3412
3795
|
// src/metadata/language/extract.ts
|
|
3413
|
-
function extractLanguage(
|
|
3796
|
+
function extractLanguage(input) {
|
|
3797
|
+
const doc = ensureDocument(input);
|
|
3414
3798
|
const metadata = {};
|
|
3415
3799
|
const htmlElement = doc.querySelector("html");
|
|
3416
3800
|
if (htmlElement) {
|
|
@@ -3462,7 +3846,8 @@ function extractBestLanguage(doc) {
|
|
|
3462
3846
|
}
|
|
3463
3847
|
|
|
3464
3848
|
// src/metadata/links/extract.ts
|
|
3465
|
-
function extractLinks3(
|
|
3849
|
+
function extractLinks3(input, baseUrl, options = {}) {
|
|
3850
|
+
const doc = ensureDocument(input);
|
|
3466
3851
|
const opts = normalizeOptions3(options);
|
|
3467
3852
|
const effectiveBaseUrl = getEffectiveBaseUrl(doc, baseUrl);
|
|
3468
3853
|
const baseOrigin = effectiveBaseUrl ? getOrigin(effectiveBaseUrl) : null;
|
|
@@ -3793,7 +4178,8 @@ function getStringProperty3(obj, prop) {
|
|
|
3793
4178
|
}
|
|
3794
4179
|
|
|
3795
4180
|
// src/metadata/canonical/extract.ts
|
|
3796
|
-
function extractCanonical(
|
|
4181
|
+
function extractCanonical(input) {
|
|
4182
|
+
const doc = ensureDocument(input);
|
|
3797
4183
|
const metadata = {};
|
|
3798
4184
|
metadata.canonical = getLinkHref(doc, "canonical");
|
|
3799
4185
|
const alternateLinks = getAllLinks(doc, "alternate");
|
|
@@ -3938,9 +4324,63 @@ async function gatherFeed(url) {
|
|
|
3938
4324
|
}
|
|
3939
4325
|
const response = await pluck(feedUrl);
|
|
3940
4326
|
const content = await response.textUtf8();
|
|
4327
|
+
const format = detectFormat(content);
|
|
4328
|
+
if (format === "sitemap") {
|
|
4329
|
+
return normalizeSitemapToFeed(content, response.finalUrl);
|
|
4330
|
+
}
|
|
3941
4331
|
const result = parseFeed(content, response.finalUrl);
|
|
3942
4332
|
return result.feed;
|
|
3943
4333
|
}
|
|
4334
|
+
function normalizeSitemapToFeed(content, baseUrl) {
|
|
4335
|
+
const result = parseSitemap(content, baseUrl);
|
|
4336
|
+
if (result.isIndex) {
|
|
4337
|
+
const items2 = result.sitemap.sitemaps.map((sitemap, index) => ({
|
|
4338
|
+
id: sitemap.loc || `sitemap-${index}`,
|
|
4339
|
+
url: sitemap.loc,
|
|
4340
|
+
title: `Sitemap: ${sitemap.loc}`,
|
|
4341
|
+
modified: sitemap.lastmod
|
|
4342
|
+
}));
|
|
4343
|
+
return {
|
|
4344
|
+
format: "sitemap",
|
|
4345
|
+
title: "Sitemap Index",
|
|
4346
|
+
url: baseUrl,
|
|
4347
|
+
items: items2
|
|
4348
|
+
};
|
|
4349
|
+
}
|
|
4350
|
+
const items = result.sitemap.urls.map((url, index) => {
|
|
4351
|
+
const item = {
|
|
4352
|
+
id: url.loc || `url-${index}`,
|
|
4353
|
+
url: url.loc,
|
|
4354
|
+
modified: url.lastmod
|
|
4355
|
+
};
|
|
4356
|
+
if (url.news) {
|
|
4357
|
+
item.title = url.news.title;
|
|
4358
|
+
item.published = url.news.publicationDate;
|
|
4359
|
+
if (url.news.publication?.name) {
|
|
4360
|
+
item.authors = [{ name: url.news.publication.name }];
|
|
4361
|
+
}
|
|
4362
|
+
if (url.news.keywords) {
|
|
4363
|
+
item.tags = url.news.keywords;
|
|
4364
|
+
}
|
|
4365
|
+
}
|
|
4366
|
+
if (url.images && url.images.length > 0) {
|
|
4367
|
+
item.image = url.images[0].loc;
|
|
4368
|
+
}
|
|
4369
|
+
return item;
|
|
4370
|
+
});
|
|
4371
|
+
let title = "Sitemap";
|
|
4372
|
+
try {
|
|
4373
|
+
const urlObj = new URL(baseUrl);
|
|
4374
|
+
title = `${urlObj.hostname} Sitemap`;
|
|
4375
|
+
} catch {
|
|
4376
|
+
}
|
|
4377
|
+
return {
|
|
4378
|
+
format: "sitemap",
|
|
4379
|
+
title,
|
|
4380
|
+
url: baseUrl,
|
|
4381
|
+
items
|
|
4382
|
+
};
|
|
4383
|
+
}
|
|
3944
4384
|
|
|
3945
4385
|
// src/metadata/feed-discovery/heuristics.ts
|
|
3946
4386
|
var COMMON_FEED_PATHS = [
|
|
@@ -3975,7 +4415,8 @@ function generateFeedSuggestions(documentUrl) {
|
|
|
3975
4415
|
}
|
|
3976
4416
|
|
|
3977
4417
|
// src/metadata/feed-discovery/extract.ts
|
|
3978
|
-
function extractFeedDiscovery(
|
|
4418
|
+
function extractFeedDiscovery(input, documentUrl) {
|
|
4419
|
+
const doc = ensureDocument(input);
|
|
3979
4420
|
const metadata = {
|
|
3980
4421
|
feeds: []
|
|
3981
4422
|
};
|
|
@@ -4152,7 +4593,8 @@ async function gatherWebsite(url) {
|
|
|
4152
4593
|
}
|
|
4153
4594
|
|
|
4154
4595
|
// src/metadata/analytics/extract.ts
|
|
4155
|
-
function extractAnalytics(
|
|
4596
|
+
function extractAnalytics(input) {
|
|
4597
|
+
const doc = ensureDocument(input);
|
|
4156
4598
|
const metadata = {};
|
|
4157
4599
|
const scripts = doc.querySelectorAll("script");
|
|
4158
4600
|
const googleAnalytics = /* @__PURE__ */ new Set();
|
|
@@ -4244,10 +4686,11 @@ function extractAnalytics(doc) {
|
|
|
4244
4686
|
}
|
|
4245
4687
|
|
|
4246
4688
|
// src/metadata/assets/extract.ts
|
|
4247
|
-
function extractAssets(
|
|
4689
|
+
function extractAssets(input, baseUrl) {
|
|
4690
|
+
const doc = ensureDocument(input);
|
|
4248
4691
|
const metadata = {};
|
|
4249
4692
|
const effectiveBaseUrl = getEffectiveBaseUrl2(doc, baseUrl);
|
|
4250
|
-
const images =
|
|
4693
|
+
const images = extractImages3(doc, effectiveBaseUrl);
|
|
4251
4694
|
if (images.length > 0) {
|
|
4252
4695
|
metadata.images = images;
|
|
4253
4696
|
}
|
|
@@ -4301,7 +4744,7 @@ function getEffectiveBaseUrl2(doc, baseUrl) {
|
|
|
4301
4744
|
}
|
|
4302
4745
|
return null;
|
|
4303
4746
|
}
|
|
4304
|
-
function
|
|
4747
|
+
function extractImages3(doc, baseUrl) {
|
|
4305
4748
|
const urls = /* @__PURE__ */ new Set();
|
|
4306
4749
|
const imgElements = doc.querySelectorAll("img[src]");
|
|
4307
4750
|
for (const img of Array.from(imgElements)) {
|
|
@@ -4571,7 +5014,8 @@ function extractConnectionHints(doc, baseUrl) {
|
|
|
4571
5014
|
}
|
|
4572
5015
|
|
|
4573
5016
|
// src/metadata/copyright/extract.ts
|
|
4574
|
-
function extractCopyright(
|
|
5017
|
+
function extractCopyright(input) {
|
|
5018
|
+
const doc = ensureDocument(input);
|
|
4575
5019
|
const metadata = {};
|
|
4576
5020
|
metadata.copyright = getMetaContent(doc, "copyright");
|
|
4577
5021
|
metadata.license = getLinkHref(doc, "license");
|
|
@@ -4607,7 +5051,8 @@ function parseCopyright(copyrightString) {
|
|
|
4607
5051
|
}
|
|
4608
5052
|
|
|
4609
5053
|
// src/metadata/dublin-core/extract.ts
|
|
4610
|
-
function extractDublinCore(
|
|
5054
|
+
function extractDublinCore(input) {
|
|
5055
|
+
const doc = ensureDocument(input);
|
|
4611
5056
|
const metadata = {};
|
|
4612
5057
|
metadata.title = getMetaContent(doc, "DC.title") || getMetaContent(doc, "dcterms.title");
|
|
4613
5058
|
metadata.description = getMetaContent(doc, "DC.description") || getMetaContent(doc, "dcterms.description");
|
|
@@ -4648,7 +5093,8 @@ function extractMultiValue(doc, field) {
|
|
|
4648
5093
|
}
|
|
4649
5094
|
|
|
4650
5095
|
// src/metadata/geo/extract.ts
|
|
4651
|
-
function extractGeo(
|
|
5096
|
+
function extractGeo(input) {
|
|
5097
|
+
const doc = ensureDocument(input);
|
|
4652
5098
|
const metadata = {};
|
|
4653
5099
|
const geoPosition = getMetaContent(doc, "geo.position");
|
|
4654
5100
|
if (geoPosition) {
|
|
@@ -4705,7 +5151,8 @@ function parseICBM(icbm) {
|
|
|
4705
5151
|
}
|
|
4706
5152
|
|
|
4707
5153
|
// src/metadata/monetization/extract.ts
|
|
4708
|
-
function extractMonetization(
|
|
5154
|
+
function extractMonetization(input) {
|
|
5155
|
+
const doc = ensureDocument(input);
|
|
4709
5156
|
const metadata = {};
|
|
4710
5157
|
metadata.webMonetization = getMetaContent(doc, "monetization");
|
|
4711
5158
|
metadata.paypalVerification = getMetaContent(doc, "paypal-site-verification");
|
|
@@ -4719,7 +5166,8 @@ function extractMonetization(doc) {
|
|
|
4719
5166
|
}
|
|
4720
5167
|
|
|
4721
5168
|
// src/metadata/news/extract.ts
|
|
4722
|
-
function
|
|
5169
|
+
function extractNews2(input) {
|
|
5170
|
+
const doc = ensureDocument(input);
|
|
4723
5171
|
const metadata = {};
|
|
4724
5172
|
const newsKeywords = getMetaContent(doc, "news_keywords");
|
|
4725
5173
|
if (newsKeywords) {
|
|
@@ -4737,7 +5185,8 @@ function extractNews(doc) {
|
|
|
4737
5185
|
}
|
|
4738
5186
|
|
|
4739
5187
|
// src/metadata/pagination/extract.ts
|
|
4740
|
-
function extractPagination(
|
|
5188
|
+
function extractPagination(input) {
|
|
5189
|
+
const doc = ensureDocument(input);
|
|
4741
5190
|
const metadata = {};
|
|
4742
5191
|
metadata.prev = getLinkHref(doc, "prev") || getLinkHref(doc, "previous");
|
|
4743
5192
|
metadata.next = getLinkHref(doc, "next");
|
|
@@ -4836,7 +5285,8 @@ function parseKeyValueDirective(key, value, result) {
|
|
|
4836
5285
|
}
|
|
4837
5286
|
|
|
4838
5287
|
// src/metadata/robots/extract.ts
|
|
4839
|
-
function extractRobots(
|
|
5288
|
+
function extractRobots(input) {
|
|
5289
|
+
const doc = ensureDocument(input);
|
|
4840
5290
|
const metadata = {};
|
|
4841
5291
|
const robotsContent = getMetaContent(doc, "robots");
|
|
4842
5292
|
if (robotsContent) {
|
|
@@ -4870,7 +5320,8 @@ function extractRobots(doc) {
|
|
|
4870
5320
|
}
|
|
4871
5321
|
|
|
4872
5322
|
// src/metadata/security/extract.ts
|
|
4873
|
-
function extractSecurity(
|
|
5323
|
+
function extractSecurity(input) {
|
|
5324
|
+
const doc = ensureDocument(input);
|
|
4874
5325
|
const metadata = {};
|
|
4875
5326
|
metadata.referrerPolicy = getMetaContent(doc, "referrer");
|
|
4876
5327
|
metadata.contentSecurityPolicy = getMetaHttpEquiv(doc, "Content-Security-Policy");
|
|
@@ -4921,7 +5372,8 @@ function generateSitemapSuggestions(documentUrl) {
|
|
|
4921
5372
|
}
|
|
4922
5373
|
|
|
4923
5374
|
// src/metadata/sitemap-discovery/extract.ts
|
|
4924
|
-
function extractSitemapDiscovery(
|
|
5375
|
+
function extractSitemapDiscovery(input, documentUrl) {
|
|
5376
|
+
const doc = ensureDocument(input);
|
|
4925
5377
|
const metadata = {
|
|
4926
5378
|
sitemaps: []
|
|
4927
5379
|
};
|
|
@@ -4934,7 +5386,8 @@ function extractSitemapDiscovery(doc, documentUrl) {
|
|
|
4934
5386
|
}
|
|
4935
5387
|
|
|
4936
5388
|
// src/metadata/social-profiles/extract.ts
|
|
4937
|
-
function extractSocialProfiles(
|
|
5389
|
+
function extractSocialProfiles(input) {
|
|
5390
|
+
const doc = ensureDocument(input);
|
|
4938
5391
|
const metadata = {};
|
|
4939
5392
|
metadata.twitter = getMetaContent(doc, "twitter:site") || getMetaContent(doc, "twitter:creator") || extractFromProperty(doc, "twitter:site") || extractFromProperty(doc, "twitter:creator");
|
|
4940
5393
|
if (metadata.twitter) {
|
|
@@ -5087,7 +5540,8 @@ function categorizeSchemaProfile(url, metadata) {
|
|
|
5087
5540
|
}
|
|
5088
5541
|
|
|
5089
5542
|
// src/metadata/verification/extract.ts
|
|
5090
|
-
function extractVerification(
|
|
5543
|
+
function extractVerification(input) {
|
|
5544
|
+
const doc = ensureDocument(input);
|
|
5091
5545
|
const metadata = {};
|
|
5092
5546
|
metadata.googleSiteVerification = getMetaContent(doc, "google-site-verification");
|
|
5093
5547
|
metadata.msvalidate = getMetaContent(doc, "msvalidate.01");
|
|
@@ -6934,6 +7388,6 @@ async function swoop(url, init) {
|
|
|
6934
7388
|
* @packageDocumentation
|
|
6935
7389
|
*/
|
|
6936
7390
|
|
|
6937
|
-
export { PluckContentTypeError, PluckEncodingError, PluckError, PluckHttpError, PluckNetworkError, PluckRedirectError, PluckSizeError, PluckTimeoutError, SwoopEnvironmentError, SwoopError, SwoopExecutionError, SwoopSecurityError, SwoopTimeoutError, assessContentQuality, calculateReadingTime, countWords, detectFormat, extractAnalytics, extractAssets, extractCanonical, extractContent, extractCopyright, extractDublinCore, extractFeedDiscovery, extractGeo, extractIcons, extractLanguage, extractLinks3 as extractLinks, extractMonetization, extractNews, extractOpenGraph, extractPagination, extractRobots, extractSEO, extractSchemaOrg, extractSecurity, extractSitemapDiscovery, extractSocialProfiles, extractTwitterCard, extractVerification, gatherArticle, gatherFeed, gatherWebsite, htmlToText, isAtom, isFeed, isJSONFeed, isProbablyReaderable, isRSS, parseFeed, parseHTML, pluck, swoop };
|
|
7391
|
+
export { PluckContentTypeError, PluckEncodingError, PluckError, PluckHttpError, PluckNetworkError, PluckRedirectError, PluckSizeError, PluckTimeoutError, SwoopEnvironmentError, SwoopError, SwoopExecutionError, SwoopSecurityError, SwoopTimeoutError, assessContentQuality, calculateReadingTime, countWords, detectFormat, extractAnalytics, extractAssets, extractCanonical, extractContent, extractCopyright, extractDublinCore, extractFeedDiscovery, extractGeo, extractIcons, extractLanguage, extractLinks3 as extractLinks, extractMonetization, extractNews2 as extractNews, extractOpenGraph, extractPagination, extractRobots, extractSEO, extractSchemaOrg, extractSecurity, extractSitemapDiscovery, extractSocialProfiles, extractTwitterCard, extractVerification, gatherArticle, gatherFeed, gatherWebsite, htmlToText, isAtom, isFeed, isJSONFeed, isProbablyReaderable, isRSS, parseFeed, parseHTML, pluck, swoop };
|
|
6938
7392
|
//# sourceMappingURL=index.js.map
|
|
6939
7393
|
//# sourceMappingURL=index.js.map
|