magpie-html 0.2.1 → 0.2.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.cjs +481 -27
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +158 -109
- package/dist/index.d.ts +158 -109
- package/dist/index.js +481 -27
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/dist/index.cjs
CHANGED
|
@@ -2353,6 +2353,378 @@ function parseFeedAs(content, format, baseUrl) {
|
|
|
2353
2353
|
}
|
|
2354
2354
|
}
|
|
2355
2355
|
|
|
2356
|
+
// src/feed/sitemap/xml-parser.ts
|
|
2357
|
+
function parseSitemapXML(xml) {
|
|
2358
|
+
const cleaned = cleanXMLDeclaration2(xml);
|
|
2359
|
+
const withoutDoctype = removeDoctype3(cleaned);
|
|
2360
|
+
const withoutComments = removeComments3(withoutDoctype);
|
|
2361
|
+
const root = parseElement3(withoutComments, 0).element;
|
|
2362
|
+
return root;
|
|
2363
|
+
}
|
|
2364
|
+
function cleanXMLDeclaration2(xml) {
|
|
2365
|
+
return xml.replace(/<\?xml[^?]*\?>/g, "").trim();
|
|
2366
|
+
}
|
|
2367
|
+
function removeDoctype3(xml) {
|
|
2368
|
+
return xml.replace(/<!DOCTYPE[^>]*>/gi, "");
|
|
2369
|
+
}
|
|
2370
|
+
function removeComments3(xml) {
|
|
2371
|
+
return xml.replace(/<!--[\s\S]*?-->/g, "");
|
|
2372
|
+
}
|
|
2373
|
+
function extractCDATA3(text) {
|
|
2374
|
+
const cdataMap = /* @__PURE__ */ new Map();
|
|
2375
|
+
let counter = 0;
|
|
2376
|
+
const processed = text.replace(/<!\[CDATA\[([\s\S]*?)\]\]>/g, (_match, content) => {
|
|
2377
|
+
const placeholder = `__CDATA_${counter}__`;
|
|
2378
|
+
cdataMap.set(placeholder, content);
|
|
2379
|
+
counter++;
|
|
2380
|
+
return placeholder;
|
|
2381
|
+
});
|
|
2382
|
+
return { text: processed, cdataMap };
|
|
2383
|
+
}
|
|
2384
|
+
function restoreCDATA3(text, cdataMap) {
|
|
2385
|
+
let result = text;
|
|
2386
|
+
for (const [placeholder, content] of cdataMap.entries()) {
|
|
2387
|
+
result = result.replace(placeholder, content);
|
|
2388
|
+
}
|
|
2389
|
+
return result;
|
|
2390
|
+
}
|
|
2391
|
+
function parseAttributes3(tagContent) {
|
|
2392
|
+
const attributes = {};
|
|
2393
|
+
const attrRegex = /(\S+)=["']([^"']*)["']/g;
|
|
2394
|
+
let match = attrRegex.exec(tagContent);
|
|
2395
|
+
while (match !== null) {
|
|
2396
|
+
attributes[match[1]] = match[2];
|
|
2397
|
+
match = attrRegex.exec(tagContent);
|
|
2398
|
+
}
|
|
2399
|
+
return attributes;
|
|
2400
|
+
}
|
|
2401
|
+
function findClosingTag3(xml, tagName, startPos) {
|
|
2402
|
+
const openTag = `<${tagName}`;
|
|
2403
|
+
const closeTag = `</${tagName}>`;
|
|
2404
|
+
let depth = 1;
|
|
2405
|
+
let pos = startPos;
|
|
2406
|
+
while (pos < xml.length && depth > 0) {
|
|
2407
|
+
const nextOpen = xml.indexOf(openTag, pos);
|
|
2408
|
+
const nextClose = xml.indexOf(closeTag, pos);
|
|
2409
|
+
if (nextClose === -1) {
|
|
2410
|
+
return -1;
|
|
2411
|
+
}
|
|
2412
|
+
if (nextOpen !== -1 && nextOpen < nextClose) {
|
|
2413
|
+
depth++;
|
|
2414
|
+
pos = nextOpen + openTag.length;
|
|
2415
|
+
} else {
|
|
2416
|
+
depth--;
|
|
2417
|
+
if (depth === 0) {
|
|
2418
|
+
return nextClose;
|
|
2419
|
+
}
|
|
2420
|
+
pos = nextClose + closeTag.length;
|
|
2421
|
+
}
|
|
2422
|
+
}
|
|
2423
|
+
return -1;
|
|
2424
|
+
}
|
|
2425
|
+
function parseElement3(xml, startPos, parent = null, cdataMap) {
|
|
2426
|
+
const extracted = cdataMap ? { text: xml, cdataMap } : extractCDATA3(xml);
|
|
2427
|
+
const cleanedXML = extracted.text;
|
|
2428
|
+
const currentCdataMap = extracted.cdataMap;
|
|
2429
|
+
const openTagStart = cleanedXML.indexOf("<", startPos);
|
|
2430
|
+
if (openTagStart === -1) {
|
|
2431
|
+
throw new Error("No opening tag found");
|
|
2432
|
+
}
|
|
2433
|
+
const openTagEnd = cleanedXML.indexOf(">", openTagStart);
|
|
2434
|
+
if (openTagEnd === -1) {
|
|
2435
|
+
throw new Error("Unclosed opening tag");
|
|
2436
|
+
}
|
|
2437
|
+
const openTagContent = cleanedXML.substring(openTagStart + 1, openTagEnd);
|
|
2438
|
+
const isSelfClosing = openTagContent.endsWith("/");
|
|
2439
|
+
const tagContent = isSelfClosing ? openTagContent.slice(0, -1).trim() : openTagContent;
|
|
2440
|
+
const spaceIndex = tagContent.indexOf(" ");
|
|
2441
|
+
const tagName = spaceIndex === -1 ? tagContent : tagContent.substring(0, spaceIndex);
|
|
2442
|
+
const attributes = spaceIndex === -1 ? {} : parseAttributes3(tagContent.substring(spaceIndex));
|
|
2443
|
+
const element = {
|
|
2444
|
+
tagName,
|
|
2445
|
+
attributes,
|
|
2446
|
+
text: "",
|
|
2447
|
+
children: [],
|
|
2448
|
+
parent
|
|
2449
|
+
};
|
|
2450
|
+
if (isSelfClosing) {
|
|
2451
|
+
return { element, endPos: openTagEnd + 1, cdataMap: currentCdataMap };
|
|
2452
|
+
}
|
|
2453
|
+
const closingTagPos = findClosingTag3(cleanedXML, tagName, openTagEnd + 1);
|
|
2454
|
+
if (closingTagPos === -1) {
|
|
2455
|
+
throw new Error(`No closing tag found for <${tagName}>`);
|
|
2456
|
+
}
|
|
2457
|
+
const content = cleanedXML.substring(openTagEnd + 1, closingTagPos);
|
|
2458
|
+
if (content.includes("<")) {
|
|
2459
|
+
let pos = 0;
|
|
2460
|
+
const trimmedContent = content.trim();
|
|
2461
|
+
while (pos < trimmedContent.length) {
|
|
2462
|
+
const nextTag = trimmedContent.indexOf("<", pos);
|
|
2463
|
+
if (nextTag === -1) break;
|
|
2464
|
+
if (trimmedContent[nextTag + 1] === "/" || trimmedContent[nextTag + 1] === "!") {
|
|
2465
|
+
pos = nextTag + 1;
|
|
2466
|
+
continue;
|
|
2467
|
+
}
|
|
2468
|
+
try {
|
|
2469
|
+
const { element: child, endPos } = parseElement3(
|
|
2470
|
+
trimmedContent,
|
|
2471
|
+
nextTag,
|
|
2472
|
+
element,
|
|
2473
|
+
currentCdataMap
|
|
2474
|
+
);
|
|
2475
|
+
element.children.push(child);
|
|
2476
|
+
pos = endPos;
|
|
2477
|
+
} catch {
|
|
2478
|
+
pos = nextTag + 1;
|
|
2479
|
+
}
|
|
2480
|
+
}
|
|
2481
|
+
let textContent = content.replace(/<[^>]+>/g, "").trim();
|
|
2482
|
+
textContent = restoreCDATA3(textContent, currentCdataMap);
|
|
2483
|
+
element.text = textContent;
|
|
2484
|
+
} else {
|
|
2485
|
+
let textContent = content.trim();
|
|
2486
|
+
textContent = restoreCDATA3(textContent, currentCdataMap);
|
|
2487
|
+
element.text = textContent;
|
|
2488
|
+
}
|
|
2489
|
+
const closingTagEnd = closingTagPos + `</${tagName}>`.length;
|
|
2490
|
+
return { element, endPos: closingTagEnd, cdataMap: currentCdataMap };
|
|
2491
|
+
}
|
|
2492
|
+
function querySelector3(element, selector, caseSensitive = false) {
|
|
2493
|
+
const tagName = caseSensitive ? selector : selector.toLowerCase();
|
|
2494
|
+
const elementTag = caseSensitive ? element.tagName : element.tagName.toLowerCase();
|
|
2495
|
+
if (elementTag === tagName) {
|
|
2496
|
+
return element;
|
|
2497
|
+
}
|
|
2498
|
+
for (const child of element.children) {
|
|
2499
|
+
const found = querySelector3(child, selector, caseSensitive);
|
|
2500
|
+
if (found) return found;
|
|
2501
|
+
}
|
|
2502
|
+
return null;
|
|
2503
|
+
}
|
|
2504
|
+
function querySelectorAll3(element, selector, caseSensitive = false) {
|
|
2505
|
+
const results = [];
|
|
2506
|
+
const tagName = caseSensitive ? selector : selector.toLowerCase();
|
|
2507
|
+
const elementTag = caseSensitive ? element.tagName : element.tagName.toLowerCase();
|
|
2508
|
+
if (elementTag === tagName) {
|
|
2509
|
+
results.push(element);
|
|
2510
|
+
}
|
|
2511
|
+
for (const child of element.children) {
|
|
2512
|
+
results.push(...querySelectorAll3(child, selector, caseSensitive));
|
|
2513
|
+
}
|
|
2514
|
+
return results;
|
|
2515
|
+
}
|
|
2516
|
+
function getText2(element) {
|
|
2517
|
+
return element?.text || "";
|
|
2518
|
+
}
|
|
2519
|
+
function getChild(element, tagName) {
|
|
2520
|
+
const lowerTag = tagName.toLowerCase();
|
|
2521
|
+
return element.children.find((c) => c.tagName.toLowerCase() === lowerTag) || null;
|
|
2522
|
+
}
|
|
2523
|
+
function getChildren(element, tagName) {
|
|
2524
|
+
const lowerTag = tagName.toLowerCase();
|
|
2525
|
+
return element.children.filter((c) => c.tagName.toLowerCase() === lowerTag);
|
|
2526
|
+
}
|
|
2527
|
+
|
|
2528
|
+
// src/feed/sitemap/parse.ts
|
|
2529
|
+
function parseSitemap(xml, baseUrl) {
|
|
2530
|
+
const doc = parseSitemapXML(xml);
|
|
2531
|
+
const sitemapIndex = querySelector3(doc, "sitemapindex");
|
|
2532
|
+
if (sitemapIndex) {
|
|
2533
|
+
return parseSitemapIndex(sitemapIndex, baseUrl);
|
|
2534
|
+
}
|
|
2535
|
+
const urlset = querySelector3(doc, "urlset");
|
|
2536
|
+
if (urlset) {
|
|
2537
|
+
return parseUrlset(urlset, baseUrl);
|
|
2538
|
+
}
|
|
2539
|
+
const urls = querySelectorAll3(doc, "url");
|
|
2540
|
+
if (urls.length > 0) {
|
|
2541
|
+
return {
|
|
2542
|
+
sitemap: {
|
|
2543
|
+
type: "urlset",
|
|
2544
|
+
urls: urls.map((url) => extractUrl(url, baseUrl)),
|
|
2545
|
+
sitemaps: []
|
|
2546
|
+
},
|
|
2547
|
+
isIndex: false
|
|
2548
|
+
};
|
|
2549
|
+
}
|
|
2550
|
+
return {
|
|
2551
|
+
sitemap: {
|
|
2552
|
+
type: "urlset",
|
|
2553
|
+
urls: [],
|
|
2554
|
+
sitemaps: []
|
|
2555
|
+
},
|
|
2556
|
+
isIndex: false
|
|
2557
|
+
};
|
|
2558
|
+
}
|
|
2559
|
+
function parseSitemapIndex(element, baseUrl) {
|
|
2560
|
+
const sitemapElements = getChildren(element, "sitemap");
|
|
2561
|
+
const sitemaps = sitemapElements.map((el) => {
|
|
2562
|
+
const loc = getText2(getChild(el, "loc"));
|
|
2563
|
+
const lastmod = getText2(getChild(el, "lastmod")) || void 0;
|
|
2564
|
+
return {
|
|
2565
|
+
loc: baseUrl ? normalizeUrlHttps(baseUrl, loc) : loc,
|
|
2566
|
+
lastmod
|
|
2567
|
+
};
|
|
2568
|
+
});
|
|
2569
|
+
return {
|
|
2570
|
+
sitemap: {
|
|
2571
|
+
type: "sitemapindex",
|
|
2572
|
+
urls: [],
|
|
2573
|
+
sitemaps
|
|
2574
|
+
},
|
|
2575
|
+
isIndex: true
|
|
2576
|
+
};
|
|
2577
|
+
}
|
|
2578
|
+
function parseUrlset(element, baseUrl) {
|
|
2579
|
+
const urlElements = getChildren(element, "url");
|
|
2580
|
+
const urls = urlElements.map((el) => extractUrl(el, baseUrl));
|
|
2581
|
+
return {
|
|
2582
|
+
sitemap: {
|
|
2583
|
+
type: "urlset",
|
|
2584
|
+
urls,
|
|
2585
|
+
sitemaps: []
|
|
2586
|
+
},
|
|
2587
|
+
isIndex: false
|
|
2588
|
+
};
|
|
2589
|
+
}
|
|
2590
|
+
function extractUrl(element, baseUrl) {
|
|
2591
|
+
const rawLoc = getText2(getChild(element, "loc"));
|
|
2592
|
+
const loc = decodeXmlEntities(rawLoc);
|
|
2593
|
+
const lastmod = getText2(getChild(element, "lastmod")) || void 0;
|
|
2594
|
+
const changefreq = getText2(getChild(element, "changefreq")) || void 0;
|
|
2595
|
+
const priorityText = getText2(getChild(element, "priority"));
|
|
2596
|
+
const priority = priorityText ? Number.parseFloat(priorityText) : void 0;
|
|
2597
|
+
const result = {
|
|
2598
|
+
loc: baseUrl ? normalizeUrlHttps(baseUrl, loc) : loc,
|
|
2599
|
+
lastmod,
|
|
2600
|
+
changefreq,
|
|
2601
|
+
priority: priority && !Number.isNaN(priority) ? priority : void 0
|
|
2602
|
+
};
|
|
2603
|
+
const news = extractNews(element);
|
|
2604
|
+
if (news) {
|
|
2605
|
+
result.news = news;
|
|
2606
|
+
}
|
|
2607
|
+
const images = extractImages(element, baseUrl);
|
|
2608
|
+
if (images.length > 0) {
|
|
2609
|
+
result.images = images;
|
|
2610
|
+
}
|
|
2611
|
+
const videos = extractVideos(element, baseUrl);
|
|
2612
|
+
if (videos.length > 0) {
|
|
2613
|
+
result.videos = videos;
|
|
2614
|
+
}
|
|
2615
|
+
return result;
|
|
2616
|
+
}
|
|
2617
|
+
function extractNews(urlElement) {
|
|
2618
|
+
const newsEl = getChild(urlElement, "news:news") || getChild(urlElement, "news") || urlElement.children.find((c) => c.tagName.toLowerCase().endsWith(":news"));
|
|
2619
|
+
if (!newsEl) {
|
|
2620
|
+
return void 0;
|
|
2621
|
+
}
|
|
2622
|
+
const news = {};
|
|
2623
|
+
const pubEl = getChild(newsEl, "news:publication") || getChild(newsEl, "publication") || newsEl.children.find((c) => c.tagName.toLowerCase().endsWith(":publication"));
|
|
2624
|
+
if (pubEl) {
|
|
2625
|
+
const name = getText2(getChild(pubEl, "news:name")) || getText2(getChild(pubEl, "name")) || getText2(pubEl.children.find((c) => c.tagName.toLowerCase().endsWith(":name")));
|
|
2626
|
+
const language = getText2(getChild(pubEl, "news:language")) || getText2(getChild(pubEl, "language")) || getText2(pubEl.children.find((c) => c.tagName.toLowerCase().endsWith(":language")));
|
|
2627
|
+
if (name || language) {
|
|
2628
|
+
news.publication = {
|
|
2629
|
+
name: name || void 0,
|
|
2630
|
+
language: language || void 0
|
|
2631
|
+
};
|
|
2632
|
+
}
|
|
2633
|
+
}
|
|
2634
|
+
const pubDate = getText2(getChild(newsEl, "news:publication_date")) || getText2(getChild(newsEl, "publication_date")) || getText2(newsEl.children.find((c) => c.tagName.toLowerCase().endsWith(":publication_date")));
|
|
2635
|
+
if (pubDate) {
|
|
2636
|
+
news.publicationDate = pubDate;
|
|
2637
|
+
}
|
|
2638
|
+
const title = getText2(getChild(newsEl, "news:title")) || getText2(getChild(newsEl, "title")) || getText2(newsEl.children.find((c) => c.tagName.toLowerCase().endsWith(":title")));
|
|
2639
|
+
if (title) {
|
|
2640
|
+
news.title = decodeXmlEntities(title);
|
|
2641
|
+
}
|
|
2642
|
+
const keywords = getText2(getChild(newsEl, "news:keywords")) || getText2(getChild(newsEl, "keywords")) || getText2(newsEl.children.find((c) => c.tagName.toLowerCase().endsWith(":keywords")));
|
|
2643
|
+
if (keywords) {
|
|
2644
|
+
news.keywords = keywords.split(",").map((k) => k.trim());
|
|
2645
|
+
}
|
|
2646
|
+
const stockTickers = getText2(getChild(newsEl, "news:stock_tickers")) || getText2(getChild(newsEl, "stock_tickers")) || getText2(newsEl.children.find((c) => c.tagName.toLowerCase().endsWith(":stock_tickers")));
|
|
2647
|
+
if (stockTickers) {
|
|
2648
|
+
news.stockTickers = stockTickers.split(",").map((t) => t.trim());
|
|
2649
|
+
}
|
|
2650
|
+
return Object.keys(news).length > 0 ? news : void 0;
|
|
2651
|
+
}
|
|
2652
|
+
function extractImages(urlElement, baseUrl) {
|
|
2653
|
+
const imageElements = urlElement.children.filter(
|
|
2654
|
+
(c) => c.tagName.toLowerCase() === "image:image" || c.tagName.toLowerCase() === "image" || c.tagName.toLowerCase().endsWith(":image")
|
|
2655
|
+
);
|
|
2656
|
+
return imageElements.map((imgEl) => {
|
|
2657
|
+
const loc = getText2(getChild(imgEl, "image:loc")) || getText2(getChild(imgEl, "loc")) || getText2(imgEl.children.find((c) => c.tagName.toLowerCase().endsWith(":loc")));
|
|
2658
|
+
if (!loc) return null;
|
|
2659
|
+
const image = {
|
|
2660
|
+
loc: baseUrl ? normalizeUrlHttps(baseUrl, loc) : loc
|
|
2661
|
+
};
|
|
2662
|
+
const caption = getText2(getChild(imgEl, "image:caption")) || getText2(getChild(imgEl, "caption")) || getText2(imgEl.children.find((c) => c.tagName.toLowerCase().endsWith(":caption")));
|
|
2663
|
+
if (caption) image.caption = decodeXmlEntities(caption);
|
|
2664
|
+
const geoLocation = getText2(getChild(imgEl, "image:geo_location")) || getText2(getChild(imgEl, "geo_location")) || getText2(imgEl.children.find((c) => c.tagName.toLowerCase().endsWith(":geo_location")));
|
|
2665
|
+
if (geoLocation) image.geoLocation = geoLocation;
|
|
2666
|
+
const title = getText2(getChild(imgEl, "image:title")) || getText2(getChild(imgEl, "title")) || getText2(imgEl.children.find((c) => c.tagName.toLowerCase().endsWith(":title")));
|
|
2667
|
+
if (title) image.title = decodeXmlEntities(title);
|
|
2668
|
+
const license = getText2(getChild(imgEl, "image:license")) || getText2(getChild(imgEl, "license")) || getText2(imgEl.children.find((c) => c.tagName.toLowerCase().endsWith(":license")));
|
|
2669
|
+
if (license) image.license = baseUrl ? normalizeUrlHttps(baseUrl, license) : license;
|
|
2670
|
+
return image;
|
|
2671
|
+
}).filter((img) => img !== null);
|
|
2672
|
+
}
|
|
2673
|
+
function extractVideos(urlElement, baseUrl) {
|
|
2674
|
+
const videoElements = urlElement.children.filter(
|
|
2675
|
+
(c) => c.tagName.toLowerCase() === "video:video" || c.tagName.toLowerCase() === "video" || c.tagName.toLowerCase().endsWith(":video")
|
|
2676
|
+
);
|
|
2677
|
+
return videoElements.map((vidEl) => {
|
|
2678
|
+
const thumbnailLoc = getText2(getChild(vidEl, "video:thumbnail_loc")) || getText2(getChild(vidEl, "thumbnail_loc")) || getText2(vidEl.children.find((c) => c.tagName.toLowerCase().endsWith(":thumbnail_loc")));
|
|
2679
|
+
const title = getText2(getChild(vidEl, "video:title")) || getText2(getChild(vidEl, "title")) || getText2(vidEl.children.find((c) => c.tagName.toLowerCase().endsWith(":title")));
|
|
2680
|
+
const description = getText2(getChild(vidEl, "video:description")) || getText2(getChild(vidEl, "description")) || getText2(vidEl.children.find((c) => c.tagName.toLowerCase().endsWith(":description")));
|
|
2681
|
+
if (!thumbnailLoc || !title || !description) return null;
|
|
2682
|
+
const video = {
|
|
2683
|
+
thumbnailLoc: baseUrl ? normalizeUrlHttps(baseUrl, thumbnailLoc) : thumbnailLoc,
|
|
2684
|
+
title: decodeXmlEntities(title),
|
|
2685
|
+
description: decodeXmlEntities(description)
|
|
2686
|
+
};
|
|
2687
|
+
const contentLoc = getText2(getChild(vidEl, "video:content_loc")) || getText2(getChild(vidEl, "content_loc")) || getText2(vidEl.children.find((c) => c.tagName.toLowerCase().endsWith(":content_loc")));
|
|
2688
|
+
if (contentLoc)
|
|
2689
|
+
video.contentLoc = baseUrl ? normalizeUrlHttps(baseUrl, contentLoc) : contentLoc;
|
|
2690
|
+
const playerLoc = getText2(getChild(vidEl, "video:player_loc")) || getText2(getChild(vidEl, "player_loc")) || getText2(vidEl.children.find((c) => c.tagName.toLowerCase().endsWith(":player_loc")));
|
|
2691
|
+
if (playerLoc) video.playerLoc = baseUrl ? normalizeUrlHttps(baseUrl, playerLoc) : playerLoc;
|
|
2692
|
+
const duration = getText2(getChild(vidEl, "video:duration")) || getText2(getChild(vidEl, "duration")) || getText2(vidEl.children.find((c) => c.tagName.toLowerCase().endsWith(":duration")));
|
|
2693
|
+
if (duration) {
|
|
2694
|
+
const dur = Number.parseInt(duration, 10);
|
|
2695
|
+
if (!Number.isNaN(dur)) video.duration = dur;
|
|
2696
|
+
}
|
|
2697
|
+
const rating = getText2(getChild(vidEl, "video:rating")) || getText2(getChild(vidEl, "rating")) || getText2(vidEl.children.find((c) => c.tagName.toLowerCase().endsWith(":rating")));
|
|
2698
|
+
if (rating) {
|
|
2699
|
+
const r = Number.parseFloat(rating);
|
|
2700
|
+
if (!Number.isNaN(r)) video.rating = r;
|
|
2701
|
+
}
|
|
2702
|
+
const viewCount = getText2(getChild(vidEl, "video:view_count")) || getText2(getChild(vidEl, "view_count")) || getText2(vidEl.children.find((c) => c.tagName.toLowerCase().endsWith(":view_count")));
|
|
2703
|
+
if (viewCount) {
|
|
2704
|
+
const vc = Number.parseInt(viewCount, 10);
|
|
2705
|
+
if (!Number.isNaN(vc)) video.viewCount = vc;
|
|
2706
|
+
}
|
|
2707
|
+
const publicationDate = getText2(getChild(vidEl, "video:publication_date")) || getText2(getChild(vidEl, "publication_date")) || getText2(vidEl.children.find((c) => c.tagName.toLowerCase().endsWith(":publication_date")));
|
|
2708
|
+
if (publicationDate) video.publicationDate = publicationDate;
|
|
2709
|
+
const familyFriendly = getText2(getChild(vidEl, "video:family_friendly")) || getText2(getChild(vidEl, "family_friendly")) || getText2(vidEl.children.find((c) => c.tagName.toLowerCase().endsWith(":family_friendly")));
|
|
2710
|
+
if (familyFriendly) {
|
|
2711
|
+
video.familyFriendly = familyFriendly.toLowerCase() === "yes";
|
|
2712
|
+
}
|
|
2713
|
+
const category = getText2(getChild(vidEl, "video:category")) || getText2(getChild(vidEl, "category")) || getText2(vidEl.children.find((c) => c.tagName.toLowerCase().endsWith(":category")));
|
|
2714
|
+
if (category) video.category = category;
|
|
2715
|
+
const tagElements = vidEl.children.filter(
|
|
2716
|
+
(c) => c.tagName.toLowerCase() === "video:tag" || c.tagName.toLowerCase() === "tag" || c.tagName.toLowerCase().endsWith(":tag")
|
|
2717
|
+
);
|
|
2718
|
+
if (tagElements.length > 0) {
|
|
2719
|
+
video.tags = tagElements.map((t) => getText2(t)).filter(Boolean);
|
|
2720
|
+
}
|
|
2721
|
+
return video;
|
|
2722
|
+
}).filter((vid) => vid !== null);
|
|
2723
|
+
}
|
|
2724
|
+
function decodeXmlEntities(text) {
|
|
2725
|
+
return text.replace(/</g, "<").replace(/>/g, ">").replace(/&/g, "&").replace(/"/g, '"').replace(/'/g, "'").replace(/&#(\d+);/g, (_, code) => String.fromCharCode(Number.parseInt(code, 10))).replace(/&#x([0-9a-fA-F]+);/g, (_, code) => String.fromCharCode(Number.parseInt(code, 16)));
|
|
2726
|
+
}
|
|
2727
|
+
|
|
2356
2728
|
// src/pluck/types.ts
|
|
2357
2729
|
var PluckError = class extends Error {
|
|
2358
2730
|
constructor(message) {
|
|
@@ -2729,6 +3101,12 @@ function parseHTML(html, baseUrl) {
|
|
|
2729
3101
|
});
|
|
2730
3102
|
return document;
|
|
2731
3103
|
}
|
|
3104
|
+
function ensureDocument(input, baseUrl) {
|
|
3105
|
+
if (typeof input === "string") {
|
|
3106
|
+
return parseHTML(input, baseUrl);
|
|
3107
|
+
}
|
|
3108
|
+
return input;
|
|
3109
|
+
}
|
|
2732
3110
|
|
|
2733
3111
|
// src/utils/meta-helpers.ts
|
|
2734
3112
|
function getMetaContent(doc, name) {
|
|
@@ -2756,7 +3134,8 @@ function getMetaHttpEquiv(doc, httpEquiv) {
|
|
|
2756
3134
|
}
|
|
2757
3135
|
|
|
2758
3136
|
// src/metadata/opengraph/extract.ts
|
|
2759
|
-
function extractOpenGraph(
|
|
3137
|
+
function extractOpenGraph(input) {
|
|
3138
|
+
const doc = ensureDocument(input);
|
|
2760
3139
|
const metadata = {};
|
|
2761
3140
|
metadata.title = getMetaProperty(doc, "og:title");
|
|
2762
3141
|
metadata.type = getMetaProperty(doc, "og:type");
|
|
@@ -2781,7 +3160,7 @@ function extractOpenGraph(doc) {
|
|
|
2781
3160
|
if (Object.keys(audio).length > 0) {
|
|
2782
3161
|
metadata.audio = audio;
|
|
2783
3162
|
}
|
|
2784
|
-
const images =
|
|
3163
|
+
const images = extractImages2(doc);
|
|
2785
3164
|
if (images.length > 0) {
|
|
2786
3165
|
metadata.images = images;
|
|
2787
3166
|
}
|
|
@@ -2850,7 +3229,7 @@ function extractAudio(doc) {
|
|
|
2850
3229
|
Object.entries(audio).filter(([_, value]) => value !== void 0)
|
|
2851
3230
|
);
|
|
2852
3231
|
}
|
|
2853
|
-
function
|
|
3232
|
+
function extractImages2(doc) {
|
|
2854
3233
|
const images = [];
|
|
2855
3234
|
const imageUrls = getAllMetaPropertyValues(doc, "og:image");
|
|
2856
3235
|
const imageSecureUrls = getAllMetaPropertyValues(doc, "og:image:secure_url");
|
|
@@ -2969,7 +3348,8 @@ function matchesAnyType(obj, targetTypes) {
|
|
|
2969
3348
|
}
|
|
2970
3349
|
|
|
2971
3350
|
// src/metadata/schema-org/extract.ts
|
|
2972
|
-
function extractSchemaOrg(
|
|
3351
|
+
function extractSchemaOrg(input) {
|
|
3352
|
+
const doc = ensureDocument(input);
|
|
2973
3353
|
const metadata = {
|
|
2974
3354
|
jsonLd: []
|
|
2975
3355
|
};
|
|
@@ -3046,7 +3426,8 @@ function organizeByType(metadata) {
|
|
|
3046
3426
|
}
|
|
3047
3427
|
|
|
3048
3428
|
// src/metadata/seo/extract.ts
|
|
3049
|
-
function extractSEO(
|
|
3429
|
+
function extractSEO(input) {
|
|
3430
|
+
const doc = ensureDocument(input);
|
|
3050
3431
|
const metadata = {};
|
|
3051
3432
|
const titleElement = doc.querySelector("title");
|
|
3052
3433
|
if (titleElement?.textContent) {
|
|
@@ -3078,7 +3459,8 @@ function extractSEO(doc) {
|
|
|
3078
3459
|
}
|
|
3079
3460
|
|
|
3080
3461
|
// src/metadata/twitter-card/extract.ts
|
|
3081
|
-
function extractTwitterCard(
|
|
3462
|
+
function extractTwitterCard(input) {
|
|
3463
|
+
const doc = ensureDocument(input);
|
|
3082
3464
|
const metadata = {};
|
|
3083
3465
|
metadata.card = getMetaContent(doc, "twitter:card");
|
|
3084
3466
|
metadata.site = getMetaContent(doc, "twitter:site");
|
|
@@ -3235,7 +3617,8 @@ function getAllLinksByPrefix(doc, relPrefix) {
|
|
|
3235
3617
|
}
|
|
3236
3618
|
|
|
3237
3619
|
// src/metadata/icons/extract.ts
|
|
3238
|
-
function extractIcons(
|
|
3620
|
+
function extractIcons(input) {
|
|
3621
|
+
const doc = ensureDocument(input);
|
|
3239
3622
|
const metadata = {};
|
|
3240
3623
|
const iconLinks = getAllLinksByRels(doc, ["icon", "shortcut icon"]);
|
|
3241
3624
|
for (const link of iconLinks) {
|
|
@@ -3416,7 +3799,8 @@ function parseSizeString(sizeStr) {
|
|
|
3416
3799
|
}
|
|
3417
3800
|
|
|
3418
3801
|
// src/metadata/language/extract.ts
|
|
3419
|
-
function extractLanguage(
|
|
3802
|
+
function extractLanguage(input) {
|
|
3803
|
+
const doc = ensureDocument(input);
|
|
3420
3804
|
const metadata = {};
|
|
3421
3805
|
const htmlElement = doc.querySelector("html");
|
|
3422
3806
|
if (htmlElement) {
|
|
@@ -3468,7 +3852,8 @@ function extractBestLanguage(doc) {
|
|
|
3468
3852
|
}
|
|
3469
3853
|
|
|
3470
3854
|
// src/metadata/links/extract.ts
|
|
3471
|
-
function extractLinks3(
|
|
3855
|
+
function extractLinks3(input, baseUrl, options = {}) {
|
|
3856
|
+
const doc = ensureDocument(input);
|
|
3472
3857
|
const opts = normalizeOptions3(options);
|
|
3473
3858
|
const effectiveBaseUrl = getEffectiveBaseUrl(doc, baseUrl);
|
|
3474
3859
|
const baseOrigin = effectiveBaseUrl ? getOrigin(effectiveBaseUrl) : null;
|
|
@@ -3799,7 +4184,8 @@ function getStringProperty3(obj, prop) {
|
|
|
3799
4184
|
}
|
|
3800
4185
|
|
|
3801
4186
|
// src/metadata/canonical/extract.ts
|
|
3802
|
-
function extractCanonical(
|
|
4187
|
+
function extractCanonical(input) {
|
|
4188
|
+
const doc = ensureDocument(input);
|
|
3803
4189
|
const metadata = {};
|
|
3804
4190
|
metadata.canonical = getLinkHref(doc, "canonical");
|
|
3805
4191
|
const alternateLinks = getAllLinks(doc, "alternate");
|
|
@@ -3944,9 +4330,63 @@ async function gatherFeed(url) {
|
|
|
3944
4330
|
}
|
|
3945
4331
|
const response = await pluck(feedUrl);
|
|
3946
4332
|
const content = await response.textUtf8();
|
|
4333
|
+
const format = detectFormat(content);
|
|
4334
|
+
if (format === "sitemap") {
|
|
4335
|
+
return normalizeSitemapToFeed(content, response.finalUrl);
|
|
4336
|
+
}
|
|
3947
4337
|
const result = parseFeed(content, response.finalUrl);
|
|
3948
4338
|
return result.feed;
|
|
3949
4339
|
}
|
|
4340
|
+
function normalizeSitemapToFeed(content, baseUrl) {
|
|
4341
|
+
const result = parseSitemap(content, baseUrl);
|
|
4342
|
+
if (result.isIndex) {
|
|
4343
|
+
const items2 = result.sitemap.sitemaps.map((sitemap, index) => ({
|
|
4344
|
+
id: sitemap.loc || `sitemap-${index}`,
|
|
4345
|
+
url: sitemap.loc,
|
|
4346
|
+
title: `Sitemap: ${sitemap.loc}`,
|
|
4347
|
+
modified: sitemap.lastmod
|
|
4348
|
+
}));
|
|
4349
|
+
return {
|
|
4350
|
+
format: "sitemap",
|
|
4351
|
+
title: "Sitemap Index",
|
|
4352
|
+
url: baseUrl,
|
|
4353
|
+
items: items2
|
|
4354
|
+
};
|
|
4355
|
+
}
|
|
4356
|
+
const items = result.sitemap.urls.map((url, index) => {
|
|
4357
|
+
const item = {
|
|
4358
|
+
id: url.loc || `url-${index}`,
|
|
4359
|
+
url: url.loc,
|
|
4360
|
+
modified: url.lastmod
|
|
4361
|
+
};
|
|
4362
|
+
if (url.news) {
|
|
4363
|
+
item.title = url.news.title;
|
|
4364
|
+
item.published = url.news.publicationDate;
|
|
4365
|
+
if (url.news.publication?.name) {
|
|
4366
|
+
item.authors = [{ name: url.news.publication.name }];
|
|
4367
|
+
}
|
|
4368
|
+
if (url.news.keywords) {
|
|
4369
|
+
item.tags = url.news.keywords;
|
|
4370
|
+
}
|
|
4371
|
+
}
|
|
4372
|
+
if (url.images && url.images.length > 0) {
|
|
4373
|
+
item.image = url.images[0].loc;
|
|
4374
|
+
}
|
|
4375
|
+
return item;
|
|
4376
|
+
});
|
|
4377
|
+
let title = "Sitemap";
|
|
4378
|
+
try {
|
|
4379
|
+
const urlObj = new URL(baseUrl);
|
|
4380
|
+
title = `${urlObj.hostname} Sitemap`;
|
|
4381
|
+
} catch {
|
|
4382
|
+
}
|
|
4383
|
+
return {
|
|
4384
|
+
format: "sitemap",
|
|
4385
|
+
title,
|
|
4386
|
+
url: baseUrl,
|
|
4387
|
+
items
|
|
4388
|
+
};
|
|
4389
|
+
}
|
|
3950
4390
|
|
|
3951
4391
|
// src/metadata/feed-discovery/heuristics.ts
|
|
3952
4392
|
var COMMON_FEED_PATHS = [
|
|
@@ -3981,7 +4421,8 @@ function generateFeedSuggestions(documentUrl) {
|
|
|
3981
4421
|
}
|
|
3982
4422
|
|
|
3983
4423
|
// src/metadata/feed-discovery/extract.ts
|
|
3984
|
-
function extractFeedDiscovery(
|
|
4424
|
+
function extractFeedDiscovery(input, documentUrl) {
|
|
4425
|
+
const doc = ensureDocument(input);
|
|
3985
4426
|
const metadata = {
|
|
3986
4427
|
feeds: []
|
|
3987
4428
|
};
|
|
@@ -4158,7 +4599,8 @@ async function gatherWebsite(url) {
|
|
|
4158
4599
|
}
|
|
4159
4600
|
|
|
4160
4601
|
// src/metadata/analytics/extract.ts
|
|
4161
|
-
function extractAnalytics(
|
|
4602
|
+
function extractAnalytics(input) {
|
|
4603
|
+
const doc = ensureDocument(input);
|
|
4162
4604
|
const metadata = {};
|
|
4163
4605
|
const scripts = doc.querySelectorAll("script");
|
|
4164
4606
|
const googleAnalytics = /* @__PURE__ */ new Set();
|
|
@@ -4250,10 +4692,11 @@ function extractAnalytics(doc) {
|
|
|
4250
4692
|
}
|
|
4251
4693
|
|
|
4252
4694
|
// src/metadata/assets/extract.ts
|
|
4253
|
-
function extractAssets(
|
|
4695
|
+
function extractAssets(input, baseUrl) {
|
|
4696
|
+
const doc = ensureDocument(input);
|
|
4254
4697
|
const metadata = {};
|
|
4255
4698
|
const effectiveBaseUrl = getEffectiveBaseUrl2(doc, baseUrl);
|
|
4256
|
-
const images =
|
|
4699
|
+
const images = extractImages3(doc, effectiveBaseUrl);
|
|
4257
4700
|
if (images.length > 0) {
|
|
4258
4701
|
metadata.images = images;
|
|
4259
4702
|
}
|
|
@@ -4307,7 +4750,7 @@ function getEffectiveBaseUrl2(doc, baseUrl) {
|
|
|
4307
4750
|
}
|
|
4308
4751
|
return null;
|
|
4309
4752
|
}
|
|
4310
|
-
function
|
|
4753
|
+
function extractImages3(doc, baseUrl) {
|
|
4311
4754
|
const urls = /* @__PURE__ */ new Set();
|
|
4312
4755
|
const imgElements = doc.querySelectorAll("img[src]");
|
|
4313
4756
|
for (const img of Array.from(imgElements)) {
|
|
@@ -4577,7 +5020,8 @@ function extractConnectionHints(doc, baseUrl) {
|
|
|
4577
5020
|
}
|
|
4578
5021
|
|
|
4579
5022
|
// src/metadata/copyright/extract.ts
|
|
4580
|
-
function extractCopyright(
|
|
5023
|
+
function extractCopyright(input) {
|
|
5024
|
+
const doc = ensureDocument(input);
|
|
4581
5025
|
const metadata = {};
|
|
4582
5026
|
metadata.copyright = getMetaContent(doc, "copyright");
|
|
4583
5027
|
metadata.license = getLinkHref(doc, "license");
|
|
@@ -4613,7 +5057,8 @@ function parseCopyright(copyrightString) {
|
|
|
4613
5057
|
}
|
|
4614
5058
|
|
|
4615
5059
|
// src/metadata/dublin-core/extract.ts
|
|
4616
|
-
function extractDublinCore(
|
|
5060
|
+
function extractDublinCore(input) {
|
|
5061
|
+
const doc = ensureDocument(input);
|
|
4617
5062
|
const metadata = {};
|
|
4618
5063
|
metadata.title = getMetaContent(doc, "DC.title") || getMetaContent(doc, "dcterms.title");
|
|
4619
5064
|
metadata.description = getMetaContent(doc, "DC.description") || getMetaContent(doc, "dcterms.description");
|
|
@@ -4654,7 +5099,8 @@ function extractMultiValue(doc, field) {
|
|
|
4654
5099
|
}
|
|
4655
5100
|
|
|
4656
5101
|
// src/metadata/geo/extract.ts
|
|
4657
|
-
function extractGeo(
|
|
5102
|
+
function extractGeo(input) {
|
|
5103
|
+
const doc = ensureDocument(input);
|
|
4658
5104
|
const metadata = {};
|
|
4659
5105
|
const geoPosition = getMetaContent(doc, "geo.position");
|
|
4660
5106
|
if (geoPosition) {
|
|
@@ -4711,7 +5157,8 @@ function parseICBM(icbm) {
|
|
|
4711
5157
|
}
|
|
4712
5158
|
|
|
4713
5159
|
// src/metadata/monetization/extract.ts
|
|
4714
|
-
function extractMonetization(
|
|
5160
|
+
function extractMonetization(input) {
|
|
5161
|
+
const doc = ensureDocument(input);
|
|
4715
5162
|
const metadata = {};
|
|
4716
5163
|
metadata.webMonetization = getMetaContent(doc, "monetization");
|
|
4717
5164
|
metadata.paypalVerification = getMetaContent(doc, "paypal-site-verification");
|
|
@@ -4725,7 +5172,8 @@ function extractMonetization(doc) {
|
|
|
4725
5172
|
}
|
|
4726
5173
|
|
|
4727
5174
|
// src/metadata/news/extract.ts
|
|
4728
|
-
function
|
|
5175
|
+
function extractNews2(input) {
|
|
5176
|
+
const doc = ensureDocument(input);
|
|
4729
5177
|
const metadata = {};
|
|
4730
5178
|
const newsKeywords = getMetaContent(doc, "news_keywords");
|
|
4731
5179
|
if (newsKeywords) {
|
|
@@ -4743,7 +5191,8 @@ function extractNews(doc) {
|
|
|
4743
5191
|
}
|
|
4744
5192
|
|
|
4745
5193
|
// src/metadata/pagination/extract.ts
|
|
4746
|
-
function extractPagination(
|
|
5194
|
+
function extractPagination(input) {
|
|
5195
|
+
const doc = ensureDocument(input);
|
|
4747
5196
|
const metadata = {};
|
|
4748
5197
|
metadata.prev = getLinkHref(doc, "prev") || getLinkHref(doc, "previous");
|
|
4749
5198
|
metadata.next = getLinkHref(doc, "next");
|
|
@@ -4842,7 +5291,8 @@ function parseKeyValueDirective(key, value, result) {
|
|
|
4842
5291
|
}
|
|
4843
5292
|
|
|
4844
5293
|
// src/metadata/robots/extract.ts
|
|
4845
|
-
function extractRobots(
|
|
5294
|
+
function extractRobots(input) {
|
|
5295
|
+
const doc = ensureDocument(input);
|
|
4846
5296
|
const metadata = {};
|
|
4847
5297
|
const robotsContent = getMetaContent(doc, "robots");
|
|
4848
5298
|
if (robotsContent) {
|
|
@@ -4876,7 +5326,8 @@ function extractRobots(doc) {
|
|
|
4876
5326
|
}
|
|
4877
5327
|
|
|
4878
5328
|
// src/metadata/security/extract.ts
|
|
4879
|
-
function extractSecurity(
|
|
5329
|
+
function extractSecurity(input) {
|
|
5330
|
+
const doc = ensureDocument(input);
|
|
4880
5331
|
const metadata = {};
|
|
4881
5332
|
metadata.referrerPolicy = getMetaContent(doc, "referrer");
|
|
4882
5333
|
metadata.contentSecurityPolicy = getMetaHttpEquiv(doc, "Content-Security-Policy");
|
|
@@ -4927,7 +5378,8 @@ function generateSitemapSuggestions(documentUrl) {
|
|
|
4927
5378
|
}
|
|
4928
5379
|
|
|
4929
5380
|
// src/metadata/sitemap-discovery/extract.ts
|
|
4930
|
-
function extractSitemapDiscovery(
|
|
5381
|
+
function extractSitemapDiscovery(input, documentUrl) {
|
|
5382
|
+
const doc = ensureDocument(input);
|
|
4931
5383
|
const metadata = {
|
|
4932
5384
|
sitemaps: []
|
|
4933
5385
|
};
|
|
@@ -4940,7 +5392,8 @@ function extractSitemapDiscovery(doc, documentUrl) {
|
|
|
4940
5392
|
}
|
|
4941
5393
|
|
|
4942
5394
|
// src/metadata/social-profiles/extract.ts
|
|
4943
|
-
function extractSocialProfiles(
|
|
5395
|
+
function extractSocialProfiles(input) {
|
|
5396
|
+
const doc = ensureDocument(input);
|
|
4944
5397
|
const metadata = {};
|
|
4945
5398
|
metadata.twitter = getMetaContent(doc, "twitter:site") || getMetaContent(doc, "twitter:creator") || extractFromProperty(doc, "twitter:site") || extractFromProperty(doc, "twitter:creator");
|
|
4946
5399
|
if (metadata.twitter) {
|
|
@@ -5093,7 +5546,8 @@ function categorizeSchemaProfile(url, metadata) {
|
|
|
5093
5546
|
}
|
|
5094
5547
|
|
|
5095
5548
|
// src/metadata/verification/extract.ts
|
|
5096
|
-
function extractVerification(
|
|
5549
|
+
function extractVerification(input) {
|
|
5550
|
+
const doc = ensureDocument(input);
|
|
5097
5551
|
const metadata = {};
|
|
5098
5552
|
metadata.googleSiteVerification = getMetaContent(doc, "google-site-verification");
|
|
5099
5553
|
metadata.msvalidate = getMetaContent(doc, "msvalidate.01");
|
|
@@ -6969,7 +7423,7 @@ exports.extractIcons = extractIcons;
|
|
|
6969
7423
|
exports.extractLanguage = extractLanguage;
|
|
6970
7424
|
exports.extractLinks = extractLinks3;
|
|
6971
7425
|
exports.extractMonetization = extractMonetization;
|
|
6972
|
-
exports.extractNews =
|
|
7426
|
+
exports.extractNews = extractNews2;
|
|
6973
7427
|
exports.extractOpenGraph = extractOpenGraph;
|
|
6974
7428
|
exports.extractPagination = extractPagination;
|
|
6975
7429
|
exports.extractRobots = extractRobots;
|