magpie-html 0.2.0 → 0.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.cjs +442 -8
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +2 -1
- package/dist/index.d.ts +2 -1
- package/dist/index.js +442 -8
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/dist/index.cjs
CHANGED
|
@@ -1081,8 +1081,12 @@ function extractEntry(entryElement) {
|
|
|
1081
1081
|
function removeComments(xml) {
|
|
1082
1082
|
return xml.replace(/<!--[\s\S]*?-->/g, "");
|
|
1083
1083
|
}
|
|
1084
|
+
function removeDoctype(xml) {
|
|
1085
|
+
return xml.replace(/<!DOCTYPE[^>]*>/gi, "");
|
|
1086
|
+
}
|
|
1084
1087
|
function parseAtomXML(xml) {
|
|
1085
|
-
const
|
|
1088
|
+
const withoutDoctype = removeDoctype(xml);
|
|
1089
|
+
const withoutComments = removeComments(withoutDoctype);
|
|
1086
1090
|
const { text: cleanedXML, cdataMap } = extractCDATA(withoutComments);
|
|
1087
1091
|
const root = parseElement(cleanedXML, 0, null, cdataMap).element;
|
|
1088
1092
|
return root;
|
|
@@ -1890,13 +1894,17 @@ function parseRSSDate(dateString) {
|
|
|
1890
1894
|
// src/feed/rss/xml-parser.ts
|
|
1891
1895
|
function parseRSSXML(xml) {
|
|
1892
1896
|
const cleaned = cleanXMLDeclaration(xml);
|
|
1893
|
-
const
|
|
1897
|
+
const withoutDoctype = removeDoctype2(cleaned);
|
|
1898
|
+
const withoutComments = removeComments2(withoutDoctype);
|
|
1894
1899
|
const root = parseElement2(withoutComments, 0).element;
|
|
1895
1900
|
return root;
|
|
1896
1901
|
}
|
|
1897
1902
|
function cleanXMLDeclaration(xml) {
|
|
1898
1903
|
return xml.replace(/<\?xml[^?]*\?>/g, "").trim();
|
|
1899
1904
|
}
|
|
1905
|
+
function removeDoctype2(xml) {
|
|
1906
|
+
return xml.replace(/<!DOCTYPE[^>]*>/gi, "");
|
|
1907
|
+
}
|
|
1900
1908
|
function removeComments2(xml) {
|
|
1901
1909
|
return xml.replace(/<!--[\s\S]*?-->/g, "");
|
|
1902
1910
|
}
|
|
@@ -2345,6 +2353,378 @@ function parseFeedAs(content, format, baseUrl) {
|
|
|
2345
2353
|
}
|
|
2346
2354
|
}
|
|
2347
2355
|
|
|
2356
|
+
// src/feed/sitemap/xml-parser.ts
|
|
2357
|
+
function parseSitemapXML(xml) {
|
|
2358
|
+
const cleaned = cleanXMLDeclaration2(xml);
|
|
2359
|
+
const withoutDoctype = removeDoctype3(cleaned);
|
|
2360
|
+
const withoutComments = removeComments3(withoutDoctype);
|
|
2361
|
+
const root = parseElement3(withoutComments, 0).element;
|
|
2362
|
+
return root;
|
|
2363
|
+
}
|
|
2364
|
+
function cleanXMLDeclaration2(xml) {
|
|
2365
|
+
return xml.replace(/<\?xml[^?]*\?>/g, "").trim();
|
|
2366
|
+
}
|
|
2367
|
+
function removeDoctype3(xml) {
|
|
2368
|
+
return xml.replace(/<!DOCTYPE[^>]*>/gi, "");
|
|
2369
|
+
}
|
|
2370
|
+
function removeComments3(xml) {
|
|
2371
|
+
return xml.replace(/<!--[\s\S]*?-->/g, "");
|
|
2372
|
+
}
|
|
2373
|
+
function extractCDATA3(text) {
|
|
2374
|
+
const cdataMap = /* @__PURE__ */ new Map();
|
|
2375
|
+
let counter = 0;
|
|
2376
|
+
const processed = text.replace(/<!\[CDATA\[([\s\S]*?)\]\]>/g, (_match, content) => {
|
|
2377
|
+
const placeholder = `__CDATA_${counter}__`;
|
|
2378
|
+
cdataMap.set(placeholder, content);
|
|
2379
|
+
counter++;
|
|
2380
|
+
return placeholder;
|
|
2381
|
+
});
|
|
2382
|
+
return { text: processed, cdataMap };
|
|
2383
|
+
}
|
|
2384
|
+
function restoreCDATA3(text, cdataMap) {
|
|
2385
|
+
let result = text;
|
|
2386
|
+
for (const [placeholder, content] of cdataMap.entries()) {
|
|
2387
|
+
result = result.replace(placeholder, content);
|
|
2388
|
+
}
|
|
2389
|
+
return result;
|
|
2390
|
+
}
|
|
2391
|
+
function parseAttributes3(tagContent) {
|
|
2392
|
+
const attributes = {};
|
|
2393
|
+
const attrRegex = /(\S+)=["']([^"']*)["']/g;
|
|
2394
|
+
let match = attrRegex.exec(tagContent);
|
|
2395
|
+
while (match !== null) {
|
|
2396
|
+
attributes[match[1]] = match[2];
|
|
2397
|
+
match = attrRegex.exec(tagContent);
|
|
2398
|
+
}
|
|
2399
|
+
return attributes;
|
|
2400
|
+
}
|
|
2401
|
+
function findClosingTag3(xml, tagName, startPos) {
|
|
2402
|
+
const openTag = `<${tagName}`;
|
|
2403
|
+
const closeTag = `</${tagName}>`;
|
|
2404
|
+
let depth = 1;
|
|
2405
|
+
let pos = startPos;
|
|
2406
|
+
while (pos < xml.length && depth > 0) {
|
|
2407
|
+
const nextOpen = xml.indexOf(openTag, pos);
|
|
2408
|
+
const nextClose = xml.indexOf(closeTag, pos);
|
|
2409
|
+
if (nextClose === -1) {
|
|
2410
|
+
return -1;
|
|
2411
|
+
}
|
|
2412
|
+
if (nextOpen !== -1 && nextOpen < nextClose) {
|
|
2413
|
+
depth++;
|
|
2414
|
+
pos = nextOpen + openTag.length;
|
|
2415
|
+
} else {
|
|
2416
|
+
depth--;
|
|
2417
|
+
if (depth === 0) {
|
|
2418
|
+
return nextClose;
|
|
2419
|
+
}
|
|
2420
|
+
pos = nextClose + closeTag.length;
|
|
2421
|
+
}
|
|
2422
|
+
}
|
|
2423
|
+
return -1;
|
|
2424
|
+
}
|
|
2425
|
+
function parseElement3(xml, startPos, parent = null, cdataMap) {
|
|
2426
|
+
const extracted = cdataMap ? { text: xml, cdataMap } : extractCDATA3(xml);
|
|
2427
|
+
const cleanedXML = extracted.text;
|
|
2428
|
+
const currentCdataMap = extracted.cdataMap;
|
|
2429
|
+
const openTagStart = cleanedXML.indexOf("<", startPos);
|
|
2430
|
+
if (openTagStart === -1) {
|
|
2431
|
+
throw new Error("No opening tag found");
|
|
2432
|
+
}
|
|
2433
|
+
const openTagEnd = cleanedXML.indexOf(">", openTagStart);
|
|
2434
|
+
if (openTagEnd === -1) {
|
|
2435
|
+
throw new Error("Unclosed opening tag");
|
|
2436
|
+
}
|
|
2437
|
+
const openTagContent = cleanedXML.substring(openTagStart + 1, openTagEnd);
|
|
2438
|
+
const isSelfClosing = openTagContent.endsWith("/");
|
|
2439
|
+
const tagContent = isSelfClosing ? openTagContent.slice(0, -1).trim() : openTagContent;
|
|
2440
|
+
const spaceIndex = tagContent.indexOf(" ");
|
|
2441
|
+
const tagName = spaceIndex === -1 ? tagContent : tagContent.substring(0, spaceIndex);
|
|
2442
|
+
const attributes = spaceIndex === -1 ? {} : parseAttributes3(tagContent.substring(spaceIndex));
|
|
2443
|
+
const element = {
|
|
2444
|
+
tagName,
|
|
2445
|
+
attributes,
|
|
2446
|
+
text: "",
|
|
2447
|
+
children: [],
|
|
2448
|
+
parent
|
|
2449
|
+
};
|
|
2450
|
+
if (isSelfClosing) {
|
|
2451
|
+
return { element, endPos: openTagEnd + 1, cdataMap: currentCdataMap };
|
|
2452
|
+
}
|
|
2453
|
+
const closingTagPos = findClosingTag3(cleanedXML, tagName, openTagEnd + 1);
|
|
2454
|
+
if (closingTagPos === -1) {
|
|
2455
|
+
throw new Error(`No closing tag found for <${tagName}>`);
|
|
2456
|
+
}
|
|
2457
|
+
const content = cleanedXML.substring(openTagEnd + 1, closingTagPos);
|
|
2458
|
+
if (content.includes("<")) {
|
|
2459
|
+
let pos = 0;
|
|
2460
|
+
const trimmedContent = content.trim();
|
|
2461
|
+
while (pos < trimmedContent.length) {
|
|
2462
|
+
const nextTag = trimmedContent.indexOf("<", pos);
|
|
2463
|
+
if (nextTag === -1) break;
|
|
2464
|
+
if (trimmedContent[nextTag + 1] === "/" || trimmedContent[nextTag + 1] === "!") {
|
|
2465
|
+
pos = nextTag + 1;
|
|
2466
|
+
continue;
|
|
2467
|
+
}
|
|
2468
|
+
try {
|
|
2469
|
+
const { element: child, endPos } = parseElement3(
|
|
2470
|
+
trimmedContent,
|
|
2471
|
+
nextTag,
|
|
2472
|
+
element,
|
|
2473
|
+
currentCdataMap
|
|
2474
|
+
);
|
|
2475
|
+
element.children.push(child);
|
|
2476
|
+
pos = endPos;
|
|
2477
|
+
} catch {
|
|
2478
|
+
pos = nextTag + 1;
|
|
2479
|
+
}
|
|
2480
|
+
}
|
|
2481
|
+
let textContent = content.replace(/<[^>]+>/g, "").trim();
|
|
2482
|
+
textContent = restoreCDATA3(textContent, currentCdataMap);
|
|
2483
|
+
element.text = textContent;
|
|
2484
|
+
} else {
|
|
2485
|
+
let textContent = content.trim();
|
|
2486
|
+
textContent = restoreCDATA3(textContent, currentCdataMap);
|
|
2487
|
+
element.text = textContent;
|
|
2488
|
+
}
|
|
2489
|
+
const closingTagEnd = closingTagPos + `</${tagName}>`.length;
|
|
2490
|
+
return { element, endPos: closingTagEnd, cdataMap: currentCdataMap };
|
|
2491
|
+
}
|
|
2492
|
+
function querySelector3(element, selector, caseSensitive = false) {
|
|
2493
|
+
const tagName = caseSensitive ? selector : selector.toLowerCase();
|
|
2494
|
+
const elementTag = caseSensitive ? element.tagName : element.tagName.toLowerCase();
|
|
2495
|
+
if (elementTag === tagName) {
|
|
2496
|
+
return element;
|
|
2497
|
+
}
|
|
2498
|
+
for (const child of element.children) {
|
|
2499
|
+
const found = querySelector3(child, selector, caseSensitive);
|
|
2500
|
+
if (found) return found;
|
|
2501
|
+
}
|
|
2502
|
+
return null;
|
|
2503
|
+
}
|
|
2504
|
+
function querySelectorAll3(element, selector, caseSensitive = false) {
|
|
2505
|
+
const results = [];
|
|
2506
|
+
const tagName = caseSensitive ? selector : selector.toLowerCase();
|
|
2507
|
+
const elementTag = caseSensitive ? element.tagName : element.tagName.toLowerCase();
|
|
2508
|
+
if (elementTag === tagName) {
|
|
2509
|
+
results.push(element);
|
|
2510
|
+
}
|
|
2511
|
+
for (const child of element.children) {
|
|
2512
|
+
results.push(...querySelectorAll3(child, selector, caseSensitive));
|
|
2513
|
+
}
|
|
2514
|
+
return results;
|
|
2515
|
+
}
|
|
2516
|
+
function getText2(element) {
|
|
2517
|
+
return element?.text || "";
|
|
2518
|
+
}
|
|
2519
|
+
function getChild(element, tagName) {
|
|
2520
|
+
const lowerTag = tagName.toLowerCase();
|
|
2521
|
+
return element.children.find((c) => c.tagName.toLowerCase() === lowerTag) || null;
|
|
2522
|
+
}
|
|
2523
|
+
function getChildren(element, tagName) {
|
|
2524
|
+
const lowerTag = tagName.toLowerCase();
|
|
2525
|
+
return element.children.filter((c) => c.tagName.toLowerCase() === lowerTag);
|
|
2526
|
+
}
|
|
2527
|
+
|
|
2528
|
+
// src/feed/sitemap/parse.ts
|
|
2529
|
+
function parseSitemap(xml, baseUrl) {
|
|
2530
|
+
const doc = parseSitemapXML(xml);
|
|
2531
|
+
const sitemapIndex = querySelector3(doc, "sitemapindex");
|
|
2532
|
+
if (sitemapIndex) {
|
|
2533
|
+
return parseSitemapIndex(sitemapIndex, baseUrl);
|
|
2534
|
+
}
|
|
2535
|
+
const urlset = querySelector3(doc, "urlset");
|
|
2536
|
+
if (urlset) {
|
|
2537
|
+
return parseUrlset(urlset, baseUrl);
|
|
2538
|
+
}
|
|
2539
|
+
const urls = querySelectorAll3(doc, "url");
|
|
2540
|
+
if (urls.length > 0) {
|
|
2541
|
+
return {
|
|
2542
|
+
sitemap: {
|
|
2543
|
+
type: "urlset",
|
|
2544
|
+
urls: urls.map((url) => extractUrl(url, baseUrl)),
|
|
2545
|
+
sitemaps: []
|
|
2546
|
+
},
|
|
2547
|
+
isIndex: false
|
|
2548
|
+
};
|
|
2549
|
+
}
|
|
2550
|
+
return {
|
|
2551
|
+
sitemap: {
|
|
2552
|
+
type: "urlset",
|
|
2553
|
+
urls: [],
|
|
2554
|
+
sitemaps: []
|
|
2555
|
+
},
|
|
2556
|
+
isIndex: false
|
|
2557
|
+
};
|
|
2558
|
+
}
|
|
2559
|
+
function parseSitemapIndex(element, baseUrl) {
|
|
2560
|
+
const sitemapElements = getChildren(element, "sitemap");
|
|
2561
|
+
const sitemaps = sitemapElements.map((el) => {
|
|
2562
|
+
const loc = getText2(getChild(el, "loc"));
|
|
2563
|
+
const lastmod = getText2(getChild(el, "lastmod")) || void 0;
|
|
2564
|
+
return {
|
|
2565
|
+
loc: baseUrl ? normalizeUrlHttps(baseUrl, loc) : loc,
|
|
2566
|
+
lastmod
|
|
2567
|
+
};
|
|
2568
|
+
});
|
|
2569
|
+
return {
|
|
2570
|
+
sitemap: {
|
|
2571
|
+
type: "sitemapindex",
|
|
2572
|
+
urls: [],
|
|
2573
|
+
sitemaps
|
|
2574
|
+
},
|
|
2575
|
+
isIndex: true
|
|
2576
|
+
};
|
|
2577
|
+
}
|
|
2578
|
+
function parseUrlset(element, baseUrl) {
|
|
2579
|
+
const urlElements = getChildren(element, "url");
|
|
2580
|
+
const urls = urlElements.map((el) => extractUrl(el, baseUrl));
|
|
2581
|
+
return {
|
|
2582
|
+
sitemap: {
|
|
2583
|
+
type: "urlset",
|
|
2584
|
+
urls,
|
|
2585
|
+
sitemaps: []
|
|
2586
|
+
},
|
|
2587
|
+
isIndex: false
|
|
2588
|
+
};
|
|
2589
|
+
}
|
|
2590
|
+
function extractUrl(element, baseUrl) {
|
|
2591
|
+
const rawLoc = getText2(getChild(element, "loc"));
|
|
2592
|
+
const loc = decodeXmlEntities(rawLoc);
|
|
2593
|
+
const lastmod = getText2(getChild(element, "lastmod")) || void 0;
|
|
2594
|
+
const changefreq = getText2(getChild(element, "changefreq")) || void 0;
|
|
2595
|
+
const priorityText = getText2(getChild(element, "priority"));
|
|
2596
|
+
const priority = priorityText ? Number.parseFloat(priorityText) : void 0;
|
|
2597
|
+
const result = {
|
|
2598
|
+
loc: baseUrl ? normalizeUrlHttps(baseUrl, loc) : loc,
|
|
2599
|
+
lastmod,
|
|
2600
|
+
changefreq,
|
|
2601
|
+
priority: priority && !Number.isNaN(priority) ? priority : void 0
|
|
2602
|
+
};
|
|
2603
|
+
const news = extractNews(element);
|
|
2604
|
+
if (news) {
|
|
2605
|
+
result.news = news;
|
|
2606
|
+
}
|
|
2607
|
+
const images = extractImages(element, baseUrl);
|
|
2608
|
+
if (images.length > 0) {
|
|
2609
|
+
result.images = images;
|
|
2610
|
+
}
|
|
2611
|
+
const videos = extractVideos(element, baseUrl);
|
|
2612
|
+
if (videos.length > 0) {
|
|
2613
|
+
result.videos = videos;
|
|
2614
|
+
}
|
|
2615
|
+
return result;
|
|
2616
|
+
}
|
|
2617
|
+
function extractNews(urlElement) {
|
|
2618
|
+
const newsEl = getChild(urlElement, "news:news") || getChild(urlElement, "news") || urlElement.children.find((c) => c.tagName.toLowerCase().endsWith(":news"));
|
|
2619
|
+
if (!newsEl) {
|
|
2620
|
+
return void 0;
|
|
2621
|
+
}
|
|
2622
|
+
const news = {};
|
|
2623
|
+
const pubEl = getChild(newsEl, "news:publication") || getChild(newsEl, "publication") || newsEl.children.find((c) => c.tagName.toLowerCase().endsWith(":publication"));
|
|
2624
|
+
if (pubEl) {
|
|
2625
|
+
const name = getText2(getChild(pubEl, "news:name")) || getText2(getChild(pubEl, "name")) || getText2(pubEl.children.find((c) => c.tagName.toLowerCase().endsWith(":name")));
|
|
2626
|
+
const language = getText2(getChild(pubEl, "news:language")) || getText2(getChild(pubEl, "language")) || getText2(pubEl.children.find((c) => c.tagName.toLowerCase().endsWith(":language")));
|
|
2627
|
+
if (name || language) {
|
|
2628
|
+
news.publication = {
|
|
2629
|
+
name: name || void 0,
|
|
2630
|
+
language: language || void 0
|
|
2631
|
+
};
|
|
2632
|
+
}
|
|
2633
|
+
}
|
|
2634
|
+
const pubDate = getText2(getChild(newsEl, "news:publication_date")) || getText2(getChild(newsEl, "publication_date")) || getText2(newsEl.children.find((c) => c.tagName.toLowerCase().endsWith(":publication_date")));
|
|
2635
|
+
if (pubDate) {
|
|
2636
|
+
news.publicationDate = pubDate;
|
|
2637
|
+
}
|
|
2638
|
+
const title = getText2(getChild(newsEl, "news:title")) || getText2(getChild(newsEl, "title")) || getText2(newsEl.children.find((c) => c.tagName.toLowerCase().endsWith(":title")));
|
|
2639
|
+
if (title) {
|
|
2640
|
+
news.title = decodeXmlEntities(title);
|
|
2641
|
+
}
|
|
2642
|
+
const keywords = getText2(getChild(newsEl, "news:keywords")) || getText2(getChild(newsEl, "keywords")) || getText2(newsEl.children.find((c) => c.tagName.toLowerCase().endsWith(":keywords")));
|
|
2643
|
+
if (keywords) {
|
|
2644
|
+
news.keywords = keywords.split(",").map((k) => k.trim());
|
|
2645
|
+
}
|
|
2646
|
+
const stockTickers = getText2(getChild(newsEl, "news:stock_tickers")) || getText2(getChild(newsEl, "stock_tickers")) || getText2(newsEl.children.find((c) => c.tagName.toLowerCase().endsWith(":stock_tickers")));
|
|
2647
|
+
if (stockTickers) {
|
|
2648
|
+
news.stockTickers = stockTickers.split(",").map((t) => t.trim());
|
|
2649
|
+
}
|
|
2650
|
+
return Object.keys(news).length > 0 ? news : void 0;
|
|
2651
|
+
}
|
|
2652
|
+
function extractImages(urlElement, baseUrl) {
|
|
2653
|
+
const imageElements = urlElement.children.filter(
|
|
2654
|
+
(c) => c.tagName.toLowerCase() === "image:image" || c.tagName.toLowerCase() === "image" || c.tagName.toLowerCase().endsWith(":image")
|
|
2655
|
+
);
|
|
2656
|
+
return imageElements.map((imgEl) => {
|
|
2657
|
+
const loc = getText2(getChild(imgEl, "image:loc")) || getText2(getChild(imgEl, "loc")) || getText2(imgEl.children.find((c) => c.tagName.toLowerCase().endsWith(":loc")));
|
|
2658
|
+
if (!loc) return null;
|
|
2659
|
+
const image = {
|
|
2660
|
+
loc: baseUrl ? normalizeUrlHttps(baseUrl, loc) : loc
|
|
2661
|
+
};
|
|
2662
|
+
const caption = getText2(getChild(imgEl, "image:caption")) || getText2(getChild(imgEl, "caption")) || getText2(imgEl.children.find((c) => c.tagName.toLowerCase().endsWith(":caption")));
|
|
2663
|
+
if (caption) image.caption = decodeXmlEntities(caption);
|
|
2664
|
+
const geoLocation = getText2(getChild(imgEl, "image:geo_location")) || getText2(getChild(imgEl, "geo_location")) || getText2(imgEl.children.find((c) => c.tagName.toLowerCase().endsWith(":geo_location")));
|
|
2665
|
+
if (geoLocation) image.geoLocation = geoLocation;
|
|
2666
|
+
const title = getText2(getChild(imgEl, "image:title")) || getText2(getChild(imgEl, "title")) || getText2(imgEl.children.find((c) => c.tagName.toLowerCase().endsWith(":title")));
|
|
2667
|
+
if (title) image.title = decodeXmlEntities(title);
|
|
2668
|
+
const license = getText2(getChild(imgEl, "image:license")) || getText2(getChild(imgEl, "license")) || getText2(imgEl.children.find((c) => c.tagName.toLowerCase().endsWith(":license")));
|
|
2669
|
+
if (license) image.license = baseUrl ? normalizeUrlHttps(baseUrl, license) : license;
|
|
2670
|
+
return image;
|
|
2671
|
+
}).filter((img) => img !== null);
|
|
2672
|
+
}
|
|
2673
|
+
function extractVideos(urlElement, baseUrl) {
|
|
2674
|
+
const videoElements = urlElement.children.filter(
|
|
2675
|
+
(c) => c.tagName.toLowerCase() === "video:video" || c.tagName.toLowerCase() === "video" || c.tagName.toLowerCase().endsWith(":video")
|
|
2676
|
+
);
|
|
2677
|
+
return videoElements.map((vidEl) => {
|
|
2678
|
+
const thumbnailLoc = getText2(getChild(vidEl, "video:thumbnail_loc")) || getText2(getChild(vidEl, "thumbnail_loc")) || getText2(vidEl.children.find((c) => c.tagName.toLowerCase().endsWith(":thumbnail_loc")));
|
|
2679
|
+
const title = getText2(getChild(vidEl, "video:title")) || getText2(getChild(vidEl, "title")) || getText2(vidEl.children.find((c) => c.tagName.toLowerCase().endsWith(":title")));
|
|
2680
|
+
const description = getText2(getChild(vidEl, "video:description")) || getText2(getChild(vidEl, "description")) || getText2(vidEl.children.find((c) => c.tagName.toLowerCase().endsWith(":description")));
|
|
2681
|
+
if (!thumbnailLoc || !title || !description) return null;
|
|
2682
|
+
const video = {
|
|
2683
|
+
thumbnailLoc: baseUrl ? normalizeUrlHttps(baseUrl, thumbnailLoc) : thumbnailLoc,
|
|
2684
|
+
title: decodeXmlEntities(title),
|
|
2685
|
+
description: decodeXmlEntities(description)
|
|
2686
|
+
};
|
|
2687
|
+
const contentLoc = getText2(getChild(vidEl, "video:content_loc")) || getText2(getChild(vidEl, "content_loc")) || getText2(vidEl.children.find((c) => c.tagName.toLowerCase().endsWith(":content_loc")));
|
|
2688
|
+
if (contentLoc)
|
|
2689
|
+
video.contentLoc = baseUrl ? normalizeUrlHttps(baseUrl, contentLoc) : contentLoc;
|
|
2690
|
+
const playerLoc = getText2(getChild(vidEl, "video:player_loc")) || getText2(getChild(vidEl, "player_loc")) || getText2(vidEl.children.find((c) => c.tagName.toLowerCase().endsWith(":player_loc")));
|
|
2691
|
+
if (playerLoc) video.playerLoc = baseUrl ? normalizeUrlHttps(baseUrl, playerLoc) : playerLoc;
|
|
2692
|
+
const duration = getText2(getChild(vidEl, "video:duration")) || getText2(getChild(vidEl, "duration")) || getText2(vidEl.children.find((c) => c.tagName.toLowerCase().endsWith(":duration")));
|
|
2693
|
+
if (duration) {
|
|
2694
|
+
const dur = Number.parseInt(duration, 10);
|
|
2695
|
+
if (!Number.isNaN(dur)) video.duration = dur;
|
|
2696
|
+
}
|
|
2697
|
+
const rating = getText2(getChild(vidEl, "video:rating")) || getText2(getChild(vidEl, "rating")) || getText2(vidEl.children.find((c) => c.tagName.toLowerCase().endsWith(":rating")));
|
|
2698
|
+
if (rating) {
|
|
2699
|
+
const r = Number.parseFloat(rating);
|
|
2700
|
+
if (!Number.isNaN(r)) video.rating = r;
|
|
2701
|
+
}
|
|
2702
|
+
const viewCount = getText2(getChild(vidEl, "video:view_count")) || getText2(getChild(vidEl, "view_count")) || getText2(vidEl.children.find((c) => c.tagName.toLowerCase().endsWith(":view_count")));
|
|
2703
|
+
if (viewCount) {
|
|
2704
|
+
const vc = Number.parseInt(viewCount, 10);
|
|
2705
|
+
if (!Number.isNaN(vc)) video.viewCount = vc;
|
|
2706
|
+
}
|
|
2707
|
+
const publicationDate = getText2(getChild(vidEl, "video:publication_date")) || getText2(getChild(vidEl, "publication_date")) || getText2(vidEl.children.find((c) => c.tagName.toLowerCase().endsWith(":publication_date")));
|
|
2708
|
+
if (publicationDate) video.publicationDate = publicationDate;
|
|
2709
|
+
const familyFriendly = getText2(getChild(vidEl, "video:family_friendly")) || getText2(getChild(vidEl, "family_friendly")) || getText2(vidEl.children.find((c) => c.tagName.toLowerCase().endsWith(":family_friendly")));
|
|
2710
|
+
if (familyFriendly) {
|
|
2711
|
+
video.familyFriendly = familyFriendly.toLowerCase() === "yes";
|
|
2712
|
+
}
|
|
2713
|
+
const category = getText2(getChild(vidEl, "video:category")) || getText2(getChild(vidEl, "category")) || getText2(vidEl.children.find((c) => c.tagName.toLowerCase().endsWith(":category")));
|
|
2714
|
+
if (category) video.category = category;
|
|
2715
|
+
const tagElements = vidEl.children.filter(
|
|
2716
|
+
(c) => c.tagName.toLowerCase() === "video:tag" || c.tagName.toLowerCase() === "tag" || c.tagName.toLowerCase().endsWith(":tag")
|
|
2717
|
+
);
|
|
2718
|
+
if (tagElements.length > 0) {
|
|
2719
|
+
video.tags = tagElements.map((t) => getText2(t)).filter(Boolean);
|
|
2720
|
+
}
|
|
2721
|
+
return video;
|
|
2722
|
+
}).filter((vid) => vid !== null);
|
|
2723
|
+
}
|
|
2724
|
+
function decodeXmlEntities(text) {
|
|
2725
|
+
return text.replace(/</g, "<").replace(/>/g, ">").replace(/&/g, "&").replace(/"/g, '"').replace(/'/g, "'").replace(/&#(\d+);/g, (_, code) => String.fromCharCode(Number.parseInt(code, 10))).replace(/&#x([0-9a-fA-F]+);/g, (_, code) => String.fromCharCode(Number.parseInt(code, 16)));
|
|
2726
|
+
}
|
|
2727
|
+
|
|
2348
2728
|
// src/pluck/types.ts
|
|
2349
2729
|
var PluckError = class extends Error {
|
|
2350
2730
|
constructor(message) {
|
|
@@ -2773,7 +3153,7 @@ function extractOpenGraph(doc) {
|
|
|
2773
3153
|
if (Object.keys(audio).length > 0) {
|
|
2774
3154
|
metadata.audio = audio;
|
|
2775
3155
|
}
|
|
2776
|
-
const images =
|
|
3156
|
+
const images = extractImages2(doc);
|
|
2777
3157
|
if (images.length > 0) {
|
|
2778
3158
|
metadata.images = images;
|
|
2779
3159
|
}
|
|
@@ -2842,7 +3222,7 @@ function extractAudio(doc) {
|
|
|
2842
3222
|
Object.entries(audio).filter(([_, value]) => value !== void 0)
|
|
2843
3223
|
);
|
|
2844
3224
|
}
|
|
2845
|
-
function
|
|
3225
|
+
function extractImages2(doc) {
|
|
2846
3226
|
const images = [];
|
|
2847
3227
|
const imageUrls = getAllMetaPropertyValues(doc, "og:image");
|
|
2848
3228
|
const imageSecureUrls = getAllMetaPropertyValues(doc, "og:image:secure_url");
|
|
@@ -3936,9 +4316,63 @@ async function gatherFeed(url) {
|
|
|
3936
4316
|
}
|
|
3937
4317
|
const response = await pluck(feedUrl);
|
|
3938
4318
|
const content = await response.textUtf8();
|
|
4319
|
+
const format = detectFormat(content);
|
|
4320
|
+
if (format === "sitemap") {
|
|
4321
|
+
return normalizeSitemapToFeed(content, response.finalUrl);
|
|
4322
|
+
}
|
|
3939
4323
|
const result = parseFeed(content, response.finalUrl);
|
|
3940
4324
|
return result.feed;
|
|
3941
4325
|
}
|
|
4326
|
+
function normalizeSitemapToFeed(content, baseUrl) {
|
|
4327
|
+
const result = parseSitemap(content, baseUrl);
|
|
4328
|
+
if (result.isIndex) {
|
|
4329
|
+
const items2 = result.sitemap.sitemaps.map((sitemap, index) => ({
|
|
4330
|
+
id: sitemap.loc || `sitemap-${index}`,
|
|
4331
|
+
url: sitemap.loc,
|
|
4332
|
+
title: `Sitemap: ${sitemap.loc}`,
|
|
4333
|
+
modified: sitemap.lastmod
|
|
4334
|
+
}));
|
|
4335
|
+
return {
|
|
4336
|
+
format: "sitemap",
|
|
4337
|
+
title: "Sitemap Index",
|
|
4338
|
+
url: baseUrl,
|
|
4339
|
+
items: items2
|
|
4340
|
+
};
|
|
4341
|
+
}
|
|
4342
|
+
const items = result.sitemap.urls.map((url, index) => {
|
|
4343
|
+
const item = {
|
|
4344
|
+
id: url.loc || `url-${index}`,
|
|
4345
|
+
url: url.loc,
|
|
4346
|
+
modified: url.lastmod
|
|
4347
|
+
};
|
|
4348
|
+
if (url.news) {
|
|
4349
|
+
item.title = url.news.title;
|
|
4350
|
+
item.published = url.news.publicationDate;
|
|
4351
|
+
if (url.news.publication?.name) {
|
|
4352
|
+
item.authors = [{ name: url.news.publication.name }];
|
|
4353
|
+
}
|
|
4354
|
+
if (url.news.keywords) {
|
|
4355
|
+
item.tags = url.news.keywords;
|
|
4356
|
+
}
|
|
4357
|
+
}
|
|
4358
|
+
if (url.images && url.images.length > 0) {
|
|
4359
|
+
item.image = url.images[0].loc;
|
|
4360
|
+
}
|
|
4361
|
+
return item;
|
|
4362
|
+
});
|
|
4363
|
+
let title = "Sitemap";
|
|
4364
|
+
try {
|
|
4365
|
+
const urlObj = new URL(baseUrl);
|
|
4366
|
+
title = `${urlObj.hostname} Sitemap`;
|
|
4367
|
+
} catch {
|
|
4368
|
+
}
|
|
4369
|
+
return {
|
|
4370
|
+
format: "sitemap",
|
|
4371
|
+
title,
|
|
4372
|
+
url: baseUrl,
|
|
4373
|
+
items
|
|
4374
|
+
};
|
|
4375
|
+
}
|
|
3942
4376
|
|
|
3943
4377
|
// src/metadata/feed-discovery/heuristics.ts
|
|
3944
4378
|
var COMMON_FEED_PATHS = [
|
|
@@ -4245,7 +4679,7 @@ function extractAnalytics(doc) {
|
|
|
4245
4679
|
function extractAssets(doc, baseUrl) {
|
|
4246
4680
|
const metadata = {};
|
|
4247
4681
|
const effectiveBaseUrl = getEffectiveBaseUrl2(doc, baseUrl);
|
|
4248
|
-
const images =
|
|
4682
|
+
const images = extractImages3(doc, effectiveBaseUrl);
|
|
4249
4683
|
if (images.length > 0) {
|
|
4250
4684
|
metadata.images = images;
|
|
4251
4685
|
}
|
|
@@ -4299,7 +4733,7 @@ function getEffectiveBaseUrl2(doc, baseUrl) {
|
|
|
4299
4733
|
}
|
|
4300
4734
|
return null;
|
|
4301
4735
|
}
|
|
4302
|
-
function
|
|
4736
|
+
function extractImages3(doc, baseUrl) {
|
|
4303
4737
|
const urls = /* @__PURE__ */ new Set();
|
|
4304
4738
|
const imgElements = doc.querySelectorAll("img[src]");
|
|
4305
4739
|
for (const img of Array.from(imgElements)) {
|
|
@@ -4717,7 +5151,7 @@ function extractMonetization(doc) {
|
|
|
4717
5151
|
}
|
|
4718
5152
|
|
|
4719
5153
|
// src/metadata/news/extract.ts
|
|
4720
|
-
function
|
|
5154
|
+
function extractNews2(doc) {
|
|
4721
5155
|
const metadata = {};
|
|
4722
5156
|
const newsKeywords = getMetaContent(doc, "news_keywords");
|
|
4723
5157
|
if (newsKeywords) {
|
|
@@ -6961,7 +7395,7 @@ exports.extractIcons = extractIcons;
|
|
|
6961
7395
|
exports.extractLanguage = extractLanguage;
|
|
6962
7396
|
exports.extractLinks = extractLinks3;
|
|
6963
7397
|
exports.extractMonetization = extractMonetization;
|
|
6964
|
-
exports.extractNews =
|
|
7398
|
+
exports.extractNews = extractNews2;
|
|
6965
7399
|
exports.extractOpenGraph = extractOpenGraph;
|
|
6966
7400
|
exports.extractPagination = extractPagination;
|
|
6967
7401
|
exports.extractRobots = extractRobots;
|