mx-cloud 0.0.14 → 0.0.15
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/build/browserSide/scraper.js +72 -2
- package/package.json +1 -1
|
@@ -287,8 +287,15 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
|
|
287
287
|
return (_e = element.textContent) === null || _e === void 0 ? void 0 : _e.trim();
|
|
288
288
|
case 'innerHTML':
|
|
289
289
|
return element.innerHTML;
|
|
290
|
-
case 'outerHTML':
|
|
291
|
-
|
|
290
|
+
case 'outerHTML': {
|
|
291
|
+
const clonedElement = element.cloneNode(true);
|
|
292
|
+
const elementsWithMxId = clonedElement.querySelectorAll('[data-mx-id]');
|
|
293
|
+
elementsWithMxId.forEach(el => el.removeAttribute('data-mx-id'));
|
|
294
|
+
if (clonedElement.hasAttribute && clonedElement.hasAttribute('data-mx-id')) {
|
|
295
|
+
clonedElement.removeAttribute('data-mx-id');
|
|
296
|
+
}
|
|
297
|
+
return clonedElement.outerHTML;
|
|
298
|
+
}
|
|
292
299
|
default:
|
|
293
300
|
return element.getAttribute(attribute) || ((_f = element.innerText) === null || _f === void 0 ? void 0 : _f.trim());
|
|
294
301
|
}
|
|
@@ -359,6 +366,69 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
|
|
359
366
|
*/
|
|
360
367
|
window.scrapeList = function (_a) {
|
|
361
368
|
return __awaiter(this, arguments, void 0, function* ({ listSelector, fields, limit = 10 }) {
|
|
369
|
+
const isSitemapUrl = () => {
|
|
370
|
+
const url = window.location.href.toLowerCase();
|
|
371
|
+
return url.includes('sitemap') && url.includes('.xml');
|
|
372
|
+
};
|
|
373
|
+
const scrapeSitemapData = () => {
|
|
374
|
+
// Try to get the XML content from the page
|
|
375
|
+
let xmlContent = null;
|
|
376
|
+
// Method 1: Check if the page is already parsed as XML
|
|
377
|
+
if (document.documentElement.tagName.toLowerCase() === 'urlset') {
|
|
378
|
+
xmlContent = document;
|
|
379
|
+
}
|
|
380
|
+
// Method 2: Try to get raw XML from pre tags (common browser display)
|
|
381
|
+
if (!xmlContent) {
|
|
382
|
+
const preElement = document.querySelector('pre');
|
|
383
|
+
if (preElement) {
|
|
384
|
+
try {
|
|
385
|
+
const parser = new DOMParser();
|
|
386
|
+
xmlContent = parser.parseFromString(preElement.textContent, 'text/xml');
|
|
387
|
+
}
|
|
388
|
+
catch (e) {
|
|
389
|
+
console.warn('Failed to parse XML from pre element:', e);
|
|
390
|
+
}
|
|
391
|
+
}
|
|
392
|
+
}
|
|
393
|
+
// Method 3: Try to parse the entire document as XML
|
|
394
|
+
if (!xmlContent) {
|
|
395
|
+
try {
|
|
396
|
+
const parser = new DOMParser();
|
|
397
|
+
xmlContent = parser.parseFromString(document.documentElement.outerHTML, 'text/xml');
|
|
398
|
+
}
|
|
399
|
+
catch (e) {
|
|
400
|
+
console.warn('Failed to parse document as XML:', e);
|
|
401
|
+
}
|
|
402
|
+
}
|
|
403
|
+
if (!xmlContent) {
|
|
404
|
+
console.error('Could not parse sitemap XML');
|
|
405
|
+
return [];
|
|
406
|
+
}
|
|
407
|
+
// Extract URL entries from the sitemap
|
|
408
|
+
const urlElements = xmlContent.querySelectorAll('url');
|
|
409
|
+
const sitemapData = [];
|
|
410
|
+
urlElements.forEach((urlElement, index) => {
|
|
411
|
+
if (limit && index >= limit)
|
|
412
|
+
return;
|
|
413
|
+
const locElement = urlElement.querySelector('loc');
|
|
414
|
+
const lastmodElement = urlElement.querySelector('lastmod');
|
|
415
|
+
const entry = {};
|
|
416
|
+
if (locElement) {
|
|
417
|
+
entry.loc = locElement.textContent.trim();
|
|
418
|
+
}
|
|
419
|
+
if (lastmodElement) {
|
|
420
|
+
entry.lastmod = lastmodElement.textContent.trim();
|
|
421
|
+
}
|
|
422
|
+
// Only add entries that have at least a loc field
|
|
423
|
+
if (entry.loc) {
|
|
424
|
+
sitemapData.push(entry);
|
|
425
|
+
}
|
|
426
|
+
});
|
|
427
|
+
return sitemapData;
|
|
428
|
+
};
|
|
429
|
+
if (isSitemapUrl()) {
|
|
430
|
+
return scrapeSitemapData();
|
|
431
|
+
}
|
|
362
432
|
// XPath evaluation functions
|
|
363
433
|
const queryInsideContext = (context, part) => {
|
|
364
434
|
try {
|