mx-cloud 0.0.14 → 0.0.15

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -287,8 +287,15 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
287
287
  return (_e = element.textContent) === null || _e === void 0 ? void 0 : _e.trim();
288
288
  case 'innerHTML':
289
289
  return element.innerHTML;
290
- case 'outerHTML':
291
- return element.outerHTML;
290
+ case 'outerHTML': {
291
+ const clonedElement = element.cloneNode(true);
292
+ const elementsWithMxId = clonedElement.querySelectorAll('[data-mx-id]');
293
+ elementsWithMxId.forEach(el => el.removeAttribute('data-mx-id'));
294
+ if (clonedElement.hasAttribute && clonedElement.hasAttribute('data-mx-id')) {
295
+ clonedElement.removeAttribute('data-mx-id');
296
+ }
297
+ return clonedElement.outerHTML;
298
+ }
292
299
  default:
293
300
  return element.getAttribute(attribute) || ((_f = element.innerText) === null || _f === void 0 ? void 0 : _f.trim());
294
301
  }
@@ -359,6 +366,69 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
359
366
  */
360
367
  window.scrapeList = function (_a) {
361
368
  return __awaiter(this, arguments, void 0, function* ({ listSelector, fields, limit = 10 }) {
369
+ const isSitemapUrl = () => {
370
+ const url = window.location.href.toLowerCase();
371
+ return url.includes('sitemap') && url.includes('.xml');
372
+ };
373
+ const scrapeSitemapData = () => {
374
+ // Try to get the XML content from the page
375
+ let xmlContent = null;
376
+ // Method 1: Check if the page is already parsed as XML
377
+ if (document.documentElement.tagName.toLowerCase() === 'urlset') {
378
+ xmlContent = document;
379
+ }
380
+ // Method 2: Try to get raw XML from pre tags (common browser display)
381
+ if (!xmlContent) {
382
+ const preElement = document.querySelector('pre');
383
+ if (preElement) {
384
+ try {
385
+ const parser = new DOMParser();
386
+ xmlContent = parser.parseFromString(preElement.textContent, 'text/xml');
387
+ }
388
+ catch (e) {
389
+ console.warn('Failed to parse XML from pre element:', e);
390
+ }
391
+ }
392
+ }
393
+ // Method 3: Try to parse the entire document as XML
394
+ if (!xmlContent) {
395
+ try {
396
+ const parser = new DOMParser();
397
+ xmlContent = parser.parseFromString(document.documentElement.outerHTML, 'text/xml');
398
+ }
399
+ catch (e) {
400
+ console.warn('Failed to parse document as XML:', e);
401
+ }
402
+ }
403
+ if (!xmlContent) {
404
+ console.error('Could not parse sitemap XML');
405
+ return [];
406
+ }
407
+ // Extract URL entries from the sitemap
408
+ const urlElements = xmlContent.querySelectorAll('url');
409
+ const sitemapData = [];
410
+ urlElements.forEach((urlElement, index) => {
411
+ if (limit && index >= limit)
412
+ return;
413
+ const locElement = urlElement.querySelector('loc');
414
+ const lastmodElement = urlElement.querySelector('lastmod');
415
+ const entry = {};
416
+ if (locElement) {
417
+ entry.loc = locElement.textContent.trim();
418
+ }
419
+ if (lastmodElement) {
420
+ entry.lastmod = lastmodElement.textContent.trim();
421
+ }
422
+ // Only add entries that have at least a loc field
423
+ if (entry.loc) {
424
+ sitemapData.push(entry);
425
+ }
426
+ });
427
+ return sitemapData;
428
+ };
429
+ if (isSitemapUrl()) {
430
+ return scrapeSitemapData();
431
+ }
362
432
  // XPath evaluation functions
363
433
  const queryInsideContext = (context, part) => {
364
434
  try {
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "mx-cloud",
3
- "version": "0.0.14",
3
+ "version": "0.0.15",
4
4
  "description": "mx cloud",
5
5
  "main": "build/index.js",
6
6
  "typings": "build/index.d.ts",