mx-cloud 0.0.14 → 0.0.16

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -287,8 +287,15 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
287
287
  return (_e = element.textContent) === null || _e === void 0 ? void 0 : _e.trim();
288
288
  case 'innerHTML':
289
289
  return element.innerHTML;
290
- case 'outerHTML':
291
- return element.outerHTML;
290
+ case 'outerHTML': {
291
+ const clonedElement = element.cloneNode(true);
292
+ const elementsWithMxId = clonedElement.querySelectorAll('[data-mx-id]');
293
+ elementsWithMxId.forEach(el => el.removeAttribute('data-mx-id'));
294
+ if (clonedElement.hasAttribute && clonedElement.hasAttribute('data-mx-id')) {
295
+ clonedElement.removeAttribute('data-mx-id');
296
+ }
297
+ return clonedElement.outerHTML;
298
+ }
292
299
  default:
293
300
  return element.getAttribute(attribute) || ((_f = element.innerText) === null || _f === void 0 ? void 0 : _f.trim());
294
301
  }
@@ -359,6 +366,69 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
359
366
  */
360
367
  window.scrapeList = function (_a) {
361
368
  return __awaiter(this, arguments, void 0, function* ({ listSelector, fields, limit = 10 }) {
369
+ const isSitemapUrl = () => {
370
+ const url = window.location.href.toLowerCase();
371
+ return url.includes('sitemap') && url.includes('.xml');
372
+ };
373
+ const scrapeSitemapData = () => {
374
+ // Try to get the XML content from the page
375
+ let xmlContent = null;
376
+ // Method 1: Check if the page is already parsed as XML
377
+ if (document.documentElement.tagName.toLowerCase() === 'urlset') {
378
+ xmlContent = document;
379
+ }
380
+ // Method 2: Try to get raw XML from pre tags (common browser display)
381
+ if (!xmlContent) {
382
+ const preElement = document.querySelector('pre');
383
+ if (preElement) {
384
+ try {
385
+ const parser = new DOMParser();
386
+ xmlContent = parser.parseFromString(preElement.textContent, 'text/xml');
387
+ }
388
+ catch (e) {
389
+ console.warn('Failed to parse XML from pre element:', e);
390
+ }
391
+ }
392
+ }
393
+ // Method 3: Try to parse the entire document as XML
394
+ if (!xmlContent) {
395
+ try {
396
+ const parser = new DOMParser();
397
+ xmlContent = parser.parseFromString(document.documentElement.outerHTML, 'text/xml');
398
+ }
399
+ catch (e) {
400
+ console.warn('Failed to parse document as XML:', e);
401
+ }
402
+ }
403
+ if (!xmlContent) {
404
+ console.error('Could not parse sitemap XML');
405
+ return [];
406
+ }
407
+ // Extract URL entries from the sitemap
408
+ const urlElements = xmlContent.querySelectorAll('url');
409
+ const sitemapData = [];
410
+ urlElements.forEach((urlElement, index) => {
411
+ if (limit && index >= limit)
412
+ return;
413
+ const locElement = urlElement.querySelector('loc');
414
+ const lastmodElement = urlElement.querySelector('lastmod');
415
+ const entry = {};
416
+ if (locElement) {
417
+ entry.loc = locElement.textContent.trim();
418
+ }
419
+ if (lastmodElement) {
420
+ entry.lastmod = lastmodElement.textContent.trim();
421
+ }
422
+ // Only add entries that have at least a loc field
423
+ if (entry.loc) {
424
+ sitemapData.push(entry);
425
+ }
426
+ });
427
+ return sitemapData;
428
+ };
429
+ if (isSitemapUrl()) {
430
+ return scrapeSitemapData();
431
+ }
362
432
  // XPath evaluation functions
363
433
  const queryInsideContext = (context, part) => {
364
434
  try {
@@ -318,7 +318,7 @@ class Interpreter extends events_1.EventEmitter {
318
318
  */
319
319
  carryOutSteps(page, steps) {
320
320
  return __awaiter(this, void 0, void 0, function* () {
321
- var _a;
321
+ var _a, _b;
322
322
  /**
323
323
  * Defines overloaded (or added) methods/actions usable in the workflow.
324
324
  * If a method overloads any existing method of the Page class, it accepts the same set
@@ -388,18 +388,44 @@ class Interpreter extends events_1.EventEmitter {
388
388
  if (!this.cumulativeResults || !Array.isArray(this.cumulativeResults)) {
389
389
  this.cumulativeResults = [];
390
390
  }
391
+ const resultToProcess = Array.isArray(scrapeResult) ? scrapeResult[0] : scrapeResult;
391
392
  if (this.cumulativeResults.length === 0) {
392
- this.cumulativeResults.push({});
393
+ // First execution - create initial row
394
+ const newRow = {};
395
+ Object.entries(resultToProcess).forEach(([key, value]) => {
396
+ if (value !== undefined) {
397
+ newRow[key] = value;
398
+ }
399
+ });
400
+ this.cumulativeResults.push(newRow);
393
401
  }
394
- const mergedResult = this.cumulativeResults[0];
395
- const resultToProcess = Array.isArray(scrapeResult) ? scrapeResult[0] : scrapeResult;
396
- Object.entries(resultToProcess).forEach(([key, value]) => {
397
- if (value !== undefined) {
398
- mergedResult[key] = value;
402
+ else {
403
+ // Check if any keys from new result already exist in the last row
404
+ const lastRow = this.cumulativeResults[this.cumulativeResults.length - 1];
405
+ const newResultKeys = Object.keys(resultToProcess).filter(key => resultToProcess[key] !== undefined);
406
+ const hasRepeatedKeys = newResultKeys.some(key => lastRow.hasOwnProperty(key));
407
+ if (hasRepeatedKeys) {
408
+ // Keys are repeated - create a new row
409
+ const newRow = {};
410
+ Object.entries(resultToProcess).forEach(([key, value]) => {
411
+ if (value !== undefined) {
412
+ newRow[key] = value;
413
+ }
414
+ });
415
+ this.cumulativeResults.push(newRow);
399
416
  }
400
- });
401
- console.log("Updated merged result:", mergedResult);
402
- yield this.options.serializableCallback([mergedResult]);
417
+ else {
418
+ // No repeated keys - merge with the last row
419
+ Object.entries(resultToProcess).forEach(([key, value]) => {
420
+ if (value !== undefined) {
421
+ lastRow[key] = value;
422
+ }
423
+ });
424
+ }
425
+ }
426
+ console.log("Total accumulated rows:", this.cumulativeResults.length);
427
+ console.log("Current results:", this.cumulativeResults);
428
+ yield this.options.serializableCallback(this.cumulativeResults);
403
429
  }),
404
430
  scrapeList: (config) => __awaiter(this, void 0, void 0, function* () {
405
431
  var _a, _b;
@@ -410,17 +436,37 @@ class Interpreter extends events_1.EventEmitter {
410
436
  yield this.options.serializableCallback({});
411
437
  return;
412
438
  }
413
- yield this.ensureScriptsLoaded(page);
414
- if ((_b = this.options.debugChannel) === null || _b === void 0 ? void 0 : _b.incrementScrapeListIndex) {
415
- this.options.debugChannel.incrementScrapeListIndex();
416
- }
417
- if (!config.pagination) {
418
- const scrapeResults = yield page.evaluate((cfg) => window.scrapeList(cfg), config);
439
+ try {
440
+ yield this.ensureScriptsLoaded(page);
441
+ if ((_b = this.options.debugChannel) === null || _b === void 0 ? void 0 : _b.incrementScrapeListIndex) {
442
+ this.options.debugChannel.incrementScrapeListIndex();
443
+ }
444
+ let scrapeResults = [];
445
+ if (!config.pagination) {
446
+ scrapeResults = yield page.evaluate((cfg) => {
447
+ try {
448
+ return window.scrapeList(cfg);
449
+ }
450
+ catch (error) {
451
+ console.warn('ScrapeList evaluation failed:', error.message);
452
+ return []; // Return empty array instead of failing
453
+ }
454
+ }, config);
455
+ }
456
+ else {
457
+ scrapeResults = yield this.handlePagination(page, config);
458
+ }
459
+ // Ensure we always have an array
460
+ if (!Array.isArray(scrapeResults)) {
461
+ scrapeResults = [];
462
+ }
463
+ console.log(`ScrapeList completed with ${scrapeResults.length} results`);
419
464
  yield this.options.serializableCallback(scrapeResults);
420
465
  }
421
- else {
422
- const scrapeResults = yield this.handlePagination(page, config);
423
- yield this.options.serializableCallback(scrapeResults);
466
+ catch (error) {
467
+ console.error('ScrapeList action failed completely:', error.message);
468
+ // Don't throw error, just return empty array
469
+ yield this.options.serializableCallback([]);
424
470
  }
425
471
  }),
426
472
  scrapeListAuto: (config) => __awaiter(this, void 0, void 0, function* () {
@@ -482,46 +528,54 @@ class Interpreter extends events_1.EventEmitter {
482
528
  });
483
529
  for (const step of steps) {
484
530
  this.log(`Launching ${String(step.action)}`, logger_1.Level.LOG);
485
- if (step.action in wawActions) {
486
- // "Arrayifying" here should not be needed (TS + syntax checker - only arrays; but why not)
487
- const params = !step.args || Array.isArray(step.args) ? step.args : [step.args];
488
- yield wawActions[step.action](...(params !== null && params !== void 0 ? params : []));
489
- }
490
- else {
491
- if ((_a = this.options.debugChannel) === null || _a === void 0 ? void 0 : _a.setActionType) {
492
- this.options.debugChannel.setActionType(String(step.action));
493
- }
494
- // Implements the dot notation for the "method name" in the workflow
495
- const levels = String(step.action).split('.');
496
- const methodName = levels[levels.length - 1];
497
- let invokee = page;
498
- for (const level of levels.splice(0, levels.length - 1)) {
499
- invokee = invokee[level];
531
+ try {
532
+ if (step.action in wawActions) {
533
+ // "Arrayifying" here should not be needed (TS + syntax checker - only arrays; but why not)
534
+ const params = !step.args || Array.isArray(step.args) ? step.args : [step.args];
535
+ yield wawActions[step.action](...(params !== null && params !== void 0 ? params : []));
500
536
  }
501
- if (methodName === 'waitForLoadState') {
502
- try {
503
- yield executeAction(invokee, methodName, step.args);
537
+ else {
538
+ if ((_a = this.options.debugChannel) === null || _a === void 0 ? void 0 : _a.setActionType) {
539
+ this.options.debugChannel.setActionType(String(step.action));
504
540
  }
505
- catch (error) {
506
- yield executeAction(invokee, methodName, 'domcontentloaded');
541
+ // Implements the dot notation for the "method name" in the workflow
542
+ const levels = String(step.action).split('.');
543
+ const methodName = levels[levels.length - 1];
544
+ let invokee = page;
545
+ for (const level of levels.splice(0, levels.length - 1)) {
546
+ invokee = invokee[level];
507
547
  }
508
- }
509
- else if (methodName === 'click') {
510
- try {
511
- yield executeAction(invokee, methodName, step.args);
548
+ if (methodName === 'waitForLoadState') {
549
+ try {
550
+ yield executeAction(invokee, methodName, step.args);
551
+ }
552
+ catch (error) {
553
+ yield executeAction(invokee, methodName, 'domcontentloaded');
554
+ }
512
555
  }
513
- catch (error) {
556
+ else if (methodName === 'click') {
514
557
  try {
515
- yield executeAction(invokee, methodName, [step.args[0], { force: true }]);
558
+ yield executeAction(invokee, methodName, step.args);
516
559
  }
517
560
  catch (error) {
518
- continue;
561
+ try {
562
+ yield executeAction(invokee, methodName, [step.args[0], { force: true }]);
563
+ }
564
+ catch (error) {
565
+ this.log(`Click action failed for selector ${(_b = step.args) === null || _b === void 0 ? void 0 : _b[0]}: ${error.message}`, logger_1.Level.WARN);
566
+ continue; // Skip to next action
567
+ }
519
568
  }
520
569
  }
570
+ else {
571
+ yield executeAction(invokee, methodName, step.args);
572
+ }
521
573
  }
522
- else {
523
- yield executeAction(invokee, methodName, step.args);
524
- }
574
+ }
575
+ catch (error) {
576
+ this.log(`Action ${String(step.action)} failed: ${error.message}`, logger_1.Level.WARN);
577
+ // Continue to next action instead of breaking
578
+ continue;
525
579
  }
526
580
  yield new Promise((res) => { setTimeout(res, 500); });
527
581
  }
@@ -1895,26 +1949,29 @@ class Interpreter extends events_1.EventEmitter {
1895
1949
  yield p.close();
1896
1950
  return;
1897
1951
  }
1898
- const newSelectors = this.getSelectors(workflowCopy);
1899
- newSelectors.forEach(selector => {
1900
- if (!selectors.includes(selector)) {
1901
- selectors.push(selector);
1902
- }
1903
- });
1904
- let pageState = {};
1905
- let getStateTest = "Hello";
1906
- try {
1907
- pageState = yield this.getState(p, workflowCopy, selectors);
1908
- selectors = [];
1909
- console.log("Empty selectors:", selectors);
1910
- }
1911
- catch (e) {
1912
- this.log('The browser has been closed.');
1952
+ if (workflowCopy.length === 0) {
1953
+ this.log('All actions completed. Workflow finished.', logger_1.Level.LOG);
1913
1954
  return;
1914
1955
  }
1915
- if (this.options.debug) {
1916
- this.log(`Current state is: \n${JSON.stringify(pageState, null, 2)}`, logger_1.Level.WARN);
1917
- }
1956
+ // const newSelectors = this.getSelectors(workflowCopy);
1957
+ // newSelectors.forEach(selector => {
1958
+ // if (!selectors.includes(selector)) {
1959
+ // selectors.push(selector);
1960
+ // }
1961
+ // });
1962
+ // let pageState = {};
1963
+ // let getStateTest = "Hello";
1964
+ // try {
1965
+ // pageState = await this.getState(p, workflowCopy, selectors);
1966
+ // selectors = [];
1967
+ // console.log("Empty selectors:", selectors)
1968
+ // } catch (e: any) {
1969
+ // this.log('The browser has been closed.');
1970
+ // return;
1971
+ // }
1972
+ // if (this.options.debug) {
1973
+ // this.log(`Current state is: \n${JSON.stringify(pageState, null, 2)}`, Level.WARN);
1974
+ // }
1918
1975
  // const actionId = workflow.findIndex((step) => {
1919
1976
  // const isApplicable = this.applicable(step.where, pageState, usedActions);
1920
1977
  // console.log("-------------------------------------------------------------");
@@ -1934,10 +1991,11 @@ class Interpreter extends events_1.EventEmitter {
1934
1991
  // console.log("SCHEMA CHANGES:", changes);
1935
1992
  // }
1936
1993
  // }
1937
- actionId = this.getMatchingActionId(workflowCopy, pageState, usedActions);
1938
- if (actionId !== -1 && workflowCopy[actionId]) {
1939
- workflowCopy[actionId] = yield this.validateWorkflowAction(p, workflowCopy[actionId]);
1940
- }
1994
+ // actionId = this.getMatchingActionId(workflowCopy, pageState, usedActions);
1995
+ // if (actionId !== -1 && workflowCopy[actionId]) {
1996
+ // workflowCopy[actionId] = await this.validateWorkflowAction(p, workflowCopy[actionId]);
1997
+ // }
1998
+ const actionId = workflowCopy.length - 1;
1941
1999
  const action = workflowCopy[actionId];
1942
2000
  console.log("MATCHED ACTION:", action);
1943
2001
  console.log("MATCHED ACTION ID:", actionId);
@@ -1963,6 +2021,10 @@ class Interpreter extends events_1.EventEmitter {
1963
2021
  }
1964
2022
  catch (e) {
1965
2023
  this.log(e, logger_1.Level.ERROR);
2024
+ console.log(`Action with ID ${action.id} failed: ${e.message}`);
2025
+ // Still remove the failed action to prevent infinite loops
2026
+ workflowCopy.splice(actionId, 1);
2027
+ console.log(`Failed action removed. Remaining actions: ${workflowCopy.length}`);
1966
2028
  }
1967
2029
  }
1968
2030
  else {
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "mx-cloud",
3
- "version": "0.0.14",
3
+ "version": "0.0.16",
4
4
  "description": "mx cloud",
5
5
  "main": "build/index.js",
6
6
  "typings": "build/index.d.ts",