mx-cloud 0.0.14 → 0.0.16
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/build/browserSide/scraper.js +72 -2
- package/build/interpret.js +133 -71
- package/package.json +1 -1
|
@@ -287,8 +287,15 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
|
|
287
287
|
return (_e = element.textContent) === null || _e === void 0 ? void 0 : _e.trim();
|
|
288
288
|
case 'innerHTML':
|
|
289
289
|
return element.innerHTML;
|
|
290
|
-
case 'outerHTML':
|
|
291
|
-
|
|
290
|
+
case 'outerHTML': {
|
|
291
|
+
const clonedElement = element.cloneNode(true);
|
|
292
|
+
const elementsWithMxId = clonedElement.querySelectorAll('[data-mx-id]');
|
|
293
|
+
elementsWithMxId.forEach(el => el.removeAttribute('data-mx-id'));
|
|
294
|
+
if (clonedElement.hasAttribute && clonedElement.hasAttribute('data-mx-id')) {
|
|
295
|
+
clonedElement.removeAttribute('data-mx-id');
|
|
296
|
+
}
|
|
297
|
+
return clonedElement.outerHTML;
|
|
298
|
+
}
|
|
292
299
|
default:
|
|
293
300
|
return element.getAttribute(attribute) || ((_f = element.innerText) === null || _f === void 0 ? void 0 : _f.trim());
|
|
294
301
|
}
|
|
@@ -359,6 +366,69 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
|
|
359
366
|
*/
|
|
360
367
|
window.scrapeList = function (_a) {
|
|
361
368
|
return __awaiter(this, arguments, void 0, function* ({ listSelector, fields, limit = 10 }) {
|
|
369
|
+
const isSitemapUrl = () => {
|
|
370
|
+
const url = window.location.href.toLowerCase();
|
|
371
|
+
return url.includes('sitemap') && url.includes('.xml');
|
|
372
|
+
};
|
|
373
|
+
const scrapeSitemapData = () => {
|
|
374
|
+
// Try to get the XML content from the page
|
|
375
|
+
let xmlContent = null;
|
|
376
|
+
// Method 1: Check if the page is already parsed as XML
|
|
377
|
+
if (document.documentElement.tagName.toLowerCase() === 'urlset') {
|
|
378
|
+
xmlContent = document;
|
|
379
|
+
}
|
|
380
|
+
// Method 2: Try to get raw XML from pre tags (common browser display)
|
|
381
|
+
if (!xmlContent) {
|
|
382
|
+
const preElement = document.querySelector('pre');
|
|
383
|
+
if (preElement) {
|
|
384
|
+
try {
|
|
385
|
+
const parser = new DOMParser();
|
|
386
|
+
xmlContent = parser.parseFromString(preElement.textContent, 'text/xml');
|
|
387
|
+
}
|
|
388
|
+
catch (e) {
|
|
389
|
+
console.warn('Failed to parse XML from pre element:', e);
|
|
390
|
+
}
|
|
391
|
+
}
|
|
392
|
+
}
|
|
393
|
+
// Method 3: Try to parse the entire document as XML
|
|
394
|
+
if (!xmlContent) {
|
|
395
|
+
try {
|
|
396
|
+
const parser = new DOMParser();
|
|
397
|
+
xmlContent = parser.parseFromString(document.documentElement.outerHTML, 'text/xml');
|
|
398
|
+
}
|
|
399
|
+
catch (e) {
|
|
400
|
+
console.warn('Failed to parse document as XML:', e);
|
|
401
|
+
}
|
|
402
|
+
}
|
|
403
|
+
if (!xmlContent) {
|
|
404
|
+
console.error('Could not parse sitemap XML');
|
|
405
|
+
return [];
|
|
406
|
+
}
|
|
407
|
+
// Extract URL entries from the sitemap
|
|
408
|
+
const urlElements = xmlContent.querySelectorAll('url');
|
|
409
|
+
const sitemapData = [];
|
|
410
|
+
urlElements.forEach((urlElement, index) => {
|
|
411
|
+
if (limit && index >= limit)
|
|
412
|
+
return;
|
|
413
|
+
const locElement = urlElement.querySelector('loc');
|
|
414
|
+
const lastmodElement = urlElement.querySelector('lastmod');
|
|
415
|
+
const entry = {};
|
|
416
|
+
if (locElement) {
|
|
417
|
+
entry.loc = locElement.textContent.trim();
|
|
418
|
+
}
|
|
419
|
+
if (lastmodElement) {
|
|
420
|
+
entry.lastmod = lastmodElement.textContent.trim();
|
|
421
|
+
}
|
|
422
|
+
// Only add entries that have at least a loc field
|
|
423
|
+
if (entry.loc) {
|
|
424
|
+
sitemapData.push(entry);
|
|
425
|
+
}
|
|
426
|
+
});
|
|
427
|
+
return sitemapData;
|
|
428
|
+
};
|
|
429
|
+
if (isSitemapUrl()) {
|
|
430
|
+
return scrapeSitemapData();
|
|
431
|
+
}
|
|
362
432
|
// XPath evaluation functions
|
|
363
433
|
const queryInsideContext = (context, part) => {
|
|
364
434
|
try {
|
package/build/interpret.js
CHANGED
|
@@ -318,7 +318,7 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
318
318
|
*/
|
|
319
319
|
carryOutSteps(page, steps) {
|
|
320
320
|
return __awaiter(this, void 0, void 0, function* () {
|
|
321
|
-
var _a;
|
|
321
|
+
var _a, _b;
|
|
322
322
|
/**
|
|
323
323
|
* Defines overloaded (or added) methods/actions usable in the workflow.
|
|
324
324
|
* If a method overloads any existing method of the Page class, it accepts the same set
|
|
@@ -388,18 +388,44 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
388
388
|
if (!this.cumulativeResults || !Array.isArray(this.cumulativeResults)) {
|
|
389
389
|
this.cumulativeResults = [];
|
|
390
390
|
}
|
|
391
|
+
const resultToProcess = Array.isArray(scrapeResult) ? scrapeResult[0] : scrapeResult;
|
|
391
392
|
if (this.cumulativeResults.length === 0) {
|
|
392
|
-
|
|
393
|
+
// First execution - create initial row
|
|
394
|
+
const newRow = {};
|
|
395
|
+
Object.entries(resultToProcess).forEach(([key, value]) => {
|
|
396
|
+
if (value !== undefined) {
|
|
397
|
+
newRow[key] = value;
|
|
398
|
+
}
|
|
399
|
+
});
|
|
400
|
+
this.cumulativeResults.push(newRow);
|
|
393
401
|
}
|
|
394
|
-
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
|
|
402
|
+
else {
|
|
403
|
+
// Check if any keys from new result already exist in the last row
|
|
404
|
+
const lastRow = this.cumulativeResults[this.cumulativeResults.length - 1];
|
|
405
|
+
const newResultKeys = Object.keys(resultToProcess).filter(key => resultToProcess[key] !== undefined);
|
|
406
|
+
const hasRepeatedKeys = newResultKeys.some(key => lastRow.hasOwnProperty(key));
|
|
407
|
+
if (hasRepeatedKeys) {
|
|
408
|
+
// Keys are repeated - create a new row
|
|
409
|
+
const newRow = {};
|
|
410
|
+
Object.entries(resultToProcess).forEach(([key, value]) => {
|
|
411
|
+
if (value !== undefined) {
|
|
412
|
+
newRow[key] = value;
|
|
413
|
+
}
|
|
414
|
+
});
|
|
415
|
+
this.cumulativeResults.push(newRow);
|
|
399
416
|
}
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
|
|
417
|
+
else {
|
|
418
|
+
// No repeated keys - merge with the last row
|
|
419
|
+
Object.entries(resultToProcess).forEach(([key, value]) => {
|
|
420
|
+
if (value !== undefined) {
|
|
421
|
+
lastRow[key] = value;
|
|
422
|
+
}
|
|
423
|
+
});
|
|
424
|
+
}
|
|
425
|
+
}
|
|
426
|
+
console.log("Total accumulated rows:", this.cumulativeResults.length);
|
|
427
|
+
console.log("Current results:", this.cumulativeResults);
|
|
428
|
+
yield this.options.serializableCallback(this.cumulativeResults);
|
|
403
429
|
}),
|
|
404
430
|
scrapeList: (config) => __awaiter(this, void 0, void 0, function* () {
|
|
405
431
|
var _a, _b;
|
|
@@ -410,17 +436,37 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
410
436
|
yield this.options.serializableCallback({});
|
|
411
437
|
return;
|
|
412
438
|
}
|
|
413
|
-
|
|
414
|
-
|
|
415
|
-
this.options.debugChannel.incrementScrapeListIndex
|
|
416
|
-
|
|
417
|
-
|
|
418
|
-
|
|
439
|
+
try {
|
|
440
|
+
yield this.ensureScriptsLoaded(page);
|
|
441
|
+
if ((_b = this.options.debugChannel) === null || _b === void 0 ? void 0 : _b.incrementScrapeListIndex) {
|
|
442
|
+
this.options.debugChannel.incrementScrapeListIndex();
|
|
443
|
+
}
|
|
444
|
+
let scrapeResults = [];
|
|
445
|
+
if (!config.pagination) {
|
|
446
|
+
scrapeResults = yield page.evaluate((cfg) => {
|
|
447
|
+
try {
|
|
448
|
+
return window.scrapeList(cfg);
|
|
449
|
+
}
|
|
450
|
+
catch (error) {
|
|
451
|
+
console.warn('ScrapeList evaluation failed:', error.message);
|
|
452
|
+
return []; // Return empty array instead of failing
|
|
453
|
+
}
|
|
454
|
+
}, config);
|
|
455
|
+
}
|
|
456
|
+
else {
|
|
457
|
+
scrapeResults = yield this.handlePagination(page, config);
|
|
458
|
+
}
|
|
459
|
+
// Ensure we always have an array
|
|
460
|
+
if (!Array.isArray(scrapeResults)) {
|
|
461
|
+
scrapeResults = [];
|
|
462
|
+
}
|
|
463
|
+
console.log(`ScrapeList completed with ${scrapeResults.length} results`);
|
|
419
464
|
yield this.options.serializableCallback(scrapeResults);
|
|
420
465
|
}
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
|
|
466
|
+
catch (error) {
|
|
467
|
+
console.error('ScrapeList action failed completely:', error.message);
|
|
468
|
+
// Don't throw error, just return empty array
|
|
469
|
+
yield this.options.serializableCallback([]);
|
|
424
470
|
}
|
|
425
471
|
}),
|
|
426
472
|
scrapeListAuto: (config) => __awaiter(this, void 0, void 0, function* () {
|
|
@@ -482,46 +528,54 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
482
528
|
});
|
|
483
529
|
for (const step of steps) {
|
|
484
530
|
this.log(`Launching ${String(step.action)}`, logger_1.Level.LOG);
|
|
485
|
-
|
|
486
|
-
|
|
487
|
-
|
|
488
|
-
|
|
489
|
-
|
|
490
|
-
else {
|
|
491
|
-
if ((_a = this.options.debugChannel) === null || _a === void 0 ? void 0 : _a.setActionType) {
|
|
492
|
-
this.options.debugChannel.setActionType(String(step.action));
|
|
493
|
-
}
|
|
494
|
-
// Implements the dot notation for the "method name" in the workflow
|
|
495
|
-
const levels = String(step.action).split('.');
|
|
496
|
-
const methodName = levels[levels.length - 1];
|
|
497
|
-
let invokee = page;
|
|
498
|
-
for (const level of levels.splice(0, levels.length - 1)) {
|
|
499
|
-
invokee = invokee[level];
|
|
531
|
+
try {
|
|
532
|
+
if (step.action in wawActions) {
|
|
533
|
+
// "Arrayifying" here should not be needed (TS + syntax checker - only arrays; but why not)
|
|
534
|
+
const params = !step.args || Array.isArray(step.args) ? step.args : [step.args];
|
|
535
|
+
yield wawActions[step.action](...(params !== null && params !== void 0 ? params : []));
|
|
500
536
|
}
|
|
501
|
-
|
|
502
|
-
|
|
503
|
-
|
|
537
|
+
else {
|
|
538
|
+
if ((_a = this.options.debugChannel) === null || _a === void 0 ? void 0 : _a.setActionType) {
|
|
539
|
+
this.options.debugChannel.setActionType(String(step.action));
|
|
504
540
|
}
|
|
505
|
-
|
|
506
|
-
|
|
541
|
+
// Implements the dot notation for the "method name" in the workflow
|
|
542
|
+
const levels = String(step.action).split('.');
|
|
543
|
+
const methodName = levels[levels.length - 1];
|
|
544
|
+
let invokee = page;
|
|
545
|
+
for (const level of levels.splice(0, levels.length - 1)) {
|
|
546
|
+
invokee = invokee[level];
|
|
507
547
|
}
|
|
508
|
-
|
|
509
|
-
|
|
510
|
-
|
|
511
|
-
|
|
548
|
+
if (methodName === 'waitForLoadState') {
|
|
549
|
+
try {
|
|
550
|
+
yield executeAction(invokee, methodName, step.args);
|
|
551
|
+
}
|
|
552
|
+
catch (error) {
|
|
553
|
+
yield executeAction(invokee, methodName, 'domcontentloaded');
|
|
554
|
+
}
|
|
512
555
|
}
|
|
513
|
-
|
|
556
|
+
else if (methodName === 'click') {
|
|
514
557
|
try {
|
|
515
|
-
yield executeAction(invokee, methodName,
|
|
558
|
+
yield executeAction(invokee, methodName, step.args);
|
|
516
559
|
}
|
|
517
560
|
catch (error) {
|
|
518
|
-
|
|
561
|
+
try {
|
|
562
|
+
yield executeAction(invokee, methodName, [step.args[0], { force: true }]);
|
|
563
|
+
}
|
|
564
|
+
catch (error) {
|
|
565
|
+
this.log(`Click action failed for selector ${(_b = step.args) === null || _b === void 0 ? void 0 : _b[0]}: ${error.message}`, logger_1.Level.WARN);
|
|
566
|
+
continue; // Skip to next action
|
|
567
|
+
}
|
|
519
568
|
}
|
|
520
569
|
}
|
|
570
|
+
else {
|
|
571
|
+
yield executeAction(invokee, methodName, step.args);
|
|
572
|
+
}
|
|
521
573
|
}
|
|
522
|
-
|
|
523
|
-
|
|
524
|
-
}
|
|
574
|
+
}
|
|
575
|
+
catch (error) {
|
|
576
|
+
this.log(`Action ${String(step.action)} failed: ${error.message}`, logger_1.Level.WARN);
|
|
577
|
+
// Continue to next action instead of breaking
|
|
578
|
+
continue;
|
|
525
579
|
}
|
|
526
580
|
yield new Promise((res) => { setTimeout(res, 500); });
|
|
527
581
|
}
|
|
@@ -1895,26 +1949,29 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
1895
1949
|
yield p.close();
|
|
1896
1950
|
return;
|
|
1897
1951
|
}
|
|
1898
|
-
|
|
1899
|
-
|
|
1900
|
-
if (!selectors.includes(selector)) {
|
|
1901
|
-
selectors.push(selector);
|
|
1902
|
-
}
|
|
1903
|
-
});
|
|
1904
|
-
let pageState = {};
|
|
1905
|
-
let getStateTest = "Hello";
|
|
1906
|
-
try {
|
|
1907
|
-
pageState = yield this.getState(p, workflowCopy, selectors);
|
|
1908
|
-
selectors = [];
|
|
1909
|
-
console.log("Empty selectors:", selectors);
|
|
1910
|
-
}
|
|
1911
|
-
catch (e) {
|
|
1912
|
-
this.log('The browser has been closed.');
|
|
1952
|
+
if (workflowCopy.length === 0) {
|
|
1953
|
+
this.log('All actions completed. Workflow finished.', logger_1.Level.LOG);
|
|
1913
1954
|
return;
|
|
1914
1955
|
}
|
|
1915
|
-
|
|
1916
|
-
|
|
1917
|
-
|
|
1956
|
+
// const newSelectors = this.getSelectors(workflowCopy);
|
|
1957
|
+
// newSelectors.forEach(selector => {
|
|
1958
|
+
// if (!selectors.includes(selector)) {
|
|
1959
|
+
// selectors.push(selector);
|
|
1960
|
+
// }
|
|
1961
|
+
// });
|
|
1962
|
+
// let pageState = {};
|
|
1963
|
+
// let getStateTest = "Hello";
|
|
1964
|
+
// try {
|
|
1965
|
+
// pageState = await this.getState(p, workflowCopy, selectors);
|
|
1966
|
+
// selectors = [];
|
|
1967
|
+
// console.log("Empty selectors:", selectors)
|
|
1968
|
+
// } catch (e: any) {
|
|
1969
|
+
// this.log('The browser has been closed.');
|
|
1970
|
+
// return;
|
|
1971
|
+
// }
|
|
1972
|
+
// if (this.options.debug) {
|
|
1973
|
+
// this.log(`Current state is: \n${JSON.stringify(pageState, null, 2)}`, Level.WARN);
|
|
1974
|
+
// }
|
|
1918
1975
|
// const actionId = workflow.findIndex((step) => {
|
|
1919
1976
|
// const isApplicable = this.applicable(step.where, pageState, usedActions);
|
|
1920
1977
|
// console.log("-------------------------------------------------------------");
|
|
@@ -1934,10 +1991,11 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
1934
1991
|
// console.log("SCHEMA CHANGES:", changes);
|
|
1935
1992
|
// }
|
|
1936
1993
|
// }
|
|
1937
|
-
actionId = this.getMatchingActionId(workflowCopy, pageState, usedActions);
|
|
1938
|
-
if (actionId !== -1 && workflowCopy[actionId]) {
|
|
1939
|
-
|
|
1940
|
-
}
|
|
1994
|
+
// actionId = this.getMatchingActionId(workflowCopy, pageState, usedActions);
|
|
1995
|
+
// if (actionId !== -1 && workflowCopy[actionId]) {
|
|
1996
|
+
// workflowCopy[actionId] = await this.validateWorkflowAction(p, workflowCopy[actionId]);
|
|
1997
|
+
// }
|
|
1998
|
+
const actionId = workflowCopy.length - 1;
|
|
1941
1999
|
const action = workflowCopy[actionId];
|
|
1942
2000
|
console.log("MATCHED ACTION:", action);
|
|
1943
2001
|
console.log("MATCHED ACTION ID:", actionId);
|
|
@@ -1963,6 +2021,10 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
1963
2021
|
}
|
|
1964
2022
|
catch (e) {
|
|
1965
2023
|
this.log(e, logger_1.Level.ERROR);
|
|
2024
|
+
console.log(`Action with ID ${action.id} failed: ${e.message}`);
|
|
2025
|
+
// Still remove the failed action to prevent infinite loops
|
|
2026
|
+
workflowCopy.splice(actionId, 1);
|
|
2027
|
+
console.log(`Failed action removed. Remaining actions: ${workflowCopy.length}`);
|
|
1966
2028
|
}
|
|
1967
2029
|
}
|
|
1968
2030
|
else {
|