mx-cloud 0.0.17 → 0.0.18
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/build/interpret.d.ts +4 -0
- package/build/interpret.js +160 -16
- package/package.json +2 -5
package/build/interpret.d.ts
CHANGED
|
@@ -85,6 +85,10 @@ export default class Interpreter extends EventEmitter {
|
|
|
85
85
|
* Sets the abort flag to immediately stop all operations
|
|
86
86
|
*/
|
|
87
87
|
abort(): void;
|
|
88
|
+
/**
|
|
89
|
+
* Returns the current abort status
|
|
90
|
+
*/
|
|
91
|
+
getIsAborted(): boolean;
|
|
88
92
|
/**
|
|
89
93
|
* Given a Playwright's page object and a "declarative" list of actions, this function
|
|
90
94
|
* calls all mentioned functions on the Page object.\
|
package/build/interpret.js
CHANGED
|
@@ -316,6 +316,12 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
316
316
|
abort() {
|
|
317
317
|
this.isAborted = true;
|
|
318
318
|
}
|
|
319
|
+
/**
|
|
320
|
+
* Returns the current abort status
|
|
321
|
+
*/
|
|
322
|
+
getIsAborted() {
|
|
323
|
+
return this.isAborted;
|
|
324
|
+
}
|
|
319
325
|
/**
|
|
320
326
|
* Given a Playwright's page object and a "declarative" list of actions, this function
|
|
321
327
|
* calls all mentioned functions on the Page object.\
|
|
@@ -582,7 +588,15 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
582
588
|
}
|
|
583
589
|
if (methodName === 'waitForLoadState') {
|
|
584
590
|
try {
|
|
585
|
-
|
|
591
|
+
// Add timeout if not already specified
|
|
592
|
+
let args = step.args;
|
|
593
|
+
if (Array.isArray(args) && args.length === 1) {
|
|
594
|
+
args = [args[0], { timeout: 30000 }];
|
|
595
|
+
}
|
|
596
|
+
else if (!Array.isArray(args)) {
|
|
597
|
+
args = [args, { timeout: 30000 }];
|
|
598
|
+
}
|
|
599
|
+
yield executeAction(invokee, methodName, args);
|
|
586
600
|
}
|
|
587
601
|
catch (error) {
|
|
588
602
|
yield executeAction(invokee, methodName, 'domcontentloaded');
|
|
@@ -639,7 +653,17 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
639
653
|
debugLog("Workflow aborted, stopping scrapeCurrentPage");
|
|
640
654
|
return;
|
|
641
655
|
}
|
|
642
|
-
|
|
656
|
+
// Add timeout to prevent hanging on page evaluation
|
|
657
|
+
const evaluationPromise = page.evaluate((cfg) => window.scrapeList(cfg), config);
|
|
658
|
+
const timeoutPromise = new Promise((_, reject) => setTimeout(() => reject(new Error('Page evaluation timeout')), 10000));
|
|
659
|
+
let results;
|
|
660
|
+
try {
|
|
661
|
+
results = yield Promise.race([evaluationPromise, timeoutPromise]);
|
|
662
|
+
}
|
|
663
|
+
catch (error) {
|
|
664
|
+
debugLog(`Page evaluation failed: ${error.message}`);
|
|
665
|
+
return;
|
|
666
|
+
}
|
|
643
667
|
const newResults = results.filter(item => {
|
|
644
668
|
const uniqueKey = JSON.stringify(item);
|
|
645
669
|
if (scrapedItems.has(uniqueKey))
|
|
@@ -696,14 +720,22 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
696
720
|
});
|
|
697
721
|
// Enhanced button finder with retry mechanism for both CSS and XPath selectors
|
|
698
722
|
const findWorkingButton = (selectors) => __awaiter(this, void 0, void 0, function* () {
|
|
723
|
+
const startTime = Date.now();
|
|
724
|
+
const MAX_BUTTON_SEARCH_TIME = 15000;
|
|
699
725
|
let updatedSelectors = [...selectors];
|
|
700
726
|
for (let i = 0; i < selectors.length; i++) {
|
|
727
|
+
// Check overall timeout
|
|
728
|
+
if (Date.now() - startTime > MAX_BUTTON_SEARCH_TIME) {
|
|
729
|
+
debugLog(`Button search timeout reached (${MAX_BUTTON_SEARCH_TIME}ms), aborting`);
|
|
730
|
+
break;
|
|
731
|
+
}
|
|
701
732
|
const selector = selectors[i];
|
|
702
733
|
let retryCount = 0;
|
|
703
734
|
let selectorSuccess = false;
|
|
704
735
|
while (retryCount < MAX_RETRIES && !selectorSuccess) {
|
|
705
736
|
try {
|
|
706
|
-
|
|
737
|
+
// Reduce timeout to prevent hanging on slow selectors
|
|
738
|
+
const button = yield waitForSelectorUniversal(selector, { timeout: 2000 });
|
|
707
739
|
if (button) {
|
|
708
740
|
debugLog('Found working selector:', selector);
|
|
709
741
|
return {
|
|
@@ -712,16 +744,30 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
712
744
|
updatedSelectors
|
|
713
745
|
};
|
|
714
746
|
}
|
|
747
|
+
else {
|
|
748
|
+
// Treat null result as failed attempt
|
|
749
|
+
retryCount++;
|
|
750
|
+
debugLog(`Selector "${selector}" not found: attempt ${retryCount}/${MAX_RETRIES}`);
|
|
751
|
+
if (retryCount < MAX_RETRIES) {
|
|
752
|
+
yield page.waitForTimeout(RETRY_DELAY);
|
|
753
|
+
}
|
|
754
|
+
else {
|
|
755
|
+
debugLog(`Removing failed selector "${selector}" after ${MAX_RETRIES} attempts`);
|
|
756
|
+
updatedSelectors = updatedSelectors.filter(s => s !== selector);
|
|
757
|
+
selectorSuccess = true; // Exit retry loop for this selector
|
|
758
|
+
}
|
|
759
|
+
}
|
|
715
760
|
}
|
|
716
761
|
catch (error) {
|
|
717
762
|
retryCount++;
|
|
718
|
-
debugLog(`Selector "${selector}"
|
|
763
|
+
debugLog(`Selector "${selector}" error: attempt ${retryCount}/${MAX_RETRIES} - ${error.message}`);
|
|
719
764
|
if (retryCount < MAX_RETRIES) {
|
|
720
765
|
yield page.waitForTimeout(RETRY_DELAY);
|
|
721
766
|
}
|
|
722
767
|
else {
|
|
723
768
|
debugLog(`Removing failed selector "${selector}" after ${MAX_RETRIES} attempts`);
|
|
724
769
|
updatedSelectors = updatedSelectors.filter(s => s !== selector);
|
|
770
|
+
selectorSuccess = true; // Exit retry loop for this selector
|
|
725
771
|
}
|
|
726
772
|
}
|
|
727
773
|
}
|
|
@@ -748,6 +794,10 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
748
794
|
});
|
|
749
795
|
let availableSelectors = config.pagination.selector.split(',');
|
|
750
796
|
let unchangedResultCounter = 0;
|
|
797
|
+
let paginationIterations = 0;
|
|
798
|
+
const MAX_PAGINATION_ITERATIONS = 100; // Prevent infinite pagination
|
|
799
|
+
const paginationStartTime = Date.now();
|
|
800
|
+
const MAX_PAGINATION_TIME = 30 * 60 * 1000; // 30 minutes max for pagination
|
|
751
801
|
try {
|
|
752
802
|
while (true) {
|
|
753
803
|
// Check abort flag at start of each pagination iteration
|
|
@@ -755,6 +805,19 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
755
805
|
this.log('Workflow aborted during pagination loop', logger_1.Level.WARN);
|
|
756
806
|
return allResults;
|
|
757
807
|
}
|
|
808
|
+
// Pagination circuit breakers
|
|
809
|
+
if (++paginationIterations > MAX_PAGINATION_ITERATIONS) {
|
|
810
|
+
debugLog(`Maximum pagination iterations reached (${MAX_PAGINATION_ITERATIONS}), stopping`);
|
|
811
|
+
return allResults;
|
|
812
|
+
}
|
|
813
|
+
if (Date.now() - paginationStartTime > MAX_PAGINATION_TIME) {
|
|
814
|
+
debugLog('Maximum pagination time reached (10 minutes), stopping');
|
|
815
|
+
return allResults;
|
|
816
|
+
}
|
|
817
|
+
// Add async yield every 5 iterations to prevent event loop blocking
|
|
818
|
+
if (paginationIterations % 5 === 0) {
|
|
819
|
+
yield new Promise(resolve => setImmediate(resolve));
|
|
820
|
+
}
|
|
758
821
|
switch (config.pagination.type) {
|
|
759
822
|
case 'scrollDown': {
|
|
760
823
|
let previousResultCount = allResults.length;
|
|
@@ -971,10 +1034,23 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
971
1034
|
if (checkLimit())
|
|
972
1035
|
return allResults;
|
|
973
1036
|
let loadMoreCounter = 0;
|
|
974
|
-
//
|
|
975
|
-
|
|
976
|
-
|
|
1037
|
+
const MAX_LOAD_MORE_ITERATIONS = 100; // Prevent infinite load more
|
|
1038
|
+
const loadMoreStartTime = Date.now();
|
|
1039
|
+
const MAX_LOAD_MORE_TIME = 30 * 60 * 1000; // 5 minutes max for load more
|
|
977
1040
|
while (true) {
|
|
1041
|
+
// Load more circuit breakers
|
|
1042
|
+
if (loadMoreCounter >= MAX_LOAD_MORE_ITERATIONS) {
|
|
1043
|
+
debugLog(`Maximum load more iterations reached (${MAX_LOAD_MORE_ITERATIONS}), stopping`);
|
|
1044
|
+
return allResults;
|
|
1045
|
+
}
|
|
1046
|
+
if (Date.now() - loadMoreStartTime > MAX_LOAD_MORE_TIME) {
|
|
1047
|
+
debugLog('Maximum load more time reached (5 minutes), stopping');
|
|
1048
|
+
return allResults;
|
|
1049
|
+
}
|
|
1050
|
+
// Add async yield every 3 iterations
|
|
1051
|
+
if (loadMoreCounter % 3 === 0 && loadMoreCounter > 0) {
|
|
1052
|
+
yield new Promise(resolve => setImmediate(resolve));
|
|
1053
|
+
}
|
|
978
1054
|
// Find working button with retry mechanism
|
|
979
1055
|
const { button: loadMoreButton, workingSelector, updatedSelectors } = yield findWorkingButton(availableSelectors);
|
|
980
1056
|
availableSelectors = updatedSelectors;
|
|
@@ -1730,12 +1806,35 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
1730
1806
|
selector.includes('@id=');
|
|
1731
1807
|
let count = 0;
|
|
1732
1808
|
if (isXPath) {
|
|
1809
|
+
// Add timeout to prevent XPath hanging
|
|
1733
1810
|
const locator = page.locator(`xpath=${selector}`);
|
|
1734
|
-
|
|
1811
|
+
const timeoutPromise = new Promise((_, reject) => setTimeout(() => reject(new Error('XPath timeout')), 5000));
|
|
1812
|
+
try {
|
|
1813
|
+
count = yield Promise.race([
|
|
1814
|
+
locator.count(),
|
|
1815
|
+
timeoutPromise
|
|
1816
|
+
]);
|
|
1817
|
+
}
|
|
1818
|
+
catch (error) {
|
|
1819
|
+
// XPath timed out or failed
|
|
1820
|
+
return false;
|
|
1821
|
+
}
|
|
1735
1822
|
}
|
|
1736
1823
|
else {
|
|
1737
|
-
|
|
1738
|
-
|
|
1824
|
+
// Add timeout to CSS selector operations
|
|
1825
|
+
try {
|
|
1826
|
+
const elementsPromise = page.$$(selector);
|
|
1827
|
+
const timeoutPromise = new Promise((_, reject) => setTimeout(() => reject(new Error('CSS selector timeout')), 5000));
|
|
1828
|
+
const elements = yield Promise.race([
|
|
1829
|
+
elementsPromise,
|
|
1830
|
+
timeoutPromise
|
|
1831
|
+
]);
|
|
1832
|
+
count = elements ? elements.length : 0;
|
|
1833
|
+
}
|
|
1834
|
+
catch (error) {
|
|
1835
|
+
// CSS selector timed out or failed
|
|
1836
|
+
return false;
|
|
1837
|
+
}
|
|
1739
1838
|
}
|
|
1740
1839
|
// For list selectors, we need multiple elements
|
|
1741
1840
|
if (isListSelector) {
|
|
@@ -1998,12 +2097,26 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
1998
2097
|
/* eslint no-constant-condition: ["warn", { "checkLoops": false }] */
|
|
1999
2098
|
let loopIterations = 0;
|
|
2000
2099
|
const MAX_LOOP_ITERATIONS = 1000; // Circuit breaker
|
|
2100
|
+
let consecutiveFailures = 0;
|
|
2101
|
+
const MAX_CONSECUTIVE_FAILURES = 10;
|
|
2102
|
+
const startTime = Date.now();
|
|
2103
|
+
const MAX_EXECUTION_TIME = 30 * 60 * 1000; // 30 minutes max
|
|
2001
2104
|
while (true) {
|
|
2002
|
-
//
|
|
2105
|
+
// Multiple circuit breakers to prevent infinite loops
|
|
2003
2106
|
if (++loopIterations > MAX_LOOP_ITERATIONS) {
|
|
2004
2107
|
this.log('Maximum loop iterations reached, terminating to prevent infinite loop', logger_1.Level.ERROR);
|
|
2005
2108
|
return;
|
|
2006
2109
|
}
|
|
2110
|
+
// Time-based circuit breaker
|
|
2111
|
+
if (Date.now() - startTime > MAX_EXECUTION_TIME) {
|
|
2112
|
+
this.log('Maximum execution time reached (30 minutes), terminating workflow', logger_1.Level.ERROR);
|
|
2113
|
+
return;
|
|
2114
|
+
}
|
|
2115
|
+
// Failure-based circuit breaker
|
|
2116
|
+
if (consecutiveFailures >= MAX_CONSECUTIVE_FAILURES) {
|
|
2117
|
+
this.log('Too many consecutive failures, terminating to prevent hang', logger_1.Level.ERROR);
|
|
2118
|
+
return;
|
|
2119
|
+
}
|
|
2007
2120
|
// Check abort flag immediately
|
|
2008
2121
|
if (this.isAborted) {
|
|
2009
2122
|
this.log('Workflow aborted in runLoop', logger_1.Level.WARN);
|
|
@@ -2094,11 +2207,19 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
2094
2207
|
usedActions.push((_b = action.id) !== null && _b !== void 0 ? _b : 'undefined');
|
|
2095
2208
|
workflowCopy.splice(actionId, 1);
|
|
2096
2209
|
console.log(`Action with ID ${action.id} removed from the workflow copy.`);
|
|
2097
|
-
// Reset
|
|
2098
|
-
loopIterations = 0;
|
|
2210
|
+
// Reset counters on successful action (but keep some history to prevent infinite resets)
|
|
2211
|
+
loopIterations = Math.max(0, loopIterations - 10);
|
|
2212
|
+
consecutiveFailures = 0;
|
|
2213
|
+
// Add async yield to prevent event loop blocking
|
|
2214
|
+
if (loopIterations % 10 === 0) {
|
|
2215
|
+
yield new Promise(resolve => setImmediate(resolve));
|
|
2216
|
+
}
|
|
2099
2217
|
}
|
|
2100
2218
|
catch (e) {
|
|
2101
2219
|
this.log(e, logger_1.Level.ERROR);
|
|
2220
|
+
consecutiveFailures++;
|
|
2221
|
+
// Add delay on failures to prevent tight error loops
|
|
2222
|
+
yield new Promise(resolve => setTimeout(resolve, Math.min(1000, consecutiveFailures * 200)));
|
|
2102
2223
|
// Don't crash on individual action failures - continue with next iteration
|
|
2103
2224
|
continue;
|
|
2104
2225
|
}
|
|
@@ -2112,9 +2233,32 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
2112
2233
|
}
|
|
2113
2234
|
ensureScriptsLoaded(page) {
|
|
2114
2235
|
return __awaiter(this, void 0, void 0, function* () {
|
|
2115
|
-
|
|
2116
|
-
|
|
2117
|
-
|
|
2236
|
+
try {
|
|
2237
|
+
// Add timeout to prevent hanging on script evaluation
|
|
2238
|
+
const evaluationPromise = page.evaluate(() => typeof window.scrape === 'function' &&
|
|
2239
|
+
typeof window.scrapeSchema === 'function' &&
|
|
2240
|
+
typeof window.scrapeList === 'function' &&
|
|
2241
|
+
typeof window.scrapeListAuto === 'function' &&
|
|
2242
|
+
typeof window.scrollDown === 'function' &&
|
|
2243
|
+
typeof window.scrollUp === 'function');
|
|
2244
|
+
const timeoutPromise = new Promise((_, reject) => setTimeout(() => reject(new Error('Script check timeout')), 3000));
|
|
2245
|
+
const isScriptLoaded = yield Promise.race([
|
|
2246
|
+
evaluationPromise,
|
|
2247
|
+
timeoutPromise
|
|
2248
|
+
]);
|
|
2249
|
+
if (!isScriptLoaded) {
|
|
2250
|
+
yield page.addInitScript({ path: path_1.default.join(__dirname, 'browserSide', 'scraper.js') });
|
|
2251
|
+
}
|
|
2252
|
+
}
|
|
2253
|
+
catch (error) {
|
|
2254
|
+
// If script check fails, try to add the script anyway
|
|
2255
|
+
this.log(`Script check failed, adding script anyway: ${error.message}`, logger_1.Level.WARN);
|
|
2256
|
+
try {
|
|
2257
|
+
yield page.addInitScript({ path: path_1.default.join(__dirname, 'browserSide', 'scraper.js') });
|
|
2258
|
+
}
|
|
2259
|
+
catch (scriptError) {
|
|
2260
|
+
this.log(`Failed to add script: ${scriptError.message}`, logger_1.Level.ERROR);
|
|
2261
|
+
}
|
|
2118
2262
|
}
|
|
2119
2263
|
});
|
|
2120
2264
|
}
|
package/package.json
CHANGED
|
@@ -1,12 +1,12 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "mx-cloud",
|
|
3
|
-
"version": "0.0.
|
|
3
|
+
"version": "0.0.18",
|
|
4
4
|
"description": "mx cloud",
|
|
5
5
|
"main": "build/index.js",
|
|
6
6
|
"typings": "build/index.d.ts",
|
|
7
7
|
"scripts": {
|
|
8
8
|
"test": "jest",
|
|
9
|
-
"build": "tsc",
|
|
9
|
+
"build": "npm run clean && tsc",
|
|
10
10
|
"lint": "eslint .",
|
|
11
11
|
"clean": "rimraf ./build"
|
|
12
12
|
},
|
|
@@ -23,8 +23,5 @@
|
|
|
23
23
|
"playwright": "^1.50.0",
|
|
24
24
|
"playwright-extra": "^4.3.6",
|
|
25
25
|
"puppeteer-extra-plugin-stealth": "^2.11.2"
|
|
26
|
-
},
|
|
27
|
-
"devDependencies": {
|
|
28
|
-
"@types/node": "^24.3.1"
|
|
29
26
|
}
|
|
30
27
|
}
|