mx-cloud 0.0.25 → 0.0.27
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/build/interpret.d.ts +16 -24
- package/build/interpret.js +1006 -1052
- package/build/selector.d.ts +1 -32
- package/build/selector.js +1 -839
- package/build/types/workflow.d.ts +1 -1
- package/build/utils/utils.d.ts +0 -4
- package/build/utils/utils.js +0 -7
- package/package.json +1 -1
package/build/interpret.js
CHANGED
|
@@ -49,7 +49,6 @@ const adblocker_playwright_1 = require("@cliqz/adblocker-playwright");
|
|
|
49
49
|
const cross_fetch_1 = __importDefault(require("cross-fetch"));
|
|
50
50
|
const path_1 = __importDefault(require("path"));
|
|
51
51
|
const events_1 = require("events");
|
|
52
|
-
const logic_1 = require("./types/logic");
|
|
53
52
|
const utils_1 = require("./utils/utils");
|
|
54
53
|
const concurrency_1 = __importDefault(require("./utils/concurrency"));
|
|
55
54
|
const preprocessor_1 = __importDefault(require("./preprocessor"));
|
|
@@ -72,10 +71,21 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
72
71
|
this.scrapeListCounter = 0;
|
|
73
72
|
this.serializableDataByType = {
|
|
74
73
|
scrapeList: {},
|
|
75
|
-
scrapeSchema: {}
|
|
74
|
+
scrapeSchema: {},
|
|
75
|
+
crawl: {},
|
|
76
|
+
search: {}
|
|
76
77
|
};
|
|
77
78
|
this.pendingDeepExtraction = null;
|
|
78
79
|
this.isInDeepExtractionPhase = false;
|
|
80
|
+
this.deepExtractionStats = {
|
|
81
|
+
totalUrlsFound: 0,
|
|
82
|
+
matchedUrls: 0,
|
|
83
|
+
successfulExtractions: 0,
|
|
84
|
+
failedExtractions: 0,
|
|
85
|
+
skippedDueToPattern: 0
|
|
86
|
+
};
|
|
87
|
+
this.totalActions = 0;
|
|
88
|
+
this.executedActions = 0;
|
|
79
89
|
this.workflow = workflow.workflow;
|
|
80
90
|
this.initializedWorkflow = null;
|
|
81
91
|
this.options = Object.assign({ maxRepeats: 5, maxConcurrency: 5, serializableCallback: (data) => {
|
|
@@ -134,189 +144,12 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
134
144
|
}
|
|
135
145
|
});
|
|
136
146
|
}
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
// }
|
|
144
|
-
// // Iterate from the start up to (but not including) actionId
|
|
145
|
-
// for (let index = 0; index < actionId; index++) {
|
|
146
|
-
// const currentSelectors = workflow[index]?.where?.selectors;
|
|
147
|
-
// console.log(`Selectors at step ${index}:`, currentSelectors);
|
|
148
|
-
// if (currentSelectors && currentSelectors.length > 0) {
|
|
149
|
-
// currentSelectors.forEach((selector) => {
|
|
150
|
-
// if (!selectors.includes(selector)) {
|
|
151
|
-
// selectors.push(selector); // Avoid duplicates
|
|
152
|
-
// }
|
|
153
|
-
// });
|
|
154
|
-
// }
|
|
155
|
-
// }
|
|
156
|
-
// console.log("Collected Selectors:", selectors);
|
|
157
|
-
// return selectors;
|
|
158
|
-
// }
|
|
159
|
-
getSelectors(workflow) {
|
|
160
|
-
var _a, _b;
|
|
161
|
-
const selectorsSet = new Set();
|
|
162
|
-
if (workflow.length === 0) {
|
|
163
|
-
return [];
|
|
164
|
-
}
|
|
165
|
-
for (let index = workflow.length - 1; index >= 0; index--) {
|
|
166
|
-
const currentSelectors = (_b = (_a = workflow[index]) === null || _a === void 0 ? void 0 : _a.where) === null || _b === void 0 ? void 0 : _b.selectors;
|
|
167
|
-
if (currentSelectors && currentSelectors.length > 0) {
|
|
168
|
-
currentSelectors.forEach((selector) => selectorsSet.add(selector));
|
|
169
|
-
return Array.from(selectorsSet);
|
|
170
|
-
}
|
|
171
|
-
}
|
|
172
|
-
return [];
|
|
173
|
-
}
|
|
174
|
-
/**
|
|
175
|
-
* Returns the context object from given Page and the current workflow.\
|
|
176
|
-
* \
|
|
177
|
-
* `workflow` is used for selector extraction - function searches for used selectors to
|
|
178
|
-
* look for later in the page's context.
|
|
179
|
-
* @param page Playwright Page object
|
|
180
|
-
* @param workflow Current **initialized** workflow (array of where-what pairs).
|
|
181
|
-
* @returns {PageState} State of the current page.
|
|
182
|
-
*/
|
|
183
|
-
getState(page, workflowCopy, selectors) {
|
|
184
|
-
return __awaiter(this, void 0, void 0, function* () {
|
|
185
|
-
/**
|
|
186
|
-
* All the selectors present in the current Workflow
|
|
187
|
-
*/
|
|
188
|
-
// const selectors = Preprocessor.extractSelectors(workflow);
|
|
189
|
-
// console.log("Current selectors:", selectors);
|
|
190
|
-
/**
|
|
191
|
-
* Determines whether the element targetted by the selector is [actionable](https://playwright.dev/docs/actionability).
|
|
192
|
-
* @param selector Selector to be queried
|
|
193
|
-
* @returns True if the targetted element is actionable, false otherwise.
|
|
194
|
-
*/
|
|
195
|
-
// const actionable = async (selector: string): Promise<boolean> => {
|
|
196
|
-
// try {
|
|
197
|
-
// const proms = [
|
|
198
|
-
// page.isEnabled(selector, { timeout: 10000 }),
|
|
199
|
-
// page.isVisible(selector, { timeout: 10000 }),
|
|
200
|
-
// ];
|
|
201
|
-
// return await Promise.all(proms).then((bools) => bools.every((x) => x));
|
|
202
|
-
// } catch (e) {
|
|
203
|
-
// // log(<Error>e, Level.ERROR);
|
|
204
|
-
// return false;
|
|
205
|
-
// }
|
|
206
|
-
// };
|
|
207
|
-
/**
|
|
208
|
-
* Object of selectors present in the current page.
|
|
209
|
-
*/
|
|
210
|
-
// const presentSelectors: SelectorArray = await Promise.all(
|
|
211
|
-
// selectors.map(async (selector) => {
|
|
212
|
-
// if (await actionable(selector)) {
|
|
213
|
-
// return [selector];
|
|
214
|
-
// }
|
|
215
|
-
// return [];
|
|
216
|
-
// }),
|
|
217
|
-
// ).then((x) => x.flat());
|
|
218
|
-
const presentSelectors = yield Promise.all(selectors.map((selector) => __awaiter(this, void 0, void 0, function* () {
|
|
219
|
-
try {
|
|
220
|
-
yield page.waitForSelector(selector, { state: 'attached' });
|
|
221
|
-
return [selector];
|
|
222
|
-
}
|
|
223
|
-
catch (e) {
|
|
224
|
-
return [];
|
|
225
|
-
}
|
|
226
|
-
}))).then((x) => x.flat());
|
|
227
|
-
const action = workflowCopy[workflowCopy.length - 1];
|
|
228
|
-
// console.log("Next action:", action)
|
|
229
|
-
let url = page.url();
|
|
230
|
-
if (action && action.where.url !== url && action.where.url !== "about:blank") {
|
|
231
|
-
url = action.where.url;
|
|
232
|
-
}
|
|
233
|
-
return {
|
|
234
|
-
url,
|
|
235
|
-
cookies: (yield page.context().cookies([page.url()]))
|
|
236
|
-
.reduce((p, cookie) => (Object.assign(Object.assign({}, p), { [cookie.name]: cookie.value })), {}),
|
|
237
|
-
selectors: presentSelectors,
|
|
238
|
-
};
|
|
239
|
-
});
|
|
240
|
-
}
|
|
241
|
-
/**
|
|
242
|
-
* Tests if the given action is applicable with the given context.
|
|
243
|
-
* @param where Tested *where* condition
|
|
244
|
-
* @param context Current browser context.
|
|
245
|
-
* @returns True if `where` is applicable in the given context, false otherwise
|
|
246
|
-
*/
|
|
247
|
-
applicable(where, context, usedActions = []) {
|
|
248
|
-
/**
|
|
249
|
-
* Given two arbitrary objects, determines whether `subset` is a subset of `superset`.\
|
|
250
|
-
* \
|
|
251
|
-
* For every key in `subset`, there must be a corresponding key with equal scalar
|
|
252
|
-
* value in `superset`, or `inclusive(subset[key], superset[key])` must hold.
|
|
253
|
-
* @param subset Arbitrary non-cyclic JS object (where clause)
|
|
254
|
-
* @param superset Arbitrary non-cyclic JS object (browser context)
|
|
255
|
-
* @returns `true` if `subset <= superset`, `false` otherwise.
|
|
256
|
-
*/
|
|
257
|
-
const inclusive = (subset, superset) => (Object.entries(subset).every(([key, value]) => {
|
|
258
|
-
/**
|
|
259
|
-
* Arrays are compared without order (are transformed into objects before comparison).
|
|
260
|
-
*/
|
|
261
|
-
const parsedValue = Array.isArray(value) ? (0, utils_1.arrayToObject)(value) : value;
|
|
262
|
-
const parsedSuperset = {};
|
|
263
|
-
parsedSuperset[key] = Array.isArray(superset[key])
|
|
264
|
-
? (0, utils_1.arrayToObject)(superset[key])
|
|
265
|
-
: superset[key];
|
|
266
|
-
if ((key === 'url' || key === 'selectors') &&
|
|
267
|
-
Array.isArray(value) && Array.isArray(superset[key]) &&
|
|
268
|
-
value.length === 0 && superset[key].length === 0) {
|
|
269
|
-
return true;
|
|
270
|
-
}
|
|
271
|
-
if (key === 'selectors' && Array.isArray(value) && Array.isArray(superset[key])) {
|
|
272
|
-
return value.some(selector => superset[key].includes(selector));
|
|
273
|
-
}
|
|
274
|
-
// Every `subset` key must exist in the `superset` and
|
|
275
|
-
// have the same value (strict equality), or subset[key] <= superset[key]
|
|
276
|
-
return parsedSuperset[key]
|
|
277
|
-
&& ((parsedSuperset[key] === parsedValue)
|
|
278
|
-
|| ((parsedValue).constructor.name === 'RegExp' && parsedValue.test(parsedSuperset[key]))
|
|
279
|
-
|| ((parsedValue).constructor.name !== 'RegExp'
|
|
280
|
-
&& typeof parsedValue === 'object' && inclusive(parsedValue, parsedSuperset[key])));
|
|
281
|
-
}));
|
|
282
|
-
// Every value in the "where" object should be compliant to the current state.
|
|
283
|
-
return Object.entries(where).every(([key, value]) => {
|
|
284
|
-
if (logic_1.operators.includes(key)) {
|
|
285
|
-
const array = Array.isArray(value)
|
|
286
|
-
? value
|
|
287
|
-
: Object.entries(value).map((a) => Object.fromEntries([a]));
|
|
288
|
-
// every condition is treated as a single context
|
|
289
|
-
switch (key) {
|
|
290
|
-
case '$and':
|
|
291
|
-
return array === null || array === void 0 ? void 0 : array.every((x) => this.applicable(x, context));
|
|
292
|
-
case '$or':
|
|
293
|
-
return array === null || array === void 0 ? void 0 : array.some((x) => this.applicable(x, context));
|
|
294
|
-
case '$not':
|
|
295
|
-
return !this.applicable(value, context); // $not should be a unary operator
|
|
296
|
-
default:
|
|
297
|
-
throw new Error('Undefined logic operator.');
|
|
298
|
-
}
|
|
299
|
-
}
|
|
300
|
-
else if (logic_1.meta.includes(key)) {
|
|
301
|
-
const testRegexString = (x) => {
|
|
302
|
-
if (typeof value === 'string') {
|
|
303
|
-
return x === value;
|
|
304
|
-
}
|
|
305
|
-
return value.test(x);
|
|
306
|
-
};
|
|
307
|
-
switch (key) {
|
|
308
|
-
case '$before':
|
|
309
|
-
return !usedActions.find(testRegexString);
|
|
310
|
-
case '$after':
|
|
311
|
-
return !!usedActions.find(testRegexString);
|
|
312
|
-
default:
|
|
313
|
-
throw new Error('Undefined meta operator.');
|
|
314
|
-
}
|
|
315
|
-
}
|
|
316
|
-
else {
|
|
317
|
-
// Current key is a base condition (url, cookies, selectors)
|
|
318
|
-
return inclusive({ [key]: value }, context);
|
|
319
|
-
}
|
|
147
|
+
callWithTimeout(callback_1) {
|
|
148
|
+
return __awaiter(this, arguments, void 0, function* (callback, timeoutMs = 30000, operationName = 'callback') {
|
|
149
|
+
return Promise.race([
|
|
150
|
+
Promise.resolve(callback()),
|
|
151
|
+
new Promise((_, reject) => setTimeout(() => reject(new Error(`${operationName} timeout after ${timeoutMs}ms`)), timeoutMs))
|
|
152
|
+
]);
|
|
320
153
|
});
|
|
321
154
|
}
|
|
322
155
|
/**
|
|
@@ -363,25 +196,20 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
363
196
|
this.options.debugChannel.setActionType("screenshot");
|
|
364
197
|
}
|
|
365
198
|
const screenshotBuffer = yield page.screenshot(Object.assign(Object.assign({}, params), { path: undefined }));
|
|
366
|
-
// Prefer explicit nameOverride (from workflow step.name or computed action name)
|
|
367
|
-
// If nameOverride is provided (non-empty) use it *as-is*.
|
|
368
|
-
// Only use counter-appended name when no nameOverride is available.
|
|
369
199
|
const explicitName = (typeof nameOverride === 'string' && nameOverride.trim().length > 0) ? nameOverride.trim() : null;
|
|
370
200
|
let screenshotName;
|
|
371
201
|
if (explicitName) {
|
|
372
202
|
screenshotName = explicitName;
|
|
373
203
|
}
|
|
374
204
|
else {
|
|
375
|
-
// If no explicit name, produce a readable generated name with a counter
|
|
376
205
|
this.screenshotCounter += 1;
|
|
377
206
|
screenshotName = `Screenshot ${this.screenshotCounter}`;
|
|
378
207
|
}
|
|
379
|
-
|
|
380
|
-
yield this.options.binaryCallback({
|
|
208
|
+
yield this.callWithTimeout(() => this.options.binaryCallback({
|
|
381
209
|
name: screenshotName,
|
|
382
210
|
data: screenshotBuffer,
|
|
383
211
|
mimeType: "image/png",
|
|
384
|
-
}, "image/png");
|
|
212
|
+
}, "image/png"), 30000, 'binaryCallback (screenshot)');
|
|
385
213
|
}),
|
|
386
214
|
enqueueLinks: (selector) => __awaiter(this, void 0, void 0, function* () {
|
|
387
215
|
var _a;
|
|
@@ -430,7 +258,7 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
430
258
|
}
|
|
431
259
|
yield this.ensureScriptsLoaded(page);
|
|
432
260
|
const scrapeResults = yield page.evaluate((s) => window.scrape(s !== null && s !== void 0 ? s : null), selector);
|
|
433
|
-
yield this.options.serializableCallback(scrapeResults);
|
|
261
|
+
yield this.callWithTimeout(() => this.options.serializableCallback(scrapeResults), 30000, 'serializableCallback (scrape)');
|
|
434
262
|
}),
|
|
435
263
|
scrapeSchema: (schema_1, ...args_1) => __awaiter(this, [schema_1, ...args_1], void 0, function* (schema, actionName = "") {
|
|
436
264
|
var _a;
|
|
@@ -442,7 +270,7 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
442
270
|
this.options.debugChannel.setActionType('scrapeSchema');
|
|
443
271
|
}
|
|
444
272
|
if (this.options.mode && this.options.mode === 'editor') {
|
|
445
|
-
yield this.options.serializableCallback({});
|
|
273
|
+
yield this.callWithTimeout(() => this.options.serializableCallback({}), 30000, 'serializableCallback (scrapeSchema editor mode)');
|
|
446
274
|
return;
|
|
447
275
|
}
|
|
448
276
|
yield this.ensureScriptsLoaded(page);
|
|
@@ -492,22 +320,32 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
492
320
|
this.serializableDataByType[actionType][name] = [];
|
|
493
321
|
}
|
|
494
322
|
this.serializableDataByType[actionType][name] = [...this.cumulativeResults];
|
|
495
|
-
yield this.options.serializableCallback({
|
|
323
|
+
yield this.callWithTimeout(() => this.options.serializableCallback({
|
|
496
324
|
scrapeList: this.serializableDataByType.scrapeList,
|
|
497
325
|
scrapeSchema: this.serializableDataByType.scrapeSchema
|
|
498
|
-
});
|
|
326
|
+
}), 30000, 'serializableCallback (scrapeSchema)');
|
|
327
|
+
const MAX_CUMULATIVE_RESULTS = 1000;
|
|
328
|
+
if (this.cumulativeResults.length > MAX_CUMULATIVE_RESULTS) {
|
|
329
|
+
this.cumulativeResults = this.cumulativeResults.slice(-500);
|
|
330
|
+
}
|
|
331
|
+
const MAX_STORED_SCHEMAS = 50;
|
|
332
|
+
const schemaKeys = Object.keys(this.serializableDataByType[actionType]);
|
|
333
|
+
if (schemaKeys.length > MAX_STORED_SCHEMAS) {
|
|
334
|
+
const sortedKeys = schemaKeys.sort();
|
|
335
|
+
const keysToRemove = sortedKeys.slice(0, schemaKeys.length - MAX_STORED_SCHEMAS);
|
|
336
|
+
keysToRemove.forEach(key => {
|
|
337
|
+
delete this.serializableDataByType[actionType][key];
|
|
338
|
+
});
|
|
339
|
+
}
|
|
499
340
|
if (this.options.robotType === 'deep-extract' && !this.isInDeepExtractionPhase && this.initializedWorkflow) {
|
|
500
341
|
if (!this.pendingDeepExtraction) {
|
|
501
|
-
console.log('DEBUG: Building hierarchical deep extraction plan from scrapeSchema...');
|
|
502
342
|
const hierarchyData = this.buildDeepExtractionHierarchy(this.initializedWorkflow);
|
|
503
343
|
if (hierarchyData.length > 0) {
|
|
504
|
-
const nextLevelIndex =
|
|
344
|
+
const nextLevelIndex = 0;
|
|
505
345
|
const nextLevelGotoPattern = hierarchyData[nextLevelIndex].gotoPattern;
|
|
506
346
|
this.log(`Root scrapeSchema will extract URLs matching pattern: ${nextLevelGotoPattern}`, logger_1.Level.LOG);
|
|
507
|
-
// Extract URLs from schema fields
|
|
508
347
|
const urls = yield this.extractHrefsFromPage(page, schema);
|
|
509
348
|
this.log(`scrapeSchema extracted ${urls.length} URLs from field selectors`, logger_1.Level.LOG);
|
|
510
|
-
// Filter URLs against pattern
|
|
511
349
|
const rootUrlMappings = urls
|
|
512
350
|
.map((url, index) => ({
|
|
513
351
|
scrapeListIndex: index,
|
|
@@ -524,9 +362,6 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
524
362
|
}))
|
|
525
363
|
};
|
|
526
364
|
}
|
|
527
|
-
else {
|
|
528
|
-
console.log('DEBUG: No goto actions found, deep extraction skipped');
|
|
529
|
-
}
|
|
530
365
|
}
|
|
531
366
|
else {
|
|
532
367
|
this.log(`[Deep Extract] scrapeSchema "${name}" extracting URLs during workflow execution`, logger_1.Level.LOG);
|
|
@@ -585,7 +420,7 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
585
420
|
this.options.debugChannel.setActionType('scrapeList');
|
|
586
421
|
}
|
|
587
422
|
if (this.options.mode && this.options.mode === 'editor') {
|
|
588
|
-
yield this.options.serializableCallback({});
|
|
423
|
+
yield this.callWithTimeout(() => this.options.serializableCallback({}), 30000, 'serializableCallback (scrapeList editor mode)');
|
|
589
424
|
return;
|
|
590
425
|
}
|
|
591
426
|
try {
|
|
@@ -613,10 +448,9 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
613
448
|
const paginationUrls = paginationResult.urls;
|
|
614
449
|
if (this.options.robotType === 'deep-extract' && this.initializedWorkflow && scrapeResults.length > 0) {
|
|
615
450
|
if (!this.pendingDeepExtraction) {
|
|
616
|
-
console.log('DEBUG: Building hierarchical deep extraction plan from pagination...');
|
|
617
451
|
const hierarchyData = this.buildDeepExtractionHierarchy(this.initializedWorkflow);
|
|
618
452
|
if (hierarchyData.length > 0) {
|
|
619
|
-
const nextLevelIndex =
|
|
453
|
+
const nextLevelIndex = 0;
|
|
620
454
|
const nextLevelGotoPattern = hierarchyData[nextLevelIndex].gotoPattern;
|
|
621
455
|
this.log(`Root scrapeList (pagination) will extract URLs matching pattern: ${nextLevelGotoPattern}`, logger_1.Level.LOG);
|
|
622
456
|
const rootUrlMappings = this.filterDeepExtractionUrlsFromExtracted(paginationUrls, scrapeResults, nextLevelGotoPattern);
|
|
@@ -625,6 +459,9 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
625
459
|
hierarchy: hierarchyData.map((level, idx) => ({
|
|
626
460
|
gotoPattern: level.gotoPattern,
|
|
627
461
|
actionsToExecute: level.actionsToExecute,
|
|
462
|
+
sourceActionName: level.sourceActionName,
|
|
463
|
+
sourceActionType: level.sourceActionType,
|
|
464
|
+
deepExtractionLimit: level.deepExtractionLimit,
|
|
628
465
|
urlMappings: idx === nextLevelIndex ? rootUrlMappings.map(m => ({ index: m.scrapeListIndex, url: m.url })) : []
|
|
629
466
|
}))
|
|
630
467
|
};
|
|
@@ -634,12 +471,30 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
634
471
|
this.log(`[Deep Extract] scrapeList (pagination) "${actionName}" extracting URLs`, logger_1.Level.LOG);
|
|
635
472
|
const hierarchy = this.pendingDeepExtraction.hierarchy;
|
|
636
473
|
if (hierarchy && hierarchy.length > 0) {
|
|
637
|
-
|
|
638
|
-
|
|
639
|
-
|
|
474
|
+
let targetLevelIndex = -1;
|
|
475
|
+
for (let i = hierarchy.length - 1; i >= 0; i--) {
|
|
476
|
+
if (hierarchy[i].urlMappings.length === 0) {
|
|
477
|
+
targetLevelIndex = i;
|
|
478
|
+
break;
|
|
479
|
+
}
|
|
480
|
+
}
|
|
481
|
+
if (targetLevelIndex >= 0) {
|
|
482
|
+
const nextGotoPattern = hierarchy[targetLevelIndex].gotoPattern;
|
|
640
483
|
this.log(`[Deep Extract] Extracting URLs for pattern: ${nextGotoPattern}`, logger_1.Level.LOG);
|
|
641
484
|
const urlMappings = this.filterDeepExtractionUrlsFromExtracted(paginationUrls, scrapeResults, nextGotoPattern);
|
|
642
485
|
this.log(`[Deep Extract] Found ${urlMappings.filter(m => m.url !== null).length} matching URLs`, logger_1.Level.LOG);
|
|
486
|
+
if (hierarchy[targetLevelIndex].urlMappings.length > 0) {
|
|
487
|
+
const existingUrls = new Set(hierarchy[targetLevelIndex].urlMappings.map(m => m.url).filter(u => u !== null));
|
|
488
|
+
const newUrls = urlMappings.filter(m => m.url !== null && !existingUrls.has(m.url));
|
|
489
|
+
if (newUrls.length > 0) {
|
|
490
|
+
const startIndex = hierarchy[targetLevelIndex].urlMappings.length;
|
|
491
|
+
hierarchy[targetLevelIndex].urlMappings.push(...newUrls.map((m, idx) => ({ index: startIndex + idx, url: m.url })));
|
|
492
|
+
this.log(`[Deep Extract] Merged ${newUrls.length} new URLs from pagination`, logger_1.Level.LOG);
|
|
493
|
+
}
|
|
494
|
+
}
|
|
495
|
+
else {
|
|
496
|
+
hierarchy[targetLevelIndex].urlMappings = urlMappings.map(m => ({ index: m.scrapeListIndex, url: m.url }));
|
|
497
|
+
}
|
|
643
498
|
const validUrls = urlMappings.filter(m => m.url !== null);
|
|
644
499
|
if (validUrls.length > 0) {
|
|
645
500
|
const sampleSize = Math.min(3, validUrls.length);
|
|
@@ -669,17 +524,24 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
669
524
|
this.serializableDataByType[actionType][name] = [];
|
|
670
525
|
}
|
|
671
526
|
this.serializableDataByType[actionType][name].push(...scrapeResults);
|
|
672
|
-
yield this.options.serializableCallback({
|
|
527
|
+
yield this.callWithTimeout(() => this.options.serializableCallback({
|
|
673
528
|
scrapeList: this.serializableDataByType.scrapeList,
|
|
674
529
|
scrapeSchema: this.serializableDataByType.scrapeSchema
|
|
675
|
-
});
|
|
676
|
-
|
|
530
|
+
}), 30000, 'serializableCallback (scrapeList)');
|
|
531
|
+
const MAX_STORED_LISTS = 50;
|
|
532
|
+
const listKeys = Object.keys(this.serializableDataByType[actionType]);
|
|
533
|
+
if (listKeys.length > MAX_STORED_LISTS) {
|
|
534
|
+
const sortedKeys = listKeys.sort();
|
|
535
|
+
const keysToRemove = sortedKeys.slice(0, listKeys.length - MAX_STORED_LISTS);
|
|
536
|
+
keysToRemove.forEach(key => {
|
|
537
|
+
delete this.serializableDataByType[actionType][key];
|
|
538
|
+
});
|
|
539
|
+
}
|
|
677
540
|
if (this.options.robotType === 'deep-extract' && !this.isInDeepExtractionPhase && this.initializedWorkflow) {
|
|
678
541
|
if (!this.pendingDeepExtraction) {
|
|
679
|
-
console.log('DEBUG: Building hierarchical deep extraction plan...');
|
|
680
542
|
const hierarchyData = this.buildDeepExtractionHierarchy(this.initializedWorkflow);
|
|
681
543
|
if (hierarchyData.length > 0) {
|
|
682
|
-
const nextLevelIndex =
|
|
544
|
+
const nextLevelIndex = 0;
|
|
683
545
|
const nextLevelGotoPattern = hierarchyData[nextLevelIndex].gotoPattern;
|
|
684
546
|
this.log(`Root scrapeList will extract URLs matching pattern: ${nextLevelGotoPattern}`, logger_1.Level.LOG);
|
|
685
547
|
const rootUrlMappings = yield this.filterDeepExtractionUrls(page, config.listSelector, scrapeResults, nextLevelGotoPattern);
|
|
@@ -688,13 +550,13 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
688
550
|
hierarchy: hierarchyData.map((level, idx) => ({
|
|
689
551
|
gotoPattern: level.gotoPattern,
|
|
690
552
|
actionsToExecute: level.actionsToExecute,
|
|
553
|
+
sourceActionName: level.sourceActionName,
|
|
554
|
+
sourceActionType: level.sourceActionType,
|
|
555
|
+
deepExtractionLimit: level.deepExtractionLimit,
|
|
691
556
|
urlMappings: idx === nextLevelIndex ? rootUrlMappings.map(m => ({ index: m.scrapeListIndex, url: m.url })) : []
|
|
692
557
|
}))
|
|
693
558
|
};
|
|
694
559
|
}
|
|
695
|
-
else {
|
|
696
|
-
console.log('DEBUG: No goto actions found, deep extraction skipped');
|
|
697
|
-
}
|
|
698
560
|
}
|
|
699
561
|
else {
|
|
700
562
|
this.log(`[Deep Extract] scrapeList "${name}" extracting URLs during workflow execution`, logger_1.Level.LOG);
|
|
@@ -767,7 +629,7 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
767
629
|
const scrapeResults = yield page.evaluate((listSelector) => {
|
|
768
630
|
return window.scrapeListAuto(listSelector);
|
|
769
631
|
}, config.listSelector);
|
|
770
|
-
yield this.options.serializableCallback(scrapeResults);
|
|
632
|
+
yield this.callWithTimeout(() => this.options.serializableCallback(scrapeResults), 30000, 'serializableCallback (scrapeListAuto)');
|
|
771
633
|
}),
|
|
772
634
|
scroll: (pages) => __awaiter(this, void 0, void 0, function* () {
|
|
773
635
|
var _a;
|
|
@@ -805,11 +667,622 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
805
667
|
this.emit('flag', page, res);
|
|
806
668
|
});
|
|
807
669
|
}),
|
|
670
|
+
crawl: (crawlConfig) => __awaiter(this, void 0, void 0, function* () {
|
|
671
|
+
var _a;
|
|
672
|
+
if (this.isAborted) {
|
|
673
|
+
this.log('Workflow aborted, stopping crawl', logger_1.Level.WARN);
|
|
674
|
+
return;
|
|
675
|
+
}
|
|
676
|
+
if ((_a = this.options.debugChannel) === null || _a === void 0 ? void 0 : _a.setActionType) {
|
|
677
|
+
this.options.debugChannel.setActionType('crawl');
|
|
678
|
+
}
|
|
679
|
+
this.log('Starting crawl operation', logger_1.Level.LOG);
|
|
680
|
+
try {
|
|
681
|
+
// Get current page URL and log it
|
|
682
|
+
const currentUrl = page.url();
|
|
683
|
+
this.log(`Current page URL: ${currentUrl}`, logger_1.Level.LOG);
|
|
684
|
+
// If page is on about:blank or empty, we need to wait for navigation
|
|
685
|
+
if (!currentUrl || currentUrl === 'about:blank' || currentUrl === '') {
|
|
686
|
+
this.log('Page not yet navigated, waiting for navigation...', logger_1.Level.WARN);
|
|
687
|
+
yield page.waitForLoadState('load', { timeout: 10000 }).catch(() => { });
|
|
688
|
+
}
|
|
689
|
+
const baseUrl = page.url();
|
|
690
|
+
this.log(`Using base URL for crawl: ${baseUrl}`, logger_1.Level.LOG);
|
|
691
|
+
const parsedBase = new URL(baseUrl);
|
|
692
|
+
const baseDomain = parsedBase.hostname;
|
|
693
|
+
let discoveredUrls = [];
|
|
694
|
+
// Step 1: Sitemap discovery using XMLHttpRequest to avoid polyfills
|
|
695
|
+
if (crawlConfig.useSitemap) {
|
|
696
|
+
this.log('Fetching sitemap URLs...', logger_1.Level.LOG);
|
|
697
|
+
try {
|
|
698
|
+
const sitemapUrl = `${parsedBase.protocol}//${parsedBase.host}/sitemap.xml`;
|
|
699
|
+
// Use XMLHttpRequest instead of fetch to avoid polyfills
|
|
700
|
+
const sitemapUrls = yield page.evaluate((url) => {
|
|
701
|
+
return new Promise((resolve) => {
|
|
702
|
+
const xhr = new XMLHttpRequest();
|
|
703
|
+
xhr.open('GET', url, true);
|
|
704
|
+
xhr.onload = function () {
|
|
705
|
+
if (xhr.status === 200) {
|
|
706
|
+
const text = xhr.responseText;
|
|
707
|
+
const locMatches = text.match(/<loc>(.*?)<\/loc>/g) || [];
|
|
708
|
+
const urls = locMatches.map(match => match.replace(/<\/?loc>/g, ''));
|
|
709
|
+
resolve(urls);
|
|
710
|
+
}
|
|
711
|
+
else {
|
|
712
|
+
resolve([]);
|
|
713
|
+
}
|
|
714
|
+
};
|
|
715
|
+
xhr.onerror = function () {
|
|
716
|
+
resolve([]);
|
|
717
|
+
};
|
|
718
|
+
xhr.send();
|
|
719
|
+
});
|
|
720
|
+
}, sitemapUrl);
|
|
721
|
+
if (sitemapUrls.length > 0) {
|
|
722
|
+
const nestedSitemaps = sitemapUrls.filter(url => url.endsWith('/sitemap') || url.endsWith('sitemap.xml') || url.includes('/sitemap/'));
|
|
723
|
+
const regularUrls = sitemapUrls.filter(url => !url.endsWith('/sitemap') && !url.endsWith('sitemap.xml') && !url.includes('/sitemap/'));
|
|
724
|
+
discoveredUrls.push(...regularUrls);
|
|
725
|
+
this.log(`Found ${regularUrls.length} regular URLs from main sitemap`, logger_1.Level.LOG);
|
|
726
|
+
for (const nestedUrl of nestedSitemaps.slice(0, 10)) {
|
|
727
|
+
try {
|
|
728
|
+
this.log(`Fetching nested sitemap: ${nestedUrl}`, logger_1.Level.LOG);
|
|
729
|
+
const nestedUrls = yield page.evaluate((url) => {
|
|
730
|
+
return new Promise((resolve) => {
|
|
731
|
+
const xhr = new XMLHttpRequest();
|
|
732
|
+
xhr.open('GET', url, true);
|
|
733
|
+
xhr.onload = function () {
|
|
734
|
+
if (xhr.status === 200) {
|
|
735
|
+
const text = xhr.responseText;
|
|
736
|
+
const locMatches = text.match(/<loc>(.*?)<\/loc>/g) || [];
|
|
737
|
+
const urls = locMatches.map(match => match.replace(/<\/?loc>/g, ''));
|
|
738
|
+
resolve(urls);
|
|
739
|
+
}
|
|
740
|
+
else {
|
|
741
|
+
resolve([]);
|
|
742
|
+
}
|
|
743
|
+
};
|
|
744
|
+
xhr.onerror = function () {
|
|
745
|
+
resolve([]);
|
|
746
|
+
};
|
|
747
|
+
xhr.send();
|
|
748
|
+
});
|
|
749
|
+
}, nestedUrl);
|
|
750
|
+
if (nestedUrls.length > 0) {
|
|
751
|
+
discoveredUrls.push(...nestedUrls);
|
|
752
|
+
this.log(`Found ${nestedUrls.length} URLs from nested sitemap ${nestedUrl}`, logger_1.Level.LOG);
|
|
753
|
+
}
|
|
754
|
+
}
|
|
755
|
+
catch (error) {
|
|
756
|
+
this.log(`Failed to fetch nested sitemap ${nestedUrl}: ${error.message}`, logger_1.Level.WARN);
|
|
757
|
+
}
|
|
758
|
+
}
|
|
759
|
+
this.log(`Total URLs from all sitemaps: ${discoveredUrls.length}`, logger_1.Level.LOG);
|
|
760
|
+
}
|
|
761
|
+
else {
|
|
762
|
+
this.log('No URLs found in sitemap or sitemap not available', logger_1.Level.WARN);
|
|
763
|
+
}
|
|
764
|
+
}
|
|
765
|
+
catch (error) {
|
|
766
|
+
this.log(`Sitemap fetch failed: ${error.message}`, logger_1.Level.WARN);
|
|
767
|
+
}
|
|
768
|
+
}
|
|
769
|
+
if (crawlConfig.followLinks) {
|
|
770
|
+
this.log('Extracting links from current page...', logger_1.Level.LOG);
|
|
771
|
+
try {
|
|
772
|
+
yield page.waitForLoadState('load', { timeout: 15000 }).catch(() => { });
|
|
773
|
+
yield page.waitForLoadState('networkidle', { timeout: 10000 }).catch(() => {
|
|
774
|
+
this.log('Network did not become idle, continuing anyway', logger_1.Level.WARN);
|
|
775
|
+
});
|
|
776
|
+
yield new Promise(resolve => setTimeout(resolve, 5000));
|
|
777
|
+
const anchorCount = yield page.evaluate(() => {
|
|
778
|
+
return document.querySelectorAll('a').length;
|
|
779
|
+
});
|
|
780
|
+
this.log(`Page has ${anchorCount} total anchor tags`, logger_1.Level.LOG);
|
|
781
|
+
const pageLinks = yield page.evaluate(() => {
|
|
782
|
+
const links = [];
|
|
783
|
+
const allAnchors = document.querySelectorAll('a');
|
|
784
|
+
console.log('Total anchors found:', allAnchors.length);
|
|
785
|
+
for (let i = 0; i < allAnchors.length; i++) {
|
|
786
|
+
const anchor = allAnchors[i];
|
|
787
|
+
const href = anchor.getAttribute('href');
|
|
788
|
+
const fullHref = anchor.href;
|
|
789
|
+
if (fullHref && (fullHref.startsWith('http://') || fullHref.startsWith('https://'))) {
|
|
790
|
+
links.push(fullHref);
|
|
791
|
+
}
|
|
792
|
+
}
|
|
793
|
+
console.log('Links extracted:', links.length);
|
|
794
|
+
return links;
|
|
795
|
+
});
|
|
796
|
+
discoveredUrls.push(...pageLinks);
|
|
797
|
+
this.log(`Found ${pageLinks.length} links from page`, logger_1.Level.LOG);
|
|
798
|
+
}
|
|
799
|
+
catch (error) {
|
|
800
|
+
this.log(`Link extraction failed: ${error.message}`, logger_1.Level.WARN);
|
|
801
|
+
}
|
|
802
|
+
}
|
|
803
|
+
const filteredUrls = discoveredUrls.filter(url => {
|
|
804
|
+
try {
|
|
805
|
+
const urlObj = new URL(url);
|
|
806
|
+
if (crawlConfig.mode === 'domain') {
|
|
807
|
+
if (urlObj.hostname !== baseDomain)
|
|
808
|
+
return false;
|
|
809
|
+
}
|
|
810
|
+
else if (crawlConfig.mode === 'subdomain') {
|
|
811
|
+
if (!urlObj.hostname.endsWith(baseDomain) && urlObj.hostname !== baseDomain)
|
|
812
|
+
return false;
|
|
813
|
+
}
|
|
814
|
+
else if (crawlConfig.mode === 'path') {
|
|
815
|
+
if (urlObj.hostname !== baseDomain || !urlObj.pathname.startsWith(parsedBase.pathname))
|
|
816
|
+
return false;
|
|
817
|
+
}
|
|
818
|
+
if (crawlConfig.includePaths && crawlConfig.includePaths.length > 0) {
|
|
819
|
+
const matches = crawlConfig.includePaths.some(pattern => {
|
|
820
|
+
const regex = new RegExp(pattern);
|
|
821
|
+
return regex.test(url);
|
|
822
|
+
});
|
|
823
|
+
if (!matches)
|
|
824
|
+
return false;
|
|
825
|
+
}
|
|
826
|
+
if (crawlConfig.excludePaths && crawlConfig.excludePaths.length > 0) {
|
|
827
|
+
const matches = crawlConfig.excludePaths.some(pattern => {
|
|
828
|
+
const regex = new RegExp(pattern);
|
|
829
|
+
return regex.test(url);
|
|
830
|
+
});
|
|
831
|
+
if (matches)
|
|
832
|
+
return false;
|
|
833
|
+
}
|
|
834
|
+
return true;
|
|
835
|
+
}
|
|
836
|
+
catch (error) {
|
|
837
|
+
return false;
|
|
838
|
+
}
|
|
839
|
+
});
|
|
840
|
+
const uniqueUrls = Array.from(new Set(filteredUrls.map(url => {
|
|
841
|
+
return url.replace(/#.*$/, '').replace(/\/$/, '');
|
|
842
|
+
})));
|
|
843
|
+
const basePathname = parsedBase.pathname;
|
|
844
|
+
const prioritizedUrls = uniqueUrls.sort((a, b) => {
|
|
845
|
+
try {
|
|
846
|
+
const aUrl = new URL(a);
|
|
847
|
+
const bUrl = new URL(b);
|
|
848
|
+
const aMatchesBase = aUrl.pathname.startsWith(basePathname);
|
|
849
|
+
const bMatchesBase = bUrl.pathname.startsWith(basePathname);
|
|
850
|
+
if (aMatchesBase && !bMatchesBase)
|
|
851
|
+
return -1;
|
|
852
|
+
if (!aMatchesBase && bMatchesBase)
|
|
853
|
+
return 1;
|
|
854
|
+
return 0;
|
|
855
|
+
}
|
|
856
|
+
catch (error) {
|
|
857
|
+
return 0;
|
|
858
|
+
}
|
|
859
|
+
});
|
|
860
|
+
const finalUrls = prioritizedUrls.slice(0, crawlConfig.limit);
|
|
861
|
+
this.log(`Crawl discovered ${finalUrls.length} URLs (from ${discoveredUrls.length} total)`, logger_1.Level.LOG);
|
|
862
|
+
this.log(`Starting to scrape content from ${finalUrls.length} discovered URLs...`, logger_1.Level.LOG);
|
|
863
|
+
const crawlResults = [];
|
|
864
|
+
for (let i = 0; i < finalUrls.length; i++) {
|
|
865
|
+
const url = finalUrls[i];
|
|
866
|
+
try {
|
|
867
|
+
this.log(`[${i + 1}/${finalUrls.length}] Scraping: ${url}`, logger_1.Level.LOG);
|
|
868
|
+
yield page.goto(url, {
|
|
869
|
+
waitUntil: 'domcontentloaded',
|
|
870
|
+
timeout: 30000
|
|
871
|
+
}).catch(() => {
|
|
872
|
+
this.log(`Failed to navigate to ${url}, skipping...`, logger_1.Level.WARN);
|
|
873
|
+
});
|
|
874
|
+
yield page.waitForLoadState('load', { timeout: 10000 }).catch(() => { });
|
|
875
|
+
const pageData = yield page.evaluate(() => {
|
|
876
|
+
var _a, _b;
|
|
877
|
+
const getMeta = (name) => {
|
|
878
|
+
const meta = document.querySelector(`meta[name="${name}"], meta[property="${name}"]`);
|
|
879
|
+
return (meta === null || meta === void 0 ? void 0 : meta.getAttribute('content')) || '';
|
|
880
|
+
};
|
|
881
|
+
const getAllMeta = () => {
|
|
882
|
+
const metadata = {};
|
|
883
|
+
const metaTags = document.querySelectorAll('meta');
|
|
884
|
+
metaTags.forEach(tag => {
|
|
885
|
+
const name = tag.getAttribute('name') || tag.getAttribute('property');
|
|
886
|
+
const content = tag.getAttribute('content');
|
|
887
|
+
if (name && content) {
|
|
888
|
+
metadata[name] = content;
|
|
889
|
+
}
|
|
890
|
+
});
|
|
891
|
+
return metadata;
|
|
892
|
+
};
|
|
893
|
+
const title = document.title || '';
|
|
894
|
+
const bodyText = ((_a = document.body) === null || _a === void 0 ? void 0 : _a.innerText) || '';
|
|
895
|
+
const elementsWithMxId = document.querySelectorAll('[data-mx-id]');
|
|
896
|
+
elementsWithMxId.forEach(el => el.removeAttribute('data-mx-id'));
|
|
897
|
+
const html = document.documentElement.outerHTML;
|
|
898
|
+
const links = Array.from(document.querySelectorAll('a')).map(a => a.href);
|
|
899
|
+
const allMetadata = getAllMeta();
|
|
900
|
+
return {
|
|
901
|
+
title,
|
|
902
|
+
description: getMeta('description'),
|
|
903
|
+
text: bodyText,
|
|
904
|
+
html: html,
|
|
905
|
+
links: links,
|
|
906
|
+
wordCount: bodyText.split(/\s+/).filter(w => w.length > 0).length,
|
|
907
|
+
metadata: Object.assign(Object.assign({}, allMetadata), { title, language: document.documentElement.lang || '', favicon: ((_b = document.querySelector('link[rel="icon"], link[rel="shortcut icon"]')) === null || _b === void 0 ? void 0 : _b.href) || '', statusCode: 200 })
|
|
908
|
+
};
|
|
909
|
+
});
|
|
910
|
+
crawlResults.push({
|
|
911
|
+
metadata: Object.assign(Object.assign({}, pageData.metadata), { url: url, sourceURL: url }),
|
|
912
|
+
html: pageData.html,
|
|
913
|
+
text: pageData.text,
|
|
914
|
+
links: pageData.links,
|
|
915
|
+
wordCount: pageData.wordCount,
|
|
916
|
+
scrapedAt: new Date().toISOString()
|
|
917
|
+
});
|
|
918
|
+
this.log(`✓ Scraped ${url} (${pageData.wordCount} words)`, logger_1.Level.LOG);
|
|
919
|
+
}
|
|
920
|
+
catch (error) {
|
|
921
|
+
this.log(`Failed to scrape ${url}: ${error.message}`, logger_1.Level.WARN);
|
|
922
|
+
crawlResults.push({
|
|
923
|
+
url: url,
|
|
924
|
+
error: error.message,
|
|
925
|
+
scrapedAt: new Date().toISOString()
|
|
926
|
+
});
|
|
927
|
+
}
|
|
928
|
+
}
|
|
929
|
+
this.log(`Successfully scraped ${crawlResults.length} pages`, logger_1.Level.LOG);
|
|
930
|
+
const actionType = "crawl";
|
|
931
|
+
const actionName = "Crawl Results";
|
|
932
|
+
if (!this.serializableDataByType[actionType]) {
|
|
933
|
+
this.serializableDataByType[actionType] = {};
|
|
934
|
+
}
|
|
935
|
+
if (!this.serializableDataByType[actionType][actionName]) {
|
|
936
|
+
this.serializableDataByType[actionType][actionName] = [];
|
|
937
|
+
}
|
|
938
|
+
this.serializableDataByType[actionType][actionName] = crawlResults;
|
|
939
|
+
yield this.options.serializableCallback({
|
|
940
|
+
scrapeList: this.serializableDataByType.scrapeList || {},
|
|
941
|
+
scrapeSchema: this.serializableDataByType.scrapeSchema || {},
|
|
942
|
+
crawl: this.serializableDataByType.crawl || {}
|
|
943
|
+
});
|
|
944
|
+
}
|
|
945
|
+
catch (error) {
|
|
946
|
+
this.log(`Crawl action failed: ${error.message}`, logger_1.Level.ERROR);
|
|
947
|
+
throw new Error(`Crawl execution error: ${error.message}`);
|
|
948
|
+
}
|
|
949
|
+
}),
|
|
950
|
+
search: (searchConfig) => __awaiter(this, void 0, void 0, function* () {
|
|
951
|
+
var _a, _b;
|
|
952
|
+
if (this.isAborted) {
|
|
953
|
+
this.log('Workflow aborted, stopping search', logger_1.Level.WARN);
|
|
954
|
+
return;
|
|
955
|
+
}
|
|
956
|
+
if ((_a = this.options.debugChannel) === null || _a === void 0 ? void 0 : _a.setActionType) {
|
|
957
|
+
this.options.debugChannel.setActionType('search');
|
|
958
|
+
}
|
|
959
|
+
searchConfig.provider = 'duckduckgo';
|
|
960
|
+
this.log(`Performing DuckDuckGo search for: ${searchConfig.query}`, logger_1.Level.LOG);
|
|
961
|
+
try {
|
|
962
|
+
let searchUrl = `https://duckduckgo.com/?q=${encodeURIComponent(searchConfig.query)}`;
|
|
963
|
+
if ((_b = searchConfig.filters) === null || _b === void 0 ? void 0 : _b.timeRange) {
|
|
964
|
+
const timeMap = {
|
|
965
|
+
'day': 'd',
|
|
966
|
+
'week': 'w',
|
|
967
|
+
'month': 'm',
|
|
968
|
+
'year': 'y'
|
|
969
|
+
};
|
|
970
|
+
searchUrl += `&df=${timeMap[searchConfig.filters.timeRange]}`;
|
|
971
|
+
}
|
|
972
|
+
const initialDelay = 500 + Math.random() * 1000;
|
|
973
|
+
yield new Promise(resolve => setTimeout(resolve, initialDelay));
|
|
974
|
+
yield page.goto(searchUrl, { waitUntil: 'domcontentloaded', timeout: 30000 });
|
|
975
|
+
yield page.waitForLoadState('load', { timeout: 10000 }).catch(() => {
|
|
976
|
+
this.log('Load state timeout, continuing anyway', logger_1.Level.WARN);
|
|
977
|
+
});
|
|
978
|
+
const pageLoadDelay = 2000 + Math.random() * 1500;
|
|
979
|
+
yield new Promise(resolve => setTimeout(resolve, pageLoadDelay));
|
|
980
|
+
let searchResults = [];
|
|
981
|
+
let retryCount = 0;
|
|
982
|
+
const maxRetries = 2;
|
|
983
|
+
while (searchResults.length === 0 && retryCount <= maxRetries) {
|
|
984
|
+
if (retryCount > 0) {
|
|
985
|
+
this.log(`Retry attempt ${retryCount}/${maxRetries} for DuckDuckGo search...`, logger_1.Level.LOG);
|
|
986
|
+
const retryDelay = 1000 * Math.pow(2, retryCount) + Math.random() * 1000;
|
|
987
|
+
yield new Promise(resolve => setTimeout(resolve, retryDelay));
|
|
988
|
+
}
|
|
989
|
+
this.log('Attempting to extract DuckDuckGo search results...', logger_1.Level.LOG);
|
|
990
|
+
yield page.waitForSelector('[data-testid="result"], .result', { timeout: 5000 }).catch(() => {
|
|
991
|
+
this.log('DuckDuckGo results not found on initial wait', logger_1.Level.WARN);
|
|
992
|
+
});
|
|
993
|
+
let currentResultCount = 0;
|
|
994
|
+
const maxLoadAttempts = Math.ceil(searchConfig.limit / 10) * 2;
|
|
995
|
+
let loadAttempts = 0;
|
|
996
|
+
let noNewResultsCount = 0;
|
|
997
|
+
while (currentResultCount < searchConfig.limit && loadAttempts < maxLoadAttempts && noNewResultsCount < 3) {
|
|
998
|
+
const previousCount = currentResultCount;
|
|
999
|
+
currentResultCount = yield page.evaluate(() => {
|
|
1000
|
+
const selectors = [
|
|
1001
|
+
'[data-testid="result"]',
|
|
1002
|
+
'article[data-testid="result"]',
|
|
1003
|
+
'li[data-layout="organic"]',
|
|
1004
|
+
'.result',
|
|
1005
|
+
'article[data-testid]'
|
|
1006
|
+
];
|
|
1007
|
+
for (const selector of selectors) {
|
|
1008
|
+
const elements = document.querySelectorAll(selector);
|
|
1009
|
+
if (elements.length > 0) {
|
|
1010
|
+
return elements.length;
|
|
1011
|
+
}
|
|
1012
|
+
}
|
|
1013
|
+
return 0;
|
|
1014
|
+
});
|
|
1015
|
+
if (currentResultCount >= searchConfig.limit) {
|
|
1016
|
+
this.log(`Reached desired result count: ${currentResultCount}`, logger_1.Level.LOG);
|
|
1017
|
+
break;
|
|
1018
|
+
}
|
|
1019
|
+
if (currentResultCount === previousCount) {
|
|
1020
|
+
noNewResultsCount++;
|
|
1021
|
+
this.log(`No new results after load more (attempt ${noNewResultsCount}/3)`, logger_1.Level.WARN);
|
|
1022
|
+
if (noNewResultsCount >= 3)
|
|
1023
|
+
break;
|
|
1024
|
+
}
|
|
1025
|
+
else {
|
|
1026
|
+
noNewResultsCount = 0;
|
|
1027
|
+
this.log(`Current results count: ${currentResultCount}/${searchConfig.limit}`, logger_1.Level.LOG);
|
|
1028
|
+
}
|
|
1029
|
+
yield page.evaluate(() => {
|
|
1030
|
+
window.scrollTo(0, document.body.scrollHeight);
|
|
1031
|
+
});
|
|
1032
|
+
yield new Promise(resolve => setTimeout(resolve, 800));
|
|
1033
|
+
const loadMoreClicked = yield page.evaluate(() => {
|
|
1034
|
+
const selectors = [
|
|
1035
|
+
'#more-results',
|
|
1036
|
+
'button:has-text("More results")',
|
|
1037
|
+
'button:has-text("more results")',
|
|
1038
|
+
'button[id*="more"]',
|
|
1039
|
+
'button:has-text("Load more")'
|
|
1040
|
+
];
|
|
1041
|
+
for (const selector of selectors) {
|
|
1042
|
+
try {
|
|
1043
|
+
const button = document.querySelector(selector);
|
|
1044
|
+
if (button && button.offsetParent !== null) {
|
|
1045
|
+
button.click();
|
|
1046
|
+
console.log(`Clicked load more button with selector: ${selector}`);
|
|
1047
|
+
return true;
|
|
1048
|
+
}
|
|
1049
|
+
}
|
|
1050
|
+
catch (e) {
|
|
1051
|
+
continue;
|
|
1052
|
+
}
|
|
1053
|
+
}
|
|
1054
|
+
return false;
|
|
1055
|
+
});
|
|
1056
|
+
if (loadMoreClicked) {
|
|
1057
|
+
this.log('Clicked "More results" button', logger_1.Level.LOG);
|
|
1058
|
+
yield new Promise(resolve => setTimeout(resolve, 1500 + Math.random() * 1000));
|
|
1059
|
+
}
|
|
1060
|
+
else {
|
|
1061
|
+
this.log('No "More results" button found, results may be limited', logger_1.Level.WARN);
|
|
1062
|
+
break;
|
|
1063
|
+
}
|
|
1064
|
+
loadAttempts++;
|
|
1065
|
+
}
|
|
1066
|
+
this.log(`Finished pagination. Total results available: ${currentResultCount}`, logger_1.Level.LOG);
|
|
1067
|
+
searchResults = yield page.evaluate((limit) => {
|
|
1068
|
+
const results = [];
|
|
1069
|
+
const cleanDescription = (text) => {
|
|
1070
|
+
if (!text)
|
|
1071
|
+
return '';
|
|
1072
|
+
let cleaned = text.replace(/^\d+\s+(second|minute|hour|day|week|month|year)s?\s+ago\s*/i, '');
|
|
1073
|
+
cleaned = cleaned.replace(/^[A-Z][a-z]{2}\s+\d{1,2},?\s+\d{4}\s*[—\-]\s*/i, '');
|
|
1074
|
+
cleaned = cleaned.replace(/^\d{4}-\d{2}-\d{2}\s*[—\-]\s*/i, '');
|
|
1075
|
+
cleaned = cleaned.trim().replace(/\s+/g, ' ');
|
|
1076
|
+
return cleaned;
|
|
1077
|
+
};
|
|
1078
|
+
const selectors = [
|
|
1079
|
+
'[data-testid="result"]',
|
|
1080
|
+
'article[data-testid="result"]',
|
|
1081
|
+
'li[data-layout="organic"]',
|
|
1082
|
+
'.result',
|
|
1083
|
+
'article[data-testid]'
|
|
1084
|
+
];
|
|
1085
|
+
let allElements = [];
|
|
1086
|
+
for (const selector of selectors) {
|
|
1087
|
+
const elements = Array.from(document.querySelectorAll(selector));
|
|
1088
|
+
if (elements.length > 0) {
|
|
1089
|
+
console.log(`Found ${elements.length} DDG elements with: ${selector}`);
|
|
1090
|
+
allElements = elements;
|
|
1091
|
+
break;
|
|
1092
|
+
}
|
|
1093
|
+
}
|
|
1094
|
+
for (let i = 0; i < Math.min(allElements.length, limit); i++) {
|
|
1095
|
+
const element = allElements[i];
|
|
1096
|
+
const titleEl = element.querySelector('h2, [data-testid="result-title-a"], h3, [data-testid="result-title"]');
|
|
1097
|
+
let linkEl = titleEl === null || titleEl === void 0 ? void 0 : titleEl.querySelector('a[href]');
|
|
1098
|
+
if (!linkEl) {
|
|
1099
|
+
linkEl = element.querySelector('a[href]');
|
|
1100
|
+
}
|
|
1101
|
+
if (!linkEl || !linkEl.href)
|
|
1102
|
+
continue;
|
|
1103
|
+
let actualUrl = linkEl.href;
|
|
1104
|
+
if (actualUrl.includes('uddg=')) {
|
|
1105
|
+
try {
|
|
1106
|
+
const urlParams = new URLSearchParams(actualUrl.split('?')[1]);
|
|
1107
|
+
const uddgUrl = urlParams.get('uddg');
|
|
1108
|
+
if (uddgUrl) {
|
|
1109
|
+
actualUrl = decodeURIComponent(uddgUrl);
|
|
1110
|
+
}
|
|
1111
|
+
}
|
|
1112
|
+
catch (e) {
|
|
1113
|
+
console.log('Failed to parse uddg parameter:', e);
|
|
1114
|
+
}
|
|
1115
|
+
}
|
|
1116
|
+
if (actualUrl.includes('duckduckgo.com')) {
|
|
1117
|
+
console.log(`Skipping DDG internal URL: ${actualUrl}`);
|
|
1118
|
+
continue;
|
|
1119
|
+
}
|
|
1120
|
+
const descEl = element.querySelector('[data-result="snippet"], .result__snippet, [data-testid="result-snippet"]');
|
|
1121
|
+
if (titleEl && titleEl.textContent && actualUrl) {
|
|
1122
|
+
const rawDescription = ((descEl === null || descEl === void 0 ? void 0 : descEl.textContent) || '').trim();
|
|
1123
|
+
const cleanedDescription = cleanDescription(rawDescription);
|
|
1124
|
+
results.push({
|
|
1125
|
+
url: actualUrl,
|
|
1126
|
+
title: titleEl.textContent.trim(),
|
|
1127
|
+
description: cleanedDescription,
|
|
1128
|
+
position: results.length + 1
|
|
1129
|
+
});
|
|
1130
|
+
}
|
|
1131
|
+
}
|
|
1132
|
+
console.log(`Extracted ${results.length} DuckDuckGo search results`);
|
|
1133
|
+
return results;
|
|
1134
|
+
}, searchConfig.limit);
|
|
1135
|
+
if (searchResults.length === 0) {
|
|
1136
|
+
this.log(`No DuckDuckGo results found (attempt ${retryCount + 1}/${maxRetries + 1})`, logger_1.Level.WARN);
|
|
1137
|
+
retryCount++;
|
|
1138
|
+
}
|
|
1139
|
+
else {
|
|
1140
|
+
this.log(`Successfully extracted ${searchResults.length} results`, logger_1.Level.LOG);
|
|
1141
|
+
break;
|
|
1142
|
+
}
|
|
1143
|
+
}
|
|
1144
|
+
this.log(`Search found ${searchResults.length} results`, logger_1.Level.LOG);
|
|
1145
|
+
if (searchConfig.mode === 'discover') {
|
|
1146
|
+
const actionType = "search";
|
|
1147
|
+
const actionName = "Search Results";
|
|
1148
|
+
if (!this.serializableDataByType[actionType]) {
|
|
1149
|
+
this.serializableDataByType[actionType] = {};
|
|
1150
|
+
}
|
|
1151
|
+
if (!this.serializableDataByType[actionType][actionName]) {
|
|
1152
|
+
this.serializableDataByType[actionType][actionName] = {};
|
|
1153
|
+
}
|
|
1154
|
+
const searchData = {
|
|
1155
|
+
query: searchConfig.query,
|
|
1156
|
+
provider: searchConfig.provider,
|
|
1157
|
+
filters: searchConfig.filters || {},
|
|
1158
|
+
resultsCount: searchResults.length,
|
|
1159
|
+
results: searchResults,
|
|
1160
|
+
searchedAt: new Date().toISOString()
|
|
1161
|
+
};
|
|
1162
|
+
this.serializableDataByType[actionType][actionName] = searchData;
|
|
1163
|
+
yield this.options.serializableCallback({
|
|
1164
|
+
scrapeList: this.serializableDataByType.scrapeList || {},
|
|
1165
|
+
scrapeSchema: this.serializableDataByType.scrapeSchema || {},
|
|
1166
|
+
crawl: this.serializableDataByType.crawl || {},
|
|
1167
|
+
search: this.serializableDataByType.search || {}
|
|
1168
|
+
});
|
|
1169
|
+
this.log(`Search completed in discover mode with ${searchResults.length} results`, logger_1.Level.LOG);
|
|
1170
|
+
return;
|
|
1171
|
+
}
|
|
1172
|
+
this.log(`Starting to scrape content from ${searchResults.length} search results...`, logger_1.Level.LOG);
|
|
1173
|
+
const scrapedResults = [];
|
|
1174
|
+
for (let i = 0; i < searchResults.length; i++) {
|
|
1175
|
+
const result = searchResults[i];
|
|
1176
|
+
try {
|
|
1177
|
+
this.log(`[${i + 1}/${searchResults.length}] Scraping: ${result.url}`, logger_1.Level.LOG);
|
|
1178
|
+
yield page.goto(result.url, {
|
|
1179
|
+
waitUntil: 'domcontentloaded',
|
|
1180
|
+
timeout: 30000
|
|
1181
|
+
}).catch(() => {
|
|
1182
|
+
this.log(`Failed to navigate to ${result.url}, skipping...`, logger_1.Level.WARN);
|
|
1183
|
+
});
|
|
1184
|
+
yield page.waitForLoadState('load', { timeout: 10000 }).catch(() => { });
|
|
1185
|
+
const pageData = yield page.evaluate(() => {
|
|
1186
|
+
var _a, _b;
|
|
1187
|
+
const getMeta = (name) => {
|
|
1188
|
+
const meta = document.querySelector(`meta[name="${name}"], meta[property="${name}"]`);
|
|
1189
|
+
return (meta === null || meta === void 0 ? void 0 : meta.getAttribute('content')) || '';
|
|
1190
|
+
};
|
|
1191
|
+
const getAllMeta = () => {
|
|
1192
|
+
const metadata = {};
|
|
1193
|
+
const metaTags = document.querySelectorAll('meta');
|
|
1194
|
+
metaTags.forEach(tag => {
|
|
1195
|
+
const name = tag.getAttribute('name') || tag.getAttribute('property');
|
|
1196
|
+
const content = tag.getAttribute('content');
|
|
1197
|
+
if (name && content) {
|
|
1198
|
+
metadata[name] = content;
|
|
1199
|
+
}
|
|
1200
|
+
});
|
|
1201
|
+
return metadata;
|
|
1202
|
+
};
|
|
1203
|
+
const title = document.title || '';
|
|
1204
|
+
const bodyText = ((_a = document.body) === null || _a === void 0 ? void 0 : _a.innerText) || '';
|
|
1205
|
+
const elementsWithMxId = document.querySelectorAll('[data-mx-id]');
|
|
1206
|
+
elementsWithMxId.forEach(el => el.removeAttribute('data-mx-id'));
|
|
1207
|
+
const html = document.documentElement.outerHTML;
|
|
1208
|
+
const links = Array.from(document.querySelectorAll('a')).map(a => a.href);
|
|
1209
|
+
const allMetadata = getAllMeta();
|
|
1210
|
+
return {
|
|
1211
|
+
title,
|
|
1212
|
+
description: getMeta('description'),
|
|
1213
|
+
text: bodyText,
|
|
1214
|
+
html: html,
|
|
1215
|
+
links: links,
|
|
1216
|
+
wordCount: bodyText.split(/\s+/).filter(w => w.length > 0).length,
|
|
1217
|
+
metadata: Object.assign(Object.assign({}, allMetadata), { title, language: document.documentElement.lang || '', favicon: ((_b = document.querySelector('link[rel="icon"], link[rel="shortcut icon"]')) === null || _b === void 0 ? void 0 : _b.href) || '', statusCode: 200 })
|
|
1218
|
+
};
|
|
1219
|
+
});
|
|
1220
|
+
scrapedResults.push({
|
|
1221
|
+
searchResult: {
|
|
1222
|
+
query: searchConfig.query,
|
|
1223
|
+
position: result.position,
|
|
1224
|
+
searchTitle: result.title,
|
|
1225
|
+
searchDescription: result.description,
|
|
1226
|
+
},
|
|
1227
|
+
metadata: Object.assign(Object.assign({}, pageData.metadata), { url: result.url, sourceURL: result.url }),
|
|
1228
|
+
html: pageData.html,
|
|
1229
|
+
text: pageData.text,
|
|
1230
|
+
links: pageData.links,
|
|
1231
|
+
wordCount: pageData.wordCount,
|
|
1232
|
+
scrapedAt: new Date().toISOString()
|
|
1233
|
+
});
|
|
1234
|
+
this.log(`✓ Scraped ${result.url} (${pageData.wordCount} words)`, logger_1.Level.LOG);
|
|
1235
|
+
}
|
|
1236
|
+
catch (error) {
|
|
1237
|
+
this.log(`Failed to scrape ${result.url}: ${error.message}`, logger_1.Level.WARN);
|
|
1238
|
+
scrapedResults.push({
|
|
1239
|
+
searchResult: {
|
|
1240
|
+
query: searchConfig.query,
|
|
1241
|
+
position: result.position,
|
|
1242
|
+
searchTitle: result.title,
|
|
1243
|
+
searchDescription: result.description,
|
|
1244
|
+
},
|
|
1245
|
+
url: result.url,
|
|
1246
|
+
error: error.message,
|
|
1247
|
+
scrapedAt: new Date().toISOString()
|
|
1248
|
+
});
|
|
1249
|
+
}
|
|
1250
|
+
}
|
|
1251
|
+
this.log(`Successfully scraped ${scrapedResults.length} search results`, logger_1.Level.LOG);
|
|
1252
|
+
const actionType = "search";
|
|
1253
|
+
const actionName = "Search Results";
|
|
1254
|
+
if (!this.serializableDataByType[actionType]) {
|
|
1255
|
+
this.serializableDataByType[actionType] = {};
|
|
1256
|
+
}
|
|
1257
|
+
if (!this.serializableDataByType[actionType][actionName]) {
|
|
1258
|
+
this.serializableDataByType[actionType][actionName] = {};
|
|
1259
|
+
}
|
|
1260
|
+
const searchData = {
|
|
1261
|
+
query: searchConfig.query,
|
|
1262
|
+
provider: searchConfig.provider,
|
|
1263
|
+
filters: searchConfig.filters || {},
|
|
1264
|
+
mode: searchConfig.mode,
|
|
1265
|
+
resultsCount: scrapedResults.length,
|
|
1266
|
+
results: scrapedResults,
|
|
1267
|
+
searchedAt: new Date().toISOString()
|
|
1268
|
+
};
|
|
1269
|
+
this.serializableDataByType[actionType][actionName] = searchData;
|
|
1270
|
+
yield this.options.serializableCallback({
|
|
1271
|
+
scrapeList: this.serializableDataByType.scrapeList || {},
|
|
1272
|
+
scrapeSchema: this.serializableDataByType.scrapeSchema || {},
|
|
1273
|
+
crawl: this.serializableDataByType.crawl || {},
|
|
1274
|
+
search: this.serializableDataByType.search || {}
|
|
1275
|
+
});
|
|
1276
|
+
}
|
|
1277
|
+
catch (error) {
|
|
1278
|
+
this.log(`Search action failed: ${error.message}`, logger_1.Level.ERROR);
|
|
1279
|
+
throw new Error(`Search execution error: ${error.message}`);
|
|
1280
|
+
}
|
|
1281
|
+
}),
|
|
808
1282
|
};
|
|
809
1283
|
const executeAction = (invokee, methodName, args) => __awaiter(this, void 0, void 0, function* () {
|
|
810
1284
|
console.log("Executing action:", methodName, args);
|
|
811
1285
|
if (methodName === 'press' || methodName === 'type') {
|
|
812
|
-
// Extract only the first two arguments for these methods
|
|
813
1286
|
const limitedArgs = Array.isArray(args) ? args.slice(0, 2) : [args];
|
|
814
1287
|
yield invokee[methodName](...limitedArgs);
|
|
815
1288
|
return;
|
|
@@ -822,7 +1295,6 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
822
1295
|
}
|
|
823
1296
|
});
|
|
824
1297
|
for (const step of steps) {
|
|
825
|
-
// Check abort flag before each step
|
|
826
1298
|
if (this.isAborted) {
|
|
827
1299
|
this.log('Workflow aborted during step execution', logger_1.Level.WARN);
|
|
828
1300
|
return;
|
|
@@ -844,7 +1316,6 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
844
1316
|
}
|
|
845
1317
|
try {
|
|
846
1318
|
if (step.action in wawActions) {
|
|
847
|
-
// "Arrayifying" here should not be needed (TS + syntax checker - only arrays; but why not)
|
|
848
1319
|
const params = !step.args || Array.isArray(step.args) ? step.args : [step.args];
|
|
849
1320
|
if (step.action === 'screenshot') {
|
|
850
1321
|
yield wawActions.screenshot(...(params !== null && params !== void 0 ? params : []), stepName !== null && stepName !== void 0 ? stepName : undefined);
|
|
@@ -861,7 +1332,6 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
861
1332
|
if ((_a = this.options.debugChannel) === null || _a === void 0 ? void 0 : _a.setActionType) {
|
|
862
1333
|
this.options.debugChannel.setActionType(String(step.action));
|
|
863
1334
|
}
|
|
864
|
-
// Implements the dot notation for the "method name" in the workflow
|
|
865
1335
|
const levels = String(step.action).split('.');
|
|
866
1336
|
const methodName = levels[levels.length - 1];
|
|
867
1337
|
let invokee = page;
|
|
@@ -870,7 +1340,6 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
870
1340
|
}
|
|
871
1341
|
if (methodName === 'waitForLoadState') {
|
|
872
1342
|
try {
|
|
873
|
-
// Add timeout if not already specified
|
|
874
1343
|
let args = step.args;
|
|
875
1344
|
if (Array.isArray(args) && args.length === 1) {
|
|
876
1345
|
args = [args[0], { timeout: 30000 }];
|
|
@@ -894,7 +1363,7 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
894
1363
|
}
|
|
895
1364
|
catch (error) {
|
|
896
1365
|
this.log(`Click action failed for selector ${(_b = step.args) === null || _b === void 0 ? void 0 : _b[0]}: ${error.message}`, logger_1.Level.WARN);
|
|
897
|
-
continue;
|
|
1366
|
+
continue;
|
|
898
1367
|
}
|
|
899
1368
|
}
|
|
900
1369
|
}
|
|
@@ -905,7 +1374,6 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
905
1374
|
}
|
|
906
1375
|
catch (error) {
|
|
907
1376
|
this.log(`Action ${String(step.action)} failed: ${error.message}`, logger_1.Level.WARN);
|
|
908
|
-
// Continue to next action instead of breaking
|
|
909
1377
|
continue;
|
|
910
1378
|
}
|
|
911
1379
|
yield new Promise((res) => { setTimeout(res, 500); });
|
|
@@ -920,12 +1388,10 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
920
1388
|
}
|
|
921
1389
|
const actionType = "scrapeList";
|
|
922
1390
|
let actionName = providedActionName || "";
|
|
923
|
-
// During deep extraction, ALWAYS auto-increment to create separate lists for each URL
|
|
924
1391
|
if (!actionName || actionName.trim() === "" || this.isInDeepExtractionPhase) {
|
|
925
1392
|
this.scrapeListCounter++;
|
|
926
1393
|
actionName = `List ${this.scrapeListCounter}`;
|
|
927
1394
|
}
|
|
928
|
-
// Initialize storage for this action
|
|
929
1395
|
if (!this.serializableDataByType[actionType]) {
|
|
930
1396
|
this.serializableDataByType[actionType] = {};
|
|
931
1397
|
}
|
|
@@ -933,23 +1399,21 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
933
1399
|
this.serializableDataByType[actionType][actionName] = [];
|
|
934
1400
|
}
|
|
935
1401
|
let allResults = [];
|
|
936
|
-
let allUrls = [];
|
|
1402
|
+
let allUrls = [];
|
|
937
1403
|
let previousHeight = 0;
|
|
938
1404
|
let scrapedItems = new Set();
|
|
939
1405
|
let visitedUrls = new Set();
|
|
940
1406
|
const MAX_RETRIES = 3;
|
|
941
|
-
const RETRY_DELAY = 1000;
|
|
1407
|
+
const RETRY_DELAY = 1000;
|
|
942
1408
|
const MAX_UNCHANGED_RESULTS = 5;
|
|
943
1409
|
const debugLog = (message, ...args) => {
|
|
944
1410
|
console.log(`[Page ${visitedUrls.size}] [URL: ${page.url()}] ${message}`, ...args);
|
|
945
1411
|
};
|
|
946
1412
|
const scrapeCurrentPage = () => __awaiter(this, void 0, void 0, function* () {
|
|
947
|
-
// Check abort flag before scraping current page
|
|
948
1413
|
if (this.isAborted) {
|
|
949
1414
|
debugLog("Workflow aborted, stopping scrapeCurrentPage");
|
|
950
1415
|
return;
|
|
951
1416
|
}
|
|
952
|
-
// Add timeout to prevent hanging on page evaluation
|
|
953
1417
|
const evaluationPromise = page.evaluate((cfg) => window.scrapeList(cfg), config);
|
|
954
1418
|
const timeoutPromise = new Promise((_, reject) => setTimeout(() => reject(new Error('Page evaluation timeout')), 10000));
|
|
955
1419
|
let results;
|
|
@@ -960,10 +1424,7 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
960
1424
|
debugLog(`Page evaluation failed: ${error.message}`);
|
|
961
1425
|
return;
|
|
962
1426
|
}
|
|
963
|
-
// Extract URLs for ALL items BEFORE filtering duplicates
|
|
964
|
-
// This ensures URL indices match result indices
|
|
965
1427
|
const allItemUrls = yield this.extractUrlsFromCurrentPage(page, config.listSelector, results.length);
|
|
966
|
-
// Filter results AND URLs together using the same uniqueness logic
|
|
967
1428
|
const newResults = [];
|
|
968
1429
|
const newUrls = [];
|
|
969
1430
|
results.forEach((item, index) => {
|
|
@@ -971,28 +1432,35 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
971
1432
|
if (!scrapedItems.has(uniqueKey)) {
|
|
972
1433
|
scrapedItems.add(uniqueKey);
|
|
973
1434
|
newResults.push(item);
|
|
974
|
-
newUrls.push(allItemUrls[index] || []);
|
|
1435
|
+
newUrls.push(allItemUrls[index] || []);
|
|
975
1436
|
}
|
|
976
1437
|
});
|
|
977
1438
|
allResults = allResults.concat(newResults);
|
|
978
1439
|
allUrls = allUrls.concat(newUrls);
|
|
979
1440
|
debugLog("Results collected:", allResults.length);
|
|
980
|
-
// Store in serializableDataByType and send structured callback
|
|
981
1441
|
this.serializableDataByType[actionType][actionName] = [...allResults];
|
|
982
|
-
yield this.options.serializableCallback({
|
|
1442
|
+
yield this.callWithTimeout(() => this.options.serializableCallback({
|
|
983
1443
|
scrapeList: this.serializableDataByType.scrapeList,
|
|
984
1444
|
scrapeSchema: this.serializableDataByType.scrapeSchema
|
|
985
|
-
});
|
|
1445
|
+
}), 30000, 'serializableCallback (pagination scrapeList)');
|
|
1446
|
+
const MAX_STORED_LISTS = 50;
|
|
1447
|
+
const listKeys = Object.keys(this.serializableDataByType[actionType]);
|
|
1448
|
+
if (listKeys.length > MAX_STORED_LISTS) {
|
|
1449
|
+
const sortedKeys = listKeys.sort();
|
|
1450
|
+
const keysToRemove = sortedKeys.slice(0, listKeys.length - MAX_STORED_LISTS);
|
|
1451
|
+
keysToRemove.forEach(key => {
|
|
1452
|
+
delete this.serializableDataByType[actionType][key];
|
|
1453
|
+
});
|
|
1454
|
+
}
|
|
986
1455
|
});
|
|
987
1456
|
const checkLimit = () => {
|
|
988
1457
|
if (config.limit && allResults.length >= config.limit) {
|
|
989
1458
|
allResults = allResults.slice(0, config.limit);
|
|
990
|
-
allUrls = allUrls.slice(0, config.limit);
|
|
1459
|
+
allUrls = allUrls.slice(0, config.limit);
|
|
991
1460
|
return true;
|
|
992
1461
|
}
|
|
993
1462
|
return false;
|
|
994
1463
|
};
|
|
995
|
-
// Helper function to detect if a selector is XPath
|
|
996
1464
|
const isXPathSelector = (selector) => {
|
|
997
1465
|
return selector.startsWith('//') ||
|
|
998
1466
|
selector.startsWith('/') ||
|
|
@@ -1004,11 +1472,9 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
1004
1472
|
selector.includes(' and ') ||
|
|
1005
1473
|
selector.includes(' or ');
|
|
1006
1474
|
};
|
|
1007
|
-
// Helper function to wait for selector (CSS or XPath)
|
|
1008
1475
|
const waitForSelectorUniversal = (selector_2, ...args_1) => __awaiter(this, [selector_2, ...args_1], void 0, function* (selector, options = {}) {
|
|
1009
1476
|
try {
|
|
1010
1477
|
if (isXPathSelector(selector)) {
|
|
1011
|
-
// Use XPath locator
|
|
1012
1478
|
const locator = page.locator(`xpath=${selector}`);
|
|
1013
1479
|
yield locator.waitFor({
|
|
1014
1480
|
state: 'attached',
|
|
@@ -1017,7 +1483,6 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
1017
1483
|
return yield locator.elementHandle();
|
|
1018
1484
|
}
|
|
1019
1485
|
else {
|
|
1020
|
-
// Use CSS selector
|
|
1021
1486
|
return yield page.waitForSelector(selector, {
|
|
1022
1487
|
state: 'attached',
|
|
1023
1488
|
timeout: options.timeout || 10000
|
|
@@ -1028,13 +1493,11 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
1028
1493
|
return null;
|
|
1029
1494
|
}
|
|
1030
1495
|
});
|
|
1031
|
-
// Enhanced button finder with retry mechanism for both CSS and XPath selectors
|
|
1032
1496
|
const findWorkingButton = (selectors) => __awaiter(this, void 0, void 0, function* () {
|
|
1033
1497
|
const startTime = Date.now();
|
|
1034
1498
|
const MAX_BUTTON_SEARCH_TIME = 15000;
|
|
1035
1499
|
let updatedSelectors = [...selectors];
|
|
1036
1500
|
for (let i = 0; i < selectors.length; i++) {
|
|
1037
|
-
// Check overall timeout
|
|
1038
1501
|
if (Date.now() - startTime > MAX_BUTTON_SEARCH_TIME) {
|
|
1039
1502
|
debugLog(`Button search timeout reached (${MAX_BUTTON_SEARCH_TIME}ms), aborting`);
|
|
1040
1503
|
break;
|
|
@@ -1044,7 +1507,6 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
1044
1507
|
let selectorSuccess = false;
|
|
1045
1508
|
while (retryCount < MAX_RETRIES && !selectorSuccess) {
|
|
1046
1509
|
try {
|
|
1047
|
-
// Reduce timeout to prevent hanging on slow selectors
|
|
1048
1510
|
const button = yield waitForSelectorUniversal(selector, { timeout: 2000 });
|
|
1049
1511
|
if (button) {
|
|
1050
1512
|
debugLog('Found working selector:', selector);
|
|
@@ -1055,7 +1517,6 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
1055
1517
|
};
|
|
1056
1518
|
}
|
|
1057
1519
|
else {
|
|
1058
|
-
// Treat null result as failed attempt
|
|
1059
1520
|
retryCount++;
|
|
1060
1521
|
debugLog(`Selector "${selector}" not found: attempt ${retryCount}/${MAX_RETRIES}`);
|
|
1061
1522
|
if (retryCount < MAX_RETRIES) {
|
|
@@ -1064,7 +1525,7 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
1064
1525
|
else {
|
|
1065
1526
|
debugLog(`Removing failed selector "${selector}" after ${MAX_RETRIES} attempts`);
|
|
1066
1527
|
updatedSelectors = updatedSelectors.filter(s => s !== selector);
|
|
1067
|
-
selectorSuccess = true;
|
|
1528
|
+
selectorSuccess = true;
|
|
1068
1529
|
}
|
|
1069
1530
|
}
|
|
1070
1531
|
}
|
|
@@ -1077,7 +1538,7 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
1077
1538
|
else {
|
|
1078
1539
|
debugLog(`Removing failed selector "${selector}" after ${MAX_RETRIES} attempts`);
|
|
1079
1540
|
updatedSelectors = updatedSelectors.filter(s => s !== selector);
|
|
1080
|
-
selectorSuccess = true;
|
|
1541
|
+
selectorSuccess = true;
|
|
1081
1542
|
}
|
|
1082
1543
|
}
|
|
1083
1544
|
}
|
|
@@ -1105,17 +1566,15 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
1105
1566
|
let availableSelectors = config.pagination.selector.split(',');
|
|
1106
1567
|
let unchangedResultCounter = 0;
|
|
1107
1568
|
let paginationIterations = 0;
|
|
1108
|
-
const MAX_PAGINATION_ITERATIONS = 100;
|
|
1569
|
+
const MAX_PAGINATION_ITERATIONS = 100;
|
|
1109
1570
|
const paginationStartTime = Date.now();
|
|
1110
|
-
const MAX_PAGINATION_TIME = 30 * 60 * 1000;
|
|
1571
|
+
const MAX_PAGINATION_TIME = 30 * 60 * 1000;
|
|
1111
1572
|
try {
|
|
1112
1573
|
while (true) {
|
|
1113
|
-
// Check abort flag at start of each pagination iteration
|
|
1114
1574
|
if (this.isAborted) {
|
|
1115
1575
|
this.log('Workflow aborted during pagination loop', logger_1.Level.WARN);
|
|
1116
1576
|
return { results: allResults, urls: allUrls };
|
|
1117
1577
|
}
|
|
1118
|
-
// Pagination circuit breakers
|
|
1119
1578
|
if (++paginationIterations > MAX_PAGINATION_ITERATIONS) {
|
|
1120
1579
|
debugLog(`Maximum pagination iterations reached (${MAX_PAGINATION_ITERATIONS}), stopping`);
|
|
1121
1580
|
return { results: allResults, urls: allUrls };
|
|
@@ -1124,7 +1583,6 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
1124
1583
|
debugLog('Maximum pagination time reached (10 minutes), stopping');
|
|
1125
1584
|
return { results: allResults, urls: allUrls };
|
|
1126
1585
|
}
|
|
1127
|
-
// Add async yield every 5 iterations to prevent event loop blocking
|
|
1128
1586
|
if (paginationIterations % 5 === 0) {
|
|
1129
1587
|
yield new Promise(resolve => setImmediate(resolve));
|
|
1130
1588
|
}
|
|
@@ -1144,12 +1602,20 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
1144
1602
|
return Math.max(document.body.scrollHeight, document.documentElement.scrollHeight);
|
|
1145
1603
|
});
|
|
1146
1604
|
const currentResultCount = allResults.length;
|
|
1147
|
-
|
|
1605
|
+
const newItemsFound = currentResultCount - previousResultCount;
|
|
1606
|
+
if (newItemsFound === 0) {
|
|
1148
1607
|
unchangedResultCounter++;
|
|
1149
1608
|
if (unchangedResultCounter >= MAX_UNCHANGED_RESULTS) {
|
|
1150
1609
|
return { results: allResults, urls: allUrls };
|
|
1151
1610
|
}
|
|
1152
1611
|
}
|
|
1612
|
+
else if (newItemsFound < 3) {
|
|
1613
|
+
unchangedResultCounter++;
|
|
1614
|
+
if (unchangedResultCounter >= MAX_UNCHANGED_RESULTS * 2) {
|
|
1615
|
+
debugLog('Very slow pagination detected (< 3 items/scroll), stopping');
|
|
1616
|
+
return { results: allResults, urls: allUrls };
|
|
1617
|
+
}
|
|
1618
|
+
}
|
|
1153
1619
|
else {
|
|
1154
1620
|
unchangedResultCounter = 0;
|
|
1155
1621
|
}
|
|
@@ -1169,12 +1635,20 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
1169
1635
|
yield page.waitForTimeout(2000);
|
|
1170
1636
|
const currentTopHeight = yield page.evaluate(() => document.documentElement.scrollTop);
|
|
1171
1637
|
const currentResultCount = allResults.length;
|
|
1172
|
-
|
|
1638
|
+
const newItemsFound = currentResultCount - previousResultCount;
|
|
1639
|
+
if (newItemsFound === 0) {
|
|
1173
1640
|
unchangedResultCounter++;
|
|
1174
1641
|
if (unchangedResultCounter >= MAX_UNCHANGED_RESULTS) {
|
|
1175
1642
|
return { results: allResults, urls: allUrls };
|
|
1176
1643
|
}
|
|
1177
1644
|
}
|
|
1645
|
+
else if (newItemsFound < 3) {
|
|
1646
|
+
unchangedResultCounter++;
|
|
1647
|
+
if (unchangedResultCounter >= MAX_UNCHANGED_RESULTS * 2) {
|
|
1648
|
+
debugLog('Very slow pagination detected (< 3 items/scroll), stopping');
|
|
1649
|
+
return { results: allResults, urls: allUrls };
|
|
1650
|
+
}
|
|
1651
|
+
}
|
|
1178
1652
|
else {
|
|
1179
1653
|
unchangedResultCounter = 0;
|
|
1180
1654
|
}
|
|
@@ -1191,9 +1665,14 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
1191
1665
|
if (checkLimit())
|
|
1192
1666
|
return { results: allResults, urls: allUrls };
|
|
1193
1667
|
const { button, workingSelector, updatedSelectors } = yield findWorkingButton(availableSelectors);
|
|
1194
|
-
|
|
1668
|
+
if (workingSelector) {
|
|
1669
|
+
config.pagination.selector = workingSelector;
|
|
1670
|
+
availableSelectors = [workingSelector];
|
|
1671
|
+
}
|
|
1672
|
+
else {
|
|
1673
|
+
availableSelectors = updatedSelectors;
|
|
1674
|
+
}
|
|
1195
1675
|
if (!button || !workingSelector) {
|
|
1196
|
-
// Final retry for navigation when no selectors work
|
|
1197
1676
|
const success = yield retryOperation(() => __awaiter(this, void 0, void 0, function* () {
|
|
1198
1677
|
try {
|
|
1199
1678
|
yield page.evaluate(() => window.history.forward());
|
|
@@ -1210,7 +1689,6 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
1210
1689
|
}
|
|
1211
1690
|
let retryCount = 0;
|
|
1212
1691
|
let paginationSuccess = false;
|
|
1213
|
-
// Capture basic content signature before click - with XPath support
|
|
1214
1692
|
const captureContentSignature = () => __awaiter(this, void 0, void 0, function* () {
|
|
1215
1693
|
return yield page.evaluate((listSelector) => {
|
|
1216
1694
|
const isXPath = (selector) => {
|
|
@@ -1219,7 +1697,6 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
1219
1697
|
let items = [];
|
|
1220
1698
|
if (isXPath(listSelector)) {
|
|
1221
1699
|
try {
|
|
1222
|
-
// Use XPath to find elements
|
|
1223
1700
|
const xpathResult = document.evaluate(listSelector, document, null, XPathResult.ORDERED_NODE_SNAPSHOT_TYPE, null);
|
|
1224
1701
|
items = [];
|
|
1225
1702
|
for (let i = 0; i < xpathResult.snapshotLength; i++) {
|
|
@@ -1231,7 +1708,6 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
1231
1708
|
}
|
|
1232
1709
|
catch (xpathError) {
|
|
1233
1710
|
console.warn('XPath evaluation failed, trying CSS selector as fallback:', xpathError);
|
|
1234
|
-
// Fallback to CSS selector
|
|
1235
1711
|
try {
|
|
1236
1712
|
items = document.querySelectorAll(listSelector);
|
|
1237
1713
|
}
|
|
@@ -1243,7 +1719,6 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
1243
1719
|
}
|
|
1244
1720
|
else {
|
|
1245
1721
|
try {
|
|
1246
|
-
// Use CSS selector
|
|
1247
1722
|
items = document.querySelectorAll(listSelector);
|
|
1248
1723
|
}
|
|
1249
1724
|
catch (cssError) {
|
|
@@ -1344,11 +1819,10 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
1344
1819
|
if (checkLimit())
|
|
1345
1820
|
return { results: allResults, urls: allUrls };
|
|
1346
1821
|
let loadMoreCounter = 0;
|
|
1347
|
-
const MAX_LOAD_MORE_ITERATIONS = 100;
|
|
1822
|
+
const MAX_LOAD_MORE_ITERATIONS = 100;
|
|
1348
1823
|
const loadMoreStartTime = Date.now();
|
|
1349
|
-
const MAX_LOAD_MORE_TIME = 30 * 60 * 1000;
|
|
1824
|
+
const MAX_LOAD_MORE_TIME = 30 * 60 * 1000;
|
|
1350
1825
|
while (true) {
|
|
1351
|
-
// Load more circuit breakers
|
|
1352
1826
|
if (loadMoreCounter >= MAX_LOAD_MORE_ITERATIONS) {
|
|
1353
1827
|
debugLog(`Maximum load more iterations reached (${MAX_LOAD_MORE_ITERATIONS}), stopping`);
|
|
1354
1828
|
return { results: allResults, urls: allUrls };
|
|
@@ -1357,18 +1831,21 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
1357
1831
|
debugLog('Maximum load more time reached (5 minutes), stopping');
|
|
1358
1832
|
return { results: allResults, urls: allUrls };
|
|
1359
1833
|
}
|
|
1360
|
-
// Add async yield every 3 iterations
|
|
1361
1834
|
if (loadMoreCounter % 3 === 0 && loadMoreCounter > 0) {
|
|
1362
1835
|
yield new Promise(resolve => setImmediate(resolve));
|
|
1363
1836
|
}
|
|
1364
|
-
// Find working button with retry mechanism
|
|
1365
1837
|
const { button: loadMoreButton, workingSelector, updatedSelectors } = yield findWorkingButton(availableSelectors);
|
|
1366
|
-
|
|
1838
|
+
if (workingSelector) {
|
|
1839
|
+
config.pagination.selector = workingSelector;
|
|
1840
|
+
availableSelectors = [workingSelector];
|
|
1841
|
+
}
|
|
1842
|
+
else {
|
|
1843
|
+
availableSelectors = updatedSelectors;
|
|
1844
|
+
}
|
|
1367
1845
|
if (!workingSelector || !loadMoreButton) {
|
|
1368
1846
|
debugLog('No working Load More selector found after retries');
|
|
1369
1847
|
return { results: allResults, urls: allUrls };
|
|
1370
1848
|
}
|
|
1371
|
-
// Implement retry mechanism for clicking the button
|
|
1372
1849
|
let retryCount = 0;
|
|
1373
1850
|
let clickSuccess = false;
|
|
1374
1851
|
while (retryCount < MAX_RETRIES && !clickSuccess) {
|
|
@@ -1379,14 +1856,13 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
1379
1856
|
}
|
|
1380
1857
|
catch (error) {
|
|
1381
1858
|
debugLog(`Regular click failed on attempt ${retryCount + 1}. Trying DispatchEvent`);
|
|
1382
|
-
// If regular click fails, try dispatchEvent
|
|
1383
1859
|
try {
|
|
1384
1860
|
yield loadMoreButton.dispatchEvent('click');
|
|
1385
1861
|
clickSuccess = true;
|
|
1386
1862
|
}
|
|
1387
1863
|
catch (dispatchError) {
|
|
1388
1864
|
debugLog(`DispatchEvent failed on attempt ${retryCount + 1}.`);
|
|
1389
|
-
throw dispatchError;
|
|
1865
|
+
throw dispatchError;
|
|
1390
1866
|
}
|
|
1391
1867
|
}
|
|
1392
1868
|
if (clickSuccess) {
|
|
@@ -1408,7 +1884,6 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
1408
1884
|
debugLog(`Load More clicking failed after ${MAX_RETRIES} attempts`);
|
|
1409
1885
|
return { results: allResults, urls: allUrls };
|
|
1410
1886
|
}
|
|
1411
|
-
// Wait for content to load and check scroll height
|
|
1412
1887
|
yield page.waitForTimeout(2000);
|
|
1413
1888
|
yield page.evaluate(() => {
|
|
1414
1889
|
const scrollHeight = Math.max(document.body.scrollHeight, document.documentElement.scrollHeight);
|
|
@@ -1458,644 +1933,15 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
1458
1933
|
return { results: allResults, urls: allUrls };
|
|
1459
1934
|
});
|
|
1460
1935
|
}
|
|
1461
|
-
getMatchingActionId(workflow, pageState, usedActions) {
|
|
1462
|
-
for (let actionId = workflow.length - 1; actionId >= 0; actionId--) {
|
|
1463
|
-
const step = workflow[actionId];
|
|
1464
|
-
const isApplicable = this.applicable(step.where, pageState, usedActions);
|
|
1465
|
-
console.log("-------------------------------------------------------------");
|
|
1466
|
-
console.log(`Where:`, step.where);
|
|
1467
|
-
console.log(`Page state:`, pageState);
|
|
1468
|
-
console.log(`Match result: ${isApplicable}`);
|
|
1469
|
-
console.log("-------------------------------------------------------------");
|
|
1470
|
-
if (isApplicable) {
|
|
1471
|
-
return actionId;
|
|
1472
|
-
}
|
|
1473
|
-
}
|
|
1474
|
-
}
|
|
1475
|
-
removeShadowSelectors(workflow) {
|
|
1476
|
-
for (let actionId = workflow.length - 1; actionId >= 0; actionId--) {
|
|
1477
|
-
const step = workflow[actionId];
|
|
1478
|
-
// Check if step has where and selectors
|
|
1479
|
-
if (step.where && Array.isArray(step.where.selectors)) {
|
|
1480
|
-
// Filter out selectors that contain ">>"
|
|
1481
|
-
step.where.selectors = step.where.selectors.filter(selector => !selector.includes('>>'));
|
|
1482
|
-
}
|
|
1483
|
-
}
|
|
1484
|
-
return workflow;
|
|
1485
|
-
}
|
|
1486
1936
|
removeSpecialSelectors(workflow) {
|
|
1487
1937
|
for (let actionId = workflow.length - 1; actionId >= 0; actionId--) {
|
|
1488
1938
|
const step = workflow[actionId];
|
|
1489
1939
|
if (step.where && Array.isArray(step.where.selectors)) {
|
|
1490
|
-
// Filter out if selector has EITHER ":>>" OR ">>"
|
|
1491
1940
|
step.where.selectors = step.where.selectors.filter(selector => !(selector.includes(':>>') || selector.includes('>>')));
|
|
1492
1941
|
}
|
|
1493
1942
|
}
|
|
1494
1943
|
return workflow;
|
|
1495
1944
|
}
|
|
1496
|
-
generatePageNodeInformation(page_1) {
|
|
1497
|
-
return __awaiter(this, arguments, void 0, function* (page, selector = 'body') {
|
|
1498
|
-
yield page.waitForTimeout(100);
|
|
1499
|
-
const nodeInfo = yield page.evaluate((sel) => {
|
|
1500
|
-
const serialize = (node) => {
|
|
1501
|
-
var _a, _b;
|
|
1502
|
-
// Handle text nodes
|
|
1503
|
-
if (node.nodeType === Node.TEXT_NODE) {
|
|
1504
|
-
// Get coordinates from parent element for text nodes
|
|
1505
|
-
let coordinates;
|
|
1506
|
-
if (node.parentElement) {
|
|
1507
|
-
const rect = node.parentElement.getBoundingClientRect();
|
|
1508
|
-
coordinates = {
|
|
1509
|
-
x: rect.x,
|
|
1510
|
-
y: rect.y + window.scrollY,
|
|
1511
|
-
};
|
|
1512
|
-
}
|
|
1513
|
-
return {
|
|
1514
|
-
type: 'TEXT_NODE',
|
|
1515
|
-
textContent: ((_a = node.textContent) === null || _a === void 0 ? void 0 : _a.trim()) || '',
|
|
1516
|
-
nodeType: node.nodeType,
|
|
1517
|
-
coordinates
|
|
1518
|
-
};
|
|
1519
|
-
}
|
|
1520
|
-
// Handle element nodes
|
|
1521
|
-
const element = node;
|
|
1522
|
-
const attributes = {};
|
|
1523
|
-
for (const attr of Array.from(element.attributes || [])) {
|
|
1524
|
-
attributes[attr.name] = attr.value;
|
|
1525
|
-
}
|
|
1526
|
-
// Get coordinates for element nodes
|
|
1527
|
-
let coordinates;
|
|
1528
|
-
if (node.nodeType === Node.ELEMENT_NODE) {
|
|
1529
|
-
const rect = element.getBoundingClientRect();
|
|
1530
|
-
coordinates = {
|
|
1531
|
-
x: rect.left + window.scrollX,
|
|
1532
|
-
y: rect.top + window.scrollY
|
|
1533
|
-
};
|
|
1534
|
-
}
|
|
1535
|
-
// Get all child nodes (including text nodes)
|
|
1536
|
-
const children = Array.from(node.childNodes)
|
|
1537
|
-
.map(child => serialize(child))
|
|
1538
|
-
.filter(child => {
|
|
1539
|
-
return !(child.type === 'TEXT_NODE' && child.textContent === '');
|
|
1540
|
-
});
|
|
1541
|
-
return {
|
|
1542
|
-
type: 'ELEMENT_NODE',
|
|
1543
|
-
tagName: ((_b = element.tagName) === null || _b === void 0 ? void 0 : _b.toLowerCase()) || '',
|
|
1544
|
-
nodeType: node.nodeType,
|
|
1545
|
-
attributes,
|
|
1546
|
-
coordinates,
|
|
1547
|
-
children
|
|
1548
|
-
};
|
|
1549
|
-
};
|
|
1550
|
-
const rootElement = document.querySelector(sel);
|
|
1551
|
-
if (!rootElement) {
|
|
1552
|
-
throw new Error(`Element not found: ${sel}`);
|
|
1553
|
-
}
|
|
1554
|
-
const exactRect = rootElement.getBoundingClientRect();
|
|
1555
|
-
const exactCoordinates = {
|
|
1556
|
-
x: exactRect.left + window.scrollX,
|
|
1557
|
-
y: exactRect.top + window.scrollY
|
|
1558
|
-
};
|
|
1559
|
-
const nodeInfo = serialize(rootElement);
|
|
1560
|
-
nodeInfo.coordinates = exactCoordinates;
|
|
1561
|
-
return nodeInfo;
|
|
1562
|
-
}, selector);
|
|
1563
|
-
return nodeInfo;
|
|
1564
|
-
});
|
|
1565
|
-
}
|
|
1566
|
-
detectElementChanges(page, schema) {
|
|
1567
|
-
return __awaiter(this, void 0, void 0, function* () {
|
|
1568
|
-
var _a;
|
|
1569
|
-
const currentDomTree = yield this.generatePageNodeInformation(page, 'body');
|
|
1570
|
-
const changes = [];
|
|
1571
|
-
const isScrapeList = 'listSelector' in schema;
|
|
1572
|
-
const findMatchingElement = (field, currentTree, targetInfo) => {
|
|
1573
|
-
if (currentTree.type !== 'ELEMENT_NODE')
|
|
1574
|
-
return { element: null, confidence: 0 };
|
|
1575
|
-
const directMatchScore = () => {
|
|
1576
|
-
let score = 0;
|
|
1577
|
-
const weights = {
|
|
1578
|
-
tag: 0.25,
|
|
1579
|
-
attributes: 0.35,
|
|
1580
|
-
content: 0.25,
|
|
1581
|
-
structure: 0.15
|
|
1582
|
-
};
|
|
1583
|
-
if ('tagName' in targetInfo &&
|
|
1584
|
-
currentTree.tagName === targetInfo.tagName) {
|
|
1585
|
-
score += weights.tag;
|
|
1586
|
-
}
|
|
1587
|
-
const targetAttrs = 'attributes' in targetInfo ? targetInfo.attributes || {} : {};
|
|
1588
|
-
const currentAttrs = currentTree.attributes || {};
|
|
1589
|
-
const targetClasses = (targetAttrs.class || '').split(/\s+/).filter(Boolean);
|
|
1590
|
-
const currentClasses = (currentAttrs.class || '').split(/\s+/).filter(Boolean);
|
|
1591
|
-
let classScore = 0;
|
|
1592
|
-
if (targetClasses.length === 0 && currentClasses.length === 0) {
|
|
1593
|
-
classScore = 1; // Both have no classes
|
|
1594
|
-
}
|
|
1595
|
-
else if (targetClasses.length > 0 && currentClasses.length > 0) {
|
|
1596
|
-
const commonClasses = targetClasses.filter(c => currentClasses.includes(c));
|
|
1597
|
-
if (commonClasses.length > 0) {
|
|
1598
|
-
classScore = commonClasses.length / Math.max(targetClasses.length, 1);
|
|
1599
|
-
}
|
|
1600
|
-
}
|
|
1601
|
-
let idScore = 0;
|
|
1602
|
-
if (targetAttrs.id && currentAttrs.id) {
|
|
1603
|
-
idScore = targetAttrs.id === currentAttrs.id ? 1 : 0;
|
|
1604
|
-
}
|
|
1605
|
-
else if (!targetAttrs.id && !currentAttrs.id) {
|
|
1606
|
-
idScore = 1; // Both have no id
|
|
1607
|
-
}
|
|
1608
|
-
const otherAttrs = Object.keys(Object.assign(Object.assign({}, targetAttrs), currentAttrs))
|
|
1609
|
-
.filter(key => key !== 'class' && key !== 'id');
|
|
1610
|
-
let otherAttrsScore = 0;
|
|
1611
|
-
if (otherAttrs.length > 0) {
|
|
1612
|
-
let matches = 0;
|
|
1613
|
-
for (const key of otherAttrs) {
|
|
1614
|
-
if (targetAttrs[key] === currentAttrs[key]) {
|
|
1615
|
-
matches++;
|
|
1616
|
-
}
|
|
1617
|
-
else if (key in targetAttrs && key in currentAttrs) {
|
|
1618
|
-
matches += 0.5;
|
|
1619
|
-
}
|
|
1620
|
-
}
|
|
1621
|
-
otherAttrsScore = matches / otherAttrs.length;
|
|
1622
|
-
}
|
|
1623
|
-
else {
|
|
1624
|
-
otherAttrsScore = 1;
|
|
1625
|
-
}
|
|
1626
|
-
const attributeWeights = { class: 0.5, id: 0.3, other: 0.2 };
|
|
1627
|
-
const attrScore = (classScore * attributeWeights.class) +
|
|
1628
|
-
(idScore * attributeWeights.id) +
|
|
1629
|
-
(otherAttrsScore * attributeWeights.other);
|
|
1630
|
-
score += weights.attributes * attrScore;
|
|
1631
|
-
if ('children' in targetInfo) {
|
|
1632
|
-
const targetText = targetInfo.children
|
|
1633
|
-
.filter(child => child.type === 'TEXT_NODE')
|
|
1634
|
-
.map(child => child.textContent)
|
|
1635
|
-
.join(' ')
|
|
1636
|
-
.trim();
|
|
1637
|
-
const currentText = currentTree.children
|
|
1638
|
-
.filter(child => child.type === 'TEXT_NODE')
|
|
1639
|
-
.map(child => child.textContent)
|
|
1640
|
-
.join(' ')
|
|
1641
|
-
.trim();
|
|
1642
|
-
let contentScore = 0;
|
|
1643
|
-
if (targetText === currentText) {
|
|
1644
|
-
contentScore = 1;
|
|
1645
|
-
}
|
|
1646
|
-
else if (!targetText && !currentText) {
|
|
1647
|
-
contentScore = 1;
|
|
1648
|
-
}
|
|
1649
|
-
else if (targetText && currentText) {
|
|
1650
|
-
if (targetText.includes(currentText) || currentText.includes(targetText)) {
|
|
1651
|
-
const ratio = Math.min(targetText.length, currentText.length) /
|
|
1652
|
-
Math.max(targetText.length, currentText.length);
|
|
1653
|
-
contentScore = 0.7 + (ratio * 0.3);
|
|
1654
|
-
}
|
|
1655
|
-
else {
|
|
1656
|
-
const targetWords = targetText.toLowerCase().split(/\s+/).filter(Boolean);
|
|
1657
|
-
const currentWords = currentText.toLowerCase().split(/\s+/).filter(Boolean);
|
|
1658
|
-
if (targetWords.length > 0 && currentWords.length > 0) {
|
|
1659
|
-
const commonWords = targetWords.filter(word => currentWords.includes(word));
|
|
1660
|
-
if (commonWords.length > 0) {
|
|
1661
|
-
contentScore = commonWords.length / Math.max(targetWords.length, currentWords.length);
|
|
1662
|
-
}
|
|
1663
|
-
}
|
|
1664
|
-
}
|
|
1665
|
-
const numericPattern = /^\s*\$?[\d.,]+\s*$/;
|
|
1666
|
-
if (numericPattern.test(targetText) && numericPattern.test(currentText)) {
|
|
1667
|
-
contentScore = Math.max(contentScore, 0.8);
|
|
1668
|
-
}
|
|
1669
|
-
}
|
|
1670
|
-
score += weights.content * contentScore;
|
|
1671
|
-
}
|
|
1672
|
-
if ('children' in targetInfo) {
|
|
1673
|
-
const targetElementChildren = targetInfo.children
|
|
1674
|
-
.filter(child => child.type === 'ELEMENT_NODE');
|
|
1675
|
-
const currentElementChildren = currentTree.children
|
|
1676
|
-
.filter(child => child.type === 'ELEMENT_NODE');
|
|
1677
|
-
let structureScore = 0;
|
|
1678
|
-
if (targetElementChildren.length === 0 && currentElementChildren.length === 0) {
|
|
1679
|
-
structureScore = 1;
|
|
1680
|
-
}
|
|
1681
|
-
else if (targetElementChildren.length > 0 && currentElementChildren.length > 0) {
|
|
1682
|
-
const targetChildTags = targetElementChildren
|
|
1683
|
-
.map(child => child.tagName.toLowerCase());
|
|
1684
|
-
const currentChildTags = currentElementChildren
|
|
1685
|
-
.map(child => child.tagName.toLowerCase());
|
|
1686
|
-
const tagMatches = targetChildTags.filter(tag => currentChildTags.includes(tag)).length;
|
|
1687
|
-
const tagCountDiff = Math.abs(targetChildTags.length - currentChildTags.length);
|
|
1688
|
-
const maxTags = Math.max(targetChildTags.length, currentChildTags.length);
|
|
1689
|
-
if (maxTags > 0) {
|
|
1690
|
-
structureScore =
|
|
1691
|
-
(0.7 * (tagMatches / maxTags)) +
|
|
1692
|
-
(0.3 * (1 - (tagCountDiff / maxTags)));
|
|
1693
|
-
}
|
|
1694
|
-
}
|
|
1695
|
-
score += weights.structure * structureScore;
|
|
1696
|
-
}
|
|
1697
|
-
return score;
|
|
1698
|
-
};
|
|
1699
|
-
const score = directMatchScore();
|
|
1700
|
-
if (field === 'listSelector') {
|
|
1701
|
-
if (score >= 0.85) {
|
|
1702
|
-
return { element: currentTree, confidence: score };
|
|
1703
|
-
}
|
|
1704
|
-
}
|
|
1705
|
-
else {
|
|
1706
|
-
if (score >= 0.7) {
|
|
1707
|
-
return { element: currentTree, confidence: score };
|
|
1708
|
-
}
|
|
1709
|
-
}
|
|
1710
|
-
let bestMatch = { element: null, confidence: 0 };
|
|
1711
|
-
for (const child of currentTree.children) {
|
|
1712
|
-
if (child.type === 'ELEMENT_NODE') {
|
|
1713
|
-
const childMatch = findMatchingElement(field, child, targetInfo);
|
|
1714
|
-
if (childMatch.confidence > bestMatch.confidence) {
|
|
1715
|
-
bestMatch = childMatch;
|
|
1716
|
-
}
|
|
1717
|
-
}
|
|
1718
|
-
}
|
|
1719
|
-
return bestMatch;
|
|
1720
|
-
};
|
|
1721
|
-
const findMatchingParentElement = (field, currentDomTree, parentNodeInfo) => {
|
|
1722
|
-
if (currentDomTree.type !== 'ELEMENT_NODE') {
|
|
1723
|
-
return { element: null, confidence: 0, changes: [] };
|
|
1724
|
-
}
|
|
1725
|
-
const matchesParentElement = (element) => {
|
|
1726
|
-
if (element.type !== 'ELEMENT_NODE' || parentNodeInfo.type !== 'ELEMENT_NODE') {
|
|
1727
|
-
return 0;
|
|
1728
|
-
}
|
|
1729
|
-
let score = 0;
|
|
1730
|
-
const weights = {
|
|
1731
|
-
tagName: 0.3,
|
|
1732
|
-
structure: 0.3,
|
|
1733
|
-
children: 0.4
|
|
1734
|
-
};
|
|
1735
|
-
if (element.tagName === parentNodeInfo.tagName) {
|
|
1736
|
-
score += weights.tagName;
|
|
1737
|
-
}
|
|
1738
|
-
const targetChildren = parentNodeInfo.children || [];
|
|
1739
|
-
const currentChildren = element.children || [];
|
|
1740
|
-
const targetChildTags = targetChildren
|
|
1741
|
-
.filter(child => child.type === 'ELEMENT_NODE')
|
|
1742
|
-
.map(child => child.tagName);
|
|
1743
|
-
const currentChildTags = currentChildren
|
|
1744
|
-
.filter(child => child.type === 'ELEMENT_NODE')
|
|
1745
|
-
.map(child => child.tagName);
|
|
1746
|
-
let matchingChildren = 0;
|
|
1747
|
-
for (const tag of targetChildTags) {
|
|
1748
|
-
if (currentChildTags.includes(tag)) {
|
|
1749
|
-
matchingChildren++;
|
|
1750
|
-
}
|
|
1751
|
-
}
|
|
1752
|
-
if (targetChildTags.length > 0) {
|
|
1753
|
-
score += weights.structure * (matchingChildren / targetChildTags.length);
|
|
1754
|
-
}
|
|
1755
|
-
const targetChildElements = targetChildren.filter(child => child.type === 'ELEMENT_NODE');
|
|
1756
|
-
const currentChildElements = currentChildren.filter(child => child.type === 'ELEMENT_NODE');
|
|
1757
|
-
let childContentScore = 0;
|
|
1758
|
-
let matchedChildrenCount = 0;
|
|
1759
|
-
for (const targetChild of targetChildElements) {
|
|
1760
|
-
for (const currentChild of currentChildElements) {
|
|
1761
|
-
if (targetChild.tagName === currentChild.tagName &&
|
|
1762
|
-
contentSimilarity(targetChild, currentChild) > 0.7) {
|
|
1763
|
-
matchedChildrenCount++;
|
|
1764
|
-
break;
|
|
1765
|
-
}
|
|
1766
|
-
}
|
|
1767
|
-
}
|
|
1768
|
-
if (targetChildElements.length > 0) {
|
|
1769
|
-
childContentScore = matchedChildrenCount / targetChildElements.length;
|
|
1770
|
-
score += weights.children * childContentScore;
|
|
1771
|
-
}
|
|
1772
|
-
return score;
|
|
1773
|
-
};
|
|
1774
|
-
const contentSimilarity = (element1, element2) => {
|
|
1775
|
-
var _a, _b;
|
|
1776
|
-
if (element1.type !== 'ELEMENT_NODE' || element2.type !== 'ELEMENT_NODE') {
|
|
1777
|
-
return 0;
|
|
1778
|
-
}
|
|
1779
|
-
const text1 = ((_a = element1.children) === null || _a === void 0 ? void 0 : _a.filter(child => child.type === 'TEXT_NODE').map(child => child.textContent).join(' ').trim()) || '';
|
|
1780
|
-
const text2 = ((_b = element2.children) === null || _b === void 0 ? void 0 : _b.filter(child => child.type === 'TEXT_NODE').map(child => child.textContent).join(' ').trim()) || '';
|
|
1781
|
-
if (!text1 && !text2)
|
|
1782
|
-
return 1;
|
|
1783
|
-
if (!text1 || !text2)
|
|
1784
|
-
return 0;
|
|
1785
|
-
if (text1 === text2)
|
|
1786
|
-
return 1;
|
|
1787
|
-
if (text1.includes(text2) || text2.includes(text1)) {
|
|
1788
|
-
return 0.8;
|
|
1789
|
-
}
|
|
1790
|
-
const words1 = text1.toLowerCase().split(/\s+/).filter(Boolean);
|
|
1791
|
-
const words2 = text2.toLowerCase().split(/\s+/).filter(Boolean);
|
|
1792
|
-
let commonCount = 0;
|
|
1793
|
-
for (const word of words1) {
|
|
1794
|
-
if (words2.includes(word)) {
|
|
1795
|
-
commonCount++;
|
|
1796
|
-
}
|
|
1797
|
-
}
|
|
1798
|
-
return commonCount / Math.max(words1.length, words2.length);
|
|
1799
|
-
};
|
|
1800
|
-
const detectAttributeChanges = (element) => {
|
|
1801
|
-
const changes = [];
|
|
1802
|
-
if (element.type !== 'ELEMENT_NODE' ||
|
|
1803
|
-
parentNodeInfo.type !== 'ELEMENT_NODE') {
|
|
1804
|
-
return changes;
|
|
1805
|
-
}
|
|
1806
|
-
const originalAttrs = parentNodeInfo.attributes || {};
|
|
1807
|
-
const currentAttrs = element.attributes || {};
|
|
1808
|
-
for (const key of Object.keys(Object.assign(Object.assign({}, originalAttrs), currentAttrs))) {
|
|
1809
|
-
if (originalAttrs[key] !== currentAttrs[key]) {
|
|
1810
|
-
changes.push({
|
|
1811
|
-
attribute: key,
|
|
1812
|
-
originalValue: originalAttrs[key] || '',
|
|
1813
|
-
currentValue: currentAttrs[key] || ''
|
|
1814
|
-
});
|
|
1815
|
-
}
|
|
1816
|
-
}
|
|
1817
|
-
return changes;
|
|
1818
|
-
};
|
|
1819
|
-
const matchScore = matchesParentElement(currentDomTree);
|
|
1820
|
-
if (field === 'listSelector') {
|
|
1821
|
-
if (matchScore >= 0.85) {
|
|
1822
|
-
const changes = detectAttributeChanges(currentDomTree);
|
|
1823
|
-
return {
|
|
1824
|
-
element: currentDomTree,
|
|
1825
|
-
confidence: matchScore,
|
|
1826
|
-
changes
|
|
1827
|
-
};
|
|
1828
|
-
}
|
|
1829
|
-
}
|
|
1830
|
-
else {
|
|
1831
|
-
if (matchScore >= 0.7) {
|
|
1832
|
-
const changes = detectAttributeChanges(currentDomTree);
|
|
1833
|
-
return {
|
|
1834
|
-
element: currentDomTree,
|
|
1835
|
-
confidence: matchScore,
|
|
1836
|
-
changes
|
|
1837
|
-
};
|
|
1838
|
-
}
|
|
1839
|
-
}
|
|
1840
|
-
let bestMatch = { element: null, confidence: 0, changes: [] };
|
|
1841
|
-
for (const child of currentDomTree.children) {
|
|
1842
|
-
if (child.type === 'ELEMENT_NODE') {
|
|
1843
|
-
const childResult = findMatchingParentElement(field, child, parentNodeInfo);
|
|
1844
|
-
if (childResult.confidence > bestMatch.confidence) {
|
|
1845
|
-
bestMatch = childResult;
|
|
1846
|
-
}
|
|
1847
|
-
}
|
|
1848
|
-
}
|
|
1849
|
-
return bestMatch;
|
|
1850
|
-
};
|
|
1851
|
-
const processElement = (field, elementConfig) => __awaiter(this, void 0, void 0, function* () {
|
|
1852
|
-
const checkChanges = (originalNode, currentNode, element, isParent = false) => {
|
|
1853
|
-
const originalAttrs = 'attributes' in originalNode ? originalNode.attributes : {};
|
|
1854
|
-
const currentAttrs = 'attributes' in currentNode ? currentNode.attributes : {};
|
|
1855
|
-
if ((Object.keys(originalAttrs).length > 0 && Object.keys(currentAttrs).length === 0) ||
|
|
1856
|
-
Object.keys(Object.assign(Object.assign({}, originalAttrs), currentAttrs)).some(key => {
|
|
1857
|
-
if (key === 'class') {
|
|
1858
|
-
const originalClasses = (originalAttrs[key] || '').split(/\s+/).filter(Boolean);
|
|
1859
|
-
const currentClasses = (currentAttrs[key] || '').split(/\s+/).filter(Boolean);
|
|
1860
|
-
return JSON.stringify(originalClasses.sort()) !== JSON.stringify(currentClasses.sort());
|
|
1861
|
-
}
|
|
1862
|
-
return originalAttrs[key] !== currentAttrs[key];
|
|
1863
|
-
})) {
|
|
1864
|
-
changes.push({
|
|
1865
|
-
type: 'ATTRIBUTES_CHANGED',
|
|
1866
|
-
field,
|
|
1867
|
-
originalState: originalAttrs,
|
|
1868
|
-
currentState: currentAttrs,
|
|
1869
|
-
coordinates: currentNode.coordinates,
|
|
1870
|
-
element: element.asElement(),
|
|
1871
|
-
isParent,
|
|
1872
|
-
confidence,
|
|
1873
|
-
});
|
|
1874
|
-
}
|
|
1875
|
-
if ('tagName' in originalNode && originalNode.tagName && currentNode.tagName &&
|
|
1876
|
-
originalNode.tagName.toLowerCase() !== currentNode.tagName.toLowerCase()) {
|
|
1877
|
-
changes.push({
|
|
1878
|
-
type: 'TAG_CHANGED',
|
|
1879
|
-
field,
|
|
1880
|
-
originalState: originalNode.tagName,
|
|
1881
|
-
currentState: currentNode.tagName,
|
|
1882
|
-
coordinates: currentNode.coordinates,
|
|
1883
|
-
element: element.asElement(),
|
|
1884
|
-
isParent,
|
|
1885
|
-
confidence,
|
|
1886
|
-
});
|
|
1887
|
-
}
|
|
1888
|
-
};
|
|
1889
|
-
const { element: matchedElement, confidence } = findMatchingElement(field, currentDomTree, elementConfig.nodeInfo);
|
|
1890
|
-
if (matchedElement) {
|
|
1891
|
-
const actualElement = yield page.evaluateHandle((element) => {
|
|
1892
|
-
function findExactElement(elementInfo) {
|
|
1893
|
-
const candidatesAtCoords = document.elementsFromPoint(elementInfo.coordinates.x, elementInfo.coordinates.y);
|
|
1894
|
-
const getNormalizedText = (el) => {
|
|
1895
|
-
var _a;
|
|
1896
|
-
if (el.nodeType === Node.TEXT_NODE)
|
|
1897
|
-
return ((_a = el.textContent) === null || _a === void 0 ? void 0 : _a.trim()) || '';
|
|
1898
|
-
const textNodes = Array.from(el.childNodes)
|
|
1899
|
-
.filter((node) => node.nodeType === Node.TEXT_NODE);
|
|
1900
|
-
return textNodes.map((node) => { var _a; return (_a = node.textContent) === null || _a === void 0 ? void 0 : _a.trim(); }).join('').trim();
|
|
1901
|
-
};
|
|
1902
|
-
const targetText = elementInfo.children
|
|
1903
|
-
.filter(child => child.type === 'TEXT_NODE')
|
|
1904
|
-
.map(child => child.textContent)
|
|
1905
|
-
.join('')
|
|
1906
|
-
.trim();
|
|
1907
|
-
const exactMatch = Array.from(candidatesAtCoords).find(candidate => {
|
|
1908
|
-
if (candidate.tagName.toLowerCase() !== elementInfo.tagName.toLowerCase()) {
|
|
1909
|
-
return false;
|
|
1910
|
-
}
|
|
1911
|
-
const candidateText = getNormalizedText(candidate);
|
|
1912
|
-
const textMatch = targetText === candidateText;
|
|
1913
|
-
if (Object.keys(elementInfo.attributes || {}).length > 0) {
|
|
1914
|
-
const attrsMatch = Object.entries(elementInfo.attributes).every(([key, value]) => {
|
|
1915
|
-
if (key === 'class') {
|
|
1916
|
-
const elementClasses = (value || '').split(/\s+/).filter(Boolean);
|
|
1917
|
-
const candidateClasses = (candidate.getAttribute('class') || '')
|
|
1918
|
-
.split(/\s+/)
|
|
1919
|
-
.filter(Boolean);
|
|
1920
|
-
return JSON.stringify(elementClasses.sort()) ===
|
|
1921
|
-
JSON.stringify(candidateClasses.sort());
|
|
1922
|
-
}
|
|
1923
|
-
return candidate.getAttribute(key) === value;
|
|
1924
|
-
});
|
|
1925
|
-
return textMatch && attrsMatch;
|
|
1926
|
-
}
|
|
1927
|
-
const elementChildTags = elementInfo.children
|
|
1928
|
-
.filter(child => child.type === 'ELEMENT_NODE')
|
|
1929
|
-
.map(child => child.tagName.toLowerCase());
|
|
1930
|
-
const candidateChildTags = Array.from(candidate.children)
|
|
1931
|
-
.map(child => child.tagName.toLowerCase());
|
|
1932
|
-
const structureMatch = JSON.stringify(elementChildTags) ===
|
|
1933
|
-
JSON.stringify(candidateChildTags);
|
|
1934
|
-
return textMatch && structureMatch;
|
|
1935
|
-
});
|
|
1936
|
-
if (exactMatch)
|
|
1937
|
-
return exactMatch;
|
|
1938
|
-
const allCandidates = document.getElementsByTagName(elementInfo.tagName);
|
|
1939
|
-
return Array.from(allCandidates).find(candidate => {
|
|
1940
|
-
const candidateText = getNormalizedText(candidate);
|
|
1941
|
-
const textMatch = targetText === candidateText;
|
|
1942
|
-
const rect = candidate.getBoundingClientRect();
|
|
1943
|
-
const coordsMatch = Math.abs(rect.x - elementInfo.coordinates.x) < 5 &&
|
|
1944
|
-
Math.abs(rect.y - elementInfo.coordinates.y) < 5;
|
|
1945
|
-
return textMatch && coordsMatch;
|
|
1946
|
-
});
|
|
1947
|
-
}
|
|
1948
|
-
return findExactElement(element);
|
|
1949
|
-
}, matchedElement);
|
|
1950
|
-
if (actualElement) {
|
|
1951
|
-
checkChanges(elementConfig.nodeInfo, matchedElement, actualElement);
|
|
1952
|
-
}
|
|
1953
|
-
if ('parent' in elementConfig.nodeInfo && elementConfig.nodeInfo.parent) {
|
|
1954
|
-
const { element: matchedParentElement, confidence: parentConfidence } = findMatchingParentElement(field, currentDomTree, elementConfig.nodeInfo.parent);
|
|
1955
|
-
if (matchedParentElement && parentConfidence >= 0.5) {
|
|
1956
|
-
checkChanges(elementConfig.nodeInfo.parent, matchedParentElement, actualElement, true);
|
|
1957
|
-
}
|
|
1958
|
-
}
|
|
1959
|
-
}
|
|
1960
|
-
});
|
|
1961
|
-
if (isScrapeList) {
|
|
1962
|
-
if (schema.listSelectorInfo) {
|
|
1963
|
-
yield processElement('listSelector', schema.listSelectorInfo);
|
|
1964
|
-
}
|
|
1965
|
-
if (schema.fields) {
|
|
1966
|
-
for (const [fieldName, fieldConfig] of Object.entries(schema.fields)) {
|
|
1967
|
-
yield processElement(`fields.${fieldName}`, {
|
|
1968
|
-
nodeInfo: fieldConfig.nodeInfo,
|
|
1969
|
-
coordinates: fieldConfig.coordinates,
|
|
1970
|
-
tag: fieldConfig.tag,
|
|
1971
|
-
});
|
|
1972
|
-
}
|
|
1973
|
-
}
|
|
1974
|
-
if ((_a = schema.pagination) === null || _a === void 0 ? void 0 : _a.nodeInfo) {
|
|
1975
|
-
yield processElement('pagination', {
|
|
1976
|
-
nodeInfo: schema.pagination.nodeInfo,
|
|
1977
|
-
coordinates: schema.pagination.coordinates,
|
|
1978
|
-
});
|
|
1979
|
-
}
|
|
1980
|
-
}
|
|
1981
|
-
else {
|
|
1982
|
-
const scrapeSchema = schema;
|
|
1983
|
-
for (const [field, config] of Object.entries(scrapeSchema)) {
|
|
1984
|
-
yield processElement(field, {
|
|
1985
|
-
nodeInfo: config.nodeInfo,
|
|
1986
|
-
coordinates: config.coordinates,
|
|
1987
|
-
tag: config.tag,
|
|
1988
|
-
});
|
|
1989
|
-
}
|
|
1990
|
-
}
|
|
1991
|
-
return changes;
|
|
1992
|
-
});
|
|
1993
|
-
}
|
|
1994
|
-
validateWorkflowAction(page, action) {
|
|
1995
|
-
return __awaiter(this, void 0, void 0, function* () {
|
|
1996
|
-
var _a, _b;
|
|
1997
|
-
const modifiedAction = JSON.parse(JSON.stringify(action));
|
|
1998
|
-
const actionIndex = modifiedAction.what.findIndex(item => item.action === "scrapeSchema" || item.action === "scrapeList");
|
|
1999
|
-
if (actionIndex !== -1) {
|
|
2000
|
-
const schema = modifiedAction.what[actionIndex].args[0];
|
|
2001
|
-
const isScrapeList = modifiedAction.what[actionIndex].action === "scrapeList";
|
|
2002
|
-
try {
|
|
2003
|
-
const changes = yield this.detectElementChanges(page, schema);
|
|
2004
|
-
const uniqueChanges = {};
|
|
2005
|
-
changes.forEach(change => {
|
|
2006
|
-
if (!uniqueChanges[change.field] || change.type === 'TAG_CHANGED') {
|
|
2007
|
-
uniqueChanges[change.field] = change;
|
|
2008
|
-
}
|
|
2009
|
-
});
|
|
2010
|
-
// console.log("Unique changes detected:", uniqueChanges);
|
|
2011
|
-
console.log("Changes detected:", changes);
|
|
2012
|
-
for (const fieldName in uniqueChanges) {
|
|
2013
|
-
let schemaField = isScrapeList ?
|
|
2014
|
-
fieldName === 'listSelector' ? schema[fieldName] :
|
|
2015
|
-
fieldName.startsWith('fields.') ? schema.fields[fieldName.split('.')[1]] :
|
|
2016
|
-
fieldName === 'pagination' ? schema.pagination : null
|
|
2017
|
-
: schema[fieldName];
|
|
2018
|
-
const change = uniqueChanges[fieldName];
|
|
2019
|
-
if (change.element && schemaField) {
|
|
2020
|
-
try {
|
|
2021
|
-
let newSelectors;
|
|
2022
|
-
if (fieldName === 'listSelector') {
|
|
2023
|
-
newSelectors = yield (0, selector_1.generateNonUniqueSelectors)(page, change.element, '');
|
|
2024
|
-
}
|
|
2025
|
-
else if (isScrapeList && fieldName.startsWith('fields.')) {
|
|
2026
|
-
newSelectors = yield (0, selector_1.generateNonUniqueSelectors)(page, change.element, schema.listSelector);
|
|
2027
|
-
}
|
|
2028
|
-
else {
|
|
2029
|
-
newSelectors = yield (0, selector_1.generateSelectors)(page, change.element);
|
|
2030
|
-
if (fieldName === 'pagination') {
|
|
2031
|
-
let chainedSelectors = [
|
|
2032
|
-
(_a = newSelectors === null || newSelectors === void 0 ? void 0 : newSelectors.iframeSelector) === null || _a === void 0 ? void 0 : _a.full,
|
|
2033
|
-
(_b = newSelectors === null || newSelectors === void 0 ? void 0 : newSelectors.shadowSelector) === null || _b === void 0 ? void 0 : _b.full,
|
|
2034
|
-
newSelectors === null || newSelectors === void 0 ? void 0 : newSelectors.testIdSelector,
|
|
2035
|
-
newSelectors === null || newSelectors === void 0 ? void 0 : newSelectors.id,
|
|
2036
|
-
newSelectors === null || newSelectors === void 0 ? void 0 : newSelectors.hrefSelector,
|
|
2037
|
-
newSelectors === null || newSelectors === void 0 ? void 0 : newSelectors.accessibilitySelector,
|
|
2038
|
-
newSelectors === null || newSelectors === void 0 ? void 0 : newSelectors.attrSelector,
|
|
2039
|
-
]
|
|
2040
|
-
.filter(selector => selector !== null && selector !== undefined)
|
|
2041
|
-
.join(',');
|
|
2042
|
-
newSelectors = chainedSelectors;
|
|
2043
|
-
}
|
|
2044
|
-
}
|
|
2045
|
-
if (!newSelectors) {
|
|
2046
|
-
throw new Error('Failed to generate new selectors');
|
|
2047
|
-
}
|
|
2048
|
-
const bestSelector = fieldName !== 'pagination' ? yield (0, utils_1.getBestSelector)({
|
|
2049
|
-
selectors: newSelectors,
|
|
2050
|
-
tagName: change.type === 'TAG_CHANGED' ?
|
|
2051
|
-
change.currentState :
|
|
2052
|
-
schemaField.tag
|
|
2053
|
-
}) : newSelectors;
|
|
2054
|
-
if (!bestSelector) {
|
|
2055
|
-
throw new Error('Failed to determine best selector');
|
|
2056
|
-
}
|
|
2057
|
-
// Update selectors
|
|
2058
|
-
let previousSelector;
|
|
2059
|
-
if (fieldName === 'listSelector') {
|
|
2060
|
-
previousSelector = schema.listSelector;
|
|
2061
|
-
schema.listSelector = bestSelector;
|
|
2062
|
-
}
|
|
2063
|
-
else if (isScrapeList && fieldName.startsWith('fields.')) {
|
|
2064
|
-
const fieldKey = fieldName.split('.')[1];
|
|
2065
|
-
previousSelector = schema.fields[fieldKey].selector;
|
|
2066
|
-
schema.fields[fieldKey].selector = bestSelector;
|
|
2067
|
-
}
|
|
2068
|
-
else if (fieldName === 'pagination') {
|
|
2069
|
-
previousSelector = schema.pagination.selector;
|
|
2070
|
-
schema.pagination.selector = bestSelector;
|
|
2071
|
-
}
|
|
2072
|
-
else {
|
|
2073
|
-
previousSelector = schema[fieldName].selector;
|
|
2074
|
-
schema[fieldName].selector = bestSelector;
|
|
2075
|
-
}
|
|
2076
|
-
console.log(`Updated ${fieldName} from ${previousSelector} to ${bestSelector}`);
|
|
2077
|
-
if (modifiedAction.where.selectors) {
|
|
2078
|
-
const selectorIndex = modifiedAction.where.selectors.findIndex(s => s.includes(previousSelector));
|
|
2079
|
-
if (selectorIndex !== -1) {
|
|
2080
|
-
modifiedAction.where.selectors[selectorIndex] = bestSelector;
|
|
2081
|
-
}
|
|
2082
|
-
}
|
|
2083
|
-
}
|
|
2084
|
-
catch (error) {
|
|
2085
|
-
console.error(`Auto-heal failed for field ${fieldName}:`, error);
|
|
2086
|
-
this.trackAutohealFailure(`Failed to auto-heal field ${fieldName}: ${error.message}`);
|
|
2087
|
-
}
|
|
2088
|
-
}
|
|
2089
|
-
}
|
|
2090
|
-
}
|
|
2091
|
-
catch (error) {
|
|
2092
|
-
console.error('Complete auto-heal failure:', error);
|
|
2093
|
-
this.trackAutohealFailure(`Complete auto-heal failure: ${error.message}`);
|
|
2094
|
-
}
|
|
2095
|
-
}
|
|
2096
|
-
return modifiedAction;
|
|
2097
|
-
});
|
|
2098
|
-
}
|
|
2099
1945
|
/**
|
|
2100
1946
|
* Test if a selector is working on the current page
|
|
2101
1947
|
* @param {Page} page - Playwright page object
|
|
@@ -2116,7 +1962,6 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
2116
1962
|
selector.includes('@id=');
|
|
2117
1963
|
let count = 0;
|
|
2118
1964
|
if (isXPath) {
|
|
2119
|
-
// Add timeout to prevent XPath hanging
|
|
2120
1965
|
const locator = page.locator(`xpath=${selector}`);
|
|
2121
1966
|
const timeoutPromise = new Promise((_, reject) => setTimeout(() => reject(new Error('XPath timeout')), 5000));
|
|
2122
1967
|
try {
|
|
@@ -2126,12 +1971,10 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
2126
1971
|
]);
|
|
2127
1972
|
}
|
|
2128
1973
|
catch (error) {
|
|
2129
|
-
// XPath timed out or failed
|
|
2130
1974
|
return false;
|
|
2131
1975
|
}
|
|
2132
1976
|
}
|
|
2133
1977
|
else {
|
|
2134
|
-
// Add timeout to CSS selector operations
|
|
2135
1978
|
try {
|
|
2136
1979
|
const elementsPromise = page.$$(selector);
|
|
2137
1980
|
const timeoutPromise = new Promise((_, reject) => setTimeout(() => reject(new Error('CSS selector timeout')), 5000));
|
|
@@ -2142,15 +1985,12 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
2142
1985
|
count = elements ? elements.length : 0;
|
|
2143
1986
|
}
|
|
2144
1987
|
catch (error) {
|
|
2145
|
-
// CSS selector timed out or failed
|
|
2146
1988
|
return false;
|
|
2147
1989
|
}
|
|
2148
1990
|
}
|
|
2149
|
-
// For list selectors, we need multiple elements
|
|
2150
1991
|
if (isListSelector) {
|
|
2151
1992
|
return count >= 2;
|
|
2152
1993
|
}
|
|
2153
|
-
// For field selectors, we need at least one element
|
|
2154
1994
|
return count >= 1;
|
|
2155
1995
|
}
|
|
2156
1996
|
catch (error) {
|
|
@@ -2170,12 +2010,10 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
2170
2010
|
return __awaiter(this, arguments, void 0, function* (page, fallbackSelector, isListSelector = false, listContext = '', isPagination = false) {
|
|
2171
2011
|
var _a, _b;
|
|
2172
2012
|
try {
|
|
2173
|
-
// First check if fallback selector works
|
|
2174
2013
|
const fallbackWorks = yield this.testSelectorWorks(page, fallbackSelector, isListSelector);
|
|
2175
2014
|
if (!fallbackWorks) {
|
|
2176
2015
|
return null;
|
|
2177
2016
|
}
|
|
2178
|
-
// Get element using fallback selector
|
|
2179
2017
|
const isXPath = fallbackSelector.startsWith('//') ||
|
|
2180
2018
|
fallbackSelector.startsWith('/') ||
|
|
2181
2019
|
fallbackSelector.includes('contains(@');
|
|
@@ -2189,7 +2027,6 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
2189
2027
|
if (!element) {
|
|
2190
2028
|
return null;
|
|
2191
2029
|
}
|
|
2192
|
-
// Generate new selectors
|
|
2193
2030
|
let newSelectors;
|
|
2194
2031
|
if (isListSelector) {
|
|
2195
2032
|
return yield (0, selector_1.generateListSelectorFromFallback)(page, fallbackSelector);
|
|
@@ -2200,7 +2037,6 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
2200
2037
|
else {
|
|
2201
2038
|
newSelectors = yield (0, selector_1.generateFieldSelectorFromFallback)(page, fallbackSelector);
|
|
2202
2039
|
if (isPagination) {
|
|
2203
|
-
// For pagination, chain selectors in priority order
|
|
2204
2040
|
let chainedSelectors = [
|
|
2205
2041
|
(_a = newSelectors === null || newSelectors === void 0 ? void 0 : newSelectors.iframeSelector) === null || _a === void 0 ? void 0 : _a.full,
|
|
2206
2042
|
(_b = newSelectors === null || newSelectors === void 0 ? void 0 : newSelectors.shadowSelector) === null || _b === void 0 ? void 0 : _b.full,
|
|
@@ -2215,7 +2051,6 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
2215
2051
|
return chainedSelectors;
|
|
2216
2052
|
}
|
|
2217
2053
|
else {
|
|
2218
|
-
// For non-pagination, use getBestSelector
|
|
2219
2054
|
const tagName = yield element.evaluate(el => el.tagName.toLowerCase());
|
|
2220
2055
|
return yield (0, utils_1.getBestSelector)({
|
|
2221
2056
|
selectors: newSelectors,
|
|
@@ -2240,7 +2075,6 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
2240
2075
|
return __awaiter(this, void 0, void 0, function* () {
|
|
2241
2076
|
let hasChanges = false;
|
|
2242
2077
|
try {
|
|
2243
|
-
// Validate listSelector
|
|
2244
2078
|
const listSelectorWorks = yield this.testSelectorWorks(page, scrapeListConfig.listSelector, true);
|
|
2245
2079
|
if (!listSelectorWorks && scrapeListConfig.listFallbackSelector) {
|
|
2246
2080
|
console.log(`ListSelector "${scrapeListConfig.listSelector}" not working, trying fallback...`);
|
|
@@ -2251,11 +2085,17 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
2251
2085
|
hasChanges = true;
|
|
2252
2086
|
}
|
|
2253
2087
|
}
|
|
2254
|
-
// Validate field selectors
|
|
2255
2088
|
if (scrapeListConfig.fields) {
|
|
2256
|
-
|
|
2257
|
-
|
|
2258
|
-
|
|
2089
|
+
const fieldEntries = Object.entries(scrapeListConfig.fields);
|
|
2090
|
+
const selectorTests = yield Promise.all(fieldEntries.map((_a) => __awaiter(this, [_a], void 0, function* ([fieldName, fieldConfig]) {
|
|
2091
|
+
return ({
|
|
2092
|
+
fieldName,
|
|
2093
|
+
fieldConfig,
|
|
2094
|
+
works: yield this.testSelectorWorks(page, fieldConfig.selector, false)
|
|
2095
|
+
});
|
|
2096
|
+
})));
|
|
2097
|
+
for (const { fieldName, fieldConfig, works } of selectorTests) {
|
|
2098
|
+
if (!works && fieldConfig.fallbackSelector) {
|
|
2259
2099
|
console.log(`Field selector "${fieldConfig.selector}" for ${fieldName} not working, trying fallback...`);
|
|
2260
2100
|
const newFieldSelector = yield this.generateSelectorFromFallback(page, fieldConfig.fallbackSelector, false, scrapeListConfig.listSelector);
|
|
2261
2101
|
if (newFieldSelector) {
|
|
@@ -2263,14 +2103,20 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
2263
2103
|
fieldConfig.selector = newFieldSelector;
|
|
2264
2104
|
hasChanges = true;
|
|
2265
2105
|
}
|
|
2106
|
+
else {
|
|
2107
|
+
this.log(`WARNING: Failed to autoheal field "${fieldName}" - selector broken and fallback failed. Data for this field may be incomplete.`, logger_1.Level.WARN);
|
|
2108
|
+
this.trackAutohealFailure(`Field "${fieldName}" autoheal failed - both primary selector and fallback selector failed`);
|
|
2109
|
+
}
|
|
2110
|
+
}
|
|
2111
|
+
else if (!works) {
|
|
2112
|
+
this.log(`WARNING: Field "${fieldName}" selector not working and no fallback available. Data for this field may be incomplete.`, logger_1.Level.WARN);
|
|
2113
|
+
this.trackAutohealFailure(`Field "${fieldName}" selector broken with no fallback defined`);
|
|
2266
2114
|
}
|
|
2267
2115
|
}
|
|
2268
2116
|
}
|
|
2269
|
-
// Validate pagination selector if it exists and is not empty
|
|
2270
2117
|
if (scrapeListConfig.pagination &&
|
|
2271
2118
|
scrapeListConfig.pagination.selector &&
|
|
2272
2119
|
scrapeListConfig.pagination.selector.trim() !== '') {
|
|
2273
|
-
// Handle comma-separated pagination selectors
|
|
2274
2120
|
const paginationSelectors = scrapeListConfig.pagination.selector.split(',').map(s => s.trim());
|
|
2275
2121
|
let workingSelector = null;
|
|
2276
2122
|
for (const selector of paginationSelectors) {
|
|
@@ -2311,19 +2157,34 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
2311
2157
|
return __awaiter(this, void 0, void 0, function* () {
|
|
2312
2158
|
let hasChanges = false;
|
|
2313
2159
|
try {
|
|
2314
|
-
|
|
2315
|
-
|
|
2316
|
-
|
|
2317
|
-
|
|
2318
|
-
|
|
2319
|
-
|
|
2320
|
-
|
|
2321
|
-
|
|
2322
|
-
|
|
2323
|
-
|
|
2324
|
-
|
|
2160
|
+
const schemaEntries = Object.entries(scrapeSchemaConfig);
|
|
2161
|
+
const selectorTests = yield Promise.all(schemaEntries
|
|
2162
|
+
.filter(([_, fieldConfig]) => fieldConfig.selector)
|
|
2163
|
+
.map((_a) => __awaiter(this, [_a], void 0, function* ([fieldName, fieldConfig]) {
|
|
2164
|
+
return ({
|
|
2165
|
+
fieldName,
|
|
2166
|
+
fieldConfig,
|
|
2167
|
+
works: yield this.testSelectorWorks(page, fieldConfig.selector, false)
|
|
2168
|
+
});
|
|
2169
|
+
})));
|
|
2170
|
+
for (const { fieldName, fieldConfig, works } of selectorTests) {
|
|
2171
|
+
if (!works && fieldConfig.fallbackSelector) {
|
|
2172
|
+
console.log(`Schema field selector "${fieldConfig.selector}" for ${fieldName} not working, trying fallback...`);
|
|
2173
|
+
const newSelector = yield this.generateSelectorFromFallback(page, fieldConfig.fallbackSelector, false);
|
|
2174
|
+
if (newSelector) {
|
|
2175
|
+
console.log(`Updated schema field selector for ${fieldName}: ${fieldConfig.selector} -> ${newSelector}`);
|
|
2176
|
+
fieldConfig.selector = newSelector;
|
|
2177
|
+
hasChanges = true;
|
|
2178
|
+
}
|
|
2179
|
+
else {
|
|
2180
|
+
this.log(`WARNING: Failed to autoheal schema field "${fieldName}" - selector broken and fallback failed. Data for this field may be incomplete.`, logger_1.Level.WARN);
|
|
2181
|
+
this.trackAutohealFailure(`Schema field "${fieldName}" autoheal failed - both primary selector and fallback selector failed`);
|
|
2325
2182
|
}
|
|
2326
2183
|
}
|
|
2184
|
+
else if (!works) {
|
|
2185
|
+
this.log(`WARNING: Schema field "${fieldName}" selector not working and no fallback available. Data for this field may be incomplete.`, logger_1.Level.WARN);
|
|
2186
|
+
this.trackAutohealFailure(`Schema field "${fieldName}" selector broken with no fallback defined`);
|
|
2187
|
+
}
|
|
2327
2188
|
}
|
|
2328
2189
|
}
|
|
2329
2190
|
catch (error) {
|
|
@@ -2343,10 +2204,8 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
2343
2204
|
const modifiedAction = JSON.parse(JSON.stringify(action));
|
|
2344
2205
|
let totalChanges = 0;
|
|
2345
2206
|
try {
|
|
2346
|
-
// Process each action in the 'what' array
|
|
2347
2207
|
for (let i = 0; i < modifiedAction.what.length; i++) {
|
|
2348
2208
|
const whatAction = modifiedAction.what[i];
|
|
2349
|
-
// Handle scrapeList actions
|
|
2350
2209
|
if (whatAction.action === 'scrapeList' && whatAction.args && whatAction.args[0]) {
|
|
2351
2210
|
console.log(`Validating scrapeList action...`);
|
|
2352
2211
|
const hasChanges = yield this.validateScrapeListAction(whatAction.args[0], page);
|
|
@@ -2355,7 +2214,6 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
2355
2214
|
console.log(`Fixed scrapeList selectors`);
|
|
2356
2215
|
}
|
|
2357
2216
|
}
|
|
2358
|
-
// Handle scrapeSchema actions
|
|
2359
2217
|
if (whatAction.action === 'scrapeSchema' && whatAction.args && whatAction.args[0]) {
|
|
2360
2218
|
console.log(`Validating scrapeSchema action...`);
|
|
2361
2219
|
const hasChanges = yield this.validateScrapeSchemaAction(whatAction.args[0], page);
|
|
@@ -2405,8 +2263,6 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
2405
2263
|
else {
|
|
2406
2264
|
listElements = Array.from(document.querySelectorAll(selector));
|
|
2407
2265
|
}
|
|
2408
|
-
// Extract URLs from the first 'limit' elements that match the selector
|
|
2409
|
-
// The limit corresponds to the number of items that were scraped
|
|
2410
2266
|
const elementsToProcess = listElements.slice(0, limit);
|
|
2411
2267
|
elementsToProcess.forEach(element => {
|
|
2412
2268
|
const urls = [];
|
|
@@ -2433,9 +2289,8 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
2433
2289
|
* Workflow is bottom-to-top, so we scan from end to start.
|
|
2434
2290
|
*/
|
|
2435
2291
|
buildDeepExtractionHierarchy(currentWorkflow) {
|
|
2436
|
-
var _a, _b;
|
|
2292
|
+
var _a, _b, _c;
|
|
2437
2293
|
const hierarchy = [];
|
|
2438
|
-
// Find all goto action indices with their patterns
|
|
2439
2294
|
const gotoData = [];
|
|
2440
2295
|
currentWorkflow.forEach((pair, index) => {
|
|
2441
2296
|
var _a;
|
|
@@ -2482,12 +2337,16 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
2482
2337
|
}
|
|
2483
2338
|
let sourceActionName = '';
|
|
2484
2339
|
let sourceActionType = 'scrapeList';
|
|
2340
|
+
let deepExtractionLimit = undefined;
|
|
2485
2341
|
if (i === uniqueGotos.length - 1) {
|
|
2486
|
-
const scrapeListBefore = currentWorkflow.slice(gotoIndex
|
|
2342
|
+
const scrapeListBefore = currentWorkflow.slice(0, gotoIndex).reverse().find(pair => pair.what && pair.what.some(action => action.action === 'scrapeList'));
|
|
2487
2343
|
if (scrapeListBefore) {
|
|
2488
2344
|
const scrapeListAction = scrapeListBefore.what.find(action => action.action === 'scrapeList');
|
|
2489
2345
|
sourceActionName = ((_b = (_a = scrapeListAction === null || scrapeListAction === void 0 ? void 0 : scrapeListAction.args) === null || _a === void 0 ? void 0 : _a[0]) === null || _b === void 0 ? void 0 : _b.name) || (scrapeListAction === null || scrapeListAction === void 0 ? void 0 : scrapeListAction.name) || '';
|
|
2490
2346
|
sourceActionType = 'scrapeList';
|
|
2347
|
+
if (((_c = scrapeListAction === null || scrapeListAction === void 0 ? void 0 : scrapeListAction.args) === null || _c === void 0 ? void 0 : _c[0]) && typeof scrapeListAction.args[0] === 'object') {
|
|
2348
|
+
deepExtractionLimit = scrapeListAction.args[0].deepExtractionLimit;
|
|
2349
|
+
}
|
|
2491
2350
|
}
|
|
2492
2351
|
}
|
|
2493
2352
|
else {
|
|
@@ -2499,9 +2358,10 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
2499
2358
|
gotoPattern: String(gotoPattern),
|
|
2500
2359
|
actionsToExecute: dataExtractionActions,
|
|
2501
2360
|
sourceActionName,
|
|
2502
|
-
sourceActionType
|
|
2361
|
+
sourceActionType,
|
|
2362
|
+
deepExtractionLimit
|
|
2503
2363
|
});
|
|
2504
|
-
this.log(`Level ${i}: goto at index ${gotoIndex}, pattern=${gotoPattern}, actions=${dataExtractionActions.length}`, logger_1.Level.LOG);
|
|
2364
|
+
this.log(`Level ${i}: goto at index ${gotoIndex}, pattern=${gotoPattern}, actions=${dataExtractionActions.length}${deepExtractionLimit ? `, deepLimit=${deepExtractionLimit}` : ''}`, logger_1.Level.LOG);
|
|
2505
2365
|
}
|
|
2506
2366
|
return hierarchy;
|
|
2507
2367
|
}
|
|
@@ -2663,9 +2523,11 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
2663
2523
|
scrapeListIndex,
|
|
2664
2524
|
url: matchingUrl
|
|
2665
2525
|
});
|
|
2526
|
+
if (!matchingUrl) {
|
|
2527
|
+
this.deepExtractionStats.skippedDueToPattern++;
|
|
2528
|
+
}
|
|
2666
2529
|
});
|
|
2667
2530
|
const matchedCount = urlMappings.filter(m => m.url !== null).length;
|
|
2668
|
-
this.log(`Filtered to ${matchedCount} matching URLs for deep extraction (out of ${scrapeResults.length} total items)`, logger_1.Level.LOG);
|
|
2669
2531
|
if (matchedCount > 0) {
|
|
2670
2532
|
const matchedMappings = urlMappings.filter(m => m.url !== null);
|
|
2671
2533
|
const sampleSize = Math.min(5, matchedMappings.length);
|
|
@@ -2727,20 +2589,17 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
2727
2589
|
continue;
|
|
2728
2590
|
if (urlPattern.pathSegments.length !== targetPattern.pathSegments.length)
|
|
2729
2591
|
continue;
|
|
2730
|
-
let pathMatches = true;
|
|
2731
|
-
for (let i = 0; i < targetPattern.pathSegments.length - 1; i++) {
|
|
2732
|
-
if (urlPattern.pathSegments[i] !== targetPattern.pathSegments[i]) {
|
|
2733
|
-
pathMatches = false;
|
|
2734
|
-
break;
|
|
2735
|
-
}
|
|
2736
|
-
}
|
|
2737
|
-
if (!pathMatches)
|
|
2738
|
-
continue;
|
|
2739
2592
|
const urlNormalized = url.replace(/\/$/, '').toLowerCase();
|
|
2740
2593
|
if (urlNormalized === targetNormalized) {
|
|
2741
2594
|
this.log(`Excluding already-visited URL: ${url}`, logger_1.Level.LOG);
|
|
2742
2595
|
continue;
|
|
2743
2596
|
}
|
|
2597
|
+
const matched = this.matchesGotoPattern(url, String(gotoTargetPattern));
|
|
2598
|
+
if (!matched) {
|
|
2599
|
+
this.log(` ❌ Pattern mismatch for: ${url}`, logger_1.Level.LOG);
|
|
2600
|
+
continue;
|
|
2601
|
+
}
|
|
2602
|
+
this.log(` ✅ MATCHED: ${url}`, logger_1.Level.LOG);
|
|
2744
2603
|
matchingUrl = url;
|
|
2745
2604
|
break;
|
|
2746
2605
|
}
|
|
@@ -2748,9 +2607,11 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
2748
2607
|
scrapeListIndex,
|
|
2749
2608
|
url: matchingUrl
|
|
2750
2609
|
});
|
|
2610
|
+
if (!matchingUrl) {
|
|
2611
|
+
this.deepExtractionStats.skippedDueToPattern++;
|
|
2612
|
+
}
|
|
2751
2613
|
});
|
|
2752
|
-
const matchedCount =
|
|
2753
|
-
this.log(`Filtered to ${matchedCount} matching URLs for deep extraction (out of ${scrapeResults.length} total items)`, logger_1.Level.LOG);
|
|
2614
|
+
const matchedCount = extractedUrls.filter(urls => urls.length > 0).length;
|
|
2754
2615
|
if (matchedCount > 0) {
|
|
2755
2616
|
const matchedMappings = urlMappings.filter(m => m.url !== null);
|
|
2756
2617
|
const sampleSize = Math.min(5, matchedMappings.length);
|
|
@@ -2773,6 +2634,16 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
2773
2634
|
/**
|
|
2774
2635
|
* Helper function to check if a URL matches a goto pattern.
|
|
2775
2636
|
*/
|
|
2637
|
+
/**
|
|
2638
|
+
* Generic pattern matching for deep extraction URLs.
|
|
2639
|
+
* Works across any website by analyzing URL structure rather than relying on keywords.
|
|
2640
|
+
*
|
|
2641
|
+
* Strategy:
|
|
2642
|
+
* 1. Match URLs with same origin and path length
|
|
2643
|
+
* 2. Identify "structural" segments (numbers, short words, etc.) that should match exactly
|
|
2644
|
+
* 3. Allow other segments to vary (dynamic content like IDs, slugs, names)
|
|
2645
|
+
* 4. Skip exact matches to avoid duplicates
|
|
2646
|
+
*/
|
|
2776
2647
|
matchesGotoPattern(url, gotoPattern) {
|
|
2777
2648
|
try {
|
|
2778
2649
|
const getUrlPattern = (urlStr) => {
|
|
@@ -2797,11 +2668,42 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
2797
2668
|
if (urlPattern.pathSegments.length !== targetPattern.pathSegments.length)
|
|
2798
2669
|
return false;
|
|
2799
2670
|
if (urlNormalized === targetNormalized)
|
|
2800
|
-
return false;
|
|
2801
|
-
|
|
2802
|
-
|
|
2671
|
+
return false;
|
|
2672
|
+
/**
|
|
2673
|
+
* Generic heuristic to identify "structural" segments that should match exactly.
|
|
2674
|
+
* These are segments that define the URL structure, not the content.
|
|
2675
|
+
*/
|
|
2676
|
+
const isStructuralSegment = (segment, index, totalSegments) => {
|
|
2677
|
+
const normalized = segment.toLowerCase();
|
|
2678
|
+
if (/^\d+$/.test(normalized)) {
|
|
2679
|
+
return true;
|
|
2680
|
+
}
|
|
2681
|
+
if (normalized.length >= 2 && normalized.length <= 5 && /^[a-z0-9-]+$/.test(normalized)) {
|
|
2682
|
+
return true;
|
|
2683
|
+
}
|
|
2684
|
+
const wordCount = normalized.split(/[-_]/).length;
|
|
2685
|
+
if (wordCount === 2 && normalized.length <= 15) {
|
|
2686
|
+
return true;
|
|
2687
|
+
}
|
|
2688
|
+
if (/^[a-f0-9]{8}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{12}$/i.test(normalized)) {
|
|
2689
|
+
return false;
|
|
2690
|
+
}
|
|
2691
|
+
if (normalized.length > 20 && /^[a-z0-9-]+$/.test(normalized)) {
|
|
2803
2692
|
return false;
|
|
2804
2693
|
}
|
|
2694
|
+
if (normalized.length >= 6 && normalized.length <= 20) {
|
|
2695
|
+
return false;
|
|
2696
|
+
}
|
|
2697
|
+
return index < Math.ceil(totalSegments / 2);
|
|
2698
|
+
};
|
|
2699
|
+
for (let i = 0; i < targetPattern.pathSegments.length; i++) {
|
|
2700
|
+
const targetSegment = targetPattern.pathSegments[i];
|
|
2701
|
+
const urlSegment = urlPattern.pathSegments[i];
|
|
2702
|
+
if (isStructuralSegment(targetSegment, i, targetPattern.pathSegments.length)) {
|
|
2703
|
+
if (targetSegment.toLowerCase() !== urlSegment.toLowerCase()) {
|
|
2704
|
+
return false;
|
|
2705
|
+
}
|
|
2706
|
+
}
|
|
2805
2707
|
}
|
|
2806
2708
|
return true;
|
|
2807
2709
|
}
|
|
@@ -2822,24 +2724,52 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
2822
2724
|
}
|
|
2823
2725
|
this.log(`\n=== Starting Hierarchical Deep Extraction (${hierarchy.length} level${hierarchy.length > 1 ? 's' : ''}) ===`, logger_1.Level.LOG);
|
|
2824
2726
|
this.isInDeepExtractionPhase = true;
|
|
2825
|
-
const startLevel =
|
|
2826
|
-
for (let levelIndex = startLevel; levelIndex
|
|
2727
|
+
const startLevel = 0;
|
|
2728
|
+
for (let levelIndex = startLevel; levelIndex < hierarchy.length; levelIndex++) {
|
|
2827
2729
|
const level = hierarchy[levelIndex];
|
|
2828
2730
|
const currentLevelUrls = level.urlMappings;
|
|
2829
|
-
|
|
2731
|
+
const parentLevel = levelIndex + 1 < hierarchy.length ? hierarchy[levelIndex + 1] : null;
|
|
2732
|
+
const effectiveLimit = (parentLevel === null || parentLevel === void 0 ? void 0 : parentLevel.deepExtractionLimit) || level.deepExtractionLimit;
|
|
2733
|
+
this.log(`\n=== Processing Deep Extraction Level ${levelIndex + 1}/${hierarchy.length} ===`, logger_1.Level.LOG);
|
|
2830
2734
|
this.log(`Goto pattern: ${level.gotoPattern}`, logger_1.Level.LOG);
|
|
2831
2735
|
this.log(`Actions to execute: ${level.actionsToExecute.length}`, logger_1.Level.LOG);
|
|
2832
2736
|
this.log(`URLs to process: ${currentLevelUrls.filter(m => m.url !== null).length}`, logger_1.Level.LOG);
|
|
2737
|
+
if (effectiveLimit) {
|
|
2738
|
+
this.log(`Deep extraction limit: ${effectiveLimit}`, logger_1.Level.LOG);
|
|
2739
|
+
}
|
|
2833
2740
|
if (currentLevelUrls.length === 0 || currentLevelUrls.every(u => !u.url)) {
|
|
2834
2741
|
this.log('No valid URLs at this level - stopping here', logger_1.Level.LOG);
|
|
2835
2742
|
break;
|
|
2836
2743
|
}
|
|
2837
|
-
yield this.executeDeepExtractionLevel(page, level, currentLevelUrls);
|
|
2744
|
+
yield this.executeDeepExtractionLevel(page, Object.assign(Object.assign({}, level), { deepExtractionLimit: effectiveLimit }), currentLevelUrls);
|
|
2838
2745
|
}
|
|
2839
2746
|
this.log('\n=== Hierarchical Deep Extraction Completed ===', logger_1.Level.LOG);
|
|
2747
|
+
if (this.deepExtractionStats.totalUrlsFound > 0) {
|
|
2748
|
+
yield this.callWithTimeout(() => this.options.serializableCallback({
|
|
2749
|
+
deepExtractionStats: {
|
|
2750
|
+
totalUrlsFound: this.deepExtractionStats.totalUrlsFound,
|
|
2751
|
+
successfulExtractions: this.deepExtractionStats.successfulExtractions,
|
|
2752
|
+
failedExtractions: this.deepExtractionStats.failedExtractions
|
|
2753
|
+
}
|
|
2754
|
+
}), 30000, 'serializableCallback (deep extraction stats)');
|
|
2755
|
+
}
|
|
2840
2756
|
}
|
|
2841
2757
|
catch (error) {
|
|
2842
2758
|
this.log(`Hierarchical deep extraction failed: ${error.message}`, logger_1.Level.ERROR);
|
|
2759
|
+
if (this.deepExtractionStats.totalUrlsFound > 0) {
|
|
2760
|
+
try {
|
|
2761
|
+
yield this.callWithTimeout(() => this.options.serializableCallback({
|
|
2762
|
+
deepExtractionStats: {
|
|
2763
|
+
totalUrlsFound: this.deepExtractionStats.totalUrlsFound,
|
|
2764
|
+
successfulExtractions: this.deepExtractionStats.successfulExtractions,
|
|
2765
|
+
failedExtractions: this.deepExtractionStats.failedExtractions
|
|
2766
|
+
}
|
|
2767
|
+
}), 30000, 'serializableCallback (deep extraction stats error)');
|
|
2768
|
+
}
|
|
2769
|
+
catch (callbackError) {
|
|
2770
|
+
this.log(`Failed to send stats on error: ${callbackError.message}`, logger_1.Level.ERROR);
|
|
2771
|
+
}
|
|
2772
|
+
}
|
|
2843
2773
|
}
|
|
2844
2774
|
finally {
|
|
2845
2775
|
this.isInDeepExtractionPhase = false;
|
|
@@ -2854,17 +2784,54 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
2854
2784
|
executeDeepExtractionLevel(page, level, urlMappings) {
|
|
2855
2785
|
return __awaiter(this, void 0, void 0, function* () {
|
|
2856
2786
|
try {
|
|
2857
|
-
|
|
2787
|
+
let validMappings = urlMappings.filter(m => m.url !== null);
|
|
2788
|
+
const deepExtractionLimit = level.deepExtractionLimit;
|
|
2789
|
+
if (deepExtractionLimit && validMappings.length > deepExtractionLimit) {
|
|
2790
|
+
this.log(`Found deepExtractionLimit: ${deepExtractionLimit} from parent action`, logger_1.Level.LOG);
|
|
2791
|
+
this.log(`Deep extraction limit applied: ${validMappings.length} URLs found, limiting to ${deepExtractionLimit}`, logger_1.Level.LOG);
|
|
2792
|
+
validMappings = validMappings.slice(0, deepExtractionLimit);
|
|
2793
|
+
}
|
|
2794
|
+
else if (deepExtractionLimit) {
|
|
2795
|
+
this.log(`Deep extraction limit: ${deepExtractionLimit} URLs configured (found ${validMappings.length} URLs, all within limit)`, logger_1.Level.LOG);
|
|
2796
|
+
}
|
|
2797
|
+
else {
|
|
2798
|
+
this.log(`No deep extraction limit configured, processing all ${validMappings.length} URLs`, logger_1.Level.LOG);
|
|
2799
|
+
}
|
|
2858
2800
|
if (validMappings.length === 0) {
|
|
2859
2801
|
this.log('No URLs to process for this level', logger_1.Level.LOG);
|
|
2860
2802
|
return;
|
|
2861
2803
|
}
|
|
2862
|
-
this.
|
|
2804
|
+
const isFirstLevel = this.deepExtractionStats.totalUrlsFound === 0;
|
|
2805
|
+
if (isFirstLevel) {
|
|
2806
|
+
this.deepExtractionStats.totalUrlsFound = validMappings.length;
|
|
2807
|
+
}
|
|
2808
|
+
else {
|
|
2809
|
+
this.deepExtractionStats.totalUrlsFound += validMappings.length;
|
|
2810
|
+
}
|
|
2811
|
+
this.log(`Processing ${validMappings.length} URLs for deep extraction`, logger_1.Level.LOG);
|
|
2863
2812
|
for (const mapping of validMappings) {
|
|
2864
2813
|
try {
|
|
2865
2814
|
this.log(`[${mapping.index}] Navigating to: ${mapping.url}`, logger_1.Level.LOG);
|
|
2866
2815
|
yield page.goto(mapping.url);
|
|
2867
2816
|
yield page.waitForLoadState('networkidle', { timeout: 30000 });
|
|
2817
|
+
const getTotalItems = () => {
|
|
2818
|
+
let total = 0;
|
|
2819
|
+
for (const key in this.serializableDataByType.scrapeSchema || {}) {
|
|
2820
|
+
const schemaData = this.serializableDataByType.scrapeSchema[key];
|
|
2821
|
+
if (Array.isArray(schemaData)) {
|
|
2822
|
+
total += schemaData.length;
|
|
2823
|
+
}
|
|
2824
|
+
else if (schemaData && typeof schemaData === 'object') {
|
|
2825
|
+
total += Object.keys(schemaData).length > 0 ? 1 : 0;
|
|
2826
|
+
}
|
|
2827
|
+
}
|
|
2828
|
+
for (const key in this.serializableDataByType.scrapeList || {}) {
|
|
2829
|
+
const listData = this.serializableDataByType.scrapeList[key];
|
|
2830
|
+
total += Array.isArray(listData) ? listData.length : 0;
|
|
2831
|
+
}
|
|
2832
|
+
return total;
|
|
2833
|
+
};
|
|
2834
|
+
const itemCountBefore = getTotalItems();
|
|
2868
2835
|
for (let i = level.actionsToExecute.length - 1; i >= 0; i--) {
|
|
2869
2836
|
const actionPair = level.actionsToExecute[i];
|
|
2870
2837
|
if (this.isAborted) {
|
|
@@ -2879,10 +2846,34 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
2879
2846
|
yield this.carryOutSteps(page, filteredActions);
|
|
2880
2847
|
}
|
|
2881
2848
|
}
|
|
2882
|
-
|
|
2849
|
+
const itemCountAfter = getTotalItems();
|
|
2850
|
+
const dataWasExtracted = itemCountAfter > itemCountBefore;
|
|
2851
|
+
if (dataWasExtracted) {
|
|
2852
|
+
this.log(`[${mapping.index}] Completed - Data extracted successfully (${itemCountAfter - itemCountBefore} items)`, logger_1.Level.LOG);
|
|
2853
|
+
this.deepExtractionStats.successfulExtractions++;
|
|
2854
|
+
}
|
|
2855
|
+
else {
|
|
2856
|
+
this.log(`[${mapping.index}] Completed - No data extracted`, logger_1.Level.WARN);
|
|
2857
|
+
this.deepExtractionStats.failedExtractions++;
|
|
2858
|
+
}
|
|
2859
|
+
yield this.callWithTimeout(() => this.options.serializableCallback({
|
|
2860
|
+
deepExtractionStats: {
|
|
2861
|
+
totalUrlsFound: this.deepExtractionStats.totalUrlsFound,
|
|
2862
|
+
successfulExtractions: this.deepExtractionStats.successfulExtractions,
|
|
2863
|
+
failedExtractions: this.deepExtractionStats.failedExtractions
|
|
2864
|
+
}
|
|
2865
|
+
}), 30000, 'serializableCallback (deep extraction item stats)');
|
|
2883
2866
|
}
|
|
2884
2867
|
catch (error) {
|
|
2885
2868
|
this.log(`[${mapping.index}] Failed: ${error.message}`, logger_1.Level.ERROR);
|
|
2869
|
+
this.deepExtractionStats.failedExtractions++;
|
|
2870
|
+
yield this.callWithTimeout(() => this.options.serializableCallback({
|
|
2871
|
+
deepExtractionStats: {
|
|
2872
|
+
totalUrlsFound: this.deepExtractionStats.totalUrlsFound,
|
|
2873
|
+
successfulExtractions: this.deepExtractionStats.successfulExtractions,
|
|
2874
|
+
failedExtractions: this.deepExtractionStats.failedExtractions
|
|
2875
|
+
}
|
|
2876
|
+
}), 30000, 'serializableCallback (deep extraction failed item stats)');
|
|
2886
2877
|
}
|
|
2887
2878
|
}
|
|
2888
2879
|
}
|
|
@@ -2903,10 +2894,7 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
2903
2894
|
catch (error) {
|
|
2904
2895
|
this.log(`Failed to apply ad-blocker: ${error.message}`, logger_1.Level.ERROR);
|
|
2905
2896
|
}
|
|
2906
|
-
const usedActions = [];
|
|
2907
|
-
let selectors = [];
|
|
2908
2897
|
let lastAction = null;
|
|
2909
|
-
let actionId = -1;
|
|
2910
2898
|
let repeatCount = 0;
|
|
2911
2899
|
/**
|
|
2912
2900
|
* Enables the interpreter functionality for popup windows.
|
|
@@ -2919,12 +2907,11 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
2919
2907
|
p.on('popup', popupHandler);
|
|
2920
2908
|
/* eslint no-constant-condition: ["warn", { "checkLoops": false }] */
|
|
2921
2909
|
let loopIterations = 0;
|
|
2922
|
-
const MAX_LOOP_ITERATIONS = 1000;
|
|
2910
|
+
const MAX_LOOP_ITERATIONS = 1000;
|
|
2923
2911
|
let consecutiveFailures = 0;
|
|
2924
2912
|
const MAX_CONSECUTIVE_FAILURES = 10;
|
|
2925
2913
|
const startTime = Date.now();
|
|
2926
|
-
const MAX_EXECUTION_TIME = 30 * 60 * 1000;
|
|
2927
|
-
// Cleanup function to remove popup listener
|
|
2914
|
+
const MAX_EXECUTION_TIME = 30 * 60 * 1000;
|
|
2928
2915
|
const cleanup = () => {
|
|
2929
2916
|
try {
|
|
2930
2917
|
if (!p.isClosed()) {
|
|
@@ -2935,32 +2922,26 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
2935
2922
|
}
|
|
2936
2923
|
};
|
|
2937
2924
|
while (true) {
|
|
2938
|
-
// Multiple circuit breakers to prevent infinite loops
|
|
2939
2925
|
if (++loopIterations > MAX_LOOP_ITERATIONS) {
|
|
2940
2926
|
this.log('Maximum loop iterations reached, terminating to prevent infinite loop', logger_1.Level.ERROR);
|
|
2941
2927
|
cleanup();
|
|
2942
2928
|
return;
|
|
2943
2929
|
}
|
|
2944
|
-
// Time-based circuit breaker
|
|
2945
2930
|
if (Date.now() - startTime > MAX_EXECUTION_TIME) {
|
|
2946
2931
|
this.log('Maximum execution time reached (30 minutes), terminating workflow', logger_1.Level.ERROR);
|
|
2947
2932
|
cleanup();
|
|
2948
2933
|
return;
|
|
2949
2934
|
}
|
|
2950
|
-
// Failure-based circuit breaker
|
|
2951
2935
|
if (consecutiveFailures >= MAX_CONSECUTIVE_FAILURES) {
|
|
2952
2936
|
this.log('Too many consecutive failures, terminating to prevent hang', logger_1.Level.ERROR);
|
|
2953
2937
|
cleanup();
|
|
2954
2938
|
return;
|
|
2955
2939
|
}
|
|
2956
|
-
// Check abort flag immediately
|
|
2957
2940
|
if (this.isAborted) {
|
|
2958
2941
|
this.log('Workflow aborted in runLoop', logger_1.Level.WARN);
|
|
2959
2942
|
cleanup();
|
|
2960
2943
|
return;
|
|
2961
2944
|
}
|
|
2962
|
-
// Checks whether the page was closed from outside,
|
|
2963
|
-
// or the workflow execution has been stopped via `interpreter.stop()`
|
|
2964
2945
|
if (p.isClosed() || !this.stopper) {
|
|
2965
2946
|
cleanup();
|
|
2966
2947
|
return;
|
|
@@ -2992,55 +2973,12 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
2992
2973
|
cleanup();
|
|
2993
2974
|
return;
|
|
2994
2975
|
}
|
|
2995
|
-
// const newSelectors = this.getSelectors(workflowCopy);
|
|
2996
|
-
// newSelectors.forEach(selector => {
|
|
2997
|
-
// if (!selectors.includes(selector)) {
|
|
2998
|
-
// selectors.push(selector);
|
|
2999
|
-
// }
|
|
3000
|
-
// });
|
|
3001
|
-
// let pageState = {};
|
|
3002
|
-
// let getStateTest = "Hello";
|
|
3003
|
-
// try {
|
|
3004
|
-
// pageState = await this.getState(p, workflowCopy, selectors);
|
|
3005
|
-
// selectors = [];
|
|
3006
|
-
// console.log("Empty selectors:", selectors)
|
|
3007
|
-
// } catch (e: any) {
|
|
3008
|
-
// this.log('The browser has been closed.');
|
|
3009
|
-
// return;
|
|
3010
|
-
// }
|
|
3011
|
-
// if (this.options.debug) {
|
|
3012
|
-
// this.log(`Current state is: \n${JSON.stringify(pageState, null, 2)}`, Level.WARN);
|
|
3013
|
-
// }
|
|
3014
|
-
// const actionId = workflow.findIndex((step) => {
|
|
3015
|
-
// const isApplicable = this.applicable(step.where, pageState, usedActions);
|
|
3016
|
-
// console.log("-------------------------------------------------------------");
|
|
3017
|
-
// console.log(`Where:`, step.where);
|
|
3018
|
-
// console.log(`Page state:`, pageState);
|
|
3019
|
-
// console.log(`Match result: ${isApplicable}`);
|
|
3020
|
-
// console.log("-------------------------------------------------------------");
|
|
3021
|
-
// return isApplicable;
|
|
3022
|
-
// });
|
|
3023
|
-
// if (workflowCopy[0]) {
|
|
3024
|
-
// if (workflowCopy[0].what[1].action === 'scrapeSchema') {
|
|
3025
|
-
// const schema = workflowCopy[0].what[1].args[0];
|
|
3026
|
-
// await p.goto(workflowCopy[0].where.url.toString())
|
|
3027
|
-
// await p.waitForLoadState();
|
|
3028
|
-
// const changes = await this.detectElementChanges(p, schema);
|
|
3029
|
-
// console.log("Page URL: ", workflowCopy[0].where.url.toString());
|
|
3030
|
-
// console.log("SCHEMA CHANGES:", changes);
|
|
3031
|
-
// }
|
|
3032
|
-
// }
|
|
3033
|
-
// actionId = this.getMatchingActionId(workflowCopy, pageState, usedActions);
|
|
3034
|
-
// if (actionId !== -1 && workflowCopy[actionId]) {
|
|
3035
|
-
// workflowCopy[actionId] = await this.validateWorkflowAction(p, workflowCopy[actionId]);
|
|
3036
|
-
// }
|
|
3037
2976
|
const actionId = workflowCopy.length - 1;
|
|
3038
2977
|
const action = workflowCopy[actionId];
|
|
3039
2978
|
console.log("MATCHED ACTION:", action);
|
|
3040
2979
|
console.log("MATCHED ACTION ID:", actionId);
|
|
3041
2980
|
this.log(`Matched ${JSON.stringify(action === null || action === void 0 ? void 0 : action.where)}`, logger_1.Level.LOG);
|
|
3042
|
-
if (action) {
|
|
3043
|
-
// Check abort flag before executing action
|
|
2981
|
+
if (action) {
|
|
3044
2982
|
if (this.isAborted) {
|
|
3045
2983
|
this.log('Workflow aborted before action execution', logger_1.Level.WARN);
|
|
3046
2984
|
return;
|
|
@@ -3054,27 +2992,43 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
3054
2992
|
return;
|
|
3055
2993
|
}
|
|
3056
2994
|
lastAction = action;
|
|
3057
|
-
|
|
3058
|
-
|
|
3059
|
-
|
|
3060
|
-
|
|
3061
|
-
|
|
3062
|
-
|
|
3063
|
-
|
|
3064
|
-
|
|
3065
|
-
|
|
3066
|
-
|
|
3067
|
-
|
|
3068
|
-
|
|
3069
|
-
|
|
3070
|
-
|
|
3071
|
-
|
|
3072
|
-
|
|
3073
|
-
|
|
3074
|
-
|
|
3075
|
-
|
|
3076
|
-
|
|
3077
|
-
|
|
2995
|
+
const MAX_ACTION_RETRIES = 3;
|
|
2996
|
+
let actionRetries = 0;
|
|
2997
|
+
let actionSucceeded = false;
|
|
2998
|
+
while (actionRetries < MAX_ACTION_RETRIES && !actionSucceeded) {
|
|
2999
|
+
try {
|
|
3000
|
+
const validatedAction = yield this.validateAndFixSelectors(p, action);
|
|
3001
|
+
console.log("Carrying out:", validatedAction.what);
|
|
3002
|
+
yield this.carryOutSteps(p, validatedAction.what, workflowCopy);
|
|
3003
|
+
workflowCopy.splice(actionId, 1);
|
|
3004
|
+
console.log(`Action with ID ${action.id} removed from the workflow copy.`);
|
|
3005
|
+
this.executedActions++;
|
|
3006
|
+
const percentage = Math.round((this.executedActions / this.totalActions) * 100);
|
|
3007
|
+
if ((_b = this.options.debugChannel) === null || _b === void 0 ? void 0 : _b.progressUpdate) {
|
|
3008
|
+
this.options.debugChannel.progressUpdate(this.executedActions, this.totalActions, percentage);
|
|
3009
|
+
}
|
|
3010
|
+
actionSucceeded = true;
|
|
3011
|
+
consecutiveFailures = 0;
|
|
3012
|
+
loopIterations = Math.max(0, loopIterations - 10);
|
|
3013
|
+
if (loopIterations % 10 === 0) {
|
|
3014
|
+
yield new Promise(resolve => setImmediate(resolve));
|
|
3015
|
+
}
|
|
3016
|
+
}
|
|
3017
|
+
catch (e) {
|
|
3018
|
+
actionRetries++;
|
|
3019
|
+
this.log(e, logger_1.Level.ERROR);
|
|
3020
|
+
if (actionRetries < MAX_ACTION_RETRIES) {
|
|
3021
|
+
this.log(`Retrying action (attempt ${actionRetries + 1}/${MAX_ACTION_RETRIES})`, logger_1.Level.WARN);
|
|
3022
|
+
yield new Promise(resolve => setTimeout(resolve, 1000 * actionRetries));
|
|
3023
|
+
}
|
|
3024
|
+
else {
|
|
3025
|
+
this.log(`Action failed after ${MAX_ACTION_RETRIES} retries`, logger_1.Level.ERROR);
|
|
3026
|
+
consecutiveFailures++;
|
|
3027
|
+
yield new Promise(resolve => setTimeout(resolve, Math.min(1000, consecutiveFailures * 200)));
|
|
3028
|
+
}
|
|
3029
|
+
}
|
|
3030
|
+
}
|
|
3031
|
+
if (!actionSucceeded) {
|
|
3078
3032
|
continue;
|
|
3079
3033
|
}
|
|
3080
3034
|
}
|
|
@@ -3088,8 +3042,8 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
3088
3042
|
}
|
|
3089
3043
|
ensureScriptsLoaded(page) {
|
|
3090
3044
|
return __awaiter(this, void 0, void 0, function* () {
|
|
3045
|
+
let scriptsLoaded = false;
|
|
3091
3046
|
try {
|
|
3092
|
-
// Add timeout to prevent hanging on script evaluation
|
|
3093
3047
|
const evaluationPromise = page.evaluate(() => typeof window.scrape === 'function' &&
|
|
3094
3048
|
typeof window.scrapeSchema === 'function' &&
|
|
3095
3049
|
typeof window.scrapeList === 'function' &&
|
|
@@ -3097,22 +3051,21 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
3097
3051
|
typeof window.scrollDown === 'function' &&
|
|
3098
3052
|
typeof window.scrollUp === 'function');
|
|
3099
3053
|
const timeoutPromise = new Promise((_, reject) => setTimeout(() => reject(new Error('Script check timeout')), 3000));
|
|
3100
|
-
|
|
3054
|
+
scriptsLoaded = yield Promise.race([
|
|
3101
3055
|
evaluationPromise,
|
|
3102
3056
|
timeoutPromise
|
|
3103
3057
|
]);
|
|
3104
|
-
if (!isScriptLoaded) {
|
|
3105
|
-
yield page.addInitScript({ path: path_1.default.join(__dirname, 'browserSide', 'scraper.js') });
|
|
3106
|
-
}
|
|
3107
3058
|
}
|
|
3108
3059
|
catch (error) {
|
|
3109
|
-
|
|
3110
|
-
|
|
3060
|
+
this.log(`Script check failed or timed out: ${error.message}`, logger_1.Level.WARN);
|
|
3061
|
+
scriptsLoaded = false;
|
|
3062
|
+
}
|
|
3063
|
+
if (!scriptsLoaded) {
|
|
3111
3064
|
try {
|
|
3112
3065
|
yield page.addInitScript({ path: path_1.default.join(__dirname, 'browserSide', 'scraper.js') });
|
|
3113
3066
|
}
|
|
3114
3067
|
catch (scriptError) {
|
|
3115
|
-
this.log(`Failed to add script: ${scriptError.message}`, logger_1.Level.ERROR);
|
|
3068
|
+
this.log(`Failed to add scraper script: ${scriptError.message}`, logger_1.Level.ERROR);
|
|
3116
3069
|
}
|
|
3117
3070
|
}
|
|
3118
3071
|
});
|
|
@@ -3127,10 +3080,10 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
3127
3080
|
*/
|
|
3128
3081
|
run(page, params) {
|
|
3129
3082
|
return __awaiter(this, void 0, void 0, function* () {
|
|
3083
|
+
var _a;
|
|
3130
3084
|
this.log('Starting the workflow.', logger_1.Level.LOG);
|
|
3131
3085
|
const context = page.context();
|
|
3132
3086
|
page.setDefaultNavigationTimeout(100000);
|
|
3133
|
-
// Check proxy settings from context options
|
|
3134
3087
|
const contextOptions = context._options;
|
|
3135
3088
|
const hasProxy = !!(contextOptions === null || contextOptions === void 0 ? void 0 : contextOptions.proxy);
|
|
3136
3089
|
this.log(`Proxy settings: ${hasProxy ? `Proxy is configured...` : 'No proxy configured...'}`);
|
|
@@ -3146,6 +3099,11 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
3146
3099
|
* `this.workflow` with the parameters initialized.
|
|
3147
3100
|
*/
|
|
3148
3101
|
this.initializedWorkflow = preprocessor_1.default.initWorkflow(this.workflow, params);
|
|
3102
|
+
this.totalActions = this.initializedWorkflow.length;
|
|
3103
|
+
this.executedActions = 0;
|
|
3104
|
+
if ((_a = this.options.debugChannel) === null || _a === void 0 ? void 0 : _a.progressUpdate) {
|
|
3105
|
+
this.options.debugChannel.progressUpdate(0, this.totalActions, 0);
|
|
3106
|
+
}
|
|
3149
3107
|
yield this.ensureScriptsLoaded(page);
|
|
3150
3108
|
this.stopper = () => {
|
|
3151
3109
|
this.stopper = null;
|
|
@@ -3173,7 +3131,6 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
3173
3131
|
cleanup() {
|
|
3174
3132
|
return __awaiter(this, void 0, void 0, function* () {
|
|
3175
3133
|
try {
|
|
3176
|
-
// Stop any running workflows first
|
|
3177
3134
|
if (this.stopper) {
|
|
3178
3135
|
try {
|
|
3179
3136
|
yield this.stop();
|
|
@@ -3182,7 +3139,6 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
3182
3139
|
this.log(`Error stopping workflow during cleanup: ${error.message}`, logger_1.Level.WARN);
|
|
3183
3140
|
}
|
|
3184
3141
|
}
|
|
3185
|
-
// Clear ad-blocker resources
|
|
3186
3142
|
if (this.blocker) {
|
|
3187
3143
|
try {
|
|
3188
3144
|
this.blocker = null;
|
|
@@ -3192,12 +3148,10 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
3192
3148
|
this.log(`Error cleaning up ad-blocker: ${error.message}`, logger_1.Level.WARN);
|
|
3193
3149
|
}
|
|
3194
3150
|
}
|
|
3195
|
-
// Clear accumulated data to free memory
|
|
3196
3151
|
this.cumulativeResults = [];
|
|
3197
3152
|
this.autohealFailures = [];
|
|
3198
3153
|
this.namedResults = {};
|
|
3199
3154
|
this.serializableDataByType = { scrapeList: {}, scrapeSchema: {} };
|
|
3200
|
-
// Reset state
|
|
3201
3155
|
this.isAborted = false;
|
|
3202
3156
|
this.initializedWorkflow = null;
|
|
3203
3157
|
this.log('Interpreter cleanup completed', logger_1.Level.DEBUG);
|