mx-cloud 0.0.23 → 0.0.25
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/build/interpret.d.ts +53 -0
- package/build/interpret.js +784 -92
- package/package.json +1 -1
package/build/interpret.d.ts
CHANGED
|
@@ -38,6 +38,7 @@ interface InterpreterOptions {
|
|
|
38
38
|
serializableCallback: (output: any) => (void | Promise<void>);
|
|
39
39
|
binaryCallback: (output: any, mimeType: string) => (void | Promise<void>);
|
|
40
40
|
debug: boolean;
|
|
41
|
+
robotType?: 'extract' | 'scrape' | 'deep-extract';
|
|
41
42
|
debugChannel: Partial<{
|
|
42
43
|
activeId: (id: number) => void;
|
|
43
44
|
debugMessage: (msg: string) => void;
|
|
@@ -63,6 +64,8 @@ export default class Interpreter extends EventEmitter {
|
|
|
63
64
|
private screenshotCounter;
|
|
64
65
|
private scrapeListCounter;
|
|
65
66
|
private serializableDataByType;
|
|
67
|
+
private pendingDeepExtraction;
|
|
68
|
+
private isInDeepExtractionPhase;
|
|
66
69
|
constructor(workflow: WorkflowFile, options?: Partial<InterpreterOptions>);
|
|
67
70
|
trackAutohealFailure(error: string): void;
|
|
68
71
|
private applyAdBlocker;
|
|
@@ -148,6 +151,56 @@ export default class Interpreter extends EventEmitter {
|
|
|
148
151
|
* @returns {Promise<WhereWhatPair>} - The potentially modified action
|
|
149
152
|
*/
|
|
150
153
|
private validateAndFixSelectors;
|
|
154
|
+
/**
|
|
155
|
+
* Extracts URLs from the current page's list elements.
|
|
156
|
+
* Used during pagination to maintain sync between scraped results and extracted URLs.
|
|
157
|
+
*
|
|
158
|
+
* @param page - Playwright page object
|
|
159
|
+
* @param listSelector - The selector used to identify list elements
|
|
160
|
+
* @param limit - Maximum number of elements to process (should match number of scraped items)
|
|
161
|
+
* @returns Array of URL arrays, one per list element
|
|
162
|
+
*/
|
|
163
|
+
private extractUrlsFromCurrentPage;
|
|
164
|
+
/**
|
|
165
|
+
* Builds a hierarchical deep extraction plan by analyzing the workflow structure.
|
|
166
|
+
* Identifies goto actions and determines what actions to execute at each level.
|
|
167
|
+
* Workflow is bottom-to-top, so we scan from end to start.
|
|
168
|
+
*/
|
|
169
|
+
private buildDeepExtractionHierarchy;
|
|
170
|
+
/**
|
|
171
|
+
* Extracts hrefs directly from the page based on scrapeSchema selectors.
|
|
172
|
+
* Checks ALL selectors from the schema config - if they point to anchor elements, extract href.
|
|
173
|
+
* This is called after scrapeSchema executes to capture hrefs for deep extraction.
|
|
174
|
+
*/
|
|
175
|
+
private extractHrefsFromPage;
|
|
176
|
+
/**
|
|
177
|
+
* Filters URLs for deep extraction based on the goto action pattern.
|
|
178
|
+
* This is called immediately after the first capture action (scrapeList).
|
|
179
|
+
* Returns the filtered URL mappings that should be processed after workflow completion.
|
|
180
|
+
* Each mapping maintains alignment with the original scrapeList index.
|
|
181
|
+
*/
|
|
182
|
+
private filterDeepExtractionUrls;
|
|
183
|
+
/**
|
|
184
|
+
* Filters pre-extracted URLs for deep extraction based on the goto action pattern.
|
|
185
|
+
* This is used for paginated lists where URLs were extracted during pagination.
|
|
186
|
+
* Returns the filtered URL mappings that maintain alignment with scrapeList indices.
|
|
187
|
+
*/
|
|
188
|
+
private filterDeepExtractionUrlsFromExtracted;
|
|
189
|
+
/**
|
|
190
|
+
* Helper function to check if a URL matches a goto pattern.
|
|
191
|
+
*/
|
|
192
|
+
private matchesGotoPattern;
|
|
193
|
+
/**
|
|
194
|
+
* Executes hierarchical deep extraction by processing each level recursively.
|
|
195
|
+
* URLs are already stored in each hierarchy level's urlMappings during workflow execution.
|
|
196
|
+
*/
|
|
197
|
+
private executeHierarchicalDeepExtraction;
|
|
198
|
+
/**
|
|
199
|
+
* Executes deep extraction for a single level.
|
|
200
|
+
* URLs are already extracted and stored in hierarchy during workflow execution.
|
|
201
|
+
* This function just navigates to URLs and executes the capture actions.
|
|
202
|
+
*/
|
|
203
|
+
private executeDeepExtractionLevel;
|
|
151
204
|
private runLoop;
|
|
152
205
|
private ensureScriptsLoaded;
|
|
153
206
|
/**
|
package/build/interpret.js
CHANGED
|
@@ -74,6 +74,8 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
74
74
|
scrapeList: {},
|
|
75
75
|
scrapeSchema: {}
|
|
76
76
|
};
|
|
77
|
+
this.pendingDeepExtraction = null;
|
|
78
|
+
this.isInDeepExtractionPhase = false;
|
|
77
79
|
this.workflow = workflow.workflow;
|
|
78
80
|
this.initializedWorkflow = null;
|
|
79
81
|
this.options = Object.assign({ maxRepeats: 5, maxConcurrency: 5, serializableCallback: (data) => {
|
|
@@ -338,7 +340,7 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
338
340
|
* @param page Playwright Page object
|
|
339
341
|
* @param steps Array of actions.
|
|
340
342
|
*/
|
|
341
|
-
carryOutSteps(page, steps) {
|
|
343
|
+
carryOutSteps(page, steps, currentWorkflow) {
|
|
342
344
|
return __awaiter(this, void 0, void 0, function* () {
|
|
343
345
|
var _a, _b;
|
|
344
346
|
// Check abort flag at start of execution
|
|
@@ -430,9 +432,8 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
430
432
|
const scrapeResults = yield page.evaluate((s) => window.scrape(s !== null && s !== void 0 ? s : null), selector);
|
|
431
433
|
yield this.options.serializableCallback(scrapeResults);
|
|
432
434
|
}),
|
|
433
|
-
scrapeSchema: (
|
|
435
|
+
scrapeSchema: (schema_1, ...args_1) => __awaiter(this, [schema_1, ...args_1], void 0, function* (schema, actionName = "") {
|
|
434
436
|
var _a;
|
|
435
|
-
// Check abort flag at start of scraping
|
|
436
437
|
if (this.isAborted) {
|
|
437
438
|
this.log('Workflow aborted, stopping scrapeSchema', logger_1.Level.WARN);
|
|
438
439
|
return;
|
|
@@ -451,7 +452,6 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
451
452
|
}
|
|
452
453
|
const resultToProcess = Array.isArray(scrapeResult) ? scrapeResult[0] : scrapeResult;
|
|
453
454
|
if (this.cumulativeResults.length === 0) {
|
|
454
|
-
// First execution - create initial row
|
|
455
455
|
const newRow = {};
|
|
456
456
|
Object.entries(resultToProcess).forEach(([key, value]) => {
|
|
457
457
|
if (value !== undefined) {
|
|
@@ -461,12 +461,10 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
461
461
|
this.cumulativeResults.push(newRow);
|
|
462
462
|
}
|
|
463
463
|
else {
|
|
464
|
-
// Check if any keys from new result already exist in the last row
|
|
465
464
|
const lastRow = this.cumulativeResults[this.cumulativeResults.length - 1];
|
|
466
465
|
const newResultKeys = Object.keys(resultToProcess).filter(key => resultToProcess[key] !== undefined);
|
|
467
466
|
const hasRepeatedKeys = newResultKeys.some(key => lastRow.hasOwnProperty(key));
|
|
468
467
|
if (hasRepeatedKeys) {
|
|
469
|
-
// Keys are repeated - create a new row
|
|
470
468
|
const newRow = {};
|
|
471
469
|
Object.entries(resultToProcess).forEach(([key, value]) => {
|
|
472
470
|
if (value !== undefined) {
|
|
@@ -476,7 +474,6 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
476
474
|
this.cumulativeResults.push(newRow);
|
|
477
475
|
}
|
|
478
476
|
else {
|
|
479
|
-
// No repeated keys - merge with the last row
|
|
480
477
|
Object.entries(resultToProcess).forEach(([key, value]) => {
|
|
481
478
|
if (value !== undefined) {
|
|
482
479
|
lastRow[key] = value;
|
|
@@ -484,30 +481,102 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
484
481
|
});
|
|
485
482
|
}
|
|
486
483
|
}
|
|
487
|
-
console.log("Total accumulated rows:", this.cumulativeResults.length);
|
|
488
|
-
console.log("Current results:", this.cumulativeResults);
|
|
489
|
-
// ✅ Append schema results under "scrapeSchema" → name
|
|
490
484
|
const actionType = "scrapeSchema";
|
|
491
|
-
const
|
|
485
|
+
const name = actionName || "Texts";
|
|
492
486
|
if (!this.namedResults[actionType])
|
|
493
487
|
this.namedResults[actionType] = {};
|
|
494
|
-
this.namedResults[actionType][
|
|
488
|
+
this.namedResults[actionType][name] = this.cumulativeResults;
|
|
495
489
|
if (!this.serializableDataByType[actionType])
|
|
496
490
|
this.serializableDataByType[actionType] = {};
|
|
497
|
-
if (!this.serializableDataByType[actionType][
|
|
498
|
-
this.serializableDataByType[actionType][
|
|
491
|
+
if (!this.serializableDataByType[actionType][name]) {
|
|
492
|
+
this.serializableDataByType[actionType][name] = [];
|
|
499
493
|
}
|
|
500
|
-
|
|
501
|
-
this.serializableDataByType[actionType][actionName] = [...this.cumulativeResults];
|
|
502
|
-
// now emit full structured object
|
|
494
|
+
this.serializableDataByType[actionType][name] = [...this.cumulativeResults];
|
|
503
495
|
yield this.options.serializableCallback({
|
|
504
496
|
scrapeList: this.serializableDataByType.scrapeList,
|
|
505
497
|
scrapeSchema: this.serializableDataByType.scrapeSchema
|
|
506
498
|
});
|
|
499
|
+
if (this.options.robotType === 'deep-extract' && !this.isInDeepExtractionPhase && this.initializedWorkflow) {
|
|
500
|
+
if (!this.pendingDeepExtraction) {
|
|
501
|
+
console.log('DEBUG: Building hierarchical deep extraction plan from scrapeSchema...');
|
|
502
|
+
const hierarchyData = this.buildDeepExtractionHierarchy(this.initializedWorkflow);
|
|
503
|
+
if (hierarchyData.length > 0) {
|
|
504
|
+
const nextLevelIndex = hierarchyData.length >= 2 ? hierarchyData.length - 2 : hierarchyData.length - 1;
|
|
505
|
+
const nextLevelGotoPattern = hierarchyData[nextLevelIndex].gotoPattern;
|
|
506
|
+
this.log(`Root scrapeSchema will extract URLs matching pattern: ${nextLevelGotoPattern}`, logger_1.Level.LOG);
|
|
507
|
+
// Extract URLs from schema fields
|
|
508
|
+
const urls = yield this.extractHrefsFromPage(page, schema);
|
|
509
|
+
this.log(`scrapeSchema extracted ${urls.length} URLs from field selectors`, logger_1.Level.LOG);
|
|
510
|
+
// Filter URLs against pattern
|
|
511
|
+
const rootUrlMappings = urls
|
|
512
|
+
.map((url, index) => ({
|
|
513
|
+
scrapeListIndex: index,
|
|
514
|
+
url: this.matchesGotoPattern(url, nextLevelGotoPattern) ? url : null
|
|
515
|
+
}))
|
|
516
|
+
.filter(m => m.url !== null);
|
|
517
|
+
this.log(`Matched ${rootUrlMappings.length} URLs against pattern ${nextLevelGotoPattern}`, logger_1.Level.LOG);
|
|
518
|
+
this.pendingDeepExtraction = {
|
|
519
|
+
page,
|
|
520
|
+
hierarchy: hierarchyData.map((level, idx) => ({
|
|
521
|
+
gotoPattern: level.gotoPattern,
|
|
522
|
+
actionsToExecute: level.actionsToExecute,
|
|
523
|
+
urlMappings: idx === nextLevelIndex ? rootUrlMappings.map(m => ({ index: m.scrapeListIndex, url: m.url })) : []
|
|
524
|
+
}))
|
|
525
|
+
};
|
|
526
|
+
}
|
|
527
|
+
else {
|
|
528
|
+
console.log('DEBUG: No goto actions found, deep extraction skipped');
|
|
529
|
+
}
|
|
530
|
+
}
|
|
531
|
+
else {
|
|
532
|
+
this.log(`[Deep Extract] scrapeSchema "${name}" extracting URLs during workflow execution`, logger_1.Level.LOG);
|
|
533
|
+
const hierarchy = this.pendingDeepExtraction.hierarchy;
|
|
534
|
+
if (hierarchy && hierarchy.length > 0) {
|
|
535
|
+
let targetLevelIndex = -1;
|
|
536
|
+
for (let i = hierarchy.length - 1; i >= 0; i--) {
|
|
537
|
+
if (hierarchy[i].urlMappings.length === 0) {
|
|
538
|
+
targetLevelIndex = i;
|
|
539
|
+
break;
|
|
540
|
+
}
|
|
541
|
+
}
|
|
542
|
+
if (targetLevelIndex >= 0) {
|
|
543
|
+
const targetGotoPattern = hierarchy[targetLevelIndex].gotoPattern;
|
|
544
|
+
this.log(`[Deep Extract] Storing URLs for level ${targetLevelIndex}, pattern: ${targetGotoPattern}`, logger_1.Level.LOG);
|
|
545
|
+
const urls = yield this.extractHrefsFromPage(page, schema);
|
|
546
|
+
this.log(`[Deep Extract] Extracted ${urls.length} URLs from scrapeSchema field selectors`, logger_1.Level.LOG);
|
|
547
|
+
const urlMappings = urls
|
|
548
|
+
.map((url, index) => ({
|
|
549
|
+
index,
|
|
550
|
+
url: this.matchesGotoPattern(url, targetGotoPattern) ? url : null
|
|
551
|
+
}))
|
|
552
|
+
.filter(m => m.url !== null);
|
|
553
|
+
if (hierarchy[targetLevelIndex].urlMappings.length > 0) {
|
|
554
|
+
const existingUrls = new Set(hierarchy[targetLevelIndex].urlMappings.map(m => m.url).filter(u => u !== null));
|
|
555
|
+
const newUrls = urlMappings.filter(m => m.url !== null && !existingUrls.has(m.url));
|
|
556
|
+
if (newUrls.length > 0) {
|
|
557
|
+
const startIndex = hierarchy[targetLevelIndex].urlMappings.length;
|
|
558
|
+
hierarchy[targetLevelIndex].urlMappings.push(...newUrls.map((m, idx) => ({ index: startIndex + idx, url: m.url })));
|
|
559
|
+
this.log(`[Deep Extract] Merged ${newUrls.length} new URLs from scrapeSchema`, logger_1.Level.LOG);
|
|
560
|
+
}
|
|
561
|
+
}
|
|
562
|
+
else {
|
|
563
|
+
hierarchy[targetLevelIndex].urlMappings = urlMappings;
|
|
564
|
+
}
|
|
565
|
+
this.log(`[Deep Extract] Stored ${urlMappings.length} matching URLs`, logger_1.Level.LOG);
|
|
566
|
+
if (urlMappings.length > 0) {
|
|
567
|
+
const sampleSize = Math.min(3, urlMappings.length);
|
|
568
|
+
this.log(`[Deep Extract] Sample URLs (showing ${sampleSize} of ${urlMappings.length}):`, logger_1.Level.LOG);
|
|
569
|
+
urlMappings.slice(0, sampleSize).forEach((mapping, idx) => {
|
|
570
|
+
this.log(`[Deep Extract] ${idx + 1}. ${mapping.url}`, logger_1.Level.LOG);
|
|
571
|
+
});
|
|
572
|
+
}
|
|
573
|
+
}
|
|
574
|
+
}
|
|
575
|
+
}
|
|
576
|
+
}
|
|
507
577
|
}),
|
|
508
|
-
scrapeList: (
|
|
578
|
+
scrapeList: (config_1, ...args_1) => __awaiter(this, [config_1, ...args_1], void 0, function* (config, actionName = "") {
|
|
509
579
|
var _a, _b;
|
|
510
|
-
// Check abort flag at start of scraping
|
|
511
580
|
if (this.isAborted) {
|
|
512
581
|
this.log('Workflow aborted, stopping scrapeList', logger_1.Level.WARN);
|
|
513
582
|
return;
|
|
@@ -533,53 +602,156 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
533
602
|
}
|
|
534
603
|
catch (error) {
|
|
535
604
|
console.warn('ScrapeList evaluation failed:', error.message);
|
|
536
|
-
return [];
|
|
605
|
+
return [];
|
|
537
606
|
}
|
|
538
607
|
}, config);
|
|
539
608
|
}
|
|
540
609
|
else {
|
|
541
610
|
paginationUsed = true;
|
|
542
|
-
|
|
611
|
+
const paginationResult = yield this.handlePagination(page, config, actionName);
|
|
612
|
+
scrapeResults = paginationResult.results;
|
|
613
|
+
const paginationUrls = paginationResult.urls;
|
|
614
|
+
if (this.options.robotType === 'deep-extract' && this.initializedWorkflow && scrapeResults.length > 0) {
|
|
615
|
+
if (!this.pendingDeepExtraction) {
|
|
616
|
+
console.log('DEBUG: Building hierarchical deep extraction plan from pagination...');
|
|
617
|
+
const hierarchyData = this.buildDeepExtractionHierarchy(this.initializedWorkflow);
|
|
618
|
+
if (hierarchyData.length > 0) {
|
|
619
|
+
const nextLevelIndex = hierarchyData.length >= 2 ? hierarchyData.length - 2 : hierarchyData.length - 1;
|
|
620
|
+
const nextLevelGotoPattern = hierarchyData[nextLevelIndex].gotoPattern;
|
|
621
|
+
this.log(`Root scrapeList (pagination) will extract URLs matching pattern: ${nextLevelGotoPattern}`, logger_1.Level.LOG);
|
|
622
|
+
const rootUrlMappings = this.filterDeepExtractionUrlsFromExtracted(paginationUrls, scrapeResults, nextLevelGotoPattern);
|
|
623
|
+
this.pendingDeepExtraction = {
|
|
624
|
+
page,
|
|
625
|
+
hierarchy: hierarchyData.map((level, idx) => ({
|
|
626
|
+
gotoPattern: level.gotoPattern,
|
|
627
|
+
actionsToExecute: level.actionsToExecute,
|
|
628
|
+
urlMappings: idx === nextLevelIndex ? rootUrlMappings.map(m => ({ index: m.scrapeListIndex, url: m.url })) : []
|
|
629
|
+
}))
|
|
630
|
+
};
|
|
631
|
+
}
|
|
632
|
+
}
|
|
633
|
+
else {
|
|
634
|
+
this.log(`[Deep Extract] scrapeList (pagination) "${actionName}" extracting URLs`, logger_1.Level.LOG);
|
|
635
|
+
const hierarchy = this.pendingDeepExtraction.hierarchy;
|
|
636
|
+
if (hierarchy && hierarchy.length > 0) {
|
|
637
|
+
const nextLevelIndex = hierarchy.length >= 3 ? hierarchy.length - 3 : 0;
|
|
638
|
+
if (nextLevelIndex >= 0 && hierarchy[nextLevelIndex]) {
|
|
639
|
+
const nextGotoPattern = hierarchy[nextLevelIndex].gotoPattern;
|
|
640
|
+
this.log(`[Deep Extract] Extracting URLs for pattern: ${nextGotoPattern}`, logger_1.Level.LOG);
|
|
641
|
+
const urlMappings = this.filterDeepExtractionUrlsFromExtracted(paginationUrls, scrapeResults, nextGotoPattern);
|
|
642
|
+
this.log(`[Deep Extract] Found ${urlMappings.filter(m => m.url !== null).length} matching URLs`, logger_1.Level.LOG);
|
|
643
|
+
const validUrls = urlMappings.filter(m => m.url !== null);
|
|
644
|
+
if (validUrls.length > 0) {
|
|
645
|
+
const sampleSize = Math.min(3, validUrls.length);
|
|
646
|
+
this.log(`[Deep Extract] Sample URLs (showing ${sampleSize} of ${validUrls.length}):`, logger_1.Level.LOG);
|
|
647
|
+
validUrls.slice(0, sampleSize).forEach((mapping, idx) => {
|
|
648
|
+
this.log(`[Deep Extract] ${idx + 1}. ${mapping.url}`, logger_1.Level.LOG);
|
|
649
|
+
});
|
|
650
|
+
}
|
|
651
|
+
}
|
|
652
|
+
}
|
|
653
|
+
}
|
|
654
|
+
}
|
|
543
655
|
}
|
|
544
|
-
// Ensure we always have an array
|
|
545
656
|
if (!Array.isArray(scrapeResults)) {
|
|
546
657
|
scrapeResults = [];
|
|
547
658
|
}
|
|
548
|
-
console.log(`ScrapeList completed with ${scrapeResults.length} results`);
|
|
549
|
-
// Only process and callback if pagination wasn't used
|
|
550
|
-
// (handlePagination already handles storage and callbacks internally)
|
|
551
659
|
if (!paginationUsed) {
|
|
552
|
-
// ✅ Append list results under "scrapeList" → name
|
|
553
660
|
const actionType = "scrapeList";
|
|
554
|
-
let
|
|
555
|
-
|
|
556
|
-
if (!actionName || actionName.trim() === "") {
|
|
661
|
+
let name = actionName || "";
|
|
662
|
+
if (!name || name.trim() === "" || this.isInDeepExtractionPhase) {
|
|
557
663
|
this.scrapeListCounter++;
|
|
558
|
-
|
|
664
|
+
name = `List ${this.scrapeListCounter}`;
|
|
559
665
|
}
|
|
560
666
|
if (!this.serializableDataByType[actionType])
|
|
561
667
|
this.serializableDataByType[actionType] = {};
|
|
562
|
-
if (!this.serializableDataByType[actionType][
|
|
563
|
-
this.serializableDataByType[actionType][
|
|
668
|
+
if (!this.serializableDataByType[actionType][name]) {
|
|
669
|
+
this.serializableDataByType[actionType][name] = [];
|
|
564
670
|
}
|
|
565
|
-
this.serializableDataByType[actionType][
|
|
671
|
+
this.serializableDataByType[actionType][name].push(...scrapeResults);
|
|
566
672
|
yield this.options.serializableCallback({
|
|
567
673
|
scrapeList: this.serializableDataByType.scrapeList,
|
|
568
674
|
scrapeSchema: this.serializableDataByType.scrapeSchema
|
|
569
675
|
});
|
|
676
|
+
console.log(`DEBUG: Checking deep extract condition: robotType=${this.options.robotType}, hasWorkflow=${!!currentWorkflow}, alreadyPending=${!!this.pendingDeepExtraction}`);
|
|
677
|
+
if (this.options.robotType === 'deep-extract' && !this.isInDeepExtractionPhase && this.initializedWorkflow) {
|
|
678
|
+
if (!this.pendingDeepExtraction) {
|
|
679
|
+
console.log('DEBUG: Building hierarchical deep extraction plan...');
|
|
680
|
+
const hierarchyData = this.buildDeepExtractionHierarchy(this.initializedWorkflow);
|
|
681
|
+
if (hierarchyData.length > 0) {
|
|
682
|
+
const nextLevelIndex = hierarchyData.length >= 2 ? hierarchyData.length - 2 : hierarchyData.length - 1;
|
|
683
|
+
const nextLevelGotoPattern = hierarchyData[nextLevelIndex].gotoPattern;
|
|
684
|
+
this.log(`Root scrapeList will extract URLs matching pattern: ${nextLevelGotoPattern}`, logger_1.Level.LOG);
|
|
685
|
+
const rootUrlMappings = yield this.filterDeepExtractionUrls(page, config.listSelector, scrapeResults, nextLevelGotoPattern);
|
|
686
|
+
this.pendingDeepExtraction = {
|
|
687
|
+
page,
|
|
688
|
+
hierarchy: hierarchyData.map((level, idx) => ({
|
|
689
|
+
gotoPattern: level.gotoPattern,
|
|
690
|
+
actionsToExecute: level.actionsToExecute,
|
|
691
|
+
urlMappings: idx === nextLevelIndex ? rootUrlMappings.map(m => ({ index: m.scrapeListIndex, url: m.url })) : []
|
|
692
|
+
}))
|
|
693
|
+
};
|
|
694
|
+
}
|
|
695
|
+
else {
|
|
696
|
+
console.log('DEBUG: No goto actions found, deep extraction skipped');
|
|
697
|
+
}
|
|
698
|
+
}
|
|
699
|
+
else {
|
|
700
|
+
this.log(`[Deep Extract] scrapeList "${name}" extracting URLs during workflow execution`, logger_1.Level.LOG);
|
|
701
|
+
const hierarchy = this.pendingDeepExtraction.hierarchy;
|
|
702
|
+
if (hierarchy && hierarchy.length > 0) {
|
|
703
|
+
let targetLevelIndex = -1;
|
|
704
|
+
for (let i = hierarchy.length - 1; i >= 0; i--) {
|
|
705
|
+
if (hierarchy[i].urlMappings.length === 0) {
|
|
706
|
+
targetLevelIndex = i;
|
|
707
|
+
break;
|
|
708
|
+
}
|
|
709
|
+
}
|
|
710
|
+
if (targetLevelIndex >= 0) {
|
|
711
|
+
const nextGotoPattern = hierarchy[targetLevelIndex].gotoPattern;
|
|
712
|
+
this.log(`[Deep Extract] Storing URLs for level ${targetLevelIndex}, pattern: ${nextGotoPattern}`, logger_1.Level.LOG);
|
|
713
|
+
const urlMappings = yield this.filterDeepExtractionUrls(page, config.listSelector, scrapeResults, nextGotoPattern);
|
|
714
|
+
if (hierarchy[targetLevelIndex].urlMappings.length > 0) {
|
|
715
|
+
const existingUrls = new Set(hierarchy[targetLevelIndex].urlMappings.map(m => m.url).filter(u => u !== null));
|
|
716
|
+
const newUrls = urlMappings.filter(m => m.url !== null && !existingUrls.has(m.url));
|
|
717
|
+
if (newUrls.length > 0) {
|
|
718
|
+
const startIndex = hierarchy[targetLevelIndex].urlMappings.length;
|
|
719
|
+
hierarchy[targetLevelIndex].urlMappings.push(...newUrls.map((m, idx) => ({ index: startIndex + idx, url: m.url })));
|
|
720
|
+
this.log(`[Deep Extract] Merged ${newUrls.length} new URLs`, logger_1.Level.LOG);
|
|
721
|
+
}
|
|
722
|
+
}
|
|
723
|
+
else {
|
|
724
|
+
hierarchy[targetLevelIndex].urlMappings = urlMappings.map(m => ({ index: m.scrapeListIndex, url: m.url }));
|
|
725
|
+
}
|
|
726
|
+
this.log(`[Deep Extract] Stored ${urlMappings.filter(m => m.url !== null).length} matching URLs`, logger_1.Level.LOG);
|
|
727
|
+
const validUrls = urlMappings.filter(m => m.url !== null);
|
|
728
|
+
if (validUrls.length > 0) {
|
|
729
|
+
const sampleSize = Math.min(3, validUrls.length);
|
|
730
|
+
this.log(`[Deep Extract] Sample URLs (showing ${sampleSize} of ${validUrls.length}):`, logger_1.Level.LOG);
|
|
731
|
+
validUrls.slice(0, sampleSize).forEach((mapping, idx) => {
|
|
732
|
+
this.log(`[Deep Extract] ${idx + 1}. ${mapping.url}`, logger_1.Level.LOG);
|
|
733
|
+
});
|
|
734
|
+
}
|
|
735
|
+
}
|
|
736
|
+
}
|
|
737
|
+
}
|
|
738
|
+
}
|
|
570
739
|
}
|
|
571
740
|
}
|
|
572
741
|
catch (error) {
|
|
573
742
|
console.error('ScrapeList action failed completely:', error.message);
|
|
574
|
-
// Don't throw error, just return empty array
|
|
575
743
|
const actionType = "scrapeList";
|
|
576
|
-
|
|
744
|
+
let name = actionName || "";
|
|
745
|
+
if (!name || name.trim() === "") {
|
|
746
|
+
this.scrapeListCounter++;
|
|
747
|
+
name = `List ${this.scrapeListCounter}`;
|
|
748
|
+
}
|
|
577
749
|
if (!this.namedResults[actionType])
|
|
578
750
|
this.namedResults[actionType] = {};
|
|
579
|
-
this.namedResults[actionType][
|
|
751
|
+
this.namedResults[actionType][name] = [];
|
|
580
752
|
if (!this.serializableDataByType[actionType])
|
|
581
753
|
this.serializableDataByType[actionType] = {};
|
|
582
|
-
this.serializableDataByType[actionType][
|
|
754
|
+
this.serializableDataByType[actionType][name] = [];
|
|
583
755
|
yield this.options.serializableCallback({
|
|
584
756
|
scrapeList: this.serializableDataByType.scrapeList,
|
|
585
757
|
scrapeSchema: this.serializableDataByType.scrapeSchema
|
|
@@ -662,25 +834,7 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
662
834
|
if (debug === null || debug === void 0 ? void 0 : debug.setActionType) {
|
|
663
835
|
debug.setActionType(String(step.action));
|
|
664
836
|
}
|
|
665
|
-
|
|
666
|
-
if (step === null || step === void 0 ? void 0 : step.name) {
|
|
667
|
-
stepName = step.name;
|
|
668
|
-
}
|
|
669
|
-
else if (Array.isArray(step === null || step === void 0 ? void 0 : step.args) &&
|
|
670
|
-
step.args.length > 0 &&
|
|
671
|
-
typeof step.args[0] === "object" &&
|
|
672
|
-
"__name" in step.args[0]) {
|
|
673
|
-
stepName = step.args[0].__name;
|
|
674
|
-
}
|
|
675
|
-
else if (typeof (step === null || step === void 0 ? void 0 : step.args) === "object" &&
|
|
676
|
-
(step === null || step === void 0 ? void 0 : step.args) !== null &&
|
|
677
|
-
"__name" in step.args) {
|
|
678
|
-
stepName = step.args.__name;
|
|
679
|
-
}
|
|
680
|
-
// Default fallback
|
|
681
|
-
if (!stepName) {
|
|
682
|
-
stepName = String(step.action);
|
|
683
|
-
}
|
|
837
|
+
stepName = (step === null || step === void 0 ? void 0 : step.name) || String(step.action);
|
|
684
838
|
if (debug && typeof debug.setActionName === "function") {
|
|
685
839
|
debug.setActionName(stepName);
|
|
686
840
|
}
|
|
@@ -693,9 +847,12 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
693
847
|
// "Arrayifying" here should not be needed (TS + syntax checker - only arrays; but why not)
|
|
694
848
|
const params = !step.args || Array.isArray(step.args) ? step.args : [step.args];
|
|
695
849
|
if (step.action === 'screenshot') {
|
|
696
|
-
// call the screenshot handler directly to allow the extra name parameter
|
|
697
850
|
yield wawActions.screenshot(...(params !== null && params !== void 0 ? params : []), stepName !== null && stepName !== void 0 ? stepName : undefined);
|
|
698
851
|
}
|
|
852
|
+
else if (step.action === 'scrapeList' || step.action === 'scrapeSchema') {
|
|
853
|
+
const actionName = step.name || "";
|
|
854
|
+
yield wawActions[step.action](...(params !== null && params !== void 0 ? params : []), actionName);
|
|
855
|
+
}
|
|
699
856
|
else {
|
|
700
857
|
yield wawActions[step.action](...(params !== null && params !== void 0 ? params : []));
|
|
701
858
|
}
|
|
@@ -755,17 +912,16 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
755
912
|
}
|
|
756
913
|
});
|
|
757
914
|
}
|
|
758
|
-
handlePagination(
|
|
759
|
-
return __awaiter(this,
|
|
760
|
-
// Check abort flag at start of pagination
|
|
915
|
+
handlePagination(page_1, config_1) {
|
|
916
|
+
return __awaiter(this, arguments, void 0, function* (page, config, providedActionName = "") {
|
|
761
917
|
if (this.isAborted) {
|
|
762
918
|
this.log('Workflow aborted, stopping pagination', logger_1.Level.WARN);
|
|
763
|
-
return [];
|
|
919
|
+
return { results: [], urls: [] };
|
|
764
920
|
}
|
|
765
|
-
// Generate action name for this scrapeList
|
|
766
921
|
const actionType = "scrapeList";
|
|
767
|
-
let actionName =
|
|
768
|
-
|
|
922
|
+
let actionName = providedActionName || "";
|
|
923
|
+
// During deep extraction, ALWAYS auto-increment to create separate lists for each URL
|
|
924
|
+
if (!actionName || actionName.trim() === "" || this.isInDeepExtractionPhase) {
|
|
769
925
|
this.scrapeListCounter++;
|
|
770
926
|
actionName = `List ${this.scrapeListCounter}`;
|
|
771
927
|
}
|
|
@@ -777,6 +933,7 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
777
933
|
this.serializableDataByType[actionType][actionName] = [];
|
|
778
934
|
}
|
|
779
935
|
let allResults = [];
|
|
936
|
+
let allUrls = []; // Track URLs alongside results for deep-extract
|
|
780
937
|
let previousHeight = 0;
|
|
781
938
|
let scrapedItems = new Set();
|
|
782
939
|
let visitedUrls = new Set();
|
|
@@ -803,14 +960,22 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
803
960
|
debugLog(`Page evaluation failed: ${error.message}`);
|
|
804
961
|
return;
|
|
805
962
|
}
|
|
806
|
-
|
|
963
|
+
// Extract URLs for ALL items BEFORE filtering duplicates
|
|
964
|
+
// This ensures URL indices match result indices
|
|
965
|
+
const allItemUrls = yield this.extractUrlsFromCurrentPage(page, config.listSelector, results.length);
|
|
966
|
+
// Filter results AND URLs together using the same uniqueness logic
|
|
967
|
+
const newResults = [];
|
|
968
|
+
const newUrls = [];
|
|
969
|
+
results.forEach((item, index) => {
|
|
807
970
|
const uniqueKey = JSON.stringify(item);
|
|
808
|
-
if (scrapedItems.has(uniqueKey))
|
|
809
|
-
|
|
810
|
-
|
|
811
|
-
|
|
971
|
+
if (!scrapedItems.has(uniqueKey)) {
|
|
972
|
+
scrapedItems.add(uniqueKey);
|
|
973
|
+
newResults.push(item);
|
|
974
|
+
newUrls.push(allItemUrls[index] || []); // Add corresponding URLs
|
|
975
|
+
}
|
|
812
976
|
});
|
|
813
977
|
allResults = allResults.concat(newResults);
|
|
978
|
+
allUrls = allUrls.concat(newUrls);
|
|
814
979
|
debugLog("Results collected:", allResults.length);
|
|
815
980
|
// Store in serializableDataByType and send structured callback
|
|
816
981
|
this.serializableDataByType[actionType][actionName] = [...allResults];
|
|
@@ -822,6 +987,7 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
822
987
|
const checkLimit = () => {
|
|
823
988
|
if (config.limit && allResults.length >= config.limit) {
|
|
824
989
|
allResults = allResults.slice(0, config.limit);
|
|
990
|
+
allUrls = allUrls.slice(0, config.limit); // Also trim URLs to maintain sync
|
|
825
991
|
return true;
|
|
826
992
|
}
|
|
827
993
|
return false;
|
|
@@ -947,16 +1113,16 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
947
1113
|
// Check abort flag at start of each pagination iteration
|
|
948
1114
|
if (this.isAborted) {
|
|
949
1115
|
this.log('Workflow aborted during pagination loop', logger_1.Level.WARN);
|
|
950
|
-
return allResults;
|
|
1116
|
+
return { results: allResults, urls: allUrls };
|
|
951
1117
|
}
|
|
952
1118
|
// Pagination circuit breakers
|
|
953
1119
|
if (++paginationIterations > MAX_PAGINATION_ITERATIONS) {
|
|
954
1120
|
debugLog(`Maximum pagination iterations reached (${MAX_PAGINATION_ITERATIONS}), stopping`);
|
|
955
|
-
return allResults;
|
|
1121
|
+
return { results: allResults, urls: allUrls };
|
|
956
1122
|
}
|
|
957
1123
|
if (Date.now() - paginationStartTime > MAX_PAGINATION_TIME) {
|
|
958
1124
|
debugLog('Maximum pagination time reached (10 minutes), stopping');
|
|
959
|
-
return allResults;
|
|
1125
|
+
return { results: allResults, urls: allUrls };
|
|
960
1126
|
}
|
|
961
1127
|
// Add async yield every 5 iterations to prevent event loop blocking
|
|
962
1128
|
if (paginationIterations % 5 === 0) {
|
|
@@ -967,7 +1133,7 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
967
1133
|
let previousResultCount = allResults.length;
|
|
968
1134
|
yield scrapeCurrentPage();
|
|
969
1135
|
if (checkLimit()) {
|
|
970
|
-
return allResults;
|
|
1136
|
+
return { results: allResults, urls: allUrls };
|
|
971
1137
|
}
|
|
972
1138
|
yield page.evaluate(() => {
|
|
973
1139
|
const scrollHeight = Math.max(document.body.scrollHeight, document.documentElement.scrollHeight);
|
|
@@ -981,14 +1147,14 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
981
1147
|
if (currentResultCount === previousResultCount) {
|
|
982
1148
|
unchangedResultCounter++;
|
|
983
1149
|
if (unchangedResultCounter >= MAX_UNCHANGED_RESULTS) {
|
|
984
|
-
return allResults;
|
|
1150
|
+
return { results: allResults, urls: allUrls };
|
|
985
1151
|
}
|
|
986
1152
|
}
|
|
987
1153
|
else {
|
|
988
1154
|
unchangedResultCounter = 0;
|
|
989
1155
|
}
|
|
990
1156
|
if (currentHeight === previousHeight) {
|
|
991
|
-
return allResults;
|
|
1157
|
+
return { results: allResults, urls: allUrls };
|
|
992
1158
|
}
|
|
993
1159
|
previousHeight = currentHeight;
|
|
994
1160
|
break;
|
|
@@ -997,7 +1163,7 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
997
1163
|
let previousResultCount = allResults.length;
|
|
998
1164
|
yield scrapeCurrentPage();
|
|
999
1165
|
if (checkLimit()) {
|
|
1000
|
-
return allResults;
|
|
1166
|
+
return { results: allResults, urls: allUrls };
|
|
1001
1167
|
}
|
|
1002
1168
|
yield page.evaluate(() => window.scrollTo(0, 0));
|
|
1003
1169
|
yield page.waitForTimeout(2000);
|
|
@@ -1006,14 +1172,14 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
1006
1172
|
if (currentResultCount === previousResultCount) {
|
|
1007
1173
|
unchangedResultCounter++;
|
|
1008
1174
|
if (unchangedResultCounter >= MAX_UNCHANGED_RESULTS) {
|
|
1009
|
-
return allResults;
|
|
1175
|
+
return { results: allResults, urls: allUrls };
|
|
1010
1176
|
}
|
|
1011
1177
|
}
|
|
1012
1178
|
else {
|
|
1013
1179
|
unchangedResultCounter = 0;
|
|
1014
1180
|
}
|
|
1015
1181
|
if (currentTopHeight === 0) {
|
|
1016
|
-
return allResults;
|
|
1182
|
+
return { results: allResults, urls: allUrls };
|
|
1017
1183
|
}
|
|
1018
1184
|
previousHeight = currentTopHeight;
|
|
1019
1185
|
break;
|
|
@@ -1023,7 +1189,7 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
1023
1189
|
visitedUrls.add(currentUrl);
|
|
1024
1190
|
yield scrapeCurrentPage();
|
|
1025
1191
|
if (checkLimit())
|
|
1026
|
-
return allResults;
|
|
1192
|
+
return { results: allResults, urls: allUrls };
|
|
1027
1193
|
const { button, workingSelector, updatedSelectors } = yield findWorkingButton(availableSelectors);
|
|
1028
1194
|
availableSelectors = updatedSelectors;
|
|
1029
1195
|
if (!button || !workingSelector) {
|
|
@@ -1039,7 +1205,7 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
1039
1205
|
}
|
|
1040
1206
|
}));
|
|
1041
1207
|
if (!success)
|
|
1042
|
-
return allResults;
|
|
1208
|
+
return { results: allResults, urls: allUrls };
|
|
1043
1209
|
break;
|
|
1044
1210
|
}
|
|
1045
1211
|
let retryCount = 0;
|
|
@@ -1169,14 +1335,14 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
1169
1335
|
}
|
|
1170
1336
|
if (!paginationSuccess) {
|
|
1171
1337
|
debugLog(`Pagination failed after ${MAX_RETRIES} attempts`);
|
|
1172
|
-
return allResults;
|
|
1338
|
+
return { results: allResults, urls: allUrls };
|
|
1173
1339
|
}
|
|
1174
1340
|
break;
|
|
1175
1341
|
}
|
|
1176
1342
|
case 'clickLoadMore': {
|
|
1177
1343
|
yield scrapeCurrentPage();
|
|
1178
1344
|
if (checkLimit())
|
|
1179
|
-
return allResults;
|
|
1345
|
+
return { results: allResults, urls: allUrls };
|
|
1180
1346
|
let loadMoreCounter = 0;
|
|
1181
1347
|
const MAX_LOAD_MORE_ITERATIONS = 100; // Prevent infinite load more
|
|
1182
1348
|
const loadMoreStartTime = Date.now();
|
|
@@ -1185,11 +1351,11 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
1185
1351
|
// Load more circuit breakers
|
|
1186
1352
|
if (loadMoreCounter >= MAX_LOAD_MORE_ITERATIONS) {
|
|
1187
1353
|
debugLog(`Maximum load more iterations reached (${MAX_LOAD_MORE_ITERATIONS}), stopping`);
|
|
1188
|
-
return allResults;
|
|
1354
|
+
return { results: allResults, urls: allUrls };
|
|
1189
1355
|
}
|
|
1190
1356
|
if (Date.now() - loadMoreStartTime > MAX_LOAD_MORE_TIME) {
|
|
1191
1357
|
debugLog('Maximum load more time reached (5 minutes), stopping');
|
|
1192
|
-
return allResults;
|
|
1358
|
+
return { results: allResults, urls: allUrls };
|
|
1193
1359
|
}
|
|
1194
1360
|
// Add async yield every 3 iterations
|
|
1195
1361
|
if (loadMoreCounter % 3 === 0 && loadMoreCounter > 0) {
|
|
@@ -1200,7 +1366,7 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
1200
1366
|
availableSelectors = updatedSelectors;
|
|
1201
1367
|
if (!workingSelector || !loadMoreButton) {
|
|
1202
1368
|
debugLog('No working Load More selector found after retries');
|
|
1203
|
-
return allResults;
|
|
1369
|
+
return { results: allResults, urls: allUrls };
|
|
1204
1370
|
}
|
|
1205
1371
|
// Implement retry mechanism for clicking the button
|
|
1206
1372
|
let retryCount = 0;
|
|
@@ -1240,7 +1406,7 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
1240
1406
|
}
|
|
1241
1407
|
if (!clickSuccess) {
|
|
1242
1408
|
debugLog(`Load More clicking failed after ${MAX_RETRIES} attempts`);
|
|
1243
|
-
return allResults;
|
|
1409
|
+
return { results: allResults, urls: allUrls };
|
|
1244
1410
|
}
|
|
1245
1411
|
// Wait for content to load and check scroll height
|
|
1246
1412
|
yield page.waitForTimeout(2000);
|
|
@@ -1269,16 +1435,16 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
1269
1435
|
// previousResultCount = currentResultCount;
|
|
1270
1436
|
// }
|
|
1271
1437
|
if (checkLimit())
|
|
1272
|
-
return allResults;
|
|
1438
|
+
return { results: allResults, urls: allUrls };
|
|
1273
1439
|
if (!heightChanged) {
|
|
1274
1440
|
debugLog('No more items loaded after Load More');
|
|
1275
|
-
return allResults;
|
|
1441
|
+
return { results: allResults, urls: allUrls };
|
|
1276
1442
|
}
|
|
1277
1443
|
}
|
|
1278
1444
|
}
|
|
1279
1445
|
default: {
|
|
1280
1446
|
yield scrapeCurrentPage();
|
|
1281
|
-
return allResults;
|
|
1447
|
+
return { results: allResults, urls: allUrls };
|
|
1282
1448
|
}
|
|
1283
1449
|
}
|
|
1284
1450
|
if (checkLimit())
|
|
@@ -1287,9 +1453,9 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
1287
1453
|
}
|
|
1288
1454
|
catch (error) {
|
|
1289
1455
|
debugLog(`Fatal error: ${error.message}`);
|
|
1290
|
-
return allResults;
|
|
1456
|
+
return { results: allResults, urls: allUrls };
|
|
1291
1457
|
}
|
|
1292
|
-
return allResults;
|
|
1458
|
+
return { results: allResults, urls: allUrls };
|
|
1293
1459
|
});
|
|
1294
1460
|
}
|
|
1295
1461
|
getMatchingActionId(workflow, pageState, usedActions) {
|
|
@@ -2213,6 +2379,518 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
2213
2379
|
return modifiedAction;
|
|
2214
2380
|
});
|
|
2215
2381
|
}
|
|
2382
|
+
/**
|
|
2383
|
+
* Extracts URLs from the current page's list elements.
|
|
2384
|
+
* Used during pagination to maintain sync between scraped results and extracted URLs.
|
|
2385
|
+
*
|
|
2386
|
+
* @param page - Playwright page object
|
|
2387
|
+
* @param listSelector - The selector used to identify list elements
|
|
2388
|
+
* @param limit - Maximum number of elements to process (should match number of scraped items)
|
|
2389
|
+
* @returns Array of URL arrays, one per list element
|
|
2390
|
+
*/
|
|
2391
|
+
extractUrlsFromCurrentPage(page, listSelector, limit) {
|
|
2392
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
2393
|
+
const extractedUrls = yield page.evaluate(({ selector, limit }) => {
|
|
2394
|
+
const urlsByElement = [];
|
|
2395
|
+
let listElements = [];
|
|
2396
|
+
if (selector.startsWith('//') || selector.startsWith('(//')) {
|
|
2397
|
+
const xpathResult = document.evaluate(selector, document, null, XPathResult.ORDERED_NODE_SNAPSHOT_TYPE, null);
|
|
2398
|
+
for (let i = 0; i < xpathResult.snapshotLength; i++) {
|
|
2399
|
+
const node = xpathResult.snapshotItem(i);
|
|
2400
|
+
if (node && node.nodeType === Node.ELEMENT_NODE) {
|
|
2401
|
+
listElements.push(node);
|
|
2402
|
+
}
|
|
2403
|
+
}
|
|
2404
|
+
}
|
|
2405
|
+
else {
|
|
2406
|
+
listElements = Array.from(document.querySelectorAll(selector));
|
|
2407
|
+
}
|
|
2408
|
+
// Extract URLs from the first 'limit' elements that match the selector
|
|
2409
|
+
// The limit corresponds to the number of items that were scraped
|
|
2410
|
+
const elementsToProcess = listElements.slice(0, limit);
|
|
2411
|
+
elementsToProcess.forEach(element => {
|
|
2412
|
+
const urls = [];
|
|
2413
|
+
if (element.tagName === 'A' && element.href) {
|
|
2414
|
+
urls.push(element.href);
|
|
2415
|
+
}
|
|
2416
|
+
const anchors = element.querySelectorAll('a[href]');
|
|
2417
|
+
anchors.forEach(anchor => {
|
|
2418
|
+
const href = anchor.href;
|
|
2419
|
+
if (href && !urls.includes(href)) {
|
|
2420
|
+
urls.push(href);
|
|
2421
|
+
}
|
|
2422
|
+
});
|
|
2423
|
+
urlsByElement.push(urls);
|
|
2424
|
+
});
|
|
2425
|
+
return urlsByElement;
|
|
2426
|
+
}, { selector: listSelector, limit });
|
|
2427
|
+
return extractedUrls;
|
|
2428
|
+
});
|
|
2429
|
+
}
|
|
2430
|
+
/**
|
|
2431
|
+
* Builds a hierarchical deep extraction plan by analyzing the workflow structure.
|
|
2432
|
+
* Identifies goto actions and determines what actions to execute at each level.
|
|
2433
|
+
* Workflow is bottom-to-top, so we scan from end to start.
|
|
2434
|
+
*/
|
|
2435
|
+
buildDeepExtractionHierarchy(currentWorkflow) {
|
|
2436
|
+
var _a, _b;
|
|
2437
|
+
const hierarchy = [];
|
|
2438
|
+
// Find all goto action indices with their patterns
|
|
2439
|
+
const gotoData = [];
|
|
2440
|
+
currentWorkflow.forEach((pair, index) => {
|
|
2441
|
+
var _a;
|
|
2442
|
+
if (pair.what && pair.what.some(action => action.action === 'goto')) {
|
|
2443
|
+
const gotoAction = pair.what.find(action => action.action === 'goto');
|
|
2444
|
+
const pattern = (_a = gotoAction === null || gotoAction === void 0 ? void 0 : gotoAction.args) === null || _a === void 0 ? void 0 : _a[0];
|
|
2445
|
+
if (pattern) {
|
|
2446
|
+
gotoData.push({ index, pattern: String(pattern) });
|
|
2447
|
+
}
|
|
2448
|
+
}
|
|
2449
|
+
});
|
|
2450
|
+
if (gotoData.length === 0) {
|
|
2451
|
+
this.log('No goto actions found in workflow', logger_1.Level.WARN);
|
|
2452
|
+
return [];
|
|
2453
|
+
}
|
|
2454
|
+
this.log(`Found ${gotoData.length} goto action(s) at indices: ${gotoData.map(g => g.index).join(', ')}`, logger_1.Level.LOG);
|
|
2455
|
+
const uniqueGotos = [];
|
|
2456
|
+
for (let i = 0; i < gotoData.length; i++) {
|
|
2457
|
+
const current = gotoData[i];
|
|
2458
|
+
const next = gotoData[i + 1];
|
|
2459
|
+
if (next && current.pattern === next.pattern) {
|
|
2460
|
+
this.log(`Skipping duplicate goto at index ${next.index} (same as ${current.index})`, logger_1.Level.LOG);
|
|
2461
|
+
i++;
|
|
2462
|
+
}
|
|
2463
|
+
uniqueGotos.push(current);
|
|
2464
|
+
}
|
|
2465
|
+
this.log(`After deduplication: ${uniqueGotos.length} unique goto(s)`, logger_1.Level.LOG);
|
|
2466
|
+
for (let i = 0; i < uniqueGotos.length; i++) {
|
|
2467
|
+
const gotoIndex = uniqueGotos[i].index;
|
|
2468
|
+
const gotoPattern = uniqueGotos[i].pattern;
|
|
2469
|
+
const nextGotoIndex = i > 0 ? uniqueGotos[i - 1].index : 0;
|
|
2470
|
+
let actionsToExecute = currentWorkflow.slice(nextGotoIndex, gotoIndex);
|
|
2471
|
+
actionsToExecute = actionsToExecute.filter(pair => {
|
|
2472
|
+
return !pair.what || !pair.what.some(action => action.action === 'goto');
|
|
2473
|
+
});
|
|
2474
|
+
const dataExtractionActions = actionsToExecute.filter(pair => {
|
|
2475
|
+
return pair.what && pair.what.some(action => action.action === 'scrapeSchema' ||
|
|
2476
|
+
action.action === 'scrapeList' ||
|
|
2477
|
+
action.action === 'screenshot');
|
|
2478
|
+
});
|
|
2479
|
+
if (dataExtractionActions.length === 0) {
|
|
2480
|
+
this.log(`No data extraction actions found between goto at ${gotoIndex} and next level`, logger_1.Level.WARN);
|
|
2481
|
+
continue;
|
|
2482
|
+
}
|
|
2483
|
+
let sourceActionName = '';
|
|
2484
|
+
let sourceActionType = 'scrapeList';
|
|
2485
|
+
if (i === uniqueGotos.length - 1) {
|
|
2486
|
+
const scrapeListBefore = currentWorkflow.slice(gotoIndex + 1).find(pair => pair.what && pair.what.some(action => action.action === 'scrapeList'));
|
|
2487
|
+
if (scrapeListBefore) {
|
|
2488
|
+
const scrapeListAction = scrapeListBefore.what.find(action => action.action === 'scrapeList');
|
|
2489
|
+
sourceActionName = ((_b = (_a = scrapeListAction === null || scrapeListAction === void 0 ? void 0 : scrapeListAction.args) === null || _a === void 0 ? void 0 : _a[0]) === null || _b === void 0 ? void 0 : _b.name) || (scrapeListAction === null || scrapeListAction === void 0 ? void 0 : scrapeListAction.name) || '';
|
|
2490
|
+
sourceActionType = 'scrapeList';
|
|
2491
|
+
}
|
|
2492
|
+
}
|
|
2493
|
+
else {
|
|
2494
|
+
sourceActionName = '';
|
|
2495
|
+
sourceActionType = 'scrapeSchema';
|
|
2496
|
+
}
|
|
2497
|
+
hierarchy.push({
|
|
2498
|
+
gotoActionIndex: gotoIndex,
|
|
2499
|
+
gotoPattern: String(gotoPattern),
|
|
2500
|
+
actionsToExecute: dataExtractionActions,
|
|
2501
|
+
sourceActionName,
|
|
2502
|
+
sourceActionType
|
|
2503
|
+
});
|
|
2504
|
+
this.log(`Level ${i}: goto at index ${gotoIndex}, pattern=${gotoPattern}, actions=${dataExtractionActions.length}`, logger_1.Level.LOG);
|
|
2505
|
+
}
|
|
2506
|
+
return hierarchy;
|
|
2507
|
+
}
|
|
2508
|
+
/**
|
|
2509
|
+
* Extracts hrefs directly from the page based on scrapeSchema selectors.
|
|
2510
|
+
* Checks ALL selectors from the schema config - if they point to anchor elements, extract href.
|
|
2511
|
+
* This is called after scrapeSchema executes to capture hrefs for deep extraction.
|
|
2512
|
+
*/
|
|
2513
|
+
extractHrefsFromPage(page, schemaConfig) {
|
|
2514
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
2515
|
+
try {
|
|
2516
|
+
const fields = schemaConfig.fields || schemaConfig;
|
|
2517
|
+
const selectors = [];
|
|
2518
|
+
for (const [fieldName, fieldConfig] of Object.entries(fields)) {
|
|
2519
|
+
if (fieldConfig && typeof fieldConfig === 'object' && fieldConfig.selector) {
|
|
2520
|
+
selectors.push(String(fieldConfig.selector));
|
|
2521
|
+
}
|
|
2522
|
+
}
|
|
2523
|
+
if (selectors.length === 0) {
|
|
2524
|
+
return [];
|
|
2525
|
+
}
|
|
2526
|
+
const extractedUrls = yield page.evaluate((selectorList) => {
|
|
2527
|
+
const urls = [];
|
|
2528
|
+
for (const selector of selectorList) {
|
|
2529
|
+
if (!selector)
|
|
2530
|
+
continue;
|
|
2531
|
+
try {
|
|
2532
|
+
let elements = [];
|
|
2533
|
+
if (selector.startsWith('//') || selector.startsWith('(//')) {
|
|
2534
|
+
const xpathResult = document.evaluate(selector, document, null, XPathResult.ORDERED_NODE_SNAPSHOT_TYPE, null);
|
|
2535
|
+
for (let i = 0; i < xpathResult.snapshotLength; i++) {
|
|
2536
|
+
const node = xpathResult.snapshotItem(i);
|
|
2537
|
+
if (node && node.nodeType === Node.ELEMENT_NODE) {
|
|
2538
|
+
elements.push(node);
|
|
2539
|
+
}
|
|
2540
|
+
}
|
|
2541
|
+
}
|
|
2542
|
+
else {
|
|
2543
|
+
elements = Array.from(document.querySelectorAll(selector));
|
|
2544
|
+
}
|
|
2545
|
+
for (const element of elements) {
|
|
2546
|
+
if (element.tagName === 'A' && element.href) {
|
|
2547
|
+
const href = element.href;
|
|
2548
|
+
if (href && !urls.includes(href)) {
|
|
2549
|
+
urls.push(href);
|
|
2550
|
+
}
|
|
2551
|
+
}
|
|
2552
|
+
}
|
|
2553
|
+
}
|
|
2554
|
+
catch (error) {
|
|
2555
|
+
console.warn(`Failed to extract hrefs for selector ${selector}:`, error);
|
|
2556
|
+
}
|
|
2557
|
+
}
|
|
2558
|
+
return urls;
|
|
2559
|
+
}, selectors);
|
|
2560
|
+
this.log(`Extracted ${extractedUrls.length} hrefs from page for schema selectors`, logger_1.Level.LOG);
|
|
2561
|
+
return extractedUrls;
|
|
2562
|
+
}
|
|
2563
|
+
catch (error) {
|
|
2564
|
+
this.log(`Failed to extract hrefs from page: ${error.message}`, logger_1.Level.ERROR);
|
|
2565
|
+
return [];
|
|
2566
|
+
}
|
|
2567
|
+
});
|
|
2568
|
+
}
|
|
2569
|
+
/**
|
|
2570
|
+
* Filters URLs for deep extraction based on the goto action pattern.
|
|
2571
|
+
* This is called immediately after the first capture action (scrapeList).
|
|
2572
|
+
* Returns the filtered URL mappings that should be processed after workflow completion.
|
|
2573
|
+
* Each mapping maintains alignment with the original scrapeList index.
|
|
2574
|
+
*/
|
|
2575
|
+
filterDeepExtractionUrls(page, listSelector, scrapeResults, gotoTargetPattern) {
|
|
2576
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
2577
|
+
try {
|
|
2578
|
+
this.log(`Deep extraction: Filtering URLs from list structure (${scrapeResults.length} items)`, logger_1.Level.LOG);
|
|
2579
|
+
const extractedUrls = yield page.evaluate(({ selector, limit }) => {
|
|
2580
|
+
const urlsByElement = [];
|
|
2581
|
+
let listElements = [];
|
|
2582
|
+
if (selector.startsWith('//') || selector.startsWith('(//')) {
|
|
2583
|
+
const xpathResult = document.evaluate(selector, document, null, XPathResult.ORDERED_NODE_SNAPSHOT_TYPE, null);
|
|
2584
|
+
for (let i = 0; i < xpathResult.snapshotLength; i++) {
|
|
2585
|
+
const node = xpathResult.snapshotItem(i);
|
|
2586
|
+
if (node && node.nodeType === Node.ELEMENT_NODE) {
|
|
2587
|
+
listElements.push(node);
|
|
2588
|
+
}
|
|
2589
|
+
}
|
|
2590
|
+
}
|
|
2591
|
+
else {
|
|
2592
|
+
listElements = Array.from(document.querySelectorAll(selector));
|
|
2593
|
+
}
|
|
2594
|
+
const elementsToProcess = listElements.slice(0, limit);
|
|
2595
|
+
elementsToProcess.forEach(element => {
|
|
2596
|
+
const urls = [];
|
|
2597
|
+
if (element.tagName === 'A' && element.href) {
|
|
2598
|
+
urls.push(element.href);
|
|
2599
|
+
}
|
|
2600
|
+
const anchors = element.querySelectorAll('a[href]');
|
|
2601
|
+
anchors.forEach(anchor => {
|
|
2602
|
+
const href = anchor.href;
|
|
2603
|
+
if (href && !urls.includes(href)) {
|
|
2604
|
+
urls.push(href);
|
|
2605
|
+
}
|
|
2606
|
+
});
|
|
2607
|
+
urlsByElement.push(urls);
|
|
2608
|
+
});
|
|
2609
|
+
return urlsByElement;
|
|
2610
|
+
}, { selector: listSelector, limit: scrapeResults.length });
|
|
2611
|
+
const totalUrlCount = extractedUrls.reduce((sum, urls) => sum + urls.length, 0);
|
|
2612
|
+
this.log(`Extracted ${totalUrlCount} total URLs from ${scrapeResults.length} list items (avg ${(totalUrlCount / scrapeResults.length).toFixed(1)} URLs per item)`, logger_1.Level.LOG);
|
|
2613
|
+
const getUrlPattern = (url) => {
|
|
2614
|
+
try {
|
|
2615
|
+
const urlObj = new URL(url);
|
|
2616
|
+
const pathname = urlObj.pathname.replace(/\/$/, '');
|
|
2617
|
+
const segments = pathname.split('/').filter(s => s.length > 0);
|
|
2618
|
+
return {
|
|
2619
|
+
origin: urlObj.origin,
|
|
2620
|
+
pathSegments: segments
|
|
2621
|
+
};
|
|
2622
|
+
}
|
|
2623
|
+
catch (_a) {
|
|
2624
|
+
return null;
|
|
2625
|
+
}
|
|
2626
|
+
};
|
|
2627
|
+
const targetPattern = getUrlPattern(String(gotoTargetPattern));
|
|
2628
|
+
const targetNormalized = String(gotoTargetPattern).replace(/\/$/, '').toLowerCase();
|
|
2629
|
+
if (!targetPattern) {
|
|
2630
|
+
this.log('Could not parse goto URL pattern, skipping deep extraction', logger_1.Level.WARN);
|
|
2631
|
+
return [];
|
|
2632
|
+
}
|
|
2633
|
+
this.log(`Target URL pattern: ${targetPattern.origin}/${targetPattern.pathSegments.join('/')}`, logger_1.Level.LOG);
|
|
2634
|
+
const urlMappings = [];
|
|
2635
|
+
extractedUrls.forEach((urlsFromElement, scrapeListIndex) => {
|
|
2636
|
+
let matchingUrl = null;
|
|
2637
|
+
for (const url of urlsFromElement) {
|
|
2638
|
+
const urlPattern = getUrlPattern(url);
|
|
2639
|
+
if (!urlPattern)
|
|
2640
|
+
continue;
|
|
2641
|
+
if (urlPattern.origin !== targetPattern.origin)
|
|
2642
|
+
continue;
|
|
2643
|
+
if (urlPattern.pathSegments.length !== targetPattern.pathSegments.length)
|
|
2644
|
+
continue;
|
|
2645
|
+
let pathMatches = true;
|
|
2646
|
+
for (let i = 0; i < targetPattern.pathSegments.length - 1; i++) {
|
|
2647
|
+
if (urlPattern.pathSegments[i] !== targetPattern.pathSegments[i]) {
|
|
2648
|
+
pathMatches = false;
|
|
2649
|
+
break;
|
|
2650
|
+
}
|
|
2651
|
+
}
|
|
2652
|
+
if (!pathMatches)
|
|
2653
|
+
continue;
|
|
2654
|
+
const urlNormalized = url.replace(/\/$/, '').toLowerCase();
|
|
2655
|
+
if (urlNormalized === targetNormalized) {
|
|
2656
|
+
this.log(`Excluding already-visited URL: ${url}`, logger_1.Level.LOG);
|
|
2657
|
+
continue;
|
|
2658
|
+
}
|
|
2659
|
+
matchingUrl = url;
|
|
2660
|
+
break;
|
|
2661
|
+
}
|
|
2662
|
+
urlMappings.push({
|
|
2663
|
+
scrapeListIndex,
|
|
2664
|
+
url: matchingUrl
|
|
2665
|
+
});
|
|
2666
|
+
});
|
|
2667
|
+
const matchedCount = urlMappings.filter(m => m.url !== null).length;
|
|
2668
|
+
this.log(`Filtered to ${matchedCount} matching URLs for deep extraction (out of ${scrapeResults.length} total items)`, logger_1.Level.LOG);
|
|
2669
|
+
if (matchedCount > 0) {
|
|
2670
|
+
const matchedMappings = urlMappings.filter(m => m.url !== null);
|
|
2671
|
+
const sampleSize = Math.min(5, matchedMappings.length);
|
|
2672
|
+
const sample = matchedMappings.slice(0, sampleSize);
|
|
2673
|
+
this.log(`Sample of matching URLs (showing ${sampleSize} of ${matchedMappings.length}):`, logger_1.Level.LOG);
|
|
2674
|
+
sample.forEach((mapping, idx) => {
|
|
2675
|
+
this.log(` ${idx + 1}. [Index ${mapping.scrapeListIndex}] ${mapping.url}`, logger_1.Level.LOG);
|
|
2676
|
+
});
|
|
2677
|
+
}
|
|
2678
|
+
else {
|
|
2679
|
+
this.log('No matching URLs found. Check if extracted URLs match the pattern.', logger_1.Level.WARN);
|
|
2680
|
+
}
|
|
2681
|
+
return urlMappings;
|
|
2682
|
+
}
|
|
2683
|
+
catch (error) {
|
|
2684
|
+
this.log(`URL filtering failed: ${error.message}`, logger_1.Level.ERROR);
|
|
2685
|
+
return [];
|
|
2686
|
+
}
|
|
2687
|
+
});
|
|
2688
|
+
}
|
|
2689
|
+
/**
|
|
2690
|
+
* Filters pre-extracted URLs for deep extraction based on the goto action pattern.
|
|
2691
|
+
* This is used for paginated lists where URLs were extracted during pagination.
|
|
2692
|
+
* Returns the filtered URL mappings that maintain alignment with scrapeList indices.
|
|
2693
|
+
*/
|
|
2694
|
+
filterDeepExtractionUrlsFromExtracted(extractedUrls, scrapeResults, gotoTargetPattern) {
|
|
2695
|
+
try {
|
|
2696
|
+
const totalUrlCount = extractedUrls.reduce((sum, urls) => sum + urls.length, 0);
|
|
2697
|
+
this.log(`Deep extraction: Filtering ${totalUrlCount} pre-extracted URLs from ${scrapeResults.length} items`, logger_1.Level.LOG);
|
|
2698
|
+
const getUrlPattern = (url) => {
|
|
2699
|
+
try {
|
|
2700
|
+
const urlObj = new URL(url);
|
|
2701
|
+
const pathname = urlObj.pathname.replace(/\/$/, '');
|
|
2702
|
+
const segments = pathname.split('/').filter(s => s.length > 0);
|
|
2703
|
+
return {
|
|
2704
|
+
origin: urlObj.origin,
|
|
2705
|
+
pathSegments: segments
|
|
2706
|
+
};
|
|
2707
|
+
}
|
|
2708
|
+
catch (_a) {
|
|
2709
|
+
return null;
|
|
2710
|
+
}
|
|
2711
|
+
};
|
|
2712
|
+
const targetPattern = getUrlPattern(String(gotoTargetPattern));
|
|
2713
|
+
const targetNormalized = String(gotoTargetPattern).replace(/\/$/, '').toLowerCase();
|
|
2714
|
+
if (!targetPattern) {
|
|
2715
|
+
this.log('Could not parse goto URL pattern, skipping deep extraction', logger_1.Level.WARN);
|
|
2716
|
+
return [];
|
|
2717
|
+
}
|
|
2718
|
+
this.log(`Target URL pattern: ${targetPattern.origin}/${targetPattern.pathSegments.join('/')}`, logger_1.Level.LOG);
|
|
2719
|
+
const urlMappings = [];
|
|
2720
|
+
extractedUrls.forEach((urlsFromElement, scrapeListIndex) => {
|
|
2721
|
+
let matchingUrl = null;
|
|
2722
|
+
for (const url of urlsFromElement) {
|
|
2723
|
+
const urlPattern = getUrlPattern(url);
|
|
2724
|
+
if (!urlPattern)
|
|
2725
|
+
continue;
|
|
2726
|
+
if (urlPattern.origin !== targetPattern.origin)
|
|
2727
|
+
continue;
|
|
2728
|
+
if (urlPattern.pathSegments.length !== targetPattern.pathSegments.length)
|
|
2729
|
+
continue;
|
|
2730
|
+
let pathMatches = true;
|
|
2731
|
+
for (let i = 0; i < targetPattern.pathSegments.length - 1; i++) {
|
|
2732
|
+
if (urlPattern.pathSegments[i] !== targetPattern.pathSegments[i]) {
|
|
2733
|
+
pathMatches = false;
|
|
2734
|
+
break;
|
|
2735
|
+
}
|
|
2736
|
+
}
|
|
2737
|
+
if (!pathMatches)
|
|
2738
|
+
continue;
|
|
2739
|
+
const urlNormalized = url.replace(/\/$/, '').toLowerCase();
|
|
2740
|
+
if (urlNormalized === targetNormalized) {
|
|
2741
|
+
this.log(`Excluding already-visited URL: ${url}`, logger_1.Level.LOG);
|
|
2742
|
+
continue;
|
|
2743
|
+
}
|
|
2744
|
+
matchingUrl = url;
|
|
2745
|
+
break;
|
|
2746
|
+
}
|
|
2747
|
+
urlMappings.push({
|
|
2748
|
+
scrapeListIndex,
|
|
2749
|
+
url: matchingUrl
|
|
2750
|
+
});
|
|
2751
|
+
});
|
|
2752
|
+
const matchedCount = urlMappings.filter(m => m.url !== null).length;
|
|
2753
|
+
this.log(`Filtered to ${matchedCount} matching URLs for deep extraction (out of ${scrapeResults.length} total items)`, logger_1.Level.LOG);
|
|
2754
|
+
if (matchedCount > 0) {
|
|
2755
|
+
const matchedMappings = urlMappings.filter(m => m.url !== null);
|
|
2756
|
+
const sampleSize = Math.min(5, matchedMappings.length);
|
|
2757
|
+
const sample = matchedMappings.slice(0, sampleSize);
|
|
2758
|
+
this.log(`Sample of matching URLs (showing ${sampleSize} of ${matchedMappings.length}):`, logger_1.Level.LOG);
|
|
2759
|
+
sample.forEach((mapping, idx) => {
|
|
2760
|
+
this.log(` ${idx + 1}. [Index ${mapping.scrapeListIndex}] ${mapping.url}`, logger_1.Level.LOG);
|
|
2761
|
+
});
|
|
2762
|
+
}
|
|
2763
|
+
else {
|
|
2764
|
+
this.log('No matching URLs found. Check if extracted URLs match the pattern.', logger_1.Level.WARN);
|
|
2765
|
+
}
|
|
2766
|
+
return urlMappings;
|
|
2767
|
+
}
|
|
2768
|
+
catch (error) {
|
|
2769
|
+
this.log(`URL filtering failed: ${error.message}`, logger_1.Level.ERROR);
|
|
2770
|
+
return [];
|
|
2771
|
+
}
|
|
2772
|
+
}
|
|
2773
|
+
/**
|
|
2774
|
+
* Helper function to check if a URL matches a goto pattern.
|
|
2775
|
+
*/
|
|
2776
|
+
matchesGotoPattern(url, gotoPattern) {
|
|
2777
|
+
try {
|
|
2778
|
+
const getUrlPattern = (urlStr) => {
|
|
2779
|
+
try {
|
|
2780
|
+
const urlObj = new URL(urlStr);
|
|
2781
|
+
const pathname = urlObj.pathname.replace(/\/$/, '');
|
|
2782
|
+
const segments = pathname.split('/').filter(s => s.length > 0);
|
|
2783
|
+
return { origin: urlObj.origin, pathSegments: segments };
|
|
2784
|
+
}
|
|
2785
|
+
catch (_a) {
|
|
2786
|
+
return null;
|
|
2787
|
+
}
|
|
2788
|
+
};
|
|
2789
|
+
const urlPattern = getUrlPattern(url);
|
|
2790
|
+
const targetPattern = getUrlPattern(gotoPattern);
|
|
2791
|
+
const targetNormalized = gotoPattern.replace(/\/$/, '').toLowerCase();
|
|
2792
|
+
const urlNormalized = url.replace(/\/$/, '').toLowerCase();
|
|
2793
|
+
if (!urlPattern || !targetPattern)
|
|
2794
|
+
return false;
|
|
2795
|
+
if (urlPattern.origin !== targetPattern.origin)
|
|
2796
|
+
return false;
|
|
2797
|
+
if (urlPattern.pathSegments.length !== targetPattern.pathSegments.length)
|
|
2798
|
+
return false;
|
|
2799
|
+
if (urlNormalized === targetNormalized)
|
|
2800
|
+
return false; // Skip exact matches
|
|
2801
|
+
for (let i = 0; i < targetPattern.pathSegments.length - 1; i++) {
|
|
2802
|
+
if (urlPattern.pathSegments[i] !== targetPattern.pathSegments[i]) {
|
|
2803
|
+
return false;
|
|
2804
|
+
}
|
|
2805
|
+
}
|
|
2806
|
+
return true;
|
|
2807
|
+
}
|
|
2808
|
+
catch (_a) {
|
|
2809
|
+
return false;
|
|
2810
|
+
}
|
|
2811
|
+
}
|
|
2812
|
+
/**
|
|
2813
|
+
* Executes hierarchical deep extraction by processing each level recursively.
|
|
2814
|
+
* URLs are already stored in each hierarchy level's urlMappings during workflow execution.
|
|
2815
|
+
*/
|
|
2816
|
+
executeHierarchicalDeepExtraction(page, hierarchy) {
|
|
2817
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
2818
|
+
try {
|
|
2819
|
+
if (hierarchy.length === 0) {
|
|
2820
|
+
this.log('No hierarchy levels to process', logger_1.Level.LOG);
|
|
2821
|
+
return;
|
|
2822
|
+
}
|
|
2823
|
+
this.log(`\n=== Starting Hierarchical Deep Extraction (${hierarchy.length} level${hierarchy.length > 1 ? 's' : ''}) ===`, logger_1.Level.LOG);
|
|
2824
|
+
this.isInDeepExtractionPhase = true;
|
|
2825
|
+
const startLevel = hierarchy.length >= 2 ? hierarchy.length - 2 : hierarchy.length - 1;
|
|
2826
|
+
for (let levelIndex = startLevel; levelIndex >= 0; levelIndex--) {
|
|
2827
|
+
const level = hierarchy[levelIndex];
|
|
2828
|
+
const currentLevelUrls = level.urlMappings;
|
|
2829
|
+
this.log(`\n=== Processing Deep Extraction Level ${startLevel - levelIndex + 1}/${startLevel + 1} ===`, logger_1.Level.LOG);
|
|
2830
|
+
this.log(`Goto pattern: ${level.gotoPattern}`, logger_1.Level.LOG);
|
|
2831
|
+
this.log(`Actions to execute: ${level.actionsToExecute.length}`, logger_1.Level.LOG);
|
|
2832
|
+
this.log(`URLs to process: ${currentLevelUrls.filter(m => m.url !== null).length}`, logger_1.Level.LOG);
|
|
2833
|
+
if (currentLevelUrls.length === 0 || currentLevelUrls.every(u => !u.url)) {
|
|
2834
|
+
this.log('No valid URLs at this level - stopping here', logger_1.Level.LOG);
|
|
2835
|
+
break;
|
|
2836
|
+
}
|
|
2837
|
+
yield this.executeDeepExtractionLevel(page, level, currentLevelUrls);
|
|
2838
|
+
}
|
|
2839
|
+
this.log('\n=== Hierarchical Deep Extraction Completed ===', logger_1.Level.LOG);
|
|
2840
|
+
}
|
|
2841
|
+
catch (error) {
|
|
2842
|
+
this.log(`Hierarchical deep extraction failed: ${error.message}`, logger_1.Level.ERROR);
|
|
2843
|
+
}
|
|
2844
|
+
finally {
|
|
2845
|
+
this.isInDeepExtractionPhase = false;
|
|
2846
|
+
}
|
|
2847
|
+
});
|
|
2848
|
+
}
|
|
2849
|
+
/**
|
|
2850
|
+
* Executes deep extraction for a single level.
|
|
2851
|
+
* URLs are already extracted and stored in hierarchy during workflow execution.
|
|
2852
|
+
* This function just navigates to URLs and executes the capture actions.
|
|
2853
|
+
*/
|
|
2854
|
+
executeDeepExtractionLevel(page, level, urlMappings) {
|
|
2855
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
2856
|
+
try {
|
|
2857
|
+
const validMappings = urlMappings.filter(m => m.url !== null);
|
|
2858
|
+
if (validMappings.length === 0) {
|
|
2859
|
+
this.log('No URLs to process for this level', logger_1.Level.LOG);
|
|
2860
|
+
return;
|
|
2861
|
+
}
|
|
2862
|
+
this.log(`Processing ${validMappings.length} URLs`, logger_1.Level.LOG);
|
|
2863
|
+
for (const mapping of validMappings) {
|
|
2864
|
+
try {
|
|
2865
|
+
this.log(`[${mapping.index}] Navigating to: ${mapping.url}`, logger_1.Level.LOG);
|
|
2866
|
+
yield page.goto(mapping.url);
|
|
2867
|
+
yield page.waitForLoadState('networkidle', { timeout: 30000 });
|
|
2868
|
+
for (let i = level.actionsToExecute.length - 1; i >= 0; i--) {
|
|
2869
|
+
const actionPair = level.actionsToExecute[i];
|
|
2870
|
+
if (this.isAborted) {
|
|
2871
|
+
this.log('Workflow aborted during deep extraction', logger_1.Level.WARN);
|
|
2872
|
+
return;
|
|
2873
|
+
}
|
|
2874
|
+
const validatedAction = yield this.validateAndFixSelectors(page, actionPair);
|
|
2875
|
+
const filteredActions = validatedAction.what.filter(action => action.action === 'scrapeSchema' ||
|
|
2876
|
+
action.action === 'scrapeList' ||
|
|
2877
|
+
action.action === 'screenshot');
|
|
2878
|
+
if (filteredActions.length > 0) {
|
|
2879
|
+
yield this.carryOutSteps(page, filteredActions);
|
|
2880
|
+
}
|
|
2881
|
+
}
|
|
2882
|
+
this.log(`[${mapping.index}] Completed`, logger_1.Level.LOG);
|
|
2883
|
+
}
|
|
2884
|
+
catch (error) {
|
|
2885
|
+
this.log(`[${mapping.index}] Failed: ${error.message}`, logger_1.Level.ERROR);
|
|
2886
|
+
}
|
|
2887
|
+
}
|
|
2888
|
+
}
|
|
2889
|
+
catch (error) {
|
|
2890
|
+
this.log(`Level execution failed: ${error.message}`, logger_1.Level.ERROR);
|
|
2891
|
+
}
|
|
2892
|
+
});
|
|
2893
|
+
}
|
|
2216
2894
|
runLoop(p, workflow) {
|
|
2217
2895
|
return __awaiter(this, void 0, void 0, function* () {
|
|
2218
2896
|
var _a, _b;
|
|
@@ -2297,6 +2975,20 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
2297
2975
|
}
|
|
2298
2976
|
if (workflowCopy.length === 0) {
|
|
2299
2977
|
this.log('All actions completed. Workflow finished.', logger_1.Level.LOG);
|
|
2978
|
+
if (this.pendingDeepExtraction) {
|
|
2979
|
+
this.log('Starting deferred hierarchical deep extraction now that workflow has completed...', logger_1.Level.LOG);
|
|
2980
|
+
const { page, hierarchy } = this.pendingDeepExtraction;
|
|
2981
|
+
try {
|
|
2982
|
+
yield this.executeHierarchicalDeepExtraction(page, hierarchy);
|
|
2983
|
+
this.log('Hierarchical deep extraction completed successfully', logger_1.Level.LOG);
|
|
2984
|
+
}
|
|
2985
|
+
catch (error) {
|
|
2986
|
+
this.log(`Hierarchical deep extraction failed: ${error.message}`, logger_1.Level.ERROR);
|
|
2987
|
+
}
|
|
2988
|
+
finally {
|
|
2989
|
+
this.pendingDeepExtraction = null;
|
|
2990
|
+
}
|
|
2991
|
+
}
|
|
2300
2992
|
cleanup();
|
|
2301
2993
|
return;
|
|
2302
2994
|
}
|
|
@@ -2365,7 +3057,7 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
2365
3057
|
try {
|
|
2366
3058
|
const validatedAction = yield this.validateAndFixSelectors(p, action);
|
|
2367
3059
|
console.log("Carrying out:", validatedAction.what);
|
|
2368
|
-
yield this.carryOutSteps(p, validatedAction.what);
|
|
3060
|
+
yield this.carryOutSteps(p, validatedAction.what, workflowCopy);
|
|
2369
3061
|
usedActions.push((_b = action.id) !== null && _b !== void 0 ? _b : 'undefined');
|
|
2370
3062
|
workflowCopy.splice(actionId, 1);
|
|
2371
3063
|
console.log(`Action with ID ${action.id} removed from the workflow copy.`);
|