mx-cloud 0.0.24 → 0.0.25
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/build/interpret.d.ts +53 -0
- package/build/interpret.js +756 -34
- package/package.json +1 -1
package/build/interpret.d.ts
CHANGED
|
@@ -38,6 +38,7 @@ interface InterpreterOptions {
|
|
|
38
38
|
serializableCallback: (output: any) => (void | Promise<void>);
|
|
39
39
|
binaryCallback: (output: any, mimeType: string) => (void | Promise<void>);
|
|
40
40
|
debug: boolean;
|
|
41
|
+
robotType?: 'extract' | 'scrape' | 'deep-extract';
|
|
41
42
|
debugChannel: Partial<{
|
|
42
43
|
activeId: (id: number) => void;
|
|
43
44
|
debugMessage: (msg: string) => void;
|
|
@@ -63,6 +64,8 @@ export default class Interpreter extends EventEmitter {
|
|
|
63
64
|
private screenshotCounter;
|
|
64
65
|
private scrapeListCounter;
|
|
65
66
|
private serializableDataByType;
|
|
67
|
+
private pendingDeepExtraction;
|
|
68
|
+
private isInDeepExtractionPhase;
|
|
66
69
|
constructor(workflow: WorkflowFile, options?: Partial<InterpreterOptions>);
|
|
67
70
|
trackAutohealFailure(error: string): void;
|
|
68
71
|
private applyAdBlocker;
|
|
@@ -148,6 +151,56 @@ export default class Interpreter extends EventEmitter {
|
|
|
148
151
|
* @returns {Promise<WhereWhatPair>} - The potentially modified action
|
|
149
152
|
*/
|
|
150
153
|
private validateAndFixSelectors;
|
|
154
|
+
/**
|
|
155
|
+
* Extracts URLs from the current page's list elements.
|
|
156
|
+
* Used during pagination to maintain sync between scraped results and extracted URLs.
|
|
157
|
+
*
|
|
158
|
+
* @param page - Playwright page object
|
|
159
|
+
* @param listSelector - The selector used to identify list elements
|
|
160
|
+
* @param limit - Maximum number of elements to process (should match number of scraped items)
|
|
161
|
+
* @returns Array of URL arrays, one per list element
|
|
162
|
+
*/
|
|
163
|
+
private extractUrlsFromCurrentPage;
|
|
164
|
+
/**
|
|
165
|
+
* Builds a hierarchical deep extraction plan by analyzing the workflow structure.
|
|
166
|
+
* Identifies goto actions and determines what actions to execute at each level.
|
|
167
|
+
* Workflow is bottom-to-top, so we scan from end to start.
|
|
168
|
+
*/
|
|
169
|
+
private buildDeepExtractionHierarchy;
|
|
170
|
+
/**
|
|
171
|
+
* Extracts hrefs directly from the page based on scrapeSchema selectors.
|
|
172
|
+
* Checks ALL selectors from the schema config - if they point to anchor elements, extract href.
|
|
173
|
+
* This is called after scrapeSchema executes to capture hrefs for deep extraction.
|
|
174
|
+
*/
|
|
175
|
+
private extractHrefsFromPage;
|
|
176
|
+
/**
|
|
177
|
+
* Filters URLs for deep extraction based on the goto action pattern.
|
|
178
|
+
* This is called immediately after the first capture action (scrapeList).
|
|
179
|
+
* Returns the filtered URL mappings that should be processed after workflow completion.
|
|
180
|
+
* Each mapping maintains alignment with the original scrapeList index.
|
|
181
|
+
*/
|
|
182
|
+
private filterDeepExtractionUrls;
|
|
183
|
+
/**
|
|
184
|
+
* Filters pre-extracted URLs for deep extraction based on the goto action pattern.
|
|
185
|
+
* This is used for paginated lists where URLs were extracted during pagination.
|
|
186
|
+
* Returns the filtered URL mappings that maintain alignment with scrapeList indices.
|
|
187
|
+
*/
|
|
188
|
+
private filterDeepExtractionUrlsFromExtracted;
|
|
189
|
+
/**
|
|
190
|
+
* Helper function to check if a URL matches a goto pattern.
|
|
191
|
+
*/
|
|
192
|
+
private matchesGotoPattern;
|
|
193
|
+
/**
|
|
194
|
+
* Executes hierarchical deep extraction by processing each level recursively.
|
|
195
|
+
* URLs are already stored in each hierarchy level's urlMappings during workflow execution.
|
|
196
|
+
*/
|
|
197
|
+
private executeHierarchicalDeepExtraction;
|
|
198
|
+
/**
|
|
199
|
+
* Executes deep extraction for a single level.
|
|
200
|
+
* URLs are already extracted and stored in hierarchy during workflow execution.
|
|
201
|
+
* This function just navigates to URLs and executes the capture actions.
|
|
202
|
+
*/
|
|
203
|
+
private executeDeepExtractionLevel;
|
|
151
204
|
private runLoop;
|
|
152
205
|
private ensureScriptsLoaded;
|
|
153
206
|
/**
|
package/build/interpret.js
CHANGED
|
@@ -74,6 +74,8 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
74
74
|
scrapeList: {},
|
|
75
75
|
scrapeSchema: {}
|
|
76
76
|
};
|
|
77
|
+
this.pendingDeepExtraction = null;
|
|
78
|
+
this.isInDeepExtractionPhase = false;
|
|
77
79
|
this.workflow = workflow.workflow;
|
|
78
80
|
this.initializedWorkflow = null;
|
|
79
81
|
this.options = Object.assign({ maxRepeats: 5, maxConcurrency: 5, serializableCallback: (data) => {
|
|
@@ -338,7 +340,7 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
338
340
|
* @param page Playwright Page object
|
|
339
341
|
* @param steps Array of actions.
|
|
340
342
|
*/
|
|
341
|
-
carryOutSteps(page, steps) {
|
|
343
|
+
carryOutSteps(page, steps, currentWorkflow) {
|
|
342
344
|
return __awaiter(this, void 0, void 0, function* () {
|
|
343
345
|
var _a, _b;
|
|
344
346
|
// Check abort flag at start of execution
|
|
@@ -494,6 +496,84 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
494
496
|
scrapeList: this.serializableDataByType.scrapeList,
|
|
495
497
|
scrapeSchema: this.serializableDataByType.scrapeSchema
|
|
496
498
|
});
|
|
499
|
+
if (this.options.robotType === 'deep-extract' && !this.isInDeepExtractionPhase && this.initializedWorkflow) {
|
|
500
|
+
if (!this.pendingDeepExtraction) {
|
|
501
|
+
console.log('DEBUG: Building hierarchical deep extraction plan from scrapeSchema...');
|
|
502
|
+
const hierarchyData = this.buildDeepExtractionHierarchy(this.initializedWorkflow);
|
|
503
|
+
if (hierarchyData.length > 0) {
|
|
504
|
+
const nextLevelIndex = hierarchyData.length >= 2 ? hierarchyData.length - 2 : hierarchyData.length - 1;
|
|
505
|
+
const nextLevelGotoPattern = hierarchyData[nextLevelIndex].gotoPattern;
|
|
506
|
+
this.log(`Root scrapeSchema will extract URLs matching pattern: ${nextLevelGotoPattern}`, logger_1.Level.LOG);
|
|
507
|
+
// Extract URLs from schema fields
|
|
508
|
+
const urls = yield this.extractHrefsFromPage(page, schema);
|
|
509
|
+
this.log(`scrapeSchema extracted ${urls.length} URLs from field selectors`, logger_1.Level.LOG);
|
|
510
|
+
// Filter URLs against pattern
|
|
511
|
+
const rootUrlMappings = urls
|
|
512
|
+
.map((url, index) => ({
|
|
513
|
+
scrapeListIndex: index,
|
|
514
|
+
url: this.matchesGotoPattern(url, nextLevelGotoPattern) ? url : null
|
|
515
|
+
}))
|
|
516
|
+
.filter(m => m.url !== null);
|
|
517
|
+
this.log(`Matched ${rootUrlMappings.length} URLs against pattern ${nextLevelGotoPattern}`, logger_1.Level.LOG);
|
|
518
|
+
this.pendingDeepExtraction = {
|
|
519
|
+
page,
|
|
520
|
+
hierarchy: hierarchyData.map((level, idx) => ({
|
|
521
|
+
gotoPattern: level.gotoPattern,
|
|
522
|
+
actionsToExecute: level.actionsToExecute,
|
|
523
|
+
urlMappings: idx === nextLevelIndex ? rootUrlMappings.map(m => ({ index: m.scrapeListIndex, url: m.url })) : []
|
|
524
|
+
}))
|
|
525
|
+
};
|
|
526
|
+
}
|
|
527
|
+
else {
|
|
528
|
+
console.log('DEBUG: No goto actions found, deep extraction skipped');
|
|
529
|
+
}
|
|
530
|
+
}
|
|
531
|
+
else {
|
|
532
|
+
this.log(`[Deep Extract] scrapeSchema "${name}" extracting URLs during workflow execution`, logger_1.Level.LOG);
|
|
533
|
+
const hierarchy = this.pendingDeepExtraction.hierarchy;
|
|
534
|
+
if (hierarchy && hierarchy.length > 0) {
|
|
535
|
+
let targetLevelIndex = -1;
|
|
536
|
+
for (let i = hierarchy.length - 1; i >= 0; i--) {
|
|
537
|
+
if (hierarchy[i].urlMappings.length === 0) {
|
|
538
|
+
targetLevelIndex = i;
|
|
539
|
+
break;
|
|
540
|
+
}
|
|
541
|
+
}
|
|
542
|
+
if (targetLevelIndex >= 0) {
|
|
543
|
+
const targetGotoPattern = hierarchy[targetLevelIndex].gotoPattern;
|
|
544
|
+
this.log(`[Deep Extract] Storing URLs for level ${targetLevelIndex}, pattern: ${targetGotoPattern}`, logger_1.Level.LOG);
|
|
545
|
+
const urls = yield this.extractHrefsFromPage(page, schema);
|
|
546
|
+
this.log(`[Deep Extract] Extracted ${urls.length} URLs from scrapeSchema field selectors`, logger_1.Level.LOG);
|
|
547
|
+
const urlMappings = urls
|
|
548
|
+
.map((url, index) => ({
|
|
549
|
+
index,
|
|
550
|
+
url: this.matchesGotoPattern(url, targetGotoPattern) ? url : null
|
|
551
|
+
}))
|
|
552
|
+
.filter(m => m.url !== null);
|
|
553
|
+
if (hierarchy[targetLevelIndex].urlMappings.length > 0) {
|
|
554
|
+
const existingUrls = new Set(hierarchy[targetLevelIndex].urlMappings.map(m => m.url).filter(u => u !== null));
|
|
555
|
+
const newUrls = urlMappings.filter(m => m.url !== null && !existingUrls.has(m.url));
|
|
556
|
+
if (newUrls.length > 0) {
|
|
557
|
+
const startIndex = hierarchy[targetLevelIndex].urlMappings.length;
|
|
558
|
+
hierarchy[targetLevelIndex].urlMappings.push(...newUrls.map((m, idx) => ({ index: startIndex + idx, url: m.url })));
|
|
559
|
+
this.log(`[Deep Extract] Merged ${newUrls.length} new URLs from scrapeSchema`, logger_1.Level.LOG);
|
|
560
|
+
}
|
|
561
|
+
}
|
|
562
|
+
else {
|
|
563
|
+
hierarchy[targetLevelIndex].urlMappings = urlMappings;
|
|
564
|
+
}
|
|
565
|
+
this.log(`[Deep Extract] Stored ${urlMappings.length} matching URLs`, logger_1.Level.LOG);
|
|
566
|
+
if (urlMappings.length > 0) {
|
|
567
|
+
const sampleSize = Math.min(3, urlMappings.length);
|
|
568
|
+
this.log(`[Deep Extract] Sample URLs (showing ${sampleSize} of ${urlMappings.length}):`, logger_1.Level.LOG);
|
|
569
|
+
urlMappings.slice(0, sampleSize).forEach((mapping, idx) => {
|
|
570
|
+
this.log(`[Deep Extract] ${idx + 1}. ${mapping.url}`, logger_1.Level.LOG);
|
|
571
|
+
});
|
|
572
|
+
}
|
|
573
|
+
}
|
|
574
|
+
}
|
|
575
|
+
}
|
|
576
|
+
}
|
|
497
577
|
}),
|
|
498
578
|
scrapeList: (config_1, ...args_1) => __awaiter(this, [config_1, ...args_1], void 0, function* (config, actionName = "") {
|
|
499
579
|
var _a, _b;
|
|
@@ -528,16 +608,58 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
528
608
|
}
|
|
529
609
|
else {
|
|
530
610
|
paginationUsed = true;
|
|
531
|
-
|
|
611
|
+
const paginationResult = yield this.handlePagination(page, config, actionName);
|
|
612
|
+
scrapeResults = paginationResult.results;
|
|
613
|
+
const paginationUrls = paginationResult.urls;
|
|
614
|
+
if (this.options.robotType === 'deep-extract' && this.initializedWorkflow && scrapeResults.length > 0) {
|
|
615
|
+
if (!this.pendingDeepExtraction) {
|
|
616
|
+
console.log('DEBUG: Building hierarchical deep extraction plan from pagination...');
|
|
617
|
+
const hierarchyData = this.buildDeepExtractionHierarchy(this.initializedWorkflow);
|
|
618
|
+
if (hierarchyData.length > 0) {
|
|
619
|
+
const nextLevelIndex = hierarchyData.length >= 2 ? hierarchyData.length - 2 : hierarchyData.length - 1;
|
|
620
|
+
const nextLevelGotoPattern = hierarchyData[nextLevelIndex].gotoPattern;
|
|
621
|
+
this.log(`Root scrapeList (pagination) will extract URLs matching pattern: ${nextLevelGotoPattern}`, logger_1.Level.LOG);
|
|
622
|
+
const rootUrlMappings = this.filterDeepExtractionUrlsFromExtracted(paginationUrls, scrapeResults, nextLevelGotoPattern);
|
|
623
|
+
this.pendingDeepExtraction = {
|
|
624
|
+
page,
|
|
625
|
+
hierarchy: hierarchyData.map((level, idx) => ({
|
|
626
|
+
gotoPattern: level.gotoPattern,
|
|
627
|
+
actionsToExecute: level.actionsToExecute,
|
|
628
|
+
urlMappings: idx === nextLevelIndex ? rootUrlMappings.map(m => ({ index: m.scrapeListIndex, url: m.url })) : []
|
|
629
|
+
}))
|
|
630
|
+
};
|
|
631
|
+
}
|
|
632
|
+
}
|
|
633
|
+
else {
|
|
634
|
+
this.log(`[Deep Extract] scrapeList (pagination) "${actionName}" extracting URLs`, logger_1.Level.LOG);
|
|
635
|
+
const hierarchy = this.pendingDeepExtraction.hierarchy;
|
|
636
|
+
if (hierarchy && hierarchy.length > 0) {
|
|
637
|
+
const nextLevelIndex = hierarchy.length >= 3 ? hierarchy.length - 3 : 0;
|
|
638
|
+
if (nextLevelIndex >= 0 && hierarchy[nextLevelIndex]) {
|
|
639
|
+
const nextGotoPattern = hierarchy[nextLevelIndex].gotoPattern;
|
|
640
|
+
this.log(`[Deep Extract] Extracting URLs for pattern: ${nextGotoPattern}`, logger_1.Level.LOG);
|
|
641
|
+
const urlMappings = this.filterDeepExtractionUrlsFromExtracted(paginationUrls, scrapeResults, nextGotoPattern);
|
|
642
|
+
this.log(`[Deep Extract] Found ${urlMappings.filter(m => m.url !== null).length} matching URLs`, logger_1.Level.LOG);
|
|
643
|
+
const validUrls = urlMappings.filter(m => m.url !== null);
|
|
644
|
+
if (validUrls.length > 0) {
|
|
645
|
+
const sampleSize = Math.min(3, validUrls.length);
|
|
646
|
+
this.log(`[Deep Extract] Sample URLs (showing ${sampleSize} of ${validUrls.length}):`, logger_1.Level.LOG);
|
|
647
|
+
validUrls.slice(0, sampleSize).forEach((mapping, idx) => {
|
|
648
|
+
this.log(`[Deep Extract] ${idx + 1}. ${mapping.url}`, logger_1.Level.LOG);
|
|
649
|
+
});
|
|
650
|
+
}
|
|
651
|
+
}
|
|
652
|
+
}
|
|
653
|
+
}
|
|
654
|
+
}
|
|
532
655
|
}
|
|
533
656
|
if (!Array.isArray(scrapeResults)) {
|
|
534
657
|
scrapeResults = [];
|
|
535
658
|
}
|
|
536
|
-
console.log(`ScrapeList completed with ${scrapeResults.length} results`);
|
|
537
659
|
if (!paginationUsed) {
|
|
538
660
|
const actionType = "scrapeList";
|
|
539
661
|
let name = actionName || "";
|
|
540
|
-
if (!name || name.trim() === "") {
|
|
662
|
+
if (!name || name.trim() === "" || this.isInDeepExtractionPhase) {
|
|
541
663
|
this.scrapeListCounter++;
|
|
542
664
|
name = `List ${this.scrapeListCounter}`;
|
|
543
665
|
}
|
|
@@ -551,6 +673,69 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
551
673
|
scrapeList: this.serializableDataByType.scrapeList,
|
|
552
674
|
scrapeSchema: this.serializableDataByType.scrapeSchema
|
|
553
675
|
});
|
|
676
|
+
console.log(`DEBUG: Checking deep extract condition: robotType=${this.options.robotType}, hasWorkflow=${!!currentWorkflow}, alreadyPending=${!!this.pendingDeepExtraction}`);
|
|
677
|
+
if (this.options.robotType === 'deep-extract' && !this.isInDeepExtractionPhase && this.initializedWorkflow) {
|
|
678
|
+
if (!this.pendingDeepExtraction) {
|
|
679
|
+
console.log('DEBUG: Building hierarchical deep extraction plan...');
|
|
680
|
+
const hierarchyData = this.buildDeepExtractionHierarchy(this.initializedWorkflow);
|
|
681
|
+
if (hierarchyData.length > 0) {
|
|
682
|
+
const nextLevelIndex = hierarchyData.length >= 2 ? hierarchyData.length - 2 : hierarchyData.length - 1;
|
|
683
|
+
const nextLevelGotoPattern = hierarchyData[nextLevelIndex].gotoPattern;
|
|
684
|
+
this.log(`Root scrapeList will extract URLs matching pattern: ${nextLevelGotoPattern}`, logger_1.Level.LOG);
|
|
685
|
+
const rootUrlMappings = yield this.filterDeepExtractionUrls(page, config.listSelector, scrapeResults, nextLevelGotoPattern);
|
|
686
|
+
this.pendingDeepExtraction = {
|
|
687
|
+
page,
|
|
688
|
+
hierarchy: hierarchyData.map((level, idx) => ({
|
|
689
|
+
gotoPattern: level.gotoPattern,
|
|
690
|
+
actionsToExecute: level.actionsToExecute,
|
|
691
|
+
urlMappings: idx === nextLevelIndex ? rootUrlMappings.map(m => ({ index: m.scrapeListIndex, url: m.url })) : []
|
|
692
|
+
}))
|
|
693
|
+
};
|
|
694
|
+
}
|
|
695
|
+
else {
|
|
696
|
+
console.log('DEBUG: No goto actions found, deep extraction skipped');
|
|
697
|
+
}
|
|
698
|
+
}
|
|
699
|
+
else {
|
|
700
|
+
this.log(`[Deep Extract] scrapeList "${name}" extracting URLs during workflow execution`, logger_1.Level.LOG);
|
|
701
|
+
const hierarchy = this.pendingDeepExtraction.hierarchy;
|
|
702
|
+
if (hierarchy && hierarchy.length > 0) {
|
|
703
|
+
let targetLevelIndex = -1;
|
|
704
|
+
for (let i = hierarchy.length - 1; i >= 0; i--) {
|
|
705
|
+
if (hierarchy[i].urlMappings.length === 0) {
|
|
706
|
+
targetLevelIndex = i;
|
|
707
|
+
break;
|
|
708
|
+
}
|
|
709
|
+
}
|
|
710
|
+
if (targetLevelIndex >= 0) {
|
|
711
|
+
const nextGotoPattern = hierarchy[targetLevelIndex].gotoPattern;
|
|
712
|
+
this.log(`[Deep Extract] Storing URLs for level ${targetLevelIndex}, pattern: ${nextGotoPattern}`, logger_1.Level.LOG);
|
|
713
|
+
const urlMappings = yield this.filterDeepExtractionUrls(page, config.listSelector, scrapeResults, nextGotoPattern);
|
|
714
|
+
if (hierarchy[targetLevelIndex].urlMappings.length > 0) {
|
|
715
|
+
const existingUrls = new Set(hierarchy[targetLevelIndex].urlMappings.map(m => m.url).filter(u => u !== null));
|
|
716
|
+
const newUrls = urlMappings.filter(m => m.url !== null && !existingUrls.has(m.url));
|
|
717
|
+
if (newUrls.length > 0) {
|
|
718
|
+
const startIndex = hierarchy[targetLevelIndex].urlMappings.length;
|
|
719
|
+
hierarchy[targetLevelIndex].urlMappings.push(...newUrls.map((m, idx) => ({ index: startIndex + idx, url: m.url })));
|
|
720
|
+
this.log(`[Deep Extract] Merged ${newUrls.length} new URLs`, logger_1.Level.LOG);
|
|
721
|
+
}
|
|
722
|
+
}
|
|
723
|
+
else {
|
|
724
|
+
hierarchy[targetLevelIndex].urlMappings = urlMappings.map(m => ({ index: m.scrapeListIndex, url: m.url }));
|
|
725
|
+
}
|
|
726
|
+
this.log(`[Deep Extract] Stored ${urlMappings.filter(m => m.url !== null).length} matching URLs`, logger_1.Level.LOG);
|
|
727
|
+
const validUrls = urlMappings.filter(m => m.url !== null);
|
|
728
|
+
if (validUrls.length > 0) {
|
|
729
|
+
const sampleSize = Math.min(3, validUrls.length);
|
|
730
|
+
this.log(`[Deep Extract] Sample URLs (showing ${sampleSize} of ${validUrls.length}):`, logger_1.Level.LOG);
|
|
731
|
+
validUrls.slice(0, sampleSize).forEach((mapping, idx) => {
|
|
732
|
+
this.log(`[Deep Extract] ${idx + 1}. ${mapping.url}`, logger_1.Level.LOG);
|
|
733
|
+
});
|
|
734
|
+
}
|
|
735
|
+
}
|
|
736
|
+
}
|
|
737
|
+
}
|
|
738
|
+
}
|
|
554
739
|
}
|
|
555
740
|
}
|
|
556
741
|
catch (error) {
|
|
@@ -731,11 +916,12 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
731
916
|
return __awaiter(this, arguments, void 0, function* (page, config, providedActionName = "") {
|
|
732
917
|
if (this.isAborted) {
|
|
733
918
|
this.log('Workflow aborted, stopping pagination', logger_1.Level.WARN);
|
|
734
|
-
return [];
|
|
919
|
+
return { results: [], urls: [] };
|
|
735
920
|
}
|
|
736
921
|
const actionType = "scrapeList";
|
|
737
922
|
let actionName = providedActionName || "";
|
|
738
|
-
|
|
923
|
+
// During deep extraction, ALWAYS auto-increment to create separate lists for each URL
|
|
924
|
+
if (!actionName || actionName.trim() === "" || this.isInDeepExtractionPhase) {
|
|
739
925
|
this.scrapeListCounter++;
|
|
740
926
|
actionName = `List ${this.scrapeListCounter}`;
|
|
741
927
|
}
|
|
@@ -747,6 +933,7 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
747
933
|
this.serializableDataByType[actionType][actionName] = [];
|
|
748
934
|
}
|
|
749
935
|
let allResults = [];
|
|
936
|
+
let allUrls = []; // Track URLs alongside results for deep-extract
|
|
750
937
|
let previousHeight = 0;
|
|
751
938
|
let scrapedItems = new Set();
|
|
752
939
|
let visitedUrls = new Set();
|
|
@@ -773,14 +960,22 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
773
960
|
debugLog(`Page evaluation failed: ${error.message}`);
|
|
774
961
|
return;
|
|
775
962
|
}
|
|
776
|
-
|
|
963
|
+
// Extract URLs for ALL items BEFORE filtering duplicates
|
|
964
|
+
// This ensures URL indices match result indices
|
|
965
|
+
const allItemUrls = yield this.extractUrlsFromCurrentPage(page, config.listSelector, results.length);
|
|
966
|
+
// Filter results AND URLs together using the same uniqueness logic
|
|
967
|
+
const newResults = [];
|
|
968
|
+
const newUrls = [];
|
|
969
|
+
results.forEach((item, index) => {
|
|
777
970
|
const uniqueKey = JSON.stringify(item);
|
|
778
|
-
if (scrapedItems.has(uniqueKey))
|
|
779
|
-
|
|
780
|
-
|
|
781
|
-
|
|
971
|
+
if (!scrapedItems.has(uniqueKey)) {
|
|
972
|
+
scrapedItems.add(uniqueKey);
|
|
973
|
+
newResults.push(item);
|
|
974
|
+
newUrls.push(allItemUrls[index] || []); // Add corresponding URLs
|
|
975
|
+
}
|
|
782
976
|
});
|
|
783
977
|
allResults = allResults.concat(newResults);
|
|
978
|
+
allUrls = allUrls.concat(newUrls);
|
|
784
979
|
debugLog("Results collected:", allResults.length);
|
|
785
980
|
// Store in serializableDataByType and send structured callback
|
|
786
981
|
this.serializableDataByType[actionType][actionName] = [...allResults];
|
|
@@ -792,6 +987,7 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
792
987
|
const checkLimit = () => {
|
|
793
988
|
if (config.limit && allResults.length >= config.limit) {
|
|
794
989
|
allResults = allResults.slice(0, config.limit);
|
|
990
|
+
allUrls = allUrls.slice(0, config.limit); // Also trim URLs to maintain sync
|
|
795
991
|
return true;
|
|
796
992
|
}
|
|
797
993
|
return false;
|
|
@@ -917,16 +1113,16 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
917
1113
|
// Check abort flag at start of each pagination iteration
|
|
918
1114
|
if (this.isAborted) {
|
|
919
1115
|
this.log('Workflow aborted during pagination loop', logger_1.Level.WARN);
|
|
920
|
-
return allResults;
|
|
1116
|
+
return { results: allResults, urls: allUrls };
|
|
921
1117
|
}
|
|
922
1118
|
// Pagination circuit breakers
|
|
923
1119
|
if (++paginationIterations > MAX_PAGINATION_ITERATIONS) {
|
|
924
1120
|
debugLog(`Maximum pagination iterations reached (${MAX_PAGINATION_ITERATIONS}), stopping`);
|
|
925
|
-
return allResults;
|
|
1121
|
+
return { results: allResults, urls: allUrls };
|
|
926
1122
|
}
|
|
927
1123
|
if (Date.now() - paginationStartTime > MAX_PAGINATION_TIME) {
|
|
928
1124
|
debugLog('Maximum pagination time reached (10 minutes), stopping');
|
|
929
|
-
return allResults;
|
|
1125
|
+
return { results: allResults, urls: allUrls };
|
|
930
1126
|
}
|
|
931
1127
|
// Add async yield every 5 iterations to prevent event loop blocking
|
|
932
1128
|
if (paginationIterations % 5 === 0) {
|
|
@@ -937,7 +1133,7 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
937
1133
|
let previousResultCount = allResults.length;
|
|
938
1134
|
yield scrapeCurrentPage();
|
|
939
1135
|
if (checkLimit()) {
|
|
940
|
-
return allResults;
|
|
1136
|
+
return { results: allResults, urls: allUrls };
|
|
941
1137
|
}
|
|
942
1138
|
yield page.evaluate(() => {
|
|
943
1139
|
const scrollHeight = Math.max(document.body.scrollHeight, document.documentElement.scrollHeight);
|
|
@@ -951,14 +1147,14 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
951
1147
|
if (currentResultCount === previousResultCount) {
|
|
952
1148
|
unchangedResultCounter++;
|
|
953
1149
|
if (unchangedResultCounter >= MAX_UNCHANGED_RESULTS) {
|
|
954
|
-
return allResults;
|
|
1150
|
+
return { results: allResults, urls: allUrls };
|
|
955
1151
|
}
|
|
956
1152
|
}
|
|
957
1153
|
else {
|
|
958
1154
|
unchangedResultCounter = 0;
|
|
959
1155
|
}
|
|
960
1156
|
if (currentHeight === previousHeight) {
|
|
961
|
-
return allResults;
|
|
1157
|
+
return { results: allResults, urls: allUrls };
|
|
962
1158
|
}
|
|
963
1159
|
previousHeight = currentHeight;
|
|
964
1160
|
break;
|
|
@@ -967,7 +1163,7 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
967
1163
|
let previousResultCount = allResults.length;
|
|
968
1164
|
yield scrapeCurrentPage();
|
|
969
1165
|
if (checkLimit()) {
|
|
970
|
-
return allResults;
|
|
1166
|
+
return { results: allResults, urls: allUrls };
|
|
971
1167
|
}
|
|
972
1168
|
yield page.evaluate(() => window.scrollTo(0, 0));
|
|
973
1169
|
yield page.waitForTimeout(2000);
|
|
@@ -976,14 +1172,14 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
976
1172
|
if (currentResultCount === previousResultCount) {
|
|
977
1173
|
unchangedResultCounter++;
|
|
978
1174
|
if (unchangedResultCounter >= MAX_UNCHANGED_RESULTS) {
|
|
979
|
-
return allResults;
|
|
1175
|
+
return { results: allResults, urls: allUrls };
|
|
980
1176
|
}
|
|
981
1177
|
}
|
|
982
1178
|
else {
|
|
983
1179
|
unchangedResultCounter = 0;
|
|
984
1180
|
}
|
|
985
1181
|
if (currentTopHeight === 0) {
|
|
986
|
-
return allResults;
|
|
1182
|
+
return { results: allResults, urls: allUrls };
|
|
987
1183
|
}
|
|
988
1184
|
previousHeight = currentTopHeight;
|
|
989
1185
|
break;
|
|
@@ -993,7 +1189,7 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
993
1189
|
visitedUrls.add(currentUrl);
|
|
994
1190
|
yield scrapeCurrentPage();
|
|
995
1191
|
if (checkLimit())
|
|
996
|
-
return allResults;
|
|
1192
|
+
return { results: allResults, urls: allUrls };
|
|
997
1193
|
const { button, workingSelector, updatedSelectors } = yield findWorkingButton(availableSelectors);
|
|
998
1194
|
availableSelectors = updatedSelectors;
|
|
999
1195
|
if (!button || !workingSelector) {
|
|
@@ -1009,7 +1205,7 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
1009
1205
|
}
|
|
1010
1206
|
}));
|
|
1011
1207
|
if (!success)
|
|
1012
|
-
return allResults;
|
|
1208
|
+
return { results: allResults, urls: allUrls };
|
|
1013
1209
|
break;
|
|
1014
1210
|
}
|
|
1015
1211
|
let retryCount = 0;
|
|
@@ -1139,14 +1335,14 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
1139
1335
|
}
|
|
1140
1336
|
if (!paginationSuccess) {
|
|
1141
1337
|
debugLog(`Pagination failed after ${MAX_RETRIES} attempts`);
|
|
1142
|
-
return allResults;
|
|
1338
|
+
return { results: allResults, urls: allUrls };
|
|
1143
1339
|
}
|
|
1144
1340
|
break;
|
|
1145
1341
|
}
|
|
1146
1342
|
case 'clickLoadMore': {
|
|
1147
1343
|
yield scrapeCurrentPage();
|
|
1148
1344
|
if (checkLimit())
|
|
1149
|
-
return allResults;
|
|
1345
|
+
return { results: allResults, urls: allUrls };
|
|
1150
1346
|
let loadMoreCounter = 0;
|
|
1151
1347
|
const MAX_LOAD_MORE_ITERATIONS = 100; // Prevent infinite load more
|
|
1152
1348
|
const loadMoreStartTime = Date.now();
|
|
@@ -1155,11 +1351,11 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
1155
1351
|
// Load more circuit breakers
|
|
1156
1352
|
if (loadMoreCounter >= MAX_LOAD_MORE_ITERATIONS) {
|
|
1157
1353
|
debugLog(`Maximum load more iterations reached (${MAX_LOAD_MORE_ITERATIONS}), stopping`);
|
|
1158
|
-
return allResults;
|
|
1354
|
+
return { results: allResults, urls: allUrls };
|
|
1159
1355
|
}
|
|
1160
1356
|
if (Date.now() - loadMoreStartTime > MAX_LOAD_MORE_TIME) {
|
|
1161
1357
|
debugLog('Maximum load more time reached (5 minutes), stopping');
|
|
1162
|
-
return allResults;
|
|
1358
|
+
return { results: allResults, urls: allUrls };
|
|
1163
1359
|
}
|
|
1164
1360
|
// Add async yield every 3 iterations
|
|
1165
1361
|
if (loadMoreCounter % 3 === 0 && loadMoreCounter > 0) {
|
|
@@ -1170,7 +1366,7 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
1170
1366
|
availableSelectors = updatedSelectors;
|
|
1171
1367
|
if (!workingSelector || !loadMoreButton) {
|
|
1172
1368
|
debugLog('No working Load More selector found after retries');
|
|
1173
|
-
return allResults;
|
|
1369
|
+
return { results: allResults, urls: allUrls };
|
|
1174
1370
|
}
|
|
1175
1371
|
// Implement retry mechanism for clicking the button
|
|
1176
1372
|
let retryCount = 0;
|
|
@@ -1210,7 +1406,7 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
1210
1406
|
}
|
|
1211
1407
|
if (!clickSuccess) {
|
|
1212
1408
|
debugLog(`Load More clicking failed after ${MAX_RETRIES} attempts`);
|
|
1213
|
-
return allResults;
|
|
1409
|
+
return { results: allResults, urls: allUrls };
|
|
1214
1410
|
}
|
|
1215
1411
|
// Wait for content to load and check scroll height
|
|
1216
1412
|
yield page.waitForTimeout(2000);
|
|
@@ -1239,16 +1435,16 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
1239
1435
|
// previousResultCount = currentResultCount;
|
|
1240
1436
|
// }
|
|
1241
1437
|
if (checkLimit())
|
|
1242
|
-
return allResults;
|
|
1438
|
+
return { results: allResults, urls: allUrls };
|
|
1243
1439
|
if (!heightChanged) {
|
|
1244
1440
|
debugLog('No more items loaded after Load More');
|
|
1245
|
-
return allResults;
|
|
1441
|
+
return { results: allResults, urls: allUrls };
|
|
1246
1442
|
}
|
|
1247
1443
|
}
|
|
1248
1444
|
}
|
|
1249
1445
|
default: {
|
|
1250
1446
|
yield scrapeCurrentPage();
|
|
1251
|
-
return allResults;
|
|
1447
|
+
return { results: allResults, urls: allUrls };
|
|
1252
1448
|
}
|
|
1253
1449
|
}
|
|
1254
1450
|
if (checkLimit())
|
|
@@ -1257,9 +1453,9 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
1257
1453
|
}
|
|
1258
1454
|
catch (error) {
|
|
1259
1455
|
debugLog(`Fatal error: ${error.message}`);
|
|
1260
|
-
return allResults;
|
|
1456
|
+
return { results: allResults, urls: allUrls };
|
|
1261
1457
|
}
|
|
1262
|
-
return allResults;
|
|
1458
|
+
return { results: allResults, urls: allUrls };
|
|
1263
1459
|
});
|
|
1264
1460
|
}
|
|
1265
1461
|
getMatchingActionId(workflow, pageState, usedActions) {
|
|
@@ -2183,6 +2379,518 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
2183
2379
|
return modifiedAction;
|
|
2184
2380
|
});
|
|
2185
2381
|
}
|
|
2382
|
+
/**
|
|
2383
|
+
* Extracts URLs from the current page's list elements.
|
|
2384
|
+
* Used during pagination to maintain sync between scraped results and extracted URLs.
|
|
2385
|
+
*
|
|
2386
|
+
* @param page - Playwright page object
|
|
2387
|
+
* @param listSelector - The selector used to identify list elements
|
|
2388
|
+
* @param limit - Maximum number of elements to process (should match number of scraped items)
|
|
2389
|
+
* @returns Array of URL arrays, one per list element
|
|
2390
|
+
*/
|
|
2391
|
+
extractUrlsFromCurrentPage(page, listSelector, limit) {
|
|
2392
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
2393
|
+
const extractedUrls = yield page.evaluate(({ selector, limit }) => {
|
|
2394
|
+
const urlsByElement = [];
|
|
2395
|
+
let listElements = [];
|
|
2396
|
+
if (selector.startsWith('//') || selector.startsWith('(//')) {
|
|
2397
|
+
const xpathResult = document.evaluate(selector, document, null, XPathResult.ORDERED_NODE_SNAPSHOT_TYPE, null);
|
|
2398
|
+
for (let i = 0; i < xpathResult.snapshotLength; i++) {
|
|
2399
|
+
const node = xpathResult.snapshotItem(i);
|
|
2400
|
+
if (node && node.nodeType === Node.ELEMENT_NODE) {
|
|
2401
|
+
listElements.push(node);
|
|
2402
|
+
}
|
|
2403
|
+
}
|
|
2404
|
+
}
|
|
2405
|
+
else {
|
|
2406
|
+
listElements = Array.from(document.querySelectorAll(selector));
|
|
2407
|
+
}
|
|
2408
|
+
// Extract URLs from the first 'limit' elements that match the selector
|
|
2409
|
+
// The limit corresponds to the number of items that were scraped
|
|
2410
|
+
const elementsToProcess = listElements.slice(0, limit);
|
|
2411
|
+
elementsToProcess.forEach(element => {
|
|
2412
|
+
const urls = [];
|
|
2413
|
+
if (element.tagName === 'A' && element.href) {
|
|
2414
|
+
urls.push(element.href);
|
|
2415
|
+
}
|
|
2416
|
+
const anchors = element.querySelectorAll('a[href]');
|
|
2417
|
+
anchors.forEach(anchor => {
|
|
2418
|
+
const href = anchor.href;
|
|
2419
|
+
if (href && !urls.includes(href)) {
|
|
2420
|
+
urls.push(href);
|
|
2421
|
+
}
|
|
2422
|
+
});
|
|
2423
|
+
urlsByElement.push(urls);
|
|
2424
|
+
});
|
|
2425
|
+
return urlsByElement;
|
|
2426
|
+
}, { selector: listSelector, limit });
|
|
2427
|
+
return extractedUrls;
|
|
2428
|
+
});
|
|
2429
|
+
}
|
|
2430
|
+
/**
|
|
2431
|
+
* Builds a hierarchical deep extraction plan by analyzing the workflow structure.
|
|
2432
|
+
* Identifies goto actions and determines what actions to execute at each level.
|
|
2433
|
+
* Workflow is bottom-to-top, so we scan from end to start.
|
|
2434
|
+
*/
|
|
2435
|
+
buildDeepExtractionHierarchy(currentWorkflow) {
|
|
2436
|
+
var _a, _b;
|
|
2437
|
+
const hierarchy = [];
|
|
2438
|
+
// Find all goto action indices with their patterns
|
|
2439
|
+
const gotoData = [];
|
|
2440
|
+
currentWorkflow.forEach((pair, index) => {
|
|
2441
|
+
var _a;
|
|
2442
|
+
if (pair.what && pair.what.some(action => action.action === 'goto')) {
|
|
2443
|
+
const gotoAction = pair.what.find(action => action.action === 'goto');
|
|
2444
|
+
const pattern = (_a = gotoAction === null || gotoAction === void 0 ? void 0 : gotoAction.args) === null || _a === void 0 ? void 0 : _a[0];
|
|
2445
|
+
if (pattern) {
|
|
2446
|
+
gotoData.push({ index, pattern: String(pattern) });
|
|
2447
|
+
}
|
|
2448
|
+
}
|
|
2449
|
+
});
|
|
2450
|
+
if (gotoData.length === 0) {
|
|
2451
|
+
this.log('No goto actions found in workflow', logger_1.Level.WARN);
|
|
2452
|
+
return [];
|
|
2453
|
+
}
|
|
2454
|
+
this.log(`Found ${gotoData.length} goto action(s) at indices: ${gotoData.map(g => g.index).join(', ')}`, logger_1.Level.LOG);
|
|
2455
|
+
const uniqueGotos = [];
|
|
2456
|
+
for (let i = 0; i < gotoData.length; i++) {
|
|
2457
|
+
const current = gotoData[i];
|
|
2458
|
+
const next = gotoData[i + 1];
|
|
2459
|
+
if (next && current.pattern === next.pattern) {
|
|
2460
|
+
this.log(`Skipping duplicate goto at index ${next.index} (same as ${current.index})`, logger_1.Level.LOG);
|
|
2461
|
+
i++;
|
|
2462
|
+
}
|
|
2463
|
+
uniqueGotos.push(current);
|
|
2464
|
+
}
|
|
2465
|
+
this.log(`After deduplication: ${uniqueGotos.length} unique goto(s)`, logger_1.Level.LOG);
|
|
2466
|
+
for (let i = 0; i < uniqueGotos.length; i++) {
|
|
2467
|
+
const gotoIndex = uniqueGotos[i].index;
|
|
2468
|
+
const gotoPattern = uniqueGotos[i].pattern;
|
|
2469
|
+
const nextGotoIndex = i > 0 ? uniqueGotos[i - 1].index : 0;
|
|
2470
|
+
let actionsToExecute = currentWorkflow.slice(nextGotoIndex, gotoIndex);
|
|
2471
|
+
actionsToExecute = actionsToExecute.filter(pair => {
|
|
2472
|
+
return !pair.what || !pair.what.some(action => action.action === 'goto');
|
|
2473
|
+
});
|
|
2474
|
+
const dataExtractionActions = actionsToExecute.filter(pair => {
|
|
2475
|
+
return pair.what && pair.what.some(action => action.action === 'scrapeSchema' ||
|
|
2476
|
+
action.action === 'scrapeList' ||
|
|
2477
|
+
action.action === 'screenshot');
|
|
2478
|
+
});
|
|
2479
|
+
if (dataExtractionActions.length === 0) {
|
|
2480
|
+
this.log(`No data extraction actions found between goto at ${gotoIndex} and next level`, logger_1.Level.WARN);
|
|
2481
|
+
continue;
|
|
2482
|
+
}
|
|
2483
|
+
let sourceActionName = '';
|
|
2484
|
+
let sourceActionType = 'scrapeList';
|
|
2485
|
+
if (i === uniqueGotos.length - 1) {
|
|
2486
|
+
const scrapeListBefore = currentWorkflow.slice(gotoIndex + 1).find(pair => pair.what && pair.what.some(action => action.action === 'scrapeList'));
|
|
2487
|
+
if (scrapeListBefore) {
|
|
2488
|
+
const scrapeListAction = scrapeListBefore.what.find(action => action.action === 'scrapeList');
|
|
2489
|
+
sourceActionName = ((_b = (_a = scrapeListAction === null || scrapeListAction === void 0 ? void 0 : scrapeListAction.args) === null || _a === void 0 ? void 0 : _a[0]) === null || _b === void 0 ? void 0 : _b.name) || (scrapeListAction === null || scrapeListAction === void 0 ? void 0 : scrapeListAction.name) || '';
|
|
2490
|
+
sourceActionType = 'scrapeList';
|
|
2491
|
+
}
|
|
2492
|
+
}
|
|
2493
|
+
else {
|
|
2494
|
+
sourceActionName = '';
|
|
2495
|
+
sourceActionType = 'scrapeSchema';
|
|
2496
|
+
}
|
|
2497
|
+
hierarchy.push({
|
|
2498
|
+
gotoActionIndex: gotoIndex,
|
|
2499
|
+
gotoPattern: String(gotoPattern),
|
|
2500
|
+
actionsToExecute: dataExtractionActions,
|
|
2501
|
+
sourceActionName,
|
|
2502
|
+
sourceActionType
|
|
2503
|
+
});
|
|
2504
|
+
this.log(`Level ${i}: goto at index ${gotoIndex}, pattern=${gotoPattern}, actions=${dataExtractionActions.length}`, logger_1.Level.LOG);
|
|
2505
|
+
}
|
|
2506
|
+
return hierarchy;
|
|
2507
|
+
}
|
|
2508
|
+
/**
|
|
2509
|
+
* Extracts hrefs directly from the page based on scrapeSchema selectors.
|
|
2510
|
+
* Checks ALL selectors from the schema config - if they point to anchor elements, extract href.
|
|
2511
|
+
* This is called after scrapeSchema executes to capture hrefs for deep extraction.
|
|
2512
|
+
*/
|
|
2513
|
+
extractHrefsFromPage(page, schemaConfig) {
|
|
2514
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
2515
|
+
try {
|
|
2516
|
+
const fields = schemaConfig.fields || schemaConfig;
|
|
2517
|
+
const selectors = [];
|
|
2518
|
+
for (const [fieldName, fieldConfig] of Object.entries(fields)) {
|
|
2519
|
+
if (fieldConfig && typeof fieldConfig === 'object' && fieldConfig.selector) {
|
|
2520
|
+
selectors.push(String(fieldConfig.selector));
|
|
2521
|
+
}
|
|
2522
|
+
}
|
|
2523
|
+
if (selectors.length === 0) {
|
|
2524
|
+
return [];
|
|
2525
|
+
}
|
|
2526
|
+
const extractedUrls = yield page.evaluate((selectorList) => {
|
|
2527
|
+
const urls = [];
|
|
2528
|
+
for (const selector of selectorList) {
|
|
2529
|
+
if (!selector)
|
|
2530
|
+
continue;
|
|
2531
|
+
try {
|
|
2532
|
+
let elements = [];
|
|
2533
|
+
if (selector.startsWith('//') || selector.startsWith('(//')) {
|
|
2534
|
+
const xpathResult = document.evaluate(selector, document, null, XPathResult.ORDERED_NODE_SNAPSHOT_TYPE, null);
|
|
2535
|
+
for (let i = 0; i < xpathResult.snapshotLength; i++) {
|
|
2536
|
+
const node = xpathResult.snapshotItem(i);
|
|
2537
|
+
if (node && node.nodeType === Node.ELEMENT_NODE) {
|
|
2538
|
+
elements.push(node);
|
|
2539
|
+
}
|
|
2540
|
+
}
|
|
2541
|
+
}
|
|
2542
|
+
else {
|
|
2543
|
+
elements = Array.from(document.querySelectorAll(selector));
|
|
2544
|
+
}
|
|
2545
|
+
for (const element of elements) {
|
|
2546
|
+
if (element.tagName === 'A' && element.href) {
|
|
2547
|
+
const href = element.href;
|
|
2548
|
+
if (href && !urls.includes(href)) {
|
|
2549
|
+
urls.push(href);
|
|
2550
|
+
}
|
|
2551
|
+
}
|
|
2552
|
+
}
|
|
2553
|
+
}
|
|
2554
|
+
catch (error) {
|
|
2555
|
+
console.warn(`Failed to extract hrefs for selector ${selector}:`, error);
|
|
2556
|
+
}
|
|
2557
|
+
}
|
|
2558
|
+
return urls;
|
|
2559
|
+
}, selectors);
|
|
2560
|
+
this.log(`Extracted ${extractedUrls.length} hrefs from page for schema selectors`, logger_1.Level.LOG);
|
|
2561
|
+
return extractedUrls;
|
|
2562
|
+
}
|
|
2563
|
+
catch (error) {
|
|
2564
|
+
this.log(`Failed to extract hrefs from page: ${error.message}`, logger_1.Level.ERROR);
|
|
2565
|
+
return [];
|
|
2566
|
+
}
|
|
2567
|
+
});
|
|
2568
|
+
}
|
|
2569
|
+
/**
|
|
2570
|
+
* Filters URLs for deep extraction based on the goto action pattern.
|
|
2571
|
+
* This is called immediately after the first capture action (scrapeList).
|
|
2572
|
+
* Returns the filtered URL mappings that should be processed after workflow completion.
|
|
2573
|
+
* Each mapping maintains alignment with the original scrapeList index.
|
|
2574
|
+
*/
|
|
2575
|
+
filterDeepExtractionUrls(page, listSelector, scrapeResults, gotoTargetPattern) {
|
|
2576
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
2577
|
+
try {
|
|
2578
|
+
this.log(`Deep extraction: Filtering URLs from list structure (${scrapeResults.length} items)`, logger_1.Level.LOG);
|
|
2579
|
+
const extractedUrls = yield page.evaluate(({ selector, limit }) => {
|
|
2580
|
+
const urlsByElement = [];
|
|
2581
|
+
let listElements = [];
|
|
2582
|
+
if (selector.startsWith('//') || selector.startsWith('(//')) {
|
|
2583
|
+
const xpathResult = document.evaluate(selector, document, null, XPathResult.ORDERED_NODE_SNAPSHOT_TYPE, null);
|
|
2584
|
+
for (let i = 0; i < xpathResult.snapshotLength; i++) {
|
|
2585
|
+
const node = xpathResult.snapshotItem(i);
|
|
2586
|
+
if (node && node.nodeType === Node.ELEMENT_NODE) {
|
|
2587
|
+
listElements.push(node);
|
|
2588
|
+
}
|
|
2589
|
+
}
|
|
2590
|
+
}
|
|
2591
|
+
else {
|
|
2592
|
+
listElements = Array.from(document.querySelectorAll(selector));
|
|
2593
|
+
}
|
|
2594
|
+
const elementsToProcess = listElements.slice(0, limit);
|
|
2595
|
+
elementsToProcess.forEach(element => {
|
|
2596
|
+
const urls = [];
|
|
2597
|
+
if (element.tagName === 'A' && element.href) {
|
|
2598
|
+
urls.push(element.href);
|
|
2599
|
+
}
|
|
2600
|
+
const anchors = element.querySelectorAll('a[href]');
|
|
2601
|
+
anchors.forEach(anchor => {
|
|
2602
|
+
const href = anchor.href;
|
|
2603
|
+
if (href && !urls.includes(href)) {
|
|
2604
|
+
urls.push(href);
|
|
2605
|
+
}
|
|
2606
|
+
});
|
|
2607
|
+
urlsByElement.push(urls);
|
|
2608
|
+
});
|
|
2609
|
+
return urlsByElement;
|
|
2610
|
+
}, { selector: listSelector, limit: scrapeResults.length });
|
|
2611
|
+
const totalUrlCount = extractedUrls.reduce((sum, urls) => sum + urls.length, 0);
|
|
2612
|
+
this.log(`Extracted ${totalUrlCount} total URLs from ${scrapeResults.length} list items (avg ${(totalUrlCount / scrapeResults.length).toFixed(1)} URLs per item)`, logger_1.Level.LOG);
|
|
2613
|
+
const getUrlPattern = (url) => {
|
|
2614
|
+
try {
|
|
2615
|
+
const urlObj = new URL(url);
|
|
2616
|
+
const pathname = urlObj.pathname.replace(/\/$/, '');
|
|
2617
|
+
const segments = pathname.split('/').filter(s => s.length > 0);
|
|
2618
|
+
return {
|
|
2619
|
+
origin: urlObj.origin,
|
|
2620
|
+
pathSegments: segments
|
|
2621
|
+
};
|
|
2622
|
+
}
|
|
2623
|
+
catch (_a) {
|
|
2624
|
+
return null;
|
|
2625
|
+
}
|
|
2626
|
+
};
|
|
2627
|
+
const targetPattern = getUrlPattern(String(gotoTargetPattern));
|
|
2628
|
+
const targetNormalized = String(gotoTargetPattern).replace(/\/$/, '').toLowerCase();
|
|
2629
|
+
if (!targetPattern) {
|
|
2630
|
+
this.log('Could not parse goto URL pattern, skipping deep extraction', logger_1.Level.WARN);
|
|
2631
|
+
return [];
|
|
2632
|
+
}
|
|
2633
|
+
this.log(`Target URL pattern: ${targetPattern.origin}/${targetPattern.pathSegments.join('/')}`, logger_1.Level.LOG);
|
|
2634
|
+
const urlMappings = [];
|
|
2635
|
+
extractedUrls.forEach((urlsFromElement, scrapeListIndex) => {
|
|
2636
|
+
let matchingUrl = null;
|
|
2637
|
+
for (const url of urlsFromElement) {
|
|
2638
|
+
const urlPattern = getUrlPattern(url);
|
|
2639
|
+
if (!urlPattern)
|
|
2640
|
+
continue;
|
|
2641
|
+
if (urlPattern.origin !== targetPattern.origin)
|
|
2642
|
+
continue;
|
|
2643
|
+
if (urlPattern.pathSegments.length !== targetPattern.pathSegments.length)
|
|
2644
|
+
continue;
|
|
2645
|
+
let pathMatches = true;
|
|
2646
|
+
for (let i = 0; i < targetPattern.pathSegments.length - 1; i++) {
|
|
2647
|
+
if (urlPattern.pathSegments[i] !== targetPattern.pathSegments[i]) {
|
|
2648
|
+
pathMatches = false;
|
|
2649
|
+
break;
|
|
2650
|
+
}
|
|
2651
|
+
}
|
|
2652
|
+
if (!pathMatches)
|
|
2653
|
+
continue;
|
|
2654
|
+
const urlNormalized = url.replace(/\/$/, '').toLowerCase();
|
|
2655
|
+
if (urlNormalized === targetNormalized) {
|
|
2656
|
+
this.log(`Excluding already-visited URL: ${url}`, logger_1.Level.LOG);
|
|
2657
|
+
continue;
|
|
2658
|
+
}
|
|
2659
|
+
matchingUrl = url;
|
|
2660
|
+
break;
|
|
2661
|
+
}
|
|
2662
|
+
urlMappings.push({
|
|
2663
|
+
scrapeListIndex,
|
|
2664
|
+
url: matchingUrl
|
|
2665
|
+
});
|
|
2666
|
+
});
|
|
2667
|
+
const matchedCount = urlMappings.filter(m => m.url !== null).length;
|
|
2668
|
+
this.log(`Filtered to ${matchedCount} matching URLs for deep extraction (out of ${scrapeResults.length} total items)`, logger_1.Level.LOG);
|
|
2669
|
+
if (matchedCount > 0) {
|
|
2670
|
+
const matchedMappings = urlMappings.filter(m => m.url !== null);
|
|
2671
|
+
const sampleSize = Math.min(5, matchedMappings.length);
|
|
2672
|
+
const sample = matchedMappings.slice(0, sampleSize);
|
|
2673
|
+
this.log(`Sample of matching URLs (showing ${sampleSize} of ${matchedMappings.length}):`, logger_1.Level.LOG);
|
|
2674
|
+
sample.forEach((mapping, idx) => {
|
|
2675
|
+
this.log(` ${idx + 1}. [Index ${mapping.scrapeListIndex}] ${mapping.url}`, logger_1.Level.LOG);
|
|
2676
|
+
});
|
|
2677
|
+
}
|
|
2678
|
+
else {
|
|
2679
|
+
this.log('No matching URLs found. Check if extracted URLs match the pattern.', logger_1.Level.WARN);
|
|
2680
|
+
}
|
|
2681
|
+
return urlMappings;
|
|
2682
|
+
}
|
|
2683
|
+
catch (error) {
|
|
2684
|
+
this.log(`URL filtering failed: ${error.message}`, logger_1.Level.ERROR);
|
|
2685
|
+
return [];
|
|
2686
|
+
}
|
|
2687
|
+
});
|
|
2688
|
+
}
|
|
2689
|
+
/**
|
|
2690
|
+
* Filters pre-extracted URLs for deep extraction based on the goto action pattern.
|
|
2691
|
+
* This is used for paginated lists where URLs were extracted during pagination.
|
|
2692
|
+
* Returns the filtered URL mappings that maintain alignment with scrapeList indices.
|
|
2693
|
+
*/
|
|
2694
|
+
filterDeepExtractionUrlsFromExtracted(extractedUrls, scrapeResults, gotoTargetPattern) {
|
|
2695
|
+
try {
|
|
2696
|
+
const totalUrlCount = extractedUrls.reduce((sum, urls) => sum + urls.length, 0);
|
|
2697
|
+
this.log(`Deep extraction: Filtering ${totalUrlCount} pre-extracted URLs from ${scrapeResults.length} items`, logger_1.Level.LOG);
|
|
2698
|
+
const getUrlPattern = (url) => {
|
|
2699
|
+
try {
|
|
2700
|
+
const urlObj = new URL(url);
|
|
2701
|
+
const pathname = urlObj.pathname.replace(/\/$/, '');
|
|
2702
|
+
const segments = pathname.split('/').filter(s => s.length > 0);
|
|
2703
|
+
return {
|
|
2704
|
+
origin: urlObj.origin,
|
|
2705
|
+
pathSegments: segments
|
|
2706
|
+
};
|
|
2707
|
+
}
|
|
2708
|
+
catch (_a) {
|
|
2709
|
+
return null;
|
|
2710
|
+
}
|
|
2711
|
+
};
|
|
2712
|
+
const targetPattern = getUrlPattern(String(gotoTargetPattern));
|
|
2713
|
+
const targetNormalized = String(gotoTargetPattern).replace(/\/$/, '').toLowerCase();
|
|
2714
|
+
if (!targetPattern) {
|
|
2715
|
+
this.log('Could not parse goto URL pattern, skipping deep extraction', logger_1.Level.WARN);
|
|
2716
|
+
return [];
|
|
2717
|
+
}
|
|
2718
|
+
this.log(`Target URL pattern: ${targetPattern.origin}/${targetPattern.pathSegments.join('/')}`, logger_1.Level.LOG);
|
|
2719
|
+
const urlMappings = [];
|
|
2720
|
+
extractedUrls.forEach((urlsFromElement, scrapeListIndex) => {
|
|
2721
|
+
let matchingUrl = null;
|
|
2722
|
+
for (const url of urlsFromElement) {
|
|
2723
|
+
const urlPattern = getUrlPattern(url);
|
|
2724
|
+
if (!urlPattern)
|
|
2725
|
+
continue;
|
|
2726
|
+
if (urlPattern.origin !== targetPattern.origin)
|
|
2727
|
+
continue;
|
|
2728
|
+
if (urlPattern.pathSegments.length !== targetPattern.pathSegments.length)
|
|
2729
|
+
continue;
|
|
2730
|
+
let pathMatches = true;
|
|
2731
|
+
for (let i = 0; i < targetPattern.pathSegments.length - 1; i++) {
|
|
2732
|
+
if (urlPattern.pathSegments[i] !== targetPattern.pathSegments[i]) {
|
|
2733
|
+
pathMatches = false;
|
|
2734
|
+
break;
|
|
2735
|
+
}
|
|
2736
|
+
}
|
|
2737
|
+
if (!pathMatches)
|
|
2738
|
+
continue;
|
|
2739
|
+
const urlNormalized = url.replace(/\/$/, '').toLowerCase();
|
|
2740
|
+
if (urlNormalized === targetNormalized) {
|
|
2741
|
+
this.log(`Excluding already-visited URL: ${url}`, logger_1.Level.LOG);
|
|
2742
|
+
continue;
|
|
2743
|
+
}
|
|
2744
|
+
matchingUrl = url;
|
|
2745
|
+
break;
|
|
2746
|
+
}
|
|
2747
|
+
urlMappings.push({
|
|
2748
|
+
scrapeListIndex,
|
|
2749
|
+
url: matchingUrl
|
|
2750
|
+
});
|
|
2751
|
+
});
|
|
2752
|
+
const matchedCount = urlMappings.filter(m => m.url !== null).length;
|
|
2753
|
+
this.log(`Filtered to ${matchedCount} matching URLs for deep extraction (out of ${scrapeResults.length} total items)`, logger_1.Level.LOG);
|
|
2754
|
+
if (matchedCount > 0) {
|
|
2755
|
+
const matchedMappings = urlMappings.filter(m => m.url !== null);
|
|
2756
|
+
const sampleSize = Math.min(5, matchedMappings.length);
|
|
2757
|
+
const sample = matchedMappings.slice(0, sampleSize);
|
|
2758
|
+
this.log(`Sample of matching URLs (showing ${sampleSize} of ${matchedMappings.length}):`, logger_1.Level.LOG);
|
|
2759
|
+
sample.forEach((mapping, idx) => {
|
|
2760
|
+
this.log(` ${idx + 1}. [Index ${mapping.scrapeListIndex}] ${mapping.url}`, logger_1.Level.LOG);
|
|
2761
|
+
});
|
|
2762
|
+
}
|
|
2763
|
+
else {
|
|
2764
|
+
this.log('No matching URLs found. Check if extracted URLs match the pattern.', logger_1.Level.WARN);
|
|
2765
|
+
}
|
|
2766
|
+
return urlMappings;
|
|
2767
|
+
}
|
|
2768
|
+
catch (error) {
|
|
2769
|
+
this.log(`URL filtering failed: ${error.message}`, logger_1.Level.ERROR);
|
|
2770
|
+
return [];
|
|
2771
|
+
}
|
|
2772
|
+
}
|
|
2773
|
+
/**
|
|
2774
|
+
* Helper function to check if a URL matches a goto pattern.
|
|
2775
|
+
*/
|
|
2776
|
+
matchesGotoPattern(url, gotoPattern) {
|
|
2777
|
+
try {
|
|
2778
|
+
const getUrlPattern = (urlStr) => {
|
|
2779
|
+
try {
|
|
2780
|
+
const urlObj = new URL(urlStr);
|
|
2781
|
+
const pathname = urlObj.pathname.replace(/\/$/, '');
|
|
2782
|
+
const segments = pathname.split('/').filter(s => s.length > 0);
|
|
2783
|
+
return { origin: urlObj.origin, pathSegments: segments };
|
|
2784
|
+
}
|
|
2785
|
+
catch (_a) {
|
|
2786
|
+
return null;
|
|
2787
|
+
}
|
|
2788
|
+
};
|
|
2789
|
+
const urlPattern = getUrlPattern(url);
|
|
2790
|
+
const targetPattern = getUrlPattern(gotoPattern);
|
|
2791
|
+
const targetNormalized = gotoPattern.replace(/\/$/, '').toLowerCase();
|
|
2792
|
+
const urlNormalized = url.replace(/\/$/, '').toLowerCase();
|
|
2793
|
+
if (!urlPattern || !targetPattern)
|
|
2794
|
+
return false;
|
|
2795
|
+
if (urlPattern.origin !== targetPattern.origin)
|
|
2796
|
+
return false;
|
|
2797
|
+
if (urlPattern.pathSegments.length !== targetPattern.pathSegments.length)
|
|
2798
|
+
return false;
|
|
2799
|
+
if (urlNormalized === targetNormalized)
|
|
2800
|
+
return false; // Skip exact matches
|
|
2801
|
+
for (let i = 0; i < targetPattern.pathSegments.length - 1; i++) {
|
|
2802
|
+
if (urlPattern.pathSegments[i] !== targetPattern.pathSegments[i]) {
|
|
2803
|
+
return false;
|
|
2804
|
+
}
|
|
2805
|
+
}
|
|
2806
|
+
return true;
|
|
2807
|
+
}
|
|
2808
|
+
catch (_a) {
|
|
2809
|
+
return false;
|
|
2810
|
+
}
|
|
2811
|
+
}
|
|
2812
|
+
/**
|
|
2813
|
+
* Executes hierarchical deep extraction by processing each level recursively.
|
|
2814
|
+
* URLs are already stored in each hierarchy level's urlMappings during workflow execution.
|
|
2815
|
+
*/
|
|
2816
|
+
executeHierarchicalDeepExtraction(page, hierarchy) {
|
|
2817
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
2818
|
+
try {
|
|
2819
|
+
if (hierarchy.length === 0) {
|
|
2820
|
+
this.log('No hierarchy levels to process', logger_1.Level.LOG);
|
|
2821
|
+
return;
|
|
2822
|
+
}
|
|
2823
|
+
this.log(`\n=== Starting Hierarchical Deep Extraction (${hierarchy.length} level${hierarchy.length > 1 ? 's' : ''}) ===`, logger_1.Level.LOG);
|
|
2824
|
+
this.isInDeepExtractionPhase = true;
|
|
2825
|
+
const startLevel = hierarchy.length >= 2 ? hierarchy.length - 2 : hierarchy.length - 1;
|
|
2826
|
+
for (let levelIndex = startLevel; levelIndex >= 0; levelIndex--) {
|
|
2827
|
+
const level = hierarchy[levelIndex];
|
|
2828
|
+
const currentLevelUrls = level.urlMappings;
|
|
2829
|
+
this.log(`\n=== Processing Deep Extraction Level ${startLevel - levelIndex + 1}/${startLevel + 1} ===`, logger_1.Level.LOG);
|
|
2830
|
+
this.log(`Goto pattern: ${level.gotoPattern}`, logger_1.Level.LOG);
|
|
2831
|
+
this.log(`Actions to execute: ${level.actionsToExecute.length}`, logger_1.Level.LOG);
|
|
2832
|
+
this.log(`URLs to process: ${currentLevelUrls.filter(m => m.url !== null).length}`, logger_1.Level.LOG);
|
|
2833
|
+
if (currentLevelUrls.length === 0 || currentLevelUrls.every(u => !u.url)) {
|
|
2834
|
+
this.log('No valid URLs at this level - stopping here', logger_1.Level.LOG);
|
|
2835
|
+
break;
|
|
2836
|
+
}
|
|
2837
|
+
yield this.executeDeepExtractionLevel(page, level, currentLevelUrls);
|
|
2838
|
+
}
|
|
2839
|
+
this.log('\n=== Hierarchical Deep Extraction Completed ===', logger_1.Level.LOG);
|
|
2840
|
+
}
|
|
2841
|
+
catch (error) {
|
|
2842
|
+
this.log(`Hierarchical deep extraction failed: ${error.message}`, logger_1.Level.ERROR);
|
|
2843
|
+
}
|
|
2844
|
+
finally {
|
|
2845
|
+
this.isInDeepExtractionPhase = false;
|
|
2846
|
+
}
|
|
2847
|
+
});
|
|
2848
|
+
}
|
|
2849
|
+
/**
|
|
2850
|
+
* Executes deep extraction for a single level.
|
|
2851
|
+
* URLs are already extracted and stored in hierarchy during workflow execution.
|
|
2852
|
+
* This function just navigates to URLs and executes the capture actions.
|
|
2853
|
+
*/
|
|
2854
|
+
executeDeepExtractionLevel(page, level, urlMappings) {
|
|
2855
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
2856
|
+
try {
|
|
2857
|
+
const validMappings = urlMappings.filter(m => m.url !== null);
|
|
2858
|
+
if (validMappings.length === 0) {
|
|
2859
|
+
this.log('No URLs to process for this level', logger_1.Level.LOG);
|
|
2860
|
+
return;
|
|
2861
|
+
}
|
|
2862
|
+
this.log(`Processing ${validMappings.length} URLs`, logger_1.Level.LOG);
|
|
2863
|
+
for (const mapping of validMappings) {
|
|
2864
|
+
try {
|
|
2865
|
+
this.log(`[${mapping.index}] Navigating to: ${mapping.url}`, logger_1.Level.LOG);
|
|
2866
|
+
yield page.goto(mapping.url);
|
|
2867
|
+
yield page.waitForLoadState('networkidle', { timeout: 30000 });
|
|
2868
|
+
for (let i = level.actionsToExecute.length - 1; i >= 0; i--) {
|
|
2869
|
+
const actionPair = level.actionsToExecute[i];
|
|
2870
|
+
if (this.isAborted) {
|
|
2871
|
+
this.log('Workflow aborted during deep extraction', logger_1.Level.WARN);
|
|
2872
|
+
return;
|
|
2873
|
+
}
|
|
2874
|
+
const validatedAction = yield this.validateAndFixSelectors(page, actionPair);
|
|
2875
|
+
const filteredActions = validatedAction.what.filter(action => action.action === 'scrapeSchema' ||
|
|
2876
|
+
action.action === 'scrapeList' ||
|
|
2877
|
+
action.action === 'screenshot');
|
|
2878
|
+
if (filteredActions.length > 0) {
|
|
2879
|
+
yield this.carryOutSteps(page, filteredActions);
|
|
2880
|
+
}
|
|
2881
|
+
}
|
|
2882
|
+
this.log(`[${mapping.index}] Completed`, logger_1.Level.LOG);
|
|
2883
|
+
}
|
|
2884
|
+
catch (error) {
|
|
2885
|
+
this.log(`[${mapping.index}] Failed: ${error.message}`, logger_1.Level.ERROR);
|
|
2886
|
+
}
|
|
2887
|
+
}
|
|
2888
|
+
}
|
|
2889
|
+
catch (error) {
|
|
2890
|
+
this.log(`Level execution failed: ${error.message}`, logger_1.Level.ERROR);
|
|
2891
|
+
}
|
|
2892
|
+
});
|
|
2893
|
+
}
|
|
2186
2894
|
runLoop(p, workflow) {
|
|
2187
2895
|
return __awaiter(this, void 0, void 0, function* () {
|
|
2188
2896
|
var _a, _b;
|
|
@@ -2267,6 +2975,20 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
2267
2975
|
}
|
|
2268
2976
|
if (workflowCopy.length === 0) {
|
|
2269
2977
|
this.log('All actions completed. Workflow finished.', logger_1.Level.LOG);
|
|
2978
|
+
if (this.pendingDeepExtraction) {
|
|
2979
|
+
this.log('Starting deferred hierarchical deep extraction now that workflow has completed...', logger_1.Level.LOG);
|
|
2980
|
+
const { page, hierarchy } = this.pendingDeepExtraction;
|
|
2981
|
+
try {
|
|
2982
|
+
yield this.executeHierarchicalDeepExtraction(page, hierarchy);
|
|
2983
|
+
this.log('Hierarchical deep extraction completed successfully', logger_1.Level.LOG);
|
|
2984
|
+
}
|
|
2985
|
+
catch (error) {
|
|
2986
|
+
this.log(`Hierarchical deep extraction failed: ${error.message}`, logger_1.Level.ERROR);
|
|
2987
|
+
}
|
|
2988
|
+
finally {
|
|
2989
|
+
this.pendingDeepExtraction = null;
|
|
2990
|
+
}
|
|
2991
|
+
}
|
|
2270
2992
|
cleanup();
|
|
2271
2993
|
return;
|
|
2272
2994
|
}
|
|
@@ -2335,7 +3057,7 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
2335
3057
|
try {
|
|
2336
3058
|
const validatedAction = yield this.validateAndFixSelectors(p, action);
|
|
2337
3059
|
console.log("Carrying out:", validatedAction.what);
|
|
2338
|
-
yield this.carryOutSteps(p, validatedAction.what);
|
|
3060
|
+
yield this.carryOutSteps(p, validatedAction.what, workflowCopy);
|
|
2339
3061
|
usedActions.push((_b = action.id) !== null && _b !== void 0 ? _b : 'undefined');
|
|
2340
3062
|
workflowCopy.splice(actionId, 1);
|
|
2341
3063
|
console.log(`Action with ID ${action.id} removed from the workflow copy.`);
|