maxun-core 0.0.5 → 0.0.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/build/browserSide/scraper.js +2 -2
- package/build/interpret.d.ts +2 -0
- package/build/interpret.js +145 -41
- package/package.json +1 -1
|
@@ -240,12 +240,12 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
|
|
240
240
|
else if (attribute === 'src') {
|
|
241
241
|
// Handle relative 'src' URLs
|
|
242
242
|
const src = fieldElement.getAttribute('src');
|
|
243
|
-
record[label] = src ? new URL(src,
|
|
243
|
+
record[label] = src ? new URL(src, window.location.origin).href : null;
|
|
244
244
|
}
|
|
245
245
|
else if (attribute === 'href') {
|
|
246
246
|
// Handle relative 'href' URLs
|
|
247
247
|
const href = fieldElement.getAttribute('href');
|
|
248
|
-
record[label] = href ? new URL(href,
|
|
248
|
+
record[label] = href ? new URL(href, window.location.origin).href : null;
|
|
249
249
|
}
|
|
250
250
|
else {
|
|
251
251
|
record[label] = fieldElement.getAttribute(attribute);
|
package/build/interpret.d.ts
CHANGED
|
@@ -56,6 +56,7 @@ export default class Interpreter extends EventEmitter {
|
|
|
56
56
|
constructor(workflow: WorkflowFile, options?: Partial<InterpreterOptions>);
|
|
57
57
|
private applyAdBlocker;
|
|
58
58
|
private disableAdBlocker;
|
|
59
|
+
private getSelectors;
|
|
59
60
|
/**
|
|
60
61
|
* Returns the context object from given Page and the current workflow.\
|
|
61
62
|
* \
|
|
@@ -84,6 +85,7 @@ export default class Interpreter extends EventEmitter {
|
|
|
84
85
|
*/
|
|
85
86
|
private carryOutSteps;
|
|
86
87
|
private handlePagination;
|
|
88
|
+
private getMatchingActionId;
|
|
87
89
|
private runLoop;
|
|
88
90
|
private ensureScriptsLoaded;
|
|
89
91
|
/**
|
package/build/interpret.js
CHANGED
|
@@ -75,7 +75,7 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
75
75
|
oldLog(...args);
|
|
76
76
|
};
|
|
77
77
|
}
|
|
78
|
-
adblocker_playwright_1.PlaywrightBlocker.
|
|
78
|
+
adblocker_playwright_1.PlaywrightBlocker.fromLists(cross_fetch_1.default, ['https://easylist.to/easylist/easylist.txt']).then(blocker => {
|
|
79
79
|
this.blocker = blocker;
|
|
80
80
|
}).catch(err => {
|
|
81
81
|
this.log(`Failed to initialize ad-blocker:`, logger_1.Level.ERROR);
|
|
@@ -95,6 +95,43 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
95
95
|
}
|
|
96
96
|
});
|
|
97
97
|
}
|
|
98
|
+
// private getSelectors(workflow: Workflow, actionId: number): string[] {
|
|
99
|
+
// const selectors: string[] = [];
|
|
100
|
+
// // Validate actionId
|
|
101
|
+
// if (actionId <= 0) {
|
|
102
|
+
// console.log("No previous selectors to collect.");
|
|
103
|
+
// return selectors; // Empty array as there are no previous steps
|
|
104
|
+
// }
|
|
105
|
+
// // Iterate from the start up to (but not including) actionId
|
|
106
|
+
// for (let index = 0; index < actionId; index++) {
|
|
107
|
+
// const currentSelectors = workflow[index]?.where?.selectors;
|
|
108
|
+
// console.log(`Selectors at step ${index}:`, currentSelectors);
|
|
109
|
+
// if (currentSelectors && currentSelectors.length > 0) {
|
|
110
|
+
// currentSelectors.forEach((selector) => {
|
|
111
|
+
// if (!selectors.includes(selector)) {
|
|
112
|
+
// selectors.push(selector); // Avoid duplicates
|
|
113
|
+
// }
|
|
114
|
+
// });
|
|
115
|
+
// }
|
|
116
|
+
// }
|
|
117
|
+
// console.log("Collected Selectors:", selectors);
|
|
118
|
+
// return selectors;
|
|
119
|
+
// }
|
|
120
|
+
getSelectors(workflow) {
|
|
121
|
+
var _a, _b;
|
|
122
|
+
const selectorsSet = new Set();
|
|
123
|
+
if (workflow.length === 0) {
|
|
124
|
+
return [];
|
|
125
|
+
}
|
|
126
|
+
for (let index = workflow.length - 1; index >= 0; index--) {
|
|
127
|
+
const currentSelectors = (_b = (_a = workflow[index]) === null || _a === void 0 ? void 0 : _a.where) === null || _b === void 0 ? void 0 : _b.selectors;
|
|
128
|
+
if (currentSelectors && currentSelectors.length > 0) {
|
|
129
|
+
currentSelectors.forEach((selector) => selectorsSet.add(selector));
|
|
130
|
+
return Array.from(selectorsSet);
|
|
131
|
+
}
|
|
132
|
+
}
|
|
133
|
+
return [];
|
|
134
|
+
}
|
|
98
135
|
/**
|
|
99
136
|
* Returns the context object from given Page and the current workflow.\
|
|
100
137
|
* \
|
|
@@ -104,44 +141,52 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
104
141
|
* @param workflow Current **initialized** workflow (array of where-what pairs).
|
|
105
142
|
* @returns {PageState} State of the current page.
|
|
106
143
|
*/
|
|
107
|
-
getState(page,
|
|
144
|
+
getState(page, workflowCopy, selectors) {
|
|
108
145
|
return __awaiter(this, void 0, void 0, function* () {
|
|
109
146
|
/**
|
|
110
147
|
* All the selectors present in the current Workflow
|
|
111
148
|
*/
|
|
112
|
-
const selectors =
|
|
149
|
+
// const selectors = Preprocessor.extractSelectors(workflow);
|
|
150
|
+
// console.log("Current selectors:", selectors);
|
|
113
151
|
/**
|
|
114
152
|
* Determines whether the element targetted by the selector is [actionable](https://playwright.dev/docs/actionability).
|
|
115
153
|
* @param selector Selector to be queried
|
|
116
154
|
* @returns True if the targetted element is actionable, false otherwise.
|
|
117
155
|
*/
|
|
118
|
-
const actionable = (selector) =>
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
});
|
|
156
|
+
// const actionable = async (selector: string): Promise<boolean> => {
|
|
157
|
+
// try {
|
|
158
|
+
// const proms = [
|
|
159
|
+
// page.isEnabled(selector, { timeout: 5000 }),
|
|
160
|
+
// page.isVisible(selector, { timeout: 5000 }),
|
|
161
|
+
// ];
|
|
162
|
+
// return await Promise.all(proms).then((bools) => bools.every((x) => x));
|
|
163
|
+
// } catch (e) {
|
|
164
|
+
// // log(<Error>e, Level.ERROR);
|
|
165
|
+
// return false;
|
|
166
|
+
// }
|
|
167
|
+
// };
|
|
131
168
|
/**
|
|
132
169
|
* Object of selectors present in the current page.
|
|
133
170
|
*/
|
|
134
|
-
const presentSelectors =
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
171
|
+
// const presentSelectors: SelectorArray = await Promise.all(
|
|
172
|
+
// selectors.map(async (selector) => {
|
|
173
|
+
// if (await actionable(selector)) {
|
|
174
|
+
// return [selector];
|
|
175
|
+
// }
|
|
176
|
+
// return [];
|
|
177
|
+
// }),
|
|
178
|
+
// ).then((x) => x.flat());
|
|
179
|
+
const action = workflowCopy[workflowCopy.length - 1];
|
|
180
|
+
// console.log("Next action:", action)
|
|
181
|
+
let url = page.url();
|
|
182
|
+
if (action && action.where.url !== url && action.where.url !== "about:blank") {
|
|
183
|
+
url = action.where.url;
|
|
184
|
+
}
|
|
140
185
|
return {
|
|
141
|
-
url
|
|
186
|
+
url,
|
|
142
187
|
cookies: (yield page.context().cookies([page.url()]))
|
|
143
188
|
.reduce((p, cookie) => (Object.assign(Object.assign({}, p), { [cookie.name]: cookie.value })), {}),
|
|
144
|
-
selectors
|
|
189
|
+
selectors,
|
|
145
190
|
};
|
|
146
191
|
});
|
|
147
192
|
}
|
|
@@ -228,7 +273,6 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
228
273
|
* @param steps Array of actions.
|
|
229
274
|
*/
|
|
230
275
|
carryOutSteps(page, steps) {
|
|
231
|
-
var _a;
|
|
232
276
|
return __awaiter(this, void 0, void 0, function* () {
|
|
233
277
|
/**
|
|
234
278
|
* Defines overloaded (or added) methods/actions usable in the workflow.
|
|
@@ -300,6 +344,7 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
300
344
|
console.log("CUMULATIVE results:", this.cumulativeResults);
|
|
301
345
|
console.log("MERGED results:", mergedResult);
|
|
302
346
|
yield this.options.serializableCallback(mergedResult);
|
|
347
|
+
// await this.options.serializableCallback(scrapeResult);
|
|
303
348
|
}),
|
|
304
349
|
scrapeList: (config) => __awaiter(this, void 0, void 0, function* () {
|
|
305
350
|
yield this.ensureScriptsLoaded(page);
|
|
@@ -338,6 +383,15 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
338
383
|
});
|
|
339
384
|
}),
|
|
340
385
|
};
|
|
386
|
+
const executeAction = (invokee, methodName, args) => __awaiter(this, void 0, void 0, function* () {
|
|
387
|
+
console.log("Executing action:", methodName, args);
|
|
388
|
+
if (!args || Array.isArray(args)) {
|
|
389
|
+
yield invokee[methodName](...(args !== null && args !== void 0 ? args : []));
|
|
390
|
+
}
|
|
391
|
+
else {
|
|
392
|
+
yield invokee[methodName](args);
|
|
393
|
+
}
|
|
394
|
+
});
|
|
341
395
|
for (const step of steps) {
|
|
342
396
|
this.log(`Launching ${String(step.action)}`, logger_1.Level.LOG);
|
|
343
397
|
if (step.action in wawActions) {
|
|
@@ -353,11 +407,24 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
353
407
|
for (const level of levels.splice(0, levels.length - 1)) {
|
|
354
408
|
invokee = invokee[level];
|
|
355
409
|
}
|
|
356
|
-
if (
|
|
357
|
-
|
|
410
|
+
if (methodName === 'waitForLoadState') {
|
|
411
|
+
try {
|
|
412
|
+
yield executeAction(invokee, methodName, step.args);
|
|
413
|
+
}
|
|
414
|
+
catch (error) {
|
|
415
|
+
yield executeAction(invokee, methodName, 'domcontentloaded');
|
|
416
|
+
}
|
|
417
|
+
}
|
|
418
|
+
else if (methodName === 'click') {
|
|
419
|
+
try {
|
|
420
|
+
yield executeAction(invokee, methodName, step.args);
|
|
421
|
+
}
|
|
422
|
+
catch (error) {
|
|
423
|
+
yield executeAction(invokee, methodName, [step.args[0], { force: true }]);
|
|
424
|
+
}
|
|
358
425
|
}
|
|
359
426
|
else {
|
|
360
|
-
yield invokee
|
|
427
|
+
yield executeAction(invokee, methodName, step.args);
|
|
361
428
|
}
|
|
362
429
|
}
|
|
363
430
|
yield new Promise((res) => { setTimeout(res, 500); });
|
|
@@ -396,6 +463,7 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
396
463
|
break;
|
|
397
464
|
case 'clickNext':
|
|
398
465
|
const pageResults = yield page.evaluate((cfg) => window.scrapeList(cfg), config);
|
|
466
|
+
// console.log("Page results:", pageResults);
|
|
399
467
|
// Filter out already scraped items
|
|
400
468
|
const newResults = pageResults.filter(item => {
|
|
401
469
|
const uniqueKey = JSON.stringify(item);
|
|
@@ -413,7 +481,7 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
413
481
|
return allResults; // No more pages to scrape
|
|
414
482
|
}
|
|
415
483
|
yield Promise.all([
|
|
416
|
-
nextButton.click
|
|
484
|
+
nextButton.dispatchEvent('click'),
|
|
417
485
|
page.waitForNavigation({ waitUntil: 'networkidle' })
|
|
418
486
|
]);
|
|
419
487
|
yield page.waitForTimeout(1000);
|
|
@@ -428,7 +496,7 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
428
496
|
return allResults;
|
|
429
497
|
}
|
|
430
498
|
// Click the 'Load More' button to load additional items
|
|
431
|
-
yield loadMoreButton.click
|
|
499
|
+
yield loadMoreButton.dispatchEvent('click');
|
|
432
500
|
yield page.waitForTimeout(2000); // Wait for new items to load
|
|
433
501
|
// After clicking 'Load More', scroll down to load more items
|
|
434
502
|
yield page.evaluate(() => window.scrollTo(0, document.body.scrollHeight));
|
|
@@ -462,13 +530,30 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
462
530
|
return allResults;
|
|
463
531
|
});
|
|
464
532
|
}
|
|
533
|
+
getMatchingActionId(workflow, pageState, usedActions) {
|
|
534
|
+
for (let actionId = workflow.length - 1; actionId >= 0; actionId--) {
|
|
535
|
+
const step = workflow[actionId];
|
|
536
|
+
const isApplicable = this.applicable(step.where, pageState, usedActions);
|
|
537
|
+
console.log("-------------------------------------------------------------");
|
|
538
|
+
console.log(`Where:`, step.where);
|
|
539
|
+
console.log(`Page state:`, pageState);
|
|
540
|
+
console.log(`Match result: ${isApplicable}`);
|
|
541
|
+
console.log("-------------------------------------------------------------");
|
|
542
|
+
if (isApplicable) {
|
|
543
|
+
return actionId;
|
|
544
|
+
}
|
|
545
|
+
}
|
|
546
|
+
}
|
|
465
547
|
runLoop(p, workflow) {
|
|
466
548
|
var _a, _b;
|
|
467
549
|
return __awaiter(this, void 0, void 0, function* () {
|
|
550
|
+
const workflowCopy = JSON.parse(JSON.stringify(workflow));
|
|
468
551
|
// apply ad-blocker to the current page
|
|
469
552
|
yield this.applyAdBlocker(p);
|
|
470
553
|
const usedActions = [];
|
|
554
|
+
let selectors = [];
|
|
471
555
|
let lastAction = null;
|
|
556
|
+
let actionId = -1;
|
|
472
557
|
let repeatCount = 0;
|
|
473
558
|
/**
|
|
474
559
|
* Enables the interpreter functionality for popup windows.
|
|
@@ -476,7 +561,7 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
476
561
|
* e.g. via `enqueueLinks`.
|
|
477
562
|
*/
|
|
478
563
|
p.on('popup', (popup) => {
|
|
479
|
-
this.concurrency.addJob(() => this.runLoop(popup,
|
|
564
|
+
this.concurrency.addJob(() => this.runLoop(popup, workflowCopy));
|
|
480
565
|
});
|
|
481
566
|
/* eslint no-constant-condition: ["warn", { "checkLoops": false }] */
|
|
482
567
|
while (true) {
|
|
@@ -493,8 +578,11 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
493
578
|
return;
|
|
494
579
|
}
|
|
495
580
|
let pageState = {};
|
|
581
|
+
let getStateTest = "Hello";
|
|
496
582
|
try {
|
|
497
|
-
pageState = yield this.getState(p,
|
|
583
|
+
pageState = yield this.getState(p, workflowCopy, selectors);
|
|
584
|
+
selectors = [];
|
|
585
|
+
console.log("Empty selectors:", selectors);
|
|
498
586
|
}
|
|
499
587
|
catch (e) {
|
|
500
588
|
this.log('The browser has been closed.');
|
|
@@ -503,27 +591,43 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
503
591
|
if (this.options.debug) {
|
|
504
592
|
this.log(`Current state is: \n${JSON.stringify(pageState, null, 2)}`, logger_1.Level.WARN);
|
|
505
593
|
}
|
|
506
|
-
const actionId = workflow.findIndex((step) => {
|
|
507
|
-
|
|
508
|
-
|
|
509
|
-
|
|
510
|
-
|
|
511
|
-
|
|
512
|
-
|
|
513
|
-
|
|
594
|
+
// const actionId = workflow.findIndex((step) => {
|
|
595
|
+
// const isApplicable = this.applicable(step.where, pageState, usedActions);
|
|
596
|
+
// console.log("-------------------------------------------------------------");
|
|
597
|
+
// console.log(`Where:`, step.where);
|
|
598
|
+
// console.log(`Page state:`, pageState);
|
|
599
|
+
// console.log(`Match result: ${isApplicable}`);
|
|
600
|
+
// console.log("-------------------------------------------------------------");
|
|
601
|
+
// return isApplicable;
|
|
602
|
+
// });
|
|
603
|
+
actionId = this.getMatchingActionId(workflowCopy, pageState, usedActions);
|
|
604
|
+
const action = workflowCopy[actionId];
|
|
605
|
+
console.log("MATCHED ACTION:", action);
|
|
606
|
+
console.log("MATCHED ACTION ID:", actionId);
|
|
514
607
|
this.log(`Matched ${JSON.stringify(action === null || action === void 0 ? void 0 : action.where)}`, logger_1.Level.LOG);
|
|
515
608
|
if (action) { // action is matched
|
|
516
609
|
if ((_a = this.options.debugChannel) === null || _a === void 0 ? void 0 : _a.activeId) {
|
|
517
610
|
this.options.debugChannel.activeId(actionId);
|
|
518
611
|
}
|
|
519
612
|
repeatCount = action === lastAction ? repeatCount + 1 : 0;
|
|
520
|
-
|
|
613
|
+
console.log("REPEAT COUNT", repeatCount);
|
|
614
|
+
if (this.options.maxRepeats && repeatCount > this.options.maxRepeats) {
|
|
521
615
|
return;
|
|
522
616
|
}
|
|
523
617
|
lastAction = action;
|
|
524
618
|
try {
|
|
619
|
+
console.log("Carrying out:", action.what);
|
|
525
620
|
yield this.carryOutSteps(p, action.what);
|
|
526
621
|
usedActions.push((_b = action.id) !== null && _b !== void 0 ? _b : 'undefined');
|
|
622
|
+
workflowCopy.splice(actionId, 1);
|
|
623
|
+
console.log(`Action with ID ${action.id} removed from the workflow copy.`);
|
|
624
|
+
// const newSelectors = this.getPreviousSelectors(workflow, actionId);
|
|
625
|
+
const newSelectors = this.getSelectors(workflowCopy);
|
|
626
|
+
newSelectors.forEach(selector => {
|
|
627
|
+
if (!selectors.includes(selector)) {
|
|
628
|
+
selectors.push(selector);
|
|
629
|
+
}
|
|
630
|
+
});
|
|
527
631
|
}
|
|
528
632
|
catch (e) {
|
|
529
633
|
this.log(e, logger_1.Level.ERROR);
|