maxun-core 0.0.2 → 0.0.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/build/browserSide/scraper.js +2 -1
- package/build/interpret.d.ts +0 -9
- package/build/interpret.js +16 -58
- package/build/utils/concurrency.d.ts +1 -1
- package/build/utils/concurrency.js +1 -1
- package/package.json +7 -2
- package/build/proxy.d.ts +0 -5
- package/build/proxy.js +0 -2
|
@@ -192,7 +192,8 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
|
|
192
192
|
return undefined;
|
|
193
193
|
switch (attribute) {
|
|
194
194
|
case 'href':
|
|
195
|
-
|
|
195
|
+
const relativeHref = elem.getAttribute('href'); // Get the href attribute
|
|
196
|
+
return relativeHref ? new URL(relativeHref, window.location.origin).href : null; // Convert to full URL
|
|
196
197
|
case 'src':
|
|
197
198
|
return elem.getAttribute('src');
|
|
198
199
|
case 'innerText':
|
package/build/interpret.d.ts
CHANGED
|
@@ -2,7 +2,6 @@
|
|
|
2
2
|
import { Page } from 'playwright';
|
|
3
3
|
import { EventEmitter } from 'events';
|
|
4
4
|
import { WorkflowFile, ParamType } from './types/workflow';
|
|
5
|
-
import { ProxyConfig } from './proxy';
|
|
6
5
|
/**
|
|
7
6
|
* Defines optional intepreter options (passed in constructor)
|
|
8
7
|
*/
|
|
@@ -16,8 +15,6 @@ interface InterpreterOptions {
|
|
|
16
15
|
activeId: Function;
|
|
17
16
|
debugMessage: Function;
|
|
18
17
|
}>;
|
|
19
|
-
proxy?: ProxyConfig | null;
|
|
20
|
-
onProxyError?: (error: Error, proxy: ProxyConfig) => Promise<ProxyConfig | null>;
|
|
21
18
|
}
|
|
22
19
|
/**
|
|
23
20
|
* Class for running the Smart Workflows.
|
|
@@ -30,13 +27,7 @@ export default class Interpreter extends EventEmitter {
|
|
|
30
27
|
private stopper;
|
|
31
28
|
private log;
|
|
32
29
|
private blocker;
|
|
33
|
-
private browser;
|
|
34
|
-
private contexts;
|
|
35
|
-
private currentProxy;
|
|
36
30
|
constructor(workflow: WorkflowFile, options?: Partial<InterpreterOptions>);
|
|
37
|
-
updateProxy(proxyConfig: ProxyConfig | null): void;
|
|
38
|
-
private createProxyContext;
|
|
39
|
-
private createProxyPage;
|
|
40
31
|
private applyAdBlocker;
|
|
41
32
|
private disableAdBlocker;
|
|
42
33
|
/**
|
package/build/interpret.js
CHANGED
|
@@ -53,16 +53,9 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
53
53
|
super();
|
|
54
54
|
this.stopper = null;
|
|
55
55
|
this.blocker = null;
|
|
56
|
-
this.browser = null;
|
|
57
|
-
this.contexts = [];
|
|
58
|
-
this.currentProxy = null;
|
|
59
56
|
this.workflow = workflow.workflow;
|
|
60
57
|
this.initializedWorkflow = null;
|
|
61
|
-
this.options = Object.assign({ maxRepeats: 5, maxConcurrency: 5, serializableCallback: (data) => { (0, logger_1.default)(JSON.stringify(data), logger_1.Level.WARN); }, binaryCallback: () => { (0, logger_1.default)('Received binary data, thrashing them.', logger_1.Level.WARN); }, debug: false, debugChannel: {}
|
|
62
|
-
this.log(`Proxy error: ${error.message}`, logger_1.Level.ERROR);
|
|
63
|
-
return null;
|
|
64
|
-
}) }, options);
|
|
65
|
-
this.currentProxy = this.options.proxy;
|
|
58
|
+
this.options = Object.assign({ maxRepeats: 5, maxConcurrency: 5, serializableCallback: (data) => { (0, logger_1.default)(JSON.stringify(data), logger_1.Level.WARN); }, binaryCallback: () => { (0, logger_1.default)('Received binary data, thrashing them.', logger_1.Level.WARN); }, debug: false, debugChannel: {} }, options);
|
|
66
59
|
this.concurrency = new concurrency_1.default(this.options.maxConcurrency);
|
|
67
60
|
this.log = (...args) => (0, logger_1.default)(...args);
|
|
68
61
|
const error = preprocessor_1.default.validateWorkflow(workflow);
|
|
@@ -85,42 +78,6 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
85
78
|
this.log(`Failed to initialize ad-blocker:`, logger_1.Level.ERROR);
|
|
86
79
|
});
|
|
87
80
|
}
|
|
88
|
-
updateProxy(proxyConfig) {
|
|
89
|
-
this.currentProxy = proxyConfig;
|
|
90
|
-
this.log(`Proxy configuration updated`, logger_1.Level.LOG);
|
|
91
|
-
}
|
|
92
|
-
createProxyContext(browser) {
|
|
93
|
-
return __awaiter(this, void 0, void 0, function* () {
|
|
94
|
-
if (!this.currentProxy) {
|
|
95
|
-
return browser.newContext();
|
|
96
|
-
}
|
|
97
|
-
try {
|
|
98
|
-
const context = yield browser.newContext({
|
|
99
|
-
proxy: this.currentProxy
|
|
100
|
-
});
|
|
101
|
-
this.contexts.push(context);
|
|
102
|
-
return context;
|
|
103
|
-
}
|
|
104
|
-
catch (error) {
|
|
105
|
-
if (this.options.onProxyError) {
|
|
106
|
-
const newProxy = yield this.options.onProxyError(error, this.currentProxy);
|
|
107
|
-
if (newProxy) {
|
|
108
|
-
this.currentProxy = newProxy;
|
|
109
|
-
return this.createProxyContext(browser);
|
|
110
|
-
}
|
|
111
|
-
}
|
|
112
|
-
throw error;
|
|
113
|
-
}
|
|
114
|
-
});
|
|
115
|
-
}
|
|
116
|
-
// create a new page with proxy
|
|
117
|
-
createProxyPage(context) {
|
|
118
|
-
return __awaiter(this, void 0, void 0, function* () {
|
|
119
|
-
const page = yield context.newPage();
|
|
120
|
-
yield page.setViewportSize({ width: 900, height: 400 });
|
|
121
|
-
return page;
|
|
122
|
-
});
|
|
123
|
-
}
|
|
124
81
|
applyAdBlocker(page) {
|
|
125
82
|
return __awaiter(this, void 0, void 0, function* () {
|
|
126
83
|
if (this.blocker) {
|
|
@@ -146,7 +103,6 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
146
103
|
*/
|
|
147
104
|
getState(page, workflow) {
|
|
148
105
|
return __awaiter(this, void 0, void 0, function* () {
|
|
149
|
-
yield page.setViewportSize({ width: 900, height: 400 });
|
|
150
106
|
/**
|
|
151
107
|
* All the selectors present in the current Workflow
|
|
152
108
|
*/
|
|
@@ -289,13 +245,12 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
289
245
|
.evaluateAll(
|
|
290
246
|
// @ts-ignore
|
|
291
247
|
(elements) => elements.map((a) => a.href).filter((x) => x));
|
|
292
|
-
const context =
|
|
248
|
+
const context = page.context();
|
|
293
249
|
for (const link of links) {
|
|
294
250
|
// eslint-disable-next-line
|
|
295
251
|
this.concurrency.addJob(() => __awaiter(this, void 0, void 0, function* () {
|
|
296
252
|
try {
|
|
297
|
-
const newPage = yield
|
|
298
|
-
yield newPage.setViewportSize({ width: 900, height: 400 });
|
|
253
|
+
const newPage = yield context.newPage();
|
|
299
254
|
yield newPage.goto(link);
|
|
300
255
|
yield newPage.waitForLoadState('networkidle');
|
|
301
256
|
yield this.runLoop(newPage, this.initializedWorkflow);
|
|
@@ -567,6 +522,17 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
567
522
|
*/
|
|
568
523
|
run(page, params) {
|
|
569
524
|
return __awaiter(this, void 0, void 0, function* () {
|
|
525
|
+
this.log('Starting the workflow.', logger_1.Level.LOG);
|
|
526
|
+
const context = page.context();
|
|
527
|
+
// Check proxy settings from context options
|
|
528
|
+
const contextOptions = context._options;
|
|
529
|
+
const hasProxy = !!(contextOptions === null || contextOptions === void 0 ? void 0 : contextOptions.proxy);
|
|
530
|
+
this.log(`Proxy settings: ${hasProxy ? `Proxy is configured...` : 'No proxy configured...'}`);
|
|
531
|
+
if (hasProxy) {
|
|
532
|
+
if (contextOptions.proxy.username) {
|
|
533
|
+
this.log(`Proxy authenticated...`);
|
|
534
|
+
}
|
|
535
|
+
}
|
|
570
536
|
if (this.stopper) {
|
|
571
537
|
throw new Error('This Interpreter is already running a workflow. To run another workflow, please, spawn another Interpreter.');
|
|
572
538
|
}
|
|
@@ -574,20 +540,12 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
574
540
|
* `this.workflow` with the parameters initialized.
|
|
575
541
|
*/
|
|
576
542
|
this.initializedWorkflow = preprocessor_1.default.initWorkflow(this.workflow, params);
|
|
577
|
-
|
|
578
|
-
const context = yield this.createProxyContext(page.context().browser());
|
|
579
|
-
// Create a new page with proxy
|
|
580
|
-
const proxyPage = yield this.createProxyPage(context);
|
|
581
|
-
// Copy over the current page's URL and state
|
|
582
|
-
yield proxyPage.goto(page.url());
|
|
583
|
-
yield this.ensureScriptsLoaded(proxyPage);
|
|
543
|
+
yield this.ensureScriptsLoaded(page);
|
|
584
544
|
this.stopper = () => {
|
|
585
545
|
this.stopper = null;
|
|
586
546
|
};
|
|
587
|
-
this.concurrency.addJob(() => this.runLoop(
|
|
547
|
+
this.concurrency.addJob(() => this.runLoop(page, this.initializedWorkflow));
|
|
588
548
|
yield this.concurrency.waitForCompletion();
|
|
589
|
-
yield Promise.all(this.contexts.map(ctx => ctx.close()));
|
|
590
|
-
this.contexts = [];
|
|
591
549
|
this.stopper = null;
|
|
592
550
|
});
|
|
593
551
|
}
|
|
@@ -38,7 +38,7 @@ export default class Concurrency {
|
|
|
38
38
|
/**
|
|
39
39
|
* Waits until there is no running nor waiting job. \
|
|
40
40
|
* If the concurrency manager is idle at the time of calling this function,
|
|
41
|
-
* it waits until at least one job is
|
|
41
|
+
* it waits until at least one job is completed (can be "presubscribed").
|
|
42
42
|
* @returns Promise, resolved after there is no running/waiting worker.
|
|
43
43
|
*/
|
|
44
44
|
waitForCompletion(): Promise<void>;
|
|
@@ -69,7 +69,7 @@ class Concurrency {
|
|
|
69
69
|
/**
|
|
70
70
|
* Waits until there is no running nor waiting job. \
|
|
71
71
|
* If the concurrency manager is idle at the time of calling this function,
|
|
72
|
-
* it waits until at least one job is
|
|
72
|
+
* it waits until at least one job is completed (can be "presubscribed").
|
|
73
73
|
* @returns Promise, resolved after there is no running/waiting worker.
|
|
74
74
|
*/
|
|
75
75
|
waitForCompletion() {
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "maxun-core",
|
|
3
|
-
"version": "0.0.
|
|
3
|
+
"version": "0.0.4",
|
|
4
4
|
"description": "Core package for Maxun, responsible for data extraction",
|
|
5
5
|
"main": "build/index.js",
|
|
6
6
|
"typings": "build/index.d.ts",
|
|
@@ -20,7 +20,12 @@
|
|
|
20
20
|
"automation",
|
|
21
21
|
"workflow",
|
|
22
22
|
"data extraction",
|
|
23
|
-
"scraping"
|
|
23
|
+
"scraping",
|
|
24
|
+
"web scraper",
|
|
25
|
+
"web scraping",
|
|
26
|
+
"data scraping",
|
|
27
|
+
"no-code web scraper",
|
|
28
|
+
"no-code web scraping"
|
|
24
29
|
],
|
|
25
30
|
"author": "Maxun",
|
|
26
31
|
"license": "AGPL-3.0-or-later",
|
package/build/proxy.d.ts
DELETED
package/build/proxy.js
DELETED