maxun-core 0.0.2 → 0.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -192,7 +192,8 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
192
192
  return undefined;
193
193
  switch (attribute) {
194
194
  case 'href':
195
- return elem.getAttribute('href');
195
+ const relativeHref = elem.getAttribute('href'); // Get the href attribute
196
+ return relativeHref ? new URL(relativeHref, window.location.origin).href : null; // Convert to full URL
196
197
  case 'src':
197
198
  return elem.getAttribute('src');
198
199
  case 'innerText':
@@ -2,7 +2,6 @@
2
2
  import { Page } from 'playwright';
3
3
  import { EventEmitter } from 'events';
4
4
  import { WorkflowFile, ParamType } from './types/workflow';
5
- import { ProxyConfig } from './proxy';
6
5
  /**
7
6
  * Defines optional intepreter options (passed in constructor)
8
7
  */
@@ -16,8 +15,6 @@ interface InterpreterOptions {
16
15
  activeId: Function;
17
16
  debugMessage: Function;
18
17
  }>;
19
- proxy?: ProxyConfig | null;
20
- onProxyError?: (error: Error, proxy: ProxyConfig) => Promise<ProxyConfig | null>;
21
18
  }
22
19
  /**
23
20
  * Class for running the Smart Workflows.
@@ -30,13 +27,7 @@ export default class Interpreter extends EventEmitter {
30
27
  private stopper;
31
28
  private log;
32
29
  private blocker;
33
- private browser;
34
- private contexts;
35
- private currentProxy;
36
30
  constructor(workflow: WorkflowFile, options?: Partial<InterpreterOptions>);
37
- updateProxy(proxyConfig: ProxyConfig | null): void;
38
- private createProxyContext;
39
- private createProxyPage;
40
31
  private applyAdBlocker;
41
32
  private disableAdBlocker;
42
33
  /**
@@ -53,16 +53,9 @@ class Interpreter extends events_1.EventEmitter {
53
53
  super();
54
54
  this.stopper = null;
55
55
  this.blocker = null;
56
- this.browser = null;
57
- this.contexts = [];
58
- this.currentProxy = null;
59
56
  this.workflow = workflow.workflow;
60
57
  this.initializedWorkflow = null;
61
- this.options = Object.assign({ maxRepeats: 5, maxConcurrency: 5, serializableCallback: (data) => { (0, logger_1.default)(JSON.stringify(data), logger_1.Level.WARN); }, binaryCallback: () => { (0, logger_1.default)('Received binary data, thrashing them.', logger_1.Level.WARN); }, debug: false, debugChannel: {}, proxy: null, onProxyError: (error, proxy) => __awaiter(this, void 0, void 0, function* () {
62
- this.log(`Proxy error: ${error.message}`, logger_1.Level.ERROR);
63
- return null;
64
- }) }, options);
65
- this.currentProxy = this.options.proxy;
58
+ this.options = Object.assign({ maxRepeats: 5, maxConcurrency: 5, serializableCallback: (data) => { (0, logger_1.default)(JSON.stringify(data), logger_1.Level.WARN); }, binaryCallback: () => { (0, logger_1.default)('Received binary data, thrashing them.', logger_1.Level.WARN); }, debug: false, debugChannel: {} }, options);
66
59
  this.concurrency = new concurrency_1.default(this.options.maxConcurrency);
67
60
  this.log = (...args) => (0, logger_1.default)(...args);
68
61
  const error = preprocessor_1.default.validateWorkflow(workflow);
@@ -85,42 +78,6 @@ class Interpreter extends events_1.EventEmitter {
85
78
  this.log(`Failed to initialize ad-blocker:`, logger_1.Level.ERROR);
86
79
  });
87
80
  }
88
- updateProxy(proxyConfig) {
89
- this.currentProxy = proxyConfig;
90
- this.log(`Proxy configuration updated`, logger_1.Level.LOG);
91
- }
92
- createProxyContext(browser) {
93
- return __awaiter(this, void 0, void 0, function* () {
94
- if (!this.currentProxy) {
95
- return browser.newContext();
96
- }
97
- try {
98
- const context = yield browser.newContext({
99
- proxy: this.currentProxy
100
- });
101
- this.contexts.push(context);
102
- return context;
103
- }
104
- catch (error) {
105
- if (this.options.onProxyError) {
106
- const newProxy = yield this.options.onProxyError(error, this.currentProxy);
107
- if (newProxy) {
108
- this.currentProxy = newProxy;
109
- return this.createProxyContext(browser);
110
- }
111
- }
112
- throw error;
113
- }
114
- });
115
- }
116
- // create a new page with proxy
117
- createProxyPage(context) {
118
- return __awaiter(this, void 0, void 0, function* () {
119
- const page = yield context.newPage();
120
- yield page.setViewportSize({ width: 900, height: 400 });
121
- return page;
122
- });
123
- }
124
81
  applyAdBlocker(page) {
125
82
  return __awaiter(this, void 0, void 0, function* () {
126
83
  if (this.blocker) {
@@ -146,7 +103,6 @@ class Interpreter extends events_1.EventEmitter {
146
103
  */
147
104
  getState(page, workflow) {
148
105
  return __awaiter(this, void 0, void 0, function* () {
149
- yield page.setViewportSize({ width: 900, height: 400 });
150
106
  /**
151
107
  * All the selectors present in the current Workflow
152
108
  */
@@ -289,13 +245,12 @@ class Interpreter extends events_1.EventEmitter {
289
245
  .evaluateAll(
290
246
  // @ts-ignore
291
247
  (elements) => elements.map((a) => a.href).filter((x) => x));
292
- const context = yield this.createProxyContext(page.context().browser());
248
+ const context = page.context();
293
249
  for (const link of links) {
294
250
  // eslint-disable-next-line
295
251
  this.concurrency.addJob(() => __awaiter(this, void 0, void 0, function* () {
296
252
  try {
297
- const newPage = yield this.createProxyPage(context);
298
- yield newPage.setViewportSize({ width: 900, height: 400 });
253
+ const newPage = yield context.newPage();
299
254
  yield newPage.goto(link);
300
255
  yield newPage.waitForLoadState('networkidle');
301
256
  yield this.runLoop(newPage, this.initializedWorkflow);
@@ -567,6 +522,17 @@ class Interpreter extends events_1.EventEmitter {
567
522
  */
568
523
  run(page, params) {
569
524
  return __awaiter(this, void 0, void 0, function* () {
525
+ this.log('Starting the workflow.', logger_1.Level.LOG);
526
+ const context = page.context();
527
+ // Check proxy settings from context options
528
+ const contextOptions = context._options;
529
+ const hasProxy = !!(contextOptions === null || contextOptions === void 0 ? void 0 : contextOptions.proxy);
530
+ this.log(`Proxy settings: ${hasProxy ? `Proxy is configured...` : 'No proxy configured...'}`);
531
+ if (hasProxy) {
532
+ if (contextOptions.proxy.username) {
533
+ this.log(`Proxy authenticated...`);
534
+ }
535
+ }
570
536
  if (this.stopper) {
571
537
  throw new Error('This Interpreter is already running a workflow. To run another workflow, please, spawn another Interpreter.');
572
538
  }
@@ -574,20 +540,12 @@ class Interpreter extends events_1.EventEmitter {
574
540
  * `this.workflow` with the parameters initialized.
575
541
  */
576
542
  this.initializedWorkflow = preprocessor_1.default.initWorkflow(this.workflow, params);
577
- // Create a new context with proxy configuration
578
- const context = yield this.createProxyContext(page.context().browser());
579
- // Create a new page with proxy
580
- const proxyPage = yield this.createProxyPage(context);
581
- // Copy over the current page's URL and state
582
- yield proxyPage.goto(page.url());
583
- yield this.ensureScriptsLoaded(proxyPage);
543
+ yield this.ensureScriptsLoaded(page);
584
544
  this.stopper = () => {
585
545
  this.stopper = null;
586
546
  };
587
- this.concurrency.addJob(() => this.runLoop(proxyPage, this.initializedWorkflow));
547
+ this.concurrency.addJob(() => this.runLoop(page, this.initializedWorkflow));
588
548
  yield this.concurrency.waitForCompletion();
589
- yield Promise.all(this.contexts.map(ctx => ctx.close()));
590
- this.contexts = [];
591
549
  this.stopper = null;
592
550
  });
593
551
  }
@@ -38,7 +38,7 @@ export default class Concurrency {
38
38
  /**
39
39
  * Waits until there is no running nor waiting job. \
40
40
  * If the concurrency manager is idle at the time of calling this function,
41
- * it waits until at least one job is compeleted (can be "presubscribed").
41
+ * it waits until at least one job is completed (can be "presubscribed").
42
42
  * @returns Promise, resolved after there is no running/waiting worker.
43
43
  */
44
44
  waitForCompletion(): Promise<void>;
@@ -69,7 +69,7 @@ class Concurrency {
69
69
  /**
70
70
  * Waits until there is no running nor waiting job. \
71
71
  * If the concurrency manager is idle at the time of calling this function,
72
- * it waits until at least one job is compeleted (can be "presubscribed").
72
+ * it waits until at least one job is completed (can be "presubscribed").
73
73
  * @returns Promise, resolved after there is no running/waiting worker.
74
74
  */
75
75
  waitForCompletion() {
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "maxun-core",
3
- "version": "0.0.2",
3
+ "version": "0.0.4",
4
4
  "description": "Core package for Maxun, responsible for data extraction",
5
5
  "main": "build/index.js",
6
6
  "typings": "build/index.d.ts",
@@ -20,7 +20,12 @@
20
20
  "automation",
21
21
  "workflow",
22
22
  "data extraction",
23
- "scraping"
23
+ "scraping",
24
+ "web scraper",
25
+ "web scraping",
26
+ "data scraping",
27
+ "no-code web scraper",
28
+ "no-code web scraping"
24
29
  ],
25
30
  "author": "Maxun",
26
31
  "license": "AGPL-3.0-or-later",
package/build/proxy.d.ts DELETED
@@ -1,5 +0,0 @@
1
- export interface ProxyConfig {
2
- server: string;
3
- username?: string;
4
- password?: string;
5
- }
package/build/proxy.js DELETED
@@ -1,2 +0,0 @@
1
- "use strict";
2
- Object.defineProperty(exports, "__esModule", { value: true });