maxun-core 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,151 @@
1
+ "use strict";
2
+ var __importDefault = (this && this.__importDefault) || function (mod) {
3
+ return (mod && mod.__esModule) ? mod : { "default": mod };
4
+ };
5
+ Object.defineProperty(exports, "__esModule", { value: true });
6
+ const joi_1 = __importDefault(require("joi"));
7
+ const logic_1 = require("./types/logic");
8
+ /**
9
+ * Class for static processing the workflow files/objects.
10
+ */
11
+ class Preprocessor {
12
+ static validateWorkflow(workflow) {
13
+ const regex = joi_1.default.object({
14
+ $regex: joi_1.default.string().required(),
15
+ });
16
+ const whereSchema = joi_1.default.object({
17
+ url: [joi_1.default.string().uri(), regex],
18
+ selectors: joi_1.default.array().items(joi_1.default.string()),
19
+ cookies: joi_1.default.object({}).pattern(joi_1.default.string(), joi_1.default.string()),
20
+ $after: [joi_1.default.string(), regex],
21
+ $before: [joi_1.default.string(), regex],
22
+ $and: joi_1.default.array().items(joi_1.default.link('#whereSchema')),
23
+ $or: joi_1.default.array().items(joi_1.default.link('#whereSchema')),
24
+ $not: joi_1.default.link('#whereSchema'),
25
+ }).id('whereSchema');
26
+ const schema = joi_1.default.object({
27
+ meta: joi_1.default.object({
28
+ name: joi_1.default.string(),
29
+ desc: joi_1.default.string(),
30
+ }),
31
+ workflow: joi_1.default.array().items(joi_1.default.object({
32
+ id: joi_1.default.string(),
33
+ where: whereSchema.required(),
34
+ what: joi_1.default.array().items({
35
+ action: joi_1.default.string().required(),
36
+ args: joi_1.default.array().items(joi_1.default.any()),
37
+ }).required(),
38
+ })).required(),
39
+ });
40
+ const { error } = schema.validate(workflow);
41
+ return error;
42
+ }
43
+ /**
44
+ * Extracts parameter names from the workflow.
45
+ * @param {WorkflowFile} workflow The given workflow
46
+ * @returns {String[]} List of parameters' names.
47
+ */
48
+ static getParams(workflow) {
49
+ const getParamsRecurse = (object) => {
50
+ if (typeof object === 'object') {
51
+ // Recursion base case
52
+ if (object.$param) {
53
+ return [object.$param];
54
+ }
55
+ // Recursion general case
56
+ return Object.values(object)
57
+ .reduce((p, v) => [...p, ...getParamsRecurse(v)], []);
58
+ }
59
+ return [];
60
+ };
61
+ return getParamsRecurse(workflow.workflow);
62
+ }
63
+ /**
64
+ * List all the selectors used in the given workflow (only literal "selector"
65
+ * field in WHERE clauses so far)
66
+ */
67
+ // TODO : add recursive selector search (also in click/fill etc. events?)
68
+ static extractSelectors(workflow) {
69
+ /**
70
+ * Given a Where condition, this function extracts
71
+ * all the existing selectors from it (recursively).
72
+ */
73
+ const selectorsFromCondition = (where) => {
74
+ var _a;
75
+ // the `selectors` field is either on the top level
76
+ let out = (_a = where.selectors) !== null && _a !== void 0 ? _a : [];
77
+ if (!Array.isArray(out)) {
78
+ out = [out];
79
+ }
80
+ // or nested in the "operator" array
81
+ logic_1.operators.forEach((op) => {
82
+ let condWhere = where[op];
83
+ if (condWhere) {
84
+ condWhere = Array.isArray(condWhere) ? condWhere : [condWhere];
85
+ (condWhere).forEach((subWhere) => {
86
+ out = [...out, ...selectorsFromCondition(subWhere)];
87
+ });
88
+ }
89
+ });
90
+ return out;
91
+ };
92
+ // Iterate through all the steps and extract the selectors from all of them.
93
+ return workflow.reduce((p, step) => [
94
+ ...p,
95
+ ...selectorsFromCondition(step.where).filter((x) => !p.includes(x)),
96
+ ], []);
97
+ }
98
+ /**
99
+ * Recursively crawl `object` and initializes params - replaces the `{$param : paramName}` objects
100
+ * with the defined value.
101
+ * @returns {Workflow} Copy of the given workflow, modified (the initial workflow is left untouched).
102
+ */
103
+ static initWorkflow(workflow, params) {
104
+ const paramNames = this.getParams({ workflow });
105
+ if (Object.keys(params !== null && params !== void 0 ? params : {}).sort().join(',') !== paramNames.sort().join(',')) {
106
+ throw new Error(`Provided parameters do not match the workflow parameters
107
+ provided: ${Object.keys(params !== null && params !== void 0 ? params : {}).sort().join(',')},
108
+ expected: ${paramNames.sort().join(',')}
109
+ `);
110
+ }
111
+ /**
112
+ * A recursive method for initializing special `{key: value}` syntax objects in the workflow.
113
+ * @param object Workflow to initialize (or a part of it).
114
+ * @param k key to look for ($regex, $param)
115
+ * @param f function mutating the special `{}` syntax into
116
+ * its true representation (RegExp...)
117
+ * @returns Updated object
118
+ */
119
+ const initSpecialRecurse = (object, k, f) => {
120
+ if (!object || typeof object !== 'object') {
121
+ return object;
122
+ }
123
+ const out = object;
124
+ // for every key (child) of the object
125
+ Object.keys(object).forEach((key) => {
126
+ // if the field has only one key, which is `k`
127
+ if (Object.keys(object[key]).length === 1 && object[key][k]) {
128
+ // process the current special tag (init param, hydrate regex...)
129
+ out[key] = f(object[key][k]);
130
+ }
131
+ else {
132
+ initSpecialRecurse(object[key], k, f);
133
+ }
134
+ });
135
+ return out;
136
+ };
137
+ // TODO: do better deep copy, this is hideous.
138
+ let workflowCopy = JSON.parse(JSON.stringify(workflow));
139
+ if (params) {
140
+ workflowCopy = initSpecialRecurse(workflowCopy, '$param', (paramName) => {
141
+ if (params && params[paramName]) {
142
+ return params[paramName];
143
+ }
144
+ throw new SyntaxError(`Unspecified parameter found ${paramName}.`);
145
+ });
146
+ }
147
+ workflowCopy = initSpecialRecurse(workflowCopy, '$regex', (regex) => new RegExp(regex));
148
+ return workflowCopy;
149
+ }
150
+ }
151
+ exports.default = Preprocessor;
@@ -0,0 +1,4 @@
1
+ export declare const unaryOperators: readonly ["$not"];
2
+ export declare const naryOperators: readonly ["$and", "$or"];
3
+ export declare const operators: readonly ["$not", "$and", "$or"];
4
+ export declare const meta: readonly ["$before", "$after"];
@@ -0,0 +1,7 @@
1
+ "use strict";
2
+ Object.defineProperty(exports, "__esModule", { value: true });
3
+ exports.meta = exports.operators = exports.naryOperators = exports.unaryOperators = void 0;
4
+ exports.unaryOperators = ['$not'];
5
+ exports.naryOperators = ['$and', '$or'];
6
+ exports.operators = [...exports.unaryOperators, ...exports.naryOperators];
7
+ exports.meta = ['$before', '$after'];
@@ -0,0 +1,47 @@
1
+ import { Page } from 'playwright';
2
+ import { naryOperators, unaryOperators, operators, meta } from './logic';
3
+ export type Operator = typeof operators[number];
4
+ export type UnaryOperator = typeof unaryOperators[number];
5
+ export type NAryOperator = typeof naryOperators[number];
6
+ export type Meta = typeof meta[number];
7
+ export type SelectorArray = string[];
8
+ type RegexableString = string | {
9
+ '$regex': string;
10
+ };
11
+ type BaseConditions = {
12
+ 'url': RegexableString;
13
+ 'cookies': Record<string, RegexableString>;
14
+ 'selectors': SelectorArray;
15
+ } & Record<Meta, RegexableString>;
16
+ export type Where = Partial<{
17
+ [key in NAryOperator]: Where[];
18
+ }> & // either a logic operator (arity N)
19
+ Partial<{
20
+ [key in UnaryOperator]: Where;
21
+ }> & // or an unary operator
22
+ Partial<BaseConditions>;
23
+ type MethodNames<T> = {
24
+ [K in keyof T]: T[K] extends Function ? K : never;
25
+ }[keyof T];
26
+ export type CustomFunctions = 'scrape' | 'scrapeSchema' | 'scroll' | 'screenshot' | 'script' | 'enqueueLinks' | 'flag' | 'scrapeList' | 'scrapeListAuto';
27
+ export type What = {
28
+ action: MethodNames<Page> | CustomFunctions;
29
+ args?: any[];
30
+ };
31
+ export type PageState = Partial<BaseConditions>;
32
+ export type ParamType = Record<string, any>;
33
+ export type MetaData = {
34
+ name?: string;
35
+ desc?: string;
36
+ };
37
+ export interface WhereWhatPair {
38
+ id?: string;
39
+ where: Where;
40
+ what: What[];
41
+ }
42
+ export type Workflow = WhereWhatPair[];
43
+ export type WorkflowFile = {
44
+ meta?: MetaData;
45
+ workflow: Workflow;
46
+ };
47
+ export {};
@@ -0,0 +1,2 @@
1
+ "use strict";
2
+ Object.defineProperty(exports, "__esModule", { value: true });
@@ -0,0 +1,45 @@
1
+ /**
2
+ * Concurrency class for running concurrent tasks while managing a limited amount of resources.
3
+ */
4
+ export default class Concurrency {
5
+ /**
6
+ * Maximum number of workers running in parallel. If set to `null`, there is no limit.
7
+ */
8
+ maxConcurrency: number;
9
+ /**
10
+ * Number of currently active workers.
11
+ */
12
+ activeWorkers: number;
13
+ /**
14
+ * Queue of jobs waiting to be completed.
15
+ */
16
+ private jobQueue;
17
+ /**
18
+ * "Resolve" callbacks of the waitForCompletion() promises.
19
+ */
20
+ private waiting;
21
+ /**
22
+ * Constructs a new instance of concurrency manager.
23
+ * @param {number} maxConcurrency Maximum number of workers running in parallel.
24
+ */
25
+ constructor(maxConcurrency: number);
26
+ /**
27
+ * Takes a waiting job out of the queue and runs it.
28
+ */
29
+ private runNextJob;
30
+ /**
31
+ * Pass a job (a time-demanding async function) to the concurrency manager. \
32
+ * The time of the job's execution depends on the concurrency manager itself
33
+ * (given a generous enough `maxConcurrency` value, it might be immediate,
34
+ * but this is not guaranteed).
35
+ * @param worker Async function to be executed (job to be processed).
36
+ */
37
+ addJob(job: () => Promise<any>): void;
38
+ /**
39
+ * Waits until there is no running nor waiting job. \
40
+ * If the concurrency manager is idle at the time of calling this function,
41
+ * it waits until at least one job is compeleted (can be "presubscribed").
42
+ * @returns Promise, resolved after there is no running/waiting worker.
43
+ */
44
+ waitForCompletion(): Promise<void>;
45
+ }
@@ -0,0 +1,81 @@
1
+ "use strict";
2
+ Object.defineProperty(exports, "__esModule", { value: true });
3
+ /**
4
+ * Concurrency class for running concurrent tasks while managing a limited amount of resources.
5
+ */
6
+ class Concurrency {
7
+ /**
8
+ * Constructs a new instance of concurrency manager.
9
+ * @param {number} maxConcurrency Maximum number of workers running in parallel.
10
+ */
11
+ constructor(maxConcurrency) {
12
+ /**
13
+ * Maximum number of workers running in parallel. If set to `null`, there is no limit.
14
+ */
15
+ this.maxConcurrency = 1;
16
+ /**
17
+ * Number of currently active workers.
18
+ */
19
+ this.activeWorkers = 0;
20
+ /**
21
+ * Queue of jobs waiting to be completed.
22
+ */
23
+ this.jobQueue = [];
24
+ /**
25
+ * "Resolve" callbacks of the waitForCompletion() promises.
26
+ */
27
+ this.waiting = [];
28
+ this.maxConcurrency = maxConcurrency;
29
+ }
30
+ /**
31
+ * Takes a waiting job out of the queue and runs it.
32
+ */
33
+ runNextJob() {
34
+ const job = this.jobQueue.pop();
35
+ if (job) {
36
+ // console.debug("Running a job...");
37
+ job().then(() => {
38
+ // console.debug("Job finished, running the next waiting job...");
39
+ this.runNextJob();
40
+ });
41
+ }
42
+ else {
43
+ // console.debug("No waiting job found!");
44
+ this.activeWorkers -= 1;
45
+ if (this.activeWorkers === 0) {
46
+ // console.debug("This concurrency manager is idle!");
47
+ this.waiting.forEach((x) => x());
48
+ }
49
+ }
50
+ }
51
+ /**
52
+ * Pass a job (a time-demanding async function) to the concurrency manager. \
53
+ * The time of the job's execution depends on the concurrency manager itself
54
+ * (given a generous enough `maxConcurrency` value, it might be immediate,
55
+ * but this is not guaranteed).
56
+ * @param worker Async function to be executed (job to be processed).
57
+ */
58
+ addJob(job) {
59
+ // console.debug("Adding a worker!");
60
+ this.jobQueue.push(job);
61
+ if (!this.maxConcurrency || this.activeWorkers < this.maxConcurrency) {
62
+ this.runNextJob();
63
+ this.activeWorkers += 1;
64
+ }
65
+ else {
66
+ // console.debug("No capacity to run a worker now, waiting!");
67
+ }
68
+ }
69
+ /**
70
+ * Waits until there is no running nor waiting job. \
71
+ * If the concurrency manager is idle at the time of calling this function,
72
+ * it waits until at least one job is compeleted (can be "presubscribed").
73
+ * @returns Promise, resolved after there is no running/waiting worker.
74
+ */
75
+ waitForCompletion() {
76
+ return new Promise((res) => {
77
+ this.waiting.push(res);
78
+ });
79
+ }
80
+ }
81
+ exports.default = Concurrency;
@@ -0,0 +1,9 @@
1
+ export declare enum Level {
2
+ DATE = 36,
3
+ LOG = 0,
4
+ WARN = 93,
5
+ ERROR = 31,
6
+ DEBUG = 95,
7
+ RESET = 0
8
+ }
9
+ export default function logger(message: string | Error, level?: (Level.LOG | Level.WARN | Level.ERROR | Level.DEBUG)): void;
@@ -0,0 +1,31 @@
1
+ "use strict";
2
+ /*
3
+ * Logger class for more detailed and comprehensible logs (with colors and timestamps)
4
+ */
5
+ Object.defineProperty(exports, "__esModule", { value: true });
6
+ exports.Level = void 0;
7
+ var Level;
8
+ (function (Level) {
9
+ Level[Level["DATE"] = 36] = "DATE";
10
+ Level[Level["LOG"] = 0] = "LOG";
11
+ Level[Level["WARN"] = 93] = "WARN";
12
+ Level[Level["ERROR"] = 31] = "ERROR";
13
+ Level[Level["DEBUG"] = 95] = "DEBUG";
14
+ Level[Level["RESET"] = 0] = "RESET";
15
+ })(Level = exports.Level || (exports.Level = {}));
16
+ function logger(message, level = Level.LOG) {
17
+ let m = message;
18
+ if (message.constructor.name.includes('Error') && typeof message !== 'string') {
19
+ m = (message).message;
20
+ }
21
+ process.stdout.write(`\x1b[${Level.DATE}m[${(new Date()).toLocaleString()}]\x1b[0m `);
22
+ process.stdout.write(`\x1b[${level}m`);
23
+ if (level === Level.ERROR || level === Level.WARN) {
24
+ process.stderr.write(m);
25
+ }
26
+ else {
27
+ process.stdout.write(m);
28
+ }
29
+ process.stdout.write(`\x1b[${Level.RESET}m\n`);
30
+ }
31
+ exports.default = logger;
@@ -0,0 +1,8 @@
1
+ /**
2
+ * ESLint rule in case there is only one util function
3
+ * (it still does not represent the "utils" file)
4
+ */
5
+ /**
6
+ * Converts an array of scalars to an object with **items** of the array **for keys**.
7
+ */
8
+ export declare function arrayToObject(array: any[]): any;
@@ -0,0 +1,15 @@
1
+ "use strict";
2
+ /**
3
+ * ESLint rule in case there is only one util function
4
+ * (it still does not represent the "utils" file)
5
+ */
6
+ Object.defineProperty(exports, "__esModule", { value: true });
7
+ exports.arrayToObject = void 0;
8
+ /* eslint-disable import/prefer-default-export */
9
+ /**
10
+ * Converts an array of scalars to an object with **items** of the array **for keys**.
11
+ */
12
+ function arrayToObject(array) {
13
+ return array.reduce((p, x) => (Object.assign(Object.assign({}, p), { [x]: [] })), {});
14
+ }
15
+ exports.arrayToObject = arrayToObject;
package/package.json ADDED
@@ -0,0 +1,35 @@
1
+ {
2
+ "name": "maxun-core",
3
+ "version": "0.0.1",
4
+ "description": "Core package for Maxun, responsible for data extraction",
5
+ "main": "build/index.js",
6
+ "typings": "build/index.d.ts",
7
+ "scripts": {
8
+ "test": "jest",
9
+ "build": "npm run clean && tsc",
10
+ "lint": "eslint .",
11
+ "clean": "rimraf ./build"
12
+ },
13
+ "files": [
14
+ "build/*"
15
+ ],
16
+ "keywords": [
17
+ "maxun",
18
+ "no-code scraping",
19
+ "web",
20
+ "automation",
21
+ "workflow",
22
+ "data extraction",
23
+ "scraping"
24
+ ],
25
+ "author": "Maxun",
26
+ "license": "AGPL-3.0-or-later",
27
+ "dependencies": {
28
+ "@cliqz/adblocker-playwright": "^1.31.3",
29
+ "cross-fetch": "^4.0.0",
30
+ "joi": "^17.6.0",
31
+ "playwright": "^1.20.1",
32
+ "playwright-extra": "^4.3.6",
33
+ "puppeteer-extra-plugin-stealth": "^2.11.2"
34
+ }
35
+ }