maxun-core 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md ADDED
@@ -0,0 +1 @@
1
+ ### Maxun-Core
@@ -0,0 +1,17 @@
1
+ declare function getBiggestElement(selector: any): any;
2
+ /**
3
+ * Generates structural selector (describing element by its DOM tree location).
4
+ *
5
+ * **The generated selector is not guaranteed to be unique!** (In fact, this is
6
+ * the desired behaviour in here.)
7
+ * @param {HTMLElement} element Element being described.
8
+ * @returns {string} CSS-compliant selector describing the element's location in the DOM tree.
9
+ */
10
+ declare function GetSelectorStructural(element: HTMLElement): string;
11
+ /**
12
+ * Heuristic method to find collections of "interesting" items on the page.
13
+ * @returns {Array<HTMLElement>} A collection of interesting DOM nodes
14
+ * (online store products, plane tickets, list items... and many more?)
15
+ */
16
+ declare function scrapableHeuristics(maxCountPerPage?: number, minArea?: number, scrolls?: number, metricType?: string): Array<HTMLElement>;
17
+ declare function area(element: any): number;
@@ -0,0 +1,294 @@
1
+ /* eslint-disable @typescript-eslint/no-unused-vars */
2
+ var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
3
+ function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
4
+ return new (P || (P = Promise))(function (resolve, reject) {
5
+ function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
6
+ function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
7
+ function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
8
+ step((generator = generator.apply(thisArg, _arguments || [])).next());
9
+ });
10
+ };
11
+ const area = (element) => element.offsetHeight * element.offsetWidth;
12
+ function getBiggestElement(selector) {
13
+ const elements = Array.from(document.querySelectorAll(selector));
14
+ const biggest = elements.reduce((max, elem) => (area(elem) > area(max) ? elem : max), { offsetHeight: 0, offsetWidth: 0 });
15
+ return biggest;
16
+ }
17
+ /**
18
+ * Generates structural selector (describing element by its DOM tree location).
19
+ *
20
+ * **The generated selector is not guaranteed to be unique!** (In fact, this is
21
+ * the desired behaviour in here.)
22
+ * @param {HTMLElement} element Element being described.
23
+ * @returns {string} CSS-compliant selector describing the element's location in the DOM tree.
24
+ */
25
+ function GetSelectorStructural(element) {
26
+ // Base conditions for the recursive approach.
27
+ if (element.tagName === 'BODY') {
28
+ return 'BODY';
29
+ }
30
+ const selector = element.tagName;
31
+ if (element.parentElement) {
32
+ return `${GetSelectorStructural(element.parentElement)} > ${selector}`;
33
+ }
34
+ return selector;
35
+ }
36
+ /**
37
+ * Heuristic method to find collections of "interesting" items on the page.
38
+ * @returns {Array<HTMLElement>} A collection of interesting DOM nodes
39
+ * (online store products, plane tickets, list items... and many more?)
40
+ */
41
+ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3, metricType = 'size_deviation') {
42
+ const restoreScroll = (() => {
43
+ const { scrollX, scrollY } = window;
44
+ return () => {
45
+ window.scrollTo(scrollX, scrollY);
46
+ };
47
+ })();
48
+ /**
49
+ * @typedef {Array<{x: number, y: number}>} Grid
50
+ */
51
+ /**
52
+ * Returns an array of grid-aligned {x,y} points.
53
+ * @param {number} [granularity=0.005] sets the number of generated points
54
+ * (the higher the granularity, the more points).
55
+ * @returns {Grid} Array of {x, y} objects.
56
+ */
57
+ function getGrid(startX = 0, startY = 0, granularity = 0.005) {
58
+ const width = window.innerWidth;
59
+ const height = window.innerHeight;
60
+ const out = [];
61
+ for (let x = 0; x < width; x += 1 / granularity) {
62
+ for (let y = 0; y < height; y += 1 / granularity) {
63
+ out.push({ x: startX + x, y: startY + y });
64
+ }
65
+ }
66
+ return out;
67
+ }
68
+ let maxSelector = { selector: 'body', metric: 0 };
69
+ const updateMaximumWithPoint = (point) => {
70
+ const currentElement = document.elementFromPoint(point.x, point.y);
71
+ const selector = GetSelectorStructural(currentElement);
72
+ const elements = Array.from(document.querySelectorAll(selector))
73
+ .filter((element) => area(element) > minArea);
74
+ // If the current selector targets less than three elements,
75
+ // we consider it not interesting (would be a very underwhelming scraper)
76
+ if (elements.length < 3) {
77
+ return;
78
+ }
79
+ let metric = null;
80
+ if (metricType === 'total_area') {
81
+ metric = elements
82
+ .reduce((p, x) => p + area(x), 0);
83
+ }
84
+ else if (metricType === 'size_deviation') {
85
+ // This could use a proper "statistics" approach... but meh, so far so good!
86
+ const sizes = elements
87
+ .map((element) => area(element));
88
+ metric = (1 - (Math.max(...sizes) - Math.min(...sizes)) / Math.max(...sizes));
89
+ }
90
+ if (metric > maxSelector.metric && elements.length < maxCountPerPage) {
91
+ maxSelector = { selector, metric };
92
+ }
93
+ };
94
+ for (let scroll = 0; scroll < scrolls; scroll += 1) {
95
+ window.scrollTo(0, scroll * window.innerHeight);
96
+ const grid = getGrid();
97
+ grid.forEach(updateMaximumWithPoint);
98
+ }
99
+ restoreScroll();
100
+ let out = Array.from(document.querySelectorAll(maxSelector.selector));
101
+ const different = (x, i, a) => a.findIndex((e) => e === x) === i;
102
+ // as long as we don't merge any two elements by substituing them for their parents,
103
+ // we substitute.
104
+ while (out.map((x) => x.parentElement).every(different)
105
+ && out.forEach((x) => x.parentElement !== null)) {
106
+ out = out.map((x) => { var _a; return (_a = x.parentElement) !== null && _a !== void 0 ? _a : x; });
107
+ }
108
+ return out;
109
+ }
110
+ /**
111
+ * Returns a "scrape" result from the current page.
112
+ * @returns {Array<Object>} *Curated* array of scraped information (with sparse rows removed)
113
+ */
114
+ // Wrap the entire function in an IIFE (Immediately Invoked Function Expression)
115
+ // and attach it to the window object
116
+ (function (window) {
117
+ /**
118
+ * Returns a "scrape" result from the current page.
119
+ * @returns {Array<Object>} *Curated* array of scraped information (with sparse rows removed)
120
+ */
121
+ window.scrape = function (selector = null) {
122
+ /**
123
+ * **crudeRecords** contains uncurated rundowns of "scrapable" elements
124
+ * @type {Array<Object>}
125
+ */
126
+ const crudeRecords = (selector
127
+ ? Array.from(document.querySelectorAll(selector))
128
+ : scrapableHeuristics())
129
+ .map((record) => (Object.assign(Object.assign({}, Array.from(record.querySelectorAll('img'))
130
+ .reduce((p, x, i) => {
131
+ let url = null;
132
+ if (x.srcset) {
133
+ const urls = x.srcset.split(', ');
134
+ [url] = urls[urls.length - 1].split(' ');
135
+ }
136
+ /**
137
+ * Contains the largest elements from `srcset` - if `srcset` is not present, contains
138
+ * URL from the `src` attribute
139
+ *
140
+ * If the `src` attribute contains a data url, imgUrl contains `undefined`.
141
+ */
142
+ let imgUrl;
143
+ if (x.srcset) {
144
+ imgUrl = url;
145
+ }
146
+ else if (x.src.indexOf('data:') === -1) {
147
+ imgUrl = x.src;
148
+ }
149
+ return (Object.assign(Object.assign({}, p), (imgUrl ? { [`img_${i}`]: imgUrl } : {})));
150
+ }, {})), record.innerText.split('\n')
151
+ .reduce((p, x, i) => (Object.assign(Object.assign({}, p), { [`record_${String(i).padStart(4, '0')}`]: x.trim() })), {}))));
152
+ return crudeRecords;
153
+ };
154
+ /**
155
+ * TODO: Simplify.
156
+ * Given an object with named lists of elements,
157
+ * groups the elements by their distance in the DOM tree.
158
+ * @param {Object.<string, {selector: string, tag: string}>} lists The named lists of HTML elements.
159
+ * @returns {Array.<Object.<string, string>>}
160
+ */
161
+ window.scrapeSchema = function (lists) {
162
+ function omap(object, f, kf = (x) => x) {
163
+ return Object.fromEntries(Object.entries(object)
164
+ .map(([k, v]) => [kf(k), f(v)]));
165
+ }
166
+ function ofilter(object, f) {
167
+ return Object.fromEntries(Object.entries(object)
168
+ .filter(([k, v]) => f(k, v)));
169
+ }
170
+ function getSeedKey(listObj) {
171
+ const maxLength = Math.max(...Object.values(omap(listObj, (x) => document.querySelectorAll(x.selector).length)));
172
+ return Object.keys(ofilter(listObj, (_, v) => document.querySelectorAll(v.selector).length === maxLength))[0];
173
+ }
174
+ function getMBEs(elements) {
175
+ return elements.map((element) => {
176
+ let candidate = element;
177
+ const isUniqueChild = (e) => elements
178
+ .filter((elem) => { var _a; return (_a = e.parentNode) === null || _a === void 0 ? void 0 : _a.contains(elem); })
179
+ .length === 1;
180
+ while (candidate && isUniqueChild(candidate)) {
181
+ candidate = candidate.parentNode;
182
+ }
183
+ return candidate;
184
+ });
185
+ }
186
+ const seedName = getSeedKey(lists);
187
+ const seedElements = Array.from(document.querySelectorAll(lists[seedName].selector));
188
+ const MBEs = getMBEs(seedElements);
189
+ return MBEs.map((mbe) => omap(lists, ({ selector, attribute }, key) => {
190
+ const elem = Array.from(document.querySelectorAll(selector)).find((elem) => mbe.contains(elem));
191
+ if (!elem)
192
+ return undefined;
193
+ switch (attribute) {
194
+ case 'href':
195
+ return elem.getAttribute('href');
196
+ case 'src':
197
+ return elem.getAttribute('src');
198
+ case 'innerText':
199
+ return elem.innerText;
200
+ case 'textContent':
201
+ return elem.textContent;
202
+ default:
203
+ return elem.innerText;
204
+ }
205
+ }, (key) => key // Use the original key in the output
206
+ ));
207
+ };
208
+ /**
209
+ * Scrapes multiple lists of similar items based on a template item.
210
+ * @param {Object} config - Configuration object
211
+ * @param {string} config.listSelector - Selector for the list container(s)
212
+ * @param {Object.<string, {selector: string, attribute?: string}>} config.fields - Fields to scrape
213
+ * @param {number} [config.limit] - Maximum number of items to scrape per list (optional)
214
+ * @param {boolean} [config.flexible=false] - Whether to use flexible matching for field selectors
215
+ * @returns {Array.<Array.<Object>>} Array of arrays of scraped items, one sub-array per list
216
+ */
217
+ window.scrapeList = function ({ listSelector, fields, limit = 10 }) {
218
+ return __awaiter(this, void 0, void 0, function* () {
219
+ const scrapedData = [];
220
+ while (scrapedData.length < limit) {
221
+ // Get all parent elements matching the listSelector
222
+ const parentElements = Array.from(document.querySelectorAll(listSelector));
223
+ // Iterate through each parent element
224
+ for (const parent of parentElements) {
225
+ if (scrapedData.length >= limit)
226
+ break;
227
+ const record = {};
228
+ // For each field, select the corresponding element within the parent
229
+ for (const [label, { selector, attribute }] of Object.entries(fields)) {
230
+ const fieldElement = parent.querySelector(selector);
231
+ if (fieldElement) {
232
+ if (attribute === 'innerText') {
233
+ record[label] = fieldElement.innerText.trim();
234
+ }
235
+ else if (attribute === 'innerHTML') {
236
+ record[label] = fieldElement.innerHTML.trim();
237
+ }
238
+ else if (attribute === 'src') {
239
+ record[label] = fieldElement.src;
240
+ }
241
+ else if (attribute === 'href') {
242
+ record[label] = fieldElement.href;
243
+ }
244
+ else {
245
+ record[label] = fieldElement.getAttribute(attribute);
246
+ }
247
+ }
248
+ }
249
+ scrapedData.push(record);
250
+ }
251
+ }
252
+ return scrapedData;
253
+ });
254
+ };
255
+ /**
256
+ * Gets all children of the elements matching the listSelector,
257
+ * returning their CSS selectors and innerText.
258
+ * @param {string} listSelector - Selector for the list container(s)
259
+ * @returns {Array.<Object>} Array of objects, each containing the CSS selector and innerText of the children
260
+ */
261
+ window.scrapeListAuto = function (listSelector) {
262
+ const lists = Array.from(document.querySelectorAll(listSelector));
263
+ const results = [];
264
+ lists.forEach(list => {
265
+ const children = Array.from(list.children);
266
+ children.forEach(child => {
267
+ const selectors = [];
268
+ let element = child;
269
+ // Traverse up to gather the CSS selector for the element
270
+ while (element && element !== document) {
271
+ let selector = element.nodeName.toLowerCase();
272
+ if (element.id) {
273
+ selector += `#${element.id}`;
274
+ selectors.push(selector);
275
+ break;
276
+ }
277
+ else {
278
+ const className = element.className.trim().split(/\s+/).join('.');
279
+ if (className) {
280
+ selector += `.${className}`;
281
+ }
282
+ selectors.push(selector);
283
+ element = element.parentElement;
284
+ }
285
+ }
286
+ results.push({
287
+ selector: selectors.reverse().join(' > '),
288
+ innerText: child.innerText.trim()
289
+ });
290
+ });
291
+ });
292
+ return results;
293
+ };
294
+ })(window);
@@ -0,0 +1,5 @@
1
+ import Interpreter from './interpret';
2
+ export default Interpreter;
3
+ export { default as Preprocessor } from './preprocessor';
4
+ export type { WorkflowFile, WhereWhatPair, Where, What, } from './types/workflow';
5
+ export { unaryOperators, naryOperators, meta as metaOperators } from './types/logic';
package/build/index.js ADDED
@@ -0,0 +1,14 @@
1
+ "use strict";
2
+ var __importDefault = (this && this.__importDefault) || function (mod) {
3
+ return (mod && mod.__esModule) ? mod : { "default": mod };
4
+ };
5
+ Object.defineProperty(exports, "__esModule", { value: true });
6
+ exports.metaOperators = exports.naryOperators = exports.unaryOperators = exports.Preprocessor = void 0;
7
+ const interpret_1 = __importDefault(require("./interpret"));
8
+ exports.default = interpret_1.default;
9
+ var preprocessor_1 = require("./preprocessor");
10
+ Object.defineProperty(exports, "Preprocessor", { enumerable: true, get: function () { return __importDefault(preprocessor_1).default; } });
11
+ var logic_1 = require("./types/logic");
12
+ Object.defineProperty(exports, "unaryOperators", { enumerable: true, get: function () { return logic_1.unaryOperators; } });
13
+ Object.defineProperty(exports, "naryOperators", { enumerable: true, get: function () { return logic_1.naryOperators; } });
14
+ Object.defineProperty(exports, "metaOperators", { enumerable: true, get: function () { return logic_1.meta; } });
@@ -0,0 +1,74 @@
1
+ /// <reference types="node" />
2
+ import { Page } from 'playwright';
3
+ import { EventEmitter } from 'events';
4
+ import { WorkflowFile, ParamType } from './types/workflow';
5
+ /**
6
+ * Defines optional intepreter options (passed in constructor)
7
+ */
8
+ interface InterpreterOptions {
9
+ maxRepeats: number;
10
+ maxConcurrency: number;
11
+ serializableCallback: (output: any) => (void | Promise<void>);
12
+ binaryCallback: (output: any, mimeType: string) => (void | Promise<void>);
13
+ debug: boolean;
14
+ debugChannel: Partial<{
15
+ activeId: Function;
16
+ debugMessage: Function;
17
+ }>;
18
+ }
19
+ /**
20
+ * Class for running the Smart Workflows.
21
+ */
22
+ export default class Interpreter extends EventEmitter {
23
+ private workflow;
24
+ private initializedWorkflow;
25
+ private options;
26
+ private concurrency;
27
+ private stopper;
28
+ private log;
29
+ private blocker;
30
+ constructor(workflow: WorkflowFile, options?: Partial<InterpreterOptions>);
31
+ private applyAdBlocker;
32
+ private disableAdBlocker;
33
+ /**
34
+ * Returns the context object from given Page and the current workflow.\
35
+ * \
36
+ * `workflow` is used for selector extraction - function searches for used selectors to
37
+ * look for later in the page's context.
38
+ * @param page Playwright Page object
39
+ * @param workflow Current **initialized** workflow (array of where-what pairs).
40
+ * @returns {PageState} State of the current page.
41
+ */
42
+ private getState;
43
+ /**
44
+ * Tests if the given action is applicable with the given context.
45
+ * @param where Tested *where* condition
46
+ * @param context Current browser context.
47
+ * @returns True if `where` is applicable in the given context, false otherwise
48
+ */
49
+ private applicable;
50
+ /**
51
+ * Given a Playwright's page object and a "declarative" list of actions, this function
52
+ * calls all mentioned functions on the Page object.\
53
+ * \
54
+ * Manipulates the iterator indexes (experimental feature, likely to be removed in
55
+ * the following versions of maxun-core)
56
+ * @param page Playwright Page object
57
+ * @param steps Array of actions.
58
+ */
59
+ private carryOutSteps;
60
+ private handlePagination;
61
+ private runLoop;
62
+ private ensureScriptsLoaded;
63
+ /**
64
+ * Spawns a browser context and runs given workflow.
65
+ * \
66
+ * Resolves after the playback is finished.
67
+ * @param {Page} [page] Page to run the workflow on.
68
+ * @param {ParamType} params Workflow specific, set of parameters
69
+ * for the `{$param: nameofparam}` fields.
70
+ */
71
+ run(page: Page, params?: ParamType): Promise<void>;
72
+ stop(): Promise<void>;
73
+ }
74
+ export {};