npm - maxun-core - Versions diffs - 0.0.1 - Mend

maxun-core 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (20) hide show

package/README.md +1 -0
package/build/browserSide/scraper.d.ts +17 -0
package/build/browserSide/scraper.js +294 -0
package/build/index.d.ts +5 -0
package/build/index.js +14 -0
package/build/interpret.d.ts +74 -0
package/build/interpret.js +555 -0
package/build/preprocessor.d.ts +24 -0
package/build/preprocessor.js +151 -0
package/build/types/logic.d.ts +4 -0
package/build/types/logic.js +7 -0
package/build/types/workflow.d.ts +47 -0
package/build/types/workflow.js +2 -0
package/build/utils/concurrency.d.ts +45 -0
package/build/utils/concurrency.js +81 -0
package/build/utils/logger.d.ts +9 -0
package/build/utils/logger.js +31 -0
package/build/utils/utils.d.ts +8 -0
package/build/utils/utils.js +15 -0
package/package.json +35 -0

package/README.md ADDED Viewed

	@@ -0,0 +1 @@
1	+ ### Maxun-Core

package/build/browserSide/scraper.d.ts ADDED Viewed

@@ -0,0 +1,17 @@
+declare function getBiggestElement(selector: any): any;
+/**
+ * Generates structural selector (describing element by its DOM tree location).
+ *
+ * **The generated selector is not guaranteed to be unique!** (In fact, this is
+ *    the desired behaviour in here.)
+ * @param {HTMLElement} element Element being described.
+ * @returns {string} CSS-compliant selector describing the element's location in the DOM tree.
+ */
+declare function GetSelectorStructural(element: HTMLElement): string;
+/**
+ * Heuristic method to find collections of "interesting" items on the page.
+ * @returns {Array<HTMLElement>} A collection of interesting DOM nodes
+ *  (online store products, plane tickets, list items... and many more?)
+ */
+declare function scrapableHeuristics(maxCountPerPage?: number, minArea?: number, scrolls?: number, metricType?: string): Array<HTMLElement>;
+declare function area(element: any): number;

package/build/browserSide/scraper.js ADDED Viewed

@@ -0,0 +1,294 @@
+/* eslint-disable @typescript-eslint/no-unused-vars */
+var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
+    function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
+    return new (P || (P = Promise))(function (resolve, reject) {
+        function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
+        function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
+        function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
+        step((generator = generator.apply(thisArg, _arguments || [])).next());
+    });
+};
+const area = (element) => element.offsetHeight * element.offsetWidth;
+function getBiggestElement(selector) {
+    const elements = Array.from(document.querySelectorAll(selector));
+    const biggest = elements.reduce((max, elem) => (area(elem) > area(max) ? elem : max), { offsetHeight: 0, offsetWidth: 0 });
+    return biggest;
+}
+/**
+ * Generates structural selector (describing element by its DOM tree location).
+ *
+ * **The generated selector is not guaranteed to be unique!** (In fact, this is
+ *    the desired behaviour in here.)
+ * @param {HTMLElement} element Element being described.
+ * @returns {string} CSS-compliant selector describing the element's location in the DOM tree.
+ */
+function GetSelectorStructural(element) {
+    // Base conditions for the recursive approach.
+    if (element.tagName === 'BODY') {
+        return 'BODY';
+    }
+    const selector = element.tagName;
+    if (element.parentElement) {
+        return `${GetSelectorStructural(element.parentElement)} > ${selector}`;
+    }
+    return selector;
+}
+/**
+ * Heuristic method to find collections of "interesting" items on the page.
+ * @returns {Array<HTMLElement>} A collection of interesting DOM nodes
+ *  (online store products, plane tickets, list items... and many more?)
+ */
+function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3, metricType = 'size_deviation') {
+    const restoreScroll = (() => {
+        const { scrollX, scrollY } = window;
+        return () => {
+            window.scrollTo(scrollX, scrollY);
+        };
+    })();
+    /**
+  * @typedef {Array<{x: number, y: number}>} Grid
+  */
+    /**
+   * Returns an array of grid-aligned {x,y} points.
+   * @param {number} [granularity=0.005] sets the number of generated points
+   *  (the higher the granularity, the more points).
+   * @returns {Grid} Array of {x, y} objects.
+   */
+    function getGrid(startX = 0, startY = 0, granularity = 0.005) {
+        const width = window.innerWidth;
+        const height = window.innerHeight;
+        const out = [];
+        for (let x = 0; x < width; x += 1 / granularity) {
+            for (let y = 0; y < height; y += 1 / granularity) {
+                out.push({ x: startX + x, y: startY + y });
+            }
+        }
+        return out;
+    }
+    let maxSelector = { selector: 'body', metric: 0 };
+    const updateMaximumWithPoint = (point) => {
+        const currentElement = document.elementFromPoint(point.x, point.y);
+        const selector = GetSelectorStructural(currentElement);
+        const elements = Array.from(document.querySelectorAll(selector))
+            .filter((element) => area(element) > minArea);
+        // If the current selector targets less than three elements,
+        // we consider it not interesting (would be a very underwhelming scraper)
+        if (elements.length < 3) {
+            return;
+        }
+        let metric = null;
+        if (metricType === 'total_area') {
+            metric = elements
+                .reduce((p, x) => p + area(x), 0);
+        }
+        else if (metricType === 'size_deviation') {
+            // This could use a proper "statistics" approach... but meh, so far so good!
+            const sizes = elements
+                .map((element) => area(element));
+            metric = (1 - (Math.max(...sizes) - Math.min(...sizes)) / Math.max(...sizes));
+        }
+        if (metric > maxSelector.metric && elements.length < maxCountPerPage) {
+            maxSelector = { selector, metric };
+        }
+    };
+    for (let scroll = 0; scroll < scrolls; scroll += 1) {
+        window.scrollTo(0, scroll * window.innerHeight);
+        const grid = getGrid();
+        grid.forEach(updateMaximumWithPoint);
+    }
+    restoreScroll();
+    let out = Array.from(document.querySelectorAll(maxSelector.selector));
+    const different = (x, i, a) => a.findIndex((e) => e === x) === i;
+    // as long as we don't merge any two elements by substituing them for their parents,
+    // we substitute.
+    while (out.map((x) => x.parentElement).every(different)
+        && out.forEach((x) => x.parentElement !== null)) {
+        out = out.map((x) => { var _a; return (_a = x.parentElement) !== null && _a !== void 0 ? _a : x; });
+    }
+    return out;
+}
+/**
+ * Returns a "scrape" result from the current page.
+ * @returns {Array<Object>} *Curated* array of scraped information (with sparse rows removed)
+ */
+// Wrap the entire function in an IIFE (Immediately Invoked Function Expression)
+// and attach it to the window object
+(function (window) {
+    /**
+     * Returns a "scrape" result from the current page.
+     * @returns {Array<Object>} *Curated* array of scraped information (with sparse rows removed)
+     */
+    window.scrape = function (selector = null) {
+        /**
+         * **crudeRecords** contains uncurated rundowns of "scrapable" elements
+         * @type {Array<Object>}
+         */
+        const crudeRecords = (selector
+            ? Array.from(document.querySelectorAll(selector))
+            : scrapableHeuristics())
+            .map((record) => (Object.assign(Object.assign({}, Array.from(record.querySelectorAll('img'))
+            .reduce((p, x, i) => {
+            let url = null;
+            if (x.srcset) {
+                const urls = x.srcset.split(', ');
+                [url] = urls[urls.length - 1].split(' ');
+            }
+            /**
+               * Contains the largest elements from `srcset` - if `srcset` is not present, contains
+               * URL from the `src` attribute
+               *
+               * If the `src` attribute contains a data url, imgUrl contains `undefined`.
+               */
+            let imgUrl;
+            if (x.srcset) {
+                imgUrl = url;
+            }
+            else if (x.src.indexOf('data:') === -1) {
+                imgUrl = x.src;
+            }
+            return (Object.assign(Object.assign({}, p), (imgUrl ? { [`img_${i}`]: imgUrl } : {})));
+        }, {})), record.innerText.split('\n')
+            .reduce((p, x, i) => (Object.assign(Object.assign({}, p), { [`record_${String(i).padStart(4, '0')}`]: x.trim() })), {}))));
+        return crudeRecords;
+    };
+    /**
+     * TODO: Simplify.
+     * Given an object with named lists of elements,
+     *  groups the elements by their distance in the DOM tree.
+     * @param {Object.<string, {selector: string, tag: string}>} lists The named lists of HTML elements.
+     * @returns {Array.<Object.<string, string>>}
+     */
+    window.scrapeSchema = function (lists) {
+        function omap(object, f, kf = (x) => x) {
+            return Object.fromEntries(Object.entries(object)
+                .map(([k, v]) => [kf(k), f(v)]));
+        }
+        function ofilter(object, f) {
+            return Object.fromEntries(Object.entries(object)
+                .filter(([k, v]) => f(k, v)));
+        }
+        function getSeedKey(listObj) {
+            const maxLength = Math.max(...Object.values(omap(listObj, (x) => document.querySelectorAll(x.selector).length)));
+            return Object.keys(ofilter(listObj, (_, v) => document.querySelectorAll(v.selector).length === maxLength))[0];
+        }
+        function getMBEs(elements) {
+            return elements.map((element) => {
+                let candidate = element;
+                const isUniqueChild = (e) => elements
+                    .filter((elem) => { var _a; return (_a = e.parentNode) === null || _a === void 0 ? void 0 : _a.contains(elem); })
+                    .length === 1;
+                while (candidate && isUniqueChild(candidate)) {
+                    candidate = candidate.parentNode;
+                }
+                return candidate;
+            });
+        }
+        const seedName = getSeedKey(lists);
+        const seedElements = Array.from(document.querySelectorAll(lists[seedName].selector));
+        const MBEs = getMBEs(seedElements);
+        return MBEs.map((mbe) => omap(lists, ({ selector, attribute }, key) => {
+            const elem = Array.from(document.querySelectorAll(selector)).find((elem) => mbe.contains(elem));
+            if (!elem)
+                return undefined;
+            switch (attribute) {
+                case 'href':
+                    return elem.getAttribute('href');
+                case 'src':
+                    return elem.getAttribute('src');
+                case 'innerText':
+                    return elem.innerText;
+                case 'textContent':
+                    return elem.textContent;
+                default:
+                    return elem.innerText;
+            }
+        }, (key) => key // Use the original key in the output
+        ));
+    };
+    /**
+   * Scrapes multiple lists of similar items based on a template item.
+   * @param {Object} config - Configuration object
+   * @param {string} config.listSelector - Selector for the list container(s)
+   * @param {Object.<string, {selector: string, attribute?: string}>} config.fields - Fields to scrape
+   * @param {number} [config.limit] - Maximum number of items to scrape per list (optional)
+   * @param {boolean} [config.flexible=false] - Whether to use flexible matching for field selectors
+   * @returns {Array.<Array.<Object>>} Array of arrays of scraped items, one sub-array per list
+   */
+    window.scrapeList = function ({ listSelector, fields, limit = 10 }) {
+        return __awaiter(this, void 0, void 0, function* () {
+            const scrapedData = [];
+            while (scrapedData.length < limit) {
+                // Get all parent elements matching the listSelector
+                const parentElements = Array.from(document.querySelectorAll(listSelector));
+                // Iterate through each parent element
+                for (const parent of parentElements) {
+                    if (scrapedData.length >= limit)
+                        break;
+                    const record = {};
+                    // For each field, select the corresponding element within the parent
+                    for (const [label, { selector, attribute }] of Object.entries(fields)) {
+                        const fieldElement = parent.querySelector(selector);
+                        if (fieldElement) {
+                            if (attribute === 'innerText') {
+                                record[label] = fieldElement.innerText.trim();
+                            }
+                            else if (attribute === 'innerHTML') {
+                                record[label] = fieldElement.innerHTML.trim();
+                            }
+                            else if (attribute === 'src') {
+                                record[label] = fieldElement.src;
+                            }
+                            else if (attribute === 'href') {
+                                record[label] = fieldElement.href;
+                            }
+                            else {
+                                record[label] = fieldElement.getAttribute(attribute);
+                            }
+                        }
+                    }
+                    scrapedData.push(record);
+                }
+            }
+            return scrapedData;
+        });
+    };
+    /**
+   * Gets all children of the elements matching the listSelector,
+   * returning their CSS selectors and innerText.
+   * @param {string} listSelector - Selector for the list container(s)
+   * @returns {Array.<Object>} Array of objects, each containing the CSS selector and innerText of the children
+   */
+    window.scrapeListAuto = function (listSelector) {
+        const lists = Array.from(document.querySelectorAll(listSelector));
+        const results = [];
+        lists.forEach(list => {
+            const children = Array.from(list.children);
+            children.forEach(child => {
+                const selectors = [];
+                let element = child;
+                // Traverse up to gather the CSS selector for the element
+                while (element && element !== document) {
+                    let selector = element.nodeName.toLowerCase();
+                    if (element.id) {
+                        selector += `#${element.id}`;
+                        selectors.push(selector);
+                        break;
+                    }
+                    else {
+                        const className = element.className.trim().split(/\s+/).join('.');
+                        if (className) {
+                            selector += `.${className}`;
+                        }
+                        selectors.push(selector);
+                        element = element.parentElement;
+                    }
+                }
+                results.push({
+                    selector: selectors.reverse().join(' > '),
+                    innerText: child.innerText.trim()
+                });
+            });
+        });
+        return results;
+    };
+})(window);

package/build/index.d.ts ADDED Viewed

@@ -0,0 +1,5 @@
+import Interpreter from './interpret';
+export default Interpreter;
+export { default as Preprocessor } from './preprocessor';
+export type { WorkflowFile, WhereWhatPair, Where, What, } from './types/workflow';
+export { unaryOperators, naryOperators, meta as metaOperators } from './types/logic';

package/build/index.js ADDED Viewed

@@ -0,0 +1,14 @@
+"use strict";
+var __importDefault = (this && this.__importDefault) || function (mod) {
+    return (mod && mod.__esModule) ? mod : { "default": mod };
+};
+Object.defineProperty(exports, "__esModule", { value: true });
+exports.metaOperators = exports.naryOperators = exports.unaryOperators = exports.Preprocessor = void 0;
+const interpret_1 = __importDefault(require("./interpret"));
+exports.default = interpret_1.default;
+var preprocessor_1 = require("./preprocessor");
+Object.defineProperty(exports, "Preprocessor", { enumerable: true, get: function () { return __importDefault(preprocessor_1).default; } });
+var logic_1 = require("./types/logic");
+Object.defineProperty(exports, "unaryOperators", { enumerable: true, get: function () { return logic_1.unaryOperators; } });
+Object.defineProperty(exports, "naryOperators", { enumerable: true, get: function () { return logic_1.naryOperators; } });
+Object.defineProperty(exports, "metaOperators", { enumerable: true, get: function () { return logic_1.meta; } });

package/build/interpret.d.ts ADDED Viewed

@@ -0,0 +1,74 @@
+/// <reference types="node" />
+import { Page } from 'playwright';
+import { EventEmitter } from 'events';
+import { WorkflowFile, ParamType } from './types/workflow';
+/**
+ * Defines optional intepreter options (passed in constructor)
+ */
+interface InterpreterOptions {
+    maxRepeats: number;
+    maxConcurrency: number;
+    serializableCallback: (output: any) => (void | Promise<void>);
+    binaryCallback: (output: any, mimeType: string) => (void | Promise<void>);
+    debug: boolean;
+    debugChannel: Partial<{
+        activeId: Function;
+        debugMessage: Function;
+    }>;
+}
+/**
+ * Class for running the Smart Workflows.
+ */
+export default class Interpreter extends EventEmitter {
+    private workflow;
+    private initializedWorkflow;
+    private options;
+    private concurrency;
+    private stopper;
+    private log;
+    private blocker;
+    constructor(workflow: WorkflowFile, options?: Partial<InterpreterOptions>);
+    private applyAdBlocker;
+    private disableAdBlocker;
+    /**
+      * Returns the context object from given Page and the current workflow.\
+      * \
+      * `workflow` is used for selector extraction - function searches for used selectors to
+      * look for later in the page's context.
+      * @param page Playwright Page object
+      * @param workflow Current **initialized** workflow (array of where-what pairs).
+      * @returns {PageState} State of the current page.
+      */
+    private getState;
+    /**
+     * Tests if the given action is applicable with the given context.
+     * @param where Tested *where* condition
+     * @param context Current browser context.
+     * @returns True if `where` is applicable in the given context, false otherwise
+     */
+    private applicable;
+    /**
+   * Given a Playwright's page object and a "declarative" list of actions, this function
+   * calls all mentioned functions on the Page object.\
+   * \
+   * Manipulates the iterator indexes (experimental feature, likely to be removed in
+   * the following versions of maxun-core)
+   * @param page Playwright Page object
+   * @param steps Array of actions.
+   */
+    private carryOutSteps;
+    private handlePagination;
+    private runLoop;
+    private ensureScriptsLoaded;
+    /**
+     * Spawns a browser context and runs given workflow.
+     * \
+     * Resolves after the playback is finished.
+     * @param {Page} [page] Page to run the workflow on.
+     * @param {ParamType} params Workflow specific, set of parameters
+     *  for the `{$param: nameofparam}` fields.
+     */
+    run(page: Page, params?: ParamType): Promise<void>;
+    stop(): Promise<void>;
+}
+export {};