npm - maxun-core - Versions diffs - 0.0.3 → 0.0.5 - Mend

maxun-core 0.0.3 → 0.0.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

package/build/browserSide/scraper.js +11 -5
package/build/interpret.d.ts +26 -0
package/build/interpret.js +37 -5
package/build/utils/concurrency.d.ts +1 -1
package/build/utils/concurrency.js +1 -1
package/package.json +7 -2

package/build/browserSide/scraper.js CHANGED Viewed

@@ -192,9 +192,11 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
                 return undefined;
             switch (attribute) {
                 case 'href':
-                    return elem.getAttribute('href');
+                    const relativeHref = elem.getAttribute('href');
+                    return relativeHref ? new URL(relativeHref, window.location.origin).href : null;
                 case 'src':
-                    return elem.getAttribute('src');
+                    const relativeSrc = elem.getAttribute('src');
+                    return relativeSrc ? new URL(relativeSrc, window.location.origin).href : null;
                 case 'innerText':
                     return elem.innerText;
                 case 'textContent':
@@ -203,7 +205,7 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
                     return elem.innerText;
             }
         }, (key) => key // Use the original key in the output
-        ));
+        )) || [];
     };
     /**
    * Scrapes multiple lists of similar items based on a template item.
@@ -236,10 +238,14 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
                                 record[label] = fieldElement.innerHTML.trim();
                             }
                             else if (attribute === 'src') {
-                                record[label] = fieldElement.src;
+                                // Handle relative 'src' URLs
+                                const src = fieldElement.getAttribute('src');
+                                record[label] = src ? new URL(src, baseUrl).href : null;
                             }
                             else if (attribute === 'href') {
-                                record[label] = fieldElement.href;
+                                // Handle relative 'href' URLs
+                                const href = fieldElement.getAttribute('href');
+                                record[label] = href ? new URL(href, baseUrl).href : null;
                             }
                             else {
                                 record[label] = fieldElement.getAttribute(attribute);

package/build/interpret.d.ts CHANGED Viewed

@@ -2,6 +2,31 @@
 import { Page } from 'playwright';
 import { EventEmitter } from 'events';
 import { WorkflowFile, ParamType } from './types/workflow';
+/**
+ * Extending the Window interface for custom scraping functions.
+ */
+declare global {
+    interface Window {
+        scrape: (selector: string | null) => Record<string, string>[];
+        scrapeSchema: (schema: Record<string, {
+            selector: string;
+            tag: string;
+            attribute: string;
+        }>) => Record<string, any>;
+        scrapeList: (config: {
+            listSelector: string;
+            fields: any;
+            limit?: number;
+            pagination: any;
+        }) => Record<string, any>[];
+        scrapeListAuto: (listSelector: string) => {
+            selector: string;
+            innerText: string;
+        }[];
+        scrollDown: (pages?: number) => void;
+        scrollUp: (pages?: number) => void;
+    }
+}
 /**
  * Defines optional intepreter options (passed in constructor)
  */
@@ -27,6 +52,7 @@ export default class Interpreter extends EventEmitter {
     private stopper;
     private log;
     private blocker;
+    private cumulativeResults;
     constructor(workflow: WorkflowFile, options?: Partial<InterpreterOptions>);
     private applyAdBlocker;
     private disableAdBlocker;

package/build/interpret.js CHANGED Viewed

@@ -53,9 +53,12 @@ class Interpreter extends events_1.EventEmitter {
         super();
         this.stopper = null;
         this.blocker = null;
+        this.cumulativeResults = [];
         this.workflow = workflow.workflow;
         this.initializedWorkflow = null;
-        this.options = Object.assign({ maxRepeats: 5, maxConcurrency: 5, serializableCallback: (data) => { (0, logger_1.default)(JSON.stringify(data), logger_1.Level.WARN); }, binaryCallback: () => { (0, logger_1.default)('Received binary data, thrashing them.', logger_1.Level.WARN); }, debug: false, debugChannel: {} }, options);
+        this.options = Object.assign({ maxRepeats: 5, maxConcurrency: 5, serializableCallback: (data) => {
+                (0, logger_1.default)(JSON.stringify(data), logger_1.Level.WARN);
+            }, binaryCallback: () => { (0, logger_1.default)('Received binary data, thrashing them.', logger_1.Level.WARN); }, debug: false, debugChannel: {} }, options);
         this.concurrency = new concurrency_1.default(this.options.maxConcurrency);
         this.log = (...args) => (0, logger_1.default)(...args);
         const error = preprocessor_1.default.validateWorkflow(workflow);
@@ -273,7 +276,30 @@ class Interpreter extends events_1.EventEmitter {
                 scrapeSchema: (schema) => __awaiter(this, void 0, void 0, function* () {
                     yield this.ensureScriptsLoaded(page);
                     const scrapeResult = yield page.evaluate((schemaObj) => window.scrapeSchema(schemaObj), schema);
-                    yield this.options.serializableCallback(scrapeResult);
+                    const newResults = Array.isArray(scrapeResult) ? scrapeResult : [scrapeResult];
+                    newResults.forEach((result) => {
+                        Object.entries(result).forEach(([key, value]) => {
+                            const keyExists = this.cumulativeResults.some((item) => key in item && item[key] !== undefined);
+                            if (!keyExists) {
+                                this.cumulativeResults.push({ [key]: value });
+                            }
+                        });
+                    });
+                    const mergedResult = [
+                        Object.fromEntries(Object.entries(this.cumulativeResults.reduce((acc, curr) => {
+                            Object.entries(curr).forEach(([key, value]) => {
+                                // If the key doesn't exist or the current value is not undefined, add/update it
+                                if (value !== undefined) {
+                                    acc[key] = value;
+                                }
+                            });
+                            return acc;
+                        }, {})))
+                    ];
+                    // Log cumulative results after each action
+                    console.log("CUMULATIVE results:", this.cumulativeResults);
+                    console.log("MERGED results:", mergedResult);
+                    yield this.options.serializableCallback(mergedResult);
                 }),
                 scrapeList: (config) => __awaiter(this, void 0, void 0, function* () {
                     yield this.ensureScriptsLoaded(page);
@@ -313,7 +339,7 @@ class Interpreter extends events_1.EventEmitter {
                 }),
             };
             for (const step of steps) {
-                this.log(`Launching ${step.action}`, logger_1.Level.LOG);
+                this.log(`Launching ${String(step.action)}`, logger_1.Level.LOG);
                 if (step.action in wawActions) {
                     // "Arrayifying" here should not be needed (TS + syntax checker - only arrays; but why not)
                     const params = !step.args || Array.isArray(step.args) ? step.args : [step.args];
@@ -321,7 +347,7 @@ class Interpreter extends events_1.EventEmitter {
                 }
                 else {
                     // Implements the dot notation for the "method name" in the workflow
-                    const levels = step.action.split('.');
+                    const levels = String(step.action).split('.');
                     const methodName = levels[levels.length - 1];
                     let invokee = page;
                     for (const level of levels.splice(0, levels.length - 1)) {
@@ -477,7 +503,13 @@ class Interpreter extends events_1.EventEmitter {
                 if (this.options.debug) {
                     this.log(`Current state is: \n${JSON.stringify(pageState, null, 2)}`, logger_1.Level.WARN);
                 }
-                const actionId = workflow.findIndex((step) => this.applicable(step.where, pageState, usedActions));
+                const actionId = workflow.findIndex((step) => {
+                    const isApplicable = this.applicable(step.where, pageState, usedActions);
+                    console.log(`Where:`, step.where);
+                    console.log(`Page state:`, pageState);
+                    console.log(`Match result: ${isApplicable}`);
+                    return isApplicable;
+                });
                 const action = workflow[actionId];
                 this.log(`Matched ${JSON.stringify(action === null || action === void 0 ? void 0 : action.where)}`, logger_1.Level.LOG);
                 if (action) { // action is matched

package/build/utils/concurrency.d.ts CHANGED Viewed

@@ -38,7 +38,7 @@ export default class Concurrency {
     /**
        * Waits until there is no running nor waiting job. \
        * If the concurrency manager is idle at the time of calling this function,
-       * it waits until at least one job is compeleted (can be "presubscribed").
+       * it waits until at least one job is completed (can be "presubscribed").
        * @returns Promise, resolved after there is no running/waiting worker.
        */
     waitForCompletion(): Promise<void>;

package/build/utils/concurrency.js CHANGED Viewed

@@ -69,7 +69,7 @@ class Concurrency {
     /**
        * Waits until there is no running nor waiting job. \
        * If the concurrency manager is idle at the time of calling this function,
-       * it waits until at least one job is compeleted (can be "presubscribed").
+       * it waits until at least one job is completed (can be "presubscribed").
        * @returns Promise, resolved after there is no running/waiting worker.
        */
     waitForCompletion() {

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "maxun-core",
-  "version": "0.0.3",
+  "version": "0.0.5",
   "description": "Core package for Maxun, responsible for data extraction",
   "main": "build/index.js",
   "typings": "build/index.d.ts",
@@ -20,7 +20,12 @@
     "automation",
     "workflow",
     "data extraction",
-    "scraping"
+    "scraping",
+    "web scraper",
+    "web scraping",
+    "data scraping",
+    "no-code web scraper",
+    "no-code web scraping"
   ],
   "author": "Maxun",
   "license": "AGPL-3.0-or-later",