maxun-core 0.0.3 → 0.0.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
|
@@ -192,9 +192,11 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
|
|
192
192
|
return undefined;
|
|
193
193
|
switch (attribute) {
|
|
194
194
|
case 'href':
|
|
195
|
-
|
|
195
|
+
const relativeHref = elem.getAttribute('href');
|
|
196
|
+
return relativeHref ? new URL(relativeHref, window.location.origin).href : null;
|
|
196
197
|
case 'src':
|
|
197
|
-
|
|
198
|
+
const relativeSrc = elem.getAttribute('src');
|
|
199
|
+
return relativeSrc ? new URL(relativeSrc, window.location.origin).href : null;
|
|
198
200
|
case 'innerText':
|
|
199
201
|
return elem.innerText;
|
|
200
202
|
case 'textContent':
|
|
@@ -203,7 +205,7 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
|
|
203
205
|
return elem.innerText;
|
|
204
206
|
}
|
|
205
207
|
}, (key) => key // Use the original key in the output
|
|
206
|
-
));
|
|
208
|
+
)) || [];
|
|
207
209
|
};
|
|
208
210
|
/**
|
|
209
211
|
* Scrapes multiple lists of similar items based on a template item.
|
|
@@ -236,10 +238,14 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
|
|
236
238
|
record[label] = fieldElement.innerHTML.trim();
|
|
237
239
|
}
|
|
238
240
|
else if (attribute === 'src') {
|
|
239
|
-
|
|
241
|
+
// Handle relative 'src' URLs
|
|
242
|
+
const src = fieldElement.getAttribute('src');
|
|
243
|
+
record[label] = src ? new URL(src, baseUrl).href : null;
|
|
240
244
|
}
|
|
241
245
|
else if (attribute === 'href') {
|
|
242
|
-
|
|
246
|
+
// Handle relative 'href' URLs
|
|
247
|
+
const href = fieldElement.getAttribute('href');
|
|
248
|
+
record[label] = href ? new URL(href, baseUrl).href : null;
|
|
243
249
|
}
|
|
244
250
|
else {
|
|
245
251
|
record[label] = fieldElement.getAttribute(attribute);
|
package/build/interpret.d.ts
CHANGED
|
@@ -2,6 +2,31 @@
|
|
|
2
2
|
import { Page } from 'playwright';
|
|
3
3
|
import { EventEmitter } from 'events';
|
|
4
4
|
import { WorkflowFile, ParamType } from './types/workflow';
|
|
5
|
+
/**
|
|
6
|
+
* Extending the Window interface for custom scraping functions.
|
|
7
|
+
*/
|
|
8
|
+
declare global {
|
|
9
|
+
interface Window {
|
|
10
|
+
scrape: (selector: string | null) => Record<string, string>[];
|
|
11
|
+
scrapeSchema: (schema: Record<string, {
|
|
12
|
+
selector: string;
|
|
13
|
+
tag: string;
|
|
14
|
+
attribute: string;
|
|
15
|
+
}>) => Record<string, any>;
|
|
16
|
+
scrapeList: (config: {
|
|
17
|
+
listSelector: string;
|
|
18
|
+
fields: any;
|
|
19
|
+
limit?: number;
|
|
20
|
+
pagination: any;
|
|
21
|
+
}) => Record<string, any>[];
|
|
22
|
+
scrapeListAuto: (listSelector: string) => {
|
|
23
|
+
selector: string;
|
|
24
|
+
innerText: string;
|
|
25
|
+
}[];
|
|
26
|
+
scrollDown: (pages?: number) => void;
|
|
27
|
+
scrollUp: (pages?: number) => void;
|
|
28
|
+
}
|
|
29
|
+
}
|
|
5
30
|
/**
|
|
6
31
|
* Defines optional intepreter options (passed in constructor)
|
|
7
32
|
*/
|
|
@@ -27,6 +52,7 @@ export default class Interpreter extends EventEmitter {
|
|
|
27
52
|
private stopper;
|
|
28
53
|
private log;
|
|
29
54
|
private blocker;
|
|
55
|
+
private cumulativeResults;
|
|
30
56
|
constructor(workflow: WorkflowFile, options?: Partial<InterpreterOptions>);
|
|
31
57
|
private applyAdBlocker;
|
|
32
58
|
private disableAdBlocker;
|
package/build/interpret.js
CHANGED
|
@@ -53,9 +53,12 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
53
53
|
super();
|
|
54
54
|
this.stopper = null;
|
|
55
55
|
this.blocker = null;
|
|
56
|
+
this.cumulativeResults = [];
|
|
56
57
|
this.workflow = workflow.workflow;
|
|
57
58
|
this.initializedWorkflow = null;
|
|
58
|
-
this.options = Object.assign({ maxRepeats: 5, maxConcurrency: 5, serializableCallback: (data) => {
|
|
59
|
+
this.options = Object.assign({ maxRepeats: 5, maxConcurrency: 5, serializableCallback: (data) => {
|
|
60
|
+
(0, logger_1.default)(JSON.stringify(data), logger_1.Level.WARN);
|
|
61
|
+
}, binaryCallback: () => { (0, logger_1.default)('Received binary data, thrashing them.', logger_1.Level.WARN); }, debug: false, debugChannel: {} }, options);
|
|
59
62
|
this.concurrency = new concurrency_1.default(this.options.maxConcurrency);
|
|
60
63
|
this.log = (...args) => (0, logger_1.default)(...args);
|
|
61
64
|
const error = preprocessor_1.default.validateWorkflow(workflow);
|
|
@@ -273,7 +276,30 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
273
276
|
scrapeSchema: (schema) => __awaiter(this, void 0, void 0, function* () {
|
|
274
277
|
yield this.ensureScriptsLoaded(page);
|
|
275
278
|
const scrapeResult = yield page.evaluate((schemaObj) => window.scrapeSchema(schemaObj), schema);
|
|
276
|
-
|
|
279
|
+
const newResults = Array.isArray(scrapeResult) ? scrapeResult : [scrapeResult];
|
|
280
|
+
newResults.forEach((result) => {
|
|
281
|
+
Object.entries(result).forEach(([key, value]) => {
|
|
282
|
+
const keyExists = this.cumulativeResults.some((item) => key in item && item[key] !== undefined);
|
|
283
|
+
if (!keyExists) {
|
|
284
|
+
this.cumulativeResults.push({ [key]: value });
|
|
285
|
+
}
|
|
286
|
+
});
|
|
287
|
+
});
|
|
288
|
+
const mergedResult = [
|
|
289
|
+
Object.fromEntries(Object.entries(this.cumulativeResults.reduce((acc, curr) => {
|
|
290
|
+
Object.entries(curr).forEach(([key, value]) => {
|
|
291
|
+
// If the key doesn't exist or the current value is not undefined, add/update it
|
|
292
|
+
if (value !== undefined) {
|
|
293
|
+
acc[key] = value;
|
|
294
|
+
}
|
|
295
|
+
});
|
|
296
|
+
return acc;
|
|
297
|
+
}, {})))
|
|
298
|
+
];
|
|
299
|
+
// Log cumulative results after each action
|
|
300
|
+
console.log("CUMULATIVE results:", this.cumulativeResults);
|
|
301
|
+
console.log("MERGED results:", mergedResult);
|
|
302
|
+
yield this.options.serializableCallback(mergedResult);
|
|
277
303
|
}),
|
|
278
304
|
scrapeList: (config) => __awaiter(this, void 0, void 0, function* () {
|
|
279
305
|
yield this.ensureScriptsLoaded(page);
|
|
@@ -313,7 +339,7 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
313
339
|
}),
|
|
314
340
|
};
|
|
315
341
|
for (const step of steps) {
|
|
316
|
-
this.log(`Launching ${step.action}`, logger_1.Level.LOG);
|
|
342
|
+
this.log(`Launching ${String(step.action)}`, logger_1.Level.LOG);
|
|
317
343
|
if (step.action in wawActions) {
|
|
318
344
|
// "Arrayifying" here should not be needed (TS + syntax checker - only arrays; but why not)
|
|
319
345
|
const params = !step.args || Array.isArray(step.args) ? step.args : [step.args];
|
|
@@ -321,7 +347,7 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
321
347
|
}
|
|
322
348
|
else {
|
|
323
349
|
// Implements the dot notation for the "method name" in the workflow
|
|
324
|
-
const levels = step.action.split('.');
|
|
350
|
+
const levels = String(step.action).split('.');
|
|
325
351
|
const methodName = levels[levels.length - 1];
|
|
326
352
|
let invokee = page;
|
|
327
353
|
for (const level of levels.splice(0, levels.length - 1)) {
|
|
@@ -477,7 +503,13 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
477
503
|
if (this.options.debug) {
|
|
478
504
|
this.log(`Current state is: \n${JSON.stringify(pageState, null, 2)}`, logger_1.Level.WARN);
|
|
479
505
|
}
|
|
480
|
-
const actionId = workflow.findIndex((step) =>
|
|
506
|
+
const actionId = workflow.findIndex((step) => {
|
|
507
|
+
const isApplicable = this.applicable(step.where, pageState, usedActions);
|
|
508
|
+
console.log(`Where:`, step.where);
|
|
509
|
+
console.log(`Page state:`, pageState);
|
|
510
|
+
console.log(`Match result: ${isApplicable}`);
|
|
511
|
+
return isApplicable;
|
|
512
|
+
});
|
|
481
513
|
const action = workflow[actionId];
|
|
482
514
|
this.log(`Matched ${JSON.stringify(action === null || action === void 0 ? void 0 : action.where)}`, logger_1.Level.LOG);
|
|
483
515
|
if (action) { // action is matched
|
|
@@ -38,7 +38,7 @@ export default class Concurrency {
|
|
|
38
38
|
/**
|
|
39
39
|
* Waits until there is no running nor waiting job. \
|
|
40
40
|
* If the concurrency manager is idle at the time of calling this function,
|
|
41
|
-
* it waits until at least one job is
|
|
41
|
+
* it waits until at least one job is completed (can be "presubscribed").
|
|
42
42
|
* @returns Promise, resolved after there is no running/waiting worker.
|
|
43
43
|
*/
|
|
44
44
|
waitForCompletion(): Promise<void>;
|
|
@@ -69,7 +69,7 @@ class Concurrency {
|
|
|
69
69
|
/**
|
|
70
70
|
* Waits until there is no running nor waiting job. \
|
|
71
71
|
* If the concurrency manager is idle at the time of calling this function,
|
|
72
|
-
* it waits until at least one job is
|
|
72
|
+
* it waits until at least one job is completed (can be "presubscribed").
|
|
73
73
|
* @returns Promise, resolved after there is no running/waiting worker.
|
|
74
74
|
*/
|
|
75
75
|
waitForCompletion() {
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "maxun-core",
|
|
3
|
-
"version": "0.0.
|
|
3
|
+
"version": "0.0.5",
|
|
4
4
|
"description": "Core package for Maxun, responsible for data extraction",
|
|
5
5
|
"main": "build/index.js",
|
|
6
6
|
"typings": "build/index.d.ts",
|
|
@@ -20,7 +20,12 @@
|
|
|
20
20
|
"automation",
|
|
21
21
|
"workflow",
|
|
22
22
|
"data extraction",
|
|
23
|
-
"scraping"
|
|
23
|
+
"scraping",
|
|
24
|
+
"web scraper",
|
|
25
|
+
"web scraping",
|
|
26
|
+
"data scraping",
|
|
27
|
+
"no-code web scraper",
|
|
28
|
+
"no-code web scraping"
|
|
24
29
|
],
|
|
25
30
|
"author": "Maxun",
|
|
26
31
|
"license": "AGPL-3.0-or-later",
|