unprint 0.17.3 → 0.17.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +6 -2
- package/package.json +2 -1
- package/src/app.js +39 -12
- package/tests/browser.js +17 -4
package/README.md
CHANGED
|
@@ -18,6 +18,10 @@ unprint.options({
|
|
|
18
18
|
concurrency: 10,
|
|
19
19
|
interval: 10, // ms
|
|
20
20
|
},
|
|
21
|
+
browser: {
|
|
22
|
+
concurrency: 5,
|
|
23
|
+
interval: 20,
|
|
24
|
+
},
|
|
21
25
|
[hostname]: {
|
|
22
26
|
enable: true, // enabled by default
|
|
23
27
|
concurrency: 1,
|
|
@@ -215,9 +219,9 @@ Use Playwright with Chromium (experimental)
|
|
|
215
219
|
Additional options
|
|
216
220
|
* `control`: Async function to interface with Playwright page passed as argument
|
|
217
221
|
* `scope`: Browser instance to (re)use, set to `null` to force new scope every request, default `main`.
|
|
218
|
-
* `browser`: Options object passed to Playwright's `launch
|
|
222
|
+
* `browser`: Options object passed to Playwright's `launch`.
|
|
219
223
|
* `browser.headless`: Headless mode, set to `false` to launch visible browser, default `true`.
|
|
220
|
-
* `context`: Options object passed to Playwright's `newContext
|
|
224
|
+
* `context`: Options object passed to Playwright's `newContext`.
|
|
221
225
|
* `page`: Options object passed to Playwright's `goto`.
|
|
222
226
|
|
|
223
227
|
This requires you to install the Chromium executable:
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "unprint",
|
|
3
|
-
"version": "0.17.
|
|
3
|
+
"version": "0.17.5",
|
|
4
4
|
"description": "Simplify common web scraping tasks while staying in control of the data.",
|
|
5
5
|
"main": "src/app.js",
|
|
6
6
|
"scripts": {},
|
|
@@ -28,6 +28,7 @@
|
|
|
28
28
|
"eslint-config-airbnb-base": "^15.0.0",
|
|
29
29
|
"jsdom": "^17.0.0",
|
|
30
30
|
"moment-timezone": "^0.5.34",
|
|
31
|
+
"object-hash": "^3.0.0",
|
|
31
32
|
"patchright": "^1.56.1",
|
|
32
33
|
"srcset": "^4.0.0",
|
|
33
34
|
"tunnel": "^0.0.6"
|
package/src/app.js
CHANGED
|
@@ -10,6 +10,7 @@ const axios = require('axios').default;
|
|
|
10
10
|
const Bottleneck = require('bottleneck');
|
|
11
11
|
const moment = require('moment-timezone');
|
|
12
12
|
const merge = require('deepmerge');
|
|
13
|
+
const hashObject = require('object-hash');
|
|
13
14
|
const srcset = require('srcset');
|
|
14
15
|
|
|
15
16
|
const settings = {
|
|
@@ -21,6 +22,10 @@ const settings = {
|
|
|
21
22
|
interval: 10,
|
|
22
23
|
concurrency: 10,
|
|
23
24
|
},
|
|
25
|
+
browser: {
|
|
26
|
+
interval: 20,
|
|
27
|
+
concurrency: 5,
|
|
28
|
+
},
|
|
24
29
|
},
|
|
25
30
|
};
|
|
26
31
|
|
|
@@ -988,7 +993,7 @@ function getLimiterValue(prop, options, hostname) {
|
|
|
988
993
|
return options.limits[hostname][prop];
|
|
989
994
|
}
|
|
990
995
|
|
|
991
|
-
return options.limits
|
|
996
|
+
return options.limits[options?.limiter || 'default'][prop];
|
|
992
997
|
}
|
|
993
998
|
|
|
994
999
|
function getLimiter(url, options) {
|
|
@@ -1020,7 +1025,6 @@ function setProxy(instance, options, url) {
|
|
|
1020
1025
|
|
|
1021
1026
|
if (options.proxy
|
|
1022
1027
|
&& options.proxy.enable !== false
|
|
1023
|
-
&& options.proxy.use !== false // use is a local override for enable
|
|
1024
1028
|
&& (options.proxy.use
|
|
1025
1029
|
|| options.proxy.hostnames?.includes(hostname))
|
|
1026
1030
|
) {
|
|
@@ -1031,14 +1035,18 @@ function setProxy(instance, options, url) {
|
|
|
1031
1035
|
},
|
|
1032
1036
|
});
|
|
1033
1037
|
|
|
1034
|
-
instance
|
|
1035
|
-
|
|
1038
|
+
if (instance) {
|
|
1039
|
+
instance.defaults.httpAgent = proxyAgent;
|
|
1040
|
+
instance.defaults.httpsAgent = proxyAgent;
|
|
1041
|
+
}
|
|
1036
1042
|
|
|
1037
1043
|
return true;
|
|
1038
1044
|
}
|
|
1039
1045
|
|
|
1040
|
-
instance
|
|
1041
|
-
|
|
1046
|
+
if (instance) {
|
|
1047
|
+
instance.defaults.httpAgent = options.httpsAgent || new http.Agent({ ...options.agent });
|
|
1048
|
+
instance.defaults.httpsAgent = options.httpsAgent || new https.Agent({ ...options.agent });
|
|
1049
|
+
}
|
|
1042
1050
|
|
|
1043
1051
|
return false;
|
|
1044
1052
|
}
|
|
@@ -1046,9 +1054,11 @@ function setProxy(instance, options, url) {
|
|
|
1046
1054
|
const clients = new Map();
|
|
1047
1055
|
|
|
1048
1056
|
/* eslint-enable no-param-reassign */
|
|
1049
|
-
async function getBrowserInstance(scope, options) {
|
|
1050
|
-
|
|
1051
|
-
|
|
1057
|
+
async function getBrowserInstance(scope, options, useProxy = false) {
|
|
1058
|
+
const scopeKey = `${scope}_${useProxy ? 'proxy' : 'direct'}_${options.browser ? hashObject(options.browser) : 'default'}_${options.context ? hashObject(options.context) : 'default'}`;
|
|
1059
|
+
|
|
1060
|
+
if (clients.has(scopeKey)) {
|
|
1061
|
+
const client = clients.get(scopeKey);
|
|
1052
1062
|
|
|
1053
1063
|
await client.launchers;
|
|
1054
1064
|
|
|
@@ -1064,13 +1074,18 @@ async function getBrowserInstance(scope, options) {
|
|
|
1064
1074
|
const contextLauncher = browserLauncher.then((browser) => browser.newContext({
|
|
1065
1075
|
userAgent: 'unprint',
|
|
1066
1076
|
...options.context,
|
|
1077
|
+
...(useProxy && {
|
|
1078
|
+
proxy: {
|
|
1079
|
+
server: `${options.proxy.host}:${options.proxy.port}`,
|
|
1080
|
+
},
|
|
1081
|
+
}),
|
|
1067
1082
|
}));
|
|
1068
1083
|
|
|
1069
1084
|
const launchers = Promise.all([browserLauncher, contextLauncher]);
|
|
1070
1085
|
const client = { launchers };
|
|
1071
1086
|
|
|
1072
1087
|
if (scope) {
|
|
1073
|
-
clients.set(
|
|
1088
|
+
clients.set(scopeKey, client);
|
|
1074
1089
|
}
|
|
1075
1090
|
|
|
1076
1091
|
client.browser = await browserLauncher;
|
|
@@ -1125,22 +1140,27 @@ async function browserRequest(url, customOptions = {}) {
|
|
|
1125
1140
|
timeout: 1000,
|
|
1126
1141
|
extract: true,
|
|
1127
1142
|
scope: 'main',
|
|
1143
|
+
limiter: 'browser',
|
|
1128
1144
|
url,
|
|
1129
1145
|
}, globalOptions, customOptions]);
|
|
1130
1146
|
|
|
1131
1147
|
const { limiter, interval, concurrency } = getLimiter(url, options);
|
|
1148
|
+
const useProxy = setProxy(null, options, url);
|
|
1132
1149
|
|
|
1133
1150
|
const feedbackBase = {
|
|
1134
1151
|
url,
|
|
1135
1152
|
method: 'get',
|
|
1136
1153
|
interval,
|
|
1137
1154
|
concurrency,
|
|
1138
|
-
isProxied:
|
|
1155
|
+
isProxied: useProxy,
|
|
1156
|
+
isBrowser: true,
|
|
1139
1157
|
options,
|
|
1140
1158
|
};
|
|
1141
1159
|
|
|
1160
|
+
events.emit('requestInit', feedbackBase);
|
|
1161
|
+
|
|
1142
1162
|
return limiter.schedule(async () => {
|
|
1143
|
-
const { context, browser } = await getBrowserInstance(options.scope, options);
|
|
1163
|
+
const { context, browser } = await getBrowserInstance(options.scope, options, useProxy);
|
|
1144
1164
|
const page = await context.newPage();
|
|
1145
1165
|
|
|
1146
1166
|
const res = await page.goto(url, {
|
|
@@ -1191,6 +1211,12 @@ async function browserRequest(url, customOptions = {}) {
|
|
|
1191
1211
|
await browser.close();
|
|
1192
1212
|
}
|
|
1193
1213
|
|
|
1214
|
+
events.emit('requestSuccess', {
|
|
1215
|
+
...feedbackBase,
|
|
1216
|
+
status,
|
|
1217
|
+
statusText,
|
|
1218
|
+
});
|
|
1219
|
+
|
|
1194
1220
|
return curateResponse({
|
|
1195
1221
|
data,
|
|
1196
1222
|
status,
|
|
@@ -1231,6 +1257,7 @@ async function request(url, body, customOptions = {}, method = 'GET') {
|
|
|
1231
1257
|
interval,
|
|
1232
1258
|
concurrency,
|
|
1233
1259
|
isProxied,
|
|
1260
|
+
isBrowser: false,
|
|
1234
1261
|
options,
|
|
1235
1262
|
};
|
|
1236
1263
|
|
package/tests/browser.js
CHANGED
|
@@ -2,6 +2,18 @@
|
|
|
2
2
|
|
|
3
3
|
const unprint = require('../src/app');
|
|
4
4
|
|
|
5
|
+
unprint.options({ // or unprint.options();
|
|
6
|
+
proxy: {
|
|
7
|
+
enable: true,
|
|
8
|
+
use: false, // don't use for all requests by default
|
|
9
|
+
host: '192.168.1.25',
|
|
10
|
+
port: 8888,
|
|
11
|
+
hostnames: [
|
|
12
|
+
'tools-httpstatus.pickup-services.com',
|
|
13
|
+
],
|
|
14
|
+
},
|
|
15
|
+
});
|
|
16
|
+
|
|
5
17
|
async function initTest() {
|
|
6
18
|
// concurrency
|
|
7
19
|
await Promise.all([
|
|
@@ -41,17 +53,18 @@ async function initTest() {
|
|
|
41
53
|
headless: false,
|
|
42
54
|
},
|
|
43
55
|
async control(_page) {
|
|
44
|
-
//
|
|
56
|
+
// return new Promise((resolve) => { setTimeout(() => resolve(), 60000); });
|
|
45
57
|
},
|
|
46
58
|
});
|
|
47
59
|
}),
|
|
48
60
|
]);
|
|
49
61
|
|
|
50
62
|
const res = await unprint.browser('https://www.scrapingcourse.com/', {
|
|
51
|
-
|
|
52
|
-
|
|
63
|
+
browser: {
|
|
64
|
+
headless: false,
|
|
65
|
+
},
|
|
53
66
|
async control(_page) {
|
|
54
|
-
|
|
67
|
+
// await new Promise((resolve) => { setTimeout(() => resolve(), 60000); });
|
|
55
68
|
},
|
|
56
69
|
});
|
|
57
70
|
|