unprint 0.17.4 → 0.17.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +2 -2
- package/package.json +2 -1
- package/src/app.js +33 -11
- package/tests/browser.js +17 -4
package/README.md
CHANGED
|
@@ -219,9 +219,9 @@ Use Playwright with Chromium (experimental)
|
|
|
219
219
|
Additional options
|
|
220
220
|
* `control`: Async function to interface with Playwright page passed as argument
|
|
221
221
|
* `scope`: Browser instance to (re)use, set to `null` to force new scope every request, default `main`.
|
|
222
|
-
* `browser`: Options object passed to Playwright's `launch
|
|
222
|
+
* `browser`: Options object passed to Playwright's `launch`.
|
|
223
223
|
* `browser.headless`: Headless mode, set to `false` to launch visible browser, default `true`.
|
|
224
|
-
* `context`: Options object passed to Playwright's `newContext
|
|
224
|
+
* `context`: Options object passed to Playwright's `newContext`.
|
|
225
225
|
* `page`: Options object passed to Playwright's `goto`.
|
|
226
226
|
|
|
227
227
|
This requires you to install the Chromium executable:
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "unprint",
|
|
3
|
-
"version": "0.17.
|
|
3
|
+
"version": "0.17.5",
|
|
4
4
|
"description": "Simplify common web scraping tasks while staying in control of the data.",
|
|
5
5
|
"main": "src/app.js",
|
|
6
6
|
"scripts": {},
|
|
@@ -28,6 +28,7 @@
|
|
|
28
28
|
"eslint-config-airbnb-base": "^15.0.0",
|
|
29
29
|
"jsdom": "^17.0.0",
|
|
30
30
|
"moment-timezone": "^0.5.34",
|
|
31
|
+
"object-hash": "^3.0.0",
|
|
31
32
|
"patchright": "^1.56.1",
|
|
32
33
|
"srcset": "^4.0.0",
|
|
33
34
|
"tunnel": "^0.0.6"
|
package/src/app.js
CHANGED
|
@@ -10,6 +10,7 @@ const axios = require('axios').default;
|
|
|
10
10
|
const Bottleneck = require('bottleneck');
|
|
11
11
|
const moment = require('moment-timezone');
|
|
12
12
|
const merge = require('deepmerge');
|
|
13
|
+
const hashObject = require('object-hash');
|
|
13
14
|
const srcset = require('srcset');
|
|
14
15
|
|
|
15
16
|
const settings = {
|
|
@@ -1024,7 +1025,6 @@ function setProxy(instance, options, url) {
|
|
|
1024
1025
|
|
|
1025
1026
|
if (options.proxy
|
|
1026
1027
|
&& options.proxy.enable !== false
|
|
1027
|
-
&& options.proxy.use !== false // use is a local override for enable
|
|
1028
1028
|
&& (options.proxy.use
|
|
1029
1029
|
|| options.proxy.hostnames?.includes(hostname))
|
|
1030
1030
|
) {
|
|
@@ -1035,14 +1035,18 @@ function setProxy(instance, options, url) {
|
|
|
1035
1035
|
},
|
|
1036
1036
|
});
|
|
1037
1037
|
|
|
1038
|
-
instance
|
|
1039
|
-
|
|
1038
|
+
if (instance) {
|
|
1039
|
+
instance.defaults.httpAgent = proxyAgent;
|
|
1040
|
+
instance.defaults.httpsAgent = proxyAgent;
|
|
1041
|
+
}
|
|
1040
1042
|
|
|
1041
1043
|
return true;
|
|
1042
1044
|
}
|
|
1043
1045
|
|
|
1044
|
-
instance
|
|
1045
|
-
|
|
1046
|
+
if (instance) {
|
|
1047
|
+
instance.defaults.httpAgent = options.httpsAgent || new http.Agent({ ...options.agent });
|
|
1048
|
+
instance.defaults.httpsAgent = options.httpsAgent || new https.Agent({ ...options.agent });
|
|
1049
|
+
}
|
|
1046
1050
|
|
|
1047
1051
|
return false;
|
|
1048
1052
|
}
|
|
@@ -1050,9 +1054,11 @@ function setProxy(instance, options, url) {
|
|
|
1050
1054
|
const clients = new Map();
|
|
1051
1055
|
|
|
1052
1056
|
/* eslint-enable no-param-reassign */
|
|
1053
|
-
async function getBrowserInstance(scope, options) {
|
|
1054
|
-
|
|
1055
|
-
|
|
1057
|
+
async function getBrowserInstance(scope, options, useProxy = false) {
|
|
1058
|
+
const scopeKey = `${scope}_${useProxy ? 'proxy' : 'direct'}_${options.browser ? hashObject(options.browser) : 'default'}_${options.context ? hashObject(options.context) : 'default'}`;
|
|
1059
|
+
|
|
1060
|
+
if (clients.has(scopeKey)) {
|
|
1061
|
+
const client = clients.get(scopeKey);
|
|
1056
1062
|
|
|
1057
1063
|
await client.launchers;
|
|
1058
1064
|
|
|
@@ -1068,13 +1074,18 @@ async function getBrowserInstance(scope, options) {
|
|
|
1068
1074
|
const contextLauncher = browserLauncher.then((browser) => browser.newContext({
|
|
1069
1075
|
userAgent: 'unprint',
|
|
1070
1076
|
...options.context,
|
|
1077
|
+
...(useProxy && {
|
|
1078
|
+
proxy: {
|
|
1079
|
+
server: `${options.proxy.host}:${options.proxy.port}`,
|
|
1080
|
+
},
|
|
1081
|
+
}),
|
|
1071
1082
|
}));
|
|
1072
1083
|
|
|
1073
1084
|
const launchers = Promise.all([browserLauncher, contextLauncher]);
|
|
1074
1085
|
const client = { launchers };
|
|
1075
1086
|
|
|
1076
1087
|
if (scope) {
|
|
1077
|
-
clients.set(
|
|
1088
|
+
clients.set(scopeKey, client);
|
|
1078
1089
|
}
|
|
1079
1090
|
|
|
1080
1091
|
client.browser = await browserLauncher;
|
|
@@ -1134,18 +1145,22 @@ async function browserRequest(url, customOptions = {}) {
|
|
|
1134
1145
|
}, globalOptions, customOptions]);
|
|
1135
1146
|
|
|
1136
1147
|
const { limiter, interval, concurrency } = getLimiter(url, options);
|
|
1148
|
+
const useProxy = setProxy(null, options, url);
|
|
1137
1149
|
|
|
1138
1150
|
const feedbackBase = {
|
|
1139
1151
|
url,
|
|
1140
1152
|
method: 'get',
|
|
1141
1153
|
interval,
|
|
1142
1154
|
concurrency,
|
|
1143
|
-
isProxied:
|
|
1155
|
+
isProxied: useProxy,
|
|
1156
|
+
isBrowser: true,
|
|
1144
1157
|
options,
|
|
1145
1158
|
};
|
|
1146
1159
|
|
|
1160
|
+
events.emit('requestInit', feedbackBase);
|
|
1161
|
+
|
|
1147
1162
|
return limiter.schedule(async () => {
|
|
1148
|
-
const { context, browser } = await getBrowserInstance(options.scope, options);
|
|
1163
|
+
const { context, browser } = await getBrowserInstance(options.scope, options, useProxy);
|
|
1149
1164
|
const page = await context.newPage();
|
|
1150
1165
|
|
|
1151
1166
|
const res = await page.goto(url, {
|
|
@@ -1196,6 +1211,12 @@ async function browserRequest(url, customOptions = {}) {
|
|
|
1196
1211
|
await browser.close();
|
|
1197
1212
|
}
|
|
1198
1213
|
|
|
1214
|
+
events.emit('requestSuccess', {
|
|
1215
|
+
...feedbackBase,
|
|
1216
|
+
status,
|
|
1217
|
+
statusText,
|
|
1218
|
+
});
|
|
1219
|
+
|
|
1199
1220
|
return curateResponse({
|
|
1200
1221
|
data,
|
|
1201
1222
|
status,
|
|
@@ -1236,6 +1257,7 @@ async function request(url, body, customOptions = {}, method = 'GET') {
|
|
|
1236
1257
|
interval,
|
|
1237
1258
|
concurrency,
|
|
1238
1259
|
isProxied,
|
|
1260
|
+
isBrowser: false,
|
|
1239
1261
|
options,
|
|
1240
1262
|
};
|
|
1241
1263
|
|
package/tests/browser.js
CHANGED
|
@@ -2,6 +2,18 @@
|
|
|
2
2
|
|
|
3
3
|
const unprint = require('../src/app');
|
|
4
4
|
|
|
5
|
+
unprint.options({ // or unprint.options();
|
|
6
|
+
proxy: {
|
|
7
|
+
enable: true,
|
|
8
|
+
use: false, // don't use for all requests by default
|
|
9
|
+
host: '192.168.1.25',
|
|
10
|
+
port: 8888,
|
|
11
|
+
hostnames: [
|
|
12
|
+
'tools-httpstatus.pickup-services.com',
|
|
13
|
+
],
|
|
14
|
+
},
|
|
15
|
+
});
|
|
16
|
+
|
|
5
17
|
async function initTest() {
|
|
6
18
|
// concurrency
|
|
7
19
|
await Promise.all([
|
|
@@ -41,17 +53,18 @@ async function initTest() {
|
|
|
41
53
|
headless: false,
|
|
42
54
|
},
|
|
43
55
|
async control(_page) {
|
|
44
|
-
//
|
|
56
|
+
// return new Promise((resolve) => { setTimeout(() => resolve(), 60000); });
|
|
45
57
|
},
|
|
46
58
|
});
|
|
47
59
|
}),
|
|
48
60
|
]);
|
|
49
61
|
|
|
50
62
|
const res = await unprint.browser('https://www.scrapingcourse.com/', {
|
|
51
|
-
|
|
52
|
-
|
|
63
|
+
browser: {
|
|
64
|
+
headless: false,
|
|
65
|
+
},
|
|
53
66
|
async control(_page) {
|
|
54
|
-
|
|
67
|
+
// await new Promise((resolve) => { setTimeout(() => resolve(), 60000); });
|
|
55
68
|
},
|
|
56
69
|
});
|
|
57
70
|
|