unprint 0.17.4 → 0.17.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +3 -3
- package/package.json +2 -1
- package/src/app.js +46 -12
- package/tests/browser.js +17 -4
package/README.md
CHANGED
|
@@ -139,7 +139,7 @@ Returns the `src` from an image element (or any other specified target) as a str
|
|
|
139
139
|
Return the contents of a `data-` attribute.
|
|
140
140
|
|
|
141
141
|
#### Query a source set
|
|
142
|
-
`query.sourceSet([
|
|
142
|
+
`query.sourceSet(selector, [property], [options])` or `query.srcSet()`
|
|
143
143
|
|
|
144
144
|
Options:
|
|
145
145
|
* `includeDescriptor`: Produce an array of `{ descriptor, url }` instead of URL strings.
|
|
@@ -219,9 +219,9 @@ Use Playwright with Chromium (experimental)
|
|
|
219
219
|
Additional options
|
|
220
220
|
* `control`: Async function to interface with Playwright page passed as argument
|
|
221
221
|
* `scope`: Browser instance to (re)use, set to `null` to force new scope every request, default `main`.
|
|
222
|
-
* `browser`: Options object passed to Playwright's `launch
|
|
222
|
+
* `browser`: Options object passed to Playwright's `launch`.
|
|
223
223
|
* `browser.headless`: Headless mode, set to `false` to launch visible browser, default `true`.
|
|
224
|
-
* `context`: Options object passed to Playwright's `newContext
|
|
224
|
+
* `context`: Options object passed to Playwright's `newContext`.
|
|
225
225
|
* `page`: Options object passed to Playwright's `goto`.
|
|
226
226
|
|
|
227
227
|
This requires you to install the Chromium executable:
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "unprint",
|
|
3
|
-
"version": "0.17.
|
|
3
|
+
"version": "0.17.6",
|
|
4
4
|
"description": "Simplify common web scraping tasks while staying in control of the data.",
|
|
5
5
|
"main": "src/app.js",
|
|
6
6
|
"scripts": {},
|
|
@@ -28,6 +28,7 @@
|
|
|
28
28
|
"eslint-config-airbnb-base": "^15.0.0",
|
|
29
29
|
"jsdom": "^17.0.0",
|
|
30
30
|
"moment-timezone": "^0.5.34",
|
|
31
|
+
"object-hash": "^3.0.0",
|
|
31
32
|
"patchright": "^1.56.1",
|
|
32
33
|
"srcset": "^4.0.0",
|
|
33
34
|
"tunnel": "^0.0.6"
|
package/src/app.js
CHANGED
|
@@ -10,6 +10,7 @@ const axios = require('axios').default;
|
|
|
10
10
|
const Bottleneck = require('bottleneck');
|
|
11
11
|
const moment = require('moment-timezone');
|
|
12
12
|
const merge = require('deepmerge');
|
|
13
|
+
const hashObject = require('object-hash');
|
|
13
14
|
const srcset = require('srcset');
|
|
14
15
|
|
|
15
16
|
const settings = {
|
|
@@ -1024,7 +1025,6 @@ function setProxy(instance, options, url) {
|
|
|
1024
1025
|
|
|
1025
1026
|
if (options.proxy
|
|
1026
1027
|
&& options.proxy.enable !== false
|
|
1027
|
-
&& options.proxy.use !== false // use is a local override for enable
|
|
1028
1028
|
&& (options.proxy.use
|
|
1029
1029
|
|| options.proxy.hostnames?.includes(hostname))
|
|
1030
1030
|
) {
|
|
@@ -1035,14 +1035,18 @@ function setProxy(instance, options, url) {
|
|
|
1035
1035
|
},
|
|
1036
1036
|
});
|
|
1037
1037
|
|
|
1038
|
-
instance
|
|
1039
|
-
|
|
1038
|
+
if (instance) {
|
|
1039
|
+
instance.defaults.httpAgent = proxyAgent;
|
|
1040
|
+
instance.defaults.httpsAgent = proxyAgent;
|
|
1041
|
+
}
|
|
1040
1042
|
|
|
1041
1043
|
return true;
|
|
1042
1044
|
}
|
|
1043
1045
|
|
|
1044
|
-
instance
|
|
1045
|
-
|
|
1046
|
+
if (instance) {
|
|
1047
|
+
instance.defaults.httpAgent = options.httpsAgent || new http.Agent({ ...options.agent });
|
|
1048
|
+
instance.defaults.httpsAgent = options.httpsAgent || new https.Agent({ ...options.agent });
|
|
1049
|
+
}
|
|
1046
1050
|
|
|
1047
1051
|
return false;
|
|
1048
1052
|
}
|
|
@@ -1050,9 +1054,11 @@ function setProxy(instance, options, url) {
|
|
|
1050
1054
|
const clients = new Map();
|
|
1051
1055
|
|
|
1052
1056
|
/* eslint-enable no-param-reassign */
|
|
1053
|
-
async function getBrowserInstance(scope, options) {
|
|
1054
|
-
|
|
1055
|
-
|
|
1057
|
+
async function getBrowserInstance(scope, options, useProxy = false) {
|
|
1058
|
+
const scopeKey = `${scope}_${useProxy ? 'proxy' : 'direct'}_${options.browser ? hashObject(options.browser) : 'default'}_${options.context ? hashObject(options.context) : 'default'}`;
|
|
1059
|
+
|
|
1060
|
+
if (clients.has(scopeKey)) {
|
|
1061
|
+
const client = clients.get(scopeKey);
|
|
1056
1062
|
|
|
1057
1063
|
await client.launchers;
|
|
1058
1064
|
|
|
@@ -1068,13 +1074,18 @@ async function getBrowserInstance(scope, options) {
|
|
|
1068
1074
|
const contextLauncher = browserLauncher.then((browser) => browser.newContext({
|
|
1069
1075
|
userAgent: 'unprint',
|
|
1070
1076
|
...options.context,
|
|
1077
|
+
...(useProxy && {
|
|
1078
|
+
proxy: {
|
|
1079
|
+
server: `${options.proxy.host}:${options.proxy.port}`,
|
|
1080
|
+
},
|
|
1081
|
+
}),
|
|
1071
1082
|
}));
|
|
1072
1083
|
|
|
1073
1084
|
const launchers = Promise.all([browserLauncher, contextLauncher]);
|
|
1074
1085
|
const client = { launchers };
|
|
1075
1086
|
|
|
1076
1087
|
if (scope) {
|
|
1077
|
-
clients.set(
|
|
1088
|
+
clients.set(scopeKey, client);
|
|
1078
1089
|
}
|
|
1079
1090
|
|
|
1080
1091
|
client.browser = await browserLauncher;
|
|
@@ -1134,18 +1145,22 @@ async function browserRequest(url, customOptions = {}) {
|
|
|
1134
1145
|
}, globalOptions, customOptions]);
|
|
1135
1146
|
|
|
1136
1147
|
const { limiter, interval, concurrency } = getLimiter(url, options);
|
|
1148
|
+
const useProxy = setProxy(null, options, url);
|
|
1137
1149
|
|
|
1138
1150
|
const feedbackBase = {
|
|
1139
1151
|
url,
|
|
1140
1152
|
method: 'get',
|
|
1141
1153
|
interval,
|
|
1142
1154
|
concurrency,
|
|
1143
|
-
isProxied:
|
|
1155
|
+
isProxied: useProxy,
|
|
1156
|
+
isBrowser: true,
|
|
1144
1157
|
options,
|
|
1145
1158
|
};
|
|
1146
1159
|
|
|
1160
|
+
events.emit('requestInit', feedbackBase);
|
|
1161
|
+
|
|
1147
1162
|
return limiter.schedule(async () => {
|
|
1148
|
-
const { context, browser } = await getBrowserInstance(options.scope, options);
|
|
1163
|
+
const { context, browser } = await getBrowserInstance(options.scope, options, useProxy);
|
|
1149
1164
|
const page = await context.newPage();
|
|
1150
1165
|
|
|
1151
1166
|
const res = await page.goto(url, {
|
|
@@ -1182,7 +1197,19 @@ async function browserRequest(url, customOptions = {}) {
|
|
|
1182
1197
|
let control = null;
|
|
1183
1198
|
|
|
1184
1199
|
if (customOptions.control) {
|
|
1185
|
-
|
|
1200
|
+
try {
|
|
1201
|
+
control = await customOptions.control(page, { context, browser });
|
|
1202
|
+
} catch (error) {
|
|
1203
|
+
return {
|
|
1204
|
+
ok: false,
|
|
1205
|
+
controlError: error.message,
|
|
1206
|
+
status,
|
|
1207
|
+
statusText,
|
|
1208
|
+
headers,
|
|
1209
|
+
response: res,
|
|
1210
|
+
res,
|
|
1211
|
+
};
|
|
1212
|
+
}
|
|
1186
1213
|
}
|
|
1187
1214
|
|
|
1188
1215
|
events.emit('controlSuccess', feedbackBase);
|
|
@@ -1196,6 +1223,12 @@ async function browserRequest(url, customOptions = {}) {
|
|
|
1196
1223
|
await browser.close();
|
|
1197
1224
|
}
|
|
1198
1225
|
|
|
1226
|
+
events.emit('requestSuccess', {
|
|
1227
|
+
...feedbackBase,
|
|
1228
|
+
status,
|
|
1229
|
+
statusText,
|
|
1230
|
+
});
|
|
1231
|
+
|
|
1199
1232
|
return curateResponse({
|
|
1200
1233
|
data,
|
|
1201
1234
|
status,
|
|
@@ -1236,6 +1269,7 @@ async function request(url, body, customOptions = {}, method = 'GET') {
|
|
|
1236
1269
|
interval,
|
|
1237
1270
|
concurrency,
|
|
1238
1271
|
isProxied,
|
|
1272
|
+
isBrowser: false,
|
|
1239
1273
|
options,
|
|
1240
1274
|
};
|
|
1241
1275
|
|
package/tests/browser.js
CHANGED
|
@@ -2,6 +2,18 @@
|
|
|
2
2
|
|
|
3
3
|
const unprint = require('../src/app');
|
|
4
4
|
|
|
5
|
+
unprint.options({ // or unprint.options();
|
|
6
|
+
proxy: {
|
|
7
|
+
enable: true,
|
|
8
|
+
use: false, // don't use for all requests by default
|
|
9
|
+
host: '192.168.1.25',
|
|
10
|
+
port: 8888,
|
|
11
|
+
hostnames: [
|
|
12
|
+
'tools-httpstatus.pickup-services.com',
|
|
13
|
+
],
|
|
14
|
+
},
|
|
15
|
+
});
|
|
16
|
+
|
|
5
17
|
async function initTest() {
|
|
6
18
|
// concurrency
|
|
7
19
|
await Promise.all([
|
|
@@ -41,17 +53,18 @@ async function initTest() {
|
|
|
41
53
|
headless: false,
|
|
42
54
|
},
|
|
43
55
|
async control(_page) {
|
|
44
|
-
//
|
|
56
|
+
// return new Promise((resolve) => { setTimeout(() => resolve(), 60000); });
|
|
45
57
|
},
|
|
46
58
|
});
|
|
47
59
|
}),
|
|
48
60
|
]);
|
|
49
61
|
|
|
50
62
|
const res = await unprint.browser('https://www.scrapingcourse.com/', {
|
|
51
|
-
|
|
52
|
-
|
|
63
|
+
browser: {
|
|
64
|
+
headless: false,
|
|
65
|
+
},
|
|
53
66
|
async control(_page) {
|
|
54
|
-
|
|
67
|
+
// await new Promise((resolve) => { setTimeout(() => resolve(), 60000); });
|
|
55
68
|
},
|
|
56
69
|
});
|
|
57
70
|
|