unprint 0.17.9 → 0.18.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.eslintrc +1 -1
- package/package.json +4 -3
- package/src/app.js +165 -87
- package/tests/init.js +16 -3
package/.eslintrc
CHANGED
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "unprint",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.18.0",
|
|
4
4
|
"description": "Simplify common web scraping tasks while staying in control of the data.",
|
|
5
5
|
"main": "src/app.js",
|
|
6
6
|
"scripts": {},
|
|
@@ -20,8 +20,8 @@
|
|
|
20
20
|
},
|
|
21
21
|
"homepage": "https://github.com/ThePendulum/unprint#readme",
|
|
22
22
|
"dependencies": {
|
|
23
|
-
"axios": "^0.27.2",
|
|
24
23
|
"bottleneck": "^2.19.5",
|
|
24
|
+
"cookie": "^1.1.1",
|
|
25
25
|
"deepmerge": "^4.2.2",
|
|
26
26
|
"eslint": "^8.17.0",
|
|
27
27
|
"eslint-config-airbnb": "^19.0.4",
|
|
@@ -31,7 +31,8 @@
|
|
|
31
31
|
"object-hash": "^3.0.0",
|
|
32
32
|
"patchright": "^1.56.1",
|
|
33
33
|
"srcset": "^4.0.0",
|
|
34
|
-
"tunnel": "^0.0.6"
|
|
34
|
+
"tunnel": "^0.0.6",
|
|
35
|
+
"undici": "^7.18.2"
|
|
35
36
|
},
|
|
36
37
|
"devDependencies": {
|
|
37
38
|
"@playwright/test": "^1.56.1",
|
package/src/app.js
CHANGED
|
@@ -3,10 +3,9 @@
|
|
|
3
3
|
const { JSDOM, VirtualConsole } = require('jsdom');
|
|
4
4
|
const { chromium } = require('patchright');
|
|
5
5
|
const EventEmitter = require('events');
|
|
6
|
-
const
|
|
7
|
-
const
|
|
8
|
-
const
|
|
9
|
-
const axios = require('axios').default;
|
|
6
|
+
const undici = require('undici');
|
|
7
|
+
const qs = require('node:querystring');
|
|
8
|
+
const cookie = require('cookie');
|
|
10
9
|
const Bottleneck = require('bottleneck');
|
|
11
10
|
const moment = require('moment-timezone');
|
|
12
11
|
const merge = require('deepmerge');
|
|
@@ -1034,38 +1033,88 @@ function getLimiter(url, options) {
|
|
|
1034
1033
|
};
|
|
1035
1034
|
}
|
|
1036
1035
|
|
|
1037
|
-
|
|
1038
|
-
|
|
1039
|
-
const
|
|
1036
|
+
function getCookie(options) {
|
|
1037
|
+
const headerCookieData = options.headers?.cookie || options.headers?.Cookie || null;
|
|
1038
|
+
const headerCookies = headerCookieData && cookie.parseCookie(headerCookieData);
|
|
1040
1039
|
|
|
1041
|
-
if (options.
|
|
1042
|
-
|
|
1043
|
-
|
|
1044
|
-
|
|
1045
|
-
) {
|
|
1046
|
-
const proxyAgent = tunnel.httpsOverHttp({
|
|
1047
|
-
proxy: {
|
|
1048
|
-
host: options.proxy.host,
|
|
1049
|
-
port: options.proxy.port,
|
|
1050
|
-
},
|
|
1040
|
+
if (typeof options.cookies === 'object') {
|
|
1041
|
+
return cookie.stringifyCookie({
|
|
1042
|
+
...headerCookies,
|
|
1043
|
+
...options.cookies,
|
|
1051
1044
|
});
|
|
1045
|
+
}
|
|
1046
|
+
|
|
1047
|
+
if (typeof options.cookies === 'string') {
|
|
1048
|
+
const cookieData = cookie.parseCookie(options.cookies);
|
|
1049
|
+
|
|
1050
|
+
return cookie.stringifyCookie({
|
|
1051
|
+
...headerCookies,
|
|
1052
|
+
...cookieData,
|
|
1053
|
+
});
|
|
1054
|
+
}
|
|
1055
|
+
|
|
1056
|
+
return headerCookieData;
|
|
1057
|
+
}
|
|
1052
1058
|
|
|
1053
|
-
|
|
1054
|
-
|
|
1055
|
-
|
|
1059
|
+
function curateResponse(res, data, options, { url, control, customOptions }) {
|
|
1060
|
+
const base = {
|
|
1061
|
+
ok: true,
|
|
1062
|
+
data,
|
|
1063
|
+
status: res.statusCode || res.status,
|
|
1064
|
+
statusText: res.statusText,
|
|
1065
|
+
headers: res.headers,
|
|
1066
|
+
response: res,
|
|
1067
|
+
res,
|
|
1068
|
+
control,
|
|
1069
|
+
};
|
|
1070
|
+
|
|
1071
|
+
if (['application/json', 'application/javascript'].some((type) => {
|
|
1072
|
+
if (typeof res.headers.get === 'function') {
|
|
1073
|
+
return res.headers.get('content-type')?.includes(type);
|
|
1056
1074
|
}
|
|
1057
1075
|
|
|
1058
|
-
return
|
|
1076
|
+
return res.headers['content-type']?.includes(type);
|
|
1077
|
+
})) {
|
|
1078
|
+
if (typeof data === 'object') {
|
|
1079
|
+
return {
|
|
1080
|
+
...base,
|
|
1081
|
+
data,
|
|
1082
|
+
};
|
|
1083
|
+
}
|
|
1084
|
+
|
|
1085
|
+
try {
|
|
1086
|
+
return {
|
|
1087
|
+
...base,
|
|
1088
|
+
data: JSON.parse(data),
|
|
1089
|
+
};
|
|
1090
|
+
} catch (error) {
|
|
1091
|
+
return {
|
|
1092
|
+
...base,
|
|
1093
|
+
data,
|
|
1094
|
+
};
|
|
1095
|
+
}
|
|
1059
1096
|
}
|
|
1060
1097
|
|
|
1061
|
-
if (
|
|
1062
|
-
|
|
1063
|
-
instance.defaults.httpsAgent = options.httpsAgent || new https.Agent({ ...options.agent });
|
|
1098
|
+
if (!options.extract) {
|
|
1099
|
+
return base;
|
|
1064
1100
|
}
|
|
1065
1101
|
|
|
1066
|
-
|
|
1102
|
+
const contextOptions = {
|
|
1103
|
+
...customOptions,
|
|
1104
|
+
origin: url,
|
|
1105
|
+
};
|
|
1106
|
+
|
|
1107
|
+
const context = options.selectAll
|
|
1108
|
+
? initAll(data, options.selectAll, contextOptions)
|
|
1109
|
+
: init(data, options.select, contextOptions);
|
|
1110
|
+
|
|
1111
|
+
return {
|
|
1112
|
+
...base,
|
|
1113
|
+
context,
|
|
1114
|
+
};
|
|
1067
1115
|
}
|
|
1068
1116
|
|
|
1117
|
+
/* eslint-disable no-param-reassign */
|
|
1069
1118
|
const clients = new Map();
|
|
1070
1119
|
|
|
1071
1120
|
/* eslint-enable no-param-reassign */
|
|
@@ -1127,43 +1176,6 @@ async function closeAllBrowsers() {
|
|
|
1127
1176
|
await Promise.all(Array.from(clients.values()).map(async (client) => client.browser.close()));
|
|
1128
1177
|
}
|
|
1129
1178
|
|
|
1130
|
-
function curateResponse(res, options, { url, control, customOptions }) {
|
|
1131
|
-
const base = {
|
|
1132
|
-
ok: true,
|
|
1133
|
-
status: res.status,
|
|
1134
|
-
statusText: res.statusText,
|
|
1135
|
-
headers: res.headers,
|
|
1136
|
-
response: res,
|
|
1137
|
-
res,
|
|
1138
|
-
control,
|
|
1139
|
-
};
|
|
1140
|
-
|
|
1141
|
-
if (['application/json', 'application/javascript'].some((type) => res.headers['content-type']?.includes(type)) && typeof res.data === 'object') {
|
|
1142
|
-
return {
|
|
1143
|
-
...base,
|
|
1144
|
-
data: res.data,
|
|
1145
|
-
};
|
|
1146
|
-
}
|
|
1147
|
-
|
|
1148
|
-
if (!options.extract) {
|
|
1149
|
-
return base;
|
|
1150
|
-
}
|
|
1151
|
-
|
|
1152
|
-
const contextOptions = {
|
|
1153
|
-
...customOptions,
|
|
1154
|
-
origin: url,
|
|
1155
|
-
};
|
|
1156
|
-
|
|
1157
|
-
const context = options.selectAll
|
|
1158
|
-
? initAll(res.data, options.selectAll, contextOptions)
|
|
1159
|
-
: init(res.data, options.select, contextOptions);
|
|
1160
|
-
|
|
1161
|
-
return {
|
|
1162
|
-
...base,
|
|
1163
|
-
context,
|
|
1164
|
-
};
|
|
1165
|
-
}
|
|
1166
|
-
|
|
1167
1179
|
async function closeBrowser(client, options) {
|
|
1168
1180
|
if (options.client === null // this browser is single-use
|
|
1169
1181
|
|| (client.retired && client.active === 0)) { // this browser is retired to minimize garbage build-up
|
|
@@ -1172,9 +1184,28 @@ async function closeBrowser(client, options) {
|
|
|
1172
1184
|
}
|
|
1173
1185
|
}
|
|
1174
1186
|
|
|
1187
|
+
function getAgent(options, url) {
|
|
1188
|
+
const { hostname } = new URL(url);
|
|
1189
|
+
|
|
1190
|
+
if (options.proxy
|
|
1191
|
+
&& options.proxy.enable !== false
|
|
1192
|
+
&& (options.useProxy // defined locally
|
|
1193
|
+
|| options.proxy.use // defined globally
|
|
1194
|
+
|| options.proxy.hostnames?.includes(hostname))
|
|
1195
|
+
) {
|
|
1196
|
+
return new undici.ProxyAgent(`http://${options.proxy.host}:${options.proxy.port}/`, {
|
|
1197
|
+
bodyTimeout: options.timeout,
|
|
1198
|
+
});
|
|
1199
|
+
}
|
|
1200
|
+
|
|
1201
|
+
return new undici.Agent({
|
|
1202
|
+
bodyTimeout: options.timeout,
|
|
1203
|
+
});
|
|
1204
|
+
}
|
|
1205
|
+
|
|
1175
1206
|
async function browserRequest(url, customOptions = {}) {
|
|
1176
1207
|
const options = merge.all([{
|
|
1177
|
-
timeout:
|
|
1208
|
+
timeout: 10000,
|
|
1178
1209
|
extract: true,
|
|
1179
1210
|
client: 'main',
|
|
1180
1211
|
limiter: 'browser',
|
|
@@ -1182,14 +1213,14 @@ async function browserRequest(url, customOptions = {}) {
|
|
|
1182
1213
|
}, globalOptions, customOptions]);
|
|
1183
1214
|
|
|
1184
1215
|
const { limiter, interval, concurrency } = getLimiter(url, options);
|
|
1185
|
-
const
|
|
1216
|
+
const agent = getAgent(options, url);
|
|
1186
1217
|
|
|
1187
1218
|
const feedbackBase = {
|
|
1188
1219
|
url,
|
|
1189
1220
|
method: 'get',
|
|
1190
1221
|
interval,
|
|
1191
1222
|
concurrency,
|
|
1192
|
-
isProxied:
|
|
1223
|
+
isProxied: agent instanceof undici.ProxyAgent,
|
|
1193
1224
|
isBrowser: true,
|
|
1194
1225
|
options,
|
|
1195
1226
|
};
|
|
@@ -1197,12 +1228,24 @@ async function browserRequest(url, customOptions = {}) {
|
|
|
1197
1228
|
events.emit('requestInit', feedbackBase);
|
|
1198
1229
|
|
|
1199
1230
|
return limiter.schedule(async () => {
|
|
1200
|
-
const client = await getBrowserInstance(options.client, options,
|
|
1231
|
+
const client = await getBrowserInstance(options.client, options, agent instanceof undici.ProxyAgent);
|
|
1201
1232
|
|
|
1202
1233
|
client.active += 1;
|
|
1203
1234
|
|
|
1204
1235
|
const page = await client.context.newPage();
|
|
1205
1236
|
|
|
1237
|
+
await page.route(url, async (route) => {
|
|
1238
|
+
const headers = route.request().headers();
|
|
1239
|
+
|
|
1240
|
+
route.continue({
|
|
1241
|
+
headers: {
|
|
1242
|
+
...headers,
|
|
1243
|
+
...options.headers,
|
|
1244
|
+
cookie: getCookie(options),
|
|
1245
|
+
},
|
|
1246
|
+
});
|
|
1247
|
+
});
|
|
1248
|
+
|
|
1206
1249
|
const res = await page.goto(url, {
|
|
1207
1250
|
...options.page,
|
|
1208
1251
|
}).catch((error) => error);
|
|
@@ -1220,7 +1263,9 @@ async function browserRequest(url, customOptions = {}) {
|
|
|
1220
1263
|
const headers = await res.allHeaders();
|
|
1221
1264
|
|
|
1222
1265
|
if (!(status >= 200 && status < 300)) {
|
|
1223
|
-
|
|
1266
|
+
const data = await page.content();
|
|
1267
|
+
|
|
1268
|
+
handleError(new Error(`HTTP response from ${url} not OK (${status} ${statusText}): ${data}`), 'HTTP_NOT_OK');
|
|
1224
1269
|
|
|
1225
1270
|
events.emit('requestError', {
|
|
1226
1271
|
...feedbackBase,
|
|
@@ -1285,11 +1330,10 @@ async function browserRequest(url, customOptions = {}) {
|
|
|
1285
1330
|
await closeBrowser(client, options);
|
|
1286
1331
|
|
|
1287
1332
|
return curateResponse({
|
|
1288
|
-
data,
|
|
1289
1333
|
status,
|
|
1290
1334
|
statusText,
|
|
1291
1335
|
headers,
|
|
1292
|
-
}, options, {
|
|
1336
|
+
}, data, options, {
|
|
1293
1337
|
url,
|
|
1294
1338
|
customOptions,
|
|
1295
1339
|
control,
|
|
@@ -1297,46 +1341,78 @@ async function browserRequest(url, customOptions = {}) {
|
|
|
1297
1341
|
});
|
|
1298
1342
|
}
|
|
1299
1343
|
|
|
1344
|
+
function curateRequestBody(body) {
|
|
1345
|
+
if (!body) {
|
|
1346
|
+
return { body };
|
|
1347
|
+
}
|
|
1348
|
+
|
|
1349
|
+
if (body instanceof undici.FormData) {
|
|
1350
|
+
return {
|
|
1351
|
+
body: qs.stringify(body),
|
|
1352
|
+
headers: {
|
|
1353
|
+
'content-type': 'application/x-www-form-urlencoded',
|
|
1354
|
+
},
|
|
1355
|
+
};
|
|
1356
|
+
}
|
|
1357
|
+
|
|
1358
|
+
if (typeof body === 'object') {
|
|
1359
|
+
return {
|
|
1360
|
+
body: JSON.stringify(body),
|
|
1361
|
+
headers: {
|
|
1362
|
+
'content-type': 'application/json',
|
|
1363
|
+
},
|
|
1364
|
+
};
|
|
1365
|
+
}
|
|
1366
|
+
|
|
1367
|
+
return { body };
|
|
1368
|
+
}
|
|
1369
|
+
|
|
1300
1370
|
async function request(url, body, customOptions = {}, method = 'GET') {
|
|
1301
1371
|
const options = merge.all([{
|
|
1302
|
-
timeout:
|
|
1372
|
+
timeout: 10000,
|
|
1303
1373
|
extract: true,
|
|
1304
1374
|
url,
|
|
1305
1375
|
}, globalOptions, customOptions]);
|
|
1306
1376
|
|
|
1307
1377
|
const { limiter, interval, concurrency } = getLimiter(url, options);
|
|
1308
1378
|
|
|
1309
|
-
const
|
|
1310
|
-
method,
|
|
1311
|
-
validateStatus: null,
|
|
1312
|
-
headers: options.headers,
|
|
1313
|
-
timeout: options.timeout,
|
|
1314
|
-
signal: options.abortSignal,
|
|
1315
|
-
// ...options,
|
|
1316
|
-
// httpAgent: options.httpAgent || new http.Agent({ ...options.agent }),
|
|
1317
|
-
});
|
|
1318
|
-
|
|
1319
|
-
const isProxied = setProxy(instance, options, url);
|
|
1379
|
+
const agent = getAgent(options, url);
|
|
1320
1380
|
|
|
1321
1381
|
const feedbackBase = {
|
|
1322
1382
|
url,
|
|
1323
1383
|
method,
|
|
1324
1384
|
interval,
|
|
1325
1385
|
concurrency,
|
|
1326
|
-
isProxied,
|
|
1386
|
+
isProxied: agent instanceof undici.ProxyAgent,
|
|
1327
1387
|
isBrowser: false,
|
|
1328
1388
|
options,
|
|
1329
1389
|
};
|
|
1330
1390
|
|
|
1331
1391
|
events.emit('requestInit', feedbackBase);
|
|
1332
1392
|
|
|
1333
|
-
const
|
|
1334
|
-
|
|
1335
|
-
|
|
1393
|
+
const curatedBody = curateRequestBody(body);
|
|
1394
|
+
const curatedCookie = getCookie(options);
|
|
1395
|
+
|
|
1396
|
+
const res = await limiter.schedule(async () => undici.fetch(url, {
|
|
1397
|
+
dispatcher: agent,
|
|
1398
|
+
method,
|
|
1399
|
+
body: curatedBody.body,
|
|
1400
|
+
headers: {
|
|
1401
|
+
...curatedBody.headers,
|
|
1402
|
+
...options.headers,
|
|
1403
|
+
cookie: curatedCookie,
|
|
1404
|
+
},
|
|
1405
|
+
signal: options.abortSignal,
|
|
1406
|
+
})).catch((error) => ({ // tends to happen when proxy can't reach host
|
|
1407
|
+
status: 500,
|
|
1408
|
+
statusText: 'Request aborted',
|
|
1409
|
+
async text() { return error.cause?.cause?.message || 'Request aborted'; },
|
|
1336
1410
|
}));
|
|
1337
1411
|
|
|
1338
1412
|
if (!(res.status >= 200 && res.status < 300)) {
|
|
1339
|
-
|
|
1413
|
+
const data = await res.text();
|
|
1414
|
+
|
|
1415
|
+
handleError(new Error(`HTTP response from ${url} not OK (${res.status} ${res.statusText}): ${data}`), 'HTTP_NOT_OK');
|
|
1340
1416
|
|
|
1341
1417
|
events.emit('requestError', {
|
|
1342
1418
|
...feedbackBase,
|
|
@@ -1360,7 +1436,9 @@ async function request(url, body, customOptions = {}, method = 'GET') {
|
|
|
1360
1436
|
statusText: res.statusText,
|
|
1361
1437
|
});
|
|
1362
1438
|
|
|
1363
|
-
|
|
1439
|
+
const data = await res.text();
|
|
1440
|
+
|
|
1441
|
+
return curateResponse(res, data, options, { url, customOptions });
|
|
1364
1442
|
}
|
|
1365
1443
|
|
|
1366
1444
|
async function get(url, options) {
|
package/tests/init.js
CHANGED
|
@@ -19,7 +19,7 @@ async function initTest() {
|
|
|
19
19
|
},
|
|
20
20
|
},
|
|
21
21
|
proxy: {
|
|
22
|
-
host: '192.168.
|
|
22
|
+
host: '192.168.1.25',
|
|
23
23
|
port: 8888,
|
|
24
24
|
hostnames: ['127.0.0.2'],
|
|
25
25
|
},
|
|
@@ -31,8 +31,21 @@ async function initTest() {
|
|
|
31
31
|
// unprint.on('query', (queryData) => console.log('query', queryData));
|
|
32
32
|
|
|
33
33
|
const res = await unprint.get(`http://127.0.0.1:${port}/html`, { select: 'body' });
|
|
34
|
-
|
|
35
|
-
|
|
34
|
+
|
|
35
|
+
const jsonRes = await unprint.get(`http://127.0.0.1:${port}/json`);
|
|
36
|
+
const errorRes = await unprint.get(`http://127.0.0.1:${port}/error/404`);
|
|
37
|
+
const cookiesRes = await unprint.get(`http://127.0.0.1:${port}/json`, {
|
|
38
|
+
headers: {
|
|
39
|
+
cookie: 'foo=bar',
|
|
40
|
+
},
|
|
41
|
+
cookies: {
|
|
42
|
+
hello: 'world',
|
|
43
|
+
},
|
|
44
|
+
});
|
|
45
|
+
|
|
46
|
+
console.log('JSON RES', jsonRes);
|
|
47
|
+
console.log('ERROR RES', errorRes);
|
|
48
|
+
console.log('COOKIES RES', cookiesRes);
|
|
36
49
|
|
|
37
50
|
console.log('title', res.context.query.content('//*[contains(text(), "Test")]'));
|
|
38
51
|
console.log('date', res.context.query.date('#date', 'DD-MM-YYYY HH:mm'));
|