unprint 0.17.9 → 0.18.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.eslintrc +1 -1
- package/package.json +4 -3
- package/src/app.js +175 -87
- package/tests/init.js +17 -3
package/.eslintrc
CHANGED
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "unprint",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.18.1",
|
|
4
4
|
"description": "Simplify common web scraping tasks while staying in control of the data.",
|
|
5
5
|
"main": "src/app.js",
|
|
6
6
|
"scripts": {},
|
|
@@ -20,8 +20,8 @@
|
|
|
20
20
|
},
|
|
21
21
|
"homepage": "https://github.com/ThePendulum/unprint#readme",
|
|
22
22
|
"dependencies": {
|
|
23
|
-
"axios": "^0.27.2",
|
|
24
23
|
"bottleneck": "^2.19.5",
|
|
24
|
+
"cookie": "^1.1.1",
|
|
25
25
|
"deepmerge": "^4.2.2",
|
|
26
26
|
"eslint": "^8.17.0",
|
|
27
27
|
"eslint-config-airbnb": "^19.0.4",
|
|
@@ -31,7 +31,8 @@
|
|
|
31
31
|
"object-hash": "^3.0.0",
|
|
32
32
|
"patchright": "^1.56.1",
|
|
33
33
|
"srcset": "^4.0.0",
|
|
34
|
-
"tunnel": "^0.0.6"
|
|
34
|
+
"tunnel": "^0.0.6",
|
|
35
|
+
"undici": "^7.18.2"
|
|
35
36
|
},
|
|
36
37
|
"devDependencies": {
|
|
37
38
|
"@playwright/test": "^1.56.1",
|
package/src/app.js
CHANGED
|
@@ -3,10 +3,9 @@
|
|
|
3
3
|
const { JSDOM, VirtualConsole } = require('jsdom');
|
|
4
4
|
const { chromium } = require('patchright');
|
|
5
5
|
const EventEmitter = require('events');
|
|
6
|
-
const
|
|
7
|
-
const
|
|
8
|
-
const
|
|
9
|
-
const axios = require('axios').default;
|
|
6
|
+
const undici = require('undici');
|
|
7
|
+
const qs = require('node:querystring');
|
|
8
|
+
const cookie = require('cookie');
|
|
10
9
|
const Bottleneck = require('bottleneck');
|
|
11
10
|
const moment = require('moment-timezone');
|
|
12
11
|
const merge = require('deepmerge');
|
|
@@ -1034,38 +1033,96 @@ function getLimiter(url, options) {
|
|
|
1034
1033
|
};
|
|
1035
1034
|
}
|
|
1036
1035
|
|
|
1037
|
-
|
|
1038
|
-
|
|
1039
|
-
const
|
|
1036
|
+
function getCookie(options) {
|
|
1037
|
+
const headerCookieData = options.headers?.cookie || options.headers?.Cookie || null;
|
|
1038
|
+
const headerCookies = headerCookieData && cookie.parseCookie(headerCookieData);
|
|
1040
1039
|
|
|
1041
|
-
if (options.
|
|
1042
|
-
|
|
1043
|
-
|
|
1044
|
-
|
|
1045
|
-
) {
|
|
1046
|
-
const proxyAgent = tunnel.httpsOverHttp({
|
|
1047
|
-
proxy: {
|
|
1048
|
-
host: options.proxy.host,
|
|
1049
|
-
port: options.proxy.port,
|
|
1050
|
-
},
|
|
1040
|
+
if (typeof options.cookies === 'object') {
|
|
1041
|
+
return cookie.stringifyCookie({
|
|
1042
|
+
...headerCookies,
|
|
1043
|
+
...options.cookies,
|
|
1051
1044
|
});
|
|
1045
|
+
}
|
|
1046
|
+
|
|
1047
|
+
if (typeof options.cookies === 'string') {
|
|
1048
|
+
const cookieData = cookie.parseCookie(options.cookies);
|
|
1049
|
+
|
|
1050
|
+
return cookie.stringifyCookie({
|
|
1051
|
+
...headerCookies,
|
|
1052
|
+
...cookieData,
|
|
1053
|
+
});
|
|
1054
|
+
}
|
|
1055
|
+
|
|
1056
|
+
return headerCookieData;
|
|
1057
|
+
}
|
|
1052
1058
|
|
|
1053
|
-
|
|
1054
|
-
|
|
1055
|
-
|
|
1059
|
+
function filterHeaders(headers, options) {
|
|
1060
|
+
if (headers && options.defaultHeaders !== false) {
|
|
1061
|
+
return Object.fromEntries(Object.entries(headers).filter(([_key, value]) => value !== null));
|
|
1062
|
+
}
|
|
1063
|
+
|
|
1064
|
+
return headers;
|
|
1065
|
+
}
|
|
1066
|
+
|
|
1067
|
+
function curateResponse(res, data, options, { url, control, customOptions }) {
|
|
1068
|
+
const base = {
|
|
1069
|
+
ok: true,
|
|
1070
|
+
data,
|
|
1071
|
+
status: res.statusCode || res.status,
|
|
1072
|
+
statusText: res.statusText,
|
|
1073
|
+
headers: res.headers,
|
|
1074
|
+
response: res,
|
|
1075
|
+
res,
|
|
1076
|
+
control,
|
|
1077
|
+
};
|
|
1078
|
+
|
|
1079
|
+
if (['application/json', 'application/javascript'].some((type) => {
|
|
1080
|
+
if (typeof res.headers.get === 'function') {
|
|
1081
|
+
return res.headers.get('content-type')?.includes(type);
|
|
1082
|
+
}
|
|
1083
|
+
|
|
1084
|
+
return res.headers['content-type']?.includes(type);
|
|
1085
|
+
})) {
|
|
1086
|
+
if (typeof data === 'object') {
|
|
1087
|
+
return {
|
|
1088
|
+
...base,
|
|
1089
|
+
data,
|
|
1090
|
+
};
|
|
1056
1091
|
}
|
|
1057
1092
|
|
|
1058
|
-
|
|
1093
|
+
try {
|
|
1094
|
+
return {
|
|
1095
|
+
...base,
|
|
1096
|
+
data: JSON.parse(data),
|
|
1097
|
+
};
|
|
1098
|
+
} catch (error) {
|
|
1099
|
+
return {
|
|
1100
|
+
...base,
|
|
1101
|
+
data,
|
|
1102
|
+
};
|
|
1103
|
+
}
|
|
1059
1104
|
}
|
|
1060
1105
|
|
|
1061
|
-
if (
|
|
1062
|
-
|
|
1063
|
-
instance.defaults.httpsAgent = options.httpsAgent || new https.Agent({ ...options.agent });
|
|
1106
|
+
if (!options.extract) {
|
|
1107
|
+
return base;
|
|
1064
1108
|
}
|
|
1065
1109
|
|
|
1066
|
-
|
|
1110
|
+
const contextOptions = {
|
|
1111
|
+
...customOptions,
|
|
1112
|
+
origin: url,
|
|
1113
|
+
};
|
|
1114
|
+
|
|
1115
|
+
const context = options.selectAll
|
|
1116
|
+
? initAll(data, options.selectAll, contextOptions)
|
|
1117
|
+
: init(data, options.select, contextOptions);
|
|
1118
|
+
|
|
1119
|
+
return {
|
|
1120
|
+
...base,
|
|
1121
|
+
context,
|
|
1122
|
+
};
|
|
1067
1123
|
}
|
|
1068
1124
|
|
|
1125
|
+
/* eslint-disable no-param-reassign */
|
|
1069
1126
|
const clients = new Map();
|
|
1070
1127
|
|
|
1071
1128
|
/* eslint-enable no-param-reassign */
|
|
@@ -1127,43 +1184,6 @@ async function closeAllBrowsers() {
|
|
|
1127
1184
|
await Promise.all(Array.from(clients.values()).map(async (client) => client.browser.close()));
|
|
1128
1185
|
}
|
|
1129
1186
|
|
|
1130
|
-
function curateResponse(res, options, { url, control, customOptions }) {
|
|
1131
|
-
const base = {
|
|
1132
|
-
ok: true,
|
|
1133
|
-
status: res.status,
|
|
1134
|
-
statusText: res.statusText,
|
|
1135
|
-
headers: res.headers,
|
|
1136
|
-
response: res,
|
|
1137
|
-
res,
|
|
1138
|
-
control,
|
|
1139
|
-
};
|
|
1140
|
-
|
|
1141
|
-
if (['application/json', 'application/javascript'].some((type) => res.headers['content-type']?.includes(type)) && typeof res.data === 'object') {
|
|
1142
|
-
return {
|
|
1143
|
-
...base,
|
|
1144
|
-
data: res.data,
|
|
1145
|
-
};
|
|
1146
|
-
}
|
|
1147
|
-
|
|
1148
|
-
if (!options.extract) {
|
|
1149
|
-
return base;
|
|
1150
|
-
}
|
|
1151
|
-
|
|
1152
|
-
const contextOptions = {
|
|
1153
|
-
...customOptions,
|
|
1154
|
-
origin: url,
|
|
1155
|
-
};
|
|
1156
|
-
|
|
1157
|
-
const context = options.selectAll
|
|
1158
|
-
? initAll(res.data, options.selectAll, contextOptions)
|
|
1159
|
-
: init(res.data, options.select, contextOptions);
|
|
1160
|
-
|
|
1161
|
-
return {
|
|
1162
|
-
...base,
|
|
1163
|
-
context,
|
|
1164
|
-
};
|
|
1165
|
-
}
|
|
1166
|
-
|
|
1167
1187
|
async function closeBrowser(client, options) {
|
|
1168
1188
|
if (options.client === null // this browser is single-use
|
|
1169
1189
|
|| (client.retired && client.active === 0)) { // this browser is retired to minimize garbage build-up
|
|
@@ -1172,9 +1192,28 @@ async function closeBrowser(client, options) {
|
|
|
1172
1192
|
}
|
|
1173
1193
|
}
|
|
1174
1194
|
|
|
1195
|
+
function getAgent(options, url) {
|
|
1196
|
+
const { hostname } = new URL(url);
|
|
1197
|
+
|
|
1198
|
+
if (options.proxy
|
|
1199
|
+
&& options.proxy.enable !== false
|
|
1200
|
+
&& (options.useProxy // defined locally
|
|
1201
|
+
|| options.proxy.use // defined globally
|
|
1202
|
+
|| options.proxy.hostnames?.includes(hostname))
|
|
1203
|
+
) {
|
|
1204
|
+
return new undici.ProxyAgent(`http://${options.proxy.host}:${options.proxy.port}/`, {
|
|
1205
|
+
bodyTimeout: options.timeout,
|
|
1206
|
+
});
|
|
1207
|
+
}
|
|
1208
|
+
|
|
1209
|
+
return new undici.Agent({
|
|
1210
|
+
bodyTimeout: options.timeout,
|
|
1211
|
+
});
|
|
1212
|
+
}
|
|
1213
|
+
|
|
1175
1214
|
async function browserRequest(url, customOptions = {}) {
|
|
1176
1215
|
const options = merge.all([{
|
|
1177
|
-
timeout:
|
|
1216
|
+
timeout: 10000,
|
|
1178
1217
|
extract: true,
|
|
1179
1218
|
client: 'main',
|
|
1180
1219
|
limiter: 'browser',
|
|
@@ -1182,14 +1221,14 @@ async function browserRequest(url, customOptions = {}) {
|
|
|
1182
1221
|
}, globalOptions, customOptions]);
|
|
1183
1222
|
|
|
1184
1223
|
const { limiter, interval, concurrency } = getLimiter(url, options);
|
|
1185
|
-
const
|
|
1224
|
+
const agent = getAgent(options, url);
|
|
1186
1225
|
|
|
1187
1226
|
const feedbackBase = {
|
|
1188
1227
|
url,
|
|
1189
1228
|
method: 'get',
|
|
1190
1229
|
interval,
|
|
1191
1230
|
concurrency,
|
|
1192
|
-
isProxied:
|
|
1231
|
+
isProxied: agent instanceof undici.ProxyAgent,
|
|
1193
1232
|
isBrowser: true,
|
|
1194
1233
|
options,
|
|
1195
1234
|
};
|
|
@@ -1197,12 +1236,24 @@ async function browserRequest(url, customOptions = {}) {
|
|
|
1197
1236
|
events.emit('requestInit', feedbackBase);
|
|
1198
1237
|
|
|
1199
1238
|
return limiter.schedule(async () => {
|
|
1200
|
-
const client = await getBrowserInstance(options.client, options,
|
|
1239
|
+
const client = await getBrowserInstance(options.client, options, agent instanceof undici.ProxyAgent);
|
|
1201
1240
|
|
|
1202
1241
|
client.active += 1;
|
|
1203
1242
|
|
|
1204
1243
|
const page = await client.context.newPage();
|
|
1205
1244
|
|
|
1245
|
+
await page.route(url, async (route) => {
|
|
1246
|
+
const headers = route.request().headers();
|
|
1247
|
+
|
|
1248
|
+
route.continue({
|
|
1249
|
+
headers: filterHeaders({
|
|
1250
|
+
...headers,
|
|
1251
|
+
...options.headers,
|
|
1252
|
+
cookie: getCookie(options),
|
|
1253
|
+
}, options),
|
|
1254
|
+
});
|
|
1255
|
+
});
|
|
1256
|
+
|
|
1206
1257
|
const res = await page.goto(url, {
|
|
1207
1258
|
...options.page,
|
|
1208
1259
|
}).catch((error) => error);
|
|
@@ -1220,7 +1271,9 @@ async function browserRequest(url, customOptions = {}) {
|
|
|
1220
1271
|
const headers = await res.allHeaders();
|
|
1221
1272
|
|
|
1222
1273
|
if (!(status >= 200 && status < 300)) {
|
|
1223
|
-
|
|
1274
|
+
const data = await page.content();
|
|
1275
|
+
|
|
1276
|
+
handleError(new Error(`HTTP response from ${url} not OK (${status} ${statusText}): ${data}`), 'HTTP_NOT_OK');
|
|
1224
1277
|
|
|
1225
1278
|
events.emit('requestError', {
|
|
1226
1279
|
...feedbackBase,
|
|
@@ -1285,11 +1338,10 @@ async function browserRequest(url, customOptions = {}) {
|
|
|
1285
1338
|
await closeBrowser(client, options);
|
|
1286
1339
|
|
|
1287
1340
|
return curateResponse({
|
|
1288
|
-
data,
|
|
1289
1341
|
status,
|
|
1290
1342
|
statusText,
|
|
1291
1343
|
headers,
|
|
1292
|
-
}, options, {
|
|
1344
|
+
}, data, options, {
|
|
1293
1345
|
url,
|
|
1294
1346
|
customOptions,
|
|
1295
1347
|
control,
|
|
@@ -1297,46 +1349,80 @@ async function browserRequest(url, customOptions = {}) {
|
|
|
1297
1349
|
});
|
|
1298
1350
|
}
|
|
1299
1351
|
|
|
1352
|
+
function curateRequestBody(body) {
|
|
1353
|
+
if (!body) {
|
|
1354
|
+
return { body };
|
|
1355
|
+
}
|
|
1356
|
+
|
|
1357
|
+
if (body instanceof undici.FormData) {
|
|
1358
|
+
return {
|
|
1359
|
+
body: qs.stringify(body),
|
|
1360
|
+
headers: {
|
|
1361
|
+
'content-type': 'application/x-www-form-urlencoded',
|
|
1362
|
+
},
|
|
1363
|
+
};
|
|
1364
|
+
}
|
|
1365
|
+
|
|
1366
|
+
if (typeof body === 'object') {
|
|
1367
|
+
return {
|
|
1368
|
+
body: JSON.stringify(body),
|
|
1369
|
+
headers: {
|
|
1370
|
+
'content-type': 'application/json',
|
|
1371
|
+
},
|
|
1372
|
+
};
|
|
1373
|
+
}
|
|
1374
|
+
|
|
1375
|
+
return { body };
|
|
1376
|
+
}
|
|
1377
|
+
|
|
1300
1378
|
async function request(url, body, customOptions = {}, method = 'GET') {
|
|
1301
1379
|
const options = merge.all([{
|
|
1302
|
-
timeout:
|
|
1380
|
+
timeout: 10000,
|
|
1303
1381
|
extract: true,
|
|
1304
1382
|
url,
|
|
1305
1383
|
}, globalOptions, customOptions]);
|
|
1306
1384
|
|
|
1307
1385
|
const { limiter, interval, concurrency } = getLimiter(url, options);
|
|
1308
1386
|
|
|
1309
|
-
const
|
|
1310
|
-
method,
|
|
1311
|
-
validateStatus: null,
|
|
1312
|
-
headers: options.headers,
|
|
1313
|
-
timeout: options.timeout,
|
|
1314
|
-
signal: options.abortSignal,
|
|
1315
|
-
// ...options,
|
|
1316
|
-
// httpAgent: options.httpAgent || new http.Agent({ ...options.agent }),
|
|
1317
|
-
});
|
|
1318
|
-
|
|
1319
|
-
const isProxied = setProxy(instance, options, url);
|
|
1387
|
+
const agent = getAgent(options, url);
|
|
1320
1388
|
|
|
1321
1389
|
const feedbackBase = {
|
|
1322
1390
|
url,
|
|
1323
1391
|
method,
|
|
1324
1392
|
interval,
|
|
1325
1393
|
concurrency,
|
|
1326
|
-
isProxied,
|
|
1394
|
+
isProxied: agent instanceof undici.ProxyAgent,
|
|
1327
1395
|
isBrowser: false,
|
|
1328
1396
|
options,
|
|
1329
1397
|
};
|
|
1330
1398
|
|
|
1331
1399
|
events.emit('requestInit', feedbackBase);
|
|
1332
1400
|
|
|
1333
|
-
const
|
|
1334
|
-
|
|
1335
|
-
|
|
1401
|
+
const curatedBody = curateRequestBody(body);
|
|
1402
|
+
const curatedCookie = getCookie(options);
|
|
1403
|
+
|
|
1404
|
+
const headers = filterHeaders({
|
|
1405
|
+
...curatedBody.headers,
|
|
1406
|
+
...options.headers,
|
|
1407
|
+
cookie: curatedCookie,
|
|
1408
|
+
}, options);
|
|
1409
|
+
|
|
1410
|
+
const res = await limiter.schedule(async () => undici.fetch(url, {
|
|
1411
|
+
dispatcher: agent,
|
|
1412
|
+
method,
|
|
1413
|
+
body: curatedBody.body,
|
|
1414
|
+
headers,
|
|
1415
|
+
signal: options.abortSignal,
|
|
1416
|
+
})).catch((error) => ({ // tends to happen when proxy can't reach host
|
|
1417
|
+
status: 500,
|
|
1418
|
+
statusText: 'Request aborted',
|
|
1419
|
+
async text() { return error.cause?.cause?.message || 'Request aborted'; },
|
|
1336
1420
|
}));
|
|
1337
1421
|
|
|
1338
1422
|
if (!(res.status >= 200 && res.status < 300)) {
|
|
1339
|
-
|
|
1423
|
+
const data = await res.text();
|
|
1424
|
+
|
|
1425
|
+
handleError(new Error(`HTTP response from ${url} not OK (${res.status} ${res.statusText}): ${data}`), 'HTTP_NOT_OK');
|
|
1340
1426
|
|
|
1341
1427
|
events.emit('requestError', {
|
|
1342
1428
|
...feedbackBase,
|
|
@@ -1360,7 +1446,9 @@ async function request(url, body, customOptions = {}, method = 'GET') {
|
|
|
1360
1446
|
statusText: res.statusText,
|
|
1361
1447
|
});
|
|
1362
1448
|
|
|
1363
|
-
|
|
1449
|
+
const data = await res.text();
|
|
1450
|
+
|
|
1451
|
+
return curateResponse(res, data, options, { url, customOptions });
|
|
1364
1452
|
}
|
|
1365
1453
|
|
|
1366
1454
|
async function get(url, options) {
|
package/tests/init.js
CHANGED
|
@@ -19,7 +19,7 @@ async function initTest() {
|
|
|
19
19
|
},
|
|
20
20
|
},
|
|
21
21
|
proxy: {
|
|
22
|
-
host: '192.168.
|
|
22
|
+
host: '192.168.1.25',
|
|
23
23
|
port: 8888,
|
|
24
24
|
hostnames: ['127.0.0.2'],
|
|
25
25
|
},
|
|
@@ -31,8 +31,22 @@ async function initTest() {
|
|
|
31
31
|
// unprint.on('query', (queryData) => console.log('query', queryData));
|
|
32
32
|
|
|
33
33
|
const res = await unprint.get(`http://127.0.0.1:${port}/html`, { select: 'body' });
|
|
34
|
-
|
|
35
|
-
|
|
34
|
+
|
|
35
|
+
const jsonRes = await unprint.get(`http://127.0.0.1:${port}/json`);
|
|
36
|
+
const errorRes = await unprint.get(`http://127.0.0.1:${port}/error/404`);
|
|
37
|
+
const cookiesRes = await unprint.get(`http://127.0.0.1:${port}/json`, {
|
|
38
|
+
headers: {
|
|
39
|
+
cookie: 'foo=bar',
|
|
40
|
+
'User-Agent': null,
|
|
41
|
+
},
|
|
42
|
+
cookies: {
|
|
43
|
+
hello: 'world',
|
|
44
|
+
},
|
|
45
|
+
});
|
|
46
|
+
|
|
47
|
+
console.log('JSON RES', jsonRes);
|
|
48
|
+
console.log('ERROR RES', errorRes);
|
|
49
|
+
console.log('COOKIES RES', cookiesRes);
|
|
36
50
|
|
|
37
51
|
console.log('title', res.context.query.content('//*[contains(text(), "Test")]'));
|
|
38
52
|
console.log('date', res.context.query.date('#date', 'DD-MM-YYYY HH:mm'));
|