unprint 0.17.9 → 0.18.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/.eslintrc CHANGED
@@ -2,7 +2,7 @@
2
2
  "extends": "airbnb-base",
3
3
  "parserOptions": {
4
4
  "sourceType": "script",
5
- "ecmaVersion": 2020
5
+ "ecmaVersion": "latest"
6
6
  },
7
7
  "rules": {
8
8
  "strict": 0,
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "unprint",
3
- "version": "0.17.9",
3
+ "version": "0.18.1",
4
4
  "description": "Simplify common web scraping tasks while staying in control of the data.",
5
5
  "main": "src/app.js",
6
6
  "scripts": {},
@@ -20,8 +20,8 @@
20
20
  },
21
21
  "homepage": "https://github.com/ThePendulum/unprint#readme",
22
22
  "dependencies": {
23
- "axios": "^0.27.2",
24
23
  "bottleneck": "^2.19.5",
24
+ "cookie": "^1.1.1",
25
25
  "deepmerge": "^4.2.2",
26
26
  "eslint": "^8.17.0",
27
27
  "eslint-config-airbnb": "^19.0.4",
@@ -31,7 +31,8 @@
31
31
  "object-hash": "^3.0.0",
32
32
  "patchright": "^1.56.1",
33
33
  "srcset": "^4.0.0",
34
- "tunnel": "^0.0.6"
34
+ "tunnel": "^0.0.6",
35
+ "undici": "^7.18.2"
35
36
  },
36
37
  "devDependencies": {
37
38
  "@playwright/test": "^1.56.1",
package/src/app.js CHANGED
@@ -3,10 +3,9 @@
3
3
  const { JSDOM, VirtualConsole } = require('jsdom');
4
4
  const { chromium } = require('patchright');
5
5
  const EventEmitter = require('events');
6
- const http = require('http');
7
- const https = require('https');
8
- const tunnel = require('tunnel');
9
- const axios = require('axios').default;
6
+ const undici = require('undici');
7
+ const qs = require('node:querystring');
8
+ const cookie = require('cookie');
10
9
  const Bottleneck = require('bottleneck');
11
10
  const moment = require('moment-timezone');
12
11
  const merge = require('deepmerge');
@@ -1034,38 +1033,96 @@ function getLimiter(url, options) {
1034
1033
  };
1035
1034
  }
1036
1035
 
1037
- /* eslint-disable no-param-reassign */
1038
- function setProxy(instance, options, url) {
1039
- const { hostname } = new URL(url);
1036
+ function getCookie(options) {
1037
+ const headerCookieData = options.headers?.cookie || options.headers?.Cookie || null;
1038
+ const headerCookies = headerCookieData && cookie.parseCookie(headerCookieData);
1040
1039
 
1041
- if (options.proxy
1042
- && options.proxy.enable !== false
1043
- && (options.proxy.use
1044
- || options.proxy.hostnames?.includes(hostname))
1045
- ) {
1046
- const proxyAgent = tunnel.httpsOverHttp({
1047
- proxy: {
1048
- host: options.proxy.host,
1049
- port: options.proxy.port,
1050
- },
1040
+ if (typeof options.cookies === 'object') {
1041
+ return cookie.stringifyCookie({
1042
+ ...headerCookies,
1043
+ ...options.cookies,
1051
1044
  });
1045
+ }
1046
+
1047
+ if (typeof options.cookies === 'string') {
1048
+ const cookieData = cookie.parseCookie(options.cookies);
1049
+
1050
+ return cookie.stringifyCookie({
1051
+ ...headerCookies,
1052
+ ...cookieData,
1053
+ });
1054
+ }
1055
+
1056
+ return headerCookieData;
1057
+ }
1052
1058
 
1053
- if (instance) {
1054
- instance.defaults.httpAgent = proxyAgent;
1055
- instance.defaults.httpsAgent = proxyAgent;
1059
+ function filterHeaders(headers, options) {
1060
+ if (headers && options.defaultHeaders !== false) {
1061
+ return Object.fromEntries(Object.entries(headers).filter(([_key, value]) => value !== null));
1062
+ }
1063
+
1064
+ return headers;
1065
+ }
1066
+
1067
+ function curateResponse(res, data, options, { url, control, customOptions }) {
1068
+ const base = {
1069
+ ok: true,
1070
+ data,
1071
+ status: res.statusCode || res.status,
1072
+ statusText: res.statusText,
1073
+ headers: res.headers,
1074
+ response: res,
1075
+ res,
1076
+ control,
1077
+ };
1078
+
1079
+ if (['application/json', 'application/javascript'].some((type) => {
1080
+ if (typeof res.headers.get === 'function') {
1081
+ return res.headers.get('content-type')?.includes(type);
1082
+ }
1083
+
1084
+ return res.headers['content-type']?.includes(type);
1085
+ })) {
1086
+ if (typeof data === 'object') {
1087
+ return {
1088
+ ...base,
1089
+ data,
1090
+ };
1056
1091
  }
1057
1092
 
1058
- return true;
1093
+ try {
1094
+ return {
1095
+ ...base,
1096
+ data: JSON.parse(data),
1097
+ };
1098
+ } catch (error) {
1099
+ return {
1100
+ ...base,
1101
+ data,
1102
+ };
1103
+ }
1059
1104
  }
1060
1105
 
1061
- if (instance) {
1062
- instance.defaults.httpAgent = options.httpsAgent || new http.Agent({ ...options.agent });
1063
- instance.defaults.httpsAgent = options.httpsAgent || new https.Agent({ ...options.agent });
1106
+ if (!options.extract) {
1107
+ return base;
1064
1108
  }
1065
1109
 
1066
- return false;
1110
+ const contextOptions = {
1111
+ ...customOptions,
1112
+ origin: url,
1113
+ };
1114
+
1115
+ const context = options.selectAll
1116
+ ? initAll(data, options.selectAll, contextOptions)
1117
+ : init(data, options.select, contextOptions);
1118
+
1119
+ return {
1120
+ ...base,
1121
+ context,
1122
+ };
1067
1123
  }
1068
1124
 
1125
+ /* eslint-disable no-param-reassign */
1069
1126
  const clients = new Map();
1070
1127
 
1071
1128
  /* eslint-enable no-param-reassign */
@@ -1127,43 +1184,6 @@ async function closeAllBrowsers() {
1127
1184
  await Promise.all(Array.from(clients.values()).map(async (client) => client.browser.close()));
1128
1185
  }
1129
1186
 
1130
- function curateResponse(res, options, { url, control, customOptions }) {
1131
- const base = {
1132
- ok: true,
1133
- status: res.status,
1134
- statusText: res.statusText,
1135
- headers: res.headers,
1136
- response: res,
1137
- res,
1138
- control,
1139
- };
1140
-
1141
- if (['application/json', 'application/javascript'].some((type) => res.headers['content-type']?.includes(type)) && typeof res.data === 'object') {
1142
- return {
1143
- ...base,
1144
- data: res.data,
1145
- };
1146
- }
1147
-
1148
- if (!options.extract) {
1149
- return base;
1150
- }
1151
-
1152
- const contextOptions = {
1153
- ...customOptions,
1154
- origin: url,
1155
- };
1156
-
1157
- const context = options.selectAll
1158
- ? initAll(res.data, options.selectAll, contextOptions)
1159
- : init(res.data, options.select, contextOptions);
1160
-
1161
- return {
1162
- ...base,
1163
- context,
1164
- };
1165
- }
1166
-
1167
1187
  async function closeBrowser(client, options) {
1168
1188
  if (options.client === null // this browser is single-use
1169
1189
  || (client.retired && client.active === 0)) { // this browser is retired to minimize garbage build-up
@@ -1172,9 +1192,28 @@ async function closeBrowser(client, options) {
1172
1192
  }
1173
1193
  }
1174
1194
 
1195
+ function getAgent(options, url) {
1196
+ const { hostname } = new URL(url);
1197
+
1198
+ if (options.proxy
1199
+ && options.proxy.enable !== false
1200
+ && (options.useProxy // defined locally
1201
+ || options.proxy.use // defined globally
1202
+ || options.proxy.hostnames?.includes(hostname))
1203
+ ) {
1204
+ return new undici.ProxyAgent(`http://${options.proxy.host}:${options.proxy.port}/`, {
1205
+ bodyTimeout: options.timeout,
1206
+ });
1207
+ }
1208
+
1209
+ return new undici.Agent({
1210
+ bodyTimeout: options.timeout,
1211
+ });
1212
+ }
1213
+
1175
1214
  async function browserRequest(url, customOptions = {}) {
1176
1215
  const options = merge.all([{
1177
- timeout: 1000,
1216
+ timeout: 10000,
1178
1217
  extract: true,
1179
1218
  client: 'main',
1180
1219
  limiter: 'browser',
@@ -1182,14 +1221,14 @@ async function browserRequest(url, customOptions = {}) {
1182
1221
  }, globalOptions, customOptions]);
1183
1222
 
1184
1223
  const { limiter, interval, concurrency } = getLimiter(url, options);
1185
- const useProxy = setProxy(null, options, url);
1224
+ const agent = getAgent(options, url);
1186
1225
 
1187
1226
  const feedbackBase = {
1188
1227
  url,
1189
1228
  method: 'get',
1190
1229
  interval,
1191
1230
  concurrency,
1192
- isProxied: useProxy,
1231
+ isProxied: agent instanceof undici.ProxyAgent,
1193
1232
  isBrowser: true,
1194
1233
  options,
1195
1234
  };
@@ -1197,12 +1236,24 @@ async function browserRequest(url, customOptions = {}) {
1197
1236
  events.emit('requestInit', feedbackBase);
1198
1237
 
1199
1238
  return limiter.schedule(async () => {
1200
- const client = await getBrowserInstance(options.client, options, useProxy);
1239
+ const client = await getBrowserInstance(options.client, options, agent instanceof undici.ProxyAgent);
1201
1240
 
1202
1241
  client.active += 1;
1203
1242
 
1204
1243
  const page = await client.context.newPage();
1205
1244
 
1245
+ await page.route(url, async (route) => {
1246
+ const headers = route.request().headers();
1247
+
1248
+ route.continue({
1249
+ headers: filterHeaders({
1250
+ ...headers,
1251
+ ...options.headers,
1252
+ cookie: getCookie(options),
1253
+ }, options),
1254
+ });
1255
+ });
1256
+
1206
1257
  const res = await page.goto(url, {
1207
1258
  ...options.page,
1208
1259
  }).catch((error) => error);
@@ -1220,7 +1271,9 @@ async function browserRequest(url, customOptions = {}) {
1220
1271
  const headers = await res.allHeaders();
1221
1272
 
1222
1273
  if (!(status >= 200 && status < 300)) {
1223
- handleError(new Error(`HTTP response from ${url} not OK (${status} ${statusText}): ${res.data}`), 'HTTP_NOT_OK');
1274
+ const data = await page.content();
1275
+
1276
+ handleError(new Error(`HTTP response from ${url} not OK (${status} ${statusText}): ${data}`), 'HTTP_NOT_OK');
1224
1277
 
1225
1278
  events.emit('requestError', {
1226
1279
  ...feedbackBase,
@@ -1285,11 +1338,10 @@ async function browserRequest(url, customOptions = {}) {
1285
1338
  await closeBrowser(client, options);
1286
1339
 
1287
1340
  return curateResponse({
1288
- data,
1289
1341
  status,
1290
1342
  statusText,
1291
1343
  headers,
1292
- }, options, {
1344
+ }, data, options, {
1293
1345
  url,
1294
1346
  customOptions,
1295
1347
  control,
@@ -1297,46 +1349,80 @@ async function browserRequest(url, customOptions = {}) {
1297
1349
  });
1298
1350
  }
1299
1351
 
1352
+ function curateRequestBody(body) {
1353
+ if (!body) {
1354
+ return { body };
1355
+ }
1356
+
1357
+ if (body instanceof undici.FormData) {
1358
+ return {
1359
+ body: qs.stringify(body),
1360
+ headers: {
1361
+ 'content-type': 'application/x-www-form-urlencoded',
1362
+ },
1363
+ };
1364
+ }
1365
+
1366
+ if (typeof body === 'object') {
1367
+ return {
1368
+ body: JSON.stringify(body),
1369
+ headers: {
1370
+ 'content-type': 'application/json',
1371
+ },
1372
+ };
1373
+ }
1374
+
1375
+ return { body };
1376
+ }
1377
+
1300
1378
  async function request(url, body, customOptions = {}, method = 'GET') {
1301
1379
  const options = merge.all([{
1302
- timeout: 1000,
1380
+ timeout: 10000,
1303
1381
  extract: true,
1304
1382
  url,
1305
1383
  }, globalOptions, customOptions]);
1306
1384
 
1307
1385
  const { limiter, interval, concurrency } = getLimiter(url, options);
1308
1386
 
1309
- const instance = axios.create({
1310
- method,
1311
- validateStatus: null,
1312
- headers: options.headers,
1313
- timeout: options.timeout,
1314
- signal: options.abortSignal,
1315
- // ...options,
1316
- // httpAgent: options.httpAgent || new http.Agent({ ...options.agent }),
1317
- });
1318
-
1319
- const isProxied = setProxy(instance, options, url);
1387
+ const agent = getAgent(options, url);
1320
1388
 
1321
1389
  const feedbackBase = {
1322
1390
  url,
1323
1391
  method,
1324
1392
  interval,
1325
1393
  concurrency,
1326
- isProxied,
1394
+ isProxied: agent instanceof undici.ProxyAgent,
1327
1395
  isBrowser: false,
1328
1396
  options,
1329
1397
  };
1330
1398
 
1331
1399
  events.emit('requestInit', feedbackBase);
1332
1400
 
1333
- const res = await limiter.schedule(async () => instance.request({
1334
- url,
1335
- data: body,
1401
+ const curatedBody = curateRequestBody(body);
1402
+ const curatedCookie = getCookie(options);
1403
+
1404
+ const headers = filterHeaders({
1405
+ ...curatedBody.headers,
1406
+ ...options.headers,
1407
+ cookie: curatedCookie,
1408
+ }, options);
1409
+
1410
+ const res = await limiter.schedule(async () => undici.fetch(url, {
1411
+ dispatcher: agent,
1412
+ method,
1413
+ body: curatedBody.body,
1414
+ headers,
1415
+ signal: options.abortSignal,
1416
+ })).catch((error) => ({ // tends to happen when proxy can't reach host
1417
+ status: 500,
1418
+ statusText: 'Request aborted',
1419
+ async text() { return error.cause?.cause?.message || 'Request aborted'; },
1336
1420
  }));
1337
1421
 
1338
1422
  if (!(res.status >= 200 && res.status < 300)) {
1339
- handleError(new Error(`HTTP response from ${url} not OK (${res.status} ${res.statusText}): ${res.data}`), 'HTTP_NOT_OK');
1423
+ const data = await res.text();
1424
+
1425
+ handleError(new Error(`HTTP response from ${url} not OK (${res.status} ${res.statusText}): ${data}`), 'HTTP_NOT_OK');
1340
1426
 
1341
1427
  events.emit('requestError', {
1342
1428
  ...feedbackBase,
@@ -1360,7 +1446,9 @@ async function request(url, body, customOptions = {}, method = 'GET') {
1360
1446
  statusText: res.statusText,
1361
1447
  });
1362
1448
 
1363
- return curateResponse(res, options, { url, customOptions });
1449
+ const data = await res.text();
1450
+
1451
+ return curateResponse(res, data, options, { url, customOptions });
1364
1452
  }
1365
1453
 
1366
1454
  async function get(url, options) {
package/tests/init.js CHANGED
@@ -19,7 +19,7 @@ async function initTest() {
19
19
  },
20
20
  },
21
21
  proxy: {
22
- host: '192.168.178.25',
22
+ host: '192.168.1.25',
23
23
  port: 8888,
24
24
  hostnames: ['127.0.0.2'],
25
25
  },
@@ -31,8 +31,22 @@ async function initTest() {
31
31
  // unprint.on('query', (queryData) => console.log('query', queryData));
32
32
 
33
33
  const res = await unprint.get(`http://127.0.0.1:${port}/html`, { select: 'body' });
34
- // const jsonRes = await unprint.get(`http://127.0.0.1:${port}/json`);
35
- // const errorRes = await unprint.get(`http://127.0.0.1:${port}/error/404`);
34
+
35
+ const jsonRes = await unprint.get(`http://127.0.0.1:${port}/json`);
36
+ const errorRes = await unprint.get(`http://127.0.0.1:${port}/error/404`);
37
+ const cookiesRes = await unprint.get(`http://127.0.0.1:${port}/json`, {
38
+ headers: {
39
+ cookie: 'foo=bar',
40
+ 'User-Agent': null,
41
+ },
42
+ cookies: {
43
+ hello: 'world',
44
+ },
45
+ });
46
+
47
+ console.log('JSON RES', jsonRes);
48
+ console.log('ERROR RES', errorRes);
49
+ console.log('COOKIES RES', cookiesRes);
36
50
 
37
51
  console.log('title', res.context.query.content('//*[contains(text(), "Test")]'));
38
52
  console.log('date', res.context.query.date('#date', 'DD-MM-YYYY HH:mm'));