unprint 0.17.9 → 0.18.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/.eslintrc CHANGED
@@ -2,7 +2,7 @@
2
2
  "extends": "airbnb-base",
3
3
  "parserOptions": {
4
4
  "sourceType": "script",
5
- "ecmaVersion": 2020
5
+ "ecmaVersion": "latest"
6
6
  },
7
7
  "rules": {
8
8
  "strict": 0,
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "unprint",
3
- "version": "0.17.9",
3
+ "version": "0.18.0",
4
4
  "description": "Simplify common web scraping tasks while staying in control of the data.",
5
5
  "main": "src/app.js",
6
6
  "scripts": {},
@@ -20,8 +20,8 @@
20
20
  },
21
21
  "homepage": "https://github.com/ThePendulum/unprint#readme",
22
22
  "dependencies": {
23
- "axios": "^0.27.2",
24
23
  "bottleneck": "^2.19.5",
24
+ "cookie": "^1.1.1",
25
25
  "deepmerge": "^4.2.2",
26
26
  "eslint": "^8.17.0",
27
27
  "eslint-config-airbnb": "^19.0.4",
@@ -31,7 +31,8 @@
31
31
  "object-hash": "^3.0.0",
32
32
  "patchright": "^1.56.1",
33
33
  "srcset": "^4.0.0",
34
- "tunnel": "^0.0.6"
34
+ "tunnel": "^0.0.6",
35
+ "undici": "^7.18.2"
35
36
  },
36
37
  "devDependencies": {
37
38
  "@playwright/test": "^1.56.1",
package/src/app.js CHANGED
@@ -3,10 +3,9 @@
3
3
  const { JSDOM, VirtualConsole } = require('jsdom');
4
4
  const { chromium } = require('patchright');
5
5
  const EventEmitter = require('events');
6
- const http = require('http');
7
- const https = require('https');
8
- const tunnel = require('tunnel');
9
- const axios = require('axios').default;
6
+ const undici = require('undici');
7
+ const qs = require('node:querystring');
8
+ const cookie = require('cookie');
10
9
  const Bottleneck = require('bottleneck');
11
10
  const moment = require('moment-timezone');
12
11
  const merge = require('deepmerge');
@@ -1034,38 +1033,88 @@ function getLimiter(url, options) {
1034
1033
  };
1035
1034
  }
1036
1035
 
1037
- /* eslint-disable no-param-reassign */
1038
- function setProxy(instance, options, url) {
1039
- const { hostname } = new URL(url);
1036
+ function getCookie(options) {
1037
+ const headerCookieData = options.headers?.cookie || options.headers?.Cookie || null;
1038
+ const headerCookies = headerCookieData && cookie.parseCookie(headerCookieData);
1040
1039
 
1041
- if (options.proxy
1042
- && options.proxy.enable !== false
1043
- && (options.proxy.use
1044
- || options.proxy.hostnames?.includes(hostname))
1045
- ) {
1046
- const proxyAgent = tunnel.httpsOverHttp({
1047
- proxy: {
1048
- host: options.proxy.host,
1049
- port: options.proxy.port,
1050
- },
1040
+ if (typeof options.cookies === 'object') {
1041
+ return cookie.stringifyCookie({
1042
+ ...headerCookies,
1043
+ ...options.cookies,
1051
1044
  });
1045
+ }
1046
+
1047
+ if (typeof options.cookies === 'string') {
1048
+ const cookieData = cookie.parseCookie(options.cookies);
1049
+
1050
+ return cookie.stringifyCookie({
1051
+ ...headerCookies,
1052
+ ...cookieData,
1053
+ });
1054
+ }
1055
+
1056
+ return headerCookieData;
1057
+ }
1052
1058
 
1053
- if (instance) {
1054
- instance.defaults.httpAgent = proxyAgent;
1055
- instance.defaults.httpsAgent = proxyAgent;
1059
+ function curateResponse(res, data, options, { url, control, customOptions }) {
1060
+ const base = {
1061
+ ok: true,
1062
+ data,
1063
+ status: res.statusCode || res.status,
1064
+ statusText: res.statusText,
1065
+ headers: res.headers,
1066
+ response: res,
1067
+ res,
1068
+ control,
1069
+ };
1070
+
1071
+ if (['application/json', 'application/javascript'].some((type) => {
1072
+ if (typeof res.headers.get === 'function') {
1073
+ return res.headers.get('content-type')?.includes(type);
1056
1074
  }
1057
1075
 
1058
- return true;
1076
+ return res.headers['content-type']?.includes(type);
1077
+ })) {
1078
+ if (typeof data === 'object') {
1079
+ return {
1080
+ ...base,
1081
+ data,
1082
+ };
1083
+ }
1084
+
1085
+ try {
1086
+ return {
1087
+ ...base,
1088
+ data: JSON.parse(data),
1089
+ };
1090
+ } catch (error) {
1091
+ return {
1092
+ ...base,
1093
+ data,
1094
+ };
1095
+ }
1059
1096
  }
1060
1097
 
1061
- if (instance) {
1062
- instance.defaults.httpAgent = options.httpsAgent || new http.Agent({ ...options.agent });
1063
- instance.defaults.httpsAgent = options.httpsAgent || new https.Agent({ ...options.agent });
1098
+ if (!options.extract) {
1099
+ return base;
1064
1100
  }
1065
1101
 
1066
- return false;
1102
+ const contextOptions = {
1103
+ ...customOptions,
1104
+ origin: url,
1105
+ };
1106
+
1107
+ const context = options.selectAll
1108
+ ? initAll(data, options.selectAll, contextOptions)
1109
+ : init(data, options.select, contextOptions);
1110
+
1111
+ return {
1112
+ ...base,
1113
+ context,
1114
+ };
1067
1115
  }
1068
1116
 
1117
+ /* eslint-disable no-param-reassign */
1069
1118
  const clients = new Map();
1070
1119
 
1071
1120
  /* eslint-enable no-param-reassign */
@@ -1127,43 +1176,6 @@ async function closeAllBrowsers() {
1127
1176
  await Promise.all(Array.from(clients.values()).map(async (client) => client.browser.close()));
1128
1177
  }
1129
1178
 
1130
- function curateResponse(res, options, { url, control, customOptions }) {
1131
- const base = {
1132
- ok: true,
1133
- status: res.status,
1134
- statusText: res.statusText,
1135
- headers: res.headers,
1136
- response: res,
1137
- res,
1138
- control,
1139
- };
1140
-
1141
- if (['application/json', 'application/javascript'].some((type) => res.headers['content-type']?.includes(type)) && typeof res.data === 'object') {
1142
- return {
1143
- ...base,
1144
- data: res.data,
1145
- };
1146
- }
1147
-
1148
- if (!options.extract) {
1149
- return base;
1150
- }
1151
-
1152
- const contextOptions = {
1153
- ...customOptions,
1154
- origin: url,
1155
- };
1156
-
1157
- const context = options.selectAll
1158
- ? initAll(res.data, options.selectAll, contextOptions)
1159
- : init(res.data, options.select, contextOptions);
1160
-
1161
- return {
1162
- ...base,
1163
- context,
1164
- };
1165
- }
1166
-
1167
1179
  async function closeBrowser(client, options) {
1168
1180
  if (options.client === null // this browser is single-use
1169
1181
  || (client.retired && client.active === 0)) { // this browser is retired to minimize garbage build-up
@@ -1172,9 +1184,28 @@ async function closeBrowser(client, options) {
1172
1184
  }
1173
1185
  }
1174
1186
 
1187
+ function getAgent(options, url) {
1188
+ const { hostname } = new URL(url);
1189
+
1190
+ if (options.proxy
1191
+ && options.proxy.enable !== false
1192
+ && (options.useProxy // defined locally
1193
+ || options.proxy.use // defined globally
1194
+ || options.proxy.hostnames?.includes(hostname))
1195
+ ) {
1196
+ return new undici.ProxyAgent(`http://${options.proxy.host}:${options.proxy.port}/`, {
1197
+ bodyTimeout: options.timeout,
1198
+ });
1199
+ }
1200
+
1201
+ return new undici.Agent({
1202
+ bodyTimeout: options.timeout,
1203
+ });
1204
+ }
1205
+
1175
1206
  async function browserRequest(url, customOptions = {}) {
1176
1207
  const options = merge.all([{
1177
- timeout: 1000,
1208
+ timeout: 10000,
1178
1209
  extract: true,
1179
1210
  client: 'main',
1180
1211
  limiter: 'browser',
@@ -1182,14 +1213,14 @@ async function browserRequest(url, customOptions = {}) {
1182
1213
  }, globalOptions, customOptions]);
1183
1214
 
1184
1215
  const { limiter, interval, concurrency } = getLimiter(url, options);
1185
- const useProxy = setProxy(null, options, url);
1216
+ const agent = getAgent(options, url);
1186
1217
 
1187
1218
  const feedbackBase = {
1188
1219
  url,
1189
1220
  method: 'get',
1190
1221
  interval,
1191
1222
  concurrency,
1192
- isProxied: useProxy,
1223
+ isProxied: agent instanceof undici.ProxyAgent,
1193
1224
  isBrowser: true,
1194
1225
  options,
1195
1226
  };
@@ -1197,12 +1228,24 @@ async function browserRequest(url, customOptions = {}) {
1197
1228
  events.emit('requestInit', feedbackBase);
1198
1229
 
1199
1230
  return limiter.schedule(async () => {
1200
- const client = await getBrowserInstance(options.client, options, useProxy);
1231
+ const client = await getBrowserInstance(options.client, options, agent instanceof undici.ProxyAgent);
1201
1232
 
1202
1233
  client.active += 1;
1203
1234
 
1204
1235
  const page = await client.context.newPage();
1205
1236
 
1237
+ await page.route(url, async (route) => {
1238
+ const headers = route.request().headers();
1239
+
1240
+ route.continue({
1241
+ headers: {
1242
+ ...headers,
1243
+ ...options.headers,
1244
+ cookie: getCookie(options),
1245
+ },
1246
+ });
1247
+ });
1248
+
1206
1249
  const res = await page.goto(url, {
1207
1250
  ...options.page,
1208
1251
  }).catch((error) => error);
@@ -1220,7 +1263,9 @@ async function browserRequest(url, customOptions = {}) {
1220
1263
  const headers = await res.allHeaders();
1221
1264
 
1222
1265
  if (!(status >= 200 && status < 300)) {
1223
- handleError(new Error(`HTTP response from ${url} not OK (${status} ${statusText}): ${res.data}`), 'HTTP_NOT_OK');
1266
+ const data = await page.content();
1267
+
1268
+ handleError(new Error(`HTTP response from ${url} not OK (${status} ${statusText}): ${data}`), 'HTTP_NOT_OK');
1224
1269
 
1225
1270
  events.emit('requestError', {
1226
1271
  ...feedbackBase,
@@ -1285,11 +1330,10 @@ async function browserRequest(url, customOptions = {}) {
1285
1330
  await closeBrowser(client, options);
1286
1331
 
1287
1332
  return curateResponse({
1288
- data,
1289
1333
  status,
1290
1334
  statusText,
1291
1335
  headers,
1292
- }, options, {
1336
+ }, data, options, {
1293
1337
  url,
1294
1338
  customOptions,
1295
1339
  control,
@@ -1297,46 +1341,78 @@ async function browserRequest(url, customOptions = {}) {
1297
1341
  });
1298
1342
  }
1299
1343
 
1344
+ function curateRequestBody(body) {
1345
+ if (!body) {
1346
+ return { body };
1347
+ }
1348
+
1349
+ if (body instanceof undici.FormData) {
1350
+ return {
1351
+ body: qs.stringify(body),
1352
+ headers: {
1353
+ 'content-type': 'application/x-www-form-urlencoded',
1354
+ },
1355
+ };
1356
+ }
1357
+
1358
+ if (typeof body === 'object') {
1359
+ return {
1360
+ body: JSON.stringify(body),
1361
+ headers: {
1362
+ 'content-type': 'application/json',
1363
+ },
1364
+ };
1365
+ }
1366
+
1367
+ return { body };
1368
+ }
1369
+
1300
1370
  async function request(url, body, customOptions = {}, method = 'GET') {
1301
1371
  const options = merge.all([{
1302
- timeout: 1000,
1372
+ timeout: 10000,
1303
1373
  extract: true,
1304
1374
  url,
1305
1375
  }, globalOptions, customOptions]);
1306
1376
 
1307
1377
  const { limiter, interval, concurrency } = getLimiter(url, options);
1308
1378
 
1309
- const instance = axios.create({
1310
- method,
1311
- validateStatus: null,
1312
- headers: options.headers,
1313
- timeout: options.timeout,
1314
- signal: options.abortSignal,
1315
- // ...options,
1316
- // httpAgent: options.httpAgent || new http.Agent({ ...options.agent }),
1317
- });
1318
-
1319
- const isProxied = setProxy(instance, options, url);
1379
+ const agent = getAgent(options, url);
1320
1380
 
1321
1381
  const feedbackBase = {
1322
1382
  url,
1323
1383
  method,
1324
1384
  interval,
1325
1385
  concurrency,
1326
- isProxied,
1386
+ isProxied: agent instanceof undici.ProxyAgent,
1327
1387
  isBrowser: false,
1328
1388
  options,
1329
1389
  };
1330
1390
 
1331
1391
  events.emit('requestInit', feedbackBase);
1332
1392
 
1333
- const res = await limiter.schedule(async () => instance.request({
1334
- url,
1335
- data: body,
1393
+ const curatedBody = curateRequestBody(body);
1394
+ const curatedCookie = getCookie(options);
1395
+
1396
+ const res = await limiter.schedule(async () => undici.fetch(url, {
1397
+ dispatcher: agent,
1398
+ method,
1399
+ body: curatedBody.body,
1400
+ headers: {
1401
+ ...curatedBody.headers,
1402
+ ...options.headers,
1403
+ cookie: curatedCookie,
1404
+ },
1405
+ signal: options.abortSignal,
1406
+ })).catch((error) => ({ // tends to happen when proxy can't reach host
1407
+ status: 500,
1408
+ statusText: 'Request aborted',
1409
+ async text() { return error.cause?.cause?.message || 'Request aborted'; },
1336
1410
  }));
1337
1411
 
1338
1412
  if (!(res.status >= 200 && res.status < 300)) {
1339
- handleError(new Error(`HTTP response from ${url} not OK (${res.status} ${res.statusText}): ${res.data}`), 'HTTP_NOT_OK');
1413
+ const data = await res.text();
1414
+
1415
+ handleError(new Error(`HTTP response from ${url} not OK (${res.status} ${res.statusText}): ${data}`), 'HTTP_NOT_OK');
1340
1416
 
1341
1417
  events.emit('requestError', {
1342
1418
  ...feedbackBase,
@@ -1360,7 +1436,9 @@ async function request(url, body, customOptions = {}, method = 'GET') {
1360
1436
  statusText: res.statusText,
1361
1437
  });
1362
1438
 
1363
- return curateResponse(res, options, { url, customOptions });
1439
+ const data = await res.text();
1440
+
1441
+ return curateResponse(res, data, options, { url, customOptions });
1364
1442
  }
1365
1443
 
1366
1444
  async function get(url, options) {
package/tests/init.js CHANGED
@@ -19,7 +19,7 @@ async function initTest() {
19
19
  },
20
20
  },
21
21
  proxy: {
22
- host: '192.168.178.25',
22
+ host: '192.168.1.25',
23
23
  port: 8888,
24
24
  hostnames: ['127.0.0.2'],
25
25
  },
@@ -31,8 +31,21 @@ async function initTest() {
31
31
  // unprint.on('query', (queryData) => console.log('query', queryData));
32
32
 
33
33
  const res = await unprint.get(`http://127.0.0.1:${port}/html`, { select: 'body' });
34
- // const jsonRes = await unprint.get(`http://127.0.0.1:${port}/json`);
35
- // const errorRes = await unprint.get(`http://127.0.0.1:${port}/error/404`);
34
+
35
+ const jsonRes = await unprint.get(`http://127.0.0.1:${port}/json`);
36
+ const errorRes = await unprint.get(`http://127.0.0.1:${port}/error/404`);
37
+ const cookiesRes = await unprint.get(`http://127.0.0.1:${port}/json`, {
38
+ headers: {
39
+ cookie: 'foo=bar',
40
+ },
41
+ cookies: {
42
+ hello: 'world',
43
+ },
44
+ });
45
+
46
+ console.log('JSON RES', jsonRes);
47
+ console.log('ERROR RES', errorRes);
48
+ console.log('COOKIES RES', cookiesRes);
36
49
 
37
50
  console.log('title', res.context.query.content('//*[contains(text(), "Test")]'));
38
51
  console.log('date', res.context.query.date('#date', 'DD-MM-YYYY HH:mm'));