unprint 0.17.8 → 0.18.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/.eslintrc CHANGED
@@ -2,7 +2,7 @@
2
2
  "extends": "airbnb-base",
3
3
  "parserOptions": {
4
4
  "sourceType": "script",
5
- "ecmaVersion": 2020
5
+ "ecmaVersion": "latest"
6
6
  },
7
7
  "rules": {
8
8
  "strict": 0,
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "unprint",
3
- "version": "0.17.8",
3
+ "version": "0.18.0",
4
4
  "description": "Simplify common web scraping tasks while staying in control of the data.",
5
5
  "main": "src/app.js",
6
6
  "scripts": {},
@@ -20,8 +20,8 @@
20
20
  },
21
21
  "homepage": "https://github.com/ThePendulum/unprint#readme",
22
22
  "dependencies": {
23
- "axios": "^0.27.2",
24
23
  "bottleneck": "^2.19.5",
24
+ "cookie": "^1.1.1",
25
25
  "deepmerge": "^4.2.2",
26
26
  "eslint": "^8.17.0",
27
27
  "eslint-config-airbnb": "^19.0.4",
@@ -31,7 +31,8 @@
31
31
  "object-hash": "^3.0.0",
32
32
  "patchright": "^1.56.1",
33
33
  "srcset": "^4.0.0",
34
- "tunnel": "^0.0.6"
34
+ "tunnel": "^0.0.6",
35
+ "undici": "^7.18.2"
35
36
  },
36
37
  "devDependencies": {
37
38
  "@playwright/test": "^1.56.1",
package/src/app.js CHANGED
@@ -3,10 +3,9 @@
3
3
  const { JSDOM, VirtualConsole } = require('jsdom');
4
4
  const { chromium } = require('patchright');
5
5
  const EventEmitter = require('events');
6
- const http = require('http');
7
- const https = require('https');
8
- const tunnel = require('tunnel');
9
- const axios = require('axios').default;
6
+ const undici = require('undici');
7
+ const qs = require('node:querystring');
8
+ const cookie = require('cookie');
10
9
  const Bottleneck = require('bottleneck');
11
10
  const moment = require('moment-timezone');
12
11
  const merge = require('deepmerge');
@@ -1034,38 +1033,88 @@ function getLimiter(url, options) {
1034
1033
  };
1035
1034
  }
1036
1035
 
1037
- /* eslint-disable no-param-reassign */
1038
- function setProxy(instance, options, url) {
1039
- const { hostname } = new URL(url);
1036
+ function getCookie(options) {
1037
+ const headerCookieData = options.headers?.cookie || options.headers?.Cookie || null;
1038
+ const headerCookies = headerCookieData && cookie.parseCookie(headerCookieData);
1040
1039
 
1041
- if (options.proxy
1042
- && options.proxy.enable !== false
1043
- && (options.proxy.use
1044
- || options.proxy.hostnames?.includes(hostname))
1045
- ) {
1046
- const proxyAgent = tunnel.httpsOverHttp({
1047
- proxy: {
1048
- host: options.proxy.host,
1049
- port: options.proxy.port,
1050
- },
1040
+ if (typeof options.cookies === 'object') {
1041
+ return cookie.stringifyCookie({
1042
+ ...headerCookies,
1043
+ ...options.cookies,
1051
1044
  });
1045
+ }
1046
+
1047
+ if (typeof options.cookies === 'string') {
1048
+ const cookieData = cookie.parseCookie(options.cookies);
1049
+
1050
+ return cookie.stringifyCookie({
1051
+ ...headerCookies,
1052
+ ...cookieData,
1053
+ });
1054
+ }
1055
+
1056
+ return headerCookieData;
1057
+ }
1052
1058
 
1053
- if (instance) {
1054
- instance.defaults.httpAgent = proxyAgent;
1055
- instance.defaults.httpsAgent = proxyAgent;
1059
+ function curateResponse(res, data, options, { url, control, customOptions }) {
1060
+ const base = {
1061
+ ok: true,
1062
+ data,
1063
+ status: res.statusCode || res.status,
1064
+ statusText: res.statusText,
1065
+ headers: res.headers,
1066
+ response: res,
1067
+ res,
1068
+ control,
1069
+ };
1070
+
1071
+ if (['application/json', 'application/javascript'].some((type) => {
1072
+ if (typeof res.headers.get === 'function') {
1073
+ return res.headers.get('content-type')?.includes(type);
1056
1074
  }
1057
1075
 
1058
- return true;
1076
+ return res.headers['content-type']?.includes(type);
1077
+ })) {
1078
+ if (typeof data === 'object') {
1079
+ return {
1080
+ ...base,
1081
+ data,
1082
+ };
1083
+ }
1084
+
1085
+ try {
1086
+ return {
1087
+ ...base,
1088
+ data: JSON.parse(data),
1089
+ };
1090
+ } catch (error) {
1091
+ return {
1092
+ ...base,
1093
+ data,
1094
+ };
1095
+ }
1059
1096
  }
1060
1097
 
1061
- if (instance) {
1062
- instance.defaults.httpAgent = options.httpsAgent || new http.Agent({ ...options.agent });
1063
- instance.defaults.httpsAgent = options.httpsAgent || new https.Agent({ ...options.agent });
1098
+ if (!options.extract) {
1099
+ return base;
1064
1100
  }
1065
1101
 
1066
- return false;
1102
+ const contextOptions = {
1103
+ ...customOptions,
1104
+ origin: url,
1105
+ };
1106
+
1107
+ const context = options.selectAll
1108
+ ? initAll(data, options.selectAll, contextOptions)
1109
+ : init(data, options.select, contextOptions);
1110
+
1111
+ return {
1112
+ ...base,
1113
+ context,
1114
+ };
1067
1115
  }
1068
1116
 
1117
+ /* eslint-disable no-param-reassign */
1069
1118
  const clients = new Map();
1070
1119
 
1071
1120
  /* eslint-enable no-param-reassign */
@@ -1127,46 +1176,36 @@ async function closeAllBrowsers() {
1127
1176
  await Promise.all(Array.from(clients.values()).map(async (client) => client.browser.close()));
1128
1177
  }
1129
1178
 
1130
- function curateResponse(res, options, { url, control, customOptions }) {
1131
- const base = {
1132
- ok: true,
1133
- status: res.status,
1134
- statusText: res.statusText,
1135
- headers: res.headers,
1136
- response: res,
1137
- res,
1138
- control,
1139
- };
1140
-
1141
- if (['application/json', 'application/javascript'].some((type) => res.headers['content-type']?.includes(type)) && typeof res.data === 'object') {
1142
- return {
1143
- ...base,
1144
- data: res.data,
1145
- };
1146
- }
1147
-
1148
- if (!options.extract) {
1149
- return base;
1179
+ async function closeBrowser(client, options) {
1180
+ if (options.client === null // this browser is single-use
1181
+ || (client.retired && client.active === 0)) { // this browser is retired to minimize garbage build-up
1182
+ // this browser won't be reused
1183
+ await client.browser.close();
1150
1184
  }
1185
+ }
1151
1186
 
1152
- const contextOptions = {
1153
- ...customOptions,
1154
- origin: url,
1155
- };
1187
+ function getAgent(options, url) {
1188
+ const { hostname } = new URL(url);
1156
1189
 
1157
- const context = options.selectAll
1158
- ? initAll(res.data, options.selectAll, contextOptions)
1159
- : init(res.data, options.select, contextOptions);
1190
+ if (options.proxy
1191
+ && options.proxy.enable !== false
1192
+ && (options.useProxy // defined locally
1193
+ || options.proxy.use // defined globally
1194
+ || options.proxy.hostnames?.includes(hostname))
1195
+ ) {
1196
+ return new undici.ProxyAgent(`http://${options.proxy.host}:${options.proxy.port}/`, {
1197
+ bodyTimeout: options.timeout,
1198
+ });
1199
+ }
1160
1200
 
1161
- return {
1162
- ...base,
1163
- context,
1164
- };
1201
+ return new undici.Agent({
1202
+ bodyTimeout: options.timeout,
1203
+ });
1165
1204
  }
1166
1205
 
1167
1206
  async function browserRequest(url, customOptions = {}) {
1168
1207
  const options = merge.all([{
1169
- timeout: 1000,
1208
+ timeout: 10000,
1170
1209
  extract: true,
1171
1210
  client: 'main',
1172
1211
  limiter: 'browser',
@@ -1174,14 +1213,14 @@ async function browserRequest(url, customOptions = {}) {
1174
1213
  }, globalOptions, customOptions]);
1175
1214
 
1176
1215
  const { limiter, interval, concurrency } = getLimiter(url, options);
1177
- const useProxy = setProxy(null, options, url);
1216
+ const agent = getAgent(options, url);
1178
1217
 
1179
1218
  const feedbackBase = {
1180
1219
  url,
1181
1220
  method: 'get',
1182
1221
  interval,
1183
1222
  concurrency,
1184
- isProxied: useProxy,
1223
+ isProxied: agent instanceof undici.ProxyAgent,
1185
1224
  isBrowser: true,
1186
1225
  options,
1187
1226
  };
@@ -1189,22 +1228,44 @@ async function browserRequest(url, customOptions = {}) {
1189
1228
  events.emit('requestInit', feedbackBase);
1190
1229
 
1191
1230
  return limiter.schedule(async () => {
1192
- const client = await getBrowserInstance(options.client, options, useProxy);
1231
+ const client = await getBrowserInstance(options.client, options, agent instanceof undici.ProxyAgent);
1193
1232
 
1194
1233
  client.active += 1;
1195
1234
 
1196
1235
  const page = await client.context.newPage();
1197
1236
 
1237
+ await page.route(url, async (route) => {
1238
+ const headers = route.request().headers();
1239
+
1240
+ route.continue({
1241
+ headers: {
1242
+ ...headers,
1243
+ ...options.headers,
1244
+ cookie: getCookie(options),
1245
+ },
1246
+ });
1247
+ });
1248
+
1198
1249
  const res = await page.goto(url, {
1199
1250
  ...options.page,
1200
- });
1251
+ }).catch((error) => error);
1252
+
1253
+ if (res instanceof Error) {
1254
+ return {
1255
+ ok: false,
1256
+ status: null,
1257
+ statusText: res.name,
1258
+ };
1259
+ }
1201
1260
 
1202
1261
  const status = res.status();
1203
1262
  const statusText = res.statusText();
1204
1263
  const headers = await res.allHeaders();
1205
1264
 
1206
1265
  if (!(status >= 200 && status < 300)) {
1207
- handleError(new Error(`HTTP response from ${url} not OK (${status} ${statusText}): ${res.data}`), 'HTTP_NOT_OK');
1266
+ const data = await page.content();
1267
+
1268
+ handleError(new Error(`HTTP response from ${url} not OK (${status} ${statusText}): ${data}`), 'HTTP_NOT_OK');
1208
1269
 
1209
1270
  events.emit('requestError', {
1210
1271
  ...feedbackBase,
@@ -1214,6 +1275,8 @@ async function browserRequest(url, customOptions = {}) {
1214
1275
 
1215
1276
  client.active -= 1;
1216
1277
 
1278
+ await closeBrowser(client, options);
1279
+
1217
1280
  return {
1218
1281
  ok: false,
1219
1282
  status,
@@ -1236,6 +1299,8 @@ async function browserRequest(url, customOptions = {}) {
1236
1299
  } catch (error) {
1237
1300
  client.active -= 1;
1238
1301
 
1302
+ await closeBrowser(client, options);
1303
+
1239
1304
  return {
1240
1305
  ok: false,
1241
1306
  controlError: error.message,
@@ -1262,18 +1327,13 @@ async function browserRequest(url, customOptions = {}) {
1262
1327
 
1263
1328
  client.active -= 1;
1264
1329
 
1265
- if (options.client === null // this browser is single-use
1266
- || (client.retired && client.active === 0)) { // this browser is retired to minimize garbage build-up
1267
- // this browser won't be reused
1268
- await client.browser.close();
1269
- }
1330
+ await closeBrowser(client, options);
1270
1331
 
1271
1332
  return curateResponse({
1272
- data,
1273
1333
  status,
1274
1334
  statusText,
1275
1335
  headers,
1276
- }, options, {
1336
+ }, data, options, {
1277
1337
  url,
1278
1338
  customOptions,
1279
1339
  control,
@@ -1281,46 +1341,78 @@ async function browserRequest(url, customOptions = {}) {
1281
1341
  });
1282
1342
  }
1283
1343
 
1344
+ function curateRequestBody(body) {
1345
+ if (!body) {
1346
+ return { body };
1347
+ }
1348
+
1349
+ if (body instanceof undici.FormData) {
1350
+ return {
1351
+ body: qs.stringify(body),
1352
+ headers: {
1353
+ 'content-type': 'application/x-www-form-urlencoded',
1354
+ },
1355
+ };
1356
+ }
1357
+
1358
+ if (typeof body === 'object') {
1359
+ return {
1360
+ body: JSON.stringify(body),
1361
+ headers: {
1362
+ 'content-type': 'application/json',
1363
+ },
1364
+ };
1365
+ }
1366
+
1367
+ return { body };
1368
+ }
1369
+
1284
1370
  async function request(url, body, customOptions = {}, method = 'GET') {
1285
1371
  const options = merge.all([{
1286
- timeout: 1000,
1372
+ timeout: 10000,
1287
1373
  extract: true,
1288
1374
  url,
1289
1375
  }, globalOptions, customOptions]);
1290
1376
 
1291
1377
  const { limiter, interval, concurrency } = getLimiter(url, options);
1292
1378
 
1293
- const instance = axios.create({
1294
- method,
1295
- validateStatus: null,
1296
- headers: options.headers,
1297
- timeout: options.timeout,
1298
- signal: options.abortSignal,
1299
- // ...options,
1300
- // httpAgent: options.httpAgent || new http.Agent({ ...options.agent }),
1301
- });
1302
-
1303
- const isProxied = setProxy(instance, options, url);
1379
+ const agent = getAgent(options, url);
1304
1380
 
1305
1381
  const feedbackBase = {
1306
1382
  url,
1307
1383
  method,
1308
1384
  interval,
1309
1385
  concurrency,
1310
- isProxied,
1386
+ isProxied: agent instanceof undici.ProxyAgent,
1311
1387
  isBrowser: false,
1312
1388
  options,
1313
1389
  };
1314
1390
 
1315
1391
  events.emit('requestInit', feedbackBase);
1316
1392
 
1317
- const res = await limiter.schedule(async () => instance.request({
1318
- url,
1319
- data: body,
1393
+ const curatedBody = curateRequestBody(body);
1394
+ const curatedCookie = getCookie(options);
1395
+
1396
+ const res = await limiter.schedule(async () => undici.fetch(url, {
1397
+ dispatcher: agent,
1398
+ method,
1399
+ body: curatedBody.body,
1400
+ headers: {
1401
+ ...curatedBody.headers,
1402
+ ...options.headers,
1403
+ cookie: curatedCookie,
1404
+ },
1405
+ signal: options.abortSignal,
1406
+ })).catch((error) => ({ // tends to happen when proxy can't reach host
1407
+ status: 500,
1408
+ statusText: 'Request aborted',
1409
+ async text() { return error.cause?.cause?.message || 'Request aborted'; },
1320
1410
  }));
1321
1411
 
1322
1412
  if (!(res.status >= 200 && res.status < 300)) {
1323
- handleError(new Error(`HTTP response from ${url} not OK (${res.status} ${res.statusText}): ${res.data}`), 'HTTP_NOT_OK');
1413
+ const data = await res.text();
1414
+
1415
+ handleError(new Error(`HTTP response from ${url} not OK (${res.status} ${res.statusText}): ${data}`), 'HTTP_NOT_OK');
1324
1416
 
1325
1417
  events.emit('requestError', {
1326
1418
  ...feedbackBase,
@@ -1344,7 +1436,9 @@ async function request(url, body, customOptions = {}, method = 'GET') {
1344
1436
  statusText: res.statusText,
1345
1437
  });
1346
1438
 
1347
- return curateResponse(res, options, { url, customOptions });
1439
+ const data = await res.text();
1440
+
1441
+ return curateResponse(res, data, options, { url, customOptions });
1348
1442
  }
1349
1443
 
1350
1444
  async function get(url, options) {
package/tests/browser.js CHANGED
@@ -16,6 +16,8 @@ unprint.options({ // or unprint.options();
16
16
 
17
17
  async function initTest() {
18
18
  // concurrency
19
+ /*
20
+ console.log('TEST CONCURRENCY');
19
21
  await Promise.all(Array.from({ length: 20 }).map(async () => {
20
22
  // await unprint.browser(`https://tools-httpstatus.pickup-services.com/${Math.random() < 0.2 ? '404' : '200'}?sleep=${Math.round(Math.random() * 500)}`, {
21
23
  await unprint.browser(`https://tools-httpstatus.pickup-services.com/200?sleep=${Math.round(Math.random() * 5000)}`, {
@@ -26,11 +28,24 @@ async function initTest() {
26
28
  },
27
29
  });
28
30
  }));
31
+ */
29
32
 
30
33
  // console.log('Requests done, waiting...');
31
34
 
32
35
  // await new Promise((resolve) => { setTimeout(() => resolve(), 60 * 60 * 1000); });
36
+ // timeout
37
+ console.log('TEST TIMEOUT');
38
+ await unprint.browser('https://tools-httpstatus.pickup-services.com/200?sleep=30000', {
39
+ // client: null,
40
+ browser: {
41
+ headless: true,
42
+ },
43
+ page: {
44
+ timeout: 5000,
45
+ },
46
+ });
33
47
 
48
+ /*
34
49
  await Promise.all([
35
50
  unprint.browser('https://tools-httpstatus.pickup-services.com/200?sleep=5000', {
36
51
  browser: {
@@ -56,6 +71,7 @@ async function initTest() {
56
71
  }),
57
72
  ]);
58
73
 
74
+ console.log('TEST SCRAPE');
59
75
  const res = await unprint.browser('https://www.scrapingcourse.com/', {
60
76
  browser: {
61
77
  headless: false,
@@ -69,6 +85,8 @@ async function initTest() {
69
85
 
70
86
  console.log('CARD TITLES', cards);
71
87
  console.log('CONTROL OUT', res.control);
88
+ */
89
+ console.log('CLOSING ALL BROWSERS');
72
90
 
73
91
  await unprint.closeAllBrowsers();
74
92
  }
package/tests/init.js CHANGED
@@ -19,7 +19,7 @@ async function initTest() {
19
19
  },
20
20
  },
21
21
  proxy: {
22
- host: '192.168.178.25',
22
+ host: '192.168.1.25',
23
23
  port: 8888,
24
24
  hostnames: ['127.0.0.2'],
25
25
  },
@@ -31,8 +31,21 @@ async function initTest() {
31
31
  // unprint.on('query', (queryData) => console.log('query', queryData));
32
32
 
33
33
  const res = await unprint.get(`http://127.0.0.1:${port}/html`, { select: 'body' });
34
- // const jsonRes = await unprint.get(`http://127.0.0.1:${port}/json`);
35
- // const errorRes = await unprint.get(`http://127.0.0.1:${port}/error/404`);
34
+
35
+ const jsonRes = await unprint.get(`http://127.0.0.1:${port}/json`);
36
+ const errorRes = await unprint.get(`http://127.0.0.1:${port}/error/404`);
37
+ const cookiesRes = await unprint.get(`http://127.0.0.1:${port}/json`, {
38
+ headers: {
39
+ cookie: 'foo=bar',
40
+ },
41
+ cookies: {
42
+ hello: 'world',
43
+ },
44
+ });
45
+
46
+ console.log('JSON RES', jsonRes);
47
+ console.log('ERROR RES', errorRes);
48
+ console.log('COOKIES RES', cookiesRes);
36
49
 
37
50
  console.log('title', res.context.query.content('//*[contains(text(), "Test")]'));
38
51
  console.log('date', res.context.query.date('#date', 'DD-MM-YYYY HH:mm'));