unprint 0.18.1 → 0.18.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -211,9 +211,13 @@ Extracts the CSS `url()` background from a style attribute. Alias for `query.sty
211
211
  Options
212
212
  * `select`: Pre-query and initialize a specific element on the page.
213
213
  * `selectAll`: Pre-query and initialize multiple specific element on the page.
214
+ * `interface`: Use undici `fetch` (browser-like, default) or `request` (raw)
215
+ * `userAgent`: The default user agent header
216
+ * `browserUserAgent`: The default user agent header for browser-like requests (`get` interface `fetch` and `browserRequest`)
217
+ * `apiUserAgent`: The default user agent header for raw requests (`get` interface `request`)
214
218
 
215
219
  Use Playwright with Chromium (experimental)
216
- * `unprint.browserRequest(url, [options])`
220
+ * `unprint.browserRequest(url, [options])` or `unprint.browser(url, [options])`
217
221
  * `unprint.closeAllBrowsers()`: Close reused browser instances.
218
222
 
219
223
  Additional options
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "unprint",
3
- "version": "0.18.1",
3
+ "version": "0.18.3",
4
4
  "description": "Simplify common web scraping tasks while staying in control of the data.",
5
5
  "main": "src/app.js",
6
6
  "scripts": {},
package/src/app.js CHANGED
@@ -16,6 +16,7 @@ const settings = {
16
16
  throwErrors: false,
17
17
  logErrors: true,
18
18
  requestTimeout: 30000,
19
+ userAgent: 'unprint',
19
20
  limits: {
20
21
  default: {
21
22
  interval: 10,
@@ -1056,9 +1057,11 @@ function getCookie(options) {
1056
1057
  return headerCookieData;
1057
1058
  }
1058
1059
 
1059
- function filterHeaders(headers, options) {
1060
+ function curateHeaders(headers, options) {
1060
1061
  if (headers && options.defaultHeaders !== false) {
1061
- return Object.fromEntries(Object.entries(headers).filter(([_key, value]) => value !== null));
1062
+ return Object.fromEntries(Object.entries(headers)
1063
+ .map(([key, value]) => [key.toLowerCase(), value])
1064
+ .filter(([_key, value]) => value !== null));
1062
1065
  }
1063
1066
 
1064
1067
  return headers;
@@ -1246,8 +1249,9 @@ async function browserRequest(url, customOptions = {}) {
1246
1249
  const headers = route.request().headers();
1247
1250
 
1248
1251
  route.continue({
1249
- headers: filterHeaders({
1252
+ headers: curateHeaders({
1250
1253
  ...headers,
1254
+ 'user-agent': options.browserUserAgent || options.userAgent,
1251
1255
  ...options.headers,
1252
1256
  cookie: getCookie(options),
1253
1257
  }, options),
@@ -1377,6 +1381,7 @@ function curateRequestBody(body) {
1377
1381
 
1378
1382
  async function request(url, body, customOptions = {}, method = 'GET') {
1379
1383
  const options = merge.all([{
1384
+ interface: 'fetch', // fetch or request
1380
1385
  timeout: 10000,
1381
1386
  extract: true,
1382
1387
  url,
@@ -1401,13 +1406,14 @@ async function request(url, body, customOptions = {}, method = 'GET') {
1401
1406
  const curatedBody = curateRequestBody(body);
1402
1407
  const curatedCookie = getCookie(options);
1403
1408
 
1404
- const headers = filterHeaders({
1409
+ const headers = curateHeaders({
1405
1410
  ...curatedBody.headers,
1411
+ 'user-agent': (options.interface === 'fetch' ? options.browserUserAgent : options.apiUserAgent) || options.userAgent,
1406
1412
  ...options.headers,
1407
1413
  cookie: curatedCookie,
1408
1414
  }, options);
1409
1415
 
1410
- const res = await limiter.schedule(async () => undici.fetch(url, {
1416
+ const res = await limiter.schedule(async () => undici[options.interface](url, {
1411
1417
  dispatcher: agent,
1412
1418
  method,
1413
1419
  body: curatedBody.body,
@@ -1419,20 +1425,24 @@ async function request(url, body, customOptions = {}, method = 'GET') {
1419
1425
  async text() { return error.cause?.cause?.message || 'Request aborted'; },
1420
1426
  }));
1421
1427
 
1422
- if (!(res.status >= 200 && res.status < 300)) {
1423
- const data = await res.text();
1428
+ const data = options.interface === 'fetch'
1429
+ ? await res.text()
1430
+ : await res.body.text();
1424
1431
 
1425
- handleError(new Error(`HTTP response from ${url} not OK (${res.status} ${res.statusText}): ${data}`), 'HTTP_NOT_OK');
1432
+ const status = res.statusCode || res.status;
1433
+
1434
+ if (!(status >= 200 && status < 300)) {
1435
+ handleError(new Error(`HTTP response from ${url} not OK (${status} ${res.statusText}): ${data}`), 'HTTP_NOT_OK');
1426
1436
 
1427
1437
  events.emit('requestError', {
1428
1438
  ...feedbackBase,
1429
- status: res.status,
1439
+ status,
1430
1440
  statusText: res.statusText,
1431
1441
  });
1432
1442
 
1433
1443
  return {
1434
1444
  ok: false,
1435
- status: res.status,
1445
+ status,
1436
1446
  statusText: res.statusText,
1437
1447
  headers: res.headers,
1438
1448
  response: res,
@@ -1442,12 +1452,10 @@ async function request(url, body, customOptions = {}, method = 'GET') {
1442
1452
 
1443
1453
  events.emit('requestSuccess', {
1444
1454
  ...feedbackBase,
1445
- status: res.status,
1455
+ status,
1446
1456
  statusText: res.statusText,
1447
1457
  });
1448
1458
 
1449
- const data = await res.text();
1450
-
1451
1459
  return curateResponse(res, data, options, { url, customOptions });
1452
1460
  }
1453
1461
 
package/tests/init.js CHANGED
@@ -11,7 +11,9 @@ const port = process.env.PORT || 3101;
11
11
 
12
12
  async function initTest() {
13
13
  unprint.options({
14
- headers: { 'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36' },
14
+ userAgent: 'unprint',
15
+ apiUserAgent: 'unprint',
16
+ browserUserAgent: 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36',
15
17
  limits: {
16
18
  default: {
17
19
  concurrency: 1,
@@ -30,7 +32,7 @@ async function initTest() {
30
32
  unprint.on('requestSuccess', (successData) => console.log('success', successData));
31
33
  // unprint.on('query', (queryData) => console.log('query', queryData));
32
34
 
33
- const res = await unprint.get(`http://127.0.0.1:${port}/html`, { select: 'body' });
35
+ const res = await unprint.get(`http://127.0.0.1:${port}/html`, { select: 'body', interface: 'request' });
34
36
 
35
37
  const jsonRes = await unprint.get(`http://127.0.0.1:${port}/json`);
36
38
  const errorRes = await unprint.get(`http://127.0.0.1:${port}/error/404`);
@@ -44,9 +46,15 @@ async function initTest() {
44
46
  },
45
47
  });
46
48
 
49
+ const proxyRes = await unprint.get('https://api.ipify.org?format=json', {
50
+ interface: 'request',
51
+ useProxy: true,
52
+ });
53
+
47
54
  console.log('JSON RES', jsonRes);
48
55
  console.log('ERROR RES', errorRes);
49
56
  console.log('COOKIES RES', cookiesRes);
57
+ console.log('PROXY RES', proxyRes.data);
50
58
 
51
59
  console.log('title', res.context.query.content('//*[contains(text(), "Test")]'));
52
60
  console.log('date', res.context.query.date('#date', 'DD-MM-YYYY HH:mm'));