unprint 0.18.0 → 0.18.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -211,9 +211,10 @@ Extracts the CSS `url()` background from a style attribute. Alias for `query.sty
211
211
  Options
212
212
  * `select`: Pre-query and initialize a specific element on the page.
213
213
  * `selectAll`: Pre-query and initialize multiple specific element on the page.
214
+ * `interface`: Use undici `fetch` (browser-like, default) or `request` (raw)
214
215
 
215
216
  Use Playwright with Chromium (experimental)
216
- * `unprint.browserRequest(url, [options])`
217
+ * `unprint.browserRequest(url, [options])` or `unprint.browser(url, [options])`
217
218
  * `unprint.closeAllBrowsers()`: Close reused browser instances.
218
219
 
219
220
  Additional options
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "unprint",
3
- "version": "0.18.0",
3
+ "version": "0.18.2",
4
4
  "description": "Simplify common web scraping tasks while staying in control of the data.",
5
5
  "main": "src/app.js",
6
6
  "scripts": {},
package/src/app.js CHANGED
@@ -1056,6 +1056,14 @@ function getCookie(options) {
1056
1056
  return headerCookieData;
1057
1057
  }
1058
1058
 
1059
+ function filterHeaders(headers, options) {
1060
+ if (headers && options.defaultHeaders !== false) {
1061
+ return Object.fromEntries(Object.entries(headers).filter(([_key, value]) => value !== null));
1062
+ }
1063
+
1064
+ return headers;
1065
+ }
1066
+
1059
1067
  function curateResponse(res, data, options, { url, control, customOptions }) {
1060
1068
  const base = {
1061
1069
  ok: true,
@@ -1238,11 +1246,11 @@ async function browserRequest(url, customOptions = {}) {
1238
1246
  const headers = route.request().headers();
1239
1247
 
1240
1248
  route.continue({
1241
- headers: {
1249
+ headers: filterHeaders({
1242
1250
  ...headers,
1243
1251
  ...options.headers,
1244
1252
  cookie: getCookie(options),
1245
- },
1253
+ }, options),
1246
1254
  });
1247
1255
  });
1248
1256
 
@@ -1369,6 +1377,7 @@ function curateRequestBody(body) {
1369
1377
 
1370
1378
  async function request(url, body, customOptions = {}, method = 'GET') {
1371
1379
  const options = merge.all([{
1380
+ interface: 'fetch', // fetch or request
1372
1381
  timeout: 10000,
1373
1382
  extract: true,
1374
1383
  url,
@@ -1393,15 +1402,17 @@ async function request(url, body, customOptions = {}, method = 'GET') {
1393
1402
  const curatedBody = curateRequestBody(body);
1394
1403
  const curatedCookie = getCookie(options);
1395
1404
 
1396
- const res = await limiter.schedule(async () => undici.fetch(url, {
1405
+ const headers = filterHeaders({
1406
+ ...curatedBody.headers,
1407
+ ...options.headers,
1408
+ cookie: curatedCookie,
1409
+ }, options);
1410
+
1411
+ const res = await limiter.schedule(async () => undici[options.interface](url, {
1397
1412
  dispatcher: agent,
1398
1413
  method,
1399
1414
  body: curatedBody.body,
1400
- headers: {
1401
- ...curatedBody.headers,
1402
- ...options.headers,
1403
- cookie: curatedCookie,
1404
- },
1415
+ headers,
1405
1416
  signal: options.abortSignal,
1406
1417
  })).catch((error) => ({ // tends to happen when proxy can't reach host
1407
1418
  status: 500,
@@ -1409,20 +1420,24 @@ async function request(url, body, customOptions = {}, method = 'GET') {
1409
1420
  async text() { return error.cause?.cause?.message || 'Request aborted'; },
1410
1421
  }));
1411
1422
 
1412
- if (!(res.status >= 200 && res.status < 300)) {
1413
- const data = await res.text();
1423
+ const data = options.interface === 'fetch'
1424
+ ? await res.text()
1425
+ : await res.body.text();
1426
+
1427
+ const status = res.statusCode || res.status;
1414
1428
 
1415
- handleError(new Error(`HTTP response from ${url} not OK (${res.status} ${res.statusText}): ${data}`), 'HTTP_NOT_OK');
1429
+ if (!(status >= 200 && status < 300)) {
1430
+ handleError(new Error(`HTTP response from ${url} not OK (${status} ${res.statusText}): ${data}`), 'HTTP_NOT_OK');
1416
1431
 
1417
1432
  events.emit('requestError', {
1418
1433
  ...feedbackBase,
1419
- status: res.status,
1434
+ status,
1420
1435
  statusText: res.statusText,
1421
1436
  });
1422
1437
 
1423
1438
  return {
1424
1439
  ok: false,
1425
- status: res.status,
1440
+ status,
1426
1441
  statusText: res.statusText,
1427
1442
  headers: res.headers,
1428
1443
  response: res,
@@ -1432,12 +1447,10 @@ async function request(url, body, customOptions = {}, method = 'GET') {
1432
1447
 
1433
1448
  events.emit('requestSuccess', {
1434
1449
  ...feedbackBase,
1435
- status: res.status,
1450
+ status,
1436
1451
  statusText: res.statusText,
1437
1452
  });
1438
1453
 
1439
- const data = await res.text();
1440
-
1441
1454
  return curateResponse(res, data, options, { url, customOptions });
1442
1455
  }
1443
1456
 
package/tests/init.js CHANGED
@@ -30,23 +30,31 @@ async function initTest() {
30
30
  unprint.on('requestSuccess', (successData) => console.log('success', successData));
31
31
  // unprint.on('query', (queryData) => console.log('query', queryData));
32
32
 
33
- const res = await unprint.get(`http://127.0.0.1:${port}/html`, { select: 'body' });
33
+ const res = await unprint.get(`http://127.0.0.1:${port}/html`, { select: 'body', interface: 'request' });
34
34
 
35
35
  const jsonRes = await unprint.get(`http://127.0.0.1:${port}/json`);
36
36
  const errorRes = await unprint.get(`http://127.0.0.1:${port}/error/404`);
37
37
  const cookiesRes = await unprint.get(`http://127.0.0.1:${port}/json`, {
38
38
  headers: {
39
39
  cookie: 'foo=bar',
40
+ 'User-Agent': null,
40
41
  },
41
42
  cookies: {
42
43
  hello: 'world',
43
44
  },
44
45
  });
45
46
 
47
+ const proxyRes = await unprint.get('https://api.ipify.org?format=json', {
48
+ interface: 'request',
49
+ useProxy: true,
50
+ });
51
+
46
52
  console.log('JSON RES', jsonRes);
47
53
  console.log('ERROR RES', errorRes);
48
54
  console.log('COOKIES RES', cookiesRes);
49
55
 
56
+ console.log('PROXY RES', proxyRes.data);
57
+
50
58
  console.log('title', res.context.query.content('//*[contains(text(), "Test")]'));
51
59
  console.log('date', res.context.query.date('#date', 'DD-MM-YYYY HH:mm'));
52
60
  console.log('date xpath', res.context.query.date('//div[contains(text(), "Today:")]', 'MMM DD, YYYY'));