unprint 0.19.0 → 0.19.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -223,9 +223,11 @@ Extracts the CSS `url()` background from a style attribute. Alias for `query.sty
223
223
  ### HTTP request
224
224
  * `unprint.get(url, [options])`
225
225
  * `unprint.post(url, body, [options])`
226
- * `unprint.request(url, body, [options], [method])`
226
+ * `unprint.request(url, [options])`
227
227
 
228
228
  Options
229
+ * `method`: `get` (default)`, `post`, `browser` (same as `useBrowser`)
230
+ * `body`: POST body, ignored for `get` and `browser` requests
229
231
  * `select`: Pre-query and initialize a specific element on the page.
230
232
  * `selectAll`: Pre-query and initialize multiple specific element on the page.
231
233
  * `interface`: Use undici `fetch` (browser-like, default) or `request` (raw)
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "unprint",
3
- "version": "0.19.0",
3
+ "version": "0.19.2",
4
4
  "description": "Simplify common web scraping tasks while staying in control of the data.",
5
5
  "main": "src/app.js",
6
6
  "repository": {
package/src/app.js CHANGED
@@ -22,7 +22,7 @@ const settings = {
22
22
  requestTimeout: 30000,
23
23
  userAgent: 'unprint',
24
24
  remote: {
25
- enabled: false,
25
+ enable: false,
26
26
  address: 'http://127.0.0.1:3333/browser',
27
27
  methods: ['browser'],
28
28
  },
@@ -46,9 +46,10 @@ let globalOptions = {
46
46
  };
47
47
 
48
48
  const events = new EventEmitter();
49
+ const arrayMerge = (_destinationArray, sourceArray) => sourceArray;
49
50
 
50
51
  function configure(newOptions) {
51
- globalOptions = merge(globalOptions, newOptions);
52
+ globalOptions = merge(globalOptions, newOptions, { arrayMerge });
52
53
  }
53
54
 
54
55
  function handleError(error, code) {
@@ -1338,7 +1339,7 @@ function getAgent(options, url) {
1338
1339
  });
1339
1340
  }
1340
1341
 
1341
- async function remoteRequest(url, method, options) {
1342
+ async function remoteRequest(url, method, options, feedbackBase) {
1342
1343
  const control = typeof options.control === 'function' ? options.control.toString() : null;
1343
1344
 
1344
1345
  const res = await undici.fetch(`${options.remote.address}/request`, {
@@ -1368,6 +1369,12 @@ async function remoteRequest(url, method, options) {
1368
1369
  const body = await res.text();
1369
1370
  const data = JSON.parse(body);
1370
1371
 
1372
+ events.emit('requestSuccess', {
1373
+ ...feedbackBase,
1374
+ status: data.status,
1375
+ statusText: data.statusText,
1376
+ });
1377
+
1371
1378
  return curateResponse({
1372
1379
  status: data.status,
1373
1380
  statusText: data.statusText,
@@ -1380,7 +1387,7 @@ async function remoteRequest(url, method, options) {
1380
1387
  }
1381
1388
 
1382
1389
  function useRemoteRequest(options, method) {
1383
- if (options.remote.enabled) {
1390
+ if (options.remote.enable) {
1384
1391
  if (options.useRemote) {
1385
1392
  return true;
1386
1393
  }
@@ -1402,12 +1409,9 @@ async function browserRequest(url, customOptions = {}) {
1402
1409
  url,
1403
1410
  }, globalOptions, customOptions]);
1404
1411
 
1405
- if (useRemoteRequest(options, 'browser')) {
1406
- return remoteRequest(url, 'browser', options);
1407
- }
1408
-
1409
1412
  const { limiter, interval, concurrency } = getLimiter(url, options);
1410
1413
  const agent = getAgent(options, url);
1414
+ const isRemote = useRemoteRequest(options, 'browser');
1411
1415
 
1412
1416
  const feedbackBase = {
1413
1417
  url,
@@ -1416,11 +1420,16 @@ async function browserRequest(url, customOptions = {}) {
1416
1420
  concurrency,
1417
1421
  isProxied: agent instanceof undici.ProxyAgent,
1418
1422
  isBrowser: true,
1423
+ isRemote,
1419
1424
  options,
1420
1425
  };
1421
1426
 
1422
1427
  events.emit('requestInit', feedbackBase);
1423
1428
 
1429
+ if (isRemote) {
1430
+ return remoteRequest(url, 'browser', options, feedbackBase);
1431
+ }
1432
+
1424
1433
  return limiter.schedule(async () => {
1425
1434
  const client = await getBrowserInstance(options.client, options, agent instanceof undici.ProxyAgent);
1426
1435
 
@@ -1576,8 +1585,10 @@ function curateRequestBody(body, options) {
1576
1585
  return { body };
1577
1586
  }
1578
1587
 
1579
- async function request(url, body, customOptions = {}, method = 'GET', redirects = 0) {
1588
+ async function request(url, customOptions = {}, redirects = 0) {
1580
1589
  const options = merge.all([{
1590
+ method: 'get',
1591
+ body: null,
1581
1592
  interface: 'fetch', // fetch or request
1582
1593
  timeout: 10000,
1583
1594
  extract: true,
@@ -1586,17 +1597,17 @@ async function request(url, body, customOptions = {}, method = 'GET', redirects
1586
1597
  url,
1587
1598
  }, globalOptions, customOptions]);
1588
1599
 
1589
- if (options.useBrowser) {
1590
- return browserRequest(url, options);
1591
- }
1600
+ const method = options.method.toUpperCase(); // uppercase required by undici
1601
+ const body = options.body;
1592
1602
 
1593
- if (useRemoteRequest(options, method)) {
1594
- return remoteRequest(url, method, options);
1603
+ if (options.useBrowser || options.method === 'browser') {
1604
+ return browserRequest(url, options);
1595
1605
  }
1596
1606
 
1597
1607
  const { limiter, interval, concurrency } = getLimiter(url, options);
1598
1608
 
1599
1609
  const agent = getAgent(options, url);
1610
+ const isRemote = useRemoteRequest(options, method);
1600
1611
 
1601
1612
  const feedbackBase = {
1602
1613
  url,
@@ -1605,11 +1616,16 @@ async function request(url, body, customOptions = {}, method = 'GET', redirects
1605
1616
  concurrency,
1606
1617
  isProxied: agent instanceof undici.ProxyAgent,
1607
1618
  isBrowser: false,
1619
+ isRemote,
1608
1620
  options,
1609
1621
  };
1610
1622
 
1611
1623
  events.emit('requestInit', feedbackBase);
1612
1624
 
1625
+ if (isRemote) {
1626
+ return remoteRequest(url, method, options, feedbackBase);
1627
+ }
1628
+
1613
1629
  const curatedBody = curateRequestBody(body, options);
1614
1630
  const curatedCookie = getCookie(options);
1615
1631
 
@@ -1681,11 +1697,18 @@ async function request(url, body, customOptions = {}, method = 'GET', redirects
1681
1697
  }
1682
1698
 
1683
1699
  async function get(url, options) {
1684
- return request(url, null, options, 'GET');
1700
+ return request(url, {
1701
+ ...options,
1702
+ method: 'GET',
1703
+ });
1685
1704
  }
1686
1705
 
1687
1706
  async function post(url, body, options) {
1688
- return request(url, body, options, 'POST');
1707
+ return request(url, {
1708
+ ...options,
1709
+ method: 'POST',
1710
+ body,
1711
+ });
1689
1712
  }
1690
1713
 
1691
1714
  function on(trigger, fn) {
package/src/server.js CHANGED
@@ -42,7 +42,7 @@ function curateOptions(options) {
42
42
  return {
43
43
  ...options,
44
44
  remote: {
45
- enabled: false,
45
+ enable: false,
46
46
  },
47
47
  useRemote: false,
48
48
  control: options.control
@@ -60,9 +60,11 @@ async function handleRequest(req, res, unprint, method) {
60
60
 
61
61
  const options = curateOptions(req.body.options);
62
62
 
63
- const unprintRes = req.body.method === 'post'
64
- ? await unprint.post(req.body.url, req.body.data, options)
65
- : await unprint[(method || req.body.method || 'get').toLowerCase()](req.body.url, options);
63
+ const unprintRes = await unprint.request(req.body.url, {
64
+ ...options,
65
+ method: req.body.method,
66
+ body: req.body.data,
67
+ });
66
68
 
67
69
  res.send({
68
70
  ok: unprintRes.ok,
@@ -124,7 +126,12 @@ async function initServer(address, unprint) {
124
126
  });
125
127
  });
126
128
 
127
- app.listen(port, host, () => {
129
+ app.listen(port, host, (error) => {
130
+ if (error) {
131
+ logger.error(`Failed to start server: ${error.message}`);
132
+ return;
133
+ }
134
+
128
135
  logger.info(`Started unprint server on ${host}:${port}`);
129
136
  });
130
137
  }
package/tests/init.js CHANGED
@@ -34,6 +34,7 @@ async function initTest() {
34
34
 
35
35
  const res = await unprint.get(`http://127.0.0.1:${port}/html`, { select: 'body', interface: 'request' });
36
36
 
37
+ /*
37
38
  const jsonRes = await unprint.get(`http://127.0.0.1:${port}/json`);
38
39
  const errorRes = await unprint.get(`http://127.0.0.1:${port}/error/404`);
39
40
  const cookiesRes = await unprint.get(`http://127.0.0.1:${port}/json`, {
@@ -58,6 +59,7 @@ async function initTest() {
58
59
  console.log('COOKIES RES', cookiesRes);
59
60
  console.log('PROXY RES', proxyRes.data);
60
61
  console.log('SET COOKIES RES', setCookiesRes.cookies);
62
+ */
61
63
 
62
64
  console.log('title', res.context.query.content('//*[contains(text(), "Test")]'));
63
65
  console.log('date', res.context.query.date('#date', 'DD-MM-YYYY HH:mm'));
@@ -137,7 +139,12 @@ async function initServer() {
137
139
  res.status(Number(req.params.code)).send();
138
140
  });
139
141
 
140
- const server = app.listen(port, async () => {
142
+ const server = app.listen(port, async (error) => {
143
+ if (error) {
144
+ console.error(error);
145
+ return;
146
+ }
147
+
141
148
  const { address } = server.address();
142
149
 
143
150
  console.log(`Test server listening on ${address}:${port}`);
package/tests/remote.js CHANGED
@@ -4,15 +4,18 @@ const unprint = require('../src/app');
4
4
 
5
5
  unprint.configure({
6
6
  remote: {
7
- enabled: true,
7
+ enable: true,
8
8
  address: 'http://127.0.0.1:3333',
9
9
  key: 'foobar',
10
+ methods: [],
10
11
  },
11
12
  });
12
13
 
13
14
  async function init() {
15
+ unprint.on('requestInit', (event) => console.log('INIT', event));
16
+
14
17
  const res = await unprint.browser('https://www.google.com', {
15
- useRemote: true,
18
+ useRemote: false,
16
19
  async control(page) {
17
20
  const form = await page.locator('form');
18
21