unprint 0.18.34 → 0.19.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/.eslintrc CHANGED
@@ -15,6 +15,7 @@
15
15
  "template-curly-spacing": "off",
16
16
  "object-curly-newline": "off",
17
17
  "default-param-last": "off",
18
- "max-len": [2, {"code": 300, "tabWidth": 4, "ignoreUrls": true}]
18
+ "max-len": [2, {"code": 300, "tabWidth": 4, "ignoreUrls": true}],
19
+ "import/no-extraneous-dependencies": ["error", {"devDependencies": true}]
19
20
  }
20
21
  }
package/README.md CHANGED
@@ -272,17 +272,6 @@ Returns
272
272
  }
273
273
  ```
274
274
 
275
- ### Helpers
276
- * `initialize(source, [selector], [options])` (`init`): Initialize element or HTML as unprint context
277
- * `initializeAll(source, [selector], [options])` (`initAll`): Initialize element or HTML as multiple contexts
278
- * `extractDate(string, [format], [options])`: Parse date with moment and some curation
279
- * `extractDateAgo(string, [options])`: Extract relative date (e.g. 4 months ago)
280
- * `extractDuration(timestamp, [matchRegex])`: Parse duration (e.g. 04:11:05) to seconds
281
- * `extractTimestamp(string)`: Parse timestamp (e.g. 4H11M5S) to seconds
282
- * `extractNumber(string, [options])`: Parse string as number
283
- * `extractSourceSet(string, [options])`: Parse source set to object
284
- * `formatDate(date, format, inputFormat)`: Format date with moment
285
-
286
275
  ### Proxy
287
276
  ```javascript
288
277
  unprint.options({ // or unprint.options();
@@ -299,13 +288,61 @@ unprint.options({ // or unprint.options();
299
288
  });
300
289
 
301
290
  unprint.get({
302
- proxy: {
303
- use: true, // use proxy for this request
304
- // all other proxy options can be supplied here
291
+ useProxy: true, // use proxy for this request
292
+ });
293
+ ```
294
+
295
+ ### Request server
296
+ You can run unprint as a server to execute web request remotely. This is mainly useful when you need expensive browser requests on clients with limited resources.
297
+ If you don't need unprint-specific features, you should probably set up a dedicated proxy server instead.
298
+
299
+ #### Server
300
+ * Ensure optional dependencies are installed
301
+ * `UNPRINT_KEY=[random] node src/app.js --server [port|address:port]`
302
+
303
+ #### Client
304
+ ```
305
+ unprint.options({
306
+ remote: {
307
+ enabled: true,
308
+ address: '10.0.0.1:3333'
309
+ key: 'YOUR_UNPRINT_KEY',
310
+ methods: ['browser'], // browser, get, post
305
311
  },
306
312
  });
307
313
  ```
308
314
 
315
+ If the remote is enabled, you can utilize it for individual requests regardless of method configuration:
316
+ ```
317
+ unprint.get([url], {
318
+ useRemote: true,
319
+ })
320
+ ```
321
+
322
+ #### API
323
+ ```
324
+ POST /request {
325
+ url: 'https://example.com', // required
326
+ method: 'get', // default get, post, browser
327
+ options: {}, // request options as passed to native method
328
+ }
329
+
330
+ POST /browser // same as /request, but with browser method implied
331
+
332
+ * `options.control` is passed as a function body string, i.e. `control: "return 'hello world';"`
333
+ ```
334
+
335
+ ### Helpers
336
+ * `initialize(source, [selector], [options])` (`init`): Initialize element or HTML as unprint context
337
+ * `initializeAll(source, [selector], [options])` (`initAll`): Initialize element or HTML as multiple contexts
338
+ * `extractDate(string, [format], [options])`: Parse date with moment and some curation
339
+ * `extractDateAgo(string, [options])`: Extract relative date (e.g. 4 months ago)
340
+ * `extractDuration(timestamp, [matchRegex])`: Parse duration (e.g. 04:11:05) to seconds
341
+ * `extractTimestamp(string)`: Parse timestamp (e.g. 4H11M5S) to seconds
342
+ * `extractNumber(string, [options])`: Parse string as number
343
+ * `extractSourceSet(string, [options])`: Parse source set to object
344
+ * `formatDate(date, format, inputFormat)`: Format date with moment
345
+
309
346
  ### Feedback events
310
347
  Usage:
311
348
  * `unprint.on('trigger', callbackFn)`
package/package.json CHANGED
@@ -1,9 +1,8 @@
1
1
  {
2
2
  "name": "unprint",
3
- "version": "0.18.34",
3
+ "version": "0.19.0",
4
4
  "description": "Simplify common web scraping tasks while staying in control of the data.",
5
5
  "main": "src/app.js",
6
- "scripts": {},
7
6
  "repository": {
8
7
  "type": "git",
9
8
  "url": "git+https://github.com/ThePendulum/unprint.git"
@@ -23,10 +22,12 @@
23
22
  "bottleneck": "^2.19.5",
24
23
  "cookie": "^1.1.1",
25
24
  "deepmerge": "^4.2.2",
25
+ "dotenv": "^17.3.1",
26
26
  "eslint": "^8.17.0",
27
27
  "eslint-config-airbnb": "^19.0.4",
28
28
  "eslint-config-airbnb-base": "^15.0.0",
29
29
  "jsdom": "^17.0.0",
30
+ "minimist": "^1.2.8",
30
31
  "moment-timezone": "^0.5.34",
31
32
  "object-hash": "^3.0.0",
32
33
  "patchright": "^1.56.1",
@@ -38,6 +39,9 @@
38
39
  "devDependencies": {
39
40
  "@playwright/test": "^1.56.1",
40
41
  "@types/node": "^24.10.0",
41
- "express": "^4.18.1"
42
+ "express": "^5.2.1"
43
+ },
44
+ "optionalDependencies": {
45
+ "express": "^5.2.1"
42
46
  }
43
47
  }
package/src/app.js CHANGED
@@ -12,12 +12,20 @@ const moment = require('moment-timezone');
12
12
  const merge = require('deepmerge');
13
13
  const hashObject = require('object-hash');
14
14
  const srcset = require('srcset');
15
+ const argv = require('minimist')(process.argv.slice(2));
16
+
17
+ const initServer = require('./server');
15
18
 
16
19
  const settings = {
17
20
  throwErrors: false,
18
21
  logErrors: true,
19
22
  requestTimeout: 30000,
20
23
  userAgent: 'unprint',
24
+ remote: {
25
+ enabled: false,
26
+ address: 'http://127.0.0.1:3333/browser',
27
+ methods: ['browser'],
28
+ },
21
29
  limits: {
22
30
  default: {
23
31
  interval: 10,
@@ -1149,7 +1157,7 @@ function curateCookies(res, options) {
1149
1157
 
1150
1158
  function curateResponse(res, data, options, { url, control, customOptions }) {
1151
1159
  const base = {
1152
- ok: true,
1160
+ ok: res.ok ?? true,
1153
1161
  data,
1154
1162
  body: data,
1155
1163
  status: res.statusCode || res.status,
@@ -1330,6 +1338,61 @@ function getAgent(options, url) {
1330
1338
  });
1331
1339
  }
1332
1340
 
1341
+ async function remoteRequest(url, method, options) {
1342
+ const control = typeof options.control === 'function' ? options.control.toString() : null;
1343
+
1344
+ const res = await undici.fetch(`${options.remote.address}/request`, {
1345
+ method: 'post',
1346
+ body: JSON.stringify({
1347
+ url,
1348
+ method,
1349
+ options: {
1350
+ ...options,
1351
+ control: control && control.slice(control.indexOf('{') + 1, control.lastIndexOf('}')),
1352
+ },
1353
+ }),
1354
+ headers: {
1355
+ 'content-type': 'application/json',
1356
+ 'unprint-key': options.remote.key,
1357
+ },
1358
+ });
1359
+
1360
+ if (res.status !== 200) {
1361
+ return {
1362
+ ok: false,
1363
+ status: res.status,
1364
+ statusText: res.statusText,
1365
+ };
1366
+ }
1367
+
1368
+ const body = await res.text();
1369
+ const data = JSON.parse(body);
1370
+
1371
+ return curateResponse({
1372
+ status: data.status,
1373
+ statusText: data.statusText,
1374
+ headers: data.headers,
1375
+ }, data.data, options, {
1376
+ url,
1377
+ customOptions: options,
1378
+ control: data.control,
1379
+ });
1380
+ }
1381
+
1382
+ function useRemoteRequest(options, method) {
1383
+ if (options.remote.enabled) {
1384
+ if (options.useRemote) {
1385
+ return true;
1386
+ }
1387
+
1388
+ if (options.remote.methods.includes(method.toLowerCase())) {
1389
+ return true;
1390
+ }
1391
+ }
1392
+
1393
+ return false;
1394
+ }
1395
+
1333
1396
  async function browserRequest(url, customOptions = {}) {
1334
1397
  const options = merge.all([{
1335
1398
  timeout: 10000,
@@ -1339,6 +1402,10 @@ async function browserRequest(url, customOptions = {}) {
1339
1402
  url,
1340
1403
  }, globalOptions, customOptions]);
1341
1404
 
1405
+ if (useRemoteRequest(options, 'browser')) {
1406
+ return remoteRequest(url, 'browser', options);
1407
+ }
1408
+
1342
1409
  const { limiter, interval, concurrency } = getLimiter(url, options);
1343
1410
  const agent = getAgent(options, url);
1344
1411
 
@@ -1523,6 +1590,10 @@ async function request(url, body, customOptions = {}, method = 'GET', redirects
1523
1590
  return browserRequest(url, options);
1524
1591
  }
1525
1592
 
1593
+ if (useRemoteRequest(options, method)) {
1594
+ return remoteRequest(url, method, options);
1595
+ }
1596
+
1526
1597
  const { limiter, interval, concurrency } = getLimiter(url, options);
1527
1598
 
1528
1599
  const agent = getAgent(options, url);
@@ -1625,7 +1696,7 @@ function off(trigger, fn) {
1625
1696
  events.off(trigger, fn);
1626
1697
  }
1627
1698
 
1628
- module.exports = {
1699
+ const unprint = {
1629
1700
  configure,
1630
1701
  on,
1631
1702
  off,
@@ -1655,3 +1726,9 @@ module.exports = {
1655
1726
  options: configure,
1656
1727
  query: initQueryFns(queryFns),
1657
1728
  };
1729
+
1730
+ if (argv.server) {
1731
+ initServer(argv.server, unprint);
1732
+ }
1733
+
1734
+ module.exports = unprint;
package/src/server.js ADDED
@@ -0,0 +1,132 @@
1
+ 'use strict';
2
+
3
+ const crypto = require('crypto');
4
+ const express = require('express');
5
+
6
+ require('dotenv').config();
7
+
8
+ const pkg = require('../package.json');
9
+
10
+ class HttpError extends Error {
11
+ constructor(message, httpCode, friendlyMessage, data) {
12
+ super(message);
13
+
14
+ this.name = 'HttpError';
15
+ this.httpCode = httpCode;
16
+
17
+ if (friendlyMessage) {
18
+ this.friendlyMessage = friendlyMessage;
19
+ }
20
+
21
+ if (data) {
22
+ this.data = data;
23
+ }
24
+ }
25
+ }
26
+
27
+ function log(level, ...data) {
28
+ const now = new Date();
29
+
30
+ console.log(`${now.toISOString()} [${level.slice(0, 5).padStart(5, ' ')}] ${data.join(' ')}`);
31
+ }
32
+
33
+ const logger = Object.fromEntries([
34
+ 'info',
35
+ 'debug',
36
+ 'error',
37
+ 'warn',
38
+ ].map((level) => [level, (...data) => log(level, ...data)]));
39
+
40
+ function curateOptions(options) {
41
+ // make sure remote unprint doesn't get configured to make request to itself
42
+ return {
43
+ ...options,
44
+ remote: {
45
+ enabled: false,
46
+ },
47
+ useRemote: false,
48
+ control: options.control
49
+ ? async function control() {}.constructor('page', 'client', options.control) // eslint-disable-line no-eval,no-new-func,no-empty-function
50
+ : null,
51
+ };
52
+ }
53
+
54
+ async function handleRequest(req, res, unprint, method) {
55
+ if (!req.body?.url) {
56
+ throw new HttpError('No URL provided', 400);
57
+ }
58
+
59
+ logger.info(`${(method || req.body.method || 'get').toLowerCase()} ${req.body.url}`);
60
+
61
+ const options = curateOptions(req.body.options);
62
+
63
+ const unprintRes = req.body.method === 'post'
64
+ ? await unprint.post(req.body.url, req.body.data, options)
65
+ : await unprint[(method || req.body.method || 'get').toLowerCase()](req.body.url, options);
66
+
67
+ res.send({
68
+ ok: unprintRes.ok,
69
+ status: unprintRes.status,
70
+ statusText: unprintRes.statusText,
71
+ data: unprintRes.data || null,
72
+ body: unprintRes.body || null,
73
+ html: unprintRes.context?.html || null,
74
+ headers: unprintRes.headers,
75
+ cookies: unprintRes.cookies,
76
+ control: unprintRes.control,
77
+ });
78
+ }
79
+
80
+ async function initServer(address, unprint) {
81
+ const app = express();
82
+ const addressComponents = typeof address === 'boolean' ? [] : String(address).split(':');
83
+
84
+ const host = addressComponents[1] ? addressComponents[0] : '127.0.0.1';
85
+ const port = addressComponents[1] || addressComponents[0] || 3000;
86
+
87
+ app.use(express.json());
88
+
89
+ app.use(async (req, res, next) => {
90
+ if (process.env.UNPRINT_KEY) {
91
+ if (process.env.UNPRINT_KEY.length !== req.headers['unprint-key']?.length
92
+ || !crypto.timingSafeEqual(Buffer.from(process.env.UNPRINT_KEY, 'utf16le'), Buffer.from(req.headers['unprint-key'], 'utf16le'))) {
93
+ logger.warn(`Invalid key used by ${req.ip}`);
94
+ throw new HttpError('Invalid key', 401);
95
+ }
96
+ }
97
+
98
+ next();
99
+ });
100
+
101
+ app.get('/', (_req, res) => {
102
+ res.send(`unprint ${pkg.version}`);
103
+ });
104
+
105
+ app.post('/request', async (req, res) => handleRequest(req, res, unprint));
106
+ app.post('/browser', async (req, res) => handleRequest(req, res, unprint, 'browser'));
107
+
108
+ app.post('/options', async (req, res) => {
109
+ if (!req.body) {
110
+ throw new HttpError('No options provided', 400);
111
+ }
112
+
113
+ unprint.options(curateOptions(req.body));
114
+
115
+ res.status(204).send();
116
+ });
117
+
118
+ app.use((error, _req, res, _next) => {
119
+ logger.error(error);
120
+
121
+ res.status(error.httpCode || 500).send({
122
+ statusCode: error.httpCode || 500,
123
+ statusMessage: error.message,
124
+ });
125
+ });
126
+
127
+ app.listen(port, host, () => {
128
+ logger.info(`Started unprint server on ${host}:${port}`);
129
+ });
130
+ }
131
+
132
+ module.exports = initServer;
@@ -0,0 +1,34 @@
1
+ 'use strict';
2
+
3
+ const unprint = require('../src/app');
4
+
5
+ unprint.configure({
6
+ remote: {
7
+ enabled: true,
8
+ address: 'http://127.0.0.1:3333',
9
+ key: 'foobar',
10
+ },
11
+ });
12
+
13
+ async function init() {
14
+ const res = await unprint.browser('https://www.google.com', {
15
+ useRemote: true,
16
+ async control(page) {
17
+ const form = await page.locator('form');
18
+
19
+ return form.count();
20
+ },
21
+ });
22
+
23
+ if (!res.ok) {
24
+ console.log(res);
25
+ return;
26
+ }
27
+
28
+ const form = res.context.query.element('form');
29
+
30
+ console.log('control', res.control);
31
+ console.log('form', form);
32
+ }
33
+
34
+ init();