unprint 0.18.34 → 0.19.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.eslintrc +2 -1
- package/README.md +51 -14
- package/package.json +7 -3
- package/src/app.js +79 -2
- package/src/server.js +132 -0
- package/tests/remote.js +34 -0
package/.eslintrc
CHANGED
|
@@ -15,6 +15,7 @@
|
|
|
15
15
|
"template-curly-spacing": "off",
|
|
16
16
|
"object-curly-newline": "off",
|
|
17
17
|
"default-param-last": "off",
|
|
18
|
-
"max-len": [2, {"code": 300, "tabWidth": 4, "ignoreUrls": true}]
|
|
18
|
+
"max-len": [2, {"code": 300, "tabWidth": 4, "ignoreUrls": true}],
|
|
19
|
+
"import/no-extraneous-dependencies": ["error", {"devDependencies": true}]
|
|
19
20
|
}
|
|
20
21
|
}
|
package/README.md
CHANGED
|
@@ -272,17 +272,6 @@ Returns
|
|
|
272
272
|
}
|
|
273
273
|
```
|
|
274
274
|
|
|
275
|
-
### Helpers
|
|
276
|
-
* `initialize(source, [selector], [options])` (`init`): Initialize element or HTML as unprint context
|
|
277
|
-
* `initializeAll(source, [selector], [options])` (`initAll`): Initialize element or HTML as multiple contexts
|
|
278
|
-
* `extractDate(string, [format], [options])`: Parse date with moment and some curation
|
|
279
|
-
* `extractDateAgo(string, [options])`: Extract relative date (e.g. 4 months ago)
|
|
280
|
-
* `extractDuration(timestamp, [matchRegex])`: Parse duration (e.g. 04:11:05) to seconds
|
|
281
|
-
* `extractTimestamp(string)`: Parse timestamp (e.g. 4H11M5S) to seconds
|
|
282
|
-
* `extractNumber(string, [options])`: Parse string as number
|
|
283
|
-
* `extractSourceSet(string, [options])`: Parse source set to object
|
|
284
|
-
* `formatDate(date, format, inputFormat)`: Format date with moment
|
|
285
|
-
|
|
286
275
|
### Proxy
|
|
287
276
|
```javascript
|
|
288
277
|
unprint.options({ // or unprint.options();
|
|
@@ -299,13 +288,61 @@ unprint.options({ // or unprint.options();
|
|
|
299
288
|
});
|
|
300
289
|
|
|
301
290
|
unprint.get({
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
291
|
+
useProxy: true, // use proxy for this request
|
|
292
|
+
});
|
|
293
|
+
```
|
|
294
|
+
|
|
295
|
+
### Request server
|
|
296
|
+
You can run unprint as a server to execute web request remotely. This is mainly useful when you need expensive browser requests on clients with limited resources.
|
|
297
|
+
If you don't need unprint-specific features, you should probably set up a dedicated proxy server instead.
|
|
298
|
+
|
|
299
|
+
#### Server
|
|
300
|
+
* Ensure optional dependencies are installed
|
|
301
|
+
* `UNPRINT_KEY=[random] node src/app.js --server [port|address:port]`
|
|
302
|
+
|
|
303
|
+
#### Client
|
|
304
|
+
```
|
|
305
|
+
unprint.options({
|
|
306
|
+
remote: {
|
|
307
|
+
enabled: true,
|
|
308
|
+
address: '10.0.0.1:3333'
|
|
309
|
+
key: 'YOUR_UNPRINT_KEY',
|
|
310
|
+
methods: ['browser'], // browser, get, post
|
|
305
311
|
},
|
|
306
312
|
});
|
|
307
313
|
```
|
|
308
314
|
|
|
315
|
+
If the remote is enabled, you can utilize it for individual requests regardless of method configuration:
|
|
316
|
+
```
|
|
317
|
+
unprint.get([url], {
|
|
318
|
+
useRemote: true,
|
|
319
|
+
})
|
|
320
|
+
```
|
|
321
|
+
|
|
322
|
+
#### API
|
|
323
|
+
```
|
|
324
|
+
POST /request {
|
|
325
|
+
url: 'https://example.com', // required
|
|
326
|
+
method: 'get', // default get, post, browser
|
|
327
|
+
options: {}, // request options as passed to native method
|
|
328
|
+
}
|
|
329
|
+
|
|
330
|
+
POST /browser // same as /request, but with browser method implied
|
|
331
|
+
|
|
332
|
+
* `options.control` is passed as a function body string, i.e. `control: "return 'hello world';"`
|
|
333
|
+
```
|
|
334
|
+
|
|
335
|
+
### Helpers
|
|
336
|
+
* `initialize(source, [selector], [options])` (`init`): Initialize element or HTML as unprint context
|
|
337
|
+
* `initializeAll(source, [selector], [options])` (`initAll`): Initialize element or HTML as multiple contexts
|
|
338
|
+
* `extractDate(string, [format], [options])`: Parse date with moment and some curation
|
|
339
|
+
* `extractDateAgo(string, [options])`: Extract relative date (e.g. 4 months ago)
|
|
340
|
+
* `extractDuration(timestamp, [matchRegex])`: Parse duration (e.g. 04:11:05) to seconds
|
|
341
|
+
* `extractTimestamp(string)`: Parse timestamp (e.g. 4H11M5S) to seconds
|
|
342
|
+
* `extractNumber(string, [options])`: Parse string as number
|
|
343
|
+
* `extractSourceSet(string, [options])`: Parse source set to object
|
|
344
|
+
* `formatDate(date, format, inputFormat)`: Format date with moment
|
|
345
|
+
|
|
309
346
|
### Feedback events
|
|
310
347
|
Usage:
|
|
311
348
|
* `unprint.on('trigger', callbackFn)`
|
package/package.json
CHANGED
|
@@ -1,9 +1,8 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "unprint",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.19.0",
|
|
4
4
|
"description": "Simplify common web scraping tasks while staying in control of the data.",
|
|
5
5
|
"main": "src/app.js",
|
|
6
|
-
"scripts": {},
|
|
7
6
|
"repository": {
|
|
8
7
|
"type": "git",
|
|
9
8
|
"url": "git+https://github.com/ThePendulum/unprint.git"
|
|
@@ -23,10 +22,12 @@
|
|
|
23
22
|
"bottleneck": "^2.19.5",
|
|
24
23
|
"cookie": "^1.1.1",
|
|
25
24
|
"deepmerge": "^4.2.2",
|
|
25
|
+
"dotenv": "^17.3.1",
|
|
26
26
|
"eslint": "^8.17.0",
|
|
27
27
|
"eslint-config-airbnb": "^19.0.4",
|
|
28
28
|
"eslint-config-airbnb-base": "^15.0.0",
|
|
29
29
|
"jsdom": "^17.0.0",
|
|
30
|
+
"minimist": "^1.2.8",
|
|
30
31
|
"moment-timezone": "^0.5.34",
|
|
31
32
|
"object-hash": "^3.0.0",
|
|
32
33
|
"patchright": "^1.56.1",
|
|
@@ -38,6 +39,9 @@
|
|
|
38
39
|
"devDependencies": {
|
|
39
40
|
"@playwright/test": "^1.56.1",
|
|
40
41
|
"@types/node": "^24.10.0",
|
|
41
|
-
"express": "^
|
|
42
|
+
"express": "^5.2.1"
|
|
43
|
+
},
|
|
44
|
+
"optionalDependencies": {
|
|
45
|
+
"express": "^5.2.1"
|
|
42
46
|
}
|
|
43
47
|
}
|
package/src/app.js
CHANGED
|
@@ -12,12 +12,20 @@ const moment = require('moment-timezone');
|
|
|
12
12
|
const merge = require('deepmerge');
|
|
13
13
|
const hashObject = require('object-hash');
|
|
14
14
|
const srcset = require('srcset');
|
|
15
|
+
const argv = require('minimist')(process.argv.slice(2));
|
|
16
|
+
|
|
17
|
+
const initServer = require('./server');
|
|
15
18
|
|
|
16
19
|
const settings = {
|
|
17
20
|
throwErrors: false,
|
|
18
21
|
logErrors: true,
|
|
19
22
|
requestTimeout: 30000,
|
|
20
23
|
userAgent: 'unprint',
|
|
24
|
+
remote: {
|
|
25
|
+
enabled: false,
|
|
26
|
+
address: 'http://127.0.0.1:3333/browser',
|
|
27
|
+
methods: ['browser'],
|
|
28
|
+
},
|
|
21
29
|
limits: {
|
|
22
30
|
default: {
|
|
23
31
|
interval: 10,
|
|
@@ -1149,7 +1157,7 @@ function curateCookies(res, options) {
|
|
|
1149
1157
|
|
|
1150
1158
|
function curateResponse(res, data, options, { url, control, customOptions }) {
|
|
1151
1159
|
const base = {
|
|
1152
|
-
ok: true,
|
|
1160
|
+
ok: res.ok ?? true,
|
|
1153
1161
|
data,
|
|
1154
1162
|
body: data,
|
|
1155
1163
|
status: res.statusCode || res.status,
|
|
@@ -1330,6 +1338,61 @@ function getAgent(options, url) {
|
|
|
1330
1338
|
});
|
|
1331
1339
|
}
|
|
1332
1340
|
|
|
1341
|
+
async function remoteRequest(url, method, options) {
|
|
1342
|
+
const control = typeof options.control === 'function' ? options.control.toString() : null;
|
|
1343
|
+
|
|
1344
|
+
const res = await undici.fetch(`${options.remote.address}/request`, {
|
|
1345
|
+
method: 'post',
|
|
1346
|
+
body: JSON.stringify({
|
|
1347
|
+
url,
|
|
1348
|
+
method,
|
|
1349
|
+
options: {
|
|
1350
|
+
...options,
|
|
1351
|
+
control: control && control.slice(control.indexOf('{') + 1, control.lastIndexOf('}')),
|
|
1352
|
+
},
|
|
1353
|
+
}),
|
|
1354
|
+
headers: {
|
|
1355
|
+
'content-type': 'application/json',
|
|
1356
|
+
'unprint-key': options.remote.key,
|
|
1357
|
+
},
|
|
1358
|
+
});
|
|
1359
|
+
|
|
1360
|
+
if (res.status !== 200) {
|
|
1361
|
+
return {
|
|
1362
|
+
ok: false,
|
|
1363
|
+
status: res.status,
|
|
1364
|
+
statusText: res.statusText,
|
|
1365
|
+
};
|
|
1366
|
+
}
|
|
1367
|
+
|
|
1368
|
+
const body = await res.text();
|
|
1369
|
+
const data = JSON.parse(body);
|
|
1370
|
+
|
|
1371
|
+
return curateResponse({
|
|
1372
|
+
status: data.status,
|
|
1373
|
+
statusText: data.statusText,
|
|
1374
|
+
headers: data.headers,
|
|
1375
|
+
}, data.data, options, {
|
|
1376
|
+
url,
|
|
1377
|
+
customOptions: options,
|
|
1378
|
+
control: data.control,
|
|
1379
|
+
});
|
|
1380
|
+
}
|
|
1381
|
+
|
|
1382
|
+
function useRemoteRequest(options, method) {
|
|
1383
|
+
if (options.remote.enabled) {
|
|
1384
|
+
if (options.useRemote) {
|
|
1385
|
+
return true;
|
|
1386
|
+
}
|
|
1387
|
+
|
|
1388
|
+
if (options.remote.methods.includes(method.toLowerCase())) {
|
|
1389
|
+
return true;
|
|
1390
|
+
}
|
|
1391
|
+
}
|
|
1392
|
+
|
|
1393
|
+
return false;
|
|
1394
|
+
}
|
|
1395
|
+
|
|
1333
1396
|
async function browserRequest(url, customOptions = {}) {
|
|
1334
1397
|
const options = merge.all([{
|
|
1335
1398
|
timeout: 10000,
|
|
@@ -1339,6 +1402,10 @@ async function browserRequest(url, customOptions = {}) {
|
|
|
1339
1402
|
url,
|
|
1340
1403
|
}, globalOptions, customOptions]);
|
|
1341
1404
|
|
|
1405
|
+
if (useRemoteRequest(options, 'browser')) {
|
|
1406
|
+
return remoteRequest(url, 'browser', options);
|
|
1407
|
+
}
|
|
1408
|
+
|
|
1342
1409
|
const { limiter, interval, concurrency } = getLimiter(url, options);
|
|
1343
1410
|
const agent = getAgent(options, url);
|
|
1344
1411
|
|
|
@@ -1523,6 +1590,10 @@ async function request(url, body, customOptions = {}, method = 'GET', redirects
|
|
|
1523
1590
|
return browserRequest(url, options);
|
|
1524
1591
|
}
|
|
1525
1592
|
|
|
1593
|
+
if (useRemoteRequest(options, method)) {
|
|
1594
|
+
return remoteRequest(url, method, options);
|
|
1595
|
+
}
|
|
1596
|
+
|
|
1526
1597
|
const { limiter, interval, concurrency } = getLimiter(url, options);
|
|
1527
1598
|
|
|
1528
1599
|
const agent = getAgent(options, url);
|
|
@@ -1625,7 +1696,7 @@ function off(trigger, fn) {
|
|
|
1625
1696
|
events.off(trigger, fn);
|
|
1626
1697
|
}
|
|
1627
1698
|
|
|
1628
|
-
|
|
1699
|
+
const unprint = {
|
|
1629
1700
|
configure,
|
|
1630
1701
|
on,
|
|
1631
1702
|
off,
|
|
@@ -1655,3 +1726,9 @@ module.exports = {
|
|
|
1655
1726
|
options: configure,
|
|
1656
1727
|
query: initQueryFns(queryFns),
|
|
1657
1728
|
};
|
|
1729
|
+
|
|
1730
|
+
if (argv.server) {
|
|
1731
|
+
initServer(argv.server, unprint);
|
|
1732
|
+
}
|
|
1733
|
+
|
|
1734
|
+
module.exports = unprint;
|
package/src/server.js
ADDED
|
@@ -0,0 +1,132 @@
|
|
|
1
|
+
'use strict';
|
|
2
|
+
|
|
3
|
+
const crypto = require('crypto');
|
|
4
|
+
const express = require('express');
|
|
5
|
+
|
|
6
|
+
require('dotenv').config();
|
|
7
|
+
|
|
8
|
+
const pkg = require('../package.json');
|
|
9
|
+
|
|
10
|
+
class HttpError extends Error {
|
|
11
|
+
constructor(message, httpCode, friendlyMessage, data) {
|
|
12
|
+
super(message);
|
|
13
|
+
|
|
14
|
+
this.name = 'HttpError';
|
|
15
|
+
this.httpCode = httpCode;
|
|
16
|
+
|
|
17
|
+
if (friendlyMessage) {
|
|
18
|
+
this.friendlyMessage = friendlyMessage;
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
if (data) {
|
|
22
|
+
this.data = data;
|
|
23
|
+
}
|
|
24
|
+
}
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
function log(level, ...data) {
|
|
28
|
+
const now = new Date();
|
|
29
|
+
|
|
30
|
+
console.log(`${now.toISOString()} [${level.slice(0, 5).padStart(5, ' ')}] ${data.join(' ')}`);
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
const logger = Object.fromEntries([
|
|
34
|
+
'info',
|
|
35
|
+
'debug',
|
|
36
|
+
'error',
|
|
37
|
+
'warn',
|
|
38
|
+
].map((level) => [level, (...data) => log(level, ...data)]));
|
|
39
|
+
|
|
40
|
+
function curateOptions(options) {
|
|
41
|
+
// make sure remote unprint doesn't get configured to make request to itself
|
|
42
|
+
return {
|
|
43
|
+
...options,
|
|
44
|
+
remote: {
|
|
45
|
+
enabled: false,
|
|
46
|
+
},
|
|
47
|
+
useRemote: false,
|
|
48
|
+
control: options.control
|
|
49
|
+
? async function control() {}.constructor('page', 'client', options.control) // eslint-disable-line no-eval,no-new-func,no-empty-function
|
|
50
|
+
: null,
|
|
51
|
+
};
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
async function handleRequest(req, res, unprint, method) {
|
|
55
|
+
if (!req.body?.url) {
|
|
56
|
+
throw new HttpError('No URL provided', 400);
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
logger.info(`${(method || req.body.method || 'get').toLowerCase()} ${req.body.url}`);
|
|
60
|
+
|
|
61
|
+
const options = curateOptions(req.body.options);
|
|
62
|
+
|
|
63
|
+
const unprintRes = req.body.method === 'post'
|
|
64
|
+
? await unprint.post(req.body.url, req.body.data, options)
|
|
65
|
+
: await unprint[(method || req.body.method || 'get').toLowerCase()](req.body.url, options);
|
|
66
|
+
|
|
67
|
+
res.send({
|
|
68
|
+
ok: unprintRes.ok,
|
|
69
|
+
status: unprintRes.status,
|
|
70
|
+
statusText: unprintRes.statusText,
|
|
71
|
+
data: unprintRes.data || null,
|
|
72
|
+
body: unprintRes.body || null,
|
|
73
|
+
html: unprintRes.context?.html || null,
|
|
74
|
+
headers: unprintRes.headers,
|
|
75
|
+
cookies: unprintRes.cookies,
|
|
76
|
+
control: unprintRes.control,
|
|
77
|
+
});
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
async function initServer(address, unprint) {
|
|
81
|
+
const app = express();
|
|
82
|
+
const addressComponents = typeof address === 'boolean' ? [] : String(address).split(':');
|
|
83
|
+
|
|
84
|
+
const host = addressComponents[1] ? addressComponents[0] : '127.0.0.1';
|
|
85
|
+
const port = addressComponents[1] || addressComponents[0] || 3000;
|
|
86
|
+
|
|
87
|
+
app.use(express.json());
|
|
88
|
+
|
|
89
|
+
app.use(async (req, res, next) => {
|
|
90
|
+
if (process.env.UNPRINT_KEY) {
|
|
91
|
+
if (process.env.UNPRINT_KEY.length !== req.headers['unprint-key']?.length
|
|
92
|
+
|| !crypto.timingSafeEqual(Buffer.from(process.env.UNPRINT_KEY, 'utf16le'), Buffer.from(req.headers['unprint-key'], 'utf16le'))) {
|
|
93
|
+
logger.warn(`Invalid key used by ${req.ip}`);
|
|
94
|
+
throw new HttpError('Invalid key', 401);
|
|
95
|
+
}
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
next();
|
|
99
|
+
});
|
|
100
|
+
|
|
101
|
+
app.get('/', (_req, res) => {
|
|
102
|
+
res.send(`unprint ${pkg.version}`);
|
|
103
|
+
});
|
|
104
|
+
|
|
105
|
+
app.post('/request', async (req, res) => handleRequest(req, res, unprint));
|
|
106
|
+
app.post('/browser', async (req, res) => handleRequest(req, res, unprint, 'browser'));
|
|
107
|
+
|
|
108
|
+
app.post('/options', async (req, res) => {
|
|
109
|
+
if (!req.body) {
|
|
110
|
+
throw new HttpError('No options provided', 400);
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
unprint.options(curateOptions(req.body));
|
|
114
|
+
|
|
115
|
+
res.status(204).send();
|
|
116
|
+
});
|
|
117
|
+
|
|
118
|
+
app.use((error, _req, res, _next) => {
|
|
119
|
+
logger.error(error);
|
|
120
|
+
|
|
121
|
+
res.status(error.httpCode || 500).send({
|
|
122
|
+
statusCode: error.httpCode || 500,
|
|
123
|
+
statusMessage: error.message,
|
|
124
|
+
});
|
|
125
|
+
});
|
|
126
|
+
|
|
127
|
+
app.listen(port, host, () => {
|
|
128
|
+
logger.info(`Started unprint server on ${host}:${port}`);
|
|
129
|
+
});
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
module.exports = initServer;
|
package/tests/remote.js
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
'use strict';
|
|
2
|
+
|
|
3
|
+
const unprint = require('../src/app');
|
|
4
|
+
|
|
5
|
+
unprint.configure({
|
|
6
|
+
remote: {
|
|
7
|
+
enabled: true,
|
|
8
|
+
address: 'http://127.0.0.1:3333',
|
|
9
|
+
key: 'foobar',
|
|
10
|
+
},
|
|
11
|
+
});
|
|
12
|
+
|
|
13
|
+
async function init() {
|
|
14
|
+
const res = await unprint.browser('https://www.google.com', {
|
|
15
|
+
useRemote: true,
|
|
16
|
+
async control(page) {
|
|
17
|
+
const form = await page.locator('form');
|
|
18
|
+
|
|
19
|
+
return form.count();
|
|
20
|
+
},
|
|
21
|
+
});
|
|
22
|
+
|
|
23
|
+
if (!res.ok) {
|
|
24
|
+
console.log(res);
|
|
25
|
+
return;
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
const form = res.context.query.element('form');
|
|
29
|
+
|
|
30
|
+
console.log('control', res.control);
|
|
31
|
+
console.log('form', form);
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
init();
|