unprint 0.17.3 → 0.17.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -18,6 +18,10 @@ unprint.options({
18
18
  concurrency: 10,
19
19
  interval: 10, // ms
20
20
  },
21
+ browser: {
22
+ concurrency: 5,
23
+ interval: 20,
24
+ },
21
25
  [hostname]: {
22
26
  enable: true, // enabled by default
23
27
  concurrency: 1,
@@ -215,9 +219,9 @@ Use Playwright with Chromium (experimental)
215
219
  Additional options
216
220
  * `control`: Async function to interface with Playwright page passed as argument
217
221
  * `scope`: Browser instance to (re)use, set to `null` to force new scope every request, default `main`.
218
- * `browser`: Options object passed to Playwright's `launch`, requires new scope.
222
+ * `browser`: Options object passed to Playwright's `launch`.
219
223
  * `browser.headless`: Headless mode, set to `false` to launch visible browser, default `true`.
220
- * `context`: Options object passed to Playwright's `newContext`, requires new scope.
224
+ * `context`: Options object passed to Playwright's `newContext`.
221
225
  * `page`: Options object passed to Playwright's `goto`.
222
226
 
223
227
  This requires you to install the Chromium executable:
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "unprint",
3
- "version": "0.17.3",
3
+ "version": "0.17.5",
4
4
  "description": "Simplify common web scraping tasks while staying in control of the data.",
5
5
  "main": "src/app.js",
6
6
  "scripts": {},
@@ -28,6 +28,7 @@
28
28
  "eslint-config-airbnb-base": "^15.0.0",
29
29
  "jsdom": "^17.0.0",
30
30
  "moment-timezone": "^0.5.34",
31
+ "object-hash": "^3.0.0",
31
32
  "patchright": "^1.56.1",
32
33
  "srcset": "^4.0.0",
33
34
  "tunnel": "^0.0.6"
package/src/app.js CHANGED
@@ -10,6 +10,7 @@ const axios = require('axios').default;
10
10
  const Bottleneck = require('bottleneck');
11
11
  const moment = require('moment-timezone');
12
12
  const merge = require('deepmerge');
13
+ const hashObject = require('object-hash');
13
14
  const srcset = require('srcset');
14
15
 
15
16
  const settings = {
@@ -21,6 +22,10 @@ const settings = {
21
22
  interval: 10,
22
23
  concurrency: 10,
23
24
  },
25
+ browser: {
26
+ interval: 20,
27
+ concurrency: 5,
28
+ },
24
29
  },
25
30
  };
26
31
 
@@ -988,7 +993,7 @@ function getLimiterValue(prop, options, hostname) {
988
993
  return options.limits[hostname][prop];
989
994
  }
990
995
 
991
- return options.limits.default[prop];
996
+ return options.limits[options?.limiter || 'default'][prop];
992
997
  }
993
998
 
994
999
  function getLimiter(url, options) {
@@ -1020,7 +1025,6 @@ function setProxy(instance, options, url) {
1020
1025
 
1021
1026
  if (options.proxy
1022
1027
  && options.proxy.enable !== false
1023
- && options.proxy.use !== false // use is a local override for enable
1024
1028
  && (options.proxy.use
1025
1029
  || options.proxy.hostnames?.includes(hostname))
1026
1030
  ) {
@@ -1031,14 +1035,18 @@ function setProxy(instance, options, url) {
1031
1035
  },
1032
1036
  });
1033
1037
 
1034
- instance.defaults.httpAgent = proxyAgent;
1035
- instance.defaults.httpsAgent = proxyAgent;
1038
+ if (instance) {
1039
+ instance.defaults.httpAgent = proxyAgent;
1040
+ instance.defaults.httpsAgent = proxyAgent;
1041
+ }
1036
1042
 
1037
1043
  return true;
1038
1044
  }
1039
1045
 
1040
- instance.defaults.httpAgent = options.httpsAgent || new http.Agent({ ...options.agent });
1041
- instance.defaults.httpsAgent = options.httpsAgent || new https.Agent({ ...options.agent });
1046
+ if (instance) {
1047
+ instance.defaults.httpAgent = options.httpsAgent || new http.Agent({ ...options.agent });
1048
+ instance.defaults.httpsAgent = options.httpsAgent || new https.Agent({ ...options.agent });
1049
+ }
1042
1050
 
1043
1051
  return false;
1044
1052
  }
@@ -1046,9 +1054,11 @@ function setProxy(instance, options, url) {
1046
1054
  const clients = new Map();
1047
1055
 
1048
1056
  /* eslint-enable no-param-reassign */
1049
- async function getBrowserInstance(scope, options) {
1050
- if (clients.has(scope)) {
1051
- const client = clients.get(scope);
1057
+ async function getBrowserInstance(scope, options, useProxy = false) {
1058
+ const scopeKey = `${scope}_${useProxy ? 'proxy' : 'direct'}_${options.browser ? hashObject(options.browser) : 'default'}_${options.context ? hashObject(options.context) : 'default'}`;
1059
+
1060
+ if (clients.has(scopeKey)) {
1061
+ const client = clients.get(scopeKey);
1052
1062
 
1053
1063
  await client.launchers;
1054
1064
 
@@ -1064,13 +1074,18 @@ async function getBrowserInstance(scope, options) {
1064
1074
  const contextLauncher = browserLauncher.then((browser) => browser.newContext({
1065
1075
  userAgent: 'unprint',
1066
1076
  ...options.context,
1077
+ ...(useProxy && {
1078
+ proxy: {
1079
+ server: `${options.proxy.host}:${options.proxy.port}`,
1080
+ },
1081
+ }),
1067
1082
  }));
1068
1083
 
1069
1084
  const launchers = Promise.all([browserLauncher, contextLauncher]);
1070
1085
  const client = { launchers };
1071
1086
 
1072
1087
  if (scope) {
1073
- clients.set(scope, client);
1088
+ clients.set(scopeKey, client);
1074
1089
  }
1075
1090
 
1076
1091
  client.browser = await browserLauncher;
@@ -1125,22 +1140,27 @@ async function browserRequest(url, customOptions = {}) {
1125
1140
  timeout: 1000,
1126
1141
  extract: true,
1127
1142
  scope: 'main',
1143
+ limiter: 'browser',
1128
1144
  url,
1129
1145
  }, globalOptions, customOptions]);
1130
1146
 
1131
1147
  const { limiter, interval, concurrency } = getLimiter(url, options);
1148
+ const useProxy = setProxy(null, options, url);
1132
1149
 
1133
1150
  const feedbackBase = {
1134
1151
  url,
1135
1152
  method: 'get',
1136
1153
  interval,
1137
1154
  concurrency,
1138
- isProxied: false,
1155
+ isProxied: useProxy,
1156
+ isBrowser: true,
1139
1157
  options,
1140
1158
  };
1141
1159
 
1160
+ events.emit('requestInit', feedbackBase);
1161
+
1142
1162
  return limiter.schedule(async () => {
1143
- const { context, browser } = await getBrowserInstance(options.scope, options);
1163
+ const { context, browser } = await getBrowserInstance(options.scope, options, useProxy);
1144
1164
  const page = await context.newPage();
1145
1165
 
1146
1166
  const res = await page.goto(url, {
@@ -1191,6 +1211,12 @@ async function browserRequest(url, customOptions = {}) {
1191
1211
  await browser.close();
1192
1212
  }
1193
1213
 
1214
+ events.emit('requestSuccess', {
1215
+ ...feedbackBase,
1216
+ status,
1217
+ statusText,
1218
+ });
1219
+
1194
1220
  return curateResponse({
1195
1221
  data,
1196
1222
  status,
@@ -1231,6 +1257,7 @@ async function request(url, body, customOptions = {}, method = 'GET') {
1231
1257
  interval,
1232
1258
  concurrency,
1233
1259
  isProxied,
1260
+ isBrowser: false,
1234
1261
  options,
1235
1262
  };
1236
1263
 
package/tests/browser.js CHANGED
@@ -2,6 +2,18 @@
2
2
 
3
3
  const unprint = require('../src/app');
4
4
 
5
+ unprint.options({ // or unprint.options();
6
+ proxy: {
7
+ enable: true,
8
+ use: false, // don't use for all requests by default
9
+ host: '192.168.1.25',
10
+ port: 8888,
11
+ hostnames: [
12
+ 'tools-httpstatus.pickup-services.com',
13
+ ],
14
+ },
15
+ });
16
+
5
17
  async function initTest() {
6
18
  // concurrency
7
19
  await Promise.all([
@@ -41,17 +53,18 @@ async function initTest() {
41
53
  headless: false,
42
54
  },
43
55
  async control(_page) {
44
- //
56
+ // return new Promise((resolve) => { setTimeout(() => resolve(), 60000); });
45
57
  },
46
58
  });
47
59
  }),
48
60
  ]);
49
61
 
50
62
  const res = await unprint.browser('https://www.scrapingcourse.com/', {
51
- // await unprint.browser('https://www.scrapingcourse.com/', {
52
- headless: false,
63
+ browser: {
64
+ headless: false,
65
+ },
53
66
  async control(_page) {
54
- return 'test';
67
+ // await new Promise((resolve) => { setTimeout(() => resolve(), 60000); });
55
68
  },
56
69
  });
57
70