unprint 0.17.4 → 0.17.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -219,9 +219,9 @@ Use Playwright with Chromium (experimental)
219
219
  Additional options
220
220
  * `control`: Async function to interface with Playwright page passed as argument
221
221
  * `scope`: Browser instance to (re)use, set to `null` to force new scope every request, default `main`.
222
- * `browser`: Options object passed to Playwright's `launch`, requires new scope.
222
+ * `browser`: Options object passed to Playwright's `launch`.
223
223
  * `browser.headless`: Headless mode, set to `false` to launch visible browser, default `true`.
224
- * `context`: Options object passed to Playwright's `newContext`, requires new scope.
224
+ * `context`: Options object passed to Playwright's `newContext`.
225
225
  * `page`: Options object passed to Playwright's `goto`.
226
226
 
227
227
  This requires you to install the Chromium executable:
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "unprint",
3
- "version": "0.17.4",
3
+ "version": "0.17.5",
4
4
  "description": "Simplify common web scraping tasks while staying in control of the data.",
5
5
  "main": "src/app.js",
6
6
  "scripts": {},
@@ -28,6 +28,7 @@
28
28
  "eslint-config-airbnb-base": "^15.0.0",
29
29
  "jsdom": "^17.0.0",
30
30
  "moment-timezone": "^0.5.34",
31
+ "object-hash": "^3.0.0",
31
32
  "patchright": "^1.56.1",
32
33
  "srcset": "^4.0.0",
33
34
  "tunnel": "^0.0.6"
package/src/app.js CHANGED
@@ -10,6 +10,7 @@ const axios = require('axios').default;
10
10
  const Bottleneck = require('bottleneck');
11
11
  const moment = require('moment-timezone');
12
12
  const merge = require('deepmerge');
13
+ const hashObject = require('object-hash');
13
14
  const srcset = require('srcset');
14
15
 
15
16
  const settings = {
@@ -1024,7 +1025,6 @@ function setProxy(instance, options, url) {
1024
1025
 
1025
1026
  if (options.proxy
1026
1027
  && options.proxy.enable !== false
1027
- && options.proxy.use !== false // use is a local override for enable
1028
1028
  && (options.proxy.use
1029
1029
  || options.proxy.hostnames?.includes(hostname))
1030
1030
  ) {
@@ -1035,14 +1035,18 @@ function setProxy(instance, options, url) {
1035
1035
  },
1036
1036
  });
1037
1037
 
1038
- instance.defaults.httpAgent = proxyAgent;
1039
- instance.defaults.httpsAgent = proxyAgent;
1038
+ if (instance) {
1039
+ instance.defaults.httpAgent = proxyAgent;
1040
+ instance.defaults.httpsAgent = proxyAgent;
1041
+ }
1040
1042
 
1041
1043
  return true;
1042
1044
  }
1043
1045
 
1044
- instance.defaults.httpAgent = options.httpsAgent || new http.Agent({ ...options.agent });
1045
- instance.defaults.httpsAgent = options.httpsAgent || new https.Agent({ ...options.agent });
1046
+ if (instance) {
1047
+ instance.defaults.httpAgent = options.httpsAgent || new http.Agent({ ...options.agent });
1048
+ instance.defaults.httpsAgent = options.httpsAgent || new https.Agent({ ...options.agent });
1049
+ }
1046
1050
 
1047
1051
  return false;
1048
1052
  }
@@ -1050,9 +1054,11 @@ function setProxy(instance, options, url) {
1050
1054
  const clients = new Map();
1051
1055
 
1052
1056
  /* eslint-enable no-param-reassign */
1053
- async function getBrowserInstance(scope, options) {
1054
- if (clients.has(scope)) {
1055
- const client = clients.get(scope);
1057
+ async function getBrowserInstance(scope, options, useProxy = false) {
1058
+ const scopeKey = `${scope}_${useProxy ? 'proxy' : 'direct'}_${options.browser ? hashObject(options.browser) : 'default'}_${options.context ? hashObject(options.context) : 'default'}`;
1059
+
1060
+ if (clients.has(scopeKey)) {
1061
+ const client = clients.get(scopeKey);
1056
1062
 
1057
1063
  await client.launchers;
1058
1064
 
@@ -1068,13 +1074,18 @@ async function getBrowserInstance(scope, options) {
1068
1074
  const contextLauncher = browserLauncher.then((browser) => browser.newContext({
1069
1075
  userAgent: 'unprint',
1070
1076
  ...options.context,
1077
+ ...(useProxy && {
1078
+ proxy: {
1079
+ server: `${options.proxy.host}:${options.proxy.port}`,
1080
+ },
1081
+ }),
1071
1082
  }));
1072
1083
 
1073
1084
  const launchers = Promise.all([browserLauncher, contextLauncher]);
1074
1085
  const client = { launchers };
1075
1086
 
1076
1087
  if (scope) {
1077
- clients.set(scope, client);
1088
+ clients.set(scopeKey, client);
1078
1089
  }
1079
1090
 
1080
1091
  client.browser = await browserLauncher;
@@ -1134,18 +1145,22 @@ async function browserRequest(url, customOptions = {}) {
1134
1145
  }, globalOptions, customOptions]);
1135
1146
 
1136
1147
  const { limiter, interval, concurrency } = getLimiter(url, options);
1148
+ const useProxy = setProxy(null, options, url);
1137
1149
 
1138
1150
  const feedbackBase = {
1139
1151
  url,
1140
1152
  method: 'get',
1141
1153
  interval,
1142
1154
  concurrency,
1143
- isProxied: false,
1155
+ isProxied: useProxy,
1156
+ isBrowser: true,
1144
1157
  options,
1145
1158
  };
1146
1159
 
1160
+ events.emit('requestInit', feedbackBase);
1161
+
1147
1162
  return limiter.schedule(async () => {
1148
- const { context, browser } = await getBrowserInstance(options.scope, options);
1163
+ const { context, browser } = await getBrowserInstance(options.scope, options, useProxy);
1149
1164
  const page = await context.newPage();
1150
1165
 
1151
1166
  const res = await page.goto(url, {
@@ -1196,6 +1211,12 @@ async function browserRequest(url, customOptions = {}) {
1196
1211
  await browser.close();
1197
1212
  }
1198
1213
 
1214
+ events.emit('requestSuccess', {
1215
+ ...feedbackBase,
1216
+ status,
1217
+ statusText,
1218
+ });
1219
+
1199
1220
  return curateResponse({
1200
1221
  data,
1201
1222
  status,
@@ -1236,6 +1257,7 @@ async function request(url, body, customOptions = {}, method = 'GET') {
1236
1257
  interval,
1237
1258
  concurrency,
1238
1259
  isProxied,
1260
+ isBrowser: false,
1239
1261
  options,
1240
1262
  };
1241
1263
 
package/tests/browser.js CHANGED
@@ -2,6 +2,18 @@
2
2
 
3
3
  const unprint = require('../src/app');
4
4
 
5
+ unprint.options({ // or unprint.options();
6
+ proxy: {
7
+ enable: true,
8
+ use: false, // don't use for all requests by default
9
+ host: '192.168.1.25',
10
+ port: 8888,
11
+ hostnames: [
12
+ 'tools-httpstatus.pickup-services.com',
13
+ ],
14
+ },
15
+ });
16
+
5
17
  async function initTest() {
6
18
  // concurrency
7
19
  await Promise.all([
@@ -41,17 +53,18 @@ async function initTest() {
41
53
  headless: false,
42
54
  },
43
55
  async control(_page) {
44
- //
56
+ // return new Promise((resolve) => { setTimeout(() => resolve(), 60000); });
45
57
  },
46
58
  });
47
59
  }),
48
60
  ]);
49
61
 
50
62
  const res = await unprint.browser('https://www.scrapingcourse.com/', {
51
- // await unprint.browser('https://www.scrapingcourse.com/', {
52
- headless: false,
63
+ browser: {
64
+ headless: false,
65
+ },
53
66
  async control(_page) {
54
- return 'test';
67
+ // await new Promise((resolve) => { setTimeout(() => resolve(), 60000); });
55
68
  },
56
69
  });
57
70