unprint 0.17.6 → 0.17.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -218,7 +218,8 @@ Use Playwright with Chromium (experimental)
218
218
 
219
219
  Additional options
220
220
  * `control`: Async function to interface with Playwright page passed as argument
221
- * `scope`: Browser instance to (re)use, set to `null` to force new scope every request, default `main`.
221
+ * `clientScope`: Browser instance to (re)use, set to `null` to force new scope every request, default `main`.
222
+ * `clientRetirement`: Number of requests until a browser gets restarted for resource clean-up, default `20`.
222
223
  * `browser`: Options object passed to Playwright's `launch`.
223
224
  * `browser.headless`: Headless mode, set to `false` to launch visible browser, default `true`.
224
225
  * `context`: Options object passed to Playwright's `newContext`.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "unprint",
3
- "version": "0.17.6",
3
+ "version": "0.17.8",
4
4
  "description": "Simplify common web scraping tasks while staying in control of the data.",
5
5
  "main": "src/app.js",
6
6
  "scripts": {},
package/src/app.js CHANGED
@@ -191,12 +191,18 @@ function queryContents(context, selector, customOptions) {
191
191
  const options = {
192
192
  ...context.options,
193
193
  trim: true,
194
+ filter: true,
194
195
  ...customOptions,
195
196
  };
196
197
 
197
198
  const targets = queryElements(context, selector, options);
199
+ const extractedContents = targets.map((target) => extractContent(target, options));
200
+
201
+ if (options.filter) {
202
+ return extractedContents.filter(Boolean);
203
+ }
198
204
 
199
- return targets.map((target) => extractContent(target, options)).filter(Boolean);
205
+ return extractedContents;
200
206
  }
201
207
 
202
208
  function queryAttribute(context, selector, attribute, customOptions) {
@@ -277,15 +283,24 @@ function queryNumber(context, selector, customOptions) {
277
283
  }
278
284
 
279
285
  function queryNumbers(context, selector, customOptions) {
286
+ const options = {
287
+ filter: true,
288
+ ...customOptions,
289
+ };
290
+
280
291
  const numberStrings = queryContents(context, selector, customOptions);
281
292
 
282
293
  if (!numberStrings) {
283
294
  return null;
284
295
  }
285
296
 
286
- return numberStrings
287
- .map((numberString) => extractNumber(numberString, customOptions))
288
- .filter(Boolean);
297
+ const extractedNumbers = numberStrings.map((numberString) => extractNumber(numberString, customOptions));
298
+
299
+ if (options.filter) {
300
+ return extractedNumbers.filter(Boolean);
301
+ }
302
+
303
+ return extractedNumbers;
289
304
  }
290
305
 
291
306
  function queryHtml(context, selector, customOptions) {
@@ -1060,6 +1075,13 @@ async function getBrowserInstance(scope, options, useProxy = false) {
1060
1075
  if (clients.has(scopeKey)) {
1061
1076
  const client = clients.get(scopeKey);
1062
1077
 
1078
+ client.uses += 1;
1079
+
1080
+ if (client.uses >= (options.clientRetirement || 20)) {
1081
+ client.retired = true;
1082
+ clients.delete(scopeKey);
1083
+ }
1084
+
1063
1085
  await client.launchers;
1064
1086
 
1065
1087
  return client;
@@ -1082,7 +1104,14 @@ async function getBrowserInstance(scope, options, useProxy = false) {
1082
1104
  }));
1083
1105
 
1084
1106
  const launchers = Promise.all([browserLauncher, contextLauncher]);
1085
- const client = { launchers };
1107
+
1108
+ const client = {
1109
+ key: scopeKey,
1110
+ launchers,
1111
+ active: 0,
1112
+ uses: 1,
1113
+ retired: false,
1114
+ };
1086
1115
 
1087
1116
  if (scope) {
1088
1117
  clients.set(scopeKey, client);
@@ -1139,7 +1168,7 @@ async function browserRequest(url, customOptions = {}) {
1139
1168
  const options = merge.all([{
1140
1169
  timeout: 1000,
1141
1170
  extract: true,
1142
- scope: 'main',
1171
+ client: 'main',
1143
1172
  limiter: 'browser',
1144
1173
  url,
1145
1174
  }, globalOptions, customOptions]);
@@ -1160,8 +1189,11 @@ async function browserRequest(url, customOptions = {}) {
1160
1189
  events.emit('requestInit', feedbackBase);
1161
1190
 
1162
1191
  return limiter.schedule(async () => {
1163
- const { context, browser } = await getBrowserInstance(options.scope, options, useProxy);
1164
- const page = await context.newPage();
1192
+ const client = await getBrowserInstance(options.client, options, useProxy);
1193
+
1194
+ client.active += 1;
1195
+
1196
+ const page = await client.context.newPage();
1165
1197
 
1166
1198
  const res = await page.goto(url, {
1167
1199
  ...options.page,
@@ -1180,6 +1212,8 @@ async function browserRequest(url, customOptions = {}) {
1180
1212
  statusText,
1181
1213
  });
1182
1214
 
1215
+ client.active -= 1;
1216
+
1183
1217
  return {
1184
1218
  ok: false,
1185
1219
  status,
@@ -1198,8 +1232,10 @@ async function browserRequest(url, customOptions = {}) {
1198
1232
 
1199
1233
  if (customOptions.control) {
1200
1234
  try {
1201
- control = await customOptions.control(page, { context, browser });
1235
+ control = await customOptions.control(page, client);
1202
1236
  } catch (error) {
1237
+ client.active -= 1;
1238
+
1203
1239
  return {
1204
1240
  ok: false,
1205
1241
  controlError: error.message,
@@ -1218,17 +1254,20 @@ async function browserRequest(url, customOptions = {}) {
1218
1254
 
1219
1255
  await page.close();
1220
1256
 
1221
- if (options.scope === null) {
1222
- // this browser won't be reused
1223
- await browser.close();
1224
- }
1225
-
1226
1257
  events.emit('requestSuccess', {
1227
1258
  ...feedbackBase,
1228
1259
  status,
1229
1260
  statusText,
1230
1261
  });
1231
1262
 
1263
+ client.active -= 1;
1264
+
1265
+ if (options.client === null // this browser is single-use
1266
+ || (client.retired && client.active === 0)) { // this browser is retired to minimize garbage build-up
1267
+ // this browser won't be reused
1268
+ await client.browser.close();
1269
+ }
1270
+
1232
1271
  return curateResponse({
1233
1272
  data,
1234
1273
  status,
package/tests/browser.js CHANGED
@@ -16,23 +16,20 @@ unprint.options({ // or unprint.options();
16
16
 
17
17
  async function initTest() {
18
18
  // concurrency
19
- await Promise.all([
20
- unprint.browser('https://tools-httpstatus.pickup-services.com/200?sleep=500', {
21
- browser: {
22
- headless: false,
23
- },
24
- }),
25
- unprint.browser('https://tools-httpstatus.pickup-services.com/200?sleep=500', {
26
- browser: {
27
- headless: false,
28
- },
29
- }),
30
- unprint.browser('https://tools-httpstatus.pickup-services.com/200?sleep=500', {
19
+ await Promise.all(Array.from({ length: 20 }).map(async () => {
20
+ // await unprint.browser(`https://tools-httpstatus.pickup-services.com/${Math.random() < 0.2 ? '404' : '200'}?sleep=${Math.round(Math.random() * 500)}`, {
21
+ await unprint.browser(`https://tools-httpstatus.pickup-services.com/200?sleep=${Math.round(Math.random() * 5000)}`, {
22
+ // client: null,
23
+ interval: 100,
31
24
  browser: {
32
- headless: false,
25
+ headless: true,
33
26
  },
34
- }),
35
- ]);
27
+ });
28
+ }));
29
+
30
+ // console.log('Requests done, waiting...');
31
+
32
+ // await new Promise((resolve) => { setTimeout(() => resolve(), 60 * 60 * 1000); });
36
33
 
37
34
  await Promise.all([
38
35
  unprint.browser('https://tools-httpstatus.pickup-services.com/200?sleep=5000', {