@browserless/goto 10.10.0 → 10.10.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -2,7 +2,7 @@
2
2
  "name": "@browserless/goto",
3
3
  "description": "Navigate to web pages with built-in ad blocking, device emulation, and optimized loading for faster automation.",
4
4
  "homepage": "https://browserless.js.org/#/?id=gotopage-options",
5
- "version": "10.10.0",
5
+ "version": "10.10.2",
6
6
  "main": "src/index.js",
7
7
  "author": {
8
8
  "email": "hello@microlink.io",
@@ -66,5 +66,5 @@
66
66
  "timeout": "2m",
67
67
  "workerThreads": false
68
68
  },
69
- "gitHead": "9b80e677418f2defb804d39043680fe65e3e277b"
69
+ "gitHead": "ec6a614923a1a692bd717ecc8e6f1b09417801d9"
70
70
  }
package/src/adblock.js CHANGED
@@ -6,25 +6,41 @@ const fs = require('fs')
6
6
 
7
7
  const debug = require('debug-logfmt')('browserless:goto:adblock')
8
8
 
9
- const engine = PuppeteerBlocker.deserialize(
10
- new Uint8Array(fs.readFileSync(path.resolve(__dirname, './engine.bin')))
11
- )
9
+ let enginePromise
12
10
 
13
- engine.on('request-blocked', ({ url }) => debug('block', url))
14
- engine.on('request-redirected', ({ url }) => debug('redirect', url))
11
+ const getEngine = () => {
12
+ if (enginePromise) return enginePromise
13
+
14
+ enginePromise = fs.promises.readFile(path.resolve(__dirname, './engine.bin')).then(buffer => {
15
+ const engine = PuppeteerBlocker.deserialize(new Uint8Array(buffer))
16
+ engine.on('request-blocked', ({ url }) => debug('block', url))
17
+ engine.on('request-redirected', ({ url }) => debug('redirect', url))
18
+ return engine
19
+ })
20
+
21
+ return enginePromise
22
+ }
15
23
 
16
24
  /**
17
25
  * autoconsent.playwright.js is the only browser-injectable IIFE bundle in the package.
18
26
  * It is not in the package's "exports" map, so pin @duckduckgo/autoconsent with ~ to
19
27
  * avoid breakage from internal restructuring on minor/patch bumps.
20
28
  */
21
- const autoconsentPlaywrightScript = fs.readFileSync(
22
- path.resolve(
23
- path.dirname(require.resolve('@duckduckgo/autoconsent')),
24
- 'autoconsent.playwright.js'
25
- ),
26
- 'utf8'
27
- )
29
+ let autoconsentPlaywrightScriptPromise
30
+
31
+ const getAutoconsentPlaywrightScript = () => {
32
+ if (autoconsentPlaywrightScriptPromise) return autoconsentPlaywrightScriptPromise
33
+
34
+ autoconsentPlaywrightScriptPromise = fs.promises.readFile(
35
+ path.resolve(
36
+ path.dirname(require.resolve('@duckduckgo/autoconsent')),
37
+ 'autoconsent.playwright.js'
38
+ ),
39
+ 'utf8'
40
+ )
41
+
42
+ return autoconsentPlaywrightScriptPromise
43
+ }
28
44
 
29
45
  /* Configuration passed to autoconsent's `initResp` message.
30
46
  See https://github.com/duckduckgo/autoconsent/blob/main/api.md */
@@ -66,6 +82,7 @@ const sendMessage = (page, message) =>
66
82
 
67
83
  const setupAutoConsent = async page => {
68
84
  if (page._autoconsentSetup) return
85
+ const autoconsentPlaywrightScript = await getAutoconsentPlaywrightScript()
69
86
 
70
87
  await page.exposeFunction('autoconsentSendMessage', async message => {
71
88
  if (!message || typeof message !== 'object') return
@@ -83,12 +100,12 @@ const setupAutoConsent = async page => {
83
100
  page._autoconsentSetup = true
84
101
  }
85
102
 
86
- const runAutoConsent = page => page.evaluate(autoconsentPlaywrightScript)
103
+ const runAutoConsent = async page => page.evaluate(await getAutoconsentPlaywrightScript())
87
104
 
88
105
  const enableBlockingInPage = (page, run, actionTimeout) => {
89
106
  page.disableAdblock = () =>
90
- engine
91
- .disableBlockingInPage(page, { keepRequestInterception: true })
107
+ getEngine()
108
+ .then(engine => engine.disableBlockingInPage(page, { keepRequestInterception: true }))
92
109
  .then(() => debug('disabled'))
93
110
  .catch(() => {})
94
111
 
@@ -99,7 +116,7 @@ const enableBlockingInPage = (page, run, actionTimeout) => {
99
116
  debug: 'autoconsent:setup'
100
117
  }),
101
118
  run({
102
- fn: engine.enableBlockingInPage(page),
119
+ fn: getEngine().then(engine => engine.enableBlockingInPage(page)),
103
120
  timeout: actionTimeout,
104
121
  debug: 'adblock'
105
122
  })
package/src/engine.bin CHANGED
Binary file
package/src/index.js CHANGED
@@ -22,23 +22,21 @@ const isEmpty = val => val == null || !(Object.keys(val) || val).length
22
22
 
23
23
  const castArray = value => [].concat(value).filter(Boolean)
24
24
 
25
- const run = async ({ fn, timeout, debug: props }) => {
26
- const duration = debug.duration()
27
- const result = await pReflect(timeout ? pTimeout(fn, timeout) : fn)
28
- const errorProps = result.isRejected ? { error: result.reason.message || result.reason } : {}
29
- duration(props, errorProps)
30
- return result
25
+ const getDefaultPath = pathname => {
26
+ if (!pathname || pathname[0] !== '/') return '/'
27
+ if (pathname === '/') return '/'
28
+
29
+ const rightSlash = pathname.lastIndexOf('/')
30
+ return rightSlash === 0 ? '/' : pathname.slice(0, rightSlash)
31
31
  }
32
32
 
33
- const parseCookies = (url, str) =>
34
- str.split(';').reduce((acc, cookieStr) => {
35
- const jar = new toughCookie.CookieJar(undefined, { rejectPublicSuffixes: false })
36
- jar.setCookieSync(cookieStr.trim(), url)
37
- const parsedCookie = jar.serializeSync().cookies[0]
33
+ const parseCookiesWithJar = (url, str) => {
34
+ const jar = new toughCookie.CookieJar(undefined, { rejectPublicSuffixes: false })
38
35
 
39
- // Use this instead of the above when the following issue is fixed:
40
- // https://github.com/salesforce/tough-cookie/issues/149
41
- // const ret = toughCookie.parse(cookie).serializeSync();
36
+ return str.split(';').reduce((acc, cookieStr) => {
37
+ const cookie = jar.setCookieSync(cookieStr.trim(), url)
38
+ if (!cookie) return acc
39
+ const parsedCookie = cookie.toJSON()
42
40
 
43
41
  parsedCookie.name = parsedCookie.key
44
42
  delete parsedCookie.key
@@ -50,6 +48,82 @@ const parseCookies = (url, str) =>
50
48
  acc.push(parsedCookie)
51
49
  return acc
52
50
  }, [])
51
+ }
52
+
53
+ const run = async ({ fn, timeout, debug: props }) => {
54
+ const duration = debug.duration()
55
+ const result = await pReflect(timeout ? pTimeout(fn, timeout) : fn)
56
+ const errorProps = result.isRejected ? { error: result.reason.message || result.reason } : {}
57
+ duration(props, errorProps)
58
+ return result
59
+ }
60
+
61
+ const stopLoadingOnTimeout = (page, timeout) => {
62
+ let timeoutId
63
+
64
+ return {
65
+ promise: new Promise(resolve => {
66
+ timeoutId = globalThis.setTimeout(() => {
67
+ pReflect(page._client().send('Page.stopLoading')).then(resolve)
68
+ }, timeout)
69
+
70
+ if (typeof timeoutId.unref === 'function') timeoutId.unref()
71
+ }),
72
+ clear: () => {
73
+ if (timeoutId) clearTimeout(timeoutId)
74
+ }
75
+ }
76
+ }
77
+
78
+ const parseCookies = (url, str) => {
79
+ let parsedURL
80
+
81
+ try {
82
+ parsedURL = new URL(url)
83
+ } catch {
84
+ return parseCookiesWithJar(url, str)
85
+ }
86
+
87
+ const domain = parsedURL.hostname
88
+
89
+ if (!domain) {
90
+ return parseCookiesWithJar(url, str)
91
+ }
92
+
93
+ const path = getDefaultPath(parsedURL.pathname)
94
+ const chunks = str.split(';')
95
+ const cookies = new Array(chunks.length)
96
+ let index = 0
97
+
98
+ for (const chunk of chunks) {
99
+ const cookieStr = chunk.trim()
100
+
101
+ if (cookieStr.length === 0) {
102
+ return parseCookiesWithJar(url, str)
103
+ }
104
+
105
+ const separatorIndex = cookieStr.indexOf('=')
106
+
107
+ if (separatorIndex === -1) {
108
+ return parseCookiesWithJar(url, str)
109
+ }
110
+
111
+ const name = cookieStr.slice(0, separatorIndex).trim()
112
+
113
+ if (name.length === 0) {
114
+ return parseCookiesWithJar(url, str)
115
+ }
116
+
117
+ cookies[index++] = {
118
+ name,
119
+ value: cookieStr.slice(separatorIndex + 1).trim(),
120
+ domain,
121
+ path
122
+ }
123
+ }
124
+
125
+ return cookies
126
+ }
53
127
 
54
128
  const getMediaFeatures = ({ animations, colorScheme }) => {
55
129
  const prefers = []
@@ -226,37 +300,51 @@ module.exports = ({ defaultDevice = 'Macbook Pro 13', timeout: globalTimeout, ..
226
300
  )
227
301
  }
228
302
 
229
- const enableInterception =
230
- (onPageRequest || abortTypes.length > 0) &&
231
- run({
232
- fn: page.setRequestInterception(true),
233
- debug: 'enableInterception'
234
- })
303
+ const abortTypesSet = abortTypes.length > 0 ? new Set(abortTypes) : null
304
+
305
+ const requestHandlers = []
306
+ let abortTypesHandler
307
+ let disableInterceptionForAbortTypes = false
235
308
 
236
309
  if (onPageRequest) {
237
- Promise.resolve(enableInterception).then(() =>
238
- page.on('request', req => onPageRequest(req, page))
239
- )
310
+ const onPageRequestHandler = req => onPageRequest(req, page)
311
+ page.on('request', onPageRequestHandler)
312
+ requestHandlers.push(onPageRequestHandler)
240
313
  }
241
314
 
242
315
  if (abortTypes.length > 0) {
243
- Promise.resolve(enableInterception).then(() => {
244
- page.on('request', req => {
245
- if (req.isInterceptResolutionHandled()) return
246
- const resourceType = req.resourceType()
247
- const url = truncate(req.url())
248
-
249
- if (!abortTypes.includes(resourceType)) {
250
- debug.continue({ url, resourceType })
251
- return req.continue(
252
- req.continueRequestOverrides(),
253
- DEFAULT_INTERCEPT_RESOLUTION_PRIORITY
254
- )
316
+ abortTypesHandler = req => {
317
+ if (req.isInterceptResolutionHandled()) return
318
+ const resourceType = req.resourceType()
319
+ const url = truncate(req.url())
320
+
321
+ if (!abortTypesSet.has(resourceType)) {
322
+ debug.continue({ url, resourceType })
323
+ return req.continue(req.continueRequestOverrides(), DEFAULT_INTERCEPT_RESOLUTION_PRIORITY)
324
+ }
325
+ debug.abort({ url, resourceType })
326
+ return req.abort('blockedbyclient', DEFAULT_INTERCEPT_RESOLUTION_PRIORITY)
327
+ }
328
+
329
+ page.on('request', abortTypesHandler)
330
+ requestHandlers.push(abortTypesHandler)
331
+ }
332
+
333
+ if (requestHandlers.length > 0) {
334
+ prePromises.push(
335
+ run({
336
+ fn: page.setRequestInterception(true),
337
+ debug: 'enableInterception'
338
+ }).then(result => {
339
+ // If interception setup fails, remove handlers to avoid keeping dead listeners.
340
+ if (result.isRejected) {
341
+ requestHandlers.forEach(handler => page.off('request', handler))
342
+ } else if (abortTypesHandler && !withAdblock && !onPageRequest) {
343
+ disableInterceptionForAbortTypes = true
255
344
  }
256
- debug.abort({ url, resourceType })
257
- return req.abort('blockedbyclient', DEFAULT_INTERCEPT_RESOLUTION_PRIORITY)
345
+ return result
258
346
  })
259
- })
347
+ )
260
348
  }
261
349
 
262
350
  if (withAdblock) {
@@ -273,7 +361,11 @@ module.exports = ({ defaultDevice = 'Macbook Pro 13', timeout: globalTimeout, ..
273
361
  )
274
362
  }
275
363
 
276
- const device = getDevice({ headers, ...args, device: args.device ?? defaultDevice })
364
+ const device = getDevice({
365
+ headers,
366
+ device: args.device ?? defaultDevice,
367
+ viewport: args.viewport
368
+ })
277
369
 
278
370
  if (device.userAgent && !headers['user-agent']) {
279
371
  headers['user-agent'] = device.userAgent
@@ -292,10 +384,11 @@ module.exports = ({ defaultDevice = 'Macbook Pro 13', timeout: globalTimeout, ..
292
384
  const headersKeys = Object.keys(headers)
293
385
 
294
386
  if (headersKeys.length > 0) {
295
- const { cookie, ...headersWithoutCookie } = headers
387
+ const cookie = headers.cookie
388
+ const userAgent = headers['user-agent']
296
389
 
297
- if (headers.cookie) {
298
- const cookies = parseCookies(url, headers.cookie)
390
+ if (cookie) {
391
+ const cookies = parseCookies(url, cookie)
299
392
  prePromises.push(
300
393
  run({
301
394
  fn: page.setCookie(...cookies),
@@ -305,25 +398,41 @@ module.exports = ({ defaultDevice = 'Macbook Pro 13', timeout: globalTimeout, ..
305
398
  )
306
399
  }
307
400
 
308
- const extraHTTPHeaders = headers.cookie ? headersWithoutCookie : headers
309
- const extraHTTPHeadersKeys = Object.keys(extraHTTPHeaders)
310
-
311
- if (headers['user-agent']) {
401
+ if (userAgent) {
312
402
  prePromises.push(
313
403
  run({
314
- fn: page.setUserAgent(headers['user-agent']),
404
+ fn: page.setUserAgent(userAgent),
315
405
  timeout: actionTimeout,
316
- debug: { 'user-agent': headers['user-agent'] }
406
+ debug: { 'user-agent': userAgent }
317
407
  })
318
408
  )
319
409
  }
320
410
 
321
- if (extraHTTPHeadersKeys.length > 0) {
411
+ if (cookie) {
412
+ const extraHTTPHeaders = {}
413
+ const extraHTTPHeadersKeys = []
414
+
415
+ for (const key of headersKeys) {
416
+ if (key === 'cookie') continue
417
+ extraHTTPHeaders[key] = headers[key]
418
+ extraHTTPHeadersKeys.push(key)
419
+ }
420
+
421
+ if (extraHTTPHeadersKeys.length > 0) {
422
+ prePromises.push(
423
+ run({
424
+ fn: page.setExtraHTTPHeaders(extraHTTPHeaders),
425
+ timeout: actionTimeout,
426
+ debug: { headers: extraHTTPHeadersKeys }
427
+ })
428
+ )
429
+ }
430
+ } else if (!(userAgent && headersKeys.length === 1)) {
322
431
  prePromises.push(
323
432
  run({
324
- fn: page.setExtraHTTPHeaders(extraHTTPHeaders),
433
+ fn: page.setExtraHTTPHeaders(headers),
325
434
  timeout: actionTimeout,
326
- debug: { headers: extraHTTPHeadersKeys }
435
+ debug: { headers: headersKeys }
327
436
  })
328
437
  )
329
438
  }
@@ -351,72 +460,91 @@ module.exports = ({ defaultDevice = 'Macbook Pro 13', timeout: globalTimeout, ..
351
460
  )
352
461
  }
353
462
 
354
- await Promise.all(prePromises)
463
+ try {
464
+ await Promise.all(prePromises)
355
465
 
356
- const { value: response, reason: error } = await run({
357
- fn: html
466
+ let clearStopLoadingTimer = () => {}
467
+ const navigationPromise = html
358
468
  ? page.setContent(html, { waitUntil, ...args })
359
- : Promise.race([
360
- page.goto(url, { waitUntil, ...args }),
361
- setTimeout(gotoTimeout).then(() => page._client().send('Page.stopLoading'))
362
- ]),
363
- timeout: gotoTimeout,
364
- debug: { fn: html ? 'html' : 'url', waitUntil }
365
- })
366
-
367
- if (withAdblock) {
368
- await run({
369
- fn: adblock.runAutoConsent(page),
370
- timeout: actionTimeout,
371
- debug: 'autoconsent:run'
469
+ : (() => {
470
+ const { promise, clear } = stopLoadingOnTimeout(page, gotoTimeout)
471
+ clearStopLoadingTimer = clear
472
+ return Promise.race([page.goto(url, { waitUntil, ...args }), promise])
473
+ })()
474
+
475
+ const { value: response, reason: error } = await run({
476
+ fn: navigationPromise,
477
+ timeout: gotoTimeout,
478
+ debug: { fn: html ? 'html' : 'url', waitUntil }
372
479
  })
373
- }
480
+ clearStopLoadingTimer()
374
481
 
375
- for (const [key, value] of Object.entries({
376
- waitForSelector,
377
- waitForFunction
378
- })) {
379
- if (value) {
380
- await run({ fn: page[key](value), timeout: gotoTimeout, debug: { [key]: value } })
482
+ if (withAdblock) {
483
+ await run({
484
+ fn: adblock.runAutoConsent(page),
485
+ timeout: actionTimeout,
486
+ debug: 'autoconsent:run'
487
+ })
381
488
  }
382
- }
383
489
 
384
- if (waitForTimeout) {
385
- await setTimeout(waitForTimeout)
386
- }
387
-
388
- await inject(page, {
389
- timeout: actionTimeout,
390
- mediaType,
391
- animations,
392
- modules,
393
- scripts,
394
- styles
395
- })
490
+ if (waitForSelector) {
491
+ await run({
492
+ fn: page.waitForSelector(waitForSelector),
493
+ timeout: gotoTimeout,
494
+ debug: { waitForSelector }
495
+ })
496
+ }
396
497
 
397
- if (click) {
398
- for (const selector of castArray(click)) {
498
+ if (waitForFunction) {
399
499
  await run({
400
- fn: page.click(selector),
401
- timeout: actionTimeout,
402
- debug: { click: selector }
500
+ fn: page.waitForFunction(waitForFunction),
501
+ timeout: gotoTimeout,
502
+ debug: { waitForFunction }
403
503
  })
404
504
  }
405
- }
406
505
 
407
- if (scroll) {
408
- await run({
409
- fn: page.$eval(scroll, el => el.scrollIntoView()),
506
+ if (waitForTimeout) {
507
+ await setTimeout(waitForTimeout)
508
+ }
509
+
510
+ await inject(page, {
410
511
  timeout: actionTimeout,
411
- debug: { scroll }
512
+ mediaType,
513
+ animations,
514
+ modules,
515
+ scripts,
516
+ styles
412
517
  })
413
- }
414
518
 
415
- if (isWaitUntilAuto) {
416
- await waitUntilAuto(page, { response, timeout: actionTimeout * 2 })
417
- }
519
+ if (click) {
520
+ for (const selector of castArray(click)) {
521
+ await run({
522
+ fn: page.click(selector),
523
+ timeout: actionTimeout,
524
+ debug: { click: selector }
525
+ })
526
+ }
527
+ }
528
+
529
+ if (scroll) {
530
+ await run({
531
+ fn: page.$eval(scroll, el => el.scrollIntoView()),
532
+ timeout: actionTimeout,
533
+ debug: { scroll }
534
+ })
535
+ }
536
+
537
+ if (isWaitUntilAuto) {
538
+ await waitUntilAuto(page, { response, timeout: actionTimeout * 2 })
539
+ }
418
540
 
419
- return { response, device, error }
541
+ return { response, device, error }
542
+ } finally {
543
+ if (abortTypesHandler) page.off('request', abortTypesHandler)
544
+ if (disableInterceptionForAbortTypes) {
545
+ await pReflect(page.setRequestInterception(false))
546
+ }
547
+ }
420
548
  }
421
549
 
422
550
  goto.getDevice = getDevice