jumpy-lion 0.1.6-beta.30 → 0.1.6-beta.31

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (144) hide show
  1. package/README.md +1 -795
  2. package/browser/fonts/macos-bundle/README.md +9 -61
  3. package/dist/browser-controller.d.ts +8 -3
  4. package/dist/browser-controller.d.ts.map +1 -1
  5. package/dist/browser-controller.js +38 -248
  6. package/dist/browser-controller.js.map +1 -1
  7. package/dist/browser-plugin.d.ts +68 -106
  8. package/dist/browser-plugin.d.ts.map +1 -1
  9. package/dist/browser-plugin.js +210 -705
  10. package/dist/browser-plugin.js.map +1 -1
  11. package/dist/browser-process/align-fingerprint-config.d.ts +40 -0
  12. package/dist/browser-process/align-fingerprint-config.d.ts.map +1 -0
  13. package/dist/browser-process/align-fingerprint-config.js +79 -0
  14. package/dist/browser-process/align-fingerprint-config.js.map +1 -0
  15. package/dist/browser-process/anti-detect-config.d.ts +5 -43
  16. package/dist/browser-process/anti-detect-config.d.ts.map +1 -1
  17. package/dist/browser-process/anti-detect-config.js +212 -554
  18. package/dist/browser-process/anti-detect-config.js.map +1 -1
  19. package/dist/browser-process/browser.d.ts +19 -137
  20. package/dist/browser-process/browser.d.ts.map +1 -1
  21. package/dist/browser-process/browser.js +41 -10
  22. package/dist/browser-process/browser.js.map +1 -1
  23. package/dist/browser-process/fingerprint-config.d.ts +103 -0
  24. package/dist/browser-process/fingerprint-config.d.ts.map +1 -0
  25. package/dist/browser-process/fingerprint-config.js +245 -0
  26. package/dist/browser-process/fingerprint-config.js.map +1 -0
  27. package/dist/browser-process/gpu-family-profiles.d.ts +53 -0
  28. package/dist/browser-process/gpu-family-profiles.d.ts.map +1 -0
  29. package/dist/browser-process/gpu-family-profiles.js +395 -0
  30. package/dist/browser-process/gpu-family-profiles.js.map +1 -0
  31. package/dist/browser-process/launch-options.d.ts +195 -0
  32. package/dist/browser-process/launch-options.d.ts.map +1 -0
  33. package/dist/browser-process/launch-options.js +2 -0
  34. package/dist/browser-process/launch-options.js.map +1 -0
  35. package/dist/browser-process/port-allocator.d.ts +46 -0
  36. package/dist/browser-process/port-allocator.d.ts.map +1 -0
  37. package/dist/browser-process/port-allocator.js +171 -0
  38. package/dist/browser-process/port-allocator.js.map +1 -0
  39. package/dist/browser-process/process.d.ts +35 -0
  40. package/dist/browser-process/process.d.ts.map +1 -1
  41. package/dist/browser-process/process.js +110 -0
  42. package/dist/browser-process/process.js.map +1 -1
  43. package/dist/browser-process/signal-cleanup.d.ts +37 -0
  44. package/dist/browser-process/signal-cleanup.d.ts.map +1 -0
  45. package/dist/browser-process/signal-cleanup.js +93 -0
  46. package/dist/browser-process/signal-cleanup.js.map +1 -0
  47. package/dist/connection/cdp-reconnection.d.ts +61 -0
  48. package/dist/connection/cdp-reconnection.d.ts.map +1 -0
  49. package/dist/connection/cdp-reconnection.js +98 -0
  50. package/dist/connection/cdp-reconnection.js.map +1 -0
  51. package/dist/connection/page-session.d.ts +109 -0
  52. package/dist/connection/page-session.d.ts.map +1 -0
  53. package/dist/connection/page-session.js +257 -0
  54. package/dist/connection/page-session.js.map +1 -0
  55. package/dist/crawler.d.ts +1 -1
  56. package/dist/crawler.d.ts.map +1 -1
  57. package/dist/crawler.js +1 -1
  58. package/dist/crawler.js.map +1 -1
  59. package/dist/fingerprinting/fingerprint-injector.d.ts +6 -116
  60. package/dist/fingerprinting/fingerprint-injector.d.ts.map +1 -1
  61. package/dist/fingerprinting/fingerprint-injector.js +50 -535
  62. package/dist/fingerprinting/fingerprint-injector.js.map +1 -1
  63. package/dist/fingerprinting/fingerprint-overrides/index.d.ts +6 -2
  64. package/dist/fingerprinting/fingerprint-overrides/index.d.ts.map +1 -1
  65. package/dist/fingerprinting/fingerprint-overrides/index.js +6 -2
  66. package/dist/fingerprinting/fingerprint-overrides/index.js.map +1 -1
  67. package/dist/fingerprinting/fingerprint-overrides/navigator-override.d.ts +18 -0
  68. package/dist/fingerprinting/fingerprint-overrides/navigator-override.d.ts.map +1 -0
  69. package/dist/fingerprinting/fingerprint-overrides/navigator-override.js +136 -0
  70. package/dist/fingerprinting/fingerprint-overrides/navigator-override.js.map +1 -0
  71. package/dist/fingerprinting/fingerprint-overrides/override.d.ts +137 -0
  72. package/dist/fingerprinting/fingerprint-overrides/override.d.ts.map +1 -0
  73. package/dist/fingerprinting/fingerprint-overrides/override.js +14 -0
  74. package/dist/fingerprinting/fingerprint-overrides/override.js.map +1 -0
  75. package/dist/fingerprinting/fingerprint-overrides/registry.d.ts +27 -0
  76. package/dist/fingerprinting/fingerprint-overrides/registry.d.ts.map +1 -0
  77. package/dist/fingerprinting/fingerprint-overrides/registry.js +285 -0
  78. package/dist/fingerprinting/fingerprint-overrides/registry.js.map +1 -0
  79. package/dist/fingerprinting/fingerprint-overrides/screen-override.d.ts +16 -0
  80. package/dist/fingerprinting/fingerprint-overrides/screen-override.d.ts.map +1 -0
  81. package/dist/fingerprinting/fingerprint-overrides/screen-override.js +175 -0
  82. package/dist/fingerprinting/fingerprint-overrides/screen-override.js.map +1 -0
  83. package/dist/fingerprinting/injection-planner.d.ts +178 -0
  84. package/dist/fingerprinting/injection-planner.d.ts.map +1 -0
  85. package/dist/fingerprinting/injection-planner.js +376 -0
  86. package/dist/fingerprinting/injection-planner.js.map +1 -0
  87. package/dist/fingerprinting/profile-quality.d.ts +24 -0
  88. package/dist/fingerprinting/profile-quality.d.ts.map +1 -0
  89. package/dist/fingerprinting/profile-quality.js +165 -0
  90. package/dist/fingerprinting/profile-quality.js.map +1 -0
  91. package/dist/fingerprinting/profile-selector.d.ts +101 -0
  92. package/dist/fingerprinting/profile-selector.d.ts.map +1 -0
  93. package/dist/fingerprinting/profile-selector.js +156 -0
  94. package/dist/fingerprinting/profile-selector.js.map +1 -0
  95. package/dist/fingerprinting/ua-alignment.d.ts +51 -0
  96. package/dist/fingerprinting/ua-alignment.d.ts.map +1 -0
  97. package/dist/fingerprinting/ua-alignment.js +146 -0
  98. package/dist/fingerprinting/ua-alignment.js.map +1 -0
  99. package/dist/input/dropdown-selector.d.ts +74 -0
  100. package/dist/input/dropdown-selector.d.ts.map +1 -0
  101. package/dist/input/dropdown-selector.js +306 -0
  102. package/dist/input/dropdown-selector.js.map +1 -0
  103. package/dist/input/element-target.d.ts +117 -0
  104. package/dist/input/element-target.d.ts.map +1 -0
  105. package/dist/input/element-target.js +383 -0
  106. package/dist/input/element-target.js.map +1 -0
  107. package/dist/input/input-emulator.d.ts +85 -0
  108. package/dist/input/input-emulator.d.ts.map +1 -0
  109. package/dist/input/input-emulator.js +319 -0
  110. package/dist/input/input-emulator.js.map +1 -0
  111. package/dist/input/input-transport.d.ts +60 -0
  112. package/dist/input/input-transport.d.ts.map +1 -0
  113. package/dist/input/input-transport.js +28 -0
  114. package/dist/input/input-transport.js.map +1 -0
  115. package/dist/input/recording-transport.d.ts +32 -0
  116. package/dist/input/recording-transport.d.ts.map +1 -0
  117. package/dist/input/recording-transport.js +43 -0
  118. package/dist/input/recording-transport.js.map +1 -0
  119. package/dist/navigation/page-navigation.d.ts +67 -0
  120. package/dist/navigation/page-navigation.d.ts.map +1 -0
  121. package/dist/navigation/page-navigation.js +107 -0
  122. package/dist/navigation/page-navigation.js.map +1 -0
  123. package/dist/network/network-watch.d.ts +72 -0
  124. package/dist/network/network-watch.d.ts.map +1 -0
  125. package/dist/network/network-watch.js +143 -0
  126. package/dist/network/network-watch.js.map +1 -0
  127. package/dist/page.d.ts +59 -117
  128. package/dist/page.d.ts.map +1 -1
  129. package/dist/page.js +169 -1304
  130. package/dist/page.js.map +1 -1
  131. package/dist/session-profile.d.ts +79 -0
  132. package/dist/session-profile.d.ts.map +1 -0
  133. package/dist/session-profile.js +124 -0
  134. package/dist/session-profile.js.map +1 -0
  135. package/dist/tsconfig.build.tsbuildinfo +1 -1
  136. package/package.json +6 -4
  137. package/dist/fingerprinting/custom-fingerprint-injector.d.ts +0 -87
  138. package/dist/fingerprinting/custom-fingerprint-injector.d.ts.map +0 -1
  139. package/dist/fingerprinting/custom-fingerprint-injector.js +0 -342
  140. package/dist/fingerprinting/custom-fingerprint-injector.js.map +0 -1
  141. package/dist/launcher-wrap.d.ts +0 -10
  142. package/dist/launcher-wrap.d.ts.map +0 -1
  143. package/dist/launcher-wrap.js +0 -11
  144. package/dist/launcher-wrap.js.map +0 -1
package/README.md CHANGED
@@ -1,795 +1 @@
1
- # Crawler Documentation
2
-
3
- ## Table of Contents
4
-
5
- - [Overview](#overview)
6
- - [NPM Package](#npm-package)
7
- - [Usage](#usage)
8
- - [Example Project](#example-project)
9
- - [Internal Guide](#internal-guide)
10
- - [Examples and Configuration](#examples-and-configuration)
11
- - [Advanced Fingerprints Usage](#advanced-fingerprints-usage)
12
- - [Syncing BrowserPool and launchOptions fingerprints](#syncing-browserpool-and-launchoptions-fingerprints)
13
- - [Stealth Consistency and Network Policies](#stealth-consistency-and-network-policies)
14
- - [Configurable Fingerprint Options](#configurable-fingerprint-options)
15
- - [Usage](#usage-1)
16
- - [Available Options](#available-options)
17
- - [Core Stealth Options](#core-stealth-options)
18
- - [Fingerprint Spoofing](#fingerprint-spoofing)
19
- - [Platform Configuration](#platform-configuration)
20
- - [Additional Features](#additional-features)
21
- - [Default Behavior](#default-behavior)
22
- - [Best Practices](#best-practices)
23
- - [Performance Considerations](#performance-considerations)
24
- - [Launch Options for Network and Persistence](#launch-options-for-network-and-persistence)
25
- - [Session Bundle (save & restore browser state)](#session-bundle-save--restore-browser-state)
26
- - [When to use it](#when-to-use-it)
27
- - [Producer: capturing a bundle](#producer-capturing-a-bundle)
28
- - [Consumer: rehydrating a bundle](#consumer-rehydrating-a-bundle)
29
- - [What's inside a bundle](#whats-inside-a-bundle)
30
- - [`saveSession()` options](#savesession-options)
31
- - [`restoreSession()`](#restoresession)
32
- - [Caveats](#caveats)
33
- - [Crawler Class Documentation](#crawler-class-documentation)
34
- - [Constructor](#constructor)
35
- - [CdpPage Class Documentation](#cdppage-class-documentation)
36
- - [Constructor](#constructor-1)
37
- - [Static Methods](#static-methods)
38
- - [Public Methods](#public-methods)
39
- - [Utility Functions](#utility-functions)
40
- - [createCDPRouter](#createcdprouter)
41
- - [saveSession](#savesession)
42
- - [restoreSession](#restoresession-1)
43
-
44
- ## Overview
45
-
46
- The `Crawler` class is a custom implementation of the `BrowserCrawler` from Crawlee, designed to utilize the Chrome DevTools Protocol (CDP) for advanced antiblocking capabilities.
47
-
48
- ## NPM Package
49
-
50
- The `jumpy-lion` is official cdp crawler package. See it [here](https://www.npmjs.com/package/jumpy-lion).
51
-
52
- ### Installation
53
-
54
- ```bash
55
- npm install jumpy-lion
56
- ```
57
-
58
- A postinstall hook pulls the matching anti-detect Chromium build from a public
59
- Apify key-value store — no GitHub token, no Apify token. Works the same on
60
- your workstation, CI, and the Apify platform. If the download fails for any
61
- reason (offline, proxy, etc.) the install still succeeds; the crawler falls
62
- back to the system Chrome on `PATH`. See `browser/BUILD.md` for env-var
63
- overrides and the maintainer release flow.
64
-
65
- ---
66
- ---
67
-
68
- ## Usage
69
-
70
- ### Example Project
71
-
72
- Refer to this [GitHub repository](https://github.com/apify-projects/cdp-crawler-example) for a complete example of using the `Crawler` class.
73
-
74
- ### Internal Guide
75
-
76
- Check out the [CDP Crawler internal guide](https://www.notion.so/apify/CDP-Crawler-internal-guide-183f39950a2280be81d7c86dc048a47a?pvs=4) for tutorial.
77
-
78
- ### Examples and Configuration
79
-
80
- For detailed examples and configuration patterns, see the [Examples README](./examples/README.md). The examples include:
81
-
82
- - **Basic Configuration**: Simple fingerprint setup for common use cases
83
- - **Comprehensive Configuration**: Full feature setup with all spoofing options
84
- - **Platform-Specific Configurations**: macOS, Windows, and Linux targeting
85
- - **Performance-Focused Configuration**: Optimized settings for speed
86
- - **Minimal Configuration**: Using intelligent defaults
87
-
88
- The examples demonstrate real-world usage patterns and best practices for different scenarios.
89
-
90
- ### Advanced Fingerprints usage
91
-
92
- To use advanced fingerprints, you need to set the `useExperimentalFingerprints` option to `true` in the `launchContext.launchOptions` of the `Crawler` constructor.
93
-
94
- ```typescript
95
- const crawler = new Crawler({
96
- launchContext: {
97
- launchOptions: {
98
- useExperimentalFingerprints: true,
99
- }
100
- },
101
- });
102
- ```
103
-
104
- ---
105
-
106
- ### Syncing BrowserPool and launchOptions fingerprints
107
-
108
- **Always keep the operating system in sync between BrowserPool fingerprints and `launchOptions.fingerprintOptions`.** A mismatch can lead to inconsistent signals (for example `navigator.platform`, User-Agent, WebGL, fonts) and reduce antibot effectiveness.
109
-
110
- - **launchOptions side**: Set `launchContext.launchOptions.fingerprintOptions.platform` to the desired platform string.
111
- - **BrowserPool side**: When `browserPoolOptions.useFingerprints` is `true`, set `browserPoolOptions.fingerprintOptions.fingerprintGeneratorOptions.operatingSystems` to the corresponding OS.
112
-
113
- Mapping guidance:
114
- - `platform: 'Win32'` ↔ `operatingSystems: ['windows']`
115
- - `platform: 'MacIntel'` ↔ `operatingSystems: ['macos']`
116
- - `platform: 'Linux x86_64'` ↔ `operatingSystems: ['linux']`
117
-
118
- Example:
119
-
120
- ```typescript
121
- const crawler = new Crawler({
122
- launchContext: {
123
- launchOptions: {
124
- useExperimentalFingerprints: true,
125
- fingerprintOptions: {
126
- platform: 'Win32', // Keep this in sync with BrowserPool OS
127
- },
128
- },
129
- },
130
- browserPoolOptions: {
131
- useFingerprints: true,
132
- fingerprintOptions: {
133
- fingerprintGeneratorOptions: {
134
- browsers: ['chrome'],
135
- operatingSystems: ['windows'], // Matches platform: 'Win32'
136
- devices: ['desktop'],
137
- },
138
- },
139
- },
140
- });
141
- ```
142
-
143
- Note: This configuration surface will be unified later. We are currently testing our custom fingerprint injector so it works even with the BrowserPool built‑in fingerprints turned off. If you prefer, you can rely solely on the custom injector by setting `browserPoolOptions.useFingerprints: false` and keeping `launchOptions.useExperimentalFingerprints: true`.
144
-
145
- ---
146
-
147
- ### Stealth Consistency and Network Policies
148
-
149
- Recent stealth hardening adds explicit consistency and policy controls:
150
-
151
- - **UA/Binary version alignment**: the injector aligns advertised `Chrome/x.y.z.w` with the actual running Chrome binary version to reduce fingerprint drift.
152
- - **WebRTC policy control**: set `fingerprintOptions.webRtcPolicy` to:
153
- - `'spoof'` (default): redacts/normalizes WebRTC leak surfaces.
154
- - `'disable'`: removes WebRTC APIs from page context.
155
- - **DNS hardening controls**: configure DoH and secure DNS through launch options (`dnsOverHttpsServer`, `secureDnsMode`).
156
- - **WebRTC transport policy flag**: configure `webrtcIpHandlingPolicy` at browser launch level.
157
- - **Persistent profile mode**: set `userDataDir` (+ optional `keepUserDataDir`) to reuse browser state across runs.
158
-
159
- ## Configurable Fingerprint Options
160
-
161
- The CDP crawler supports configurable fingerprint options that can be passed through the crawler options. This allows you to customize the fingerprint spoofing behavior for different use cases.
162
-
163
- ### Usage
164
-
165
- You can configure fingerprint options by adding them to the `launchContext.launchOptions.fingerprintOptions` in your crawler configuration:
166
-
167
- ```typescript
168
- import { Crawler } from 'cdp-crawler';
169
-
170
- const crawler = new Crawler({
171
- launchContext: {
172
- launchOptions: {
173
- fingerprintOptions: {
174
- // Enable advanced stealth features
175
- enableAdvancedStealth: true,
176
-
177
- // Bypass Runtime.enable detection
178
- bypassRuntimeEnable: true,
179
-
180
- // Humanize mouse interactions
181
- humanizeInteractions: true,
182
-
183
- // Spoof WebGL fingerprinting
184
- spoofWebGL: true,
185
-
186
- // Spoof audio context fingerprinting
187
- spoofAudioContext: true,
188
-
189
- // Add variations to client rect measurements
190
- spoofClientRects: true,
191
-
192
- // Mask automation flags
193
- maskAutomationFlags: true,
194
-
195
- // Use fingerprint-generator defaults when available
196
- useFingerprintDefaults: true,
197
-
198
- // Platform to spoof (defaults to Win32 for better evasion)
199
- platform: 'Win32', // 'Win32' | 'MacIntel' | 'Linux x86_64'
200
-
201
- // Spoof font measurements
202
- spoofFonts: true,
203
-
204
- // Spoof performance timing
205
- spoofPerformance: true,
206
-
207
- // Spoof locale settings
208
- spoofLocale: true,
209
-
210
- // Detect timezone from proxy (useful with residential proxies)
211
- detectTimezone: true,
212
-
213
- // WebRTC policy: 'spoof' (default) or 'disable'
214
- webRtcPolicy: 'spoof',
215
- }
216
- }
217
- },
218
- // ... other crawler options
219
- });
220
- ```
221
-
222
- ### Available Options
223
-
224
- #### Core Stealth Options
225
-
226
- - **`enableAdvancedStealth`** (boolean): Enables advanced stealth features including WebGPU spoofing and platform consistency
227
- - **`bypassRuntimeEnable`** (boolean): Prevents CDP detection through Runtime.enable bypass techniques
228
- - **`humanizeInteractions`** (boolean): Generates human-like mouse movements using bezier curves
229
-
230
- #### Fingerprint Spoofing
231
-
232
- - **`spoofWebGL`** (boolean): Spoofs WebGL fingerprinting by modifying GPU adapter information
233
- - **`spoofAudioContext`** (boolean): Adds noise to audio processing to prevent audio fingerprinting
234
- - **`spoofClientRects`** (boolean): Adds small variations to getBoundingClientRect results
235
- - **`spoofFonts`** (boolean): Hides platform-specific fonts and adds font measurement variations
236
- - **`spoofPerformance`** (boolean): Modifies timing characteristics to match the target platform
237
- - **`spoofLocale`** (boolean): Ensures consistent locale formatting across all browser properties
238
-
239
- #### Platform Configuration
240
-
241
- - **`platform`** (string): Target platform to spoof. Options: `'Win32'`, `'MacIntel'`, `'Linux x86_64'`
242
- - **`useFingerprintDefaults`** (boolean): Use hardcoded defaults instead of fingerprint-generator values. When `false`, uses generated fingerprint values; when `true` (default), uses hardcoded defaults
243
-
244
- #### Additional Features
245
-
246
- - **`maskAutomationFlags`** (boolean): Masks automation-related flags in the browser
247
- - **`detectTimezone`** (boolean): Automatically detect timezone from proxy IP (useful with residential proxies)
248
- - **`webRtcPolicy`** (`'spoof' | 'disable'`): Controls whether WebRTC is spoofed or fully removed from page APIs
249
-
250
- ### Default Behavior
251
-
252
- When no fingerprint options are provided, the crawler uses intelligent defaults:
253
-
254
- - **On Apify**: Uses Apify-recommended settings optimized for the Apify environment
255
- - **On other platforms**: Uses a comprehensive set of stealth features with Windows platform spoofing
256
- - **Humanization defaults**: mouse, keyboard, and scroll humanization are enabled with safe defaults
257
- - **UA consistency**: claimed UA Chrome version is automatically aligned to the running Chrome binary
258
-
259
- ### Best Practices
260
-
261
- 1. **Use `platform: 'Win32'`** for better evasion on Linux servers (like Apify)
262
- 2. **Enable `detectTimezone: true`** when using residential proxies
263
- 3. **Use `useFingerprintDefaults: false`** to leverage fingerprint-generator's realistic values
264
- 4. **Enable `bypassRuntimeEnable: true`** for sites that detect automation
265
- 5. **Use `enableAdvancedStealth: true`** for maximum protection against fingerprinting
266
- 6. **Keep OS settings in sync** between `launchOptions.fingerprintOptions.platform` and `browserPoolOptions.fingerprintOptions.fingerprintGeneratorOptions.operatingSystems`
267
- 7. **Use `webRtcPolicy: 'disable'`** for strictest leak prevention, or `'spoof'` for compatibility-sensitive targets
268
-
269
- ### Performance Considerations
270
-
271
- - More fingerprint options enabled = slightly higher CPU usage
272
- - WebGPU spoofing may add a small delay to page loads
273
- - Humanized interactions add realistic delays to mouse movements
274
-
275
- The fingerprint options are designed to provide maximum protection while maintaining good performance for web scraping tasks.
276
-
277
- For more configuration examples and patterns, see the [Examples README](./examples/README.md).
278
-
279
- ---
280
-
281
- ## Launch Options for Network and Persistence
282
-
283
- The following options are configured in `launchContext.launchOptions`:
284
-
285
- - **`dnsOverHttpsServer`** (string): DoH endpoint template, for example `https://cloudflare-dns.com/dns-query`
286
- - **`secureDnsMode`** (`'off' | 'automatic' | 'secure'`): Chromium secure DNS mode
287
- - **`webrtcIpHandlingPolicy`** (`'default' | 'default_public_interface_only' | 'default_public_and_private_interfaces' | 'disable_non_proxied_udp'`): Browser-level WebRTC IP handling policy
288
- - **`userDataDir`** (string): Reuse a specific Chrome profile directory across runs
289
- - **`keepUserDataDir`** (boolean): Keep/cleanup profile directory on close (defaults to keep custom dir, cleanup temp dir)
290
-
291
- Example:
292
-
293
- ```typescript
294
- const crawler = new Crawler({
295
- launchContext: {
296
- launchOptions: {
297
- dnsOverHttpsServer: 'https://cloudflare-dns.com/dns-query',
298
- secureDnsMode: 'secure',
299
- webrtcIpHandlingPolicy: 'disable_non_proxied_udp',
300
- userDataDir: './state/chrome-profile',
301
- keepUserDataDir: true,
302
- fingerprintOptions: {
303
- webRtcPolicy: 'disable',
304
- },
305
- },
306
- },
307
- });
308
- ```
309
-
310
- ---
311
-
312
- ## Session Bundle (save & restore browser state)
313
-
314
- The session bundle lets one crawler **capture** the full live browser state — cookies, per-origin localStorage and sessionStorage, the Chrome user-data-dir (which transitively carries IndexedDB, Service Workers, and Cache Storage), the C++ `fingerprintConfig`, the captured `fingerprint`, and the resolved locale triple — into a single JSON-serializable blob. A second crawler can then **rehydrate** from that blob and come up byte-for-byte identical, so a session that was authenticated upstream stays authenticated downstream (matching what `naver-session-test` does, generalized into the framework).
315
-
316
- This is mechanism, **not** persistence: nothing is written to disk for you. You stash the bundle wherever you like (Apify KV store, S3, a local file) and pass it back when you launch the next crawler.
317
-
318
- ### When to use it
319
-
320
- - An Apify actor logs into a site, then hands the session to a second actor that does the actual scraping.
321
- - A pool of long-running actors needs to checkpoint browser state between restarts.
322
- - You need two crawls to look like the *exact same browser* to a bot detector (Cloudflare, DataDome, Naver) — same UA, same UA-CH, same WebGL renderer, same screen, same canvas/audio noise seed, same cookies, same localStorage, same Service Worker state.
323
-
324
- ### Producer: capturing a bundle
325
-
326
- ```typescript
327
- import CDPCrawler, { saveSession, type SessionBundle } from 'cdp-crawler';
328
- import { Actor } from 'apify';
329
-
330
- let bundle: SessionBundle | undefined;
331
-
332
- const producer = new CDPCrawler({
333
- launchContext: {
334
- launchOptions: {
335
- useNativeStealth: true,
336
- fingerprintOptions: { platform: 'MacIntel' },
337
- },
338
- },
339
- requestHandler: async ({ page }) => {
340
- await page.goto('https://target.example.com/login');
341
- // … perform login, solve captcha, etc. …
342
-
343
- bundle = await saveSession(page);
344
- },
345
- });
346
-
347
- await producer.run(['https://target.example.com/login']);
348
- await Actor.setValue('session', bundle); // ship to KV store for the next actor
349
- ```
350
-
351
- ### Consumer: rehydrating a bundle
352
-
353
- Pass the bundle as `launchOptions.sessionBundle` on the new crawler. The plugin extracts the user-data-dir into a temp directory, feeds the captured `fingerprintConfig` straight to the C++ patches (regeneration is skipped), pins `useNonApifyFingerprints: false`, and replays cookies + per-origin storage on every new page **before** any navigation runs.
354
-
355
- ```typescript
356
- import CDPCrawler, { type SessionBundle } from 'cdp-crawler';
357
- import { Actor } from 'apify';
358
-
359
- const bundle = await Actor.getValue<SessionBundle>('session');
360
- if (!bundle) throw new Error('No session bundle available');
361
-
362
- const consumer = new CDPCrawler({
363
- launchContext: {
364
- launchOptions: {
365
- useNativeStealth: true,
366
- sessionBundle: bundle, // ← the only new option
367
- },
368
- },
369
- requestHandler: async ({ page }) => {
370
- // The first page already has the producer's cookies, localStorage,
371
- // sessionStorage, IndexedDB, Service Worker registrations, etc.
372
- await page.goto('https://target.example.com/account'); // already logged in
373
- },
374
- });
375
-
376
- await consumer.run(['https://target.example.com/account']);
377
- ```
378
-
379
- ### What's inside a bundle
380
-
381
- ```typescript
382
- interface SessionBundle {
383
- schemaVersion: 1;
384
- createdAt: string;
385
- createdBy?: { package: 'cdp-crawler'; version: string };
386
-
387
- cookies: SerializedCookie[]; // Network.Cookie-shaped
388
- localStorage: Record<origin, Record<key, value>>;
389
- sessionStorage: Record<origin, Record<key, value>>;
390
-
391
- userDataDir: {
392
- encoding: 'base64+gzip+tar';
393
- bytes: string; // the whole Chrome profile
394
- sizeBytes: number;
395
- capturedFiles: number;
396
- } | null;
397
-
398
- fingerprintConfig: FingerprintConfigJson; // C++ overrides JSON, byte-for-byte
399
- fingerprint: BrowserFingerprintWithHeaders; // Crawlee-shaped fp object
400
- fingerprintInput: { // The fingerprintOptions inputs
401
- locale?: string; languages?: string; timezone?: string;
402
- platform?: string; seedKey?: string;
403
- useNonApifyFingerprints: false; // pinned on restore
404
- [key: string]: unknown;
405
- };
406
- resolvedLocale: { locale: string; languages: string; timezone: string };
407
-
408
- browserProfile: { // diagnostic snapshot
409
- userAgent: string; platform: string; language: string;
410
- screenWidth: number; screenHeight: number; devicePixelRatio: number;
411
- webglRenderer: string; webglVendor: string;
412
- };
413
-
414
- proxyMeta?: { proxyUrl?: string; sessionId?: string; countryCode?: string };
415
- }
416
- ```
417
-
418
- Bundles are versioned via `schemaVersion`. Loading a bundle with an unrecognized version throws an explicit error rather than misbehaving silently. A helper `assertValidBundle(value)` is exported for callers that want to validate before passing the bundle around.
419
-
420
- ### `saveSession()` options
421
-
422
- ```typescript
423
- await saveSession(page, {
424
- includeUserDataDir: true, // default true; set false for a JSON-only bundle (~50 KB)
425
- flushCookies: true, // default true → calls Storage.flushBrowserCookies first
426
- userDataDirPath: undefined, // override; defaults to the path used at launch
427
- cookieUrls: undefined, // forwarded to Network.getCookies as a fallback
428
- proxyMeta: { // optional; stamped for inspection, NOT replayed
429
- proxyUrl, sessionId, countryCode,
430
- },
431
- });
432
- ```
433
-
434
- The proxy is intentionally not rebuilt on restore — pass your own `proxyUrl` to the consumer crawler so cookies stay tied to the same exit IP.
435
-
436
- ### `restoreSession()`
437
-
438
- `restoreSession(page, bundle)` is the manual escape hatch for advanced users who open additional pages (or targets) inside a single crawl and want them to share the bundle's cookies + storage. Most users do not need it — the `launchOptions.sessionBundle` option is the canonical path.
439
-
440
- ```typescript
441
- import { restoreSession } from 'cdp-crawler';
442
-
443
- await restoreSession(page, bundle); // sets cookies + registers per-origin storage replay
444
- ```
445
-
446
- Note: `restoreSession` cannot swap the user-data-dir or fingerprintConfig on a running browser — those are launch-time inputs and must travel through `launchOptions.sessionBundle`.
447
-
448
- ### Caveats
449
-
450
- - **User-data-dir size**: a real Chrome profile can be several MB. Inlined as base64-gzipped-tar inside the JSON, this can push the bundle past the Apify KV-store 9 MB record limit. Use `includeUserDataDir: false` if you only need cookies + storage and can live without IndexedDB / Service Workers / Cache Storage.
451
- - **Locks**: Singleton* sentinels, GPU/Shader/Code caches, and Crashpad metrics are deliberately stripped from the tar — they are process-bound or freely regenerable and would otherwise break restore on a new machine.
452
- - **Capture timing**: the data-dir tar is taken while Chrome is still running. To avoid half-written LevelDB files, prefer to call `saveSession` after the page has gone idle (`waitForLoadState`-style settling, or right before `crawler.teardown()`). `saveSession` automatically calls `Storage.flushBrowserCookies` first.
453
- - **Cross-version replay**: bundles are tagged with `createdBy.version`; loading a bundle produced by a meaningfully different `cdp-crawler` version may warn or fail depending on what changed in `fingerprintConfig`'s shape. The plan is to migrate forward, not to support arbitrarily old bundles.
454
- - **Multi-origin localStorage**: `saveSession` captures the current page's origin only. If you need storage from multiple origins, navigate to each one before calling `saveSession`, or call `saveSession` per page and merge the bundles client-side.
455
-
456
- ---
457
-
458
- ## `Crawler` Class Documentation
459
-
460
- ### Constructor
461
-
462
- #### `constructor(options: BrowserCrawlerOptions = {}, override readonly config = Configuration.getGlobalConfig())`
463
-
464
- Initializes the `Crawler` instance with default and provided options.
465
-
466
- - **Parameters**:
467
-
468
- - `options` (BrowserCrawlerOptions): Configuration options for the crawler.
469
- - `launchContext`: Specifies browser launch parameters.
470
- - Default: `{}`
471
- - `headless`: Runs the browser in headless mode.
472
- - Default: `false`
473
- - `browserPoolOptions`: Configuration for managing browser instances.
474
- - `config` (Configuration): Global Crawlee configuration.
475
- - Default: `Configuration.getGlobalConfig()`
476
-
477
- - **Default Behavior**:
478
- - Throws an error if `launchContext.proxyUrl` is provided. Use `proxyConfiguration` instead.
479
- - Throws an error if `browserPoolOptions.browserPlugins` is set. Use `launchContext.launcher` instead.
480
-
481
- ---
482
-
483
- ## `CdpPage` Class Documentation
484
-
485
- ### Constructor
486
-
487
- #### `constructor(client: CDP.Client)`
488
-
489
- Initializes the `CdpPage` instance with a CDP client.
490
-
491
- - **Parameters**:
492
-
493
- - `client` (CDP.Client): The Chrome DevTools Protocol client.
494
-
495
- - **Emitted Events**:
496
- - `PAGE_CREATED`: Triggered upon the creation of the page.
497
-
498
- ### Static Methods
499
-
500
- #### `static async create(client: CDP.Client): Promise<CdpPage>`
501
-
502
- Creates and initializes a new `CdpPage` instance.
503
-
504
- - **Parameters**:
505
-
506
- - `client` (CDP.Client): The CDP client.
507
-
508
- - **Returns**:
509
- - `Promise<CdpPage>`: A promise resolving to the new `CdpPage` instance.
510
-
511
- ---
512
-
513
- ### Public Methods
514
-
515
- #### `async url(): Promise<string>`
516
- Gets the current URL of the page.
517
-
518
- - **Returns**:
519
- - `Promise<string>`: The current URL.
520
-
521
- #### `async goto(url: string, options?: GotoOptions): Promise<void>`
522
- Navigates to a specified URL.
523
-
524
- - **Parameters**:
525
- - `url` (string): The URL to navigate to.
526
- - `options` (GotoOptions): Navigation options, including:
527
- - `waitUntil`: When to consider navigation finished (`domcontentloaded` or `load`).
528
- - `timeout`: Maximum time to wait for navigation in milliseconds.
529
-
530
- #### `async click(selector: string): Promise<void>`
531
- Simulates a click on an element identified by the selector.
532
-
533
- - **Parameters**:
534
- - `selector` (string): CSS selector of the element.
535
-
536
- #### `async type(selector: string, text: string, options?: { delay?: number }): Promise<void>`
537
- Types text into an input field.
538
-
539
- - **Parameters**:
540
- - `selector` (string): CSS selector of the element.
541
- - `text` (string): Text to type.
542
- - `options` (object): Options for typing:
543
- - `delay`: Time in milliseconds between key presses.
544
-
545
- #### `async screenshot(options?: { path?: string; fullPage?: boolean; format?: 'png' | 'jpeg' }): Promise<Buffer>`
546
- Takes a screenshot of the page, with support for PNG and JPEG formats.
547
-
548
- - **Parameters**:
549
- - `options` (object): Screenshot options:
550
- - `path`: File path to save the screenshot.
551
- - `fullPage`: Capture the entire page.
552
- - `format`: Image format, either `'png'` (default) or `'jpeg'`.
553
-
554
- - **Returns**:
555
- - `Promise<Buffer>`: The screenshot as a buffer.
556
-
557
- #### `async content(): Promise<string>`
558
- Gets the HTML content of the page.
559
-
560
- - **Returns**:
561
- - `Promise<string>`: The page's HTML.
562
-
563
- #### `async toCheerio(): Promise<cheerio.CheerioAPI>`
564
- Converts the current page content to a Cheerio instance for DOM manipulation.
565
-
566
- - **Returns**:
567
- - `Promise<cheerio.CheerioAPI>`: A Cheerio API instance.
568
-
569
- #### `async setViewport(viewport: Viewport): Promise<void>`
570
- Sets the page's viewport dimensions.
571
-
572
- - **Parameters**:
573
- - `viewport` (Viewport): Object with `width` and `height` properties.
574
-
575
- #### `async setUserAgent(userAgent: string): Promise<void>`
576
- Overrides the user-agent string.
577
-
578
- - **Parameters**:
579
- - `userAgent` (string): The new user-agent string.
580
-
581
- #### `async setExtraHTTPHeaders(headers: Record<string, string>): Promise<void>`
582
- Sets additional HTTP headers for requests.
583
-
584
- - **Parameters**:
585
- - `headers` (Record<string, string>): Key-value pairs of headers.
586
-
587
- #### `async waitForResponse(urlPart: string, statusCode?: number, timeout?: number): Promise<any>`
588
- Waits for a specific network response.
589
-
590
- - **Parameters**:
591
- - `urlPart` (string): Part of the URL to match.
592
- - `statusCode` (number): Expected HTTP status code.
593
- - `timeout` (number): Maximum wait time in milliseconds.
594
-
595
- - **Returns**:
596
- - `Promise<any>`: The response.
597
-
598
- #### `async setCookies(cookies: Cookie[]): Promise<void>`
599
- Sets cookies for the page.
600
-
601
- - **Parameters**:
602
- - `cookies` (Cookie[]): Array of cookies to set.
603
-
604
- #### `async getCookies(urls?: string[]): Promise<Cookie[]>`
605
- Retrieves cookies for the given URLs or all cookies if no URLs are specified.
606
-
607
- - **Parameters**:
608
- - `urls` (string[]): Optional array of URLs.
609
-
610
- - **Returns**:
611
- - `Promise<Cookie[]>`: Array of cookies.
612
-
613
- #### `async waitForSelector(selector: string, options?: { timeout?: number }): Promise<void>`
614
- Waits for an element matching the selector to appear.
615
-
616
- - **Parameters**:
617
- - `selector` (string): CSS selector of the element.
618
- - `options` (object): Options for waiting:
619
- - `timeout`: Maximum wait time in milliseconds.
620
-
621
- #### `async elementExists(selector: string): Promise<boolean>`
622
- Checks if an element exists.
623
-
624
- - **Parameters**:
625
- - `selector` (string): CSS selector of the element.
626
-
627
- - **Returns**:
628
- - `Promise<boolean>`: `true` if the element exists, `false` otherwise.
629
-
630
- #### `async getTextContent(selector: string): Promise<string>`
631
- Gets the text content of an element.
632
-
633
- - **Parameters**:
634
- - `selector` (string): CSS selector of the element.
635
-
636
- - **Returns**:
637
- - `Promise<string>`: The element's text content.
638
-
639
- #### `async getHref(selector: string): Promise<string>`
640
- Gets the `href` attribute of an anchor element.
641
-
642
- - **Parameters**:
643
- - `selector` (string): CSS selector of the anchor element.
644
-
645
- - **Returns**:
646
- - `Promise<string>`: The `href` value.
647
-
648
- #### `async reload(options?: GotoOptions): Promise<void>`
649
- Reloads the current page.
650
-
651
- - **Parameters**:
652
- - `options` (GotoOptions): Navigation options, including:
653
- - `waitUntil`: When to consider reload finished (`domcontentloaded` or `load`).
654
- - `timeout`: Maximum time to wait for reload in milliseconds.
655
-
656
- #### `async deleteInput(selector: string): Promise<void>`
657
- Clears the value of an input field specified by the selector.
658
-
659
- - **Parameters**:
660
- - `selector` (string): CSS selector of the input element.
661
-
662
- #### `async isVisible(selector: string): Promise<boolean>`
663
- Checks if the element specified by selector is visible (not `display: none` and not `visibility: hidden`).
664
- The selector should be the root item which can be hidden, otherwise this function could return a false positive.
665
-
666
- - **Parameters**:
667
- - `selector` (string): CSS selector of the element.
668
- - **Returns**:
669
- - `Promise<boolean>`: `true` if the element is visible, `false` otherwise.
670
-
671
- #### `async selectOption(dropdownSelector: string, optionSelector: string | string[], options?: SelectOptionOptions): Promise<void>`
672
- Selects one or more options from a select element or dropdown with intelligent automatic handling.
673
-
674
- **Key Features**:
675
- - **Automatic Detection**: Distinguishes between HTML `<select>` elements and custom dropdowns
676
- - **Smart Trigger Discovery**: For custom dropdowns, automatically finds and clicks triggers using multiple strategies
677
- - **Virtualized List Support**: Handles large dropdown lists with intelligent scrolling
678
- - **No Manual Configuration**: No need to specify separate trigger and container selectors
679
-
680
- - **Parameters**:
681
- - `dropdownSelector` (string): CSS selector for the select element or dropdown container.
682
- - `optionSelector` (string | string[]): CSS selector(s) for the option(s) to select. Can be a single selector or array of selectors.
683
- - `options` (SelectOptionOptions): Optional configuration object with the following properties:
684
- - `timeout` (number): Maximum wait time in milliseconds. Default: 30000.
685
- - `force` (boolean): Bypass visibility and disabled checks. Default: false.
686
- - `waitForOptions` (boolean): Wait for dropdown options to load. Default: true.
687
- - `maxScrollAttempts` (number): Maximum scroll attempts for virtualized dropdowns. Default: 10.
688
-
689
- #### `async waitForElementPositionToStabilize(selector: string, timeout?: number, checkInterval?: number, stabilityThreshold?: number, tolerance?: number): Promise<void>`
690
- Waits for an element's position to stabilize by polling its bounding box. Useful before interactions after scrolling/animations.
691
-
692
- - **Parameters**:
693
- - `selector` (string): Target element selector
694
- - `timeout` (number): Max time to wait. Default: 2000
695
- - `checkInterval` (number): Polling interval. Default: 100
696
- - `stabilityThreshold` (number): Consecutive stable checks required. Default: 3
697
- - `tolerance` (number): Max pixel delta to consider stable. Default: 1
698
-
699
- - **Usage Examples**:
700
- ```typescript
701
- // Regular HTML select element - works directly
702
- await page.selectOption('select#country', 'option[value="us"]');
703
-
704
- // Multiple selection in HTML select
705
- await page.selectOption('select#languages', ['option[value="en"]', 'option[value="es"]']);
706
-
707
- // Custom dropdown - automatically finds and clicks trigger
708
- await page.selectOption('#dropdown-menu', '[data-value="premium"]');
709
-
710
- // Virtualized dropdown - automatically scrolls to find option
711
- await page.selectOption('#large-dropdown', '[data-item="item-500"]');
712
-
713
- // With custom configuration
714
- await page.selectOption(
715
- '#complex-dropdown',
716
- '.option[data-category="business"]',
717
- {
718
- timeout: 10000,
719
- maxScrollAttempts: 15
720
- }
721
- );
722
-
723
- // Bootstrap/Material-UI dropdowns work automatically
724
- await page.selectOption('.MuiSelect-menu', '[data-value="option1"]');
725
- await page.selectOption('.dropdown-menu', '.dropdown-item[data-value="choice2"]');
726
- ```
727
-
728
- - **How Trigger Detection Works**:
729
- The method automatically detects dropdown triggers using multiple strategies:
730
- 1. **Accessibility patterns**: `[aria-haspopup]`, `[role="button"]`
731
- 2. **Common class names**: `.dropdown-trigger`, `.select-trigger`
732
- 3. **Sibling elements**: Previous sibling of the dropdown container
733
- 4. **ID pattern matching**: `#menu-id` → `#trigger-id`, `#dropdown-menu` → `#dropdown-trigger`
734
-
735
- - **Migration from Previous API**:
736
- ```typescript
737
- // OLD - Complex API with manual configuration
738
- await page.selectOption('#trigger', '[data-value="item"]', {
739
- dropdownSelector: '#menu',
740
- optionSelector: '.dropdown-item'
741
- });
742
-
743
- // NEW - Simplified API with automatic detection
744
- await page.selectOption('#menu', '[data-value="item"]');
745
- ```
746
-
747
- ---
748
-
749
- ## Utility Functions
750
-
751
- ### `createCDPRouter`
752
-
753
- #### `export function createCDPRouter<Context extends CDPCrawlingContext = CDPCrawlingContext, UserData extends Dictionary = GetUserDataFromRequest<Context['request']>>(routes?: RouterRoutes<Context, UserData>): Router<Context>`
754
-
755
- Creates a custom router for handling crawling routes using CDP.
756
-
757
- - **Parameters**:
758
- - `routes` (RouterRoutes<Context, UserData>): Optional routes for defining crawl logic.
759
-
760
- - **Returns**:
761
- - `Router<Context>`: A configured router instance.
762
-
763
- ---
764
-
765
- ### `saveSession`
766
-
767
- #### `export async function saveSession(page: CdpPage, options?: SaveSessionOptions): Promise<SessionBundle>`
768
-
769
- Captures the full browser state of a running page — cookies, per-origin web storage, fingerprintConfig, fingerprint object, resolved locale, and the Chrome user-data-dir (inlined as base64-gzipped-tar) — into a single JSON-serializable bundle.
770
-
771
- - **Parameters**:
772
- - `page` (CdpPage): a page produced by a `CDPCrawler` instance.
773
- - `options` (SaveSessionOptions): optional knobs:
774
- - `includeUserDataDir` (boolean, default `true`): pack and inline the Chrome user-data-dir.
775
- - `flushCookies` (boolean, default `true`): call `Storage.flushBrowserCookies` before snapshotting cookies.
776
- - `userDataDirPath` (string): override the user-data-dir path; defaults to the one used at launch.
777
- - `cookieUrls` (string[]): forwarded to `Network.getCookies` when `Storage.getCookies` is unavailable.
778
- - `proxyMeta` ({ proxyUrl?, sessionId?, countryCode? }): stamped on the bundle for inspection only.
779
-
780
- - **Returns**:
781
- - `Promise<SessionBundle>`: a fully populated, JSON-serializable bundle.
782
-
783
- See [Session Bundle](#session-bundle-save--restore-browser-state) for usage patterns.
784
-
785
- ### `restoreSession`
786
-
787
- #### `export async function restoreSession(page: CdpPage, bundle: SessionBundle): Promise<void>`
788
-
789
- Manually applies a bundle's cookies and per-origin web storage to an arbitrary page. Use this only for advanced flows (e.g. opening extra targets mid-crawl). The canonical path is `launchOptions.sessionBundle`, which also restores the user-data-dir and fingerprintConfig — `restoreSession` cannot swap those on a running browser.
790
-
791
- - **Parameters**:
792
- - `page` (CdpPage): the target page.
793
- - `bundle` (SessionBundle): a bundle produced by `saveSession`.
794
-
795
- ---
1
+ # jumpy-lion