jumpy-lion 0.1.6-beta.3 → 0.1.6-beta.30

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (84) hide show
  1. package/README.md +188 -0
  2. package/browser/fonts/macos-bundle/40-macos-aliases.conf +497 -0
  3. package/browser/fonts/macos-bundle/README.md +76 -0
  4. package/browser/fonts/macos-bundle/install.sh +83 -0
  5. package/dist/browser-controller.d.ts +13 -0
  6. package/dist/browser-controller.d.ts.map +1 -1
  7. package/dist/browser-controller.js +102 -48
  8. package/dist/browser-controller.js.map +1 -1
  9. package/dist/browser-plugin.d.ts +10 -1
  10. package/dist/browser-plugin.d.ts.map +1 -1
  11. package/dist/browser-plugin.js +465 -9
  12. package/dist/browser-plugin.js.map +1 -1
  13. package/dist/browser-process/anti-detect-config.d.ts +168 -1
  14. package/dist/browser-process/anti-detect-config.d.ts.map +1 -1
  15. package/dist/browser-process/anti-detect-config.js +617 -32
  16. package/dist/browser-process/anti-detect-config.js.map +1 -1
  17. package/dist/browser-process/browser.d.ts +70 -4
  18. package/dist/browser-process/browser.d.ts.map +1 -1
  19. package/dist/browser-process/browser.js +349 -76
  20. package/dist/browser-process/browser.js.map +1 -1
  21. package/dist/browser-process/get-chrome-executable.d.ts +6 -0
  22. package/dist/browser-process/get-chrome-executable.d.ts.map +1 -1
  23. package/dist/browser-process/get-chrome-executable.js +23 -0
  24. package/dist/browser-process/get-chrome-executable.js.map +1 -1
  25. package/dist/browser-process/index.d.ts +1 -1
  26. package/dist/browser-process/index.d.ts.map +1 -1
  27. package/dist/browser-process/index.js +1 -1
  28. package/dist/browser-process/index.js.map +1 -1
  29. package/dist/crawler.d.ts +29 -0
  30. package/dist/crawler.d.ts.map +1 -1
  31. package/dist/crawler.js +96 -6
  32. package/dist/crawler.js.map +1 -1
  33. package/dist/fingerprinting/fingerprint-injector.d.ts.map +1 -1
  34. package/dist/fingerprinting/fingerprint-injector.js +75 -39
  35. package/dist/fingerprinting/fingerprint-injector.js.map +1 -1
  36. package/dist/fingerprinting/fingerprint-overrides/platform-consistency.d.ts.map +1 -1
  37. package/dist/fingerprinting/fingerprint-overrides/platform-consistency.js +42 -11
  38. package/dist/fingerprinting/fingerprint-overrides/platform-consistency.js.map +1 -1
  39. package/dist/fingerprinting/fingerprint-overrides/stealth-script.d.ts.map +1 -1
  40. package/dist/fingerprinting/fingerprint-overrides/stealth-script.js +72 -6
  41. package/dist/fingerprinting/fingerprint-overrides/stealth-script.js.map +1 -1
  42. package/dist/fingerprinting/fingerprint-overrides/webgl-spoofing.d.ts.map +1 -1
  43. package/dist/fingerprinting/fingerprint-overrides/webgl-spoofing.js +23 -4
  44. package/dist/fingerprinting/fingerprint-overrides/webgl-spoofing.js.map +1 -1
  45. package/dist/fingerprinting/locale-resolver.d.ts +7 -0
  46. package/dist/fingerprinting/locale-resolver.d.ts.map +1 -1
  47. package/dist/fingerprinting/locale-resolver.js +24 -1
  48. package/dist/fingerprinting/locale-resolver.js.map +1 -1
  49. package/dist/fingerprinting/non-apify-fingerprint-generator.d.ts +23 -11
  50. package/dist/fingerprinting/non-apify-fingerprint-generator.d.ts.map +1 -1
  51. package/dist/fingerprinting/non-apify-fingerprint-generator.js +53 -15
  52. package/dist/fingerprinting/non-apify-fingerprint-generator.js.map +1 -1
  53. package/dist/fingerprinting/non-apify-profiles.json +52186 -0
  54. package/dist/fingerprinting/os-consistency.d.ts +31 -0
  55. package/dist/fingerprinting/os-consistency.d.ts.map +1 -0
  56. package/dist/fingerprinting/os-consistency.js +50 -0
  57. package/dist/fingerprinting/os-consistency.js.map +1 -0
  58. package/dist/index.d.ts +5 -0
  59. package/dist/index.d.ts.map +1 -1
  60. package/dist/index.js +6 -0
  61. package/dist/index.js.map +1 -1
  62. package/dist/page.d.ts +45 -0
  63. package/dist/page.d.ts.map +1 -1
  64. package/dist/page.js +230 -28
  65. package/dist/page.js.map +1 -1
  66. package/dist/session/inject-hook.d.ts +24 -0
  67. package/dist/session/inject-hook.d.ts.map +1 -0
  68. package/dist/session/inject-hook.js +80 -0
  69. package/dist/session/inject-hook.js.map +1 -0
  70. package/dist/session/save.d.ts +21 -0
  71. package/dist/session/save.d.ts.map +1 -0
  72. package/dist/session/save.js +163 -0
  73. package/dist/session/save.js.map +1 -0
  74. package/dist/session/tar-data-dir.d.ts +13 -0
  75. package/dist/session/tar-data-dir.d.ts.map +1 -0
  76. package/dist/session/tar-data-dir.js +107 -0
  77. package/dist/session/tar-data-dir.js.map +1 -0
  78. package/dist/session/types.d.ts +80 -0
  79. package/dist/session/types.d.ts.map +1 -0
  80. package/dist/session/types.js +29 -0
  81. package/dist/session/types.js.map +1 -0
  82. package/dist/tsconfig.build.tsbuildinfo +1 -1
  83. package/package.json +12 -7
  84. package/scripts/postinstall.cjs +58 -0
package/README.md CHANGED
@@ -22,6 +22,14 @@
22
22
  - [Best Practices](#best-practices)
23
23
  - [Performance Considerations](#performance-considerations)
24
24
  - [Launch Options for Network and Persistence](#launch-options-for-network-and-persistence)
25
+ - [Session Bundle (save & restore browser state)](#session-bundle-save--restore-browser-state)
26
+ - [When to use it](#when-to-use-it)
27
+ - [Producer: capturing a bundle](#producer-capturing-a-bundle)
28
+ - [Consumer: rehydrating a bundle](#consumer-rehydrating-a-bundle)
29
+ - [What's inside a bundle](#whats-inside-a-bundle)
30
+ - [`saveSession()` options](#savesession-options)
31
+ - [`restoreSession()`](#restoresession)
32
+ - [Caveats](#caveats)
25
33
  - [Crawler Class Documentation](#crawler-class-documentation)
26
34
  - [Constructor](#constructor)
27
35
  - [CdpPage Class Documentation](#cdppage-class-documentation)
@@ -30,6 +38,8 @@
30
38
  - [Public Methods](#public-methods)
31
39
  - [Utility Functions](#utility-functions)
32
40
  - [createCDPRouter](#createcdprouter)
41
+ - [saveSession](#savesession)
42
+ - [restoreSession](#restoresession-1)
33
43
 
34
44
  ## Overview
35
45
 
@@ -299,6 +309,152 @@ const crawler = new Crawler({
299
309
 
300
310
  ---
301
311
 
312
+ ## Session Bundle (save & restore browser state)
313
+
314
+ The session bundle lets one crawler **capture** the full live browser state — cookies, per-origin localStorage and sessionStorage, the Chrome user-data-dir (which transitively carries IndexedDB, Service Workers, and Cache Storage), the C++ `fingerprintConfig`, the captured `fingerprint`, and the resolved locale triple — into a single JSON-serializable blob. A second crawler can then **rehydrate** from that blob and come up byte-for-byte identical, so a session that was authenticated upstream stays authenticated downstream (matching what `naver-session-test` does, generalized into the framework).
315
+
316
+ This is mechanism, **not** persistence: nothing is written to disk for you. You stash the bundle wherever you like (Apify KV store, S3, a local file) and pass it back when you launch the next crawler.
317
+
318
+ ### When to use it
319
+
320
+ - An Apify actor logs into a site, then hands the session to a second actor that does the actual scraping.
321
+ - A pool of long-running actors needs to checkpoint browser state between restarts.
322
+ - You need two crawls to look like the *exact same browser* to a bot detector (Cloudflare, DataDome, Naver) — same UA, same UA-CH, same WebGL renderer, same screen, same canvas/audio noise seed, same cookies, same localStorage, same Service Worker state.
323
+
324
+ ### Producer: capturing a bundle
325
+
326
+ ```typescript
327
+ import CDPCrawler, { saveSession, type SessionBundle } from 'cdp-crawler';
328
+ import { Actor } from 'apify';
329
+
330
+ let bundle: SessionBundle | undefined;
331
+
332
+ const producer = new CDPCrawler({
333
+ launchContext: {
334
+ launchOptions: {
335
+ useNativeStealth: true,
336
+ fingerprintOptions: { platform: 'MacIntel' },
337
+ },
338
+ },
339
+ requestHandler: async ({ page }) => {
340
+ await page.goto('https://target.example.com/login');
341
+ // … perform login, solve captcha, etc. …
342
+
343
+ bundle = await saveSession(page);
344
+ },
345
+ });
346
+
347
+ await producer.run(['https://target.example.com/login']);
348
+ await Actor.setValue('session', bundle); // ship to KV store for the next actor
349
+ ```
350
+
351
+ ### Consumer: rehydrating a bundle
352
+
353
+ Pass the bundle as `launchOptions.sessionBundle` on the new crawler. The plugin extracts the user-data-dir into a temp directory, feeds the captured `fingerprintConfig` straight to the C++ patches (regeneration is skipped), pins `useNonApifyFingerprints: false`, and replays cookies + per-origin storage on every new page **before** any navigation runs.
354
+
355
+ ```typescript
356
+ import CDPCrawler, { type SessionBundle } from 'cdp-crawler';
357
+ import { Actor } from 'apify';
358
+
359
+ const bundle = await Actor.getValue<SessionBundle>('session');
360
+ if (!bundle) throw new Error('No session bundle available');
361
+
362
+ const consumer = new CDPCrawler({
363
+ launchContext: {
364
+ launchOptions: {
365
+ useNativeStealth: true,
366
+ sessionBundle: bundle, // ← the only new option
367
+ },
368
+ },
369
+ requestHandler: async ({ page }) => {
370
+ // The first page already has the producer's cookies, localStorage,
371
+ // sessionStorage, IndexedDB, Service Worker registrations, etc.
372
+ await page.goto('https://target.example.com/account'); // already logged in
373
+ },
374
+ });
375
+
376
+ await consumer.run(['https://target.example.com/account']);
377
+ ```
378
+
379
+ ### What's inside a bundle
380
+
381
+ ```typescript
382
+ interface SessionBundle {
383
+ schemaVersion: 1;
384
+ createdAt: string;
385
+ createdBy?: { package: 'cdp-crawler'; version: string };
386
+
387
+ cookies: SerializedCookie[]; // Network.Cookie-shaped
388
+ localStorage: Record<origin, Record<key, value>>;
389
+ sessionStorage: Record<origin, Record<key, value>>;
390
+
391
+ userDataDir: {
392
+ encoding: 'base64+gzip+tar';
393
+ bytes: string; // the whole Chrome profile
394
+ sizeBytes: number;
395
+ capturedFiles: number;
396
+ } | null;
397
+
398
+ fingerprintConfig: FingerprintConfigJson; // C++ overrides JSON, byte-for-byte
399
+ fingerprint: BrowserFingerprintWithHeaders; // Crawlee-shaped fp object
400
+ fingerprintInput: { // The fingerprintOptions inputs
401
+ locale?: string; languages?: string; timezone?: string;
402
+ platform?: string; seedKey?: string;
403
+ useNonApifyFingerprints: false; // pinned on restore
404
+ [key: string]: unknown;
405
+ };
406
+ resolvedLocale: { locale: string; languages: string; timezone: string };
407
+
408
+ browserProfile: { // diagnostic snapshot
409
+ userAgent: string; platform: string; language: string;
410
+ screenWidth: number; screenHeight: number; devicePixelRatio: number;
411
+ webglRenderer: string; webglVendor: string;
412
+ };
413
+
414
+ proxyMeta?: { proxyUrl?: string; sessionId?: string; countryCode?: string };
415
+ }
416
+ ```
417
+
418
+ Bundles are versioned via `schemaVersion`. Loading a bundle with an unrecognized version throws an explicit error rather than misbehaving silently. A helper `assertValidBundle(value)` is exported for callers that want to validate before passing the bundle around.
419
+
420
+ ### `saveSession()` options
421
+
422
+ ```typescript
423
+ await saveSession(page, {
424
+ includeUserDataDir: true, // default true; set false for a JSON-only bundle (~50 KB)
425
+ flushCookies: true, // default true → calls Storage.flushBrowserCookies first
426
+ userDataDirPath: undefined, // override; defaults to the path used at launch
427
+ cookieUrls: undefined, // forwarded to Network.getCookies as a fallback
428
+ proxyMeta: { // optional; stamped for inspection, NOT replayed
429
+ proxyUrl, sessionId, countryCode,
430
+ },
431
+ });
432
+ ```
433
+
434
+ The proxy is intentionally not rebuilt on restore — pass your own `proxyUrl` to the consumer crawler so cookies stay tied to the same exit IP.
435
+
436
+ ### `restoreSession()`
437
+
438
+ `restoreSession(page, bundle)` is the manual escape hatch for advanced users who open additional pages (or targets) inside a single crawl and want them to share the bundle's cookies + storage. Most users do not need it — the `launchOptions.sessionBundle` option is the canonical path.
439
+
440
+ ```typescript
441
+ import { restoreSession } from 'cdp-crawler';
442
+
443
+ await restoreSession(page, bundle); // sets cookies + registers per-origin storage replay
444
+ ```
445
+
446
+ Note: `restoreSession` cannot swap the user-data-dir or fingerprintConfig on a running browser — those are launch-time inputs and must travel through `launchOptions.sessionBundle`.
447
+
448
+ ### Caveats
449
+
450
+ - **User-data-dir size**: a real Chrome profile can be several MB. Inlined as base64-gzipped-tar inside the JSON, this can push the bundle past the Apify KV-store 9 MB record limit. Use `includeUserDataDir: false` if you only need cookies + storage and can live without IndexedDB / Service Workers / Cache Storage.
451
+ - **Locks**: Singleton* sentinels, GPU/Shader/Code caches, and Crashpad metrics are deliberately stripped from the tar — they are process-bound or freely regenerable and would otherwise break restore on a new machine.
452
+ - **Capture timing**: the data-dir tar is taken while Chrome is still running. To avoid half-written LevelDB files, prefer to call `saveSession` after the page has gone idle (`waitForLoadState`-style settling, or right before `crawler.teardown()`). `saveSession` automatically calls `Storage.flushBrowserCookies` first.
453
+ - **Cross-version replay**: bundles are tagged with `createdBy.version`; loading a bundle produced by a meaningfully different `cdp-crawler` version may warn or fail depending on what changed in `fingerprintConfig`'s shape. The plan is to migrate forward, not to support arbitrarily old bundles.
454
+ - **Multi-origin localStorage**: `saveSession` captures the current page's origin only. If you need storage from multiple origins, navigate to each one before calling `saveSession`, or call `saveSession` per page and merge the bundles client-side.
455
+
456
+ ---
457
+
302
458
  ## `Crawler` Class Documentation
303
459
 
304
460
  ### Constructor
@@ -605,3 +761,35 @@ Creates a custom router for handling crawling routes using CDP.
605
761
  - `Router<Context>`: A configured router instance.
606
762
 
607
763
  ---
764
+
765
+ ### `saveSession`
766
+
767
+ #### `export async function saveSession(page: CdpPage, options?: SaveSessionOptions): Promise<SessionBundle>`
768
+
769
+ Captures the full browser state of a running page — cookies, per-origin web storage, fingerprintConfig, fingerprint object, resolved locale, and the Chrome user-data-dir (inlined as base64-gzipped-tar) — into a single JSON-serializable bundle.
770
+
771
+ - **Parameters**:
772
+ - `page` (CdpPage): a page produced by a `CDPCrawler` instance.
773
+ - `options` (SaveSessionOptions): optional knobs:
774
+ - `includeUserDataDir` (boolean, default `true`): pack and inline the Chrome user-data-dir.
775
+ - `flushCookies` (boolean, default `true`): call `Storage.flushBrowserCookies` before snapshotting cookies.
776
+ - `userDataDirPath` (string): override the user-data-dir path; defaults to the one used at launch.
777
+ - `cookieUrls` (string[]): forwarded to `Network.getCookies` when `Storage.getCookies` is unavailable.
778
+ - `proxyMeta` ({ proxyUrl?, sessionId?, countryCode? }): stamped on the bundle for inspection only.
779
+
780
+ - **Returns**:
781
+ - `Promise<SessionBundle>`: a fully populated, JSON-serializable bundle.
782
+
783
+ See [Session Bundle](#session-bundle-save--restore-browser-state) for usage patterns.
784
+
785
+ ### `restoreSession`
786
+
787
+ #### `export async function restoreSession(page: CdpPage, bundle: SessionBundle): Promise<void>`
788
+
789
+ Manually applies a bundle's cookies and per-origin web storage to an arbitrary page. Use this only for advanced flows (e.g. opening extra targets mid-crawl). The canonical path is `launchOptions.sessionBundle`, which also restores the user-data-dir and fingerprintConfig — `restoreSession` cannot swap those on a running browser.
790
+
791
+ - **Parameters**:
792
+ - `page` (CdpPage): the target page.
793
+ - `bundle` (SessionBundle): a bundle produced by `saveSession`.
794
+
795
+ ---