@crawlee/playwright 4.0.0-beta.1 → 4.0.0-beta.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,24 +1,15 @@
1
- import type { BrowserHook, LoadedContext, RouterHandler } from '@crawlee/browser';
2
- import type { BaseHttpResponseData, GetUserDataFromRequest, RestrictedCrawlingContext, RouterRoutes, StatisticsOptions, StatisticState } from '@crawlee/core';
1
+ import { BasicCrawler } from '@crawlee/basic';
2
+ import type { BasicCrawlerOptions, BrowserHook, LoadedRequest, Request } from '@crawlee/browser';
3
+ import type { BaseHttpResponseData, CrawlingContext, EnqueueLinksOptions, GetUserDataFromRequest, RouterRoutes, StatisticsOptions, StatisticState } from '@crawlee/core';
3
4
  import { Configuration, RequestHandlerResult, Statistics } from '@crawlee/core';
4
- import type { Awaitable, Dictionary } from '@crawlee/types';
5
+ import type { Dictionary } from '@crawlee/types';
5
6
  import { type CheerioRoot } from '@crawlee/utils';
6
7
  import { type Cheerio } from 'cheerio';
8
+ import type { AnyNode } from 'domhandler';
7
9
  // @ts-ignore optional peer dependency or compatibility with es2022
8
10
  import type { Page } from 'playwright';
9
- import type { Log } from '@apify/log';
10
- import type { PlaywrightCrawlerOptions, PlaywrightCrawlingContext, PlaywrightGotoOptions } from './playwright-crawler.js';
11
- import { PlaywrightCrawler } from './playwright-crawler.js';
11
+ import type { PlaywrightCrawlingContext, PlaywrightGotoOptions } from './playwright-crawler.js';
12
12
  import { RenderingTypePredictor } from './utils/rendering-type-prediction.js';
13
- type Result<TResult> = {
14
- result: TResult;
15
- ok: true;
16
- logs?: LogProxyCall[];
17
- } | {
18
- error: unknown;
19
- ok: false;
20
- logs?: LogProxyCall[];
21
- };
22
13
  interface AdaptivePlaywrightCrawlerStatisticState extends StatisticState {
23
14
  httpOnlyRequestHandlerRuns?: number;
24
15
  browserRequestHandlerRuns?: number;
@@ -33,7 +24,8 @@ declare class AdaptivePlaywrightCrawlerStatistics extends Statistics {
33
24
  trackBrowserRequestHandlerRun(): void;
34
25
  trackRenderingTypeMisprediction(): void;
35
26
  }
36
- export interface AdaptivePlaywrightCrawlerContext<UserData extends Dictionary = Dictionary> extends RestrictedCrawlingContext<UserData> {
27
+ export interface AdaptivePlaywrightCrawlerContext<UserData extends Dictionary = Dictionary> extends CrawlingContext<UserData> {
28
+ request: LoadedRequest<Request<UserData>>;
37
29
  /**
38
30
  * The HTTP response, either from the HTTP client or from the initial request from playwright's navigation.
39
31
  */
@@ -46,7 +38,7 @@ export interface AdaptivePlaywrightCrawlerContext<UserData extends Dictionary =
46
38
  * Wait for an element matching the selector to appear and return a Cheerio object of matched elements.
47
39
  * Timeout defaults to 5s.
48
40
  */
49
- querySelector<T = any>(selector: string, timeoutMs?: number): Promise<Cheerio<T>>;
41
+ querySelector(selector: string, timeoutMs?: number): Promise<Cheerio<AnyNode>>;
50
42
  /**
51
43
  * Wait for an element matching the selector to appear.
52
44
  * Timeout defaults to 5s.
@@ -74,24 +66,14 @@ export interface AdaptivePlaywrightCrawlerContext<UserData extends Dictionary =
74
66
  * ```
75
67
  */
76
68
  parseWithCheerio(selector?: string, timeoutMs?: number): Promise<CheerioRoot>;
69
+ enqueueLinks(options?: EnqueueLinksOptions): Promise<void>;
77
70
  }
78
- interface AdaptiveHook extends BrowserHook<Pick<AdaptivePlaywrightCrawlerContext, 'id' | 'request' | 'session' | 'proxyInfo' | 'log'> & {
71
+ interface AdaptiveHook extends BrowserHook<Pick<AdaptivePlaywrightCrawlerContext, 'id' | 'session' | 'proxyInfo' | 'log'> & {
79
72
  page?: Page;
73
+ request: Request;
80
74
  }, PlaywrightGotoOptions> {
81
75
  }
82
- export interface AdaptivePlaywrightCrawlerOptions extends Omit<PlaywrightCrawlerOptions, 'requestHandler' | 'preNavigationHooks' | 'postNavigationHooks'> {
83
- /**
84
- * Function that is called to process each request.
85
- *
86
- * The function receives the {@link AdaptivePlaywrightCrawlingContext} as an argument, and it must refrain from calling code with side effects,
87
- * other than the methods of the crawling context. Any other side effects may be invoked repeatedly by the crawler, which can lead to inconsistent results.
88
- *
89
- * The function must return a promise, which is then awaited by the crawler.
90
- *
91
- * If the function throws an exception, the crawler will try to re-crawl the
92
- * request later, up to `option.maxRequestRetries` times.
93
- */
94
- requestHandler?: (crawlingContext: LoadedContext<AdaptivePlaywrightCrawlerContext>) => Awaitable<void>;
76
+ export interface AdaptivePlaywrightCrawlerOptions<ExtendedContext extends AdaptivePlaywrightCrawlerContext = AdaptivePlaywrightCrawlerContext> extends Omit<BasicCrawlerOptions<AdaptivePlaywrightCrawlerContext, ExtendedContext>, 'preNavigationHooks' | 'postNavigationHooks'> {
95
77
  /**
96
78
  * Async functions that are sequentially evaluated before the navigation. Good for setting additional cookies.
97
79
  * The function accepts a subset of the crawling context. If you attempt to access the `page` property during HTTP-only crawling,
@@ -132,8 +114,6 @@ export interface AdaptivePlaywrightCrawlerOptions extends Omit<PlaywrightCrawler
132
114
  */
133
115
  preventDirectStorageAccess?: boolean;
134
116
  }
135
- declare const proxyLogMethods: readonly ["error", "exception", "softFail", "info", "debug", "perf", "warningOnce", "deprecated"];
136
- type LogProxyCall = [log: Log, method: (typeof proxyLogMethods)[number], ...args: unknown[]];
137
117
  /**
138
118
  * An extension of {@link PlaywrightCrawler} that uses a more limited request handler interface so that it is able to switch to HTTP-only crawling when it detects it may be possible.
139
119
  *
@@ -163,31 +143,29 @@ type LogProxyCall = [log: Log, method: (typeof proxyLogMethods)[number], ...args
163
143
  *
164
144
  * @experimental
165
145
  */
166
- export declare class AdaptivePlaywrightCrawler extends PlaywrightCrawler {
146
+ export declare class AdaptivePlaywrightCrawler<ExtendedContext extends AdaptivePlaywrightCrawlerContext = AdaptivePlaywrightCrawlerContext> extends BasicCrawler<AdaptivePlaywrightCrawlerContext, ExtendedContext> {
167
147
  readonly config: Configuration;
168
- private adaptiveRequestHandler;
169
148
  private renderingTypePredictor;
170
149
  private resultChecker;
171
150
  private resultComparator;
172
151
  private preventDirectStorageAccess;
152
+ private staticContextPipeline;
153
+ private browserContextPipeline;
154
+ private individualRequestHandlerTimeoutMillis;
173
155
  readonly stats: AdaptivePlaywrightCrawlerStatistics;
174
- /**
175
- * Default {@link Router} instance that will be used if we don't specify any {@link AdaptivePlaywrightCrawlerOptions.requestHandler|`requestHandler`}.
176
- * See {@link Router.addHandler|`router.addHandler()`} and {@link Router.addDefaultHandler|`router.addDefaultHandler()`}.
177
- */
178
- // @ts-ignore optional peer dependency or compatibility with es2022
179
- readonly router: RouterHandler<AdaptivePlaywrightCrawlerContext>;
180
- constructor(options?: AdaptivePlaywrightCrawlerOptions, config?: Configuration);
181
- protected _runRequestHandler(crawlingContext: PlaywrightCrawlingContext): Promise<void>;
156
+ private resultObjects;
157
+ private teardownHooks;
158
+ constructor(options?: AdaptivePlaywrightCrawlerOptions<ExtendedContext>, config?: Configuration);
159
+ private adaptCheerioContext;
160
+ private adaptPlaywrightContext;
161
+ private crawlOne;
162
+ protected runRequestHandler(crawlingContext: PlaywrightCrawlingContext): Promise<void>;
182
163
  protected commitResult(crawlingContext: PlaywrightCrawlingContext, { calls, keyValueStoreChanges }: RequestHandlerResult): Promise<void>;
183
164
  protected allowStorageAccess<R, TArgs extends any[]>(func: (...args: TArgs) => Promise<R>): (...args: TArgs) => Promise<R>;
184
- protected runRequestHandlerInBrowser(crawlingContext: PlaywrightCrawlingContext): Promise<{
185
- result: Result<RequestHandlerResult>;
186
- initialStateCopy?: Record<string, unknown>;
187
- }>;
188
- protected runRequestHandlerWithPlainHTTP(crawlingContext: PlaywrightCrawlingContext, oldStateCopy?: Dictionary): Promise<Result<RequestHandlerResult>>;
189
165
  private createLogProxy;
166
+ teardown(): Promise<void>;
190
167
  }
191
- export declare function createAdaptivePlaywrightRouter<Context extends AdaptivePlaywrightCrawlerContext = AdaptivePlaywrightCrawlerContext, UserData extends Dictionary = GetUserDataFromRequest<Context['request']>>(routes?: RouterRoutes<Context, UserData>): RouterHandler<Context>;
168
+ // @ts-ignore optional peer dependency or compatibility with es2022
169
+ export declare function createAdaptivePlaywrightRouter<Context extends AdaptivePlaywrightCrawlerContext = AdaptivePlaywrightCrawlerContext, UserData extends Dictionary = GetUserDataFromRequest<Context['request']>>(routes?: RouterRoutes<Context, UserData>): import("@crawlee/basic").RouterHandler<Context>;
192
170
  export {};
193
171
  //# sourceMappingURL=adaptive-playwright-crawler.d.ts.map
@@ -1 +1 @@
1
- {"version":3,"file":"adaptive-playwright-crawler.d.ts","sourceRoot":"","sources":["../../src/internals/adaptive-playwright-crawler.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,WAAW,EAAE,aAAa,EAA0B,aAAa,EAAE,MAAM,kBAAkB,CAAC;AAE1G,OAAO,KAAK,EACR,oBAAoB,EACpB,sBAAsB,EACtB,yBAAyB,EACzB,YAAY,EAEZ,iBAAiB,EACjB,cAAc,EACjB,MAAM,eAAe,CAAC;AACvB,OAAO,EAAE,aAAa,EAAE,oBAAoB,EAAU,UAAU,EAA4B,MAAM,eAAe,CAAC;AAClH,OAAO,KAAK,EAAE,SAAS,EAAE,UAAU,EAAE,MAAM,gBAAgB,CAAC;AAC5D,OAAO,EAAE,KAAK,WAAW,EAA0B,MAAM,gBAAgB,CAAC;AAC1E,OAAO,EAAE,KAAK,OAAO,EAAQ,MAAM,SAAS,CAAC;AAE7C,OAAO,KAAK,EAAE,IAAI,EAAE,MAAM,YAAY,CAAC;AAEvC,OAAO,KAAK,EAAE,GAAG,EAAE,MAAM,YAAY,CAAC;AAGtC,OAAO,KAAK,EACR,wBAAwB,EACxB,yBAAyB,EACzB,qBAAqB,EACxB,MAAM,yBAAyB,CAAC;AACjC,OAAO,EAAE,iBAAiB,EAAE,MAAM,yBAAyB,CAAC;AAC5D,OAAO,EAAsB,sBAAsB,EAAE,MAAM,sCAAsC,CAAC;AAElG,KAAK,MAAM,CAAC,OAAO,IACb;IAAE,MAAM,EAAE,OAAO,CAAC;IAAC,EAAE,EAAE,IAAI,CAAC;IAAC,IAAI,CAAC,EAAE,YAAY,EAAE,CAAA;CAAE,GACpD;IAAE,KAAK,EAAE,OAAO,CAAC;IAAC,EAAE,EAAE,KAAK,CAAC;IAAC,IAAI,CAAC,EAAE,YAAY,EAAE,CAAA;CAAE,CAAC;AAE3D,UAAU,uCAAwC,SAAQ,cAAc;IACpE,0BAA0B,CAAC,EAAE,MAAM,CAAC;IACpC,yBAAyB,CAAC,EAAE,MAAM,CAAC;IACnC,2BAA2B,CAAC,EAAE,MAAM,CAAC;CACxC;AAQD,cAAM,mCAAoC,SAAQ,UAAU;IAC/C,KAAK,EAAE,uCAAuC,CAAe;gBAE1D,OAAO,GAAE,iBAAsB;IAKlC,KAAK,IAAI,IAAI;cAOG,oBAAoB,IAAI,OAAO,CAAC,IAAI,CAAC;IAe9D,8BAA8B,IAAI,IAAI;IAKtC,6BAA6B,IAAI,IAAI;IAKrC,+BAA+B,IAAI,IAAI;CAI1C;AAED,MAAM,WAAW,gCAAgC,CAAC,QAAQ,SAAS,UAAU,GAAG,UAAU,CACtF,SAAQ,yBAAyB,CAAC,QAAQ,CAAC;IAC3C;;OAEG;IACH,QAAQ,EAAE,oBAAoB,CAAC;IAE/B;;OAEG;IACH,IAAI,EAAE,IAAI,CAAC;IAEX;;;OAGG;IACH,aAAa,CAAC,CAAC,GAAG,GAAG,EAAE,QAAQ,EAAE,MAAM,EAAE,SAAS,CAAC,EAAE,MAAM,GAAG,OAAO,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC,CAAC;IAElF;;;;;;;;;;;;OAYG;IACH,eAAe,CAAC,QAAQ,EAAE,MAAM,EAAE,SAAS,CAAC,EAAE,MAAM,GAAG,OAAO,CAAC,IAAI,CAAC,CAAC;IAErE;;;;;;;;;;;OAWG;IACH,gBAAgB,CAAC,QAAQ,CAAC,EAAE,MAAM,EAAE,SAAS,CAAC,EAAE,MAAM,GAAG,OAAO,CAAC,WAAW,CAAC,CAAC;CACjF;AAED,UAAU,YACN,SAAQ,WAAW,CACf,IAAI,CAAC,gCAAgC,EAAE,IAAI,GAAG,SAAS,GAAG,SAAS,GAAG,WAAW,GAAG,KAAK,CAAC,GAAG;IAAE,IAAI,CAAC,EAAE,IAAI,CAAA;CAAE,EAC5G,qBAAqB,CACxB;CAAG;AAER,MAAM,WAAW,gCACb,SAAQ,IAAI,CAAC,wBAAwB,EAAE,gBAAgB,GAAG,oBAAoB,GAAG,qBAAqB,CAAC;IACvG;;;;;;;;;;OAUG;IACH,cAAc,CAAC,EAAE,CAAC,eAAe,EAAE,aAAa,CAAC,gCAAgC,CAAC,KAAK,SAAS,CAAC,IAAI,CAAC,CAAC;IAEvG;;;;OAIG;IACH,kBAAkB,CAAC,EAAE,YAAY,EAAE,CAAC;IAEpC;;;;OAIG;IACH,mBAAmB,CAAC,EAAE,YAAY,EAAE,CAAC;IAErC;;;OAGG;IACH,2BAA2B,CAAC,EAAE,MAAM,CAAC;IAErC;;;;OAIG;IACH,aAAa,CAAC,EAAE,CAAC,MAAM,EAAE,oBAAoB,KAAK,OAAO,CAAC;IAE1D;;;;;OAKG;IACH,gBAAgB,CAAC,EAAE,CAAC,OAAO,EAAE,oBAAoB,EAAE,OAAO,EAAE,oBAAoB,KAAK,OAAO,CAAC;IAE7F;;OAEG;IACH,sBAAsB,CAAC,EAAE,IAAI,CAAC,sBAAsB,EAAE,SAAS,GAAG,aAAa,CAAC,CAAC;IAEjF;;;OAGG;IACH,0BAA0B,CAAC,EAAE,OAAO,CAAC;CACxC;AAED,QAAA,MAAM,eAAe,mGASX,CAAC;AAEX,KAAK,YAAY,GAAG,CAAC,GAAG,EAAE,GAAG,EAAE,MAAM,EAAE,CAAC,OAAO,eAAe,CAAC,CAAC,MAAM,CAAC,EAAE,GAAG,IAAI,EAAE,OAAO,EAAE,CAAC,CAAC;AAE7F;;;;;;;;;;;;;;;;;;;;;;;;;;;;GA4BG;AACH,qBAAa,yBAA0B,SAAQ,iBAAiB;aAkBtC,MAAM;IAjB5B,OAAO,CAAC,sBAAsB,CAA0D;IACxF,OAAO,CAAC,sBAAsB,CAA0E;IACxG,OAAO,CAAC,aAAa,CAAiE;IACtF,OAAO,CAAC,gBAAgB,CAAoE;IAC5F,OAAO,CAAC,0BAA0B,CAAU;IAC5C,SAAiB,KAAK,EAAE,mCAAmC,CAAC;IAE5D;;;OAGG;IAEH,SAAkB,MAAM,EAAE,aAAa,CAAC,gCAAgC,CAAC,CACnB;gBAGlD,OAAO,GAAE,gCAAqC,EAC5B,MAAM,gBAAkC;cA4CrC,kBAAkB,CAAC,eAAe,EAAE,yBAAyB,GAAG,OAAO,CAAC,IAAI,CAAC;cAuEtF,YAAY,CACxB,eAAe,EAAE,yBAAyB,EAC1C,EAAE,KAAK,EAAE,oBAAoB,EAAE,EAAE,oBAAoB,GACtD,OAAO,CAAC,IAAI,CAAC;IAgBhB,SAAS,CAAC,kBAAkB,CAAC,CAAC,EAAE,KAAK,SAAS,GAAG,EAAE,EAC/C,IAAI,EAAE,CAAC,GAAG,IAAI,EAAE,KAAK,KAAK,OAAO,CAAC,CAAC,CAAC,GACrC,CAAC,GAAG,IAAI,EAAE,KAAK,KAAK,OAAO,CAAC,CAAC,CAAC;cAQjB,0BAA0B,CACtC,eAAe,EAAE,yBAAyB,GAC3C,OAAO,CAAC;QAAE,MAAM,EAAE,MAAM,CAAC,oBAAoB,CAAC,CAAC;QAAC,gBAAgB,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAAA;KAAE,CAAC;cA8FhF,8BAA8B,CAC1C,eAAe,EAAE,yBAAyB,EAC1C,YAAY,CAAC,EAAE,UAAU,GAC1B,OAAO,CAAC,MAAM,CAAC,oBAAoB,CAAC,CAAC;IAqGxC,OAAO,CAAC,cAAc;CAYzB;AAED,wBAAgB,8BAA8B,CAC1C,OAAO,SAAS,gCAAgC,GAAG,gCAAgC,EACnF,QAAQ,SAAS,UAAU,GAAG,sBAAsB,CAAC,OAAO,CAAC,SAAS,CAAC,CAAC,EAC1E,MAAM,CAAC,EAAE,YAAY,CAAC,OAAO,EAAE,QAAQ,CAAC,0BAEzC"}
1
+ {"version":3,"file":"adaptive-playwright-crawler.d.ts","sourceRoot":"","sources":["../../src/internals/adaptive-playwright-crawler.ts"],"names":[],"mappings":"AAEA,OAAO,EAAE,YAAY,EAAE,MAAM,gBAAgB,CAAC;AAC9C,OAAO,KAAK,EAAE,mBAAmB,EAAE,WAAW,EAAE,aAAa,EAAE,OAAO,EAAE,MAAM,kBAAkB,CAAC;AAIjG,OAAO,KAAK,EACR,oBAAoB,EAEpB,eAAe,EACf,mBAAmB,EACnB,sBAAsB,EACtB,YAAY,EAEZ,iBAAiB,EACjB,cAAc,EACjB,MAAM,eAAe,CAAC;AACvB,OAAO,EACH,aAAa,EAEb,oBAAoB,EAEpB,UAAU,EAEb,MAAM,eAAe,CAAC;AACvB,OAAO,KAAK,EAAE,UAAU,EAAE,MAAM,gBAAgB,CAAC;AACjD,OAAO,EAAE,KAAK,WAAW,EAA0B,MAAM,gBAAgB,CAAC;AAC1E,OAAO,EAAE,KAAK,OAAO,EAAE,MAAM,SAAS,CAAC;AACvC,OAAO,KAAK,EAAE,OAAO,EAAE,MAAM,YAAY,CAAC;AAC1C,OAAO,KAAK,EAAE,IAAI,EAAE,MAAM,YAAY,CAAC;AAKvC,OAAO,KAAK,EAAE,yBAAyB,EAAE,qBAAqB,EAAE,MAAM,yBAAyB,CAAC;AAEhG,OAAO,EAAsB,sBAAsB,EAAE,MAAM,sCAAsC,CAAC;AAMlG,UAAU,uCAAwC,SAAQ,cAAc;IACpE,0BAA0B,CAAC,EAAE,MAAM,CAAC;IACpC,yBAAyB,CAAC,EAAE,MAAM,CAAC;IACnC,2BAA2B,CAAC,EAAE,MAAM,CAAC;CACxC;AAQD,cAAM,mCAAoC,SAAQ,UAAU;IAC/C,KAAK,EAAE,uCAAuC,CAAe;gBAE1D,OAAO,GAAE,iBAAsB;IAKlC,KAAK,IAAI,IAAI;cAOG,oBAAoB,IAAI,OAAO,CAAC,IAAI,CAAC;IAe9D,8BAA8B,IAAI,IAAI;IAKtC,6BAA6B,IAAI,IAAI;IAKrC,+BAA+B,IAAI,IAAI;CAI1C;AAED,MAAM,WAAW,gCAAgC,CAAC,QAAQ,SAAS,UAAU,GAAG,UAAU,CACtF,SAAQ,eAAe,CAAC,QAAQ,CAAC;IACjC,OAAO,EAAE,aAAa,CAAC,OAAO,CAAC,QAAQ,CAAC,CAAC,CAAC;IAC1C;;OAEG;IACH,QAAQ,EAAE,oBAAoB,CAAC;IAE/B;;OAEG;IACH,IAAI,EAAE,IAAI,CAAC;IAEX;;;OAGG;IACH,aAAa,CAAC,QAAQ,EAAE,MAAM,EAAE,SAAS,CAAC,EAAE,MAAM,GAAG,OAAO,CAAC,OAAO,CAAC,OAAO,CAAC,CAAC,CAAC;IAE/E;;;;;;;;;;;;OAYG;IACH,eAAe,CAAC,QAAQ,EAAE,MAAM,EAAE,SAAS,CAAC,EAAE,MAAM,GAAG,OAAO,CAAC,IAAI,CAAC,CAAC;IAErE;;;;;;;;;;;OAWG;IACH,gBAAgB,CAAC,QAAQ,CAAC,EAAE,MAAM,EAAE,SAAS,CAAC,EAAE,MAAM,GAAG,OAAO,CAAC,WAAW,CAAC,CAAC;IAE9E,YAAY,CAAC,OAAO,CAAC,EAAE,mBAAmB,GAAG,OAAO,CAAC,IAAI,CAAC,CAAC;CAC9D;AAED,UAAU,YACN,SAAQ,WAAW,CACf,IAAI,CAAC,gCAAgC,EAAE,IAAI,GAAG,SAAS,GAAG,WAAW,GAAG,KAAK,CAAC,GAAG;IAC7E,IAAI,CAAC,EAAE,IAAI,CAAC;IACZ,OAAO,EAAE,OAAO,CAAC;CACpB,EACD,qBAAqB,CACxB;CAAG;AAER,MAAM,WAAW,gCAAgC,CAC7C,eAAe,SAAS,gCAAgC,GAAG,gCAAgC,CAC7F,SAAQ,IAAI,CACN,mBAAmB,CAAC,gCAAgC,EAAE,eAAe,CAAC,EACtE,oBAAoB,GAAG,qBAAqB,CAC/C;IACD;;;;OAIG;IACH,kBAAkB,CAAC,EAAE,YAAY,EAAE,CAAC;IAEpC;;;;OAIG;IACH,mBAAmB,CAAC,EAAE,YAAY,EAAE,CAAC;IAErC;;;OAGG;IACH,2BAA2B,CAAC,EAAE,MAAM,CAAC;IAErC;;;;OAIG;IACH,aAAa,CAAC,EAAE,CAAC,MAAM,EAAE,oBAAoB,KAAK,OAAO,CAAC;IAE1D;;;;;OAKG;IACH,gBAAgB,CAAC,EAAE,CAAC,OAAO,EAAE,oBAAoB,EAAE,OAAO,EAAE,oBAAoB,KAAK,OAAO,CAAC;IAE7F;;OAEG;IACH,sBAAsB,CAAC,EAAE,IAAI,CAAC,sBAAsB,EAAE,SAAS,GAAG,aAAa,CAAC,CAAC;IAEjF;;;OAGG;IACH,0BAA0B,CAAC,EAAE,OAAO,CAAC;CACxC;AAeD;;;;;;;;;;;;;;;;;;;;;;;;;;;;GA4BG;AACH,qBAAa,yBAAyB,CAClC,eAAe,SAAS,gCAAgC,GAAG,gCAAgC,CAC7F,SAAQ,YAAY,CAAC,gCAAgC,EAAE,eAAe,CAAC;aAe/C,MAAM;IAd5B,OAAO,CAAC,sBAAsB,CAA0E;IACxG,OAAO,CAAC,aAAa,CAAiE;IACtF,OAAO,CAAC,gBAAgB,CAAoE;IAC5F,OAAO,CAAC,0BAA0B,CAAU;IAC5C,OAAO,CAAC,qBAAqB,CAAoD;IACjF,OAAO,CAAC,sBAAsB,CAAoD;IAClF,OAAO,CAAC,qCAAqC,CAAS;IACtD,SAAiB,KAAK,EAAE,mCAAmC,CAAC;IAC5D,OAAO,CAAC,aAAa,CAAwD;IAE7E,OAAO,CAAC,aAAa,CAAkC;gBAGnD,OAAO,GAAE,gCAAgC,CAAC,eAAe,CAAM,EAC7C,MAAM,gBAAkC;YAuIhD,mBAAmB;YAwCnB,sBAAsB;YA6CtB,QAAQ;cA2DG,iBAAiB,CAAC,eAAe,EAAE,yBAAyB,GAAG,OAAO,CAAC,IAAI,CAAC;cAyGrF,YAAY,CACxB,eAAe,EAAE,yBAAyB,EAC1C,EAAE,KAAK,EAAE,oBAAoB,EAAE,EAAE,oBAAoB,GACtD,OAAO,CAAC,IAAI,CAAC;IAgBhB,SAAS,CAAC,kBAAkB,CAAC,CAAC,EAAE,KAAK,SAAS,GAAG,EAAE,EAC/C,IAAI,EAAE,CAAC,GAAG,IAAI,EAAE,KAAK,KAAK,OAAO,CAAC,CAAC,CAAC,GACrC,CAAC,GAAG,IAAI,EAAE,KAAK,KAAK,OAAO,CAAC,CAAC,CAAC;IAQjC,OAAO,CAAC,cAAc;IAaP,QAAQ;CAM1B;AAED,wBAAgB,8BAA8B,CAC1C,OAAO,SAAS,gCAAgC,GAAG,gCAAgC,EACnF,QAAQ,SAAS,UAAU,GAAG,sBAAsB,CAAC,OAAO,CAAC,SAAS,CAAC,CAAC,EAC1E,MAAM,CAAC,EAAE,YAAY,CAAC,OAAO,EAAE,QAAQ,CAAC,mDAEzC"}
@@ -1,8 +1,9 @@
1
+ import { isDeepStrictEqual } from 'node:util';
2
+ import { BasicCrawler } from '@crawlee/basic';
1
3
  import { extractUrlsFromPage } from '@crawlee/browser';
2
- import { Configuration, RequestHandlerResult, Router, Statistics, withCheckedStorageAccess } from '@crawlee/core';
4
+ import { CheerioCrawler } from '@crawlee/cheerio';
5
+ import { Configuration, RequestHandlerError, RequestHandlerResult, Router, Statistics, withCheckedStorageAccess, } from '@crawlee/core';
3
6
  import { extractUrlsFromCheerio } from '@crawlee/utils';
4
- import { load } from 'cheerio';
5
- import isEqual from 'lodash.isequal';
6
7
  import { addTimeoutToPromise } from '@apify/timeout';
7
8
  import { PlaywrightCrawler } from './playwright-crawler.js';
8
9
  import { RenderingTypePredictor } from './utils/rendering-type-prediction.js';
@@ -80,24 +81,32 @@ const proxyLogMethods = [
80
81
  *
81
82
  * @experimental
82
83
  */
83
- export class AdaptivePlaywrightCrawler extends PlaywrightCrawler {
84
+ export class AdaptivePlaywrightCrawler extends BasicCrawler {
84
85
  config;
85
- adaptiveRequestHandler;
86
86
  renderingTypePredictor;
87
87
  resultChecker;
88
88
  resultComparator;
89
89
  preventDirectStorageAccess;
90
- /**
91
- * Default {@link Router} instance that will be used if we don't specify any {@link AdaptivePlaywrightCrawlerOptions.requestHandler|`requestHandler`}.
92
- * See {@link Router.addHandler|`router.addHandler()`} and {@link Router.addDefaultHandler|`router.addDefaultHandler()`}.
93
- */
94
- // @ts-ignore
95
- router = Router.create();
90
+ staticContextPipeline;
91
+ browserContextPipeline;
92
+ individualRequestHandlerTimeoutMillis;
93
+ resultObjects = new WeakMap();
94
+ teardownHooks = [];
96
95
  constructor(options = {}, config = Configuration.getGlobalConfig()) {
97
- const { requestHandler, renderingTypeDetectionRatio = 0.1, renderingTypePredictor, resultChecker, resultComparator, statisticsOptions, preventDirectStorageAccess = true, ...rest } = options;
98
- super(rest, config);
96
+ const { requestHandler, renderingTypeDetectionRatio = 0.1, renderingTypePredictor, resultChecker, resultComparator, statisticsOptions, preventDirectStorageAccess = true, requestHandlerTimeoutSecs = 60, errorHandler, failedRequestHandler, preNavigationHooks, postNavigationHooks, extendContext, contextPipelineBuilder, ...rest } = options;
97
+ super({
98
+ ...rest,
99
+ // Pass error handlers to the "main" crawler - we only pluck them from `rest` so that they don't go to the sub crawlers
100
+ errorHandler,
101
+ failedRequestHandler,
102
+ // Same for request handler
103
+ requestHandler,
104
+ // The builder intentionally returns null so that it crashes the crawler when it tries to use this instead of one of two the specialized context pipelines
105
+ // (that would be a logical error in this class)
106
+ contextPipelineBuilder: () => null,
107
+ }, config);
99
108
  this.config = config;
100
- this.adaptiveRequestHandler = requestHandler ?? this.router;
109
+ this.individualRequestHandlerTimeoutMillis = requestHandlerTimeoutSecs * 1000;
101
110
  this.renderingTypePredictor =
102
111
  renderingTypePredictor ?? new RenderingTypePredictor({ detectionRatio: renderingTypeDetectionRatio });
103
112
  this.resultChecker = resultChecker ?? (() => true);
@@ -112,10 +121,67 @@ export class AdaptivePlaywrightCrawler extends PlaywrightCrawler {
112
121
  return (resultA.datasetItems.length === resultB.datasetItems.length &&
113
122
  resultA.datasetItems.every((itemA, i) => {
114
123
  const itemB = resultB.datasetItems[i];
115
- return isEqual(itemA, itemB);
124
+ return isDeepStrictEqual(itemA, itemB);
116
125
  }));
117
126
  };
118
127
  }
128
+ const staticCrawler = new CheerioCrawler({
129
+ ...rest,
130
+ useSessionPool: false,
131
+ statisticsOptions: {
132
+ persistenceOptions: { enable: false },
133
+ },
134
+ preNavigationHooks: [
135
+ async (context) => {
136
+ for (const hook of preNavigationHooks ?? []) {
137
+ await hook(context, undefined);
138
+ }
139
+ },
140
+ ],
141
+ postNavigationHooks: [
142
+ async (context) => {
143
+ for (const hook of postNavigationHooks ?? []) {
144
+ await hook(context, undefined);
145
+ }
146
+ },
147
+ ],
148
+ }, config);
149
+ const browserCrawler = new PlaywrightCrawler({
150
+ ...rest,
151
+ useSessionPool: false,
152
+ statisticsOptions: {
153
+ persistenceOptions: { enable: false },
154
+ },
155
+ preNavigationHooks: [
156
+ async (context, gotoOptions) => {
157
+ for (const hook of preNavigationHooks ?? []) {
158
+ await hook(context, gotoOptions);
159
+ }
160
+ },
161
+ ],
162
+ postNavigationHooks: [
163
+ async (context, gotoOptions) => {
164
+ for (const hook of postNavigationHooks ?? []) {
165
+ await hook(context, gotoOptions);
166
+ }
167
+ },
168
+ ],
169
+ }, config);
170
+ this.teardownHooks.push(browserCrawler.teardown.bind(browserCrawler));
171
+ this.staticContextPipeline = staticCrawler.contextPipeline
172
+ .compose({
173
+ action: this.adaptCheerioContext.bind(this),
174
+ })
175
+ .compose({
176
+ action: async (context) => extendContext ? await extendContext(context) : context,
177
+ });
178
+ this.browserContextPipeline = browserCrawler.contextPipeline
179
+ .compose({
180
+ action: this.adaptPlaywrightContext.bind(this),
181
+ })
182
+ .compose({
183
+ action: async (context) => extendContext ? await extendContext(context) : context,
184
+ });
119
185
  this.stats = new AdaptivePlaywrightCrawlerStatistics({
120
186
  logMessage: `${this.log.getOptions().prefix} request statistics:`,
121
187
  config,
@@ -123,7 +189,112 @@ export class AdaptivePlaywrightCrawler extends PlaywrightCrawler {
123
189
  });
124
190
  this.preventDirectStorageAccess = preventDirectStorageAccess;
125
191
  }
126
- async _runRequestHandler(crawlingContext) {
192
+ async adaptCheerioContext(cheerioContext) {
193
+ // Capture the original response to avoid infinite recursion when the getter is copied to the context
194
+ const originalResponse = cheerioContext.response;
195
+ const enqueueLinks = this.resultObjects.get(cheerioContext)?.enqueueLinks;
196
+ if (enqueueLinks === undefined) {
197
+ throw new Error('Logical error - `this.resultObjects` does not contain the result object');
198
+ }
199
+ return {
200
+ get page() {
201
+ throw new Error('Page object was used in HTTP-only request handler');
202
+ },
203
+ get response() {
204
+ return {
205
+ // TODO remove this once cheerioContext.response is just a Response
206
+ complete: true,
207
+ headers: originalResponse.headers,
208
+ trailers: {},
209
+ url: originalResponse.url,
210
+ statusCode: originalResponse.statusCode,
211
+ redirectUrls: originalResponse.redirectUrls ?? [],
212
+ };
213
+ },
214
+ async querySelector(selector) {
215
+ return cheerioContext.$(selector);
216
+ },
217
+ async enqueueLinks(options = {}) {
218
+ const urls = options.urls ??
219
+ extractUrlsFromCheerio(cheerioContext.$, options.selector, options.baseUrl ?? cheerioContext.request.loadedUrl);
220
+ await enqueueLinks({ ...options, urls });
221
+ },
222
+ };
223
+ }
224
+ async adaptPlaywrightContext(playwrightContext) {
225
+ // Capture the original response to avoid infinite recursion when the getter is copied to the context
226
+ const originalResponse = playwrightContext.response;
227
+ const enqueueLinks = this.resultObjects.get(playwrightContext)?.enqueueLinks;
228
+ if (enqueueLinks === undefined) {
229
+ throw new Error('Logical error - `this.resultObjects` does not contain the result object');
230
+ }
231
+ return {
232
+ get response() {
233
+ return {
234
+ url: originalResponse.url(),
235
+ statusCode: originalResponse.status(),
236
+ headers: originalResponse.headers(),
237
+ trailers: {},
238
+ complete: true,
239
+ redirectUrls: [],
240
+ };
241
+ },
242
+ async querySelector(selector, timeoutMs = 5000) {
243
+ const locator = playwrightContext.page.locator(selector).first();
244
+ await locator.waitFor({ timeout: timeoutMs, state: 'attached' });
245
+ const $ = await playwrightContext.parseWithCheerio();
246
+ return $(selector);
247
+ },
248
+ async enqueueLinks(options = {}, timeoutMs = 5000) {
249
+ const selector = options.selector ?? 'a';
250
+ const locator = playwrightContext.page.locator(selector).first();
251
+ await locator.waitFor({ timeout: timeoutMs, state: 'attached' });
252
+ // TODO consider using `context.parseWithCheerio` to make this universal and avoid code duplication
253
+ const urls = options.urls ??
254
+ (await extractUrlsFromPage(playwrightContext.page, selector, options.baseUrl ?? playwrightContext.request.loadedUrl));
255
+ await enqueueLinks({ ...options, urls });
256
+ },
257
+ };
258
+ }
259
+ async crawlOne(renderingType, context, useStateFunction) {
260
+ const result = new RequestHandlerResult(this.config, AdaptivePlaywrightCrawler.CRAWLEE_STATE_KEY);
261
+ const logs = [];
262
+ const deferredCleanup = [];
263
+ const resultBoundContextHelpers = {
264
+ addRequests: result.addRequests,
265
+ pushData: result.pushData,
266
+ useState: this.allowStorageAccess(useStateFunction),
267
+ getKeyValueStore: this.allowStorageAccess(result.getKeyValueStore),
268
+ enqueueLinks: result.enqueueLinks,
269
+ log: this.createLogProxy(context.log, logs),
270
+ registerDeferredCleanup: (cleanup) => deferredCleanup.push(cleanup),
271
+ };
272
+ const subCrawlerContext = { ...context, ...resultBoundContextHelpers };
273
+ this.resultObjects.set(subCrawlerContext, result);
274
+ try {
275
+ const callAdaptiveRequestHandler = async () => {
276
+ if (renderingType === 'static') {
277
+ await this.staticContextPipeline.call(subCrawlerContext, async (finalContext) => await this.requestHandler(finalContext));
278
+ }
279
+ else if (renderingType === 'clientOnly') {
280
+ await this.browserContextPipeline.call(subCrawlerContext, async (finalContext) => await this.requestHandler(finalContext));
281
+ }
282
+ };
283
+ await addTimeoutToPromise(async () => withCheckedStorageAccess(() => {
284
+ if (this.preventDirectStorageAccess) {
285
+ throw new Error('Directly accessing storage in a request handler is not allowed in AdaptivePlaywrightCrawler');
286
+ }
287
+ }, callAdaptiveRequestHandler), this.individualRequestHandlerTimeoutMillis, 'Request handler timed out');
288
+ return { result, ok: true, logs };
289
+ }
290
+ catch (error) {
291
+ return { error, ok: false, logs };
292
+ }
293
+ finally {
294
+ await Promise.all(deferredCleanup.map((cleanup) => cleanup()));
295
+ }
296
+ }
297
+ async runRequestHandler(crawlingContext) {
127
298
  const renderingTypePrediction = this.renderingTypePredictor.predict(crawlingContext.request);
128
299
  const shouldDetectRenderingType = Math.random() < renderingTypePrediction.detectionProbabilityRecommendation;
129
300
  if (!shouldDetectRenderingType) {
@@ -132,15 +303,19 @@ export class AdaptivePlaywrightCrawler extends PlaywrightCrawler {
132
303
  if (renderingTypePrediction.renderingType === 'static' && !shouldDetectRenderingType) {
133
304
  crawlingContext.log.debug(`Running HTTP-only request handler for ${crawlingContext.request.url}`);
134
305
  this.stats.trackHttpOnlyRequestHandlerRun();
135
- const plainHTTPRun = await this.runRequestHandlerWithPlainHTTP(crawlingContext);
306
+ const plainHTTPRun = await this.crawlOne('static', crawlingContext, crawlingContext.useState);
136
307
  if (plainHTTPRun.ok && this.resultChecker(plainHTTPRun.result)) {
137
308
  crawlingContext.log.debug(`HTTP-only request handler succeeded for ${crawlingContext.request.url}`);
138
309
  plainHTTPRun.logs?.forEach(([log, method, ...args]) => log[method](...args));
139
310
  await this.commitResult(crawlingContext, plainHTTPRun.result);
140
311
  return;
141
312
  }
313
+ // Execution will "fall through" and try running the request handler in a browser
142
314
  if (!plainHTTPRun.ok) {
143
- crawlingContext.log.exception(plainHTTPRun.error, `HTTP-only request handler failed for ${crawlingContext.request.url}`);
315
+ const actualError = plainHTTPRun.error instanceof RequestHandlerError
316
+ ? plainHTTPRun.error.cause
317
+ : plainHTTPRun.error;
318
+ crawlingContext.log.exception(actualError, `HTTP-only request handler failed for ${crawlingContext.request.url}`);
144
319
  }
145
320
  else {
146
321
  crawlingContext.log.warning(`HTTP-only request handler returned a suspicious result for ${crawlingContext.request.url}`);
@@ -153,14 +328,30 @@ export class AdaptivePlaywrightCrawler extends PlaywrightCrawler {
153
328
  // a rendering type detection if necessary. Without this measure, the HTTP request handler would run
154
329
  // under different conditions, which could change its behavior. Changes done to the crawler state by
155
330
  // the HTTP request handler will not be committed to the actual storage.
156
- const { result: browserRun, initialStateCopy } = await this.runRequestHandlerInBrowser(crawlingContext);
331
+ const stateTracker = {
332
+ stateCopy: null,
333
+ async getLiveState(defaultValue = {}) {
334
+ const state = await crawlingContext.useState(defaultValue);
335
+ if (this.stateCopy === null) {
336
+ this.stateCopy = JSON.parse(JSON.stringify(state));
337
+ }
338
+ return state;
339
+ },
340
+ async getStateCopy(defaultValue = {}) {
341
+ if (this.stateCopy === null) {
342
+ return defaultValue;
343
+ }
344
+ return this.stateCopy;
345
+ },
346
+ };
347
+ const browserRun = await this.crawlOne('clientOnly', crawlingContext, stateTracker.getLiveState.bind(stateTracker));
157
348
  if (!browserRun.ok) {
158
349
  throw browserRun.error;
159
350
  }
160
351
  await this.commitResult(crawlingContext, browserRun.result);
161
352
  if (shouldDetectRenderingType) {
162
353
  crawlingContext.log.debug(`Detecting rendering type for ${crawlingContext.request.url}`);
163
- const plainHTTPRun = await this.runRequestHandlerWithPlainHTTP(crawlingContext, initialStateCopy);
354
+ const plainHTTPRun = await this.crawlOne('static', crawlingContext, stateTracker.getStateCopy.bind(stateTracker));
164
355
  const detectionResult = (() => {
165
356
  if (!plainHTTPRun.ok) {
166
357
  return 'clientOnly';
@@ -188,152 +379,6 @@ export class AdaptivePlaywrightCrawler extends PlaywrightCrawler {
188
379
  allowStorageAccess(func) {
189
380
  return async (...args) => withCheckedStorageAccess(() => { }, async () => func(...args));
190
381
  }
191
- async runRequestHandlerInBrowser(crawlingContext) {
192
- const result = new RequestHandlerResult(this.config, AdaptivePlaywrightCrawler.CRAWLEE_STATE_KEY);
193
- let initialStateCopy;
194
- try {
195
- await super._runRequestHandler.call(new Proxy(this, {
196
- get: (target, propertyName, receiver) => {
197
- if (propertyName === 'userProvidedRequestHandler') {
198
- return async (playwrightContext) => withCheckedStorageAccess(() => {
199
- if (this.preventDirectStorageAccess) {
200
- throw new Error('Directly accessing storage in a request handler is not allowed in AdaptivePlaywrightCrawler');
201
- }
202
- }, () => this.adaptiveRequestHandler({
203
- id: crawlingContext.id,
204
- session: crawlingContext.session,
205
- proxyInfo: crawlingContext.proxyInfo,
206
- request: crawlingContext.request,
207
- response: {
208
- url: crawlingContext.response.url(),
209
- statusCode: crawlingContext.response.status(),
210
- headers: crawlingContext.response.headers(),
211
- trailers: {},
212
- complete: true,
213
- redirectUrls: [],
214
- },
215
- log: crawlingContext.log,
216
- page: crawlingContext.page,
217
- querySelector: async (selector, timeoutMs = 5_000) => {
218
- const locator = playwrightContext.page.locator(selector).first();
219
- await locator.waitFor({ timeout: timeoutMs, state: 'attached' });
220
- const $ = await playwrightContext.parseWithCheerio();
221
- return $(selector);
222
- },
223
- async waitForSelector(selector, timeoutMs = 5_000) {
224
- const locator = playwrightContext.page.locator(selector).first();
225
- await locator.waitFor({ timeout: timeoutMs, state: 'attached' });
226
- },
227
- async parseWithCheerio(selector, timeoutMs = 5_000) {
228
- if (selector) {
229
- const locator = playwrightContext.page.locator(selector).first();
230
- await locator.waitFor({ timeout: timeoutMs, state: 'attached' });
231
- }
232
- return playwrightContext.parseWithCheerio();
233
- },
234
- async enqueueLinks(options = {}, timeoutMs = 5_000) {
235
- const selector = options.selector ?? 'a';
236
- const locator = playwrightContext.page.locator(selector).first();
237
- await locator.waitFor({ timeout: timeoutMs, state: 'attached' });
238
- const urls = await extractUrlsFromPage(playwrightContext.page, selector, options.baseUrl ??
239
- playwrightContext.request.loadedUrl ??
240
- playwrightContext.request.url);
241
- await result.enqueueLinks({ ...options, urls });
242
- },
243
- addRequests: result.addRequests,
244
- pushData: result.pushData,
245
- useState: this.allowStorageAccess(async (defaultValue) => {
246
- const state = await result.useState(defaultValue);
247
- if (initialStateCopy === undefined) {
248
- initialStateCopy = JSON.parse(JSON.stringify(state));
249
- }
250
- return state;
251
- }),
252
- getKeyValueStore: this.allowStorageAccess(result.getKeyValueStore),
253
- }));
254
- }
255
- return Reflect.get(target, propertyName, receiver);
256
- },
257
- }), crawlingContext);
258
- return { result: { result, ok: true }, initialStateCopy };
259
- }
260
- catch (error) {
261
- return { result: { error, ok: false }, initialStateCopy };
262
- }
263
- }
264
- async runRequestHandlerWithPlainHTTP(crawlingContext, oldStateCopy) {
265
- const result = new RequestHandlerResult(this.config, AdaptivePlaywrightCrawler.CRAWLEE_STATE_KEY);
266
- const logs = [];
267
- const pageGotoOptions = { timeout: this.navigationTimeoutMillis }; // Irrelevant, but required by BrowserCrawler
268
- try {
269
- await withCheckedStorageAccess(() => {
270
- if (this.preventDirectStorageAccess) {
271
- throw new Error('Directly accessing storage in a request handler is not allowed in AdaptivePlaywrightCrawler');
272
- }
273
- }, async () => addTimeoutToPromise(async () => {
274
- const hookContext = {
275
- id: crawlingContext.id,
276
- session: crawlingContext.session,
277
- proxyInfo: crawlingContext.proxyInfo,
278
- request: crawlingContext.request,
279
- log: this.createLogProxy(crawlingContext.log, logs),
280
- };
281
- await this._executeHooks(this.preNavigationHooks, {
282
- ...hookContext,
283
- get page() {
284
- throw new Error('Page object was used in HTTP-only pre-navigation hook');
285
- },
286
- }, // This is safe because `executeHooks` just passes the context to the hooks which accept the partial context
287
- pageGotoOptions);
288
- const response = await crawlingContext.sendRequest({});
289
- const loadedUrl = response.url;
290
- crawlingContext.request.loadedUrl = loadedUrl;
291
- const $ = load(response.body);
292
- await this.adaptiveRequestHandler({
293
- ...hookContext,
294
- request: crawlingContext.request,
295
- response,
296
- get page() {
297
- throw new Error('Page object was used in HTTP-only request handler');
298
- },
299
- async querySelector(selector, _timeoutMs) {
300
- return $(selector);
301
- },
302
- async waitForSelector(selector, _timeoutMs) {
303
- if ($(selector).get().length === 0) {
304
- throw new Error(`Selector '${selector}' not found.`);
305
- }
306
- },
307
- async parseWithCheerio(selector, _timeoutMs) {
308
- if (selector && $(selector).get().length === 0) {
309
- throw new Error(`Selector '${selector}' not found.`);
310
- }
311
- return $;
312
- },
313
- async enqueueLinks(options = {}) {
314
- const urls = extractUrlsFromCheerio($, options.selector, options.baseUrl ?? loadedUrl);
315
- await result.enqueueLinks({ ...options, urls });
316
- },
317
- addRequests: result.addRequests,
318
- pushData: result.pushData,
319
- useState: async (defaultValue) => {
320
- // return the old state before the browser handler was executed
321
- // when rerunning the handler via HTTP for detection
322
- if (oldStateCopy !== undefined) {
323
- return oldStateCopy ?? defaultValue; // fallback to the default for `null`
324
- }
325
- return this.allowStorageAccess(result.useState)(defaultValue);
326
- },
327
- getKeyValueStore: this.allowStorageAccess(result.getKeyValueStore),
328
- });
329
- await this._executeHooks(this.postNavigationHooks, crawlingContext, pageGotoOptions);
330
- }, this.requestHandlerTimeoutInnerMillis, 'Request handler timed out'));
331
- return { result, logs, ok: true };
332
- }
333
- catch (error) {
334
- return { error, logs, ok: false };
335
- }
336
- }
337
382
  createLogProxy(log, logs) {
338
383
  return new Proxy(log, {
339
384
  get(target, propertyName, receiver) {
@@ -346,6 +391,12 @@ export class AdaptivePlaywrightCrawler extends PlaywrightCrawler {
346
391
  },
347
392
  });
348
393
  }
394
+ async teardown() {
395
+ await super.teardown();
396
+ for (const hook of this.teardownHooks) {
397
+ await hook();
398
+ }
399
+ }
349
400
  }
350
401
  export function createAdaptivePlaywrightRouter(routes) {
351
402
  return Router.create(routes);