@crawlee/playwright 4.0.0-beta.1 → 4.0.0-beta.11
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/internals/adaptive-playwright-crawler.d.ts +27 -49
- package/internals/adaptive-playwright-crawler.d.ts.map +1 -1
- package/internals/adaptive-playwright-crawler.js +217 -166
- package/internals/adaptive-playwright-crawler.js.map +1 -1
- package/internals/playwright-crawler.d.ts +14 -41
- package/internals/playwright-crawler.d.ts.map +1 -1
- package/internals/playwright-crawler.js +47 -9
- package/internals/playwright-crawler.js.map +1 -1
- package/internals/utils/playwright-utils.d.ts +1 -3
- package/internals/utils/playwright-utils.d.ts.map +1 -1
- package/internals/utils/playwright-utils.js +1 -35
- package/internals/utils/playwright-utils.js.map +1 -1
- package/package.json +8 -8
- package/tsconfig.build.tsbuildinfo +1 -1
|
@@ -1,24 +1,15 @@
|
|
|
1
|
-
import
|
|
2
|
-
import type {
|
|
1
|
+
import { BasicCrawler } from '@crawlee/basic';
|
|
2
|
+
import type { BasicCrawlerOptions, BrowserHook, LoadedRequest, Request } from '@crawlee/browser';
|
|
3
|
+
import type { BaseHttpResponseData, CrawlingContext, EnqueueLinksOptions, GetUserDataFromRequest, RouterRoutes, StatisticsOptions, StatisticState } from '@crawlee/core';
|
|
3
4
|
import { Configuration, RequestHandlerResult, Statistics } from '@crawlee/core';
|
|
4
|
-
import type {
|
|
5
|
+
import type { Dictionary } from '@crawlee/types';
|
|
5
6
|
import { type CheerioRoot } from '@crawlee/utils';
|
|
6
7
|
import { type Cheerio } from 'cheerio';
|
|
8
|
+
import type { AnyNode } from 'domhandler';
|
|
7
9
|
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
8
10
|
import type { Page } from 'playwright';
|
|
9
|
-
import type {
|
|
10
|
-
import type { PlaywrightCrawlerOptions, PlaywrightCrawlingContext, PlaywrightGotoOptions } from './playwright-crawler.js';
|
|
11
|
-
import { PlaywrightCrawler } from './playwright-crawler.js';
|
|
11
|
+
import type { PlaywrightCrawlingContext, PlaywrightGotoOptions } from './playwright-crawler.js';
|
|
12
12
|
import { RenderingTypePredictor } from './utils/rendering-type-prediction.js';
|
|
13
|
-
type Result<TResult> = {
|
|
14
|
-
result: TResult;
|
|
15
|
-
ok: true;
|
|
16
|
-
logs?: LogProxyCall[];
|
|
17
|
-
} | {
|
|
18
|
-
error: unknown;
|
|
19
|
-
ok: false;
|
|
20
|
-
logs?: LogProxyCall[];
|
|
21
|
-
};
|
|
22
13
|
interface AdaptivePlaywrightCrawlerStatisticState extends StatisticState {
|
|
23
14
|
httpOnlyRequestHandlerRuns?: number;
|
|
24
15
|
browserRequestHandlerRuns?: number;
|
|
@@ -33,7 +24,8 @@ declare class AdaptivePlaywrightCrawlerStatistics extends Statistics {
|
|
|
33
24
|
trackBrowserRequestHandlerRun(): void;
|
|
34
25
|
trackRenderingTypeMisprediction(): void;
|
|
35
26
|
}
|
|
36
|
-
export interface AdaptivePlaywrightCrawlerContext<UserData extends Dictionary = Dictionary> extends
|
|
27
|
+
export interface AdaptivePlaywrightCrawlerContext<UserData extends Dictionary = Dictionary> extends CrawlingContext<UserData> {
|
|
28
|
+
request: LoadedRequest<Request<UserData>>;
|
|
37
29
|
/**
|
|
38
30
|
* The HTTP response, either from the HTTP client or from the initial request from playwright's navigation.
|
|
39
31
|
*/
|
|
@@ -46,7 +38,7 @@ export interface AdaptivePlaywrightCrawlerContext<UserData extends Dictionary =
|
|
|
46
38
|
* Wait for an element matching the selector to appear and return a Cheerio object of matched elements.
|
|
47
39
|
* Timeout defaults to 5s.
|
|
48
40
|
*/
|
|
49
|
-
querySelector
|
|
41
|
+
querySelector(selector: string, timeoutMs?: number): Promise<Cheerio<AnyNode>>;
|
|
50
42
|
/**
|
|
51
43
|
* Wait for an element matching the selector to appear.
|
|
52
44
|
* Timeout defaults to 5s.
|
|
@@ -74,24 +66,14 @@ export interface AdaptivePlaywrightCrawlerContext<UserData extends Dictionary =
|
|
|
74
66
|
* ```
|
|
75
67
|
*/
|
|
76
68
|
parseWithCheerio(selector?: string, timeoutMs?: number): Promise<CheerioRoot>;
|
|
69
|
+
enqueueLinks(options?: EnqueueLinksOptions): Promise<void>;
|
|
77
70
|
}
|
|
78
|
-
interface AdaptiveHook extends BrowserHook<Pick<AdaptivePlaywrightCrawlerContext, 'id' | '
|
|
71
|
+
interface AdaptiveHook extends BrowserHook<Pick<AdaptivePlaywrightCrawlerContext, 'id' | 'session' | 'proxyInfo' | 'log'> & {
|
|
79
72
|
page?: Page;
|
|
73
|
+
request: Request;
|
|
80
74
|
}, PlaywrightGotoOptions> {
|
|
81
75
|
}
|
|
82
|
-
export interface AdaptivePlaywrightCrawlerOptions extends Omit<
|
|
83
|
-
/**
|
|
84
|
-
* Function that is called to process each request.
|
|
85
|
-
*
|
|
86
|
-
* The function receives the {@link AdaptivePlaywrightCrawlingContext} as an argument, and it must refrain from calling code with side effects,
|
|
87
|
-
* other than the methods of the crawling context. Any other side effects may be invoked repeatedly by the crawler, which can lead to inconsistent results.
|
|
88
|
-
*
|
|
89
|
-
* The function must return a promise, which is then awaited by the crawler.
|
|
90
|
-
*
|
|
91
|
-
* If the function throws an exception, the crawler will try to re-crawl the
|
|
92
|
-
* request later, up to `option.maxRequestRetries` times.
|
|
93
|
-
*/
|
|
94
|
-
requestHandler?: (crawlingContext: LoadedContext<AdaptivePlaywrightCrawlerContext>) => Awaitable<void>;
|
|
76
|
+
export interface AdaptivePlaywrightCrawlerOptions<ExtendedContext extends AdaptivePlaywrightCrawlerContext = AdaptivePlaywrightCrawlerContext> extends Omit<BasicCrawlerOptions<AdaptivePlaywrightCrawlerContext, ExtendedContext>, 'preNavigationHooks' | 'postNavigationHooks'> {
|
|
95
77
|
/**
|
|
96
78
|
* Async functions that are sequentially evaluated before the navigation. Good for setting additional cookies.
|
|
97
79
|
* The function accepts a subset of the crawling context. If you attempt to access the `page` property during HTTP-only crawling,
|
|
@@ -132,8 +114,6 @@ export interface AdaptivePlaywrightCrawlerOptions extends Omit<PlaywrightCrawler
|
|
|
132
114
|
*/
|
|
133
115
|
preventDirectStorageAccess?: boolean;
|
|
134
116
|
}
|
|
135
|
-
declare const proxyLogMethods: readonly ["error", "exception", "softFail", "info", "debug", "perf", "warningOnce", "deprecated"];
|
|
136
|
-
type LogProxyCall = [log: Log, method: (typeof proxyLogMethods)[number], ...args: unknown[]];
|
|
137
117
|
/**
|
|
138
118
|
* An extension of {@link PlaywrightCrawler} that uses a more limited request handler interface so that it is able to switch to HTTP-only crawling when it detects it may be possible.
|
|
139
119
|
*
|
|
@@ -163,31 +143,29 @@ type LogProxyCall = [log: Log, method: (typeof proxyLogMethods)[number], ...args
|
|
|
163
143
|
*
|
|
164
144
|
* @experimental
|
|
165
145
|
*/
|
|
166
|
-
export declare class AdaptivePlaywrightCrawler extends
|
|
146
|
+
export declare class AdaptivePlaywrightCrawler<ExtendedContext extends AdaptivePlaywrightCrawlerContext = AdaptivePlaywrightCrawlerContext> extends BasicCrawler<AdaptivePlaywrightCrawlerContext, ExtendedContext> {
|
|
167
147
|
readonly config: Configuration;
|
|
168
|
-
private adaptiveRequestHandler;
|
|
169
148
|
private renderingTypePredictor;
|
|
170
149
|
private resultChecker;
|
|
171
150
|
private resultComparator;
|
|
172
151
|
private preventDirectStorageAccess;
|
|
152
|
+
private staticContextPipeline;
|
|
153
|
+
private browserContextPipeline;
|
|
154
|
+
private individualRequestHandlerTimeoutMillis;
|
|
173
155
|
readonly stats: AdaptivePlaywrightCrawlerStatistics;
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
protected _runRequestHandler(crawlingContext: PlaywrightCrawlingContext): Promise<void>;
|
|
156
|
+
private resultObjects;
|
|
157
|
+
private teardownHooks;
|
|
158
|
+
constructor(options?: AdaptivePlaywrightCrawlerOptions<ExtendedContext>, config?: Configuration);
|
|
159
|
+
private adaptCheerioContext;
|
|
160
|
+
private adaptPlaywrightContext;
|
|
161
|
+
private crawlOne;
|
|
162
|
+
protected runRequestHandler(crawlingContext: PlaywrightCrawlingContext): Promise<void>;
|
|
182
163
|
protected commitResult(crawlingContext: PlaywrightCrawlingContext, { calls, keyValueStoreChanges }: RequestHandlerResult): Promise<void>;
|
|
183
164
|
protected allowStorageAccess<R, TArgs extends any[]>(func: (...args: TArgs) => Promise<R>): (...args: TArgs) => Promise<R>;
|
|
184
|
-
protected runRequestHandlerInBrowser(crawlingContext: PlaywrightCrawlingContext): Promise<{
|
|
185
|
-
result: Result<RequestHandlerResult>;
|
|
186
|
-
initialStateCopy?: Record<string, unknown>;
|
|
187
|
-
}>;
|
|
188
|
-
protected runRequestHandlerWithPlainHTTP(crawlingContext: PlaywrightCrawlingContext, oldStateCopy?: Dictionary): Promise<Result<RequestHandlerResult>>;
|
|
189
165
|
private createLogProxy;
|
|
166
|
+
teardown(): Promise<void>;
|
|
190
167
|
}
|
|
191
|
-
|
|
168
|
+
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
169
|
+
export declare function createAdaptivePlaywrightRouter<Context extends AdaptivePlaywrightCrawlerContext = AdaptivePlaywrightCrawlerContext, UserData extends Dictionary = GetUserDataFromRequest<Context['request']>>(routes?: RouterRoutes<Context, UserData>): import("@crawlee/basic").RouterHandler<Context>;
|
|
192
170
|
export {};
|
|
193
171
|
//# sourceMappingURL=adaptive-playwright-crawler.d.ts.map
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"adaptive-playwright-crawler.d.ts","sourceRoot":"","sources":["../../src/internals/adaptive-playwright-crawler.ts"],"names":[],"mappings":"
|
|
1
|
+
{"version":3,"file":"adaptive-playwright-crawler.d.ts","sourceRoot":"","sources":["../../src/internals/adaptive-playwright-crawler.ts"],"names":[],"mappings":"AAEA,OAAO,EAAE,YAAY,EAAE,MAAM,gBAAgB,CAAC;AAC9C,OAAO,KAAK,EAAE,mBAAmB,EAAE,WAAW,EAAE,aAAa,EAAE,OAAO,EAAE,MAAM,kBAAkB,CAAC;AAIjG,OAAO,KAAK,EACR,oBAAoB,EAEpB,eAAe,EACf,mBAAmB,EACnB,sBAAsB,EACtB,YAAY,EAEZ,iBAAiB,EACjB,cAAc,EACjB,MAAM,eAAe,CAAC;AACvB,OAAO,EACH,aAAa,EAEb,oBAAoB,EAEpB,UAAU,EAEb,MAAM,eAAe,CAAC;AACvB,OAAO,KAAK,EAAE,UAAU,EAAE,MAAM,gBAAgB,CAAC;AACjD,OAAO,EAAE,KAAK,WAAW,EAA0B,MAAM,gBAAgB,CAAC;AAC1E,OAAO,EAAE,KAAK,OAAO,EAAE,MAAM,SAAS,CAAC;AACvC,OAAO,KAAK,EAAE,OAAO,EAAE,MAAM,YAAY,CAAC;AAC1C,OAAO,KAAK,EAAE,IAAI,EAAE,MAAM,YAAY,CAAC;AAKvC,OAAO,KAAK,EAAE,yBAAyB,EAAE,qBAAqB,EAAE,MAAM,yBAAyB,CAAC;AAEhG,OAAO,EAAsB,sBAAsB,EAAE,MAAM,sCAAsC,CAAC;AAMlG,UAAU,uCAAwC,SAAQ,cAAc;IACpE,0BAA0B,CAAC,EAAE,MAAM,CAAC;IACpC,yBAAyB,CAAC,EAAE,MAAM,CAAC;IACnC,2BAA2B,CAAC,EAAE,MAAM,CAAC;CACxC;AAQD,cAAM,mCAAoC,SAAQ,UAAU;IAC/C,KAAK,EAAE,uCAAuC,CAAe;gBAE1D,OAAO,GAAE,iBAAsB;IAKlC,KAAK,IAAI,IAAI;cAOG,oBAAoB,IAAI,OAAO,CAAC,IAAI,CAAC;IAe9D,8BAA8B,IAAI,IAAI;IAKtC,6BAA6B,IAAI,IAAI;IAKrC,+BAA+B,IAAI,IAAI;CAI1C;AAED,MAAM,WAAW,gCAAgC,CAAC,QAAQ,SAAS,UAAU,GAAG,UAAU,CACtF,SAAQ,eAAe,CAAC,QAAQ,CAAC;IACjC,OAAO,EAAE,aAAa,CAAC,OAAO,CAAC,QAAQ,CAAC,CAAC,CAAC;IAC1C;;OAEG;IACH,QAAQ,EAAE,oBAAoB,CAAC;IAE/B;;OAEG;IACH,IAAI,EAAE,IAAI,CAAC;IAEX;;;OAGG;IACH,aAAa,CAAC,QAAQ,EAAE,MAAM,EAAE,SAAS,CAAC,EAAE,MAAM,GAAG,OAAO,CAAC,OAAO,CAAC,OAAO,CAAC,CAAC,CAAC;IAE/E;;;;;;;;;;;;OAYG;IACH,eAAe,CAAC,QAAQ,EAAE,MAAM,EAAE,SAAS,CAAC,EAAE,MAAM,GAAG,OAAO,CAAC,IAAI,CAAC,CAAC;IAErE;;;;;;;;;;;OAWG;IACH,gBAAgB,CAAC,QAAQ,CAAC,EAAE,MAAM,EAAE,SAAS,CAAC,EAAE,MAAM,GAAG,OAAO,CAAC,WAAW,CAAC,CAAC;IAE9E,YAAY,CAAC,OAAO,CAAC,EAAE,mBAAmB,GAAG,OAAO,CAAC,IAAI,CAAC,CAAC;CAC9D;AAED,UAAU,YACN,SAAQ,WAAW,CACf,IAAI,CAAC,gCAAgC,EAAE,IAAI,GAAG,SAAS,GAAG,WAAW,GAAG,KAAK,CAAC,GAAG;IAC7E,IAAI,CAAC,EAAE,IAAI,CAAC;IACZ,OAAO,EAAE,OAAO,CAAC;CACpB,EACD,qBAAqB,CACxB;CAAG;AAER,MAAM,WAAW,gCAAgC,CAC7C,eAAe,SAAS,gCAAgC,GAAG,gCAAgC,CAC7F,SAAQ,IAAI,CACN,mBAAmB,CAAC,gCAAgC,EAAE,eAAe,CAAC,EACtE,oBAAoB,GAAG,qBAAqB,CAC/C;IACD;;;;OAIG;IACH,kBAAkB,CAAC,EAAE,YAAY,EAAE,CAAC;IAEpC;;;;OAIG;IACH,mBAAmB,CAAC,EAAE,YAAY,EAAE,CAAC;IAErC;;;OAGG;IACH,2BAA2B,CAAC,EAAE,MAAM,CAAC;IAErC;;;;OAIG;IACH,aAAa,CAAC,EAAE,CAAC,MAAM,EAAE,oBAAoB,KAAK,OAAO,CAAC;IAE1D;;;;;OAKG;IACH,gBAAgB,CAAC,EAAE,CAAC,OAAO,EAAE,oBAAoB,EAAE,OAAO,EAAE,oBAAoB,KAAK,OAAO,CAAC;IAE7F;;OAEG;IACH,sBAAsB,CAAC,EAAE,IAAI,CAAC,sBAAsB,EAAE,SAAS,GAAG,aAAa,CAAC,CAAC;IAEjF;;;OAGG;IACH,0BAA0B,CAAC,EAAE,OAAO,CAAC;CACxC;AAeD;;;;;;;;;;;;;;;;;;;;;;;;;;;;GA4BG;AACH,qBAAa,yBAAyB,CAClC,eAAe,SAAS,gCAAgC,GAAG,gCAAgC,CAC7F,SAAQ,YAAY,CAAC,gCAAgC,EAAE,eAAe,CAAC;aAe/C,MAAM;IAd5B,OAAO,CAAC,sBAAsB,CAA0E;IACxG,OAAO,CAAC,aAAa,CAAiE;IACtF,OAAO,CAAC,gBAAgB,CAAoE;IAC5F,OAAO,CAAC,0BAA0B,CAAU;IAC5C,OAAO,CAAC,qBAAqB,CAAoD;IACjF,OAAO,CAAC,sBAAsB,CAAoD;IAClF,OAAO,CAAC,qCAAqC,CAAS;IACtD,SAAiB,KAAK,EAAE,mCAAmC,CAAC;IAC5D,OAAO,CAAC,aAAa,CAAwD;IAE7E,OAAO,CAAC,aAAa,CAAkC;gBAGnD,OAAO,GAAE,gCAAgC,CAAC,eAAe,CAAM,EAC7C,MAAM,gBAAkC;YAuIhD,mBAAmB;YAwCnB,sBAAsB;YA6CtB,QAAQ;cA2DG,iBAAiB,CAAC,eAAe,EAAE,yBAAyB,GAAG,OAAO,CAAC,IAAI,CAAC;cAyGrF,YAAY,CACxB,eAAe,EAAE,yBAAyB,EAC1C,EAAE,KAAK,EAAE,oBAAoB,EAAE,EAAE,oBAAoB,GACtD,OAAO,CAAC,IAAI,CAAC;IAgBhB,SAAS,CAAC,kBAAkB,CAAC,CAAC,EAAE,KAAK,SAAS,GAAG,EAAE,EAC/C,IAAI,EAAE,CAAC,GAAG,IAAI,EAAE,KAAK,KAAK,OAAO,CAAC,CAAC,CAAC,GACrC,CAAC,GAAG,IAAI,EAAE,KAAK,KAAK,OAAO,CAAC,CAAC,CAAC;IAQjC,OAAO,CAAC,cAAc;IAaP,QAAQ;CAM1B;AAED,wBAAgB,8BAA8B,CAC1C,OAAO,SAAS,gCAAgC,GAAG,gCAAgC,EACnF,QAAQ,SAAS,UAAU,GAAG,sBAAsB,CAAC,OAAO,CAAC,SAAS,CAAC,CAAC,EAC1E,MAAM,CAAC,EAAE,YAAY,CAAC,OAAO,EAAE,QAAQ,CAAC,mDAEzC"}
|
|
@@ -1,8 +1,9 @@
|
|
|
1
|
+
import { isDeepStrictEqual } from 'node:util';
|
|
2
|
+
import { BasicCrawler } from '@crawlee/basic';
|
|
1
3
|
import { extractUrlsFromPage } from '@crawlee/browser';
|
|
2
|
-
import {
|
|
4
|
+
import { CheerioCrawler } from '@crawlee/cheerio';
|
|
5
|
+
import { Configuration, RequestHandlerError, RequestHandlerResult, Router, Statistics, withCheckedStorageAccess, } from '@crawlee/core';
|
|
3
6
|
import { extractUrlsFromCheerio } from '@crawlee/utils';
|
|
4
|
-
import { load } from 'cheerio';
|
|
5
|
-
import isEqual from 'lodash.isequal';
|
|
6
7
|
import { addTimeoutToPromise } from '@apify/timeout';
|
|
7
8
|
import { PlaywrightCrawler } from './playwright-crawler.js';
|
|
8
9
|
import { RenderingTypePredictor } from './utils/rendering-type-prediction.js';
|
|
@@ -80,24 +81,32 @@ const proxyLogMethods = [
|
|
|
80
81
|
*
|
|
81
82
|
* @experimental
|
|
82
83
|
*/
|
|
83
|
-
export class AdaptivePlaywrightCrawler extends
|
|
84
|
+
export class AdaptivePlaywrightCrawler extends BasicCrawler {
|
|
84
85
|
config;
|
|
85
|
-
adaptiveRequestHandler;
|
|
86
86
|
renderingTypePredictor;
|
|
87
87
|
resultChecker;
|
|
88
88
|
resultComparator;
|
|
89
89
|
preventDirectStorageAccess;
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
router = Router.create();
|
|
90
|
+
staticContextPipeline;
|
|
91
|
+
browserContextPipeline;
|
|
92
|
+
individualRequestHandlerTimeoutMillis;
|
|
93
|
+
resultObjects = new WeakMap();
|
|
94
|
+
teardownHooks = [];
|
|
96
95
|
constructor(options = {}, config = Configuration.getGlobalConfig()) {
|
|
97
|
-
const { requestHandler, renderingTypeDetectionRatio = 0.1, renderingTypePredictor, resultChecker, resultComparator, statisticsOptions, preventDirectStorageAccess = true, ...rest } = options;
|
|
98
|
-
super(
|
|
96
|
+
const { requestHandler, renderingTypeDetectionRatio = 0.1, renderingTypePredictor, resultChecker, resultComparator, statisticsOptions, preventDirectStorageAccess = true, requestHandlerTimeoutSecs = 60, errorHandler, failedRequestHandler, preNavigationHooks, postNavigationHooks, extendContext, contextPipelineBuilder, ...rest } = options;
|
|
97
|
+
super({
|
|
98
|
+
...rest,
|
|
99
|
+
// Pass error handlers to the "main" crawler - we only pluck them from `rest` so that they don't go to the sub crawlers
|
|
100
|
+
errorHandler,
|
|
101
|
+
failedRequestHandler,
|
|
102
|
+
// Same for request handler
|
|
103
|
+
requestHandler,
|
|
104
|
+
// The builder intentionally returns null so that it crashes the crawler when it tries to use this instead of one of two the specialized context pipelines
|
|
105
|
+
// (that would be a logical error in this class)
|
|
106
|
+
contextPipelineBuilder: () => null,
|
|
107
|
+
}, config);
|
|
99
108
|
this.config = config;
|
|
100
|
-
this.
|
|
109
|
+
this.individualRequestHandlerTimeoutMillis = requestHandlerTimeoutSecs * 1000;
|
|
101
110
|
this.renderingTypePredictor =
|
|
102
111
|
renderingTypePredictor ?? new RenderingTypePredictor({ detectionRatio: renderingTypeDetectionRatio });
|
|
103
112
|
this.resultChecker = resultChecker ?? (() => true);
|
|
@@ -112,10 +121,67 @@ export class AdaptivePlaywrightCrawler extends PlaywrightCrawler {
|
|
|
112
121
|
return (resultA.datasetItems.length === resultB.datasetItems.length &&
|
|
113
122
|
resultA.datasetItems.every((itemA, i) => {
|
|
114
123
|
const itemB = resultB.datasetItems[i];
|
|
115
|
-
return
|
|
124
|
+
return isDeepStrictEqual(itemA, itemB);
|
|
116
125
|
}));
|
|
117
126
|
};
|
|
118
127
|
}
|
|
128
|
+
const staticCrawler = new CheerioCrawler({
|
|
129
|
+
...rest,
|
|
130
|
+
useSessionPool: false,
|
|
131
|
+
statisticsOptions: {
|
|
132
|
+
persistenceOptions: { enable: false },
|
|
133
|
+
},
|
|
134
|
+
preNavigationHooks: [
|
|
135
|
+
async (context) => {
|
|
136
|
+
for (const hook of preNavigationHooks ?? []) {
|
|
137
|
+
await hook(context, undefined);
|
|
138
|
+
}
|
|
139
|
+
},
|
|
140
|
+
],
|
|
141
|
+
postNavigationHooks: [
|
|
142
|
+
async (context) => {
|
|
143
|
+
for (const hook of postNavigationHooks ?? []) {
|
|
144
|
+
await hook(context, undefined);
|
|
145
|
+
}
|
|
146
|
+
},
|
|
147
|
+
],
|
|
148
|
+
}, config);
|
|
149
|
+
const browserCrawler = new PlaywrightCrawler({
|
|
150
|
+
...rest,
|
|
151
|
+
useSessionPool: false,
|
|
152
|
+
statisticsOptions: {
|
|
153
|
+
persistenceOptions: { enable: false },
|
|
154
|
+
},
|
|
155
|
+
preNavigationHooks: [
|
|
156
|
+
async (context, gotoOptions) => {
|
|
157
|
+
for (const hook of preNavigationHooks ?? []) {
|
|
158
|
+
await hook(context, gotoOptions);
|
|
159
|
+
}
|
|
160
|
+
},
|
|
161
|
+
],
|
|
162
|
+
postNavigationHooks: [
|
|
163
|
+
async (context, gotoOptions) => {
|
|
164
|
+
for (const hook of postNavigationHooks ?? []) {
|
|
165
|
+
await hook(context, gotoOptions);
|
|
166
|
+
}
|
|
167
|
+
},
|
|
168
|
+
],
|
|
169
|
+
}, config);
|
|
170
|
+
this.teardownHooks.push(browserCrawler.teardown.bind(browserCrawler));
|
|
171
|
+
this.staticContextPipeline = staticCrawler.contextPipeline
|
|
172
|
+
.compose({
|
|
173
|
+
action: this.adaptCheerioContext.bind(this),
|
|
174
|
+
})
|
|
175
|
+
.compose({
|
|
176
|
+
action: async (context) => extendContext ? await extendContext(context) : context,
|
|
177
|
+
});
|
|
178
|
+
this.browserContextPipeline = browserCrawler.contextPipeline
|
|
179
|
+
.compose({
|
|
180
|
+
action: this.adaptPlaywrightContext.bind(this),
|
|
181
|
+
})
|
|
182
|
+
.compose({
|
|
183
|
+
action: async (context) => extendContext ? await extendContext(context) : context,
|
|
184
|
+
});
|
|
119
185
|
this.stats = new AdaptivePlaywrightCrawlerStatistics({
|
|
120
186
|
logMessage: `${this.log.getOptions().prefix} request statistics:`,
|
|
121
187
|
config,
|
|
@@ -123,7 +189,112 @@ export class AdaptivePlaywrightCrawler extends PlaywrightCrawler {
|
|
|
123
189
|
});
|
|
124
190
|
this.preventDirectStorageAccess = preventDirectStorageAccess;
|
|
125
191
|
}
|
|
126
|
-
async
|
|
192
|
+
async adaptCheerioContext(cheerioContext) {
|
|
193
|
+
// Capture the original response to avoid infinite recursion when the getter is copied to the context
|
|
194
|
+
const originalResponse = cheerioContext.response;
|
|
195
|
+
const enqueueLinks = this.resultObjects.get(cheerioContext)?.enqueueLinks;
|
|
196
|
+
if (enqueueLinks === undefined) {
|
|
197
|
+
throw new Error('Logical error - `this.resultObjects` does not contain the result object');
|
|
198
|
+
}
|
|
199
|
+
return {
|
|
200
|
+
get page() {
|
|
201
|
+
throw new Error('Page object was used in HTTP-only request handler');
|
|
202
|
+
},
|
|
203
|
+
get response() {
|
|
204
|
+
return {
|
|
205
|
+
// TODO remove this once cheerioContext.response is just a Response
|
|
206
|
+
complete: true,
|
|
207
|
+
headers: originalResponse.headers,
|
|
208
|
+
trailers: {},
|
|
209
|
+
url: originalResponse.url,
|
|
210
|
+
statusCode: originalResponse.statusCode,
|
|
211
|
+
redirectUrls: originalResponse.redirectUrls ?? [],
|
|
212
|
+
};
|
|
213
|
+
},
|
|
214
|
+
async querySelector(selector) {
|
|
215
|
+
return cheerioContext.$(selector);
|
|
216
|
+
},
|
|
217
|
+
async enqueueLinks(options = {}) {
|
|
218
|
+
const urls = options.urls ??
|
|
219
|
+
extractUrlsFromCheerio(cheerioContext.$, options.selector, options.baseUrl ?? cheerioContext.request.loadedUrl);
|
|
220
|
+
await enqueueLinks({ ...options, urls });
|
|
221
|
+
},
|
|
222
|
+
};
|
|
223
|
+
}
|
|
224
|
+
async adaptPlaywrightContext(playwrightContext) {
|
|
225
|
+
// Capture the original response to avoid infinite recursion when the getter is copied to the context
|
|
226
|
+
const originalResponse = playwrightContext.response;
|
|
227
|
+
const enqueueLinks = this.resultObjects.get(playwrightContext)?.enqueueLinks;
|
|
228
|
+
if (enqueueLinks === undefined) {
|
|
229
|
+
throw new Error('Logical error - `this.resultObjects` does not contain the result object');
|
|
230
|
+
}
|
|
231
|
+
return {
|
|
232
|
+
get response() {
|
|
233
|
+
return {
|
|
234
|
+
url: originalResponse.url(),
|
|
235
|
+
statusCode: originalResponse.status(),
|
|
236
|
+
headers: originalResponse.headers(),
|
|
237
|
+
trailers: {},
|
|
238
|
+
complete: true,
|
|
239
|
+
redirectUrls: [],
|
|
240
|
+
};
|
|
241
|
+
},
|
|
242
|
+
async querySelector(selector, timeoutMs = 5000) {
|
|
243
|
+
const locator = playwrightContext.page.locator(selector).first();
|
|
244
|
+
await locator.waitFor({ timeout: timeoutMs, state: 'attached' });
|
|
245
|
+
const $ = await playwrightContext.parseWithCheerio();
|
|
246
|
+
return $(selector);
|
|
247
|
+
},
|
|
248
|
+
async enqueueLinks(options = {}, timeoutMs = 5000) {
|
|
249
|
+
const selector = options.selector ?? 'a';
|
|
250
|
+
const locator = playwrightContext.page.locator(selector).first();
|
|
251
|
+
await locator.waitFor({ timeout: timeoutMs, state: 'attached' });
|
|
252
|
+
// TODO consider using `context.parseWithCheerio` to make this universal and avoid code duplication
|
|
253
|
+
const urls = options.urls ??
|
|
254
|
+
(await extractUrlsFromPage(playwrightContext.page, selector, options.baseUrl ?? playwrightContext.request.loadedUrl));
|
|
255
|
+
await enqueueLinks({ ...options, urls });
|
|
256
|
+
},
|
|
257
|
+
};
|
|
258
|
+
}
|
|
259
|
+
async crawlOne(renderingType, context, useStateFunction) {
|
|
260
|
+
const result = new RequestHandlerResult(this.config, AdaptivePlaywrightCrawler.CRAWLEE_STATE_KEY);
|
|
261
|
+
const logs = [];
|
|
262
|
+
const deferredCleanup = [];
|
|
263
|
+
const resultBoundContextHelpers = {
|
|
264
|
+
addRequests: result.addRequests,
|
|
265
|
+
pushData: result.pushData,
|
|
266
|
+
useState: this.allowStorageAccess(useStateFunction),
|
|
267
|
+
getKeyValueStore: this.allowStorageAccess(result.getKeyValueStore),
|
|
268
|
+
enqueueLinks: result.enqueueLinks,
|
|
269
|
+
log: this.createLogProxy(context.log, logs),
|
|
270
|
+
registerDeferredCleanup: (cleanup) => deferredCleanup.push(cleanup),
|
|
271
|
+
};
|
|
272
|
+
const subCrawlerContext = { ...context, ...resultBoundContextHelpers };
|
|
273
|
+
this.resultObjects.set(subCrawlerContext, result);
|
|
274
|
+
try {
|
|
275
|
+
const callAdaptiveRequestHandler = async () => {
|
|
276
|
+
if (renderingType === 'static') {
|
|
277
|
+
await this.staticContextPipeline.call(subCrawlerContext, async (finalContext) => await this.requestHandler(finalContext));
|
|
278
|
+
}
|
|
279
|
+
else if (renderingType === 'clientOnly') {
|
|
280
|
+
await this.browserContextPipeline.call(subCrawlerContext, async (finalContext) => await this.requestHandler(finalContext));
|
|
281
|
+
}
|
|
282
|
+
};
|
|
283
|
+
await addTimeoutToPromise(async () => withCheckedStorageAccess(() => {
|
|
284
|
+
if (this.preventDirectStorageAccess) {
|
|
285
|
+
throw new Error('Directly accessing storage in a request handler is not allowed in AdaptivePlaywrightCrawler');
|
|
286
|
+
}
|
|
287
|
+
}, callAdaptiveRequestHandler), this.individualRequestHandlerTimeoutMillis, 'Request handler timed out');
|
|
288
|
+
return { result, ok: true, logs };
|
|
289
|
+
}
|
|
290
|
+
catch (error) {
|
|
291
|
+
return { error, ok: false, logs };
|
|
292
|
+
}
|
|
293
|
+
finally {
|
|
294
|
+
await Promise.all(deferredCleanup.map((cleanup) => cleanup()));
|
|
295
|
+
}
|
|
296
|
+
}
|
|
297
|
+
async runRequestHandler(crawlingContext) {
|
|
127
298
|
const renderingTypePrediction = this.renderingTypePredictor.predict(crawlingContext.request);
|
|
128
299
|
const shouldDetectRenderingType = Math.random() < renderingTypePrediction.detectionProbabilityRecommendation;
|
|
129
300
|
if (!shouldDetectRenderingType) {
|
|
@@ -132,15 +303,19 @@ export class AdaptivePlaywrightCrawler extends PlaywrightCrawler {
|
|
|
132
303
|
if (renderingTypePrediction.renderingType === 'static' && !shouldDetectRenderingType) {
|
|
133
304
|
crawlingContext.log.debug(`Running HTTP-only request handler for ${crawlingContext.request.url}`);
|
|
134
305
|
this.stats.trackHttpOnlyRequestHandlerRun();
|
|
135
|
-
const plainHTTPRun = await this.
|
|
306
|
+
const plainHTTPRun = await this.crawlOne('static', crawlingContext, crawlingContext.useState);
|
|
136
307
|
if (plainHTTPRun.ok && this.resultChecker(plainHTTPRun.result)) {
|
|
137
308
|
crawlingContext.log.debug(`HTTP-only request handler succeeded for ${crawlingContext.request.url}`);
|
|
138
309
|
plainHTTPRun.logs?.forEach(([log, method, ...args]) => log[method](...args));
|
|
139
310
|
await this.commitResult(crawlingContext, plainHTTPRun.result);
|
|
140
311
|
return;
|
|
141
312
|
}
|
|
313
|
+
// Execution will "fall through" and try running the request handler in a browser
|
|
142
314
|
if (!plainHTTPRun.ok) {
|
|
143
|
-
|
|
315
|
+
const actualError = plainHTTPRun.error instanceof RequestHandlerError
|
|
316
|
+
? plainHTTPRun.error.cause
|
|
317
|
+
: plainHTTPRun.error;
|
|
318
|
+
crawlingContext.log.exception(actualError, `HTTP-only request handler failed for ${crawlingContext.request.url}`);
|
|
144
319
|
}
|
|
145
320
|
else {
|
|
146
321
|
crawlingContext.log.warning(`HTTP-only request handler returned a suspicious result for ${crawlingContext.request.url}`);
|
|
@@ -153,14 +328,30 @@ export class AdaptivePlaywrightCrawler extends PlaywrightCrawler {
|
|
|
153
328
|
// a rendering type detection if necessary. Without this measure, the HTTP request handler would run
|
|
154
329
|
// under different conditions, which could change its behavior. Changes done to the crawler state by
|
|
155
330
|
// the HTTP request handler will not be committed to the actual storage.
|
|
156
|
-
const
|
|
331
|
+
const stateTracker = {
|
|
332
|
+
stateCopy: null,
|
|
333
|
+
async getLiveState(defaultValue = {}) {
|
|
334
|
+
const state = await crawlingContext.useState(defaultValue);
|
|
335
|
+
if (this.stateCopy === null) {
|
|
336
|
+
this.stateCopy = JSON.parse(JSON.stringify(state));
|
|
337
|
+
}
|
|
338
|
+
return state;
|
|
339
|
+
},
|
|
340
|
+
async getStateCopy(defaultValue = {}) {
|
|
341
|
+
if (this.stateCopy === null) {
|
|
342
|
+
return defaultValue;
|
|
343
|
+
}
|
|
344
|
+
return this.stateCopy;
|
|
345
|
+
},
|
|
346
|
+
};
|
|
347
|
+
const browserRun = await this.crawlOne('clientOnly', crawlingContext, stateTracker.getLiveState.bind(stateTracker));
|
|
157
348
|
if (!browserRun.ok) {
|
|
158
349
|
throw browserRun.error;
|
|
159
350
|
}
|
|
160
351
|
await this.commitResult(crawlingContext, browserRun.result);
|
|
161
352
|
if (shouldDetectRenderingType) {
|
|
162
353
|
crawlingContext.log.debug(`Detecting rendering type for ${crawlingContext.request.url}`);
|
|
163
|
-
const plainHTTPRun = await this.
|
|
354
|
+
const plainHTTPRun = await this.crawlOne('static', crawlingContext, stateTracker.getStateCopy.bind(stateTracker));
|
|
164
355
|
const detectionResult = (() => {
|
|
165
356
|
if (!plainHTTPRun.ok) {
|
|
166
357
|
return 'clientOnly';
|
|
@@ -188,152 +379,6 @@ export class AdaptivePlaywrightCrawler extends PlaywrightCrawler {
|
|
|
188
379
|
allowStorageAccess(func) {
|
|
189
380
|
return async (...args) => withCheckedStorageAccess(() => { }, async () => func(...args));
|
|
190
381
|
}
|
|
191
|
-
async runRequestHandlerInBrowser(crawlingContext) {
|
|
192
|
-
const result = new RequestHandlerResult(this.config, AdaptivePlaywrightCrawler.CRAWLEE_STATE_KEY);
|
|
193
|
-
let initialStateCopy;
|
|
194
|
-
try {
|
|
195
|
-
await super._runRequestHandler.call(new Proxy(this, {
|
|
196
|
-
get: (target, propertyName, receiver) => {
|
|
197
|
-
if (propertyName === 'userProvidedRequestHandler') {
|
|
198
|
-
return async (playwrightContext) => withCheckedStorageAccess(() => {
|
|
199
|
-
if (this.preventDirectStorageAccess) {
|
|
200
|
-
throw new Error('Directly accessing storage in a request handler is not allowed in AdaptivePlaywrightCrawler');
|
|
201
|
-
}
|
|
202
|
-
}, () => this.adaptiveRequestHandler({
|
|
203
|
-
id: crawlingContext.id,
|
|
204
|
-
session: crawlingContext.session,
|
|
205
|
-
proxyInfo: crawlingContext.proxyInfo,
|
|
206
|
-
request: crawlingContext.request,
|
|
207
|
-
response: {
|
|
208
|
-
url: crawlingContext.response.url(),
|
|
209
|
-
statusCode: crawlingContext.response.status(),
|
|
210
|
-
headers: crawlingContext.response.headers(),
|
|
211
|
-
trailers: {},
|
|
212
|
-
complete: true,
|
|
213
|
-
redirectUrls: [],
|
|
214
|
-
},
|
|
215
|
-
log: crawlingContext.log,
|
|
216
|
-
page: crawlingContext.page,
|
|
217
|
-
querySelector: async (selector, timeoutMs = 5_000) => {
|
|
218
|
-
const locator = playwrightContext.page.locator(selector).first();
|
|
219
|
-
await locator.waitFor({ timeout: timeoutMs, state: 'attached' });
|
|
220
|
-
const $ = await playwrightContext.parseWithCheerio();
|
|
221
|
-
return $(selector);
|
|
222
|
-
},
|
|
223
|
-
async waitForSelector(selector, timeoutMs = 5_000) {
|
|
224
|
-
const locator = playwrightContext.page.locator(selector).first();
|
|
225
|
-
await locator.waitFor({ timeout: timeoutMs, state: 'attached' });
|
|
226
|
-
},
|
|
227
|
-
async parseWithCheerio(selector, timeoutMs = 5_000) {
|
|
228
|
-
if (selector) {
|
|
229
|
-
const locator = playwrightContext.page.locator(selector).first();
|
|
230
|
-
await locator.waitFor({ timeout: timeoutMs, state: 'attached' });
|
|
231
|
-
}
|
|
232
|
-
return playwrightContext.parseWithCheerio();
|
|
233
|
-
},
|
|
234
|
-
async enqueueLinks(options = {}, timeoutMs = 5_000) {
|
|
235
|
-
const selector = options.selector ?? 'a';
|
|
236
|
-
const locator = playwrightContext.page.locator(selector).first();
|
|
237
|
-
await locator.waitFor({ timeout: timeoutMs, state: 'attached' });
|
|
238
|
-
const urls = await extractUrlsFromPage(playwrightContext.page, selector, options.baseUrl ??
|
|
239
|
-
playwrightContext.request.loadedUrl ??
|
|
240
|
-
playwrightContext.request.url);
|
|
241
|
-
await result.enqueueLinks({ ...options, urls });
|
|
242
|
-
},
|
|
243
|
-
addRequests: result.addRequests,
|
|
244
|
-
pushData: result.pushData,
|
|
245
|
-
useState: this.allowStorageAccess(async (defaultValue) => {
|
|
246
|
-
const state = await result.useState(defaultValue);
|
|
247
|
-
if (initialStateCopy === undefined) {
|
|
248
|
-
initialStateCopy = JSON.parse(JSON.stringify(state));
|
|
249
|
-
}
|
|
250
|
-
return state;
|
|
251
|
-
}),
|
|
252
|
-
getKeyValueStore: this.allowStorageAccess(result.getKeyValueStore),
|
|
253
|
-
}));
|
|
254
|
-
}
|
|
255
|
-
return Reflect.get(target, propertyName, receiver);
|
|
256
|
-
},
|
|
257
|
-
}), crawlingContext);
|
|
258
|
-
return { result: { result, ok: true }, initialStateCopy };
|
|
259
|
-
}
|
|
260
|
-
catch (error) {
|
|
261
|
-
return { result: { error, ok: false }, initialStateCopy };
|
|
262
|
-
}
|
|
263
|
-
}
|
|
264
|
-
async runRequestHandlerWithPlainHTTP(crawlingContext, oldStateCopy) {
|
|
265
|
-
const result = new RequestHandlerResult(this.config, AdaptivePlaywrightCrawler.CRAWLEE_STATE_KEY);
|
|
266
|
-
const logs = [];
|
|
267
|
-
const pageGotoOptions = { timeout: this.navigationTimeoutMillis }; // Irrelevant, but required by BrowserCrawler
|
|
268
|
-
try {
|
|
269
|
-
await withCheckedStorageAccess(() => {
|
|
270
|
-
if (this.preventDirectStorageAccess) {
|
|
271
|
-
throw new Error('Directly accessing storage in a request handler is not allowed in AdaptivePlaywrightCrawler');
|
|
272
|
-
}
|
|
273
|
-
}, async () => addTimeoutToPromise(async () => {
|
|
274
|
-
const hookContext = {
|
|
275
|
-
id: crawlingContext.id,
|
|
276
|
-
session: crawlingContext.session,
|
|
277
|
-
proxyInfo: crawlingContext.proxyInfo,
|
|
278
|
-
request: crawlingContext.request,
|
|
279
|
-
log: this.createLogProxy(crawlingContext.log, logs),
|
|
280
|
-
};
|
|
281
|
-
await this._executeHooks(this.preNavigationHooks, {
|
|
282
|
-
...hookContext,
|
|
283
|
-
get page() {
|
|
284
|
-
throw new Error('Page object was used in HTTP-only pre-navigation hook');
|
|
285
|
-
},
|
|
286
|
-
}, // This is safe because `executeHooks` just passes the context to the hooks which accept the partial context
|
|
287
|
-
pageGotoOptions);
|
|
288
|
-
const response = await crawlingContext.sendRequest({});
|
|
289
|
-
const loadedUrl = response.url;
|
|
290
|
-
crawlingContext.request.loadedUrl = loadedUrl;
|
|
291
|
-
const $ = load(response.body);
|
|
292
|
-
await this.adaptiveRequestHandler({
|
|
293
|
-
...hookContext,
|
|
294
|
-
request: crawlingContext.request,
|
|
295
|
-
response,
|
|
296
|
-
get page() {
|
|
297
|
-
throw new Error('Page object was used in HTTP-only request handler');
|
|
298
|
-
},
|
|
299
|
-
async querySelector(selector, _timeoutMs) {
|
|
300
|
-
return $(selector);
|
|
301
|
-
},
|
|
302
|
-
async waitForSelector(selector, _timeoutMs) {
|
|
303
|
-
if ($(selector).get().length === 0) {
|
|
304
|
-
throw new Error(`Selector '${selector}' not found.`);
|
|
305
|
-
}
|
|
306
|
-
},
|
|
307
|
-
async parseWithCheerio(selector, _timeoutMs) {
|
|
308
|
-
if (selector && $(selector).get().length === 0) {
|
|
309
|
-
throw new Error(`Selector '${selector}' not found.`);
|
|
310
|
-
}
|
|
311
|
-
return $;
|
|
312
|
-
},
|
|
313
|
-
async enqueueLinks(options = {}) {
|
|
314
|
-
const urls = extractUrlsFromCheerio($, options.selector, options.baseUrl ?? loadedUrl);
|
|
315
|
-
await result.enqueueLinks({ ...options, urls });
|
|
316
|
-
},
|
|
317
|
-
addRequests: result.addRequests,
|
|
318
|
-
pushData: result.pushData,
|
|
319
|
-
useState: async (defaultValue) => {
|
|
320
|
-
// return the old state before the browser handler was executed
|
|
321
|
-
// when rerunning the handler via HTTP for detection
|
|
322
|
-
if (oldStateCopy !== undefined) {
|
|
323
|
-
return oldStateCopy ?? defaultValue; // fallback to the default for `null`
|
|
324
|
-
}
|
|
325
|
-
return this.allowStorageAccess(result.useState)(defaultValue);
|
|
326
|
-
},
|
|
327
|
-
getKeyValueStore: this.allowStorageAccess(result.getKeyValueStore),
|
|
328
|
-
});
|
|
329
|
-
await this._executeHooks(this.postNavigationHooks, crawlingContext, pageGotoOptions);
|
|
330
|
-
}, this.requestHandlerTimeoutInnerMillis, 'Request handler timed out'));
|
|
331
|
-
return { result, logs, ok: true };
|
|
332
|
-
}
|
|
333
|
-
catch (error) {
|
|
334
|
-
return { error, logs, ok: false };
|
|
335
|
-
}
|
|
336
|
-
}
|
|
337
382
|
createLogProxy(log, logs) {
|
|
338
383
|
return new Proxy(log, {
|
|
339
384
|
get(target, propertyName, receiver) {
|
|
@@ -346,6 +391,12 @@ export class AdaptivePlaywrightCrawler extends PlaywrightCrawler {
|
|
|
346
391
|
},
|
|
347
392
|
});
|
|
348
393
|
}
|
|
394
|
+
async teardown() {
|
|
395
|
+
await super.teardown();
|
|
396
|
+
for (const hook of this.teardownHooks) {
|
|
397
|
+
await hook();
|
|
398
|
+
}
|
|
399
|
+
}
|
|
349
400
|
}
|
|
350
401
|
export function createAdaptivePlaywrightRouter(routes) {
|
|
351
402
|
return Router.create(routes);
|