@crawlee/browser 4.0.0-beta.4 → 4.0.0-beta.40
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +9 -5
- package/internals/browser-crawler.d.ts +78 -53
- package/internals/browser-crawler.d.ts.map +1 -1
- package/internals/browser-crawler.js +150 -189
- package/internals/browser-crawler.js.map +1 -1
- package/internals/browser-launcher.d.ts +7 -0
- package/internals/browser-launcher.d.ts.map +1 -1
- package/internals/browser-launcher.js +1 -0
- package/internals/browser-launcher.js.map +1 -1
- package/package.json +6 -6
- package/tsconfig.build.tsbuildinfo +0 -1
package/README.md
CHANGED
|
@@ -9,6 +9,10 @@
|
|
|
9
9
|
<small>A web scraping and browser automation library</small>
|
|
10
10
|
</h1>
|
|
11
11
|
|
|
12
|
+
<p align=center>
|
|
13
|
+
<a href="https://trendshift.io/repositories/5179" target="_blank"><img src="https://trendshift.io/api/badge/repositories/5179" alt="apify%2Fcrawlee | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a>
|
|
14
|
+
</p>
|
|
15
|
+
|
|
12
16
|
<p align=center>
|
|
13
17
|
<a href="https://www.npmjs.com/package/@crawlee/core" rel="nofollow"><img src="https://img.shields.io/npm/v/@crawlee/core.svg" alt="NPM latest version" data-canonical-src="https://img.shields.io/npm/v/@crawlee/core/next.svg" style="max-width: 100%;"></a>
|
|
14
18
|
<a href="https://www.npmjs.com/package/@crawlee/core" rel="nofollow"><img src="https://img.shields.io/npm/dm/@crawlee/core.svg" alt="Downloads" data-canonical-src="https://img.shields.io/npm/dm/@crawlee/core.svg" style="max-width: 100%;"></a>
|
|
@@ -24,7 +28,7 @@ Crawlee is available as the [`crawlee`](https://www.npmjs.com/package/crawlee) N
|
|
|
24
28
|
|
|
25
29
|
> 👉 **View full documentation, guides and examples on the [Crawlee project website](https://crawlee.dev)** 👈
|
|
26
30
|
|
|
27
|
-
>
|
|
31
|
+
> Do you prefer 🐍 Python instead of JavaScript? [👉 Checkout Crawlee for Python 👈](https://github.com/apify/crawlee-python).
|
|
28
32
|
|
|
29
33
|
## Installation
|
|
30
34
|
|
|
@@ -85,7 +89,7 @@ By default, Crawlee stores data to `./storage` in the current working directory.
|
|
|
85
89
|
We provide automated beta builds for every merged code change in Crawlee. You can find them in the npm [list of releases](https://www.npmjs.com/package/crawlee?activeTab=versions). If you want to test new features or bug fixes before we release them, feel free to install a beta build like this:
|
|
86
90
|
|
|
87
91
|
```bash
|
|
88
|
-
npm install crawlee@
|
|
92
|
+
npm install crawlee@next
|
|
89
93
|
```
|
|
90
94
|
|
|
91
95
|
If you also use the [Apify SDK](https://github.com/apify/apify-sdk-js), you need to specify dependency overrides in your `package.json` file so that you don't end up with multiple versions of Crawlee installed:
|
|
@@ -94,9 +98,9 @@ If you also use the [Apify SDK](https://github.com/apify/apify-sdk-js), you need
|
|
|
94
98
|
{
|
|
95
99
|
"overrides": {
|
|
96
100
|
"apify": {
|
|
97
|
-
"@crawlee/core": "
|
|
98
|
-
"@crawlee/types": "
|
|
99
|
-
"@crawlee/utils": "
|
|
101
|
+
"@crawlee/core": "$crawlee",
|
|
102
|
+
"@crawlee/types": "$crawlee",
|
|
103
|
+
"@crawlee/utils": "$crawlee"
|
|
100
104
|
}
|
|
101
105
|
}
|
|
102
106
|
}
|
|
@@ -1,19 +1,38 @@
|
|
|
1
|
-
import type { Awaitable, BasicCrawlerOptions, CrawlingContext, Dictionary, EnqueueLinksOptions, ErrorHandler,
|
|
2
|
-
import { BasicCrawler,
|
|
1
|
+
import type { Awaitable, BasicCrawlerOptions, BasicCrawlingContext, CrawlingContext, Dictionary, EnqueueLinksOptions, ErrorHandler, LoadedRequest, Request, RequestHandler, RequestProvider, SkippedRequestCallback } from '@crawlee/basic';
|
|
2
|
+
import { BasicCrawler, ContextPipeline } from '@crawlee/basic';
|
|
3
3
|
import type { BrowserController, BrowserPlugin, BrowserPoolHooks, BrowserPoolOptions, CommonPage, InferBrowserPluginArray, LaunchContext } from '@crawlee/browser-pool';
|
|
4
4
|
import { BrowserPool } from '@crawlee/browser-pool';
|
|
5
|
+
import type { BatchAddRequestsResult } from '@crawlee/types';
|
|
5
6
|
import type { RobotsTxtFile } from '@crawlee/utils';
|
|
6
7
|
import type { ReadonlyDeep } from 'type-fest';
|
|
7
8
|
import type { BrowserLaunchContext } from './browser-launcher.js';
|
|
8
|
-
|
|
9
|
+
interface BaseResponse {
|
|
10
|
+
status(): number;
|
|
11
|
+
}
|
|
12
|
+
export interface BrowserCrawlingContext<Page extends CommonPage = CommonPage, Response extends BaseResponse = BaseResponse, ProvidedController = BrowserController, UserData extends Dictionary = Dictionary> extends CrawlingContext<UserData> {
|
|
13
|
+
/**
|
|
14
|
+
* An instance of the {@link BrowserController} that manages the browser instance and provides access to its API.
|
|
15
|
+
*/
|
|
9
16
|
browserController: ProvidedController;
|
|
17
|
+
/**
|
|
18
|
+
* The browser page object where the web page is loaded and rendered.
|
|
19
|
+
*/
|
|
10
20
|
page: Page;
|
|
11
|
-
|
|
21
|
+
/**
|
|
22
|
+
* The request object that was successfully loaded and navigated to, including the {@link Request.loadedUrl|`loadedUrl`} property.
|
|
23
|
+
*/
|
|
24
|
+
request: LoadedRequest<Request<UserData>>;
|
|
25
|
+
/**
|
|
26
|
+
* The HTTP response object returned by the browser's navigation.
|
|
27
|
+
*/
|
|
28
|
+
response: Response;
|
|
29
|
+
/**
|
|
30
|
+
* Helper function for extracting URLs from the current page and adding them to the request queue.
|
|
31
|
+
*/
|
|
32
|
+
enqueueLinks: (options?: EnqueueLinksOptions) => Promise<BatchAddRequestsResult>;
|
|
12
33
|
}
|
|
13
|
-
export type BrowserRequestHandler<Context extends BrowserCrawlingContext = BrowserCrawlingContext> = RequestHandler<Context>;
|
|
14
|
-
export type BrowserErrorHandler<Context extends BrowserCrawlingContext = BrowserCrawlingContext> = ErrorHandler<Context>;
|
|
15
34
|
export type BrowserHook<Context = BrowserCrawlingContext, GoToOptions extends Dictionary | undefined = Dictionary> = (crawlingContext: Context, gotoOptions: GoToOptions) => Awaitable<void>;
|
|
16
|
-
export interface BrowserCrawlerOptions<Context extends BrowserCrawlingContext = BrowserCrawlingContext, InternalBrowserPoolOptions extends BrowserPoolOptions = BrowserPoolOptions, __BrowserPlugins extends BrowserPlugin[] = InferBrowserPluginArray<InternalBrowserPoolOptions['browserPlugins']>, __BrowserControllerReturn extends BrowserController = ReturnType<__BrowserPlugins[number]['createController']>, __LaunchContextReturn extends LaunchContext = ReturnType<__BrowserPlugins[number]['createLaunchContext']>> extends Omit<BasicCrawlerOptions, 'requestHandler' | 'failedRequestHandler' | 'errorHandler'> {
|
|
35
|
+
export interface BrowserCrawlerOptions<Page extends CommonPage = CommonPage, Response extends BaseResponse = BaseResponse, ProvidedController extends BrowserController = BrowserController, Context extends BrowserCrawlingContext<Page, Response, ProvidedController, Dictionary> = BrowserCrawlingContext<Page, Response, ProvidedController, Dictionary>, ContextExtension = Dictionary<never>, ExtendedContext extends Context = Context & ContextExtension, InternalBrowserPoolOptions extends BrowserPoolOptions = BrowserPoolOptions, __BrowserPlugins extends BrowserPlugin[] = InferBrowserPluginArray<InternalBrowserPoolOptions['browserPlugins']>, __BrowserControllerReturn extends BrowserController = ReturnType<__BrowserPlugins[number]['createController']>, __LaunchContextReturn extends LaunchContext = ReturnType<__BrowserPlugins[number]['createLaunchContext']>> extends Omit<BasicCrawlerOptions<Context, ExtendedContext>, 'requestHandler' | 'failedRequestHandler' | 'errorHandler'> {
|
|
17
36
|
launchContext?: BrowserLaunchContext<any, any>;
|
|
18
37
|
/**
|
|
19
38
|
* Function that is called to process each request.
|
|
@@ -42,7 +61,7 @@ export interface BrowserCrawlerOptions<Context extends BrowserCrawlingContext =
|
|
|
42
61
|
* The exceptions are logged to the request using the
|
|
43
62
|
* {@link Request.pushErrorMessage|`Request.pushErrorMessage()`} function.
|
|
44
63
|
*/
|
|
45
|
-
requestHandler?:
|
|
64
|
+
requestHandler?: RequestHandler<ExtendedContext>;
|
|
46
65
|
/**
|
|
47
66
|
* User-provided function that allows modifying the request object before it gets retried by the crawler.
|
|
48
67
|
* It's executed before each retry for the requests that failed less than {@link BrowserCrawlerOptions.maxRequestRetries|`maxRequestRetries`} times.
|
|
@@ -53,7 +72,7 @@ export interface BrowserCrawlerOptions<Context extends BrowserCrawlingContext =
|
|
|
53
72
|
* Second argument is the `Error` instance that
|
|
54
73
|
* represents the last error thrown during processing of the request.
|
|
55
74
|
*/
|
|
56
|
-
errorHandler?:
|
|
75
|
+
errorHandler?: ErrorHandler<CrawlingContext, ExtendedContext>;
|
|
57
76
|
/**
|
|
58
77
|
* A function to handle requests that failed more than `option.maxRequestRetries` times.
|
|
59
78
|
*
|
|
@@ -63,17 +82,12 @@ export interface BrowserCrawlerOptions<Context extends BrowserCrawlingContext =
|
|
|
63
82
|
* Second argument is the `Error` instance that
|
|
64
83
|
* represents the last error thrown during processing of the request.
|
|
65
84
|
*/
|
|
66
|
-
failedRequestHandler?:
|
|
85
|
+
failedRequestHandler?: ErrorHandler<CrawlingContext, ExtendedContext>;
|
|
67
86
|
/**
|
|
68
87
|
* Custom options passed to the underlying {@link BrowserPool} constructor.
|
|
69
88
|
* We can tweak those to fine-tune browser management.
|
|
70
89
|
*/
|
|
71
90
|
browserPoolOptions?: Partial<BrowserPoolOptions> & Partial<BrowserPoolHooks<__BrowserControllerReturn, __LaunchContextReturn>>;
|
|
72
|
-
/**
|
|
73
|
-
* If set, the crawler will be configured for all connections to use
|
|
74
|
-
* the Proxy URLs provided and rotated according to the configuration.
|
|
75
|
-
*/
|
|
76
|
-
proxyConfiguration?: ProxyConfiguration;
|
|
77
91
|
/**
|
|
78
92
|
* Async functions that are sequentially evaluated before the navigation. Good for setting additional cookies
|
|
79
93
|
* or browser properties before navigation. The function accepts two parameters, `crawlingContext` and `gotoOptions`,
|
|
@@ -119,8 +133,7 @@ export interface BrowserCrawlerOptions<Context extends BrowserCrawlingContext =
|
|
|
119
133
|
*/
|
|
120
134
|
navigationTimeoutSecs?: number;
|
|
121
135
|
/**
|
|
122
|
-
* Defines whether the cookies should be persisted for sessions.
|
|
123
|
-
* This can only be used when `useSessionPool` is set to `true`.
|
|
136
|
+
* Defines whether the cookies should be persisted for sessions. Enabled by default.
|
|
124
137
|
*/
|
|
125
138
|
persistCookiesPerSession?: boolean;
|
|
126
139
|
/**
|
|
@@ -178,21 +191,15 @@ export interface BrowserCrawlerOptions<Context extends BrowserCrawlingContext =
|
|
|
178
191
|
*
|
|
179
192
|
* @category Crawlers
|
|
180
193
|
*/
|
|
181
|
-
export declare abstract class BrowserCrawler<InternalBrowserPoolOptions extends BrowserPoolOptions = BrowserPoolOptions, LaunchOptions extends Dictionary | undefined = Dictionary, Context extends BrowserCrawlingContext = BrowserCrawlingContext, GoToOptions extends Dictionary = Dictionary> extends BasicCrawler<Context> {
|
|
182
|
-
readonly config: Configuration;
|
|
183
|
-
/**
|
|
184
|
-
* A reference to the underlying {@link ProxyConfiguration} class that manages the crawler's proxies.
|
|
185
|
-
* Only available if used by the crawler.
|
|
186
|
-
*/
|
|
187
|
-
proxyConfiguration?: ProxyConfiguration;
|
|
194
|
+
export declare abstract class BrowserCrawler<Page extends CommonPage = CommonPage, Response extends BaseResponse = BaseResponse, ProvidedController extends BrowserController = BrowserController, InternalBrowserPoolOptions extends BrowserPoolOptions = BrowserPoolOptions, LaunchOptions extends Dictionary | undefined = Dictionary, Context extends BrowserCrawlingContext<Page, Response, ProvidedController, Dictionary> = BrowserCrawlingContext<Page, Response, ProvidedController, Dictionary>, ContextExtension = Dictionary<never>, ExtendedContext extends Context = Context & ContextExtension, GoToOptions extends Dictionary = Dictionary> extends BasicCrawler<Context, ContextExtension, ExtendedContext> {
|
|
188
195
|
/**
|
|
189
196
|
* A reference to the underlying {@link BrowserPool} class that manages the crawler's browsers.
|
|
190
197
|
*/
|
|
191
198
|
browserPool: BrowserPool<InternalBrowserPoolOptions>;
|
|
192
199
|
launchContext: BrowserLaunchContext<LaunchOptions, unknown>;
|
|
193
|
-
protected
|
|
200
|
+
protected readonly ignoreShadowRoots: boolean;
|
|
201
|
+
protected readonly ignoreIframes: boolean;
|
|
194
202
|
protected navigationTimeoutMillis: number;
|
|
195
|
-
protected requestHandlerTimeoutInnerMillis: number;
|
|
196
203
|
protected preNavigationHooks: BrowserHook<Context>[];
|
|
197
204
|
protected postNavigationHooks: BrowserHook<Context>[];
|
|
198
205
|
protected persistCookiesPerSession: boolean;
|
|
@@ -213,14 +220,12 @@ export declare abstract class BrowserCrawler<InternalBrowserPoolOptions extends
|
|
|
213
220
|
sessionPoolOptions: import("ow").ObjectPredicate<object> & import("ow").BasePredicate<object | undefined>;
|
|
214
221
|
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
215
222
|
persistCookiesPerSession: import("ow").BooleanPredicate & import("ow").BasePredicate<boolean | undefined>;
|
|
216
|
-
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
217
|
-
useSessionPool: import("ow").BooleanPredicate & import("ow").BasePredicate<boolean | undefined>;
|
|
218
223
|
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
219
224
|
proxyConfiguration: import("ow").ObjectPredicate<object> & import("ow").BasePredicate<object | undefined>;
|
|
220
225
|
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
221
|
-
|
|
226
|
+
contextPipelineBuilder: import("ow").ObjectPredicate<object> & import("ow").BasePredicate<object | undefined>;
|
|
222
227
|
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
223
|
-
|
|
228
|
+
extendContext: import("ow").Predicate<Function> & import("ow").BasePredicate<Function | undefined>;
|
|
224
229
|
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
225
230
|
requestList: import("ow").ObjectPredicate<object> & import("ow").BasePredicate<object | undefined>;
|
|
226
231
|
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
@@ -241,20 +246,36 @@ export declare abstract class BrowserCrawler<InternalBrowserPoolOptions extends
|
|
|
241
246
|
maxSessionRotations: import("ow").NumberPredicate & import("ow").BasePredicate<number | undefined>;
|
|
242
247
|
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
243
248
|
maxRequestsPerCrawl: import("ow").NumberPredicate & import("ow").BasePredicate<number | undefined>;
|
|
249
|
+
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
250
|
+
maxCrawlDepth: import("ow").NumberPredicate & import("ow").BasePredicate<number | undefined>;
|
|
244
251
|
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
245
252
|
autoscaledPoolOptions: import("ow").ObjectPredicate<object> & import("ow").BasePredicate<object | undefined>;
|
|
246
253
|
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
247
254
|
statusMessageLoggingInterval: import("ow").NumberPredicate & import("ow").BasePredicate<number | undefined>;
|
|
248
255
|
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
249
256
|
statusMessageCallback: import("ow").Predicate<Function> & import("ow").BasePredicate<Function | undefined>;
|
|
257
|
+
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
258
|
+
additionalHttpErrorStatusCodes: import("ow").ArrayPredicate<number>;
|
|
259
|
+
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
260
|
+
ignoreHttpErrorStatusCodes: import("ow").ArrayPredicate<number>;
|
|
261
|
+
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
262
|
+
blockedStatusCodes: import("ow").ArrayPredicate<number>;
|
|
250
263
|
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
251
264
|
retryOnBlocked: import("ow").BooleanPredicate & import("ow").BasePredicate<boolean | undefined>;
|
|
252
265
|
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
253
|
-
respectRobotsTxtFile: import("ow").
|
|
266
|
+
respectRobotsTxtFile: import("ow").AnyPredicate<boolean | object>;
|
|
254
267
|
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
255
268
|
onSkippedRequest: import("ow").Predicate<Function> & import("ow").BasePredicate<Function | undefined>;
|
|
256
269
|
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
257
270
|
httpClient: import("ow").ObjectPredicate<object> & import("ow").BasePredicate<object | undefined>;
|
|
271
|
+
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
272
|
+
configuration: import("ow").ObjectPredicate<object> & import("ow").BasePredicate<object | undefined>;
|
|
273
|
+
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
274
|
+
storageClient: import("ow").ObjectPredicate<object> & import("ow").BasePredicate<object | undefined>;
|
|
275
|
+
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
276
|
+
eventManager: import("ow").ObjectPredicate<object> & import("ow").BasePredicate<object | undefined>;
|
|
277
|
+
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
278
|
+
logger: import("ow").ObjectPredicate<object> & import("ow").BasePredicate<object | undefined>;
|
|
258
279
|
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
259
280
|
minConcurrency: import("ow").NumberPredicate & import("ow").BasePredicate<number | undefined>;
|
|
260
281
|
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
@@ -263,42 +284,39 @@ export declare abstract class BrowserCrawler<InternalBrowserPoolOptions extends
|
|
|
263
284
|
maxRequestsPerMinute: import("ow").NumberPredicate & import("ow").BasePredicate<number | undefined>;
|
|
264
285
|
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
265
286
|
keepAlive: import("ow").BooleanPredicate & import("ow").BasePredicate<boolean | undefined>;
|
|
266
|
-
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
267
|
-
log: import("ow").ObjectPredicate<object> & import("ow").BasePredicate<object | undefined>;
|
|
268
287
|
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
269
288
|
experiments: import("ow").ObjectPredicate<object> & import("ow").BasePredicate<object | undefined>;
|
|
270
289
|
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
271
290
|
statisticsOptions: import("ow").ObjectPredicate<object> & import("ow").BasePredicate<object | undefined>;
|
|
291
|
+
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
292
|
+
id: import("ow").StringPredicate & import("ow").BasePredicate<string | undefined>;
|
|
272
293
|
};
|
|
273
294
|
/**
|
|
274
295
|
* All `BrowserCrawler` parameters are passed via an options object.
|
|
275
296
|
*/
|
|
276
|
-
protected constructor(options
|
|
277
|
-
|
|
297
|
+
protected constructor(options: BrowserCrawlerOptions<Page, Response, ProvidedController, Context, ContextExtension, ExtendedContext> & {
|
|
298
|
+
contextPipelineBuilder: () => ContextPipeline<CrawlingContext, Context>;
|
|
299
|
+
});
|
|
300
|
+
protected buildContextPipeline(): ContextPipeline<CrawlingContext, BrowserCrawlingContext<Page, Response, ProvidedController, Dictionary>>;
|
|
278
301
|
private containsSelectors;
|
|
279
|
-
protected isRequestBlocked(crawlingContext:
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
protected
|
|
285
|
-
protected _handleNavigation(crawlingContext: Context): Promise<void>;
|
|
286
|
-
protected _applyCookies({ session, request, page, browserController }: Context, preHooksCookies: string, postHooksCookies: string): Promise<void>;
|
|
302
|
+
protected isRequestBlocked(crawlingContext: BrowserCrawlingContext<Page, Response, ProvidedController>): Promise<string | false>;
|
|
303
|
+
private preparePage;
|
|
304
|
+
private performNavigation;
|
|
305
|
+
private handleBlockedRequestByContent;
|
|
306
|
+
private restoreRequestState;
|
|
307
|
+
protected _applyCookies({ session, request, page, browserController }: BrowserCrawlingContext, preHooksCookies: string, postHooksCookies: string): Promise<void>;
|
|
287
308
|
/**
|
|
288
309
|
* Marks session bad in case of navigation timeout.
|
|
289
310
|
*/
|
|
290
|
-
protected _handleNavigationTimeout(crawlingContext:
|
|
311
|
+
protected _handleNavigationTimeout(crawlingContext: BrowserCrawlingContext, error: Error): Promise<void>;
|
|
291
312
|
/**
|
|
292
313
|
* Transforms proxy-related errors to `SessionError`.
|
|
293
314
|
*/
|
|
294
315
|
protected _throwIfProxyError(error: Error): void;
|
|
295
|
-
protected abstract _navigationHandler(crawlingContext:
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
protected _responseHandler(crawlingContext: Context): Promise<void>;
|
|
300
|
-
protected _extendLaunchContext(_pageId: string, launchContext: LaunchContext): Promise<void>;
|
|
301
|
-
protected _maybeAddSessionRetiredListener(_pageId: string, browserController: Context['browserController']): void;
|
|
316
|
+
protected abstract _navigationHandler(crawlingContext: BrowserCrawlingContext<Page, Response, ProvidedController>, gotoOptions: GoToOptions): Promise<Context['response'] | null | undefined>;
|
|
317
|
+
private processResponse;
|
|
318
|
+
private browserSessionIds;
|
|
319
|
+
private addSessionRetiredListener;
|
|
302
320
|
/**
|
|
303
321
|
* Function for cleaning up after all requests are processed.
|
|
304
322
|
* @ignore
|
|
@@ -316,8 +334,15 @@ interface EnqueueLinksInternalOptions {
|
|
|
316
334
|
finalRequestUrl?: string;
|
|
317
335
|
}
|
|
318
336
|
/** @internal */
|
|
319
|
-
|
|
320
|
-
|
|
337
|
+
interface BoundEnqueueLinksInternalOptions {
|
|
338
|
+
enqueueLinks: BasicCrawlingContext['enqueueLinks'];
|
|
339
|
+
options?: ReadonlyDeep<Omit<EnqueueLinksOptions, 'requestQueue'>> & Pick<EnqueueLinksOptions, 'requestQueue'>;
|
|
340
|
+
originalRequestUrl: string;
|
|
341
|
+
finalRequestUrl?: string;
|
|
342
|
+
page: CommonPage;
|
|
343
|
+
}
|
|
344
|
+
/** @internal */
|
|
345
|
+
export declare function browserCrawlerEnqueueLinks(options: EnqueueLinksInternalOptions | BoundEnqueueLinksInternalOptions): Promise<unknown>;
|
|
321
346
|
/**
|
|
322
347
|
* Extracts URLs from a given page.
|
|
323
348
|
* @ignore
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"browser-crawler.d.ts","sourceRoot":"","sources":["../../src/internals/browser-crawler.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EACR,SAAS,EACT,mBAAmB,EACnB,eAAe,EACf,UAAU,EACV,mBAAmB,EACnB,YAAY,EACZ,aAAa,EACb,
|
|
1
|
+
{"version":3,"file":"browser-crawler.d.ts","sourceRoot":"","sources":["../../src/internals/browser-crawler.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EACR,SAAS,EACT,mBAAmB,EACnB,oBAAoB,EACpB,eAAe,EACf,UAAU,EACV,mBAAmB,EACnB,YAAY,EACZ,aAAa,EACb,OAAO,EACP,cAAc,EACd,eAAe,EAEf,sBAAsB,EACzB,MAAM,gBAAgB,CAAC;AACxB,OAAO,EACH,YAAY,EACZ,eAAe,EAUlB,MAAM,gBAAgB,CAAC;AACxB,OAAO,KAAK,EACR,iBAAiB,EACjB,aAAa,EACb,gBAAgB,EAChB,kBAAkB,EAClB,UAAU,EACV,uBAAuB,EACvB,aAAa,EAChB,MAAM,uBAAuB,CAAC;AAC/B,OAAO,EAA6B,WAAW,EAAE,MAAM,uBAAuB,CAAC;AAC/E,OAAO,KAAK,EAAE,sBAAsB,EAA0B,MAAM,gBAAgB,CAAC;AACrF,OAAO,KAAK,EAAE,aAAa,EAAE,MAAM,gBAAgB,CAAC;AAGpD,OAAO,KAAK,EAAE,YAAY,EAAE,MAAM,WAAW,CAAC;AAI9C,OAAO,KAAK,EAAE,oBAAoB,EAAE,MAAM,uBAAuB,CAAC;AAElE,UAAU,YAAY;IAClB,MAAM,IAAI,MAAM,CAAC;CACpB;AAID,MAAM,WAAW,sBAAsB,CACnC,IAAI,SAAS,UAAU,GAAG,UAAU,EACpC,QAAQ,SAAS,YAAY,GAAG,YAAY,EAC5C,kBAAkB,GAAG,iBAAiB,EACtC,QAAQ,SAAS,UAAU,GAAG,UAAU,CAC1C,SAAQ,eAAe,CAAC,QAAQ,CAAC;IAC/B;;OAEG;IACH,iBAAiB,EAAE,kBAAkB,CAAC;IAEtC;;OAEG;IACH,IAAI,EAAE,IAAI,CAAC;IAEX;;OAEG;IACH,OAAO,EAAE,aAAa,CAAC,OAAO,CAAC,QAAQ,CAAC,CAAC,CAAC;IAE1C;;OAEG;IACH,QAAQ,EAAE,QAAQ,CAAC;IAEnB;;OAEG;IACH,YAAY,EAAE,CAAC,OAAO,CAAC,EAAE,mBAAmB,KAAK,OAAO,CAAC,sBAAsB,CAAC,CAAC;CACpF;AAED,MAAM,MAAM,WAAW,CAAC,OAAO,GAAG,sBAAsB,EAAE,WAAW,SAAS,UAAU,GAAG,SAAS,GAAG,UAAU,IAAI,CACjH,eAAe,EAAE,OAAO,EACxB,WAAW,EAAE,WAAW,KACvB,SAAS,CAAC,IAAI,CAAC,CAAC;AAErB,MAAM,WAAW,qBAAqB,CAClC,IAAI,SAAS,UAAU,GAAG,UAAU,EACpC,QAAQ,SAAS,YAAY,GAAG,YAAY,EAC5C,kBAAkB,SAAS,iBAAiB,GAAG,iBAAiB,EAChE,OAAO,SAAS,sBAAsB,CAAC,IAAI,EAAE,QAAQ,EAAE,kBAAkB,EAAE,UAAU,CAAC,GAAG,sBAAsB,CAC3G,IAAI,EACJ,QAAQ,EACR,kBAAkB,EAClB,UAAU,CACb,EACD,gBAAgB,GAAG,UAAU,CAAC,KAAK,CAAC,EACpC,eAAe,SAAS,OAAO,GAAG,OAAO,GAAG,gBAAgB,EAC5D,0BAA0B,SAAS,kBAAkB,GAAG,kBAAkB,EAC1E,gBAAgB,SAAS,aAAa,EAAE,GAAG,uBAAuB,CAAC,0BAA0B,CAAC,gBAAgB,CAAC,CAAC,EAChH,yBAAyB,SAAS,iBAAiB,GAAG,UAAU,CAAC,gBAAgB,CAAC,MAAM,CAAC,CAAC,kBAAkB,CAAC,CAAC,EAC9G,qBAAqB,SAAS,aAAa,GAAG,UAAU,CAAC,gBAAgB,CAAC,MAAM,CAAC,CAAC,qBAAqB,CAAC,CAAC,CAC3G,SAAQ,IAAI,CACN,mBAAmB,CAAC,OAAO,EAAE,eAAe,CAAC,EAE7C,gBAAgB,GAAG,sBAAsB,GAAG,cAAc,CAC7D;IACD,aAAa,CAAC,EAAE,oBAAoB,CAAC,GAAG,EAAE,GAAG,CAAC,CAAC;IAE/C;;;;;;;;;;;;;;;;;;;;;;;;;;OA0BG;IACH,cAAc,CAAC,EAAE,cAAc,CAAC,eAAe,CAAC,CAAC;IAEjD;;;;;;;;;OASG;IACH,YAAY,CAAC,EAAE,YAAY,CAAC,eAAe,EAAE,eAAe,CAAC,CAAC;IAE9D;;;;;;;;OAQG;IACH,oBAAoB,CAAC,EAAE,YAAY,CAAC,eAAe,EAAE,eAAe,CAAC,CAAC;IAEtE;;;OAGG;IACH,kBAAkB,CAAC,EAAE,OAAO,CAAC,kBAAkB,CAAC,GAC5C,OAAO,CAAC,gBAAgB,CAAC,yBAAyB,EAAE,qBAAqB,CAAC,CAAC,CAAC;IAEhF;;;;;;;;;;;;;;;;;;;;OAoBG;IACH,kBAAkB,CAAC,EAAE,WAAW,CAAC,OAAO,CAAC,EAAE,CAAC;IAE5C;;;;;;;;;;;;;;;;OAgBG;IACH,mBAAmB,CAAC,EAAE,WAAW,CAAC,OAAO,CAAC,EAAE,CAAC;IAE7C;;OAEG;IACH,qBAAqB,CAAC,EAAE,MAAM,CAAC;IAE/B;;OAEG;IACH,wBAAwB,CAAC,EAAE,OAAO,CAAC;IAEnC;;;OAGG;IACH,QAAQ,CAAC,EAAE,OAAO,GAAG,KAAK,GAAG,KAAK,CAAC;IAEnC;;;OAGG;IACH,iBAAiB,CAAC,EAAE,OAAO,CAAC;IAE5B;;;OAGG;IACH,aAAa,CAAC,EAAE,OAAO,CAAC;CAC3B;AAED;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GAsCG;AACH,8BAAsB,cAAc,CAChC,IAAI,SAAS,UAAU,GAAG,UAAU,EACpC,QAAQ,SAAS,YAAY,GAAG,YAAY,EAC5C,kBAAkB,SAAS,iBAAiB,GAAG,iBAAiB,EAChE,0BAA0B,SAAS,kBAAkB,GAAG,kBAAkB,EAC1E,aAAa,SAAS,UAAU,GAAG,SAAS,GAAG,UAAU,EACzD,OAAO,SAAS,sBAAsB,CAAC,IAAI,EAAE,QAAQ,EAAE,kBAAkB,EAAE,UAAU,CAAC,GAAG,sBAAsB,CAC3G,IAAI,EACJ,QAAQ,EACR,kBAAkB,EAClB,UAAU,CACb,EACD,gBAAgB,GAAG,UAAU,CAAC,KAAK,CAAC,EACpC,eAAe,SAAS,OAAO,GAAG,OAAO,GAAG,gBAAgB,EAC5D,WAAW,SAAS,UAAU,GAAG,UAAU,CAC7C,SAAQ,YAAY,CAAC,OAAO,EAAE,gBAAgB,EAAE,eAAe,CAAC;IAC9D;;OAEG;IACH,WAAW,EAAE,WAAW,CAAC,0BAA0B,CAAC,CAAC;IAErD,aAAa,EAAE,oBAAoB,CAAC,aAAa,EAAE,OAAO,CAAC,CAAC;IAE5D,SAAS,CAAC,QAAQ,CAAC,iBAAiB,EAAE,OAAO,CAAC;IAC9C,SAAS,CAAC,QAAQ,CAAC,aAAa,EAAE,OAAO,CAAC;IAE1C,SAAS,CAAC,uBAAuB,EAAE,MAAM,CAAC;IAC1C,SAAS,CAAC,kBAAkB,EAAE,WAAW,CAAC,OAAO,CAAC,EAAE,CAAC;IACrD,SAAS,CAAC,mBAAmB,EAAE,WAAW,CAAC,OAAO,CAAC,EAAE,CAAC;IACtD,SAAS,CAAC,wBAAwB,EAAE,OAAO,CAAC;IAE5C,iBAA0B,YAAY;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;MAapC;IAEF;;OAEG;IACH,SAAS,aACL,OAAO,EAAE,qBAAqB,CAC1B,IAAI,EACJ,QAAQ,EACR,kBAAkB,EAClB,OAAO,EACP,gBAAgB,EAChB,eAAe,CAClB,GAAG;QACA,sBAAsB,EAAE,MAAM,eAAe,CAAC,eAAe,EAAE,OAAO,CAAC,CAAC;KAC3E;cAuDc,oBAAoB,IAAI,eAAe,CACtD,eAAe,EACf,sBAAsB,CAAC,IAAI,EAAE,QAAQ,EAAE,kBAAkB,EAAE,UAAU,CAAC,CACzE;YAgBa,iBAAiB;cASf,gBAAgB,CAC5B,eAAe,EAAE,sBAAsB,CAAC,IAAI,EAAE,QAAQ,EAAE,kBAAkB,CAAC,GAC5E,OAAO,CAAC,MAAM,GAAG,KAAK,CAAC;YAuBZ,WAAW;YAmDX,iBAAiB;YA0EjB,6BAA6B;YAW7B,mBAAmB;cAKjB,aAAa,CACzB,EAAE,OAAO,EAAE,OAAO,EAAE,IAAI,EAAE,iBAAiB,EAAE,EAAE,sBAAsB,EACrE,eAAe,EAAE,MAAM,EACvB,gBAAgB,EAAE,MAAM;IAc5B;;OAEG;cACa,wBAAwB,CAAC,eAAe,EAAE,sBAAsB,EAAE,KAAK,EAAE,KAAK,GAAG,OAAO,CAAC,IAAI,CAAC;IAU9G;;OAEG;IACH,SAAS,CAAC,kBAAkB,CAAC,KAAK,EAAE,KAAK;IAMzC,SAAS,CAAC,QAAQ,CAAC,kBAAkB,CACjC,eAAe,EAAE,sBAAsB,CAAC,IAAI,EAAE,QAAQ,EAAE,kBAAkB,CAAC,EAC3E,WAAW,EAAE,WAAW,GACzB,OAAO,CAAC,OAAO,CAAC,UAAU,CAAC,GAAG,IAAI,GAAG,SAAS,CAAC;YAEpC,eAAe;IA+B7B,OAAO,CAAC,iBAAiB,CAA4D;IAErF,OAAO,CAAC,yBAAyB;IA+BjC;;;OAGG;IACY,QAAQ,IAAI,OAAO,CAAC,IAAI,CAAC;CAI3C;AAED,gBAAgB;AAChB,UAAU,2BAA2B;IACjC,OAAO,CAAC,EAAE,YAAY,CAAC,IAAI,CAAC,mBAAmB,EAAE,cAAc,CAAC,CAAC,GAAG,IAAI,CAAC,mBAAmB,EAAE,cAAc,CAAC,CAAC;IAC9G,IAAI,EAAE,UAAU,CAAC;IACjB,YAAY,EAAE,eAAe,CAAC;IAC9B,aAAa,CAAC,EAAE,aAAa,CAAC;IAC9B,gBAAgB,CAAC,EAAE,sBAAsB,CAAC;IAC1C,kBAAkB,EAAE,MAAM,CAAC;IAC3B,eAAe,CAAC,EAAE,MAAM,CAAC;CAC5B;AAED,gBAAgB;AAChB,UAAU,gCAAgC;IACtC,YAAY,EAAE,oBAAoB,CAAC,cAAc,CAAC,CAAC;IACnD,OAAO,CAAC,EAAE,YAAY,CAAC,IAAI,CAAC,mBAAmB,EAAE,cAAc,CAAC,CAAC,GAAG,IAAI,CAAC,mBAAmB,EAAE,cAAc,CAAC,CAAC;IAC9G,kBAAkB,EAAE,MAAM,CAAC;IAC3B,eAAe,CAAC,EAAE,MAAM,CAAC;IACzB,IAAI,EAAE,UAAU,CAAC;CACpB;AASD,gBAAgB;AAChB,wBAAsB,0BAA0B,CAC5C,OAAO,EAAE,2BAA2B,GAAG,gCAAgC,oBAiC1E;AAED;;;GAGG;AACH,wBAAsB,mBAAmB,CAErC,IAAI,EAAE;IAAE,MAAM,EAAE,QAAQ,CAAA;CAAE,EAC1B,QAAQ,EAAE,MAAM,EAChB,OAAO,EAAE,MAAM,GAChB,OAAO,CAAC,MAAM,EAAE,CAAC,CA0BnB"}
|