@crawlee/http 4.0.0-beta.4 → 4.0.0-beta.41
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +9 -5
- package/internals/file-download.d.ts +58 -32
- package/internals/file-download.d.ts.map +1 -1
- package/internals/file-download.js +116 -73
- package/internals/file-download.js.map +1 -1
- package/internals/http-crawler.d.ts +92 -175
- package/internals/http-crawler.d.ts.map +1 -1
- package/internals/http-crawler.js +169 -321
- package/internals/http-crawler.js.map +1 -1
- package/internals/utils.d.ts +14 -0
- package/internals/utils.d.ts.map +1 -0
- package/internals/utils.js +71 -0
- package/internals/utils.js.map +1 -0
- package/package.json +7 -7
- package/tsconfig.build.tsbuildinfo +0 -1
|
@@ -1,25 +1,13 @@
|
|
|
1
|
-
import type {
|
|
2
|
-
import
|
|
3
|
-
import type {
|
|
4
|
-
import { BasicCrawler, Configuration, CrawlerExtension } from '@crawlee/basic';
|
|
5
|
-
import type { HttpResponse } from '@crawlee/core';
|
|
1
|
+
import type { BasicCrawlerOptions, CrawlingContext, ErrorHandler, GetUserDataFromRequest, Request as CrawleeRequest, RequestHandler, RequireContextPipeline, RouterRoutes, Session } from '@crawlee/basic';
|
|
2
|
+
import { BasicCrawler, ContextPipeline } from '@crawlee/basic';
|
|
3
|
+
import type { LoadedRequest } from '@crawlee/core';
|
|
6
4
|
import type { Awaitable, Dictionary } from '@crawlee/types';
|
|
7
5
|
import { type CheerioRoot } from '@crawlee/utils';
|
|
8
6
|
import type { RequestLike, ResponseLike } from 'content-type';
|
|
9
|
-
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
10
|
-
import type { Method, OptionsInit } from 'got-scraping';
|
|
11
|
-
import { ObjectPredicate } from 'ow';
|
|
12
7
|
import type { JsonValue } from 'type-fest';
|
|
13
|
-
/**
|
|
14
|
-
* TODO exists for BC within HttpCrawler - replace completely with StreamingHttpResponse in 4.0
|
|
15
|
-
* @internal
|
|
16
|
-
*/
|
|
17
|
-
export type PlainResponse = Omit<HttpResponse, 'body'> & IncomingMessage & {
|
|
18
|
-
body?: unknown;
|
|
19
|
-
};
|
|
20
8
|
export type HttpErrorHandler<UserData extends Dictionary = any, // with default to Dictionary we cant use a typed router in untyped crawler
|
|
21
9
|
JSONData extends JsonValue = any> = ErrorHandler<HttpCrawlingContext<UserData, JSONData>>;
|
|
22
|
-
export interface HttpCrawlerOptions<Context extends InternalHttpCrawlingContext = InternalHttpCrawlingContext> extends BasicCrawlerOptions<Context> {
|
|
10
|
+
export interface HttpCrawlerOptions<Context extends InternalHttpCrawlingContext = InternalHttpCrawlingContext, ContextExtension = Dictionary<never>, ExtendedContext extends Context = Context & ContextExtension> extends BasicCrawlerOptions<Context, ContextExtension, ExtendedContext> {
|
|
23
11
|
/**
|
|
24
12
|
* Timeout in which the HTTP request to the resource needs to finish, given in seconds.
|
|
25
13
|
*/
|
|
@@ -28,20 +16,14 @@ export interface HttpCrawlerOptions<Context extends InternalHttpCrawlingContext
|
|
|
28
16
|
* If set to true, SSL certificate errors will be ignored.
|
|
29
17
|
*/
|
|
30
18
|
ignoreSslErrors?: boolean;
|
|
31
|
-
/**
|
|
32
|
-
* If set, this crawler will be configured for all connections to use
|
|
33
|
-
* [Apify Proxy](https://console.apify.com/proxy) or your own Proxy URLs provided and rotated according to the configuration.
|
|
34
|
-
* For more information, see the [documentation](https://docs.apify.com/proxy).
|
|
35
|
-
*/
|
|
36
|
-
proxyConfiguration?: ProxyConfiguration;
|
|
37
19
|
/**
|
|
38
20
|
* Async functions that are sequentially evaluated before the navigation. Good for setting additional cookies
|
|
39
|
-
* or browser properties before navigation. The function accepts
|
|
40
|
-
* which
|
|
21
|
+
* or browser properties before navigation. The function accepts one parameter `crawlingContext`,
|
|
22
|
+
* which is passed to the `requestAsBrowser()` function the crawler calls to navigate.
|
|
41
23
|
* Example:
|
|
42
24
|
* ```
|
|
43
25
|
* preNavigationHooks: [
|
|
44
|
-
* async (crawlingContext
|
|
26
|
+
* async (crawlingContext) => {
|
|
45
27
|
* // ...
|
|
46
28
|
* },
|
|
47
29
|
* ]
|
|
@@ -50,7 +32,7 @@ export interface HttpCrawlerOptions<Context extends InternalHttpCrawlingContext
|
|
|
50
32
|
* Modyfing `pageOptions` is supported only in Playwright incognito.
|
|
51
33
|
* See {@link PrePageCreateHook}
|
|
52
34
|
*/
|
|
53
|
-
preNavigationHooks?: InternalHttpHook<
|
|
35
|
+
preNavigationHooks?: InternalHttpHook<CrawlingContext>[];
|
|
54
36
|
/**
|
|
55
37
|
* Async functions that are sequentially evaluated after the navigation. Good for checking if the navigation was successful.
|
|
56
38
|
* The function accepts `crawlingContext` as the only parameter.
|
|
@@ -63,10 +45,11 @@ export interface HttpCrawlerOptions<Context extends InternalHttpCrawlingContext
|
|
|
63
45
|
* ]
|
|
64
46
|
* ```
|
|
65
47
|
*/
|
|
66
|
-
postNavigationHooks?:
|
|
48
|
+
postNavigationHooks?: ((crawlingContext: CrawlingContextWithReponse) => Awaitable<void>)[];
|
|
67
49
|
/**
|
|
68
50
|
* An array of [MIME types](https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/MIME_types/Complete_list_of_MIME_types)
|
|
69
|
-
* you want the crawler to load and process. By default, only `text/html
|
|
51
|
+
* you want the crawler to load and process. By default, only `text/html`, `application/xhtml+xml`, `text/xml`, `application/xml`,
|
|
52
|
+
* and `application/json` MIME types are supported.
|
|
70
53
|
*/
|
|
71
54
|
additionalMimeTypes?: string[];
|
|
72
55
|
/**
|
|
@@ -92,35 +75,34 @@ export interface HttpCrawlerOptions<Context extends InternalHttpCrawlingContext
|
|
|
92
75
|
*/
|
|
93
76
|
forceResponseEncoding?: string;
|
|
94
77
|
/**
|
|
95
|
-
* Automatically saves cookies to Session.
|
|
78
|
+
* Automatically saves cookies to Session. Enabled by default.
|
|
96
79
|
*
|
|
97
80
|
* It parses cookie from response "set-cookie" header saves or updates cookies for session and once the session is used for next request.
|
|
98
81
|
* It passes the "Cookie" header to the request with the session cookies.
|
|
99
82
|
*/
|
|
100
83
|
persistCookiesPerSession?: boolean;
|
|
101
|
-
/**
|
|
102
|
-
* An array of HTTP response [Status Codes](https://developer.mozilla.org/en-US/docs/Web/HTTP/Status) to be excluded from error consideration.
|
|
103
|
-
* By default, status codes >= 500 trigger errors.
|
|
104
|
-
*/
|
|
105
|
-
ignoreHttpErrorStatusCodes?: number[];
|
|
106
|
-
/**
|
|
107
|
-
* An array of additional HTTP response [Status Codes](https://developer.mozilla.org/en-US/docs/Web/HTTP/Status) to be treated as errors.
|
|
108
|
-
* By default, status codes >= 500 trigger errors.
|
|
109
|
-
*/
|
|
110
|
-
additionalHttpErrorStatusCodes?: number[];
|
|
111
84
|
}
|
|
112
85
|
/**
|
|
113
86
|
* @internal
|
|
114
87
|
*/
|
|
115
|
-
export type InternalHttpHook<Context> = (crawlingContext: Context
|
|
88
|
+
export type InternalHttpHook<Context> = (crawlingContext: Context) => Awaitable<void>;
|
|
116
89
|
export type HttpHook<UserData extends Dictionary = any, // with default to Dictionary we cant use a typed router in untyped crawler
|
|
117
90
|
JSONData extends JsonValue = any> = InternalHttpHook<HttpCrawlingContext<UserData, JSONData>>;
|
|
91
|
+
interface CrawlingContextWithReponse<UserData extends Dictionary = any> extends CrawlingContext<UserData> {
|
|
92
|
+
/**
|
|
93
|
+
* The request object that was successfully loaded and navigated to, including the {@link Request.loadedUrl|`loadedUrl`} property.
|
|
94
|
+
*/
|
|
95
|
+
request: LoadedRequest<CrawleeRequest<UserData>>;
|
|
96
|
+
/**
|
|
97
|
+
* The HTTP response object containing status code, headers, and other response metadata.
|
|
98
|
+
*/
|
|
99
|
+
response: Response;
|
|
100
|
+
}
|
|
118
101
|
/**
|
|
119
102
|
* @internal
|
|
120
103
|
*/
|
|
121
104
|
export interface InternalHttpCrawlingContext<UserData extends Dictionary = any, // with default to Dictionary we cant use a typed router in untyped crawler
|
|
122
|
-
JSONData extends JsonValue = any
|
|
123
|
-
Crawler = HttpCrawler<any>> extends CrawlingContext<Crawler, UserData> {
|
|
105
|
+
JSONData extends JsonValue = any> extends CrawlingContextWithReponse<UserData> {
|
|
124
106
|
/**
|
|
125
107
|
* The request body of the web page.
|
|
126
108
|
* The type depends on the `Content-Type` header of the web page:
|
|
@@ -139,7 +121,6 @@ Crawler = HttpCrawler<any>> extends CrawlingContext<Crawler, UserData> {
|
|
|
139
121
|
type: string;
|
|
140
122
|
encoding: BufferEncoding;
|
|
141
123
|
};
|
|
142
|
-
response: PlainResponse;
|
|
143
124
|
/**
|
|
144
125
|
* Wait for an element matching the selector to appear. Timeout is ignored.
|
|
145
126
|
*
|
|
@@ -167,7 +148,7 @@ Crawler = HttpCrawler<any>> extends CrawlingContext<Crawler, UserData> {
|
|
|
167
148
|
*/
|
|
168
149
|
parseWithCheerio(selector?: string, timeoutMs?: number): Promise<CheerioRoot>;
|
|
169
150
|
}
|
|
170
|
-
export interface HttpCrawlingContext<UserData extends Dictionary = any, JSONData extends JsonValue = any> extends InternalHttpCrawlingContext<UserData, JSONData
|
|
151
|
+
export interface HttpCrawlingContext<UserData extends Dictionary = any, JSONData extends JsonValue = any> extends InternalHttpCrawlingContext<UserData, JSONData> {
|
|
171
152
|
}
|
|
172
153
|
export type HttpRequestHandler<UserData extends Dictionary = any, // with default to Dictionary we cant use a typed router in untyped crawler
|
|
173
154
|
JSONData extends JsonValue = any> = RequestHandler<HttpCrawlingContext<UserData, JSONData>>;
|
|
@@ -192,18 +173,18 @@ JSONData extends JsonValue = any> = RequestHandler<HttpCrawlingContext<UserData,
|
|
|
192
173
|
*
|
|
193
174
|
* The crawler finishes when there are no more {@link Request} objects to crawl.
|
|
194
175
|
*
|
|
195
|
-
* We can use the `preNavigationHooks` to adjust
|
|
176
|
+
* We can use the `preNavigationHooks` to adjust the crawling context before the request is made:
|
|
196
177
|
*
|
|
197
178
|
* ```javascript
|
|
198
179
|
* preNavigationHooks: [
|
|
199
|
-
* (crawlingContext
|
|
180
|
+
* (crawlingContext) => {
|
|
200
181
|
* // ...
|
|
201
182
|
* },
|
|
202
183
|
* ]
|
|
203
184
|
* ```
|
|
204
185
|
*
|
|
205
|
-
* By default, this crawler only processes web pages with the `text/html`
|
|
206
|
-
* and `application/
|
|
186
|
+
* By default, this crawler only processes web pages with the `text/html`, `application/xhtml+xml`, `text/xml`, `application/xml`,
|
|
187
|
+
* and `application/json` MIME content types (as reported by the `Content-Type` HTTP header),
|
|
207
188
|
* and skips pages with other content types. If you want the crawler to process other content types,
|
|
208
189
|
* use the {@link HttpCrawlerOptions.additionalMimeTypes} constructor option.
|
|
209
190
|
* Beware that the parsing behavior differs for HTML, XML, JSON and other types of content.
|
|
@@ -238,23 +219,14 @@ JSONData extends JsonValue = any> = RequestHandler<HttpCrawlingContext<UserData,
|
|
|
238
219
|
* ```
|
|
239
220
|
* @category Crawlers
|
|
240
221
|
*/
|
|
241
|
-
export declare class HttpCrawler<Context extends InternalHttpCrawlingContext<any, any,
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
* A reference to the underlying {@link ProxyConfiguration} class that manages the crawler's proxies.
|
|
245
|
-
* Only available if used by the crawler.
|
|
246
|
-
*/
|
|
247
|
-
proxyConfiguration?: ProxyConfiguration;
|
|
248
|
-
protected userRequestHandlerTimeoutMillis: number;
|
|
249
|
-
protected preNavigationHooks: InternalHttpHook<Context>[];
|
|
250
|
-
protected postNavigationHooks: InternalHttpHook<Context>[];
|
|
222
|
+
export declare class HttpCrawler<Context extends InternalHttpCrawlingContext<any, any> = InternalHttpCrawlingContext, ContextExtension = Dictionary<never>, ExtendedContext extends Context = Context & ContextExtension> extends BasicCrawler<Context, ContextExtension, ExtendedContext> {
|
|
223
|
+
protected preNavigationHooks: InternalHttpHook<CrawlingContext>[];
|
|
224
|
+
protected postNavigationHooks: ((crawlingContext: CrawlingContextWithReponse) => Awaitable<void>)[];
|
|
251
225
|
protected persistCookiesPerSession: boolean;
|
|
252
226
|
protected navigationTimeoutMillis: number;
|
|
253
227
|
protected ignoreSslErrors: boolean;
|
|
254
228
|
protected suggestResponseEncoding?: string;
|
|
255
229
|
protected forceResponseEncoding?: string;
|
|
256
|
-
protected additionalHttpErrorStatusCodes: Set<number>;
|
|
257
|
-
protected ignoreHttpErrorStatusCodes: Set<number>;
|
|
258
230
|
protected readonly supportedMimeTypes: Set<string>;
|
|
259
231
|
protected static optionsShape: {
|
|
260
232
|
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
@@ -267,18 +239,16 @@ export declare class HttpCrawler<Context extends InternalHttpCrawlingContext<any
|
|
|
267
239
|
suggestResponseEncoding: import("ow").StringPredicate & import("ow").BasePredicate<string | undefined>;
|
|
268
240
|
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
269
241
|
forceResponseEncoding: import("ow").StringPredicate & import("ow").BasePredicate<string | undefined>;
|
|
270
|
-
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
271
|
-
proxyConfiguration: ObjectPredicate<object> & import("ow").BasePredicate<object | undefined>;
|
|
272
242
|
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
273
243
|
persistCookiesPerSession: import("ow").BooleanPredicate & import("ow").BasePredicate<boolean | undefined>;
|
|
274
|
-
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
275
|
-
additionalHttpErrorStatusCodes: import("ow").ArrayPredicate<number>;
|
|
276
|
-
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
277
|
-
ignoreHttpErrorStatusCodes: import("ow").ArrayPredicate<number>;
|
|
278
244
|
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
279
245
|
preNavigationHooks: import("ow").ArrayPredicate<unknown> & import("ow").BasePredicate<unknown[] | undefined>;
|
|
280
246
|
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
281
247
|
postNavigationHooks: import("ow").ArrayPredicate<unknown> & import("ow").BasePredicate<unknown[] | undefined>;
|
|
248
|
+
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
249
|
+
contextPipelineBuilder: import("ow").ObjectPredicate<object> & import("ow").BasePredicate<object | undefined>;
|
|
250
|
+
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
251
|
+
extendContext: import("ow").Predicate<Function> & import("ow").BasePredicate<Function | undefined>;
|
|
282
252
|
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
283
253
|
requestList: import("ow").ObjectPredicate<object> & import("ow").BasePredicate<object | undefined>;
|
|
284
254
|
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
@@ -299,24 +269,40 @@ export declare class HttpCrawler<Context extends InternalHttpCrawlingContext<any
|
|
|
299
269
|
maxSessionRotations: import("ow").NumberPredicate & import("ow").BasePredicate<number | undefined>;
|
|
300
270
|
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
301
271
|
maxRequestsPerCrawl: import("ow").NumberPredicate & import("ow").BasePredicate<number | undefined>;
|
|
272
|
+
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
273
|
+
maxCrawlDepth: import("ow").NumberPredicate & import("ow").BasePredicate<number | undefined>;
|
|
302
274
|
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
303
275
|
autoscaledPoolOptions: import("ow").ObjectPredicate<object> & import("ow").BasePredicate<object | undefined>;
|
|
304
276
|
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
305
277
|
sessionPoolOptions: import("ow").ObjectPredicate<object> & import("ow").BasePredicate<object | undefined>;
|
|
306
278
|
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
307
|
-
|
|
279
|
+
proxyConfiguration: import("ow").ObjectPredicate<object> & import("ow").BasePredicate<object | undefined>;
|
|
308
280
|
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
309
281
|
statusMessageLoggingInterval: import("ow").NumberPredicate & import("ow").BasePredicate<number | undefined>;
|
|
310
282
|
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
311
283
|
statusMessageCallback: import("ow").Predicate<Function> & import("ow").BasePredicate<Function | undefined>;
|
|
284
|
+
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
285
|
+
additionalHttpErrorStatusCodes: import("ow").ArrayPredicate<number>;
|
|
286
|
+
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
287
|
+
ignoreHttpErrorStatusCodes: import("ow").ArrayPredicate<number>;
|
|
288
|
+
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
289
|
+
blockedStatusCodes: import("ow").ArrayPredicate<number>;
|
|
312
290
|
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
313
291
|
retryOnBlocked: import("ow").BooleanPredicate & import("ow").BasePredicate<boolean | undefined>;
|
|
314
292
|
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
315
|
-
respectRobotsTxtFile: import("ow").
|
|
293
|
+
respectRobotsTxtFile: import("ow").AnyPredicate<boolean | object>;
|
|
316
294
|
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
317
295
|
onSkippedRequest: import("ow").Predicate<Function> & import("ow").BasePredicate<Function | undefined>;
|
|
318
296
|
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
319
297
|
httpClient: import("ow").ObjectPredicate<object> & import("ow").BasePredicate<object | undefined>;
|
|
298
|
+
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
299
|
+
configuration: import("ow").ObjectPredicate<object> & import("ow").BasePredicate<object | undefined>;
|
|
300
|
+
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
301
|
+
storageClient: import("ow").ObjectPredicate<object> & import("ow").BasePredicate<object | undefined>;
|
|
302
|
+
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
303
|
+
eventManager: import("ow").ObjectPredicate<object> & import("ow").BasePredicate<object | undefined>;
|
|
304
|
+
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
305
|
+
logger: import("ow").ObjectPredicate<object> & import("ow").BasePredicate<object | undefined>;
|
|
320
306
|
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
321
307
|
minConcurrency: import("ow").NumberPredicate & import("ow").BasePredicate<number | undefined>;
|
|
322
308
|
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
@@ -325,141 +311,72 @@ export declare class HttpCrawler<Context extends InternalHttpCrawlingContext<any
|
|
|
325
311
|
maxRequestsPerMinute: import("ow").NumberPredicate & import("ow").BasePredicate<number | undefined>;
|
|
326
312
|
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
327
313
|
keepAlive: import("ow").BooleanPredicate & import("ow").BasePredicate<boolean | undefined>;
|
|
328
|
-
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
329
|
-
log: import("ow").ObjectPredicate<object> & import("ow").BasePredicate<object | undefined>;
|
|
330
314
|
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
331
315
|
experiments: import("ow").ObjectPredicate<object> & import("ow").BasePredicate<object | undefined>;
|
|
332
316
|
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
333
317
|
statisticsOptions: import("ow").ObjectPredicate<object> & import("ow").BasePredicate<object | undefined>;
|
|
318
|
+
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
319
|
+
id: import("ow").StringPredicate & import("ow").BasePredicate<string | undefined>;
|
|
334
320
|
};
|
|
335
321
|
/**
|
|
336
322
|
* All `HttpCrawlerOptions` parameters are passed via an options object.
|
|
337
323
|
*/
|
|
338
|
-
constructor(options?: HttpCrawlerOptions<Context
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
use(extension: CrawlerExtension): void;
|
|
345
|
-
/**
|
|
346
|
-
* Wrapper around requestHandler that opens and closes pages etc.
|
|
347
|
-
*/
|
|
348
|
-
protected _runRequestHandler(crawlingContext: Context): Promise<void>;
|
|
349
|
-
protected isRequestBlocked(crawlingContext: Context): Promise<string | false>;
|
|
350
|
-
protected _handleNavigation(crawlingContext: Context): Promise<void>;
|
|
324
|
+
constructor(options?: HttpCrawlerOptions<Context, ContextExtension, ExtendedContext> & RequireContextPipeline<InternalHttpCrawlingContext, Context>);
|
|
325
|
+
protected buildContextPipeline(): ContextPipeline<CrawlingContext, InternalHttpCrawlingContext>;
|
|
326
|
+
private makeHttpRequest;
|
|
327
|
+
private processHttpResponse;
|
|
328
|
+
private handleBlockedRequestByContent;
|
|
329
|
+
protected isRequestBlocked(crawlingContext: InternalHttpCrawlingContext): Promise<string | false>;
|
|
351
330
|
/**
|
|
352
|
-
*
|
|
331
|
+
* Returns the `Cookie` header value based on the current context and
|
|
332
|
+
* any changes that occurred in the navigation hooks.
|
|
353
333
|
*/
|
|
354
|
-
protected _applyCookies({ session, request }: CrawlingContext,
|
|
334
|
+
protected _applyCookies({ session, request }: CrawlingContext, preHookCookies: string, postHookCookies: string): string;
|
|
355
335
|
/**
|
|
356
336
|
* Function to make the HTTP request. It performs optimizations
|
|
357
337
|
* on the request such as only downloading the request body if the
|
|
358
338
|
* received content type matches text/html, application/xml, application/xhtml+xml.
|
|
359
339
|
*/
|
|
360
|
-
protected _requestFunction({ request, session, proxyUrl,
|
|
340
|
+
protected _requestFunction({ request, session, proxyUrl, cookieString, }: RequestFunctionOptions): Promise<Response>;
|
|
361
341
|
/**
|
|
362
342
|
* Encodes and parses response according to the provided content type
|
|
363
343
|
*/
|
|
364
|
-
protected _parseResponse(request:
|
|
365
|
-
|
|
366
|
-
response: IncomingMessage;
|
|
344
|
+
protected _parseResponse(request: CrawleeRequest, response: Response): Promise<{
|
|
345
|
+
response: Response;
|
|
367
346
|
contentType: {
|
|
368
347
|
type: string;
|
|
369
348
|
encoding: BufferEncoding;
|
|
370
349
|
};
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
|
|
350
|
+
body: string;
|
|
351
|
+
} | {
|
|
352
|
+
body: Buffer<ArrayBuffer>;
|
|
353
|
+
response: Response;
|
|
374
354
|
contentType: {
|
|
375
355
|
type: string;
|
|
376
356
|
encoding: BufferEncoding;
|
|
377
357
|
};
|
|
378
|
-
enqueueLinks: () => Promise<{
|
|
379
|
-
processedRequests: never[];
|
|
380
|
-
unprocessedRequests: never[];
|
|
381
|
-
}>;
|
|
382
358
|
}>;
|
|
383
|
-
protected _parseHTML(response: IncomingMessage, _isXml: boolean, _crawlingContext: Context): Promise<Partial<Context>>;
|
|
384
359
|
/**
|
|
385
360
|
* Combines the provided `requestOptions` with mandatory (non-overridable) values.
|
|
386
361
|
*/
|
|
387
|
-
protected _getRequestOptions(request:
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
|
|
394
|
-
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
402
|
-
timeout?: import("got-scraping").Delays | undefined;
|
|
403
|
-
prefixUrl?: string | URL | undefined;
|
|
404
|
-
form?: Record<string, any> | undefined;
|
|
405
|
-
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
406
|
-
cookieJar?: import("got-scraping").PromiseCookieJar | import("got-scraping").ToughCookieJar | undefined;
|
|
407
|
-
signal?: AbortSignal | undefined;
|
|
408
|
-
ignoreInvalidCookies?: boolean | undefined;
|
|
409
|
-
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
410
|
-
searchParams?: string | import("got-scraping").SearchParameters | URLSearchParams | undefined;
|
|
411
|
-
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
412
|
-
dnsLookup?: import("cacheable-lookup").default["lookup"] | undefined;
|
|
413
|
-
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
414
|
-
dnsCache?: import("cacheable-lookup").default | boolean | undefined;
|
|
415
|
-
context?: Record<string, unknown> | undefined;
|
|
416
|
-
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
417
|
-
followRedirect?: boolean | ((response: import("got-scraping").PlainResponse) => boolean) | undefined;
|
|
418
|
-
maxRedirects?: number | undefined;
|
|
419
|
-
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
420
|
-
cache?: string | import("cacheable-request").StorageAdapter | boolean | undefined;
|
|
421
|
-
throwHttpErrors?: boolean | undefined;
|
|
422
|
-
username?: string | undefined;
|
|
423
|
-
password?: string | undefined;
|
|
424
|
-
http2?: boolean | undefined;
|
|
425
|
-
allowGetBody?: boolean | undefined;
|
|
426
|
-
methodRewriting?: boolean | undefined;
|
|
427
|
-
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
428
|
-
dnsLookupIpVersion?: import("got-scraping").DnsLookupIpVersion;
|
|
429
|
-
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
430
|
-
parseJson?: import("got-scraping").ParseJsonFunction | undefined;
|
|
431
|
-
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
432
|
-
stringifyJson?: import("got-scraping").StringifyJsonFunction | undefined;
|
|
433
|
-
localAddress?: string | undefined;
|
|
434
|
-
method?: Method | undefined;
|
|
435
|
-
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
436
|
-
createConnection?: import("got-scraping").CreateConnectionFunction | undefined;
|
|
437
|
-
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
438
|
-
cacheOptions?: import("got-scraping").CacheOptions | undefined;
|
|
439
|
-
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
440
|
-
https?: import("got-scraping").HttpsOptions | undefined;
|
|
441
|
-
encoding?: BufferEncoding | undefined;
|
|
442
|
-
resolveBodyOnly?: boolean | undefined;
|
|
443
|
-
isStream?: boolean | undefined;
|
|
444
|
-
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
445
|
-
responseType?: import("got-scraping").ResponseType | undefined;
|
|
446
|
-
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
447
|
-
pagination?: import("got-scraping").PaginationOptions<unknown, unknown> | undefined;
|
|
448
|
-
setHost?: boolean | undefined;
|
|
449
|
-
maxHeaderSize?: number | undefined;
|
|
450
|
-
enableUnixSockets?: boolean | undefined;
|
|
451
|
-
} & {
|
|
452
|
-
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
453
|
-
hooks?: Partial<import("got-scraping").Hooks>;
|
|
454
|
-
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
455
|
-
retry?: Partial<import("got-scraping").RetryOptions>;
|
|
456
|
-
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
457
|
-
} & import("got-scraping").Context & Required<Pick<OptionsInit, "url">> & {
|
|
458
|
-
isStream: true;
|
|
362
|
+
protected _getRequestOptions(request: CrawleeRequest, session: Session, proxyUrl?: string): {
|
|
363
|
+
url: string;
|
|
364
|
+
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
365
|
+
method: import("@crawlee/types").AllowedHttpMethods;
|
|
366
|
+
proxyUrl: string | undefined;
|
|
367
|
+
timeout: number;
|
|
368
|
+
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
369
|
+
cookieJar: import("tough-cookie").CookieJar | undefined;
|
|
370
|
+
sessionToken: Session;
|
|
371
|
+
headers: Record<string, string> | undefined;
|
|
372
|
+
https: {
|
|
373
|
+
rejectUnauthorized: boolean;
|
|
374
|
+
};
|
|
375
|
+
body: string | undefined;
|
|
459
376
|
};
|
|
460
|
-
protected _encodeResponse(request:
|
|
377
|
+
protected _encodeResponse(request: CrawleeRequest, response: Response, encoding: BufferEncoding): {
|
|
461
378
|
encoding: BufferEncoding;
|
|
462
|
-
response:
|
|
379
|
+
response: Response;
|
|
463
380
|
};
|
|
464
381
|
/**
|
|
465
382
|
* Checks and extends supported mime types
|
|
@@ -468,7 +385,7 @@ export declare class HttpCrawler<Context extends InternalHttpCrawlingContext<any
|
|
|
468
385
|
/**
|
|
469
386
|
* Handles timeout request
|
|
470
387
|
*/
|
|
471
|
-
protected _handleRequestTimeout(session
|
|
388
|
+
protected _handleRequestTimeout(session: Session): void;
|
|
472
389
|
private _abortDownloadOfBody;
|
|
473
390
|
/**
|
|
474
391
|
* @internal wraps public utility for mocking purposes
|
|
@@ -476,10 +393,10 @@ export declare class HttpCrawler<Context extends InternalHttpCrawlingContext<any
|
|
|
476
393
|
private _requestAsBrowser;
|
|
477
394
|
}
|
|
478
395
|
interface RequestFunctionOptions {
|
|
479
|
-
request:
|
|
480
|
-
session
|
|
396
|
+
request: CrawleeRequest;
|
|
397
|
+
session: Session;
|
|
481
398
|
proxyUrl?: string;
|
|
482
|
-
|
|
399
|
+
cookieString?: string;
|
|
483
400
|
}
|
|
484
401
|
/**
|
|
485
402
|
* Creates new {@link Router} instance that works based on request labels.
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"http-crawler.d.ts","sourceRoot":"","sources":["../../src/internals/http-crawler.ts"],"names":[],"mappings":"
|
|
1
|
+
{"version":3,"file":"http-crawler.d.ts","sourceRoot":"","sources":["../../src/internals/http-crawler.ts"],"names":[],"mappings":"AAGA,OAAO,KAAK,EAER,mBAAmB,EACnB,eAAe,EACf,YAAY,EACZ,sBAAsB,EACtB,OAAO,IAAI,cAAc,EACzB,cAAc,EACd,sBAAsB,EACtB,YAAY,EACZ,OAAO,EACV,MAAM,gBAAgB,CAAC;AACxB,OAAO,EAAE,YAAY,EAAE,eAAe,EAAoD,MAAM,gBAAgB,CAAC;AACjH,OAAO,KAAK,EAAE,aAAa,EAAE,MAAM,eAAe,CAAC;AAEnD,OAAO,KAAK,EAAE,SAAS,EAAE,UAAU,EAAE,MAAM,gBAAgB,CAAC;AAC5D,OAAO,EAAE,KAAK,WAAW,EAAuB,MAAM,gBAAgB,CAAC;AAEvE,OAAO,KAAK,EAAE,WAAW,EAAE,YAAY,EAAE,MAAM,cAAc,CAAC;AAI9D,OAAO,KAAK,EAAE,SAAS,EAAE,MAAM,WAAW,CAAC;AAsB3C,MAAM,MAAM,gBAAgB,CACxB,QAAQ,SAAS,UAAU,GAAG,GAAG,EAAE,2EAA2E;AAC9G,QAAQ,SAAS,SAAS,GAAG,GAAG,IAChC,YAAY,CAAC,mBAAmB,CAAC,QAAQ,EAAE,QAAQ,CAAC,CAAC,CAAC;AAE1D,MAAM,WAAW,kBAAkB,CAC/B,OAAO,SAAS,2BAA2B,GAAG,2BAA2B,EACzE,gBAAgB,GAAG,UAAU,CAAC,KAAK,CAAC,EACpC,eAAe,SAAS,OAAO,GAAG,OAAO,GAAG,gBAAgB,CAC9D,SAAQ,mBAAmB,CAAC,OAAO,EAAE,gBAAgB,EAAE,eAAe,CAAC;IACrE;;OAEG;IACH,qBAAqB,CAAC,EAAE,MAAM,CAAC;IAE/B;;OAEG;IACH,eAAe,CAAC,EAAE,OAAO,CAAC;IAE1B;;;;;;;;;;;;;;;OAeG;IACH,kBAAkB,CAAC,EAAE,gBAAgB,CAAC,eAAe,CAAC,EAAE,CAAC;IAEzD;;;;;;;;;;;OAWG;IACH,mBAAmB,CAAC,EAAE,CAAC,CAAC,eAAe,EAAE,0BAA0B,KAAK,SAAS,CAAC,IAAI,CAAC,CAAC,EAAE,CAAC;IAE3F;;;;OAIG;IACH,mBAAmB,CAAC,EAAE,MAAM,EAAE,CAAC;IAE/B;;;;;;;;;;OAUG;IACH,uBAAuB,CAAC,EAAE,MAAM,CAAC;IAEjC;;;;;;;;OAQG;IACH,qBAAqB,CAAC,EAAE,MAAM,CAAC;IAE/B;;;;;OAKG;IACH,wBAAwB,CAAC,EAAE,OAAO,CAAC;CACtC;AAED;;GAEG;AACH,MAAM,MAAM,gBAAgB,CAAC,OAAO,IAAI,CAAC,eAAe,EAAE,OAAO,KAAK,SAAS,CAAC,IAAI,CAAC,CAAC;AAEtF,MAAM,MAAM,QAAQ,CAChB,QAAQ,SAAS,UAAU,GAAG,GAAG,EAAE,2EAA2E;AAC9G,QAAQ,SAAS,SAAS,GAAG,GAAG,IAChC,gBAAgB,CAAC,mBAAmB,CAAC,QAAQ,EAAE,QAAQ,CAAC,CAAC,CAAC;AAE9D,UAAU,0BAA0B,CAChC,QAAQ,SAAS,UAAU,GAAG,GAAG,CACnC,SAAQ,eAAe,CAAC,QAAQ,CAAC;IAC/B;;OAEG;IACH,OAAO,EAAE,aAAa,CAAC,cAAc,CAAC,QAAQ,CAAC,CAAC,CAAC;IAEjD;;OAEG;IACH,QAAQ,EAAE,QAAQ,CAAC;CACtB;AAED;;GAEG;AACH,MAAM,WAAW,2BAA2B,CACxC,QAAQ,SAAS,UAAU,GAAG,GAAG,EAAE,2EAA2E;AAC9G,QAAQ,SAAS,SAAS,GAAG,GAAG,CAClC,SAAQ,0BAA0B,CAAC,QAAQ,CAAC;IAC1C;;;;;OAKG;IACH,IAAI,EAAE,MAAM,GAAG,MAAM,CAAC;IAEtB;;OAEG;IACH,IAAI,EAAE,QAAQ,CAAC;IAEf;;OAEG;IACH,WAAW,EAAE;QAAE,IAAI,EAAE,MAAM,CAAC;QAAC,QAAQ,EAAE,cAAc,CAAA;KAAE,CAAC;IAExD;;;;;;;;;;;OAWG;IACH,eAAe,CAAC,QAAQ,EAAE,MAAM,EAAE,SAAS,CAAC,EAAE,MAAM,GAAG,OAAO,CAAC,IAAI,CAAC,CAAC;IAErE;;;;;;;;;;;OAWG;IACH,gBAAgB,CAAC,QAAQ,CAAC,EAAE,MAAM,EAAE,SAAS,CAAC,EAAE,MAAM,GAAG,OAAO,CAAC,WAAW,CAAC,CAAC;CACjF;AAED,MAAM,WAAW,mBAAmB,CAAC,QAAQ,SAAS,UAAU,GAAG,GAAG,EAAE,QAAQ,SAAS,SAAS,GAAG,GAAG,CACpG,SAAQ,2BAA2B,CAAC,QAAQ,EAAE,QAAQ,CAAC;CAAG;AAE9D,MAAM,MAAM,kBAAkB,CAC1B,QAAQ,SAAS,UAAU,GAAG,GAAG,EAAE,2EAA2E;AAC9G,QAAQ,SAAS,SAAS,GAAG,GAAG,IAChC,cAAc,CAAC,mBAAmB,CAAC,QAAQ,EAAE,QAAQ,CAAC,CAAC,CAAC;AAE5D;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GAkEG;AACH,qBAAa,WAAW,CACpB,OAAO,SAAS,2BAA2B,CAAC,GAAG,EAAE,GAAG,CAAC,GAAG,2BAA2B,EACnF,gBAAgB,GAAG,UAAU,CAAC,KAAK,CAAC,EACpC,eAAe,SAAS,OAAO,GAAG,OAAO,GAAG,gBAAgB,CAC9D,SAAQ,YAAY,CAAC,OAAO,EAAE,gBAAgB,EAAE,eAAe,CAAC;IAC9D,SAAS,CAAC,kBAAkB,EAAE,gBAAgB,CAAC,eAAe,CAAC,EAAE,CAAC;IAClE,SAAS,CAAC,mBAAmB,EAAE,CAAC,CAAC,eAAe,EAAE,0BAA0B,KAAK,SAAS,CAAC,IAAI,CAAC,CAAC,EAAE,CAAC;IACpG,SAAS,CAAC,wBAAwB,EAAE,OAAO,CAAC;IAC5C,SAAS,CAAC,uBAAuB,EAAE,MAAM,CAAC;IAC1C,SAAS,CAAC,eAAe,EAAE,OAAO,CAAC;IACnC,SAAS,CAAC,uBAAuB,CAAC,EAAE,MAAM,CAAC;IAC3C,SAAS,CAAC,qBAAqB,CAAC,EAAE,MAAM,CAAC;IACzC,SAAS,CAAC,QAAQ,CAAC,kBAAkB,EAAE,GAAG,CAAC,MAAM,CAAC,CAAC;IAEnD,iBAA0B,YAAY;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;MAYpC;IAEF;;OAEG;gBAEC,OAAO,GAAE,kBAAkB,CAAC,OAAO,EAAE,gBAAgB,EAAE,eAAe,CAAC,GACnE,sBAAsB,CAAC,2BAA2B,EAAE,OAAO,CAAa;cAkD7D,oBAAoB,IAAI,eAAe,CAAC,eAAe,EAAE,2BAA2B,CAAC;YAS1F,eAAe;YAkDf,mBAAmB;YAqEnB,6BAA6B;cAQ3B,gBAAgB,CAAC,eAAe,EAAE,2BAA2B,GAAG,OAAO,CAAC,MAAM,GAAG,KAAK,CAAC;IAkBvG;;;OAGG;IACH,SAAS,CAAC,aAAa,CACnB,EAAE,OAAO,EAAE,OAAO,EAAE,EAAE,eAAe,EACrC,cAAc,EAAE,MAAM,EACtB,eAAe,EAAE,MAAM,GACxB,MAAM;IAQT;;;;OAIG;cACa,gBAAgB,CAAC,EAC7B,OAAO,EACP,OAAO,EACP,QAAQ,EACR,YAAY,GACf,EAAE,sBAAsB,GAAG,OAAO,CAAC,QAAQ,CAAC;IAmB7C;;OAEG;cACa,cAAc,CAAC,OAAO,EAAE,cAAc,EAAE,QAAQ,EAAE,QAAQ;;;;;;;;;;;;;;;IAwC1E;;OAEG;IACH,SAAS,CAAC,kBAAkB,CAAC,OAAO,EAAE,cAAc,EAAE,OAAO,EAAE,OAAO,EAAE,QAAQ,CAAC,EAAE,MAAM;;;;;;;;;;;cAY9D,MAAM,GAAG,SAAS;;IAmB7C,SAAS,CAAC,eAAe,CACrB,OAAO,EAAE,cAAc,EACvB,QAAQ,EAAE,QAAQ,EAClB,QAAQ,EAAE,cAAc,GACzB;QACC,QAAQ,EAAE,cAAc,CAAC;QACzB,QAAQ,EAAE,QAAQ,CAAC;KACtB;IAsCD;;OAEG;IACH,SAAS,CAAC,yBAAyB,CAAC,mBAAmB,EAAE,CAAC,MAAM,GAAG,WAAW,GAAG,YAAY,CAAC,EAAE;IAgBhG;;OAEG;IACH,SAAS,CAAC,qBAAqB,CAAC,OAAO,EAAE,OAAO;IAKhD,OAAO,CAAC,oBAAoB;IAe5B;;OAEG;IACH,OAAO,CAAC,iBAAiB,CA4BvB;CACL;AAED,UAAU,sBAAsB;IAC5B,OAAO,EAAE,cAAc,CAAC;IACxB,OAAO,EAAE,OAAO,CAAC;IACjB,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,YAAY,CAAC,EAAE,MAAM,CAAC;CACzB;AAED;;;;;;;;;;;;;;;;;;;;;;;GAuBG;AACH,wBAAgB,gBAAgB,CAC5B,OAAO,SAAS,mBAAmB,GAAG,mBAAmB,EACzD,QAAQ,SAAS,UAAU,GAAG,sBAAsB,CAAC,OAAO,CAAC,SAAS,CAAC,CAAC,EAC1E,MAAM,CAAC,EAAE,YAAY,CAAC,OAAO,EAAE,QAAQ,CAAC,mDAEzC"}
|