@crawlee/http 4.0.0-beta.5 → 4.0.0-beta.50
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +9 -5
- package/internals/file-download.d.ts +58 -32
- package/internals/file-download.d.ts.map +1 -1
- package/internals/file-download.js +116 -73
- package/internals/file-download.js.map +1 -1
- package/internals/http-crawler.d.ts +89 -178
- package/internals/http-crawler.d.ts.map +1 -1
- package/internals/http-crawler.js +159 -325
- package/internals/http-crawler.js.map +1 -1
- package/internals/utils.d.ts +14 -0
- package/internals/utils.d.ts.map +1 -0
- package/internals/utils.js +71 -0
- package/internals/utils.js.map +1 -0
- package/package.json +9 -8
- package/tsconfig.build.tsbuildinfo +0 -1
|
@@ -1,25 +1,13 @@
|
|
|
1
|
-
import type {
|
|
2
|
-
import
|
|
3
|
-
import type {
|
|
4
|
-
import { BasicCrawler, Configuration, CrawlerExtension } from '@crawlee/basic';
|
|
5
|
-
import type { HttpResponse } from '@crawlee/core';
|
|
1
|
+
import type { BasicCrawlerOptions, CrawlingContext, ErrorHandler, GetUserDataFromRequest, Request as CrawleeRequest, RequestHandler, RequireContextPipeline, RouterRoutes, Session } from '@crawlee/basic';
|
|
2
|
+
import { BasicCrawler, ContextPipeline } from '@crawlee/basic';
|
|
3
|
+
import type { LoadedRequest } from '@crawlee/core';
|
|
6
4
|
import type { Awaitable, Dictionary } from '@crawlee/types';
|
|
7
5
|
import { type CheerioRoot } from '@crawlee/utils';
|
|
8
6
|
import type { RequestLike, ResponseLike } from 'content-type';
|
|
9
|
-
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
10
|
-
import type { Method, OptionsInit } from 'got-scraping';
|
|
11
|
-
import { ObjectPredicate } from 'ow';
|
|
12
7
|
import type { JsonValue } from 'type-fest';
|
|
13
|
-
/**
|
|
14
|
-
* TODO exists for BC within HttpCrawler - replace completely with StreamingHttpResponse in 4.0
|
|
15
|
-
* @internal
|
|
16
|
-
*/
|
|
17
|
-
export type PlainResponse = Omit<HttpResponse, 'body'> & IncomingMessage & {
|
|
18
|
-
body?: unknown;
|
|
19
|
-
};
|
|
20
8
|
export type HttpErrorHandler<UserData extends Dictionary = any, // with default to Dictionary we cant use a typed router in untyped crawler
|
|
21
9
|
JSONData extends JsonValue = any> = ErrorHandler<HttpCrawlingContext<UserData, JSONData>>;
|
|
22
|
-
export interface HttpCrawlerOptions<Context extends InternalHttpCrawlingContext = InternalHttpCrawlingContext> extends BasicCrawlerOptions<Context> {
|
|
10
|
+
export interface HttpCrawlerOptions<Context extends InternalHttpCrawlingContext = InternalHttpCrawlingContext, ContextExtension = Dictionary<never>, ExtendedContext extends Context = Context & ContextExtension> extends BasicCrawlerOptions<Context, ContextExtension, ExtendedContext> {
|
|
23
11
|
/**
|
|
24
12
|
* Timeout in which the HTTP request to the resource needs to finish, given in seconds.
|
|
25
13
|
*/
|
|
@@ -28,20 +16,14 @@ export interface HttpCrawlerOptions<Context extends InternalHttpCrawlingContext
|
|
|
28
16
|
* If set to true, SSL certificate errors will be ignored.
|
|
29
17
|
*/
|
|
30
18
|
ignoreSslErrors?: boolean;
|
|
31
|
-
/**
|
|
32
|
-
* If set, this crawler will be configured for all connections to use
|
|
33
|
-
* [Apify Proxy](https://console.apify.com/proxy) or your own Proxy URLs provided and rotated according to the configuration.
|
|
34
|
-
* For more information, see the [documentation](https://docs.apify.com/proxy).
|
|
35
|
-
*/
|
|
36
|
-
proxyConfiguration?: ProxyConfiguration;
|
|
37
19
|
/**
|
|
38
20
|
* Async functions that are sequentially evaluated before the navigation. Good for setting additional cookies
|
|
39
|
-
* or browser properties before navigation. The function accepts
|
|
40
|
-
* which
|
|
21
|
+
* or browser properties before navigation. The function accepts one parameter `crawlingContext`,
|
|
22
|
+
* which is passed to the `requestAsBrowser()` function the crawler calls to navigate.
|
|
41
23
|
* Example:
|
|
42
24
|
* ```
|
|
43
25
|
* preNavigationHooks: [
|
|
44
|
-
* async (crawlingContext
|
|
26
|
+
* async (crawlingContext) => {
|
|
45
27
|
* // ...
|
|
46
28
|
* },
|
|
47
29
|
* ]
|
|
@@ -50,7 +32,7 @@ export interface HttpCrawlerOptions<Context extends InternalHttpCrawlingContext
|
|
|
50
32
|
* Modyfing `pageOptions` is supported only in Playwright incognito.
|
|
51
33
|
* See {@link PrePageCreateHook}
|
|
52
34
|
*/
|
|
53
|
-
preNavigationHooks?: InternalHttpHook<
|
|
35
|
+
preNavigationHooks?: InternalHttpHook<CrawlingContext>[];
|
|
54
36
|
/**
|
|
55
37
|
* Async functions that are sequentially evaluated after the navigation. Good for checking if the navigation was successful.
|
|
56
38
|
* The function accepts `crawlingContext` as the only parameter.
|
|
@@ -63,10 +45,11 @@ export interface HttpCrawlerOptions<Context extends InternalHttpCrawlingContext
|
|
|
63
45
|
* ]
|
|
64
46
|
* ```
|
|
65
47
|
*/
|
|
66
|
-
postNavigationHooks?:
|
|
48
|
+
postNavigationHooks?: ((crawlingContext: CrawlingContextWithReponse) => Awaitable<void>)[];
|
|
67
49
|
/**
|
|
68
50
|
* An array of [MIME types](https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/MIME_types/Complete_list_of_MIME_types)
|
|
69
|
-
* you want the crawler to load and process. By default, only `text/html
|
|
51
|
+
* you want the crawler to load and process. By default, only `text/html`, `application/xhtml+xml`, `text/xml`, `application/xml`,
|
|
52
|
+
* and `application/json` MIME types are supported.
|
|
70
53
|
*/
|
|
71
54
|
additionalMimeTypes?: string[];
|
|
72
55
|
/**
|
|
@@ -92,35 +75,34 @@ export interface HttpCrawlerOptions<Context extends InternalHttpCrawlingContext
|
|
|
92
75
|
*/
|
|
93
76
|
forceResponseEncoding?: string;
|
|
94
77
|
/**
|
|
95
|
-
* Automatically saves cookies to Session.
|
|
78
|
+
* Automatically saves cookies to Session. Enabled by default.
|
|
96
79
|
*
|
|
97
80
|
* It parses cookie from response "set-cookie" header saves or updates cookies for session and once the session is used for next request.
|
|
98
81
|
* It passes the "Cookie" header to the request with the session cookies.
|
|
99
82
|
*/
|
|
100
83
|
persistCookiesPerSession?: boolean;
|
|
101
|
-
/**
|
|
102
|
-
* An array of HTTP response [Status Codes](https://developer.mozilla.org/en-US/docs/Web/HTTP/Status) to be excluded from error consideration.
|
|
103
|
-
* By default, status codes >= 500 trigger errors.
|
|
104
|
-
*/
|
|
105
|
-
ignoreHttpErrorStatusCodes?: number[];
|
|
106
|
-
/**
|
|
107
|
-
* An array of additional HTTP response [Status Codes](https://developer.mozilla.org/en-US/docs/Web/HTTP/Status) to be treated as errors.
|
|
108
|
-
* By default, status codes >= 500 trigger errors.
|
|
109
|
-
*/
|
|
110
|
-
additionalHttpErrorStatusCodes?: number[];
|
|
111
84
|
}
|
|
112
85
|
/**
|
|
113
86
|
* @internal
|
|
114
87
|
*/
|
|
115
|
-
export type InternalHttpHook<Context> = (crawlingContext: Context
|
|
88
|
+
export type InternalHttpHook<Context> = (crawlingContext: Context) => Awaitable<void>;
|
|
116
89
|
export type HttpHook<UserData extends Dictionary = any, // with default to Dictionary we cant use a typed router in untyped crawler
|
|
117
90
|
JSONData extends JsonValue = any> = InternalHttpHook<HttpCrawlingContext<UserData, JSONData>>;
|
|
91
|
+
interface CrawlingContextWithReponse<UserData extends Dictionary = any> extends CrawlingContext<UserData> {
|
|
92
|
+
/**
|
|
93
|
+
* The request object that was successfully loaded and navigated to, including the {@link Request.loadedUrl|`loadedUrl`} property.
|
|
94
|
+
*/
|
|
95
|
+
request: LoadedRequest<CrawleeRequest<UserData>>;
|
|
96
|
+
/**
|
|
97
|
+
* The HTTP response object containing status code, headers, and other response metadata.
|
|
98
|
+
*/
|
|
99
|
+
response: Response;
|
|
100
|
+
}
|
|
118
101
|
/**
|
|
119
102
|
* @internal
|
|
120
103
|
*/
|
|
121
104
|
export interface InternalHttpCrawlingContext<UserData extends Dictionary = any, // with default to Dictionary we cant use a typed router in untyped crawler
|
|
122
|
-
JSONData extends JsonValue = any
|
|
123
|
-
Crawler = HttpCrawler<any>> extends CrawlingContext<Crawler, UserData> {
|
|
105
|
+
JSONData extends JsonValue = any> extends CrawlingContextWithReponse<UserData> {
|
|
124
106
|
/**
|
|
125
107
|
* The request body of the web page.
|
|
126
108
|
* The type depends on the `Content-Type` header of the web page:
|
|
@@ -139,7 +121,6 @@ Crawler = HttpCrawler<any>> extends CrawlingContext<Crawler, UserData> {
|
|
|
139
121
|
type: string;
|
|
140
122
|
encoding: BufferEncoding;
|
|
141
123
|
};
|
|
142
|
-
response: PlainResponse;
|
|
143
124
|
/**
|
|
144
125
|
* Wait for an element matching the selector to appear. Timeout is ignored.
|
|
145
126
|
*
|
|
@@ -167,7 +148,7 @@ Crawler = HttpCrawler<any>> extends CrawlingContext<Crawler, UserData> {
|
|
|
167
148
|
*/
|
|
168
149
|
parseWithCheerio(selector?: string, timeoutMs?: number): Promise<CheerioRoot>;
|
|
169
150
|
}
|
|
170
|
-
export interface HttpCrawlingContext<UserData extends Dictionary = any, JSONData extends JsonValue = any> extends InternalHttpCrawlingContext<UserData, JSONData
|
|
151
|
+
export interface HttpCrawlingContext<UserData extends Dictionary = any, JSONData extends JsonValue = any> extends InternalHttpCrawlingContext<UserData, JSONData> {
|
|
171
152
|
}
|
|
172
153
|
export type HttpRequestHandler<UserData extends Dictionary = any, // with default to Dictionary we cant use a typed router in untyped crawler
|
|
173
154
|
JSONData extends JsonValue = any> = RequestHandler<HttpCrawlingContext<UserData, JSONData>>;
|
|
@@ -192,18 +173,18 @@ JSONData extends JsonValue = any> = RequestHandler<HttpCrawlingContext<UserData,
|
|
|
192
173
|
*
|
|
193
174
|
* The crawler finishes when there are no more {@link Request} objects to crawl.
|
|
194
175
|
*
|
|
195
|
-
* We can use the `preNavigationHooks` to adjust
|
|
176
|
+
* We can use the `preNavigationHooks` to adjust the crawling context before the request is made:
|
|
196
177
|
*
|
|
197
178
|
* ```javascript
|
|
198
179
|
* preNavigationHooks: [
|
|
199
|
-
* (crawlingContext
|
|
180
|
+
* (crawlingContext) => {
|
|
200
181
|
* // ...
|
|
201
182
|
* },
|
|
202
183
|
* ]
|
|
203
184
|
* ```
|
|
204
185
|
*
|
|
205
|
-
* By default, this crawler only processes web pages with the `text/html`
|
|
206
|
-
* and `application/
|
|
186
|
+
* By default, this crawler only processes web pages with the `text/html`, `application/xhtml+xml`, `text/xml`, `application/xml`,
|
|
187
|
+
* and `application/json` MIME content types (as reported by the `Content-Type` HTTP header),
|
|
207
188
|
* and skips pages with other content types. If you want the crawler to process other content types,
|
|
208
189
|
* use the {@link HttpCrawlerOptions.additionalMimeTypes} constructor option.
|
|
209
190
|
* Beware that the parsing behavior differs for HTML, XML, JSON and other types of content.
|
|
@@ -238,23 +219,14 @@ JSONData extends JsonValue = any> = RequestHandler<HttpCrawlingContext<UserData,
|
|
|
238
219
|
* ```
|
|
239
220
|
* @category Crawlers
|
|
240
221
|
*/
|
|
241
|
-
export declare class HttpCrawler<Context extends InternalHttpCrawlingContext<any, any,
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
* A reference to the underlying {@link ProxyConfiguration} class that manages the crawler's proxies.
|
|
245
|
-
* Only available if used by the crawler.
|
|
246
|
-
*/
|
|
247
|
-
proxyConfiguration?: ProxyConfiguration;
|
|
248
|
-
protected userRequestHandlerTimeoutMillis: number;
|
|
249
|
-
protected preNavigationHooks: InternalHttpHook<Context>[];
|
|
250
|
-
protected postNavigationHooks: InternalHttpHook<Context>[];
|
|
222
|
+
export declare class HttpCrawler<Context extends InternalHttpCrawlingContext<any, any> = InternalHttpCrawlingContext, ContextExtension = Dictionary<never>, ExtendedContext extends Context = Context & ContextExtension> extends BasicCrawler<Context, ContextExtension, ExtendedContext> {
|
|
223
|
+
protected preNavigationHooks: InternalHttpHook<CrawlingContext>[];
|
|
224
|
+
protected postNavigationHooks: ((crawlingContext: CrawlingContextWithReponse) => Awaitable<void>)[];
|
|
251
225
|
protected persistCookiesPerSession: boolean;
|
|
252
226
|
protected navigationTimeoutMillis: number;
|
|
253
227
|
protected ignoreSslErrors: boolean;
|
|
254
228
|
protected suggestResponseEncoding?: string;
|
|
255
229
|
protected forceResponseEncoding?: string;
|
|
256
|
-
protected additionalHttpErrorStatusCodes: Set<number>;
|
|
257
|
-
protected ignoreHttpErrorStatusCodes: Set<number>;
|
|
258
230
|
protected readonly supportedMimeTypes: Set<string>;
|
|
259
231
|
protected static optionsShape: {
|
|
260
232
|
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
@@ -267,18 +239,16 @@ export declare class HttpCrawler<Context extends InternalHttpCrawlingContext<any
|
|
|
267
239
|
suggestResponseEncoding: import("ow").StringPredicate & import("ow").BasePredicate<string | undefined>;
|
|
268
240
|
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
269
241
|
forceResponseEncoding: import("ow").StringPredicate & import("ow").BasePredicate<string | undefined>;
|
|
270
|
-
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
271
|
-
proxyConfiguration: ObjectPredicate<object> & import("ow").BasePredicate<object | undefined>;
|
|
272
242
|
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
273
243
|
persistCookiesPerSession: import("ow").BooleanPredicate & import("ow").BasePredicate<boolean | undefined>;
|
|
274
|
-
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
275
|
-
additionalHttpErrorStatusCodes: import("ow").ArrayPredicate<number>;
|
|
276
|
-
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
277
|
-
ignoreHttpErrorStatusCodes: import("ow").ArrayPredicate<number>;
|
|
278
244
|
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
279
245
|
preNavigationHooks: import("ow").ArrayPredicate<unknown> & import("ow").BasePredicate<unknown[] | undefined>;
|
|
280
246
|
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
281
247
|
postNavigationHooks: import("ow").ArrayPredicate<unknown> & import("ow").BasePredicate<unknown[] | undefined>;
|
|
248
|
+
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
249
|
+
contextPipelineBuilder: import("ow").ObjectPredicate<object> & import("ow").BasePredicate<object | undefined>;
|
|
250
|
+
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
251
|
+
extendContext: import("ow").Predicate<Function> & import("ow").BasePredicate<Function | undefined>;
|
|
282
252
|
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
283
253
|
requestList: import("ow").ObjectPredicate<object> & import("ow").BasePredicate<object | undefined>;
|
|
284
254
|
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
@@ -299,24 +269,40 @@ export declare class HttpCrawler<Context extends InternalHttpCrawlingContext<any
|
|
|
299
269
|
maxSessionRotations: import("ow").NumberPredicate & import("ow").BasePredicate<number | undefined>;
|
|
300
270
|
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
301
271
|
maxRequestsPerCrawl: import("ow").NumberPredicate & import("ow").BasePredicate<number | undefined>;
|
|
272
|
+
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
273
|
+
maxCrawlDepth: import("ow").NumberPredicate & import("ow").BasePredicate<number | undefined>;
|
|
302
274
|
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
303
275
|
autoscaledPoolOptions: import("ow").ObjectPredicate<object> & import("ow").BasePredicate<object | undefined>;
|
|
304
276
|
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
305
|
-
|
|
277
|
+
sessionPool: import("ow").ObjectPredicate<object> & import("ow").BasePredicate<object | undefined>;
|
|
306
278
|
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
307
|
-
|
|
279
|
+
proxyConfiguration: import("ow").ObjectPredicate<object> & import("ow").BasePredicate<object | undefined>;
|
|
308
280
|
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
309
281
|
statusMessageLoggingInterval: import("ow").NumberPredicate & import("ow").BasePredicate<number | undefined>;
|
|
310
282
|
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
311
283
|
statusMessageCallback: import("ow").Predicate<Function> & import("ow").BasePredicate<Function | undefined>;
|
|
284
|
+
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
285
|
+
additionalHttpErrorStatusCodes: import("ow").ArrayPredicate<number>;
|
|
286
|
+
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
287
|
+
ignoreHttpErrorStatusCodes: import("ow").ArrayPredicate<number>;
|
|
288
|
+
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
289
|
+
blockedStatusCodes: import("ow").ArrayPredicate<number>;
|
|
312
290
|
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
313
291
|
retryOnBlocked: import("ow").BooleanPredicate & import("ow").BasePredicate<boolean | undefined>;
|
|
314
292
|
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
315
|
-
respectRobotsTxtFile: import("ow").
|
|
293
|
+
respectRobotsTxtFile: import("ow").AnyPredicate<boolean | object>;
|
|
316
294
|
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
317
295
|
onSkippedRequest: import("ow").Predicate<Function> & import("ow").BasePredicate<Function | undefined>;
|
|
318
296
|
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
319
297
|
httpClient: import("ow").ObjectPredicate<object> & import("ow").BasePredicate<object | undefined>;
|
|
298
|
+
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
299
|
+
configuration: import("ow").ObjectPredicate<object> & import("ow").BasePredicate<object | undefined>;
|
|
300
|
+
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
301
|
+
storageClient: import("ow").ObjectPredicate<object> & import("ow").BasePredicate<object | undefined>;
|
|
302
|
+
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
303
|
+
eventManager: import("ow").ObjectPredicate<object> & import("ow").BasePredicate<object | undefined>;
|
|
304
|
+
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
305
|
+
logger: import("ow").ObjectPredicate<object> & import("ow").BasePredicate<object | undefined>;
|
|
320
306
|
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
321
307
|
minConcurrency: import("ow").NumberPredicate & import("ow").BasePredicate<number | undefined>;
|
|
322
308
|
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
@@ -325,141 +311,67 @@ export declare class HttpCrawler<Context extends InternalHttpCrawlingContext<any
|
|
|
325
311
|
maxRequestsPerMinute: import("ow").NumberPredicate & import("ow").BasePredicate<number | undefined>;
|
|
326
312
|
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
327
313
|
keepAlive: import("ow").BooleanPredicate & import("ow").BasePredicate<boolean | undefined>;
|
|
328
|
-
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
329
|
-
log: import("ow").ObjectPredicate<object> & import("ow").BasePredicate<object | undefined>;
|
|
330
314
|
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
331
315
|
experiments: import("ow").ObjectPredicate<object> & import("ow").BasePredicate<object | undefined>;
|
|
332
316
|
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
333
317
|
statisticsOptions: import("ow").ObjectPredicate<object> & import("ow").BasePredicate<object | undefined>;
|
|
318
|
+
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
319
|
+
id: import("ow").StringPredicate & import("ow").BasePredicate<string | undefined>;
|
|
334
320
|
};
|
|
335
321
|
/**
|
|
336
322
|
* All `HttpCrawlerOptions` parameters are passed via an options object.
|
|
337
323
|
*/
|
|
338
|
-
constructor(options?: HttpCrawlerOptions<Context
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
use(extension: CrawlerExtension): void;
|
|
345
|
-
/**
|
|
346
|
-
* Wrapper around requestHandler that opens and closes pages etc.
|
|
347
|
-
*/
|
|
348
|
-
protected _runRequestHandler(crawlingContext: Context): Promise<void>;
|
|
349
|
-
protected isRequestBlocked(crawlingContext: Context): Promise<string | false>;
|
|
350
|
-
protected _handleNavigation(crawlingContext: Context): Promise<void>;
|
|
351
|
-
/**
|
|
352
|
-
* Sets the cookie header to `gotOptions` based on the provided request and session headers, as well as any changes that occurred due to hooks.
|
|
353
|
-
*/
|
|
354
|
-
protected _applyCookies({ session, request }: CrawlingContext, gotOptions: OptionsInit, preHookCookies: string, postHookCookies: string): void;
|
|
324
|
+
constructor(options?: HttpCrawlerOptions<Context, ContextExtension, ExtendedContext> & RequireContextPipeline<InternalHttpCrawlingContext, Context>);
|
|
325
|
+
protected buildContextPipeline(): ContextPipeline<CrawlingContext, InternalHttpCrawlingContext>;
|
|
326
|
+
private makeHttpRequest;
|
|
327
|
+
private processHttpResponse;
|
|
328
|
+
private handleBlockedRequestByContent;
|
|
329
|
+
protected isRequestBlocked(crawlingContext: InternalHttpCrawlingContext): Promise<string | false>;
|
|
355
330
|
/**
|
|
356
331
|
* Function to make the HTTP request. It performs optimizations
|
|
357
332
|
* on the request such as only downloading the request body if the
|
|
358
333
|
* received content type matches text/html, application/xml, application/xhtml+xml.
|
|
359
334
|
*/
|
|
360
|
-
protected _requestFunction({ request, session, proxyUrl
|
|
335
|
+
protected _requestFunction({ request, session, proxyUrl }: RequestFunctionOptions): Promise<Response>;
|
|
361
336
|
/**
|
|
362
337
|
* Encodes and parses response according to the provided content type
|
|
363
338
|
*/
|
|
364
|
-
protected _parseResponse(request:
|
|
365
|
-
|
|
366
|
-
response: IncomingMessage;
|
|
339
|
+
protected _parseResponse(request: CrawleeRequest, response: Response): Promise<{
|
|
340
|
+
response: Response;
|
|
367
341
|
contentType: {
|
|
368
342
|
type: string;
|
|
369
343
|
encoding: BufferEncoding;
|
|
370
344
|
};
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
|
|
345
|
+
body: string;
|
|
346
|
+
} | {
|
|
347
|
+
body: Buffer<ArrayBuffer>;
|
|
348
|
+
response: Response;
|
|
374
349
|
contentType: {
|
|
375
350
|
type: string;
|
|
376
351
|
encoding: BufferEncoding;
|
|
377
352
|
};
|
|
378
|
-
enqueueLinks: () => Promise<{
|
|
379
|
-
processedRequests: never[];
|
|
380
|
-
unprocessedRequests: never[];
|
|
381
|
-
}>;
|
|
382
353
|
}>;
|
|
383
|
-
protected _parseHTML(response: IncomingMessage, _isXml: boolean, _crawlingContext: Context): Promise<Partial<Context>>;
|
|
384
354
|
/**
|
|
385
355
|
* Combines the provided `requestOptions` with mandatory (non-overridable) values.
|
|
386
356
|
*/
|
|
387
|
-
protected _getRequestOptions(request:
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
|
|
394
|
-
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
402
|
-
timeout?: import("got-scraping").Delays | undefined;
|
|
403
|
-
prefixUrl?: string | URL | undefined;
|
|
404
|
-
form?: Record<string, any> | undefined;
|
|
405
|
-
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
406
|
-
cookieJar?: import("got-scraping").PromiseCookieJar | import("got-scraping").ToughCookieJar | undefined;
|
|
407
|
-
signal?: AbortSignal | undefined;
|
|
408
|
-
ignoreInvalidCookies?: boolean | undefined;
|
|
409
|
-
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
410
|
-
searchParams?: string | import("got-scraping").SearchParameters | URLSearchParams | undefined;
|
|
411
|
-
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
412
|
-
dnsLookup?: import("cacheable-lookup").default["lookup"] | undefined;
|
|
413
|
-
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
414
|
-
dnsCache?: import("cacheable-lookup").default | boolean | undefined;
|
|
415
|
-
context?: Record<string, unknown> | undefined;
|
|
416
|
-
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
417
|
-
followRedirect?: boolean | ((response: import("got-scraping").PlainResponse) => boolean) | undefined;
|
|
418
|
-
maxRedirects?: number | undefined;
|
|
419
|
-
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
420
|
-
cache?: string | import("cacheable-request").StorageAdapter | boolean | undefined;
|
|
421
|
-
throwHttpErrors?: boolean | undefined;
|
|
422
|
-
username?: string | undefined;
|
|
423
|
-
password?: string | undefined;
|
|
424
|
-
http2?: boolean | undefined;
|
|
425
|
-
allowGetBody?: boolean | undefined;
|
|
426
|
-
methodRewriting?: boolean | undefined;
|
|
427
|
-
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
428
|
-
dnsLookupIpVersion?: import("got-scraping").DnsLookupIpVersion;
|
|
429
|
-
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
430
|
-
parseJson?: import("got-scraping").ParseJsonFunction | undefined;
|
|
431
|
-
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
432
|
-
stringifyJson?: import("got-scraping").StringifyJsonFunction | undefined;
|
|
433
|
-
localAddress?: string | undefined;
|
|
434
|
-
method?: Method | undefined;
|
|
435
|
-
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
436
|
-
createConnection?: import("got-scraping").CreateConnectionFunction | undefined;
|
|
437
|
-
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
438
|
-
cacheOptions?: import("got-scraping").CacheOptions | undefined;
|
|
439
|
-
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
440
|
-
https?: import("got-scraping").HttpsOptions | undefined;
|
|
441
|
-
encoding?: BufferEncoding | undefined;
|
|
442
|
-
resolveBodyOnly?: boolean | undefined;
|
|
443
|
-
isStream?: boolean | undefined;
|
|
444
|
-
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
445
|
-
responseType?: import("got-scraping").ResponseType | undefined;
|
|
446
|
-
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
447
|
-
pagination?: import("got-scraping").PaginationOptions<unknown, unknown> | undefined;
|
|
448
|
-
setHost?: boolean | undefined;
|
|
449
|
-
maxHeaderSize?: number | undefined;
|
|
450
|
-
enableUnixSockets?: boolean | undefined;
|
|
451
|
-
} & {
|
|
452
|
-
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
453
|
-
hooks?: Partial<import("got-scraping").Hooks>;
|
|
454
|
-
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
455
|
-
retry?: Partial<import("got-scraping").RetryOptions>;
|
|
456
|
-
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
457
|
-
} & import("got-scraping").Context & Required<Pick<OptionsInit, "url">> & {
|
|
458
|
-
isStream: true;
|
|
357
|
+
protected _getRequestOptions(request: CrawleeRequest, session: Session, proxyUrl?: string): {
|
|
358
|
+
url: string;
|
|
359
|
+
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
360
|
+
method: import("@crawlee/types").AllowedHttpMethods;
|
|
361
|
+
proxyUrl: string | undefined;
|
|
362
|
+
timeout: number;
|
|
363
|
+
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
364
|
+
cookieJar: import("tough-cookie").CookieJar | undefined;
|
|
365
|
+
sessionToken: Session;
|
|
366
|
+
headers: Record<string, string> | undefined;
|
|
367
|
+
https: {
|
|
368
|
+
rejectUnauthorized: boolean;
|
|
369
|
+
};
|
|
370
|
+
body: string | undefined;
|
|
459
371
|
};
|
|
460
|
-
protected _encodeResponse(request:
|
|
372
|
+
protected _encodeResponse(request: CrawleeRequest, response: Response, encoding: BufferEncoding): {
|
|
461
373
|
encoding: BufferEncoding;
|
|
462
|
-
response:
|
|
374
|
+
response: Response;
|
|
463
375
|
};
|
|
464
376
|
/**
|
|
465
377
|
* Checks and extends supported mime types
|
|
@@ -468,7 +380,7 @@ export declare class HttpCrawler<Context extends InternalHttpCrawlingContext<any
|
|
|
468
380
|
/**
|
|
469
381
|
* Handles timeout request
|
|
470
382
|
*/
|
|
471
|
-
protected _handleRequestTimeout(session
|
|
383
|
+
protected _handleRequestTimeout(session: Session): void;
|
|
472
384
|
private _abortDownloadOfBody;
|
|
473
385
|
/**
|
|
474
386
|
* @internal wraps public utility for mocking purposes
|
|
@@ -476,10 +388,9 @@ export declare class HttpCrawler<Context extends InternalHttpCrawlingContext<any
|
|
|
476
388
|
private _requestAsBrowser;
|
|
477
389
|
}
|
|
478
390
|
interface RequestFunctionOptions {
|
|
479
|
-
request:
|
|
480
|
-
session
|
|
391
|
+
request: CrawleeRequest;
|
|
392
|
+
session: Session;
|
|
481
393
|
proxyUrl?: string;
|
|
482
|
-
gotOptions: OptionsInit;
|
|
483
394
|
}
|
|
484
395
|
/**
|
|
485
396
|
* Creates new {@link Router} instance that works based on request labels.
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"http-crawler.d.ts","sourceRoot":"","sources":["../../src/internals/http-crawler.ts"],"names":[],"mappings":"
|
|
1
|
+
{"version":3,"file":"http-crawler.d.ts","sourceRoot":"","sources":["../../src/internals/http-crawler.ts"],"names":[],"mappings":"AAGA,OAAO,KAAK,EAER,mBAAmB,EACnB,eAAe,EACf,YAAY,EACZ,sBAAsB,EACtB,OAAO,IAAI,cAAc,EACzB,cAAc,EACd,sBAAsB,EACtB,YAAY,EACZ,OAAO,EACV,MAAM,gBAAgB,CAAC;AACxB,OAAO,EACH,YAAY,EACZ,eAAe,EAKlB,MAAM,gBAAgB,CAAC;AACxB,OAAO,KAAK,EAAE,aAAa,EAAE,MAAM,eAAe,CAAC;AAEnD,OAAO,KAAK,EAAE,SAAS,EAAE,UAAU,EAAE,MAAM,gBAAgB,CAAC;AAC5D,OAAO,EAAE,KAAK,WAAW,EAAuB,MAAM,gBAAgB,CAAC;AAEvE,OAAO,KAAK,EAAE,WAAW,EAAE,YAAY,EAAE,MAAM,cAAc,CAAC;AAI9D,OAAO,KAAK,EAAE,SAAS,EAAE,MAAM,WAAW,CAAC;AAsB3C,MAAM,MAAM,gBAAgB,CACxB,QAAQ,SAAS,UAAU,GAAG,GAAG,EAAE,2EAA2E;AAC9G,QAAQ,SAAS,SAAS,GAAG,GAAG,IAChC,YAAY,CAAC,mBAAmB,CAAC,QAAQ,EAAE,QAAQ,CAAC,CAAC,CAAC;AAE1D,MAAM,WAAW,kBAAkB,CAC/B,OAAO,SAAS,2BAA2B,GAAG,2BAA2B,EACzE,gBAAgB,GAAG,UAAU,CAAC,KAAK,CAAC,EACpC,eAAe,SAAS,OAAO,GAAG,OAAO,GAAG,gBAAgB,CAC9D,SAAQ,mBAAmB,CAAC,OAAO,EAAE,gBAAgB,EAAE,eAAe,CAAC;IACrE;;OAEG;IACH,qBAAqB,CAAC,EAAE,MAAM,CAAC;IAE/B;;OAEG;IACH,eAAe,CAAC,EAAE,OAAO,CAAC;IAE1B;;;;;;;;;;;;;;;OAeG;IACH,kBAAkB,CAAC,EAAE,gBAAgB,CAAC,eAAe,CAAC,EAAE,CAAC;IAEzD;;;;;;;;;;;OAWG;IACH,mBAAmB,CAAC,EAAE,CAAC,CAAC,eAAe,EAAE,0BAA0B,KAAK,SAAS,CAAC,IAAI,CAAC,CAAC,EAAE,CAAC;IAE3F;;;;OAIG;IACH,mBAAmB,CAAC,EAAE,MAAM,EAAE,CAAC;IAE/B;;;;;;;;;;OAUG;IACH,uBAAuB,CAAC,EAAE,MAAM,CAAC;IAEjC;;;;;;;;OAQG;IACH,qBAAqB,CAAC,EAAE,MAAM,CAAC;IAE/B;;;;;OAKG;IACH,wBAAwB,CAAC,EAAE,OAAO,CAAC;CACtC;AAED;;GAEG;AACH,MAAM,MAAM,gBAAgB,CAAC,OAAO,IAAI,CAAC,eAAe,EAAE,OAAO,KAAK,SAAS,CAAC,IAAI,CAAC,CAAC;AAEtF,MAAM,MAAM,QAAQ,CAChB,QAAQ,SAAS,UAAU,GAAG,GAAG,EAAE,2EAA2E;AAC9G,QAAQ,SAAS,SAAS,GAAG,GAAG,IAChC,gBAAgB,CAAC,mBAAmB,CAAC,QAAQ,EAAE,QAAQ,CAAC,CAAC,CAAC;AAE9D,UAAU,0BAA0B,CAChC,QAAQ,SAAS,UAAU,GAAG,GAAG,CACnC,SAAQ,eAAe,CAAC,QAAQ,CAAC;IAC/B;;OAEG;IACH,OAAO,EAAE,aAAa,CAAC,cAAc,CAAC,QAAQ,CAAC,CAAC,CAAC;IAEjD;;OAEG;IACH,QAAQ,EAAE,QAAQ,CAAC;CACtB;AAED;;GAEG;AACH,MAAM,WAAW,2BAA2B,CACxC,QAAQ,SAAS,UAAU,GAAG,GAAG,EAAE,2EAA2E;AAC9G,QAAQ,SAAS,SAAS,GAAG,GAAG,CAClC,SAAQ,0BAA0B,CAAC,QAAQ,CAAC;IAC1C;;;;;OAKG;IACH,IAAI,EAAE,MAAM,GAAG,MAAM,CAAC;IAEtB;;OAEG;IACH,IAAI,EAAE,QAAQ,CAAC;IAEf;;OAEG;IACH,WAAW,EAAE;QAAE,IAAI,EAAE,MAAM,CAAC;QAAC,QAAQ,EAAE,cAAc,CAAA;KAAE,CAAC;IAExD;;;;;;;;;;;OAWG;IACH,eAAe,CAAC,QAAQ,EAAE,MAAM,EAAE,SAAS,CAAC,EAAE,MAAM,GAAG,OAAO,CAAC,IAAI,CAAC,CAAC;IAErE;;;;;;;;;;;OAWG;IACH,gBAAgB,CAAC,QAAQ,CAAC,EAAE,MAAM,EAAE,SAAS,CAAC,EAAE,MAAM,GAAG,OAAO,CAAC,WAAW,CAAC,CAAC;CACjF;AAED,MAAM,WAAW,mBAAmB,CAChC,QAAQ,SAAS,UAAU,GAAG,GAAG,EACjC,QAAQ,SAAS,SAAS,GAAG,GAAG,CAClC,SAAQ,2BAA2B,CAAC,QAAQ,EAAE,QAAQ,CAAC;CAAG;AAE5D,MAAM,MAAM,kBAAkB,CAC1B,QAAQ,SAAS,UAAU,GAAG,GAAG,EAAE,2EAA2E;AAC9G,QAAQ,SAAS,SAAS,GAAG,GAAG,IAChC,cAAc,CAAC,mBAAmB,CAAC,QAAQ,EAAE,QAAQ,CAAC,CAAC,CAAC;AAE5D;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GAkEG;AACH,qBAAa,WAAW,CACpB,OAAO,SAAS,2BAA2B,CAAC,GAAG,EAAE,GAAG,CAAC,GAAG,2BAA2B,EACnF,gBAAgB,GAAG,UAAU,CAAC,KAAK,CAAC,EACpC,eAAe,SAAS,OAAO,GAAG,OAAO,GAAG,gBAAgB,CAC9D,SAAQ,YAAY,CAAC,OAAO,EAAE,gBAAgB,EAAE,eAAe,CAAC;IAC9D,SAAS,CAAC,kBAAkB,EAAE,gBAAgB,CAAC,eAAe,CAAC,EAAE,CAAC;IAClE,SAAS,CAAC,mBAAmB,EAAE,CAAC,CAAC,eAAe,EAAE,0BAA0B,KAAK,SAAS,CAAC,IAAI,CAAC,CAAC,EAAE,CAAC;IACpG,SAAS,CAAC,wBAAwB,EAAE,OAAO,CAAC;IAC5C,SAAS,CAAC,uBAAuB,EAAE,MAAM,CAAC;IAC1C,SAAS,CAAC,eAAe,EAAE,OAAO,CAAC;IACnC,SAAS,CAAC,uBAAuB,CAAC,EAAE,MAAM,CAAC;IAC3C,SAAS,CAAC,qBAAqB,CAAC,EAAE,MAAM,CAAC;IACzC,SAAS,CAAC,QAAQ,CAAC,kBAAkB,EAAE,GAAG,CAAC,MAAM,CAAC,CAAC;IAEnD,iBAA0B,YAAY;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;MAYpC;IAEF;;OAEG;gBAEC,OAAO,GAAE,kBAAkB,CAAC,OAAO,EAAE,gBAAgB,EAAE,eAAe,CAAC,GACnE,sBAAsB,CAAC,2BAA2B,EAAE,OAAO,CAAa;cAkD7D,oBAAoB,IAAI,eAAe,CAAC,eAAe,EAAE,2BAA2B,CAAC;YAS1F,eAAe;YA4Cf,mBAAmB;YA+EnB,6BAA6B;cAQ3B,gBAAgB,CAAC,eAAe,EAAE,2BAA2B,GAAG,OAAO,CAAC,MAAM,GAAG,KAAK,CAAC;IAkBvG;;;;OAIG;cACa,gBAAgB,CAAC,EAAE,OAAO,EAAE,OAAO,EAAE,QAAQ,EAAE,EAAE,sBAAsB,GAAG,OAAO,CAAC,QAAQ,CAAC;IAmB3G;;OAEG;cACa,cAAc,CAAC,OAAO,EAAE,cAAc,EAAE,QAAQ,EAAE,QAAQ;;;;;;;;;;;;;;;IAwC1E;;OAEG;IACH,SAAS,CAAC,kBAAkB,CAAC,OAAO,EAAE,cAAc,EAAE,OAAO,EAAE,OAAO,EAAE,QAAQ,CAAC,EAAE,MAAM;;;;;;;;;;;cAY9D,MAAM,GAAG,SAAS;;IAqB7C,SAAS,CAAC,eAAe,CACrB,OAAO,EAAE,cAAc,EACvB,QAAQ,EAAE,QAAQ,EAClB,QAAQ,EAAE,cAAc,GACzB;QACC,QAAQ,EAAE,cAAc,CAAC;QACzB,QAAQ,EAAE,QAAQ,CAAC;KACtB;IAwCD;;OAEG;IACH,SAAS,CAAC,yBAAyB,CAAC,mBAAmB,EAAE,CAAC,MAAM,GAAG,WAAW,GAAG,YAAY,CAAC,EAAE;IAgBhG;;OAEG;IACH,SAAS,CAAC,qBAAqB,CAAC,OAAO,EAAE,OAAO;IAKhD,OAAO,CAAC,oBAAoB;IAe5B;;OAEG;IACH,OAAO,CAAC,iBAAiB,CAsBvB;CACL;AAED,UAAU,sBAAsB;IAC5B,OAAO,EAAE,cAAc,CAAC;IACxB,OAAO,EAAE,OAAO,CAAC;IACjB,QAAQ,CAAC,EAAE,MAAM,CAAC;CACrB;AAED;;;;;;;;;;;;;;;;;;;;;;;GAuBG;AACH,wBAAgB,gBAAgB,CAC5B,OAAO,SAAS,mBAAmB,GAAG,mBAAmB,EACzD,QAAQ,SAAS,UAAU,GAAG,sBAAsB,CAAC,OAAO,CAAC,SAAS,CAAC,CAAC,EAC1E,MAAM,CAAC,EAAE,YAAY,CAAC,OAAO,EAAE,QAAQ,CAAC,mDAEzC"}
|