@crawlee/http 4.0.0-beta.3 → 4.0.0-beta.30
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +5 -1
- package/internals/file-download.d.ts +46 -33
- package/internals/file-download.d.ts.map +1 -1
- package/internals/file-download.js +114 -74
- package/internals/file-download.js.map +1 -1
- package/internals/http-crawler.d.ts +78 -150
- package/internals/http-crawler.d.ts.map +1 -1
- package/internals/http-crawler.js +173 -297
- package/internals/http-crawler.js.map +1 -1
- package/internals/utils.d.ts +14 -0
- package/internals/utils.d.ts.map +1 -0
- package/internals/utils.js +71 -0
- package/internals/utils.js.map +1 -0
- package/package.json +7 -7
- package/tsconfig.build.tsbuildinfo +0 -1
|
@@ -1,25 +1,13 @@
|
|
|
1
|
-
import type {
|
|
2
|
-
import
|
|
3
|
-
import type {
|
|
4
|
-
import { BasicCrawler, Configuration, CrawlerExtension } from '@crawlee/basic';
|
|
5
|
-
import type { HttpResponse } from '@crawlee/core';
|
|
1
|
+
import type { BasicCrawlerOptions, CrawlingContext, ErrorHandler, GetUserDataFromRequest, Request as CrawleeRequest, RequestHandler, RequireContextPipeline, RouterRoutes, Session } from '@crawlee/basic';
|
|
2
|
+
import { BasicCrawler, ContextPipeline } from '@crawlee/basic';
|
|
3
|
+
import type { LoadedRequest } from '@crawlee/core';
|
|
6
4
|
import type { Awaitable, Dictionary } from '@crawlee/types';
|
|
7
5
|
import { type CheerioRoot } from '@crawlee/utils';
|
|
8
6
|
import type { RequestLike, ResponseLike } from 'content-type';
|
|
9
|
-
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
10
|
-
import type { Method, OptionsInit } from 'got-scraping';
|
|
11
|
-
import { ObjectPredicate } from 'ow';
|
|
12
7
|
import type { JsonValue } from 'type-fest';
|
|
13
|
-
/**
|
|
14
|
-
* TODO exists for BC within HttpCrawler - replace completely with StreamingHttpResponse in 4.0
|
|
15
|
-
* @internal
|
|
16
|
-
*/
|
|
17
|
-
export type PlainResponse = Omit<HttpResponse, 'body'> & IncomingMessage & {
|
|
18
|
-
body?: unknown;
|
|
19
|
-
};
|
|
20
8
|
export type HttpErrorHandler<UserData extends Dictionary = any, // with default to Dictionary we cant use a typed router in untyped crawler
|
|
21
9
|
JSONData extends JsonValue = any> = ErrorHandler<HttpCrawlingContext<UserData, JSONData>>;
|
|
22
|
-
export interface HttpCrawlerOptions<Context extends InternalHttpCrawlingContext = InternalHttpCrawlingContext> extends BasicCrawlerOptions<Context> {
|
|
10
|
+
export interface HttpCrawlerOptions<Context extends InternalHttpCrawlingContext = InternalHttpCrawlingContext, ContextExtension = Dictionary<never>, ExtendedContext extends Context = Context & ContextExtension> extends BasicCrawlerOptions<Context, ContextExtension, ExtendedContext> {
|
|
23
11
|
/**
|
|
24
12
|
* Timeout in which the HTTP request to the resource needs to finish, given in seconds.
|
|
25
13
|
*/
|
|
@@ -28,20 +16,14 @@ export interface HttpCrawlerOptions<Context extends InternalHttpCrawlingContext
|
|
|
28
16
|
* If set to true, SSL certificate errors will be ignored.
|
|
29
17
|
*/
|
|
30
18
|
ignoreSslErrors?: boolean;
|
|
31
|
-
/**
|
|
32
|
-
* If set, this crawler will be configured for all connections to use
|
|
33
|
-
* [Apify Proxy](https://console.apify.com/proxy) or your own Proxy URLs provided and rotated according to the configuration.
|
|
34
|
-
* For more information, see the [documentation](https://docs.apify.com/proxy).
|
|
35
|
-
*/
|
|
36
|
-
proxyConfiguration?: ProxyConfiguration;
|
|
37
19
|
/**
|
|
38
20
|
* Async functions that are sequentially evaluated before the navigation. Good for setting additional cookies
|
|
39
|
-
* or browser properties before navigation. The function accepts
|
|
40
|
-
* which
|
|
21
|
+
* or browser properties before navigation. The function accepts one parameter `crawlingContext`,
|
|
22
|
+
* which is passed to the `requestAsBrowser()` function the crawler calls to navigate.
|
|
41
23
|
* Example:
|
|
42
24
|
* ```
|
|
43
25
|
* preNavigationHooks: [
|
|
44
|
-
* async (crawlingContext
|
|
26
|
+
* async (crawlingContext) => {
|
|
45
27
|
* // ...
|
|
46
28
|
* },
|
|
47
29
|
* ]
|
|
@@ -50,7 +32,7 @@ export interface HttpCrawlerOptions<Context extends InternalHttpCrawlingContext
|
|
|
50
32
|
* Modyfing `pageOptions` is supported only in Playwright incognito.
|
|
51
33
|
* See {@link PrePageCreateHook}
|
|
52
34
|
*/
|
|
53
|
-
preNavigationHooks?: InternalHttpHook<
|
|
35
|
+
preNavigationHooks?: InternalHttpHook<CrawlingContext>[];
|
|
54
36
|
/**
|
|
55
37
|
* Async functions that are sequentially evaluated after the navigation. Good for checking if the navigation was successful.
|
|
56
38
|
* The function accepts `crawlingContext` as the only parameter.
|
|
@@ -63,7 +45,7 @@ export interface HttpCrawlerOptions<Context extends InternalHttpCrawlingContext
|
|
|
63
45
|
* ]
|
|
64
46
|
* ```
|
|
65
47
|
*/
|
|
66
|
-
postNavigationHooks?:
|
|
48
|
+
postNavigationHooks?: ((crawlingContext: CrawlingContextWithReponse) => Awaitable<void>)[];
|
|
67
49
|
/**
|
|
68
50
|
* An array of [MIME types](https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/MIME_types/Complete_list_of_MIME_types)
|
|
69
51
|
* you want the crawler to load and process. By default, only `text/html` and `application/xhtml+xml` MIME types are supported.
|
|
@@ -112,15 +94,24 @@ export interface HttpCrawlerOptions<Context extends InternalHttpCrawlingContext
|
|
|
112
94
|
/**
|
|
113
95
|
* @internal
|
|
114
96
|
*/
|
|
115
|
-
export type InternalHttpHook<Context> = (crawlingContext: Context
|
|
97
|
+
export type InternalHttpHook<Context> = (crawlingContext: Context) => Awaitable<void>;
|
|
116
98
|
export type HttpHook<UserData extends Dictionary = any, // with default to Dictionary we cant use a typed router in untyped crawler
|
|
117
99
|
JSONData extends JsonValue = any> = InternalHttpHook<HttpCrawlingContext<UserData, JSONData>>;
|
|
100
|
+
interface CrawlingContextWithReponse<UserData extends Dictionary = any> extends CrawlingContext<UserData> {
|
|
101
|
+
/**
|
|
102
|
+
* The request object that was successfully loaded and navigated to, including the {@link Request.loadedUrl|`loadedUrl`} property.
|
|
103
|
+
*/
|
|
104
|
+
request: LoadedRequest<CrawleeRequest<UserData>>;
|
|
105
|
+
/**
|
|
106
|
+
* The HTTP response object containing status code, headers, and other response metadata.
|
|
107
|
+
*/
|
|
108
|
+
response: Response;
|
|
109
|
+
}
|
|
118
110
|
/**
|
|
119
111
|
* @internal
|
|
120
112
|
*/
|
|
121
113
|
export interface InternalHttpCrawlingContext<UserData extends Dictionary = any, // with default to Dictionary we cant use a typed router in untyped crawler
|
|
122
|
-
JSONData extends JsonValue = any
|
|
123
|
-
Crawler = HttpCrawler<any>> extends CrawlingContext<Crawler, UserData> {
|
|
114
|
+
JSONData extends JsonValue = any> extends CrawlingContextWithReponse<UserData> {
|
|
124
115
|
/**
|
|
125
116
|
* The request body of the web page.
|
|
126
117
|
* The type depends on the `Content-Type` header of the web page:
|
|
@@ -139,7 +130,6 @@ Crawler = HttpCrawler<any>> extends CrawlingContext<Crawler, UserData> {
|
|
|
139
130
|
type: string;
|
|
140
131
|
encoding: BufferEncoding;
|
|
141
132
|
};
|
|
142
|
-
response: PlainResponse;
|
|
143
133
|
/**
|
|
144
134
|
* Wait for an element matching the selector to appear. Timeout is ignored.
|
|
145
135
|
*
|
|
@@ -167,7 +157,7 @@ Crawler = HttpCrawler<any>> extends CrawlingContext<Crawler, UserData> {
|
|
|
167
157
|
*/
|
|
168
158
|
parseWithCheerio(selector?: string, timeoutMs?: number): Promise<CheerioRoot>;
|
|
169
159
|
}
|
|
170
|
-
export interface HttpCrawlingContext<UserData extends Dictionary = any, JSONData extends JsonValue = any> extends InternalHttpCrawlingContext<UserData, JSONData
|
|
160
|
+
export interface HttpCrawlingContext<UserData extends Dictionary = any, JSONData extends JsonValue = any> extends InternalHttpCrawlingContext<UserData, JSONData> {
|
|
171
161
|
}
|
|
172
162
|
export type HttpRequestHandler<UserData extends Dictionary = any, // with default to Dictionary we cant use a typed router in untyped crawler
|
|
173
163
|
JSONData extends JsonValue = any> = RequestHandler<HttpCrawlingContext<UserData, JSONData>>;
|
|
@@ -192,11 +182,11 @@ JSONData extends JsonValue = any> = RequestHandler<HttpCrawlingContext<UserData,
|
|
|
192
182
|
*
|
|
193
183
|
* The crawler finishes when there are no more {@link Request} objects to crawl.
|
|
194
184
|
*
|
|
195
|
-
* We can use the `preNavigationHooks` to adjust
|
|
185
|
+
* We can use the `preNavigationHooks` to adjust the crawling context before the request is made:
|
|
196
186
|
*
|
|
197
187
|
* ```javascript
|
|
198
188
|
* preNavigationHooks: [
|
|
199
|
-
* (crawlingContext
|
|
189
|
+
* (crawlingContext) => {
|
|
200
190
|
* // ...
|
|
201
191
|
* },
|
|
202
192
|
* ]
|
|
@@ -238,16 +228,9 @@ JSONData extends JsonValue = any> = RequestHandler<HttpCrawlingContext<UserData,
|
|
|
238
228
|
* ```
|
|
239
229
|
* @category Crawlers
|
|
240
230
|
*/
|
|
241
|
-
export declare class HttpCrawler<Context extends InternalHttpCrawlingContext<any, any,
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
* A reference to the underlying {@link ProxyConfiguration} class that manages the crawler's proxies.
|
|
245
|
-
* Only available if used by the crawler.
|
|
246
|
-
*/
|
|
247
|
-
proxyConfiguration?: ProxyConfiguration;
|
|
248
|
-
protected userRequestHandlerTimeoutMillis: number;
|
|
249
|
-
protected preNavigationHooks: InternalHttpHook<Context>[];
|
|
250
|
-
protected postNavigationHooks: InternalHttpHook<Context>[];
|
|
231
|
+
export declare class HttpCrawler<Context extends InternalHttpCrawlingContext<any, any> = InternalHttpCrawlingContext, ContextExtension = Dictionary<never>, ExtendedContext extends Context = Context & ContextExtension> extends BasicCrawler<Context, ContextExtension, ExtendedContext> {
|
|
232
|
+
protected preNavigationHooks: InternalHttpHook<CrawlingContext>[];
|
|
233
|
+
protected postNavigationHooks: ((crawlingContext: CrawlingContextWithReponse) => Awaitable<void>)[];
|
|
251
234
|
protected persistCookiesPerSession: boolean;
|
|
252
235
|
protected navigationTimeoutMillis: number;
|
|
253
236
|
protected ignoreSslErrors: boolean;
|
|
@@ -267,8 +250,6 @@ export declare class HttpCrawler<Context extends InternalHttpCrawlingContext<any
|
|
|
267
250
|
suggestResponseEncoding: import("ow").StringPredicate & import("ow").BasePredicate<string | undefined>;
|
|
268
251
|
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
269
252
|
forceResponseEncoding: import("ow").StringPredicate & import("ow").BasePredicate<string | undefined>;
|
|
270
|
-
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
271
|
-
proxyConfiguration: ObjectPredicate<object> & import("ow").BasePredicate<object | undefined>;
|
|
272
253
|
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
273
254
|
persistCookiesPerSession: import("ow").BooleanPredicate & import("ow").BasePredicate<boolean | undefined>;
|
|
274
255
|
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
@@ -279,6 +260,10 @@ export declare class HttpCrawler<Context extends InternalHttpCrawlingContext<any
|
|
|
279
260
|
preNavigationHooks: import("ow").ArrayPredicate<unknown> & import("ow").BasePredicate<unknown[] | undefined>;
|
|
280
261
|
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
281
262
|
postNavigationHooks: import("ow").ArrayPredicate<unknown> & import("ow").BasePredicate<unknown[] | undefined>;
|
|
263
|
+
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
264
|
+
contextPipelineBuilder: import("ow").ObjectPredicate<object> & import("ow").BasePredicate<object | undefined>;
|
|
265
|
+
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
266
|
+
extendContext: import("ow").Predicate<Function> & import("ow").BasePredicate<Function | undefined>;
|
|
282
267
|
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
283
268
|
requestList: import("ow").ObjectPredicate<object> & import("ow").BasePredicate<object | undefined>;
|
|
284
269
|
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
@@ -299,12 +284,16 @@ export declare class HttpCrawler<Context extends InternalHttpCrawlingContext<any
|
|
|
299
284
|
maxSessionRotations: import("ow").NumberPredicate & import("ow").BasePredicate<number | undefined>;
|
|
300
285
|
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
301
286
|
maxRequestsPerCrawl: import("ow").NumberPredicate & import("ow").BasePredicate<number | undefined>;
|
|
287
|
+
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
288
|
+
maxCrawlDepth: import("ow").NumberPredicate & import("ow").BasePredicate<number | undefined>;
|
|
302
289
|
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
303
290
|
autoscaledPoolOptions: import("ow").ObjectPredicate<object> & import("ow").BasePredicate<object | undefined>;
|
|
304
291
|
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
305
292
|
sessionPoolOptions: import("ow").ObjectPredicate<object> & import("ow").BasePredicate<object | undefined>;
|
|
306
293
|
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
307
294
|
useSessionPool: import("ow").BooleanPredicate & import("ow").BasePredicate<boolean | undefined>;
|
|
295
|
+
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
296
|
+
proxyConfiguration: import("ow").ObjectPredicate<object> & import("ow").BasePredicate<object | undefined>;
|
|
308
297
|
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
309
298
|
statusMessageLoggingInterval: import("ow").NumberPredicate & import("ow").BasePredicate<number | undefined>;
|
|
310
299
|
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
@@ -312,11 +301,17 @@ export declare class HttpCrawler<Context extends InternalHttpCrawlingContext<any
|
|
|
312
301
|
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
313
302
|
retryOnBlocked: import("ow").BooleanPredicate & import("ow").BasePredicate<boolean | undefined>;
|
|
314
303
|
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
315
|
-
respectRobotsTxtFile: import("ow").
|
|
304
|
+
respectRobotsTxtFile: import("ow").AnyPredicate<boolean | object>;
|
|
316
305
|
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
317
306
|
onSkippedRequest: import("ow").Predicate<Function> & import("ow").BasePredicate<Function | undefined>;
|
|
318
307
|
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
319
308
|
httpClient: import("ow").ObjectPredicate<object> & import("ow").BasePredicate<object | undefined>;
|
|
309
|
+
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
310
|
+
configuration: import("ow").ObjectPredicate<object> & import("ow").BasePredicate<object | undefined>;
|
|
311
|
+
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
312
|
+
storageClient: import("ow").ObjectPredicate<object> & import("ow").BasePredicate<object | undefined>;
|
|
313
|
+
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
314
|
+
eventManager: import("ow").ObjectPredicate<object> & import("ow").BasePredicate<object | undefined>;
|
|
320
315
|
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
321
316
|
minConcurrency: import("ow").NumberPredicate & import("ow").BasePredicate<number | undefined>;
|
|
322
317
|
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
@@ -331,135 +326,68 @@ export declare class HttpCrawler<Context extends InternalHttpCrawlingContext<any
|
|
|
331
326
|
experiments: import("ow").ObjectPredicate<object> & import("ow").BasePredicate<object | undefined>;
|
|
332
327
|
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
333
328
|
statisticsOptions: import("ow").ObjectPredicate<object> & import("ow").BasePredicate<object | undefined>;
|
|
329
|
+
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
330
|
+
id: import("ow").StringPredicate & import("ow").BasePredicate<string | undefined>;
|
|
334
331
|
};
|
|
335
332
|
/**
|
|
336
333
|
* All `HttpCrawlerOptions` parameters are passed via an options object.
|
|
337
334
|
*/
|
|
338
|
-
constructor(options?: HttpCrawlerOptions<Context
|
|
335
|
+
constructor(options?: HttpCrawlerOptions<Context, ContextExtension, ExtendedContext> & RequireContextPipeline<InternalHttpCrawlingContext, Context>);
|
|
336
|
+
protected buildContextPipeline(): ContextPipeline<CrawlingContext, InternalHttpCrawlingContext>;
|
|
337
|
+
private makeHttpRequest;
|
|
338
|
+
private processHttpResponse;
|
|
339
|
+
private handleBlockedRequestByContent;
|
|
340
|
+
protected isRequestBlocked(crawlingContext: InternalHttpCrawlingContext): Promise<string | false>;
|
|
339
341
|
/**
|
|
340
|
-
*
|
|
341
|
-
*
|
|
342
|
-
* @param extension Crawler extension that overrides the crawler configuration.
|
|
342
|
+
* Returns the `Cookie` header value based on the current context and
|
|
343
|
+
* any changes that occurred in the navigation hooks.
|
|
343
344
|
*/
|
|
344
|
-
|
|
345
|
-
/**
|
|
346
|
-
* Wrapper around requestHandler that opens and closes pages etc.
|
|
347
|
-
*/
|
|
348
|
-
protected _runRequestHandler(crawlingContext: Context): Promise<void>;
|
|
349
|
-
protected isRequestBlocked(crawlingContext: Context): Promise<string | false>;
|
|
350
|
-
protected _handleNavigation(crawlingContext: Context): Promise<void>;
|
|
351
|
-
/**
|
|
352
|
-
* Sets the cookie header to `gotOptions` based on the provided request and session headers, as well as any changes that occurred due to hooks.
|
|
353
|
-
*/
|
|
354
|
-
protected _applyCookies({ session, request }: CrawlingContext, gotOptions: OptionsInit, preHookCookies: string, postHookCookies: string): void;
|
|
345
|
+
protected _applyCookies({ session, request }: CrawlingContext, preHookCookies: string, postHookCookies: string): string;
|
|
355
346
|
/**
|
|
356
347
|
* Function to make the HTTP request. It performs optimizations
|
|
357
348
|
* on the request such as only downloading the request body if the
|
|
358
349
|
* received content type matches text/html, application/xml, application/xhtml+xml.
|
|
359
350
|
*/
|
|
360
|
-
protected _requestFunction({ request, session, proxyUrl,
|
|
351
|
+
protected _requestFunction({ request, session, proxyUrl, cookieString, }: RequestFunctionOptions): Promise<Response>;
|
|
361
352
|
/**
|
|
362
353
|
* Encodes and parses response according to the provided content type
|
|
363
354
|
*/
|
|
364
|
-
protected _parseResponse(request:
|
|
365
|
-
|
|
366
|
-
response: IncomingMessage;
|
|
355
|
+
protected _parseResponse(request: CrawleeRequest, response: Response): Promise<{
|
|
356
|
+
response: Response;
|
|
367
357
|
contentType: {
|
|
368
358
|
type: string;
|
|
369
359
|
encoding: BufferEncoding;
|
|
370
360
|
};
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
|
|
361
|
+
body: string;
|
|
362
|
+
} | {
|
|
363
|
+
body: Buffer<ArrayBuffer>;
|
|
364
|
+
response: Response;
|
|
374
365
|
contentType: {
|
|
375
366
|
type: string;
|
|
376
367
|
encoding: BufferEncoding;
|
|
377
368
|
};
|
|
378
|
-
enqueueLinks: () => Promise<{
|
|
379
|
-
processedRequests: never[];
|
|
380
|
-
unprocessedRequests: never[];
|
|
381
|
-
}>;
|
|
382
369
|
}>;
|
|
383
|
-
protected _parseHTML(response: IncomingMessage, _isXml: boolean, _crawlingContext: Context): Promise<Partial<Context>>;
|
|
384
370
|
/**
|
|
385
371
|
* Combines the provided `requestOptions` with mandatory (non-overridable) values.
|
|
386
372
|
*/
|
|
387
|
-
protected _getRequestOptions(request:
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
|
|
394
|
-
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
402
|
-
timeout?: import("got-scraping").Delays | undefined;
|
|
403
|
-
prefixUrl?: string | URL | undefined;
|
|
404
|
-
form?: Record<string, any> | undefined;
|
|
405
|
-
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
406
|
-
cookieJar?: import("got-scraping").PromiseCookieJar | import("got-scraping").ToughCookieJar | undefined;
|
|
407
|
-
signal?: AbortSignal | undefined;
|
|
408
|
-
ignoreInvalidCookies?: boolean | undefined;
|
|
409
|
-
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
410
|
-
searchParams?: string | import("got-scraping").SearchParameters | URLSearchParams | undefined;
|
|
411
|
-
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
412
|
-
dnsLookup?: import("cacheable-lookup").default["lookup"] | undefined;
|
|
413
|
-
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
414
|
-
dnsCache?: import("cacheable-lookup").default | boolean | undefined;
|
|
415
|
-
context?: Record<string, unknown> | undefined;
|
|
416
|
-
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
417
|
-
followRedirect?: boolean | ((response: import("got-scraping").PlainResponse) => boolean) | undefined;
|
|
418
|
-
maxRedirects?: number | undefined;
|
|
419
|
-
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
420
|
-
cache?: string | import("cacheable-request").StorageAdapter | boolean | undefined;
|
|
421
|
-
throwHttpErrors?: boolean | undefined;
|
|
422
|
-
username?: string | undefined;
|
|
423
|
-
password?: string | undefined;
|
|
424
|
-
http2?: boolean | undefined;
|
|
425
|
-
allowGetBody?: boolean | undefined;
|
|
426
|
-
methodRewriting?: boolean | undefined;
|
|
427
|
-
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
428
|
-
dnsLookupIpVersion?: import("got-scraping").DnsLookupIpVersion;
|
|
429
|
-
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
430
|
-
parseJson?: import("got-scraping").ParseJsonFunction | undefined;
|
|
431
|
-
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
432
|
-
stringifyJson?: import("got-scraping").StringifyJsonFunction | undefined;
|
|
433
|
-
localAddress?: string | undefined;
|
|
434
|
-
method?: Method | undefined;
|
|
435
|
-
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
436
|
-
createConnection?: import("got-scraping").CreateConnectionFunction | undefined;
|
|
437
|
-
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
438
|
-
cacheOptions?: import("got-scraping").CacheOptions | undefined;
|
|
439
|
-
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
440
|
-
https?: import("got-scraping").HttpsOptions | undefined;
|
|
441
|
-
encoding?: BufferEncoding | undefined;
|
|
442
|
-
resolveBodyOnly?: boolean | undefined;
|
|
443
|
-
isStream?: boolean | undefined;
|
|
444
|
-
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
445
|
-
responseType?: import("got-scraping").ResponseType | undefined;
|
|
446
|
-
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
447
|
-
pagination?: import("got-scraping").PaginationOptions<unknown, unknown> | undefined;
|
|
448
|
-
setHost?: boolean | undefined;
|
|
449
|
-
maxHeaderSize?: number | undefined;
|
|
450
|
-
enableUnixSockets?: boolean | undefined;
|
|
451
|
-
} & {
|
|
452
|
-
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
453
|
-
hooks?: Partial<import("got-scraping").Hooks>;
|
|
454
|
-
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
455
|
-
retry?: Partial<import("got-scraping").RetryOptions>;
|
|
456
|
-
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
457
|
-
} & import("got-scraping").Context & Required<Pick<OptionsInit, "url">> & {
|
|
458
|
-
isStream: true;
|
|
373
|
+
protected _getRequestOptions(request: CrawleeRequest, session?: Session, proxyUrl?: string): {
|
|
374
|
+
url: string;
|
|
375
|
+
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
376
|
+
method: import("@crawlee/types").AllowedHttpMethods;
|
|
377
|
+
proxyUrl: string | undefined;
|
|
378
|
+
timeout: number;
|
|
379
|
+
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
380
|
+
cookieJar: import("tough-cookie").CookieJar | undefined;
|
|
381
|
+
sessionToken: Session | undefined;
|
|
382
|
+
headers: Record<string, string> | undefined;
|
|
383
|
+
https: {
|
|
384
|
+
rejectUnauthorized: boolean;
|
|
385
|
+
};
|
|
386
|
+
body: string | undefined;
|
|
459
387
|
};
|
|
460
|
-
protected _encodeResponse(request:
|
|
388
|
+
protected _encodeResponse(request: CrawleeRequest, response: Response, encoding: BufferEncoding): {
|
|
461
389
|
encoding: BufferEncoding;
|
|
462
|
-
response:
|
|
390
|
+
response: Response;
|
|
463
391
|
};
|
|
464
392
|
/**
|
|
465
393
|
* Checks and extends supported mime types
|
|
@@ -476,10 +404,10 @@ export declare class HttpCrawler<Context extends InternalHttpCrawlingContext<any
|
|
|
476
404
|
private _requestAsBrowser;
|
|
477
405
|
}
|
|
478
406
|
interface RequestFunctionOptions {
|
|
479
|
-
request:
|
|
407
|
+
request: CrawleeRequest;
|
|
480
408
|
session?: Session;
|
|
481
409
|
proxyUrl?: string;
|
|
482
|
-
|
|
410
|
+
cookieString?: string;
|
|
483
411
|
}
|
|
484
412
|
/**
|
|
485
413
|
* Creates new {@link Router} instance that works based on request labels.
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"http-crawler.d.ts","sourceRoot":"","sources":["../../src/internals/http-crawler.ts"],"names":[],"mappings":"
|
|
1
|
+
{"version":3,"file":"http-crawler.d.ts","sourceRoot":"","sources":["../../src/internals/http-crawler.ts"],"names":[],"mappings":"AAGA,OAAO,KAAK,EAER,mBAAmB,EACnB,eAAe,EACf,YAAY,EACZ,sBAAsB,EACtB,OAAO,IAAI,cAAc,EACzB,cAAc,EACd,sBAAsB,EACtB,YAAY,EACZ,OAAO,EACV,MAAM,gBAAgB,CAAC;AACxB,OAAO,EACH,YAAY,EAEZ,eAAe,EAKlB,MAAM,gBAAgB,CAAC;AACxB,OAAO,KAAK,EAAE,aAAa,EAAE,MAAM,eAAe,CAAC;AAEnD,OAAO,KAAK,EAAE,SAAS,EAAE,UAAU,EAAE,MAAM,gBAAgB,CAAC;AAC5D,OAAO,EAAE,KAAK,WAAW,EAAuB,MAAM,gBAAgB,CAAC;AAEvE,OAAO,KAAK,EAAE,WAAW,EAAE,YAAY,EAAE,MAAM,cAAc,CAAC;AAI9D,OAAO,KAAK,EAAE,SAAS,EAAE,MAAM,WAAW,CAAC;AAsB3C,MAAM,MAAM,gBAAgB,CACxB,QAAQ,SAAS,UAAU,GAAG,GAAG,EAAE,2EAA2E;AAC9G,QAAQ,SAAS,SAAS,GAAG,GAAG,IAChC,YAAY,CAAC,mBAAmB,CAAC,QAAQ,EAAE,QAAQ,CAAC,CAAC,CAAC;AAE1D,MAAM,WAAW,kBAAkB,CAC/B,OAAO,SAAS,2BAA2B,GAAG,2BAA2B,EACzE,gBAAgB,GAAG,UAAU,CAAC,KAAK,CAAC,EACpC,eAAe,SAAS,OAAO,GAAG,OAAO,GAAG,gBAAgB,CAC9D,SAAQ,mBAAmB,CAAC,OAAO,EAAE,gBAAgB,EAAE,eAAe,CAAC;IACrE;;OAEG;IACH,qBAAqB,CAAC,EAAE,MAAM,CAAC;IAE/B;;OAEG;IACH,eAAe,CAAC,EAAE,OAAO,CAAC;IAE1B;;;;;;;;;;;;;;;OAeG;IACH,kBAAkB,CAAC,EAAE,gBAAgB,CAAC,eAAe,CAAC,EAAE,CAAC;IAEzD;;;;;;;;;;;OAWG;IACH,mBAAmB,CAAC,EAAE,CAAC,CAAC,eAAe,EAAE,0BAA0B,KAAK,SAAS,CAAC,IAAI,CAAC,CAAC,EAAE,CAAC;IAE3F;;;OAGG;IACH,mBAAmB,CAAC,EAAE,MAAM,EAAE,CAAC;IAE/B;;;;;;;;;;OAUG;IACH,uBAAuB,CAAC,EAAE,MAAM,CAAC;IAEjC;;;;;;;;OAQG;IACH,qBAAqB,CAAC,EAAE,MAAM,CAAC;IAE/B;;;;;OAKG;IACH,wBAAwB,CAAC,EAAE,OAAO,CAAC;IAEnC;;;OAGG;IACH,0BAA0B,CAAC,EAAE,MAAM,EAAE,CAAC;IAEtC;;;OAGG;IACH,8BAA8B,CAAC,EAAE,MAAM,EAAE,CAAC;CAC7C;AAED;;GAEG;AACH,MAAM,MAAM,gBAAgB,CAAC,OAAO,IAAI,CAAC,eAAe,EAAE,OAAO,KAAK,SAAS,CAAC,IAAI,CAAC,CAAC;AAEtF,MAAM,MAAM,QAAQ,CAChB,QAAQ,SAAS,UAAU,GAAG,GAAG,EAAE,2EAA2E;AAC9G,QAAQ,SAAS,SAAS,GAAG,GAAG,IAChC,gBAAgB,CAAC,mBAAmB,CAAC,QAAQ,EAAE,QAAQ,CAAC,CAAC,CAAC;AAE9D,UAAU,0BAA0B,CAChC,QAAQ,SAAS,UAAU,GAAG,GAAG,CACnC,SAAQ,eAAe,CAAC,QAAQ,CAAC;IAC/B;;OAEG;IACH,OAAO,EAAE,aAAa,CAAC,cAAc,CAAC,QAAQ,CAAC,CAAC,CAAC;IAEjD;;OAEG;IACH,QAAQ,EAAE,QAAQ,CAAC;CACtB;AAED;;GAEG;AACH,MAAM,WAAW,2BAA2B,CACxC,QAAQ,SAAS,UAAU,GAAG,GAAG,EAAE,2EAA2E;AAC9G,QAAQ,SAAS,SAAS,GAAG,GAAG,CAClC,SAAQ,0BAA0B,CAAC,QAAQ,CAAC;IAC1C;;;;;OAKG;IACH,IAAI,EAAE,MAAM,GAAG,MAAM,CAAC;IAEtB;;OAEG;IACH,IAAI,EAAE,QAAQ,CAAC;IAEf;;OAEG;IACH,WAAW,EAAE;QAAE,IAAI,EAAE,MAAM,CAAC;QAAC,QAAQ,EAAE,cAAc,CAAA;KAAE,CAAC;IAExD;;;;;;;;;;;OAWG;IACH,eAAe,CAAC,QAAQ,EAAE,MAAM,EAAE,SAAS,CAAC,EAAE,MAAM,GAAG,OAAO,CAAC,IAAI,CAAC,CAAC;IAErE;;;;;;;;;;;OAWG;IACH,gBAAgB,CAAC,QAAQ,CAAC,EAAE,MAAM,EAAE,SAAS,CAAC,EAAE,MAAM,GAAG,OAAO,CAAC,WAAW,CAAC,CAAC;CACjF;AAED,MAAM,WAAW,mBAAmB,CAAC,QAAQ,SAAS,UAAU,GAAG,GAAG,EAAE,QAAQ,SAAS,SAAS,GAAG,GAAG,CACpG,SAAQ,2BAA2B,CAAC,QAAQ,EAAE,QAAQ,CAAC;CAAG;AAE9D,MAAM,MAAM,kBAAkB,CAC1B,QAAQ,SAAS,UAAU,GAAG,GAAG,EAAE,2EAA2E;AAC9G,QAAQ,SAAS,SAAS,GAAG,GAAG,IAChC,cAAc,CAAC,mBAAmB,CAAC,QAAQ,EAAE,QAAQ,CAAC,CAAC,CAAC;AAE5D;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GAkEG;AACH,qBAAa,WAAW,CACpB,OAAO,SAAS,2BAA2B,CAAC,GAAG,EAAE,GAAG,CAAC,GAAG,2BAA2B,EACnF,gBAAgB,GAAG,UAAU,CAAC,KAAK,CAAC,EACpC,eAAe,SAAS,OAAO,GAAG,OAAO,GAAG,gBAAgB,CAC9D,SAAQ,YAAY,CAAC,OAAO,EAAE,gBAAgB,EAAE,eAAe,CAAC;IAC9D,SAAS,CAAC,kBAAkB,EAAE,gBAAgB,CAAC,eAAe,CAAC,EAAE,CAAC;IAClE,SAAS,CAAC,mBAAmB,EAAE,CAAC,CAAC,eAAe,EAAE,0BAA0B,KAAK,SAAS,CAAC,IAAI,CAAC,CAAC,EAAE,CAAC;IACpG,SAAS,CAAC,wBAAwB,EAAE,OAAO,CAAC;IAC5C,SAAS,CAAC,uBAAuB,EAAE,MAAM,CAAC;IAC1C,SAAS,CAAC,eAAe,EAAE,OAAO,CAAC;IACnC,SAAS,CAAC,uBAAuB,CAAC,EAAE,MAAM,CAAC;IAC3C,SAAS,CAAC,qBAAqB,CAAC,EAAE,MAAM,CAAC;IACzC,SAAS,CAAC,8BAA8B,EAAE,GAAG,CAAC,MAAM,CAAC,CAAC;IACtD,SAAS,CAAC,0BAA0B,EAAE,GAAG,CAAC,MAAM,CAAC,CAAC;IAClD,SAAS,CAAC,QAAQ,CAAC,kBAAkB,EAAE,GAAG,CAAC,MAAM,CAAC,CAAC;IAEnD,iBAA0B,YAAY;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;MAepC;IAEF;;OAEG;gBAEC,OAAO,GAAE,kBAAkB,CAAC,OAAO,EAAE,gBAAgB,EAAE,eAAe,CAAC,GACnE,sBAAsB,CAAC,2BAA2B,EAAE,OAAO,CAAa;IA+DhF,SAAS,CAAC,oBAAoB,IAAI,eAAe,CAAC,eAAe,EAAE,2BAA2B,CAAC;YASjF,eAAe;YAkDf,mBAAmB;YAuEnB,6BAA6B;cAQ3B,gBAAgB,CAAC,eAAe,EAAE,2BAA2B,GAAG,OAAO,CAAC,MAAM,GAAG,KAAK,CAAC;IAyBvG;;;OAGG;IACH,SAAS,CAAC,aAAa,CACnB,EAAE,OAAO,EAAE,OAAO,EAAE,EAAE,eAAe,EACrC,cAAc,EAAE,MAAM,EACtB,eAAe,EAAE,MAAM,GACxB,MAAM;IAQT;;;;OAIG;cACa,gBAAgB,CAAC,EAC7B,OAAO,EACP,OAAO,EACP,QAAQ,EACR,YAAY,GACf,EAAE,sBAAsB,GAAG,OAAO,CAAC,QAAQ,CAAC;IAmB7C;;OAEG;cACa,cAAc,CAAC,OAAO,EAAE,cAAc,EAAE,QAAQ,EAAE,QAAQ;;;;;;;;;;;;;;;IA2C1E;;OAEG;IACH,SAAS,CAAC,kBAAkB,CAAC,OAAO,EAAE,cAAc,EAAE,OAAO,CAAC,EAAE,OAAO,EAAE,QAAQ,CAAC,EAAE,MAAM;;;;;;;;;;;cAY/D,MAAM,GAAG,SAAS;;IAmB7C,SAAS,CAAC,eAAe,CACrB,OAAO,EAAE,cAAc,EACvB,QAAQ,EAAE,QAAQ,EAClB,QAAQ,EAAE,cAAc,GACzB;QACC,QAAQ,EAAE,cAAc,CAAC;QACzB,QAAQ,EAAE,QAAQ,CAAC;KACtB;IAsCD;;OAEG;IACH,SAAS,CAAC,yBAAyB,CAAC,mBAAmB,EAAE,CAAC,MAAM,GAAG,WAAW,GAAG,YAAY,CAAC,EAAE;IAgBhG;;OAEG;IACH,SAAS,CAAC,qBAAqB,CAAC,OAAO,CAAC,EAAE,OAAO;IAKjD,OAAO,CAAC,oBAAoB;IAkB5B;;OAEG;IACH,OAAO,CAAC,iBAAiB,CA4BvB;CACL;AAED,UAAU,sBAAsB;IAC5B,OAAO,EAAE,cAAc,CAAC;IACxB,OAAO,CAAC,EAAE,OAAO,CAAC;IAClB,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,YAAY,CAAC,EAAE,MAAM,CAAC;CACzB;AAED;;;;;;;;;;;;;;;;;;;;;;;GAuBG;AACH,wBAAgB,gBAAgB,CAC5B,OAAO,SAAS,mBAAmB,GAAG,mBAAmB,EACzD,QAAQ,SAAS,UAAU,GAAG,sBAAsB,CAAC,OAAO,CAAC,SAAS,CAAC,CAAC,EAC1E,MAAM,CAAC,EAAE,YAAY,CAAC,OAAO,EAAE,QAAQ,CAAC,mDAEzC"}
|