@crawlee/http 4.0.0-beta.2 → 4.0.0-beta.21
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +5 -1
- package/internals/file-download.d.ts +46 -33
- package/internals/file-download.d.ts.map +1 -1
- package/internals/file-download.js +85 -73
- package/internals/file-download.js.map +1 -1
- package/internals/http-crawler.d.ts +72 -149
- package/internals/http-crawler.d.ts.map +1 -1
- package/internals/http-crawler.js +178 -287
- package/internals/http-crawler.js.map +1 -1
- package/internals/utils.d.ts +9 -0
- package/internals/utils.d.ts.map +1 -0
- package/internals/utils.js +35 -0
- package/internals/utils.js.map +1 -0
- package/package.json +6 -6
- package/tsconfig.build.tsbuildinfo +0 -1
|
@@ -1,25 +1,13 @@
|
|
|
1
|
-
import type {
|
|
2
|
-
import
|
|
3
|
-
import type {
|
|
4
|
-
import { BasicCrawler, Configuration, CrawlerExtension } from '@crawlee/basic';
|
|
5
|
-
import type { HttpResponse } from '@crawlee/core';
|
|
1
|
+
import type { BasicCrawlerOptions, CrawlingContext, ErrorHandler, GetUserDataFromRequest, Request as CrawleeRequest, RequestHandler, RequireContextPipeline, RouterRoutes, Session } from '@crawlee/basic';
|
|
2
|
+
import { BasicCrawler, Configuration, ContextPipeline } from '@crawlee/basic';
|
|
3
|
+
import type { LoadedRequest } from '@crawlee/core';
|
|
6
4
|
import type { Awaitable, Dictionary } from '@crawlee/types';
|
|
7
5
|
import { type CheerioRoot } from '@crawlee/utils';
|
|
8
6
|
import type { RequestLike, ResponseLike } from 'content-type';
|
|
9
|
-
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
10
|
-
import type { Method, OptionsInit } from 'got-scraping';
|
|
11
|
-
import { ObjectPredicate } from 'ow';
|
|
12
7
|
import type { JsonValue } from 'type-fest';
|
|
13
|
-
/**
|
|
14
|
-
* TODO exists for BC within HttpCrawler - replace completely with StreamingHttpResponse in 4.0
|
|
15
|
-
* @internal
|
|
16
|
-
*/
|
|
17
|
-
export type PlainResponse = Omit<HttpResponse, 'body'> & IncomingMessage & {
|
|
18
|
-
body?: unknown;
|
|
19
|
-
};
|
|
20
8
|
export type HttpErrorHandler<UserData extends Dictionary = any, // with default to Dictionary we cant use a typed router in untyped crawler
|
|
21
9
|
JSONData extends JsonValue = any> = ErrorHandler<HttpCrawlingContext<UserData, JSONData>>;
|
|
22
|
-
export interface HttpCrawlerOptions<Context extends InternalHttpCrawlingContext = InternalHttpCrawlingContext> extends BasicCrawlerOptions<Context> {
|
|
10
|
+
export interface HttpCrawlerOptions<Context extends InternalHttpCrawlingContext = InternalHttpCrawlingContext, ContextExtension = Dictionary<never>, ExtendedContext extends Context = Context & ContextExtension> extends BasicCrawlerOptions<Context, ContextExtension, ExtendedContext> {
|
|
23
11
|
/**
|
|
24
12
|
* Timeout in which the HTTP request to the resource needs to finish, given in seconds.
|
|
25
13
|
*/
|
|
@@ -28,20 +16,14 @@ export interface HttpCrawlerOptions<Context extends InternalHttpCrawlingContext
|
|
|
28
16
|
* If set to true, SSL certificate errors will be ignored.
|
|
29
17
|
*/
|
|
30
18
|
ignoreSslErrors?: boolean;
|
|
31
|
-
/**
|
|
32
|
-
* If set, this crawler will be configured for all connections to use
|
|
33
|
-
* [Apify Proxy](https://console.apify.com/proxy) or your own Proxy URLs provided and rotated according to the configuration.
|
|
34
|
-
* For more information, see the [documentation](https://docs.apify.com/proxy).
|
|
35
|
-
*/
|
|
36
|
-
proxyConfiguration?: ProxyConfiguration;
|
|
37
19
|
/**
|
|
38
20
|
* Async functions that are sequentially evaluated before the navigation. Good for setting additional cookies
|
|
39
|
-
* or browser properties before navigation. The function accepts
|
|
40
|
-
* which
|
|
21
|
+
* or browser properties before navigation. The function accepts one parameter `crawlingContext`,
|
|
22
|
+
* which is passed to the `requestAsBrowser()` function the crawler calls to navigate.
|
|
41
23
|
* Example:
|
|
42
24
|
* ```
|
|
43
25
|
* preNavigationHooks: [
|
|
44
|
-
* async (crawlingContext
|
|
26
|
+
* async (crawlingContext) => {
|
|
45
27
|
* // ...
|
|
46
28
|
* },
|
|
47
29
|
* ]
|
|
@@ -50,7 +32,7 @@ export interface HttpCrawlerOptions<Context extends InternalHttpCrawlingContext
|
|
|
50
32
|
* Modyfing `pageOptions` is supported only in Playwright incognito.
|
|
51
33
|
* See {@link PrePageCreateHook}
|
|
52
34
|
*/
|
|
53
|
-
preNavigationHooks?: InternalHttpHook<
|
|
35
|
+
preNavigationHooks?: InternalHttpHook<CrawlingContext>[];
|
|
54
36
|
/**
|
|
55
37
|
* Async functions that are sequentially evaluated after the navigation. Good for checking if the navigation was successful.
|
|
56
38
|
* The function accepts `crawlingContext` as the only parameter.
|
|
@@ -63,7 +45,7 @@ export interface HttpCrawlerOptions<Context extends InternalHttpCrawlingContext
|
|
|
63
45
|
* ]
|
|
64
46
|
* ```
|
|
65
47
|
*/
|
|
66
|
-
postNavigationHooks?:
|
|
48
|
+
postNavigationHooks?: ((crawlingContext: CrawlingContextWithReponse) => Awaitable<void>)[];
|
|
67
49
|
/**
|
|
68
50
|
* An array of [MIME types](https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/MIME_types/Complete_list_of_MIME_types)
|
|
69
51
|
* you want the crawler to load and process. By default, only `text/html` and `application/xhtml+xml` MIME types are supported.
|
|
@@ -112,15 +94,24 @@ export interface HttpCrawlerOptions<Context extends InternalHttpCrawlingContext
|
|
|
112
94
|
/**
|
|
113
95
|
* @internal
|
|
114
96
|
*/
|
|
115
|
-
export type InternalHttpHook<Context> = (crawlingContext: Context
|
|
97
|
+
export type InternalHttpHook<Context> = (crawlingContext: Context) => Awaitable<void>;
|
|
116
98
|
export type HttpHook<UserData extends Dictionary = any, // with default to Dictionary we cant use a typed router in untyped crawler
|
|
117
99
|
JSONData extends JsonValue = any> = InternalHttpHook<HttpCrawlingContext<UserData, JSONData>>;
|
|
100
|
+
interface CrawlingContextWithReponse<UserData extends Dictionary = any> extends CrawlingContext<UserData> {
|
|
101
|
+
/**
|
|
102
|
+
* The request object that was successfully loaded and navigated to, including the {@link Request.loadedUrl|`loadedUrl`} property.
|
|
103
|
+
*/
|
|
104
|
+
request: LoadedRequest<CrawleeRequest<UserData>>;
|
|
105
|
+
/**
|
|
106
|
+
* The HTTP response object containing status code, headers, and other response metadata.
|
|
107
|
+
*/
|
|
108
|
+
response: Response;
|
|
109
|
+
}
|
|
118
110
|
/**
|
|
119
111
|
* @internal
|
|
120
112
|
*/
|
|
121
113
|
export interface InternalHttpCrawlingContext<UserData extends Dictionary = any, // with default to Dictionary we cant use a typed router in untyped crawler
|
|
122
|
-
JSONData extends JsonValue = any
|
|
123
|
-
Crawler = HttpCrawler<any>> extends CrawlingContext<Crawler, UserData> {
|
|
114
|
+
JSONData extends JsonValue = any> extends CrawlingContextWithReponse<UserData> {
|
|
124
115
|
/**
|
|
125
116
|
* The request body of the web page.
|
|
126
117
|
* The type depends on the `Content-Type` header of the web page:
|
|
@@ -139,7 +130,6 @@ Crawler = HttpCrawler<any>> extends CrawlingContext<Crawler, UserData> {
|
|
|
139
130
|
type: string;
|
|
140
131
|
encoding: BufferEncoding;
|
|
141
132
|
};
|
|
142
|
-
response: PlainResponse;
|
|
143
133
|
/**
|
|
144
134
|
* Wait for an element matching the selector to appear. Timeout is ignored.
|
|
145
135
|
*
|
|
@@ -167,7 +157,7 @@ Crawler = HttpCrawler<any>> extends CrawlingContext<Crawler, UserData> {
|
|
|
167
157
|
*/
|
|
168
158
|
parseWithCheerio(selector?: string, timeoutMs?: number): Promise<CheerioRoot>;
|
|
169
159
|
}
|
|
170
|
-
export interface HttpCrawlingContext<UserData extends Dictionary = any, JSONData extends JsonValue = any> extends InternalHttpCrawlingContext<UserData, JSONData
|
|
160
|
+
export interface HttpCrawlingContext<UserData extends Dictionary = any, JSONData extends JsonValue = any> extends InternalHttpCrawlingContext<UserData, JSONData> {
|
|
171
161
|
}
|
|
172
162
|
export type HttpRequestHandler<UserData extends Dictionary = any, // with default to Dictionary we cant use a typed router in untyped crawler
|
|
173
163
|
JSONData extends JsonValue = any> = RequestHandler<HttpCrawlingContext<UserData, JSONData>>;
|
|
@@ -192,11 +182,11 @@ JSONData extends JsonValue = any> = RequestHandler<HttpCrawlingContext<UserData,
|
|
|
192
182
|
*
|
|
193
183
|
* The crawler finishes when there are no more {@link Request} objects to crawl.
|
|
194
184
|
*
|
|
195
|
-
* We can use the `preNavigationHooks` to adjust
|
|
185
|
+
* We can use the `preNavigationHooks` to adjust the crawling context before the request is made:
|
|
196
186
|
*
|
|
197
187
|
* ```javascript
|
|
198
188
|
* preNavigationHooks: [
|
|
199
|
-
* (crawlingContext
|
|
189
|
+
* (crawlingContext) => {
|
|
200
190
|
* // ...
|
|
201
191
|
* },
|
|
202
192
|
* ]
|
|
@@ -238,16 +228,10 @@ JSONData extends JsonValue = any> = RequestHandler<HttpCrawlingContext<UserData,
|
|
|
238
228
|
* ```
|
|
239
229
|
* @category Crawlers
|
|
240
230
|
*/
|
|
241
|
-
export declare class HttpCrawler<Context extends InternalHttpCrawlingContext<any, any,
|
|
231
|
+
export declare class HttpCrawler<Context extends InternalHttpCrawlingContext<any, any> = InternalHttpCrawlingContext, ContextExtension = Dictionary<never>, ExtendedContext extends Context = Context & ContextExtension> extends BasicCrawler<Context, ContextExtension, ExtendedContext> {
|
|
242
232
|
readonly config: Configuration;
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
* Only available if used by the crawler.
|
|
246
|
-
*/
|
|
247
|
-
proxyConfiguration?: ProxyConfiguration;
|
|
248
|
-
protected userRequestHandlerTimeoutMillis: number;
|
|
249
|
-
protected preNavigationHooks: InternalHttpHook<Context>[];
|
|
250
|
-
protected postNavigationHooks: InternalHttpHook<Context>[];
|
|
233
|
+
protected preNavigationHooks: InternalHttpHook<CrawlingContext>[];
|
|
234
|
+
protected postNavigationHooks: ((crawlingContext: CrawlingContextWithReponse) => Awaitable<void>)[];
|
|
251
235
|
protected persistCookiesPerSession: boolean;
|
|
252
236
|
protected navigationTimeoutMillis: number;
|
|
253
237
|
protected ignoreSslErrors: boolean;
|
|
@@ -267,8 +251,6 @@ export declare class HttpCrawler<Context extends InternalHttpCrawlingContext<any
|
|
|
267
251
|
suggestResponseEncoding: import("ow").StringPredicate & import("ow").BasePredicate<string | undefined>;
|
|
268
252
|
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
269
253
|
forceResponseEncoding: import("ow").StringPredicate & import("ow").BasePredicate<string | undefined>;
|
|
270
|
-
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
271
|
-
proxyConfiguration: ObjectPredicate<object> & import("ow").BasePredicate<object | undefined>;
|
|
272
254
|
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
273
255
|
persistCookiesPerSession: import("ow").BooleanPredicate & import("ow").BasePredicate<boolean | undefined>;
|
|
274
256
|
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
@@ -279,6 +261,10 @@ export declare class HttpCrawler<Context extends InternalHttpCrawlingContext<any
|
|
|
279
261
|
preNavigationHooks: import("ow").ArrayPredicate<unknown> & import("ow").BasePredicate<unknown[] | undefined>;
|
|
280
262
|
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
281
263
|
postNavigationHooks: import("ow").ArrayPredicate<unknown> & import("ow").BasePredicate<unknown[] | undefined>;
|
|
264
|
+
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
265
|
+
contextPipelineBuilder: import("ow").ObjectPredicate<object> & import("ow").BasePredicate<object | undefined>;
|
|
266
|
+
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
267
|
+
extendContext: import("ow").Predicate<Function> & import("ow").BasePredicate<Function | undefined>;
|
|
282
268
|
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
283
269
|
requestList: import("ow").ObjectPredicate<object> & import("ow").BasePredicate<object | undefined>;
|
|
284
270
|
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
@@ -299,12 +285,16 @@ export declare class HttpCrawler<Context extends InternalHttpCrawlingContext<any
|
|
|
299
285
|
maxSessionRotations: import("ow").NumberPredicate & import("ow").BasePredicate<number | undefined>;
|
|
300
286
|
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
301
287
|
maxRequestsPerCrawl: import("ow").NumberPredicate & import("ow").BasePredicate<number | undefined>;
|
|
288
|
+
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
289
|
+
maxCrawlDepth: import("ow").NumberPredicate & import("ow").BasePredicate<number | undefined>;
|
|
302
290
|
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
303
291
|
autoscaledPoolOptions: import("ow").ObjectPredicate<object> & import("ow").BasePredicate<object | undefined>;
|
|
304
292
|
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
305
293
|
sessionPoolOptions: import("ow").ObjectPredicate<object> & import("ow").BasePredicate<object | undefined>;
|
|
306
294
|
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
307
295
|
useSessionPool: import("ow").BooleanPredicate & import("ow").BasePredicate<boolean | undefined>;
|
|
296
|
+
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
297
|
+
proxyConfiguration: import("ow").ObjectPredicate<object> & import("ow").BasePredicate<object | undefined>;
|
|
308
298
|
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
309
299
|
statusMessageLoggingInterval: import("ow").NumberPredicate & import("ow").BasePredicate<number | undefined>;
|
|
310
300
|
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
@@ -312,7 +302,7 @@ export declare class HttpCrawler<Context extends InternalHttpCrawlingContext<any
|
|
|
312
302
|
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
313
303
|
retryOnBlocked: import("ow").BooleanPredicate & import("ow").BasePredicate<boolean | undefined>;
|
|
314
304
|
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
315
|
-
respectRobotsTxtFile: import("ow").
|
|
305
|
+
respectRobotsTxtFile: import("ow").AnyPredicate<boolean | object>;
|
|
316
306
|
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
317
307
|
onSkippedRequest: import("ow").Predicate<Function> & import("ow").BasePredicate<Function | undefined>;
|
|
318
308
|
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
@@ -331,135 +321,68 @@ export declare class HttpCrawler<Context extends InternalHttpCrawlingContext<any
|
|
|
331
321
|
experiments: import("ow").ObjectPredicate<object> & import("ow").BasePredicate<object | undefined>;
|
|
332
322
|
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
333
323
|
statisticsOptions: import("ow").ObjectPredicate<object> & import("ow").BasePredicate<object | undefined>;
|
|
324
|
+
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
325
|
+
id: import("ow").StringPredicate & import("ow").BasePredicate<string | undefined>;
|
|
334
326
|
};
|
|
335
327
|
/**
|
|
336
328
|
* All `HttpCrawlerOptions` parameters are passed via an options object.
|
|
337
329
|
*/
|
|
338
|
-
constructor(options?: HttpCrawlerOptions<Context>, config?: Configuration);
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
use(extension: CrawlerExtension): void;
|
|
345
|
-
/**
|
|
346
|
-
* Wrapper around requestHandler that opens and closes pages etc.
|
|
347
|
-
*/
|
|
348
|
-
protected _runRequestHandler(crawlingContext: Context): Promise<void>;
|
|
349
|
-
protected isRequestBlocked(crawlingContext: Context): Promise<string | false>;
|
|
350
|
-
protected _handleNavigation(crawlingContext: Context): Promise<void>;
|
|
330
|
+
constructor(options?: HttpCrawlerOptions<Context, ContextExtension, ExtendedContext> & RequireContextPipeline<InternalHttpCrawlingContext, Context>, config?: Configuration);
|
|
331
|
+
protected buildContextPipeline(): ContextPipeline<CrawlingContext, InternalHttpCrawlingContext>;
|
|
332
|
+
private makeHttpRequest;
|
|
333
|
+
private processHttpResponse;
|
|
334
|
+
private handleBlockedRequestByContent;
|
|
335
|
+
protected isRequestBlocked(crawlingContext: InternalHttpCrawlingContext): Promise<string | false>;
|
|
351
336
|
/**
|
|
352
|
-
*
|
|
337
|
+
* Returns the `Cookie` header value based on the current context and
|
|
338
|
+
* any changes that occurred in the navigation hooks.
|
|
353
339
|
*/
|
|
354
|
-
protected _applyCookies({ session, request }: CrawlingContext,
|
|
340
|
+
protected _applyCookies({ session, request }: CrawlingContext, preHookCookies: string, postHookCookies: string): string;
|
|
355
341
|
/**
|
|
356
342
|
* Function to make the HTTP request. It performs optimizations
|
|
357
343
|
* on the request such as only downloading the request body if the
|
|
358
344
|
* received content type matches text/html, application/xml, application/xhtml+xml.
|
|
359
345
|
*/
|
|
360
|
-
protected _requestFunction({ request, session, proxyUrl,
|
|
346
|
+
protected _requestFunction({ request, session, proxyUrl, cookieString, }: RequestFunctionOptions): Promise<Response>;
|
|
361
347
|
/**
|
|
362
348
|
* Encodes and parses response according to the provided content type
|
|
363
349
|
*/
|
|
364
|
-
protected _parseResponse(request:
|
|
365
|
-
|
|
366
|
-
response: IncomingMessage;
|
|
350
|
+
protected _parseResponse(request: CrawleeRequest, response: Response): Promise<{
|
|
351
|
+
response: Response;
|
|
367
352
|
contentType: {
|
|
368
353
|
type: string;
|
|
369
354
|
encoding: BufferEncoding;
|
|
370
355
|
};
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
|
|
356
|
+
body: string;
|
|
357
|
+
} | {
|
|
358
|
+
body: Buffer<ArrayBuffer>;
|
|
359
|
+
response: Response;
|
|
374
360
|
contentType: {
|
|
375
361
|
type: string;
|
|
376
362
|
encoding: BufferEncoding;
|
|
377
363
|
};
|
|
378
|
-
enqueueLinks: () => Promise<{
|
|
379
|
-
processedRequests: never[];
|
|
380
|
-
unprocessedRequests: never[];
|
|
381
|
-
}>;
|
|
382
364
|
}>;
|
|
383
|
-
protected _parseHTML(response: IncomingMessage, _isXml: boolean, _crawlingContext: Context): Promise<Partial<Context>>;
|
|
384
365
|
/**
|
|
385
366
|
* Combines the provided `requestOptions` with mandatory (non-overridable) values.
|
|
386
367
|
*/
|
|
387
|
-
protected _getRequestOptions(request:
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
|
|
394
|
-
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
402
|
-
timeout?: import("got-scraping").Delays | undefined;
|
|
403
|
-
prefixUrl?: string | URL | undefined;
|
|
404
|
-
form?: Record<string, any> | undefined;
|
|
405
|
-
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
406
|
-
cookieJar?: import("got-scraping").PromiseCookieJar | import("got-scraping").ToughCookieJar | undefined;
|
|
407
|
-
signal?: AbortSignal | undefined;
|
|
408
|
-
ignoreInvalidCookies?: boolean | undefined;
|
|
409
|
-
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
410
|
-
searchParams?: string | import("got-scraping").SearchParameters | URLSearchParams | undefined;
|
|
411
|
-
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
412
|
-
dnsLookup?: import("cacheable-lookup").default["lookup"] | undefined;
|
|
413
|
-
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
414
|
-
dnsCache?: import("cacheable-lookup").default | boolean | undefined;
|
|
415
|
-
context?: Record<string, unknown> | undefined;
|
|
416
|
-
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
417
|
-
followRedirect?: boolean | ((response: import("got-scraping").PlainResponse) => boolean) | undefined;
|
|
418
|
-
maxRedirects?: number | undefined;
|
|
419
|
-
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
420
|
-
cache?: string | import("cacheable-request").StorageAdapter | boolean | undefined;
|
|
421
|
-
throwHttpErrors?: boolean | undefined;
|
|
422
|
-
username?: string | undefined;
|
|
423
|
-
password?: string | undefined;
|
|
424
|
-
http2?: boolean | undefined;
|
|
425
|
-
allowGetBody?: boolean | undefined;
|
|
426
|
-
methodRewriting?: boolean | undefined;
|
|
427
|
-
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
428
|
-
dnsLookupIpVersion?: import("got-scraping").DnsLookupIpVersion;
|
|
429
|
-
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
430
|
-
parseJson?: import("got-scraping").ParseJsonFunction | undefined;
|
|
431
|
-
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
432
|
-
stringifyJson?: import("got-scraping").StringifyJsonFunction | undefined;
|
|
433
|
-
localAddress?: string | undefined;
|
|
434
|
-
method?: Method | undefined;
|
|
435
|
-
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
436
|
-
createConnection?: import("got-scraping").CreateConnectionFunction | undefined;
|
|
437
|
-
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
438
|
-
cacheOptions?: import("got-scraping").CacheOptions | undefined;
|
|
439
|
-
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
440
|
-
https?: import("got-scraping").HttpsOptions | undefined;
|
|
441
|
-
encoding?: BufferEncoding | undefined;
|
|
442
|
-
resolveBodyOnly?: boolean | undefined;
|
|
443
|
-
isStream?: boolean | undefined;
|
|
444
|
-
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
445
|
-
responseType?: import("got-scraping").ResponseType | undefined;
|
|
446
|
-
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
447
|
-
pagination?: import("got-scraping").PaginationOptions<unknown, unknown> | undefined;
|
|
448
|
-
setHost?: boolean | undefined;
|
|
449
|
-
maxHeaderSize?: number | undefined;
|
|
450
|
-
enableUnixSockets?: boolean | undefined;
|
|
451
|
-
} & {
|
|
452
|
-
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
453
|
-
hooks?: Partial<import("got-scraping").Hooks>;
|
|
454
|
-
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
455
|
-
retry?: Partial<import("got-scraping").RetryOptions>;
|
|
456
|
-
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
457
|
-
} & import("got-scraping").Context & Required<Pick<OptionsInit, "url">> & {
|
|
458
|
-
isStream: true;
|
|
368
|
+
protected _getRequestOptions(request: CrawleeRequest, session?: Session, proxyUrl?: string): {
|
|
369
|
+
url: string;
|
|
370
|
+
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
371
|
+
method: import("@crawlee/types").AllowedHttpMethods;
|
|
372
|
+
proxyUrl: string | undefined;
|
|
373
|
+
timeout: number;
|
|
374
|
+
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
375
|
+
cookieJar: import("tough-cookie").CookieJar | undefined;
|
|
376
|
+
sessionToken: Session | undefined;
|
|
377
|
+
headers: Record<string, string> | undefined;
|
|
378
|
+
https: {
|
|
379
|
+
rejectUnauthorized: boolean;
|
|
380
|
+
};
|
|
381
|
+
body: string | undefined;
|
|
459
382
|
};
|
|
460
|
-
protected _encodeResponse(request:
|
|
383
|
+
protected _encodeResponse(request: CrawleeRequest, response: Response, encoding: BufferEncoding): {
|
|
461
384
|
encoding: BufferEncoding;
|
|
462
|
-
response:
|
|
385
|
+
response: Response;
|
|
463
386
|
};
|
|
464
387
|
/**
|
|
465
388
|
* Checks and extends supported mime types
|
|
@@ -476,10 +399,10 @@ export declare class HttpCrawler<Context extends InternalHttpCrawlingContext<any
|
|
|
476
399
|
private _requestAsBrowser;
|
|
477
400
|
}
|
|
478
401
|
interface RequestFunctionOptions {
|
|
479
|
-
request:
|
|
402
|
+
request: CrawleeRequest;
|
|
480
403
|
session?: Session;
|
|
481
404
|
proxyUrl?: string;
|
|
482
|
-
|
|
405
|
+
cookieString?: string;
|
|
483
406
|
}
|
|
484
407
|
/**
|
|
485
408
|
* Creates new {@link Router} instance that works based on request labels.
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"http-crawler.d.ts","sourceRoot":"","sources":["../../src/internals/http-crawler.ts"],"names":[],"mappings":"
|
|
1
|
+
{"version":3,"file":"http-crawler.d.ts","sourceRoot":"","sources":["../../src/internals/http-crawler.ts"],"names":[],"mappings":"AAGA,OAAO,KAAK,EAER,mBAAmB,EACnB,eAAe,EACf,YAAY,EACZ,sBAAsB,EACtB,OAAO,IAAI,cAAc,EACzB,cAAc,EACd,sBAAsB,EACtB,YAAY,EACZ,OAAO,EACV,MAAM,gBAAgB,CAAC;AACxB,OAAO,EACH,YAAY,EAEZ,aAAa,EACb,eAAe,EAOlB,MAAM,gBAAgB,CAAC;AACxB,OAAO,KAAK,EAAE,aAAa,EAAE,MAAM,eAAe,CAAC;AACnD,OAAO,KAAK,EAAE,SAAS,EAAE,UAAU,EAAE,MAAM,gBAAgB,CAAC;AAC5D,OAAO,EAAE,KAAK,WAAW,EAAuB,MAAM,gBAAgB,CAAC;AAEvE,OAAO,KAAK,EAAE,WAAW,EAAE,YAAY,EAAE,MAAM,cAAc,CAAC;AAK9D,OAAO,KAAK,EAAE,SAAS,EAAE,MAAM,WAAW,CAAC;AAwB3C,MAAM,MAAM,gBAAgB,CACxB,QAAQ,SAAS,UAAU,GAAG,GAAG,EAAE,2EAA2E;AAC9G,QAAQ,SAAS,SAAS,GAAG,GAAG,IAChC,YAAY,CAAC,mBAAmB,CAAC,QAAQ,EAAE,QAAQ,CAAC,CAAC,CAAC;AAE1D,MAAM,WAAW,kBAAkB,CAC/B,OAAO,SAAS,2BAA2B,GAAG,2BAA2B,EACzE,gBAAgB,GAAG,UAAU,CAAC,KAAK,CAAC,EACpC,eAAe,SAAS,OAAO,GAAG,OAAO,GAAG,gBAAgB,CAC9D,SAAQ,mBAAmB,CAAC,OAAO,EAAE,gBAAgB,EAAE,eAAe,CAAC;IACrE;;OAEG;IACH,qBAAqB,CAAC,EAAE,MAAM,CAAC;IAE/B;;OAEG;IACH,eAAe,CAAC,EAAE,OAAO,CAAC;IAE1B;;;;;;;;;;;;;;;OAeG;IACH,kBAAkB,CAAC,EAAE,gBAAgB,CAAC,eAAe,CAAC,EAAE,CAAC;IAEzD;;;;;;;;;;;OAWG;IACH,mBAAmB,CAAC,EAAE,CAAC,CAAC,eAAe,EAAE,0BAA0B,KAAK,SAAS,CAAC,IAAI,CAAC,CAAC,EAAE,CAAC;IAE3F;;;OAGG;IACH,mBAAmB,CAAC,EAAE,MAAM,EAAE,CAAC;IAE/B;;;;;;;;;;OAUG;IACH,uBAAuB,CAAC,EAAE,MAAM,CAAC;IAEjC;;;;;;;;OAQG;IACH,qBAAqB,CAAC,EAAE,MAAM,CAAC;IAE/B;;;;;OAKG;IACH,wBAAwB,CAAC,EAAE,OAAO,CAAC;IAEnC;;;OAGG;IACH,0BAA0B,CAAC,EAAE,MAAM,EAAE,CAAC;IAEtC;;;OAGG;IACH,8BAA8B,CAAC,EAAE,MAAM,EAAE,CAAC;CAC7C;AAED;;GAEG;AACH,MAAM,MAAM,gBAAgB,CAAC,OAAO,IAAI,CAAC,eAAe,EAAE,OAAO,KAAK,SAAS,CAAC,IAAI,CAAC,CAAC;AAEtF,MAAM,MAAM,QAAQ,CAChB,QAAQ,SAAS,UAAU,GAAG,GAAG,EAAE,2EAA2E;AAC9G,QAAQ,SAAS,SAAS,GAAG,GAAG,IAChC,gBAAgB,CAAC,mBAAmB,CAAC,QAAQ,EAAE,QAAQ,CAAC,CAAC,CAAC;AAE9D,UAAU,0BAA0B,CAChC,QAAQ,SAAS,UAAU,GAAG,GAAG,CACnC,SAAQ,eAAe,CAAC,QAAQ,CAAC;IAC/B;;OAEG;IACH,OAAO,EAAE,aAAa,CAAC,cAAc,CAAC,QAAQ,CAAC,CAAC,CAAC;IAEjD;;OAEG;IACH,QAAQ,EAAE,QAAQ,CAAC;CACtB;AAED;;GAEG;AACH,MAAM,WAAW,2BAA2B,CACxC,QAAQ,SAAS,UAAU,GAAG,GAAG,EAAE,2EAA2E;AAC9G,QAAQ,SAAS,SAAS,GAAG,GAAG,CAClC,SAAQ,0BAA0B,CAAC,QAAQ,CAAC;IAC1C;;;;;OAKG;IACH,IAAI,EAAE,MAAM,GAAG,MAAM,CAAC;IAEtB;;OAEG;IACH,IAAI,EAAE,QAAQ,CAAC;IAEf;;OAEG;IACH,WAAW,EAAE;QAAE,IAAI,EAAE,MAAM,CAAC;QAAC,QAAQ,EAAE,cAAc,CAAA;KAAE,CAAC;IAExD;;;;;;;;;;;OAWG;IACH,eAAe,CAAC,QAAQ,EAAE,MAAM,EAAE,SAAS,CAAC,EAAE,MAAM,GAAG,OAAO,CAAC,IAAI,CAAC,CAAC;IAErE;;;;;;;;;;;OAWG;IACH,gBAAgB,CAAC,QAAQ,CAAC,EAAE,MAAM,EAAE,SAAS,CAAC,EAAE,MAAM,GAAG,OAAO,CAAC,WAAW,CAAC,CAAC;CACjF;AAED,MAAM,WAAW,mBAAmB,CAAC,QAAQ,SAAS,UAAU,GAAG,GAAG,EAAE,QAAQ,SAAS,SAAS,GAAG,GAAG,CACpG,SAAQ,2BAA2B,CAAC,QAAQ,EAAE,QAAQ,CAAC;CAAG;AAE9D,MAAM,MAAM,kBAAkB,CAC1B,QAAQ,SAAS,UAAU,GAAG,GAAG,EAAE,2EAA2E;AAC9G,QAAQ,SAAS,SAAS,GAAG,GAAG,IAChC,cAAc,CAAC,mBAAmB,CAAC,QAAQ,EAAE,QAAQ,CAAC,CAAC,CAAC;AAE5D;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GAkEG;AACH,qBAAa,WAAW,CACpB,OAAO,SAAS,2BAA2B,CAAC,GAAG,EAAE,GAAG,CAAC,GAAG,2BAA2B,EACnF,gBAAgB,GAAG,UAAU,CAAC,KAAK,CAAC,EACpC,eAAe,SAAS,OAAO,GAAG,OAAO,GAAG,gBAAgB,CAC9D,SAAQ,YAAY,CAAC,OAAO,EAAE,gBAAgB,EAAE,eAAe,CAAC;aAmCxC,MAAM;IAlC5B,SAAS,CAAC,kBAAkB,EAAE,gBAAgB,CAAC,eAAe,CAAC,EAAE,CAAC;IAClE,SAAS,CAAC,mBAAmB,EAAE,CAAC,CAAC,eAAe,EAAE,0BAA0B,KAAK,SAAS,CAAC,IAAI,CAAC,CAAC,EAAE,CAAC;IACpG,SAAS,CAAC,wBAAwB,EAAE,OAAO,CAAC;IAC5C,SAAS,CAAC,uBAAuB,EAAE,MAAM,CAAC;IAC1C,SAAS,CAAC,eAAe,EAAE,OAAO,CAAC;IACnC,SAAS,CAAC,uBAAuB,CAAC,EAAE,MAAM,CAAC;IAC3C,SAAS,CAAC,qBAAqB,CAAC,EAAE,MAAM,CAAC;IACzC,SAAS,CAAC,8BAA8B,EAAE,GAAG,CAAC,MAAM,CAAC,CAAC;IACtD,SAAS,CAAC,0BAA0B,EAAE,GAAG,CAAC,MAAM,CAAC,CAAC;IAClD,SAAS,CAAC,QAAQ,CAAC,kBAAkB,EAAE,GAAG,CAAC,MAAM,CAAC,CAAC;IAEnD,iBAA0B,YAAY;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;MAepC;IAEF;;OAEG;gBAEC,OAAO,GAAE,kBAAkB,CAAC,OAAO,EAAE,gBAAgB,EAAE,eAAe,CAAC,GACnE,sBAAsB,CAAC,2BAA2B,EAAE,OAAO,CAAa,EAC1D,MAAM,gBAAkC;IAkE9D,SAAS,CAAC,oBAAoB,IAAI,eAAe,CAAC,eAAe,EAAE,2BAA2B,CAAC;YASjF,eAAe;YAkDf,mBAAmB;YAuEnB,6BAA6B;cAQ3B,gBAAgB,CAAC,eAAe,EAAE,2BAA2B,GAAG,OAAO,CAAC,MAAM,GAAG,KAAK,CAAC;IAyBvG;;;OAGG;IACH,SAAS,CAAC,aAAa,CACnB,EAAE,OAAO,EAAE,OAAO,EAAE,EAAE,eAAe,EACrC,cAAc,EAAE,MAAM,EACtB,eAAe,EAAE,MAAM,GACxB,MAAM;IAQT;;;;OAIG;cACa,gBAAgB,CAAC,EAC7B,OAAO,EACP,OAAO,EACP,QAAQ,EACR,YAAY,GACf,EAAE,sBAAsB,GAAG,OAAO,CAAC,QAAQ,CAAC;IAwB7C;;OAEG;cACa,cAAc,CAAC,OAAO,EAAE,cAAc,EAAE,QAAQ,EAAE,QAAQ;;;;;;;;;;;;;;;IA2C1E;;OAEG;IACH,SAAS,CAAC,kBAAkB,CAAC,OAAO,EAAE,cAAc,EAAE,OAAO,CAAC,EAAE,OAAO,EAAE,QAAQ,CAAC,EAAE,MAAM;;;;;;;;;;;cAY/D,MAAM,GAAG,SAAS;;IAmB7C,SAAS,CAAC,eAAe,CACrB,OAAO,EAAE,cAAc,EACvB,QAAQ,EAAE,QAAQ,EAClB,QAAQ,EAAE,cAAc,GACzB;QACC,QAAQ,EAAE,cAAc,CAAC;QACzB,QAAQ,EAAE,QAAQ,CAAC;KACtB;IAsCD;;OAEG;IACH,SAAS,CAAC,yBAAyB,CAAC,mBAAmB,EAAE,CAAC,MAAM,GAAG,WAAW,GAAG,YAAY,CAAC,EAAE;IAgBhG;;OAEG;IACH,SAAS,CAAC,qBAAqB,CAAC,OAAO,CAAC,EAAE,OAAO;IAKjD,OAAO,CAAC,oBAAoB;IAkB5B;;OAEG;IACH,OAAO,CAAC,iBAAiB,CAsCvB;CACL;AAED,UAAU,sBAAsB;IAC5B,OAAO,EAAE,cAAc,CAAC;IACxB,OAAO,CAAC,EAAE,OAAO,CAAC;IAClB,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,YAAY,CAAC,EAAE,MAAM,CAAC;CACzB;AAED;;;;;;;;;;;;;;;;;;;;;;;GAuBG;AACH,wBAAgB,gBAAgB,CAC5B,OAAO,SAAS,mBAAmB,GAAG,mBAAmB,EACzD,QAAQ,SAAS,UAAU,GAAG,sBAAsB,CAAC,OAAO,CAAC,SAAS,CAAC,CAAC,EAC1E,MAAM,CAAC,EAAE,YAAY,CAAC,OAAO,EAAE,QAAQ,CAAC,mDAEzC"}
|