@crawlee/core 3.13.3-beta.11 → 3.13.3-beta.13
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/autoscaling/autoscaled_pool.d.ts +16 -16
- package/autoscaling/autoscaled_pool.js +13 -13
- package/autoscaling/snapshotter.d.ts +1 -1
- package/autoscaling/snapshotter.js +1 -1
- package/autoscaling/system_status.d.ts +12 -12
- package/autoscaling/system_status.js +11 -11
- package/configuration.d.ts +10 -10
- package/configuration.js +4 -4
- package/crawlers/crawler_commons.d.ts +12 -12
- package/crawlers/crawler_commons.js +4 -4
- package/crawlers/statistics.d.ts +2 -2
- package/crawlers/statistics.js +1 -1
- package/enqueue_links/enqueue_links.d.ts +14 -14
- package/enqueue_links/enqueue_links.js +5 -5
- package/enqueue_links/shared.d.ts +2 -2
- package/http_clients/base-http-client.d.ts +7 -7
- package/http_clients/base-http-client.js +1 -1
- package/package.json +5 -5
- package/proxy_configuration.d.ts +11 -11
- package/proxy_configuration.js +8 -8
- package/request.d.ts +3 -3
- package/request.js +2 -2
- package/session_pool/session.d.ts +1 -1
- package/session_pool/session_pool.d.ts +12 -12
- package/session_pool/session_pool.js +10 -10
- package/storages/dataset.d.ts +15 -15
- package/storages/dataset.js +9 -9
- package/storages/key_value_store.d.ts +32 -32
- package/storages/key_value_store.js +22 -22
- package/storages/request_list.d.ts +35 -35
- package/storages/request_list.js +19 -19
- package/storages/request_provider.d.ts +19 -19
- package/storages/request_provider.js +12 -12
- package/storages/request_queue.d.ts +16 -16
- package/storages/request_queue.js +16 -16
- package/storages/request_queue_v2.d.ts +7 -7
- package/storages/request_queue_v2.js +7 -7
- package/storages/utils.d.ts +2 -2
|
@@ -16,9 +16,9 @@ export interface EnqueueLinksOptions extends RequestQueueOperationOptions {
|
|
|
16
16
|
requestQueue?: RequestProvider;
|
|
17
17
|
/** A CSS selector matching links to be enqueued. */
|
|
18
18
|
selector?: string;
|
|
19
|
-
/** Sets {@
|
|
19
|
+
/** Sets {@link Request.userData} for newly enqueued requests. */
|
|
20
20
|
userData?: Dictionary;
|
|
21
|
-
/** Sets {@
|
|
21
|
+
/** Sets {@link Request.label} for newly enqueued requests. */
|
|
22
22
|
label?: string;
|
|
23
23
|
/**
|
|
24
24
|
* If set to `true`, tells the crawler to skip navigation and process the request directly.
|
|
@@ -35,7 +35,7 @@ export interface EnqueueLinksOptions extends RequestQueueOperationOptions {
|
|
|
35
35
|
* containing glob pattern strings matching the URLs to be enqueued.
|
|
36
36
|
*
|
|
37
37
|
* The plain objects must include at least the `glob` property, which holds the glob pattern string.
|
|
38
|
-
* All remaining keys will be used as request options for the corresponding enqueued {@
|
|
38
|
+
* All remaining keys will be used as request options for the corresponding enqueued {@link Request} objects.
|
|
39
39
|
*
|
|
40
40
|
* The matching is always case-insensitive.
|
|
41
41
|
* If you need case-sensitive matching, use `regexps` property directly.
|
|
@@ -49,7 +49,7 @@ export interface EnqueueLinksOptions extends RequestQueueOperationOptions {
|
|
|
49
49
|
* containing patterns matching URLs that will **never** be enqueued.
|
|
50
50
|
*
|
|
51
51
|
* The plain objects must include either the `glob` property or the `regexp` property.
|
|
52
|
-
* All remaining keys will be used as request options for the corresponding enqueued {@
|
|
52
|
+
* All remaining keys will be used as request options for the corresponding enqueued {@link Request} objects.
|
|
53
53
|
*
|
|
54
54
|
* Glob matching is always case-insensitive.
|
|
55
55
|
* If you need case-sensitive matching, provide a regexp.
|
|
@@ -60,7 +60,7 @@ export interface EnqueueLinksOptions extends RequestQueueOperationOptions {
|
|
|
60
60
|
* containing regular expressions matching the URLs to be enqueued.
|
|
61
61
|
*
|
|
62
62
|
* The plain objects must include at least the `regexp` property, which holds the regular expression.
|
|
63
|
-
* All remaining keys will be used as request options for the corresponding enqueued {@
|
|
63
|
+
* All remaining keys will be used as request options for the corresponding enqueued {@link Request} objects.
|
|
64
64
|
*
|
|
65
65
|
* If `regexps` is an empty array or `undefined`, and `globs` are also not defined, then the function
|
|
66
66
|
* enqueues the links with the same subdomain.
|
|
@@ -70,11 +70,11 @@ export interface EnqueueLinksOptions extends RequestQueueOperationOptions {
|
|
|
70
70
|
* *NOTE:* In future versions of SDK the options will be removed.
|
|
71
71
|
* Please use `globs` or `regexps` instead.
|
|
72
72
|
*
|
|
73
|
-
* An array of {@
|
|
74
|
-
* containing {@
|
|
73
|
+
* An array of {@link PseudoUrl} strings or plain objects
|
|
74
|
+
* containing {@link PseudoUrl} strings matching the URLs to be enqueued.
|
|
75
75
|
*
|
|
76
76
|
* The plain objects must include at least the `purl` property, which holds the pseudo-URL string.
|
|
77
|
-
* All remaining keys will be used as request options for the corresponding enqueued {@
|
|
77
|
+
* All remaining keys will be used as request options for the corresponding enqueued {@link Request} objects.
|
|
78
78
|
*
|
|
79
79
|
* With a pseudo-URL string, the matching is always case-insensitive.
|
|
80
80
|
* If you need case-sensitive matching, use `regexps` property directly.
|
|
@@ -86,7 +86,7 @@ export interface EnqueueLinksOptions extends RequestQueueOperationOptions {
|
|
|
86
86
|
*/
|
|
87
87
|
pseudoUrls?: readonly PseudoUrlInput[];
|
|
88
88
|
/**
|
|
89
|
-
* Just before a new {@
|
|
89
|
+
* Just before a new {@link Request} is constructed and enqueued to the {@link RequestQueue}, this function can be used
|
|
90
90
|
* to remove it or modify its contents such as `userData`, `payload` or, most importantly `uniqueKey`. This is useful
|
|
91
91
|
* when you need to enqueue multiple `Requests` to the queue that share the same URL, but differ in methods or payloads,
|
|
92
92
|
* or to dynamically update or create `userData`.
|
|
@@ -197,11 +197,11 @@ export declare enum EnqueueStrategy {
|
|
|
197
197
|
SameOrigin = "same-origin"
|
|
198
198
|
}
|
|
199
199
|
/**
|
|
200
|
-
* This function enqueues the urls provided to the {@
|
|
200
|
+
* This function enqueues the urls provided to the {@link RequestQueue} provided. If you want to automatically find and enqueue links,
|
|
201
201
|
* you should use the context-aware `enqueueLinks` function provided on the crawler contexts.
|
|
202
202
|
*
|
|
203
203
|
* Optionally, the function allows you to filter the target links' URLs using an array of globs or regular expressions
|
|
204
|
-
* and override settings of the enqueued {@
|
|
204
|
+
* and override settings of the enqueued {@link Request} objects.
|
|
205
205
|
*
|
|
206
206
|
* **Example usage**
|
|
207
207
|
*
|
|
@@ -218,14 +218,14 @@ export declare enum EnqueueStrategy {
|
|
|
218
218
|
* ```
|
|
219
219
|
*
|
|
220
220
|
* @param options All `enqueueLinks()` parameters are passed via an options object.
|
|
221
|
-
* @returns Promise that resolves to {@
|
|
221
|
+
* @returns Promise that resolves to {@link BatchAddRequestsResult} object.
|
|
222
222
|
*/
|
|
223
223
|
export declare function enqueueLinks(options: SetRequired<EnqueueLinksOptions, 'requestQueue' | 'urls'>): Promise<BatchAddRequestsResult>;
|
|
224
224
|
/**
|
|
225
225
|
* @internal
|
|
226
|
-
* This method helps resolve the baseUrl that will be used for filtering in {@
|
|
226
|
+
* This method helps resolve the baseUrl that will be used for filtering in {@link enqueueLinks}.
|
|
227
227
|
* - If a user provides a base url, we always return it
|
|
228
|
-
* - If a user specifies {@
|
|
228
|
+
* - If a user specifies {@link EnqueueStrategy.All} strategy, they do not care if the newly found urls are on the original
|
|
229
229
|
* request domain, or a redirected one
|
|
230
230
|
* - In all other cases, we return the domain of the original request as that's the one we need to use for filtering
|
|
231
231
|
*/
|
|
@@ -61,11 +61,11 @@ var EnqueueStrategy;
|
|
|
61
61
|
EnqueueStrategy["SameOrigin"] = "same-origin";
|
|
62
62
|
})(EnqueueStrategy || (exports.EnqueueStrategy = EnqueueStrategy = {}));
|
|
63
63
|
/**
|
|
64
|
-
* This function enqueues the urls provided to the {@
|
|
64
|
+
* This function enqueues the urls provided to the {@link RequestQueue} provided. If you want to automatically find and enqueue links,
|
|
65
65
|
* you should use the context-aware `enqueueLinks` function provided on the crawler contexts.
|
|
66
66
|
*
|
|
67
67
|
* Optionally, the function allows you to filter the target links' URLs using an array of globs or regular expressions
|
|
68
|
-
* and override settings of the enqueued {@
|
|
68
|
+
* and override settings of the enqueued {@link Request} objects.
|
|
69
69
|
*
|
|
70
70
|
* **Example usage**
|
|
71
71
|
*
|
|
@@ -82,7 +82,7 @@ var EnqueueStrategy;
|
|
|
82
82
|
* ```
|
|
83
83
|
*
|
|
84
84
|
* @param options All `enqueueLinks()` parameters are passed via an options object.
|
|
85
|
-
* @returns Promise that resolves to {@
|
|
85
|
+
* @returns Promise that resolves to {@link BatchAddRequestsResult} object.
|
|
86
86
|
*/
|
|
87
87
|
async function enqueueLinks(options) {
|
|
88
88
|
if (!options || Object.keys(options).length === 0) {
|
|
@@ -215,9 +215,9 @@ async function enqueueLinks(options) {
|
|
|
215
215
|
}
|
|
216
216
|
/**
|
|
217
217
|
* @internal
|
|
218
|
-
* This method helps resolve the baseUrl that will be used for filtering in {@
|
|
218
|
+
* This method helps resolve the baseUrl that will be used for filtering in {@link enqueueLinks}.
|
|
219
219
|
* - If a user provides a base url, we always return it
|
|
220
|
-
* - If a user specifies {@
|
|
220
|
+
* - If a user specifies {@link EnqueueStrategy.All} strategy, they do not care if the newly found urls are on the original
|
|
221
221
|
* request domain, or a redirected one
|
|
222
222
|
* - In all other cases, we return the domain of the original request as that's the one we need to use for filtering
|
|
223
223
|
*/
|
|
@@ -54,8 +54,8 @@ export declare function filterRequestsByPatterns(requests: Request[], patterns?:
|
|
|
54
54
|
*/
|
|
55
55
|
export declare function createRequestOptions(sources: (string | Record<string, unknown>)[], options?: Pick<EnqueueLinksOptions, 'label' | 'userData' | 'baseUrl' | 'skipNavigation' | 'strategy'>): RequestOptions[];
|
|
56
56
|
/**
|
|
57
|
-
* Takes an Apify {@
|
|
58
|
-
* {@
|
|
57
|
+
* Takes an Apify {@link RequestOptions} object and changes its attributes in a desired way. This user-function is used
|
|
58
|
+
* {@link enqueueLinks} to modify requests before enqueuing them.
|
|
59
59
|
*/
|
|
60
60
|
export interface RequestTransform {
|
|
61
61
|
/**
|
|
@@ -13,7 +13,7 @@ type Timeout = {
|
|
|
13
13
|
};
|
|
14
14
|
type Method = 'GET' | 'POST' | 'PUT' | 'PATCH' | 'HEAD' | 'DELETE' | 'OPTIONS' | 'TRACE' | 'get' | 'post' | 'put' | 'patch' | 'head' | 'delete' | 'options' | 'trace';
|
|
15
15
|
/**
|
|
16
|
-
* Maps permitted values of the `responseType` option on {@
|
|
16
|
+
* Maps permitted values of the `responseType` option on {@link HttpRequest} to the types that they produce.
|
|
17
17
|
*/
|
|
18
18
|
export interface ResponseTypes {
|
|
19
19
|
'json': unknown;
|
|
@@ -35,7 +35,7 @@ interface PromiseCookieJar {
|
|
|
35
35
|
}
|
|
36
36
|
type SimpleHeaders = Record<string, string | string[] | undefined>;
|
|
37
37
|
/**
|
|
38
|
-
* HTTP Request as accepted by {@
|
|
38
|
+
* HTTP Request as accepted by {@link BaseHttpClient} methods.
|
|
39
39
|
*/
|
|
40
40
|
export interface HttpRequest<TResponseType extends keyof ResponseTypes = 'text'> {
|
|
41
41
|
[k: string]: unknown;
|
|
@@ -61,7 +61,7 @@ export interface HttpRequest<TResponseType extends keyof ResponseTypes = 'text'>
|
|
|
61
61
|
sessionToken?: object;
|
|
62
62
|
}
|
|
63
63
|
/**
|
|
64
|
-
* Additional options for HTTP requests that need to be handled separately before passing to {@
|
|
64
|
+
* Additional options for HTTP requests that need to be handled separately before passing to {@link BaseHttpClient}.
|
|
65
65
|
*/
|
|
66
66
|
export interface HttpRequestOptions<TResponseType extends keyof ResponseTypes = 'text'> extends HttpRequest<TResponseType> {
|
|
67
67
|
/** Search (query string) parameters to be appended to the request URL */
|
|
@@ -76,7 +76,7 @@ export interface HttpRequestOptions<TResponseType extends keyof ResponseTypes =
|
|
|
76
76
|
password?: string;
|
|
77
77
|
}
|
|
78
78
|
/**
|
|
79
|
-
* HTTP response data, without a body, as returned by {@
|
|
79
|
+
* HTTP response data, without a body, as returned by {@link BaseHttpClient} methods.
|
|
80
80
|
*/
|
|
81
81
|
export interface BaseHttpResponseData {
|
|
82
82
|
redirectUrls: URL[];
|
|
@@ -92,14 +92,14 @@ interface HttpResponseWithoutBody<TResponseType extends keyof ResponseTypes = ke
|
|
|
92
92
|
request: HttpRequest<TResponseType>;
|
|
93
93
|
}
|
|
94
94
|
/**
|
|
95
|
-
* HTTP response data as returned by the {@
|
|
95
|
+
* HTTP response data as returned by the {@link BaseHttpClient.sendRequest} method.
|
|
96
96
|
*/
|
|
97
97
|
export interface HttpResponse<TResponseType extends keyof ResponseTypes = keyof ResponseTypes> extends HttpResponseWithoutBody<TResponseType> {
|
|
98
98
|
[k: string]: any;
|
|
99
99
|
body: ResponseTypes[TResponseType];
|
|
100
100
|
}
|
|
101
101
|
/**
|
|
102
|
-
* HTTP response data as returned by the {@
|
|
102
|
+
* HTTP response data as returned by the {@link BaseHttpClient.stream} method.
|
|
103
103
|
*/
|
|
104
104
|
export interface StreamingHttpResponse extends HttpResponseWithoutBody {
|
|
105
105
|
stream: Readable;
|
|
@@ -127,7 +127,7 @@ export interface BaseHttpClient {
|
|
|
127
127
|
stream(request: HttpRequest, onRedirect?: RedirectHandler): Promise<StreamingHttpResponse>;
|
|
128
128
|
}
|
|
129
129
|
/**
|
|
130
|
-
* Converts {@
|
|
130
|
+
* Converts {@link HttpRequestOptions} to a {@link HttpRequest}.
|
|
131
131
|
*/
|
|
132
132
|
export declare function processHttpRequestOptions<TResponseType extends keyof ResponseTypes = 'text'>({ searchParams, form, json, username, password, ...request }: HttpRequestOptions<TResponseType>): HttpRequest<TResponseType>;
|
|
133
133
|
export {};
|
|
@@ -3,7 +3,7 @@ Object.defineProperty(exports, "__esModule", { value: true });
|
|
|
3
3
|
exports.processHttpRequestOptions = processHttpRequestOptions;
|
|
4
4
|
const utils_1 = require("@crawlee/utils");
|
|
5
5
|
/**
|
|
6
|
-
* Converts {@
|
|
6
|
+
* Converts {@link HttpRequestOptions} to a {@link HttpRequest}.
|
|
7
7
|
*/
|
|
8
8
|
function processHttpRequestOptions({ searchParams, form, json, username, password, ...request }) {
|
|
9
9
|
const url = new URL(request.url);
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@crawlee/core",
|
|
3
|
-
"version": "3.13.3-beta.
|
|
3
|
+
"version": "3.13.3-beta.13",
|
|
4
4
|
"description": "The scalable web crawling and scraping library for JavaScript/Node.js. Enables development of data extraction and web automation jobs (not only) with headless Chrome and Puppeteer.",
|
|
5
5
|
"engines": {
|
|
6
6
|
"node": ">=16.0.0"
|
|
@@ -59,9 +59,9 @@
|
|
|
59
59
|
"@apify/pseudo_url": "^2.0.30",
|
|
60
60
|
"@apify/timeout": "^0.3.0",
|
|
61
61
|
"@apify/utilities": "^2.7.10",
|
|
62
|
-
"@crawlee/memory-storage": "3.13.3-beta.
|
|
63
|
-
"@crawlee/types": "3.13.3-beta.
|
|
64
|
-
"@crawlee/utils": "3.13.3-beta.
|
|
62
|
+
"@crawlee/memory-storage": "3.13.3-beta.13",
|
|
63
|
+
"@crawlee/types": "3.13.3-beta.13",
|
|
64
|
+
"@crawlee/utils": "3.13.3-beta.13",
|
|
65
65
|
"@sapphire/async-queue": "^1.5.1",
|
|
66
66
|
"@vladfrangu/async_event_emitter": "^2.2.2",
|
|
67
67
|
"csv-stringify": "^6.2.0",
|
|
@@ -83,5 +83,5 @@
|
|
|
83
83
|
}
|
|
84
84
|
}
|
|
85
85
|
},
|
|
86
|
-
"gitHead": "
|
|
86
|
+
"gitHead": "dbeb9038f0ef619689f9067563cddcb375207ab6"
|
|
87
87
|
}
|
package/proxy_configuration.d.ts
CHANGED
|
@@ -15,7 +15,7 @@ export interface ProxyConfigurationOptions {
|
|
|
15
15
|
* Custom function that allows you to generate the new proxy URL dynamically. It gets the `sessionId` as a parameter and an optional parameter with the `Request` object when applicable.
|
|
16
16
|
* Can return either stringified proxy URL or `null` if the proxy should not be used. Can be asynchronous.
|
|
17
17
|
*
|
|
18
|
-
* This function is used to generate the URL when {@
|
|
18
|
+
* This function is used to generate the URL when {@link ProxyConfiguration.newUrl} or {@link ProxyConfiguration.newProxyInfo} is called.
|
|
19
19
|
*/
|
|
20
20
|
newUrlFunction?: ProxyConfigurationFunction;
|
|
21
21
|
/**
|
|
@@ -38,7 +38,7 @@ export interface TieredProxy {
|
|
|
38
38
|
/**
|
|
39
39
|
* The main purpose of the ProxyInfo object is to provide information
|
|
40
40
|
* about the current proxy connection used by the crawler for the request.
|
|
41
|
-
* Outside of crawlers, you can get this object by calling {@
|
|
41
|
+
* Outside of crawlers, you can get this object by calling {@link ProxyConfiguration.newProxyInfo}.
|
|
42
42
|
*
|
|
43
43
|
* **Example usage:**
|
|
44
44
|
*
|
|
@@ -67,7 +67,7 @@ export interface TieredProxy {
|
|
|
67
67
|
*/
|
|
68
68
|
export interface ProxyInfo {
|
|
69
69
|
/**
|
|
70
|
-
* The identifier of used {@
|
|
70
|
+
* The identifier of used {@link Session}, if used.
|
|
71
71
|
*/
|
|
72
72
|
sessionId?: string;
|
|
73
73
|
/**
|
|
@@ -129,9 +129,9 @@ declare class ProxyTierTracker {
|
|
|
129
129
|
* Configures connection to a proxy server with the provided options. Proxy servers are used to prevent target websites from blocking
|
|
130
130
|
* your crawlers based on IP address rate limits or blacklists. Setting proxy configuration in your crawlers automatically configures
|
|
131
131
|
* them to use the selected proxies for all connections. You can get information about the currently used proxy by inspecting
|
|
132
|
-
* the {@
|
|
132
|
+
* the {@link ProxyInfo} property in your crawler's page function. There, you can inspect the proxy's URL and other attributes.
|
|
133
133
|
*
|
|
134
|
-
* If you want to use your own proxies, use the {@
|
|
134
|
+
* If you want to use your own proxies, use the {@link ProxyConfigurationOptions.proxyUrls} option. Your list of proxy URLs will
|
|
135
135
|
* be rotated by the configuration if this option is provided.
|
|
136
136
|
*
|
|
137
137
|
* **Example usage:**
|
|
@@ -164,7 +164,7 @@ export declare class ProxyConfiguration {
|
|
|
164
164
|
protected log: import("@apify/log").Log;
|
|
165
165
|
protected domainTiers: Map<string, ProxyTierTracker>;
|
|
166
166
|
/**
|
|
167
|
-
* Creates a {@
|
|
167
|
+
* Creates a {@link ProxyConfiguration} instance based on the provided options. Proxy servers are used to prevent target websites from
|
|
168
168
|
* blocking your crawlers based on IP address rate limits or blacklists. Setting proxy configuration in your crawlers automatically configures
|
|
169
169
|
* them to use the selected proxies for all connections.
|
|
170
170
|
*
|
|
@@ -185,16 +185,16 @@ export declare class ProxyConfiguration {
|
|
|
185
185
|
*/
|
|
186
186
|
constructor(options?: ProxyConfigurationOptions);
|
|
187
187
|
/**
|
|
188
|
-
* This function creates a new {@
|
|
188
|
+
* This function creates a new {@link ProxyInfo} info object.
|
|
189
189
|
* It is used by CheerioCrawler and PuppeteerCrawler to generate proxy URLs and also to allow the user to inspect
|
|
190
190
|
* the currently used proxy via the requestHandler parameter `proxyInfo`.
|
|
191
191
|
* Use it if you want to work with a rich representation of a proxy URL.
|
|
192
|
-
* If you need the URL string only, use {@
|
|
192
|
+
* If you need the URL string only, use {@link ProxyConfiguration.newUrl}.
|
|
193
193
|
* @param [sessionId]
|
|
194
|
-
* Represents the identifier of user {@
|
|
194
|
+
* Represents the identifier of user {@link Session} that can be managed by the {@link SessionPool} or
|
|
195
195
|
* you can use the Apify Proxy [Session](https://docs.apify.com/proxy#sessions) identifier.
|
|
196
196
|
* When the provided sessionId is a number, it's converted to a string. Property sessionId of
|
|
197
|
-
* {@
|
|
197
|
+
* {@link ProxyInfo} is always returned as a type string.
|
|
198
198
|
*
|
|
199
199
|
* All the HTTP requests going through the proxy with the same session identifier
|
|
200
200
|
* will use the same target proxy server (i.e. the same IP address).
|
|
@@ -218,7 +218,7 @@ export declare class ProxyConfiguration {
|
|
|
218
218
|
/**
|
|
219
219
|
* Returns a new proxy URL based on provided configuration options and the `sessionId` parameter.
|
|
220
220
|
* @param [sessionId]
|
|
221
|
-
* Represents the identifier of user {@
|
|
221
|
+
* Represents the identifier of user {@link Session} that can be managed by the {@link SessionPool} or
|
|
222
222
|
* you can use the Apify Proxy [Session](https://docs.apify.com/proxy#sessions) identifier.
|
|
223
223
|
* When the provided sessionId is a number, it's converted to a string.
|
|
224
224
|
*
|
package/proxy_configuration.js
CHANGED
|
@@ -68,9 +68,9 @@ class ProxyTierTracker {
|
|
|
68
68
|
* Configures connection to a proxy server with the provided options. Proxy servers are used to prevent target websites from blocking
|
|
69
69
|
* your crawlers based on IP address rate limits or blacklists. Setting proxy configuration in your crawlers automatically configures
|
|
70
70
|
* them to use the selected proxies for all connections. You can get information about the currently used proxy by inspecting
|
|
71
|
-
* the {@
|
|
71
|
+
* the {@link ProxyInfo} property in your crawler's page function. There, you can inspect the proxy's URL and other attributes.
|
|
72
72
|
*
|
|
73
|
-
* If you want to use your own proxies, use the {@
|
|
73
|
+
* If you want to use your own proxies, use the {@link ProxyConfigurationOptions.proxyUrls} option. Your list of proxy URLs will
|
|
74
74
|
* be rotated by the configuration if this option is provided.
|
|
75
75
|
*
|
|
76
76
|
* **Example usage:**
|
|
@@ -94,7 +94,7 @@ class ProxyTierTracker {
|
|
|
94
94
|
*/
|
|
95
95
|
class ProxyConfiguration {
|
|
96
96
|
/**
|
|
97
|
-
* Creates a {@
|
|
97
|
+
* Creates a {@link ProxyConfiguration} instance based on the provided options. Proxy servers are used to prevent target websites from
|
|
98
98
|
* blocking your crawlers based on IP address rate limits or blacklists. Setting proxy configuration in your crawlers automatically configures
|
|
99
99
|
* them to use the selected proxies for all connections.
|
|
100
100
|
*
|
|
@@ -178,16 +178,16 @@ class ProxyConfiguration {
|
|
|
178
178
|
this.tieredProxyUrls = tieredProxyUrls;
|
|
179
179
|
}
|
|
180
180
|
/**
|
|
181
|
-
* This function creates a new {@
|
|
181
|
+
* This function creates a new {@link ProxyInfo} info object.
|
|
182
182
|
* It is used by CheerioCrawler and PuppeteerCrawler to generate proxy URLs and also to allow the user to inspect
|
|
183
183
|
* the currently used proxy via the requestHandler parameter `proxyInfo`.
|
|
184
184
|
* Use it if you want to work with a rich representation of a proxy URL.
|
|
185
|
-
* If you need the URL string only, use {@
|
|
185
|
+
* If you need the URL string only, use {@link ProxyConfiguration.newUrl}.
|
|
186
186
|
* @param [sessionId]
|
|
187
|
-
* Represents the identifier of user {@
|
|
187
|
+
* Represents the identifier of user {@link Session} that can be managed by the {@link SessionPool} or
|
|
188
188
|
* you can use the Apify Proxy [Session](https://docs.apify.com/proxy#sessions) identifier.
|
|
189
189
|
* When the provided sessionId is a number, it's converted to a string. Property sessionId of
|
|
190
|
-
* {@
|
|
190
|
+
* {@link ProxyInfo} is always returned as a type string.
|
|
191
191
|
*
|
|
192
192
|
* All the HTTP requests going through the proxy with the same session identifier
|
|
193
193
|
* will use the same target proxy server (i.e. the same IP address).
|
|
@@ -275,7 +275,7 @@ class ProxyConfiguration {
|
|
|
275
275
|
/**
|
|
276
276
|
* Returns a new proxy URL based on provided configuration options and the `sessionId` parameter.
|
|
277
277
|
* @param [sessionId]
|
|
278
|
-
* Represents the identifier of user {@
|
|
278
|
+
* Represents the identifier of user {@link Session} that can be managed by the {@link SessionPool} or
|
|
279
279
|
* you can use the Apify Proxy [Session](https://docs.apify.com/proxy#sessions) identifier.
|
|
280
280
|
* When the provided sessionId is a number, it's converted to a string.
|
|
281
281
|
*
|
package/request.d.ts
CHANGED
|
@@ -19,7 +19,7 @@ export declare enum RequestState {
|
|
|
19
19
|
* Each `Request` instance has the `uniqueKey` property, which can be either specified
|
|
20
20
|
* manually in the constructor or generated automatically from the URL. Two requests with the same `uniqueKey`
|
|
21
21
|
* are considered as pointing to the same web resource. This behavior applies to all Crawlee classes,
|
|
22
|
-
* such as {@
|
|
22
|
+
* such as {@link RequestList}, {@link RequestQueue}, {@link PuppeteerCrawler} or {@link PlaywrightCrawler}.
|
|
23
23
|
*
|
|
24
24
|
* > To access and examine the actual request sent over http, with all autofilled headers you can access
|
|
25
25
|
* `response.request` object from the request handler
|
|
@@ -52,7 +52,7 @@ export declare class Request<UserData extends Dictionary = Dictionary> {
|
|
|
52
52
|
* An actually loaded URL after redirects, if present. HTTP redirects are guaranteed
|
|
53
53
|
* to be included.
|
|
54
54
|
*
|
|
55
|
-
* When using {@
|
|
55
|
+
* When using {@link PuppeteerCrawler} or {@link PlaywrightCrawler}, meta tag and JavaScript redirects may,
|
|
56
56
|
* or may not be included, depending on their nature. This generally means that redirects,
|
|
57
57
|
* which happen immediately will most likely be included, but delayed redirects will not.
|
|
58
58
|
*/
|
|
@@ -131,7 +131,7 @@ export declare class Request<UserData extends Dictionary = Dictionary> {
|
|
|
131
131
|
static hashPayload(payload: BinaryLike): string;
|
|
132
132
|
}
|
|
133
133
|
/**
|
|
134
|
-
* Specifies required and optional fields for constructing a {@
|
|
134
|
+
* Specifies required and optional fields for constructing a {@link Request}.
|
|
135
135
|
*/
|
|
136
136
|
export interface RequestOptions<UserData extends Dictionary = Dictionary> {
|
|
137
137
|
/** URL of the web page to crawl. It must be a non-empty string. */
|
package/request.js
CHANGED
|
@@ -48,7 +48,7 @@ var RequestState;
|
|
|
48
48
|
* Each `Request` instance has the `uniqueKey` property, which can be either specified
|
|
49
49
|
* manually in the constructor or generated automatically from the URL. Two requests with the same `uniqueKey`
|
|
50
50
|
* are considered as pointing to the same web resource. This behavior applies to all Crawlee classes,
|
|
51
|
-
* such as {@
|
|
51
|
+
* such as {@link RequestList}, {@link RequestQueue}, {@link PuppeteerCrawler} or {@link PlaywrightCrawler}.
|
|
52
52
|
*
|
|
53
53
|
* > To access and examine the actual request sent over http, with all autofilled headers you can access
|
|
54
54
|
* `response.request` object from the request handler
|
|
@@ -95,7 +95,7 @@ class Request {
|
|
|
95
95
|
* An actually loaded URL after redirects, if present. HTTP redirects are guaranteed
|
|
96
96
|
* to be included.
|
|
97
97
|
*
|
|
98
|
-
* When using {@
|
|
98
|
+
* When using {@link PuppeteerCrawler} or {@link PlaywrightCrawler}, meta tag and JavaScript redirects may,
|
|
99
99
|
* or may not be included, depending on their nature. This generally means that redirects,
|
|
100
100
|
* which happen immediately will most likely be included, but delayed redirects will not.
|
|
101
101
|
*/
|
|
@@ -4,7 +4,7 @@ import { CookieJar } from 'tough-cookie';
|
|
|
4
4
|
import type { Log } from '@apify/log';
|
|
5
5
|
import type { ResponseLike } from '../cookie_utils';
|
|
6
6
|
/**
|
|
7
|
-
* Persistable {@
|
|
7
|
+
* Persistable {@link Session} state.
|
|
8
8
|
*/
|
|
9
9
|
export interface SessionState {
|
|
10
10
|
id: string;
|
|
@@ -7,7 +7,7 @@ import { KeyValueStore } from '../storages/key_value_store';
|
|
|
7
7
|
import type { SessionOptions } from './session';
|
|
8
8
|
import { Session } from './session';
|
|
9
9
|
/**
|
|
10
|
-
* Factory user-function which creates customized {@
|
|
10
|
+
* Factory user-function which creates customized {@link Session} instances.
|
|
11
11
|
*/
|
|
12
12
|
export interface CreateSession {
|
|
13
13
|
/**
|
|
@@ -24,7 +24,7 @@ export interface SessionPoolOptions {
|
|
|
24
24
|
* @default 1000
|
|
25
25
|
*/
|
|
26
26
|
maxPoolSize?: number;
|
|
27
|
-
/** The configuration options for {@
|
|
27
|
+
/** The configuration options for {@link Session} instances. */
|
|
28
28
|
sessionOptions?: SessionOptions;
|
|
29
29
|
/** Name or Id of `KeyValueStore` where is the `SessionPool` state stored. */
|
|
30
30
|
persistStateKeyValueStoreId?: string;
|
|
@@ -54,11 +54,11 @@ export interface SessionPoolOptions {
|
|
|
54
54
|
}
|
|
55
55
|
/**
|
|
56
56
|
* Handles the rotation, creation and persistence of user-like sessions.
|
|
57
|
-
* Creates a pool of {@
|
|
57
|
+
* Creates a pool of {@link Session} instances, that are randomly rotated.
|
|
58
58
|
* When some session is marked as blocked, it is removed and new one is created instead (the pool never returns an unusable session).
|
|
59
59
|
* Learn more in the {@doclink guides/session-management | Session management guide}.
|
|
60
60
|
*
|
|
61
|
-
* You can create one by calling the {@
|
|
61
|
+
* You can create one by calling the {@link SessionPool.open} function.
|
|
62
62
|
*
|
|
63
63
|
* Session pool is already integrated into crawlers, and it can significantly improve your scraper
|
|
64
64
|
* performance with just 2 lines of code.
|
|
@@ -73,10 +73,10 @@ export interface SessionPoolOptions {
|
|
|
73
73
|
* })
|
|
74
74
|
* ```
|
|
75
75
|
*
|
|
76
|
-
* You can configure the pool with many options. See the {@
|
|
77
|
-
* Session pool is by default persisted in default {@
|
|
76
|
+
* You can configure the pool with many options. See the {@link SessionPoolOptions}.
|
|
77
|
+
* Session pool is by default persisted in default {@link KeyValueStore}.
|
|
78
78
|
* If you want to have one pool for all runs you have to specify
|
|
79
|
-
* {@
|
|
79
|
+
* {@link SessionPoolOptions.persistStateKeyValueStoreId}.
|
|
80
80
|
*
|
|
81
81
|
* **Advanced usage:**
|
|
82
82
|
*
|
|
@@ -147,8 +147,8 @@ export declare class SessionPool extends EventEmitter {
|
|
|
147
147
|
*/
|
|
148
148
|
get retiredSessionsCount(): number;
|
|
149
149
|
/**
|
|
150
|
-
* Starts periodic state persistence and potentially loads SessionPool state from {@
|
|
151
|
-
* It is called automatically by the {@
|
|
150
|
+
* Starts periodic state persistence and potentially loads SessionPool state from {@link KeyValueStore}.
|
|
151
|
+
* It is called automatically by the {@link SessionPool.open} function.
|
|
152
152
|
*/
|
|
153
153
|
initialize(): Promise<void>;
|
|
154
154
|
/**
|
|
@@ -184,7 +184,7 @@ export declare class SessionPool extends EventEmitter {
|
|
|
184
184
|
sessions: import("./session").SessionState[];
|
|
185
185
|
};
|
|
186
186
|
/**
|
|
187
|
-
* Persists the current state of the `SessionPool` into the default {@
|
|
187
|
+
* Persists the current state of the `SessionPool` into the default {@link KeyValueStore}.
|
|
188
188
|
* The state is persisted automatically in regular intervals.
|
|
189
189
|
* @param options - Override the persistence options provided in the constructor
|
|
190
190
|
*/
|
|
@@ -242,9 +242,9 @@ export declare class SessionPool extends EventEmitter {
|
|
|
242
242
|
protected _maybeLoadSessionPool(): Promise<void>;
|
|
243
243
|
/**
|
|
244
244
|
* Opens a SessionPool and returns a promise resolving to an instance
|
|
245
|
-
* of the {@
|
|
245
|
+
* of the {@link SessionPool} class that is already initialized.
|
|
246
246
|
*
|
|
247
|
-
* For more details and code examples, see the {@
|
|
247
|
+
* For more details and code examples, see the {@link SessionPool} class.
|
|
248
248
|
*/
|
|
249
249
|
static open(options?: SessionPoolOptions, config?: Configuration): Promise<SessionPool>;
|
|
250
250
|
}
|
|
@@ -12,11 +12,11 @@ const consts_1 = require("./consts");
|
|
|
12
12
|
const session_1 = require("./session");
|
|
13
13
|
/**
|
|
14
14
|
* Handles the rotation, creation and persistence of user-like sessions.
|
|
15
|
-
* Creates a pool of {@
|
|
15
|
+
* Creates a pool of {@link Session} instances, that are randomly rotated.
|
|
16
16
|
* When some session is marked as blocked, it is removed and new one is created instead (the pool never returns an unusable session).
|
|
17
17
|
* Learn more in the {@doclink guides/session-management | Session management guide}.
|
|
18
18
|
*
|
|
19
|
-
* You can create one by calling the {@
|
|
19
|
+
* You can create one by calling the {@link SessionPool.open} function.
|
|
20
20
|
*
|
|
21
21
|
* Session pool is already integrated into crawlers, and it can significantly improve your scraper
|
|
22
22
|
* performance with just 2 lines of code.
|
|
@@ -31,10 +31,10 @@ const session_1 = require("./session");
|
|
|
31
31
|
* })
|
|
32
32
|
* ```
|
|
33
33
|
*
|
|
34
|
-
* You can configure the pool with many options. See the {@
|
|
35
|
-
* Session pool is by default persisted in default {@
|
|
34
|
+
* You can configure the pool with many options. See the {@link SessionPoolOptions}.
|
|
35
|
+
* Session pool is by default persisted in default {@link KeyValueStore}.
|
|
36
36
|
* If you want to have one pool for all runs you have to specify
|
|
37
|
-
* {@
|
|
37
|
+
* {@link SessionPoolOptions.persistStateKeyValueStoreId}.
|
|
38
38
|
*
|
|
39
39
|
* **Advanced usage:**
|
|
40
40
|
*
|
|
@@ -222,8 +222,8 @@ class SessionPool extends node_events_1.EventEmitter {
|
|
|
222
222
|
return this.sessions.filter((session) => !session.isUsable()).length;
|
|
223
223
|
}
|
|
224
224
|
/**
|
|
225
|
-
* Starts periodic state persistence and potentially loads SessionPool state from {@
|
|
226
|
-
* It is called automatically by the {@
|
|
225
|
+
* Starts periodic state persistence and potentially loads SessionPool state from {@link KeyValueStore}.
|
|
226
|
+
* It is called automatically by the {@link SessionPool.open} function.
|
|
227
227
|
*/
|
|
228
228
|
async initialize() {
|
|
229
229
|
if (this.isInitialized) {
|
|
@@ -317,7 +317,7 @@ class SessionPool extends node_events_1.EventEmitter {
|
|
|
317
317
|
};
|
|
318
318
|
}
|
|
319
319
|
/**
|
|
320
|
-
* Persists the current state of the `SessionPool` into the default {@
|
|
320
|
+
* Persists the current state of the `SessionPool` into the default {@link KeyValueStore}.
|
|
321
321
|
* The state is persisted automatically in regular intervals.
|
|
322
322
|
* @param options - Override the persistence options provided in the constructor
|
|
323
323
|
*/
|
|
@@ -437,9 +437,9 @@ class SessionPool extends node_events_1.EventEmitter {
|
|
|
437
437
|
}
|
|
438
438
|
/**
|
|
439
439
|
* Opens a SessionPool and returns a promise resolving to an instance
|
|
440
|
-
* of the {@
|
|
440
|
+
* of the {@link SessionPool} class that is already initialized.
|
|
441
441
|
*
|
|
442
|
-
* For more details and code examples, see the {@
|
|
442
|
+
* For more details and code examples, see the {@link SessionPool} class.
|
|
443
443
|
*/
|
|
444
444
|
static async open(options, config) {
|
|
445
445
|
const sessionPool = new SessionPool(options, config);
|