@crawlee/core 4.0.0-beta.3 → 4.0.0-beta.31
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +5 -1
- package/autoscaling/autoscaled_pool.d.ts +3 -5
- package/autoscaling/autoscaled_pool.d.ts.map +1 -1
- package/autoscaling/autoscaled_pool.js +3 -9
- package/autoscaling/autoscaled_pool.js.map +1 -1
- package/autoscaling/snapshotter.d.ts +3 -13
- package/autoscaling/snapshotter.d.ts.map +1 -1
- package/autoscaling/snapshotter.js +15 -29
- package/autoscaling/snapshotter.js.map +1 -1
- package/autoscaling/system_status.d.ts +0 -3
- package/autoscaling/system_status.d.ts.map +1 -1
- package/autoscaling/system_status.js +2 -3
- package/autoscaling/system_status.js.map +1 -1
- package/configuration.d.ts +5 -78
- package/configuration.d.ts.map +1 -1
- package/configuration.js +6 -102
- package/configuration.js.map +1 -1
- package/cookie_utils.d.ts +1 -1
- package/cookie_utils.d.ts.map +1 -1
- package/cookie_utils.js +8 -8
- package/cookie_utils.js.map +1 -1
- package/crawlers/context_pipeline.d.ts +61 -0
- package/crawlers/context_pipeline.d.ts.map +1 -0
- package/crawlers/context_pipeline.js +99 -0
- package/crawlers/context_pipeline.js.map +1 -0
- package/crawlers/crawler_commons.d.ts +14 -22
- package/crawlers/crawler_commons.d.ts.map +1 -1
- package/crawlers/crawler_commons.js +0 -8
- package/crawlers/crawler_commons.js.map +1 -1
- package/crawlers/error_snapshotter.d.ts +3 -2
- package/crawlers/error_snapshotter.d.ts.map +1 -1
- package/crawlers/error_snapshotter.js +2 -2
- package/crawlers/error_snapshotter.js.map +1 -1
- package/crawlers/error_tracker.d.ts +2 -1
- package/crawlers/error_tracker.d.ts.map +1 -1
- package/crawlers/error_tracker.js.map +1 -1
- package/crawlers/index.d.ts +1 -1
- package/crawlers/index.d.ts.map +1 -1
- package/crawlers/index.js +1 -1
- package/crawlers/index.js.map +1 -1
- package/crawlers/internals/types.d.ts +8 -0
- package/crawlers/internals/types.d.ts.map +1 -0
- package/crawlers/internals/types.js +2 -0
- package/crawlers/internals/types.js.map +1 -0
- package/crawlers/statistics.d.ts +15 -15
- package/crawlers/statistics.d.ts.map +1 -1
- package/crawlers/statistics.js +21 -18
- package/crawlers/statistics.js.map +1 -1
- package/enqueue_links/enqueue_links.d.ts +30 -18
- package/enqueue_links/enqueue_links.d.ts.map +1 -1
- package/enqueue_links/enqueue_links.js +41 -23
- package/enqueue_links/enqueue_links.js.map +1 -1
- package/enqueue_links/shared.d.ts +24 -7
- package/enqueue_links/shared.d.ts.map +1 -1
- package/enqueue_links/shared.js +66 -37
- package/enqueue_links/shared.js.map +1 -1
- package/errors.d.ts +18 -0
- package/errors.d.ts.map +1 -1
- package/errors.js +29 -0
- package/errors.js.map +1 -1
- package/events/event_manager.d.ts +8 -5
- package/events/event_manager.d.ts.map +1 -1
- package/events/event_manager.js +7 -9
- package/events/event_manager.js.map +1 -1
- package/events/local_event_manager.d.ts +14 -4
- package/events/local_event_manager.d.ts.map +1 -1
- package/events/local_event_manager.js +27 -39
- package/events/local_event_manager.js.map +1 -1
- package/index.d.ts +2 -1
- package/index.d.ts.map +1 -1
- package/index.js +2 -1
- package/index.js.map +1 -1
- package/log.d.ts +155 -2
- package/log.d.ts.map +1 -1
- package/log.js +111 -0
- package/log.js.map +1 -1
- package/package.json +6 -7
- package/proxy_configuration.d.ts +17 -94
- package/proxy_configuration.d.ts.map +1 -1
- package/proxy_configuration.js +18 -54
- package/proxy_configuration.js.map +1 -1
- package/recoverable_state.d.ts +121 -0
- package/recoverable_state.d.ts.map +1 -0
- package/recoverable_state.js +137 -0
- package/recoverable_state.js.map +1 -0
- package/request.d.ts +39 -5
- package/request.d.ts.map +1 -1
- package/request.js +56 -15
- package/request.js.map +1 -1
- package/service_locator.d.ts +130 -0
- package/service_locator.d.ts.map +1 -0
- package/service_locator.js +290 -0
- package/service_locator.js.map +1 -0
- package/session_pool/session.d.ts +9 -22
- package/session_pool/session.d.ts.map +1 -1
- package/session_pool/session.js +17 -5
- package/session_pool/session.js.map +1 -1
- package/session_pool/session_pool.d.ts +15 -10
- package/session_pool/session_pool.d.ts.map +1 -1
- package/session_pool/session_pool.js +23 -13
- package/session_pool/session_pool.js.map +1 -1
- package/storages/dataset.d.ts +9 -2
- package/storages/dataset.d.ts.map +1 -1
- package/storages/dataset.js +16 -6
- package/storages/dataset.js.map +1 -1
- package/storages/index.d.ts +2 -0
- package/storages/index.d.ts.map +1 -1
- package/storages/index.js +2 -0
- package/storages/index.js.map +1 -1
- package/storages/key_value_store.d.ts +13 -1
- package/storages/key_value_store.d.ts.map +1 -1
- package/storages/key_value_store.js +17 -12
- package/storages/key_value_store.js.map +1 -1
- package/storages/request_list.d.ts +9 -9
- package/storages/request_list.d.ts.map +1 -1
- package/storages/request_list.js +11 -8
- package/storages/request_list.js.map +1 -1
- package/storages/request_list_adapter.d.ts +58 -0
- package/storages/request_list_adapter.d.ts.map +1 -0
- package/storages/request_list_adapter.js +81 -0
- package/storages/request_list_adapter.js.map +1 -0
- package/storages/request_manager_tandem.d.ts +68 -0
- package/storages/request_manager_tandem.d.ts.map +1 -0
- package/storages/request_manager_tandem.js +124 -0
- package/storages/request_manager_tandem.js.map +1 -0
- package/storages/request_provider.d.ts +76 -9
- package/storages/request_provider.d.ts.map +1 -1
- package/storages/request_provider.js +92 -54
- package/storages/request_provider.js.map +1 -1
- package/storages/request_queue.d.ts +1 -1
- package/storages/request_queue.d.ts.map +1 -1
- package/storages/request_queue.js +2 -2
- package/storages/request_queue.js.map +1 -1
- package/storages/request_queue_v2.d.ts +3 -3
- package/storages/request_queue_v2.d.ts.map +1 -1
- package/storages/request_queue_v2.js +4 -5
- package/storages/request_queue_v2.js.map +1 -1
- package/storages/sitemap_request_list.d.ts +5 -5
- package/storages/sitemap_request_list.d.ts.map +1 -1
- package/storages/sitemap_request_list.js +8 -7
- package/storages/sitemap_request_list.js.map +1 -1
- package/storages/storage_manager.d.ts +10 -8
- package/storages/storage_manager.d.ts.map +1 -1
- package/storages/storage_manager.js +12 -22
- package/storages/storage_manager.js.map +1 -1
- package/storages/utils.d.ts.map +1 -1
- package/storages/utils.js +4 -3
- package/storages/utils.js.map +1 -1
- package/typedefs.d.ts +1 -1
- package/typedefs.d.ts.map +1 -1
- package/crawlers/crawler_extension.d.ts +0 -12
- package/crawlers/crawler_extension.d.ts.map +0 -1
- package/crawlers/crawler_extension.js +0 -14
- package/crawlers/crawler_extension.js.map +0 -1
- package/http_clients/base-http-client.d.ts +0 -134
- package/http_clients/base-http-client.d.ts.map +0 -1
- package/http_clients/base-http-client.js +0 -33
- package/http_clients/base-http-client.js.map +0 -1
- package/http_clients/form-data-like.d.ts +0 -67
- package/http_clients/form-data-like.d.ts.map +0 -1
- package/http_clients/form-data-like.js +0 -5
- package/http_clients/form-data-like.js.map +0 -1
- package/http_clients/got-scraping-http-client.d.ts +0 -15
- package/http_clients/got-scraping-http-client.d.ts.map +0 -1
- package/http_clients/got-scraping-http-client.js +0 -69
- package/http_clients/got-scraping-http-client.js.map +0 -1
- package/http_clients/index.d.ts +0 -3
- package/http_clients/index.d.ts.map +0 -1
- package/http_clients/index.js +0 -3
- package/http_clients/index.js.map +0 -1
- package/tsconfig.build.tsbuildinfo +0 -1
package/configuration.js
CHANGED
|
@@ -1,11 +1,9 @@
|
|
|
1
|
-
import { AsyncLocalStorage } from 'node:async_hooks';
|
|
2
1
|
import { EventEmitter } from 'node:events';
|
|
3
2
|
import { readFileSync } from 'node:fs';
|
|
4
3
|
import { join } from 'node:path';
|
|
5
|
-
import { MemoryStorage } from '@crawlee/memory-storage';
|
|
6
4
|
import { pathExistsSync } from 'fs-extra/esm';
|
|
7
|
-
import log,
|
|
8
|
-
import {
|
|
5
|
+
import { log, LogLevel } from './log.js';
|
|
6
|
+
import { serviceLocator } from './service_locator.js';
|
|
9
7
|
import { entries } from './typedefs.js';
|
|
10
8
|
/**
|
|
11
9
|
* `Configuration` is a value object holding Crawlee configuration. By default, there is a
|
|
@@ -35,7 +33,7 @@ import { entries } from './typedefs.js';
|
|
|
35
33
|
* // Create a new configuration
|
|
36
34
|
* const config = new Configuration({ persistStateIntervalMillis: 30_000 });
|
|
37
35
|
* // Pass the configuration to the crawler
|
|
38
|
-
* const crawler = new BasicCrawler({ ... }
|
|
36
|
+
* const crawler = new BasicCrawler({ configuration: config, ... });
|
|
39
37
|
* ```
|
|
40
38
|
*
|
|
41
39
|
* The configuration provided via environment variables always takes precedence. We can also
|
|
@@ -70,7 +68,6 @@ import { entries } from './typedefs.js';
|
|
|
70
68
|
* `defaultBrowserPath` | `CRAWLEE_DEFAULT_BROWSER_PATH` | -
|
|
71
69
|
* `disableBrowserSandbox` | `CRAWLEE_DISABLE_BROWSER_SANDBOX` | -
|
|
72
70
|
* `availableMemoryRatio` | `CRAWLEE_AVAILABLE_MEMORY_RATIO` | `0.25`
|
|
73
|
-
* `systemInfoV2` | `CRAWLEE_SYSTEM_INFO_V2` | false
|
|
74
71
|
* `containerized | `CRAWLEE_CONTAINERIZED | -
|
|
75
72
|
*/
|
|
76
73
|
export class Configuration {
|
|
@@ -93,7 +90,6 @@ export class Configuration {
|
|
|
93
90
|
CRAWLEE_DISABLE_BROWSER_SANDBOX: 'disableBrowserSandbox',
|
|
94
91
|
CRAWLEE_LOG_LEVEL: 'logLevel',
|
|
95
92
|
CRAWLEE_PERSIST_STORAGE: 'persistStorage',
|
|
96
|
-
CRAWLEE_SYSTEM_INFO_V2: 'systemInfoV2',
|
|
97
93
|
CRAWLEE_CONTAINERIZED: 'containerized',
|
|
98
94
|
};
|
|
99
95
|
static BOOLEAN_VARS = [
|
|
@@ -102,7 +98,6 @@ export class Configuration {
|
|
|
102
98
|
'xvfb',
|
|
103
99
|
'disableBrowserSandbox',
|
|
104
100
|
'persistStorage',
|
|
105
|
-
'systemInfoV2',
|
|
106
101
|
'containerized',
|
|
107
102
|
];
|
|
108
103
|
static INTEGER_VARS = ['memoryMbytes', 'persistStateIntervalMillis', 'systemInfoIntervalMillis'];
|
|
@@ -120,18 +115,8 @@ export class Configuration {
|
|
|
120
115
|
persistStateIntervalMillis: 60_000,
|
|
121
116
|
systemInfoIntervalMillis: 1_000,
|
|
122
117
|
persistStorage: true,
|
|
123
|
-
systemInfoV2: false,
|
|
124
118
|
};
|
|
125
|
-
/**
|
|
126
|
-
* Provides access to the current-instance-scoped Configuration without passing it around in parameters.
|
|
127
|
-
* @internal
|
|
128
|
-
*/
|
|
129
|
-
static storage = new AsyncLocalStorage();
|
|
130
119
|
options;
|
|
131
|
-
services = new Map();
|
|
132
|
-
/** @internal */
|
|
133
|
-
static globalConfig;
|
|
134
|
-
storageManagers = new Map();
|
|
135
120
|
/**
|
|
136
121
|
* Creates new `Configuration` instance with provided options. Env vars will have precedence over those.
|
|
137
122
|
*/
|
|
@@ -198,94 +183,13 @@ export class Configuration {
|
|
|
198
183
|
set(key, value) {
|
|
199
184
|
this.options.set(key, value);
|
|
200
185
|
}
|
|
201
|
-
/**
|
|
202
|
-
* Sets value for given option. Only affects the global `Configuration` instance, the value will not be propagated down to the env var.
|
|
203
|
-
* To reset a value, we can omit the `value` argument or pass `undefined` there.
|
|
204
|
-
*/
|
|
205
|
-
static set(key, value) {
|
|
206
|
-
this.getGlobalConfig().set(key, value);
|
|
207
|
-
}
|
|
208
|
-
/**
|
|
209
|
-
* Returns cached instance of {@link StorageClient} using options as defined in the environment variables or in
|
|
210
|
-
* this {@link Configuration} instance. Only first call of this method will create the client, following calls will
|
|
211
|
-
* return the same client instance.
|
|
212
|
-
*
|
|
213
|
-
* Caching works based on the `storageClientOptions`, so calling this method with different options will return
|
|
214
|
-
* multiple instances, one for each variant of the options.
|
|
215
|
-
* @internal
|
|
216
|
-
*/
|
|
217
|
-
getStorageClient() {
|
|
218
|
-
if (this.options.has('storageClient')) {
|
|
219
|
-
return this.options.get('storageClient');
|
|
220
|
-
}
|
|
221
|
-
const options = this.options.get('storageClientOptions');
|
|
222
|
-
return this.createMemoryStorage(options);
|
|
223
|
-
}
|
|
224
|
-
getEventManager() {
|
|
225
|
-
if (this.options.has('eventManager')) {
|
|
226
|
-
return this.options.get('eventManager');
|
|
227
|
-
}
|
|
228
|
-
if (this.services.has('eventManager')) {
|
|
229
|
-
return this.services.get('eventManager');
|
|
230
|
-
}
|
|
231
|
-
const eventManager = new LocalEventManager(this);
|
|
232
|
-
this.services.set('eventManager', eventManager);
|
|
233
|
-
return eventManager;
|
|
234
|
-
}
|
|
235
|
-
/**
|
|
236
|
-
* Creates an instance of MemoryStorage using options as defined in the environment variables or in this `Configuration` instance.
|
|
237
|
-
* @internal
|
|
238
|
-
*/
|
|
239
|
-
createMemoryStorage(options = {}) {
|
|
240
|
-
const cacheKey = `MemoryStorage-${JSON.stringify(options)}`;
|
|
241
|
-
if (this.services.has(cacheKey)) {
|
|
242
|
-
return this.services.get(cacheKey);
|
|
243
|
-
}
|
|
244
|
-
const storage = new MemoryStorage({
|
|
245
|
-
persistStorage: this.get('persistStorage'),
|
|
246
|
-
// Override persistStorage if user provides it via storageClientOptions
|
|
247
|
-
...options,
|
|
248
|
-
});
|
|
249
|
-
this.services.set(cacheKey, storage);
|
|
250
|
-
return storage;
|
|
251
|
-
}
|
|
252
|
-
useStorageClient(client) {
|
|
253
|
-
this.options.set('storageClient', client);
|
|
254
|
-
}
|
|
255
|
-
static useStorageClient(client) {
|
|
256
|
-
this.getGlobalConfig().useStorageClient(client);
|
|
257
|
-
}
|
|
258
|
-
useEventManager(events) {
|
|
259
|
-
this.options.set('eventManager', events);
|
|
260
|
-
}
|
|
261
186
|
/**
|
|
262
187
|
* Returns the global configuration instance. It will respect the environment variables.
|
|
188
|
+
*
|
|
189
|
+
* Delegates to the global ServiceLocator, making it the single source of truth for service management.
|
|
263
190
|
*/
|
|
264
191
|
static getGlobalConfig() {
|
|
265
|
-
|
|
266
|
-
return Configuration.storage.getStore();
|
|
267
|
-
}
|
|
268
|
-
Configuration.globalConfig ??= new Configuration();
|
|
269
|
-
return Configuration.globalConfig;
|
|
270
|
-
}
|
|
271
|
-
/**
|
|
272
|
-
* Gets default {@link StorageClient} instance.
|
|
273
|
-
*/
|
|
274
|
-
static getStorageClient() {
|
|
275
|
-
return this.getGlobalConfig().getStorageClient();
|
|
276
|
-
}
|
|
277
|
-
/**
|
|
278
|
-
* Gets default {@link EventManager} instance.
|
|
279
|
-
*/
|
|
280
|
-
static getEventManager() {
|
|
281
|
-
return this.getGlobalConfig().getEventManager();
|
|
282
|
-
}
|
|
283
|
-
/**
|
|
284
|
-
* Resets global configuration instance. The default instance holds configuration based on env vars,
|
|
285
|
-
* if we want to change them, we need to first reset the global state. Used mainly for testing purposes.
|
|
286
|
-
*/
|
|
287
|
-
static resetGlobalState() {
|
|
288
|
-
delete this.globalConfig;
|
|
192
|
+
return serviceLocator.getConfiguration();
|
|
289
193
|
}
|
|
290
194
|
buildOptions(options) {
|
|
291
195
|
// try to load configuration from crawlee.json as the baseline
|
package/configuration.js.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"configuration.js","sourceRoot":"","sources":["../src/configuration.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,
|
|
1
|
+
{"version":3,"file":"configuration.js","sourceRoot":"","sources":["../src/configuration.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,YAAY,EAAE,MAAM,aAAa,CAAC;AAC3C,OAAO,EAAE,YAAY,EAAE,MAAM,SAAS,CAAC;AACvC,OAAO,EAAE,IAAI,EAAE,MAAM,WAAW,CAAC;AAGjC,OAAO,EAAE,cAAc,EAAE,MAAM,cAAc,CAAC;AAE9C,OAAO,EAAE,GAAG,EAAE,QAAQ,EAAE,MAAM,UAAU,CAAC;AACzC,OAAO,EAAE,cAAc,EAAE,MAAM,sBAAsB,CAAC;AACtD,OAAO,EAAE,OAAO,EAAE,MAAM,eAAe,CAAC;AA8IxC;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GAgEG;AACH,MAAM,OAAO,aAAa;IACtB;;OAEG;IACO,MAAM,CAAC,OAAO,GAAe;QACnC,8BAA8B,EAAE,sBAAsB;QACtD,sBAAsB,EAAE,cAAc;QACtC,qBAAqB,EAAE,cAAc;QACrC,0BAA0B,EAAE,kBAAkB;QAC9C,kCAAkC,EAAE,wBAAwB;QAC5D,gCAAgC,EAAE,uBAAuB;QACzD,iBAAiB,EAAE,UAAU;QAC7B,qCAAqC,EAAE,4BAA4B;QACnE,gBAAgB,EAAE,UAAU;QAC5B,YAAY,EAAE,MAAM;QACpB,8BAA8B,EAAE,sBAAsB;QACtD,4BAA4B,EAAE,oBAAoB;QAClD,+BAA+B,EAAE,uBAAuB;QACxD,iBAAiB,EAAE,UAAU;QAC7B,uBAAuB,EAAE,gBAAgB;QACzC,qBAAqB,EAAE,eAAe;KACzC,CAAC;IAEQ,MAAM,CAAC,YAAY,GAAG;QAC5B,cAAc;QACd,UAAU;QACV,MAAM;QACN,uBAAuB;QACvB,gBAAgB;QAChB,eAAe;KAClB,CAAC;IAEQ,MAAM,CAAC,YAAY,GAAG,CAAC,cAAc,EAAE,4BAA4B,EAAE,0BAA0B,CAAC,CAAC;IAEjG,MAAM,CAAC,yBAAyB,GAAa,EAAE,CAAC;IAEhD,MAAM,CAAC,QAAQ,GAAe;QACpC,sBAAsB,EAAE,SAAS;QACjC,gBAAgB,EAAE,SAAS;QAC3B,qBAAqB,EAAE,SAAS;QAChC,QAAQ,EAAE,OAAO;QACjB,eAAe,EAAE,IAAI;QACrB,oBAAoB,EAAE,IAAI;QAC1B,oBAAoB,EAAE,EAAE;QACxB,YAAY,EAAE,IAAI;QAClB,QAAQ,EAAE,IAAI;QACd,0BAA0B,EAAE,MAAM;QAClC,wBAAwB,EAAE,KAAK;QAC/B,cAAc,EAAE,IAAI;KACvB,CAAC;IAEQ,OAAO,CAAqF;IAEtG;;OAEG;IACH,YAAY,UAAgC,EAAE;QAC1C,IAAI,CAAC,YAAY,CAAC,OAAO,CAAC,CAAC;QAE3B,oEAAoE;QACpE,YAAY,CAAC,mBAAmB,GAAG,EAAE,CAAC;QAEtC,6DAA6D;QAC7D,MAAM,QAAQ,GAAG,IAAI,CAAC,GAAG,CAAC,UAAU,CAAC,CAAC;QAEtC,IAAI,QAAQ,EAAE,CAAC;YACX,MAAM,KAAK,GAAG,MAAM,CAAC,QAAQ,CAAC,CAAC,QAAQ,CAAC;gBACpC,CAAC,CAAC,CAAC,QAAQ;gBACX,CAAC,CAAC,QAAQ,CAAC,MAAM,CAAC,QAAQ,CAAC,CAAC,WAAW,EAAyB,CAAC,CAAC;YACtE,GAAG,CAAC,QAAQ,CAAC,KAAiB,CAAC,CAAC;QACpC,CAAC;IACL,CAAC;IAED;;;;OAIG;IACH,GAAG,CAA0E,GAAM,EAAE,YAAgB;QACjG,iHAAiH;QACjH,IAAI,QAA4B,CAAC;QAEjC,KAAK,MAAM,CAAC,CAAC,EAAE,CAAC,CAAC,IAAI,OAAO,CAAC,aAAa,CAAC,OAAO,CAAC,EAAE,CAAC;YAClD,IAAI,GAAG,KAAK,CAAC,EAAE,CAAC;gBACZ,QAAQ,GAAG,OAAO,CAAC,GAAG,CAAC,CAAW,CAAC,CAAC;gBAEpC,IAAI,QAAQ,EAAE,CAAC;oBACX,MAAM;gBACV,CAAC;YACL,CAAC;QACL,CAAC;QAED,IAAI,QAAQ,IAAI,IAAI,EAAE,CAAC;YACnB,OAAO,IAAI,CAAC,aAAa,CAAC,GAAG,EAAE,QAAQ,CAAM,CAAC;QAClD,CAAC;QAED,+BAA+B;QAC/B,IAAI,IAAI,CAAC,OAAO,CAAC,GAAG,CAAC,GAAG,CAAC,EAAE,CAAC;YACxB,OAAO,IAAI,CAAC,OAAO,CAAC,GAAG,CAAC,GAAG,CAAM,CAAC;QACtC,CAAC;QAED,uBAAuB;QACvB,OAAO,CAAC,YAAY,IAAI,aAAa,CAAC,QAAQ,CAAC,GAA0C,CAAC,IAAI,QAAQ,CAAM,CAAC;IACjH,CAAC;IAES,aAAa,CAAC,GAA+B,EAAE,KAAgC;QACrF,IAAI,aAAa,CAAC,YAAY,CAAC,QAAQ,CAAC,GAAG,CAAC,EAAE,CAAC;YAC3C,OAAO,CAAC,KAAK,CAAC;QAClB,CAAC;QAED,IAAI,aAAa,CAAC,YAAY,CAAC,QAAQ,CAAC,GAAG,CAAC,EAAE,CAAC;YAC3C,wDAAwD;YACxD,OAAO,CAAC,CAAC,GAAG,EAAE,OAAO,EAAE,EAAE,CAAC,CAAC,QAAQ,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC,WAAW,EAAE,CAAC,CAAC;QACrE,CAAC;QAED,IAAI,aAAa,CAAC,yBAAyB,CAAC,QAAQ,CAAC,GAAG,CAAC,EAAE,CAAC;YACxD,IAAI,CAAC,KAAK;gBAAE,OAAO,EAAE,CAAC;YACtB,OAAO,MAAM,CAAC,KAAK,CAAC;iBACf,KAAK,CAAC,GAAG,CAAC;iBACV,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC,CAAC;QAC9B,CAAC;QAED,OAAO,KAAK,CAAC;IACjB,CAAC;IAED;;;OAGG;IACH,GAAG,CAAC,GAA+B,EAAE,KAAW;QAC5C,IAAI,CAAC,OAAO,CAAC,GAAG,CAAC,GAAG,EAAE,KAAK,CAAC,CAAC;IACjC,CAAC;IAED;;;;OAIG;IACH,MAAM,CAAC,eAAe;QAClB,OAAO,cAAc,CAAC,gBAAgB,EAAE,CAAC;IAC7C,CAAC;IAES,YAAY,CAAC,OAA6B;QAChD,8DAA8D;QAC9D,MAAM,IAAI,GAAG,IAAI,CAAC,OAAO,CAAC,GAAG,EAAE,EAAE,cAAc,CAAC,CAAC;QAEjD,IAAI,cAAc,CAAC,IAAI,CAAC,EAAE,CAAC;YACvB,IAAI,CAAC;gBACD,MAAM,IAAI,GAAG,YAAY,CAAC,IAAI,CAAC,CAAC;gBAChC,MAAM,qBAAqB,GAAG,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,QAAQ,EAAE,CAAC,CAAC;gBAC1D,MAAM,CAAC,MAAM,CAAC,OAAO,EAAE,qBAAqB,CAAC,CAAC;YAClD,CAAC;YAAC,MAAM,CAAC;gBACL,SAAS;YACb,CAAC;QACL,CAAC;QAED,IAAI,CAAC,OAAO,GAAG,IAAI,GAAG,CAAC,OAAO,CAAC,OAAO,CAAC,CAAC,CAAC;IAC7C,CAAC"}
|
package/cookie_utils.d.ts
CHANGED
|
@@ -7,7 +7,7 @@ export interface ResponseLike {
|
|
|
7
7
|
/**
|
|
8
8
|
* @internal
|
|
9
9
|
*/
|
|
10
|
-
export declare function getCookiesFromResponse(response:
|
|
10
|
+
export declare function getCookiesFromResponse(response: Response): Cookie[];
|
|
11
11
|
/**
|
|
12
12
|
* Calculate cookie expiration date
|
|
13
13
|
* @param maxAgeSecs
|
package/cookie_utils.d.ts.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"cookie_utils.d.ts","sourceRoot":"","sources":["../src/cookie_utils.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,MAAM,IAAI,YAAY,EAAE,MAAM,gBAAgB,CAAC;AAC7D,OAAO,EAAE,MAAM,EAAa,MAAM,cAAc,CAAC;AAKjD,MAAM,WAAW,YAAY;IACzB,GAAG,CAAC,EAAE,MAAM,GAAG,CAAC,MAAM,MAAM,CAAC,CAAC;IAC9B,OAAO,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,GAAG,MAAM,EAAE,GAAG,SAAS,CAAC,GAAG,CAAC,MAAM,MAAM,CAAC,MAAM,EAAE,MAAM,GAAG,MAAM,EAAE,GAAG,SAAS,CAAC,CAAC,CAAC;CACnH;AAED;;GAEG;AACH,wBAAgB,sBAAsB,CAAC,QAAQ,EAAE,
|
|
1
|
+
{"version":3,"file":"cookie_utils.d.ts","sourceRoot":"","sources":["../src/cookie_utils.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,MAAM,IAAI,YAAY,EAAE,MAAM,gBAAgB,CAAC;AAC7D,OAAO,EAAE,MAAM,EAAa,MAAM,cAAc,CAAC;AAKjD,MAAM,WAAW,YAAY;IACzB,GAAG,CAAC,EAAE,MAAM,GAAG,CAAC,MAAM,MAAM,CAAC,CAAC;IAC9B,OAAO,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,GAAG,MAAM,EAAE,GAAG,SAAS,CAAC,GAAG,CAAC,MAAM,MAAM,CAAC,MAAM,EAAE,MAAM,GAAG,MAAM,EAAE,GAAG,SAAS,CAAC,CAAC,CAAC;CACnH;AAED;;GAEG;AACH,wBAAgB,sBAAsB,CAAC,QAAQ,EAAE,QAAQ,GAAG,MAAM,EAAE,CASnE;AAED;;;;;GAKG;AACH,wBAAgB,8BAA8B,CAAC,UAAU,EAAE,MAAM,QAEhE;AAED;;;;;GAKG;AACH,wBAAgB,8BAA8B,CAAC,WAAW,EAAE,MAAM,GAAG,YAAY,CAehF;AAED;;;;GAIG;AACH,wBAAgB,8BAA8B,CAAC,YAAY,EAAE,YAAY,EAAE,UAAU,EAAE,MAAM,UAiB5F;AAED;;;;GAIG;AACH,wBAAgB,yBAAyB,CAAC,YAAY,EAAE,MAAM,uBAQ7D;AAED;;;;GAIG;AACH,wBAAgB,YAAY,CAAC,GAAG,EAAE,MAAM,EAAE,aAAa,EAAE,MAAM,EAAE,GAAG,MAAM,CAgCzE"}
|
package/cookie_utils.js
CHANGED
|
@@ -1,19 +1,17 @@
|
|
|
1
1
|
import { Cookie, CookieJar } from 'tough-cookie';
|
|
2
|
-
import {
|
|
2
|
+
import { serviceLocator } from './service_locator.js';
|
|
3
3
|
import { CookieParseError } from './session_pool/errors.js';
|
|
4
4
|
/**
|
|
5
5
|
* @internal
|
|
6
6
|
*/
|
|
7
7
|
export function getCookiesFromResponse(response) {
|
|
8
|
-
const headers =
|
|
9
|
-
const
|
|
8
|
+
const headers = response.headers;
|
|
9
|
+
const cookieHeaders = headers.getSetCookie();
|
|
10
10
|
try {
|
|
11
|
-
return
|
|
12
|
-
? cookieHeader.map((cookie) => Cookie.parse(cookie))
|
|
13
|
-
: [Cookie.parse(cookieHeader)];
|
|
11
|
+
return cookieHeaders.map((cookie) => Cookie.parse(cookie));
|
|
14
12
|
}
|
|
15
13
|
catch (e) {
|
|
16
|
-
throw new CookieParseError(
|
|
14
|
+
throw new CookieParseError(cookieHeaders);
|
|
17
15
|
}
|
|
18
16
|
}
|
|
19
17
|
/**
|
|
@@ -103,7 +101,9 @@ export function mergeCookies(url, sourceCookies) {
|
|
|
103
101
|
return cookie.key !== c.key && cookie.key.toLowerCase() === c.key.toLowerCase();
|
|
104
102
|
});
|
|
105
103
|
if (similarKeyCookie) {
|
|
106
|
-
|
|
104
|
+
serviceLocator
|
|
105
|
+
.getLogger()
|
|
106
|
+
.warningOnce(`Found cookies with similar name during cookie merging: '${cookie.key}' and '${similarKeyCookie.key}'`);
|
|
107
107
|
}
|
|
108
108
|
jar.setCookieSync(cookie, url);
|
|
109
109
|
}
|
package/cookie_utils.js.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"cookie_utils.js","sourceRoot":"","sources":["../src/cookie_utils.ts"],"names":[],"mappings":"AACA,OAAO,EAAE,MAAM,EAAE,SAAS,EAAE,MAAM,cAAc,CAAC;AAEjD,OAAO,EAAE,
|
|
1
|
+
{"version":3,"file":"cookie_utils.js","sourceRoot":"","sources":["../src/cookie_utils.ts"],"names":[],"mappings":"AACA,OAAO,EAAE,MAAM,EAAE,SAAS,EAAE,MAAM,cAAc,CAAC;AAEjD,OAAO,EAAE,cAAc,EAAE,MAAM,sBAAsB,CAAC;AACtD,OAAO,EAAE,gBAAgB,EAAE,MAAM,0BAA0B,CAAC;AAO5D;;GAEG;AACH,MAAM,UAAU,sBAAsB,CAAC,QAAkB;IACrD,MAAM,OAAO,GAAG,QAAQ,CAAC,OAAO,CAAC;IACjC,MAAM,aAAa,GAAG,OAAO,CAAC,YAAY,EAAE,CAAC;IAE7C,IAAI,CAAC;QACD,OAAO,aAAa,CAAC,GAAG,CAAC,CAAC,MAAM,EAAE,EAAE,CAAC,MAAM,CAAC,KAAK,CAAC,MAAM,CAAE,CAAC,CAAC;IAChE,CAAC;IAAC,OAAO,CAAC,EAAE,CAAC;QACT,MAAM,IAAI,gBAAgB,CAAC,aAAa,CAAC,CAAC;IAC9C,CAAC;AACL,CAAC;AAED;;;;;GAKG;AACH,MAAM,UAAU,8BAA8B,CAAC,UAAkB;IAC7D,OAAO,IAAI,IAAI,CAAC,IAAI,CAAC,GAAG,EAAE,GAAG,UAAU,GAAG,IAAI,CAAC,CAAC;AACpD,CAAC;AAED;;;;;GAKG;AACH,MAAM,UAAU,8BAA8B,CAAC,WAAmB;IAC9D,OAAO;QACH,IAAI,EAAE,WAAW,CAAC,GAAG;QACrB,KAAK,EAAE,WAAW,CAAC,KAAK;QACxB,iFAAiF;QACjF,6FAA6F;QAC7F,OAAO,EACH,WAAW,CAAC,OAAO,IAAI,IAAI,IAAI,WAAW,CAAC,OAAO,KAAK,UAAU;YAC7D,CAAC,CAAC,SAAS;YACX,CAAC,CAAC,WAAW,CAAC,OAAO,CAAC,OAAO,EAAE,GAAG,IAAI;QAC9C,MAAM,EAAE,WAAW,CAAC,MAAM,CAAC,CAAC,CAAC,GAAG,WAAW,CAAC,QAAQ,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,GAAG,GAAG,WAAW,CAAC,MAAM,EAAE,CAAC,CAAC,CAAC,SAAS;QAClG,IAAI,EAAE,WAAW,CAAC,IAAI,IAAI,SAAS;QACnC,MAAM,EAAE,WAAW,CAAC,MAAM;QAC1B,QAAQ,EAAE,WAAW,CAAC,QAAQ;KACjC,CAAC;AACN,CAAC;AAED;;;;GAIG;AACH,MAAM,UAAU,8BAA8B,CAAC,YAA0B,EAAE,UAAkB;IACzF,MAAM,cAAc,GAAG,YAAY,CAAC,OAAO,IAAI,OAAO,YAAY,CAAC,OAAO,KAAK,QAAQ,IAAI,YAAY,CAAC,OAAO,GAAG,CAAC,CAAC;IACpH,MAAM,OAAO,GAAG,cAAc;QAC1B,CAAC,CAAC,IAAI,IAAI,CAAC,YAAY,CAAC,OAAQ,GAAG,IAAI,CAAC;QACxC,CAAC,CAAC,8BAA8B,CAAC,UAAU,CAAC,CAAC;IACjD,MAAM,mBAAmB,GAAG,YAAY,CAAC,MAAM,EAAE,UAAU,EAAE,CAAC,GAAG,CAAC,CAAC;IACnE,MAAM,MAAM,GAAG,mBAAmB,CAAC,CAAC,CAAC,YAAY,CAAC,MAAM,EAAE,KAAK,EAAE,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,YAAY,CAAC,MAAM,CAAC;IAC3F,OAAO,IAAI,MAAM,CAAC;QACd,GAAG,EAAE,YAAY,CAAC,IAAI;QACtB,KAAK,EAAE,YAAY,CAAC,KAAK;QACzB,OAAO;QACP,MAAM;QACN,IAAI,EAAE,YAAY,CAAC,IAAI;QACvB,MAAM,EAAE,YAAY,CAAC,MAAM;QAC3B,QAAQ,EAAE,YAAY,CAAC,QAAQ;QAC/B,QAAQ,EAAE,CAAC,mBAAmB;KACjC,CAAC,CAAC;AACP,CAAC;AAED;;;;GAIG;AACH,MAAM,UAAU,yBAAyB,CAAC,YAAoB;IAC1D,MAAM,MAAM,GAAG,MAAM,CAAC,KAAK,CAAC,YAAY,CAAC,CAAC;IAE1C,IAAI,MAAM,EAAE,CAAC;QACT,OAAO,8BAA8B,CAAC,MAAM,CAAC,CAAC;IAClD,CAAC;IAED,OAAO,IAAI,CAAC;AAChB,CAAC;AAED;;;;GAIG;AACH,MAAM,UAAU,YAAY,CAAC,GAAW,EAAE,aAAuB;IAC7D,MAAM,GAAG,GAAG,IAAI,SAAS,EAAE,CAAC;IAE5B,uBAAuB;IACvB,KAAK,MAAM,kBAAkB,IAAI,aAAa,EAAE,CAAC;QAC7C,uBAAuB;QACvB,IAAI,CAAC,kBAAkB;YAAE,SAAS;QAElC,MAAM,OAAO,GAAG,kBAAkB,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC;QAElD,KAAK,MAAM,YAAY,IAAI,OAAO,EAAE,CAAC;YACjC,sBAAsB;YACtB,IAAI,CAAC,YAAY;gBAAE,SAAS;YAE5B,MAAM,MAAM,GAAG,MAAM,CAAC,KAAK,CAAC,YAAY,CAAE,CAAC;YAC3C,MAAM,gBAAgB,GAAG,GAAG,CAAC,cAAc,CAAC,GAAG,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE;gBACxD,OAAO,MAAM,CAAC,GAAG,KAAK,CAAC,CAAC,GAAG,IAAI,MAAM,CAAC,GAAG,CAAC,WAAW,EAAE,KAAK,CAAC,CAAC,GAAG,CAAC,WAAW,EAAE,CAAC;YACpF,CAAC,CAAC,CAAC;YAEH,IAAI,gBAAgB,EAAE,CAAC;gBACnB,cAAc;qBACT,SAAS,EAAE;qBACX,WAAW,CACR,2DAA2D,MAAM,CAAC,GAAG,UAAU,gBAAgB,CAAC,GAAG,GAAG,CACzG,CAAC;YACV,CAAC;YAED,GAAG,CAAC,aAAa,CAAC,MAAM,EAAE,GAAG,CAAC,CAAC;QACnC,CAAC;IACL,CAAC;IAED,OAAO,GAAG,CAAC,mBAAmB,CAAC,GAAG,CAAC,CAAC;AACxC,CAAC"}
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
import type { Awaitable } from '@crawlee/types';
|
|
2
|
+
/**
|
|
3
|
+
* Represents a middleware step in the context pipeline.
|
|
4
|
+
*
|
|
5
|
+
* @template TCrawlingContext - The input context type for this middleware
|
|
6
|
+
* @template TCrawlingContextExtension - The enhanced output context type
|
|
7
|
+
*/
|
|
8
|
+
export interface ContextMiddleware<TCrawlingContext, TCrawlingContextExtension> {
|
|
9
|
+
/** The main middleware function that enhances the context */
|
|
10
|
+
action: (context: TCrawlingContext) => Awaitable<TCrawlingContextExtension>;
|
|
11
|
+
/** Optional cleanup function called after the consumer finishes or fails */
|
|
12
|
+
cleanup?: (context: TCrawlingContext & TCrawlingContextExtension, error?: unknown) => Awaitable<void>;
|
|
13
|
+
}
|
|
14
|
+
/**
|
|
15
|
+
* Encapsulates the logic of gradually enhancing the crawling context with additional information and utilities.
|
|
16
|
+
*
|
|
17
|
+
* The enhancement is done by a chain of middlewares that are added to the pipeline after its creation.
|
|
18
|
+
* This class provides a type-safe way to build a pipeline of context transformations where each step
|
|
19
|
+
* can enhance the context with additional properties or utilities.
|
|
20
|
+
*
|
|
21
|
+
* @template TContextBase - The base context type that serves as the starting point
|
|
22
|
+
* @template TCrawlingContext - The final context type after all middleware transformations
|
|
23
|
+
*/
|
|
24
|
+
export declare abstract class ContextPipeline<TContextBase, TCrawlingContext extends TContextBase> {
|
|
25
|
+
/**
|
|
26
|
+
* Creates a new empty context pipeline.
|
|
27
|
+
*
|
|
28
|
+
* @template TContextBase - The base context type for the pipeline
|
|
29
|
+
* @returns A new ContextPipeline instance with no transformations
|
|
30
|
+
*/
|
|
31
|
+
static create<TContextBase>(): ContextPipeline<TContextBase, TContextBase>;
|
|
32
|
+
/**
|
|
33
|
+
* Adds a middleware to the pipeline, creating a new pipeline instance.
|
|
34
|
+
*
|
|
35
|
+
* This method provides a fluent interface for building context transformation pipelines.
|
|
36
|
+
* Each middleware can enhance the context with additional properties or utilities.
|
|
37
|
+
*
|
|
38
|
+
* @template TCrawlingContextExtension - The enhanced context type produced by this middleware
|
|
39
|
+
* @param middleware - The middleware to add to the pipeline
|
|
40
|
+
* @returns A new ContextPipeline instance with the added middleware
|
|
41
|
+
*/
|
|
42
|
+
abstract compose<TCrawlingContextExtension>(middleware: ContextMiddleware<TCrawlingContext, TCrawlingContextExtension>): ContextPipeline<TContextBase, TCrawlingContext & TCrawlingContextExtension>;
|
|
43
|
+
/**
|
|
44
|
+
* Executes the middleware pipeline and passes the final context to a consumer function.
|
|
45
|
+
*
|
|
46
|
+
* This method runs the crawling context through the entire middleware chain, enhancing it
|
|
47
|
+
* at each step, and then passes the final enhanced context to the provided consumer function.
|
|
48
|
+
* Proper cleanup is performed even if exceptions occur during processing.
|
|
49
|
+
*
|
|
50
|
+
* @param crawlingContext - The initial context to process through the pipeline
|
|
51
|
+
* @param finalContextConsumer - The function that will receive the final enhanced context
|
|
52
|
+
*
|
|
53
|
+
* @throws {ContextPipelineInitializationError} When a middleware fails during initialization
|
|
54
|
+
* @throws {ContextPipelineInterruptedError} When the pipeline is intentionally interrupted during initialization
|
|
55
|
+
* @throws {RequestHandlerError} When the final context consumer throws an exception
|
|
56
|
+
* @throws {ContextPipelineCleanupError} When cleanup operations fail
|
|
57
|
+
* @throws {SessionError} Session errors are re-thrown as-is for special handling
|
|
58
|
+
*/
|
|
59
|
+
abstract call(crawlingContext: TContextBase, finalContextConsumer: (finalContext: TCrawlingContext) => Awaitable<unknown>): Promise<void>;
|
|
60
|
+
}
|
|
61
|
+
//# sourceMappingURL=context_pipeline.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"context_pipeline.d.ts","sourceRoot":"","sources":["../../src/crawlers/context_pipeline.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,SAAS,EAAE,MAAM,gBAAgB,CAAC;AAUhD;;;;;GAKG;AACH,MAAM,WAAW,iBAAiB,CAAC,gBAAgB,EAAE,yBAAyB;IAC1E,6DAA6D;IAC7D,MAAM,EAAE,CAAC,OAAO,EAAE,gBAAgB,KAAK,SAAS,CAAC,yBAAyB,CAAC,CAAC;IAC5E,4EAA4E;IAC5E,OAAO,CAAC,EAAE,CAAC,OAAO,EAAE,gBAAgB,GAAG,yBAAyB,EAAE,KAAK,CAAC,EAAE,OAAO,KAAK,SAAS,CAAC,IAAI,CAAC,CAAC;CACzG;AAED;;;;;;;;;GASG;AACH,8BAAsB,eAAe,CAAC,YAAY,EAAE,gBAAgB,SAAS,YAAY;IACrF;;;;;OAKG;IACH,MAAM,CAAC,MAAM,CAAC,YAAY,KAAK,eAAe,CAAC,YAAY,EAAE,YAAY,CAAC;IAI1E;;;;;;;;;OASG;IACH,QAAQ,CAAC,OAAO,CAAC,yBAAyB,EACtC,UAAU,EAAE,iBAAiB,CAAC,gBAAgB,EAAE,yBAAyB,CAAC,GAC3E,eAAe,CAAC,YAAY,EAAE,gBAAgB,GAAG,yBAAyB,CAAC;IAE9E;;;;;;;;;;;;;;;OAeG;IACH,QAAQ,CAAC,IAAI,CACT,eAAe,EAAE,YAAY,EAC7B,oBAAoB,EAAE,CAAC,YAAY,EAAE,gBAAgB,KAAK,SAAS,CAAC,OAAO,CAAC,GAC7E,OAAO,CAAC,IAAI,CAAC;CACnB"}
|
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
import { ContextPipelineCleanupError, ContextPipelineInitializationError, ContextPipelineInterruptedError, RequestHandlerError, SessionError, } from '../errors.js';
|
|
2
|
+
/**
|
|
3
|
+
* Encapsulates the logic of gradually enhancing the crawling context with additional information and utilities.
|
|
4
|
+
*
|
|
5
|
+
* The enhancement is done by a chain of middlewares that are added to the pipeline after its creation.
|
|
6
|
+
* This class provides a type-safe way to build a pipeline of context transformations where each step
|
|
7
|
+
* can enhance the context with additional properties or utilities.
|
|
8
|
+
*
|
|
9
|
+
* @template TContextBase - The base context type that serves as the starting point
|
|
10
|
+
* @template TCrawlingContext - The final context type after all middleware transformations
|
|
11
|
+
*/
|
|
12
|
+
export class ContextPipeline {
|
|
13
|
+
/**
|
|
14
|
+
* Creates a new empty context pipeline.
|
|
15
|
+
*
|
|
16
|
+
* @template TContextBase - The base context type for the pipeline
|
|
17
|
+
* @returns A new ContextPipeline instance with no transformations
|
|
18
|
+
*/
|
|
19
|
+
static create() {
|
|
20
|
+
return new ContextPipelineImpl({ action: async (context) => context });
|
|
21
|
+
}
|
|
22
|
+
}
|
|
23
|
+
/**
|
|
24
|
+
* Implementation of the `ContextPipeline` logic. This hides implementation details such as the `middleware` and `parent`
|
|
25
|
+
* properties from the `ContextPipeline` interface, making type checking more reliable.
|
|
26
|
+
*/
|
|
27
|
+
class ContextPipelineImpl extends ContextPipeline {
|
|
28
|
+
middleware;
|
|
29
|
+
parent;
|
|
30
|
+
constructor(middleware, parent) {
|
|
31
|
+
super();
|
|
32
|
+
this.middleware = middleware;
|
|
33
|
+
this.parent = parent;
|
|
34
|
+
}
|
|
35
|
+
/**
|
|
36
|
+
* @inheritdoc
|
|
37
|
+
*/
|
|
38
|
+
compose(middleware) {
|
|
39
|
+
return new ContextPipelineImpl(middleware, this);
|
|
40
|
+
}
|
|
41
|
+
*middlewareChain() {
|
|
42
|
+
let step = this;
|
|
43
|
+
while (step !== undefined) {
|
|
44
|
+
yield step.middleware;
|
|
45
|
+
step = step.parent;
|
|
46
|
+
}
|
|
47
|
+
}
|
|
48
|
+
/**
|
|
49
|
+
* @inheritdoc
|
|
50
|
+
*/
|
|
51
|
+
async call(crawlingContext, finalContextConsumer) {
|
|
52
|
+
const middlewares = Array.from(this.middlewareChain()).reverse();
|
|
53
|
+
const cleanupStack = [];
|
|
54
|
+
let consumerException;
|
|
55
|
+
try {
|
|
56
|
+
for (const { action, cleanup } of middlewares) {
|
|
57
|
+
try {
|
|
58
|
+
const contextExtension = await action(crawlingContext);
|
|
59
|
+
Object.defineProperties(crawlingContext, Object.getOwnPropertyDescriptors(contextExtension));
|
|
60
|
+
if (cleanup) {
|
|
61
|
+
cleanupStack.push(cleanup);
|
|
62
|
+
}
|
|
63
|
+
}
|
|
64
|
+
catch (exception) {
|
|
65
|
+
if (exception instanceof SessionError) {
|
|
66
|
+
throw exception; // Session errors are re-thrown as-is
|
|
67
|
+
}
|
|
68
|
+
if (exception instanceof ContextPipelineInterruptedError) {
|
|
69
|
+
throw exception;
|
|
70
|
+
}
|
|
71
|
+
throw new ContextPipelineInitializationError(exception);
|
|
72
|
+
}
|
|
73
|
+
}
|
|
74
|
+
try {
|
|
75
|
+
await finalContextConsumer(crawlingContext);
|
|
76
|
+
}
|
|
77
|
+
catch (exception) {
|
|
78
|
+
if (exception instanceof SessionError) {
|
|
79
|
+
consumerException = exception;
|
|
80
|
+
throw exception; // Session errors are re-thrown as-is
|
|
81
|
+
}
|
|
82
|
+
consumerException = exception;
|
|
83
|
+
throw new RequestHandlerError(exception);
|
|
84
|
+
}
|
|
85
|
+
}
|
|
86
|
+
finally {
|
|
87
|
+
try {
|
|
88
|
+
for (const cleanup of cleanupStack.reverse()) {
|
|
89
|
+
await cleanup(crawlingContext, consumerException);
|
|
90
|
+
}
|
|
91
|
+
}
|
|
92
|
+
catch (exception) {
|
|
93
|
+
// eslint-disable-next-line no-unsafe-finally
|
|
94
|
+
throw new ContextPipelineCleanupError(exception);
|
|
95
|
+
}
|
|
96
|
+
}
|
|
97
|
+
}
|
|
98
|
+
}
|
|
99
|
+
//# sourceMappingURL=context_pipeline.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"context_pipeline.js","sourceRoot":"","sources":["../../src/crawlers/context_pipeline.ts"],"names":[],"mappings":"AAEA,OAAO,EACH,2BAA2B,EAC3B,kCAAkC,EAClC,+BAA+B,EAC/B,mBAAmB,EACnB,YAAY,GACf,MAAM,cAAc,CAAC;AAetB;;;;;;;;;GASG;AACH,MAAM,OAAgB,eAAe;IACjC;;;;;OAKG;IACH,MAAM,CAAC,MAAM;QACT,OAAO,IAAI,mBAAmB,CAA6B,EAAE,MAAM,EAAE,KAAK,EAAE,OAAO,EAAE,EAAE,CAAC,OAAO,EAAE,CAAC,CAAC;IACvG,CAAC;CAoCJ;AAED;;;GAGG;AACH,MAAM,mBAAyE,SAAQ,eAGtF;IAEe;IACA;IAFZ,YACY,UAA6D,EAC7D,MAAwD;QAEhE,KAAK,EAAE,CAAC;QAHA,eAAU,GAAV,UAAU,CAAmD;QAC7D,WAAM,GAAN,MAAM,CAAkD;IAGpE,CAAC;IAED;;OAEG;IACH,OAAO,CACH,UAA0E;QAE1E,OAAO,IAAI,mBAAmB,CAC1B,UAAiB,EACjB,IAAW,CACd,CAAC;IACN,CAAC;IAEO,CAAC,eAAe;QACpB,IAAI,IAAI,GAAgE,IAAW,CAAC;QAEpF,OAAO,IAAI,KAAK,SAAS,EAAE,CAAC;YACxB,MAAM,IAAI,CAAC,UAAU,CAAC;YACtB,IAAI,GAAG,IAAI,CAAC,MAAM,CAAC;QACvB,CAAC;IACL,CAAC;IAED;;OAEG;IACH,KAAK,CAAC,IAAI,CACN,eAA6B,EAC7B,oBAA0E;QAE1E,MAAM,WAAW,GAAG,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,eAAe,EAAE,CAAC,CAAC,OAAO,EAAE,CAAC;QACjE,MAAM,YAAY,GAAG,EAAE,CAAC;QACxB,IAAI,iBAAsC,CAAC;QAE3C,IAAI,CAAC;YACD,KAAK,MAAM,EAAE,MAAM,EAAE,OAAO,EAAE,IAAI,WAAW,EAAE,CAAC;gBAC5C,IAAI,CAAC;oBACD,MAAM,gBAAgB,GAAG,MAAM,MAAM,CAAC,eAAe,CAAC,CAAC;oBACvD,MAAM,CAAC,gBAAgB,CAAC,eAAe,EAAE,MAAM,CAAC,yBAAyB,CAAC,gBAAgB,CAAC,CAAC,CAAC;oBAE7F,IAAI,OAAO,EAAE,CAAC;wBACV,YAAY,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC;oBAC/B,CAAC;gBACL,CAAC;gBAAC,OAAO,SAAkB,EAAE,CAAC;oBAC1B,IAAI,SAAS,YAAY,YAAY,EAAE,CAAC;wBACpC,MAAM,SAAS,CAAC,CAAC,qCAAqC;oBAC1D,CAAC;oBACD,IAAI,SAAS,YAAY,+BAA+B,EAAE,CAAC;wBACvD,MAAM,SAAS,CAAC;oBACpB,CAAC;oBAED,MAAM,IAAI,kCAAkC,CAAC,SAAS,CAAC,CAAC;gBAC5D,CAAC;YACL,CAAC;YAED,IAAI,CAAC;gBACD,MAAM,oBAAoB,CAAC,eAAmC,CAAC,CAAC;YACpE,CAAC;YAAC,OAAO,SAAkB,EAAE,CAAC;gBAC1B,IAAI,SAAS,YAAY,YAAY,EAAE,CAAC;oBACpC,iBAAiB,GAAG,SAAS,CAAC;oBAC9B,MAAM,SAAS,CAAC,CAAC,qCAAqC;gBAC1D,CAAC;gBACD,iBAAiB,GAAG,SAAS,CAAC;gBAC9B,MAAM,IAAI,mBAAmB,CAAC,SAAS,CAAC,CAAC;YAC7C,CAAC;QACL,CAAC;gBAAS,CAAC;YACP,IAAI,CAAC;gBACD,KAAK,MAAM,OAAO,IAAI,YAAY,CAAC,OAAO,EAAE,EAAE,CAAC;oBAC3C,MAAM,OAAO,CAAC,eAAe,EAAE,iBAAiB,CAAC,CAAC;gBACtD,CAAC;YACL,CAAC;YAAC,OAAO,SAAkB,EAAE,CAAC;gBAC1B,6CAA6C;gBAC7C,MAAM,IAAI,2BAA2B,CAAC,SAAS,CAAC,CAAC;YACrD,CAAC;QACL,CAAC;IACL,CAAC;CACJ"}
|
|
@@ -1,11 +1,8 @@
|
|
|
1
|
-
import type {
|
|
2
|
-
|
|
3
|
-
import type { OptionsInit, Response as GotResponse } from 'got-scraping';
|
|
4
|
-
import type { ReadonlyDeep } from 'type-fest';
|
|
1
|
+
import type { Dictionary, HttpRequestOptions, ProxyInfo, SendRequestOptions } from '@crawlee/types';
|
|
2
|
+
import type { ReadonlyDeep, SetRequired } from 'type-fest';
|
|
5
3
|
import type { Configuration } from '../configuration.js';
|
|
6
4
|
import type { EnqueueLinksOptions } from '../enqueue_links/enqueue_links.js';
|
|
7
|
-
import type {
|
|
8
|
-
import type { ProxyInfo } from '../proxy_configuration.js';
|
|
5
|
+
import type { CrawleeLogger } from '../log.js';
|
|
9
6
|
import type { Request, Source } from '../request.js';
|
|
10
7
|
import type { Session } from '../session_pool/session.js';
|
|
11
8
|
import type { Dataset } from '../storages/dataset.js';
|
|
@@ -22,7 +19,7 @@ export type LoadedRequest<R extends Request> = WithRequired<R, 'id' | 'loadedUrl
|
|
|
22
19
|
export type LoadedContext<Context extends RestrictedCrawlingContext> = IsAny<Context> extends true ? Context : {
|
|
23
20
|
request: LoadedRequest<Context['request']>;
|
|
24
21
|
} & Omit<Context, 'request'>;
|
|
25
|
-
export interface RestrictedCrawlingContext<UserData extends Dictionary = Dictionary>
|
|
22
|
+
export interface RestrictedCrawlingContext<UserData extends Dictionary = Dictionary> {
|
|
26
23
|
id: string;
|
|
27
24
|
session?: Session;
|
|
28
25
|
/**
|
|
@@ -66,7 +63,7 @@ export interface RestrictedCrawlingContext<UserData extends Dictionary = Diction
|
|
|
66
63
|
*
|
|
67
64
|
* @param [options] All `enqueueLinks()` parameters are passed via an options object.
|
|
68
65
|
*/
|
|
69
|
-
enqueueLinks: (options
|
|
66
|
+
enqueueLinks: (options: ReadonlyDeep<Omit<SetRequired<EnqueueLinksOptions, 'urls'>, 'requestQueue' | 'robotsTxtFile'>>) => Promise<unknown>;
|
|
70
67
|
/**
|
|
71
68
|
* Add requests directly to the request queue.
|
|
72
69
|
*
|
|
@@ -85,10 +82,9 @@ export interface RestrictedCrawlingContext<UserData extends Dictionary = Diction
|
|
|
85
82
|
/**
|
|
86
83
|
* A preconfigured logger for the request handler.
|
|
87
84
|
*/
|
|
88
|
-
log:
|
|
85
|
+
log: CrawleeLogger;
|
|
89
86
|
}
|
|
90
|
-
export interface CrawlingContext<
|
|
91
|
-
crawler: Crawler;
|
|
87
|
+
export interface CrawlingContext<UserData extends Dictionary = Dictionary> extends RestrictedCrawlingContext<UserData> {
|
|
92
88
|
/**
|
|
93
89
|
* This function automatically finds and enqueues links from the current page, adding them to the {@link RequestQueue}
|
|
94
90
|
* currently used by the crawler.
|
|
@@ -114,14 +110,9 @@ export interface CrawlingContext<Crawler = unknown, UserData extends Dictionary
|
|
|
114
110
|
* @param [options] All `enqueueLinks()` parameters are passed via an options object.
|
|
115
111
|
* @returns Promise that resolves to {@link BatchAddRequestsResult} object.
|
|
116
112
|
*/
|
|
117
|
-
enqueueLinks(options
|
|
113
|
+
enqueueLinks(options: ReadonlyDeep<Omit<SetRequired<EnqueueLinksOptions, 'urls'>, 'requestQueue' | 'robotsTxtFile'>> & Pick<EnqueueLinksOptions, 'requestQueue' | 'robotsTxtFile'>): Promise<unknown>;
|
|
118
114
|
/**
|
|
119
|
-
*
|
|
120
|
-
*/
|
|
121
|
-
getKeyValueStore: (idOrName?: string) => Promise<KeyValueStore>;
|
|
122
|
-
/**
|
|
123
|
-
* Fires HTTP request via [`got-scraping`](https://crawlee.dev/js/docs/guides/got-scraping), allowing to override the request
|
|
124
|
-
* options on the fly.
|
|
115
|
+
* Fires HTTP request via the internal HTTP client, allowing to override the request options on the fly.
|
|
125
116
|
*
|
|
126
117
|
* This is handy when you work with a browser crawler but want to execute some requests outside it (e.g. API requests).
|
|
127
118
|
* Check the [Skipping navigations for certain requests](https://crawlee.dev/js/docs/examples/skip-navigation) example for
|
|
@@ -136,7 +127,11 @@ export interface CrawlingContext<Crawler = unknown, UserData extends Dictionary
|
|
|
136
127
|
* },
|
|
137
128
|
* ```
|
|
138
129
|
*/
|
|
139
|
-
sendRequest
|
|
130
|
+
sendRequest: (requestOverrides?: Partial<HttpRequestOptions>, optionsOverrides?: SendRequestOptions) => Promise<Response>;
|
|
131
|
+
/**
|
|
132
|
+
* Register a function to be called at the very end of the request handling process. This is useful for resources that should be accessible to error handlers, for instance.
|
|
133
|
+
*/
|
|
134
|
+
registerDeferredCleanup(cleanup: () => Promise<unknown>): void;
|
|
140
135
|
}
|
|
141
136
|
/**
|
|
142
137
|
* A partial implementation of {@link RestrictedCrawlingContext} that stores parameters of calls to context methods for later inspection.
|
|
@@ -149,7 +144,6 @@ export declare class RequestHandlerResult {
|
|
|
149
144
|
private _keyValueStoreChanges;
|
|
150
145
|
private pushDataCalls;
|
|
151
146
|
private addRequestsCalls;
|
|
152
|
-
private enqueueLinksCalls;
|
|
153
147
|
constructor(config: Configuration, crawleeStateKey: string);
|
|
154
148
|
/**
|
|
155
149
|
* A record of calls to {@link RestrictedCrawlingContext.pushData}, {@link RestrictedCrawlingContext.addRequests}, {@link RestrictedCrawlingContext.enqueueLinks} made by a request handler.
|
|
@@ -157,7 +151,6 @@ export declare class RequestHandlerResult {
|
|
|
157
151
|
get calls(): ReadonlyDeep<{
|
|
158
152
|
pushData: Parameters<RestrictedCrawlingContext['pushData']>[];
|
|
159
153
|
addRequests: Parameters<RestrictedCrawlingContext['addRequests']>[];
|
|
160
|
-
enqueueLinks: Parameters<RestrictedCrawlingContext['enqueueLinks']>[];
|
|
161
154
|
}>;
|
|
162
155
|
/**
|
|
163
156
|
* A record of changes made to key-value stores by a request handler.
|
|
@@ -188,7 +181,6 @@ export declare class RequestHandlerResult {
|
|
|
188
181
|
label?: string;
|
|
189
182
|
}[]>;
|
|
190
183
|
pushData: RestrictedCrawlingContext['pushData'];
|
|
191
|
-
enqueueLinks: RestrictedCrawlingContext['enqueueLinks'];
|
|
192
184
|
addRequests: RestrictedCrawlingContext['addRequests'];
|
|
193
185
|
useState: RestrictedCrawlingContext['useState'];
|
|
194
186
|
getKeyValueStore: RestrictedCrawlingContext['getKeyValueStore'];
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"crawler_commons.d.ts","sourceRoot":"","sources":["../../src/crawlers/crawler_commons.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,
|
|
1
|
+
{"version":3,"file":"crawler_commons.d.ts","sourceRoot":"","sources":["../../src/crawlers/crawler_commons.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,UAAU,EAAE,kBAAkB,EAAE,SAAS,EAAE,kBAAkB,EAAE,MAAM,gBAAgB,CAAC;AACpG,OAAO,KAAK,EAAE,YAAY,EAAE,WAAW,EAAE,MAAM,WAAW,CAAC;AAE3D,OAAO,KAAK,EAAE,aAAa,EAAE,MAAM,qBAAqB,CAAC;AACzD,OAAO,KAAK,EAAE,mBAAmB,EAAE,MAAM,mCAAmC,CAAC;AAC7E,OAAO,KAAK,EAAE,aAAa,EAAE,MAAM,WAAW,CAAC;AAC/C,OAAO,KAAK,EAAE,OAAO,EAAE,MAAM,EAAE,MAAM,eAAe,CAAC;AACrD,OAAO,KAAK,EAAE,OAAO,EAAE,MAAM,4BAA4B,CAAC;AAC1D,OAAO,KAAK,EAAE,OAAO,EAAE,MAAM,wBAAwB,CAAC;AACtD,OAAO,EAAE,aAAa,EAAE,KAAK,aAAa,EAAE,MAAM,gCAAgC,CAAC;AACnF,OAAO,KAAK,EAAE,4BAA4B,EAAE,MAAM,iCAAiC,CAAC;AAEpF,gBAAgB;AAChB,MAAM,MAAM,KAAK,CAAC,CAAC,IAAI,CAAC,SAAS,CAAC,GAAG,CAAC,GAAG,IAAI,GAAG,KAAK,CAAC;AAEtD,gBAAgB;AAChB,MAAM,MAAM,YAAY,CAAC,CAAC,EAAE,CAAC,SAAS,MAAM,CAAC,IAAI,CAAC,GAAG;KAAG,CAAC,IAAI,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC;CAAE,CAAC;AAE1E,MAAM,MAAM,aAAa,CAAC,CAAC,SAAS,OAAO,IAAI,YAAY,CAAC,CAAC,EAAE,IAAI,GAAG,WAAW,CAAC,CAAC;AAEnF,gBAAgB;AAChB,MAAM,MAAM,aAAa,CAAC,OAAO,SAAS,yBAAyB,IAAI,KAAK,CAAC,OAAO,CAAC,SAAS,IAAI,GAC5F,OAAO,GACP;IACI,OAAO,EAAE,aAAa,CAAC,OAAO,CAAC,SAAS,CAAC,CAAC,CAAC;CAC9C,GAAG,IAAI,CAAC,OAAO,EAAE,SAAS,CAAC,CAAC;AAEnC,MAAM,WAAW,yBAAyB,CAAC,QAAQ,SAAS,UAAU,GAAG,UAAU;IAC/E,EAAE,EAAE,MAAM,CAAC;IACX,OAAO,CAAC,EAAE,OAAO,CAAC;IAElB;;;OAGG;IACH,SAAS,CAAC,EAAE,SAAS,CAAC;IAEtB;;OAEG;IACH,OAAO,EAAE,OAAO,CAAC,QAAQ,CAAC,CAAC;IAE3B;;;;;;OAMG;IACH,QAAQ,CAAC,IAAI,EAAE,YAAY,CAAC,UAAU,CAAC,OAAO,CAAC,UAAU,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,EAAE,eAAe,CAAC,EAAE,MAAM,GAAG,OAAO,CAAC,IAAI,CAAC,CAAC;IAE1G;;;;;;;;;;;;;;;;;;;;;;;OAuBG;IACH,YAAY,EAAE,CACV,OAAO,EAAE,YAAY,CAAC,IAAI,CAAC,WAAW,CAAC,mBAAmB,EAAE,MAAM,CAAC,EAAE,cAAc,GAAG,eAAe,CAAC,CAAC,KACtG,OAAO,CAAC,OAAO,CAAC,CAAC;IAEtB;;;;;OAKG;IACH,WAAW,EAAE,CACT,YAAY,EAAE,YAAY,CAAC,CAAC,MAAM,GAAG,MAAM,CAAC,EAAE,CAAC,EAC/C,OAAO,CAAC,EAAE,YAAY,CAAC,4BAA4B,CAAC,KACnD,OAAO,CAAC,IAAI,CAAC,CAAC;IAEnB;;OAEG;IACH,QAAQ,EAAE,CAAC,KAAK,SAAS,UAAU,GAAG,UAAU,EAAE,YAAY,CAAC,EAAE,KAAK,KAAK,OAAO,CAAC,KAAK,CAAC,CAAC;IAE1F;;OAEG;IACH,gBAAgB,EAAE,CACd,QAAQ,CAAC,EAAE,MAAM,KAChB,OAAO,CAAC,IAAI,CAAC,aAAa,EAAE,IAAI,GAAG,MAAM,GAAG,UAAU,GAAG,mBAAmB,GAAG,UAAU,GAAG,cAAc,CAAC,CAAC,CAAC;IAElH;;OAEG;IACH,GAAG,EAAE,aAAa,CAAC;CACtB;AAED,MAAM,WAAW,eAAe,CAAC,QAAQ,SAAS,UAAU,GAAG,UAAU,CAAE,SAAQ,yBAAyB,CAAC,QAAQ,CAAC;IAClH;;;;;;;;;;;;;;;;;;;;;;;;OAwBG;IACH,YAAY,CACR,OAAO,EAAE,YAAY,CAAC,IAAI,CAAC,WAAW,CAAC,mBAAmB,EAAE,MAAM,CAAC,EAAE,cAAc,GAAG,eAAe,CAAC,CAAC,GACnG,IAAI,CAAC,mBAAmB,EAAE,cAAc,GAAG,eAAe,CAAC,GAChE,OAAO,CAAC,OAAO,CAAC,CAAC;IAEpB;;;;;;;;;;;;;;;OAeG;IACH,WAAW,EAAE,CACT,gBAAgB,CAAC,EAAE,OAAO,CAAC,kBAAkB,CAAC,EAC9C,gBAAgB,CAAC,EAAE,kBAAkB,KACpC,OAAO,CAAC,QAAQ,CAAC,CAAC;IAEvB;;OAEG;IACH,uBAAuB,CAAC,OAAO,EAAE,MAAM,OAAO,CAAC,OAAO,CAAC,GAAG,IAAI,CAAC;CAClE;AAED;;;;GAIG;AACH,qBAAa,oBAAoB;IASzB,OAAO,CAAC,MAAM;IACd,OAAO,CAAC,eAAe;IAT3B,OAAO,CAAC,qBAAqB,CACtB;IAEP,OAAO,CAAC,aAAa,CAA2D;IAEhF,OAAO,CAAC,gBAAgB,CAA8D;gBAG1E,MAAM,EAAE,aAAa,EACrB,eAAe,EAAE,MAAM;IAGnC;;OAEG;IACH,IAAI,KAAK,IAAI,YAAY,CAAC;QACtB,QAAQ,EAAE,UAAU,CAAC,yBAAyB,CAAC,UAAU,CAAC,CAAC,EAAE,CAAC;QAC9D,WAAW,EAAE,UAAU,CAAC,yBAAyB,CAAC,aAAa,CAAC,CAAC,EAAE,CAAC;KACvE,CAAC,CAKD;IAED;;OAEG;IACH,IAAI,oBAAoB,IAAI,YAAY,CACpC,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,MAAM,EAAE;QAAE,YAAY,EAAE,OAAO,CAAC;QAAC,OAAO,CAAC,EAAE,aAAa,CAAA;KAAE,CAAC,CAAC,CACrF,CAEA;IAED;;OAEG;IACH,IAAI,YAAY,IAAI,YAAY,CAAC;QAAE,IAAI,EAAE,UAAU,CAAC;QAAC,eAAe,CAAC,EAAE,MAAM,CAAA;KAAE,EAAE,CAAC,CAIjF;IAED;;OAEG;IACH,IAAI,YAAY,IAAI,YAAY,CAAC;QAAE,GAAG,EAAE,MAAM,CAAC;QAAC,KAAK,CAAC,EAAE,MAAM,CAAA;KAAE,EAAE,CAAC,CAkBlE;IAED;;OAEG;IACH,IAAI,gBAAgB,IAAI,YAAY,CAAC;QAAE,OAAO,EAAE,MAAM,CAAC;QAAC,KAAK,CAAC,EAAE,MAAM,CAAA;KAAE,EAAE,CAAC,CAgB1E;IAED,QAAQ,EAAE,yBAAyB,CAAC,UAAU,CAAC,CAE7C;IAEF,WAAW,EAAE,yBAAyB,CAAC,aAAa,CAAC,CAEnD;IAEF,QAAQ,EAAE,yBAAyB,CAAC,UAAU,CAAC,CAG7C;IAEF,gBAAgB,EAAE,yBAAyB,CAAC,kBAAkB,CAAC,CAa7D;IAEF,OAAO,CAAC,WAAW,CAAwF;IAE3G,OAAO,CAAC,4BAA4B,CAIlC;IAEF,OAAO,CAAC,4BAA4B,CASlC;CACL"}
|
|
@@ -10,7 +10,6 @@ export class RequestHandlerResult {
|
|
|
10
10
|
_keyValueStoreChanges = {};
|
|
11
11
|
pushDataCalls = [];
|
|
12
12
|
addRequestsCalls = [];
|
|
13
|
-
enqueueLinksCalls = [];
|
|
14
13
|
constructor(config, crawleeStateKey) {
|
|
15
14
|
this.config = config;
|
|
16
15
|
this.crawleeStateKey = crawleeStateKey;
|
|
@@ -22,7 +21,6 @@ export class RequestHandlerResult {
|
|
|
22
21
|
return {
|
|
23
22
|
pushData: this.pushDataCalls,
|
|
24
23
|
addRequests: this.addRequestsCalls,
|
|
25
|
-
enqueueLinks: this.enqueueLinksCalls,
|
|
26
24
|
};
|
|
27
25
|
}
|
|
28
26
|
/**
|
|
@@ -42,9 +40,6 @@ export class RequestHandlerResult {
|
|
|
42
40
|
*/
|
|
43
41
|
get enqueuedUrls() {
|
|
44
42
|
const result = [];
|
|
45
|
-
for (const [options] of this.enqueueLinksCalls) {
|
|
46
|
-
result.push(...(options?.urls?.map((url) => ({ url, label: options?.label })) ?? []));
|
|
47
|
-
}
|
|
48
43
|
for (const [requests] of this.addRequestsCalls) {
|
|
49
44
|
for (const request of requests) {
|
|
50
45
|
if (typeof request === 'object' &&
|
|
@@ -78,9 +73,6 @@ export class RequestHandlerResult {
|
|
|
78
73
|
pushData = async (data, datasetIdOrName) => {
|
|
79
74
|
this.pushDataCalls.push([data, datasetIdOrName]);
|
|
80
75
|
};
|
|
81
|
-
enqueueLinks = async (options) => {
|
|
82
|
-
this.enqueueLinksCalls.push([options]);
|
|
83
|
-
};
|
|
84
76
|
addRequests = async (requests, options = {}) => {
|
|
85
77
|
this.addRequestsCalls.push([requests, options]);
|
|
86
78
|
};
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"crawler_commons.js","sourceRoot":"","sources":["../../src/crawlers/crawler_commons.ts"],"names":[],"mappings":"
|
|
1
|
+
{"version":3,"file":"crawler_commons.js","sourceRoot":"","sources":["../../src/crawlers/crawler_commons.ts"],"names":[],"mappings":"AASA,OAAO,EAAE,aAAa,EAAsB,MAAM,gCAAgC,CAAC;AA6JnF;;;;GAIG;AACH,MAAM,OAAO,oBAAoB;IASjB;IACA;IATJ,qBAAqB,GACzB,EAAE,CAAC;IAEC,aAAa,GAAwD,EAAE,CAAC;IAExE,gBAAgB,GAA2D,EAAE,CAAC;IAEtF,YACY,MAAqB,EACrB,eAAuB;QADvB,WAAM,GAAN,MAAM,CAAe;QACrB,oBAAe,GAAf,eAAe,CAAQ;IAChC,CAAC;IAEJ;;OAEG;IACH,IAAI,KAAK;QAIL,OAAO;YACH,QAAQ,EAAE,IAAI,CAAC,aAAa;YAC5B,WAAW,EAAE,IAAI,CAAC,gBAAgB;SACrC,CAAC;IACN,CAAC;IAED;;OAEG;IACH,IAAI,oBAAoB;QAGpB,OAAO,IAAI,CAAC,qBAAqB,CAAC;IACtC,CAAC;IAED;;OAEG;IACH,IAAI,YAAY;QACZ,OAAO,IAAI,CAAC,aAAa,CAAC,OAAO,CAAC,CAAC,CAAC,IAAI,EAAE,eAAe,CAAC,EAAE,EAAE,CAC1D,CAAC,KAAK,CAAC,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,IAAI,EAAE,EAAE,CAAC,CAAC,EAAE,IAAI,EAAE,eAAe,EAAE,CAAC,CAAC,CACnF,CAAC;IACN,CAAC;IAED;;OAEG;IACH,IAAI,YAAY;QACZ,MAAM,MAAM,GAAsC,EAAE,CAAC;QAErD,KAAK,MAAM,CAAC,QAAQ,CAAC,IAAI,IAAI,CAAC,gBAAgB,EAAE,CAAC;YAC7C,KAAK,MAAM,OAAO,IAAI,QAAQ,EAAE,CAAC;gBAC7B,IACI,OAAO,OAAO,KAAK,QAAQ;oBAC3B,CAAC,CAAC,CAAC,iBAAiB,IAAI,OAAO,CAAC,IAAI,OAAO,CAAC,eAAe,KAAK,SAAS,CAAC;oBAC1E,OAAO,CAAC,GAAG,KAAK,SAAS,EAC3B,CAAC;oBACC,MAAM,CAAC,IAAI,CAAC,EAAE,GAAG,EAAE,OAAO,CAAC,GAAG,EAAE,KAAK,EAAE,OAAO,CAAC,KAAK,EAAE,CAAC,CAAC;gBAC5D,CAAC;qBAAM,IAAI,OAAO,OAAO,KAAK,QAAQ,EAAE,CAAC;oBACrC,MAAM,CAAC,IAAI,CAAC,EAAE,GAAG,EAAE,OAAO,EAAE,CAAC,CAAC;gBAClC,CAAC;YACL,CAAC;QACL,CAAC;QAED,OAAO,MAAM,CAAC;IAClB,CAAC;IAED;;OAEG;IACH,IAAI,gBAAgB;QAChB,MAAM,MAAM,GAA0C,EAAE,CAAC;QAEzD,KAAK,MAAM,CAAC,QAAQ,CAAC,IAAI,IAAI,CAAC,gBAAgB,EAAE,CAAC;YAC7C,KAAK,MAAM,OAAO,IAAI,QAAQ,EAAE,CAAC;gBAC7B,IACI,OAAO,OAAO,KAAK,QAAQ;oBAC3B,iBAAiB,IAAI,OAAO;oBAC5B,OAAO,CAAC,eAAe,KAAK,SAAS,EACvC,CAAC;oBACC,MAAM,CAAC,IAAI,CAAC,EAAE,OAAO,EAAE,OAAO,CAAC,eAAe,EAAE,KAAK,EAAE,OAAO,CAAC,KAAK,EAAE,CAAC,CAAC;gBAC5E,CAAC;YACL,CAAC;QACL,CAAC;QAED,OAAO,MAAM,CAAC;IAClB,CAAC;IAED,QAAQ,GAA0C,KAAK,EAAE,IAAI,EAAE,eAAe,EAAE,EAAE;QAC9E,IAAI,CAAC,aAAa,CAAC,IAAI,CAAC,CAAC,IAAI,EAAE,eAAe,CAAC,CAAC,CAAC;IACrD,CAAC,CAAC;IAEF,WAAW,GAA6C,KAAK,EAAE,QAAQ,EAAE,OAAO,GAAG,EAAE,EAAE,EAAE;QACrF,IAAI,CAAC,gBAAgB,CAAC,IAAI,CAAC,CAAC,QAAQ,EAAE,OAAO,CAAC,CAAC,CAAC;IACpD,CAAC,CAAC;IAEF,QAAQ,GAA0C,KAAK,EAAE,YAAY,EAAE,EAAE;QACrE,MAAM,KAAK,GAAG,MAAM,IAAI,CAAC,gBAAgB,CAAC,SAAS,CAAC,CAAC;QACrD,OAAO,MAAM,KAAK,CAAC,iBAAiB,CAAC,IAAI,CAAC,eAAe,EAAE,YAAY,CAAC,CAAC;IAC7E,CAAC,CAAC;IAEF,gBAAgB,GAAkD,KAAK,EAAE,QAAQ,EAAE,EAAE;QACjF,MAAM,KAAK,GAAG,MAAM,aAAa,CAAC,IAAI,CAAC,QAAQ,EAAE,EAAE,MAAM,EAAE,IAAI,CAAC,MAAM,EAAE,CAAC,CAAC;QAE1E,OAAO;YACH,EAAE,EAAE,IAAI,CAAC,WAAW,CAAC,QAAQ,CAAC;YAC9B,IAAI,EAAE,QAAQ;YACd,QAAQ,EAAE,KAAK,EAAE,GAAG,EAAE,EAAE,CAAC,IAAI,CAAC,4BAA4B,CAAC,QAAQ,EAAE,GAAG,CAAC,IAAI,CAAC,MAAM,KAAK,CAAC,QAAQ,CAAC,GAAG,CAAC,CAAC;YACxG,QAAQ,EAAE,KAAK,EAAE,GAAG,EAAE,KAAK,EAAE,OAAO,EAAE,EAAE;gBACpC,IAAI,CAAC,4BAA4B,CAAC,QAAQ,EAAE,GAAG,EAAE,KAAK,EAAE,OAAO,CAAC,CAAC;YACrE,CAAC;YACD,iBAAiB,EAAE,KAAK,CAAC,iBAAiB,CAAC,IAAI,CAAC,KAAK,CAAC;YACtD,YAAY,EAAE,KAAK,CAAC,YAAY,CAAC,IAAI,CAAC,KAAK,CAAC;SAC/C,CAAC;IACN,CAAC,CAAC;IAEM,WAAW,GAAG,CAAC,QAAiB,EAAU,EAAE,CAAC,QAAQ,IAAI,IAAI,CAAC,MAAM,CAAC,GAAG,CAAC,wBAAwB,CAAC,CAAC;IAEnG,4BAA4B,GAAG,CAAC,QAA4B,EAAE,GAAW,EAAE,EAAE;QACjF,MAAM,EAAE,GAAG,IAAI,CAAC,WAAW,CAAC,QAAQ,CAAC,CAAC;QACtC,IAAI,CAAC,qBAAqB,CAAC,EAAE,CAAC,KAAK,EAAE,CAAC;QACtC,OAAO,IAAI,CAAC,oBAAoB,CAAC,EAAE,CAAC,CAAC,GAAG,CAAC,EAAE,YAAY,IAAI,IAAI,CAAC;IACpE,CAAC,CAAC;IAEM,4BAA4B,GAAG,CACnC,QAA4B,EAC5B,GAAW,EACX,YAAqB,EACrB,OAAuB,EACzB,EAAE;QACA,MAAM,EAAE,GAAG,IAAI,CAAC,WAAW,CAAC,QAAQ,CAAC,CAAC;QACtC,IAAI,CAAC,qBAAqB,CAAC,EAAE,CAAC,KAAK,EAAE,CAAC;QACtC,IAAI,CAAC,qBAAqB,CAAC,EAAE,CAAC,CAAC,GAAG,CAAC,GAAG,EAAE,YAAY,EAAE,OAAO,EAAE,CAAC;IACpE,CAAC,CAAC;CACL"}
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
import type { CrawlingContext } from '../crawlers/crawler_commons.js';
|
|
2
2
|
import type { KeyValueStore } from '../storages/key_value_store.js';
|
|
3
3
|
import type { ErrnoException } from './error_tracker.js';
|
|
4
|
+
import type { SnapshottableProperties } from './internals/types.js';
|
|
4
5
|
interface BrowserCrawlingContext {
|
|
5
6
|
saveSnapshot: (options: {
|
|
6
7
|
key: string;
|
|
@@ -39,7 +40,7 @@ export declare class ErrorSnapshotter {
|
|
|
39
40
|
/**
|
|
40
41
|
* Capture a snapshot of the error context.
|
|
41
42
|
*/
|
|
42
|
-
captureSnapshot(error: ErrnoException, context: CrawlingContext): Promise<ErrorSnapshot>;
|
|
43
|
+
captureSnapshot(error: ErrnoException, context: CrawlingContext & SnapshottableProperties): Promise<ErrorSnapshot>;
|
|
43
44
|
/**
|
|
44
45
|
* Captures a snapshot of the current page using the context.saveSnapshot function.
|
|
45
46
|
* This function is applicable for browser contexts only.
|
|
@@ -49,7 +50,7 @@ export declare class ErrorSnapshotter {
|
|
|
49
50
|
/**
|
|
50
51
|
* Save the HTML snapshot of the page, and return the fileName with the extension.
|
|
51
52
|
*/
|
|
52
|
-
saveHTMLSnapshot(html: string, keyValueStore: KeyValueStore, fileName: string): Promise<string | undefined>;
|
|
53
|
+
saveHTMLSnapshot(html: string, keyValueStore: Pick<KeyValueStore, 'setValue'>, fileName: string): Promise<string | undefined>;
|
|
53
54
|
/**
|
|
54
55
|
* Generate a unique fileName for each error snapshot.
|
|
55
56
|
*/
|