@crawlee/core 4.0.0-beta.6 → 4.0.0-beta.61
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +9 -5
- package/autoscaling/autoscaled_pool.d.ts +3 -5
- package/autoscaling/autoscaled_pool.d.ts.map +1 -1
- package/autoscaling/autoscaled_pool.js +3 -9
- package/autoscaling/autoscaled_pool.js.map +1 -1
- package/autoscaling/snapshotter.d.ts +3 -13
- package/autoscaling/snapshotter.d.ts.map +1 -1
- package/autoscaling/snapshotter.js +18 -29
- package/autoscaling/snapshotter.js.map +1 -1
- package/autoscaling/system_status.d.ts +0 -3
- package/autoscaling/system_status.d.ts.map +1 -1
- package/autoscaling/system_status.js +2 -3
- package/autoscaling/system_status.js.map +1 -1
- package/configuration.d.ts +85 -227
- package/configuration.d.ts.map +1 -1
- package/configuration.js +159 -223
- package/configuration.js.map +1 -1
- package/cookie_utils.d.ts +4 -2
- package/cookie_utils.d.ts.map +1 -1
- package/cookie_utils.js +18 -12
- package/cookie_utils.js.map +1 -1
- package/crawlers/context_pipeline.d.ts +71 -0
- package/crawlers/context_pipeline.d.ts.map +1 -0
- package/crawlers/context_pipeline.js +123 -0
- package/crawlers/context_pipeline.js.map +1 -0
- package/crawlers/crawler_commons.d.ts +19 -28
- package/crawlers/crawler_commons.d.ts.map +1 -1
- package/crawlers/crawler_commons.js +12 -20
- package/crawlers/crawler_commons.js.map +1 -1
- package/crawlers/crawler_utils.d.ts +2 -2
- package/crawlers/crawler_utils.d.ts.map +1 -1
- package/crawlers/crawler_utils.js +1 -1
- package/crawlers/crawler_utils.js.map +1 -1
- package/crawlers/error_snapshotter.d.ts +3 -2
- package/crawlers/error_snapshotter.d.ts.map +1 -1
- package/crawlers/error_snapshotter.js +2 -2
- package/crawlers/error_snapshotter.js.map +1 -1
- package/crawlers/error_tracker.d.ts +2 -1
- package/crawlers/error_tracker.d.ts.map +1 -1
- package/crawlers/error_tracker.js.map +1 -1
- package/crawlers/index.d.ts +1 -1
- package/crawlers/index.d.ts.map +1 -1
- package/crawlers/index.js +1 -1
- package/crawlers/index.js.map +1 -1
- package/crawlers/internals/types.d.ts +8 -0
- package/crawlers/internals/types.d.ts.map +1 -0
- package/crawlers/internals/types.js +2 -0
- package/crawlers/internals/types.js.map +1 -0
- package/crawlers/statistics.d.ts +15 -15
- package/crawlers/statistics.d.ts.map +1 -1
- package/crawlers/statistics.js +21 -24
- package/crawlers/statistics.js.map +1 -1
- package/enqueue_links/enqueue_links.d.ts +32 -18
- package/enqueue_links/enqueue_links.d.ts.map +1 -1
- package/enqueue_links/enqueue_links.js +45 -24
- package/enqueue_links/enqueue_links.js.map +1 -1
- package/enqueue_links/shared.d.ts +25 -8
- package/enqueue_links/shared.d.ts.map +1 -1
- package/enqueue_links/shared.js +69 -37
- package/enqueue_links/shared.js.map +1 -1
- package/errors.d.ts +33 -3
- package/errors.d.ts.map +1 -1
- package/errors.js +48 -4
- package/errors.js.map +1 -1
- package/events/event_manager.d.ts +8 -5
- package/events/event_manager.d.ts.map +1 -1
- package/events/event_manager.js +7 -9
- package/events/event_manager.js.map +1 -1
- package/events/local_event_manager.d.ts +14 -4
- package/events/local_event_manager.d.ts.map +1 -1
- package/events/local_event_manager.js +33 -39
- package/events/local_event_manager.js.map +1 -1
- package/index.d.ts +3 -2
- package/index.d.ts.map +1 -1
- package/index.js +2 -1
- package/index.js.map +1 -1
- package/log.d.ts +82 -2
- package/log.d.ts.map +1 -1
- package/log.js +102 -0
- package/log.js.map +1 -1
- package/package.json +9 -10
- package/proxy_configuration.d.ts +14 -148
- package/proxy_configuration.d.ts.map +1 -1
- package/proxy_configuration.js +19 -167
- package/proxy_configuration.js.map +1 -1
- package/recoverable_state.d.ts +121 -0
- package/recoverable_state.d.ts.map +1 -0
- package/recoverable_state.js +142 -0
- package/recoverable_state.js.map +1 -0
- package/request.d.ts +74 -10
- package/request.d.ts.map +1 -1
- package/request.js +85 -23
- package/request.js.map +1 -1
- package/router.d.ts.map +1 -1
- package/router.js.map +1 -1
- package/serialization.js +1 -1
- package/serialization.js.map +1 -1
- package/service_locator.d.ts +157 -0
- package/service_locator.d.ts.map +1 -0
- package/service_locator.js +234 -0
- package/service_locator.js.map +1 -0
- package/session_pool/index.d.ts +0 -1
- package/session_pool/index.d.ts.map +1 -1
- package/session_pool/index.js +0 -1
- package/session_pool/index.js.map +1 -1
- package/session_pool/session.d.ts +26 -72
- package/session_pool/session.d.ts.map +1 -1
- package/session_pool/session.js +36 -98
- package/session_pool/session.js.map +1 -1
- package/session_pool/session_pool.d.ts +65 -71
- package/session_pool/session_pool.d.ts.map +1 -1
- package/session_pool/session_pool.js +101 -100
- package/session_pool/session_pool.js.map +1 -1
- package/storages/dataset.d.ts +90 -46
- package/storages/dataset.d.ts.map +1 -1
- package/storages/dataset.js +149 -121
- package/storages/dataset.js.map +1 -1
- package/storages/index.d.ts +3 -1
- package/storages/index.d.ts.map +1 -1
- package/storages/index.js +3 -1
- package/storages/index.js.map +1 -1
- package/storages/key_value_store.d.ts +104 -22
- package/storages/key_value_store.d.ts.map +1 -1
- package/storages/key_value_store.js +166 -51
- package/storages/key_value_store.js.map +1 -1
- package/storages/request_list.d.ts +9 -9
- package/storages/request_list.d.ts.map +1 -1
- package/storages/request_list.js +13 -8
- package/storages/request_list.js.map +1 -1
- package/storages/request_list_adapter.d.ts +58 -0
- package/storages/request_list_adapter.d.ts.map +1 -0
- package/storages/request_list_adapter.js +81 -0
- package/storages/request_list_adapter.js.map +1 -0
- package/storages/request_manager_tandem.d.ts +68 -0
- package/storages/request_manager_tandem.d.ts.map +1 -0
- package/storages/request_manager_tandem.js +124 -0
- package/storages/request_manager_tandem.js.map +1 -0
- package/storages/request_provider.d.ts +87 -22
- package/storages/request_provider.d.ts.map +1 -1
- package/storages/request_provider.js +127 -77
- package/storages/request_provider.js.map +1 -1
- package/storages/request_queue.d.ts +1 -3
- package/storages/request_queue.d.ts.map +1 -1
- package/storages/request_queue.js +2 -4
- package/storages/request_queue.js.map +1 -1
- package/storages/request_queue_v2.d.ts +3 -3
- package/storages/request_queue_v2.d.ts.map +1 -1
- package/storages/request_queue_v2.js +4 -5
- package/storages/request_queue_v2.js.map +1 -1
- package/storages/sitemap_request_list.d.ts +5 -5
- package/storages/sitemap_request_list.d.ts.map +1 -1
- package/storages/sitemap_request_list.js +10 -7
- package/storages/sitemap_request_list.js.map +1 -1
- package/storages/storage_instance_manager.d.ts +91 -0
- package/storages/storage_instance_manager.d.ts.map +1 -0
- package/storages/storage_instance_manager.js +236 -0
- package/storages/storage_instance_manager.js.map +1 -0
- package/storages/utils.d.ts +47 -1
- package/storages/utils.d.ts.map +1 -1
- package/storages/utils.js +57 -5
- package/storages/utils.js.map +1 -1
- package/typedefs.d.ts +1 -1
- package/typedefs.d.ts.map +1 -1
- package/validators.d.ts +4 -0
- package/validators.d.ts.map +1 -1
- package/validators.js +4 -0
- package/validators.js.map +1 -1
- package/crawlers/crawler_extension.d.ts +0 -12
- package/crawlers/crawler_extension.d.ts.map +0 -1
- package/crawlers/crawler_extension.js +0 -14
- package/crawlers/crawler_extension.js.map +0 -1
- package/http_clients/base-http-client.d.ts +0 -134
- package/http_clients/base-http-client.d.ts.map +0 -1
- package/http_clients/base-http-client.js +0 -33
- package/http_clients/base-http-client.js.map +0 -1
- package/http_clients/form-data-like.d.ts +0 -67
- package/http_clients/form-data-like.d.ts.map +0 -1
- package/http_clients/form-data-like.js +0 -5
- package/http_clients/form-data-like.js.map +0 -1
- package/http_clients/got-scraping-http-client.d.ts +0 -15
- package/http_clients/got-scraping-http-client.d.ts.map +0 -1
- package/http_clients/got-scraping-http-client.js +0 -69
- package/http_clients/got-scraping-http-client.js.map +0 -1
- package/http_clients/index.d.ts +0 -3
- package/http_clients/index.d.ts.map +0 -1
- package/http_clients/index.js +0 -3
- package/http_clients/index.js.map +0 -1
- package/session_pool/events.d.ts +0 -3
- package/session_pool/events.d.ts.map +0 -1
- package/session_pool/events.js +0 -3
- package/session_pool/events.js.map +0 -1
- package/storages/storage_manager.d.ts +0 -58
- package/storages/storage_manager.d.ts.map +0 -1
- package/storages/storage_manager.js +0 -105
- package/storages/storage_manager.js.map +0 -1
- package/tsconfig.build.tsbuildinfo +0 -1
package/storages/dataset.d.ts
CHANGED
|
@@ -1,27 +1,20 @@
|
|
|
1
|
-
import type { DatasetClient, DatasetInfo, Dictionary
|
|
1
|
+
import type { DatasetClient, DatasetInfo, Dictionary } from '@crawlee/types';
|
|
2
2
|
import { Configuration } from '../configuration.js';
|
|
3
|
-
import {
|
|
3
|
+
import type { CrawleeLogger } from '../log.js';
|
|
4
4
|
import type { Awaitable } from '../typedefs.js';
|
|
5
|
-
import type {
|
|
5
|
+
import type { StorageIdentifier } from './storage_instance_manager.js';
|
|
6
|
+
import type { StorageOpenOptions } from './utils.js';
|
|
6
7
|
/** @internal */
|
|
7
8
|
export declare const DATASET_ITERATORS_DEFAULT_LIMIT = 10000;
|
|
8
9
|
/**
|
|
9
|
-
*
|
|
10
|
-
*
|
|
11
|
-
* in an array to provide better error messages. Returns serialized object.
|
|
12
|
-
* @ignore
|
|
13
|
-
*/
|
|
14
|
-
export declare function checkAndSerialize<T>(item: T, limitBytes: number, index?: number): string;
|
|
15
|
-
/**
|
|
16
|
-
* Takes an array of JSONs (payloads) as input and produces an array of JSON strings
|
|
17
|
-
* where each string is a JSON array of payloads with a maximum size of limitBytes per one
|
|
18
|
-
* JSON array. Fits as many payloads as possible into a single JSON array and then moves
|
|
19
|
-
* on to the next, preserving item order.
|
|
10
|
+
* Validates that the given value is a plain JSON-serializable object
|
|
11
|
+
* (not an array, not a primitive, not circular).
|
|
20
12
|
*
|
|
21
|
-
*
|
|
13
|
+
* @param item The value to validate.
|
|
14
|
+
* @param index Optional index for error messages when validating inside an array.
|
|
22
15
|
* @ignore
|
|
23
16
|
*/
|
|
24
|
-
export declare function
|
|
17
|
+
export declare function assertJsonSerializable<T>(item: T, index?: number): void;
|
|
25
18
|
export interface DatasetDataOptions {
|
|
26
19
|
/**
|
|
27
20
|
* Number of array elements that should be skipped at the start.
|
|
@@ -67,6 +60,11 @@ export interface DatasetDataOptions {
|
|
|
67
60
|
skipEmpty?: boolean;
|
|
68
61
|
}
|
|
69
62
|
export interface DatasetExportOptions extends Omit<DatasetDataOptions, 'offset' | 'limit'> {
|
|
63
|
+
/**
|
|
64
|
+
* If true, includes all unique keys from all dataset items in the CSV export header.
|
|
65
|
+
* If omitted or false, only keys from the first item are used.
|
|
66
|
+
*/
|
|
67
|
+
collectAllKeys?: boolean;
|
|
70
68
|
}
|
|
71
69
|
export interface DatasetIteratorOptions extends Omit<DatasetDataOptions, 'offset' | 'limit' | 'clean' | 'skipHidden' | 'skipEmpty'> {
|
|
72
70
|
/** @internal */
|
|
@@ -86,8 +84,8 @@ export interface DatasetIteratorOptions extends Omit<DatasetDataOptions, 'offset
|
|
|
86
84
|
format?: string;
|
|
87
85
|
}
|
|
88
86
|
export interface DatasetExportToOptions extends DatasetExportOptions {
|
|
89
|
-
fromDataset?: string;
|
|
90
|
-
toKVS?: string;
|
|
87
|
+
fromDataset?: string | StorageIdentifier;
|
|
88
|
+
toKVS?: string | StorageIdentifier;
|
|
91
89
|
}
|
|
92
90
|
/**
|
|
93
91
|
* The `Dataset` class represents a store for structured data where each object stored has the same attributes,
|
|
@@ -145,7 +143,7 @@ export declare class Dataset<Data extends Dictionary = Dictionary> {
|
|
|
145
143
|
id: string;
|
|
146
144
|
name?: string;
|
|
147
145
|
client: DatasetClient<Data>;
|
|
148
|
-
log:
|
|
146
|
+
log: CrawleeLogger;
|
|
149
147
|
/**
|
|
150
148
|
* @internal
|
|
151
149
|
*/
|
|
@@ -158,21 +156,8 @@ export declare class Dataset<Data extends Dictionary = Dictionary> {
|
|
|
158
156
|
* **IMPORTANT**: Make sure to use the `await` keyword when calling `pushData()`,
|
|
159
157
|
* otherwise the crawler process might finish before the data is stored!
|
|
160
158
|
*
|
|
161
|
-
* The size of the data is limited by the receiving API and therefore `pushData()` will only
|
|
162
|
-
* allow objects whose JSON representation is smaller than 9MB. When an array is passed,
|
|
163
|
-
* none of the included objects
|
|
164
|
-
* may be larger than 9MB, but the array itself may be of any size.
|
|
165
|
-
*
|
|
166
|
-
* The function internally
|
|
167
|
-
* chunks the array into separate items and pushes them sequentially.
|
|
168
|
-
* The chunking process is stable (keeps order of data), but it does not provide a transaction
|
|
169
|
-
* safety mechanism. Therefore, in the event of an uploading error (after several automatic retries),
|
|
170
|
-
* the function's Promise will reject and the dataset will be left in a state where some of
|
|
171
|
-
* the items have already been saved to the dataset while other items from the source array were not.
|
|
172
|
-
* To overcome this limitation, the developer may, for example, read the last item saved in the dataset
|
|
173
|
-
* and re-attempt the save of the data from this item onwards to prevent duplicates.
|
|
174
159
|
* @param data Object or array of objects containing data to be stored in the default dataset.
|
|
175
|
-
* The objects must be serializable to JSON
|
|
160
|
+
* The objects must be serializable to JSON.
|
|
176
161
|
*/
|
|
177
162
|
pushData(data: Data | Data[]): Promise<void>;
|
|
178
163
|
/**
|
|
@@ -223,26 +208,21 @@ export declare class Dataset<Data extends Dictionary = Dictionary> {
|
|
|
223
208
|
/**
|
|
224
209
|
* Returns an object containing general information about the dataset.
|
|
225
210
|
*
|
|
226
|
-
* The function returns the same object as the Apify API Client's
|
|
227
|
-
* [getDataset](https://docs.apify.com/api/apify-client-js/latest#ApifyClient-datasets-getDataset)
|
|
228
|
-
* function, which in turn calls the
|
|
229
|
-
* [Get dataset](https://apify.com/docs/api/v2#/reference/datasets/dataset/get-dataset)
|
|
230
|
-
* API endpoint.
|
|
231
|
-
*
|
|
232
211
|
* **Example:**
|
|
233
212
|
* ```
|
|
234
213
|
* {
|
|
235
214
|
* id: "WkzbQMuFYuamGv3YF",
|
|
236
215
|
* name: "my-dataset",
|
|
237
|
-
* userId: "wRsJZtadYvn4mBZmm",
|
|
238
216
|
* createdAt: new Date("2015-12-12T07:34:14.202Z"),
|
|
239
217
|
* modifiedAt: new Date("2015-12-13T08:36:13.202Z"),
|
|
240
218
|
* accessedAt: new Date("2015-12-14T08:36:13.202Z"),
|
|
241
219
|
* itemCount: 14,
|
|
242
220
|
* }
|
|
243
221
|
* ```
|
|
222
|
+
*
|
|
223
|
+
* @throws If the underlying storage no longer exists (e.g. it was deleted externally).
|
|
244
224
|
*/
|
|
245
|
-
getInfo(): Promise<DatasetInfo
|
|
225
|
+
getInfo(): Promise<DatasetInfo>;
|
|
246
226
|
/**
|
|
247
227
|
* Iterates over dataset items, yielding each in turn to an `iteratee` function.
|
|
248
228
|
* Each invocation of `iteratee` is called with two arguments: `(item, index)`.
|
|
@@ -326,6 +306,69 @@ export declare class Dataset<Data extends Dictionary = Dictionary> {
|
|
|
326
306
|
* @param [options] An object containing extra options for `reduce()`
|
|
327
307
|
*/
|
|
328
308
|
reduce<T>(iteratee: DatasetReducer<T, Data>, memo: T, options?: DatasetIteratorOptions): Promise<T>;
|
|
309
|
+
private fetchEntryPages;
|
|
310
|
+
private fetchPages;
|
|
311
|
+
/**
|
|
312
|
+
* Returns dataset items.
|
|
313
|
+
*
|
|
314
|
+
* When awaited (`await dataset.values()`), returns all items as a flat `Data[]` array.
|
|
315
|
+
* When used as an async iterable (`for await...of`), iterates over all items across pages
|
|
316
|
+
* without loading everything into memory at once.
|
|
317
|
+
*
|
|
318
|
+
* **Example usage:**
|
|
319
|
+
* ```javascript
|
|
320
|
+
* const dataset = await Dataset.open('my-results');
|
|
321
|
+
*
|
|
322
|
+
* // Iterate over all items (memory-efficient for large datasets)
|
|
323
|
+
* for await (const item of dataset.values()) {
|
|
324
|
+
* console.log(item);
|
|
325
|
+
* }
|
|
326
|
+
*
|
|
327
|
+
* // Or fetch all items at once
|
|
328
|
+
* const items = await dataset.values();
|
|
329
|
+
* console.log(items);
|
|
330
|
+
* ```
|
|
331
|
+
*
|
|
332
|
+
* @param options Options for the iteration.
|
|
333
|
+
*/
|
|
334
|
+
values(options?: DatasetIteratorOptions): AsyncIterable<Data> & Promise<Data[]>;
|
|
335
|
+
/**
|
|
336
|
+
* Returns dataset entries (index-value pairs).
|
|
337
|
+
*
|
|
338
|
+
* When awaited (`await dataset.entries()`), returns all entries as a flat `[index, item][]` array.
|
|
339
|
+
* When used as an async iterable (`for await...of`), iterates over all entries across pages
|
|
340
|
+
* without loading everything into memory at once.
|
|
341
|
+
*
|
|
342
|
+
* **Example usage:**
|
|
343
|
+
* ```javascript
|
|
344
|
+
* const dataset = await Dataset.open('my-results');
|
|
345
|
+
*
|
|
346
|
+
* // Iterate over all entries
|
|
347
|
+
* for await (const [index, item] of dataset.entries()) {
|
|
348
|
+
* console.log(`Item at ${index}: ${JSON.stringify(item)}`);
|
|
349
|
+
* }
|
|
350
|
+
*
|
|
351
|
+
* // Or fetch all at once
|
|
352
|
+
* const entries = await dataset.entries();
|
|
353
|
+
* console.log(entries);
|
|
354
|
+
* ```
|
|
355
|
+
*
|
|
356
|
+
* @param options Options for the iteration.
|
|
357
|
+
*/
|
|
358
|
+
entries(options?: DatasetIteratorOptions): AsyncIterable<[number, Data]> & Promise<[number, Data][]>;
|
|
359
|
+
/**
|
|
360
|
+
* Default async iterator for the dataset, iterating over items.
|
|
361
|
+
* Allows using the dataset directly in a `for await...of` loop.
|
|
362
|
+
*
|
|
363
|
+
* **Example usage:**
|
|
364
|
+
* ```javascript
|
|
365
|
+
* const dataset = await Dataset.open('my-results');
|
|
366
|
+
* for await (const item of dataset) {
|
|
367
|
+
* console.log(item);
|
|
368
|
+
* }
|
|
369
|
+
* ```
|
|
370
|
+
*/
|
|
371
|
+
[Symbol.asyncIterator](): AsyncGenerator<Data, void, undefined>;
|
|
329
372
|
/**
|
|
330
373
|
* Removes the dataset either from the Apify cloud storage or from the local directory,
|
|
331
374
|
* depending on the mode of operation.
|
|
@@ -340,12 +383,13 @@ export declare class Dataset<Data extends Dictionary = Dictionary> {
|
|
|
340
383
|
*
|
|
341
384
|
* For more details and code examples, see the {@link Dataset} class.
|
|
342
385
|
*
|
|
343
|
-
* @param [
|
|
344
|
-
* ID or name of the dataset to be opened. If
|
|
345
|
-
*
|
|
386
|
+
* @param [identifier]
|
|
387
|
+
* ID or name of the dataset to be opened. If a string is provided, it will first be
|
|
388
|
+
* looked up as an ID; if no such storage exists, it will be treated as a name.
|
|
389
|
+
* If `null` or `undefined`, the function returns the default dataset associated with the crawler run.
|
|
346
390
|
* @param [options] Storage manager options.
|
|
347
391
|
*/
|
|
348
|
-
static open<Data extends Dictionary = Dictionary>(
|
|
392
|
+
static open<Data extends Dictionary = Dictionary>(identifier?: string | StorageIdentifier | null, options?: StorageOpenOptions): Promise<Dataset<Data>>;
|
|
349
393
|
/**
|
|
350
394
|
* Stores an object or an array of objects to the default {@link Dataset} of the current crawler run.
|
|
351
395
|
*
|
|
@@ -411,7 +455,7 @@ export interface DatasetReducer<T, Data> {
|
|
|
411
455
|
export interface DatasetOptions {
|
|
412
456
|
id: string;
|
|
413
457
|
name?: string;
|
|
414
|
-
client:
|
|
458
|
+
client: DatasetClient;
|
|
415
459
|
}
|
|
416
460
|
export interface DatasetContent<Data> {
|
|
417
461
|
/** Total count of entries in the dataset. */
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"dataset.d.ts","sourceRoot":"","sources":["../../src/storages/dataset.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,aAAa,EAAE,WAAW,EAAE,UAAU,
|
|
1
|
+
{"version":3,"file":"dataset.d.ts","sourceRoot":"","sources":["../../src/storages/dataset.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,aAAa,EAAE,WAAW,EAAE,UAAU,EAAiB,MAAM,gBAAgB,CAAC;AAI5F,OAAO,EAAE,aAAa,EAAE,MAAM,qBAAqB,CAAC;AACpD,OAAO,KAAK,EAAE,aAAa,EAAE,MAAM,WAAW,CAAC;AAE/C,OAAO,KAAK,EAAE,SAAS,EAAE,MAAM,gBAAgB,CAAC;AAGhD,OAAO,KAAK,EAAE,iBAAiB,EAAE,MAAM,+BAA+B,CAAC;AACvE,OAAO,KAAK,EAAE,kBAAkB,EAAE,MAAM,YAAY,CAAC;AAIrD,gBAAgB;AAChB,eAAO,MAAM,+BAA+B,QAAQ,CAAC;AAErD;;;;;;;GAOG;AACH,wBAAgB,sBAAsB,CAAC,CAAC,EAAE,IAAI,EAAE,CAAC,EAAE,KAAK,CAAC,EAAE,MAAM,GAAG,IAAI,CAcvE;AAED,MAAM,WAAW,kBAAkB;IAC/B;;;OAGG;IACH,MAAM,CAAC,EAAE,MAAM,CAAC;IAEhB;;;OAGG;IACH,KAAK,CAAC,EAAE,MAAM,CAAC;IAEf;;;;OAIG;IACH,IAAI,CAAC,EAAE,OAAO,CAAC;IAEf;;OAEG;IACH,MAAM,CAAC,EAAE,MAAM,EAAE,CAAC;IAElB;;;OAGG;IACH,MAAM,CAAC,EAAE,MAAM,CAAC;IAEhB;;;;OAIG;IACH,KAAK,CAAC,EAAE,OAAO,CAAC;IAEhB;;;OAGG;IACH,UAAU,CAAC,EAAE,OAAO,CAAC;IAErB;;;;OAIG;IACH,SAAS,CAAC,EAAE,OAAO,CAAC;CACvB;AAED,MAAM,WAAW,oBAAqB,SAAQ,IAAI,CAAC,kBAAkB,EAAE,QAAQ,GAAG,OAAO,CAAC;IACtF;;;OAGG;IACH,cAAc,CAAC,EAAE,OAAO,CAAC;CAC5B;AAED,MAAM,WAAW,sBAAuB,SAAQ,IAAI,CAChD,kBAAkB,EAClB,QAAQ,GAAG,OAAO,GAAG,OAAO,GAAG,YAAY,GAAG,WAAW,CAC5D;IACG,gBAAgB;IAChB,MAAM,CAAC,EAAE,MAAM,CAAC;IAEhB;;;OAGG;IACH,KAAK,CAAC,EAAE,MAAM,CAAC;IAEf,gBAAgB;IAChB,KAAK,CAAC,EAAE,OAAO,CAAC;IAEhB,gBAAgB;IAChB,UAAU,CAAC,EAAE,OAAO,CAAC;IAErB,gBAAgB;IAChB,SAAS,CAAC,EAAE,OAAO,CAAC;IAEpB,gBAAgB;IAChB,MAAM,CAAC,EAAE,MAAM,CAAC;CACnB;AAED,MAAM,WAAW,sBAAuB,SAAQ,oBAAoB;IAChE,WAAW,CAAC,EAAE,MAAM,GAAG,iBAAiB,CAAC;IACzC,KAAK,CAAC,EAAE,MAAM,GAAG,iBAAiB,CAAC;CACtC;AAED;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GAkDG;AACH,qBAAa,OAAO,CAAC,IAAI,SAAS,UAAU,GAAG,UAAU;IAWjD,QAAQ,CAAC,MAAM;IAVnB,EAAE,EAAE,MAAM,CAAC;IACX,IAAI,CAAC,EAAE,MAAM,CAAC;IACd,MAAM,EAAE,aAAa,CAAC,IAAI,CAAC,CAAC;IAC5B,GAAG,EAAE,aAAa,CAAC;IAEnB;;OAEG;gBAEC,OAAO,EAAE,cAAc,EACd,MAAM,gBAAkC;IAQrD;;;;;;;;;;OAUG;IACG,QAAQ,CAAC,IAAI,EAAE,IAAI,GAAG,IAAI,EAAE,GAAG,OAAO,CAAC,IAAI,CAAC;IAclD;;OAEG;IACG,OAAO,CAAC,OAAO,GAAE,kBAAuB,GAAG,OAAO,CAAC,cAAc,CAAC,IAAI,CAAC,CAAC;IAgB9E;;;OAGG;IACG,MAAM,CAAC,OAAO,GAAE,oBAAyB,GAAG,OAAO,CAAC,IAAI,EAAE,CAAC;IAYjE;;;;;;OAMG;IACG,QAAQ,CAAC,GAAG,EAAE,MAAM,EAAE,OAAO,CAAC,EAAE,sBAAsB,EAAE,WAAW,CAAC,EAAE,MAAM,GAAG,OAAO,CAAC,IAAI,EAAE,CAAC;IAiCpG;;;;;OAKG;IACG,YAAY,CAAC,GAAG,EAAE,MAAM,EAAE,OAAO,CAAC,EAAE,IAAI,CAAC,sBAAsB,EAAE,aAAa,CAAC;IAIrF;;;;;OAKG;IACG,WAAW,CAAC,GAAG,EAAE,MAAM,EAAE,OAAO,CAAC,EAAE,IAAI,CAAC,sBAAsB,EAAE,aAAa,CAAC;IAIpF;;;;;OAKG;WACU,YAAY,CAAC,GAAG,EAAE,MAAM,EAAE,OAAO,CAAC,EAAE,sBAAsB;IAOvE;;;;;OAKG;WACU,WAAW,CAAC,GAAG,EAAE,MAAM,EAAE,OAAO,CAAC,EAAE,sBAAsB;IAOtE;;;;;;;;;;;;;;;;OAgBG;IACG,OAAO,IAAI,OAAO,CAAC,WAAW,CAAC;IAMrC;;;;;;;;;;;;;;;;;;;OAmBG;IACG,OAAO,CAAC,QAAQ,EAAE,eAAe,CAAC,IAAI,CAAC,EAAE,OAAO,GAAE,sBAA2B,EAAE,KAAK,SAAI,GAAG,OAAO,CAAC,IAAI,CAAC;IAqB9G;;;;;;;;OAQG;IACG,GAAG,CAAC,CAAC,EAAE,QAAQ,EAAE,aAAa,CAAC,IAAI,EAAE,CAAC,CAAC,EAAE,OAAO,GAAE,sBAA2B,GAAG,OAAO,CAAC,CAAC,EAAE,CAAC;IAalG;;;;;;;;;;;;;;;;OAgBG;IACG,MAAM,CAAC,QAAQ,EAAE,cAAc,CAAC,IAAI,EAAE,IAAI,CAAC,GAAG,OAAO,CAAC,IAAI,GAAG,SAAS,CAAC;IAE7E;;;;;;;;;;;;;;;;;;OAkBG;IACG,MAAM,CACR,QAAQ,EAAE,cAAc,CAAC,IAAI,EAAE,IAAI,CAAC,EACpC,IAAI,EAAE,SAAS,EACf,OAAO,EAAE,sBAAsB,GAChC,OAAO,CAAC,IAAI,GAAG,SAAS,CAAC;IAE5B;;;;;;;;;;;;OAYG;IACG,MAAM,CAAC,CAAC,EAAE,QAAQ,EAAE,cAAc,CAAC,CAAC,EAAE,IAAI,CAAC,EAAE,IAAI,EAAE,CAAC,EAAE,OAAO,CAAC,EAAE,sBAAsB,GAAG,OAAO,CAAC,CAAC,CAAC;YAyB1F,eAAe;YAUf,UAAU;IAqBzB;;;;;;;;;;;;;;;;;;;;;;OAsBG;IACH,MAAM,CAAC,OAAO,GAAE,sBAA2B,GAAG,aAAa,CAAC,IAAI,CAAC,GAAG,OAAO,CAAC,IAAI,EAAE,CAAC;IASnF;;;;;;;;;;;;;;;;;;;;;;OAsBG;IACH,OAAO,CAAC,OAAO,GAAE,sBAA2B,GAAG,aAAa,CAAC,CAAC,MAAM,EAAE,IAAI,CAAC,CAAC,GAAG,OAAO,CAAC,CAAC,MAAM,EAAE,IAAI,CAAC,EAAE,CAAC;IASxG;;;;;;;;;;;OAWG;IACI,CAAC,MAAM,CAAC,aAAa,CAAC,IAAI,cAAc,CAAC,IAAI,EAAE,IAAI,EAAE,SAAS,CAAC;IAItE;;;OAGG;IACG,IAAI,IAAI,OAAO,CAAC,IAAI,CAAC;IAO3B;;;;;;;;;;;;;;OAcG;WACU,IAAI,CAAC,IAAI,SAAS,UAAU,GAAG,UAAU,EAClD,UAAU,CAAC,EAAE,MAAM,GAAG,iBAAiB,GAAG,IAAI,EAC9C,OAAO,GAAE,kBAAuB,GACjC,OAAO,CAAC,OAAO,CAAC,IAAI,CAAC,CAAC;IA0BzB;;;;;;;;;;;;;;;;;;;;;;;OAuBG;WACU,QAAQ,CAAC,IAAI,SAAS,UAAU,GAAG,UAAU,EAAE,IAAI,EAAE,IAAI,GAAG,IAAI,EAAE,GAAG,OAAO,CAAC,IAAI,CAAC;IAK/F;;OAEG;WACU,OAAO,CAAC,IAAI,SAAS,UAAU,GAAG,UAAU,EACrD,OAAO,GAAE,kBAAuB,GACjC,OAAO,CAAC,cAAc,CAAC,IAAI,CAAC,CAAC;CAInC;AAED;;GAEG;AACH,MAAM,WAAW,eAAe,CAAC,IAAI;IACjC;;;OAGG;IACH,CAAC,IAAI,EAAE,IAAI,EAAE,KAAK,EAAE,MAAM,GAAG,SAAS,CAAC,IAAI,CAAC,CAAC;CAChD;AAED;;GAEG;AACH,MAAM,WAAW,aAAa,CAAC,IAAI,EAAE,CAAC;IAClC;;;;OAIG;IACH,CAAC,IAAI,EAAE,IAAI,EAAE,KAAK,EAAE,MAAM,GAAG,SAAS,CAAC,CAAC,CAAC,CAAC;CAC7C;AAED;;GAEG;AACH,MAAM,WAAW,cAAc,CAAC,CAAC,EAAE,IAAI;IACnC;;;;OAIG;IACH,CAAC,IAAI,EAAE,CAAC,EAAE,IAAI,EAAE,IAAI,EAAE,KAAK,EAAE,MAAM,GAAG,SAAS,CAAC,CAAC,CAAC,CAAC;CACtD;AAED,MAAM,WAAW,cAAc;IAC3B,EAAE,EAAE,MAAM,CAAC;IACX,IAAI,CAAC,EAAE,MAAM,CAAC;IACd,MAAM,EAAE,aAAa,CAAC;CACzB;AAED,MAAM,WAAW,cAAc,CAAC,IAAI;IAChC,6CAA6C;IAC7C,KAAK,EAAE,MAAM,CAAC;IACd,qDAAqD;IACrD,KAAK,EAAE,MAAM,CAAC;IACd,2DAA2D;IAC3D,MAAM,EAAE,MAAM,CAAC;IACf,mDAAmD;IACnD,KAAK,EAAE,MAAM,CAAC;IACd,wDAAwD;IACxD,KAAK,EAAE,IAAI,EAAE,CAAC;IACd,iDAAiD;IACjD,IAAI,CAAC,EAAE,OAAO,CAAC;CAClB"}
|
package/storages/dataset.js
CHANGED
|
@@ -1,80 +1,34 @@
|
|
|
1
1
|
import { stringify } from 'csv-stringify/sync';
|
|
2
2
|
import ow from 'ow';
|
|
3
|
-
import { MAX_PAYLOAD_SIZE_BYTES } from '@apify/consts';
|
|
4
3
|
import { Configuration } from '../configuration.js';
|
|
5
|
-
import {
|
|
4
|
+
import { serviceLocator } from '../service_locator.js';
|
|
6
5
|
import { checkStorageAccess } from './access_checking.js';
|
|
7
6
|
import { KeyValueStore } from './key_value_store.js';
|
|
8
|
-
import {
|
|
9
|
-
import { purgeDefaultStorages } from './utils.js';
|
|
7
|
+
import { resolveStorageIdentifier } from './storage_instance_manager.js';
|
|
8
|
+
import { createDualIterable, purgeDefaultStorages } from './utils.js';
|
|
10
9
|
/** @internal */
|
|
11
10
|
export const DATASET_ITERATORS_DEFAULT_LIMIT = 10000;
|
|
12
|
-
const SAFETY_BUFFER_PERCENT = 0.01 / 100; // 0.01%
|
|
13
11
|
/**
|
|
14
|
-
*
|
|
15
|
-
*
|
|
16
|
-
*
|
|
12
|
+
* Validates that the given value is a plain JSON-serializable object
|
|
13
|
+
* (not an array, not a primitive, not circular).
|
|
14
|
+
*
|
|
15
|
+
* @param item The value to validate.
|
|
16
|
+
* @param index Optional index for error messages when validating inside an array.
|
|
17
17
|
* @ignore
|
|
18
18
|
*/
|
|
19
|
-
export function
|
|
19
|
+
export function assertJsonSerializable(item, index) {
|
|
20
20
|
const s = typeof index === 'number' ? ` at index ${index} ` : ' ';
|
|
21
21
|
const isItemObject = item && typeof item === 'object' && !Array.isArray(item);
|
|
22
22
|
if (!isItemObject) {
|
|
23
23
|
throw new Error(`Data item${s}is not an object. You can push only objects into a dataset.`);
|
|
24
24
|
}
|
|
25
|
-
let payload;
|
|
26
25
|
try {
|
|
27
|
-
|
|
26
|
+
JSON.stringify(item);
|
|
28
27
|
}
|
|
29
28
|
catch (e) {
|
|
30
29
|
const err = e;
|
|
31
30
|
throw new Error(`Data item${s}is not serializable to JSON.\nCause: ${err.message}`);
|
|
32
31
|
}
|
|
33
|
-
const bytes = Buffer.byteLength(payload);
|
|
34
|
-
if (bytes > limitBytes) {
|
|
35
|
-
throw new Error(`Data item${s}is too large (size: ${bytes} bytes, limit: ${limitBytes} bytes)`);
|
|
36
|
-
}
|
|
37
|
-
return payload;
|
|
38
|
-
}
|
|
39
|
-
/**
|
|
40
|
-
* Takes an array of JSONs (payloads) as input and produces an array of JSON strings
|
|
41
|
-
* where each string is a JSON array of payloads with a maximum size of limitBytes per one
|
|
42
|
-
* JSON array. Fits as many payloads as possible into a single JSON array and then moves
|
|
43
|
-
* on to the next, preserving item order.
|
|
44
|
-
*
|
|
45
|
-
* The function assumes that none of the items is larger than limitBytes and does not validate.
|
|
46
|
-
* @ignore
|
|
47
|
-
*/
|
|
48
|
-
export function chunkBySize(items, limitBytes) {
|
|
49
|
-
if (!items.length)
|
|
50
|
-
return [];
|
|
51
|
-
if (items.length === 1)
|
|
52
|
-
return items;
|
|
53
|
-
// Split payloads into buckets of valid size.
|
|
54
|
-
let lastChunkBytes = 2; // Add 2 bytes for [] wrapper.
|
|
55
|
-
const chunks = [];
|
|
56
|
-
for (const payload of items) {
|
|
57
|
-
const bytes = Buffer.byteLength(payload);
|
|
58
|
-
if (bytes <= limitBytes && bytes + 2 > limitBytes) {
|
|
59
|
-
// Handle cases where wrapping with [] would fail, but solo object is fine.
|
|
60
|
-
chunks.push(payload);
|
|
61
|
-
lastChunkBytes = bytes;
|
|
62
|
-
}
|
|
63
|
-
else if (lastChunkBytes + bytes <= limitBytes) {
|
|
64
|
-
// ensure array
|
|
65
|
-
if (!Array.isArray(chunks[chunks.length - 1])) {
|
|
66
|
-
chunks.push([]);
|
|
67
|
-
}
|
|
68
|
-
chunks[chunks.length - 1].push(payload);
|
|
69
|
-
lastChunkBytes += bytes + 1; // Add 1 byte for ',' separator.
|
|
70
|
-
}
|
|
71
|
-
else {
|
|
72
|
-
chunks.push([payload]);
|
|
73
|
-
lastChunkBytes = bytes + 2; // Add 2 bytes for [] wrapper.
|
|
74
|
-
}
|
|
75
|
-
}
|
|
76
|
-
// Stringify array chunks.
|
|
77
|
-
return chunks.map((chunk) => (typeof chunk === 'string' ? chunk : `[${chunk.join(',')}]`));
|
|
78
32
|
}
|
|
79
33
|
/**
|
|
80
34
|
* The `Dataset` class represents a store for structured data where each object stored has the same attributes,
|
|
@@ -132,7 +86,7 @@ export class Dataset {
|
|
|
132
86
|
id;
|
|
133
87
|
name;
|
|
134
88
|
client;
|
|
135
|
-
log
|
|
89
|
+
log;
|
|
136
90
|
/**
|
|
137
91
|
* @internal
|
|
138
92
|
*/
|
|
@@ -140,7 +94,8 @@ export class Dataset {
|
|
|
140
94
|
this.config = config;
|
|
141
95
|
this.id = options.id;
|
|
142
96
|
this.name = options.name;
|
|
143
|
-
this.client = options.client
|
|
97
|
+
this.client = options.client;
|
|
98
|
+
this.log = serviceLocator.getLogger().child({ prefix: 'Dataset' });
|
|
144
99
|
}
|
|
145
100
|
/**
|
|
146
101
|
* Stores an object or an array of objects to the dataset.
|
|
@@ -150,40 +105,18 @@ export class Dataset {
|
|
|
150
105
|
* **IMPORTANT**: Make sure to use the `await` keyword when calling `pushData()`,
|
|
151
106
|
* otherwise the crawler process might finish before the data is stored!
|
|
152
107
|
*
|
|
153
|
-
* The size of the data is limited by the receiving API and therefore `pushData()` will only
|
|
154
|
-
* allow objects whose JSON representation is smaller than 9MB. When an array is passed,
|
|
155
|
-
* none of the included objects
|
|
156
|
-
* may be larger than 9MB, but the array itself may be of any size.
|
|
157
|
-
*
|
|
158
|
-
* The function internally
|
|
159
|
-
* chunks the array into separate items and pushes them sequentially.
|
|
160
|
-
* The chunking process is stable (keeps order of data), but it does not provide a transaction
|
|
161
|
-
* safety mechanism. Therefore, in the event of an uploading error (after several automatic retries),
|
|
162
|
-
* the function's Promise will reject and the dataset will be left in a state where some of
|
|
163
|
-
* the items have already been saved to the dataset while other items from the source array were not.
|
|
164
|
-
* To overcome this limitation, the developer may, for example, read the last item saved in the dataset
|
|
165
|
-
* and re-attempt the save of the data from this item onwards to prevent duplicates.
|
|
166
108
|
* @param data Object or array of objects containing data to be stored in the default dataset.
|
|
167
|
-
* The objects must be serializable to JSON
|
|
109
|
+
* The objects must be serializable to JSON.
|
|
168
110
|
*/
|
|
169
111
|
async pushData(data) {
|
|
170
112
|
checkStorageAccess();
|
|
171
113
|
ow(data, 'data', ow.object);
|
|
172
|
-
|
|
173
|
-
const
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
const payload = checkAndSerialize(data, limit);
|
|
177
|
-
await dispatch(payload);
|
|
178
|
-
return;
|
|
179
|
-
}
|
|
180
|
-
// Handle Arrays
|
|
181
|
-
const payloads = data.map((item, index) => checkAndSerialize(item, limit, index));
|
|
182
|
-
const chunks = chunkBySize(payloads, limit);
|
|
183
|
-
// Invoke client in series to preserve order of data
|
|
184
|
-
for (const chunk of chunks) {
|
|
185
|
-
await dispatch(chunk);
|
|
114
|
+
// Normalize to array and validate each item
|
|
115
|
+
const items = Array.isArray(data) ? data : [data];
|
|
116
|
+
for (let i = 0; i < items.length; i++) {
|
|
117
|
+
assertJsonSerializable(items[i], i);
|
|
186
118
|
}
|
|
119
|
+
await this.client.pushData(items);
|
|
187
120
|
}
|
|
188
121
|
/**
|
|
189
122
|
* Returns {@link DatasetContent} object holding the items in the dataset based on the provided parameters.
|
|
@@ -191,7 +124,7 @@ export class Dataset {
|
|
|
191
124
|
async getData(options = {}) {
|
|
192
125
|
checkStorageAccess();
|
|
193
126
|
try {
|
|
194
|
-
return await this.client.
|
|
127
|
+
return await this.client.getData(options);
|
|
195
128
|
}
|
|
196
129
|
catch (e) {
|
|
197
130
|
const error = e;
|
|
@@ -208,18 +141,9 @@ export class Dataset {
|
|
|
208
141
|
async export(options = {}) {
|
|
209
142
|
checkStorageAccess();
|
|
210
143
|
const items = [];
|
|
211
|
-
const
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
if (value.count === 0) {
|
|
215
|
-
return;
|
|
216
|
-
}
|
|
217
|
-
items.push(...value.items);
|
|
218
|
-
if (value.total > offset + value.count) {
|
|
219
|
-
await fetchNextChunk(offset + value.count);
|
|
220
|
-
}
|
|
221
|
-
};
|
|
222
|
-
await fetchNextChunk();
|
|
144
|
+
for await (const page of this.fetchPages(options)) {
|
|
145
|
+
items.push(...page.items);
|
|
146
|
+
}
|
|
223
147
|
return items;
|
|
224
148
|
}
|
|
225
149
|
/**
|
|
@@ -233,7 +157,14 @@ export class Dataset {
|
|
|
233
157
|
const kvStore = await KeyValueStore.open(options?.toKVS ?? null, { config: this.config });
|
|
234
158
|
const items = await this.export(options);
|
|
235
159
|
if (contentType === 'text/csv') {
|
|
236
|
-
|
|
160
|
+
// To handle empty dataset exports gracefully.
|
|
161
|
+
if (items.length === 0) {
|
|
162
|
+
await kvStore.setValue(key, '', { contentType });
|
|
163
|
+
return items;
|
|
164
|
+
}
|
|
165
|
+
const keys = options?.collectAllKeys
|
|
166
|
+
? Array.from(new Set(items.flatMap(Object.keys)))
|
|
167
|
+
: Object.keys(items[0]);
|
|
237
168
|
const value = stringify([
|
|
238
169
|
keys,
|
|
239
170
|
...items.map((item) => {
|
|
@@ -248,7 +179,6 @@ export class Dataset {
|
|
|
248
179
|
return items;
|
|
249
180
|
}
|
|
250
181
|
throw new Error(`Unsupported content type: ${contentType}`);
|
|
251
|
-
return items;
|
|
252
182
|
}
|
|
253
183
|
/**
|
|
254
184
|
* Save entire default dataset's contents into one JSON file within a key-value store.
|
|
@@ -293,28 +223,23 @@ export class Dataset {
|
|
|
293
223
|
/**
|
|
294
224
|
* Returns an object containing general information about the dataset.
|
|
295
225
|
*
|
|
296
|
-
* The function returns the same object as the Apify API Client's
|
|
297
|
-
* [getDataset](https://docs.apify.com/api/apify-client-js/latest#ApifyClient-datasets-getDataset)
|
|
298
|
-
* function, which in turn calls the
|
|
299
|
-
* [Get dataset](https://apify.com/docs/api/v2#/reference/datasets/dataset/get-dataset)
|
|
300
|
-
* API endpoint.
|
|
301
|
-
*
|
|
302
226
|
* **Example:**
|
|
303
227
|
* ```
|
|
304
228
|
* {
|
|
305
229
|
* id: "WkzbQMuFYuamGv3YF",
|
|
306
230
|
* name: "my-dataset",
|
|
307
|
-
* userId: "wRsJZtadYvn4mBZmm",
|
|
308
231
|
* createdAt: new Date("2015-12-12T07:34:14.202Z"),
|
|
309
232
|
* modifiedAt: new Date("2015-12-13T08:36:13.202Z"),
|
|
310
233
|
* accessedAt: new Date("2015-12-14T08:36:13.202Z"),
|
|
311
234
|
* itemCount: 14,
|
|
312
235
|
* }
|
|
313
236
|
* ```
|
|
237
|
+
*
|
|
238
|
+
* @throws If the underlying storage no longer exists (e.g. it was deleted externally).
|
|
314
239
|
*/
|
|
315
240
|
async getInfo() {
|
|
316
241
|
checkStorageAccess();
|
|
317
|
-
return this.client.
|
|
242
|
+
return this.client.getMetadata();
|
|
318
243
|
}
|
|
319
244
|
/**
|
|
320
245
|
* Iterates over dataset items, yielding each in turn to an `iteratee` function.
|
|
@@ -388,15 +313,114 @@ export class Dataset {
|
|
|
388
313
|
await this.forEach(wrappedFunc, options);
|
|
389
314
|
return currentMemo;
|
|
390
315
|
}
|
|
316
|
+
async *fetchEntryPages(options) {
|
|
317
|
+
let index = options.offset ?? 0;
|
|
318
|
+
for await (const page of this.fetchPages(options)) {
|
|
319
|
+
yield {
|
|
320
|
+
...page,
|
|
321
|
+
items: page.items.map((item) => [index++, item]),
|
|
322
|
+
};
|
|
323
|
+
}
|
|
324
|
+
}
|
|
325
|
+
async *fetchPages(options, pageSize = DATASET_ITERATORS_DEFAULT_LIMIT) {
|
|
326
|
+
let offset = options.offset ?? 0;
|
|
327
|
+
const totalLimit = options.limit;
|
|
328
|
+
let yielded = 0;
|
|
329
|
+
while (true) {
|
|
330
|
+
const fetchLimit = totalLimit !== undefined ? Math.min(pageSize, totalLimit - yielded) : pageSize;
|
|
331
|
+
if (fetchLimit <= 0)
|
|
332
|
+
break;
|
|
333
|
+
const page = await this.client.getData({ ...options, offset, limit: fetchLimit });
|
|
334
|
+
yield page;
|
|
335
|
+
yielded += page.items.length;
|
|
336
|
+
if (page.items.length < fetchLimit || offset + page.items.length >= page.total)
|
|
337
|
+
break;
|
|
338
|
+
offset += page.items.length;
|
|
339
|
+
}
|
|
340
|
+
}
|
|
341
|
+
/**
|
|
342
|
+
* Returns dataset items.
|
|
343
|
+
*
|
|
344
|
+
* When awaited (`await dataset.values()`), returns all items as a flat `Data[]` array.
|
|
345
|
+
* When used as an async iterable (`for await...of`), iterates over all items across pages
|
|
346
|
+
* without loading everything into memory at once.
|
|
347
|
+
*
|
|
348
|
+
* **Example usage:**
|
|
349
|
+
* ```javascript
|
|
350
|
+
* const dataset = await Dataset.open('my-results');
|
|
351
|
+
*
|
|
352
|
+
* // Iterate over all items (memory-efficient for large datasets)
|
|
353
|
+
* for await (const item of dataset.values()) {
|
|
354
|
+
* console.log(item);
|
|
355
|
+
* }
|
|
356
|
+
*
|
|
357
|
+
* // Or fetch all items at once
|
|
358
|
+
* const items = await dataset.values();
|
|
359
|
+
* console.log(items);
|
|
360
|
+
* ```
|
|
361
|
+
*
|
|
362
|
+
* @param options Options for the iteration.
|
|
363
|
+
*/
|
|
364
|
+
values(options = {}) {
|
|
365
|
+
checkStorageAccess();
|
|
366
|
+
return createDualIterable({
|
|
367
|
+
createPages: () => this.fetchPages(options),
|
|
368
|
+
extractItems: (page) => page.items,
|
|
369
|
+
});
|
|
370
|
+
}
|
|
371
|
+
/**
|
|
372
|
+
* Returns dataset entries (index-value pairs).
|
|
373
|
+
*
|
|
374
|
+
* When awaited (`await dataset.entries()`), returns all entries as a flat `[index, item][]` array.
|
|
375
|
+
* When used as an async iterable (`for await...of`), iterates over all entries across pages
|
|
376
|
+
* without loading everything into memory at once.
|
|
377
|
+
*
|
|
378
|
+
* **Example usage:**
|
|
379
|
+
* ```javascript
|
|
380
|
+
* const dataset = await Dataset.open('my-results');
|
|
381
|
+
*
|
|
382
|
+
* // Iterate over all entries
|
|
383
|
+
* for await (const [index, item] of dataset.entries()) {
|
|
384
|
+
* console.log(`Item at ${index}: ${JSON.stringify(item)}`);
|
|
385
|
+
* }
|
|
386
|
+
*
|
|
387
|
+
* // Or fetch all at once
|
|
388
|
+
* const entries = await dataset.entries();
|
|
389
|
+
* console.log(entries);
|
|
390
|
+
* ```
|
|
391
|
+
*
|
|
392
|
+
* @param options Options for the iteration.
|
|
393
|
+
*/
|
|
394
|
+
entries(options = {}) {
|
|
395
|
+
checkStorageAccess();
|
|
396
|
+
return createDualIterable({
|
|
397
|
+
createPages: () => this.fetchEntryPages(options),
|
|
398
|
+
extractItems: (page) => page.items,
|
|
399
|
+
});
|
|
400
|
+
}
|
|
401
|
+
/**
|
|
402
|
+
* Default async iterator for the dataset, iterating over items.
|
|
403
|
+
* Allows using the dataset directly in a `for await...of` loop.
|
|
404
|
+
*
|
|
405
|
+
* **Example usage:**
|
|
406
|
+
* ```javascript
|
|
407
|
+
* const dataset = await Dataset.open('my-results');
|
|
408
|
+
* for await (const item of dataset) {
|
|
409
|
+
* console.log(item);
|
|
410
|
+
* }
|
|
411
|
+
* ```
|
|
412
|
+
*/
|
|
413
|
+
async *[Symbol.asyncIterator]() {
|
|
414
|
+
yield* this.values();
|
|
415
|
+
}
|
|
391
416
|
/**
|
|
392
417
|
* Removes the dataset either from the Apify cloud storage or from the local directory,
|
|
393
418
|
* depending on the mode of operation.
|
|
394
419
|
*/
|
|
395
420
|
async drop() {
|
|
396
421
|
checkStorageAccess();
|
|
397
|
-
await this.client.
|
|
398
|
-
|
|
399
|
-
manager.closeStorage(this);
|
|
422
|
+
await this.client.drop();
|
|
423
|
+
serviceLocator.getStorageInstanceManager().removeFromCache(this);
|
|
400
424
|
}
|
|
401
425
|
/**
|
|
402
426
|
* Opens a dataset and returns a promise resolving to an instance of the {@link Dataset} class.
|
|
@@ -407,23 +431,27 @@ export class Dataset {
|
|
|
407
431
|
*
|
|
408
432
|
* For more details and code examples, see the {@link Dataset} class.
|
|
409
433
|
*
|
|
410
|
-
* @param [
|
|
411
|
-
* ID or name of the dataset to be opened. If
|
|
412
|
-
*
|
|
434
|
+
* @param [identifier]
|
|
435
|
+
* ID or name of the dataset to be opened. If a string is provided, it will first be
|
|
436
|
+
* looked up as an ID; if no such storage exists, it will be treated as a name.
|
|
437
|
+
* If `null` or `undefined`, the function returns the default dataset associated with the crawler run.
|
|
413
438
|
* @param [options] Storage manager options.
|
|
414
439
|
*/
|
|
415
|
-
static async open(
|
|
440
|
+
static async open(identifier, options = {}) {
|
|
416
441
|
checkStorageAccess();
|
|
417
|
-
ow(datasetIdOrName, ow.optional.string);
|
|
418
442
|
ow(options, ow.object.exactShape({
|
|
419
443
|
config: ow.optional.object.instanceOf(Configuration),
|
|
420
444
|
storageClient: ow.optional.object,
|
|
421
445
|
}));
|
|
422
446
|
options.config ??= Configuration.getGlobalConfig();
|
|
423
|
-
options.storageClient
|
|
424
|
-
await purgeDefaultStorages({ onlyPurgeOnce: true, client
|
|
425
|
-
const
|
|
426
|
-
return
|
|
447
|
+
const client = options.storageClient ?? serviceLocator.getStorageClient();
|
|
448
|
+
await purgeDefaultStorages({ onlyPurgeOnce: true, client, config: options.config });
|
|
449
|
+
const resolved = await resolveStorageIdentifier(identifier, client, 'Dataset');
|
|
450
|
+
return serviceLocator.getStorageInstanceManager().openStorage(this, {
|
|
451
|
+
...resolved,
|
|
452
|
+
clientOpener: () => client.createDatasetClient(resolved),
|
|
453
|
+
clientCacheKey: client.getStorageClientCacheKey?.() ?? client.constructor.name,
|
|
454
|
+
});
|
|
427
455
|
}
|
|
428
456
|
/**
|
|
429
457
|
* Stores an object or an array of objects to the default {@link Dataset} of the current crawler run.
|