@crawlee/core 4.0.0-beta.6 → 4.0.0-beta.61

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (196) hide show
  1. package/README.md +9 -5
  2. package/autoscaling/autoscaled_pool.d.ts +3 -5
  3. package/autoscaling/autoscaled_pool.d.ts.map +1 -1
  4. package/autoscaling/autoscaled_pool.js +3 -9
  5. package/autoscaling/autoscaled_pool.js.map +1 -1
  6. package/autoscaling/snapshotter.d.ts +3 -13
  7. package/autoscaling/snapshotter.d.ts.map +1 -1
  8. package/autoscaling/snapshotter.js +18 -29
  9. package/autoscaling/snapshotter.js.map +1 -1
  10. package/autoscaling/system_status.d.ts +0 -3
  11. package/autoscaling/system_status.d.ts.map +1 -1
  12. package/autoscaling/system_status.js +2 -3
  13. package/autoscaling/system_status.js.map +1 -1
  14. package/configuration.d.ts +85 -227
  15. package/configuration.d.ts.map +1 -1
  16. package/configuration.js +159 -223
  17. package/configuration.js.map +1 -1
  18. package/cookie_utils.d.ts +4 -2
  19. package/cookie_utils.d.ts.map +1 -1
  20. package/cookie_utils.js +18 -12
  21. package/cookie_utils.js.map +1 -1
  22. package/crawlers/context_pipeline.d.ts +71 -0
  23. package/crawlers/context_pipeline.d.ts.map +1 -0
  24. package/crawlers/context_pipeline.js +123 -0
  25. package/crawlers/context_pipeline.js.map +1 -0
  26. package/crawlers/crawler_commons.d.ts +19 -28
  27. package/crawlers/crawler_commons.d.ts.map +1 -1
  28. package/crawlers/crawler_commons.js +12 -20
  29. package/crawlers/crawler_commons.js.map +1 -1
  30. package/crawlers/crawler_utils.d.ts +2 -2
  31. package/crawlers/crawler_utils.d.ts.map +1 -1
  32. package/crawlers/crawler_utils.js +1 -1
  33. package/crawlers/crawler_utils.js.map +1 -1
  34. package/crawlers/error_snapshotter.d.ts +3 -2
  35. package/crawlers/error_snapshotter.d.ts.map +1 -1
  36. package/crawlers/error_snapshotter.js +2 -2
  37. package/crawlers/error_snapshotter.js.map +1 -1
  38. package/crawlers/error_tracker.d.ts +2 -1
  39. package/crawlers/error_tracker.d.ts.map +1 -1
  40. package/crawlers/error_tracker.js.map +1 -1
  41. package/crawlers/index.d.ts +1 -1
  42. package/crawlers/index.d.ts.map +1 -1
  43. package/crawlers/index.js +1 -1
  44. package/crawlers/index.js.map +1 -1
  45. package/crawlers/internals/types.d.ts +8 -0
  46. package/crawlers/internals/types.d.ts.map +1 -0
  47. package/crawlers/internals/types.js +2 -0
  48. package/crawlers/internals/types.js.map +1 -0
  49. package/crawlers/statistics.d.ts +15 -15
  50. package/crawlers/statistics.d.ts.map +1 -1
  51. package/crawlers/statistics.js +21 -24
  52. package/crawlers/statistics.js.map +1 -1
  53. package/enqueue_links/enqueue_links.d.ts +32 -18
  54. package/enqueue_links/enqueue_links.d.ts.map +1 -1
  55. package/enqueue_links/enqueue_links.js +45 -24
  56. package/enqueue_links/enqueue_links.js.map +1 -1
  57. package/enqueue_links/shared.d.ts +25 -8
  58. package/enqueue_links/shared.d.ts.map +1 -1
  59. package/enqueue_links/shared.js +69 -37
  60. package/enqueue_links/shared.js.map +1 -1
  61. package/errors.d.ts +33 -3
  62. package/errors.d.ts.map +1 -1
  63. package/errors.js +48 -4
  64. package/errors.js.map +1 -1
  65. package/events/event_manager.d.ts +8 -5
  66. package/events/event_manager.d.ts.map +1 -1
  67. package/events/event_manager.js +7 -9
  68. package/events/event_manager.js.map +1 -1
  69. package/events/local_event_manager.d.ts +14 -4
  70. package/events/local_event_manager.d.ts.map +1 -1
  71. package/events/local_event_manager.js +33 -39
  72. package/events/local_event_manager.js.map +1 -1
  73. package/index.d.ts +3 -2
  74. package/index.d.ts.map +1 -1
  75. package/index.js +2 -1
  76. package/index.js.map +1 -1
  77. package/log.d.ts +82 -2
  78. package/log.d.ts.map +1 -1
  79. package/log.js +102 -0
  80. package/log.js.map +1 -1
  81. package/package.json +9 -10
  82. package/proxy_configuration.d.ts +14 -148
  83. package/proxy_configuration.d.ts.map +1 -1
  84. package/proxy_configuration.js +19 -167
  85. package/proxy_configuration.js.map +1 -1
  86. package/recoverable_state.d.ts +121 -0
  87. package/recoverable_state.d.ts.map +1 -0
  88. package/recoverable_state.js +142 -0
  89. package/recoverable_state.js.map +1 -0
  90. package/request.d.ts +74 -10
  91. package/request.d.ts.map +1 -1
  92. package/request.js +85 -23
  93. package/request.js.map +1 -1
  94. package/router.d.ts.map +1 -1
  95. package/router.js.map +1 -1
  96. package/serialization.js +1 -1
  97. package/serialization.js.map +1 -1
  98. package/service_locator.d.ts +157 -0
  99. package/service_locator.d.ts.map +1 -0
  100. package/service_locator.js +234 -0
  101. package/service_locator.js.map +1 -0
  102. package/session_pool/index.d.ts +0 -1
  103. package/session_pool/index.d.ts.map +1 -1
  104. package/session_pool/index.js +0 -1
  105. package/session_pool/index.js.map +1 -1
  106. package/session_pool/session.d.ts +26 -72
  107. package/session_pool/session.d.ts.map +1 -1
  108. package/session_pool/session.js +36 -98
  109. package/session_pool/session.js.map +1 -1
  110. package/session_pool/session_pool.d.ts +65 -71
  111. package/session_pool/session_pool.d.ts.map +1 -1
  112. package/session_pool/session_pool.js +101 -100
  113. package/session_pool/session_pool.js.map +1 -1
  114. package/storages/dataset.d.ts +90 -46
  115. package/storages/dataset.d.ts.map +1 -1
  116. package/storages/dataset.js +149 -121
  117. package/storages/dataset.js.map +1 -1
  118. package/storages/index.d.ts +3 -1
  119. package/storages/index.d.ts.map +1 -1
  120. package/storages/index.js +3 -1
  121. package/storages/index.js.map +1 -1
  122. package/storages/key_value_store.d.ts +104 -22
  123. package/storages/key_value_store.d.ts.map +1 -1
  124. package/storages/key_value_store.js +166 -51
  125. package/storages/key_value_store.js.map +1 -1
  126. package/storages/request_list.d.ts +9 -9
  127. package/storages/request_list.d.ts.map +1 -1
  128. package/storages/request_list.js +13 -8
  129. package/storages/request_list.js.map +1 -1
  130. package/storages/request_list_adapter.d.ts +58 -0
  131. package/storages/request_list_adapter.d.ts.map +1 -0
  132. package/storages/request_list_adapter.js +81 -0
  133. package/storages/request_list_adapter.js.map +1 -0
  134. package/storages/request_manager_tandem.d.ts +68 -0
  135. package/storages/request_manager_tandem.d.ts.map +1 -0
  136. package/storages/request_manager_tandem.js +124 -0
  137. package/storages/request_manager_tandem.js.map +1 -0
  138. package/storages/request_provider.d.ts +87 -22
  139. package/storages/request_provider.d.ts.map +1 -1
  140. package/storages/request_provider.js +127 -77
  141. package/storages/request_provider.js.map +1 -1
  142. package/storages/request_queue.d.ts +1 -3
  143. package/storages/request_queue.d.ts.map +1 -1
  144. package/storages/request_queue.js +2 -4
  145. package/storages/request_queue.js.map +1 -1
  146. package/storages/request_queue_v2.d.ts +3 -3
  147. package/storages/request_queue_v2.d.ts.map +1 -1
  148. package/storages/request_queue_v2.js +4 -5
  149. package/storages/request_queue_v2.js.map +1 -1
  150. package/storages/sitemap_request_list.d.ts +5 -5
  151. package/storages/sitemap_request_list.d.ts.map +1 -1
  152. package/storages/sitemap_request_list.js +10 -7
  153. package/storages/sitemap_request_list.js.map +1 -1
  154. package/storages/storage_instance_manager.d.ts +91 -0
  155. package/storages/storage_instance_manager.d.ts.map +1 -0
  156. package/storages/storage_instance_manager.js +236 -0
  157. package/storages/storage_instance_manager.js.map +1 -0
  158. package/storages/utils.d.ts +47 -1
  159. package/storages/utils.d.ts.map +1 -1
  160. package/storages/utils.js +57 -5
  161. package/storages/utils.js.map +1 -1
  162. package/typedefs.d.ts +1 -1
  163. package/typedefs.d.ts.map +1 -1
  164. package/validators.d.ts +4 -0
  165. package/validators.d.ts.map +1 -1
  166. package/validators.js +4 -0
  167. package/validators.js.map +1 -1
  168. package/crawlers/crawler_extension.d.ts +0 -12
  169. package/crawlers/crawler_extension.d.ts.map +0 -1
  170. package/crawlers/crawler_extension.js +0 -14
  171. package/crawlers/crawler_extension.js.map +0 -1
  172. package/http_clients/base-http-client.d.ts +0 -134
  173. package/http_clients/base-http-client.d.ts.map +0 -1
  174. package/http_clients/base-http-client.js +0 -33
  175. package/http_clients/base-http-client.js.map +0 -1
  176. package/http_clients/form-data-like.d.ts +0 -67
  177. package/http_clients/form-data-like.d.ts.map +0 -1
  178. package/http_clients/form-data-like.js +0 -5
  179. package/http_clients/form-data-like.js.map +0 -1
  180. package/http_clients/got-scraping-http-client.d.ts +0 -15
  181. package/http_clients/got-scraping-http-client.d.ts.map +0 -1
  182. package/http_clients/got-scraping-http-client.js +0 -69
  183. package/http_clients/got-scraping-http-client.js.map +0 -1
  184. package/http_clients/index.d.ts +0 -3
  185. package/http_clients/index.d.ts.map +0 -1
  186. package/http_clients/index.js +0 -3
  187. package/http_clients/index.js.map +0 -1
  188. package/session_pool/events.d.ts +0 -3
  189. package/session_pool/events.d.ts.map +0 -1
  190. package/session_pool/events.js +0 -3
  191. package/session_pool/events.js.map +0 -1
  192. package/storages/storage_manager.d.ts +0 -58
  193. package/storages/storage_manager.d.ts.map +0 -1
  194. package/storages/storage_manager.js +0 -105
  195. package/storages/storage_manager.js.map +0 -1
  196. package/tsconfig.build.tsbuildinfo +0 -1
@@ -1,27 +1,20 @@
1
- import type { DatasetClient, DatasetInfo, Dictionary, StorageClient } from '@crawlee/types';
1
+ import type { DatasetClient, DatasetInfo, Dictionary } from '@crawlee/types';
2
2
  import { Configuration } from '../configuration.js';
3
- import { type Log } from '../log.js';
3
+ import type { CrawleeLogger } from '../log.js';
4
4
  import type { Awaitable } from '../typedefs.js';
5
- import type { StorageManagerOptions } from './storage_manager.js';
5
+ import type { StorageIdentifier } from './storage_instance_manager.js';
6
+ import type { StorageOpenOptions } from './utils.js';
6
7
  /** @internal */
7
8
  export declare const DATASET_ITERATORS_DEFAULT_LIMIT = 10000;
8
9
  /**
9
- * Accepts a JSON serializable object as an input, validates its serializability,
10
- * and validates its serialized size against limitBytes. Optionally accepts its index
11
- * in an array to provide better error messages. Returns serialized object.
12
- * @ignore
13
- */
14
- export declare function checkAndSerialize<T>(item: T, limitBytes: number, index?: number): string;
15
- /**
16
- * Takes an array of JSONs (payloads) as input and produces an array of JSON strings
17
- * where each string is a JSON array of payloads with a maximum size of limitBytes per one
18
- * JSON array. Fits as many payloads as possible into a single JSON array and then moves
19
- * on to the next, preserving item order.
10
+ * Validates that the given value is a plain JSON-serializable object
11
+ * (not an array, not a primitive, not circular).
20
12
  *
21
- * The function assumes that none of the items is larger than limitBytes and does not validate.
13
+ * @param item The value to validate.
14
+ * @param index Optional index for error messages when validating inside an array.
22
15
  * @ignore
23
16
  */
24
- export declare function chunkBySize(items: string[], limitBytes: number): string[];
17
+ export declare function assertJsonSerializable<T>(item: T, index?: number): void;
25
18
  export interface DatasetDataOptions {
26
19
  /**
27
20
  * Number of array elements that should be skipped at the start.
@@ -67,6 +60,11 @@ export interface DatasetDataOptions {
67
60
  skipEmpty?: boolean;
68
61
  }
69
62
  export interface DatasetExportOptions extends Omit<DatasetDataOptions, 'offset' | 'limit'> {
63
+ /**
64
+ * If true, includes all unique keys from all dataset items in the CSV export header.
65
+ * If omitted or false, only keys from the first item are used.
66
+ */
67
+ collectAllKeys?: boolean;
70
68
  }
71
69
  export interface DatasetIteratorOptions extends Omit<DatasetDataOptions, 'offset' | 'limit' | 'clean' | 'skipHidden' | 'skipEmpty'> {
72
70
  /** @internal */
@@ -86,8 +84,8 @@ export interface DatasetIteratorOptions extends Omit<DatasetDataOptions, 'offset
86
84
  format?: string;
87
85
  }
88
86
  export interface DatasetExportToOptions extends DatasetExportOptions {
89
- fromDataset?: string;
90
- toKVS?: string;
87
+ fromDataset?: string | StorageIdentifier;
88
+ toKVS?: string | StorageIdentifier;
91
89
  }
92
90
  /**
93
91
  * The `Dataset` class represents a store for structured data where each object stored has the same attributes,
@@ -145,7 +143,7 @@ export declare class Dataset<Data extends Dictionary = Dictionary> {
145
143
  id: string;
146
144
  name?: string;
147
145
  client: DatasetClient<Data>;
148
- log: Log;
146
+ log: CrawleeLogger;
149
147
  /**
150
148
  * @internal
151
149
  */
@@ -158,21 +156,8 @@ export declare class Dataset<Data extends Dictionary = Dictionary> {
158
156
  * **IMPORTANT**: Make sure to use the `await` keyword when calling `pushData()`,
159
157
  * otherwise the crawler process might finish before the data is stored!
160
158
  *
161
- * The size of the data is limited by the receiving API and therefore `pushData()` will only
162
- * allow objects whose JSON representation is smaller than 9MB. When an array is passed,
163
- * none of the included objects
164
- * may be larger than 9MB, but the array itself may be of any size.
165
- *
166
- * The function internally
167
- * chunks the array into separate items and pushes them sequentially.
168
- * The chunking process is stable (keeps order of data), but it does not provide a transaction
169
- * safety mechanism. Therefore, in the event of an uploading error (after several automatic retries),
170
- * the function's Promise will reject and the dataset will be left in a state where some of
171
- * the items have already been saved to the dataset while other items from the source array were not.
172
- * To overcome this limitation, the developer may, for example, read the last item saved in the dataset
173
- * and re-attempt the save of the data from this item onwards to prevent duplicates.
174
159
  * @param data Object or array of objects containing data to be stored in the default dataset.
175
- * The objects must be serializable to JSON and the JSON representation of each object must be smaller than 9MB.
160
+ * The objects must be serializable to JSON.
176
161
  */
177
162
  pushData(data: Data | Data[]): Promise<void>;
178
163
  /**
@@ -223,26 +208,21 @@ export declare class Dataset<Data extends Dictionary = Dictionary> {
223
208
  /**
224
209
  * Returns an object containing general information about the dataset.
225
210
  *
226
- * The function returns the same object as the Apify API Client's
227
- * [getDataset](https://docs.apify.com/api/apify-client-js/latest#ApifyClient-datasets-getDataset)
228
- * function, which in turn calls the
229
- * [Get dataset](https://apify.com/docs/api/v2#/reference/datasets/dataset/get-dataset)
230
- * API endpoint.
231
- *
232
211
  * **Example:**
233
212
  * ```
234
213
  * {
235
214
  * id: "WkzbQMuFYuamGv3YF",
236
215
  * name: "my-dataset",
237
- * userId: "wRsJZtadYvn4mBZmm",
238
216
  * createdAt: new Date("2015-12-12T07:34:14.202Z"),
239
217
  * modifiedAt: new Date("2015-12-13T08:36:13.202Z"),
240
218
  * accessedAt: new Date("2015-12-14T08:36:13.202Z"),
241
219
  * itemCount: 14,
242
220
  * }
243
221
  * ```
222
+ *
223
+ * @throws If the underlying storage no longer exists (e.g. it was deleted externally).
244
224
  */
245
- getInfo(): Promise<DatasetInfo | undefined>;
225
+ getInfo(): Promise<DatasetInfo>;
246
226
  /**
247
227
  * Iterates over dataset items, yielding each in turn to an `iteratee` function.
248
228
  * Each invocation of `iteratee` is called with two arguments: `(item, index)`.
@@ -326,6 +306,69 @@ export declare class Dataset<Data extends Dictionary = Dictionary> {
326
306
  * @param [options] An object containing extra options for `reduce()`
327
307
  */
328
308
  reduce<T>(iteratee: DatasetReducer<T, Data>, memo: T, options?: DatasetIteratorOptions): Promise<T>;
309
+ private fetchEntryPages;
310
+ private fetchPages;
311
+ /**
312
+ * Returns dataset items.
313
+ *
314
+ * When awaited (`await dataset.values()`), returns all items as a flat `Data[]` array.
315
+ * When used as an async iterable (`for await...of`), iterates over all items across pages
316
+ * without loading everything into memory at once.
317
+ *
318
+ * **Example usage:**
319
+ * ```javascript
320
+ * const dataset = await Dataset.open('my-results');
321
+ *
322
+ * // Iterate over all items (memory-efficient for large datasets)
323
+ * for await (const item of dataset.values()) {
324
+ * console.log(item);
325
+ * }
326
+ *
327
+ * // Or fetch all items at once
328
+ * const items = await dataset.values();
329
+ * console.log(items);
330
+ * ```
331
+ *
332
+ * @param options Options for the iteration.
333
+ */
334
+ values(options?: DatasetIteratorOptions): AsyncIterable<Data> & Promise<Data[]>;
335
+ /**
336
+ * Returns dataset entries (index-value pairs).
337
+ *
338
+ * When awaited (`await dataset.entries()`), returns all entries as a flat `[index, item][]` array.
339
+ * When used as an async iterable (`for await...of`), iterates over all entries across pages
340
+ * without loading everything into memory at once.
341
+ *
342
+ * **Example usage:**
343
+ * ```javascript
344
+ * const dataset = await Dataset.open('my-results');
345
+ *
346
+ * // Iterate over all entries
347
+ * for await (const [index, item] of dataset.entries()) {
348
+ * console.log(`Item at ${index}: ${JSON.stringify(item)}`);
349
+ * }
350
+ *
351
+ * // Or fetch all at once
352
+ * const entries = await dataset.entries();
353
+ * console.log(entries);
354
+ * ```
355
+ *
356
+ * @param options Options for the iteration.
357
+ */
358
+ entries(options?: DatasetIteratorOptions): AsyncIterable<[number, Data]> & Promise<[number, Data][]>;
359
+ /**
360
+ * Default async iterator for the dataset, iterating over items.
361
+ * Allows using the dataset directly in a `for await...of` loop.
362
+ *
363
+ * **Example usage:**
364
+ * ```javascript
365
+ * const dataset = await Dataset.open('my-results');
366
+ * for await (const item of dataset) {
367
+ * console.log(item);
368
+ * }
369
+ * ```
370
+ */
371
+ [Symbol.asyncIterator](): AsyncGenerator<Data, void, undefined>;
329
372
  /**
330
373
  * Removes the dataset either from the Apify cloud storage or from the local directory,
331
374
  * depending on the mode of operation.
@@ -340,12 +383,13 @@ export declare class Dataset<Data extends Dictionary = Dictionary> {
340
383
  *
341
384
  * For more details and code examples, see the {@link Dataset} class.
342
385
  *
343
- * @param [datasetIdOrName]
344
- * ID or name of the dataset to be opened. If `null` or `undefined`,
345
- * the function returns the default dataset associated with the crawler run.
386
+ * @param [identifier]
387
+ * ID or name of the dataset to be opened. If a string is provided, it will first be
388
+ * looked up as an ID; if no such storage exists, it will be treated as a name.
389
+ * If `null` or `undefined`, the function returns the default dataset associated with the crawler run.
346
390
  * @param [options] Storage manager options.
347
391
  */
348
- static open<Data extends Dictionary = Dictionary>(datasetIdOrName?: string | null, options?: StorageManagerOptions): Promise<Dataset<Data>>;
392
+ static open<Data extends Dictionary = Dictionary>(identifier?: string | StorageIdentifier | null, options?: StorageOpenOptions): Promise<Dataset<Data>>;
349
393
  /**
350
394
  * Stores an object or an array of objects to the default {@link Dataset} of the current crawler run.
351
395
  *
@@ -411,7 +455,7 @@ export interface DatasetReducer<T, Data> {
411
455
  export interface DatasetOptions {
412
456
  id: string;
413
457
  name?: string;
414
- client: StorageClient;
458
+ client: DatasetClient;
415
459
  }
416
460
  export interface DatasetContent<Data> {
417
461
  /** Total count of entries in the dataset. */
@@ -1 +1 @@
1
- {"version":3,"file":"dataset.d.ts","sourceRoot":"","sources":["../../src/storages/dataset.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,aAAa,EAAE,WAAW,EAAE,UAAU,EAAE,aAAa,EAAE,MAAM,gBAAgB,CAAC;AAM5F,OAAO,EAAE,aAAa,EAAE,MAAM,qBAAqB,CAAC;AACpD,OAAO,EAAE,KAAK,GAAG,EAAO,MAAM,WAAW,CAAC;AAC1C,OAAO,KAAK,EAAE,SAAS,EAAE,MAAM,gBAAgB,CAAC;AAGhD,OAAO,KAAK,EAAE,qBAAqB,EAAE,MAAM,sBAAsB,CAAC;AAIlE,gBAAgB;AAChB,eAAO,MAAM,+BAA+B,QAAQ,CAAC;AAIrD;;;;;GAKG;AACH,wBAAgB,iBAAiB,CAAC,CAAC,EAAE,IAAI,EAAE,CAAC,EAAE,UAAU,EAAE,MAAM,EAAE,KAAK,CAAC,EAAE,MAAM,GAAG,MAAM,CAsBxF;AAED;;;;;;;;GAQG;AACH,wBAAgB,WAAW,CAAC,KAAK,EAAE,MAAM,EAAE,EAAE,UAAU,EAAE,MAAM,GAAG,MAAM,EAAE,CA8BzE;AAED,MAAM,WAAW,kBAAkB;IAC/B;;;OAGG;IACH,MAAM,CAAC,EAAE,MAAM,CAAC;IAEhB;;;OAGG;IACH,KAAK,CAAC,EAAE,MAAM,CAAC;IAEf;;;;OAIG;IACH,IAAI,CAAC,EAAE,OAAO,CAAC;IAEf;;OAEG;IACH,MAAM,CAAC,EAAE,MAAM,EAAE,CAAC;IAElB;;;OAGG;IACH,MAAM,CAAC,EAAE,MAAM,CAAC;IAEhB;;;;OAIG;IACH,KAAK,CAAC,EAAE,OAAO,CAAC;IAEhB;;;OAGG;IACH,UAAU,CAAC,EAAE,OAAO,CAAC;IAErB;;;;OAIG;IACH,SAAS,CAAC,EAAE,OAAO,CAAC;CACvB;AAED,MAAM,WAAW,oBAAqB,SAAQ,IAAI,CAAC,kBAAkB,EAAE,QAAQ,GAAG,OAAO,CAAC;CAAG;AAE7F,MAAM,WAAW,sBACb,SAAQ,IAAI,CAAC,kBAAkB,EAAE,QAAQ,GAAG,OAAO,GAAG,OAAO,GAAG,YAAY,GAAG,WAAW,CAAC;IAC3F,gBAAgB;IAChB,MAAM,CAAC,EAAE,MAAM,CAAC;IAEhB;;;OAGG;IACH,KAAK,CAAC,EAAE,MAAM,CAAC;IAEf,gBAAgB;IAChB,KAAK,CAAC,EAAE,OAAO,CAAC;IAEhB,gBAAgB;IAChB,UAAU,CAAC,EAAE,OAAO,CAAC;IAErB,gBAAgB;IAChB,SAAS,CAAC,EAAE,OAAO,CAAC;IAEpB,gBAAgB;IAChB,MAAM,CAAC,EAAE,MAAM,CAAC;CACnB;AAED,MAAM,WAAW,sBAAuB,SAAQ,oBAAoB;IAChE,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB,KAAK,CAAC,EAAE,MAAM,CAAC;CAClB;AAED;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GAkDG;AACH,qBAAa,OAAO,CAAC,IAAI,SAAS,UAAU,GAAG,UAAU;IAWjD,QAAQ,CAAC,MAAM;IAVnB,EAAE,EAAE,MAAM,CAAC;IACX,IAAI,CAAC,EAAE,MAAM,CAAC;IACd,MAAM,EAAE,aAAa,CAAC,IAAI,CAAC,CAAC;IAC5B,GAAG,EAAE,GAAG,CAAoC;IAE5C;;OAEG;gBAEC,OAAO,EAAE,cAAc,EACd,MAAM,gBAAkC;IAOrD;;;;;;;;;;;;;;;;;;;;;;;OAuBG;IACG,QAAQ,CAAC,IAAI,EAAE,IAAI,GAAG,IAAI,EAAE,GAAG,OAAO,CAAC,IAAI,CAAC;IAwBlD;;OAEG;IACG,OAAO,CAAC,OAAO,GAAE,kBAAuB,GAAG,OAAO,CAAC,cAAc,CAAC,IAAI,CAAC,CAAC;IAgB9E;;;OAGG;IACG,MAAM,CAAC,OAAO,GAAE,oBAAyB,GAAG,OAAO,CAAC,IAAI,EAAE,CAAC;IAyBjE;;;;;;OAMG;IACG,QAAQ,CAAC,GAAG,EAAE,MAAM,EAAE,OAAO,CAAC,EAAE,sBAAsB,EAAE,WAAW,CAAC,EAAE,MAAM,GAAG,OAAO,CAAC,IAAI,EAAE,CAAC;IA0BpG;;;;;OAKG;IACG,YAAY,CAAC,GAAG,EAAE,MAAM,EAAE,OAAO,CAAC,EAAE,IAAI,CAAC,sBAAsB,EAAE,aAAa,CAAC;IAIrF;;;;;OAKG;IACG,WAAW,CAAC,GAAG,EAAE,MAAM,EAAE,OAAO,CAAC,EAAE,IAAI,CAAC,sBAAsB,EAAE,aAAa,CAAC;IAIpF;;;;;OAKG;WACU,YAAY,CAAC,GAAG,EAAE,MAAM,EAAE,OAAO,CAAC,EAAE,sBAAsB;IAOvE;;;;;OAKG;WACU,WAAW,CAAC,GAAG,EAAE,MAAM,EAAE,OAAO,CAAC,EAAE,sBAAsB;IAOtE;;;;;;;;;;;;;;;;;;;;;OAqBG;IACG,OAAO,IAAI,OAAO,CAAC,WAAW,GAAG,SAAS,CAAC;IAMjD;;;;;;;;;;;;;;;;;;;OAmBG;IACG,OAAO,CAAC,QAAQ,EAAE,eAAe,CAAC,IAAI,CAAC,EAAE,OAAO,GAAE,sBAA2B,EAAE,KAAK,SAAI,GAAG,OAAO,CAAC,IAAI,CAAC;IAqB9G;;;;;;;;OAQG;IACG,GAAG,CAAC,CAAC,EAAE,QAAQ,EAAE,aAAa,CAAC,IAAI,EAAE,CAAC,CAAC,EAAE,OAAO,GAAE,sBAA2B,GAAG,OAAO,CAAC,CAAC,EAAE,CAAC;IAalG;;;;;;;;;;;;;;;;OAgBG;IACG,MAAM,CAAC,QAAQ,EAAE,cAAc,CAAC,IAAI,EAAE,IAAI,CAAC,GAAG,OAAO,CAAC,IAAI,GAAG,SAAS,CAAC;IAE7E;;;;;;;;;;;;;;;;;;OAkBG;IACG,MAAM,CACR,QAAQ,EAAE,cAAc,CAAC,IAAI,EAAE,IAAI,CAAC,EACpC,IAAI,EAAE,SAAS,EACf,OAAO,EAAE,sBAAsB,GAChC,OAAO,CAAC,IAAI,GAAG,SAAS,CAAC;IAE5B;;;;;;;;;;;;OAYG;IACG,MAAM,CAAC,CAAC,EAAE,QAAQ,EAAE,cAAc,CAAC,CAAC,EAAE,IAAI,CAAC,EAAE,IAAI,EAAE,CAAC,EAAE,OAAO,CAAC,EAAE,sBAAsB,GAAG,OAAO,CAAC,CAAC,CAAC;IAyBzG;;;OAGG;IACG,IAAI,IAAI,OAAO,CAAC,IAAI,CAAC;IAQ3B;;;;;;;;;;;;;OAaG;WACU,IAAI,CAAC,IAAI,SAAS,UAAU,GAAG,UAAU,EAClD,eAAe,CAAC,EAAE,MAAM,GAAG,IAAI,EAC/B,OAAO,GAAE,qBAA0B,GACpC,OAAO,CAAC,OAAO,CAAC,IAAI,CAAC,CAAC;IAsBzB;;;;;;;;;;;;;;;;;;;;;;;OAuBG;WACU,QAAQ,CAAC,IAAI,SAAS,UAAU,GAAG,UAAU,EAAE,IAAI,EAAE,IAAI,GAAG,IAAI,EAAE,GAAG,OAAO,CAAC,IAAI,CAAC;IAK/F;;OAEG;WACU,OAAO,CAAC,IAAI,SAAS,UAAU,GAAG,UAAU,EACrD,OAAO,GAAE,kBAAuB,GACjC,OAAO,CAAC,cAAc,CAAC,IAAI,CAAC,CAAC;CAInC;AAED;;GAEG;AACH,MAAM,WAAW,eAAe,CAAC,IAAI;IACjC;;;OAGG;IACH,CAAC,IAAI,EAAE,IAAI,EAAE,KAAK,EAAE,MAAM,GAAG,SAAS,CAAC,IAAI,CAAC,CAAC;CAChD;AAED;;GAEG;AACH,MAAM,WAAW,aAAa,CAAC,IAAI,EAAE,CAAC;IAClC;;;;OAIG;IACH,CAAC,IAAI,EAAE,IAAI,EAAE,KAAK,EAAE,MAAM,GAAG,SAAS,CAAC,CAAC,CAAC,CAAC;CAC7C;AAED;;GAEG;AACH,MAAM,WAAW,cAAc,CAAC,CAAC,EAAE,IAAI;IACnC;;;;OAIG;IACH,CAAC,IAAI,EAAE,CAAC,EAAE,IAAI,EAAE,IAAI,EAAE,KAAK,EAAE,MAAM,GAAG,SAAS,CAAC,CAAC,CAAC,CAAC;CACtD;AAED,MAAM,WAAW,cAAc;IAC3B,EAAE,EAAE,MAAM,CAAC;IACX,IAAI,CAAC,EAAE,MAAM,CAAC;IACd,MAAM,EAAE,aAAa,CAAC;CACzB;AAED,MAAM,WAAW,cAAc,CAAC,IAAI;IAChC,6CAA6C;IAC7C,KAAK,EAAE,MAAM,CAAC;IACd,qDAAqD;IACrD,KAAK,EAAE,MAAM,CAAC;IACd,2DAA2D;IAC3D,MAAM,EAAE,MAAM,CAAC;IACf,mDAAmD;IACnD,KAAK,EAAE,MAAM,CAAC;IACd,wDAAwD;IACxD,KAAK,EAAE,IAAI,EAAE,CAAC;IACd,iDAAiD;IACjD,IAAI,CAAC,EAAE,OAAO,CAAC;CAClB"}
1
+ {"version":3,"file":"dataset.d.ts","sourceRoot":"","sources":["../../src/storages/dataset.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,aAAa,EAAE,WAAW,EAAE,UAAU,EAAiB,MAAM,gBAAgB,CAAC;AAI5F,OAAO,EAAE,aAAa,EAAE,MAAM,qBAAqB,CAAC;AACpD,OAAO,KAAK,EAAE,aAAa,EAAE,MAAM,WAAW,CAAC;AAE/C,OAAO,KAAK,EAAE,SAAS,EAAE,MAAM,gBAAgB,CAAC;AAGhD,OAAO,KAAK,EAAE,iBAAiB,EAAE,MAAM,+BAA+B,CAAC;AACvE,OAAO,KAAK,EAAE,kBAAkB,EAAE,MAAM,YAAY,CAAC;AAIrD,gBAAgB;AAChB,eAAO,MAAM,+BAA+B,QAAQ,CAAC;AAErD;;;;;;;GAOG;AACH,wBAAgB,sBAAsB,CAAC,CAAC,EAAE,IAAI,EAAE,CAAC,EAAE,KAAK,CAAC,EAAE,MAAM,GAAG,IAAI,CAcvE;AAED,MAAM,WAAW,kBAAkB;IAC/B;;;OAGG;IACH,MAAM,CAAC,EAAE,MAAM,CAAC;IAEhB;;;OAGG;IACH,KAAK,CAAC,EAAE,MAAM,CAAC;IAEf;;;;OAIG;IACH,IAAI,CAAC,EAAE,OAAO,CAAC;IAEf;;OAEG;IACH,MAAM,CAAC,EAAE,MAAM,EAAE,CAAC;IAElB;;;OAGG;IACH,MAAM,CAAC,EAAE,MAAM,CAAC;IAEhB;;;;OAIG;IACH,KAAK,CAAC,EAAE,OAAO,CAAC;IAEhB;;;OAGG;IACH,UAAU,CAAC,EAAE,OAAO,CAAC;IAErB;;;;OAIG;IACH,SAAS,CAAC,EAAE,OAAO,CAAC;CACvB;AAED,MAAM,WAAW,oBAAqB,SAAQ,IAAI,CAAC,kBAAkB,EAAE,QAAQ,GAAG,OAAO,CAAC;IACtF;;;OAGG;IACH,cAAc,CAAC,EAAE,OAAO,CAAC;CAC5B;AAED,MAAM,WAAW,sBAAuB,SAAQ,IAAI,CAChD,kBAAkB,EAClB,QAAQ,GAAG,OAAO,GAAG,OAAO,GAAG,YAAY,GAAG,WAAW,CAC5D;IACG,gBAAgB;IAChB,MAAM,CAAC,EAAE,MAAM,CAAC;IAEhB;;;OAGG;IACH,KAAK,CAAC,EAAE,MAAM,CAAC;IAEf,gBAAgB;IAChB,KAAK,CAAC,EAAE,OAAO,CAAC;IAEhB,gBAAgB;IAChB,UAAU,CAAC,EAAE,OAAO,CAAC;IAErB,gBAAgB;IAChB,SAAS,CAAC,EAAE,OAAO,CAAC;IAEpB,gBAAgB;IAChB,MAAM,CAAC,EAAE,MAAM,CAAC;CACnB;AAED,MAAM,WAAW,sBAAuB,SAAQ,oBAAoB;IAChE,WAAW,CAAC,EAAE,MAAM,GAAG,iBAAiB,CAAC;IACzC,KAAK,CAAC,EAAE,MAAM,GAAG,iBAAiB,CAAC;CACtC;AAED;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GAkDG;AACH,qBAAa,OAAO,CAAC,IAAI,SAAS,UAAU,GAAG,UAAU;IAWjD,QAAQ,CAAC,MAAM;IAVnB,EAAE,EAAE,MAAM,CAAC;IACX,IAAI,CAAC,EAAE,MAAM,CAAC;IACd,MAAM,EAAE,aAAa,CAAC,IAAI,CAAC,CAAC;IAC5B,GAAG,EAAE,aAAa,CAAC;IAEnB;;OAEG;gBAEC,OAAO,EAAE,cAAc,EACd,MAAM,gBAAkC;IAQrD;;;;;;;;;;OAUG;IACG,QAAQ,CAAC,IAAI,EAAE,IAAI,GAAG,IAAI,EAAE,GAAG,OAAO,CAAC,IAAI,CAAC;IAclD;;OAEG;IACG,OAAO,CAAC,OAAO,GAAE,kBAAuB,GAAG,OAAO,CAAC,cAAc,CAAC,IAAI,CAAC,CAAC;IAgB9E;;;OAGG;IACG,MAAM,CAAC,OAAO,GAAE,oBAAyB,GAAG,OAAO,CAAC,IAAI,EAAE,CAAC;IAYjE;;;;;;OAMG;IACG,QAAQ,CAAC,GAAG,EAAE,MAAM,EAAE,OAAO,CAAC,EAAE,sBAAsB,EAAE,WAAW,CAAC,EAAE,MAAM,GAAG,OAAO,CAAC,IAAI,EAAE,CAAC;IAiCpG;;;;;OAKG;IACG,YAAY,CAAC,GAAG,EAAE,MAAM,EAAE,OAAO,CAAC,EAAE,IAAI,CAAC,sBAAsB,EAAE,aAAa,CAAC;IAIrF;;;;;OAKG;IACG,WAAW,CAAC,GAAG,EAAE,MAAM,EAAE,OAAO,CAAC,EAAE,IAAI,CAAC,sBAAsB,EAAE,aAAa,CAAC;IAIpF;;;;;OAKG;WACU,YAAY,CAAC,GAAG,EAAE,MAAM,EAAE,OAAO,CAAC,EAAE,sBAAsB;IAOvE;;;;;OAKG;WACU,WAAW,CAAC,GAAG,EAAE,MAAM,EAAE,OAAO,CAAC,EAAE,sBAAsB;IAOtE;;;;;;;;;;;;;;;;OAgBG;IACG,OAAO,IAAI,OAAO,CAAC,WAAW,CAAC;IAMrC;;;;;;;;;;;;;;;;;;;OAmBG;IACG,OAAO,CAAC,QAAQ,EAAE,eAAe,CAAC,IAAI,CAAC,EAAE,OAAO,GAAE,sBAA2B,EAAE,KAAK,SAAI,GAAG,OAAO,CAAC,IAAI,CAAC;IAqB9G;;;;;;;;OAQG;IACG,GAAG,CAAC,CAAC,EAAE,QAAQ,EAAE,aAAa,CAAC,IAAI,EAAE,CAAC,CAAC,EAAE,OAAO,GAAE,sBAA2B,GAAG,OAAO,CAAC,CAAC,EAAE,CAAC;IAalG;;;;;;;;;;;;;;;;OAgBG;IACG,MAAM,CAAC,QAAQ,EAAE,cAAc,CAAC,IAAI,EAAE,IAAI,CAAC,GAAG,OAAO,CAAC,IAAI,GAAG,SAAS,CAAC;IAE7E;;;;;;;;;;;;;;;;;;OAkBG;IACG,MAAM,CACR,QAAQ,EAAE,cAAc,CAAC,IAAI,EAAE,IAAI,CAAC,EACpC,IAAI,EAAE,SAAS,EACf,OAAO,EAAE,sBAAsB,GAChC,OAAO,CAAC,IAAI,GAAG,SAAS,CAAC;IAE5B;;;;;;;;;;;;OAYG;IACG,MAAM,CAAC,CAAC,EAAE,QAAQ,EAAE,cAAc,CAAC,CAAC,EAAE,IAAI,CAAC,EAAE,IAAI,EAAE,CAAC,EAAE,OAAO,CAAC,EAAE,sBAAsB,GAAG,OAAO,CAAC,CAAC,CAAC;YAyB1F,eAAe;YAUf,UAAU;IAqBzB;;;;;;;;;;;;;;;;;;;;;;OAsBG;IACH,MAAM,CAAC,OAAO,GAAE,sBAA2B,GAAG,aAAa,CAAC,IAAI,CAAC,GAAG,OAAO,CAAC,IAAI,EAAE,CAAC;IASnF;;;;;;;;;;;;;;;;;;;;;;OAsBG;IACH,OAAO,CAAC,OAAO,GAAE,sBAA2B,GAAG,aAAa,CAAC,CAAC,MAAM,EAAE,IAAI,CAAC,CAAC,GAAG,OAAO,CAAC,CAAC,MAAM,EAAE,IAAI,CAAC,EAAE,CAAC;IASxG;;;;;;;;;;;OAWG;IACI,CAAC,MAAM,CAAC,aAAa,CAAC,IAAI,cAAc,CAAC,IAAI,EAAE,IAAI,EAAE,SAAS,CAAC;IAItE;;;OAGG;IACG,IAAI,IAAI,OAAO,CAAC,IAAI,CAAC;IAO3B;;;;;;;;;;;;;;OAcG;WACU,IAAI,CAAC,IAAI,SAAS,UAAU,GAAG,UAAU,EAClD,UAAU,CAAC,EAAE,MAAM,GAAG,iBAAiB,GAAG,IAAI,EAC9C,OAAO,GAAE,kBAAuB,GACjC,OAAO,CAAC,OAAO,CAAC,IAAI,CAAC,CAAC;IA0BzB;;;;;;;;;;;;;;;;;;;;;;;OAuBG;WACU,QAAQ,CAAC,IAAI,SAAS,UAAU,GAAG,UAAU,EAAE,IAAI,EAAE,IAAI,GAAG,IAAI,EAAE,GAAG,OAAO,CAAC,IAAI,CAAC;IAK/F;;OAEG;WACU,OAAO,CAAC,IAAI,SAAS,UAAU,GAAG,UAAU,EACrD,OAAO,GAAE,kBAAuB,GACjC,OAAO,CAAC,cAAc,CAAC,IAAI,CAAC,CAAC;CAInC;AAED;;GAEG;AACH,MAAM,WAAW,eAAe,CAAC,IAAI;IACjC;;;OAGG;IACH,CAAC,IAAI,EAAE,IAAI,EAAE,KAAK,EAAE,MAAM,GAAG,SAAS,CAAC,IAAI,CAAC,CAAC;CAChD;AAED;;GAEG;AACH,MAAM,WAAW,aAAa,CAAC,IAAI,EAAE,CAAC;IAClC;;;;OAIG;IACH,CAAC,IAAI,EAAE,IAAI,EAAE,KAAK,EAAE,MAAM,GAAG,SAAS,CAAC,CAAC,CAAC,CAAC;CAC7C;AAED;;GAEG;AACH,MAAM,WAAW,cAAc,CAAC,CAAC,EAAE,IAAI;IACnC;;;;OAIG;IACH,CAAC,IAAI,EAAE,CAAC,EAAE,IAAI,EAAE,IAAI,EAAE,KAAK,EAAE,MAAM,GAAG,SAAS,CAAC,CAAC,CAAC,CAAC;CACtD;AAED,MAAM,WAAW,cAAc;IAC3B,EAAE,EAAE,MAAM,CAAC;IACX,IAAI,CAAC,EAAE,MAAM,CAAC;IACd,MAAM,EAAE,aAAa,CAAC;CACzB;AAED,MAAM,WAAW,cAAc,CAAC,IAAI;IAChC,6CAA6C;IAC7C,KAAK,EAAE,MAAM,CAAC;IACd,qDAAqD;IACrD,KAAK,EAAE,MAAM,CAAC;IACd,2DAA2D;IAC3D,MAAM,EAAE,MAAM,CAAC;IACf,mDAAmD;IACnD,KAAK,EAAE,MAAM,CAAC;IACd,wDAAwD;IACxD,KAAK,EAAE,IAAI,EAAE,CAAC;IACd,iDAAiD;IACjD,IAAI,CAAC,EAAE,OAAO,CAAC;CAClB"}
@@ -1,80 +1,34 @@
1
1
  import { stringify } from 'csv-stringify/sync';
2
2
  import ow from 'ow';
3
- import { MAX_PAYLOAD_SIZE_BYTES } from '@apify/consts';
4
3
  import { Configuration } from '../configuration.js';
5
- import { log } from '../log.js';
4
+ import { serviceLocator } from '../service_locator.js';
6
5
  import { checkStorageAccess } from './access_checking.js';
7
6
  import { KeyValueStore } from './key_value_store.js';
8
- import { StorageManager } from './storage_manager.js';
9
- import { purgeDefaultStorages } from './utils.js';
7
+ import { resolveStorageIdentifier } from './storage_instance_manager.js';
8
+ import { createDualIterable, purgeDefaultStorages } from './utils.js';
10
9
  /** @internal */
11
10
  export const DATASET_ITERATORS_DEFAULT_LIMIT = 10000;
12
- const SAFETY_BUFFER_PERCENT = 0.01 / 100; // 0.01%
13
11
  /**
14
- * Accepts a JSON serializable object as an input, validates its serializability,
15
- * and validates its serialized size against limitBytes. Optionally accepts its index
16
- * in an array to provide better error messages. Returns serialized object.
12
+ * Validates that the given value is a plain JSON-serializable object
13
+ * (not an array, not a primitive, not circular).
14
+ *
15
+ * @param item The value to validate.
16
+ * @param index Optional index for error messages when validating inside an array.
17
17
  * @ignore
18
18
  */
19
- export function checkAndSerialize(item, limitBytes, index) {
19
+ export function assertJsonSerializable(item, index) {
20
20
  const s = typeof index === 'number' ? ` at index ${index} ` : ' ';
21
21
  const isItemObject = item && typeof item === 'object' && !Array.isArray(item);
22
22
  if (!isItemObject) {
23
23
  throw new Error(`Data item${s}is not an object. You can push only objects into a dataset.`);
24
24
  }
25
- let payload;
26
25
  try {
27
- payload = JSON.stringify(item);
26
+ JSON.stringify(item);
28
27
  }
29
28
  catch (e) {
30
29
  const err = e;
31
30
  throw new Error(`Data item${s}is not serializable to JSON.\nCause: ${err.message}`);
32
31
  }
33
- const bytes = Buffer.byteLength(payload);
34
- if (bytes > limitBytes) {
35
- throw new Error(`Data item${s}is too large (size: ${bytes} bytes, limit: ${limitBytes} bytes)`);
36
- }
37
- return payload;
38
- }
39
- /**
40
- * Takes an array of JSONs (payloads) as input and produces an array of JSON strings
41
- * where each string is a JSON array of payloads with a maximum size of limitBytes per one
42
- * JSON array. Fits as many payloads as possible into a single JSON array and then moves
43
- * on to the next, preserving item order.
44
- *
45
- * The function assumes that none of the items is larger than limitBytes and does not validate.
46
- * @ignore
47
- */
48
- export function chunkBySize(items, limitBytes) {
49
- if (!items.length)
50
- return [];
51
- if (items.length === 1)
52
- return items;
53
- // Split payloads into buckets of valid size.
54
- let lastChunkBytes = 2; // Add 2 bytes for [] wrapper.
55
- const chunks = [];
56
- for (const payload of items) {
57
- const bytes = Buffer.byteLength(payload);
58
- if (bytes <= limitBytes && bytes + 2 > limitBytes) {
59
- // Handle cases where wrapping with [] would fail, but solo object is fine.
60
- chunks.push(payload);
61
- lastChunkBytes = bytes;
62
- }
63
- else if (lastChunkBytes + bytes <= limitBytes) {
64
- // ensure array
65
- if (!Array.isArray(chunks[chunks.length - 1])) {
66
- chunks.push([]);
67
- }
68
- chunks[chunks.length - 1].push(payload);
69
- lastChunkBytes += bytes + 1; // Add 1 byte for ',' separator.
70
- }
71
- else {
72
- chunks.push([payload]);
73
- lastChunkBytes = bytes + 2; // Add 2 bytes for [] wrapper.
74
- }
75
- }
76
- // Stringify array chunks.
77
- return chunks.map((chunk) => (typeof chunk === 'string' ? chunk : `[${chunk.join(',')}]`));
78
32
  }
79
33
  /**
80
34
  * The `Dataset` class represents a store for structured data where each object stored has the same attributes,
@@ -132,7 +86,7 @@ export class Dataset {
132
86
  id;
133
87
  name;
134
88
  client;
135
- log = log.child({ prefix: 'Dataset' });
89
+ log;
136
90
  /**
137
91
  * @internal
138
92
  */
@@ -140,7 +94,8 @@ export class Dataset {
140
94
  this.config = config;
141
95
  this.id = options.id;
142
96
  this.name = options.name;
143
- this.client = options.client.dataset(this.id);
97
+ this.client = options.client;
98
+ this.log = serviceLocator.getLogger().child({ prefix: 'Dataset' });
144
99
  }
145
100
  /**
146
101
  * Stores an object or an array of objects to the dataset.
@@ -150,40 +105,18 @@ export class Dataset {
150
105
  * **IMPORTANT**: Make sure to use the `await` keyword when calling `pushData()`,
151
106
  * otherwise the crawler process might finish before the data is stored!
152
107
  *
153
- * The size of the data is limited by the receiving API and therefore `pushData()` will only
154
- * allow objects whose JSON representation is smaller than 9MB. When an array is passed,
155
- * none of the included objects
156
- * may be larger than 9MB, but the array itself may be of any size.
157
- *
158
- * The function internally
159
- * chunks the array into separate items and pushes them sequentially.
160
- * The chunking process is stable (keeps order of data), but it does not provide a transaction
161
- * safety mechanism. Therefore, in the event of an uploading error (after several automatic retries),
162
- * the function's Promise will reject and the dataset will be left in a state where some of
163
- * the items have already been saved to the dataset while other items from the source array were not.
164
- * To overcome this limitation, the developer may, for example, read the last item saved in the dataset
165
- * and re-attempt the save of the data from this item onwards to prevent duplicates.
166
108
  * @param data Object or array of objects containing data to be stored in the default dataset.
167
- * The objects must be serializable to JSON and the JSON representation of each object must be smaller than 9MB.
109
+ * The objects must be serializable to JSON.
168
110
  */
169
111
  async pushData(data) {
170
112
  checkStorageAccess();
171
113
  ow(data, 'data', ow.object);
172
- const dispatch = async (payload) => this.client.pushItems(payload);
173
- const limit = MAX_PAYLOAD_SIZE_BYTES - Math.ceil(MAX_PAYLOAD_SIZE_BYTES * SAFETY_BUFFER_PERCENT);
174
- // Handle singular Objects
175
- if (!Array.isArray(data)) {
176
- const payload = checkAndSerialize(data, limit);
177
- await dispatch(payload);
178
- return;
179
- }
180
- // Handle Arrays
181
- const payloads = data.map((item, index) => checkAndSerialize(item, limit, index));
182
- const chunks = chunkBySize(payloads, limit);
183
- // Invoke client in series to preserve order of data
184
- for (const chunk of chunks) {
185
- await dispatch(chunk);
114
+ // Normalize to array and validate each item
115
+ const items = Array.isArray(data) ? data : [data];
116
+ for (let i = 0; i < items.length; i++) {
117
+ assertJsonSerializable(items[i], i);
186
118
  }
119
+ await this.client.pushData(items);
187
120
  }
188
121
  /**
189
122
  * Returns {@link DatasetContent} object holding the items in the dataset based on the provided parameters.
@@ -191,7 +124,7 @@ export class Dataset {
191
124
  async getData(options = {}) {
192
125
  checkStorageAccess();
193
126
  try {
194
- return await this.client.listItems(options);
127
+ return await this.client.getData(options);
195
128
  }
196
129
  catch (e) {
197
130
  const error = e;
@@ -208,18 +141,9 @@ export class Dataset {
208
141
  async export(options = {}) {
209
142
  checkStorageAccess();
210
143
  const items = [];
211
- const fetchNextChunk = async (offset = 0) => {
212
- const limit = 1000;
213
- const value = await this.client.listItems({ offset, limit, ...options });
214
- if (value.count === 0) {
215
- return;
216
- }
217
- items.push(...value.items);
218
- if (value.total > offset + value.count) {
219
- await fetchNextChunk(offset + value.count);
220
- }
221
- };
222
- await fetchNextChunk();
144
+ for await (const page of this.fetchPages(options)) {
145
+ items.push(...page.items);
146
+ }
223
147
  return items;
224
148
  }
225
149
  /**
@@ -233,7 +157,14 @@ export class Dataset {
233
157
  const kvStore = await KeyValueStore.open(options?.toKVS ?? null, { config: this.config });
234
158
  const items = await this.export(options);
235
159
  if (contentType === 'text/csv') {
236
- const keys = Object.keys(items[0]);
160
+ // To handle empty dataset exports gracefully.
161
+ if (items.length === 0) {
162
+ await kvStore.setValue(key, '', { contentType });
163
+ return items;
164
+ }
165
+ const keys = options?.collectAllKeys
166
+ ? Array.from(new Set(items.flatMap(Object.keys)))
167
+ : Object.keys(items[0]);
237
168
  const value = stringify([
238
169
  keys,
239
170
  ...items.map((item) => {
@@ -248,7 +179,6 @@ export class Dataset {
248
179
  return items;
249
180
  }
250
181
  throw new Error(`Unsupported content type: ${contentType}`);
251
- return items;
252
182
  }
253
183
  /**
254
184
  * Save entire default dataset's contents into one JSON file within a key-value store.
@@ -293,28 +223,23 @@ export class Dataset {
293
223
  /**
294
224
  * Returns an object containing general information about the dataset.
295
225
  *
296
- * The function returns the same object as the Apify API Client's
297
- * [getDataset](https://docs.apify.com/api/apify-client-js/latest#ApifyClient-datasets-getDataset)
298
- * function, which in turn calls the
299
- * [Get dataset](https://apify.com/docs/api/v2#/reference/datasets/dataset/get-dataset)
300
- * API endpoint.
301
- *
302
226
  * **Example:**
303
227
  * ```
304
228
  * {
305
229
  * id: "WkzbQMuFYuamGv3YF",
306
230
  * name: "my-dataset",
307
- * userId: "wRsJZtadYvn4mBZmm",
308
231
  * createdAt: new Date("2015-12-12T07:34:14.202Z"),
309
232
  * modifiedAt: new Date("2015-12-13T08:36:13.202Z"),
310
233
  * accessedAt: new Date("2015-12-14T08:36:13.202Z"),
311
234
  * itemCount: 14,
312
235
  * }
313
236
  * ```
237
+ *
238
+ * @throws If the underlying storage no longer exists (e.g. it was deleted externally).
314
239
  */
315
240
  async getInfo() {
316
241
  checkStorageAccess();
317
- return this.client.get();
242
+ return this.client.getMetadata();
318
243
  }
319
244
  /**
320
245
  * Iterates over dataset items, yielding each in turn to an `iteratee` function.
@@ -388,15 +313,114 @@ export class Dataset {
388
313
  await this.forEach(wrappedFunc, options);
389
314
  return currentMemo;
390
315
  }
316
+ async *fetchEntryPages(options) {
317
+ let index = options.offset ?? 0;
318
+ for await (const page of this.fetchPages(options)) {
319
+ yield {
320
+ ...page,
321
+ items: page.items.map((item) => [index++, item]),
322
+ };
323
+ }
324
+ }
325
+ async *fetchPages(options, pageSize = DATASET_ITERATORS_DEFAULT_LIMIT) {
326
+ let offset = options.offset ?? 0;
327
+ const totalLimit = options.limit;
328
+ let yielded = 0;
329
+ while (true) {
330
+ const fetchLimit = totalLimit !== undefined ? Math.min(pageSize, totalLimit - yielded) : pageSize;
331
+ if (fetchLimit <= 0)
332
+ break;
333
+ const page = await this.client.getData({ ...options, offset, limit: fetchLimit });
334
+ yield page;
335
+ yielded += page.items.length;
336
+ if (page.items.length < fetchLimit || offset + page.items.length >= page.total)
337
+ break;
338
+ offset += page.items.length;
339
+ }
340
+ }
341
+ /**
342
+ * Returns dataset items.
343
+ *
344
+ * When awaited (`await dataset.values()`), returns all items as a flat `Data[]` array.
345
+ * When used as an async iterable (`for await...of`), iterates over all items across pages
346
+ * without loading everything into memory at once.
347
+ *
348
+ * **Example usage:**
349
+ * ```javascript
350
+ * const dataset = await Dataset.open('my-results');
351
+ *
352
+ * // Iterate over all items (memory-efficient for large datasets)
353
+ * for await (const item of dataset.values()) {
354
+ * console.log(item);
355
+ * }
356
+ *
357
+ * // Or fetch all items at once
358
+ * const items = await dataset.values();
359
+ * console.log(items);
360
+ * ```
361
+ *
362
+ * @param options Options for the iteration.
363
+ */
364
+ values(options = {}) {
365
+ checkStorageAccess();
366
+ return createDualIterable({
367
+ createPages: () => this.fetchPages(options),
368
+ extractItems: (page) => page.items,
369
+ });
370
+ }
371
+ /**
372
+ * Returns dataset entries (index-value pairs).
373
+ *
374
+ * When awaited (`await dataset.entries()`), returns all entries as a flat `[index, item][]` array.
375
+ * When used as an async iterable (`for await...of`), iterates over all entries across pages
376
+ * without loading everything into memory at once.
377
+ *
378
+ * **Example usage:**
379
+ * ```javascript
380
+ * const dataset = await Dataset.open('my-results');
381
+ *
382
+ * // Iterate over all entries
383
+ * for await (const [index, item] of dataset.entries()) {
384
+ * console.log(`Item at ${index}: ${JSON.stringify(item)}`);
385
+ * }
386
+ *
387
+ * // Or fetch all at once
388
+ * const entries = await dataset.entries();
389
+ * console.log(entries);
390
+ * ```
391
+ *
392
+ * @param options Options for the iteration.
393
+ */
394
+ entries(options = {}) {
395
+ checkStorageAccess();
396
+ return createDualIterable({
397
+ createPages: () => this.fetchEntryPages(options),
398
+ extractItems: (page) => page.items,
399
+ });
400
+ }
401
+ /**
402
+ * Default async iterator for the dataset, iterating over items.
403
+ * Allows using the dataset directly in a `for await...of` loop.
404
+ *
405
+ * **Example usage:**
406
+ * ```javascript
407
+ * const dataset = await Dataset.open('my-results');
408
+ * for await (const item of dataset) {
409
+ * console.log(item);
410
+ * }
411
+ * ```
412
+ */
413
+ async *[Symbol.asyncIterator]() {
414
+ yield* this.values();
415
+ }
391
416
  /**
392
417
  * Removes the dataset either from the Apify cloud storage or from the local directory,
393
418
  * depending on the mode of operation.
394
419
  */
395
420
  async drop() {
396
421
  checkStorageAccess();
397
- await this.client.delete();
398
- const manager = StorageManager.getManager(Dataset, this.config);
399
- manager.closeStorage(this);
422
+ await this.client.drop();
423
+ serviceLocator.getStorageInstanceManager().removeFromCache(this);
400
424
  }
401
425
  /**
402
426
  * Opens a dataset and returns a promise resolving to an instance of the {@link Dataset} class.
@@ -407,23 +431,27 @@ export class Dataset {
407
431
  *
408
432
  * For more details and code examples, see the {@link Dataset} class.
409
433
  *
410
- * @param [datasetIdOrName]
411
- * ID or name of the dataset to be opened. If `null` or `undefined`,
412
- * the function returns the default dataset associated with the crawler run.
434
+ * @param [identifier]
435
+ * ID or name of the dataset to be opened. If a string is provided, it will first be
436
+ * looked up as an ID; if no such storage exists, it will be treated as a name.
437
+ * If `null` or `undefined`, the function returns the default dataset associated with the crawler run.
413
438
  * @param [options] Storage manager options.
414
439
  */
415
- static async open(datasetIdOrName, options = {}) {
440
+ static async open(identifier, options = {}) {
416
441
  checkStorageAccess();
417
- ow(datasetIdOrName, ow.optional.string);
418
442
  ow(options, ow.object.exactShape({
419
443
  config: ow.optional.object.instanceOf(Configuration),
420
444
  storageClient: ow.optional.object,
421
445
  }));
422
446
  options.config ??= Configuration.getGlobalConfig();
423
- options.storageClient ??= options.config.getStorageClient();
424
- await purgeDefaultStorages({ onlyPurgeOnce: true, client: options.storageClient, config: options.config });
425
- const manager = StorageManager.getManager(this, options.config);
426
- return manager.openStorage(datasetIdOrName, options.storageClient);
447
+ const client = options.storageClient ?? serviceLocator.getStorageClient();
448
+ await purgeDefaultStorages({ onlyPurgeOnce: true, client, config: options.config });
449
+ const resolved = await resolveStorageIdentifier(identifier, client, 'Dataset');
450
+ return serviceLocator.getStorageInstanceManager().openStorage(this, {
451
+ ...resolved,
452
+ clientOpener: () => client.createDatasetClient(resolved),
453
+ clientCacheKey: client.getStorageClientCacheKey?.() ?? client.constructor.name,
454
+ });
427
455
  }
428
456
  /**
429
457
  * Stores an object or an array of objects to the default {@link Dataset} of the current crawler run.