crawlee-one 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (131) hide show
  1. package/README.md +81 -0
  2. package/dist/cjs/cli/cli.d.ts +1 -0
  3. package/dist/cjs/cli/cli.js +61 -0
  4. package/dist/cjs/cli/cli.js.map +1 -0
  5. package/dist/cjs/cli/index.d.ts +2 -0
  6. package/dist/cjs/cli/index.js +6 -0
  7. package/dist/cjs/cli/index.js.map +1 -0
  8. package/dist/cjs/index.d.ts +24 -0
  9. package/dist/cjs/index.js +43 -0
  10. package/dist/cjs/index.js.map +1 -0
  11. package/dist/cjs/lib/actions/dom.d.ts +102 -0
  12. package/dist/cjs/lib/actions/dom.js +743 -0
  13. package/dist/cjs/lib/actions/dom.js.map +1 -0
  14. package/dist/cjs/lib/actions/domUtils.d.ts +42 -0
  15. package/dist/cjs/lib/actions/domUtils.js +126 -0
  16. package/dist/cjs/lib/actions/domUtils.js.map +1 -0
  17. package/dist/cjs/lib/actions/page.d.ts +69 -0
  18. package/dist/cjs/lib/actions/page.js +205 -0
  19. package/dist/cjs/lib/actions/page.js.map +1 -0
  20. package/dist/cjs/lib/actions/scrapeListing.d.ts +78 -0
  21. package/dist/cjs/lib/actions/scrapeListing.js +242 -0
  22. package/dist/cjs/lib/actions/scrapeListing.js.map +1 -0
  23. package/dist/cjs/lib/actor/actor.d.ts +90 -0
  24. package/dist/cjs/lib/actor/actor.js +306 -0
  25. package/dist/cjs/lib/actor/actor.js.map +1 -0
  26. package/dist/cjs/lib/actor/types.d.ts +162 -0
  27. package/dist/cjs/lib/actor/types.js +3 -0
  28. package/dist/cjs/lib/actor/types.js.map +1 -0
  29. package/dist/cjs/lib/actor.d.ts +189 -0
  30. package/dist/cjs/lib/actor.js +225 -0
  31. package/dist/cjs/lib/actor.js.map +1 -0
  32. package/dist/cjs/lib/actorSpec.d.ts +20 -0
  33. package/dist/cjs/lib/actorSpec.js +3 -0
  34. package/dist/cjs/lib/actorSpec.js.map +1 -0
  35. package/dist/cjs/lib/config.d.ts +561 -0
  36. package/dist/cjs/lib/config.js +707 -0
  37. package/dist/cjs/lib/config.js.map +1 -0
  38. package/dist/cjs/lib/dataset/maxCount.d.ts +30 -0
  39. package/dist/cjs/lib/dataset/maxCount.js +55 -0
  40. package/dist/cjs/lib/dataset/maxCount.js.map +1 -0
  41. package/dist/cjs/lib/dataset/pushData.d.ts +123 -0
  42. package/dist/cjs/lib/dataset/pushData.js +182 -0
  43. package/dist/cjs/lib/dataset/pushData.js.map +1 -0
  44. package/dist/cjs/lib/dataset.d.ts +98 -0
  45. package/dist/cjs/lib/dataset.js +122 -0
  46. package/dist/cjs/lib/dataset.js.map +1 -0
  47. package/dist/cjs/lib/dom.d.ts +78 -0
  48. package/dist/cjs/lib/dom.js +243 -0
  49. package/dist/cjs/lib/dom.js.map +1 -0
  50. package/dist/cjs/lib/error/errorHandler.d.ts +112 -0
  51. package/dist/cjs/lib/error/errorHandler.js +164 -0
  52. package/dist/cjs/lib/error/errorHandler.js.map +1 -0
  53. package/dist/cjs/lib/error/sentry.d.ts +11 -0
  54. package/dist/cjs/lib/error/sentry.js +60 -0
  55. package/dist/cjs/lib/error/sentry.js.map +1 -0
  56. package/dist/cjs/lib/integrations/apify.d.ts +67 -0
  57. package/dist/cjs/lib/integrations/apify.js +106 -0
  58. package/dist/cjs/lib/integrations/apify.js.map +1 -0
  59. package/dist/cjs/lib/integrations/types.d.ts +274 -0
  60. package/dist/cjs/lib/integrations/types.js +3 -0
  61. package/dist/cjs/lib/integrations/types.js.map +1 -0
  62. package/dist/cjs/lib/io/dataset.d.ts +67 -0
  63. package/dist/cjs/lib/io/dataset.js +86 -0
  64. package/dist/cjs/lib/io/dataset.js.map +1 -0
  65. package/dist/cjs/lib/io/maxCount.d.ts +30 -0
  66. package/dist/cjs/lib/io/maxCount.js +55 -0
  67. package/dist/cjs/lib/io/maxCount.js.map +1 -0
  68. package/dist/cjs/lib/io/pushData.d.ts +124 -0
  69. package/dist/cjs/lib/io/pushData.js +193 -0
  70. package/dist/cjs/lib/io/pushData.js.map +1 -0
  71. package/dist/cjs/lib/io/pushRequests.d.ts +38 -0
  72. package/dist/cjs/lib/io/pushRequests.js +63 -0
  73. package/dist/cjs/lib/io/pushRequests.js.map +1 -0
  74. package/dist/cjs/lib/io/requestQueue.d.ts +28 -0
  75. package/dist/cjs/lib/io/requestQueue.js +40 -0
  76. package/dist/cjs/lib/io/requestQueue.js.map +1 -0
  77. package/dist/cjs/lib/log.d.ts +38 -0
  78. package/dist/cjs/lib/log.js +54 -0
  79. package/dist/cjs/lib/log.js.map +1 -0
  80. package/dist/cjs/lib/migrate/localMigrator.d.ts +10 -0
  81. package/dist/cjs/lib/migrate/localMigrator.js +57 -0
  82. package/dist/cjs/lib/migrate/localMigrator.js.map +1 -0
  83. package/dist/cjs/lib/migrate/localState.d.ts +7 -0
  84. package/dist/cjs/lib/migrate/localState.js +43 -0
  85. package/dist/cjs/lib/migrate/localState.js.map +1 -0
  86. package/dist/cjs/lib/migrate/types.d.ts +6 -0
  87. package/dist/cjs/lib/migrate/types.js +3 -0
  88. package/dist/cjs/lib/migrate/types.js.map +1 -0
  89. package/dist/cjs/lib/readme/readme.d.ts +65 -0
  90. package/dist/cjs/lib/readme/readme.js +534 -0
  91. package/dist/cjs/lib/readme/readme.js.map +1 -0
  92. package/dist/cjs/lib/readme/types.d.ts +260 -0
  93. package/dist/cjs/lib/readme/types.js +54 -0
  94. package/dist/cjs/lib/readme/types.js.map +1 -0
  95. package/dist/cjs/lib/router.d.ts +132 -0
  96. package/dist/cjs/lib/router.js +165 -0
  97. package/dist/cjs/lib/router.js.map +1 -0
  98. package/dist/cjs/lib/scraper/scrapeListing.d.ts +78 -0
  99. package/dist/cjs/lib/scraper/scrapeListing.js +242 -0
  100. package/dist/cjs/lib/scraper/scrapeListing.js.map +1 -0
  101. package/dist/cjs/lib/test/actor.d.ts +21 -0
  102. package/dist/cjs/lib/test/actor.js +56 -0
  103. package/dist/cjs/lib/test/actor.js.map +1 -0
  104. package/dist/cjs/lib/test/mockApifyClient.d.ts +32 -0
  105. package/dist/cjs/lib/test/mockApifyClient.js +176 -0
  106. package/dist/cjs/lib/test/mockApifyClient.js.map +1 -0
  107. package/dist/cjs/types.d.ts +31 -0
  108. package/dist/cjs/types.js +3 -0
  109. package/dist/cjs/types.js.map +1 -0
  110. package/dist/cjs/utils/async.d.ts +19 -0
  111. package/dist/cjs/utils/async.js +74 -0
  112. package/dist/cjs/utils/async.js.map +1 -0
  113. package/dist/cjs/utils/error.d.ts +1 -0
  114. package/dist/cjs/utils/error.js +10 -0
  115. package/dist/cjs/utils/error.js.map +1 -0
  116. package/dist/cjs/utils/format.d.ts +9 -0
  117. package/dist/cjs/utils/format.js +19 -0
  118. package/dist/cjs/utils/format.js.map +1 -0
  119. package/dist/cjs/utils/package.d.ts +15 -0
  120. package/dist/cjs/utils/package.js +25 -0
  121. package/dist/cjs/utils/package.js.map +1 -0
  122. package/dist/cjs/utils/types.d.ts +6 -0
  123. package/dist/cjs/utils/types.js +9 -0
  124. package/dist/cjs/utils/types.js.map +1 -0
  125. package/dist/cjs/utils/url.d.ts +9 -0
  126. package/dist/cjs/utils/url.js +32 -0
  127. package/dist/cjs/utils/url.js.map +1 -0
  128. package/dist/cjs/utils/valueMonitor.d.ts +31 -0
  129. package/dist/cjs/utils/valueMonitor.js +91 -0
  130. package/dist/cjs/utils/valueMonitor.js.map +1 -0
  131. package/package.json +85 -0
@@ -0,0 +1,274 @@
1
+ import type { ExitOptions } from 'apify';
2
+ import type { Request as CrawleeRequest, CrawlingContext, DatasetDataOptions, Log, ProxyConfiguration, RequestOptions } from 'crawlee';
3
+ import type { Page } from 'playwright';
4
+ import type { MaybeArray, MaybePromise, PickRequired } from '../../utils/types';
5
+ export type UnwrapCrawleeOneIO<T extends CrawleeOneIO<any, any, any>> = {
6
+ env: T extends CrawleeOneIO<infer U, any, any> ? U : never;
7
+ report: T extends CrawleeOneIO<any, infer U, any> ? U : never;
8
+ metadata: T extends CrawleeOneIO<any, any, infer U> ? U : never;
9
+ };
10
+ /**
11
+ * Interface for storing and retrieving:
12
+ * - Scraped data
13
+ * - Requests (URLs) to scrape
14
+ * - Cache data
15
+ *
16
+ * This interface is based on Crawlee/Apify, but defined separately to allow
17
+ * drop-in replacement with other integrations.
18
+ */
19
+ export interface CrawleeOneIO<TEnv extends object = object, TReport extends object = object, TMetadata extends object = object> {
20
+ /**
21
+ * Opens a dataset and returns a promise resolving to an instance of the {@link CrawleeOneDataset}.
22
+ *
23
+ * Datasets are used to store structured data where each object stored has the same attributes,
24
+ * such as online store products or real estate offers. The actual data is stored either on
25
+ * the local filesystem or in the cloud.
26
+ */
27
+ openDataset: (id?: string | null) => MaybePromise<CrawleeOneDataset>;
28
+ /**
29
+ * Opens a request queue and returns a promise resolving to an instance of the {@link CrawleeOneRequestQueue}.
30
+ *
31
+ * RequestQueue represents a queue of URLs to crawl, which is stored either on local filesystem
32
+ * or in the cloud. The queue is used for deep crawling of websites, where you start with several
33
+ * URLs and then recursively follow links to other pages. The data structure supports both
34
+ * breadth-first and depth-first crawling orders.
35
+ */
36
+ openRequestQueue: (id?: string | null) => MaybePromise<CrawleeOneRequestQueue>;
37
+ /**
38
+ * Opens a key-value store and returns a promise resolving to an instance of the {@link CrawleeOneKeyValueStore}.
39
+ *
40
+ * Key-value stores are used to store records or files, along with their MIME content type.
41
+ * The records are stored and retrieved using a unique key. The actual data is stored
42
+ * either on a local filesystem or in the cloud.
43
+ */
44
+ openKeyValueStore: (id?: string | null) => MaybePromise<CrawleeOneKeyValueStore>;
45
+ /**
46
+ * Returns an object which contains information parsed from relevant environment variables.
47
+ */
48
+ getEnv: () => MaybePromise<TEnv>;
49
+ /**
50
+ * Returns a promise of an object with the crawler input. E.g. In Apify, retrieves the actor input value from
51
+ * the default {@link KeyValueStore} associated with the current actor run.
52
+ */
53
+ getInput: <Input extends object>() => Promise<Input | null>;
54
+ /**
55
+ * Equivalent of {@link Actor.metamorph}.
56
+ *
57
+ * This function should:
58
+ * 1. Start a crawler/actor by its ID,
59
+ * 2. Pass the given input into downsteam crawler.
60
+ * 3. Make the same storage available to the downstream crawler. AKA, the downstream crawler
61
+ * should use the same "default" storage as is the current "default" storage.
62
+ *
63
+ * Read more about {@link Actor.metamorph}:
64
+ *
65
+ * `Actor.metamorph` transforms this actor run to an actor run of a given actor. The system
66
+ * stops the current container and starts the new container instead. All the default storages
67
+ * are preserved and the new input is stored under the INPUT-METAMORPH-1 key in the same
68
+ * default key-value store.
69
+ */
70
+ triggerDownstreamCrawler: <TInput extends object>(
71
+ /** ID of the crawler/actor to which should be triggered. */
72
+ targetActorId: string,
73
+ /** Input for the crawler/actor. Must be JSON-serializable (it will be stringified to JSON). */
74
+ input?: TInput, options?: {
75
+ /**
76
+ * Tag or number of the target build to metamorph into (e.g. `beta` or `1.2.345`).
77
+ * If not provided, the run uses build tag or number from the default actor run configuration (typically `latest`).
78
+ */
79
+ build?: string;
80
+ }) => Promise<void>;
81
+ /**
82
+ * Equivalent of {@link Actor.main}.
83
+ *
84
+ * Runs the main user function that performs the job of the actor
85
+ * and terminates the process when the user function finishes.
86
+ *
87
+ * **The `Actor.main()` function is optional** and is provided merely for your convenience.
88
+ * It is mainly useful when you're running your code as an actor on the [Apify platform](https://apify.com/actors).
89
+ * However, if you want to use Apify SDK tools directly inside your existing projects, e.g.
90
+ * running in an [Express](https://expressjs.com/) server, on
91
+ * [Google Cloud functions](https://cloud.google.com/functions)
92
+ * or [AWS Lambda](https://aws.amazon.com/lambda/), it's better to avoid
93
+ * it since the function terminates the main process when it finishes!
94
+ *
95
+ * The `Actor.main()` function performs the following actions:
96
+ *
97
+ * - When running on the Apify platform (i.e. `APIFY_IS_AT_HOME` environment variable is set),
98
+ * it sets up a connection to listen for platform events.
99
+ * For example, to get a notification about an imminent migration to another server.
100
+ * See {@apilink Actor.events} for details.
101
+ * - It checks that either `APIFY_TOKEN` or `APIFY_LOCAL_STORAGE_DIR` environment variable
102
+ * is defined. If not, the functions sets `APIFY_LOCAL_STORAGE_DIR` to `./apify_storage`
103
+ * inside the current working directory. This is to simplify running code examples.
104
+ * - It invokes the user function passed as the `userFunc` parameter.
105
+ * - If the user function returned a promise, waits for it to resolve.
106
+ * - If the user function throws an exception or some other error is encountered,
107
+ * prints error details to console so that they are stored to the log.
108
+ * - Exits the Node.js process, with zero exit code on success and non-zero on errors.
109
+ */
110
+ runInContext: (userFunc: () => MaybePromise<unknown>, options?: ExitOptions) => Promise<void>;
111
+ /**
112
+ * Creates a proxy configuration and returns a promise resolving to an instance of
113
+ * {@link ProxyConfiguration} that is already initialized.
114
+ *
115
+ * Configures connection to a proxy server with the provided options. Proxy servers are used
116
+ * to prevent target websites from blocking your crawlers based on IP address rate limits or
117
+ * blacklists. Setting proxy configuration in your crawlers automatically configures them to
118
+ * use the selected proxies for all connections.
119
+ *
120
+ * For more details and code examples, see {@link ProxyConfiguration}.
121
+ */
122
+ createDefaultProxyConfiguration: <T extends object>(input: T | Readonly<T> | undefined) => MaybePromise<ProxyConfiguration | undefined>;
123
+ isTelemetryEnabled: () => MaybePromise<boolean>;
124
+ /** Generate object with info on current context, which will be send to the error Dataset */
125
+ generateErrorReport: (input: CrawleeOneErrorHandlerInput, options: PickRequired<CrawleeOneErrorHandlerOptions<TEnv, TReport>, 'io'>) => MaybePromise<TReport>;
126
+ /** Generate object with info on current context, which will be appended to the scraped entry */
127
+ generateEntryMetadata: <Ctx extends CrawlingContext>(ctx: Ctx) => MaybePromise<TMetadata>;
128
+ }
129
+ /**
130
+ * Interface for storing and retrieving data in/from Dataset
131
+ *
132
+ * This interface is based on Crawlee/Apify, but defined separately to allow
133
+ * drop-in replacement with other integrations.
134
+ */
135
+ export interface CrawleeOneDataset<T extends object = object> {
136
+ /**
137
+ * Stores an object or an array of objects to the dataset. The function returns a promise
138
+ * that resolves when the operation finishes. It has no result, but throws on invalid args
139
+ * or other errors.
140
+ */
141
+ pushData: (
142
+ /**
143
+ * Object or array of objects containing data to be stored in the default dataset.
144
+ * The objects must be serializable to JSON and the JSON representation of each object
145
+ * must be smaller than 9MB.
146
+ */
147
+ data: MaybeArray<T>) => MaybePromise<void>;
148
+ /** Returns the items in the dataset based on the provided parameters. */
149
+ getItems: (options?: Pick<DatasetDataOptions, 'offset' | 'limit' | 'desc' | 'fields'>) => MaybePromise<T[]>;
150
+ /** Returns the count of items in the dataset. */
151
+ getItemCount: () => MaybePromise<number | null>;
152
+ }
153
+ /**
154
+ * Interface for storing and retrieving data in/from KeyValueStore.
155
+ *
156
+ * KeyValueStore is a cache / map structure, where entries are retrieved and saved
157
+ * under keys.
158
+ *
159
+ * This interface is based on Crawlee/Apify, but defined separately to allow
160
+ * drop-in replacement with other integrations.
161
+ */
162
+ export interface CrawleeOneKeyValueStore {
163
+ /**
164
+ * Saves or deletes a record in the key-value store. The function returns a promise that
165
+ * resolves once the record has been saved or deleted.
166
+ *
167
+ * If value is null, the record is deleted instead. Note that the setValue() function
168
+ * succeeds regardless whether the record existed or not.
169
+ *
170
+ * Beware that the key can be at most 256 characters long and only contain the following
171
+ * characters: a-zA-Z0-9!-_.'()
172
+ *
173
+ * To retrieve a value from the key-value store, use the {@link CrawleeOneKeyValueStore.getValue}
174
+ * function.
175
+ */
176
+ setValue: (key: string, value: any, options?: {
177
+ /** Specifies a custom MIME content type of the record. */
178
+ contentType?: string;
179
+ }) => MaybePromise<void>;
180
+ /**
181
+ * Removes the key-value store either from the cloud storage or from the local directory,
182
+ * depending on the mode of operation.
183
+ */
184
+ drop: () => MaybePromise<void>;
185
+ }
186
+ /**
187
+ * Interface for storing and retrieving Requests (URLs) to scrape
188
+ *
189
+ * This interface is based on Crawlee/Apify, but defined separately to allow
190
+ * drop-in replacement with other integrations.
191
+ */
192
+ export interface CrawleeOneRequestQueue {
193
+ /**
194
+ * Adds requests to the queue.
195
+ *
196
+ * If a request that is passed in is already present due to its uniqueKey property
197
+ * being the same, it will not be updated.
198
+ */
199
+ addRequests: (
200
+ /** Objects with request data. */
201
+ requestsLike: (CrawleeRequest | RequestOptions)[], options?: {
202
+ /**
203
+ * If set to true, the request will be added to the foremost position in the queue,
204
+ * so that it's returned in the next call to {@link CrawleeOneRequestQueue.fetchNextRequest}.
205
+ *
206
+ * By default, it's put to the end of the queue.
207
+ */
208
+ forefront?: boolean;
209
+ }) => MaybePromise<unknown>;
210
+ /**
211
+ * Marks a request that was previously returned by the
212
+ * {@link CrawleeOneRequestQueue.fetchNextRequest} function as handled after successful
213
+ * processing. Handled requests will never again be returned by the fetchNextRequest function.
214
+ */
215
+ markRequestHandled: (req: CrawleeRequest) => MaybePromise<unknown>;
216
+ /**
217
+ * Returns a next request in the queue to be processed, or null if there are no more
218
+ * pending requests.
219
+ *
220
+ * Once you successfully finish processing of the request, you need to call
221
+ * {@link CrawleeOneRequestQueue.markRequestHandled} to mark the request as handled
222
+ * in the queue. If there was some error in processing the request, call
223
+ * {@link CrawleeOneRequestQueue.reclaimRequest} instead, so that the queue will
224
+ * give the request to some other consumer in another call to the fetchNextRequest function.
225
+ *
226
+ * Note that the null return value doesn't mean the queue processing finished,
227
+ * it means there are currently no pending requests. To check whether all requests in queue
228
+ * were finished, use {@link CrawleeOneRequestQueue.isFinished} instead.
229
+ *
230
+ * @returns — Returns the request object or null if there are no more pending requests.
231
+ */
232
+ fetchNextRequest: () => MaybePromise<CrawleeRequest | null>;
233
+ /**
234
+ * Reclaims a failed request back to the queue, so that it can be returned
235
+ * for processing later again by another call to {@link CrawleeOneRequestQueue.fetchNextRequest}.
236
+ */
237
+ reclaimRequest: (req: CrawleeRequest, options?: {
238
+ /**
239
+ * If set to true, the request will be placed to the beginning of the queue,
240
+ * so that it's returned in the next call to {@link CrawleeOneRequestQueue.fetchNextRequest}.
241
+ *
242
+ * By default, it's put to the end of the queue.
243
+ */
244
+ forefront?: boolean;
245
+ }) => MaybePromise<unknown>;
246
+ /**
247
+ * Resolves to true if all requests were already handled and there are no more left. Due to the nature
248
+ * of distributed storage used by the queue, the function might occasionally return a false negative.
249
+ */
250
+ isFinished: () => MaybePromise<boolean>;
251
+ /** Removes the queue from the storage. */
252
+ drop: () => MaybePromise<void>;
253
+ /** Returns the number of handled requests. */
254
+ handledCount: () => MaybePromise<number | null>;
255
+ }
256
+ /** Input passed to the error handler */
257
+ export interface CrawleeOneErrorHandlerInput {
258
+ error: Error;
259
+ /** Page instance if we used PlaywrightCrawler */
260
+ page: Page | null;
261
+ /** URL where the error happened. If not given URL is taken from the Page object */
262
+ url: string | null;
263
+ log: Log | null;
264
+ }
265
+ /** User-configurable options passed to the error handler */
266
+ export interface CrawleeOneErrorHandlerOptions<TEnv extends object = object, TReport extends object = object> {
267
+ io?: CrawleeOneIO<TEnv, TReport>;
268
+ allowScreenshot?: boolean;
269
+ reportingDatasetId?: string;
270
+ onErrorCapture?: (input: {
271
+ error: Error;
272
+ report: TReport;
273
+ }) => MaybePromise<void>;
274
+ }
@@ -0,0 +1,3 @@
1
+ "use strict";
2
+ Object.defineProperty(exports, "__esModule", { value: true });
3
+ //# sourceMappingURL=types.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"types.js","sourceRoot":"","sources":["../../../../src/lib/integrations/types.ts"],"names":[],"mappings":"","sourcesContent":["import type { Actor, ExitOptions } from 'apify';\nimport type {\n Request as CrawleeRequest,\n CrawlingContext,\n DatasetDataOptions,\n Log,\n ProxyConfiguration,\n RequestOptions,\n} from 'crawlee';\nimport type { Page } from 'playwright';\n\nimport type { MaybeArray, MaybePromise, PickRequired } from '../../utils/types';\n\nexport type UnwrapCrawleeOneIO<T extends CrawleeOneIO<any, any, any>> = {\n env: T extends CrawleeOneIO<infer U, any, any> ? U : never;\n report: T extends CrawleeOneIO<any, infer U, any> ? U : never;\n metadata: T extends CrawleeOneIO<any, any, infer U> ? U : never;\n};\n\n/**\n * Interface for storing and retrieving:\n * - Scraped data\n * - Requests (URLs) to scrape\n * - Cache data\n *\n * This interface is based on Crawlee/Apify, but defined separately to allow\n * drop-in replacement with other integrations.\n */\nexport interface CrawleeOneIO<\n TEnv extends object = object,\n TReport extends object = object,\n TMetadata extends object = object\n> {\n /**\n * Opens a dataset and returns a promise resolving to an instance of the {@link CrawleeOneDataset}.\n *\n * Datasets are used to store structured data where each object stored has the same attributes,\n * such as online store products or real estate offers. The actual data is stored either on\n * the local filesystem or in the cloud.\n */\n openDataset: (id?: string | null) => MaybePromise<CrawleeOneDataset>;\n /**\n * Opens a request queue and returns a promise resolving to an instance of the {@link CrawleeOneRequestQueue}.\n *\n * RequestQueue represents a queue of URLs to crawl, which is stored either on local filesystem\n * or in the cloud. The queue is used for deep crawling of websites, where you start with several\n * URLs and then recursively follow links to other pages. The data structure supports both\n * breadth-first and depth-first crawling orders.\n */\n openRequestQueue: (id?: string | null) => MaybePromise<CrawleeOneRequestQueue>;\n /**\n * Opens a key-value store and returns a promise resolving to an instance of the {@link CrawleeOneKeyValueStore}.\n *\n * Key-value stores are used to store records or files, along with their MIME content type.\n * The records are stored and retrieved using a unique key. The actual data is stored\n * either on a local filesystem or in the cloud.\n */\n openKeyValueStore: (id?: string | null) => MaybePromise<CrawleeOneKeyValueStore>;\n /**\n * Returns an object which contains information parsed from relevant environment variables.\n */\n getEnv: () => MaybePromise<TEnv>;\n /**\n * Returns a promise of an object with the crawler input. E.g. In Apify, retrieves the actor input value from\n * the default {@link KeyValueStore} associated with the current actor run.\n */\n getInput: <Input extends object>() => Promise<Input | null>;\n /**\n * Equivalent of {@link Actor.metamorph}.\n *\n * This function should:\n * 1. Start a crawler/actor by its ID,\n * 2. Pass the given input into downsteam crawler.\n * 3. Make the same storage available to the downstream crawler. AKA, the downstream crawler\n * should use the same \"default\" storage as is the current \"default\" storage.\n *\n * Read more about {@link Actor.metamorph}:\n *\n * `Actor.metamorph` transforms this actor run to an actor run of a given actor. The system\n * stops the current container and starts the new container instead. All the default storages\n * are preserved and the new input is stored under the INPUT-METAMORPH-1 key in the same\n * default key-value store.\n */\n triggerDownstreamCrawler: <TInput extends object>(\n /** ID of the crawler/actor to which should be triggered. */\n targetActorId: string,\n /** Input for the crawler/actor. Must be JSON-serializable (it will be stringified to JSON). */\n input?: TInput,\n options?: {\n /**\n * Tag or number of the target build to metamorph into (e.g. `beta` or `1.2.345`).\n * If not provided, the run uses build tag or number from the default actor run configuration (typically `latest`).\n */\n build?: string;\n }\n ) => Promise<void>;\n /**\n * Equivalent of {@link Actor.main}.\n *\n * Runs the main user function that performs the job of the actor\n * and terminates the process when the user function finishes.\n *\n * **The `Actor.main()` function is optional** and is provided merely for your convenience.\n * It is mainly useful when you're running your code as an actor on the [Apify platform](https://apify.com/actors).\n * However, if you want to use Apify SDK tools directly inside your existing projects, e.g.\n * running in an [Express](https://expressjs.com/) server, on\n * [Google Cloud functions](https://cloud.google.com/functions)\n * or [AWS Lambda](https://aws.amazon.com/lambda/), it's better to avoid\n * it since the function terminates the main process when it finishes!\n *\n * The `Actor.main()` function performs the following actions:\n *\n * - When running on the Apify platform (i.e. `APIFY_IS_AT_HOME` environment variable is set),\n * it sets up a connection to listen for platform events.\n * For example, to get a notification about an imminent migration to another server.\n * See {@apilink Actor.events} for details.\n * - It checks that either `APIFY_TOKEN` or `APIFY_LOCAL_STORAGE_DIR` environment variable\n * is defined. If not, the functions sets `APIFY_LOCAL_STORAGE_DIR` to `./apify_storage`\n * inside the current working directory. This is to simplify running code examples.\n * - It invokes the user function passed as the `userFunc` parameter.\n * - If the user function returned a promise, waits for it to resolve.\n * - If the user function throws an exception or some other error is encountered,\n * prints error details to console so that they are stored to the log.\n * - Exits the Node.js process, with zero exit code on success and non-zero on errors.\n */\n runInContext: (userFunc: () => MaybePromise<unknown>, options?: ExitOptions) => Promise<void>;\n /**\n * Creates a proxy configuration and returns a promise resolving to an instance of\n * {@link ProxyConfiguration} that is already initialized.\n *\n * Configures connection to a proxy server with the provided options. Proxy servers are used\n * to prevent target websites from blocking your crawlers based on IP address rate limits or\n * blacklists. Setting proxy configuration in your crawlers automatically configures them to\n * use the selected proxies for all connections.\n *\n * For more details and code examples, see {@link ProxyConfiguration}.\n */\n createDefaultProxyConfiguration: <T extends object>(\n input: T | Readonly<T> | undefined\n ) => MaybePromise<ProxyConfiguration | undefined>;\n isTelemetryEnabled: () => MaybePromise<boolean>;\n /** Generate object with info on current context, which will be send to the error Dataset */\n generateErrorReport: (\n input: CrawleeOneErrorHandlerInput,\n options: PickRequired<CrawleeOneErrorHandlerOptions<TEnv, TReport>, 'io'>\n ) => MaybePromise<TReport>;\n /** Generate object with info on current context, which will be appended to the scraped entry */\n generateEntryMetadata: <Ctx extends CrawlingContext>(ctx: Ctx) => MaybePromise<TMetadata>;\n}\n\n/**\n * Interface for storing and retrieving data in/from Dataset\n *\n * This interface is based on Crawlee/Apify, but defined separately to allow\n * drop-in replacement with other integrations.\n */\nexport interface CrawleeOneDataset<T extends object = object> {\n /**\n * Stores an object or an array of objects to the dataset. The function returns a promise\n * that resolves when the operation finishes. It has no result, but throws on invalid args\n * or other errors.\n */\n pushData: (\n /**\n * Object or array of objects containing data to be stored in the default dataset.\n * The objects must be serializable to JSON and the JSON representation of each object\n * must be smaller than 9MB.\n */\n data: MaybeArray<T>\n ) => MaybePromise<void>;\n /** Returns the items in the dataset based on the provided parameters. */\n getItems: (\n options?: Pick<DatasetDataOptions, 'offset' | 'limit' | 'desc' | 'fields'>\n ) => MaybePromise<T[]>;\n /** Returns the count of items in the dataset. */\n getItemCount: () => MaybePromise<number | null>;\n}\n\n/**\n * Interface for storing and retrieving data in/from KeyValueStore.\n *\n * KeyValueStore is a cache / map structure, where entries are retrieved and saved\n * under keys.\n *\n * This interface is based on Crawlee/Apify, but defined separately to allow\n * drop-in replacement with other integrations.\n */\nexport interface CrawleeOneKeyValueStore {\n /**\n * Saves or deletes a record in the key-value store. The function returns a promise that\n * resolves once the record has been saved or deleted.\n *\n * If value is null, the record is deleted instead. Note that the setValue() function\n * succeeds regardless whether the record existed or not.\n *\n * Beware that the key can be at most 256 characters long and only contain the following\n * characters: a-zA-Z0-9!-_.'()\n *\n * To retrieve a value from the key-value store, use the {@link CrawleeOneKeyValueStore.getValue}\n * function.\n */\n setValue: (\n key: string,\n value: any,\n options?: {\n /** Specifies a custom MIME content type of the record. */\n contentType?: string;\n }\n ) => MaybePromise<void>;\n /**\n * Removes the key-value store either from the cloud storage or from the local directory,\n * depending on the mode of operation.\n */\n drop: () => MaybePromise<void>;\n}\n\n/**\n * Interface for storing and retrieving Requests (URLs) to scrape\n *\n * This interface is based on Crawlee/Apify, but defined separately to allow\n * drop-in replacement with other integrations.\n */\nexport interface CrawleeOneRequestQueue {\n /**\n * Adds requests to the queue.\n *\n * If a request that is passed in is already present due to its uniqueKey property\n * being the same, it will not be updated.\n */\n addRequests: (\n /** Objects with request data. */\n requestsLike: (CrawleeRequest | RequestOptions)[],\n options?: {\n /**\n * If set to true, the request will be added to the foremost position in the queue,\n * so that it's returned in the next call to {@link CrawleeOneRequestQueue.fetchNextRequest}.\n *\n * By default, it's put to the end of the queue.\n */\n forefront?: boolean;\n }\n ) => MaybePromise<unknown>;\n /**\n * Marks a request that was previously returned by the\n * {@link CrawleeOneRequestQueue.fetchNextRequest} function as handled after successful\n * processing. Handled requests will never again be returned by the fetchNextRequest function.\n */\n markRequestHandled: (req: CrawleeRequest) => MaybePromise<unknown>;\n /**\n * Returns a next request in the queue to be processed, or null if there are no more\n * pending requests.\n *\n * Once you successfully finish processing of the request, you need to call\n * {@link CrawleeOneRequestQueue.markRequestHandled} to mark the request as handled\n * in the queue. If there was some error in processing the request, call\n * {@link CrawleeOneRequestQueue.reclaimRequest} instead, so that the queue will\n * give the request to some other consumer in another call to the fetchNextRequest function.\n *\n * Note that the null return value doesn't mean the queue processing finished,\n * it means there are currently no pending requests. To check whether all requests in queue\n * were finished, use {@link CrawleeOneRequestQueue.isFinished} instead.\n *\n * @returns — Returns the request object or null if there are no more pending requests.\n */\n fetchNextRequest: () => MaybePromise<CrawleeRequest | null>;\n /**\n * Reclaims a failed request back to the queue, so that it can be returned\n * for processing later again by another call to {@link CrawleeOneRequestQueue.fetchNextRequest}.\n */\n reclaimRequest: (\n req: CrawleeRequest,\n options?: {\n /**\n * If set to true, the request will be placed to the beginning of the queue,\n * so that it's returned in the next call to {@link CrawleeOneRequestQueue.fetchNextRequest}.\n *\n * By default, it's put to the end of the queue.\n */\n forefront?: boolean;\n }\n ) => MaybePromise<unknown>;\n /**\n * Resolves to true if all requests were already handled and there are no more left. Due to the nature\n * of distributed storage used by the queue, the function might occasionally return a false negative.\n */\n isFinished: () => MaybePromise<boolean>;\n /** Removes the queue from the storage. */\n drop: () => MaybePromise<void>;\n /** Returns the number of handled requests. */\n handledCount: () => MaybePromise<number | null>;\n}\n\n/** Input passed to the error handler */\nexport interface CrawleeOneErrorHandlerInput {\n error: Error;\n /** Page instance if we used PlaywrightCrawler */\n page: Page | null;\n /** URL where the error happened. If not given URL is taken from the Page object */\n url: string | null;\n log: Log | null;\n}\n\n/** User-configurable options passed to the error handler */\nexport interface CrawleeOneErrorHandlerOptions<\n TEnv extends object = object,\n TReport extends object = object\n> {\n io?: CrawleeOneIO<TEnv, TReport>;\n allowScreenshot?: boolean;\n reportingDatasetId?: string;\n onErrorCapture?: (input: { error: Error; report: TReport }) => MaybePromise<void>;\n}\n"]}
@@ -0,0 +1,67 @@
1
+ import type { DatasetDataOptions, Log } from 'apify';
2
+ import { ValueMonitorOptions } from '../../utils/valueMonitor';
3
+ import type { CrawleeOneIO } from '../integrations/types';
4
+ /**
5
+ * Given a Dataset ID, get the number of entries already in the Dataset.
6
+ *
7
+ * By default uses Apify Dataset.
8
+ */
9
+ export declare const getDatasetCount: (datasetNameOrId?: string, options?: {
10
+ io?: CrawleeOneIO;
11
+ log?: Log;
12
+ }) => Promise<number | null>;
13
+ /**
14
+ * Given a Dataset ID and a name of a field, get the columnar data.
15
+ *
16
+ * By default uses Apify Dataset.
17
+ *
18
+ * Example:
19
+ * ```js
20
+ * // Given dataset
21
+ * // [
22
+ * // { id: 1, field: 'abc' },
23
+ * // { id: 2, field: 'def' }
24
+ * // ]
25
+ * const results = await getColumnFromDataset('datasetId123', 'field');
26
+ * console.log(results)
27
+ * // ['abc', 'def']
28
+ * ```
29
+ */
30
+ export declare const getColumnFromDataset: <T>(datasetId: string, field: string, options?: {
31
+ io?: CrawleeOneIO;
32
+ dataOptions?: Pick<DatasetDataOptions, 'offset' | 'limit' | 'desc'>;
33
+ }) => Promise<T[]>;
34
+ export interface DatasetSizeMonitorOptions extends ValueMonitorOptions {
35
+ /**
36
+ * ID or name of the Dataset that's monitored for size.
37
+ *
38
+ * If omitted, the default Dataset is used.
39
+ */
40
+ datasetId?: string;
41
+ /**
42
+ * ID of the RequestQueue that holds remaining requests. This queue will be
43
+ * emptied when Dataset reaches `maxSize`.
44
+ *
45
+ * If omitted, the default RequestQueue is used.
46
+ */
47
+ requestQueueId?: string;
48
+ io?: CrawleeOneIO;
49
+ }
50
+ /**
51
+ * Semi-automatic monitoring of Dataset size. This is used in limiting the total of entries
52
+ * scraped per run / Dataset:
53
+ * - When Dataset reaches `maxSize`, then all remaining Requests
54
+ * in the RequestQueue are removed.
55
+ * - Pass an array of items to `shortenToSize` to shorten the array to the size
56
+ * that still fits the Dataset.
57
+ *
58
+ * By default uses Apify Dataset.
59
+ */
60
+ export declare const datasetSizeMonitor: (maxSize: number, options?: DatasetSizeMonitorOptions) => {
61
+ shortenToSize: <T>(arr: T[]) => Promise<T[]>;
62
+ isFull: () => Promise<boolean>;
63
+ value: () => number | Promise<number> | null;
64
+ isStale: () => boolean;
65
+ refresh: () => Promise<number>;
66
+ onValue: (callback: import("../../utils/valueMonitor").ValueCallback<number>) => () => void;
67
+ };
@@ -0,0 +1,86 @@
1
+ "use strict";
2
+ var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
3
+ function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
4
+ return new (P || (P = Promise))(function (resolve, reject) {
5
+ function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
6
+ function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
7
+ function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
8
+ step((generator = generator.apply(thisArg, _arguments || [])).next());
9
+ });
10
+ };
11
+ Object.defineProperty(exports, "__esModule", { value: true });
12
+ exports.datasetSizeMonitor = exports.getColumnFromDataset = exports.getDatasetCount = void 0;
13
+ const valueMonitor_1 = require("../../utils/valueMonitor");
14
+ const apify_1 = require("../integrations/apify");
15
+ /**
16
+ * Given a Dataset ID, get the number of entries already in the Dataset.
17
+ *
18
+ * By default uses Apify Dataset.
19
+ */
20
+ const getDatasetCount = (datasetNameOrId, options) => __awaiter(void 0, void 0, void 0, function* () {
21
+ const { io = apify_1.apifyIO, log } = options !== null && options !== void 0 ? options : {};
22
+ log === null || log === void 0 ? void 0 : log.debug('Opening dataset');
23
+ const dataset = yield io.openDataset(datasetNameOrId);
24
+ // const dataset = await io.openDataset(datasetNameOrId);
25
+ log === null || log === void 0 ? void 0 : log.debug('Obtaining dataset entries count');
26
+ const count = yield dataset.getItemCount();
27
+ if (typeof count !== 'number') {
28
+ log === null || log === void 0 ? void 0 : log.warning('Failed to get count of entries in dataset. We use this info to know how many items were scraped. More entries might be scraped than was set.'); // prettier-ignore
29
+ }
30
+ else {
31
+ log === null || log === void 0 ? void 0 : log.debug(`Done obtaining dataset entries count (${count})`);
32
+ }
33
+ return count;
34
+ });
35
+ exports.getDatasetCount = getDatasetCount;
36
+ /**
37
+ * Given a Dataset ID and a name of a field, get the columnar data.
38
+ *
39
+ * By default uses Apify Dataset.
40
+ *
41
+ * Example:
42
+ * ```js
43
+ * // Given dataset
44
+ * // [
45
+ * // { id: 1, field: 'abc' },
46
+ * // { id: 2, field: 'def' }
47
+ * // ]
48
+ * const results = await getColumnFromDataset('datasetId123', 'field');
49
+ * console.log(results)
50
+ * // ['abc', 'def']
51
+ * ```
52
+ */
53
+ const getColumnFromDataset = (datasetId, field, options) => __awaiter(void 0, void 0, void 0, function* () {
54
+ const { io = apify_1.apifyIO, dataOptions } = options !== null && options !== void 0 ? options : {};
55
+ const dataset = yield io.openDataset(datasetId);
56
+ const items = yield dataset.getItems(Object.assign(Object.assign({}, dataOptions), { fields: [field] }));
57
+ const data = items.map((d) => d[field]);
58
+ return data;
59
+ });
60
+ exports.getColumnFromDataset = getColumnFromDataset;
61
+ /**
62
+ * Semi-automatic monitoring of Dataset size. This is used in limiting the total of entries
63
+ * scraped per run / Dataset:
64
+ * - When Dataset reaches `maxSize`, then all remaining Requests
65
+ * in the RequestQueue are removed.
66
+ * - Pass an array of items to `shortenToSize` to shorten the array to the size
67
+ * that still fits the Dataset.
68
+ *
69
+ * By default uses Apify Dataset.
70
+ */
71
+ const datasetSizeMonitor = (maxSize, options) => {
72
+ const { io = apify_1.apifyIO } = options !== null && options !== void 0 ? options : {};
73
+ const getSize = () => __awaiter(void 0, void 0, void 0, function* () {
74
+ const dataset = yield io.openDataset(options === null || options === void 0 ? void 0 : options.datasetId);
75
+ const size = yield dataset.getItemCount();
76
+ return size !== null && size !== void 0 ? size : 0;
77
+ });
78
+ // When we've reached the Dataset's max size, then remove all remaining Requests
79
+ const onMaxSizeReached = () => __awaiter(void 0, void 0, void 0, function* () {
80
+ const reqQueue = yield io.openRequestQueue(options === null || options === void 0 ? void 0 : options.requestQueueId);
81
+ yield reqQueue.drop();
82
+ });
83
+ return (0, valueMonitor_1.createSizeMonitor)(maxSize, getSize, onMaxSizeReached, options);
84
+ };
85
+ exports.datasetSizeMonitor = datasetSizeMonitor;
86
+ //# sourceMappingURL=dataset.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"dataset.js","sourceRoot":"","sources":["../../../../src/lib/io/dataset.ts"],"names":[],"mappings":";;;;;;;;;;;;AAEA,2DAAkF;AAElF,iDAAgD;AAEhD;;;;GAIG;AACI,MAAM,eAAe,GAAG,CAC7B,eAAwB,EACxB,OAA0C,EAC1C,EAAE;IACF,MAAM,EAAE,EAAE,GAAG,eAAO,EAAE,GAAG,EAAE,GAAG,OAAO,aAAP,OAAO,cAAP,OAAO,GAAI,EAAE,CAAC;IAE5C,GAAG,aAAH,GAAG,uBAAH,GAAG,CAAE,KAAK,CAAC,iBAAiB,CAAC,CAAC;IAC9B,MAAM,OAAO,GAAG,MAAM,EAAE,CAAC,WAAW,CAAC,eAAe,CAAC,CAAC;IACtD,yDAAyD;IACzD,GAAG,aAAH,GAAG,uBAAH,GAAG,CAAE,KAAK,CAAC,iCAAiC,CAAC,CAAC;IAC9C,MAAM,KAAK,GAAG,MAAM,OAAO,CAAC,YAAY,EAAE,CAAC;IAC3C,IAAI,OAAO,KAAK,KAAK,QAAQ,EAAE;QAC7B,GAAG,aAAH,GAAG,uBAAH,GAAG,CAAE,OAAO,CAAC,8IAA8I,CAAC,CAAC,CAAC,kBAAkB;KACjL;SAAM;QACL,GAAG,aAAH,GAAG,uBAAH,GAAG,CAAE,KAAK,CAAC,yCAAyC,KAAK,GAAG,CAAC,CAAC;KAC/D;IACD,OAAO,KAAK,CAAC;AACf,CAAC,CAAA,CAAC;AAjBW,QAAA,eAAe,mBAiB1B;AAEF;;;;;;;;;;;;;;;;GAgBG;AACI,MAAM,oBAAoB,GAAG,CAClC,SAAiB,EACjB,KAAa,EACb,OAGC,EACD,EAAE;IACF,MAAM,EAAE,EAAE,GAAG,eAAO,EAAE,WAAW,EAAE,GAAG,OAAO,aAAP,OAAO,cAAP,OAAO,GAAI,EAAE,CAAC;IAEpD,MAAM,OAAO,GAAG,MAAM,EAAE,CAAC,WAAW,CAAC,SAAS,CAAC,CAAC;IAChD,MAAM,KAAK,GAAG,MAAM,OAAO,CAAC,QAAQ,iCAC/B,WAAW,KACd,MAAM,EAAE,CAAC,KAAK,CAAC,IACf,CAAC;IACH,MAAM,IAAI,GAAG,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,KAAK,CAAM,CAAC,CAAC;IAC7C,OAAO,IAAI,CAAC;AACd,CAAC,CAAA,CAAC;AAjBW,QAAA,oBAAoB,wBAiB/B;AAmBF;;;;;;;;;GASG;AACI,MAAM,kBAAkB,GAAG,CAAC,OAAe,EAAE,OAAmC,EAAE,EAAE;IACzF,MAAM,EAAE,EAAE,GAAG,eAAO,EAAE,GAAG,OAAO,aAAP,OAAO,cAAP,OAAO,GAAI,EAAE,CAAC;IAEvC,MAAM,OAAO,GAAG,GAAS,EAAE;QACzB,MAAM,OAAO,GAAG,MAAM,EAAE,CAAC,WAAW,CAAC,OAAO,aAAP,OAAO,uBAAP,OAAO,CAAE,SAAS,CAAC,CAAC;QACzD,MAAM,IAAI,GAAG,MAAM,OAAO,CAAC,YAAY,EAAE,CAAC;QAC1C,OAAO,IAAI,aAAJ,IAAI,cAAJ,IAAI,GAAI,CAAC,CAAC;IACnB,CAAC,CAAA,CAAC;IAEF,gFAAgF;IAChF,MAAM,gBAAgB,GAAG,GAAS,EAAE;QAClC,MAAM,QAAQ,GAAG,MAAM,EAAE,CAAC,gBAAgB,CAAC,OAAO,aAAP,OAAO,uBAAP,OAAO,CAAE,cAAc,CAAC,CAAC;QACpE,MAAM,QAAQ,CAAC,IAAI,EAAE,CAAC;IACxB,CAAC,CAAA,CAAC;IAEF,OAAO,IAAA,gCAAiB,EAAC,OAAO,EAAE,OAAO,EAAE,gBAAgB,EAAE,OAAO,CAAC,CAAC;AACxE,CAAC,CAAC;AAhBW,QAAA,kBAAkB,sBAgB7B","sourcesContent":["import type { DatasetDataOptions, Log } from 'apify';\n\nimport { ValueMonitorOptions, createSizeMonitor } from '../../utils/valueMonitor';\nimport type { CrawleeOneIO } from '../integrations/types';\nimport { apifyIO } from '../integrations/apify';\n\n/**\n * Given a Dataset ID, get the number of entries already in the Dataset.\n *\n * By default uses Apify Dataset.\n */\nexport const getDatasetCount = async (\n datasetNameOrId?: string,\n options?: { io?: CrawleeOneIO; log?: Log }\n) => {\n const { io = apifyIO, log } = options ?? {};\n\n log?.debug('Opening dataset');\n const dataset = await io.openDataset(datasetNameOrId);\n // const dataset = await io.openDataset(datasetNameOrId);\n log?.debug('Obtaining dataset entries count');\n const count = await dataset.getItemCount();\n if (typeof count !== 'number') {\n log?.warning('Failed to get count of entries in dataset. We use this info to know how many items were scraped. More entries might be scraped than was set.'); // prettier-ignore\n } else {\n log?.debug(`Done obtaining dataset entries count (${count})`);\n }\n return count;\n};\n\n/**\n * Given a Dataset ID and a name of a field, get the columnar data.\n *\n * By default uses Apify Dataset.\n *\n * Example:\n * ```js\n * // Given dataset\n * // [\n * // { id: 1, field: 'abc' },\n * // { id: 2, field: 'def' }\n * // ]\n * const results = await getColumnFromDataset('datasetId123', 'field');\n * console.log(results)\n * // ['abc', 'def']\n * ```\n */\nexport const getColumnFromDataset = async <T>(\n datasetId: string,\n field: string,\n options?: {\n io?: CrawleeOneIO;\n dataOptions?: Pick<DatasetDataOptions, 'offset' | 'limit' | 'desc'>;\n }\n) => {\n const { io = apifyIO, dataOptions } = options ?? {};\n\n const dataset = await io.openDataset(datasetId);\n const items = await dataset.getItems({\n ...dataOptions,\n fields: [field],\n });\n const data = items.map((d) => d[field] as T);\n return data;\n};\n\nexport interface DatasetSizeMonitorOptions extends ValueMonitorOptions {\n /**\n * ID or name of the Dataset that's monitored for size.\n *\n * If omitted, the default Dataset is used.\n */\n datasetId?: string;\n /**\n * ID of the RequestQueue that holds remaining requests. This queue will be\n * emptied when Dataset reaches `maxSize`.\n *\n * If omitted, the default RequestQueue is used.\n */\n requestQueueId?: string;\n io?: CrawleeOneIO;\n}\n\n/**\n * Semi-automatic monitoring of Dataset size. This is used in limiting the total of entries\n * scraped per run / Dataset:\n * - When Dataset reaches `maxSize`, then all remaining Requests\n * in the RequestQueue are removed.\n * - Pass an array of items to `shortenToSize` to shorten the array to the size\n * that still fits the Dataset.\n *\n * By default uses Apify Dataset.\n */\nexport const datasetSizeMonitor = (maxSize: number, options?: DatasetSizeMonitorOptions) => {\n const { io = apifyIO } = options ?? {};\n\n const getSize = async () => {\n const dataset = await io.openDataset(options?.datasetId);\n const size = await dataset.getItemCount();\n return size ?? 0;\n };\n\n // When we've reached the Dataset's max size, then remove all remaining Requests\n const onMaxSizeReached = async () => {\n const reqQueue = await io.openRequestQueue(options?.requestQueueId);\n await reqQueue.drop();\n };\n\n return createSizeMonitor(maxSize, getSize, onMaxSizeReached, options);\n};\n"]}
@@ -0,0 +1,30 @@
1
+ import { Log } from 'apify';
2
+ /**
3
+ * Given a batch of entries, use several strategies to check
4
+ * if we've reached the limit on the max number of entries
5
+ * we're allowed to extract this run.
6
+ */
7
+ export declare const checkEntriesCount: ({ maxCount, currBatchCount, datasetNameOrId, customItemCount, }: {
8
+ /** Number of entries in the current batch */
9
+ currBatchCount: number;
10
+ /** Max number of entries allowed to extract. */
11
+ maxCount?: number | null | undefined;
12
+ /**
13
+ * If given, maxCount will be ALSO compared against
14
+ * the amount of entries already in the dataset.
15
+ */
16
+ datasetNameOrId?: string | null | undefined;
17
+ /**
18
+ * If given, maxCount will be ALSO compared against
19
+ * this amount.
20
+ */
21
+ customItemCount?: number | null | undefined;
22
+ }, { log }?: {
23
+ log?: Log | undefined;
24
+ }) => Promise<{
25
+ limitReached: boolean;
26
+ overflow: number;
27
+ }>;
28
+ export declare const getDatasetCount: (datasetNameOrId?: string, { log }?: {
29
+ log?: Log | undefined;
30
+ }) => Promise<number | null>;
@@ -0,0 +1,55 @@
1
+ "use strict";
2
+ var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
3
+ function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
4
+ return new (P || (P = Promise))(function (resolve, reject) {
5
+ function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
6
+ function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
7
+ function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
8
+ step((generator = generator.apply(thisArg, _arguments || [])).next());
9
+ });
10
+ };
11
+ Object.defineProperty(exports, "__esModule", { value: true });
12
+ exports.getDatasetCount = exports.checkEntriesCount = void 0;
13
+ const apify_1 = require("apify");
14
+ /**
15
+ * Given a batch of entries, use several strategies to check
16
+ * if we've reached the limit on the max number of entries
17
+ * we're allowed to extract this run.
18
+ */
19
+ const checkEntriesCount = ({ maxCount, currBatchCount, datasetNameOrId, customItemCount, }, { log } = {}) => __awaiter(void 0, void 0, void 0, function* () {
20
+ const datasetItemCount = datasetNameOrId ? yield (0, exports.getDatasetCount)(datasetNameOrId, { log }) : null;
21
+ if ((datasetItemCount == null && customItemCount == null) || maxCount == null) {
22
+ return { limitReached: false, overflow: 0 };
23
+ }
24
+ // Check if we've reached the limit for max entries
25
+ if (currBatchCount >= maxCount) {
26
+ return { limitReached: true, overflow: currBatchCount - maxCount };
27
+ }
28
+ // Use count of items already in dataset to check if limit reached
29
+ if (datasetItemCount != null && datasetItemCount + currBatchCount >= maxCount) {
30
+ return { limitReached: true, overflow: datasetItemCount + currBatchCount - maxCount };
31
+ }
32
+ // Use page offset to check if limit reached (20 entries per page)
33
+ if (customItemCount != null && customItemCount >= maxCount) {
34
+ return { limitReached: true, overflow: customItemCount - maxCount };
35
+ }
36
+ return { limitReached: false, overflow: 0 };
37
+ });
38
+ exports.checkEntriesCount = checkEntriesCount;
39
+ const getDatasetCount = (datasetNameOrId, { log } = {}) => __awaiter(void 0, void 0, void 0, function* () {
40
+ var _a;
41
+ log === null || log === void 0 ? void 0 : log.debug('Opening dataset');
42
+ const dataset = yield apify_1.Actor.openDataset(datasetNameOrId);
43
+ log === null || log === void 0 ? void 0 : log.debug('Obtaining dataset entries count');
44
+ const datasetInfo = yield dataset.getInfo();
45
+ const count = (_a = datasetInfo === null || datasetInfo === void 0 ? void 0 : datasetInfo.itemCount) !== null && _a !== void 0 ? _a : null;
46
+ if (typeof count !== 'number') {
47
+ log === null || log === void 0 ? void 0 : log.warning('Failed to get count of entries in dataset. We use this info to know how many items were scraped. More entries might be scraped than was set.'); // prettier-ignore
48
+ }
49
+ else {
50
+ log === null || log === void 0 ? void 0 : log.debug(`Done obtaining dataset entries count (${count})`);
51
+ }
52
+ return count;
53
+ });
54
+ exports.getDatasetCount = getDatasetCount;
55
+ //# sourceMappingURL=maxCount.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"maxCount.js","sourceRoot":"","sources":["../../../../src/lib/io/maxCount.ts"],"names":[],"mappings":";;;;;;;;;;;;AAAA,iCAAmC;AAEnC;;;;GAIG;AACI,MAAM,iBAAiB,GAAG,CAC/B,EACE,QAAQ,EACR,cAAc,EACd,eAAe,EACf,eAAe,GAgBhB,EACD,EAAE,GAAG,KAAoB,EAAE,EAC3B,EAAE;IACF,MAAM,gBAAgB,GAAG,eAAe,CAAC,CAAC,CAAC,MAAM,IAAA,uBAAe,EAAC,eAAe,EAAE,EAAE,GAAG,EAAE,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC;IAElG,IAAI,CAAC,gBAAgB,IAAI,IAAI,IAAI,eAAe,IAAI,IAAI,CAAC,IAAI,QAAQ,IAAI,IAAI,EAAE;QAC7E,OAAO,EAAE,YAAY,EAAE,KAAK,EAAE,QAAQ,EAAE,CAAC,EAAE,CAAC;KAC7C;IAED,mDAAmD;IACnD,IAAI,cAAc,IAAI,QAAQ,EAAE;QAC9B,OAAO,EAAE,YAAY,EAAE,IAAI,EAAE,QAAQ,EAAE,cAAc,GAAG,QAAQ,EAAE,CAAC;KACpE;IAED,kEAAkE;IAClE,IAAI,gBAAgB,IAAI,IAAI,IAAI,gBAAgB,GAAG,cAAc,IAAI,QAAQ,EAAE;QAC7E,OAAO,EAAE,YAAY,EAAE,IAAI,EAAE,QAAQ,EAAE,gBAAgB,GAAG,cAAc,GAAG,QAAQ,EAAE,CAAC;KACvF;IAED,kEAAkE;IAClE,IAAI,eAAe,IAAI,IAAI,IAAI,eAAe,IAAI,QAAQ,EAAE;QAC1D,OAAO,EAAE,YAAY,EAAE,IAAI,EAAE,QAAQ,EAAE,eAAe,GAAG,QAAQ,EAAE,CAAC;KACrE;IAED,OAAO,EAAE,YAAY,EAAE,KAAK,EAAE,QAAQ,EAAE,CAAC,EAAE,CAAC;AAC9C,CAAC,CAAA,CAAC;AA9CW,QAAA,iBAAiB,qBA8C5B;AAEK,MAAM,eAAe,GAAG,CAAO,eAAwB,EAAE,EAAE,GAAG,KAAoB,EAAE,EAAE,EAAE;;IAC7F,GAAG,aAAH,GAAG,uBAAH,GAAG,CAAE,KAAK,CAAC,iBAAiB,CAAC,CAAC;IAC9B,MAAM,OAAO,GAAG,MAAM,aAAK,CAAC,WAAW,CAAC,eAAe,CAAC,CAAC;IACzD,GAAG,aAAH,GAAG,uBAAH,GAAG,CAAE,KAAK,CAAC,iCAAiC,CAAC,CAAC;IAC9C,MAAM,WAAW,GAAG,MAAM,OAAO,CAAC,OAAO,EAAE,CAAC;IAC5C,MAAM,KAAK,GAAG,MAAA,WAAW,aAAX,WAAW,uBAAX,WAAW,CAAE,SAAS,mCAAI,IAAI,CAAC;IAC7C,IAAI,OAAO,KAAK,KAAK,QAAQ,EAAE;QAC7B,GAAG,aAAH,GAAG,uBAAH,GAAG,CAAE,OAAO,CAAC,8IAA8I,CAAC,CAAC,CAAC,kBAAkB;KACjL;SAAM;QACL,GAAG,aAAH,GAAG,uBAAH,GAAG,CAAE,KAAK,CAAC,yCAAyC,KAAK,GAAG,CAAC,CAAC;KAC/D;IACD,OAAO,KAAK,CAAC;AACf,CAAC,CAAA,CAAC;AAZW,QAAA,eAAe,mBAY1B","sourcesContent":["import { Actor, Log } from 'apify';\n\n/**\n * Given a batch of entries, use several strategies to check\n * if we've reached the limit on the max number of entries\n * we're allowed to extract this run.\n */\nexport const checkEntriesCount = async (\n {\n maxCount,\n currBatchCount,\n datasetNameOrId,\n customItemCount,\n }: {\n /** Number of entries in the current batch */\n currBatchCount: number;\n /** Max number of entries allowed to extract. */\n maxCount?: number | null;\n /**\n * If given, maxCount will be ALSO compared against\n * the amount of entries already in the dataset.\n */\n datasetNameOrId?: string | null;\n /**\n * If given, maxCount will be ALSO compared against\n * this amount.\n */\n customItemCount?: number | null;\n },\n { log }: { log?: Log } = {}\n) => {\n const datasetItemCount = datasetNameOrId ? await getDatasetCount(datasetNameOrId, { log }) : null;\n\n if ((datasetItemCount == null && customItemCount == null) || maxCount == null) {\n return { limitReached: false, overflow: 0 };\n }\n\n // Check if we've reached the limit for max entries\n if (currBatchCount >= maxCount) {\n return { limitReached: true, overflow: currBatchCount - maxCount };\n }\n\n // Use count of items already in dataset to check if limit reached\n if (datasetItemCount != null && datasetItemCount + currBatchCount >= maxCount) {\n return { limitReached: true, overflow: datasetItemCount + currBatchCount - maxCount };\n }\n\n // Use page offset to check if limit reached (20 entries per page)\n if (customItemCount != null && customItemCount >= maxCount) {\n return { limitReached: true, overflow: customItemCount - maxCount };\n }\n\n return { limitReached: false, overflow: 0 };\n};\n\nexport const getDatasetCount = async (datasetNameOrId?: string, { log }: { log?: Log } = {}) => {\n log?.debug('Opening dataset');\n const dataset = await Actor.openDataset(datasetNameOrId);\n log?.debug('Obtaining dataset entries count');\n const datasetInfo = await dataset.getInfo();\n const count = datasetInfo?.itemCount ?? null;\n if (typeof count !== 'number') {\n log?.warning('Failed to get count of entries in dataset. We use this info to know how many items were scraped. More entries might be scraped than was set.'); // prettier-ignore\n } else {\n log?.debug(`Done obtaining dataset entries count (${count})`);\n }\n return count;\n};\n"]}