npm - crawlee-one - Versions diffs - 1.0.0 - Mend

crawlee-one 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (131) hide show

package/README.md +81 -0
package/dist/cjs/cli/cli.d.ts +1 -0
package/dist/cjs/cli/cli.js +61 -0
package/dist/cjs/cli/cli.js.map +1 -0
package/dist/cjs/cli/index.d.ts +2 -0
package/dist/cjs/cli/index.js +6 -0
package/dist/cjs/cli/index.js.map +1 -0
package/dist/cjs/index.d.ts +24 -0
package/dist/cjs/index.js +43 -0
package/dist/cjs/index.js.map +1 -0
package/dist/cjs/lib/actions/dom.d.ts +102 -0
package/dist/cjs/lib/actions/dom.js +743 -0
package/dist/cjs/lib/actions/dom.js.map +1 -0
package/dist/cjs/lib/actions/domUtils.d.ts +42 -0
package/dist/cjs/lib/actions/domUtils.js +126 -0
package/dist/cjs/lib/actions/domUtils.js.map +1 -0
package/dist/cjs/lib/actions/page.d.ts +69 -0
package/dist/cjs/lib/actions/page.js +205 -0
package/dist/cjs/lib/actions/page.js.map +1 -0
package/dist/cjs/lib/actions/scrapeListing.d.ts +78 -0
package/dist/cjs/lib/actions/scrapeListing.js +242 -0
package/dist/cjs/lib/actions/scrapeListing.js.map +1 -0
package/dist/cjs/lib/actor/actor.d.ts +90 -0
package/dist/cjs/lib/actor/actor.js +306 -0
package/dist/cjs/lib/actor/actor.js.map +1 -0
package/dist/cjs/lib/actor/types.d.ts +162 -0
package/dist/cjs/lib/actor/types.js +3 -0
package/dist/cjs/lib/actor/types.js.map +1 -0
package/dist/cjs/lib/actor.d.ts +189 -0
package/dist/cjs/lib/actor.js +225 -0
package/dist/cjs/lib/actor.js.map +1 -0
package/dist/cjs/lib/actorSpec.d.ts +20 -0
package/dist/cjs/lib/actorSpec.js +3 -0
package/dist/cjs/lib/actorSpec.js.map +1 -0
package/dist/cjs/lib/config.d.ts +561 -0
package/dist/cjs/lib/config.js +707 -0
package/dist/cjs/lib/config.js.map +1 -0
package/dist/cjs/lib/dataset/maxCount.d.ts +30 -0
package/dist/cjs/lib/dataset/maxCount.js +55 -0
package/dist/cjs/lib/dataset/maxCount.js.map +1 -0
package/dist/cjs/lib/dataset/pushData.d.ts +123 -0
package/dist/cjs/lib/dataset/pushData.js +182 -0
package/dist/cjs/lib/dataset/pushData.js.map +1 -0
package/dist/cjs/lib/dataset.d.ts +98 -0
package/dist/cjs/lib/dataset.js +122 -0
package/dist/cjs/lib/dataset.js.map +1 -0
package/dist/cjs/lib/dom.d.ts +78 -0
package/dist/cjs/lib/dom.js +243 -0
package/dist/cjs/lib/dom.js.map +1 -0
package/dist/cjs/lib/error/errorHandler.d.ts +112 -0
package/dist/cjs/lib/error/errorHandler.js +164 -0
package/dist/cjs/lib/error/errorHandler.js.map +1 -0
package/dist/cjs/lib/error/sentry.d.ts +11 -0
package/dist/cjs/lib/error/sentry.js +60 -0
package/dist/cjs/lib/error/sentry.js.map +1 -0
package/dist/cjs/lib/integrations/apify.d.ts +67 -0
package/dist/cjs/lib/integrations/apify.js +106 -0
package/dist/cjs/lib/integrations/apify.js.map +1 -0
package/dist/cjs/lib/integrations/types.d.ts +274 -0
package/dist/cjs/lib/integrations/types.js +3 -0
package/dist/cjs/lib/integrations/types.js.map +1 -0
package/dist/cjs/lib/io/dataset.d.ts +67 -0
package/dist/cjs/lib/io/dataset.js +86 -0
package/dist/cjs/lib/io/dataset.js.map +1 -0
package/dist/cjs/lib/io/maxCount.d.ts +30 -0
package/dist/cjs/lib/io/maxCount.js +55 -0
package/dist/cjs/lib/io/maxCount.js.map +1 -0
package/dist/cjs/lib/io/pushData.d.ts +124 -0
package/dist/cjs/lib/io/pushData.js +193 -0
package/dist/cjs/lib/io/pushData.js.map +1 -0
package/dist/cjs/lib/io/pushRequests.d.ts +38 -0
package/dist/cjs/lib/io/pushRequests.js +63 -0
package/dist/cjs/lib/io/pushRequests.js.map +1 -0
package/dist/cjs/lib/io/requestQueue.d.ts +28 -0
package/dist/cjs/lib/io/requestQueue.js +40 -0
package/dist/cjs/lib/io/requestQueue.js.map +1 -0
package/dist/cjs/lib/log.d.ts +38 -0
package/dist/cjs/lib/log.js +54 -0
package/dist/cjs/lib/log.js.map +1 -0
package/dist/cjs/lib/migrate/localMigrator.d.ts +10 -0
package/dist/cjs/lib/migrate/localMigrator.js +57 -0
package/dist/cjs/lib/migrate/localMigrator.js.map +1 -0
package/dist/cjs/lib/migrate/localState.d.ts +7 -0
package/dist/cjs/lib/migrate/localState.js +43 -0
package/dist/cjs/lib/migrate/localState.js.map +1 -0
package/dist/cjs/lib/migrate/types.d.ts +6 -0
package/dist/cjs/lib/migrate/types.js +3 -0
package/dist/cjs/lib/migrate/types.js.map +1 -0
package/dist/cjs/lib/readme/readme.d.ts +65 -0
package/dist/cjs/lib/readme/readme.js +534 -0
package/dist/cjs/lib/readme/readme.js.map +1 -0
package/dist/cjs/lib/readme/types.d.ts +260 -0
package/dist/cjs/lib/readme/types.js +54 -0
package/dist/cjs/lib/readme/types.js.map +1 -0
package/dist/cjs/lib/router.d.ts +132 -0
package/dist/cjs/lib/router.js +165 -0
package/dist/cjs/lib/router.js.map +1 -0
package/dist/cjs/lib/scraper/scrapeListing.d.ts +78 -0
package/dist/cjs/lib/scraper/scrapeListing.js +242 -0
package/dist/cjs/lib/scraper/scrapeListing.js.map +1 -0
package/dist/cjs/lib/test/actor.d.ts +21 -0
package/dist/cjs/lib/test/actor.js +56 -0
package/dist/cjs/lib/test/actor.js.map +1 -0
package/dist/cjs/lib/test/mockApifyClient.d.ts +32 -0
package/dist/cjs/lib/test/mockApifyClient.js +176 -0
package/dist/cjs/lib/test/mockApifyClient.js.map +1 -0
package/dist/cjs/types.d.ts +31 -0
package/dist/cjs/types.js +3 -0
package/dist/cjs/types.js.map +1 -0
package/dist/cjs/utils/async.d.ts +19 -0
package/dist/cjs/utils/async.js +74 -0
package/dist/cjs/utils/async.js.map +1 -0
package/dist/cjs/utils/error.d.ts +1 -0
package/dist/cjs/utils/error.js +10 -0
package/dist/cjs/utils/error.js.map +1 -0
package/dist/cjs/utils/format.d.ts +9 -0
package/dist/cjs/utils/format.js +19 -0
package/dist/cjs/utils/format.js.map +1 -0
package/dist/cjs/utils/package.d.ts +15 -0
package/dist/cjs/utils/package.js +25 -0
package/dist/cjs/utils/package.js.map +1 -0
package/dist/cjs/utils/types.d.ts +6 -0
package/dist/cjs/utils/types.js +9 -0
package/dist/cjs/utils/types.js.map +1 -0
package/dist/cjs/utils/url.d.ts +9 -0
package/dist/cjs/utils/url.js +32 -0
package/dist/cjs/utils/url.js.map +1 -0
package/dist/cjs/utils/valueMonitor.d.ts +31 -0
package/dist/cjs/utils/valueMonitor.js +91 -0
package/dist/cjs/utils/valueMonitor.js.map +1 -0
package/package.json +85 -0

package/dist/cjs/lib/integrations/types.d.ts ADDED Viewed

@@ -0,0 +1,274 @@
+import type { ExitOptions } from 'apify';
+import type { Request as CrawleeRequest, CrawlingContext, DatasetDataOptions, Log, ProxyConfiguration, RequestOptions } from 'crawlee';
+import type { Page } from 'playwright';
+import type { MaybeArray, MaybePromise, PickRequired } from '../../utils/types';
+export type UnwrapCrawleeOneIO<T extends CrawleeOneIO<any, any, any>> = {
+    env: T extends CrawleeOneIO<infer U, any, any> ? U : never;
+    report: T extends CrawleeOneIO<any, infer U, any> ? U : never;
+    metadata: T extends CrawleeOneIO<any, any, infer U> ? U : never;
+};
+/**
+ * Interface for storing and retrieving:
+ * - Scraped data
+ * - Requests (URLs) to scrape
+ * - Cache data
+ *
+ * This interface is based on Crawlee/Apify, but defined separately to allow
+ * drop-in replacement with other integrations.
+ */
+export interface CrawleeOneIO<TEnv extends object = object, TReport extends object = object, TMetadata extends object = object> {
+    /**
+     * Opens a dataset and returns a promise resolving to an instance of the {@link CrawleeOneDataset}.
+     *
+     * Datasets are used to store structured data where each object stored has the same attributes,
+     * such as online store products or real estate offers. The actual data is stored either on
+     * the local filesystem or in the cloud.
+     */
+    openDataset: (id?: string | null) => MaybePromise<CrawleeOneDataset>;
+    /**
+     * Opens a request queue and returns a promise resolving to an instance of the {@link CrawleeOneRequestQueue}.
+     *
+     * RequestQueue represents a queue of URLs to crawl, which is stored either on local filesystem
+     * or in the cloud. The queue is used for deep crawling of websites, where you start with several
+     * URLs and then recursively follow links to other pages. The data structure supports both
+     * breadth-first and depth-first crawling orders.
+     */
+    openRequestQueue: (id?: string | null) => MaybePromise<CrawleeOneRequestQueue>;
+    /**
+     * Opens a key-value store and returns a promise resolving to an instance of the {@link CrawleeOneKeyValueStore}.
+     *
+     * Key-value stores are used to store records or files, along with their MIME content type.
+     * The records are stored and retrieved using a unique key. The actual data is stored
+     * either on a local filesystem or in the cloud.
+     */
+    openKeyValueStore: (id?: string | null) => MaybePromise<CrawleeOneKeyValueStore>;
+    /**
+     * Returns an object which contains information parsed from relevant environment variables.
+     */
+    getEnv: () => MaybePromise<TEnv>;
+    /**
+     * Returns a promise of an object with the crawler input. E.g. In Apify, retrieves the actor input value from
+     * the default {@link KeyValueStore} associated with the current actor run.
+     */
+    getInput: <Input extends object>() => Promise<Input | null>;
+    /**
+     * Equivalent of {@link Actor.metamorph}.
+     *
+     * This function should:
+     * 1. Start a crawler/actor by its ID,
+     * 2. Pass the given input into downsteam crawler.
+     * 3. Make the same storage available to the downstream crawler. AKA, the downstream crawler
+     *    should use the same "default" storage as is the current "default" storage.
+     *
+     * Read more about  {@link Actor.metamorph}:
+     *
+     * `Actor.metamorph` transforms this actor run to an actor run of a given actor. The system
+     * stops the current container and starts the new container instead. All the default storages
+     * are preserved and the new input is stored under the INPUT-METAMORPH-1 key in the same
+     * default key-value store.
+     */
+    triggerDownstreamCrawler: <TInput extends object>(
+    /** ID of the crawler/actor to which should be triggered. */
+    targetActorId: string,
+    /** Input for the crawler/actor. Must be JSON-serializable (it will be stringified to JSON). */
+    input?: TInput, options?: {
+        /**
+         * Tag or number of the target build to metamorph into (e.g. `beta` or `1.2.345`).
+         * If not provided, the run uses build tag or number from the default actor run configuration (typically `latest`).
+         */
+        build?: string;
+    }) => Promise<void>;
+    /**
+     * Equivalent of {@link Actor.main}.
+     *
+     * Runs the main user function that performs the job of the actor
+     * and terminates the process when the user function finishes.
+     *
+     * **The `Actor.main()` function is optional** and is provided merely for your convenience.
+     * It is mainly useful when you're running your code as an actor on the [Apify platform](https://apify.com/actors).
+     * However, if you want to use Apify SDK tools directly inside your existing projects, e.g.
+     * running in an [Express](https://expressjs.com/) server, on
+     * [Google Cloud functions](https://cloud.google.com/functions)
+     * or [AWS Lambda](https://aws.amazon.com/lambda/), it's better to avoid
+     * it since the function terminates the main process when it finishes!
+     *
+     * The `Actor.main()` function performs the following actions:
+     *
+     * - When running on the Apify platform (i.e. `APIFY_IS_AT_HOME` environment variable is set),
+     *   it sets up a connection to listen for platform events.
+     *   For example, to get a notification about an imminent migration to another server.
+     *   See {@apilink Actor.events} for details.
+     * - It checks that either `APIFY_TOKEN` or `APIFY_LOCAL_STORAGE_DIR` environment variable
+     *   is defined. If not, the functions sets `APIFY_LOCAL_STORAGE_DIR` to `./apify_storage`
+     *   inside the current working directory. This is to simplify running code examples.
+     * - It invokes the user function passed as the `userFunc` parameter.
+     * - If the user function returned a promise, waits for it to resolve.
+     * - If the user function throws an exception or some other error is encountered,
+     *   prints error details to console so that they are stored to the log.
+     * - Exits the Node.js process, with zero exit code on success and non-zero on errors.
+     */
+    runInContext: (userFunc: () => MaybePromise<unknown>, options?: ExitOptions) => Promise<void>;
+    /**
+     * Creates a proxy configuration and returns a promise resolving to an instance of
+     * {@link ProxyConfiguration} that is already initialized.
+     *
+     * Configures connection to a proxy server with the provided options. Proxy servers are used
+     * to prevent target websites from blocking your crawlers based on IP address rate limits or
+     * blacklists. Setting proxy configuration in your crawlers automatically configures them to
+     * use the selected proxies for all connections.
+     *
+     * For more details and code examples, see {@link ProxyConfiguration}.
+     */
+    createDefaultProxyConfiguration: <T extends object>(input: T | Readonly<T> | undefined) => MaybePromise<ProxyConfiguration | undefined>;
+    isTelemetryEnabled: () => MaybePromise<boolean>;
+    /** Generate object with info on current context, which will be send to the error Dataset */
+    generateErrorReport: (input: CrawleeOneErrorHandlerInput, options: PickRequired<CrawleeOneErrorHandlerOptions<TEnv, TReport>, 'io'>) => MaybePromise<TReport>;
+    /** Generate object with info on current context, which will be appended to the scraped entry */
+    generateEntryMetadata: <Ctx extends CrawlingContext>(ctx: Ctx) => MaybePromise<TMetadata>;
+}
+/**
+ * Interface for storing and retrieving data in/from Dataset
+ *
+ * This interface is based on Crawlee/Apify, but defined separately to allow
+ * drop-in replacement with other integrations.
+ */
+export interface CrawleeOneDataset<T extends object = object> {
+    /**
+     * Stores an object or an array of objects to the dataset. The function returns a promise
+     * that resolves when the operation finishes. It has no result, but throws on invalid args
+     * or other errors.
+     */
+    pushData: (
+    /**
+     * Object or array of objects containing data to be stored in the default dataset.
+     * The objects must be serializable to JSON and the JSON representation of each object
+     * must be smaller than 9MB.
+     */
+    data: MaybeArray<T>) => MaybePromise<void>;
+    /** Returns the items in the dataset based on the provided parameters. */
+    getItems: (options?: Pick<DatasetDataOptions, 'offset' | 'limit' | 'desc' | 'fields'>) => MaybePromise<T[]>;
+    /** Returns the count of items in the dataset. */
+    getItemCount: () => MaybePromise<number | null>;
+}
+/**
+ * Interface for storing and retrieving data in/from KeyValueStore.
+ *
+ * KeyValueStore is a cache / map structure, where entries are retrieved and saved
+ * under keys.
+ *
+ * This interface is based on Crawlee/Apify, but defined separately to allow
+ * drop-in replacement with other integrations.
+ */
+export interface CrawleeOneKeyValueStore {
+    /**
+     * Saves or deletes a record in the key-value store. The function returns a promise that
+     * resolves once the record has been saved or deleted.
+     *
+     * If value is null, the record is deleted instead. Note that the setValue() function
+     * succeeds regardless whether the record existed or not.
+     *
+     * Beware that the key can be at most 256 characters long and only contain the following
+     * characters: a-zA-Z0-9!-_.'()
+     *
+     * To retrieve a value from the key-value store, use the {@link CrawleeOneKeyValueStore.getValue}
+     * function.
+     */
+    setValue: (key: string, value: any, options?: {
+        /** Specifies a custom MIME content type of the record. */
+        contentType?: string;
+    }) => MaybePromise<void>;
+    /**
+     * Removes the key-value store either from the cloud storage or from the local directory,
+     * depending on the mode of operation.
+     */
+    drop: () => MaybePromise<void>;
+}
+/**
+ * Interface for storing and retrieving Requests (URLs) to scrape
+ *
+ * This interface is based on Crawlee/Apify, but defined separately to allow
+ * drop-in replacement with other integrations.
+ */
+export interface CrawleeOneRequestQueue {
+    /**
+     * Adds requests to the queue.
+     *
+     * If a request that is passed in is already present due to its uniqueKey property
+     * being the same, it will not be updated.
+     */
+    addRequests: (
+    /** Objects with request data. */
+    requestsLike: (CrawleeRequest | RequestOptions)[], options?: {
+        /**
+         * If set to true, the request will be added to the foremost position in the queue,
+         * so that it's returned in the next call to {@link CrawleeOneRequestQueue.fetchNextRequest}.
+         *
+         * By default, it's put to the end of the queue.
+         */
+        forefront?: boolean;
+    }) => MaybePromise<unknown>;
+    /**
+     * Marks a request that was previously returned by the
+     * {@link CrawleeOneRequestQueue.fetchNextRequest} function as handled after successful
+     * processing. Handled requests will never again be returned by the fetchNextRequest function.
+     */
+    markRequestHandled: (req: CrawleeRequest) => MaybePromise<unknown>;
+    /**
+     * Returns a next request in the queue to be processed, or null if there are no more
+     * pending requests.
+     *
+     * Once you successfully finish processing of the request, you need to call
+     * {@link CrawleeOneRequestQueue.markRequestHandled} to mark the request as handled
+     * in the queue. If there was some error in processing the request, call
+     * {@link CrawleeOneRequestQueue.reclaimRequest} instead, so that the queue will
+     * give the request to some other consumer in another call to the fetchNextRequest function.
+     *
+     * Note that the null return value doesn't mean the queue processing finished,
+     * it means there are currently no pending requests. To check whether all requests in queue
+     * were finished, use {@link CrawleeOneRequestQueue.isFinished} instead.
+     *
+     * @returns — Returns the request object or null if there are no more pending requests.
+     */
+    fetchNextRequest: () => MaybePromise<CrawleeRequest | null>;
+    /**
+     * Reclaims a failed request back to the queue, so that it can be returned
+     * for processing later again by another call to {@link CrawleeOneRequestQueue.fetchNextRequest}.
+     */
+    reclaimRequest: (req: CrawleeRequest, options?: {
+        /**
+         * If set to true, the request will be placed to the beginning of the queue,
+         * so that it's returned in the next call to {@link CrawleeOneRequestQueue.fetchNextRequest}.
+         *
+         * By default, it's put to the end of the queue.
+         */
+        forefront?: boolean;
+    }) => MaybePromise<unknown>;
+    /**
+     * Resolves to true if all requests were already handled and there are no more left. Due to the nature
+     * of distributed storage used by the queue, the function might occasionally return a false negative.
+     */
+    isFinished: () => MaybePromise<boolean>;
+    /** Removes the queue from the storage. */
+    drop: () => MaybePromise<void>;
+    /** Returns the number of handled requests. */
+    handledCount: () => MaybePromise<number | null>;
+}
+/** Input passed to the error handler */
+export interface CrawleeOneErrorHandlerInput {
+    error: Error;
+    /** Page instance if we used PlaywrightCrawler */
+    page: Page | null;
+    /** URL where the error happened. If not given URL is taken from the Page object */
+    url: string | null;
+    log: Log | null;
+}
+/** User-configurable options passed to the error handler */
+export interface CrawleeOneErrorHandlerOptions<TEnv extends object = object, TReport extends object = object> {
+    io?: CrawleeOneIO<TEnv, TReport>;
+    allowScreenshot?: boolean;
+    reportingDatasetId?: string;
+    onErrorCapture?: (input: {
+        error: Error;
+        report: TReport;
+    }) => MaybePromise<void>;
+}

package/dist/cjs/lib/integrations/types.js ADDED Viewed

@@ -0,0 +1,3 @@
+"use strict";
+Object.defineProperty(exports, "__esModule", { value: true });
+//# sourceMappingURL=types.js.map

package/dist/cjs/lib/integrations/types.js.map ADDED Viewed

@@ -0,0 +1 @@

+ {"version":3,"file":"types.js","sourceRoot":"","sources":["../../../../src/lib/integrations/types.ts"],"names":[],"mappings":"","sourcesContent":["import type { Actor, ExitOptions } from 'apify';\nimport type {\n Request as CrawleeRequest,\n CrawlingContext,\n DatasetDataOptions,\n Log,\n ProxyConfiguration,\n RequestOptions,\n} from 'crawlee';\nimport type { Page } from 'playwright';\n\nimport type { MaybeArray, MaybePromise, PickRequired } from '../../utils/types';\n\nexport type UnwrapCrawleeOneIO<T extends CrawleeOneIO<any, any, any>> = {\n env: T extends CrawleeOneIO<infer U, any, any> ? U : never;\n report: T extends CrawleeOneIO<any, infer U, any> ? U : never;\n metadata: T extends CrawleeOneIO<any, any, infer U> ? U : never;\n};\n\n/**\n * Interface for storing and retrieving:\n * - Scraped data\n * - Requests (URLs) to scrape\n * - Cache data\n *\n * This interface is based on Crawlee/Apify, but defined separately to allow\n * drop-in replacement with other integrations.\n */\nexport interface CrawleeOneIO<\n TEnv extends object = object,\n TReport extends object = object,\n TMetadata extends object = object\n> {\n /**\n * Opens a dataset and returns a promise resolving to an instance of the {@link CrawleeOneDataset}.\n *\n * Datasets are used to store structured data where each object stored has the same attributes,\n * such as online store products or real estate offers. The actual data is stored either on\n * the local filesystem or in the cloud.\n */\n openDataset: (id?: string | null) => MaybePromise<CrawleeOneDataset>;\n /**\n * Opens a request queue and returns a promise resolving to an instance of the {@link CrawleeOneRequestQueue}.\n *\n * RequestQueue represents a queue of URLs to crawl, which is stored either on local filesystem\n * or in the cloud. The queue is used for deep crawling of websites, where you start with several\n * URLs and then recursively follow links to other pages. The data structure supports both\n * breadth-first and depth-first crawling orders.\n */\n openRequestQueue: (id?: string | null) => MaybePromise<CrawleeOneRequestQueue>;\n /**\n * Opens a key-value store and returns a promise resolving to an instance of the {@link CrawleeOneKeyValueStore}.\n *\n * Key-value stores are used to store records or files, along with their MIME content type.\n * The records are stored and retrieved using a unique key. The actual data is stored\n * either on a local filesystem or in the cloud.\n */\n openKeyValueStore: (id?: string | null) => MaybePromise<CrawleeOneKeyValueStore>;\n /**\n * Returns an object which contains information parsed from relevant environment variables.\n */\n getEnv: () => MaybePromise<TEnv>;\n /**\n * Returns a promise of an object with the crawler input. E.g. In Apify, retrieves the actor input value from\n * the default {@link KeyValueStore} associated with the current actor run.\n */\n getInput: <Input extends object>() => Promise<Input | null>;\n /**\n * Equivalent of {@link Actor.metamorph}.\n *\n * This function should:\n * 1. Start a crawler/actor by its ID,\n * 2. Pass the given input into downsteam crawler.\n * 3. Make the same storage available to the downstream crawler. AKA, the downstream crawler\n * should use the same \"default\" storage as is the current \"default\" storage.\n *\n * Read more about {@link Actor.metamorph}:\n *\n * `Actor.metamorph` transforms this actor run to an actor run of a given actor. The system\n * stops the current container and starts the new container instead. All the default storages\n * are preserved and the new input is stored under the INPUT-METAMORPH-1 key in the same\n * default key-value store.\n */\n triggerDownstreamCrawler: <TInput extends object>(\n /** ID of the crawler/actor to which should be triggered. */\n targetActorId: string,\n /** Input for the crawler/actor. Must be JSON-serializable (it will be stringified to JSON). */\n input?: TInput,\n options?: {\n /**\n * Tag or number of the target build to metamorph into (e.g. `beta` or `1.2.345`).\n * If not provided, the run uses build tag or number from the default actor run configuration (typically `latest`).\n */\n build?: string;\n }\n ) => Promise<void>;\n /**\n * Equivalent of {@link Actor.main}.\n *\n * Runs the main user function that performs the job of the actor\n * and terminates the process when the user function finishes.\n *\n * **The `Actor.main()` function is optional** and is provided merely for your convenience.\n * It is mainly useful when you're running your code as an actor on the [Apify platform](https://apify.com/actors).\n * However, if you want to use Apify SDK tools directly inside your existing projects, e.g.\n * running in an [Express](https://expressjs.com/) server, on\n * [Google Cloud functions](https://cloud.google.com/functions)\n * or [AWS Lambda](https://aws.amazon.com/lambda/), it's better to avoid\n * it since the function terminates the main process when it finishes!\n *\n * The `Actor.main()` function performs the following actions:\n *\n * - When running on the Apify platform (i.e. `APIFY_IS_AT_HOME` environment variable is set),\n * it sets up a connection to listen for platform events.\n * For example, to get a notification about an imminent migration to another server.\n * See {@apilink Actor.events} for details.\n * - It checks that either `APIFY_TOKEN` or `APIFY_LOCAL_STORAGE_DIR` environment variable\n * is defined. If not, the functions sets `APIFY_LOCAL_STORAGE_DIR` to `./apify_storage`\n * inside the current working directory. This is to simplify running code examples.\n * - It invokes the user function passed as the `userFunc` parameter.\n * - If the user function returned a promise, waits for it to resolve.\n * - If the user function throws an exception or some other error is encountered,\n * prints error details to console so that they are stored to the log.\n * - Exits the Node.js process, with zero exit code on success and non-zero on errors.\n */\n runInContext: (userFunc: () => MaybePromise<unknown>, options?: ExitOptions) => Promise<void>;\n /**\n * Creates a proxy configuration and returns a promise resolving to an instance of\n * {@link ProxyConfiguration} that is already initialized.\n *\n * Configures connection to a proxy server with the provided options. Proxy servers are used\n * to prevent target websites from blocking your crawlers based on IP address rate limits or\n * blacklists. Setting proxy configuration in your crawlers automatically configures them to\n * use the selected proxies for all connections.\n *\n * For more details and code examples, see {@link ProxyConfiguration}.\n */\n createDefaultProxyConfiguration: <T extends object>(\n input: T | Readonly<T> | undefined\n ) => MaybePromise<ProxyConfiguration | undefined>;\n isTelemetryEnabled: () => MaybePromise<boolean>;\n /** Generate object with info on current context, which will be send to the error Dataset */\n generateErrorReport: (\n input: CrawleeOneErrorHandlerInput,\n options: PickRequired<CrawleeOneErrorHandlerOptions<TEnv, TReport>, 'io'>\n ) => MaybePromise<TReport>;\n /** Generate object with info on current context, which will be appended to the scraped entry */\n generateEntryMetadata: <Ctx extends CrawlingContext>(ctx: Ctx) => MaybePromise<TMetadata>;\n}\n\n/**\n * Interface for storing and retrieving data in/from Dataset\n *\n * This interface is based on Crawlee/Apify, but defined separately to allow\n * drop-in replacement with other integrations.\n */\nexport interface CrawleeOneDataset<T extends object = object> {\n /**\n * Stores an object or an array of objects to the dataset. The function returns a promise\n * that resolves when the operation finishes. It has no result, but throws on invalid args\n * or other errors.\n */\n pushData: (\n /**\n * Object or array of objects containing data to be stored in the default dataset.\n * The objects must be serializable to JSON and the JSON representation of each object\n * must be smaller than 9MB.\n */\n data: MaybeArray<T>\n ) => MaybePromise<void>;\n /** Returns the items in the dataset based on the provided parameters. */\n getItems: (\n options?: Pick<DatasetDataOptions, 'offset' | 'limit' | 'desc' | 'fields'>\n ) => MaybePromise<T[]>;\n /** Returns the count of items in the dataset. */\n getItemCount: () => MaybePromise<number | null>;\n}\n\n/**\n * Interface for storing and retrieving data in/from KeyValueStore.\n *\n * KeyValueStore is a cache / map structure, where entries are retrieved and saved\n * under keys.\n *\n * This interface is based on Crawlee/Apify, but defined separately to allow\n * drop-in replacement with other integrations.\n */\nexport interface CrawleeOneKeyValueStore {\n /**\n * Saves or deletes a record in the key-value store. The function returns a promise that\n * resolves once the record has been saved or deleted.\n *\n * If value is null, the record is deleted instead. Note that the setValue() function\n * succeeds regardless whether the record existed or not.\n *\n * Beware that the key can be at most 256 characters long and only contain the following\n * characters: a-zA-Z0-9!-_.'()\n *\n * To retrieve a value from the key-value store, use the {@link CrawleeOneKeyValueStore.getValue}\n * function.\n */\n setValue: (\n key: string,\n value: any,\n options?: {\n /** Specifies a custom MIME content type of the record. */\n contentType?: string;\n }\n ) => MaybePromise<void>;\n /**\n * Removes the key-value store either from the cloud storage or from the local directory,\n * depending on the mode of operation.\n */\n drop: () => MaybePromise<void>;\n}\n\n/**\n * Interface for storing and retrieving Requests (URLs) to scrape\n *\n * This interface is based on Crawlee/Apify, but defined separately to allow\n * drop-in replacement with other integrations.\n */\nexport interface CrawleeOneRequestQueue {\n /**\n * Adds requests to the queue.\n *\n * If a request that is passed in is already present due to its uniqueKey property\n * being the same, it will not be updated.\n */\n addRequests: (\n /** Objects with request data. */\n requestsLike: (CrawleeRequest | RequestOptions)[],\n options?: {\n /**\n * If set to true, the request will be added to the foremost position in the queue,\n * so that it's returned in the next call to {@link CrawleeOneRequestQueue.fetchNextRequest}.\n *\n * By default, it's put to the end of the queue.\n */\n forefront?: boolean;\n }\n ) => MaybePromise<unknown>;\n /**\n * Marks a request that was previously returned by the\n * {@link CrawleeOneRequestQueue.fetchNextRequest} function as handled after successful\n * processing. Handled requests will never again be returned by the fetchNextRequest function.\n */\n markRequestHandled: (req: CrawleeRequest) => MaybePromise<unknown>;\n /**\n * Returns a next request in the queue to be processed, or null if there are no more\n * pending requests.\n *\n * Once you successfully finish processing of the request, you need to call\n * {@link CrawleeOneRequestQueue.markRequestHandled} to mark the request as handled\n * in the queue. If there was some error in processing the request, call\n * {@link CrawleeOneRequestQueue.reclaimRequest} instead, so that the queue will\n * give the request to some other consumer in another call to the fetchNextRequest function.\n *\n * Note that the null return value doesn't mean the queue processing finished,\n * it means there are currently no pending requests. To check whether all requests in queue\n * were finished, use {@link CrawleeOneRequestQueue.isFinished} instead.\n *\n * @returns — Returns the request object or null if there are no more pending requests.\n */\n fetchNextRequest: () => MaybePromise<CrawleeRequest | null>;\n /**\n * Reclaims a failed request back to the queue, so that it can be returned\n * for processing later again by another call to {@link CrawleeOneRequestQueue.fetchNextRequest}.\n */\n reclaimRequest: (\n req: CrawleeRequest,\n options?: {\n /**\n * If set to true, the request will be placed to the beginning of the queue,\n * so that it's returned in the next call to {@link CrawleeOneRequestQueue.fetchNextRequest}.\n *\n * By default, it's put to the end of the queue.\n */\n forefront?: boolean;\n }\n ) => MaybePromise<unknown>;\n /**\n * Resolves to true if all requests were already handled and there are no more left. Due to the nature\n * of distributed storage used by the queue, the function might occasionally return a false negative.\n */\n isFinished: () => MaybePromise<boolean>;\n /** Removes the queue from the storage. */\n drop: () => MaybePromise<void>;\n /** Returns the number of handled requests. */\n handledCount: () => MaybePromise<number | null>;\n}\n\n/** Input passed to the error handler */\nexport interface CrawleeOneErrorHandlerInput {\n error: Error;\n /** Page instance if we used PlaywrightCrawler */\n page: Page | null;\n /** URL where the error happened. If not given URL is taken from the Page object */\n url: string | null;\n log: Log | null;\n}\n\n/** User-configurable options passed to the error handler */\nexport interface CrawleeOneErrorHandlerOptions<\n TEnv extends object = object,\n TReport extends object = object\n> {\n io?: CrawleeOneIO<TEnv, TReport>;\n allowScreenshot?: boolean;\n reportingDatasetId?: string;\n onErrorCapture?: (input: { error: Error; report: TReport }) => MaybePromise<void>;\n}\n"]}

package/dist/cjs/lib/io/dataset.d.ts ADDED Viewed

@@ -0,0 +1,67 @@
+import type { DatasetDataOptions, Log } from 'apify';
+import { ValueMonitorOptions } from '../../utils/valueMonitor';
+import type { CrawleeOneIO } from '../integrations/types';
+/**
+ * Given a Dataset ID, get the number of entries already in the Dataset.
+ *
+ * By default uses Apify Dataset.
+ */
+export declare const getDatasetCount: (datasetNameOrId?: string, options?: {
+    io?: CrawleeOneIO;
+    log?: Log;
+}) => Promise<number | null>;
+/**
+ * Given a Dataset ID and a name of a field, get the columnar data.
+ *
+ * By default uses Apify Dataset.
+ *
+ * Example:
+ * ```js
+ * // Given dataset
+ * // [
+ * //   { id: 1, field: 'abc' },
+ * //   { id: 2, field: 'def' }
+ * // ]
+ * const results = await getColumnFromDataset('datasetId123', 'field');
+ * console.log(results)
+ * // ['abc', 'def']
+ * ```
+ */
+export declare const getColumnFromDataset: <T>(datasetId: string, field: string, options?: {
+    io?: CrawleeOneIO;
+    dataOptions?: Pick<DatasetDataOptions, 'offset' | 'limit' | 'desc'>;
+}) => Promise<T[]>;
+export interface DatasetSizeMonitorOptions extends ValueMonitorOptions {
+    /**
+     * ID or name of the Dataset that's monitored for size.
+     *
+     * If omitted, the default Dataset is used.
+     */
+    datasetId?: string;
+    /**
+     * ID of the RequestQueue that holds remaining requests. This queue will be
+     * emptied when Dataset reaches `maxSize`.
+     *
+     * If omitted, the default RequestQueue is used.
+     */
+    requestQueueId?: string;
+    io?: CrawleeOneIO;
+}
+/**
+ * Semi-automatic monitoring of Dataset size. This is used in limiting the total of entries
+ * scraped per run / Dataset:
+ * - When Dataset reaches `maxSize`, then all remaining Requests
+ *   in the RequestQueue are removed.
+ * - Pass an array of items to `shortenToSize` to shorten the array to the size
+ *   that still fits the Dataset.
+ *
+ * By default uses Apify Dataset.
+ */
+export declare const datasetSizeMonitor: (maxSize: number, options?: DatasetSizeMonitorOptions) => {
+    shortenToSize: <T>(arr: T[]) => Promise<T[]>;
+    isFull: () => Promise<boolean>;
+    value: () => number | Promise<number> | null;
+    isStale: () => boolean;
+    refresh: () => Promise<number>;
+    onValue: (callback: import("../../utils/valueMonitor").ValueCallback<number>) => () => void;
+};

package/dist/cjs/lib/io/dataset.js ADDED Viewed

@@ -0,0 +1,86 @@
+"use strict";
+var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
+    function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
+    return new (P || (P = Promise))(function (resolve, reject) {
+        function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
+        function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
+        function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
+        step((generator = generator.apply(thisArg, _arguments || [])).next());
+    });
+};
+Object.defineProperty(exports, "__esModule", { value: true });
+exports.datasetSizeMonitor = exports.getColumnFromDataset = exports.getDatasetCount = void 0;
+const valueMonitor_1 = require("../../utils/valueMonitor");
+const apify_1 = require("../integrations/apify");
+/**
+ * Given a Dataset ID, get the number of entries already in the Dataset.
+ *
+ * By default uses Apify Dataset.
+ */
+const getDatasetCount = (datasetNameOrId, options) => __awaiter(void 0, void 0, void 0, function* () {
+    const { io = apify_1.apifyIO, log } = options !== null && options !== void 0 ? options : {};
+    log === null || log === void 0 ? void 0 : log.debug('Opening dataset');
+    const dataset = yield io.openDataset(datasetNameOrId);
+    // const dataset = await io.openDataset(datasetNameOrId);
+    log === null || log === void 0 ? void 0 : log.debug('Obtaining dataset entries count');
+    const count = yield dataset.getItemCount();
+    if (typeof count !== 'number') {
+        log === null || log === void 0 ? void 0 : log.warning('Failed to get count of entries in dataset. We use this info to know how many items were scraped. More entries might be scraped than was set.'); // prettier-ignore
+    }
+    else {
+        log === null || log === void 0 ? void 0 : log.debug(`Done obtaining dataset entries count (${count})`);
+    }
+    return count;
+});
+exports.getDatasetCount = getDatasetCount;
+/**
+ * Given a Dataset ID and a name of a field, get the columnar data.
+ *
+ * By default uses Apify Dataset.
+ *
+ * Example:
+ * ```js
+ * // Given dataset
+ * // [
+ * //   { id: 1, field: 'abc' },
+ * //   { id: 2, field: 'def' }
+ * // ]
+ * const results = await getColumnFromDataset('datasetId123', 'field');
+ * console.log(results)
+ * // ['abc', 'def']
+ * ```
+ */
+const getColumnFromDataset = (datasetId, field, options) => __awaiter(void 0, void 0, void 0, function* () {
+    const { io = apify_1.apifyIO, dataOptions } = options !== null && options !== void 0 ? options : {};
+    const dataset = yield io.openDataset(datasetId);
+    const items = yield dataset.getItems(Object.assign(Object.assign({}, dataOptions), { fields: [field] }));
+    const data = items.map((d) => d[field]);
+    return data;
+});
+exports.getColumnFromDataset = getColumnFromDataset;
+/**
+ * Semi-automatic monitoring of Dataset size. This is used in limiting the total of entries
+ * scraped per run / Dataset:
+ * - When Dataset reaches `maxSize`, then all remaining Requests
+ *   in the RequestQueue are removed.
+ * - Pass an array of items to `shortenToSize` to shorten the array to the size
+ *   that still fits the Dataset.
+ *
+ * By default uses Apify Dataset.
+ */
+const datasetSizeMonitor = (maxSize, options) => {
+    const { io = apify_1.apifyIO } = options !== null && options !== void 0 ? options : {};
+    const getSize = () => __awaiter(void 0, void 0, void 0, function* () {
+        const dataset = yield io.openDataset(options === null || options === void 0 ? void 0 : options.datasetId);
+        const size = yield dataset.getItemCount();
+        return size !== null && size !== void 0 ? size : 0;
+    });
+    // When we've reached the Dataset's max size, then remove all remaining Requests
+    const onMaxSizeReached = () => __awaiter(void 0, void 0, void 0, function* () {
+        const reqQueue = yield io.openRequestQueue(options === null || options === void 0 ? void 0 : options.requestQueueId);
+        yield reqQueue.drop();
+    });
+    return (0, valueMonitor_1.createSizeMonitor)(maxSize, getSize, onMaxSizeReached, options);
+};
+exports.datasetSizeMonitor = datasetSizeMonitor;
+//# sourceMappingURL=dataset.js.map

package/dist/cjs/lib/io/dataset.js.map ADDED Viewed

@@ -0,0 +1 @@

+ {"version":3,"file":"dataset.js","sourceRoot":"","sources":["../../../../src/lib/io/dataset.ts"],"names":[],"mappings":";;;;;;;;;;;;AAEA,2DAAkF;AAElF,iDAAgD;AAEhD;;;;GAIG;AACI,MAAM,eAAe,GAAG,CAC7B,eAAwB,EACxB,OAA0C,EAC1C,EAAE;IACF,MAAM,EAAE,EAAE,GAAG,eAAO,EAAE,GAAG,EAAE,GAAG,OAAO,aAAP,OAAO,cAAP,OAAO,GAAI,EAAE,CAAC;IAE5C,GAAG,aAAH,GAAG,uBAAH,GAAG,CAAE,KAAK,CAAC,iBAAiB,CAAC,CAAC;IAC9B,MAAM,OAAO,GAAG,MAAM,EAAE,CAAC,WAAW,CAAC,eAAe,CAAC,CAAC;IACtD,yDAAyD;IACzD,GAAG,aAAH,GAAG,uBAAH,GAAG,CAAE,KAAK,CAAC,iCAAiC,CAAC,CAAC;IAC9C,MAAM,KAAK,GAAG,MAAM,OAAO,CAAC,YAAY,EAAE,CAAC;IAC3C,IAAI,OAAO,KAAK,KAAK,QAAQ,EAAE;QAC7B,GAAG,aAAH,GAAG,uBAAH,GAAG,CAAE,OAAO,CAAC,8IAA8I,CAAC,CAAC,CAAC,kBAAkB;KACjL;SAAM;QACL,GAAG,aAAH,GAAG,uBAAH,GAAG,CAAE,KAAK,CAAC,yCAAyC,KAAK,GAAG,CAAC,CAAC;KAC/D;IACD,OAAO,KAAK,CAAC;AACf,CAAC,CAAA,CAAC;AAjBW,QAAA,eAAe,mBAiB1B;AAEF;;;;;;;;;;;;;;;;GAgBG;AACI,MAAM,oBAAoB,GAAG,CAClC,SAAiB,EACjB,KAAa,EACb,OAGC,EACD,EAAE;IACF,MAAM,EAAE,EAAE,GAAG,eAAO,EAAE,WAAW,EAAE,GAAG,OAAO,aAAP,OAAO,cAAP,OAAO,GAAI,EAAE,CAAC;IAEpD,MAAM,OAAO,GAAG,MAAM,EAAE,CAAC,WAAW,CAAC,SAAS,CAAC,CAAC;IAChD,MAAM,KAAK,GAAG,MAAM,OAAO,CAAC,QAAQ,iCAC/B,WAAW,KACd,MAAM,EAAE,CAAC,KAAK,CAAC,IACf,CAAC;IACH,MAAM,IAAI,GAAG,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,KAAK,CAAM,CAAC,CAAC;IAC7C,OAAO,IAAI,CAAC;AACd,CAAC,CAAA,CAAC;AAjBW,QAAA,oBAAoB,wBAiB/B;AAmBF;;;;;;;;;GASG;AACI,MAAM,kBAAkB,GAAG,CAAC,OAAe,EAAE,OAAmC,EAAE,EAAE;IACzF,MAAM,EAAE,EAAE,GAAG,eAAO,EAAE,GAAG,OAAO,aAAP,OAAO,cAAP,OAAO,GAAI,EAAE,CAAC;IAEvC,MAAM,OAAO,GAAG,GAAS,EAAE;QACzB,MAAM,OAAO,GAAG,MAAM,EAAE,CAAC,WAAW,CAAC,OAAO,aAAP,OAAO,uBAAP,OAAO,CAAE,SAAS,CAAC,CAAC;QACzD,MAAM,IAAI,GAAG,MAAM,OAAO,CAAC,YAAY,EAAE,CAAC;QAC1C,OAAO,IAAI,aAAJ,IAAI,cAAJ,IAAI,GAAI,CAAC,CAAC;IACnB,CAAC,CAAA,CAAC;IAEF,gFAAgF;IAChF,MAAM,gBAAgB,GAAG,GAAS,EAAE;QAClC,MAAM,QAAQ,GAAG,MAAM,EAAE,CAAC,gBAAgB,CAAC,OAAO,aAAP,OAAO,uBAAP,OAAO,CAAE,cAAc,CAAC,CAAC;QACpE,MAAM,QAAQ,CAAC,IAAI,EAAE,CAAC;IACxB,CAAC,CAAA,CAAC;IAEF,OAAO,IAAA,gCAAiB,EAAC,OAAO,EAAE,OAAO,EAAE,gBAAgB,EAAE,OAAO,CAAC,CAAC;AACxE,CAAC,CAAC;AAhBW,QAAA,kBAAkB,sBAgB7B","sourcesContent":["import type { DatasetDataOptions, Log } from 'apify';\n\nimport { ValueMonitorOptions, createSizeMonitor } from '../../utils/valueMonitor';\nimport type { CrawleeOneIO } from '../integrations/types';\nimport { apifyIO } from '../integrations/apify';\n\n/**\n * Given a Dataset ID, get the number of entries already in the Dataset.\n *\n * By default uses Apify Dataset.\n */\nexport const getDatasetCount = async (\n datasetNameOrId?: string,\n options?: { io?: CrawleeOneIO; log?: Log }\n) => {\n const { io = apifyIO, log } = options ?? {};\n\n log?.debug('Opening dataset');\n const dataset = await io.openDataset(datasetNameOrId);\n // const dataset = await io.openDataset(datasetNameOrId);\n log?.debug('Obtaining dataset entries count');\n const count = await dataset.getItemCount();\n if (typeof count !== 'number') {\n log?.warning('Failed to get count of entries in dataset. We use this info to know how many items were scraped. More entries might be scraped than was set.'); // prettier-ignore\n } else {\n log?.debug(`Done obtaining dataset entries count (${count})`);\n }\n return count;\n};\n\n/**\n * Given a Dataset ID and a name of a field, get the columnar data.\n *\n * By default uses Apify Dataset.\n *\n * Example:\n * ```js\n * // Given dataset\n * // [\n * // { id: 1, field: 'abc' },\n * // { id: 2, field: 'def' }\n * // ]\n * const results = await getColumnFromDataset('datasetId123', 'field');\n * console.log(results)\n * // ['abc', 'def']\n * ```\n */\nexport const getColumnFromDataset = async <T>(\n datasetId: string,\n field: string,\n options?: {\n io?: CrawleeOneIO;\n dataOptions?: Pick<DatasetDataOptions, 'offset' | 'limit' | 'desc'>;\n }\n) => {\n const { io = apifyIO, dataOptions } = options ?? {};\n\n const dataset = await io.openDataset(datasetId);\n const items = await dataset.getItems({\n ...dataOptions,\n fields: [field],\n });\n const data = items.map((d) => d[field] as T);\n return data;\n};\n\nexport interface DatasetSizeMonitorOptions extends ValueMonitorOptions {\n /**\n * ID or name of the Dataset that's monitored for size.\n *\n * If omitted, the default Dataset is used.\n */\n datasetId?: string;\n /**\n * ID of the RequestQueue that holds remaining requests. This queue will be\n * emptied when Dataset reaches `maxSize`.\n *\n * If omitted, the default RequestQueue is used.\n */\n requestQueueId?: string;\n io?: CrawleeOneIO;\n}\n\n/**\n * Semi-automatic monitoring of Dataset size. This is used in limiting the total of entries\n * scraped per run / Dataset:\n * - When Dataset reaches `maxSize`, then all remaining Requests\n * in the RequestQueue are removed.\n * - Pass an array of items to `shortenToSize` to shorten the array to the size\n * that still fits the Dataset.\n *\n * By default uses Apify Dataset.\n */\nexport const datasetSizeMonitor = (maxSize: number, options?: DatasetSizeMonitorOptions) => {\n const { io = apifyIO } = options ?? {};\n\n const getSize = async () => {\n const dataset = await io.openDataset(options?.datasetId);\n const size = await dataset.getItemCount();\n return size ?? 0;\n };\n\n // When we've reached the Dataset's max size, then remove all remaining Requests\n const onMaxSizeReached = async () => {\n const reqQueue = await io.openRequestQueue(options?.requestQueueId);\n await reqQueue.drop();\n };\n\n return createSizeMonitor(maxSize, getSize, onMaxSizeReached, options);\n};\n"]}

package/dist/cjs/lib/io/maxCount.d.ts ADDED Viewed

@@ -0,0 +1,30 @@
+import { Log } from 'apify';
+/**
+ * Given a batch of entries, use several strategies to check
+ * if we've reached the limit on the max number of entries
+ * we're allowed to extract this run.
+ */
+export declare const checkEntriesCount: ({ maxCount, currBatchCount, datasetNameOrId, customItemCount, }: {
+    /** Number of entries in the current batch */
+    currBatchCount: number;
+    /** Max number of entries allowed to extract. */
+    maxCount?: number | null | undefined;
+    /**
+     * If given, maxCount will be ALSO compared against
+     * the amount of entries already in the dataset.
+     */
+    datasetNameOrId?: string | null | undefined;
+    /**
+     * If given, maxCount will be ALSO compared against
+     * this amount.
+     */
+    customItemCount?: number | null | undefined;
+}, { log }?: {
+    log?: Log | undefined;
+}) => Promise<{
+    limitReached: boolean;
+    overflow: number;
+}>;
+export declare const getDatasetCount: (datasetNameOrId?: string, { log }?: {
+    log?: Log | undefined;
+}) => Promise<number | null>;

package/dist/cjs/lib/io/maxCount.js ADDED Viewed

@@ -0,0 +1,55 @@
+"use strict";
+var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
+    function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
+    return new (P || (P = Promise))(function (resolve, reject) {
+        function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
+        function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
+        function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
+        step((generator = generator.apply(thisArg, _arguments || [])).next());
+    });
+};
+Object.defineProperty(exports, "__esModule", { value: true });
+exports.getDatasetCount = exports.checkEntriesCount = void 0;
+const apify_1 = require("apify");
+/**
+ * Given a batch of entries, use several strategies to check
+ * if we've reached the limit on the max number of entries
+ * we're allowed to extract this run.
+ */
+const checkEntriesCount = ({ maxCount, currBatchCount, datasetNameOrId, customItemCount, }, { log } = {}) => __awaiter(void 0, void 0, void 0, function* () {
+    const datasetItemCount = datasetNameOrId ? yield (0, exports.getDatasetCount)(datasetNameOrId, { log }) : null;
+    if ((datasetItemCount == null && customItemCount == null) || maxCount == null) {
+        return { limitReached: false, overflow: 0 };
+    }
+    // Check if we've reached the limit for max entries
+    if (currBatchCount >= maxCount) {
+        return { limitReached: true, overflow: currBatchCount - maxCount };
+    }
+    // Use count of items already in dataset to check if limit reached
+    if (datasetItemCount != null && datasetItemCount + currBatchCount >= maxCount) {
+        return { limitReached: true, overflow: datasetItemCount + currBatchCount - maxCount };
+    }
+    // Use page offset to check if limit reached (20 entries per page)
+    if (customItemCount != null && customItemCount >= maxCount) {
+        return { limitReached: true, overflow: customItemCount - maxCount };
+    }
+    return { limitReached: false, overflow: 0 };
+});
+exports.checkEntriesCount = checkEntriesCount;
+const getDatasetCount = (datasetNameOrId, { log } = {}) => __awaiter(void 0, void 0, void 0, function* () {
+    var _a;
+    log === null || log === void 0 ? void 0 : log.debug('Opening dataset');
+    const dataset = yield apify_1.Actor.openDataset(datasetNameOrId);
+    log === null || log === void 0 ? void 0 : log.debug('Obtaining dataset entries count');
+    const datasetInfo = yield dataset.getInfo();
+    const count = (_a = datasetInfo === null || datasetInfo === void 0 ? void 0 : datasetInfo.itemCount) !== null && _a !== void 0 ? _a : null;
+    if (typeof count !== 'number') {
+        log === null || log === void 0 ? void 0 : log.warning('Failed to get count of entries in dataset. We use this info to know how many items were scraped. More entries might be scraped than was set.'); // prettier-ignore
+    }
+    else {
+        log === null || log === void 0 ? void 0 : log.debug(`Done obtaining dataset entries count (${count})`);
+    }
+    return count;
+});
+exports.getDatasetCount = getDatasetCount;
+//# sourceMappingURL=maxCount.js.map

package/dist/cjs/lib/io/maxCount.js.map ADDED Viewed

@@ -0,0 +1 @@

+ {"version":3,"file":"maxCount.js","sourceRoot":"","sources":["../../../../src/lib/io/maxCount.ts"],"names":[],"mappings":";;;;;;;;;;;;AAAA,iCAAmC;AAEnC;;;;GAIG;AACI,MAAM,iBAAiB,GAAG,CAC/B,EACE,QAAQ,EACR,cAAc,EACd,eAAe,EACf,eAAe,GAgBhB,EACD,EAAE,GAAG,KAAoB,EAAE,EAC3B,EAAE;IACF,MAAM,gBAAgB,GAAG,eAAe,CAAC,CAAC,CAAC,MAAM,IAAA,uBAAe,EAAC,eAAe,EAAE,EAAE,GAAG,EAAE,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC;IAElG,IAAI,CAAC,gBAAgB,IAAI,IAAI,IAAI,eAAe,IAAI,IAAI,CAAC,IAAI,QAAQ,IAAI,IAAI,EAAE;QAC7E,OAAO,EAAE,YAAY,EAAE,KAAK,EAAE,QAAQ,EAAE,CAAC,EAAE,CAAC;KAC7C;IAED,mDAAmD;IACnD,IAAI,cAAc,IAAI,QAAQ,EAAE;QAC9B,OAAO,EAAE,YAAY,EAAE,IAAI,EAAE,QAAQ,EAAE,cAAc,GAAG,QAAQ,EAAE,CAAC;KACpE;IAED,kEAAkE;IAClE,IAAI,gBAAgB,IAAI,IAAI,IAAI,gBAAgB,GAAG,cAAc,IAAI,QAAQ,EAAE;QAC7E,OAAO,EAAE,YAAY,EAAE,IAAI,EAAE,QAAQ,EAAE,gBAAgB,GAAG,cAAc,GAAG,QAAQ,EAAE,CAAC;KACvF;IAED,kEAAkE;IAClE,IAAI,eAAe,IAAI,IAAI,IAAI,eAAe,IAAI,QAAQ,EAAE;QAC1D,OAAO,EAAE,YAAY,EAAE,IAAI,EAAE,QAAQ,EAAE,eAAe,GAAG,QAAQ,EAAE,CAAC;KACrE;IAED,OAAO,EAAE,YAAY,EAAE,KAAK,EAAE,QAAQ,EAAE,CAAC,EAAE,CAAC;AAC9C,CAAC,CAAA,CAAC;AA9CW,QAAA,iBAAiB,qBA8C5B;AAEK,MAAM,eAAe,GAAG,CAAO,eAAwB,EAAE,EAAE,GAAG,KAAoB,EAAE,EAAE,EAAE;;IAC7F,GAAG,aAAH,GAAG,uBAAH,GAAG,CAAE,KAAK,CAAC,iBAAiB,CAAC,CAAC;IAC9B,MAAM,OAAO,GAAG,MAAM,aAAK,CAAC,WAAW,CAAC,eAAe,CAAC,CAAC;IACzD,GAAG,aAAH,GAAG,uBAAH,GAAG,CAAE,KAAK,CAAC,iCAAiC,CAAC,CAAC;IAC9C,MAAM,WAAW,GAAG,MAAM,OAAO,CAAC,OAAO,EAAE,CAAC;IAC5C,MAAM,KAAK,GAAG,MAAA,WAAW,aAAX,WAAW,uBAAX,WAAW,CAAE,SAAS,mCAAI,IAAI,CAAC;IAC7C,IAAI,OAAO,KAAK,KAAK,QAAQ,EAAE;QAC7B,GAAG,aAAH,GAAG,uBAAH,GAAG,CAAE,OAAO,CAAC,8IAA8I,CAAC,CAAC,CAAC,kBAAkB;KACjL;SAAM;QACL,GAAG,aAAH,GAAG,uBAAH,GAAG,CAAE,KAAK,CAAC,yCAAyC,KAAK,GAAG,CAAC,CAAC;KAC/D;IACD,OAAO,KAAK,CAAC;AACf,CAAC,CAAA,CAAC;AAZW,QAAA,eAAe,mBAY1B","sourcesContent":["import { Actor, Log } from 'apify';\n\n/**\n * Given a batch of entries, use several strategies to check\n * if we've reached the limit on the max number of entries\n * we're allowed to extract this run.\n */\nexport const checkEntriesCount = async (\n {\n maxCount,\n currBatchCount,\n datasetNameOrId,\n customItemCount,\n }: {\n /** Number of entries in the current batch */\n currBatchCount: number;\n /** Max number of entries allowed to extract. */\n maxCount?: number | null;\n /**\n * If given, maxCount will be ALSO compared against\n * the amount of entries already in the dataset.\n */\n datasetNameOrId?: string | null;\n /**\n * If given, maxCount will be ALSO compared against\n * this amount.\n */\n customItemCount?: number | null;\n },\n { log }: { log?: Log } = {}\n) => {\n const datasetItemCount = datasetNameOrId ? await getDatasetCount(datasetNameOrId, { log }) : null;\n\n if ((datasetItemCount == null && customItemCount == null) || maxCount == null) {\n return { limitReached: false, overflow: 0 };\n }\n\n // Check if we've reached the limit for max entries\n if (currBatchCount >= maxCount) {\n return { limitReached: true, overflow: currBatchCount - maxCount };\n }\n\n // Use count of items already in dataset to check if limit reached\n if (datasetItemCount != null && datasetItemCount + currBatchCount >= maxCount) {\n return { limitReached: true, overflow: datasetItemCount + currBatchCount - maxCount };\n }\n\n // Use page offset to check if limit reached (20 entries per page)\n if (customItemCount != null && customItemCount >= maxCount) {\n return { limitReached: true, overflow: customItemCount - maxCount };\n }\n\n return { limitReached: false, overflow: 0 };\n};\n\nexport const getDatasetCount = async (datasetNameOrId?: string, { log }: { log?: Log } = {}) => {\n log?.debug('Opening dataset');\n const dataset = await Actor.openDataset(datasetNameOrId);\n log?.debug('Obtaining dataset entries count');\n const datasetInfo = await dataset.getInfo();\n const count = datasetInfo?.itemCount ?? null;\n if (typeof count !== 'number') {\n log?.warning('Failed to get count of entries in dataset. We use this info to know how many items were scraped. More entries might be scraped than was set.'); // prettier-ignore\n } else {\n log?.debug(`Done obtaining dataset entries count (${count})`);\n }\n return count;\n};\n"]}