npm - @adobe/spacecat-shared-scrape-client - Versions diffs - 1.0.2 → 1.0.4 - Mend

@adobe/spacecat-shared-scrape-client 1.0.2 → 1.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (3) hide show

package/CHANGELOG.md +14 -0
package/package.json +1 -1
package/src/clients/scrape-job-supervisor.js +46 -12

package/CHANGELOG.md CHANGED Viewed

@@ -1,3 +1,17 @@
+# [@adobe/spacecat-shared-scrape-client-v1.0.4](https://github.com/adobe/spacecat-shared/compare/@adobe/spacecat-shared-scrape-client-v1.0.3...@adobe/spacecat-shared-scrape-client-v1.0.4) (2025-07-24)
+### Bug Fixes
+* (scrape-job-supervisor): add offset for URL numbering in batches ([#868](https://github.com/adobe/spacecat-shared/issues/868)) ([12789c0](https://github.com/adobe/spacecat-shared/commit/12789c0cabe33ad5e526793d645bfef421a851af))
+# [@adobe/spacecat-shared-scrape-client-v1.0.3](https://github.com/adobe/spacecat-shared/compare/@adobe/spacecat-shared-scrape-client-v1.0.2...@adobe/spacecat-shared-scrape-client-v1.0.3) (2025-07-21)
+### Bug Fixes
+* ScrapeClient handle large url lists ([#854](https://github.com/adobe/spacecat-shared/issues/854)) ([d0768db](https://github.com/adobe/spacecat-shared/commit/d0768db101d65bc604c64473648cba0344612025))
 # [@adobe/spacecat-shared-scrape-client-v1.0.2](https://github.com/adobe/spacecat-shared/compare/@adobe/spacecat-shared-scrape-client-v1.0.1...@adobe/spacecat-shared-scrape-client-v1.0.2) (2025-07-19)

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@adobe/spacecat-shared-scrape-client",
-  "version": "1.0.2",
+  "version": "1.0.4",
   "description": "Shared modules of the Spacecat Services - Scrape Client",
   "type": "module",
   "engines": {

package/src/clients/scrape-job-supervisor.js CHANGED Viewed

@@ -35,6 +35,7 @@ function ScrapeJobSupervisor(services, config) {
   const {
     queues = [], // Array of scrape queues
     scrapeWorkerQueue, // URL of the scrape worker queue
+    maxUrlsPerMessage,
   } = config;
   /**
@@ -129,9 +130,24 @@ function ScrapeJobSupervisor(services, config) {
   }
   /**
-   * Queue all URLs as a single message for processing by another function. This will enable
-   * the controller to respond with a new job ID ASAP, while the individual URLs are queued up
-   * asynchronously.
+   * Split an array of URLs into batches of a specified size.
+   * @param urls
+   * @param batchSize
+   * @returns {*[]}
+   */
+  function splitUrlsIntoBatches(urls, batchSize = 1000) {
+    const batches = [];
+    for (let i = 0; i < urls.length; i += batchSize) {
+      batches.push(urls.slice(i, i + batchSize));
+    }
+    log.info(`Split ${urls.length} URLs into ${batches.length} batches of size ${batchSize}.`);
+    return batches;
+  }
+  /**
+   * Queue all URLs for processing by another function. Splits URL-Arrays > 1000 into multiple
+   * messages. This will enable the controller to respond with a new job ID ASAP, while the
+   * individual URLs are queued up asynchronously.
    * @param {Array<string>} urls - Array of URL records to queue.
    * @param {object} scrapeJob - The scrape job record.
    * @param {object} customHeaders - Optional custom headers to be sent with each request.
@@ -143,17 +159,35 @@ function ScrapeJobSupervisor(services, config) {
     const options = scrapeJob.getOptions();
     const processingType = scrapeJob.getProcessingType();
+    const totalUrlCount = urls.length;
+    const baseUrl = scrapeJob.getBaseURL();
+    let urlBatches = [];
-    // Send a single message containing all URLs and the new job ID
-    const message = {
-      processingType,
-      jobId: scrapeJob.getId(),
-      urls,
-      customHeaders,
-      options,
-    };
+    // If there are more than 1000 URLs, split them into multiple messages
+    if (totalUrlCount > maxUrlsPerMessage) {
+      urlBatches = splitUrlsIntoBatches(urls, maxUrlsPerMessage);
+      log.info(`Queuing ${totalUrlCount} URLs for scrape in ${urlBatches.length} messages.`);
+    } else {
+      // If there are 1000 or fewer URLs, we can send them all in a single message
+      log.info(`Queuing ${totalUrlCount} URLs for scrape in a single message.`);
+      urlBatches = [urls]; // Wrap in an array to maintain consistent structure
+    }
+    for (const [index, batch] of urlBatches.entries()) {
+      // Calculate the offset for numbering the URLs in the batch
+      const offset = index * maxUrlsPerMessage;
+      const message = {
+        processingType,
+        jobId: scrapeJob.getId(),
+        batch,
+        batchOffset: offset,
+        customHeaders,
+        options,
+      };
-    await sqs.sendMessage(scrapeWorkerQueue, message);
+      // eslint-disable-next-line no-await-in-loop
+      await sqs.sendMessage(scrapeWorkerQueue, message, baseUrl);
+    }
   }
   /**