npm - @adobe/spacecat-shared-scrape-client - Versions diffs - 1.0.7 → 2.1.0 - Mend

@adobe/spacecat-shared-scrape-client 1.0.7 → 2.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (5) hide show

package/CHANGELOG.md +19 -0
package/README.md +1 -0
package/package.json +1 -1
package/src/clients/scrape-client.js +29 -3
package/src/clients/scrape-job-supervisor.js +13 -43

package/CHANGELOG.md CHANGED Viewed

@@ -1,3 +1,22 @@
+# [@adobe/spacecat-shared-scrape-client-v2.1.0](https://github.com/adobe/spacecat-shared/compare/@adobe/spacecat-shared-scrape-client-v2.0.0...@adobe/spacecat-shared-scrape-client-v2.1.0) (2025-08-20)
+### Features
+* add scrape-client destination  ([#913](https://github.com/adobe/spacecat-shared/issues/913)) ([e208a87](https://github.com/adobe/spacecat-shared/commit/e208a87214874a2708ac2d7614fcfd4c0770fe17))
+# [@adobe/spacecat-shared-scrape-client-v2.0.0](https://github.com/adobe/spacecat-shared/compare/@adobe/spacecat-shared-scrape-client-v1.0.7...@adobe/spacecat-shared-scrape-client-v2.0.0) (2025-08-13)
+### Features
+* re-scraping of URLs ([b889a19](https://github.com/adobe/spacecat-shared/commit/b889a19b1cec20b1f1dc32a89b34ab5125fa90e6))
+### BREAKING CHANGES
+* ScrapeClient does not choose a scrape queue anymore. This is done in Scrape Job Manager.
 # [@adobe/spacecat-shared-scrape-client-v1.0.7](https://github.com/adobe/spacecat-shared/compare/@adobe/spacecat-shared-scrape-client-v1.0.6...@adobe/spacecat-shared-scrape-client-v1.0.7) (2025-08-09)

package/README.md CHANGED Viewed

@@ -1,5 +1,6 @@
 # Spacecat Shared - Scrape Client
 A JavaScript client for managing web scraping jobs, part of the SpaceCat Shared library. The ScrapeClient provides a comprehensive interface for creating, monitoring, and retrieving results from web scraping operations without needing to access the SpaceCat API service directly.
 ## Installation

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@adobe/spacecat-shared-scrape-client",
-  "version": "1.0.7",
+  "version": "2.1.0",
   "description": "Shared modules of the Spacecat Services - Scrape Client",
   "type": "module",
   "engines": {

package/src/clients/scrape-client.js CHANGED Viewed

@@ -11,8 +11,7 @@
  */
 import {
-  isIsoDate, isObject, isValidUrl, isNonEmptyArray, hasText,
-  isValidUUID,
+  hasText, isIsoDate, isNonEmptyArray, isObject, isValidUrl, isValidUUID,
 } from '@adobe/spacecat-shared-utils';
 import { ScrapeJob as ScrapeJobModel } from '@adobe/spacecat-shared-data-access';
 import { ScrapeJobDto } from './scrapeJobDto.js';
@@ -118,6 +117,8 @@ export default class ScrapeClient {
   /**
    * Create and start a new scrape job.
    * @param {object} data - json data for scrape job
+   * @param {number} data.maxScrapeAge - (optional) max age of scrapes in hours
+   * default is 24, 0 to force rescrape
    * @returns {Promise<Response>} newly created job object
    */
   async createScrapeJob(data) {
@@ -125,7 +126,12 @@ export default class ScrapeClient {
       this.validateRequestData(data);
       const {
-        urls, options, customHeaders, processingType = ScrapeJobModel.ScrapeProcessingType.DEFAULT,
+        urls,
+        options,
+        customHeaders,
+        processingType = ScrapeJobModel.ScrapeProcessingType.DEFAULT,
+        maxScrapeAge = 24,
+        auditData = {},
       } = data;
       this.config.log.info(`Creating a new scrape job with ${urls.length} URLs.`);
@@ -142,6 +148,8 @@ export default class ScrapeClient {
         processingType,
         mergedOptions,
         customHeaders,
+        maxScrapeAge,
+        auditData,
       );
       return ScrapeJobDto.toJSON(job);
     } catch (error) {
@@ -221,6 +229,24 @@ export default class ScrapeClient {
     }
   }
+  async getScrapeResultPaths(jobId) {
+    try {
+      const job = await this.scrapeSupervisor.getScrapeJob(jobId);
+      if (!job) {
+        return null;
+      }
+      const { ScrapeUrl } = this.config.dataAccess;
+      const scrapeUrls = await ScrapeUrl.allByScrapeJobId(job.getId());
+      return scrapeUrls
+        .filter((url) => url.getStatus() === ScrapeJobModel.ScrapeUrlStatus.COMPLETE)
+        .reduce((map, url) => map.set(url.getUrl(), url.getPath()), new Map());
+    } catch (error) {
+      const msgError = `Failed to fetch the scrape job result: ${error.message}`;
+      this.config.log.error(msgError);
+      throw new Error(msgError);
+    }
+  }
   /**
    * Get all scrape jobs by baseURL and processing type
    * @param {string} baseURL - The baseURL of the jobs to fetch.

package/src/clients/scrape-job-supervisor.js CHANGED Viewed

@@ -33,37 +33,10 @@ function ScrapeJobSupervisor(services, config) {
   const { ScrapeJob } = dataAccess;
   const {
-    queues = [], // Array of scrape queues
     scrapeWorkerQueue, // URL of the scrape worker queue
     maxUrlsPerMessage,
   } = config;
-  /**
-   * Get the queue with the least number of messages.
-   */
-  async function getAvailableScrapeQueue() {
-    const countMessages = async (queue) => {
-      const count = await sqs.getQueueMessageCount(queue);
-      return { queue, count };
-    };
-    const arrProm = queues.map(
-      (queue) => countMessages(queue),
-    );
-    const queueMessageCounts = await Promise.all(arrProm);
-    if (queueMessageCounts.length === 0) {
-      return null;
-    }
-    // get the queue with the lowest number of messages
-    const queueWithLeastMessages = queueMessageCounts.reduce(
-      (min, current) => (min.count < current.count ? min : current),
-    );
-    log.info(`Queue with least messages: ${queueWithLeastMessages.queue}`);
-    return queueWithLeastMessages.queue;
-  }
   function determineBaseURL(urls) {
     // Initially, we will just use the domain of the first URL
     const url = new URL(urls[0]);
@@ -74,7 +47,6 @@ function ScrapeJobSupervisor(services, config) {
    * Create a new scrape job by claiming one of the free scrape queues, persisting the scrape job
    * metadata, and setting the job status to 'RUNNING'.
    * @param {Array<string>} urls - The list of URLs to scrape.
-   * @param {string} scrapeQueueId - Name of the queue to use for this scrape job.
    * @param {string} processingType - The scrape handler to be used for the scrape job.
    * @param {object} options - Client provided options for the scrape job.
    * @param {object} customHeaders - Custom headers to be sent with each request.
@@ -82,14 +54,12 @@ function ScrapeJobSupervisor(services, config) {
    */
   async function createNewScrapeJob(
     urls,
-    scrapeQueueId,
     processingType,
     options,
     customHeaders = null,
   ) {
     const jobData = {
       baseURL: determineBaseURL(urls),
-      scrapeQueueId,
       processingType,
       options,
       urlCount: urls.length,
@@ -151,10 +121,13 @@ function ScrapeJobSupervisor(services, config) {
    * @param {Array<string>} urls - Array of URL records to queue.
    * @param {object} scrapeJob - The scrape job record.
    * @param {object} customHeaders - Optional custom headers to be sent with each request.
+   * @param {string} maxScrapeAge - The maximum age of the scrape job
+   * @param auditContext
    */
-  async function queueUrlsForScrapeWorker(urls, scrapeJob, customHeaders) {
+  // eslint-disable-next-line max-len
+  async function queueUrlsForScrapeWorker(urls, scrapeJob, customHeaders, maxScrapeAge, auditData) {
     log.info(`Starting a new scrape job of baseUrl: ${scrapeJob.getBaseURL()} with ${urls.length}`
-      + ` URLs. This new job has claimed: ${scrapeJob.getScrapeQueueId()} `
+      + ' URLs.'
       + `(jobId: ${scrapeJob.getId()})`);
     const options = scrapeJob.getOptions();
@@ -183,6 +156,8 @@ function ScrapeJobSupervisor(services, config) {
         batchOffset: offset,
         customHeaders,
         options,
+        maxScrapeAge,
+        auditData,
       };
       // eslint-disable-next-line no-await-in-loop
@@ -193,8 +168,11 @@ function ScrapeJobSupervisor(services, config) {
   /**
    * Starts a new scrape job.
    * @param {Array<string>} urls - The URLs to scrape.
+   * @param {string} processingType - The type of processing to perform.
    * @param {object} options - Optional configuration params for the scrape job.
    * @param {object} customHeaders - Optional custom headers to be sent with each request.
+   * @param {number} maxScrapeAge - The maximum age of the scrape job
+   * @param auditContext
    * @returns {Promise<ScrapeJob>} newly created job object
    */
   async function startNewJob(
@@ -202,18 +180,11 @@ function ScrapeJobSupervisor(services, config) {
     processingType,
     options,
     customHeaders,
+    maxScrapeAge,
+    auditContext,
   ) {
-    // Determine if there is a free scrape queue
-    const scrapeQueueId = await getAvailableScrapeQueue();
-    if (scrapeQueueId === null) {
-      throw new Error('Service Unavailable: No scrape queue available');
-    }
-    // If a queue is available, create the scrape-job record in dataAccess:
     const newScrapeJob = await createNewScrapeJob(
       urls,
-      scrapeQueueId,
       processingType,
       options,
       customHeaders,
@@ -224,14 +195,13 @@ function ScrapeJobSupervisor(services, config) {
       + `- baseUrl: ${newScrapeJob.getBaseURL()}\n`
       + `- urlCount: ${urls.length}\n`
       + `- jobId: ${newScrapeJob.getId()}\n`
-      + `- scrapeQueueId: ${scrapeQueueId}\n`
       + `- customHeaders: ${JSON.stringify(customHeaders)}\n`
       + `- options: ${JSON.stringify(options)}`,
     );
     // Queue all URLs for scrape as a single message. This enables the controller to respond with
     // a job ID ASAP, while the individual URLs are queued up asynchronously by another function.
-    await queueUrlsForScrapeWorker(urls, newScrapeJob, customHeaders);
+    await queueUrlsForScrapeWorker(urls, newScrapeJob, customHeaders, maxScrapeAge, auditContext);
     return newScrapeJob;
   }