@adobe/spacecat-shared-scrape-client 2.0.0 → 2.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/CHANGELOG.md CHANGED
@@ -1,3 +1,10 @@
1
+ # [@adobe/spacecat-shared-scrape-client-v2.1.0](https://github.com/adobe/spacecat-shared/compare/@adobe/spacecat-shared-scrape-client-v2.0.0...@adobe/spacecat-shared-scrape-client-v2.1.0) (2025-08-20)
2
+
3
+
4
+ ### Features
5
+
6
+ * add scrape-client destination ([#913](https://github.com/adobe/spacecat-shared/issues/913)) ([e208a87](https://github.com/adobe/spacecat-shared/commit/e208a87214874a2708ac2d7614fcfd4c0770fe17))
7
+
1
8
  # [@adobe/spacecat-shared-scrape-client-v2.0.0](https://github.com/adobe/spacecat-shared/compare/@adobe/spacecat-shared-scrape-client-v1.0.7...@adobe/spacecat-shared-scrape-client-v2.0.0) (2025-08-13)
2
9
 
3
10
 
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@adobe/spacecat-shared-scrape-client",
3
- "version": "2.0.0",
3
+ "version": "2.1.0",
4
4
  "description": "Shared modules of the Spacecat Services - Scrape Client",
5
5
  "type": "module",
6
6
  "engines": {
@@ -11,8 +11,7 @@
11
11
  */
12
12
 
13
13
  import {
14
- isIsoDate, isObject, isValidUrl, isNonEmptyArray, hasText,
15
- isValidUUID,
14
+ hasText, isIsoDate, isNonEmptyArray, isObject, isValidUrl, isValidUUID,
16
15
  } from '@adobe/spacecat-shared-utils';
17
16
  import { ScrapeJob as ScrapeJobModel } from '@adobe/spacecat-shared-data-access';
18
17
  import { ScrapeJobDto } from './scrapeJobDto.js';
@@ -132,6 +131,7 @@ export default class ScrapeClient {
132
131
  customHeaders,
133
132
  processingType = ScrapeJobModel.ScrapeProcessingType.DEFAULT,
134
133
  maxScrapeAge = 24,
134
+ auditData = {},
135
135
  } = data;
136
136
 
137
137
  this.config.log.info(`Creating a new scrape job with ${urls.length} URLs.`);
@@ -149,6 +149,7 @@ export default class ScrapeClient {
149
149
  mergedOptions,
150
150
  customHeaders,
151
151
  maxScrapeAge,
152
+ auditData,
152
153
  );
153
154
  return ScrapeJobDto.toJSON(job);
154
155
  } catch (error) {
@@ -228,6 +229,24 @@ export default class ScrapeClient {
228
229
  }
229
230
  }
230
231
 
232
+ async getScrapeResultPaths(jobId) {
233
+ try {
234
+ const job = await this.scrapeSupervisor.getScrapeJob(jobId);
235
+ if (!job) {
236
+ return null;
237
+ }
238
+ const { ScrapeUrl } = this.config.dataAccess;
239
+ const scrapeUrls = await ScrapeUrl.allByScrapeJobId(job.getId());
240
+ return scrapeUrls
241
+ .filter((url) => url.getStatus() === ScrapeJobModel.ScrapeUrlStatus.COMPLETE)
242
+ .reduce((map, url) => map.set(url.getUrl(), url.getPath()), new Map());
243
+ } catch (error) {
244
+ const msgError = `Failed to fetch the scrape job result: ${error.message}`;
245
+ this.config.log.error(msgError);
246
+ throw new Error(msgError);
247
+ }
248
+ }
249
+
231
250
  /**
232
251
  * Get all scrape jobs by baseURL and processing type
233
252
  * @param {string} baseURL - The baseURL of the jobs to fetch.
@@ -122,10 +122,12 @@ function ScrapeJobSupervisor(services, config) {
122
122
  * @param {object} scrapeJob - The scrape job record.
123
123
  * @param {object} customHeaders - Optional custom headers to be sent with each request.
124
124
  * @param {string} maxScrapeAge - The maximum age of the scrape job
125
+ * @param auditContext
125
126
  */
126
- async function queueUrlsForScrapeWorker(urls, scrapeJob, customHeaders, maxScrapeAge) {
127
+ // eslint-disable-next-line max-len
128
+ async function queueUrlsForScrapeWorker(urls, scrapeJob, customHeaders, maxScrapeAge, auditData) {
127
129
  log.info(`Starting a new scrape job of baseUrl: ${scrapeJob.getBaseURL()} with ${urls.length}`
128
- + ` URLs. This new job has claimed: ${scrapeJob.getScrapeQueueId()} `
130
+ + ' URLs.'
129
131
  + `(jobId: ${scrapeJob.getId()})`);
130
132
 
131
133
  const options = scrapeJob.getOptions();
@@ -155,6 +157,7 @@ function ScrapeJobSupervisor(services, config) {
155
157
  customHeaders,
156
158
  options,
157
159
  maxScrapeAge,
160
+ auditData,
158
161
  };
159
162
 
160
163
  // eslint-disable-next-line no-await-in-loop
@@ -168,7 +171,8 @@ function ScrapeJobSupervisor(services, config) {
168
171
  * @param {string} processingType - The type of processing to perform.
169
172
  * @param {object} options - Optional configuration params for the scrape job.
170
173
  * @param {object} customHeaders - Optional custom headers to be sent with each request.
171
- * @param {string} maxScrapeAge - The maximum age of the scrape job
174
+ * @param {number} maxScrapeAge - The maximum age of the scrape job
175
+ * @param auditContext
172
176
  * @returns {Promise<ScrapeJob>} newly created job object
173
177
  */
174
178
  async function startNewJob(
@@ -177,6 +181,7 @@ function ScrapeJobSupervisor(services, config) {
177
181
  options,
178
182
  customHeaders,
179
183
  maxScrapeAge,
184
+ auditContext,
180
185
  ) {
181
186
  const newScrapeJob = await createNewScrapeJob(
182
187
  urls,
@@ -196,7 +201,7 @@ function ScrapeJobSupervisor(services, config) {
196
201
 
197
202
  // Queue all URLs for scrape as a single message. This enables the controller to respond with
198
203
  // a job ID ASAP, while the individual URLs are queued up asynchronously by another function.
199
- await queueUrlsForScrapeWorker(urls, newScrapeJob, customHeaders, maxScrapeAge);
204
+ await queueUrlsForScrapeWorker(urls, newScrapeJob, customHeaders, maxScrapeAge, auditContext);
200
205
 
201
206
  return newScrapeJob;
202
207
  }