@adobe/spacecat-shared-scrape-client 2.0.0 → 2.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/CHANGELOG.md CHANGED
@@ -1,3 +1,17 @@
1
+ # [@adobe/spacecat-shared-scrape-client-v2.1.1](https://github.com/adobe/spacecat-shared/compare/@adobe/spacecat-shared-scrape-client-v2.1.0...@adobe/spacecat-shared-scrape-client-v2.1.1) (2025-08-28)
2
+
3
+
4
+ ### Bug Fixes
5
+
6
+ * enhance validation for scrape job configuration ([#940](https://github.com/adobe/spacecat-shared/issues/940)) ([54d0a6a](https://github.com/adobe/spacecat-shared/commit/54d0a6aa322547e13da25f2f97e1542fd5688849))
7
+
8
+ # [@adobe/spacecat-shared-scrape-client-v2.1.0](https://github.com/adobe/spacecat-shared/compare/@adobe/spacecat-shared-scrape-client-v2.0.0...@adobe/spacecat-shared-scrape-client-v2.1.0) (2025-08-20)
9
+
10
+
11
+ ### Features
12
+
13
+ * add scrape-client destination ([#913](https://github.com/adobe/spacecat-shared/issues/913)) ([e208a87](https://github.com/adobe/spacecat-shared/commit/e208a87214874a2708ac2d7614fcfd4c0770fe17))
14
+
1
15
  # [@adobe/spacecat-shared-scrape-client-v2.0.0](https://github.com/adobe/spacecat-shared/compare/@adobe/spacecat-shared-scrape-client-v1.0.7...@adobe/spacecat-shared-scrape-client-v2.0.0) (2025-08-13)
2
16
 
3
17
 
package/README.md CHANGED
@@ -67,7 +67,9 @@ const jobData = {
67
67
  'Authorization': 'Bearer token',
68
68
  'X-Custom-Header': 'value'
69
69
  },
70
- processingType: 'default' // Optional, defaults to 'DEFAULT'
70
+ processingType: 'default', // Optional, defaults to 'DEFAULT'
71
+ maxScrapeAge: 6, // Optional, used to avoid re-scraping recently scraped URLs (hours) 0 means always scrape
72
+ auditData: {} // Optional, this is used for step audits
71
73
  };
72
74
 
73
75
  try {
@@ -122,6 +124,27 @@ try {
122
124
  }
123
125
  ```
124
126
 
127
+ ### Getting Successful Scrape Paths
128
+
129
+ ```js
130
+ const jobId = 'your-job-id';
131
+ try {
132
+ const paths = await client.getScrapeResultPaths(jobId);
133
+ if (paths === null) {
134
+ console.log('Job not found');
135
+ } else if (paths.size === 0) {
136
+ console.log('No successful paths found for this job');
137
+ } else {
138
+ console.log(`Found ${paths.size} successful paths for job ${jobId}`);
139
+ for (const [url, path] of paths) {
140
+ console.log(`URL: ${url} -> Path: ${path}`);
141
+ }
142
+ }
143
+ } catch (error) {
144
+ console.error('Failed to get successful paths:', error.message);
145
+ }
146
+ ```
147
+
125
148
  ### Finding Jobs by Date Range
126
149
 
127
150
  ```js
@@ -192,6 +215,17 @@ When you retrieve job results, each URL result has this structure:
192
215
  }
193
216
  ```
194
217
 
218
+ ## Path Results Format
219
+
220
+ When you retrieve successful scrape paths using `getScrapeResultPaths()`, the response is a JavaScript Map object that maps URLs to their corresponding result file paths. Only URLs with `COMPLETE` status are included:
221
+
222
+ ```js
223
+ Map(2) {
224
+ 'https://example.com/page1' => 'path/to/result1',
225
+ 'https://example.com/page2' => 'path/to/result2'
226
+ }
227
+ ```
228
+
195
229
  ## Configuration
196
230
 
197
231
  The client uses the `SCRAPE_JOB_CONFIGURATION` environment variable for default settings:
@@ -248,3 +282,37 @@ npm run clean
248
282
  - **Repository**: [GitHub](https://github.com/adobe/spacecat-shared.git)
249
283
  - **Issue Tracking**: [GitHub Issues](https://github.com/adobe/spacecat-shared/issues)
250
284
  - **License**: Apache-2.0
285
+
286
+ ### ScrapeClient Workflow Overview
287
+
288
+ <img width="889" height="508" alt="Screenshot 2025-08-27 at 08 56 16" src="https://github.com/user-attachments/assets/9ccc1388-ed6b-4bf0-a059-d40e6e90aff8" />
289
+
290
+ When a new scrape job is created, the client performs the following steps:
291
+ 1. Creates a new job entry in the database with status `PENDING`.
292
+ 2. Splits the provided URLs into batches based on the `maxUrlsPerMessage` configuration (this is limited due to SQS message size constraints).
293
+ 3. For each batch, it creates a message in the SQS queue to the scrape-job-manager.
294
+
295
+ In the scrape-job-manager the following steps are performed:
296
+ 1. All existing ScrapeURLs are fetched for the base URL to avoid re-scraping recently scraped URLs (based on the `maxScrapeAge` parameter).
297
+ 2. For all URLs a new ScrapeURL entry is created with status `PENDING`.
298
+ 3. Each URL in the batch is checked against existing ScrapeURLs.
299
+ - Already scraped URLs (with status 'COMPLETE' or 'PENDING') are marked to be skipped with the ID of the existing ScrapeURL and the isOriginal flag set to false.
300
+ - URLs that need to be scraped are marked with the isOriginal flag set to true. (The isOriginal flag is used to avoid the sliding window problem when re-scraping URLs.)
301
+ - All URLs are numbered with based on their position in the original list to be able to track the job progress.
302
+ 4. For each URL, a message is created in the SQS queue to the content-scraper.
303
+
304
+ In the content-scraper the following steps are performed:
305
+ 1. The content-scraper checks if an incoming URL message is marked to be skipped. If so, it just sends a message to the content-processor.
306
+ 2. If the URL is not marked to be skipped, the content-scraper scrapes the URL.
307
+ 3. The content-scraper creates a message in the SQS queue to the content-processor with the result of the scraping operation.
308
+
309
+ in the content-processor the following steps are performed:
310
+ 1. The content-processor processes the incoming message from the content-scraper.
311
+ 2. If the URL was skipped, it fetches the existing ScrapeURL entry and updates the new ScrapeURL entry with the same path and status.
312
+ 3. If the URL was scraped, it updates the ScrapeURL entry with the result of the scraping operation (status, path, reason).
313
+ 4. The content-processor updates the ScrapeJob entry with the new counts (success, failed, redirect).
314
+ 5. If all URLs of a job are processed (based on their number and the totalUrlCount of the job), it:
315
+ - performs a cleanup step to set all PENDING URLs to FAILED that were not processed (e.g. due to timeouts).
316
+ - updates the counts of the job again.
317
+ - sets the job status to COMPLETE and sets the endedAt timestamp.
318
+ - Optionally, it can send a SQS message (e.g. to trigger the next audit step).
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@adobe/spacecat-shared-scrape-client",
3
- "version": "2.0.0",
3
+ "version": "2.1.1",
4
4
  "description": "Shared modules of the Spacecat Services - Scrape Client",
5
5
  "type": "module",
6
6
  "engines": {
@@ -11,8 +11,7 @@
11
11
  */
12
12
 
13
13
  import {
14
- isIsoDate, isObject, isValidUrl, isNonEmptyArray, hasText,
15
- isValidUUID,
14
+ hasText, isIsoDate, isNonEmptyArray, isObject, isValidUrl, isValidUUID,
16
15
  } from '@adobe/spacecat-shared-utils';
17
16
  import { ScrapeJob as ScrapeJobModel } from '@adobe/spacecat-shared-data-access';
18
17
  import { ScrapeJobDto } from './scrapeJobDto.js';
@@ -35,6 +34,59 @@ export default class ScrapeClient {
35
34
  }
36
35
  }
37
36
 
37
+ static validateScrapeConfiguration(scrapeJobConfiguration) {
38
+ if (!isObject(scrapeJobConfiguration)) {
39
+ throw new Error('Invalid scrape configuration: configuration must be an object');
40
+ }
41
+
42
+ // Validate scrapeWorkerQueue
43
+ if (!hasText(scrapeJobConfiguration.scrapeWorkerQueue)) {
44
+ throw new Error('Invalid scrape configuration: scrapeWorkerQueue must be a non-empty string');
45
+ }
46
+
47
+ if (!isValidUrl(scrapeJobConfiguration.scrapeWorkerQueue)) {
48
+ throw new Error('Invalid scrape configuration: scrapeWorkerQueue must be a valid URL');
49
+ }
50
+
51
+ // Validate s3Bucket
52
+ if (!hasText(scrapeJobConfiguration.s3Bucket)) {
53
+ throw new Error('Invalid scrape configuration: s3Bucket must be a non-empty string');
54
+ }
55
+
56
+ // Validate options
57
+ if (scrapeJobConfiguration.options !== undefined) {
58
+ if (!isObject(scrapeJobConfiguration.options)) {
59
+ throw new Error('Invalid scrape configuration: options must be an object');
60
+ }
61
+
62
+ const { options } = scrapeJobConfiguration;
63
+
64
+ if (options.enableJavascript !== undefined && typeof options.enableJavascript !== 'boolean') {
65
+ throw new Error('Invalid scrape configuration: options.enableJavascript must be a boolean');
66
+ }
67
+
68
+ if (options.hideConsentBanners !== undefined && typeof options.hideConsentBanners !== 'boolean') {
69
+ throw new Error('Invalid scrape configuration: options.hideConsentBanners must be a boolean');
70
+ }
71
+ }
72
+
73
+ // Validate maxUrlsPerJob
74
+ if (scrapeJobConfiguration.maxUrlsPerJob !== undefined) {
75
+ if (!Number.isInteger(scrapeJobConfiguration.maxUrlsPerJob)
76
+ || scrapeJobConfiguration.maxUrlsPerJob <= 0) {
77
+ throw new Error('Invalid scrape configuration: maxUrlsPerJob must be a positive integer');
78
+ }
79
+ }
80
+
81
+ // Validate maxUrlsPerMessage
82
+ if (scrapeJobConfiguration.maxUrlsPerMessage !== undefined) {
83
+ if (!Number.isInteger(scrapeJobConfiguration.maxUrlsPerMessage)
84
+ || scrapeJobConfiguration.maxUrlsPerMessage <= 0) {
85
+ throw new Error('Invalid scrape configuration: maxUrlsPerMessage must be a positive integer');
86
+ }
87
+ }
88
+ }
89
+
38
90
  validateRequestData(data) {
39
91
  if (!isObject(data)) {
40
92
  throw new Error('Invalid request: missing application/json request data');
@@ -104,8 +156,10 @@ export default class ScrapeClient {
104
156
  let scrapeConfiguration = {};
105
157
  try {
106
158
  scrapeConfiguration = JSON.parse(this.config.env.SCRAPE_JOB_CONFIGURATION);
159
+ ScrapeClient.validateScrapeConfiguration(scrapeConfiguration);
107
160
  } catch (error) {
108
- this.config.log.error(`Failed to parse scrape job configuration: ${error.message}`);
161
+ this.config.log.error(`Failed to parse or validate scrape job configuration: ${error.message}`);
162
+ throw new Error(`Invalid scrape job configuration: ${error.message}`);
109
163
  }
110
164
  this.scrapeConfiguration = scrapeConfiguration;
111
165
 
@@ -132,6 +186,7 @@ export default class ScrapeClient {
132
186
  customHeaders,
133
187
  processingType = ScrapeJobModel.ScrapeProcessingType.DEFAULT,
134
188
  maxScrapeAge = 24,
189
+ auditData = {},
135
190
  } = data;
136
191
 
137
192
  this.config.log.info(`Creating a new scrape job with ${urls.length} URLs.`);
@@ -149,6 +204,7 @@ export default class ScrapeClient {
149
204
  mergedOptions,
150
205
  customHeaders,
151
206
  maxScrapeAge,
207
+ auditData,
152
208
  );
153
209
  return ScrapeJobDto.toJSON(job);
154
210
  } catch (error) {
@@ -228,6 +284,29 @@ export default class ScrapeClient {
228
284
  }
229
285
  }
230
286
 
287
+ /**
288
+ * Get the result paths of a scrape job
289
+ * @param {string} jobId - The ID of the job to fetch.
290
+ * @return {Promise<Map<string, string>>} A map of URLs to their corresponding result paths.
291
+ */
292
+ async getScrapeResultPaths(jobId) {
293
+ try {
294
+ const job = await this.scrapeSupervisor.getScrapeJob(jobId);
295
+ if (!job) {
296
+ return null;
297
+ }
298
+ const { ScrapeUrl } = this.config.dataAccess;
299
+ const scrapeUrls = await ScrapeUrl.allByScrapeJobId(job.getId());
300
+ return scrapeUrls
301
+ .filter((url) => url.getStatus() === ScrapeJobModel.ScrapeUrlStatus.COMPLETE)
302
+ .reduce((map, url) => map.set(url.getUrl(), url.getPath()), new Map());
303
+ } catch (error) {
304
+ const msgError = `Failed to fetch the scrape job result: ${error.message}`;
305
+ this.config.log.error(msgError);
306
+ throw new Error(msgError);
307
+ }
308
+ }
309
+
231
310
  /**
232
311
  * Get all scrape jobs by baseURL and processing type
233
312
  * @param {string} baseURL - The baseURL of the jobs to fetch.
@@ -122,10 +122,12 @@ function ScrapeJobSupervisor(services, config) {
122
122
  * @param {object} scrapeJob - The scrape job record.
123
123
  * @param {object} customHeaders - Optional custom headers to be sent with each request.
124
124
  * @param {string} maxScrapeAge - The maximum age of the scrape job
125
+ * @param {object} auditData - Step-Audit specific data
125
126
  */
126
- async function queueUrlsForScrapeWorker(urls, scrapeJob, customHeaders, maxScrapeAge) {
127
+ // eslint-disable-next-line max-len
128
+ async function queueUrlsForScrapeWorker(urls, scrapeJob, customHeaders, maxScrapeAge, auditData) {
127
129
  log.info(`Starting a new scrape job of baseUrl: ${scrapeJob.getBaseURL()} with ${urls.length}`
128
- + ` URLs. This new job has claimed: ${scrapeJob.getScrapeQueueId()} `
130
+ + ' URLs.'
129
131
  + `(jobId: ${scrapeJob.getId()})`);
130
132
 
131
133
  const options = scrapeJob.getOptions();
@@ -155,6 +157,7 @@ function ScrapeJobSupervisor(services, config) {
155
157
  customHeaders,
156
158
  options,
157
159
  maxScrapeAge,
160
+ auditData,
158
161
  };
159
162
 
160
163
  // eslint-disable-next-line no-await-in-loop
@@ -168,7 +171,8 @@ function ScrapeJobSupervisor(services, config) {
168
171
  * @param {string} processingType - The type of processing to perform.
169
172
  * @param {object} options - Optional configuration params for the scrape job.
170
173
  * @param {object} customHeaders - Optional custom headers to be sent with each request.
171
- * @param {string} maxScrapeAge - The maximum age of the scrape job
174
+ * @param {number} maxScrapeAge - The maximum age of the scrape job
175
+ * @param auditContext
172
176
  * @returns {Promise<ScrapeJob>} newly created job object
173
177
  */
174
178
  async function startNewJob(
@@ -177,6 +181,7 @@ function ScrapeJobSupervisor(services, config) {
177
181
  options,
178
182
  customHeaders,
179
183
  maxScrapeAge,
184
+ auditContext,
180
185
  ) {
181
186
  const newScrapeJob = await createNewScrapeJob(
182
187
  urls,
@@ -196,7 +201,7 @@ function ScrapeJobSupervisor(services, config) {
196
201
 
197
202
  // Queue all URLs for scrape as a single message. This enables the controller to respond with
198
203
  // a job ID ASAP, while the individual URLs are queued up asynchronously by another function.
199
- await queueUrlsForScrapeWorker(urls, newScrapeJob, customHeaders, maxScrapeAge);
204
+ await queueUrlsForScrapeWorker(urls, newScrapeJob, customHeaders, maxScrapeAge, auditContext);
200
205
 
201
206
  return newScrapeJob;
202
207
  }