@adobe/spacecat-shared-scrape-client 2.1.0 → 2.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/CHANGELOG.md CHANGED
@@ -1,3 +1,10 @@
1
+ # [@adobe/spacecat-shared-scrape-client-v2.1.1](https://github.com/adobe/spacecat-shared/compare/@adobe/spacecat-shared-scrape-client-v2.1.0...@adobe/spacecat-shared-scrape-client-v2.1.1) (2025-08-28)
2
+
3
+
4
+ ### Bug Fixes
5
+
6
+ * enhance validation for scrape job configuration ([#940](https://github.com/adobe/spacecat-shared/issues/940)) ([54d0a6a](https://github.com/adobe/spacecat-shared/commit/54d0a6aa322547e13da25f2f97e1542fd5688849))
7
+
1
8
  # [@adobe/spacecat-shared-scrape-client-v2.1.0](https://github.com/adobe/spacecat-shared/compare/@adobe/spacecat-shared-scrape-client-v2.0.0...@adobe/spacecat-shared-scrape-client-v2.1.0) (2025-08-20)
2
9
 
3
10
 
package/README.md CHANGED
@@ -67,7 +67,9 @@ const jobData = {
67
67
  'Authorization': 'Bearer token',
68
68
  'X-Custom-Header': 'value'
69
69
  },
70
- processingType: 'default' // Optional, defaults to 'DEFAULT'
70
+ processingType: 'default', // Optional, defaults to 'DEFAULT'
71
+ maxScrapeAge: 6, // Optional, used to avoid re-scraping recently scraped URLs (hours) 0 means always scrape
72
+ auditData: {} // Optional, this is used for step audits
71
73
  };
72
74
 
73
75
  try {
@@ -122,6 +124,27 @@ try {
122
124
  }
123
125
  ```
124
126
 
127
+ ### Getting Successful Scrape Paths
128
+
129
+ ```js
130
+ const jobId = 'your-job-id';
131
+ try {
132
+ const paths = await client.getScrapeResultPaths(jobId);
133
+ if (paths === null) {
134
+ console.log('Job not found');
135
+ } else if (paths.size === 0) {
136
+ console.log('No successful paths found for this job');
137
+ } else {
138
+ console.log(`Found ${paths.size} successful paths for job ${jobId}`);
139
+ for (const [url, path] of paths) {
140
+ console.log(`URL: ${url} -> Path: ${path}`);
141
+ }
142
+ }
143
+ } catch (error) {
144
+ console.error('Failed to get successful paths:', error.message);
145
+ }
146
+ ```
147
+
125
148
  ### Finding Jobs by Date Range
126
149
 
127
150
  ```js
@@ -192,6 +215,17 @@ When you retrieve job results, each URL result has this structure:
192
215
  }
193
216
  ```
194
217
 
218
+ ## Path Results Format
219
+
220
+ When you retrieve successful scrape paths using `getScrapeResultPaths()`, the response is a JavaScript Map object that maps URLs to their corresponding result file paths. Only URLs with `COMPLETE` status are included:
221
+
222
+ ```js
223
+ Map(2) {
224
+ 'https://example.com/page1' => 'path/to/result1',
225
+ 'https://example.com/page2' => 'path/to/result2'
226
+ }
227
+ ```
228
+
195
229
  ## Configuration
196
230
 
197
231
  The client uses the `SCRAPE_JOB_CONFIGURATION` environment variable for default settings:
@@ -248,3 +282,37 @@ npm run clean
248
282
  - **Repository**: [GitHub](https://github.com/adobe/spacecat-shared.git)
249
283
  - **Issue Tracking**: [GitHub Issues](https://github.com/adobe/spacecat-shared/issues)
250
284
  - **License**: Apache-2.0
285
+
286
+ ### ScrapeClient Workflow Overview
287
+
288
+ <img width="889" height="508" alt="Screenshot 2025-08-27 at 08 56 16" src="https://github.com/user-attachments/assets/9ccc1388-ed6b-4bf0-a059-d40e6e90aff8" />
289
+
290
+ When a new scrape job is created, the client performs the following steps:
291
+ 1. Creates a new job entry in the database with status `PENDING`.
292
+ 2. Splits the provided URLs into batches based on the `maxUrlsPerMessage` configuration (this is limited due to SQS message size constraints).
293
+ 3. For each batch, it creates a message in the SQS queue to the scrape-job-manager.
294
+
295
+ In the scrape-job-manager the following steps are performed:
296
+ 1. All existing ScrapeURLs are fetched for the base URL to avoid re-scraping recently scraped URLs (based on the `maxScrapeAge` parameter).
297
+ 2. For all URLs a new ScrapeURL entry is created with status `PENDING`.
298
+ 3. Each URL in the batch is checked against existing ScrapeURLs.
299
+ - Already scraped URLs (with status 'COMPLETE' or 'PENDING') are marked to be skipped with the ID of the existing ScrapeURL and the isOriginal flag set to false.
300
+ - URLs that need to be scraped are marked with the isOriginal flag set to true. (The isOriginal flag is used to avoid the sliding window problem when re-scraping URLs.)
301
+ - All URLs are numbered with based on their position in the original list to be able to track the job progress.
302
+ 4. For each URL, a message is created in the SQS queue to the content-scraper.
303
+
304
+ In the content-scraper the following steps are performed:
305
+ 1. The content-scraper checks if an incoming URL message is marked to be skipped. If so, it just sends a message to the content-processor.
306
+ 2. If the URL is not marked to be skipped, the content-scraper scrapes the URL.
307
+ 3. The content-scraper creates a message in the SQS queue to the content-processor with the result of the scraping operation.
308
+
309
+ in the content-processor the following steps are performed:
310
+ 1. The content-processor processes the incoming message from the content-scraper.
311
+ 2. If the URL was skipped, it fetches the existing ScrapeURL entry and updates the new ScrapeURL entry with the same path and status.
312
+ 3. If the URL was scraped, it updates the ScrapeURL entry with the result of the scraping operation (status, path, reason).
313
+ 4. The content-processor updates the ScrapeJob entry with the new counts (success, failed, redirect).
314
+ 5. If all URLs of a job are processed (based on their number and the totalUrlCount of the job), it:
315
+ - performs a cleanup step to set all PENDING URLs to FAILED that were not processed (e.g. due to timeouts).
316
+ - updates the counts of the job again.
317
+ - sets the job status to COMPLETE and sets the endedAt timestamp.
318
+ - Optionally, it can send a SQS message (e.g. to trigger the next audit step).
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@adobe/spacecat-shared-scrape-client",
3
- "version": "2.1.0",
3
+ "version": "2.1.1",
4
4
  "description": "Shared modules of the Spacecat Services - Scrape Client",
5
5
  "type": "module",
6
6
  "engines": {
@@ -34,6 +34,59 @@ export default class ScrapeClient {
34
34
  }
35
35
  }
36
36
 
37
+ static validateScrapeConfiguration(scrapeJobConfiguration) {
38
+ if (!isObject(scrapeJobConfiguration)) {
39
+ throw new Error('Invalid scrape configuration: configuration must be an object');
40
+ }
41
+
42
+ // Validate scrapeWorkerQueue
43
+ if (!hasText(scrapeJobConfiguration.scrapeWorkerQueue)) {
44
+ throw new Error('Invalid scrape configuration: scrapeWorkerQueue must be a non-empty string');
45
+ }
46
+
47
+ if (!isValidUrl(scrapeJobConfiguration.scrapeWorkerQueue)) {
48
+ throw new Error('Invalid scrape configuration: scrapeWorkerQueue must be a valid URL');
49
+ }
50
+
51
+ // Validate s3Bucket
52
+ if (!hasText(scrapeJobConfiguration.s3Bucket)) {
53
+ throw new Error('Invalid scrape configuration: s3Bucket must be a non-empty string');
54
+ }
55
+
56
+ // Validate options
57
+ if (scrapeJobConfiguration.options !== undefined) {
58
+ if (!isObject(scrapeJobConfiguration.options)) {
59
+ throw new Error('Invalid scrape configuration: options must be an object');
60
+ }
61
+
62
+ const { options } = scrapeJobConfiguration;
63
+
64
+ if (options.enableJavascript !== undefined && typeof options.enableJavascript !== 'boolean') {
65
+ throw new Error('Invalid scrape configuration: options.enableJavascript must be a boolean');
66
+ }
67
+
68
+ if (options.hideConsentBanners !== undefined && typeof options.hideConsentBanners !== 'boolean') {
69
+ throw new Error('Invalid scrape configuration: options.hideConsentBanners must be a boolean');
70
+ }
71
+ }
72
+
73
+ // Validate maxUrlsPerJob
74
+ if (scrapeJobConfiguration.maxUrlsPerJob !== undefined) {
75
+ if (!Number.isInteger(scrapeJobConfiguration.maxUrlsPerJob)
76
+ || scrapeJobConfiguration.maxUrlsPerJob <= 0) {
77
+ throw new Error('Invalid scrape configuration: maxUrlsPerJob must be a positive integer');
78
+ }
79
+ }
80
+
81
+ // Validate maxUrlsPerMessage
82
+ if (scrapeJobConfiguration.maxUrlsPerMessage !== undefined) {
83
+ if (!Number.isInteger(scrapeJobConfiguration.maxUrlsPerMessage)
84
+ || scrapeJobConfiguration.maxUrlsPerMessage <= 0) {
85
+ throw new Error('Invalid scrape configuration: maxUrlsPerMessage must be a positive integer');
86
+ }
87
+ }
88
+ }
89
+
37
90
  validateRequestData(data) {
38
91
  if (!isObject(data)) {
39
92
  throw new Error('Invalid request: missing application/json request data');
@@ -103,8 +156,10 @@ export default class ScrapeClient {
103
156
  let scrapeConfiguration = {};
104
157
  try {
105
158
  scrapeConfiguration = JSON.parse(this.config.env.SCRAPE_JOB_CONFIGURATION);
159
+ ScrapeClient.validateScrapeConfiguration(scrapeConfiguration);
106
160
  } catch (error) {
107
- this.config.log.error(`Failed to parse scrape job configuration: ${error.message}`);
161
+ this.config.log.error(`Failed to parse or validate scrape job configuration: ${error.message}`);
162
+ throw new Error(`Invalid scrape job configuration: ${error.message}`);
108
163
  }
109
164
  this.scrapeConfiguration = scrapeConfiguration;
110
165
 
@@ -229,6 +284,11 @@ export default class ScrapeClient {
229
284
  }
230
285
  }
231
286
 
287
+ /**
288
+ * Get the result paths of a scrape job
289
+ * @param {string} jobId - The ID of the job to fetch.
290
+ * @return {Promise<Map<string, string>>} A map of URLs to their corresponding result paths.
291
+ */
232
292
  async getScrapeResultPaths(jobId) {
233
293
  try {
234
294
  const job = await this.scrapeSupervisor.getScrapeJob(jobId);
@@ -122,7 +122,7 @@ function ScrapeJobSupervisor(services, config) {
122
122
  * @param {object} scrapeJob - The scrape job record.
123
123
  * @param {object} customHeaders - Optional custom headers to be sent with each request.
124
124
  * @param {string} maxScrapeAge - The maximum age of the scrape job
125
- * @param auditContext
125
+ * @param {object} auditData - Step-Audit specific data
126
126
  */
127
127
  // eslint-disable-next-line max-len
128
128
  async function queueUrlsForScrapeWorker(urls, scrapeJob, customHeaders, maxScrapeAge, auditData) {