@adobe/spacecat-shared-scrape-client 2.0.0 → 2.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +14 -0
- package/README.md +69 -1
- package/package.json +1 -1
- package/src/clients/scrape-client.js +82 -3
- package/src/clients/scrape-job-supervisor.js +9 -4
package/CHANGELOG.md
CHANGED
|
@@ -1,3 +1,17 @@
|
|
|
1
|
+
# [@adobe/spacecat-shared-scrape-client-v2.1.1](https://github.com/adobe/spacecat-shared/compare/@adobe/spacecat-shared-scrape-client-v2.1.0...@adobe/spacecat-shared-scrape-client-v2.1.1) (2025-08-28)
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
### Bug Fixes
|
|
5
|
+
|
|
6
|
+
* enhance validation for scrape job configuration ([#940](https://github.com/adobe/spacecat-shared/issues/940)) ([54d0a6a](https://github.com/adobe/spacecat-shared/commit/54d0a6aa322547e13da25f2f97e1542fd5688849))
|
|
7
|
+
|
|
8
|
+
# [@adobe/spacecat-shared-scrape-client-v2.1.0](https://github.com/adobe/spacecat-shared/compare/@adobe/spacecat-shared-scrape-client-v2.0.0...@adobe/spacecat-shared-scrape-client-v2.1.0) (2025-08-20)
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
### Features
|
|
12
|
+
|
|
13
|
+
* add scrape-client destination ([#913](https://github.com/adobe/spacecat-shared/issues/913)) ([e208a87](https://github.com/adobe/spacecat-shared/commit/e208a87214874a2708ac2d7614fcfd4c0770fe17))
|
|
14
|
+
|
|
1
15
|
# [@adobe/spacecat-shared-scrape-client-v2.0.0](https://github.com/adobe/spacecat-shared/compare/@adobe/spacecat-shared-scrape-client-v1.0.7...@adobe/spacecat-shared-scrape-client-v2.0.0) (2025-08-13)
|
|
2
16
|
|
|
3
17
|
|
package/README.md
CHANGED
|
@@ -67,7 +67,9 @@ const jobData = {
|
|
|
67
67
|
'Authorization': 'Bearer token',
|
|
68
68
|
'X-Custom-Header': 'value'
|
|
69
69
|
},
|
|
70
|
-
processingType: 'default' // Optional, defaults to 'DEFAULT'
|
|
70
|
+
processingType: 'default', // Optional, defaults to 'DEFAULT'
|
|
71
|
+
maxScrapeAge: 6, // Optional, used to avoid re-scraping recently scraped URLs (hours) 0 means always scrape
|
|
72
|
+
auditData: {} // Optional, this is used for step audits
|
|
71
73
|
};
|
|
72
74
|
|
|
73
75
|
try {
|
|
@@ -122,6 +124,27 @@ try {
|
|
|
122
124
|
}
|
|
123
125
|
```
|
|
124
126
|
|
|
127
|
+
### Getting Successful Scrape Paths
|
|
128
|
+
|
|
129
|
+
```js
|
|
130
|
+
const jobId = 'your-job-id';
|
|
131
|
+
try {
|
|
132
|
+
const paths = await client.getScrapeResultPaths(jobId);
|
|
133
|
+
if (paths === null) {
|
|
134
|
+
console.log('Job not found');
|
|
135
|
+
} else if (paths.size === 0) {
|
|
136
|
+
console.log('No successful paths found for this job');
|
|
137
|
+
} else {
|
|
138
|
+
console.log(`Found ${paths.size} successful paths for job ${jobId}`);
|
|
139
|
+
for (const [url, path] of paths) {
|
|
140
|
+
console.log(`URL: ${url} -> Path: ${path}`);
|
|
141
|
+
}
|
|
142
|
+
}
|
|
143
|
+
} catch (error) {
|
|
144
|
+
console.error('Failed to get successful paths:', error.message);
|
|
145
|
+
}
|
|
146
|
+
```
|
|
147
|
+
|
|
125
148
|
### Finding Jobs by Date Range
|
|
126
149
|
|
|
127
150
|
```js
|
|
@@ -192,6 +215,17 @@ When you retrieve job results, each URL result has this structure:
|
|
|
192
215
|
}
|
|
193
216
|
```
|
|
194
217
|
|
|
218
|
+
## Path Results Format
|
|
219
|
+
|
|
220
|
+
When you retrieve successful scrape paths using `getScrapeResultPaths()`, the response is a JavaScript Map object that maps URLs to their corresponding result file paths. Only URLs with `COMPLETE` status are included:
|
|
221
|
+
|
|
222
|
+
```js
|
|
223
|
+
Map(2) {
|
|
224
|
+
'https://example.com/page1' => 'path/to/result1',
|
|
225
|
+
'https://example.com/page2' => 'path/to/result2'
|
|
226
|
+
}
|
|
227
|
+
```
|
|
228
|
+
|
|
195
229
|
## Configuration
|
|
196
230
|
|
|
197
231
|
The client uses the `SCRAPE_JOB_CONFIGURATION` environment variable for default settings:
|
|
@@ -248,3 +282,37 @@ npm run clean
|
|
|
248
282
|
- **Repository**: [GitHub](https://github.com/adobe/spacecat-shared.git)
|
|
249
283
|
- **Issue Tracking**: [GitHub Issues](https://github.com/adobe/spacecat-shared/issues)
|
|
250
284
|
- **License**: Apache-2.0
|
|
285
|
+
|
|
286
|
+
### ScrapeClient Workflow Overview
|
|
287
|
+
|
|
288
|
+
<img width="889" height="508" alt="Screenshot 2025-08-27 at 08 56 16" src="https://github.com/user-attachments/assets/9ccc1388-ed6b-4bf0-a059-d40e6e90aff8" />
|
|
289
|
+
|
|
290
|
+
When a new scrape job is created, the client performs the following steps:
|
|
291
|
+
1. Creates a new job entry in the database with status `PENDING`.
|
|
292
|
+
2. Splits the provided URLs into batches based on the `maxUrlsPerMessage` configuration (this is limited due to SQS message size constraints).
|
|
293
|
+
3. For each batch, it creates a message in the SQS queue to the scrape-job-manager.
|
|
294
|
+
|
|
295
|
+
In the scrape-job-manager the following steps are performed:
|
|
296
|
+
1. All existing ScrapeURLs are fetched for the base URL to avoid re-scraping recently scraped URLs (based on the `maxScrapeAge` parameter).
|
|
297
|
+
2. For all URLs a new ScrapeURL entry is created with status `PENDING`.
|
|
298
|
+
3. Each URL in the batch is checked against existing ScrapeURLs.
|
|
299
|
+
- Already scraped URLs (with status 'COMPLETE' or 'PENDING') are marked to be skipped with the ID of the existing ScrapeURL and the isOriginal flag set to false.
|
|
300
|
+
- URLs that need to be scraped are marked with the isOriginal flag set to true. (The isOriginal flag is used to avoid the sliding window problem when re-scraping URLs.)
|
|
301
|
+
- All URLs are numbered with based on their position in the original list to be able to track the job progress.
|
|
302
|
+
4. For each URL, a message is created in the SQS queue to the content-scraper.
|
|
303
|
+
|
|
304
|
+
In the content-scraper the following steps are performed:
|
|
305
|
+
1. The content-scraper checks if an incoming URL message is marked to be skipped. If so, it just sends a message to the content-processor.
|
|
306
|
+
2. If the URL is not marked to be skipped, the content-scraper scrapes the URL.
|
|
307
|
+
3. The content-scraper creates a message in the SQS queue to the content-processor with the result of the scraping operation.
|
|
308
|
+
|
|
309
|
+
in the content-processor the following steps are performed:
|
|
310
|
+
1. The content-processor processes the incoming message from the content-scraper.
|
|
311
|
+
2. If the URL was skipped, it fetches the existing ScrapeURL entry and updates the new ScrapeURL entry with the same path and status.
|
|
312
|
+
3. If the URL was scraped, it updates the ScrapeURL entry with the result of the scraping operation (status, path, reason).
|
|
313
|
+
4. The content-processor updates the ScrapeJob entry with the new counts (success, failed, redirect).
|
|
314
|
+
5. If all URLs of a job are processed (based on their number and the totalUrlCount of the job), it:
|
|
315
|
+
- performs a cleanup step to set all PENDING URLs to FAILED that were not processed (e.g. due to timeouts).
|
|
316
|
+
- updates the counts of the job again.
|
|
317
|
+
- sets the job status to COMPLETE and sets the endedAt timestamp.
|
|
318
|
+
- Optionally, it can send a SQS message (e.g. to trigger the next audit step).
|
package/package.json
CHANGED
|
@@ -11,8 +11,7 @@
|
|
|
11
11
|
*/
|
|
12
12
|
|
|
13
13
|
import {
|
|
14
|
-
isIsoDate, isObject, isValidUrl,
|
|
15
|
-
isValidUUID,
|
|
14
|
+
hasText, isIsoDate, isNonEmptyArray, isObject, isValidUrl, isValidUUID,
|
|
16
15
|
} from '@adobe/spacecat-shared-utils';
|
|
17
16
|
import { ScrapeJob as ScrapeJobModel } from '@adobe/spacecat-shared-data-access';
|
|
18
17
|
import { ScrapeJobDto } from './scrapeJobDto.js';
|
|
@@ -35,6 +34,59 @@ export default class ScrapeClient {
|
|
|
35
34
|
}
|
|
36
35
|
}
|
|
37
36
|
|
|
37
|
+
static validateScrapeConfiguration(scrapeJobConfiguration) {
|
|
38
|
+
if (!isObject(scrapeJobConfiguration)) {
|
|
39
|
+
throw new Error('Invalid scrape configuration: configuration must be an object');
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
// Validate scrapeWorkerQueue
|
|
43
|
+
if (!hasText(scrapeJobConfiguration.scrapeWorkerQueue)) {
|
|
44
|
+
throw new Error('Invalid scrape configuration: scrapeWorkerQueue must be a non-empty string');
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
if (!isValidUrl(scrapeJobConfiguration.scrapeWorkerQueue)) {
|
|
48
|
+
throw new Error('Invalid scrape configuration: scrapeWorkerQueue must be a valid URL');
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
// Validate s3Bucket
|
|
52
|
+
if (!hasText(scrapeJobConfiguration.s3Bucket)) {
|
|
53
|
+
throw new Error('Invalid scrape configuration: s3Bucket must be a non-empty string');
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
// Validate options
|
|
57
|
+
if (scrapeJobConfiguration.options !== undefined) {
|
|
58
|
+
if (!isObject(scrapeJobConfiguration.options)) {
|
|
59
|
+
throw new Error('Invalid scrape configuration: options must be an object');
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
const { options } = scrapeJobConfiguration;
|
|
63
|
+
|
|
64
|
+
if (options.enableJavascript !== undefined && typeof options.enableJavascript !== 'boolean') {
|
|
65
|
+
throw new Error('Invalid scrape configuration: options.enableJavascript must be a boolean');
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
if (options.hideConsentBanners !== undefined && typeof options.hideConsentBanners !== 'boolean') {
|
|
69
|
+
throw new Error('Invalid scrape configuration: options.hideConsentBanners must be a boolean');
|
|
70
|
+
}
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
// Validate maxUrlsPerJob
|
|
74
|
+
if (scrapeJobConfiguration.maxUrlsPerJob !== undefined) {
|
|
75
|
+
if (!Number.isInteger(scrapeJobConfiguration.maxUrlsPerJob)
|
|
76
|
+
|| scrapeJobConfiguration.maxUrlsPerJob <= 0) {
|
|
77
|
+
throw new Error('Invalid scrape configuration: maxUrlsPerJob must be a positive integer');
|
|
78
|
+
}
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
// Validate maxUrlsPerMessage
|
|
82
|
+
if (scrapeJobConfiguration.maxUrlsPerMessage !== undefined) {
|
|
83
|
+
if (!Number.isInteger(scrapeJobConfiguration.maxUrlsPerMessage)
|
|
84
|
+
|| scrapeJobConfiguration.maxUrlsPerMessage <= 0) {
|
|
85
|
+
throw new Error('Invalid scrape configuration: maxUrlsPerMessage must be a positive integer');
|
|
86
|
+
}
|
|
87
|
+
}
|
|
88
|
+
}
|
|
89
|
+
|
|
38
90
|
validateRequestData(data) {
|
|
39
91
|
if (!isObject(data)) {
|
|
40
92
|
throw new Error('Invalid request: missing application/json request data');
|
|
@@ -104,8 +156,10 @@ export default class ScrapeClient {
|
|
|
104
156
|
let scrapeConfiguration = {};
|
|
105
157
|
try {
|
|
106
158
|
scrapeConfiguration = JSON.parse(this.config.env.SCRAPE_JOB_CONFIGURATION);
|
|
159
|
+
ScrapeClient.validateScrapeConfiguration(scrapeConfiguration);
|
|
107
160
|
} catch (error) {
|
|
108
|
-
this.config.log.error(`Failed to parse scrape job configuration: ${error.message}`);
|
|
161
|
+
this.config.log.error(`Failed to parse or validate scrape job configuration: ${error.message}`);
|
|
162
|
+
throw new Error(`Invalid scrape job configuration: ${error.message}`);
|
|
109
163
|
}
|
|
110
164
|
this.scrapeConfiguration = scrapeConfiguration;
|
|
111
165
|
|
|
@@ -132,6 +186,7 @@ export default class ScrapeClient {
|
|
|
132
186
|
customHeaders,
|
|
133
187
|
processingType = ScrapeJobModel.ScrapeProcessingType.DEFAULT,
|
|
134
188
|
maxScrapeAge = 24,
|
|
189
|
+
auditData = {},
|
|
135
190
|
} = data;
|
|
136
191
|
|
|
137
192
|
this.config.log.info(`Creating a new scrape job with ${urls.length} URLs.`);
|
|
@@ -149,6 +204,7 @@ export default class ScrapeClient {
|
|
|
149
204
|
mergedOptions,
|
|
150
205
|
customHeaders,
|
|
151
206
|
maxScrapeAge,
|
|
207
|
+
auditData,
|
|
152
208
|
);
|
|
153
209
|
return ScrapeJobDto.toJSON(job);
|
|
154
210
|
} catch (error) {
|
|
@@ -228,6 +284,29 @@ export default class ScrapeClient {
|
|
|
228
284
|
}
|
|
229
285
|
}
|
|
230
286
|
|
|
287
|
+
/**
|
|
288
|
+
* Get the result paths of a scrape job
|
|
289
|
+
* @param {string} jobId - The ID of the job to fetch.
|
|
290
|
+
* @return {Promise<Map<string, string>>} A map of URLs to their corresponding result paths.
|
|
291
|
+
*/
|
|
292
|
+
async getScrapeResultPaths(jobId) {
|
|
293
|
+
try {
|
|
294
|
+
const job = await this.scrapeSupervisor.getScrapeJob(jobId);
|
|
295
|
+
if (!job) {
|
|
296
|
+
return null;
|
|
297
|
+
}
|
|
298
|
+
const { ScrapeUrl } = this.config.dataAccess;
|
|
299
|
+
const scrapeUrls = await ScrapeUrl.allByScrapeJobId(job.getId());
|
|
300
|
+
return scrapeUrls
|
|
301
|
+
.filter((url) => url.getStatus() === ScrapeJobModel.ScrapeUrlStatus.COMPLETE)
|
|
302
|
+
.reduce((map, url) => map.set(url.getUrl(), url.getPath()), new Map());
|
|
303
|
+
} catch (error) {
|
|
304
|
+
const msgError = `Failed to fetch the scrape job result: ${error.message}`;
|
|
305
|
+
this.config.log.error(msgError);
|
|
306
|
+
throw new Error(msgError);
|
|
307
|
+
}
|
|
308
|
+
}
|
|
309
|
+
|
|
231
310
|
/**
|
|
232
311
|
* Get all scrape jobs by baseURL and processing type
|
|
233
312
|
* @param {string} baseURL - The baseURL of the jobs to fetch.
|
|
@@ -122,10 +122,12 @@ function ScrapeJobSupervisor(services, config) {
|
|
|
122
122
|
* @param {object} scrapeJob - The scrape job record.
|
|
123
123
|
* @param {object} customHeaders - Optional custom headers to be sent with each request.
|
|
124
124
|
* @param {string} maxScrapeAge - The maximum age of the scrape job
|
|
125
|
+
* @param {object} auditData - Step-Audit specific data
|
|
125
126
|
*/
|
|
126
|
-
|
|
127
|
+
// eslint-disable-next-line max-len
|
|
128
|
+
async function queueUrlsForScrapeWorker(urls, scrapeJob, customHeaders, maxScrapeAge, auditData) {
|
|
127
129
|
log.info(`Starting a new scrape job of baseUrl: ${scrapeJob.getBaseURL()} with ${urls.length}`
|
|
128
|
-
+
|
|
130
|
+
+ ' URLs.'
|
|
129
131
|
+ `(jobId: ${scrapeJob.getId()})`);
|
|
130
132
|
|
|
131
133
|
const options = scrapeJob.getOptions();
|
|
@@ -155,6 +157,7 @@ function ScrapeJobSupervisor(services, config) {
|
|
|
155
157
|
customHeaders,
|
|
156
158
|
options,
|
|
157
159
|
maxScrapeAge,
|
|
160
|
+
auditData,
|
|
158
161
|
};
|
|
159
162
|
|
|
160
163
|
// eslint-disable-next-line no-await-in-loop
|
|
@@ -168,7 +171,8 @@ function ScrapeJobSupervisor(services, config) {
|
|
|
168
171
|
* @param {string} processingType - The type of processing to perform.
|
|
169
172
|
* @param {object} options - Optional configuration params for the scrape job.
|
|
170
173
|
* @param {object} customHeaders - Optional custom headers to be sent with each request.
|
|
171
|
-
* @param {
|
|
174
|
+
* @param {number} maxScrapeAge - The maximum age of the scrape job
|
|
175
|
+
* @param auditContext
|
|
172
176
|
* @returns {Promise<ScrapeJob>} newly created job object
|
|
173
177
|
*/
|
|
174
178
|
async function startNewJob(
|
|
@@ -177,6 +181,7 @@ function ScrapeJobSupervisor(services, config) {
|
|
|
177
181
|
options,
|
|
178
182
|
customHeaders,
|
|
179
183
|
maxScrapeAge,
|
|
184
|
+
auditContext,
|
|
180
185
|
) {
|
|
181
186
|
const newScrapeJob = await createNewScrapeJob(
|
|
182
187
|
urls,
|
|
@@ -196,7 +201,7 @@ function ScrapeJobSupervisor(services, config) {
|
|
|
196
201
|
|
|
197
202
|
// Queue all URLs for scrape as a single message. This enables the controller to respond with
|
|
198
203
|
// a job ID ASAP, while the individual URLs are queued up asynchronously by another function.
|
|
199
|
-
await queueUrlsForScrapeWorker(urls, newScrapeJob, customHeaders, maxScrapeAge);
|
|
204
|
+
await queueUrlsForScrapeWorker(urls, newScrapeJob, customHeaders, maxScrapeAge, auditContext);
|
|
200
205
|
|
|
201
206
|
return newScrapeJob;
|
|
202
207
|
}
|