@adobe/spacecat-shared-scrape-client 2.1.0 → 2.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +14 -0
- package/README.md +69 -1
- package/package.json +5 -5
- package/src/clients/scrape-client.js +61 -1
- package/src/clients/scrape-job-supervisor.js +1 -1
package/CHANGELOG.md
CHANGED
|
@@ -1,3 +1,17 @@
|
|
|
1
|
+
# [@adobe/spacecat-shared-scrape-client-v2.1.2](https://github.com/adobe/spacecat-shared/compare/@adobe/spacecat-shared-scrape-client-v2.1.1...@adobe/spacecat-shared-scrape-client-v2.1.2) (2025-09-06)
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
### Bug Fixes
|
|
5
|
+
|
|
6
|
+
* **deps:** update external fixes ([#920](https://github.com/adobe/spacecat-shared/issues/920)) ([1a6b1e1](https://github.com/adobe/spacecat-shared/commit/1a6b1e1ac9531a41c86406ada4bd4ab903307fdc))
|
|
7
|
+
|
|
8
|
+
# [@adobe/spacecat-shared-scrape-client-v2.1.1](https://github.com/adobe/spacecat-shared/compare/@adobe/spacecat-shared-scrape-client-v2.1.0...@adobe/spacecat-shared-scrape-client-v2.1.1) (2025-08-28)
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
### Bug Fixes
|
|
12
|
+
|
|
13
|
+
* enhance validation for scrape job configuration ([#940](https://github.com/adobe/spacecat-shared/issues/940)) ([54d0a6a](https://github.com/adobe/spacecat-shared/commit/54d0a6aa322547e13da25f2f97e1542fd5688849))
|
|
14
|
+
|
|
1
15
|
# [@adobe/spacecat-shared-scrape-client-v2.1.0](https://github.com/adobe/spacecat-shared/compare/@adobe/spacecat-shared-scrape-client-v2.0.0...@adobe/spacecat-shared-scrape-client-v2.1.0) (2025-08-20)
|
|
2
16
|
|
|
3
17
|
|
package/README.md
CHANGED
|
@@ -67,7 +67,9 @@ const jobData = {
|
|
|
67
67
|
'Authorization': 'Bearer token',
|
|
68
68
|
'X-Custom-Header': 'value'
|
|
69
69
|
},
|
|
70
|
-
processingType: 'default' // Optional, defaults to 'DEFAULT'
|
|
70
|
+
processingType: 'default', // Optional, defaults to 'DEFAULT'
|
|
71
|
+
maxScrapeAge: 6, // Optional, used to avoid re-scraping recently scraped URLs (hours) 0 means always scrape
|
|
72
|
+
auditData: {} // Optional, this is used for step audits
|
|
71
73
|
};
|
|
72
74
|
|
|
73
75
|
try {
|
|
@@ -122,6 +124,27 @@ try {
|
|
|
122
124
|
}
|
|
123
125
|
```
|
|
124
126
|
|
|
127
|
+
### Getting Successful Scrape Paths
|
|
128
|
+
|
|
129
|
+
```js
|
|
130
|
+
const jobId = 'your-job-id';
|
|
131
|
+
try {
|
|
132
|
+
const paths = await client.getScrapeResultPaths(jobId);
|
|
133
|
+
if (paths === null) {
|
|
134
|
+
console.log('Job not found');
|
|
135
|
+
} else if (paths.size === 0) {
|
|
136
|
+
console.log('No successful paths found for this job');
|
|
137
|
+
} else {
|
|
138
|
+
console.log(`Found ${paths.size} successful paths for job ${jobId}`);
|
|
139
|
+
for (const [url, path] of paths) {
|
|
140
|
+
console.log(`URL: ${url} -> Path: ${path}`);
|
|
141
|
+
}
|
|
142
|
+
}
|
|
143
|
+
} catch (error) {
|
|
144
|
+
console.error('Failed to get successful paths:', error.message);
|
|
145
|
+
}
|
|
146
|
+
```
|
|
147
|
+
|
|
125
148
|
### Finding Jobs by Date Range
|
|
126
149
|
|
|
127
150
|
```js
|
|
@@ -192,6 +215,17 @@ When you retrieve job results, each URL result has this structure:
|
|
|
192
215
|
}
|
|
193
216
|
```
|
|
194
217
|
|
|
218
|
+
## Path Results Format
|
|
219
|
+
|
|
220
|
+
When you retrieve successful scrape paths using `getScrapeResultPaths()`, the response is a JavaScript Map object that maps URLs to their corresponding result file paths. Only URLs with `COMPLETE` status are included:
|
|
221
|
+
|
|
222
|
+
```js
|
|
223
|
+
Map(2) {
|
|
224
|
+
'https://example.com/page1' => 'path/to/result1',
|
|
225
|
+
'https://example.com/page2' => 'path/to/result2'
|
|
226
|
+
}
|
|
227
|
+
```
|
|
228
|
+
|
|
195
229
|
## Configuration
|
|
196
230
|
|
|
197
231
|
The client uses the `SCRAPE_JOB_CONFIGURATION` environment variable for default settings:
|
|
@@ -248,3 +282,37 @@ npm run clean
|
|
|
248
282
|
- **Repository**: [GitHub](https://github.com/adobe/spacecat-shared.git)
|
|
249
283
|
- **Issue Tracking**: [GitHub Issues](https://github.com/adobe/spacecat-shared/issues)
|
|
250
284
|
- **License**: Apache-2.0
|
|
285
|
+
|
|
286
|
+
### ScrapeClient Workflow Overview
|
|
287
|
+
|
|
288
|
+
<img width="889" height="508" alt="Screenshot 2025-08-27 at 08 56 16" src="https://github.com/user-attachments/assets/9ccc1388-ed6b-4bf0-a059-d40e6e90aff8" />
|
|
289
|
+
|
|
290
|
+
When a new scrape job is created, the client performs the following steps:
|
|
291
|
+
1. Creates a new job entry in the database with status `PENDING`.
|
|
292
|
+
2. Splits the provided URLs into batches based on the `maxUrlsPerMessage` configuration (this is limited due to SQS message size constraints).
|
|
293
|
+
3. For each batch, it creates a message in the SQS queue to the scrape-job-manager.
|
|
294
|
+
|
|
295
|
+
In the scrape-job-manager the following steps are performed:
|
|
296
|
+
1. All existing ScrapeURLs are fetched for the base URL to avoid re-scraping recently scraped URLs (based on the `maxScrapeAge` parameter).
|
|
297
|
+
2. For all URLs a new ScrapeURL entry is created with status `PENDING`.
|
|
298
|
+
3. Each URL in the batch is checked against existing ScrapeURLs.
|
|
299
|
+
- Already scraped URLs (with status 'COMPLETE' or 'PENDING') are marked to be skipped with the ID of the existing ScrapeURL and the isOriginal flag set to false.
|
|
300
|
+
- URLs that need to be scraped are marked with the isOriginal flag set to true. (The isOriginal flag is used to avoid the sliding window problem when re-scraping URLs.)
|
|
301
|
+
- All URLs are numbered with based on their position in the original list to be able to track the job progress.
|
|
302
|
+
4. For each URL, a message is created in the SQS queue to the content-scraper.
|
|
303
|
+
|
|
304
|
+
In the content-scraper the following steps are performed:
|
|
305
|
+
1. The content-scraper checks if an incoming URL message is marked to be skipped. If so, it just sends a message to the content-processor.
|
|
306
|
+
2. If the URL is not marked to be skipped, the content-scraper scrapes the URL.
|
|
307
|
+
3. The content-scraper creates a message in the SQS queue to the content-processor with the result of the scraping operation.
|
|
308
|
+
|
|
309
|
+
in the content-processor the following steps are performed:
|
|
310
|
+
1. The content-processor processes the incoming message from the content-scraper.
|
|
311
|
+
2. If the URL was skipped, it fetches the existing ScrapeURL entry and updates the new ScrapeURL entry with the same path and status.
|
|
312
|
+
3. If the URL was scraped, it updates the ScrapeURL entry with the result of the scraping operation (status, path, reason).
|
|
313
|
+
4. The content-processor updates the ScrapeJob entry with the new counts (success, failed, redirect).
|
|
314
|
+
5. If all URLs of a job are processed (based on their number and the totalUrlCount of the job), it:
|
|
315
|
+
- performs a cleanup step to set all PENDING URLs to FAILED that were not processed (e.g. due to timeouts).
|
|
316
|
+
- updates the counts of the job again.
|
|
317
|
+
- sets the job status to COMPLETE and sets the endedAt timestamp.
|
|
318
|
+
- Optionally, it can send a SQS message (e.g. to trigger the next audit step).
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@adobe/spacecat-shared-scrape-client",
|
|
3
|
-
"version": "2.1.
|
|
3
|
+
"version": "2.1.2",
|
|
4
4
|
"description": "Shared modules of the Spacecat Services - Scrape Client",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"engines": {
|
|
@@ -40,11 +40,11 @@
|
|
|
40
40
|
"@adobe/spacecat-shared-utils": "1.31.0"
|
|
41
41
|
},
|
|
42
42
|
"devDependencies": {
|
|
43
|
-
"chai": "5.
|
|
44
|
-
"chai-as-promised": "8.0.
|
|
45
|
-
"nock": "14.0.
|
|
43
|
+
"chai": "5.3.3",
|
|
44
|
+
"chai-as-promised": "8.0.2",
|
|
45
|
+
"nock": "14.0.10",
|
|
46
46
|
"sinon": "20.0.0",
|
|
47
|
-
"sinon-chai": "4.0.
|
|
47
|
+
"sinon-chai": "4.0.1",
|
|
48
48
|
"typescript": "5.9.2"
|
|
49
49
|
}
|
|
50
50
|
}
|
|
@@ -34,6 +34,59 @@ export default class ScrapeClient {
|
|
|
34
34
|
}
|
|
35
35
|
}
|
|
36
36
|
|
|
37
|
+
static validateScrapeConfiguration(scrapeJobConfiguration) {
|
|
38
|
+
if (!isObject(scrapeJobConfiguration)) {
|
|
39
|
+
throw new Error('Invalid scrape configuration: configuration must be an object');
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
// Validate scrapeWorkerQueue
|
|
43
|
+
if (!hasText(scrapeJobConfiguration.scrapeWorkerQueue)) {
|
|
44
|
+
throw new Error('Invalid scrape configuration: scrapeWorkerQueue must be a non-empty string');
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
if (!isValidUrl(scrapeJobConfiguration.scrapeWorkerQueue)) {
|
|
48
|
+
throw new Error('Invalid scrape configuration: scrapeWorkerQueue must be a valid URL');
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
// Validate s3Bucket
|
|
52
|
+
if (!hasText(scrapeJobConfiguration.s3Bucket)) {
|
|
53
|
+
throw new Error('Invalid scrape configuration: s3Bucket must be a non-empty string');
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
// Validate options
|
|
57
|
+
if (scrapeJobConfiguration.options !== undefined) {
|
|
58
|
+
if (!isObject(scrapeJobConfiguration.options)) {
|
|
59
|
+
throw new Error('Invalid scrape configuration: options must be an object');
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
const { options } = scrapeJobConfiguration;
|
|
63
|
+
|
|
64
|
+
if (options.enableJavascript !== undefined && typeof options.enableJavascript !== 'boolean') {
|
|
65
|
+
throw new Error('Invalid scrape configuration: options.enableJavascript must be a boolean');
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
if (options.hideConsentBanners !== undefined && typeof options.hideConsentBanners !== 'boolean') {
|
|
69
|
+
throw new Error('Invalid scrape configuration: options.hideConsentBanners must be a boolean');
|
|
70
|
+
}
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
// Validate maxUrlsPerJob
|
|
74
|
+
if (scrapeJobConfiguration.maxUrlsPerJob !== undefined) {
|
|
75
|
+
if (!Number.isInteger(scrapeJobConfiguration.maxUrlsPerJob)
|
|
76
|
+
|| scrapeJobConfiguration.maxUrlsPerJob <= 0) {
|
|
77
|
+
throw new Error('Invalid scrape configuration: maxUrlsPerJob must be a positive integer');
|
|
78
|
+
}
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
// Validate maxUrlsPerMessage
|
|
82
|
+
if (scrapeJobConfiguration.maxUrlsPerMessage !== undefined) {
|
|
83
|
+
if (!Number.isInteger(scrapeJobConfiguration.maxUrlsPerMessage)
|
|
84
|
+
|| scrapeJobConfiguration.maxUrlsPerMessage <= 0) {
|
|
85
|
+
throw new Error('Invalid scrape configuration: maxUrlsPerMessage must be a positive integer');
|
|
86
|
+
}
|
|
87
|
+
}
|
|
88
|
+
}
|
|
89
|
+
|
|
37
90
|
validateRequestData(data) {
|
|
38
91
|
if (!isObject(data)) {
|
|
39
92
|
throw new Error('Invalid request: missing application/json request data');
|
|
@@ -103,8 +156,10 @@ export default class ScrapeClient {
|
|
|
103
156
|
let scrapeConfiguration = {};
|
|
104
157
|
try {
|
|
105
158
|
scrapeConfiguration = JSON.parse(this.config.env.SCRAPE_JOB_CONFIGURATION);
|
|
159
|
+
ScrapeClient.validateScrapeConfiguration(scrapeConfiguration);
|
|
106
160
|
} catch (error) {
|
|
107
|
-
this.config.log.error(`Failed to parse scrape job configuration: ${error.message}`);
|
|
161
|
+
this.config.log.error(`Failed to parse or validate scrape job configuration: ${error.message}`);
|
|
162
|
+
throw new Error(`Invalid scrape job configuration: ${error.message}`);
|
|
108
163
|
}
|
|
109
164
|
this.scrapeConfiguration = scrapeConfiguration;
|
|
110
165
|
|
|
@@ -229,6 +284,11 @@ export default class ScrapeClient {
|
|
|
229
284
|
}
|
|
230
285
|
}
|
|
231
286
|
|
|
287
|
+
/**
|
|
288
|
+
* Get the result paths of a scrape job
|
|
289
|
+
* @param {string} jobId - The ID of the job to fetch.
|
|
290
|
+
* @return {Promise<Map<string, string>>} A map of URLs to their corresponding result paths.
|
|
291
|
+
*/
|
|
232
292
|
async getScrapeResultPaths(jobId) {
|
|
233
293
|
try {
|
|
234
294
|
const job = await this.scrapeSupervisor.getScrapeJob(jobId);
|
|
@@ -122,7 +122,7 @@ function ScrapeJobSupervisor(services, config) {
|
|
|
122
122
|
* @param {object} scrapeJob - The scrape job record.
|
|
123
123
|
* @param {object} customHeaders - Optional custom headers to be sent with each request.
|
|
124
124
|
* @param {string} maxScrapeAge - The maximum age of the scrape job
|
|
125
|
-
* @param
|
|
125
|
+
* @param {object} auditData - Step-Audit specific data
|
|
126
126
|
*/
|
|
127
127
|
// eslint-disable-next-line max-len
|
|
128
128
|
async function queueUrlsForScrapeWorker(urls, scrapeJob, customHeaders, maxScrapeAge, auditData) {
|