@adobe/spacecat-shared-scrape-client 1.0.7 → 2.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +19 -0
- package/README.md +1 -0
- package/package.json +1 -1
- package/src/clients/scrape-client.js +29 -3
- package/src/clients/scrape-job-supervisor.js +13 -43
package/CHANGELOG.md
CHANGED
|
@@ -1,3 +1,22 @@
|
|
|
1
|
+
# [@adobe/spacecat-shared-scrape-client-v2.1.0](https://github.com/adobe/spacecat-shared/compare/@adobe/spacecat-shared-scrape-client-v2.0.0...@adobe/spacecat-shared-scrape-client-v2.1.0) (2025-08-20)
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
### Features
|
|
5
|
+
|
|
6
|
+
* add scrape-client destination ([#913](https://github.com/adobe/spacecat-shared/issues/913)) ([e208a87](https://github.com/adobe/spacecat-shared/commit/e208a87214874a2708ac2d7614fcfd4c0770fe17))
|
|
7
|
+
|
|
8
|
+
# [@adobe/spacecat-shared-scrape-client-v2.0.0](https://github.com/adobe/spacecat-shared/compare/@adobe/spacecat-shared-scrape-client-v1.0.7...@adobe/spacecat-shared-scrape-client-v2.0.0) (2025-08-13)
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
### Features
|
|
12
|
+
|
|
13
|
+
* re-scraping of URLs ([b889a19](https://github.com/adobe/spacecat-shared/commit/b889a19b1cec20b1f1dc32a89b34ab5125fa90e6))
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
### BREAKING CHANGES
|
|
17
|
+
|
|
18
|
+
* ScrapeClient does not choose a scrape queue anymore. This is done in Scrape Job Manager.
|
|
19
|
+
|
|
1
20
|
# [@adobe/spacecat-shared-scrape-client-v1.0.7](https://github.com/adobe/spacecat-shared/compare/@adobe/spacecat-shared-scrape-client-v1.0.6...@adobe/spacecat-shared-scrape-client-v1.0.7) (2025-08-09)
|
|
2
21
|
|
|
3
22
|
|
package/README.md
CHANGED
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
# Spacecat Shared - Scrape Client
|
|
2
2
|
|
|
3
|
+
|
|
3
4
|
A JavaScript client for managing web scraping jobs, part of the SpaceCat Shared library. The ScrapeClient provides a comprehensive interface for creating, monitoring, and retrieving results from web scraping operations without needing to access the SpaceCat API service directly.
|
|
4
5
|
|
|
5
6
|
## Installation
|
package/package.json
CHANGED
|
@@ -11,8 +11,7 @@
|
|
|
11
11
|
*/
|
|
12
12
|
|
|
13
13
|
import {
|
|
14
|
-
isIsoDate, isObject, isValidUrl,
|
|
15
|
-
isValidUUID,
|
|
14
|
+
hasText, isIsoDate, isNonEmptyArray, isObject, isValidUrl, isValidUUID,
|
|
16
15
|
} from '@adobe/spacecat-shared-utils';
|
|
17
16
|
import { ScrapeJob as ScrapeJobModel } from '@adobe/spacecat-shared-data-access';
|
|
18
17
|
import { ScrapeJobDto } from './scrapeJobDto.js';
|
|
@@ -118,6 +117,8 @@ export default class ScrapeClient {
|
|
|
118
117
|
/**
|
|
119
118
|
* Create and start a new scrape job.
|
|
120
119
|
* @param {object} data - json data for scrape job
|
|
120
|
+
* @param {number} data.maxScrapeAge - (optional) max age of scrapes in hours
|
|
121
|
+
* default is 24, 0 to force rescrape
|
|
121
122
|
* @returns {Promise<Response>} newly created job object
|
|
122
123
|
*/
|
|
123
124
|
async createScrapeJob(data) {
|
|
@@ -125,7 +126,12 @@ export default class ScrapeClient {
|
|
|
125
126
|
this.validateRequestData(data);
|
|
126
127
|
|
|
127
128
|
const {
|
|
128
|
-
urls,
|
|
129
|
+
urls,
|
|
130
|
+
options,
|
|
131
|
+
customHeaders,
|
|
132
|
+
processingType = ScrapeJobModel.ScrapeProcessingType.DEFAULT,
|
|
133
|
+
maxScrapeAge = 24,
|
|
134
|
+
auditData = {},
|
|
129
135
|
} = data;
|
|
130
136
|
|
|
131
137
|
this.config.log.info(`Creating a new scrape job with ${urls.length} URLs.`);
|
|
@@ -142,6 +148,8 @@ export default class ScrapeClient {
|
|
|
142
148
|
processingType,
|
|
143
149
|
mergedOptions,
|
|
144
150
|
customHeaders,
|
|
151
|
+
maxScrapeAge,
|
|
152
|
+
auditData,
|
|
145
153
|
);
|
|
146
154
|
return ScrapeJobDto.toJSON(job);
|
|
147
155
|
} catch (error) {
|
|
@@ -221,6 +229,24 @@ export default class ScrapeClient {
|
|
|
221
229
|
}
|
|
222
230
|
}
|
|
223
231
|
|
|
232
|
+
async getScrapeResultPaths(jobId) {
|
|
233
|
+
try {
|
|
234
|
+
const job = await this.scrapeSupervisor.getScrapeJob(jobId);
|
|
235
|
+
if (!job) {
|
|
236
|
+
return null;
|
|
237
|
+
}
|
|
238
|
+
const { ScrapeUrl } = this.config.dataAccess;
|
|
239
|
+
const scrapeUrls = await ScrapeUrl.allByScrapeJobId(job.getId());
|
|
240
|
+
return scrapeUrls
|
|
241
|
+
.filter((url) => url.getStatus() === ScrapeJobModel.ScrapeUrlStatus.COMPLETE)
|
|
242
|
+
.reduce((map, url) => map.set(url.getUrl(), url.getPath()), new Map());
|
|
243
|
+
} catch (error) {
|
|
244
|
+
const msgError = `Failed to fetch the scrape job result: ${error.message}`;
|
|
245
|
+
this.config.log.error(msgError);
|
|
246
|
+
throw new Error(msgError);
|
|
247
|
+
}
|
|
248
|
+
}
|
|
249
|
+
|
|
224
250
|
/**
|
|
225
251
|
* Get all scrape jobs by baseURL and processing type
|
|
226
252
|
* @param {string} baseURL - The baseURL of the jobs to fetch.
|
|
@@ -33,37 +33,10 @@ function ScrapeJobSupervisor(services, config) {
|
|
|
33
33
|
const { ScrapeJob } = dataAccess;
|
|
34
34
|
|
|
35
35
|
const {
|
|
36
|
-
queues = [], // Array of scrape queues
|
|
37
36
|
scrapeWorkerQueue, // URL of the scrape worker queue
|
|
38
37
|
maxUrlsPerMessage,
|
|
39
38
|
} = config;
|
|
40
39
|
|
|
41
|
-
/**
|
|
42
|
-
* Get the queue with the least number of messages.
|
|
43
|
-
*/
|
|
44
|
-
async function getAvailableScrapeQueue() {
|
|
45
|
-
const countMessages = async (queue) => {
|
|
46
|
-
const count = await sqs.getQueueMessageCount(queue);
|
|
47
|
-
return { queue, count };
|
|
48
|
-
};
|
|
49
|
-
|
|
50
|
-
const arrProm = queues.map(
|
|
51
|
-
(queue) => countMessages(queue),
|
|
52
|
-
);
|
|
53
|
-
const queueMessageCounts = await Promise.all(arrProm);
|
|
54
|
-
|
|
55
|
-
if (queueMessageCounts.length === 0) {
|
|
56
|
-
return null;
|
|
57
|
-
}
|
|
58
|
-
|
|
59
|
-
// get the queue with the lowest number of messages
|
|
60
|
-
const queueWithLeastMessages = queueMessageCounts.reduce(
|
|
61
|
-
(min, current) => (min.count < current.count ? min : current),
|
|
62
|
-
);
|
|
63
|
-
log.info(`Queue with least messages: ${queueWithLeastMessages.queue}`);
|
|
64
|
-
return queueWithLeastMessages.queue;
|
|
65
|
-
}
|
|
66
|
-
|
|
67
40
|
function determineBaseURL(urls) {
|
|
68
41
|
// Initially, we will just use the domain of the first URL
|
|
69
42
|
const url = new URL(urls[0]);
|
|
@@ -74,7 +47,6 @@ function ScrapeJobSupervisor(services, config) {
|
|
|
74
47
|
* Create a new scrape job by claiming one of the free scrape queues, persisting the scrape job
|
|
75
48
|
* metadata, and setting the job status to 'RUNNING'.
|
|
76
49
|
* @param {Array<string>} urls - The list of URLs to scrape.
|
|
77
|
-
* @param {string} scrapeQueueId - Name of the queue to use for this scrape job.
|
|
78
50
|
* @param {string} processingType - The scrape handler to be used for the scrape job.
|
|
79
51
|
* @param {object} options - Client provided options for the scrape job.
|
|
80
52
|
* @param {object} customHeaders - Custom headers to be sent with each request.
|
|
@@ -82,14 +54,12 @@ function ScrapeJobSupervisor(services, config) {
|
|
|
82
54
|
*/
|
|
83
55
|
async function createNewScrapeJob(
|
|
84
56
|
urls,
|
|
85
|
-
scrapeQueueId,
|
|
86
57
|
processingType,
|
|
87
58
|
options,
|
|
88
59
|
customHeaders = null,
|
|
89
60
|
) {
|
|
90
61
|
const jobData = {
|
|
91
62
|
baseURL: determineBaseURL(urls),
|
|
92
|
-
scrapeQueueId,
|
|
93
63
|
processingType,
|
|
94
64
|
options,
|
|
95
65
|
urlCount: urls.length,
|
|
@@ -151,10 +121,13 @@ function ScrapeJobSupervisor(services, config) {
|
|
|
151
121
|
* @param {Array<string>} urls - Array of URL records to queue.
|
|
152
122
|
* @param {object} scrapeJob - The scrape job record.
|
|
153
123
|
* @param {object} customHeaders - Optional custom headers to be sent with each request.
|
|
124
|
+
* @param {string} maxScrapeAge - The maximum age of the scrape job
|
|
125
|
+
* @param auditContext
|
|
154
126
|
*/
|
|
155
|
-
|
|
127
|
+
// eslint-disable-next-line max-len
|
|
128
|
+
async function queueUrlsForScrapeWorker(urls, scrapeJob, customHeaders, maxScrapeAge, auditData) {
|
|
156
129
|
log.info(`Starting a new scrape job of baseUrl: ${scrapeJob.getBaseURL()} with ${urls.length}`
|
|
157
|
-
+
|
|
130
|
+
+ ' URLs.'
|
|
158
131
|
+ `(jobId: ${scrapeJob.getId()})`);
|
|
159
132
|
|
|
160
133
|
const options = scrapeJob.getOptions();
|
|
@@ -183,6 +156,8 @@ function ScrapeJobSupervisor(services, config) {
|
|
|
183
156
|
batchOffset: offset,
|
|
184
157
|
customHeaders,
|
|
185
158
|
options,
|
|
159
|
+
maxScrapeAge,
|
|
160
|
+
auditData,
|
|
186
161
|
};
|
|
187
162
|
|
|
188
163
|
// eslint-disable-next-line no-await-in-loop
|
|
@@ -193,8 +168,11 @@ function ScrapeJobSupervisor(services, config) {
|
|
|
193
168
|
/**
|
|
194
169
|
* Starts a new scrape job.
|
|
195
170
|
* @param {Array<string>} urls - The URLs to scrape.
|
|
171
|
+
* @param {string} processingType - The type of processing to perform.
|
|
196
172
|
* @param {object} options - Optional configuration params for the scrape job.
|
|
197
173
|
* @param {object} customHeaders - Optional custom headers to be sent with each request.
|
|
174
|
+
* @param {number} maxScrapeAge - The maximum age of the scrape job
|
|
175
|
+
* @param auditContext
|
|
198
176
|
* @returns {Promise<ScrapeJob>} newly created job object
|
|
199
177
|
*/
|
|
200
178
|
async function startNewJob(
|
|
@@ -202,18 +180,11 @@ function ScrapeJobSupervisor(services, config) {
|
|
|
202
180
|
processingType,
|
|
203
181
|
options,
|
|
204
182
|
customHeaders,
|
|
183
|
+
maxScrapeAge,
|
|
184
|
+
auditContext,
|
|
205
185
|
) {
|
|
206
|
-
// Determine if there is a free scrape queue
|
|
207
|
-
const scrapeQueueId = await getAvailableScrapeQueue();
|
|
208
|
-
|
|
209
|
-
if (scrapeQueueId === null) {
|
|
210
|
-
throw new Error('Service Unavailable: No scrape queue available');
|
|
211
|
-
}
|
|
212
|
-
|
|
213
|
-
// If a queue is available, create the scrape-job record in dataAccess:
|
|
214
186
|
const newScrapeJob = await createNewScrapeJob(
|
|
215
187
|
urls,
|
|
216
|
-
scrapeQueueId,
|
|
217
188
|
processingType,
|
|
218
189
|
options,
|
|
219
190
|
customHeaders,
|
|
@@ -224,14 +195,13 @@ function ScrapeJobSupervisor(services, config) {
|
|
|
224
195
|
+ `- baseUrl: ${newScrapeJob.getBaseURL()}\n`
|
|
225
196
|
+ `- urlCount: ${urls.length}\n`
|
|
226
197
|
+ `- jobId: ${newScrapeJob.getId()}\n`
|
|
227
|
-
+ `- scrapeQueueId: ${scrapeQueueId}\n`
|
|
228
198
|
+ `- customHeaders: ${JSON.stringify(customHeaders)}\n`
|
|
229
199
|
+ `- options: ${JSON.stringify(options)}`,
|
|
230
200
|
);
|
|
231
201
|
|
|
232
202
|
// Queue all URLs for scrape as a single message. This enables the controller to respond with
|
|
233
203
|
// a job ID ASAP, while the individual URLs are queued up asynchronously by another function.
|
|
234
|
-
await queueUrlsForScrapeWorker(urls, newScrapeJob, customHeaders);
|
|
204
|
+
await queueUrlsForScrapeWorker(urls, newScrapeJob, customHeaders, maxScrapeAge, auditContext);
|
|
235
205
|
|
|
236
206
|
return newScrapeJob;
|
|
237
207
|
}
|