@adobe/spacecat-shared-scrape-client 1.0.7 → 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +12 -0
- package/README.md +1 -0
- package/package.json +1 -1
- package/src/clients/scrape-client.js +8 -1
- package/src/clients/scrape-job-supervisor.js +7 -42
package/CHANGELOG.md
CHANGED
|
@@ -1,3 +1,15 @@
|
|
|
1
|
+
# [@adobe/spacecat-shared-scrape-client-v2.0.0](https://github.com/adobe/spacecat-shared/compare/@adobe/spacecat-shared-scrape-client-v1.0.7...@adobe/spacecat-shared-scrape-client-v2.0.0) (2025-08-13)
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
### Features
|
|
5
|
+
|
|
6
|
+
* re-scraping of URLs ([b889a19](https://github.com/adobe/spacecat-shared/commit/b889a19b1cec20b1f1dc32a89b34ab5125fa90e6))
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
### BREAKING CHANGES
|
|
10
|
+
|
|
11
|
+
* ScrapeClient does not choose a scrape queue anymore. This is done in Scrape Job Manager.
|
|
12
|
+
|
|
1
13
|
# [@adobe/spacecat-shared-scrape-client-v1.0.7](https://github.com/adobe/spacecat-shared/compare/@adobe/spacecat-shared-scrape-client-v1.0.6...@adobe/spacecat-shared-scrape-client-v1.0.7) (2025-08-09)
|
|
2
14
|
|
|
3
15
|
|
package/README.md
CHANGED
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
# Spacecat Shared - Scrape Client
|
|
2
2
|
|
|
3
|
+
|
|
3
4
|
A JavaScript client for managing web scraping jobs, part of the SpaceCat Shared library. The ScrapeClient provides a comprehensive interface for creating, monitoring, and retrieving results from web scraping operations without needing to access the SpaceCat API service directly.
|
|
4
5
|
|
|
5
6
|
## Installation
|
package/package.json
CHANGED
|
@@ -118,6 +118,8 @@ export default class ScrapeClient {
|
|
|
118
118
|
/**
|
|
119
119
|
* Create and start a new scrape job.
|
|
120
120
|
* @param {object} data - json data for scrape job
|
|
121
|
+
* @param {number} data.maxScrapeAge - (optional) max age of scrapes in hours
|
|
122
|
+
* default is 24, 0 to force rescrape
|
|
121
123
|
* @returns {Promise<Response>} newly created job object
|
|
122
124
|
*/
|
|
123
125
|
async createScrapeJob(data) {
|
|
@@ -125,7 +127,11 @@ export default class ScrapeClient {
|
|
|
125
127
|
this.validateRequestData(data);
|
|
126
128
|
|
|
127
129
|
const {
|
|
128
|
-
urls,
|
|
130
|
+
urls,
|
|
131
|
+
options,
|
|
132
|
+
customHeaders,
|
|
133
|
+
processingType = ScrapeJobModel.ScrapeProcessingType.DEFAULT,
|
|
134
|
+
maxScrapeAge = 24,
|
|
129
135
|
} = data;
|
|
130
136
|
|
|
131
137
|
this.config.log.info(`Creating a new scrape job with ${urls.length} URLs.`);
|
|
@@ -142,6 +148,7 @@ export default class ScrapeClient {
|
|
|
142
148
|
processingType,
|
|
143
149
|
mergedOptions,
|
|
144
150
|
customHeaders,
|
|
151
|
+
maxScrapeAge,
|
|
145
152
|
);
|
|
146
153
|
return ScrapeJobDto.toJSON(job);
|
|
147
154
|
} catch (error) {
|
|
@@ -33,37 +33,10 @@ function ScrapeJobSupervisor(services, config) {
|
|
|
33
33
|
const { ScrapeJob } = dataAccess;
|
|
34
34
|
|
|
35
35
|
const {
|
|
36
|
-
queues = [], // Array of scrape queues
|
|
37
36
|
scrapeWorkerQueue, // URL of the scrape worker queue
|
|
38
37
|
maxUrlsPerMessage,
|
|
39
38
|
} = config;
|
|
40
39
|
|
|
41
|
-
/**
|
|
42
|
-
* Get the queue with the least number of messages.
|
|
43
|
-
*/
|
|
44
|
-
async function getAvailableScrapeQueue() {
|
|
45
|
-
const countMessages = async (queue) => {
|
|
46
|
-
const count = await sqs.getQueueMessageCount(queue);
|
|
47
|
-
return { queue, count };
|
|
48
|
-
};
|
|
49
|
-
|
|
50
|
-
const arrProm = queues.map(
|
|
51
|
-
(queue) => countMessages(queue),
|
|
52
|
-
);
|
|
53
|
-
const queueMessageCounts = await Promise.all(arrProm);
|
|
54
|
-
|
|
55
|
-
if (queueMessageCounts.length === 0) {
|
|
56
|
-
return null;
|
|
57
|
-
}
|
|
58
|
-
|
|
59
|
-
// get the queue with the lowest number of messages
|
|
60
|
-
const queueWithLeastMessages = queueMessageCounts.reduce(
|
|
61
|
-
(min, current) => (min.count < current.count ? min : current),
|
|
62
|
-
);
|
|
63
|
-
log.info(`Queue with least messages: ${queueWithLeastMessages.queue}`);
|
|
64
|
-
return queueWithLeastMessages.queue;
|
|
65
|
-
}
|
|
66
|
-
|
|
67
40
|
function determineBaseURL(urls) {
|
|
68
41
|
// Initially, we will just use the domain of the first URL
|
|
69
42
|
const url = new URL(urls[0]);
|
|
@@ -74,7 +47,6 @@ function ScrapeJobSupervisor(services, config) {
|
|
|
74
47
|
* Create a new scrape job by claiming one of the free scrape queues, persisting the scrape job
|
|
75
48
|
* metadata, and setting the job status to 'RUNNING'.
|
|
76
49
|
* @param {Array<string>} urls - The list of URLs to scrape.
|
|
77
|
-
* @param {string} scrapeQueueId - Name of the queue to use for this scrape job.
|
|
78
50
|
* @param {string} processingType - The scrape handler to be used for the scrape job.
|
|
79
51
|
* @param {object} options - Client provided options for the scrape job.
|
|
80
52
|
* @param {object} customHeaders - Custom headers to be sent with each request.
|
|
@@ -82,14 +54,12 @@ function ScrapeJobSupervisor(services, config) {
|
|
|
82
54
|
*/
|
|
83
55
|
async function createNewScrapeJob(
|
|
84
56
|
urls,
|
|
85
|
-
scrapeQueueId,
|
|
86
57
|
processingType,
|
|
87
58
|
options,
|
|
88
59
|
customHeaders = null,
|
|
89
60
|
) {
|
|
90
61
|
const jobData = {
|
|
91
62
|
baseURL: determineBaseURL(urls),
|
|
92
|
-
scrapeQueueId,
|
|
93
63
|
processingType,
|
|
94
64
|
options,
|
|
95
65
|
urlCount: urls.length,
|
|
@@ -151,8 +121,9 @@ function ScrapeJobSupervisor(services, config) {
|
|
|
151
121
|
* @param {Array<string>} urls - Array of URL records to queue.
|
|
152
122
|
* @param {object} scrapeJob - The scrape job record.
|
|
153
123
|
* @param {object} customHeaders - Optional custom headers to be sent with each request.
|
|
124
|
+
* @param {string} maxScrapeAge - The maximum age of the scrape job
|
|
154
125
|
*/
|
|
155
|
-
async function queueUrlsForScrapeWorker(urls, scrapeJob, customHeaders) {
|
|
126
|
+
async function queueUrlsForScrapeWorker(urls, scrapeJob, customHeaders, maxScrapeAge) {
|
|
156
127
|
log.info(`Starting a new scrape job of baseUrl: ${scrapeJob.getBaseURL()} with ${urls.length}`
|
|
157
128
|
+ ` URLs. This new job has claimed: ${scrapeJob.getScrapeQueueId()} `
|
|
158
129
|
+ `(jobId: ${scrapeJob.getId()})`);
|
|
@@ -183,6 +154,7 @@ function ScrapeJobSupervisor(services, config) {
|
|
|
183
154
|
batchOffset: offset,
|
|
184
155
|
customHeaders,
|
|
185
156
|
options,
|
|
157
|
+
maxScrapeAge,
|
|
186
158
|
};
|
|
187
159
|
|
|
188
160
|
// eslint-disable-next-line no-await-in-loop
|
|
@@ -193,8 +165,10 @@ function ScrapeJobSupervisor(services, config) {
|
|
|
193
165
|
/**
|
|
194
166
|
* Starts a new scrape job.
|
|
195
167
|
* @param {Array<string>} urls - The URLs to scrape.
|
|
168
|
+
* @param {string} processingType - The type of processing to perform.
|
|
196
169
|
* @param {object} options - Optional configuration params for the scrape job.
|
|
197
170
|
* @param {object} customHeaders - Optional custom headers to be sent with each request.
|
|
171
|
+
* @param {string} maxScrapeAge - The maximum age of the scrape job
|
|
198
172
|
* @returns {Promise<ScrapeJob>} newly created job object
|
|
199
173
|
*/
|
|
200
174
|
async function startNewJob(
|
|
@@ -202,18 +176,10 @@ function ScrapeJobSupervisor(services, config) {
|
|
|
202
176
|
processingType,
|
|
203
177
|
options,
|
|
204
178
|
customHeaders,
|
|
179
|
+
maxScrapeAge,
|
|
205
180
|
) {
|
|
206
|
-
// Determine if there is a free scrape queue
|
|
207
|
-
const scrapeQueueId = await getAvailableScrapeQueue();
|
|
208
|
-
|
|
209
|
-
if (scrapeQueueId === null) {
|
|
210
|
-
throw new Error('Service Unavailable: No scrape queue available');
|
|
211
|
-
}
|
|
212
|
-
|
|
213
|
-
// If a queue is available, create the scrape-job record in dataAccess:
|
|
214
181
|
const newScrapeJob = await createNewScrapeJob(
|
|
215
182
|
urls,
|
|
216
|
-
scrapeQueueId,
|
|
217
183
|
processingType,
|
|
218
184
|
options,
|
|
219
185
|
customHeaders,
|
|
@@ -224,14 +190,13 @@ function ScrapeJobSupervisor(services, config) {
|
|
|
224
190
|
+ `- baseUrl: ${newScrapeJob.getBaseURL()}\n`
|
|
225
191
|
+ `- urlCount: ${urls.length}\n`
|
|
226
192
|
+ `- jobId: ${newScrapeJob.getId()}\n`
|
|
227
|
-
+ `- scrapeQueueId: ${scrapeQueueId}\n`
|
|
228
193
|
+ `- customHeaders: ${JSON.stringify(customHeaders)}\n`
|
|
229
194
|
+ `- options: ${JSON.stringify(options)}`,
|
|
230
195
|
);
|
|
231
196
|
|
|
232
197
|
// Queue all URLs for scrape as a single message. This enables the controller to respond with
|
|
233
198
|
// a job ID ASAP, while the individual URLs are queued up asynchronously by another function.
|
|
234
|
-
await queueUrlsForScrapeWorker(urls, newScrapeJob, customHeaders);
|
|
199
|
+
await queueUrlsForScrapeWorker(urls, newScrapeJob, customHeaders, maxScrapeAge);
|
|
235
200
|
|
|
236
201
|
return newScrapeJob;
|
|
237
202
|
}
|