@adobe/spacecat-shared-scrape-client 1.0.7 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/CHANGELOG.md CHANGED
@@ -1,3 +1,15 @@
1
+ # [@adobe/spacecat-shared-scrape-client-v2.0.0](https://github.com/adobe/spacecat-shared/compare/@adobe/spacecat-shared-scrape-client-v1.0.7...@adobe/spacecat-shared-scrape-client-v2.0.0) (2025-08-13)
2
+
3
+
4
+ ### Features
5
+
6
+ * re-scraping of URLs ([b889a19](https://github.com/adobe/spacecat-shared/commit/b889a19b1cec20b1f1dc32a89b34ab5125fa90e6))
7
+
8
+
9
+ ### BREAKING CHANGES
10
+
11
+ * ScrapeClient does not choose a scrape queue anymore. This is done in Scrape Job Manager.
12
+
1
13
  # [@adobe/spacecat-shared-scrape-client-v1.0.7](https://github.com/adobe/spacecat-shared/compare/@adobe/spacecat-shared-scrape-client-v1.0.6...@adobe/spacecat-shared-scrape-client-v1.0.7) (2025-08-09)
2
14
 
3
15
 
package/README.md CHANGED
@@ -1,5 +1,6 @@
1
1
  # Spacecat Shared - Scrape Client
2
2
 
3
+
3
4
  A JavaScript client for managing web scraping jobs, part of the SpaceCat Shared library. The ScrapeClient provides a comprehensive interface for creating, monitoring, and retrieving results from web scraping operations without needing to access the SpaceCat API service directly.
4
5
 
5
6
  ## Installation
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@adobe/spacecat-shared-scrape-client",
3
- "version": "1.0.7",
3
+ "version": "2.0.0",
4
4
  "description": "Shared modules of the Spacecat Services - Scrape Client",
5
5
  "type": "module",
6
6
  "engines": {
@@ -118,6 +118,8 @@ export default class ScrapeClient {
118
118
  /**
119
119
  * Create and start a new scrape job.
120
120
  * @param {object} data - json data for scrape job
121
+ * @param {number} data.maxScrapeAge - (optional) max age of scrapes in hours
122
+ * default is 24, 0 to force rescrape
121
123
  * @returns {Promise<Response>} newly created job object
122
124
  */
123
125
  async createScrapeJob(data) {
@@ -125,7 +127,11 @@ export default class ScrapeClient {
125
127
  this.validateRequestData(data);
126
128
 
127
129
  const {
128
- urls, options, customHeaders, processingType = ScrapeJobModel.ScrapeProcessingType.DEFAULT,
130
+ urls,
131
+ options,
132
+ customHeaders,
133
+ processingType = ScrapeJobModel.ScrapeProcessingType.DEFAULT,
134
+ maxScrapeAge = 24,
129
135
  } = data;
130
136
 
131
137
  this.config.log.info(`Creating a new scrape job with ${urls.length} URLs.`);
@@ -142,6 +148,7 @@ export default class ScrapeClient {
142
148
  processingType,
143
149
  mergedOptions,
144
150
  customHeaders,
151
+ maxScrapeAge,
145
152
  );
146
153
  return ScrapeJobDto.toJSON(job);
147
154
  } catch (error) {
@@ -33,37 +33,10 @@ function ScrapeJobSupervisor(services, config) {
33
33
  const { ScrapeJob } = dataAccess;
34
34
 
35
35
  const {
36
- queues = [], // Array of scrape queues
37
36
  scrapeWorkerQueue, // URL of the scrape worker queue
38
37
  maxUrlsPerMessage,
39
38
  } = config;
40
39
 
41
- /**
42
- * Get the queue with the least number of messages.
43
- */
44
- async function getAvailableScrapeQueue() {
45
- const countMessages = async (queue) => {
46
- const count = await sqs.getQueueMessageCount(queue);
47
- return { queue, count };
48
- };
49
-
50
- const arrProm = queues.map(
51
- (queue) => countMessages(queue),
52
- );
53
- const queueMessageCounts = await Promise.all(arrProm);
54
-
55
- if (queueMessageCounts.length === 0) {
56
- return null;
57
- }
58
-
59
- // get the queue with the lowest number of messages
60
- const queueWithLeastMessages = queueMessageCounts.reduce(
61
- (min, current) => (min.count < current.count ? min : current),
62
- );
63
- log.info(`Queue with least messages: ${queueWithLeastMessages.queue}`);
64
- return queueWithLeastMessages.queue;
65
- }
66
-
67
40
  function determineBaseURL(urls) {
68
41
  // Initially, we will just use the domain of the first URL
69
42
  const url = new URL(urls[0]);
@@ -74,7 +47,6 @@ function ScrapeJobSupervisor(services, config) {
74
47
  * Create a new scrape job by claiming one of the free scrape queues, persisting the scrape job
75
48
  * metadata, and setting the job status to 'RUNNING'.
76
49
  * @param {Array<string>} urls - The list of URLs to scrape.
77
- * @param {string} scrapeQueueId - Name of the queue to use for this scrape job.
78
50
  * @param {string} processingType - The scrape handler to be used for the scrape job.
79
51
  * @param {object} options - Client provided options for the scrape job.
80
52
  * @param {object} customHeaders - Custom headers to be sent with each request.
@@ -82,14 +54,12 @@ function ScrapeJobSupervisor(services, config) {
82
54
  */
83
55
  async function createNewScrapeJob(
84
56
  urls,
85
- scrapeQueueId,
86
57
  processingType,
87
58
  options,
88
59
  customHeaders = null,
89
60
  ) {
90
61
  const jobData = {
91
62
  baseURL: determineBaseURL(urls),
92
- scrapeQueueId,
93
63
  processingType,
94
64
  options,
95
65
  urlCount: urls.length,
@@ -151,8 +121,9 @@ function ScrapeJobSupervisor(services, config) {
151
121
  * @param {Array<string>} urls - Array of URL records to queue.
152
122
  * @param {object} scrapeJob - The scrape job record.
153
123
  * @param {object} customHeaders - Optional custom headers to be sent with each request.
124
+ * @param {string} maxScrapeAge - The maximum age of the scrape job
154
125
  */
155
- async function queueUrlsForScrapeWorker(urls, scrapeJob, customHeaders) {
126
+ async function queueUrlsForScrapeWorker(urls, scrapeJob, customHeaders, maxScrapeAge) {
156
127
  log.info(`Starting a new scrape job of baseUrl: ${scrapeJob.getBaseURL()} with ${urls.length}`
157
128
  + ` URLs. This new job has claimed: ${scrapeJob.getScrapeQueueId()} `
158
129
  + `(jobId: ${scrapeJob.getId()})`);
@@ -183,6 +154,7 @@ function ScrapeJobSupervisor(services, config) {
183
154
  batchOffset: offset,
184
155
  customHeaders,
185
156
  options,
157
+ maxScrapeAge,
186
158
  };
187
159
 
188
160
  // eslint-disable-next-line no-await-in-loop
@@ -193,8 +165,10 @@ function ScrapeJobSupervisor(services, config) {
193
165
  /**
194
166
  * Starts a new scrape job.
195
167
  * @param {Array<string>} urls - The URLs to scrape.
168
+ * @param {string} processingType - The type of processing to perform.
196
169
  * @param {object} options - Optional configuration params for the scrape job.
197
170
  * @param {object} customHeaders - Optional custom headers to be sent with each request.
171
+ * @param {string} maxScrapeAge - The maximum age of the scrape job
198
172
  * @returns {Promise<ScrapeJob>} newly created job object
199
173
  */
200
174
  async function startNewJob(
@@ -202,18 +176,10 @@ function ScrapeJobSupervisor(services, config) {
202
176
  processingType,
203
177
  options,
204
178
  customHeaders,
179
+ maxScrapeAge,
205
180
  ) {
206
- // Determine if there is a free scrape queue
207
- const scrapeQueueId = await getAvailableScrapeQueue();
208
-
209
- if (scrapeQueueId === null) {
210
- throw new Error('Service Unavailable: No scrape queue available');
211
- }
212
-
213
- // If a queue is available, create the scrape-job record in dataAccess:
214
181
  const newScrapeJob = await createNewScrapeJob(
215
182
  urls,
216
- scrapeQueueId,
217
183
  processingType,
218
184
  options,
219
185
  customHeaders,
@@ -224,14 +190,13 @@ function ScrapeJobSupervisor(services, config) {
224
190
  + `- baseUrl: ${newScrapeJob.getBaseURL()}\n`
225
191
  + `- urlCount: ${urls.length}\n`
226
192
  + `- jobId: ${newScrapeJob.getId()}\n`
227
- + `- scrapeQueueId: ${scrapeQueueId}\n`
228
193
  + `- customHeaders: ${JSON.stringify(customHeaders)}\n`
229
194
  + `- options: ${JSON.stringify(options)}`,
230
195
  );
231
196
 
232
197
  // Queue all URLs for scrape as a single message. This enables the controller to respond with
233
198
  // a job ID ASAP, while the individual URLs are queued up asynchronously by another function.
234
- await queueUrlsForScrapeWorker(urls, newScrapeJob, customHeaders);
199
+ await queueUrlsForScrapeWorker(urls, newScrapeJob, customHeaders, maxScrapeAge);
235
200
 
236
201
  return newScrapeJob;
237
202
  }