@adobe/spacecat-shared-scrape-client 1.0.7 → 2.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/CHANGELOG.md CHANGED
@@ -1,3 +1,22 @@
1
+ # [@adobe/spacecat-shared-scrape-client-v2.1.0](https://github.com/adobe/spacecat-shared/compare/@adobe/spacecat-shared-scrape-client-v2.0.0...@adobe/spacecat-shared-scrape-client-v2.1.0) (2025-08-20)
2
+
3
+
4
+ ### Features
5
+
6
+ * add scrape-client destination ([#913](https://github.com/adobe/spacecat-shared/issues/913)) ([e208a87](https://github.com/adobe/spacecat-shared/commit/e208a87214874a2708ac2d7614fcfd4c0770fe17))
7
+
8
+ # [@adobe/spacecat-shared-scrape-client-v2.0.0](https://github.com/adobe/spacecat-shared/compare/@adobe/spacecat-shared-scrape-client-v1.0.7...@adobe/spacecat-shared-scrape-client-v2.0.0) (2025-08-13)
9
+
10
+
11
+ ### Features
12
+
13
+ * re-scraping of URLs ([b889a19](https://github.com/adobe/spacecat-shared/commit/b889a19b1cec20b1f1dc32a89b34ab5125fa90e6))
14
+
15
+
16
+ ### BREAKING CHANGES
17
+
18
+ * ScrapeClient does not choose a scrape queue anymore. This is done in Scrape Job Manager.
19
+
1
20
  # [@adobe/spacecat-shared-scrape-client-v1.0.7](https://github.com/adobe/spacecat-shared/compare/@adobe/spacecat-shared-scrape-client-v1.0.6...@adobe/spacecat-shared-scrape-client-v1.0.7) (2025-08-09)
2
21
 
3
22
 
package/README.md CHANGED
@@ -1,5 +1,6 @@
1
1
  # Spacecat Shared - Scrape Client
2
2
 
3
+
3
4
  A JavaScript client for managing web scraping jobs, part of the SpaceCat Shared library. The ScrapeClient provides a comprehensive interface for creating, monitoring, and retrieving results from web scraping operations without needing to access the SpaceCat API service directly.
4
5
 
5
6
  ## Installation
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@adobe/spacecat-shared-scrape-client",
3
- "version": "1.0.7",
3
+ "version": "2.1.0",
4
4
  "description": "Shared modules of the Spacecat Services - Scrape Client",
5
5
  "type": "module",
6
6
  "engines": {
@@ -11,8 +11,7 @@
11
11
  */
12
12
 
13
13
  import {
14
- isIsoDate, isObject, isValidUrl, isNonEmptyArray, hasText,
15
- isValidUUID,
14
+ hasText, isIsoDate, isNonEmptyArray, isObject, isValidUrl, isValidUUID,
16
15
  } from '@adobe/spacecat-shared-utils';
17
16
  import { ScrapeJob as ScrapeJobModel } from '@adobe/spacecat-shared-data-access';
18
17
  import { ScrapeJobDto } from './scrapeJobDto.js';
@@ -118,6 +117,8 @@ export default class ScrapeClient {
118
117
  /**
119
118
  * Create and start a new scrape job.
120
119
  * @param {object} data - json data for scrape job
120
+ * @param {number} data.maxScrapeAge - (optional) max age of scrapes in hours
121
+ * default is 24, 0 to force rescrape
121
122
  * @returns {Promise<Response>} newly created job object
122
123
  */
123
124
  async createScrapeJob(data) {
@@ -125,7 +126,12 @@ export default class ScrapeClient {
125
126
  this.validateRequestData(data);
126
127
 
127
128
  const {
128
- urls, options, customHeaders, processingType = ScrapeJobModel.ScrapeProcessingType.DEFAULT,
129
+ urls,
130
+ options,
131
+ customHeaders,
132
+ processingType = ScrapeJobModel.ScrapeProcessingType.DEFAULT,
133
+ maxScrapeAge = 24,
134
+ auditData = {},
129
135
  } = data;
130
136
 
131
137
  this.config.log.info(`Creating a new scrape job with ${urls.length} URLs.`);
@@ -142,6 +148,8 @@ export default class ScrapeClient {
142
148
  processingType,
143
149
  mergedOptions,
144
150
  customHeaders,
151
+ maxScrapeAge,
152
+ auditData,
145
153
  );
146
154
  return ScrapeJobDto.toJSON(job);
147
155
  } catch (error) {
@@ -221,6 +229,24 @@ export default class ScrapeClient {
221
229
  }
222
230
  }
223
231
 
232
+ async getScrapeResultPaths(jobId) {
233
+ try {
234
+ const job = await this.scrapeSupervisor.getScrapeJob(jobId);
235
+ if (!job) {
236
+ return null;
237
+ }
238
+ const { ScrapeUrl } = this.config.dataAccess;
239
+ const scrapeUrls = await ScrapeUrl.allByScrapeJobId(job.getId());
240
+ return scrapeUrls
241
+ .filter((url) => url.getStatus() === ScrapeJobModel.ScrapeUrlStatus.COMPLETE)
242
+ .reduce((map, url) => map.set(url.getUrl(), url.getPath()), new Map());
243
+ } catch (error) {
244
+ const msgError = `Failed to fetch the scrape job result: ${error.message}`;
245
+ this.config.log.error(msgError);
246
+ throw new Error(msgError);
247
+ }
248
+ }
249
+
224
250
  /**
225
251
  * Get all scrape jobs by baseURL and processing type
226
252
  * @param {string} baseURL - The baseURL of the jobs to fetch.
@@ -33,37 +33,10 @@ function ScrapeJobSupervisor(services, config) {
33
33
  const { ScrapeJob } = dataAccess;
34
34
 
35
35
  const {
36
- queues = [], // Array of scrape queues
37
36
  scrapeWorkerQueue, // URL of the scrape worker queue
38
37
  maxUrlsPerMessage,
39
38
  } = config;
40
39
 
41
- /**
42
- * Get the queue with the least number of messages.
43
- */
44
- async function getAvailableScrapeQueue() {
45
- const countMessages = async (queue) => {
46
- const count = await sqs.getQueueMessageCount(queue);
47
- return { queue, count };
48
- };
49
-
50
- const arrProm = queues.map(
51
- (queue) => countMessages(queue),
52
- );
53
- const queueMessageCounts = await Promise.all(arrProm);
54
-
55
- if (queueMessageCounts.length === 0) {
56
- return null;
57
- }
58
-
59
- // get the queue with the lowest number of messages
60
- const queueWithLeastMessages = queueMessageCounts.reduce(
61
- (min, current) => (min.count < current.count ? min : current),
62
- );
63
- log.info(`Queue with least messages: ${queueWithLeastMessages.queue}`);
64
- return queueWithLeastMessages.queue;
65
- }
66
-
67
40
  function determineBaseURL(urls) {
68
41
  // Initially, we will just use the domain of the first URL
69
42
  const url = new URL(urls[0]);
@@ -74,7 +47,6 @@ function ScrapeJobSupervisor(services, config) {
74
47
  * Create a new scrape job by claiming one of the free scrape queues, persisting the scrape job
75
48
  * metadata, and setting the job status to 'RUNNING'.
76
49
  * @param {Array<string>} urls - The list of URLs to scrape.
77
- * @param {string} scrapeQueueId - Name of the queue to use for this scrape job.
78
50
  * @param {string} processingType - The scrape handler to be used for the scrape job.
79
51
  * @param {object} options - Client provided options for the scrape job.
80
52
  * @param {object} customHeaders - Custom headers to be sent with each request.
@@ -82,14 +54,12 @@ function ScrapeJobSupervisor(services, config) {
82
54
  */
83
55
  async function createNewScrapeJob(
84
56
  urls,
85
- scrapeQueueId,
86
57
  processingType,
87
58
  options,
88
59
  customHeaders = null,
89
60
  ) {
90
61
  const jobData = {
91
62
  baseURL: determineBaseURL(urls),
92
- scrapeQueueId,
93
63
  processingType,
94
64
  options,
95
65
  urlCount: urls.length,
@@ -151,10 +121,13 @@ function ScrapeJobSupervisor(services, config) {
151
121
  * @param {Array<string>} urls - Array of URL records to queue.
152
122
  * @param {object} scrapeJob - The scrape job record.
153
123
  * @param {object} customHeaders - Optional custom headers to be sent with each request.
124
+ * @param {string} maxScrapeAge - The maximum age of the scrape job
125
+ * @param auditContext
154
126
  */
155
- async function queueUrlsForScrapeWorker(urls, scrapeJob, customHeaders) {
127
+ // eslint-disable-next-line max-len
128
+ async function queueUrlsForScrapeWorker(urls, scrapeJob, customHeaders, maxScrapeAge, auditData) {
156
129
  log.info(`Starting a new scrape job of baseUrl: ${scrapeJob.getBaseURL()} with ${urls.length}`
157
- + ` URLs. This new job has claimed: ${scrapeJob.getScrapeQueueId()} `
130
+ + ' URLs.'
158
131
  + `(jobId: ${scrapeJob.getId()})`);
159
132
 
160
133
  const options = scrapeJob.getOptions();
@@ -183,6 +156,8 @@ function ScrapeJobSupervisor(services, config) {
183
156
  batchOffset: offset,
184
157
  customHeaders,
185
158
  options,
159
+ maxScrapeAge,
160
+ auditData,
186
161
  };
187
162
 
188
163
  // eslint-disable-next-line no-await-in-loop
@@ -193,8 +168,11 @@ function ScrapeJobSupervisor(services, config) {
193
168
  /**
194
169
  * Starts a new scrape job.
195
170
  * @param {Array<string>} urls - The URLs to scrape.
171
+ * @param {string} processingType - The type of processing to perform.
196
172
  * @param {object} options - Optional configuration params for the scrape job.
197
173
  * @param {object} customHeaders - Optional custom headers to be sent with each request.
174
+ * @param {number} maxScrapeAge - The maximum age of the scrape job
175
+ * @param auditContext
198
176
  * @returns {Promise<ScrapeJob>} newly created job object
199
177
  */
200
178
  async function startNewJob(
@@ -202,18 +180,11 @@ function ScrapeJobSupervisor(services, config) {
202
180
  processingType,
203
181
  options,
204
182
  customHeaders,
183
+ maxScrapeAge,
184
+ auditContext,
205
185
  ) {
206
- // Determine if there is a free scrape queue
207
- const scrapeQueueId = await getAvailableScrapeQueue();
208
-
209
- if (scrapeQueueId === null) {
210
- throw new Error('Service Unavailable: No scrape queue available');
211
- }
212
-
213
- // If a queue is available, create the scrape-job record in dataAccess:
214
186
  const newScrapeJob = await createNewScrapeJob(
215
187
  urls,
216
- scrapeQueueId,
217
188
  processingType,
218
189
  options,
219
190
  customHeaders,
@@ -224,14 +195,13 @@ function ScrapeJobSupervisor(services, config) {
224
195
  + `- baseUrl: ${newScrapeJob.getBaseURL()}\n`
225
196
  + `- urlCount: ${urls.length}\n`
226
197
  + `- jobId: ${newScrapeJob.getId()}\n`
227
- + `- scrapeQueueId: ${scrapeQueueId}\n`
228
198
  + `- customHeaders: ${JSON.stringify(customHeaders)}\n`
229
199
  + `- options: ${JSON.stringify(options)}`,
230
200
  );
231
201
 
232
202
  // Queue all URLs for scrape as a single message. This enables the controller to respond with
233
203
  // a job ID ASAP, while the individual URLs are queued up asynchronously by another function.
234
- await queueUrlsForScrapeWorker(urls, newScrapeJob, customHeaders);
204
+ await queueUrlsForScrapeWorker(urls, newScrapeJob, customHeaders, maxScrapeAge, auditContext);
235
205
 
236
206
  return newScrapeJob;
237
207
  }