@adobe/spacecat-shared-scrape-client 1.0.2 → 1.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/CHANGELOG.md CHANGED
@@ -1,3 +1,17 @@
1
+ # [@adobe/spacecat-shared-scrape-client-v1.0.4](https://github.com/adobe/spacecat-shared/compare/@adobe/spacecat-shared-scrape-client-v1.0.3...@adobe/spacecat-shared-scrape-client-v1.0.4) (2025-07-24)
2
+
3
+
4
+ ### Bug Fixes
5
+
6
+ * (scrape-job-supervisor): add offset for URL numbering in batches ([#868](https://github.com/adobe/spacecat-shared/issues/868)) ([12789c0](https://github.com/adobe/spacecat-shared/commit/12789c0cabe33ad5e526793d645bfef421a851af))
7
+
8
+ # [@adobe/spacecat-shared-scrape-client-v1.0.3](https://github.com/adobe/spacecat-shared/compare/@adobe/spacecat-shared-scrape-client-v1.0.2...@adobe/spacecat-shared-scrape-client-v1.0.3) (2025-07-21)
9
+
10
+
11
+ ### Bug Fixes
12
+
13
+ * ScrapeClient handle large url lists ([#854](https://github.com/adobe/spacecat-shared/issues/854)) ([d0768db](https://github.com/adobe/spacecat-shared/commit/d0768db101d65bc604c64473648cba0344612025))
14
+
1
15
  # [@adobe/spacecat-shared-scrape-client-v1.0.2](https://github.com/adobe/spacecat-shared/compare/@adobe/spacecat-shared-scrape-client-v1.0.1...@adobe/spacecat-shared-scrape-client-v1.0.2) (2025-07-19)
2
16
 
3
17
 
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@adobe/spacecat-shared-scrape-client",
3
- "version": "1.0.2",
3
+ "version": "1.0.4",
4
4
  "description": "Shared modules of the Spacecat Services - Scrape Client",
5
5
  "type": "module",
6
6
  "engines": {
@@ -35,6 +35,7 @@ function ScrapeJobSupervisor(services, config) {
35
35
  const {
36
36
  queues = [], // Array of scrape queues
37
37
  scrapeWorkerQueue, // URL of the scrape worker queue
38
+ maxUrlsPerMessage,
38
39
  } = config;
39
40
 
40
41
  /**
@@ -129,9 +130,24 @@ function ScrapeJobSupervisor(services, config) {
129
130
  }
130
131
 
131
132
  /**
132
- * Queue all URLs as a single message for processing by another function. This will enable
133
- * the controller to respond with a new job ID ASAP, while the individual URLs are queued up
134
- * asynchronously.
133
+ * Split an array of URLs into batches of a specified size.
134
+ * @param urls
135
+ * @param batchSize
136
+ * @returns {*[]}
137
+ */
138
+ function splitUrlsIntoBatches(urls, batchSize = 1000) {
139
+ const batches = [];
140
+ for (let i = 0; i < urls.length; i += batchSize) {
141
+ batches.push(urls.slice(i, i + batchSize));
142
+ }
143
+ log.info(`Split ${urls.length} URLs into ${batches.length} batches of size ${batchSize}.`);
144
+ return batches;
145
+ }
146
+
147
+ /**
148
+ * Queue all URLs for processing by another function. Splits URL-Arrays > 1000 into multiple
149
+ * messages. This will enable the controller to respond with a new job ID ASAP, while the
150
+ * individual URLs are queued up asynchronously.
135
151
  * @param {Array<string>} urls - Array of URL records to queue.
136
152
  * @param {object} scrapeJob - The scrape job record.
137
153
  * @param {object} customHeaders - Optional custom headers to be sent with each request.
@@ -143,17 +159,35 @@ function ScrapeJobSupervisor(services, config) {
143
159
 
144
160
  const options = scrapeJob.getOptions();
145
161
  const processingType = scrapeJob.getProcessingType();
162
+ const totalUrlCount = urls.length;
163
+ const baseUrl = scrapeJob.getBaseURL();
164
+ let urlBatches = [];
146
165
 
147
- // Send a single message containing all URLs and the new job ID
148
- const message = {
149
- processingType,
150
- jobId: scrapeJob.getId(),
151
- urls,
152
- customHeaders,
153
- options,
154
- };
166
+ // If there are more than 1000 URLs, split them into multiple messages
167
+ if (totalUrlCount > maxUrlsPerMessage) {
168
+ urlBatches = splitUrlsIntoBatches(urls, maxUrlsPerMessage);
169
+ log.info(`Queuing ${totalUrlCount} URLs for scrape in ${urlBatches.length} messages.`);
170
+ } else {
171
+ // If there are 1000 or fewer URLs, we can send them all in a single message
172
+ log.info(`Queuing ${totalUrlCount} URLs for scrape in a single message.`);
173
+ urlBatches = [urls]; // Wrap in an array to maintain consistent structure
174
+ }
175
+
176
+ for (const [index, batch] of urlBatches.entries()) {
177
+ // Calculate the offset for numbering the URLs in the batch
178
+ const offset = index * maxUrlsPerMessage;
179
+ const message = {
180
+ processingType,
181
+ jobId: scrapeJob.getId(),
182
+ batch,
183
+ batchOffset: offset,
184
+ customHeaders,
185
+ options,
186
+ };
155
187
 
156
- await sqs.sendMessage(scrapeWorkerQueue, message);
188
+ // eslint-disable-next-line no-await-in-loop
189
+ await sqs.sendMessage(scrapeWorkerQueue, message, baseUrl);
190
+ }
157
191
  }
158
192
 
159
193
  /**