@adobe/spacecat-shared-scrape-client 1.0.2 → 1.0.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +14 -0
- package/package.json +1 -1
- package/src/clients/scrape-job-supervisor.js +46 -12
package/CHANGELOG.md
CHANGED
|
@@ -1,3 +1,17 @@
|
|
|
1
|
+
# [@adobe/spacecat-shared-scrape-client-v1.0.4](https://github.com/adobe/spacecat-shared/compare/@adobe/spacecat-shared-scrape-client-v1.0.3...@adobe/spacecat-shared-scrape-client-v1.0.4) (2025-07-24)
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
### Bug Fixes
|
|
5
|
+
|
|
6
|
+
* (scrape-job-supervisor): add offset for URL numbering in batches ([#868](https://github.com/adobe/spacecat-shared/issues/868)) ([12789c0](https://github.com/adobe/spacecat-shared/commit/12789c0cabe33ad5e526793d645bfef421a851af))
|
|
7
|
+
|
|
8
|
+
# [@adobe/spacecat-shared-scrape-client-v1.0.3](https://github.com/adobe/spacecat-shared/compare/@adobe/spacecat-shared-scrape-client-v1.0.2...@adobe/spacecat-shared-scrape-client-v1.0.3) (2025-07-21)
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
### Bug Fixes
|
|
12
|
+
|
|
13
|
+
* ScrapeClient handle large url lists ([#854](https://github.com/adobe/spacecat-shared/issues/854)) ([d0768db](https://github.com/adobe/spacecat-shared/commit/d0768db101d65bc604c64473648cba0344612025))
|
|
14
|
+
|
|
1
15
|
# [@adobe/spacecat-shared-scrape-client-v1.0.2](https://github.com/adobe/spacecat-shared/compare/@adobe/spacecat-shared-scrape-client-v1.0.1...@adobe/spacecat-shared-scrape-client-v1.0.2) (2025-07-19)
|
|
2
16
|
|
|
3
17
|
|
package/package.json
CHANGED
|
@@ -35,6 +35,7 @@ function ScrapeJobSupervisor(services, config) {
|
|
|
35
35
|
const {
|
|
36
36
|
queues = [], // Array of scrape queues
|
|
37
37
|
scrapeWorkerQueue, // URL of the scrape worker queue
|
|
38
|
+
maxUrlsPerMessage,
|
|
38
39
|
} = config;
|
|
39
40
|
|
|
40
41
|
/**
|
|
@@ -129,9 +130,24 @@ function ScrapeJobSupervisor(services, config) {
|
|
|
129
130
|
}
|
|
130
131
|
|
|
131
132
|
/**
|
|
132
|
-
*
|
|
133
|
-
*
|
|
134
|
-
*
|
|
133
|
+
* Split an array of URLs into batches of a specified size.
|
|
134
|
+
* @param urls
|
|
135
|
+
* @param batchSize
|
|
136
|
+
* @returns {*[]}
|
|
137
|
+
*/
|
|
138
|
+
function splitUrlsIntoBatches(urls, batchSize = 1000) {
|
|
139
|
+
const batches = [];
|
|
140
|
+
for (let i = 0; i < urls.length; i += batchSize) {
|
|
141
|
+
batches.push(urls.slice(i, i + batchSize));
|
|
142
|
+
}
|
|
143
|
+
log.info(`Split ${urls.length} URLs into ${batches.length} batches of size ${batchSize}.`);
|
|
144
|
+
return batches;
|
|
145
|
+
}
|
|
146
|
+
|
|
147
|
+
/**
|
|
148
|
+
* Queue all URLs for processing by another function. Splits URL-Arrays > 1000 into multiple
|
|
149
|
+
* messages. This will enable the controller to respond with a new job ID ASAP, while the
|
|
150
|
+
* individual URLs are queued up asynchronously.
|
|
135
151
|
* @param {Array<string>} urls - Array of URL records to queue.
|
|
136
152
|
* @param {object} scrapeJob - The scrape job record.
|
|
137
153
|
* @param {object} customHeaders - Optional custom headers to be sent with each request.
|
|
@@ -143,17 +159,35 @@ function ScrapeJobSupervisor(services, config) {
|
|
|
143
159
|
|
|
144
160
|
const options = scrapeJob.getOptions();
|
|
145
161
|
const processingType = scrapeJob.getProcessingType();
|
|
162
|
+
const totalUrlCount = urls.length;
|
|
163
|
+
const baseUrl = scrapeJob.getBaseURL();
|
|
164
|
+
let urlBatches = [];
|
|
146
165
|
|
|
147
|
-
//
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
166
|
+
// If there are more than 1000 URLs, split them into multiple messages
|
|
167
|
+
if (totalUrlCount > maxUrlsPerMessage) {
|
|
168
|
+
urlBatches = splitUrlsIntoBatches(urls, maxUrlsPerMessage);
|
|
169
|
+
log.info(`Queuing ${totalUrlCount} URLs for scrape in ${urlBatches.length} messages.`);
|
|
170
|
+
} else {
|
|
171
|
+
// If there are 1000 or fewer URLs, we can send them all in a single message
|
|
172
|
+
log.info(`Queuing ${totalUrlCount} URLs for scrape in a single message.`);
|
|
173
|
+
urlBatches = [urls]; // Wrap in an array to maintain consistent structure
|
|
174
|
+
}
|
|
175
|
+
|
|
176
|
+
for (const [index, batch] of urlBatches.entries()) {
|
|
177
|
+
// Calculate the offset for numbering the URLs in the batch
|
|
178
|
+
const offset = index * maxUrlsPerMessage;
|
|
179
|
+
const message = {
|
|
180
|
+
processingType,
|
|
181
|
+
jobId: scrapeJob.getId(),
|
|
182
|
+
batch,
|
|
183
|
+
batchOffset: offset,
|
|
184
|
+
customHeaders,
|
|
185
|
+
options,
|
|
186
|
+
};
|
|
155
187
|
|
|
156
|
-
|
|
188
|
+
// eslint-disable-next-line no-await-in-loop
|
|
189
|
+
await sqs.sendMessage(scrapeWorkerQueue, message, baseUrl);
|
|
190
|
+
}
|
|
157
191
|
}
|
|
158
192
|
|
|
159
193
|
/**
|