@adobe/spacecat-shared-scrape-client 1.0.1 → 1.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +14 -0
- package/package.json +2 -2
- package/src/clients/scrape-job-supervisor.js +43 -12
package/CHANGELOG.md
CHANGED
|
@@ -1,3 +1,17 @@
|
|
|
1
|
+
# [@adobe/spacecat-shared-scrape-client-v1.0.3](https://github.com/adobe/spacecat-shared/compare/@adobe/spacecat-shared-scrape-client-v1.0.2...@adobe/spacecat-shared-scrape-client-v1.0.3) (2025-07-21)
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
### Bug Fixes
|
|
5
|
+
|
|
6
|
+
* ScrapeClient handle large url lists ([#854](https://github.com/adobe/spacecat-shared/issues/854)) ([d0768db](https://github.com/adobe/spacecat-shared/commit/d0768db101d65bc604c64473648cba0344612025))
|
|
7
|
+
|
|
8
|
+
# [@adobe/spacecat-shared-scrape-client-v1.0.2](https://github.com/adobe/spacecat-shared/compare/@adobe/spacecat-shared-scrape-client-v1.0.1...@adobe/spacecat-shared-scrape-client-v1.0.2) (2025-07-19)
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
### Bug Fixes
|
|
12
|
+
|
|
13
|
+
* **deps:** update external fixes ([#859](https://github.com/adobe/spacecat-shared/issues/859)) ([7ca9099](https://github.com/adobe/spacecat-shared/commit/7ca90994d61d07f71e580301365447b94ad07a52))
|
|
14
|
+
|
|
1
15
|
# [@adobe/spacecat-shared-scrape-client-v1.0.1](https://github.com/adobe/spacecat-shared/compare/@adobe/spacecat-shared-scrape-client-v1.0.0...@adobe/spacecat-shared-scrape-client-v1.0.1) (2025-07-12)
|
|
2
16
|
|
|
3
17
|
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@adobe/spacecat-shared-scrape-client",
|
|
3
|
-
"version": "1.0.
|
|
3
|
+
"version": "1.0.3",
|
|
4
4
|
"description": "Shared modules of the Spacecat Services - Scrape Client",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"engines": {
|
|
@@ -42,7 +42,7 @@
|
|
|
42
42
|
"devDependencies": {
|
|
43
43
|
"chai": "5.2.1",
|
|
44
44
|
"chai-as-promised": "8.0.1",
|
|
45
|
-
"nock": "14.0.
|
|
45
|
+
"nock": "14.0.6",
|
|
46
46
|
"sinon": "20.0.0",
|
|
47
47
|
"sinon-chai": "4.0.0",
|
|
48
48
|
"typescript": "5.8.3"
|
|
@@ -35,6 +35,7 @@ function ScrapeJobSupervisor(services, config) {
|
|
|
35
35
|
const {
|
|
36
36
|
queues = [], // Array of scrape queues
|
|
37
37
|
scrapeWorkerQueue, // URL of the scrape worker queue
|
|
38
|
+
maxUrlsPerMessage,
|
|
38
39
|
} = config;
|
|
39
40
|
|
|
40
41
|
/**
|
|
@@ -129,9 +130,24 @@ function ScrapeJobSupervisor(services, config) {
|
|
|
129
130
|
}
|
|
130
131
|
|
|
131
132
|
/**
|
|
132
|
-
*
|
|
133
|
-
*
|
|
134
|
-
*
|
|
133
|
+
* Split an array of URLs into batches of a specified size.
|
|
134
|
+
* @param urls
|
|
135
|
+
* @param batchSize
|
|
136
|
+
* @returns {*[]}
|
|
137
|
+
*/
|
|
138
|
+
function splitUrlsIntoBatches(urls, batchSize = 1000) {
|
|
139
|
+
const batches = [];
|
|
140
|
+
for (let i = 0; i < urls.length; i += batchSize) {
|
|
141
|
+
batches.push(urls.slice(i, i + batchSize));
|
|
142
|
+
}
|
|
143
|
+
log.info(`Split ${urls.length} URLs into ${batches.length} batches of size ${batchSize}.`);
|
|
144
|
+
return batches;
|
|
145
|
+
}
|
|
146
|
+
|
|
147
|
+
/**
|
|
148
|
+
* Queue all URLs for processing by another function. Splits URL-Arrays > 1000 into multiple
|
|
149
|
+
* messages. This will enable the controller to respond with a new job ID ASAP, while the
|
|
150
|
+
* individual URLs are queued up asynchronously.
|
|
135
151
|
* @param {Array<string>} urls - Array of URL records to queue.
|
|
136
152
|
* @param {object} scrapeJob - The scrape job record.
|
|
137
153
|
* @param {object} customHeaders - Optional custom headers to be sent with each request.
|
|
@@ -143,17 +159,32 @@ function ScrapeJobSupervisor(services, config) {
|
|
|
143
159
|
|
|
144
160
|
const options = scrapeJob.getOptions();
|
|
145
161
|
const processingType = scrapeJob.getProcessingType();
|
|
162
|
+
const totalUrlCount = urls.length;
|
|
163
|
+
const baseUrl = scrapeJob.getBaseURL();
|
|
164
|
+
let urlBatches = [];
|
|
146
165
|
|
|
147
|
-
//
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
166
|
+
// If there are more than 1000 URLs, split them into multiple messages
|
|
167
|
+
if (totalUrlCount > maxUrlsPerMessage) {
|
|
168
|
+
urlBatches = splitUrlsIntoBatches(urls, maxUrlsPerMessage);
|
|
169
|
+
log.info(`Queuing ${totalUrlCount} URLs for scrape in ${urlBatches.length} messages.`);
|
|
170
|
+
} else {
|
|
171
|
+
// If there are 1000 or fewer URLs, we can send them all in a single message
|
|
172
|
+
log.info(`Queuing ${totalUrlCount} URLs for scrape in a single message.`);
|
|
173
|
+
urlBatches = [urls]; // Wrap in an array to maintain consistent structure
|
|
174
|
+
}
|
|
175
|
+
|
|
176
|
+
for (const batch of urlBatches) {
|
|
177
|
+
const message = {
|
|
178
|
+
processingType,
|
|
179
|
+
jobId: scrapeJob.getId(),
|
|
180
|
+
batch,
|
|
181
|
+
customHeaders,
|
|
182
|
+
options,
|
|
183
|
+
};
|
|
155
184
|
|
|
156
|
-
|
|
185
|
+
// eslint-disable-next-line no-await-in-loop
|
|
186
|
+
await sqs.sendMessage(scrapeWorkerQueue, message, baseUrl);
|
|
187
|
+
}
|
|
157
188
|
}
|
|
158
189
|
|
|
159
190
|
/**
|