@adobe/spacecat-shared-scrape-client 2.0.0 → 2.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/CHANGELOG.md
CHANGED
|
@@ -1,3 +1,10 @@
|
|
|
1
|
+
# [@adobe/spacecat-shared-scrape-client-v2.1.0](https://github.com/adobe/spacecat-shared/compare/@adobe/spacecat-shared-scrape-client-v2.0.0...@adobe/spacecat-shared-scrape-client-v2.1.0) (2025-08-20)
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
### Features
|
|
5
|
+
|
|
6
|
+
* add scrape-client destination ([#913](https://github.com/adobe/spacecat-shared/issues/913)) ([e208a87](https://github.com/adobe/spacecat-shared/commit/e208a87214874a2708ac2d7614fcfd4c0770fe17))
|
|
7
|
+
|
|
1
8
|
# [@adobe/spacecat-shared-scrape-client-v2.0.0](https://github.com/adobe/spacecat-shared/compare/@adobe/spacecat-shared-scrape-client-v1.0.7...@adobe/spacecat-shared-scrape-client-v2.0.0) (2025-08-13)
|
|
2
9
|
|
|
3
10
|
|
package/package.json
CHANGED
|
@@ -11,8 +11,7 @@
|
|
|
11
11
|
*/
|
|
12
12
|
|
|
13
13
|
import {
|
|
14
|
-
isIsoDate, isObject, isValidUrl,
|
|
15
|
-
isValidUUID,
|
|
14
|
+
hasText, isIsoDate, isNonEmptyArray, isObject, isValidUrl, isValidUUID,
|
|
16
15
|
} from '@adobe/spacecat-shared-utils';
|
|
17
16
|
import { ScrapeJob as ScrapeJobModel } from '@adobe/spacecat-shared-data-access';
|
|
18
17
|
import { ScrapeJobDto } from './scrapeJobDto.js';
|
|
@@ -132,6 +131,7 @@ export default class ScrapeClient {
|
|
|
132
131
|
customHeaders,
|
|
133
132
|
processingType = ScrapeJobModel.ScrapeProcessingType.DEFAULT,
|
|
134
133
|
maxScrapeAge = 24,
|
|
134
|
+
auditData = {},
|
|
135
135
|
} = data;
|
|
136
136
|
|
|
137
137
|
this.config.log.info(`Creating a new scrape job with ${urls.length} URLs.`);
|
|
@@ -149,6 +149,7 @@ export default class ScrapeClient {
|
|
|
149
149
|
mergedOptions,
|
|
150
150
|
customHeaders,
|
|
151
151
|
maxScrapeAge,
|
|
152
|
+
auditData,
|
|
152
153
|
);
|
|
153
154
|
return ScrapeJobDto.toJSON(job);
|
|
154
155
|
} catch (error) {
|
|
@@ -228,6 +229,24 @@ export default class ScrapeClient {
|
|
|
228
229
|
}
|
|
229
230
|
}
|
|
230
231
|
|
|
232
|
+
async getScrapeResultPaths(jobId) {
|
|
233
|
+
try {
|
|
234
|
+
const job = await this.scrapeSupervisor.getScrapeJob(jobId);
|
|
235
|
+
if (!job) {
|
|
236
|
+
return null;
|
|
237
|
+
}
|
|
238
|
+
const { ScrapeUrl } = this.config.dataAccess;
|
|
239
|
+
const scrapeUrls = await ScrapeUrl.allByScrapeJobId(job.getId());
|
|
240
|
+
return scrapeUrls
|
|
241
|
+
.filter((url) => url.getStatus() === ScrapeJobModel.ScrapeUrlStatus.COMPLETE)
|
|
242
|
+
.reduce((map, url) => map.set(url.getUrl(), url.getPath()), new Map());
|
|
243
|
+
} catch (error) {
|
|
244
|
+
const msgError = `Failed to fetch the scrape job result: ${error.message}`;
|
|
245
|
+
this.config.log.error(msgError);
|
|
246
|
+
throw new Error(msgError);
|
|
247
|
+
}
|
|
248
|
+
}
|
|
249
|
+
|
|
231
250
|
/**
|
|
232
251
|
* Get all scrape jobs by baseURL and processing type
|
|
233
252
|
* @param {string} baseURL - The baseURL of the jobs to fetch.
|
|
@@ -122,10 +122,12 @@ function ScrapeJobSupervisor(services, config) {
|
|
|
122
122
|
* @param {object} scrapeJob - The scrape job record.
|
|
123
123
|
* @param {object} customHeaders - Optional custom headers to be sent with each request.
|
|
124
124
|
* @param {string} maxScrapeAge - The maximum age of the scrape job
|
|
125
|
+
* @param auditContext
|
|
125
126
|
*/
|
|
126
|
-
|
|
127
|
+
// eslint-disable-next-line max-len
|
|
128
|
+
async function queueUrlsForScrapeWorker(urls, scrapeJob, customHeaders, maxScrapeAge, auditData) {
|
|
127
129
|
log.info(`Starting a new scrape job of baseUrl: ${scrapeJob.getBaseURL()} with ${urls.length}`
|
|
128
|
-
+
|
|
130
|
+
+ ' URLs.'
|
|
129
131
|
+ `(jobId: ${scrapeJob.getId()})`);
|
|
130
132
|
|
|
131
133
|
const options = scrapeJob.getOptions();
|
|
@@ -155,6 +157,7 @@ function ScrapeJobSupervisor(services, config) {
|
|
|
155
157
|
customHeaders,
|
|
156
158
|
options,
|
|
157
159
|
maxScrapeAge,
|
|
160
|
+
auditData,
|
|
158
161
|
};
|
|
159
162
|
|
|
160
163
|
// eslint-disable-next-line no-await-in-loop
|
|
@@ -168,7 +171,8 @@ function ScrapeJobSupervisor(services, config) {
|
|
|
168
171
|
* @param {string} processingType - The type of processing to perform.
|
|
169
172
|
* @param {object} options - Optional configuration params for the scrape job.
|
|
170
173
|
* @param {object} customHeaders - Optional custom headers to be sent with each request.
|
|
171
|
-
* @param {
|
|
174
|
+
* @param {number} maxScrapeAge - The maximum age of the scrape job
|
|
175
|
+
* @param auditContext
|
|
172
176
|
* @returns {Promise<ScrapeJob>} newly created job object
|
|
173
177
|
*/
|
|
174
178
|
async function startNewJob(
|
|
@@ -177,6 +181,7 @@ function ScrapeJobSupervisor(services, config) {
|
|
|
177
181
|
options,
|
|
178
182
|
customHeaders,
|
|
179
183
|
maxScrapeAge,
|
|
184
|
+
auditContext,
|
|
180
185
|
) {
|
|
181
186
|
const newScrapeJob = await createNewScrapeJob(
|
|
182
187
|
urls,
|
|
@@ -196,7 +201,7 @@ function ScrapeJobSupervisor(services, config) {
|
|
|
196
201
|
|
|
197
202
|
// Queue all URLs for scrape as a single message. This enables the controller to respond with
|
|
198
203
|
// a job ID ASAP, while the individual URLs are queued up asynchronously by another function.
|
|
199
|
-
await queueUrlsForScrapeWorker(urls, newScrapeJob, customHeaders, maxScrapeAge);
|
|
204
|
+
await queueUrlsForScrapeWorker(urls, newScrapeJob, customHeaders, maxScrapeAge, auditContext);
|
|
200
205
|
|
|
201
206
|
return newScrapeJob;
|
|
202
207
|
}
|