@adobe/spacecat-shared-data-access 2.24.0 → 2.25.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +14 -0
- package/package.json +4 -4
- package/src/models/base/entity.registry.js +6 -0
- package/src/models/index.d.ts +2 -0
- package/src/models/index.js +2 -0
- package/src/models/scrape-job/index.d.ts +68 -0
- package/src/models/scrape-job/index.js +19 -0
- package/src/models/scrape-job/scrape-job.collection.js +45 -0
- package/src/models/scrape-job/scrape-job.model.js +77 -0
- package/src/models/scrape-job/scrape-job.schema.js +129 -0
- package/src/models/scrape-url/index.d.ts +36 -0
- package/src/models/scrape-url/index.js +19 -0
- package/src/models/scrape-url/scrape-url.collection.js +26 -0
- package/src/models/scrape-url/scrape-url.model.js +28 -0
- package/src/models/scrape-url/scrape-url.schema.js +50 -0
package/CHANGELOG.md
CHANGED
|
@@ -1,3 +1,17 @@
|
|
|
1
|
+
# [@adobe/spacecat-shared-data-access-v2.25.0](https://github.com/adobe/spacecat-shared/compare/@adobe/spacecat-shared-data-access-v2.24.1...@adobe/spacecat-shared-data-access-v2.25.0) (2025-06-18)
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
### Features
|
|
5
|
+
|
|
6
|
+
* introduce ScrapeJob and ScrapeUrl entities ([#803](https://github.com/adobe/spacecat-shared/issues/803)) ([d295f65](https://github.com/adobe/spacecat-shared/commit/d295f65a89a986f08d5d3b28fe60b45d4c65ee36))
|
|
7
|
+
|
|
8
|
+
# [@adobe/spacecat-shared-data-access-v2.24.1](https://github.com/adobe/spacecat-shared/compare/@adobe/spacecat-shared-data-access-v2.24.0...@adobe/spacecat-shared-data-access-v2.24.1) (2025-06-14)
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
### Bug Fixes
|
|
12
|
+
|
|
13
|
+
* **deps:** update external fixes ([#802](https://github.com/adobe/spacecat-shared/issues/802)) ([fc2cb47](https://github.com/adobe/spacecat-shared/commit/fc2cb47183948833f5b0a411ae78d1649e747a17))
|
|
14
|
+
|
|
1
15
|
# [@adobe/spacecat-shared-data-access-v2.24.0](https://github.com/adobe/spacecat-shared/compare/@adobe/spacecat-shared-data-access-v2.23.3...@adobe/spacecat-shared-data-access-v2.24.0) (2025-06-13)
|
|
2
16
|
|
|
3
17
|
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@adobe/spacecat-shared-data-access",
|
|
3
|
-
"version": "2.
|
|
3
|
+
"version": "2.25.0",
|
|
4
4
|
"description": "Shared modules of the Spacecat Services - Data Access",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"engines": {
|
|
@@ -36,11 +36,11 @@
|
|
|
36
36
|
},
|
|
37
37
|
"dependencies": {
|
|
38
38
|
"@adobe/spacecat-shared-utils": "1.38.4",
|
|
39
|
-
"@aws-sdk/client-dynamodb": "3.
|
|
40
|
-
"@aws-sdk/lib-dynamodb": "3.
|
|
39
|
+
"@aws-sdk/client-dynamodb": "3.828.0",
|
|
40
|
+
"@aws-sdk/lib-dynamodb": "3.828.0",
|
|
41
41
|
"@types/joi": "17.2.3",
|
|
42
42
|
"aws-xray-sdk": "3.10.3",
|
|
43
|
-
"electrodb": "3.4.
|
|
43
|
+
"electrodb": "3.4.3",
|
|
44
44
|
"joi": "17.13.3",
|
|
45
45
|
"pluralize": "8.0.0",
|
|
46
46
|
"uuid": "11.1.0"
|
|
@@ -25,6 +25,8 @@ import KeyEventCollection from '../key-event/key-event.collection.js';
|
|
|
25
25
|
import LatestAuditCollection from '../latest-audit/latest-audit.collection.js';
|
|
26
26
|
import OpportunityCollection from '../opportunity/opportunity.collection.js';
|
|
27
27
|
import OrganizationCollection from '../organization/organization.collection.js';
|
|
28
|
+
import ScrapeJobCollection from '../scrape-job/scrape-job.collection.js';
|
|
29
|
+
import ScrapeUrlCollection from '../scrape-url/scrape-url.collection.js';
|
|
28
30
|
import SiteCandidateCollection from '../site-candidate/site-candidate.collection.js';
|
|
29
31
|
import SiteCollection from '../site/site.collection.js';
|
|
30
32
|
import SiteTopPageCollection from '../site-top-page/site-top-page.collection.js';
|
|
@@ -42,6 +44,8 @@ import KeyEventSchema from '../key-event/key-event.schema.js';
|
|
|
42
44
|
import LatestAuditSchema from '../latest-audit/latest-audit.schema.js';
|
|
43
45
|
import OpportunitySchema from '../opportunity/opportunity.schema.js';
|
|
44
46
|
import OrganizationSchema from '../organization/organization.schema.js';
|
|
47
|
+
import ScrapeJobSchema from '../scrape-job/scrape-job.schema.js';
|
|
48
|
+
import ScrapeUrlSchema from '../scrape-url/scrape-url.schema.js';
|
|
45
49
|
import SiteSchema from '../site/site.schema.js';
|
|
46
50
|
import SiteCandidateSchema from '../site-candidate/site-candidate.schema.js';
|
|
47
51
|
import SiteTopPageSchema from '../site-top-page/site-top-page.schema.js';
|
|
@@ -140,6 +144,8 @@ EntityRegistry.registerEntity(KeyEventSchema, KeyEventCollection);
|
|
|
140
144
|
EntityRegistry.registerEntity(LatestAuditSchema, LatestAuditCollection);
|
|
141
145
|
EntityRegistry.registerEntity(OpportunitySchema, OpportunityCollection);
|
|
142
146
|
EntityRegistry.registerEntity(OrganizationSchema, OrganizationCollection);
|
|
147
|
+
EntityRegistry.registerEntity(ScrapeJobSchema, ScrapeJobCollection);
|
|
148
|
+
EntityRegistry.registerEntity(ScrapeUrlSchema, ScrapeUrlCollection);
|
|
143
149
|
EntityRegistry.registerEntity(SiteSchema, SiteCollection);
|
|
144
150
|
EntityRegistry.registerEntity(SiteCandidateSchema, SiteCandidateCollection);
|
|
145
151
|
EntityRegistry.registerEntity(SiteTopPageSchema, SiteTopPageCollection);
|
package/src/models/index.d.ts
CHANGED
|
@@ -22,6 +22,8 @@ export type * from './key-event';
|
|
|
22
22
|
export type * from './latest-audit';
|
|
23
23
|
export type * from './opportunity';
|
|
24
24
|
export type * from './organization';
|
|
25
|
+
export type * from './scrape-job';
|
|
26
|
+
export type * from './scrape-url';
|
|
25
27
|
export type * from './site';
|
|
26
28
|
export type * from './site-candidate';
|
|
27
29
|
export type * from './site-top-page';
|
package/src/models/index.js
CHANGED
|
@@ -23,6 +23,8 @@ export * from './key-event/index.js';
|
|
|
23
23
|
export * from './latest-audit/index.js';
|
|
24
24
|
export * from './opportunity/index.js';
|
|
25
25
|
export * from './organization/index.js';
|
|
26
|
+
export * from './scrape-job/index.js';
|
|
27
|
+
export * from './scrape-url/index.js';
|
|
26
28
|
export * from './site-candidate/index.js';
|
|
27
29
|
export * from './site-top-page/index.js';
|
|
28
30
|
export * from './site/index.js';
|
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
/*
|
|
2
|
+
* Copyright 2025 Adobe. All rights reserved.
|
|
3
|
+
* This file is licensed to you under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
* you may not use this file except in compliance with the License. You may obtain a copy
|
|
5
|
+
* of the License at http://www.apache.org/licenses/LICENSE-2.0
|
|
6
|
+
*
|
|
7
|
+
* Unless required by applicable law or agreed to in writing, software distributed under
|
|
8
|
+
* the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR REPRESENTATIONS
|
|
9
|
+
* OF ANY KIND, either express or implied. See the License for the specific language
|
|
10
|
+
* governing permissions and limitations under the License.
|
|
11
|
+
*/
|
|
12
|
+
|
|
13
|
+
import type { BaseCollection, BaseModel } from '../base';
|
|
14
|
+
import type { ScrapeUrl } from '../scrape-url';
|
|
15
|
+
|
|
16
|
+
export interface ScrapeJob extends BaseModel {
|
|
17
|
+
getBaseURL(): string,
|
|
18
|
+
getCustomHeaders(): IOptions,
|
|
19
|
+
getDuration(): number,
|
|
20
|
+
getEndedAt(): string,
|
|
21
|
+
getFailedCount(): number,
|
|
22
|
+
getOptions(): string,
|
|
23
|
+
getProcessingType(): string,
|
|
24
|
+
getRedirectCount(): number,
|
|
25
|
+
getResults(): string,
|
|
26
|
+
getScrapeQueueId(): string,
|
|
27
|
+
getScrapeUrls(): Promise<ScrapeUrl[]>,
|
|
28
|
+
getScrapeUrlsByStatus(status: string): Promise<ScrapeUrl[]>,
|
|
29
|
+
getStartedAt(): string,
|
|
30
|
+
getStatus(): string,
|
|
31
|
+
getSuccessCount(): number,
|
|
32
|
+
getUrlCount(): number,
|
|
33
|
+
setBaseURL(baseURL: string): void,
|
|
34
|
+
setCustomHeaders(customHeaders: IOptions): void,
|
|
35
|
+
setDuration(duration: number): void,
|
|
36
|
+
setEndedAt(endTime: string): void,
|
|
37
|
+
setFailedCount(failedCount: number): void,
|
|
38
|
+
setOptions(options: string): void,
|
|
39
|
+
setProcessingType(processingType: string): void,
|
|
40
|
+
setRedirectCount(redirectCount: number): void,
|
|
41
|
+
setResults(results: string): void,
|
|
42
|
+
setScrapeQueueId(ScrapeQueueId: string): void,
|
|
43
|
+
setStatus(status: string): void,
|
|
44
|
+
setSuccessCount(successCount: number): void,
|
|
45
|
+
setUrlCount(urlCount: number): void,
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
export interface ScrapeJobCollection extends BaseCollection<ScrapeJob> {
|
|
49
|
+
allByBaseURLAndProcessingType(baseURL: string, processingType: string): Promise<ScrapeJob[]>;
|
|
50
|
+
allByBaseURLAndProcessingTypeAndOptEnableJavascriptAndOptHideConsentBanner(
|
|
51
|
+
baseURL: string,
|
|
52
|
+
processingType: string,
|
|
53
|
+
optEnableJavascript: string,
|
|
54
|
+
optHideConsentBanner: string): Promise<ScrapeJob[]>;
|
|
55
|
+
allByDateRange(startDate: string, endDate: string): Promise<ScrapeJob[]>;
|
|
56
|
+
allByStartedAt(startDate: string): Promise<ScrapeJob[]>;
|
|
57
|
+
allByStatus(status: string): Promise<ScrapeJob[]>;
|
|
58
|
+
allByStatusAndUpdatedAt(status: string, updatedAt: string): Promise<ScrapeJob[]>;
|
|
59
|
+
findByBaseURLAndProcessingType(baseURL: string, processingType: string): Promise<ScrapeJob[]>;
|
|
60
|
+
findByBaseURLAndProcessingTypeAndOptEnableJavascriptAndOptHideConsentBanner(
|
|
61
|
+
baseURL: string,
|
|
62
|
+
processingType: string,
|
|
63
|
+
optEnableJavascript: string,
|
|
64
|
+
optHideConsentBanner: string): Promise<ScrapeJob[]>;
|
|
65
|
+
findByStartedAt(startDate: string): Promise<ScrapeJob | null>;
|
|
66
|
+
findByStatus(status: string): Promise<ScrapeJob | null>;
|
|
67
|
+
findByStatusAndUpdatedAt(status: string, updatedAt: string): Promise<ScrapeJob | null>;
|
|
68
|
+
}
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
/*
|
|
2
|
+
* Copyright 2025 Adobe. All rights reserved.
|
|
3
|
+
* This file is licensed to you under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
* you may not use this file except in compliance with the License. You may obtain a copy
|
|
5
|
+
* of the License at http://www.apache.org/licenses/LICENSE-2.0
|
|
6
|
+
*
|
|
7
|
+
* Unless required by applicable law or agreed to in writing, software distributed under
|
|
8
|
+
* the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR REPRESENTATIONS
|
|
9
|
+
* OF ANY KIND, either express or implied. See the License for the specific language
|
|
10
|
+
* governing permissions and limitations under the License.
|
|
11
|
+
*/
|
|
12
|
+
|
|
13
|
+
import ScrapeJob from './scrape-job.model.js';
|
|
14
|
+
import ScrapeJobCollection from './scrape-job.collection.js';
|
|
15
|
+
|
|
16
|
+
export {
|
|
17
|
+
ScrapeJob,
|
|
18
|
+
ScrapeJobCollection,
|
|
19
|
+
};
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
/*
|
|
2
|
+
* Copyright 2025 Adobe. All rights reserved.
|
|
3
|
+
* This file is licensed to you under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
* you may not use this file except in compliance with the License. You may obtain a copy
|
|
5
|
+
* of the License at http://www.apache.org/licenses/LICENSE-2.0
|
|
6
|
+
*
|
|
7
|
+
* Unless required by applicable law or agreed to in writing, software distributed under
|
|
8
|
+
* the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR REPRESENTATIONS
|
|
9
|
+
* OF ANY KIND, either express or implied. See the License for the specific language
|
|
10
|
+
* governing permissions and limitations under the License.
|
|
11
|
+
*/
|
|
12
|
+
|
|
13
|
+
import { isIsoDate } from '@adobe/spacecat-shared-utils';
|
|
14
|
+
|
|
15
|
+
import { ValidationError } from '../../errors/index.js';
|
|
16
|
+
import BaseCollection from '../base/base.collection.js';
|
|
17
|
+
|
|
18
|
+
/**
|
|
19
|
+
* ScrapeJobCollection - A collection class responsible for managing ScrapeJob entities.
|
|
20
|
+
* Extends the BaseCollection to provide specific methods for interacting with ScrapeJob records.
|
|
21
|
+
*
|
|
22
|
+
* @class ScrapeJobCollection
|
|
23
|
+
* @extends BaseCollection
|
|
24
|
+
*/
|
|
25
|
+
class ScrapeJobCollection extends BaseCollection {
|
|
26
|
+
async allByDateRange(startDate, endDate) {
|
|
27
|
+
if (!isIsoDate(startDate)) {
|
|
28
|
+
throw new ValidationError(`Invalid start date: ${startDate}`);
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
if (!isIsoDate(endDate)) {
|
|
32
|
+
throw new ValidationError(`Invalid end date: ${endDate}`);
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
return this.all({}, {
|
|
36
|
+
between: {
|
|
37
|
+
attribute: 'startedAt',
|
|
38
|
+
start: startDate,
|
|
39
|
+
end: endDate,
|
|
40
|
+
},
|
|
41
|
+
});
|
|
42
|
+
}
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
export default ScrapeJobCollection;
|
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
/*
|
|
2
|
+
* Copyright 2025 Adobe. All rights reserved.
|
|
3
|
+
* This file is licensed to you under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
* you may not use this file except in compliance with the License. You may obtain a copy
|
|
5
|
+
* of the License at http://www.apache.org/licenses/LICENSE-2.0
|
|
6
|
+
*
|
|
7
|
+
* Unless required by applicable law or agreed to in writing, software distributed under
|
|
8
|
+
* the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR REPRESENTATIONS
|
|
9
|
+
* OF ANY KIND, either express or implied. See the License for the specific language
|
|
10
|
+
* governing permissions and limitations under the License.
|
|
11
|
+
*/
|
|
12
|
+
|
|
13
|
+
import BaseModel from '../base/base.model.js';
|
|
14
|
+
|
|
15
|
+
/**
|
|
16
|
+
* ScrapeJob - A class representing an ScrapeJob entity.
|
|
17
|
+
* Provides methods to access and manipulate ScrapeJob-specific data.
|
|
18
|
+
*
|
|
19
|
+
* @class ScrapeJob
|
|
20
|
+
* @extends BaseModel
|
|
21
|
+
*/
|
|
22
|
+
class ScrapeJob extends BaseModel {
|
|
23
|
+
static SCRAPE_JOB_EXPIRES_IN_DAYS = 14;
|
|
24
|
+
|
|
25
|
+
/**
|
|
26
|
+
* Scrape Job Status types.
|
|
27
|
+
* Any changes to this object needs to be reflected in the index.d.ts file as well.
|
|
28
|
+
*/
|
|
29
|
+
static ScrapeJobStatus = {
|
|
30
|
+
RUNNING: 'RUNNING',
|
|
31
|
+
COMPLETE: 'COMPLETE',
|
|
32
|
+
FAILED: 'FAILED',
|
|
33
|
+
STOPPED: 'STOPPED',
|
|
34
|
+
};
|
|
35
|
+
|
|
36
|
+
/**
|
|
37
|
+
* ScrapeURL Status types.
|
|
38
|
+
* Any changes to this object needs to be reflected in the index.d.ts file as well.
|
|
39
|
+
*/
|
|
40
|
+
static ScrapeUrlStatus = {
|
|
41
|
+
PENDING: 'PENDING',
|
|
42
|
+
REDIRECT: 'REDIRECT',
|
|
43
|
+
...ScrapeJob.ScrapeJobStatus,
|
|
44
|
+
};
|
|
45
|
+
|
|
46
|
+
/**
|
|
47
|
+
* Supported Scrape Options.
|
|
48
|
+
*/
|
|
49
|
+
static ScrapeOptions = {
|
|
50
|
+
ENABLE_JAVASCRIPT: 'enableJavascript',
|
|
51
|
+
HIDE_CONSENT_BANNER: 'hideConsentBanners',
|
|
52
|
+
PAGE_LOAD_TIMEOUT: 'pageLoadTimeout',
|
|
53
|
+
WAIT_FOR_SELECTOR: 'waitForSelector',
|
|
54
|
+
SECTION_LOAD_WAIT_TIME: 'sectionLoadWaitTime',
|
|
55
|
+
SCREENSHOT_TYPES: 'screenshotTypes',
|
|
56
|
+
};
|
|
57
|
+
|
|
58
|
+
static ScrapeProcessingType = {
|
|
59
|
+
DEFAULT: 'default',
|
|
60
|
+
ACCESSIBILITY: 'accessibility',
|
|
61
|
+
FORM_ACCESSIBILITY: 'form-accessibility',
|
|
62
|
+
FORM: 'form',
|
|
63
|
+
TEXT_CONTENT: 'text-content',
|
|
64
|
+
};
|
|
65
|
+
|
|
66
|
+
static ScrapeScreenshotType = {
|
|
67
|
+
FULL_PAGE: 'fullPage',
|
|
68
|
+
THUMBNAIL: 'thumbnail',
|
|
69
|
+
SECTION: 'section',
|
|
70
|
+
BLOCK: 'block',
|
|
71
|
+
SCROLL: 'scroll',
|
|
72
|
+
};
|
|
73
|
+
|
|
74
|
+
// add your custom methods or overrides here
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
export default ScrapeJob;
|
|
@@ -0,0 +1,129 @@
|
|
|
1
|
+
/*
|
|
2
|
+
* Copyright 2025 Adobe. All rights reserved.
|
|
3
|
+
* This file is licensed to you under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
* you may not use this file except in compliance with the License. You may obtain a copy
|
|
5
|
+
* of the License at http://www.apache.org/licenses/LICENSE-2.0
|
|
6
|
+
*
|
|
7
|
+
* Unless required by applicable law or agreed to in writing, software distributed under
|
|
8
|
+
* the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR REPRESENTATIONS
|
|
9
|
+
* OF ANY KIND, either express or implied. See the License for the specific language
|
|
10
|
+
* governing permissions and limitations under the License.
|
|
11
|
+
*/
|
|
12
|
+
|
|
13
|
+
/* c8 ignore start */
|
|
14
|
+
|
|
15
|
+
import {
|
|
16
|
+
isInteger,
|
|
17
|
+
isIsoDate,
|
|
18
|
+
isNumber,
|
|
19
|
+
isObject,
|
|
20
|
+
isValidUrl,
|
|
21
|
+
isString,
|
|
22
|
+
} from '@adobe/spacecat-shared-utils';
|
|
23
|
+
|
|
24
|
+
import SchemaBuilder from '../base/schema.builder.js';
|
|
25
|
+
import ScrapeJob from './scrape-job.model.js';
|
|
26
|
+
import ScrapeJobCollection from './scrape-job.collection.js';
|
|
27
|
+
|
|
28
|
+
/*
|
|
29
|
+
Schema Doc: https://electrodb.dev/en/modeling/schema/
|
|
30
|
+
Attribute Doc: https://electrodb.dev/en/modeling/attributes/
|
|
31
|
+
Indexes Doc: https://electrodb.dev/en/modeling/indexes/
|
|
32
|
+
*/
|
|
33
|
+
|
|
34
|
+
const schema = new SchemaBuilder(ScrapeJob, ScrapeJobCollection)
|
|
35
|
+
.withRecordExpiry(ScrapeJob.SCRAPE_JOB_EXPIRES_IN_DAYS)
|
|
36
|
+
.addReference('has_many', 'ScrapeUrls')
|
|
37
|
+
.addAttribute('baseURL', {
|
|
38
|
+
type: 'string',
|
|
39
|
+
required: true,
|
|
40
|
+
validate: (value) => isValidUrl(value),
|
|
41
|
+
})
|
|
42
|
+
.addAttribute('processingType', {
|
|
43
|
+
type: 'string',
|
|
44
|
+
required: true,
|
|
45
|
+
validate: (value) => isString(value),
|
|
46
|
+
})
|
|
47
|
+
.addAttribute('duration', {
|
|
48
|
+
type: 'number',
|
|
49
|
+
default: 0,
|
|
50
|
+
validate: (value) => !value || isNumber(value),
|
|
51
|
+
})
|
|
52
|
+
.addAttribute('endedAt', {
|
|
53
|
+
type: 'string',
|
|
54
|
+
validate: (value) => !value || isIsoDate(value),
|
|
55
|
+
})
|
|
56
|
+
.addAttribute('failedCount', {
|
|
57
|
+
type: 'number',
|
|
58
|
+
default: 0,
|
|
59
|
+
validate: (value) => !value || isInteger(value),
|
|
60
|
+
})
|
|
61
|
+
.addAttribute('scrapeQueueId', {
|
|
62
|
+
type: 'string',
|
|
63
|
+
})
|
|
64
|
+
.addAttribute('options', {
|
|
65
|
+
type: 'any',
|
|
66
|
+
validate: (value) => !value || isObject(value),
|
|
67
|
+
})
|
|
68
|
+
.addAttribute('customHeaders', {
|
|
69
|
+
type: 'any',
|
|
70
|
+
})
|
|
71
|
+
.addAttribute('redirectCount', {
|
|
72
|
+
type: 'number',
|
|
73
|
+
default: 0,
|
|
74
|
+
validate: (value) => !value || isInteger(value),
|
|
75
|
+
})
|
|
76
|
+
.addAttribute('status', {
|
|
77
|
+
type: Object.values(ScrapeJob.ScrapeJobStatus),
|
|
78
|
+
required: true,
|
|
79
|
+
})
|
|
80
|
+
.addAttribute('startedAt', {
|
|
81
|
+
type: 'string',
|
|
82
|
+
required: true,
|
|
83
|
+
readOnly: true,
|
|
84
|
+
default: () => new Date().toISOString(),
|
|
85
|
+
validate: (value) => isIsoDate(value),
|
|
86
|
+
})
|
|
87
|
+
.addAttribute('successCount', {
|
|
88
|
+
type: 'number',
|
|
89
|
+
default: 0,
|
|
90
|
+
validate: (value) => !value || isInteger(value),
|
|
91
|
+
})
|
|
92
|
+
.addAttribute('urlCount', {
|
|
93
|
+
type: 'number',
|
|
94
|
+
default: 0,
|
|
95
|
+
validate: (value) => !value || isInteger(value),
|
|
96
|
+
})
|
|
97
|
+
.addAttribute('results', {
|
|
98
|
+
type: 'any',
|
|
99
|
+
})
|
|
100
|
+
.addAttribute('optEnableJavascript', {
|
|
101
|
+
type: 'string',
|
|
102
|
+
hidden: true,
|
|
103
|
+
readOnly: true,
|
|
104
|
+
watch: ['options'],
|
|
105
|
+
set: (_, { options }) => (options[ScrapeJob.ScrapeOptions.ENABLE_JAVASCRIPT] ? 'T' : 'F'),
|
|
106
|
+
})
|
|
107
|
+
.addAttribute('optHideConsentBanner', {
|
|
108
|
+
type: 'string',
|
|
109
|
+
hidden: true,
|
|
110
|
+
readOnly: true,
|
|
111
|
+
watch: ['options'],
|
|
112
|
+
set: (_, { options }) => (options[ScrapeJob.ScrapeOptions.HIDE_CONSENT_BANNER] ? 'T' : 'F'),
|
|
113
|
+
})
|
|
114
|
+
// access pattern: get all jobs sorted by startedAt
|
|
115
|
+
.addAllIndex(['startedAt'])
|
|
116
|
+
// access pattern: get all jobs for a given baseURL and processingType,
|
|
117
|
+
// can be filtered by optEnableJavascript and optHideConsentBanner
|
|
118
|
+
// are solrted by startedAt
|
|
119
|
+
.addIndex(
|
|
120
|
+
{ composite: ['baseURL', 'processingType'] },
|
|
121
|
+
{ composite: ['optEnableJavascript', 'optHideConsentBanner', 'startedAt'] },
|
|
122
|
+
)
|
|
123
|
+
// access pattern: get all jobs for a given status, sorted by updatedAt
|
|
124
|
+
.addIndex(
|
|
125
|
+
{ composite: ['status'] },
|
|
126
|
+
{ composite: ['updatedAt'] },
|
|
127
|
+
);
|
|
128
|
+
|
|
129
|
+
export default schema.build();
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
/*
|
|
2
|
+
* Copyright 2025 Adobe. All rights reserved.
|
|
3
|
+
* This file is licensed to you under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
* you may not use this file except in compliance with the License. You may obtain a copy
|
|
5
|
+
* of the License at http://www.apache.org/licenses/LICENSE-2.0
|
|
6
|
+
*
|
|
7
|
+
* Unless required by applicable law or agreed to in writing, software distributed under
|
|
8
|
+
* the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR REPRESENTATIONS
|
|
9
|
+
* OF ANY KIND, either express or implied. See the License for the specific language
|
|
10
|
+
* governing permissions and limitations under the License.
|
|
11
|
+
*/
|
|
12
|
+
|
|
13
|
+
import type { BaseCollection, BaseModel, ScrapeJob } from '../index';
|
|
14
|
+
|
|
15
|
+
export interface ScrapeUrl extends BaseModel {
|
|
16
|
+
getFile(): string,
|
|
17
|
+
getScrapeJob(): Promise<ScrapeJob>,
|
|
18
|
+
getScrapeJobId(): string,
|
|
19
|
+
getPath(): string,
|
|
20
|
+
getReason(): string,
|
|
21
|
+
getStatus(): string,
|
|
22
|
+
getUrl(): string,
|
|
23
|
+
setFile(file: string): void,
|
|
24
|
+
setScrapeJobId(ScrapeJobId: string): void,
|
|
25
|
+
setPath(path: string): void,
|
|
26
|
+
setReason(reason: string): void,
|
|
27
|
+
setStatus(status: string): void,
|
|
28
|
+
setUrl(url: string): void,
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
export interface ScrapeUrlCollection extends BaseCollection<ScrapeUrl> {
|
|
32
|
+
allByScrapeJobId(ScrapeJobId: string): Promise<ScrapeUrl[]>;
|
|
33
|
+
allByScrapeUrlsByJobIdAndStatus(ScrapeJobId: string, status: string): Promise<ScrapeUrl[]>;
|
|
34
|
+
findByScrapeJobId(ScrapeJobId: string): Promise<ScrapeUrl | null>;
|
|
35
|
+
findByScrapeJobIdAndUrl(ScrapeJobId: string, url: string): Promise<ScrapeUrl | null>;
|
|
36
|
+
}
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
/*
|
|
2
|
+
* Copyright 2025 Adobe. All rights reserved.
|
|
3
|
+
* This file is licensed to you under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
* you may not use this file except in compliance with the License. You may obtain a copy
|
|
5
|
+
* of the License at http://www.apache.org/licenses/LICENSE-2.0
|
|
6
|
+
*
|
|
7
|
+
* Unless required by applicable law or agreed to in writing, software distributed under
|
|
8
|
+
* the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR REPRESENTATIONS
|
|
9
|
+
* OF ANY KIND, either express or implied. See the License for the specific language
|
|
10
|
+
* governing permissions and limitations under the License.
|
|
11
|
+
*/
|
|
12
|
+
|
|
13
|
+
import ScrapeUrl from './scrape-url.model.js';
|
|
14
|
+
import ScrapeUrlCollection from './scrape-url.collection.js';
|
|
15
|
+
|
|
16
|
+
export {
|
|
17
|
+
ScrapeUrl,
|
|
18
|
+
ScrapeUrlCollection,
|
|
19
|
+
};
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
/*
|
|
2
|
+
* Copyright 2025 Adobe. All rights reserved.
|
|
3
|
+
* This file is licensed to you under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
* you may not use this file except in compliance with the License. You may obtain a copy
|
|
5
|
+
* of the License at http://www.apache.org/licenses/LICENSE-2.0
|
|
6
|
+
*
|
|
7
|
+
* Unless required by applicable law or agreed to in writing, software distributed under
|
|
8
|
+
* the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR REPRESENTATIONS
|
|
9
|
+
* OF ANY KIND, either express or implied. See the License for the specific language
|
|
10
|
+
* governing permissions and limitations under the License.
|
|
11
|
+
*/
|
|
12
|
+
|
|
13
|
+
import BaseCollection from '../base/base.collection.js';
|
|
14
|
+
|
|
15
|
+
/**
|
|
16
|
+
* ScraperUrlCollection - A collection class responsible for managing ScraperUrl entities.
|
|
17
|
+
* Extends the BaseCollection to provide specific methods for interacting with ScraperUrl records.
|
|
18
|
+
*
|
|
19
|
+
* @class ScraperUrlCollection
|
|
20
|
+
* @extends BaseCollection
|
|
21
|
+
*/
|
|
22
|
+
class ScrapeUrlCollection extends BaseCollection {
|
|
23
|
+
// add custom methods here
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
export default ScrapeUrlCollection;
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
/*
|
|
2
|
+
* Copyright 2025 Adobe. All rights reserved.
|
|
3
|
+
* This file is licensed to you under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
* you may not use this file except in compliance with the License. You may obtain a copy
|
|
5
|
+
* of the License at http://www.apache.org/licenses/LICENSE-2.0
|
|
6
|
+
*
|
|
7
|
+
* Unless required by applicable law or agreed to in writing, software distributed under
|
|
8
|
+
* the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR REPRESENTATIONS
|
|
9
|
+
* OF ANY KIND, either express or implied. See the License for the specific language
|
|
10
|
+
* governing permissions and limitations under the License.
|
|
11
|
+
*/
|
|
12
|
+
|
|
13
|
+
import BaseModel from '../base/base.model.js';
|
|
14
|
+
|
|
15
|
+
/**
|
|
16
|
+
* ScraperUrl - A class representing an ScraperUrl entity.
|
|
17
|
+
* Provides methods to access and manipulate ScraperUrl-specific data.
|
|
18
|
+
*
|
|
19
|
+
* @class ScraperUrl
|
|
20
|
+
* @extends BaseModel
|
|
21
|
+
*/
|
|
22
|
+
class ScrapeUrl extends BaseModel {
|
|
23
|
+
static SCRAPE_URL_EXPIRES_IN_DAYS = 14;
|
|
24
|
+
|
|
25
|
+
// add your custom methods or overrides here
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
export default ScrapeUrl;
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
/*
|
|
2
|
+
* Copyright 2025 Adobe. All rights reserved.
|
|
3
|
+
* This file is licensed to you under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
* you may not use this file except in compliance with the License. You may obtain a copy
|
|
5
|
+
* of the License at http://www.apache.org/licenses/LICENSE-2.0
|
|
6
|
+
*
|
|
7
|
+
* Unless required by applicable law or agreed to in writing, software distributed under
|
|
8
|
+
* the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR REPRESENTATIONS
|
|
9
|
+
* OF ANY KIND, either express or implied. See the License for the specific language
|
|
10
|
+
* governing permissions and limitations under the License.
|
|
11
|
+
*/
|
|
12
|
+
|
|
13
|
+
/* c8 ignore start */
|
|
14
|
+
|
|
15
|
+
import { isValidUrl } from '@adobe/spacecat-shared-utils';
|
|
16
|
+
|
|
17
|
+
import SchemaBuilder from '../base/schema.builder.js';
|
|
18
|
+
import ScrapeUrl from './scrape-url.model.js';
|
|
19
|
+
import ScrapeUrlCollection from './scrape-url.collection.js';
|
|
20
|
+
import { ScrapeJob } from '../scrape-job/index.js';
|
|
21
|
+
|
|
22
|
+
/*
|
|
23
|
+
Schema Doc: https://electrodb.dev/en/modeling/schema/
|
|
24
|
+
Attribute Doc: https://electrodb.dev/en/modeling/attributes/
|
|
25
|
+
Indexes Doc: https://electrodb.dev/en/modeling/indexes/
|
|
26
|
+
*/
|
|
27
|
+
|
|
28
|
+
const schema = new SchemaBuilder(ScrapeUrl, ScrapeUrlCollection)
|
|
29
|
+
.withRecordExpiry(ScrapeUrl.SCRAPE_URL_EXPIRES_IN_DAYS)
|
|
30
|
+
.addReference('belongs_to', 'ScrapeJob', ['status'])
|
|
31
|
+
.addAttribute('file', {
|
|
32
|
+
type: 'string',
|
|
33
|
+
})
|
|
34
|
+
.addAttribute('path', {
|
|
35
|
+
type: 'string',
|
|
36
|
+
})
|
|
37
|
+
.addAttribute('reason', {
|
|
38
|
+
type: 'string',
|
|
39
|
+
})
|
|
40
|
+
.addAttribute('status', {
|
|
41
|
+
type: Object.values(ScrapeJob.ScrapeUrlStatus),
|
|
42
|
+
required: true,
|
|
43
|
+
})
|
|
44
|
+
.addAttribute('url', {
|
|
45
|
+
type: 'string',
|
|
46
|
+
required: true,
|
|
47
|
+
validate: (value) => isValidUrl(value),
|
|
48
|
+
});
|
|
49
|
+
|
|
50
|
+
export default schema.build();
|