@opentermsarchive/engine 5.0.2 → 5.0.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.eslintrc.yaml +10 -0
- package/package.json +2 -1
- package/src/archivist/extract/index.js +6 -4
- package/src/archivist/fetcher/fullDomFetcher.js +12 -0
- package/src/archivist/fetcher/index.js +10 -10
- package/src/archivist/recorder/record.js +1 -1
- package/src/archivist/recorder/repositories/interface.js +54 -61
- package/src/archivist/recorder/repositories/mongo/index.js +19 -7
- package/src/archivist/recorder/repositories/mongo/index.test.js +140 -0
- package/src/archivist/services/sourceDocument.js +14 -0
- package/src/collection-api/routes/metadata.js +3 -0
- package/src/collection-api/routes/services.js +2 -0
- package/src/collection-api/routes/versions.js +3 -1
- package/src/reporter/github/index.js +4 -1
- package/src/reporter/github/index.test.js +2 -0
- package/src/reporter/index.js +4 -3
package/.eslintrc.yaml
CHANGED
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
extends:
|
|
2
2
|
- airbnb-base
|
|
3
|
+
- plugin:jsdoc/recommended-error
|
|
3
4
|
parserOptions:
|
|
4
5
|
ecmaVersion: 2022
|
|
5
6
|
env:
|
|
@@ -11,7 +12,16 @@ plugins:
|
|
|
11
12
|
- import
|
|
12
13
|
- json-format
|
|
13
14
|
- no-only-tests
|
|
15
|
+
- jsdoc
|
|
14
16
|
rules:
|
|
17
|
+
jsdoc/require-jsdoc: 0
|
|
18
|
+
jsdoc/check-tag-names:
|
|
19
|
+
- error
|
|
20
|
+
- definedTags:
|
|
21
|
+
- swagger
|
|
22
|
+
jsdoc/check-line-alignment:
|
|
23
|
+
- error
|
|
24
|
+
- always
|
|
15
25
|
arrow-parens:
|
|
16
26
|
- error
|
|
17
27
|
- as-needed
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@opentermsarchive/engine",
|
|
3
|
-
"version": "5.0.
|
|
3
|
+
"version": "5.0.4",
|
|
4
4
|
"description": "Tracks and makes visible changes to the terms of online services",
|
|
5
5
|
"homepage": "https://opentermsarchive.org",
|
|
6
6
|
"bugs": {
|
|
@@ -108,6 +108,7 @@
|
|
|
108
108
|
"devDependencies": {
|
|
109
109
|
"@commitlint/cli": "^19.0.3",
|
|
110
110
|
"dir-compare": "^4.0.0",
|
|
111
|
+
"eslint-plugin-jsdoc": "^50.6.9",
|
|
111
112
|
"keep-a-changelog": "^2.5.3",
|
|
112
113
|
"nock": "^13.2.1",
|
|
113
114
|
"node-stream-zip": "^1.15.0",
|
|
@@ -5,6 +5,8 @@ import turndownPluginGithubFlavouredMarkdown from 'joplin-turndown-plugin-gfm';
|
|
|
5
5
|
import jsdom from 'jsdom';
|
|
6
6
|
import mime from 'mime';
|
|
7
7
|
|
|
8
|
+
import SourceDocument from '../services/sourceDocument.js';
|
|
9
|
+
|
|
8
10
|
import { ExtractDocumentError } from './errors.js';
|
|
9
11
|
|
|
10
12
|
export { ExtractDocumentError } from './errors.js';
|
|
@@ -23,11 +25,11 @@ const ciceroMarkTransformer = new CiceroMarkTransformer();
|
|
|
23
25
|
|
|
24
26
|
/**
|
|
25
27
|
* Extract content from source document and convert it to Markdown
|
|
26
|
-
*
|
|
27
28
|
* @function extract
|
|
28
|
-
* @param
|
|
29
|
-
* @returns {Promise<string>}
|
|
30
|
-
|
|
29
|
+
* @param {string} sourceDocument Source document from which to extract content, see {@link SourceDocument}
|
|
30
|
+
* @returns {Promise<string>} Promise which is fulfilled once the content is extracted and converted in Markdown. The promise will resolve into a string containing the extracted content in Markdown format
|
|
31
|
+
* @async
|
|
32
|
+
*/
|
|
31
33
|
export default async function extract(sourceDocument) {
|
|
32
34
|
try {
|
|
33
35
|
if (sourceDocument.mimeType == mime.getType('pdf')) {
|
|
@@ -62,6 +62,12 @@ export default async function fetch(url, cssSelectors, config) {
|
|
|
62
62
|
}
|
|
63
63
|
}
|
|
64
64
|
|
|
65
|
+
/**
|
|
66
|
+
* Launches a headless browser instance using Puppeteer if one is not already running. Returns the existing browser instance if one is already running, otherwise creates and returns a new instance.
|
|
67
|
+
* @function launchHeadlessBrowser
|
|
68
|
+
* @returns {Promise<puppeteer.Browser>} The Puppeteer browser instance.
|
|
69
|
+
* @async
|
|
70
|
+
*/
|
|
65
71
|
export async function launchHeadlessBrowser() {
|
|
66
72
|
if (browser) {
|
|
67
73
|
return browser;
|
|
@@ -72,6 +78,12 @@ export async function launchHeadlessBrowser() {
|
|
|
72
78
|
return browser;
|
|
73
79
|
}
|
|
74
80
|
|
|
81
|
+
/**
|
|
82
|
+
* Stops the headless browser instance if one is running. If no instance exists, it does nothing.
|
|
83
|
+
* @function stopHeadlessBrowser
|
|
84
|
+
* @returns {Promise<void>}
|
|
85
|
+
* @async
|
|
86
|
+
*/
|
|
75
87
|
export async function stopHeadlessBrowser() {
|
|
76
88
|
if (!browser) {
|
|
77
89
|
return;
|
|
@@ -9,17 +9,17 @@ export { FetchDocumentError } from './errors.js';
|
|
|
9
9
|
|
|
10
10
|
/**
|
|
11
11
|
* Fetch a resource from the network, returning a promise which is fulfilled once the response is available
|
|
12
|
-
*
|
|
13
12
|
* @function fetch
|
|
14
|
-
* @param
|
|
15
|
-
* @param
|
|
16
|
-
* @param
|
|
17
|
-
* @param
|
|
18
|
-
* @param
|
|
19
|
-
* @param
|
|
20
|
-
* @param
|
|
21
|
-
* @param
|
|
22
|
-
* @returns {Promise
|
|
13
|
+
* @param {object} params Fetcher parameters
|
|
14
|
+
* @param {string} params.url URL of the resource you want to fetch
|
|
15
|
+
* @param {boolean} [params.executeClientScripts] Enable execution of client scripts. When set to `true`, this property loads the page in a headless browser to load all assets and execute client scripts before returning its content
|
|
16
|
+
* @param {string|Array} [params.cssSelectors] List of CSS selectors to await when loading the resource in a headless browser. Can be a CSS selector or an array of CSS selectors. Only relevant when `executeClientScripts` is enabled
|
|
17
|
+
* @param {object} [params.config] Fetcher configuration
|
|
18
|
+
* @param {number} [params.config.navigationTimeout] Maximum time (in milliseconds) to wait before considering the fetch failed
|
|
19
|
+
* @param {string} [params.config.language] Language (in [ISO 639-1 format](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes)) to be passed in request headers
|
|
20
|
+
* @param {number} [params.config.waitForElementsTimeout] Maximum time (in milliseconds) to wait for selectors to exist on page before considering the fetch failed. Only relevant when `executeClientScripts` is enabled
|
|
21
|
+
* @returns {Promise<{ mimeType: string, content: string | Buffer }>} Promise containing the fetched resource's MIME type and content
|
|
22
|
+
* @async
|
|
23
23
|
*/
|
|
24
24
|
export default async function fetch({
|
|
25
25
|
url, executeClientScripts, cssSelectors,
|
|
@@ -4,120 +4,113 @@
|
|
|
4
4
|
* @interface
|
|
5
5
|
*/
|
|
6
6
|
|
|
7
|
+
import Record from '../record.js';
|
|
8
|
+
|
|
7
9
|
/* eslint-disable require-await */
|
|
8
10
|
class RepositoryInterface {
|
|
9
11
|
/**
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
*/
|
|
12
|
+
* [Optional] Initialize repository
|
|
13
|
+
* Override this method if the repository needs some asynchronous initialization code (open database connection and create collections, initialize Git…)
|
|
14
|
+
* @returns {Promise<RepositoryInterface>} Promise that will be resolved with the current repository instance
|
|
15
|
+
*/
|
|
15
16
|
async initialize() {
|
|
16
17
|
return this;
|
|
17
18
|
}
|
|
18
19
|
|
|
19
20
|
/**
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
*/
|
|
21
|
+
* [Optional] Finalize repository
|
|
22
|
+
* Override this method if the repository needs some asynchronous code to properly close the repository (close database connection, push changes on Git remote…)
|
|
23
|
+
* @returns {Promise<RepositoryInterface>} Promise that will be resolved with the current repository instance
|
|
24
|
+
*/
|
|
25
25
|
async finalize() {
|
|
26
26
|
return this;
|
|
27
27
|
}
|
|
28
28
|
|
|
29
29
|
/**
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
*/
|
|
30
|
+
* Persist the given record if it does not already exist in repository
|
|
31
|
+
* @param {Record} record - Record to persist
|
|
32
|
+
* @returns {Promise<Record>} Promise that will be resolved with the given record when it has been persisted
|
|
33
|
+
*/
|
|
35
34
|
async save(record) {
|
|
36
35
|
throw new Error(`#save method is not implemented in ${this.constructor.name}`);
|
|
37
36
|
}
|
|
38
37
|
|
|
39
38
|
/**
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
*/
|
|
39
|
+
* Find the most recent record that matches the given service ID and terms type and optionally the document ID
|
|
40
|
+
* In case of snapshots, if the record is related to terms extracted from multiple source documents, the document ID is required to find the source snapshot
|
|
41
|
+
* @param {string} serviceId - Service ID of record to find
|
|
42
|
+
* @param {string} termsType - Terms type of record to find
|
|
43
|
+
* @param {string} [documentId] - Document ID of record to find. Used to identify the source in terms extracted from multiple source documents. Not necessary for terms with a single source document
|
|
44
|
+
* @returns {Promise<Record>} Promise that will be resolved with the found record or an empty object if none match the given criteria
|
|
45
|
+
*/
|
|
48
46
|
async findLatest(serviceId, termsType, documentId) {
|
|
49
47
|
throw new Error(`#findLatest method is not implemented in ${this.constructor.name}`);
|
|
50
48
|
}
|
|
51
49
|
|
|
52
50
|
/**
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
*/
|
|
51
|
+
* Find the record that was valid on the given date and that matches the given service ID and terms type and optionally the document ID
|
|
52
|
+
* In case of snapshots, if the record is related to terms extracted from multiple source documents, the document ID is required to find the source snapshot
|
|
53
|
+
* @param {string} serviceId - Service ID of record to find
|
|
54
|
+
* @param {string} termsType - Terms type of record to find
|
|
55
|
+
* @param {Date} date - Datetime on which the record to find was valid
|
|
56
|
+
* @param {string} [documentId] - Document ID of record to find. Used to identify the source in terms extracted from multiple source documents. Not necessary for terms with a single source document
|
|
57
|
+
* @returns {Promise<Record>} Promise that will be resolved with the found record or an empty object if none match the given criteria
|
|
58
|
+
*/
|
|
62
59
|
async findByDate(serviceId, termsType, date, documentId) {
|
|
63
60
|
throw new Error(`#findByDate method is not implemented in ${this.constructor.name}`);
|
|
64
61
|
}
|
|
65
62
|
|
|
66
63
|
/**
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
*/
|
|
64
|
+
* Find the record that matches the given record ID
|
|
65
|
+
* @param {string} recordId - Record ID of the record to find
|
|
66
|
+
* @returns {Promise<Record>} Promise that will be resolved with the found record or an empty object if none match the given ID
|
|
67
|
+
*/
|
|
72
68
|
async findById(recordId) {
|
|
73
69
|
throw new Error(`#findById method is not implemented in ${this.constructor.name}`);
|
|
74
70
|
}
|
|
75
71
|
|
|
76
72
|
/**
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
*/
|
|
73
|
+
* Find all records
|
|
74
|
+
* For performance reasons, the content of the records will not be loaded by default. Use #loadRecordContent to load the content of individual records
|
|
75
|
+
* @see RepositoryInterface#loadRecordContent
|
|
76
|
+
* @returns {Promise<Array<Record>>} Promise that will be resolved with an array of all records
|
|
77
|
+
*/
|
|
83
78
|
async findAll() {
|
|
84
79
|
throw new Error(`#findAll method is not implemented in ${this.constructor.name}`);
|
|
85
80
|
}
|
|
86
81
|
|
|
87
82
|
/**
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
*/
|
|
83
|
+
* Count the total number of records in the repository
|
|
84
|
+
* For performance reasons, use this method rather than counting the number of entries returned by #findAll if you only need the size of a repository
|
|
85
|
+
* @returns {Promise<number>} Promise that will be resolved with the total number of records
|
|
86
|
+
*/
|
|
93
87
|
async count() {
|
|
94
88
|
throw new Error(`#count method is not implemented in ${this.constructor.name}`);
|
|
95
89
|
}
|
|
96
90
|
|
|
91
|
+
/* eslint-disable jsdoc/require-yields-check */
|
|
97
92
|
/**
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
*/
|
|
93
|
+
* Iterate over all records in the repository, from oldest to most recent
|
|
94
|
+
* @yields {Record}
|
|
95
|
+
*/
|
|
102
96
|
async* iterate() {
|
|
103
97
|
throw new Error(`#iterate method is not implemented in ${this.constructor.name}`);
|
|
104
98
|
}
|
|
99
|
+
/* eslint-enable jsdoc/require-yields-check */
|
|
105
100
|
|
|
106
101
|
/**
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
*/
|
|
102
|
+
* Remove all records
|
|
103
|
+
* @returns {Promise} Promise that will be resolved when all records are removed
|
|
104
|
+
*/
|
|
111
105
|
async removeAll() {
|
|
112
106
|
throw new Error(`#removeAll method is not implemented in ${this.constructor.name}`);
|
|
113
107
|
}
|
|
114
108
|
|
|
115
109
|
/**
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
*/
|
|
110
|
+
* Load content of the given record
|
|
111
|
+
* @param {Record} record - Record of which to populate content
|
|
112
|
+
* @returns {Promise<Record>} Promise that will be resolved with the given record when its content has been loaded
|
|
113
|
+
*/
|
|
121
114
|
async loadRecordContent(record) {
|
|
122
115
|
throw new Error(`#loadRecordContent method is not implemented in ${this.constructor.name}`);
|
|
123
116
|
}
|
|
@@ -34,14 +34,14 @@ export default class MongoRepository extends RepositoryInterface {
|
|
|
34
34
|
}
|
|
35
35
|
|
|
36
36
|
async save(record) {
|
|
37
|
-
const { serviceId, termsType } = record;
|
|
37
|
+
const { serviceId, termsType, documentId } = record;
|
|
38
38
|
|
|
39
39
|
if (record.isFirstRecord === undefined || record.isFirstRecord === null) {
|
|
40
|
-
record.isFirstRecord = !await this.collection.findOne({ serviceId, termsType });
|
|
40
|
+
record.isFirstRecord = !await this.collection.findOne({ serviceId, termsType, documentId });
|
|
41
41
|
}
|
|
42
42
|
|
|
43
43
|
const documentFields = await this.#toPersistence(record);
|
|
44
|
-
const previousRecord = await this.findLatest(serviceId, termsType);
|
|
44
|
+
const previousRecord = await this.findLatest(serviceId, termsType, documentId);
|
|
45
45
|
|
|
46
46
|
if (previousRecord?.content == documentFields.content) {
|
|
47
47
|
return Object(null);
|
|
@@ -54,14 +54,26 @@ export default class MongoRepository extends RepositoryInterface {
|
|
|
54
54
|
return record;
|
|
55
55
|
}
|
|
56
56
|
|
|
57
|
-
async findLatest(serviceId, termsType) {
|
|
58
|
-
const
|
|
57
|
+
async findLatest(serviceId, termsType, documentId) {
|
|
58
|
+
const query = { serviceId, termsType };
|
|
59
|
+
|
|
60
|
+
if (documentId !== undefined) {
|
|
61
|
+
query.documentId = documentId;
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
const [mongoDocument] = await this.collection.find(query).limit(1).sort({ fetchDate: -1 }).toArray(); // `findOne` doesn't support the `sort` method, so even for only one mongo document use `find`
|
|
59
65
|
|
|
60
66
|
return this.#toDomain(mongoDocument);
|
|
61
67
|
}
|
|
62
68
|
|
|
63
|
-
async findByDate(serviceId, termsType, date) {
|
|
64
|
-
const
|
|
69
|
+
async findByDate(serviceId, termsType, date, documentId) {
|
|
70
|
+
const query = { serviceId, termsType, fetchDate: { $lte: new Date(date) } };
|
|
71
|
+
|
|
72
|
+
if (documentId !== undefined) {
|
|
73
|
+
query.documentId = documentId;
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
const [mongoDocument] = await this.collection.find(query).limit(1).sort({ fetchDate: -1 }).toArray(); // `findOne` doesn't support the `sort` method, so even for only one mongo document use `find`
|
|
65
77
|
|
|
66
78
|
return this.#toDomain(mongoDocument);
|
|
67
79
|
}
|
|
@@ -332,6 +332,30 @@ describe('MongoRepository', () => {
|
|
|
332
332
|
expect(mongoDocument.termsType).to.include(TERMS_TYPE);
|
|
333
333
|
});
|
|
334
334
|
});
|
|
335
|
+
|
|
336
|
+
context('when document ID is specified', () => {
|
|
337
|
+
before(async () => {
|
|
338
|
+
(record = await subject.save(new Version({
|
|
339
|
+
serviceId: SERVICE_PROVIDER_ID,
|
|
340
|
+
termsType: TERMS_TYPE,
|
|
341
|
+
documentId: DOCUMENT_ID,
|
|
342
|
+
content: CONTENT,
|
|
343
|
+
fetchDate: FETCH_DATE,
|
|
344
|
+
snapshotIds: [SNAPSHOT_ID],
|
|
345
|
+
})));
|
|
346
|
+
|
|
347
|
+
(mongoDocument = await collection.findOne({
|
|
348
|
+
serviceId: SERVICE_PROVIDER_ID,
|
|
349
|
+
termsType: TERMS_TYPE,
|
|
350
|
+
}));
|
|
351
|
+
});
|
|
352
|
+
|
|
353
|
+
after(() => subject.removeAll());
|
|
354
|
+
|
|
355
|
+
it('stores the document ID', () => {
|
|
356
|
+
expect(mongoDocument.documentId).to.equal(DOCUMENT_ID);
|
|
357
|
+
});
|
|
358
|
+
});
|
|
335
359
|
});
|
|
336
360
|
|
|
337
361
|
describe('#findById', () => {
|
|
@@ -439,6 +463,46 @@ describe('MongoRepository', () => {
|
|
|
439
463
|
expect(recordFound.id).to.include(recordToFindId);
|
|
440
464
|
});
|
|
441
465
|
});
|
|
466
|
+
|
|
467
|
+
context('when document ID is specified', () => {
|
|
468
|
+
let recordFound;
|
|
469
|
+
const DIFFERENT_DOCUMENT_ID = 'other-document';
|
|
470
|
+
const UPDATED_CONTENT = `${CONTENT} (with additional content)`;
|
|
471
|
+
|
|
472
|
+
before(async () => {
|
|
473
|
+
await subject.save(new Version({
|
|
474
|
+
serviceId: SERVICE_PROVIDER_ID,
|
|
475
|
+
termsType: TERMS_TYPE,
|
|
476
|
+
documentId: DOCUMENT_ID,
|
|
477
|
+
content: CONTENT,
|
|
478
|
+
fetchDate: FETCH_DATE,
|
|
479
|
+
snapshotIds: [SNAPSHOT_ID],
|
|
480
|
+
}));
|
|
481
|
+
|
|
482
|
+
await subject.save(new Version({
|
|
483
|
+
serviceId: SERVICE_PROVIDER_ID,
|
|
484
|
+
termsType: TERMS_TYPE,
|
|
485
|
+
documentId: DIFFERENT_DOCUMENT_ID,
|
|
486
|
+
content: UPDATED_CONTENT,
|
|
487
|
+
fetchDate: FETCH_DATE_LATER,
|
|
488
|
+
snapshotIds: [SNAPSHOT_ID],
|
|
489
|
+
}));
|
|
490
|
+
|
|
491
|
+
recordFound = await subject.findByDate(
|
|
492
|
+
SERVICE_PROVIDER_ID,
|
|
493
|
+
TERMS_TYPE,
|
|
494
|
+
FETCH_DATE_LATER,
|
|
495
|
+
DOCUMENT_ID,
|
|
496
|
+
);
|
|
497
|
+
});
|
|
498
|
+
|
|
499
|
+
after(() => subject.removeAll());
|
|
500
|
+
|
|
501
|
+
it('returns only records matching the document ID', () => {
|
|
502
|
+
expect(recordFound.documentId).to.equal(DOCUMENT_ID);
|
|
503
|
+
expect(recordFound.content).to.equal(CONTENT);
|
|
504
|
+
});
|
|
505
|
+
});
|
|
442
506
|
});
|
|
443
507
|
});
|
|
444
508
|
|
|
@@ -580,6 +644,44 @@ describe('MongoRepository', () => {
|
|
|
580
644
|
expect((await latestRecord.content).toString('utf8')).to.equal(UPDATED_CONTENT);
|
|
581
645
|
});
|
|
582
646
|
});
|
|
647
|
+
|
|
648
|
+
context('when document ID is specified', () => {
|
|
649
|
+
let latestRecord;
|
|
650
|
+
const DIFFERENT_DOCUMENT_ID = 'other-document';
|
|
651
|
+
|
|
652
|
+
before(async () => {
|
|
653
|
+
await subject.save(new Version({
|
|
654
|
+
serviceId: SERVICE_PROVIDER_ID,
|
|
655
|
+
termsType: TERMS_TYPE,
|
|
656
|
+
documentId: DOCUMENT_ID,
|
|
657
|
+
content: CONTENT,
|
|
658
|
+
fetchDate: FETCH_DATE_LATER,
|
|
659
|
+
snapshotIds: [SNAPSHOT_ID],
|
|
660
|
+
}));
|
|
661
|
+
|
|
662
|
+
await subject.save(new Version({
|
|
663
|
+
serviceId: SERVICE_PROVIDER_ID,
|
|
664
|
+
termsType: TERMS_TYPE,
|
|
665
|
+
documentId: DIFFERENT_DOCUMENT_ID,
|
|
666
|
+
content: CONTENT,
|
|
667
|
+
fetchDate: FETCH_DATE,
|
|
668
|
+
snapshotIds: [SNAPSHOT_ID],
|
|
669
|
+
}));
|
|
670
|
+
|
|
671
|
+
latestRecord = await subject.findLatest(
|
|
672
|
+
SERVICE_PROVIDER_ID,
|
|
673
|
+
TERMS_TYPE,
|
|
674
|
+
DIFFERENT_DOCUMENT_ID,
|
|
675
|
+
);
|
|
676
|
+
});
|
|
677
|
+
|
|
678
|
+
after(() => subject.removeAll());
|
|
679
|
+
|
|
680
|
+
it('returns only records matching the document ID', () => {
|
|
681
|
+
expect(latestRecord.documentId).to.equal(DIFFERENT_DOCUMENT_ID);
|
|
682
|
+
expect(latestRecord.fetchDate).to.deep.equal(FETCH_DATE);
|
|
683
|
+
});
|
|
684
|
+
});
|
|
583
685
|
});
|
|
584
686
|
|
|
585
687
|
context('when there are no records for the given service', () => {
|
|
@@ -1119,6 +1221,44 @@ describe('MongoRepository', () => {
|
|
|
1119
1221
|
expect(latestRecord.mimeType).to.equal(PDF_MIME_TYPE);
|
|
1120
1222
|
});
|
|
1121
1223
|
});
|
|
1224
|
+
|
|
1225
|
+
context('when document ID is specified', () => {
|
|
1226
|
+
let latestRecord;
|
|
1227
|
+
const DIFFERENT_DOCUMENT_ID = 'other-document';
|
|
1228
|
+
|
|
1229
|
+
before(async () => {
|
|
1230
|
+
await subject.save(new Snapshot({
|
|
1231
|
+
serviceId: SERVICE_PROVIDER_ID,
|
|
1232
|
+
termsType: TERMS_TYPE,
|
|
1233
|
+
documentId: DOCUMENT_ID,
|
|
1234
|
+
content: CONTENT,
|
|
1235
|
+
fetchDate: FETCH_DATE_LATER,
|
|
1236
|
+
mimeType: HTML_MIME_TYPE,
|
|
1237
|
+
}));
|
|
1238
|
+
|
|
1239
|
+
await subject.save(new Snapshot({
|
|
1240
|
+
serviceId: SERVICE_PROVIDER_ID,
|
|
1241
|
+
termsType: TERMS_TYPE,
|
|
1242
|
+
documentId: DIFFERENT_DOCUMENT_ID,
|
|
1243
|
+
content: CONTENT,
|
|
1244
|
+
fetchDate: FETCH_DATE,
|
|
1245
|
+
mimeType: HTML_MIME_TYPE,
|
|
1246
|
+
}));
|
|
1247
|
+
|
|
1248
|
+
latestRecord = await subject.findLatest(
|
|
1249
|
+
SERVICE_PROVIDER_ID,
|
|
1250
|
+
TERMS_TYPE,
|
|
1251
|
+
DIFFERENT_DOCUMENT_ID,
|
|
1252
|
+
);
|
|
1253
|
+
});
|
|
1254
|
+
|
|
1255
|
+
after(() => subject.removeAll());
|
|
1256
|
+
|
|
1257
|
+
it('returns only records matching the document ID', () => {
|
|
1258
|
+
expect(latestRecord.documentId).to.equal(DIFFERENT_DOCUMENT_ID);
|
|
1259
|
+
expect(latestRecord.fetchDate).to.deep.equal(FETCH_DATE);
|
|
1260
|
+
});
|
|
1261
|
+
});
|
|
1122
1262
|
});
|
|
1123
1263
|
|
|
1124
1264
|
context('when there are no records for the given service', () => {
|
|
@@ -1,4 +1,18 @@
|
|
|
1
1
|
export default class SourceDocument {
|
|
2
|
+
/**
|
|
3
|
+
* Represents a source document containing web content and metadata for extraction.
|
|
4
|
+
* Includes the document location, selectors for content inclusion/exclusion,
|
|
5
|
+
* content filters, raw content data, and MIME type information.
|
|
6
|
+
* @class SourceDocument
|
|
7
|
+
* @param {object} params The source document parameters
|
|
8
|
+
* @param {string} params.location The URL location of the document
|
|
9
|
+
* @param {boolean} params.executeClientScripts Whether to execute client-side scripts
|
|
10
|
+
* @param {(string | object | Array)} params.contentSelectors CSS selectors for content to include
|
|
11
|
+
* @param {(string | object | Array)} params.insignificantContentSelectors CSS selectors for content to exclude
|
|
12
|
+
* @param {Array} params.filters Array of filters to apply
|
|
13
|
+
* @param {string} params.content The document content
|
|
14
|
+
* @param {string} params.mimeType The MIME type of the content
|
|
15
|
+
*/
|
|
2
16
|
constructor({ location, executeClientScripts, contentSelectors, insignificantContentSelectors, filters, content, mimeType }) {
|
|
3
17
|
this.location = location;
|
|
4
18
|
this.executeClientScripts = executeClientScripts;
|
|
@@ -10,6 +10,9 @@ const METADATA_FILENAME = 'metadata.yml';
|
|
|
10
10
|
const PACKAGE_JSON_PATH = '../../../package.json';
|
|
11
11
|
|
|
12
12
|
/**
|
|
13
|
+
* @param {string} collectionPath The path to the collection
|
|
14
|
+
* @param {object} services The services of the collection
|
|
15
|
+
* @returns {express.Router} The router instance
|
|
13
16
|
* @swagger
|
|
14
17
|
* tags:
|
|
15
18
|
* name: Metadata
|
|
@@ -5,6 +5,7 @@ import RepositoryFactory from '../../archivist/recorder/repositories/factory.js'
|
|
|
5
5
|
import { toISODateWithoutMilliseconds } from '../../archivist/utils/date.js';
|
|
6
6
|
|
|
7
7
|
/**
|
|
8
|
+
* @private
|
|
8
9
|
* @swagger
|
|
9
10
|
* tags:
|
|
10
11
|
* name: Versions
|
|
@@ -24,13 +25,14 @@ import { toISODateWithoutMilliseconds } from '../../archivist/utils/date.js';
|
|
|
24
25
|
* description: The ID of the version.
|
|
25
26
|
* content:
|
|
26
27
|
* type: string
|
|
27
|
-
* description: The JSON-escaped Markdown content of the version
|
|
28
|
+
* description: The JSON-escaped Markdown content of the version
|
|
28
29
|
*/
|
|
29
30
|
const router = express.Router();
|
|
30
31
|
|
|
31
32
|
const versionsRepository = await RepositoryFactory.create(config.get('@opentermsarchive/engine.recorder.versions.storage')).initialize();
|
|
32
33
|
|
|
33
34
|
/**
|
|
35
|
+
* @private
|
|
34
36
|
* @swagger
|
|
35
37
|
* /version/{serviceId}/{termsType}/{date}:
|
|
36
38
|
* get:
|
|
@@ -99,7 +99,10 @@ export default class GitHub {
|
|
|
99
99
|
}
|
|
100
100
|
|
|
101
101
|
async getRepositoryLabels() {
|
|
102
|
-
const
|
|
102
|
+
const labels = await this.octokit.paginate('GET /repos/{owner}/{repo}/labels', {
|
|
103
|
+
...this.commonParams,
|
|
104
|
+
per_page: 100,
|
|
105
|
+
});
|
|
103
106
|
|
|
104
107
|
return labels;
|
|
105
108
|
}
|
|
@@ -33,6 +33,7 @@ describe('GitHub', function () {
|
|
|
33
33
|
|
|
34
34
|
nock('https://api.github.com')
|
|
35
35
|
.get('/repos/owner/repo/labels')
|
|
36
|
+
.query(true)
|
|
36
37
|
.reply(200, existingLabels);
|
|
37
38
|
|
|
38
39
|
const missingLabels = MANAGED_LABELS.slice(-2);
|
|
@@ -61,6 +62,7 @@ describe('GitHub', function () {
|
|
|
61
62
|
before(async () => {
|
|
62
63
|
scope = nock('https://api.github.com')
|
|
63
64
|
.get('/repos/owner/repo/labels')
|
|
65
|
+
.query(true)
|
|
64
66
|
.reply(200, LABELS);
|
|
65
67
|
|
|
66
68
|
result = await github.getRepositoryLabels();
|
package/src/reporter/index.js
CHANGED
|
@@ -44,8 +44,7 @@ export default class Reporter {
|
|
|
44
44
|
|
|
45
45
|
/**
|
|
46
46
|
* Support for legacy config format where reporter configuration was nested under `githubIssues`
|
|
47
|
-
*
|
|
48
|
-
*
|
|
47
|
+
* @example
|
|
49
48
|
* ```json
|
|
50
49
|
* {
|
|
51
50
|
* "githubIssues": {
|
|
@@ -55,8 +54,10 @@ export default class Reporter {
|
|
|
55
54
|
* }
|
|
56
55
|
* }
|
|
57
56
|
* ```
|
|
58
|
-
*
|
|
57
|
+
* @param {object} config - The configuration object to normalize
|
|
58
|
+
* @returns {object} The normalized configuration object
|
|
59
59
|
* @deprecated
|
|
60
|
+
* @private
|
|
60
61
|
*/
|
|
61
62
|
static normalizeConfig(config) {
|
|
62
63
|
if (config.githubIssues) {
|