@opentermsarchive/engine 0.26.1 → 0.27.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/ota-track.js +3 -3
- package/bin/ota-validate.js +2 -2
- package/bin/ota.js +1 -1
- package/config/default.json +1 -1
- package/package.json +3 -4
- package/scripts/dataset/export/index.js +4 -4
- package/scripts/dataset/export/index.test.js +11 -17
- package/scripts/declarations/lint/index.mocha.js +1 -1
- package/scripts/declarations/utils/index.js +12 -12
- package/scripts/declarations/validate/definitions.js +1 -1
- package/scripts/declarations/validate/index.mocha.js +30 -34
- package/scripts/declarations/validate/service.history.schema.js +11 -11
- package/scripts/declarations/validate/service.schema.js +13 -13
- package/scripts/history/migrate-services.js +4 -4
- package/scripts/history/update-to-full-hash.js +2 -2
- package/scripts/import/index.js +14 -14
- package/scripts/rewrite/rewrite-snapshots.js +3 -3
- package/scripts/rewrite/rewrite-versions.js +14 -14
- package/scripts/utils/renamer/README.md +3 -3
- package/scripts/utils/renamer/index.js +13 -13
- package/src/archivist/errors.js +1 -1
- package/src/archivist/extract/exports.js +3 -0
- package/src/archivist/{filter → extract}/index.js +23 -27
- package/src/archivist/extract/index.test.js +516 -0
- package/src/archivist/index.js +101 -140
- package/src/archivist/index.test.js +178 -166
- package/src/archivist/recorder/index.js +11 -55
- package/src/archivist/recorder/index.test.js +310 -356
- package/src/archivist/recorder/record.js +18 -7
- package/src/archivist/recorder/repositories/git/dataMapper.js +41 -31
- package/src/archivist/recorder/repositories/git/index.js +11 -15
- package/src/archivist/recorder/repositories/git/index.test.js +1058 -463
- package/src/archivist/recorder/repositories/interface.js +8 -6
- package/src/archivist/recorder/repositories/mongo/dataMapper.js +21 -14
- package/src/archivist/recorder/repositories/mongo/index.js +8 -8
- package/src/archivist/recorder/repositories/mongo/index.test.js +898 -479
- package/src/archivist/recorder/snapshot.js +5 -0
- package/src/archivist/recorder/snapshot.test.js +65 -0
- package/src/archivist/recorder/version.js +14 -0
- package/src/archivist/recorder/version.test.js +65 -0
- package/src/archivist/services/index.js +60 -51
- package/src/archivist/services/index.test.js +63 -83
- package/src/archivist/services/service.js +26 -22
- package/src/archivist/services/service.test.js +46 -68
- package/src/archivist/services/{pageDeclaration.js → sourceDocument.js} +11 -9
- package/src/archivist/services/{pageDeclaration.test.js → sourceDocument.test.js} +21 -21
- package/src/archivist/services/terms.js +26 -0
- package/src/archivist/services/{documentDeclaration.test.js → terms.test.js} +15 -15
- package/src/exports.js +2 -2
- package/src/index.js +16 -13
- package/src/logger/index.js +35 -36
- package/src/notifier/index.js +8 -8
- package/src/tracker/index.js +6 -6
- package/src/archivist/filter/exports.js +0 -3
- package/src/archivist/filter/index.test.js +0 -564
- package/src/archivist/recorder/record.test.js +0 -91
- package/src/archivist/services/documentDeclaration.js +0 -26
- /package/scripts/utils/renamer/rules/{documentTypes.json → termsTypes.json} +0 -0
- /package/scripts/utils/renamer/rules/{documentTypesByService.json → termsTypesByService.json} +0 -0
|
@@ -0,0 +1,516 @@
|
|
|
1
|
+
import fsApi from 'fs';
|
|
2
|
+
import path from 'path';
|
|
3
|
+
import { fileURLToPath } from 'url';
|
|
4
|
+
|
|
5
|
+
import chai from 'chai';
|
|
6
|
+
import jsdom from 'jsdom';
|
|
7
|
+
|
|
8
|
+
import { InaccessibleContentError } from '../errors.js';
|
|
9
|
+
import SourceDocument from '../services/sourceDocument.js';
|
|
10
|
+
|
|
11
|
+
import { convertRelativeURLsToAbsolute, extractFromHTML, extractFromPDF } from './index.js';
|
|
12
|
+
|
|
13
|
+
const __dirname = path.dirname(fileURLToPath(import.meta.url));
|
|
14
|
+
const fs = fsApi.promises;
|
|
15
|
+
const { JSDOM } = jsdom;
|
|
16
|
+
const { expect } = chai;
|
|
17
|
+
|
|
18
|
+
const virtualLocation = 'https://exemple.com/main';
|
|
19
|
+
const rawHTML = `
|
|
20
|
+
<!DOCTYPE html>
|
|
21
|
+
<html>
|
|
22
|
+
<head>
|
|
23
|
+
<meta charset="UTF-8">
|
|
24
|
+
<title>TOS</title>
|
|
25
|
+
</head>
|
|
26
|
+
<body>
|
|
27
|
+
<h1>Title</h1>
|
|
28
|
+
<p><a id="link1" href="/relative/link">link 1</a></p>
|
|
29
|
+
<p><a id="link2" href="#anchor">link 2</a></p>
|
|
30
|
+
<p><a id="link3" href="http://absolute.url/link">link 3</a></p>
|
|
31
|
+
<div id="empty"></div>
|
|
32
|
+
<div id="whitespaceOnly"> </div>
|
|
33
|
+
</body>
|
|
34
|
+
</html>`;
|
|
35
|
+
|
|
36
|
+
const expectedExtracted = `Title
|
|
37
|
+
=====
|
|
38
|
+
|
|
39
|
+
[link 1](https://exemple.com/relative/link)
|
|
40
|
+
|
|
41
|
+
[link 2](#anchor)
|
|
42
|
+
|
|
43
|
+
[link 3](http://absolute.url/link)`;
|
|
44
|
+
|
|
45
|
+
const expectedExtractedWithAdditional = `Title
|
|
46
|
+
=====`;
|
|
47
|
+
|
|
48
|
+
const rawHTMLWithCommonChangingItems = `
|
|
49
|
+
<!DOCTYPE html>
|
|
50
|
+
<html>
|
|
51
|
+
<head>
|
|
52
|
+
<meta charset="UTF-8">
|
|
53
|
+
<title>TOS</title>
|
|
54
|
+
<style>body { background: red }</style>
|
|
55
|
+
<script>console.log("test")</script>
|
|
56
|
+
</head>
|
|
57
|
+
<body>
|
|
58
|
+
<style>body { background: blue }</style>
|
|
59
|
+
<script>console.log("test")</script>
|
|
60
|
+
<h1>Title</h1>
|
|
61
|
+
<p><a id="link1" href="/relative/link">link 1</a></p>
|
|
62
|
+
<p><a id="link2" href="#anchor">link 2</a></p>
|
|
63
|
+
<p><a id="link3" href="http://absolute.url/link">link 3</a></p>
|
|
64
|
+
<p><a id="link4" href="">link 4</a></p>
|
|
65
|
+
<a href="/cdn-cgi/l/email-protection#3b4c52555f484f495e5a56154b49524d5a584215484f5a4f5e565e554f7b4c52555f484f495e5a5615585456">[email protected]</a>
|
|
66
|
+
<p><a href="/cdn-cgi/l/email-protection#2d4e4243594c4e596d4e4459545e4e424259034858">conta<span>[email protected]</span></a></p>
|
|
67
|
+
</body>
|
|
68
|
+
</html>`;
|
|
69
|
+
|
|
70
|
+
/* eslint-disable no-irregular-whitespace */
|
|
71
|
+
const expectedExtractedWithCommonChangingItems = `Title
|
|
72
|
+
=====
|
|
73
|
+
|
|
74
|
+
[link 1](https://exemple.com/relative/link)
|
|
75
|
+
|
|
76
|
+
[link 2](#anchor)
|
|
77
|
+
|
|
78
|
+
[link 3](http://absolute.url/link)
|
|
79
|
+
|
|
80
|
+
link 4
|
|
81
|
+
|
|
82
|
+
[\\[email protected\\]](https://exemple.com/cdn-cgi/l/email-protection)
|
|
83
|
+
|
|
84
|
+
[\\[email protected\\]](https://exemple.com/cdn-cgi/l/email-protection)`;
|
|
85
|
+
/* eslint-enable no-irregular-whitespace */
|
|
86
|
+
|
|
87
|
+
const additionalFilter = {
|
|
88
|
+
removeLinks: function removeLinks(document) {
|
|
89
|
+
const links = document.querySelectorAll('a');
|
|
90
|
+
|
|
91
|
+
links.forEach(link => {
|
|
92
|
+
link.remove();
|
|
93
|
+
});
|
|
94
|
+
},
|
|
95
|
+
removeLinksAsync: async function removeLinksAsync(document) {
|
|
96
|
+
return new Promise(resolve => {
|
|
97
|
+
setTimeout(() => {
|
|
98
|
+
const links = document.querySelectorAll('a');
|
|
99
|
+
|
|
100
|
+
links.forEach(link => {
|
|
101
|
+
link.remove();
|
|
102
|
+
});
|
|
103
|
+
resolve();
|
|
104
|
+
}, 300);
|
|
105
|
+
});
|
|
106
|
+
},
|
|
107
|
+
};
|
|
108
|
+
|
|
109
|
+
describe('Extract', () => {
|
|
110
|
+
describe('#convertRelativeURLsToAbsolute', () => {
|
|
111
|
+
let subject;
|
|
112
|
+
|
|
113
|
+
before(() => {
|
|
114
|
+
const { document: webPageDOM } = new JSDOM(rawHTML).window;
|
|
115
|
+
|
|
116
|
+
convertRelativeURLsToAbsolute(webPageDOM, virtualLocation);
|
|
117
|
+
subject = Array.from(webPageDOM.querySelectorAll('a[href]')).map(el => el.href);
|
|
118
|
+
});
|
|
119
|
+
|
|
120
|
+
it('converts relative urls', async () => {
|
|
121
|
+
expect(subject).to.include('https://exemple.com/relative/link');
|
|
122
|
+
});
|
|
123
|
+
|
|
124
|
+
it('leaves absolute urls untouched', async () => {
|
|
125
|
+
expect(subject).to.include('http://absolute.url/link');
|
|
126
|
+
});
|
|
127
|
+
});
|
|
128
|
+
|
|
129
|
+
describe('#extractFromHTML', () => {
|
|
130
|
+
describe('Select', () => {
|
|
131
|
+
context('with string selector', () => {
|
|
132
|
+
it('extracts content from the given HTML with common changing items', async () => {
|
|
133
|
+
const result = await extractFromHTML(new SourceDocument({
|
|
134
|
+
location: virtualLocation,
|
|
135
|
+
contentSelectors: 'body',
|
|
136
|
+
content: rawHTMLWithCommonChangingItems,
|
|
137
|
+
}));
|
|
138
|
+
|
|
139
|
+
expect(result).to.equal(expectedExtractedWithCommonChangingItems);
|
|
140
|
+
});
|
|
141
|
+
|
|
142
|
+
it('extracts content from the given HTML', async () => {
|
|
143
|
+
const result = await extractFromHTML(new SourceDocument({
|
|
144
|
+
location: virtualLocation,
|
|
145
|
+
contentSelectors: 'body',
|
|
146
|
+
content: rawHTML,
|
|
147
|
+
}));
|
|
148
|
+
|
|
149
|
+
expect(result).to.equal(expectedExtracted);
|
|
150
|
+
});
|
|
151
|
+
|
|
152
|
+
context('with no match for the given selector', () => {
|
|
153
|
+
it('throws an InaccessibleContentError error', async () => {
|
|
154
|
+
await expect(extractFromHTML(new SourceDocument({
|
|
155
|
+
location: virtualLocation,
|
|
156
|
+
contentSelectors: '#thisAnchorDoesNotExist',
|
|
157
|
+
content: rawHTML,
|
|
158
|
+
}))).to.be.rejectedWith(InaccessibleContentError, /#thisAnchorDoesNotExist/);
|
|
159
|
+
});
|
|
160
|
+
});
|
|
161
|
+
|
|
162
|
+
context('with no content for the matching given selector', () => {
|
|
163
|
+
it('throws an InaccessibleContentError error', async () => {
|
|
164
|
+
await expect(extractFromHTML(new SourceDocument({
|
|
165
|
+
location: virtualLocation,
|
|
166
|
+
contentSelectors: '#empty',
|
|
167
|
+
content: rawHTML,
|
|
168
|
+
}))).to.be.rejectedWith(InaccessibleContentError, /empty content/);
|
|
169
|
+
});
|
|
170
|
+
});
|
|
171
|
+
|
|
172
|
+
context('with a whitespace only content for the corresponding given selector', () => {
|
|
173
|
+
it('throws an InaccessibleContentError error', async () => {
|
|
174
|
+
await expect(extractFromHTML(new SourceDocument({
|
|
175
|
+
location: virtualLocation,
|
|
176
|
+
contentSelectors: '#whitespaceOnly',
|
|
177
|
+
content: rawHTML,
|
|
178
|
+
}))).to.be.rejectedWith(InaccessibleContentError, /empty content/);
|
|
179
|
+
});
|
|
180
|
+
});
|
|
181
|
+
|
|
182
|
+
context('with multiple selectors in one string', () => {
|
|
183
|
+
it('extracts content from the given HTML', async () => {
|
|
184
|
+
const result = await extractFromHTML(new SourceDocument({
|
|
185
|
+
location: virtualLocation,
|
|
186
|
+
contentSelectors: 'h1, #link2',
|
|
187
|
+
content: rawHTML,
|
|
188
|
+
}));
|
|
189
|
+
|
|
190
|
+
expect(result).to.equal('Title\n=====\n\n[link 2](#anchor)');
|
|
191
|
+
});
|
|
192
|
+
});
|
|
193
|
+
});
|
|
194
|
+
|
|
195
|
+
context('with an array of selectors', () => {
|
|
196
|
+
it('extracts content from the given HTML', async () => {
|
|
197
|
+
const result = await extractFromHTML(new SourceDocument({
|
|
198
|
+
content: rawHTML,
|
|
199
|
+
location: virtualLocation,
|
|
200
|
+
contentSelectors: [ 'h1', '#link2' ],
|
|
201
|
+
}));
|
|
202
|
+
|
|
203
|
+
expect(result).to.equal('Title\n=====\n\n[link 2](#anchor)');
|
|
204
|
+
});
|
|
205
|
+
|
|
206
|
+
context('when one selector is dependent on another', () => {
|
|
207
|
+
it('extracts content from the given HTML', async () => {
|
|
208
|
+
const result = await extractFromHTML(new SourceDocument({
|
|
209
|
+
content: rawHTML,
|
|
210
|
+
location: virtualLocation,
|
|
211
|
+
contentSelectors: [ 'h1', 'h1 ~ p' ],
|
|
212
|
+
}));
|
|
213
|
+
|
|
214
|
+
expect(result).to.equal('Title\n=====\n\n[link 1](https://exemple.com/relative/link)\n\n[link 2](#anchor)\n\n[link 3](http://absolute.url/link)');
|
|
215
|
+
});
|
|
216
|
+
});
|
|
217
|
+
});
|
|
218
|
+
|
|
219
|
+
context('with range selector', () => {
|
|
220
|
+
context('with startBefore and endBefore', () => {
|
|
221
|
+
it('extracts content from the given HTML', async () => {
|
|
222
|
+
const result = await extractFromHTML(new SourceDocument({
|
|
223
|
+
content: rawHTML,
|
|
224
|
+
location: virtualLocation,
|
|
225
|
+
contentSelectors: {
|
|
226
|
+
startBefore: '#link1',
|
|
227
|
+
endBefore: '#link2',
|
|
228
|
+
},
|
|
229
|
+
}));
|
|
230
|
+
|
|
231
|
+
expect(result).to.equal('[link 1](https://exemple.com/relative/link)');
|
|
232
|
+
});
|
|
233
|
+
});
|
|
234
|
+
context('with startBefore and endAfter', () => {
|
|
235
|
+
it('extracts content from the given HTML', async () => {
|
|
236
|
+
const result = await extractFromHTML(new SourceDocument({
|
|
237
|
+
content: rawHTML,
|
|
238
|
+
location: virtualLocation,
|
|
239
|
+
contentSelectors: {
|
|
240
|
+
startBefore: '#link2',
|
|
241
|
+
endAfter: '#link2',
|
|
242
|
+
},
|
|
243
|
+
}));
|
|
244
|
+
|
|
245
|
+
expect(result).to.equal('[link 2](#anchor)');
|
|
246
|
+
});
|
|
247
|
+
});
|
|
248
|
+
context('with startAfter and endBefore', () => {
|
|
249
|
+
it('extracts content from the given HTML', async () => {
|
|
250
|
+
const result = await extractFromHTML(new SourceDocument({
|
|
251
|
+
content: rawHTML,
|
|
252
|
+
location: virtualLocation,
|
|
253
|
+
contentSelectors: {
|
|
254
|
+
startAfter: '#link1',
|
|
255
|
+
endBefore: '#link3',
|
|
256
|
+
},
|
|
257
|
+
}));
|
|
258
|
+
|
|
259
|
+
expect(result).to.equal('[link 2](#anchor)');
|
|
260
|
+
});
|
|
261
|
+
});
|
|
262
|
+
context('with startAfter and endAfter', () => {
|
|
263
|
+
it('extracts content from the given HTML', async () => {
|
|
264
|
+
const result = await extractFromHTML(new SourceDocument({
|
|
265
|
+
content: rawHTML,
|
|
266
|
+
location: virtualLocation,
|
|
267
|
+
contentSelectors: {
|
|
268
|
+
startAfter: '#link2',
|
|
269
|
+
endAfter: '#link3',
|
|
270
|
+
},
|
|
271
|
+
}));
|
|
272
|
+
|
|
273
|
+
expect(result).to.equal('[link 3](http://absolute.url/link)');
|
|
274
|
+
});
|
|
275
|
+
});
|
|
276
|
+
context('with a "start" selector that has no match', () => {
|
|
277
|
+
it('throws an InaccessibleContentError error', async () => {
|
|
278
|
+
await expect(extractFromHTML(new SourceDocument({
|
|
279
|
+
content: rawHTML,
|
|
280
|
+
location: virtualLocation,
|
|
281
|
+
contentSelectors: {
|
|
282
|
+
startAfter: '#paragraph1',
|
|
283
|
+
endAfter: '#link2',
|
|
284
|
+
},
|
|
285
|
+
}))).to.be.rejectedWith(InaccessibleContentError, /"start" selector has no match/);
|
|
286
|
+
});
|
|
287
|
+
});
|
|
288
|
+
context('with an "end" selector that has no match', () => {
|
|
289
|
+
it('throws an InaccessibleContentError error', async () => {
|
|
290
|
+
await expect(extractFromHTML(new SourceDocument({
|
|
291
|
+
content: rawHTML,
|
|
292
|
+
location: virtualLocation,
|
|
293
|
+
contentSelectors: {
|
|
294
|
+
startAfter: '#link2',
|
|
295
|
+
endAfter: '#paragraph1',
|
|
296
|
+
},
|
|
297
|
+
}))).to.be.rejectedWith(InaccessibleContentError, /"end" selector has no match/);
|
|
298
|
+
});
|
|
299
|
+
});
|
|
300
|
+
});
|
|
301
|
+
|
|
302
|
+
context('with an array of range selectors', () => {
|
|
303
|
+
it('extracts content from the given HTML', async () => {
|
|
304
|
+
const result = await extractFromHTML(new SourceDocument({
|
|
305
|
+
content: rawHTML,
|
|
306
|
+
location: virtualLocation,
|
|
307
|
+
contentSelectors: [
|
|
308
|
+
{
|
|
309
|
+
startAfter: '#link1',
|
|
310
|
+
endAfter: '#link2',
|
|
311
|
+
},
|
|
312
|
+
{
|
|
313
|
+
startAfter: '#link2',
|
|
314
|
+
endAfter: '#link3',
|
|
315
|
+
},
|
|
316
|
+
],
|
|
317
|
+
}));
|
|
318
|
+
|
|
319
|
+
expect(result).to.equal('[link 2](#anchor)\n\n[link 3](http://absolute.url/link)');
|
|
320
|
+
});
|
|
321
|
+
});
|
|
322
|
+
|
|
323
|
+
context('with an array of mixed string selectors and range selectors', () => {
|
|
324
|
+
it('extracts content from the given HTML', async () => {
|
|
325
|
+
const result = await extractFromHTML(new SourceDocument({
|
|
326
|
+
content: rawHTML,
|
|
327
|
+
location: virtualLocation,
|
|
328
|
+
contentSelectors: [
|
|
329
|
+
'h1',
|
|
330
|
+
{
|
|
331
|
+
startAfter: '#link2',
|
|
332
|
+
endAfter: '#link3',
|
|
333
|
+
},
|
|
334
|
+
],
|
|
335
|
+
}));
|
|
336
|
+
|
|
337
|
+
expect(result).to.equal('Title\n=====\n\n[link 3](http://absolute.url/link)');
|
|
338
|
+
});
|
|
339
|
+
});
|
|
340
|
+
});
|
|
341
|
+
|
|
342
|
+
describe('Remove', () => {
|
|
343
|
+
context('with a simple selector', () => {
|
|
344
|
+
it('removes the specified elements', async () => {
|
|
345
|
+
const result = await extractFromHTML(new SourceDocument({
|
|
346
|
+
content: rawHTML,
|
|
347
|
+
location: virtualLocation,
|
|
348
|
+
contentSelectors: 'body',
|
|
349
|
+
insignificantContentSelectors: 'h1',
|
|
350
|
+
}));
|
|
351
|
+
|
|
352
|
+
expect(result).to.equal('[link 1](https://exemple.com/relative/link)\n\n[link 2](#anchor)\n\n[link 3](http://absolute.url/link)');
|
|
353
|
+
});
|
|
354
|
+
});
|
|
355
|
+
|
|
356
|
+
context('with an array of string selectors', () => {
|
|
357
|
+
it('removes the specified elements', async () => {
|
|
358
|
+
const result = await extractFromHTML(new SourceDocument({
|
|
359
|
+
content: rawHTML,
|
|
360
|
+
location: virtualLocation,
|
|
361
|
+
contentSelectors: 'body',
|
|
362
|
+
insignificantContentSelectors: [ 'h1', '#link3' ],
|
|
363
|
+
}));
|
|
364
|
+
|
|
365
|
+
expect(result).to.equal('[link 1](https://exemple.com/relative/link)\n\n[link 2](#anchor)');
|
|
366
|
+
});
|
|
367
|
+
});
|
|
368
|
+
|
|
369
|
+
context('with a simple range selector', () => {
|
|
370
|
+
it('removes the specified elements', async () => {
|
|
371
|
+
const result = await extractFromHTML(new SourceDocument({
|
|
372
|
+
content: rawHTML,
|
|
373
|
+
location: virtualLocation,
|
|
374
|
+
contentSelectors: 'body',
|
|
375
|
+
insignificantContentSelectors: {
|
|
376
|
+
startBefore: '#link1',
|
|
377
|
+
endAfter: '#link3',
|
|
378
|
+
},
|
|
379
|
+
}));
|
|
380
|
+
|
|
381
|
+
expect(result).to.equal('Title\n=====');
|
|
382
|
+
});
|
|
383
|
+
context('with a "start" selector that has no match', () => {
|
|
384
|
+
it('throws an InaccessibleContentError error', async () => {
|
|
385
|
+
await expect(extractFromHTML(new SourceDocument({
|
|
386
|
+
content: rawHTML,
|
|
387
|
+
location: virtualLocation,
|
|
388
|
+
contentSelectors: 'body',
|
|
389
|
+
insignificantContentSelectors: {
|
|
390
|
+
startAfter: '#paragraph1',
|
|
391
|
+
endAfter: '#link2',
|
|
392
|
+
},
|
|
393
|
+
}))).to.be.rejectedWith(InaccessibleContentError, /"start" selector has no match/);
|
|
394
|
+
});
|
|
395
|
+
});
|
|
396
|
+
context('with an "end" selector that has no match', () => {
|
|
397
|
+
it('throws an InaccessibleContentError error', async () => {
|
|
398
|
+
await expect(extractFromHTML(new SourceDocument({
|
|
399
|
+
content: rawHTML,
|
|
400
|
+
location: virtualLocation,
|
|
401
|
+
contentSelectors: 'body',
|
|
402
|
+
insignificantContentSelectors: {
|
|
403
|
+
startAfter: '#link2',
|
|
404
|
+
endAfter: '#paragraph1',
|
|
405
|
+
},
|
|
406
|
+
}))).to.be.rejectedWith(InaccessibleContentError, /"end" selector has no match/);
|
|
407
|
+
});
|
|
408
|
+
});
|
|
409
|
+
});
|
|
410
|
+
context('with an array of range selectors', () => {
|
|
411
|
+
it('removes all the selections', async () => {
|
|
412
|
+
const result = await extractFromHTML(new SourceDocument({
|
|
413
|
+
content: rawHTML,
|
|
414
|
+
location: virtualLocation,
|
|
415
|
+
contentSelectors: 'body',
|
|
416
|
+
insignificantContentSelectors: [
|
|
417
|
+
{
|
|
418
|
+
startBefore: 'h1',
|
|
419
|
+
endBefore: '#link1',
|
|
420
|
+
},
|
|
421
|
+
{
|
|
422
|
+
startBefore: '#link3',
|
|
423
|
+
endAfter: '#link3',
|
|
424
|
+
},
|
|
425
|
+
],
|
|
426
|
+
}));
|
|
427
|
+
|
|
428
|
+
expect(result).to.equal('[link 1](https://exemple.com/relative/link)\n\n[link 2](#anchor)');
|
|
429
|
+
});
|
|
430
|
+
});
|
|
431
|
+
|
|
432
|
+
context('with an array of mixed selectors and range selectors', () => {
|
|
433
|
+
it('removes all the selections', async () => {
|
|
434
|
+
const result = await extractFromHTML(new SourceDocument({
|
|
435
|
+
content: rawHTML,
|
|
436
|
+
location: virtualLocation,
|
|
437
|
+
contentSelectors: 'body',
|
|
438
|
+
insignificantContentSelectors: [
|
|
439
|
+
'h1',
|
|
440
|
+
{
|
|
441
|
+
startBefore: '#link3',
|
|
442
|
+
endAfter: '#link3',
|
|
443
|
+
},
|
|
444
|
+
],
|
|
445
|
+
}));
|
|
446
|
+
|
|
447
|
+
expect(result).to.equal('[link 1](https://exemple.com/relative/link)\n\n[link 2](#anchor)');
|
|
448
|
+
});
|
|
449
|
+
|
|
450
|
+
context('where one selector is dependent on another', () => {
|
|
451
|
+
it('removes all the selections', async () => {
|
|
452
|
+
const result = await extractFromHTML(new SourceDocument({
|
|
453
|
+
content: rawHTML,
|
|
454
|
+
location: virtualLocation,
|
|
455
|
+
contentSelectors: 'body',
|
|
456
|
+
insignificantContentSelectors: [
|
|
457
|
+
'h1',
|
|
458
|
+
{
|
|
459
|
+
startAfter: 'h1',
|
|
460
|
+
endBefore: '#link2',
|
|
461
|
+
},
|
|
462
|
+
],
|
|
463
|
+
}));
|
|
464
|
+
|
|
465
|
+
expect(result).to.equal('[link 2](#anchor)\n\n[link 3](http://absolute.url/link)');
|
|
466
|
+
});
|
|
467
|
+
});
|
|
468
|
+
});
|
|
469
|
+
});
|
|
470
|
+
|
|
471
|
+
describe('Filter', () => {
|
|
472
|
+
context('with a synchronous filter', () => {
|
|
473
|
+
it('extracts content from the given HTML also with given additional filter', async () => {
|
|
474
|
+
const result = await extractFromHTML(new SourceDocument({
|
|
475
|
+
content: rawHTML,
|
|
476
|
+
location: virtualLocation,
|
|
477
|
+
contentSelectors: 'body',
|
|
478
|
+
filters: [additionalFilter.removeLinks],
|
|
479
|
+
}));
|
|
480
|
+
|
|
481
|
+
expect(result).to.equal(expectedExtractedWithAdditional);
|
|
482
|
+
});
|
|
483
|
+
});
|
|
484
|
+
|
|
485
|
+
context('with an asynchronous filter', () => {
|
|
486
|
+
it('extracts content from the given HTML also with given additional filter', async () => {
|
|
487
|
+
const result = await extractFromHTML(new SourceDocument({
|
|
488
|
+
content: rawHTML,
|
|
489
|
+
location: virtualLocation,
|
|
490
|
+
contentSelectors: 'body',
|
|
491
|
+
filters: [additionalFilter.removeLinksAsync],
|
|
492
|
+
}));
|
|
493
|
+
|
|
494
|
+
expect(result).to.equal(expectedExtractedWithAdditional);
|
|
495
|
+
});
|
|
496
|
+
});
|
|
497
|
+
});
|
|
498
|
+
});
|
|
499
|
+
|
|
500
|
+
describe('#extractFromPDF', () => {
|
|
501
|
+
let pdfContent;
|
|
502
|
+
let expectedExtractedContent;
|
|
503
|
+
|
|
504
|
+
before(async () => {
|
|
505
|
+
pdfContent = await fs.readFile(path.resolve(__dirname, '../../../test/fixtures/terms.pdf'));
|
|
506
|
+
expectedExtractedContent = await fs.readFile(
|
|
507
|
+
path.resolve(__dirname, '../../../test/fixtures/termsFromPDF.md'),
|
|
508
|
+
{ encoding: 'utf8' },
|
|
509
|
+
);
|
|
510
|
+
});
|
|
511
|
+
|
|
512
|
+
it('extracts content from the given PDF', async () => {
|
|
513
|
+
expect(await extractFromPDF({ content: pdfContent })).to.equal(expectedExtractedContent);
|
|
514
|
+
});
|
|
515
|
+
});
|
|
516
|
+
});
|