@opentermsarchive/engine 1.1.3 → 1.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +2 -2
- package/src/archivist/extract/errors.js +6 -0
- package/src/archivist/extract/index.js +32 -16
- package/src/archivist/extract/index.test.js +319 -302
- package/src/archivist/fetcher/errors.js +1 -1
- package/src/archivist/fetcher/fullDomFetcher.js +4 -6
- package/src/archivist/fetcher/htmlOnlyFetcher.js +6 -7
- package/src/archivist/fetcher/index.js +9 -4
- package/src/archivist/fetcher/index.test.js +23 -12
- package/src/archivist/index.js +37 -13
- package/src/archivist/index.test.js +22 -22
- package/src/archivist/services/service.js +12 -6
- package/src/archivist/services/service.test.js +60 -39
- package/src/logger/index.js +3 -3
- package/src/reporter/index.js +4 -2
- package/src/reporter/labels.json +10 -0
|
@@ -4,11 +4,13 @@ import { fileURLToPath } from 'url';
|
|
|
4
4
|
|
|
5
5
|
import chai from 'chai';
|
|
6
6
|
import jsdom from 'jsdom';
|
|
7
|
+
import mime from 'mime';
|
|
7
8
|
|
|
8
|
-
import { InaccessibleContentError } from '../errors.js';
|
|
9
9
|
import SourceDocument from '../services/sourceDocument.js';
|
|
10
10
|
|
|
11
|
-
import {
|
|
11
|
+
import { ExtractDocumentError } from './errors.js';
|
|
12
|
+
|
|
13
|
+
import extract, { convertRelativeURLsToAbsolute } from './index.js';
|
|
12
14
|
|
|
13
15
|
const __dirname = path.dirname(fileURLToPath(import.meta.url));
|
|
14
16
|
const fs = fsApi.promises;
|
|
@@ -28,6 +30,7 @@ const rawHTML = `
|
|
|
28
30
|
<p><a id="link1" href="/relative/link">link 1</a></p>
|
|
29
31
|
<p><a id="link2" href="#anchor">link 2</a></p>
|
|
30
32
|
<p><a id="link3" href="http://absolute.url/link">link 3</a></p>
|
|
33
|
+
<p><a id="link5" href="http://[INVALID_URL=http://www.example.org/">link 5</a></p>
|
|
31
34
|
<div id="empty"></div>
|
|
32
35
|
<div id="whitespaceOnly"> </div>
|
|
33
36
|
</body>
|
|
@@ -40,7 +43,9 @@ const expectedExtracted = `Title
|
|
|
40
43
|
|
|
41
44
|
[link 2](#anchor)
|
|
42
45
|
|
|
43
|
-
[link 3](http://absolute.url/link)
|
|
46
|
+
[link 3](http://absolute.url/link)
|
|
47
|
+
|
|
48
|
+
[link 5](http://[INVALID_URL=http://www.example.org/)`;
|
|
44
49
|
|
|
45
50
|
const expectedExtractedWithAdditional = `Title
|
|
46
51
|
=====`;
|
|
@@ -92,7 +97,7 @@ const additionalFilter = {
|
|
|
92
97
|
link.remove();
|
|
93
98
|
});
|
|
94
99
|
},
|
|
95
|
-
removeLinksAsync:
|
|
100
|
+
removeLinksAsync: function removeLinksAsync(document) {
|
|
96
101
|
return new Promise(resolve => {
|
|
97
102
|
setTimeout(() => {
|
|
98
103
|
const links = document.querySelectorAll('a');
|
|
@@ -117,400 +122,412 @@ describe('Extract', () => {
|
|
|
117
122
|
subject = Array.from(webPageDOM.querySelectorAll('a[href]')).map(el => el.href);
|
|
118
123
|
});
|
|
119
124
|
|
|
120
|
-
it('converts relative urls',
|
|
125
|
+
it('converts relative urls', () => {
|
|
121
126
|
expect(subject).to.include('https://exemple.com/relative/link');
|
|
122
127
|
});
|
|
123
128
|
|
|
124
|
-
it('leaves absolute urls untouched',
|
|
129
|
+
it('leaves absolute urls untouched', () => {
|
|
125
130
|
expect(subject).to.include('http://absolute.url/link');
|
|
126
131
|
});
|
|
127
|
-
});
|
|
128
132
|
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
const result = await extractFromHTML(new SourceDocument({
|
|
134
|
-
location: virtualLocation,
|
|
135
|
-
contentSelectors: 'body',
|
|
136
|
-
content: rawHTMLWithCommonChangingItems,
|
|
137
|
-
}));
|
|
138
|
-
|
|
139
|
-
expect(result).to.equal(expectedExtractedWithCommonChangingItems);
|
|
140
|
-
});
|
|
133
|
+
it('leaves invalid urls untouched', () => {
|
|
134
|
+
expect(subject).to.include('http://[INVALID_URL=http://www.example.org/');
|
|
135
|
+
});
|
|
136
|
+
});
|
|
141
137
|
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
138
|
+
describe('#extract', () => {
|
|
139
|
+
context('from HTML content', () => {
|
|
140
|
+
describe('Select', () => {
|
|
141
|
+
context('with string selector', () => {
|
|
142
|
+
it('extracts content from the given HTML with common changing items', async () => {
|
|
143
|
+
const result = await extract(new SourceDocument({
|
|
144
|
+
location: virtualLocation,
|
|
145
|
+
contentSelectors: 'body',
|
|
146
|
+
content: rawHTMLWithCommonChangingItems,
|
|
147
|
+
}));
|
|
148
148
|
|
|
149
|
-
|
|
150
|
-
|
|
149
|
+
expect(result).to.equal(expectedExtractedWithCommonChangingItems);
|
|
150
|
+
});
|
|
151
151
|
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
await expect(extractFromHTML(new SourceDocument({
|
|
152
|
+
it('extracts content from the given HTML', async () => {
|
|
153
|
+
const result = await extract(new SourceDocument({
|
|
155
154
|
location: virtualLocation,
|
|
156
|
-
contentSelectors: '
|
|
155
|
+
contentSelectors: 'body',
|
|
157
156
|
content: rawHTML,
|
|
158
|
-
}))
|
|
157
|
+
}));
|
|
158
|
+
|
|
159
|
+
expect(result).to.equal(expectedExtracted);
|
|
159
160
|
});
|
|
160
|
-
});
|
|
161
161
|
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
162
|
+
context('with no match for the given selector', () => {
|
|
163
|
+
it('throws an ExtractDocumentError error', async () => {
|
|
164
|
+
await expect(extract(new SourceDocument({
|
|
165
|
+
location: virtualLocation,
|
|
166
|
+
contentSelectors: '#thisAnchorDoesNotExist',
|
|
167
|
+
content: rawHTML,
|
|
168
|
+
}))).to.be.rejectedWith(Error, /#thisAnchorDoesNotExist/);
|
|
169
|
+
});
|
|
169
170
|
});
|
|
170
|
-
});
|
|
171
171
|
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
172
|
+
context('with no content for the matching given selector', () => {
|
|
173
|
+
it('throws an ExtractDocumentError error', async () => {
|
|
174
|
+
await expect(extract(new SourceDocument({
|
|
175
|
+
location: virtualLocation,
|
|
176
|
+
contentSelectors: '#empty',
|
|
177
|
+
content: rawHTML,
|
|
178
|
+
}))).to.be.rejectedWith(ExtractDocumentError, /empty content/);
|
|
179
|
+
});
|
|
180
|
+
});
|
|
181
|
+
|
|
182
|
+
context('with a whitespace only content for the corresponding given selector', () => {
|
|
183
|
+
it('throws an ExtractDocumentError error', async () => {
|
|
184
|
+
await expect(extract(new SourceDocument({
|
|
185
|
+
location: virtualLocation,
|
|
186
|
+
contentSelectors: '#whitespaceOnly',
|
|
187
|
+
content: rawHTML,
|
|
188
|
+
}))).to.be.rejectedWith(ExtractDocumentError, /empty content/);
|
|
189
|
+
});
|
|
190
|
+
});
|
|
191
|
+
|
|
192
|
+
context('with multiple selectors in one string', () => {
|
|
193
|
+
it('extracts content from the given HTML', async () => {
|
|
194
|
+
const result = await extract(new SourceDocument({
|
|
195
|
+
location: virtualLocation,
|
|
196
|
+
contentSelectors: 'h1, #link2',
|
|
197
|
+
content: rawHTML,
|
|
198
|
+
}));
|
|
199
|
+
|
|
200
|
+
expect(result).to.equal('Title\n=====\n\n[link 2](#anchor)');
|
|
201
|
+
});
|
|
179
202
|
});
|
|
180
203
|
});
|
|
181
204
|
|
|
182
|
-
context('with
|
|
205
|
+
context('with an array of selectors', () => {
|
|
183
206
|
it('extracts content from the given HTML', async () => {
|
|
184
|
-
const result = await
|
|
185
|
-
location: virtualLocation,
|
|
186
|
-
contentSelectors: 'h1, #link2',
|
|
207
|
+
const result = await extract(new SourceDocument({
|
|
187
208
|
content: rawHTML,
|
|
209
|
+
location: virtualLocation,
|
|
210
|
+
contentSelectors: [ 'h1', '#link2' ],
|
|
188
211
|
}));
|
|
189
212
|
|
|
190
213
|
expect(result).to.equal('Title\n=====\n\n[link 2](#anchor)');
|
|
191
214
|
});
|
|
192
|
-
});
|
|
193
|
-
});
|
|
194
215
|
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
216
|
+
context('when one selector is dependent on another', () => {
|
|
217
|
+
it('extracts content from the given HTML', async () => {
|
|
218
|
+
const result = await extract(new SourceDocument({
|
|
219
|
+
content: rawHTML,
|
|
220
|
+
location: virtualLocation,
|
|
221
|
+
contentSelectors: [ 'h1', 'h1 ~ p' ],
|
|
222
|
+
}));
|
|
202
223
|
|
|
203
|
-
|
|
224
|
+
expect(result).to.equal('Title\n=====\n\n[link 1](https://exemple.com/relative/link)\n\n[link 2](#anchor)\n\n[link 3](http://absolute.url/link)\n\n[link 5](http://[INVALID_URL=http://www.example.org/)');
|
|
225
|
+
});
|
|
226
|
+
});
|
|
204
227
|
});
|
|
205
228
|
|
|
206
|
-
context('
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
229
|
+
context('with range selector', () => {
|
|
230
|
+
context('with startBefore and endBefore', () => {
|
|
231
|
+
it('extracts content from the given HTML', async () => {
|
|
232
|
+
const result = await extract(new SourceDocument({
|
|
233
|
+
content: rawHTML,
|
|
234
|
+
location: virtualLocation,
|
|
235
|
+
contentSelectors: {
|
|
236
|
+
startBefore: '#link1',
|
|
237
|
+
endBefore: '#link2',
|
|
238
|
+
},
|
|
239
|
+
}));
|
|
213
240
|
|
|
214
|
-
|
|
241
|
+
expect(result).to.equal('[link 1](https://exemple.com/relative/link)');
|
|
242
|
+
});
|
|
215
243
|
});
|
|
216
|
-
|
|
217
|
-
|
|
244
|
+
context('with startBefore and endAfter', () => {
|
|
245
|
+
it('extracts content from the given HTML', async () => {
|
|
246
|
+
const result = await extract(new SourceDocument({
|
|
247
|
+
content: rawHTML,
|
|
248
|
+
location: virtualLocation,
|
|
249
|
+
contentSelectors: {
|
|
250
|
+
startBefore: '#link2',
|
|
251
|
+
endAfter: '#link2',
|
|
252
|
+
},
|
|
253
|
+
}));
|
|
218
254
|
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
255
|
+
expect(result).to.equal('[link 2](#anchor)');
|
|
256
|
+
});
|
|
257
|
+
});
|
|
258
|
+
context('with startAfter and endBefore', () => {
|
|
259
|
+
it('extracts content from the given HTML', async () => {
|
|
260
|
+
const result = await extract(new SourceDocument({
|
|
261
|
+
content: rawHTML,
|
|
262
|
+
location: virtualLocation,
|
|
263
|
+
contentSelectors: {
|
|
264
|
+
startAfter: '#link1',
|
|
265
|
+
endBefore: '#link3',
|
|
266
|
+
},
|
|
267
|
+
}));
|
|
230
268
|
|
|
231
|
-
|
|
269
|
+
expect(result).to.equal('[link 2](#anchor)');
|
|
270
|
+
});
|
|
232
271
|
});
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
}
|
|
243
|
-
}));
|
|
272
|
+
context('with startAfter and endAfter', () => {
|
|
273
|
+
it('extracts content from the given HTML', async () => {
|
|
274
|
+
const result = await extract(new SourceDocument({
|
|
275
|
+
content: rawHTML,
|
|
276
|
+
location: virtualLocation,
|
|
277
|
+
contentSelectors: {
|
|
278
|
+
startAfter: '#link2',
|
|
279
|
+
endAfter: '#link3',
|
|
280
|
+
},
|
|
281
|
+
}));
|
|
244
282
|
|
|
245
|
-
|
|
283
|
+
expect(result).to.equal('[link 3](http://absolute.url/link)');
|
|
284
|
+
});
|
|
285
|
+
});
|
|
286
|
+
context('with a "start" selector that has no match', () => {
|
|
287
|
+
it('throws an ExtractDocumentError error', async () => {
|
|
288
|
+
await expect(extract(new SourceDocument({
|
|
289
|
+
content: rawHTML,
|
|
290
|
+
location: virtualLocation,
|
|
291
|
+
contentSelectors: {
|
|
292
|
+
startAfter: '#paragraph1',
|
|
293
|
+
endAfter: '#link2',
|
|
294
|
+
},
|
|
295
|
+
}))).to.be.rejectedWith(ExtractDocumentError, /"start" selector has no match/);
|
|
296
|
+
});
|
|
297
|
+
});
|
|
298
|
+
context('with an "end" selector that has no match', () => {
|
|
299
|
+
it('throws an ExtractDocumentError error', async () => {
|
|
300
|
+
await expect(extract(new SourceDocument({
|
|
301
|
+
content: rawHTML,
|
|
302
|
+
location: virtualLocation,
|
|
303
|
+
contentSelectors: {
|
|
304
|
+
startAfter: '#link2',
|
|
305
|
+
endAfter: '#paragraph1',
|
|
306
|
+
},
|
|
307
|
+
}))).to.be.rejectedWith(ExtractDocumentError, /"end" selector has no match/);
|
|
308
|
+
});
|
|
246
309
|
});
|
|
247
310
|
});
|
|
248
|
-
|
|
311
|
+
|
|
312
|
+
context('with an array of range selectors', () => {
|
|
249
313
|
it('extracts content from the given HTML', async () => {
|
|
250
|
-
const result = await
|
|
314
|
+
const result = await extract(new SourceDocument({
|
|
251
315
|
content: rawHTML,
|
|
252
316
|
location: virtualLocation,
|
|
253
|
-
contentSelectors:
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
317
|
+
contentSelectors: [
|
|
318
|
+
{
|
|
319
|
+
startAfter: '#link1',
|
|
320
|
+
endAfter: '#link2',
|
|
321
|
+
},
|
|
322
|
+
{
|
|
323
|
+
startAfter: '#link2',
|
|
324
|
+
endAfter: '#link3',
|
|
325
|
+
},
|
|
326
|
+
],
|
|
257
327
|
}));
|
|
258
328
|
|
|
259
|
-
expect(result).to.equal('[link 2](#anchor)');
|
|
329
|
+
expect(result).to.equal('[link 2](#anchor)\n\n[link 3](http://absolute.url/link)');
|
|
260
330
|
});
|
|
261
331
|
});
|
|
262
|
-
|
|
332
|
+
|
|
333
|
+
context('with an array of mixed string selectors and range selectors', () => {
|
|
263
334
|
it('extracts content from the given HTML', async () => {
|
|
264
|
-
const result = await
|
|
335
|
+
const result = await extract(new SourceDocument({
|
|
265
336
|
content: rawHTML,
|
|
266
337
|
location: virtualLocation,
|
|
267
|
-
contentSelectors:
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
338
|
+
contentSelectors: [
|
|
339
|
+
'h1',
|
|
340
|
+
{
|
|
341
|
+
startAfter: '#link2',
|
|
342
|
+
endAfter: '#link3',
|
|
343
|
+
},
|
|
344
|
+
],
|
|
271
345
|
}));
|
|
272
346
|
|
|
273
|
-
expect(result).to.equal('[link 3](http://absolute.url/link)');
|
|
347
|
+
expect(result).to.equal('Title\n=====\n\n[link 3](http://absolute.url/link)');
|
|
274
348
|
});
|
|
275
349
|
});
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
350
|
+
});
|
|
351
|
+
|
|
352
|
+
describe('Remove', () => {
|
|
353
|
+
context('with a simple selector', () => {
|
|
354
|
+
it('removes the specified elements', async () => {
|
|
355
|
+
const result = await extract(new SourceDocument({
|
|
279
356
|
content: rawHTML,
|
|
280
357
|
location: virtualLocation,
|
|
281
|
-
contentSelectors:
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
358
|
+
contentSelectors: 'body',
|
|
359
|
+
insignificantContentSelectors: 'h1',
|
|
360
|
+
}));
|
|
361
|
+
|
|
362
|
+
expect(result).to.equal('[link 1](https://exemple.com/relative/link)\n\n[link 2](#anchor)\n\n[link 3](http://absolute.url/link)\n\n[link 5](http://[INVALID_URL=http://www.example.org/)');
|
|
286
363
|
});
|
|
287
364
|
});
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
365
|
+
|
|
366
|
+
context('with an array of string selectors', () => {
|
|
367
|
+
it('removes the specified elements', async () => {
|
|
368
|
+
const result = await extract(new SourceDocument({
|
|
291
369
|
content: rawHTML,
|
|
292
370
|
location: virtualLocation,
|
|
293
|
-
contentSelectors:
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
},
|
|
297
|
-
}))).to.be.rejectedWith(InaccessibleContentError, /"end" selector has no match/);
|
|
298
|
-
});
|
|
299
|
-
});
|
|
300
|
-
});
|
|
301
|
-
|
|
302
|
-
context('with an array of range selectors', () => {
|
|
303
|
-
it('extracts content from the given HTML', async () => {
|
|
304
|
-
const result = await extractFromHTML(new SourceDocument({
|
|
305
|
-
content: rawHTML,
|
|
306
|
-
location: virtualLocation,
|
|
307
|
-
contentSelectors: [
|
|
308
|
-
{
|
|
309
|
-
startAfter: '#link1',
|
|
310
|
-
endAfter: '#link2',
|
|
311
|
-
},
|
|
312
|
-
{
|
|
313
|
-
startAfter: '#link2',
|
|
314
|
-
endAfter: '#link3',
|
|
315
|
-
},
|
|
316
|
-
],
|
|
317
|
-
}));
|
|
318
|
-
|
|
319
|
-
expect(result).to.equal('[link 2](#anchor)\n\n[link 3](http://absolute.url/link)');
|
|
320
|
-
});
|
|
321
|
-
});
|
|
322
|
-
|
|
323
|
-
context('with an array of mixed string selectors and range selectors', () => {
|
|
324
|
-
it('extracts content from the given HTML', async () => {
|
|
325
|
-
const result = await extractFromHTML(new SourceDocument({
|
|
326
|
-
content: rawHTML,
|
|
327
|
-
location: virtualLocation,
|
|
328
|
-
contentSelectors: [
|
|
329
|
-
'h1',
|
|
330
|
-
{
|
|
331
|
-
startAfter: '#link2',
|
|
332
|
-
endAfter: '#link3',
|
|
333
|
-
},
|
|
334
|
-
],
|
|
335
|
-
}));
|
|
336
|
-
|
|
337
|
-
expect(result).to.equal('Title\n=====\n\n[link 3](http://absolute.url/link)');
|
|
338
|
-
});
|
|
339
|
-
});
|
|
340
|
-
});
|
|
341
|
-
|
|
342
|
-
describe('Remove', () => {
|
|
343
|
-
context('with a simple selector', () => {
|
|
344
|
-
it('removes the specified elements', async () => {
|
|
345
|
-
const result = await extractFromHTML(new SourceDocument({
|
|
346
|
-
content: rawHTML,
|
|
347
|
-
location: virtualLocation,
|
|
348
|
-
contentSelectors: 'body',
|
|
349
|
-
insignificantContentSelectors: 'h1',
|
|
350
|
-
}));
|
|
351
|
-
|
|
352
|
-
expect(result).to.equal('[link 1](https://exemple.com/relative/link)\n\n[link 2](#anchor)\n\n[link 3](http://absolute.url/link)');
|
|
353
|
-
});
|
|
354
|
-
});
|
|
355
|
-
|
|
356
|
-
context('with an array of string selectors', () => {
|
|
357
|
-
it('removes the specified elements', async () => {
|
|
358
|
-
const result = await extractFromHTML(new SourceDocument({
|
|
359
|
-
content: rawHTML,
|
|
360
|
-
location: virtualLocation,
|
|
361
|
-
contentSelectors: 'body',
|
|
362
|
-
insignificantContentSelectors: [ 'h1', '#link3' ],
|
|
363
|
-
}));
|
|
371
|
+
contentSelectors: 'body',
|
|
372
|
+
insignificantContentSelectors: [ 'h1', '#link3', '#link5' ],
|
|
373
|
+
}));
|
|
364
374
|
|
|
365
|
-
|
|
375
|
+
expect(result).to.equal('[link 1](https://exemple.com/relative/link)\n\n[link 2](#anchor)');
|
|
376
|
+
});
|
|
366
377
|
});
|
|
367
|
-
});
|
|
368
378
|
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
content: rawHTML,
|
|
373
|
-
location: virtualLocation,
|
|
374
|
-
contentSelectors: 'body',
|
|
375
|
-
insignificantContentSelectors: {
|
|
376
|
-
startBefore: '#link1',
|
|
377
|
-
endAfter: '#link3',
|
|
378
|
-
},
|
|
379
|
-
}));
|
|
380
|
-
|
|
381
|
-
expect(result).to.equal('Title\n=====');
|
|
382
|
-
});
|
|
383
|
-
context('with a "start" selector that has no match', () => {
|
|
384
|
-
it('throws an InaccessibleContentError error', async () => {
|
|
385
|
-
await expect(extractFromHTML(new SourceDocument({
|
|
379
|
+
context('with a simple range selector', () => {
|
|
380
|
+
it('removes the specified elements', async () => {
|
|
381
|
+
const result = await extract(new SourceDocument({
|
|
386
382
|
content: rawHTML,
|
|
387
383
|
location: virtualLocation,
|
|
388
384
|
contentSelectors: 'body',
|
|
389
385
|
insignificantContentSelectors: {
|
|
390
|
-
|
|
391
|
-
endAfter: '#
|
|
386
|
+
startBefore: '#link1',
|
|
387
|
+
endAfter: '#link5',
|
|
392
388
|
},
|
|
393
|
-
}))
|
|
389
|
+
}));
|
|
390
|
+
|
|
391
|
+
expect(result).to.equal('Title\n=====');
|
|
392
|
+
});
|
|
393
|
+
context('with a "start" selector that has no match', () => {
|
|
394
|
+
it('throws an ExtractDocumentError error', async () => {
|
|
395
|
+
await expect(extract(new SourceDocument({
|
|
396
|
+
content: rawHTML,
|
|
397
|
+
location: virtualLocation,
|
|
398
|
+
contentSelectors: 'body',
|
|
399
|
+
insignificantContentSelectors: {
|
|
400
|
+
startAfter: '#paragraph1',
|
|
401
|
+
endAfter: '#link2',
|
|
402
|
+
},
|
|
403
|
+
}))).to.be.rejectedWith(ExtractDocumentError, /"start" selector has no match/);
|
|
404
|
+
});
|
|
405
|
+
});
|
|
406
|
+
context('with an "end" selector that has no match', () => {
|
|
407
|
+
it('throws an ExtractDocumentError error', async () => {
|
|
408
|
+
await expect(extract(new SourceDocument({
|
|
409
|
+
content: rawHTML,
|
|
410
|
+
location: virtualLocation,
|
|
411
|
+
contentSelectors: 'body',
|
|
412
|
+
insignificantContentSelectors: {
|
|
413
|
+
startAfter: '#link2',
|
|
414
|
+
endAfter: '#paragraph1',
|
|
415
|
+
},
|
|
416
|
+
}))).to.be.rejectedWith(ExtractDocumentError, /"end" selector has no match/);
|
|
417
|
+
});
|
|
394
418
|
});
|
|
395
419
|
});
|
|
396
|
-
context('with an
|
|
397
|
-
it('
|
|
398
|
-
await
|
|
420
|
+
context('with an array of range selectors', () => {
|
|
421
|
+
it('removes all the selections', async () => {
|
|
422
|
+
const result = await extract(new SourceDocument({
|
|
399
423
|
content: rawHTML,
|
|
400
424
|
location: virtualLocation,
|
|
401
425
|
contentSelectors: 'body',
|
|
402
|
-
insignificantContentSelectors:
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
|
|
413
|
-
content: rawHTML,
|
|
414
|
-
location: virtualLocation,
|
|
415
|
-
contentSelectors: 'body',
|
|
416
|
-
insignificantContentSelectors: [
|
|
417
|
-
{
|
|
418
|
-
startBefore: 'h1',
|
|
419
|
-
endBefore: '#link1',
|
|
420
|
-
},
|
|
421
|
-
{
|
|
422
|
-
startBefore: '#link3',
|
|
423
|
-
endAfter: '#link3',
|
|
424
|
-
},
|
|
425
|
-
],
|
|
426
|
-
}));
|
|
427
|
-
|
|
428
|
-
expect(result).to.equal('[link 1](https://exemple.com/relative/link)\n\n[link 2](#anchor)');
|
|
429
|
-
});
|
|
430
|
-
});
|
|
431
|
-
|
|
432
|
-
context('with an array of mixed selectors and range selectors', () => {
|
|
433
|
-
it('removes all the selections', async () => {
|
|
434
|
-
const result = await extractFromHTML(new SourceDocument({
|
|
435
|
-
content: rawHTML,
|
|
436
|
-
location: virtualLocation,
|
|
437
|
-
contentSelectors: 'body',
|
|
438
|
-
insignificantContentSelectors: [
|
|
439
|
-
'h1',
|
|
440
|
-
{
|
|
441
|
-
startBefore: '#link3',
|
|
442
|
-
endAfter: '#link3',
|
|
443
|
-
},
|
|
444
|
-
],
|
|
445
|
-
}));
|
|
426
|
+
insignificantContentSelectors: [
|
|
427
|
+
{
|
|
428
|
+
startBefore: 'h1',
|
|
429
|
+
endBefore: '#link1',
|
|
430
|
+
},
|
|
431
|
+
{
|
|
432
|
+
startBefore: '#link3',
|
|
433
|
+
endAfter: '#link5',
|
|
434
|
+
},
|
|
435
|
+
],
|
|
436
|
+
}));
|
|
446
437
|
|
|
447
|
-
|
|
438
|
+
expect(result).to.equal('[link 1](https://exemple.com/relative/link)\n\n[link 2](#anchor)');
|
|
439
|
+
});
|
|
448
440
|
});
|
|
449
441
|
|
|
450
|
-
context('
|
|
442
|
+
context('with an array of mixed selectors and range selectors', () => {
|
|
451
443
|
it('removes all the selections', async () => {
|
|
452
|
-
const result = await
|
|
444
|
+
const result = await extract(new SourceDocument({
|
|
453
445
|
content: rawHTML,
|
|
454
446
|
location: virtualLocation,
|
|
455
447
|
contentSelectors: 'body',
|
|
456
448
|
insignificantContentSelectors: [
|
|
457
449
|
'h1',
|
|
458
450
|
{
|
|
459
|
-
|
|
460
|
-
|
|
451
|
+
startBefore: '#link3',
|
|
452
|
+
endAfter: '#link5',
|
|
461
453
|
},
|
|
462
454
|
],
|
|
463
455
|
}));
|
|
464
456
|
|
|
465
|
-
expect(result).to.equal('[link
|
|
457
|
+
expect(result).to.equal('[link 1](https://exemple.com/relative/link)\n\n[link 2](#anchor)');
|
|
458
|
+
});
|
|
459
|
+
|
|
460
|
+
context('where one selector is dependent on another', () => {
|
|
461
|
+
it('removes all the selections', async () => {
|
|
462
|
+
const result = await extract(new SourceDocument({
|
|
463
|
+
content: rawHTML,
|
|
464
|
+
location: virtualLocation,
|
|
465
|
+
contentSelectors: 'body',
|
|
466
|
+
insignificantContentSelectors: [
|
|
467
|
+
'h1',
|
|
468
|
+
{
|
|
469
|
+
startAfter: 'h1',
|
|
470
|
+
endBefore: '#link2',
|
|
471
|
+
},
|
|
472
|
+
],
|
|
473
|
+
}));
|
|
474
|
+
|
|
475
|
+
expect(result).to.equal('[link 2](#anchor)\n\n[link 3](http://absolute.url/link)\n\n[link 5](http://[INVALID_URL=http://www.example.org/)');
|
|
476
|
+
});
|
|
466
477
|
});
|
|
467
478
|
});
|
|
468
479
|
});
|
|
469
|
-
});
|
|
470
480
|
|
|
471
|
-
|
|
472
|
-
|
|
473
|
-
|
|
474
|
-
|
|
475
|
-
|
|
476
|
-
|
|
477
|
-
|
|
478
|
-
|
|
479
|
-
|
|
480
|
-
|
|
481
|
-
|
|
481
|
+
describe('Filter', () => {
|
|
482
|
+
context('with a synchronous filter', () => {
|
|
483
|
+
it('extracts content from the given HTML also with given additional filter', async () => {
|
|
484
|
+
const result = await extract(new SourceDocument({
|
|
485
|
+
content: rawHTML,
|
|
486
|
+
location: virtualLocation,
|
|
487
|
+
contentSelectors: 'body',
|
|
488
|
+
filters: [additionalFilter.removeLinks],
|
|
489
|
+
}));
|
|
490
|
+
|
|
491
|
+
expect(result).to.equal(expectedExtractedWithAdditional);
|
|
492
|
+
});
|
|
482
493
|
});
|
|
483
|
-
});
|
|
484
494
|
|
|
485
|
-
|
|
486
|
-
|
|
487
|
-
|
|
488
|
-
|
|
489
|
-
|
|
490
|
-
|
|
491
|
-
|
|
492
|
-
|
|
495
|
+
context('with an asynchronous filter', () => {
|
|
496
|
+
it('extracts content from the given HTML also with given additional filter', async () => {
|
|
497
|
+
const result = await extract(new SourceDocument({
|
|
498
|
+
content: rawHTML,
|
|
499
|
+
location: virtualLocation,
|
|
500
|
+
contentSelectors: 'body',
|
|
501
|
+
filters: [additionalFilter.removeLinksAsync],
|
|
502
|
+
}));
|
|
493
503
|
|
|
494
|
-
|
|
504
|
+
expect(result).to.equal(expectedExtractedWithAdditional);
|
|
505
|
+
});
|
|
495
506
|
});
|
|
496
507
|
});
|
|
497
508
|
});
|
|
498
|
-
});
|
|
499
509
|
|
|
500
|
-
|
|
501
|
-
|
|
502
|
-
|
|
510
|
+
context('from PDF content', () => {
|
|
511
|
+
let pdfContent;
|
|
512
|
+
let expectedExtractedContent;
|
|
503
513
|
|
|
504
|
-
|
|
505
|
-
|
|
506
|
-
|
|
507
|
-
|
|
508
|
-
|
|
509
|
-
|
|
510
|
-
|
|
514
|
+
before(async () => {
|
|
515
|
+
pdfContent = await fs.readFile(path.resolve(__dirname, '../../../test/fixtures/terms.pdf'));
|
|
516
|
+
expectedExtractedContent = await fs.readFile(
|
|
517
|
+
path.resolve(__dirname, '../../../test/fixtures/termsFromPDF.md'),
|
|
518
|
+
{ encoding: 'utf8' },
|
|
519
|
+
);
|
|
520
|
+
});
|
|
511
521
|
|
|
512
|
-
|
|
513
|
-
|
|
522
|
+
it('extracts content from the given PDF', async () => {
|
|
523
|
+
expect(await extract({ content: pdfContent, mimeType: mime.getType('pdf') })).to.equal(expectedExtractedContent);
|
|
524
|
+
});
|
|
525
|
+
|
|
526
|
+
context('when PDF contains no text', () => {
|
|
527
|
+
it('throws an ExtractDocumentError error', async () => {
|
|
528
|
+
await expect(extract({ content: await fs.readFile(path.resolve(__dirname, '../../../test/fixtures/termsWithoutText.pdf')), mimeType: mime.getType('pdf') })).to.be.rejectedWith(ExtractDocumentError, /contains no text/);
|
|
529
|
+
});
|
|
530
|
+
});
|
|
514
531
|
});
|
|
515
532
|
});
|
|
516
533
|
});
|