@opentermsarchive/engine 1.1.2 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -4,11 +4,13 @@ import { fileURLToPath } from 'url';
4
4
 
5
5
  import chai from 'chai';
6
6
  import jsdom from 'jsdom';
7
+ import mime from 'mime';
7
8
 
8
- import { InaccessibleContentError } from '../errors.js';
9
9
  import SourceDocument from '../services/sourceDocument.js';
10
10
 
11
- import { convertRelativeURLsToAbsolute, extractFromHTML, extractFromPDF } from './index.js';
11
+ import { ExtractDocumentError } from './errors.js';
12
+
13
+ import extract, { convertRelativeURLsToAbsolute } from './index.js';
12
14
 
13
15
  const __dirname = path.dirname(fileURLToPath(import.meta.url));
14
16
  const fs = fsApi.promises;
@@ -28,6 +30,7 @@ const rawHTML = `
28
30
  <p><a id="link1" href="/relative/link">link 1</a></p>
29
31
  <p><a id="link2" href="#anchor">link 2</a></p>
30
32
  <p><a id="link3" href="http://absolute.url/link">link 3</a></p>
33
+ <p><a id="link5" href="http://[INVALID_URL=http://www.example.org/">link 5</a></p>
31
34
  <div id="empty"></div>
32
35
  <div id="whitespaceOnly"> </div>
33
36
  </body>
@@ -40,7 +43,9 @@ const expectedExtracted = `Title
40
43
 
41
44
  [link 2](#anchor)
42
45
 
43
- [link 3](http://absolute.url/link)`;
46
+ [link 3](http://absolute.url/link)
47
+
48
+ [link 5](http://[INVALID_URL=http://www.example.org/)`;
44
49
 
45
50
  const expectedExtractedWithAdditional = `Title
46
51
  =====`;
@@ -92,7 +97,7 @@ const additionalFilter = {
92
97
  link.remove();
93
98
  });
94
99
  },
95
- removeLinksAsync: async function removeLinksAsync(document) {
100
+ removeLinksAsync: function removeLinksAsync(document) {
96
101
  return new Promise(resolve => {
97
102
  setTimeout(() => {
98
103
  const links = document.querySelectorAll('a');
@@ -117,400 +122,412 @@ describe('Extract', () => {
117
122
  subject = Array.from(webPageDOM.querySelectorAll('a[href]')).map(el => el.href);
118
123
  });
119
124
 
120
- it('converts relative urls', async () => {
125
+ it('converts relative urls', () => {
121
126
  expect(subject).to.include('https://exemple.com/relative/link');
122
127
  });
123
128
 
124
- it('leaves absolute urls untouched', async () => {
129
+ it('leaves absolute urls untouched', () => {
125
130
  expect(subject).to.include('http://absolute.url/link');
126
131
  });
127
- });
128
132
 
129
- describe('#extractFromHTML', () => {
130
- describe('Select', () => {
131
- context('with string selector', () => {
132
- it('extracts content from the given HTML with common changing items', async () => {
133
- const result = await extractFromHTML(new SourceDocument({
134
- location: virtualLocation,
135
- contentSelectors: 'body',
136
- content: rawHTMLWithCommonChangingItems,
137
- }));
138
-
139
- expect(result).to.equal(expectedExtractedWithCommonChangingItems);
140
- });
133
+ it('leaves invalid urls untouched', () => {
134
+ expect(subject).to.include('http://[INVALID_URL=http://www.example.org/');
135
+ });
136
+ });
141
137
 
142
- it('extracts content from the given HTML', async () => {
143
- const result = await extractFromHTML(new SourceDocument({
144
- location: virtualLocation,
145
- contentSelectors: 'body',
146
- content: rawHTML,
147
- }));
138
+ describe('#extract', () => {
139
+ context('from HTML content', () => {
140
+ describe('Select', () => {
141
+ context('with string selector', () => {
142
+ it('extracts content from the given HTML with common changing items', async () => {
143
+ const result = await extract(new SourceDocument({
144
+ location: virtualLocation,
145
+ contentSelectors: 'body',
146
+ content: rawHTMLWithCommonChangingItems,
147
+ }));
148
148
 
149
- expect(result).to.equal(expectedExtracted);
150
- });
149
+ expect(result).to.equal(expectedExtractedWithCommonChangingItems);
150
+ });
151
151
 
152
- context('with no match for the given selector', () => {
153
- it('throws an InaccessibleContentError error', async () => {
154
- await expect(extractFromHTML(new SourceDocument({
152
+ it('extracts content from the given HTML', async () => {
153
+ const result = await extract(new SourceDocument({
155
154
  location: virtualLocation,
156
- contentSelectors: '#thisAnchorDoesNotExist',
155
+ contentSelectors: 'body',
157
156
  content: rawHTML,
158
- }))).to.be.rejectedWith(InaccessibleContentError, /#thisAnchorDoesNotExist/);
157
+ }));
158
+
159
+ expect(result).to.equal(expectedExtracted);
159
160
  });
160
- });
161
161
 
162
- context('with no content for the matching given selector', () => {
163
- it('throws an InaccessibleContentError error', async () => {
164
- await expect(extractFromHTML(new SourceDocument({
165
- location: virtualLocation,
166
- contentSelectors: '#empty',
167
- content: rawHTML,
168
- }))).to.be.rejectedWith(InaccessibleContentError, /empty content/);
162
+ context('with no match for the given selector', () => {
163
+ it('throws an ExtractDocumentError error', async () => {
164
+ await expect(extract(new SourceDocument({
165
+ location: virtualLocation,
166
+ contentSelectors: '#thisAnchorDoesNotExist',
167
+ content: rawHTML,
168
+ }))).to.be.rejectedWith(Error, /#thisAnchorDoesNotExist/);
169
+ });
169
170
  });
170
- });
171
171
 
172
- context('with a whitespace only content for the corresponding given selector', () => {
173
- it('throws an InaccessibleContentError error', async () => {
174
- await expect(extractFromHTML(new SourceDocument({
175
- location: virtualLocation,
176
- contentSelectors: '#whitespaceOnly',
177
- content: rawHTML,
178
- }))).to.be.rejectedWith(InaccessibleContentError, /empty content/);
172
+ context('with no content for the matching given selector', () => {
173
+ it('throws an ExtractDocumentError error', async () => {
174
+ await expect(extract(new SourceDocument({
175
+ location: virtualLocation,
176
+ contentSelectors: '#empty',
177
+ content: rawHTML,
178
+ }))).to.be.rejectedWith(ExtractDocumentError, /empty content/);
179
+ });
180
+ });
181
+
182
+ context('with a whitespace only content for the corresponding given selector', () => {
183
+ it('throws an ExtractDocumentError error', async () => {
184
+ await expect(extract(new SourceDocument({
185
+ location: virtualLocation,
186
+ contentSelectors: '#whitespaceOnly',
187
+ content: rawHTML,
188
+ }))).to.be.rejectedWith(ExtractDocumentError, /empty content/);
189
+ });
190
+ });
191
+
192
+ context('with multiple selectors in one string', () => {
193
+ it('extracts content from the given HTML', async () => {
194
+ const result = await extract(new SourceDocument({
195
+ location: virtualLocation,
196
+ contentSelectors: 'h1, #link2',
197
+ content: rawHTML,
198
+ }));
199
+
200
+ expect(result).to.equal('Title\n=====\n\n[link 2](#anchor)');
201
+ });
179
202
  });
180
203
  });
181
204
 
182
- context('with multiple selectors in one string', () => {
205
+ context('with an array of selectors', () => {
183
206
  it('extracts content from the given HTML', async () => {
184
- const result = await extractFromHTML(new SourceDocument({
185
- location: virtualLocation,
186
- contentSelectors: 'h1, #link2',
207
+ const result = await extract(new SourceDocument({
187
208
  content: rawHTML,
209
+ location: virtualLocation,
210
+ contentSelectors: [ 'h1', '#link2' ],
188
211
  }));
189
212
 
190
213
  expect(result).to.equal('Title\n=====\n\n[link 2](#anchor)');
191
214
  });
192
- });
193
- });
194
215
 
195
- context('with an array of selectors', () => {
196
- it('extracts content from the given HTML', async () => {
197
- const result = await extractFromHTML(new SourceDocument({
198
- content: rawHTML,
199
- location: virtualLocation,
200
- contentSelectors: [ 'h1', '#link2' ],
201
- }));
216
+ context('when one selector is dependent on another', () => {
217
+ it('extracts content from the given HTML', async () => {
218
+ const result = await extract(new SourceDocument({
219
+ content: rawHTML,
220
+ location: virtualLocation,
221
+ contentSelectors: [ 'h1', 'h1 ~ p' ],
222
+ }));
202
223
 
203
- expect(result).to.equal('Title\n=====\n\n[link 2](#anchor)');
224
+ expect(result).to.equal('Title\n=====\n\n[link 1](https://exemple.com/relative/link)\n\n[link 2](#anchor)\n\n[link 3](http://absolute.url/link)\n\n[link 5](http://[INVALID_URL=http://www.example.org/)');
225
+ });
226
+ });
204
227
  });
205
228
 
206
- context('when one selector is dependent on another', () => {
207
- it('extracts content from the given HTML', async () => {
208
- const result = await extractFromHTML(new SourceDocument({
209
- content: rawHTML,
210
- location: virtualLocation,
211
- contentSelectors: [ 'h1', 'h1 ~ p' ],
212
- }));
229
+ context('with range selector', () => {
230
+ context('with startBefore and endBefore', () => {
231
+ it('extracts content from the given HTML', async () => {
232
+ const result = await extract(new SourceDocument({
233
+ content: rawHTML,
234
+ location: virtualLocation,
235
+ contentSelectors: {
236
+ startBefore: '#link1',
237
+ endBefore: '#link2',
238
+ },
239
+ }));
213
240
 
214
- expect(result).to.equal('Title\n=====\n\n[link 1](https://exemple.com/relative/link)\n\n[link 2](#anchor)\n\n[link 3](http://absolute.url/link)');
241
+ expect(result).to.equal('[link 1](https://exemple.com/relative/link)');
242
+ });
215
243
  });
216
- });
217
- });
244
+ context('with startBefore and endAfter', () => {
245
+ it('extracts content from the given HTML', async () => {
246
+ const result = await extract(new SourceDocument({
247
+ content: rawHTML,
248
+ location: virtualLocation,
249
+ contentSelectors: {
250
+ startBefore: '#link2',
251
+ endAfter: '#link2',
252
+ },
253
+ }));
218
254
 
219
- context('with range selector', () => {
220
- context('with startBefore and endBefore', () => {
221
- it('extracts content from the given HTML', async () => {
222
- const result = await extractFromHTML(new SourceDocument({
223
- content: rawHTML,
224
- location: virtualLocation,
225
- contentSelectors: {
226
- startBefore: '#link1',
227
- endBefore: '#link2',
228
- },
229
- }));
255
+ expect(result).to.equal('[link 2](#anchor)');
256
+ });
257
+ });
258
+ context('with startAfter and endBefore', () => {
259
+ it('extracts content from the given HTML', async () => {
260
+ const result = await extract(new SourceDocument({
261
+ content: rawHTML,
262
+ location: virtualLocation,
263
+ contentSelectors: {
264
+ startAfter: '#link1',
265
+ endBefore: '#link3',
266
+ },
267
+ }));
230
268
 
231
- expect(result).to.equal('[link 1](https://exemple.com/relative/link)');
269
+ expect(result).to.equal('[link 2](#anchor)');
270
+ });
232
271
  });
233
- });
234
- context('with startBefore and endAfter', () => {
235
- it('extracts content from the given HTML', async () => {
236
- const result = await extractFromHTML(new SourceDocument({
237
- content: rawHTML,
238
- location: virtualLocation,
239
- contentSelectors: {
240
- startBefore: '#link2',
241
- endAfter: '#link2',
242
- },
243
- }));
272
+ context('with startAfter and endAfter', () => {
273
+ it('extracts content from the given HTML', async () => {
274
+ const result = await extract(new SourceDocument({
275
+ content: rawHTML,
276
+ location: virtualLocation,
277
+ contentSelectors: {
278
+ startAfter: '#link2',
279
+ endAfter: '#link3',
280
+ },
281
+ }));
244
282
 
245
- expect(result).to.equal('[link 2](#anchor)');
283
+ expect(result).to.equal('[link 3](http://absolute.url/link)');
284
+ });
285
+ });
286
+ context('with a "start" selector that has no match', () => {
287
+ it('throws an ExtractDocumentError error', async () => {
288
+ await expect(extract(new SourceDocument({
289
+ content: rawHTML,
290
+ location: virtualLocation,
291
+ contentSelectors: {
292
+ startAfter: '#paragraph1',
293
+ endAfter: '#link2',
294
+ },
295
+ }))).to.be.rejectedWith(ExtractDocumentError, /"start" selector has no match/);
296
+ });
297
+ });
298
+ context('with an "end" selector that has no match', () => {
299
+ it('throws an ExtractDocumentError error', async () => {
300
+ await expect(extract(new SourceDocument({
301
+ content: rawHTML,
302
+ location: virtualLocation,
303
+ contentSelectors: {
304
+ startAfter: '#link2',
305
+ endAfter: '#paragraph1',
306
+ },
307
+ }))).to.be.rejectedWith(ExtractDocumentError, /"end" selector has no match/);
308
+ });
246
309
  });
247
310
  });
248
- context('with startAfter and endBefore', () => {
311
+
312
+ context('with an array of range selectors', () => {
249
313
  it('extracts content from the given HTML', async () => {
250
- const result = await extractFromHTML(new SourceDocument({
314
+ const result = await extract(new SourceDocument({
251
315
  content: rawHTML,
252
316
  location: virtualLocation,
253
- contentSelectors: {
254
- startAfter: '#link1',
255
- endBefore: '#link3',
256
- },
317
+ contentSelectors: [
318
+ {
319
+ startAfter: '#link1',
320
+ endAfter: '#link2',
321
+ },
322
+ {
323
+ startAfter: '#link2',
324
+ endAfter: '#link3',
325
+ },
326
+ ],
257
327
  }));
258
328
 
259
- expect(result).to.equal('[link 2](#anchor)');
329
+ expect(result).to.equal('[link 2](#anchor)\n\n[link 3](http://absolute.url/link)');
260
330
  });
261
331
  });
262
- context('with startAfter and endAfter', () => {
332
+
333
+ context('with an array of mixed string selectors and range selectors', () => {
263
334
  it('extracts content from the given HTML', async () => {
264
- const result = await extractFromHTML(new SourceDocument({
335
+ const result = await extract(new SourceDocument({
265
336
  content: rawHTML,
266
337
  location: virtualLocation,
267
- contentSelectors: {
268
- startAfter: '#link2',
269
- endAfter: '#link3',
270
- },
338
+ contentSelectors: [
339
+ 'h1',
340
+ {
341
+ startAfter: '#link2',
342
+ endAfter: '#link3',
343
+ },
344
+ ],
271
345
  }));
272
346
 
273
- expect(result).to.equal('[link 3](http://absolute.url/link)');
347
+ expect(result).to.equal('Title\n=====\n\n[link 3](http://absolute.url/link)');
274
348
  });
275
349
  });
276
- context('with a "start" selector that has no match', () => {
277
- it('throws an InaccessibleContentError error', async () => {
278
- await expect(extractFromHTML(new SourceDocument({
350
+ });
351
+
352
+ describe('Remove', () => {
353
+ context('with a simple selector', () => {
354
+ it('removes the specified elements', async () => {
355
+ const result = await extract(new SourceDocument({
279
356
  content: rawHTML,
280
357
  location: virtualLocation,
281
- contentSelectors: {
282
- startAfter: '#paragraph1',
283
- endAfter: '#link2',
284
- },
285
- }))).to.be.rejectedWith(InaccessibleContentError, /"start" selector has no match/);
358
+ contentSelectors: 'body',
359
+ insignificantContentSelectors: 'h1',
360
+ }));
361
+
362
+ expect(result).to.equal('[link 1](https://exemple.com/relative/link)\n\n[link 2](#anchor)\n\n[link 3](http://absolute.url/link)\n\n[link 5](http://[INVALID_URL=http://www.example.org/)');
286
363
  });
287
364
  });
288
- context('with an "end" selector that has no match', () => {
289
- it('throws an InaccessibleContentError error', async () => {
290
- await expect(extractFromHTML(new SourceDocument({
365
+
366
+ context('with an array of string selectors', () => {
367
+ it('removes the specified elements', async () => {
368
+ const result = await extract(new SourceDocument({
291
369
  content: rawHTML,
292
370
  location: virtualLocation,
293
- contentSelectors: {
294
- startAfter: '#link2',
295
- endAfter: '#paragraph1',
296
- },
297
- }))).to.be.rejectedWith(InaccessibleContentError, /"end" selector has no match/);
298
- });
299
- });
300
- });
301
-
302
- context('with an array of range selectors', () => {
303
- it('extracts content from the given HTML', async () => {
304
- const result = await extractFromHTML(new SourceDocument({
305
- content: rawHTML,
306
- location: virtualLocation,
307
- contentSelectors: [
308
- {
309
- startAfter: '#link1',
310
- endAfter: '#link2',
311
- },
312
- {
313
- startAfter: '#link2',
314
- endAfter: '#link3',
315
- },
316
- ],
317
- }));
318
-
319
- expect(result).to.equal('[link 2](#anchor)\n\n[link 3](http://absolute.url/link)');
320
- });
321
- });
322
-
323
- context('with an array of mixed string selectors and range selectors', () => {
324
- it('extracts content from the given HTML', async () => {
325
- const result = await extractFromHTML(new SourceDocument({
326
- content: rawHTML,
327
- location: virtualLocation,
328
- contentSelectors: [
329
- 'h1',
330
- {
331
- startAfter: '#link2',
332
- endAfter: '#link3',
333
- },
334
- ],
335
- }));
336
-
337
- expect(result).to.equal('Title\n=====\n\n[link 3](http://absolute.url/link)');
338
- });
339
- });
340
- });
341
-
342
- describe('Remove', () => {
343
- context('with a simple selector', () => {
344
- it('removes the specified elements', async () => {
345
- const result = await extractFromHTML(new SourceDocument({
346
- content: rawHTML,
347
- location: virtualLocation,
348
- contentSelectors: 'body',
349
- insignificantContentSelectors: 'h1',
350
- }));
351
-
352
- expect(result).to.equal('[link 1](https://exemple.com/relative/link)\n\n[link 2](#anchor)\n\n[link 3](http://absolute.url/link)');
353
- });
354
- });
355
-
356
- context('with an array of string selectors', () => {
357
- it('removes the specified elements', async () => {
358
- const result = await extractFromHTML(new SourceDocument({
359
- content: rawHTML,
360
- location: virtualLocation,
361
- contentSelectors: 'body',
362
- insignificantContentSelectors: [ 'h1', '#link3' ],
363
- }));
371
+ contentSelectors: 'body',
372
+ insignificantContentSelectors: [ 'h1', '#link3', '#link5' ],
373
+ }));
364
374
 
365
- expect(result).to.equal('[link 1](https://exemple.com/relative/link)\n\n[link 2](#anchor)');
375
+ expect(result).to.equal('[link 1](https://exemple.com/relative/link)\n\n[link 2](#anchor)');
376
+ });
366
377
  });
367
- });
368
378
 
369
- context('with a simple range selector', () => {
370
- it('removes the specified elements', async () => {
371
- const result = await extractFromHTML(new SourceDocument({
372
- content: rawHTML,
373
- location: virtualLocation,
374
- contentSelectors: 'body',
375
- insignificantContentSelectors: {
376
- startBefore: '#link1',
377
- endAfter: '#link3',
378
- },
379
- }));
380
-
381
- expect(result).to.equal('Title\n=====');
382
- });
383
- context('with a "start" selector that has no match', () => {
384
- it('throws an InaccessibleContentError error', async () => {
385
- await expect(extractFromHTML(new SourceDocument({
379
+ context('with a simple range selector', () => {
380
+ it('removes the specified elements', async () => {
381
+ const result = await extract(new SourceDocument({
386
382
  content: rawHTML,
387
383
  location: virtualLocation,
388
384
  contentSelectors: 'body',
389
385
  insignificantContentSelectors: {
390
- startAfter: '#paragraph1',
391
- endAfter: '#link2',
386
+ startBefore: '#link1',
387
+ endAfter: '#link5',
392
388
  },
393
- }))).to.be.rejectedWith(InaccessibleContentError, /"start" selector has no match/);
389
+ }));
390
+
391
+ expect(result).to.equal('Title\n=====');
392
+ });
393
+ context('with a "start" selector that has no match', () => {
394
+ it('throws an ExtractDocumentError error', async () => {
395
+ await expect(extract(new SourceDocument({
396
+ content: rawHTML,
397
+ location: virtualLocation,
398
+ contentSelectors: 'body',
399
+ insignificantContentSelectors: {
400
+ startAfter: '#paragraph1',
401
+ endAfter: '#link2',
402
+ },
403
+ }))).to.be.rejectedWith(ExtractDocumentError, /"start" selector has no match/);
404
+ });
405
+ });
406
+ context('with an "end" selector that has no match', () => {
407
+ it('throws an ExtractDocumentError error', async () => {
408
+ await expect(extract(new SourceDocument({
409
+ content: rawHTML,
410
+ location: virtualLocation,
411
+ contentSelectors: 'body',
412
+ insignificantContentSelectors: {
413
+ startAfter: '#link2',
414
+ endAfter: '#paragraph1',
415
+ },
416
+ }))).to.be.rejectedWith(ExtractDocumentError, /"end" selector has no match/);
417
+ });
394
418
  });
395
419
  });
396
- context('with an "end" selector that has no match', () => {
397
- it('throws an InaccessibleContentError error', async () => {
398
- await expect(extractFromHTML(new SourceDocument({
420
+ context('with an array of range selectors', () => {
421
+ it('removes all the selections', async () => {
422
+ const result = await extract(new SourceDocument({
399
423
  content: rawHTML,
400
424
  location: virtualLocation,
401
425
  contentSelectors: 'body',
402
- insignificantContentSelectors: {
403
- startAfter: '#link2',
404
- endAfter: '#paragraph1',
405
- },
406
- }))).to.be.rejectedWith(InaccessibleContentError, /"end" selector has no match/);
407
- });
408
- });
409
- });
410
- context('with an array of range selectors', () => {
411
- it('removes all the selections', async () => {
412
- const result = await extractFromHTML(new SourceDocument({
413
- content: rawHTML,
414
- location: virtualLocation,
415
- contentSelectors: 'body',
416
- insignificantContentSelectors: [
417
- {
418
- startBefore: 'h1',
419
- endBefore: '#link1',
420
- },
421
- {
422
- startBefore: '#link3',
423
- endAfter: '#link3',
424
- },
425
- ],
426
- }));
427
-
428
- expect(result).to.equal('[link 1](https://exemple.com/relative/link)\n\n[link 2](#anchor)');
429
- });
430
- });
431
-
432
- context('with an array of mixed selectors and range selectors', () => {
433
- it('removes all the selections', async () => {
434
- const result = await extractFromHTML(new SourceDocument({
435
- content: rawHTML,
436
- location: virtualLocation,
437
- contentSelectors: 'body',
438
- insignificantContentSelectors: [
439
- 'h1',
440
- {
441
- startBefore: '#link3',
442
- endAfter: '#link3',
443
- },
444
- ],
445
- }));
426
+ insignificantContentSelectors: [
427
+ {
428
+ startBefore: 'h1',
429
+ endBefore: '#link1',
430
+ },
431
+ {
432
+ startBefore: '#link3',
433
+ endAfter: '#link5',
434
+ },
435
+ ],
436
+ }));
446
437
 
447
- expect(result).to.equal('[link 1](https://exemple.com/relative/link)\n\n[link 2](#anchor)');
438
+ expect(result).to.equal('[link 1](https://exemple.com/relative/link)\n\n[link 2](#anchor)');
439
+ });
448
440
  });
449
441
 
450
- context('where one selector is dependent on another', () => {
442
+ context('with an array of mixed selectors and range selectors', () => {
451
443
  it('removes all the selections', async () => {
452
- const result = await extractFromHTML(new SourceDocument({
444
+ const result = await extract(new SourceDocument({
453
445
  content: rawHTML,
454
446
  location: virtualLocation,
455
447
  contentSelectors: 'body',
456
448
  insignificantContentSelectors: [
457
449
  'h1',
458
450
  {
459
- startAfter: 'h1',
460
- endBefore: '#link2',
451
+ startBefore: '#link3',
452
+ endAfter: '#link5',
461
453
  },
462
454
  ],
463
455
  }));
464
456
 
465
- expect(result).to.equal('[link 2](#anchor)\n\n[link 3](http://absolute.url/link)');
457
+ expect(result).to.equal('[link 1](https://exemple.com/relative/link)\n\n[link 2](#anchor)');
458
+ });
459
+
460
+ context('where one selector is dependent on another', () => {
461
+ it('removes all the selections', async () => {
462
+ const result = await extract(new SourceDocument({
463
+ content: rawHTML,
464
+ location: virtualLocation,
465
+ contentSelectors: 'body',
466
+ insignificantContentSelectors: [
467
+ 'h1',
468
+ {
469
+ startAfter: 'h1',
470
+ endBefore: '#link2',
471
+ },
472
+ ],
473
+ }));
474
+
475
+ expect(result).to.equal('[link 2](#anchor)\n\n[link 3](http://absolute.url/link)\n\n[link 5](http://[INVALID_URL=http://www.example.org/)');
476
+ });
466
477
  });
467
478
  });
468
479
  });
469
- });
470
480
 
471
- describe('Filter', () => {
472
- context('with a synchronous filter', () => {
473
- it('extracts content from the given HTML also with given additional filter', async () => {
474
- const result = await extractFromHTML(new SourceDocument({
475
- content: rawHTML,
476
- location: virtualLocation,
477
- contentSelectors: 'body',
478
- filters: [additionalFilter.removeLinks],
479
- }));
480
-
481
- expect(result).to.equal(expectedExtractedWithAdditional);
481
+ describe('Filter', () => {
482
+ context('with a synchronous filter', () => {
483
+ it('extracts content from the given HTML also with given additional filter', async () => {
484
+ const result = await extract(new SourceDocument({
485
+ content: rawHTML,
486
+ location: virtualLocation,
487
+ contentSelectors: 'body',
488
+ filters: [additionalFilter.removeLinks],
489
+ }));
490
+
491
+ expect(result).to.equal(expectedExtractedWithAdditional);
492
+ });
482
493
  });
483
- });
484
494
 
485
- context('with an asynchronous filter', () => {
486
- it('extracts content from the given HTML also with given additional filter', async () => {
487
- const result = await extractFromHTML(new SourceDocument({
488
- content: rawHTML,
489
- location: virtualLocation,
490
- contentSelectors: 'body',
491
- filters: [additionalFilter.removeLinksAsync],
492
- }));
495
+ context('with an asynchronous filter', () => {
496
+ it('extracts content from the given HTML also with given additional filter', async () => {
497
+ const result = await extract(new SourceDocument({
498
+ content: rawHTML,
499
+ location: virtualLocation,
500
+ contentSelectors: 'body',
501
+ filters: [additionalFilter.removeLinksAsync],
502
+ }));
493
503
 
494
- expect(result).to.equal(expectedExtractedWithAdditional);
504
+ expect(result).to.equal(expectedExtractedWithAdditional);
505
+ });
495
506
  });
496
507
  });
497
508
  });
498
- });
499
509
 
500
- describe('#extractFromPDF', () => {
501
- let pdfContent;
502
- let expectedExtractedContent;
510
+ context('from PDF content', () => {
511
+ let pdfContent;
512
+ let expectedExtractedContent;
503
513
 
504
- before(async () => {
505
- pdfContent = await fs.readFile(path.resolve(__dirname, '../../../test/fixtures/terms.pdf'));
506
- expectedExtractedContent = await fs.readFile(
507
- path.resolve(__dirname, '../../../test/fixtures/termsFromPDF.md'),
508
- { encoding: 'utf8' },
509
- );
510
- });
514
+ before(async () => {
515
+ pdfContent = await fs.readFile(path.resolve(__dirname, '../../../test/fixtures/terms.pdf'));
516
+ expectedExtractedContent = await fs.readFile(
517
+ path.resolve(__dirname, '../../../test/fixtures/termsFromPDF.md'),
518
+ { encoding: 'utf8' },
519
+ );
520
+ });
511
521
 
512
- it('extracts content from the given PDF', async () => {
513
- expect(await extractFromPDF({ content: pdfContent })).to.equal(expectedExtractedContent);
522
+ it('extracts content from the given PDF', async () => {
523
+ expect(await extract({ content: pdfContent, mimeType: mime.getType('pdf') })).to.equal(expectedExtractedContent);
524
+ });
525
+
526
+ context('when PDF contains no text', () => {
527
+ it('throws an ExtractDocumentError error', async () => {
528
+ await expect(extract({ content: await fs.readFile(path.resolve(__dirname, '../../../test/fixtures/termsNoText.pdf')), mimeType: mime.getType('pdf') })).to.be.rejectedWith(ExtractDocumentError, /contains no text/);
529
+ });
530
+ });
514
531
  });
515
532
  });
516
533
  });