npm - @opentermsarchive/engine - Versions diffs - 1.1.2 → 1.2.0 - Mend

@opentermsarchive/engine 1.1.2 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (21) hide show

package/.eslintrc.yaml +2 -0
package/package.json +2 -1
package/scripts/declarations/utils/fixtures/serviceATermsUpdated.history.json +9 -0
package/scripts/declarations/utils/index.js +4 -0
package/scripts/declarations/utils/index.test.js +12 -4
package/scripts/declarations/validate/index.mocha.js +1 -1
package/src/archivist/extract/errors.js +6 -0
package/src/archivist/extract/index.js +32 -16
package/src/archivist/extract/index.test.js +319 -302
package/src/archivist/fetcher/errors.js +1 -1
package/src/archivist/fetcher/fullDomFetcher.js +4 -6
package/src/archivist/fetcher/htmlOnlyFetcher.js +6 -7
package/src/archivist/fetcher/index.js +9 -4
package/src/archivist/fetcher/index.test.js +24 -13
package/src/archivist/index.js +37 -13
package/src/archivist/index.test.js +22 -22
package/src/archivist/services/service.js +12 -6
package/src/archivist/services/service.test.js +60 -39
package/src/logger/index.js +3 -3
package/src/reporter/index.js +4 -2
package/src/reporter/labels.json +10 -0

package/src/archivist/extract/index.test.js CHANGED Viewed

@@ -4,11 +4,13 @@ import { fileURLToPath } from 'url';
 import chai from 'chai';
 import jsdom from 'jsdom';
+import mime from 'mime';
-import { InaccessibleContentError } from '../errors.js';
 import SourceDocument from '../services/sourceDocument.js';
-import { convertRelativeURLsToAbsolute, extractFromHTML, extractFromPDF } from './index.js';
+import { ExtractDocumentError } from './errors.js';
+import extract, { convertRelativeURLsToAbsolute } from './index.js';
 const __dirname = path.dirname(fileURLToPath(import.meta.url));
 const fs = fsApi.promises;
@@ -28,6 +30,7 @@ const rawHTML = `
     <p><a id="link1" href="/relative/link">link 1</a></p>
     <p><a id="link2" href="#anchor">link 2</a></p>
     <p><a id="link3" href="http://absolute.url/link">link 3</a></p>
+    <p><a id="link5" href="http://[INVALID_URL=http://www.example.org/">link 5</a></p>
     <div id="empty"></div>
     <div id="whitespaceOnly"> </div>
   </body>
@@ -40,7 +43,9 @@ const expectedExtracted = `Title
 [link 2](#anchor)
-[link 3](http://absolute.url/link)`;
+[link 3](http://absolute.url/link)
+[link 5](http://[INVALID_URL=http://www.example.org/)`;
 const expectedExtractedWithAdditional = `Title
 =====`;
@@ -92,7 +97,7 @@ const additionalFilter = {
       link.remove();
     });
   },
-  removeLinksAsync: async function removeLinksAsync(document) {
+  removeLinksAsync: function removeLinksAsync(document) {
     return new Promise(resolve => {
       setTimeout(() => {
         const links = document.querySelectorAll('a');
@@ -117,400 +122,412 @@ describe('Extract', () => {
       subject = Array.from(webPageDOM.querySelectorAll('a[href]')).map(el => el.href);
     });
-    it('converts relative urls', async () => {
+    it('converts relative urls', () => {
       expect(subject).to.include('https://exemple.com/relative/link');
     });
-    it('leaves absolute urls untouched', async () => {
+    it('leaves absolute urls untouched', () => {
       expect(subject).to.include('http://absolute.url/link');
     });
-  });
-  describe('#extractFromHTML', () => {
-    describe('Select', () => {
-      context('with string selector', () => {
-        it('extracts content from the given HTML with common changing items', async () => {
-          const result = await extractFromHTML(new SourceDocument({
-            location: virtualLocation,
-            contentSelectors: 'body',
-            content: rawHTMLWithCommonChangingItems,
-          }));
-          expect(result).to.equal(expectedExtractedWithCommonChangingItems);
-        });
+    it('leaves invalid urls untouched', () => {
+      expect(subject).to.include('http://[INVALID_URL=http://www.example.org/');
+    });
+  });
-        it('extracts content from the given HTML', async () => {
-          const result = await extractFromHTML(new SourceDocument({
-            location: virtualLocation,
-            contentSelectors: 'body',
-            content: rawHTML,
-          }));
+  describe('#extract', () => {
+    context('from HTML content', () => {
+      describe('Select', () => {
+        context('with string selector', () => {
+          it('extracts content from the given HTML with common changing items', async () => {
+            const result = await extract(new SourceDocument({
+              location: virtualLocation,
+              contentSelectors: 'body',
+              content: rawHTMLWithCommonChangingItems,
+            }));
-          expect(result).to.equal(expectedExtracted);
-        });
+            expect(result).to.equal(expectedExtractedWithCommonChangingItems);
+          });
-        context('with no match for the given selector', () => {
-          it('throws an InaccessibleContentError error', async () => {
-            await expect(extractFromHTML(new SourceDocument({
+          it('extracts content from the given HTML', async () => {
+            const result = await extract(new SourceDocument({
               location: virtualLocation,
-              contentSelectors: '#thisAnchorDoesNotExist',
+              contentSelectors: 'body',
               content: rawHTML,
-            }))).to.be.rejectedWith(InaccessibleContentError, /#thisAnchorDoesNotExist/);
+            }));
+            expect(result).to.equal(expectedExtracted);
           });
-        });
-        context('with no content for the matching given selector', () => {
-          it('throws an InaccessibleContentError error', async () => {
-            await expect(extractFromHTML(new SourceDocument({
-              location: virtualLocation,
-              contentSelectors: '#empty',
-              content: rawHTML,
-            }))).to.be.rejectedWith(InaccessibleContentError, /empty content/);
+          context('with no match for the given selector', () => {
+            it('throws an ExtractDocumentError error', async () => {
+              await expect(extract(new SourceDocument({
+                location: virtualLocation,
+                contentSelectors: '#thisAnchorDoesNotExist',
+                content: rawHTML,
+              }))).to.be.rejectedWith(Error, /#thisAnchorDoesNotExist/);
+            });
           });
-        });
-        context('with a whitespace only content for the corresponding given selector', () => {
-          it('throws an InaccessibleContentError error', async () => {
-            await expect(extractFromHTML(new SourceDocument({
-              location: virtualLocation,
-              contentSelectors: '#whitespaceOnly',
-              content: rawHTML,
-            }))).to.be.rejectedWith(InaccessibleContentError, /empty content/);
+          context('with no content for the matching given selector', () => {
+            it('throws an ExtractDocumentError error', async () => {
+              await expect(extract(new SourceDocument({
+                location: virtualLocation,
+                contentSelectors: '#empty',
+                content: rawHTML,
+              }))).to.be.rejectedWith(ExtractDocumentError, /empty content/);
+            });
+          });
+          context('with a whitespace only content for the corresponding given selector', () => {
+            it('throws an ExtractDocumentError error', async () => {
+              await expect(extract(new SourceDocument({
+                location: virtualLocation,
+                contentSelectors: '#whitespaceOnly',
+                content: rawHTML,
+              }))).to.be.rejectedWith(ExtractDocumentError, /empty content/);
+            });
+          });
+          context('with multiple selectors in one string', () => {
+            it('extracts content from the given HTML', async () => {
+              const result = await extract(new SourceDocument({
+                location: virtualLocation,
+                contentSelectors: 'h1, #link2',
+                content: rawHTML,
+              }));
+              expect(result).to.equal('Title\n=====\n\n[link 2](#anchor)');
+            });
           });
         });
-        context('with multiple selectors in one string', () => {
+        context('with an array of selectors', () => {
           it('extracts content from the given HTML', async () => {
-            const result = await extractFromHTML(new SourceDocument({
-              location: virtualLocation,
-              contentSelectors: 'h1, #link2',
+            const result = await extract(new SourceDocument({
               content: rawHTML,
+              location: virtualLocation,
+              contentSelectors: [ 'h1', '#link2' ],
             }));
             expect(result).to.equal('Title\n=====\n\n[link 2](#anchor)');
           });
-        });
-      });
-      context('with an array of selectors', () => {
-        it('extracts content from the given HTML', async () => {
-          const result = await extractFromHTML(new SourceDocument({
-            content: rawHTML,
-            location: virtualLocation,
-            contentSelectors: [ 'h1', '#link2' ],
-          }));
+          context('when one selector is dependent on another', () => {
+            it('extracts content from the given HTML', async () => {
+              const result = await extract(new SourceDocument({
+                content: rawHTML,
+                location: virtualLocation,
+                contentSelectors: [ 'h1', 'h1 ~ p' ],
+              }));
-          expect(result).to.equal('Title\n=====\n\n[link 2](#anchor)');
+              expect(result).to.equal('Title\n=====\n\n[link 1](https://exemple.com/relative/link)\n\n[link 2](#anchor)\n\n[link 3](http://absolute.url/link)\n\n[link 5](http://[INVALID_URL=http://www.example.org/)');
+            });
+          });
         });
-        context('when one selector is dependent on another', () => {
-          it('extracts content from the given HTML', async () => {
-            const result = await extractFromHTML(new SourceDocument({
-              content: rawHTML,
-              location: virtualLocation,
-              contentSelectors: [ 'h1', 'h1 ~ p' ],
-            }));
+        context('with range selector', () => {
+          context('with startBefore and endBefore', () => {
+            it('extracts content from the given HTML', async () => {
+              const result = await extract(new SourceDocument({
+                content: rawHTML,
+                location: virtualLocation,
+                contentSelectors: {
+                  startBefore: '#link1',
+                  endBefore: '#link2',
+                },
+              }));
-            expect(result).to.equal('Title\n=====\n\n[link 1](https://exemple.com/relative/link)\n\n[link 2](#anchor)\n\n[link 3](http://absolute.url/link)');
+              expect(result).to.equal('[link 1](https://exemple.com/relative/link)');
+            });
           });
-        });
-      });
+          context('with startBefore and endAfter', () => {
+            it('extracts content from the given HTML', async () => {
+              const result = await extract(new SourceDocument({
+                content: rawHTML,
+                location: virtualLocation,
+                contentSelectors: {
+                  startBefore: '#link2',
+                  endAfter: '#link2',
+                },
+              }));
-      context('with range selector', () => {
-        context('with startBefore and endBefore', () => {
-          it('extracts content from the given HTML', async () => {
-            const result = await extractFromHTML(new SourceDocument({
-              content: rawHTML,
-              location: virtualLocation,
-              contentSelectors: {
-                startBefore: '#link1',
-                endBefore: '#link2',
-              },
-            }));
+              expect(result).to.equal('[link 2](#anchor)');
+            });
+          });
+          context('with startAfter and endBefore', () => {
+            it('extracts content from the given HTML', async () => {
+              const result = await extract(new SourceDocument({
+                content: rawHTML,
+                location: virtualLocation,
+                contentSelectors: {
+                  startAfter: '#link1',
+                  endBefore: '#link3',
+                },
+              }));
-            expect(result).to.equal('[link 1](https://exemple.com/relative/link)');
+              expect(result).to.equal('[link 2](#anchor)');
+            });
           });
-        });
-        context('with startBefore and endAfter', () => {
-          it('extracts content from the given HTML', async () => {
-            const result = await extractFromHTML(new SourceDocument({
-              content: rawHTML,
-              location: virtualLocation,
-              contentSelectors: {
-                startBefore: '#link2',
-                endAfter: '#link2',
-              },
-            }));
+          context('with startAfter and endAfter', () => {
+            it('extracts content from the given HTML', async () => {
+              const result = await extract(new SourceDocument({
+                content: rawHTML,
+                location: virtualLocation,
+                contentSelectors: {
+                  startAfter: '#link2',
+                  endAfter: '#link3',
+                },
+              }));
-            expect(result).to.equal('[link 2](#anchor)');
+              expect(result).to.equal('[link 3](http://absolute.url/link)');
+            });
+          });
+          context('with a "start" selector that has no match', () => {
+            it('throws an ExtractDocumentError error', async () => {
+              await expect(extract(new SourceDocument({
+                content: rawHTML,
+                location: virtualLocation,
+                contentSelectors: {
+                  startAfter: '#paragraph1',
+                  endAfter: '#link2',
+                },
+              }))).to.be.rejectedWith(ExtractDocumentError, /"start" selector has no match/);
+            });
+          });
+          context('with an "end" selector that has no match', () => {
+            it('throws an ExtractDocumentError error', async () => {
+              await expect(extract(new SourceDocument({
+                content: rawHTML,
+                location: virtualLocation,
+                contentSelectors: {
+                  startAfter: '#link2',
+                  endAfter: '#paragraph1',
+                },
+              }))).to.be.rejectedWith(ExtractDocumentError, /"end" selector has no match/);
+            });
           });
         });
-        context('with startAfter and endBefore', () => {
+        context('with an array of range selectors', () => {
           it('extracts content from the given HTML', async () => {
-            const result = await extractFromHTML(new SourceDocument({
+            const result = await extract(new SourceDocument({
               content: rawHTML,
               location: virtualLocation,
-              contentSelectors: {
-                startAfter: '#link1',
-                endBefore: '#link3',
-              },
+              contentSelectors: [
+                {
+                  startAfter: '#link1',
+                  endAfter: '#link2',
+                },
+                {
+                  startAfter: '#link2',
+                  endAfter: '#link3',
+                },
+              ],
             }));
-            expect(result).to.equal('[link 2](#anchor)');
+            expect(result).to.equal('[link 2](#anchor)\n\n[link 3](http://absolute.url/link)');
           });
         });
-        context('with startAfter and endAfter', () => {
+        context('with an array of mixed string selectors and range selectors', () => {
           it('extracts content from the given HTML', async () => {
-            const result = await extractFromHTML(new SourceDocument({
+            const result = await extract(new SourceDocument({
               content: rawHTML,
               location: virtualLocation,
-              contentSelectors: {
-                startAfter: '#link2',
-                endAfter: '#link3',
-              },
+              contentSelectors: [
+                'h1',
+                {
+                  startAfter: '#link2',
+                  endAfter: '#link3',
+                },
+              ],
             }));
-            expect(result).to.equal('[link 3](http://absolute.url/link)');
+            expect(result).to.equal('Title\n=====\n\n[link 3](http://absolute.url/link)');
           });
         });
-        context('with a "start" selector that has no match', () => {
-          it('throws an InaccessibleContentError error', async () => {
-            await expect(extractFromHTML(new SourceDocument({
+      });
+      describe('Remove', () => {
+        context('with a simple selector', () => {
+          it('removes the specified elements', async () => {
+            const result = await extract(new SourceDocument({
               content: rawHTML,
               location: virtualLocation,
-              contentSelectors: {
-                startAfter: '#paragraph1',
-                endAfter: '#link2',
-              },
-            }))).to.be.rejectedWith(InaccessibleContentError, /"start" selector has no match/);
+              contentSelectors: 'body',
+              insignificantContentSelectors: 'h1',
+            }));
+            expect(result).to.equal('[link 1](https://exemple.com/relative/link)\n\n[link 2](#anchor)\n\n[link 3](http://absolute.url/link)\n\n[link 5](http://[INVALID_URL=http://www.example.org/)');
           });
         });
-        context('with an "end" selector that has no match', () => {
-          it('throws an InaccessibleContentError error', async () => {
-            await expect(extractFromHTML(new SourceDocument({
+        context('with an array of string selectors', () => {
+          it('removes the specified elements', async () => {
+            const result = await extract(new SourceDocument({
               content: rawHTML,
               location: virtualLocation,
-              contentSelectors: {
-                startAfter: '#link2',
-                endAfter: '#paragraph1',
-              },
-            }))).to.be.rejectedWith(InaccessibleContentError, /"end" selector has no match/);
-          });
-        });
-      });
-      context('with an array of range selectors', () => {
-        it('extracts content from the given HTML', async () => {
-          const result = await extractFromHTML(new SourceDocument({
-            content: rawHTML,
-            location: virtualLocation,
-            contentSelectors: [
-              {
-                startAfter: '#link1',
-                endAfter: '#link2',
-              },
-              {
-                startAfter: '#link2',
-                endAfter: '#link3',
-              },
-            ],
-          }));
-          expect(result).to.equal('[link 2](#anchor)\n\n[link 3](http://absolute.url/link)');
-        });
-      });
-      context('with an array of mixed string selectors and range selectors', () => {
-        it('extracts content from the given HTML', async () => {
-          const result = await extractFromHTML(new SourceDocument({
-            content: rawHTML,
-            location: virtualLocation,
-            contentSelectors: [
-              'h1',
-              {
-                startAfter: '#link2',
-                endAfter: '#link3',
-              },
-            ],
-          }));
-          expect(result).to.equal('Title\n=====\n\n[link 3](http://absolute.url/link)');
-        });
-      });
-    });
-    describe('Remove', () => {
-      context('with a simple selector', () => {
-        it('removes the specified elements', async () => {
-          const result = await extractFromHTML(new SourceDocument({
-            content: rawHTML,
-            location: virtualLocation,
-            contentSelectors: 'body',
-            insignificantContentSelectors: 'h1',
-          }));
-          expect(result).to.equal('[link 1](https://exemple.com/relative/link)\n\n[link 2](#anchor)\n\n[link 3](http://absolute.url/link)');
-        });
-      });
-      context('with an array of string selectors', () => {
-        it('removes the specified elements', async () => {
-          const result = await extractFromHTML(new SourceDocument({
-            content: rawHTML,
-            location: virtualLocation,
-            contentSelectors: 'body',
-            insignificantContentSelectors: [ 'h1', '#link3' ],
-          }));
+              contentSelectors: 'body',
+              insignificantContentSelectors: [ 'h1', '#link3', '#link5' ],
+            }));
-          expect(result).to.equal('[link 1](https://exemple.com/relative/link)\n\n[link 2](#anchor)');
+            expect(result).to.equal('[link 1](https://exemple.com/relative/link)\n\n[link 2](#anchor)');
+          });
         });
-      });
-      context('with a simple range selector', () => {
-        it('removes the specified elements', async () => {
-          const result = await extractFromHTML(new SourceDocument({
-            content: rawHTML,
-            location: virtualLocation,
-            contentSelectors: 'body',
-            insignificantContentSelectors: {
-              startBefore: '#link1',
-              endAfter: '#link3',
-            },
-          }));
-          expect(result).to.equal('Title\n=====');
-        });
-        context('with a "start" selector that has no match', () => {
-          it('throws an InaccessibleContentError error', async () => {
-            await expect(extractFromHTML(new SourceDocument({
+        context('with a simple range selector', () => {
+          it('removes the specified elements', async () => {
+            const result = await extract(new SourceDocument({
               content: rawHTML,
               location: virtualLocation,
               contentSelectors: 'body',
               insignificantContentSelectors: {
-                startAfter: '#paragraph1',
-                endAfter: '#link2',
+                startBefore: '#link1',
+                endAfter: '#link5',
               },
-            }))).to.be.rejectedWith(InaccessibleContentError, /"start" selector has no match/);
+            }));
+            expect(result).to.equal('Title\n=====');
+          });
+          context('with a "start" selector that has no match', () => {
+            it('throws an ExtractDocumentError error', async () => {
+              await expect(extract(new SourceDocument({
+                content: rawHTML,
+                location: virtualLocation,
+                contentSelectors: 'body',
+                insignificantContentSelectors: {
+                  startAfter: '#paragraph1',
+                  endAfter: '#link2',
+                },
+              }))).to.be.rejectedWith(ExtractDocumentError, /"start" selector has no match/);
+            });
+          });
+          context('with an "end" selector that has no match', () => {
+            it('throws an ExtractDocumentError error', async () => {
+              await expect(extract(new SourceDocument({
+                content: rawHTML,
+                location: virtualLocation,
+                contentSelectors: 'body',
+                insignificantContentSelectors: {
+                  startAfter: '#link2',
+                  endAfter: '#paragraph1',
+                },
+              }))).to.be.rejectedWith(ExtractDocumentError, /"end" selector has no match/);
+            });
           });
         });
-        context('with an "end" selector that has no match', () => {
-          it('throws an InaccessibleContentError error', async () => {
-            await expect(extractFromHTML(new SourceDocument({
+        context('with an array of range selectors', () => {
+          it('removes all the selections', async () => {
+            const result = await extract(new SourceDocument({
               content: rawHTML,
               location: virtualLocation,
               contentSelectors: 'body',
-              insignificantContentSelectors: {
-                startAfter: '#link2',
-                endAfter: '#paragraph1',
-              },
-            }))).to.be.rejectedWith(InaccessibleContentError, /"end" selector has no match/);
-          });
-        });
-      });
-      context('with an array of range selectors', () => {
-        it('removes all the selections', async () => {
-          const result = await extractFromHTML(new SourceDocument({
-            content: rawHTML,
-            location: virtualLocation,
-            contentSelectors: 'body',
-            insignificantContentSelectors: [
-              {
-                startBefore: 'h1',
-                endBefore: '#link1',
-              },
-              {
-                startBefore: '#link3',
-                endAfter: '#link3',
-              },
-            ],
-          }));
-          expect(result).to.equal('[link 1](https://exemple.com/relative/link)\n\n[link 2](#anchor)');
-        });
-      });
-      context('with an array of mixed selectors and range selectors', () => {
-        it('removes all the selections', async () => {
-          const result = await extractFromHTML(new SourceDocument({
-            content: rawHTML,
-            location: virtualLocation,
-            contentSelectors: 'body',
-            insignificantContentSelectors: [
-              'h1',
-              {
-                startBefore: '#link3',
-                endAfter: '#link3',
-              },
-            ],
-          }));
+              insignificantContentSelectors: [
+                {
+                  startBefore: 'h1',
+                  endBefore: '#link1',
+                },
+                {
+                  startBefore: '#link3',
+                  endAfter: '#link5',
+                },
+              ],
+            }));
-          expect(result).to.equal('[link 1](https://exemple.com/relative/link)\n\n[link 2](#anchor)');
+            expect(result).to.equal('[link 1](https://exemple.com/relative/link)\n\n[link 2](#anchor)');
+          });
         });
-        context('where one selector is dependent on another', () => {
+        context('with an array of mixed selectors and range selectors', () => {
           it('removes all the selections', async () => {
-            const result = await extractFromHTML(new SourceDocument({
+            const result = await extract(new SourceDocument({
               content: rawHTML,
               location: virtualLocation,
               contentSelectors: 'body',
               insignificantContentSelectors: [
                 'h1',
                 {
-                  startAfter: 'h1',
-                  endBefore: '#link2',
+                  startBefore: '#link3',
+                  endAfter: '#link5',
                 },
               ],
             }));
-            expect(result).to.equal('[link 2](#anchor)\n\n[link 3](http://absolute.url/link)');
+            expect(result).to.equal('[link 1](https://exemple.com/relative/link)\n\n[link 2](#anchor)');
+          });
+          context('where one selector is dependent on another', () => {
+            it('removes all the selections', async () => {
+              const result = await extract(new SourceDocument({
+                content: rawHTML,
+                location: virtualLocation,
+                contentSelectors: 'body',
+                insignificantContentSelectors: [
+                  'h1',
+                  {
+                    startAfter: 'h1',
+                    endBefore: '#link2',
+                  },
+                ],
+              }));
+              expect(result).to.equal('[link 2](#anchor)\n\n[link 3](http://absolute.url/link)\n\n[link 5](http://[INVALID_URL=http://www.example.org/)');
+            });
           });
         });
       });
-    });
-    describe('Filter', () => {
-      context('with a synchronous filter', () => {
-        it('extracts content from the given HTML also with given additional filter', async () => {
-          const result = await extractFromHTML(new SourceDocument({
-            content: rawHTML,
-            location: virtualLocation,
-            contentSelectors: 'body',
-            filters: [additionalFilter.removeLinks],
-          }));
-          expect(result).to.equal(expectedExtractedWithAdditional);
+      describe('Filter', () => {
+        context('with a synchronous filter', () => {
+          it('extracts content from the given HTML also with given additional filter', async () => {
+            const result = await extract(new SourceDocument({
+              content: rawHTML,
+              location: virtualLocation,
+              contentSelectors: 'body',
+              filters: [additionalFilter.removeLinks],
+            }));
+            expect(result).to.equal(expectedExtractedWithAdditional);
+          });
         });
-      });
-      context('with an asynchronous filter', () => {
-        it('extracts content from the given HTML also with given additional filter', async () => {
-          const result = await extractFromHTML(new SourceDocument({
-            content: rawHTML,
-            location: virtualLocation,
-            contentSelectors: 'body',
-            filters: [additionalFilter.removeLinksAsync],
-          }));
+        context('with an asynchronous filter', () => {
+          it('extracts content from the given HTML also with given additional filter', async () => {
+            const result = await extract(new SourceDocument({
+              content: rawHTML,
+              location: virtualLocation,
+              contentSelectors: 'body',
+              filters: [additionalFilter.removeLinksAsync],
+            }));
-          expect(result).to.equal(expectedExtractedWithAdditional);
+            expect(result).to.equal(expectedExtractedWithAdditional);
+          });
         });
       });
     });
-  });
-  describe('#extractFromPDF', () => {
-    let pdfContent;
-    let expectedExtractedContent;
+    context('from PDF content', () => {
+      let pdfContent;
+      let expectedExtractedContent;
-    before(async () => {
-      pdfContent = await fs.readFile(path.resolve(__dirname, '../../../test/fixtures/terms.pdf'));
-      expectedExtractedContent = await fs.readFile(
-        path.resolve(__dirname, '../../../test/fixtures/termsFromPDF.md'),
-        { encoding: 'utf8' },
-      );
-    });
+      before(async () => {
+        pdfContent = await fs.readFile(path.resolve(__dirname, '../../../test/fixtures/terms.pdf'));
+        expectedExtractedContent = await fs.readFile(
+          path.resolve(__dirname, '../../../test/fixtures/termsFromPDF.md'),
+          { encoding: 'utf8' },
+        );
+      });
-    it('extracts content from the given PDF', async () => {
-      expect(await extractFromPDF({ content: pdfContent })).to.equal(expectedExtractedContent);
+      it('extracts content from the given PDF', async () => {
+        expect(await extract({ content: pdfContent, mimeType: mime.getType('pdf') })).to.equal(expectedExtractedContent);
+      });
+      context('when PDF contains no text', () => {
+        it('throws an ExtractDocumentError error', async () => {
+          await expect(extract({ content: await fs.readFile(path.resolve(__dirname, '../../../test/fixtures/termsNoText.pdf')), mimeType: mime.getType('pdf') })).to.be.rejectedWith(ExtractDocumentError, /contains no text/);
+        });
+      });
     });
   });
 });