@opentermsarchive/engine 7.1.0 → 7.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,194 @@
1
+ import { expect } from 'chai';
2
+
3
+ import createWebPageDOM from './dom.js';
4
+ import filter from './filter.js';
5
+
6
+ const delay = ms => new Promise(resolve => { setTimeout(resolve, ms); });
7
+
8
+ describe('Filter', () => {
9
+ let webPageDOM;
10
+ let sourceDocument;
11
+ const BASE_URL = 'https://example.com';
12
+
13
+ before(() => {
14
+ webPageDOM = createWebPageDOM('<!DOCTYPE html><html><body></body></html>', BASE_URL);
15
+ sourceDocument = {
16
+ location: BASE_URL,
17
+ contentSelectors: [],
18
+ insignificantContentSelectors: [],
19
+ filters: [],
20
+ removeQueryParams: [],
21
+ };
22
+ });
23
+
24
+ describe('#filter', () => {
25
+ it('returns the webPageDOM', async () => {
26
+ const result = await filter(webPageDOM, sourceDocument);
27
+
28
+ expect(result).to.equal(webPageDOM);
29
+ });
30
+
31
+ describe('with custom filters', () => {
32
+ let receivedContext;
33
+
34
+ const contentFilter = (dom, { select }) => {
35
+ const element = dom.querySelector(select[0]);
36
+
37
+ if (element) {
38
+ element.innerHTML = 'Filtered content';
39
+ }
40
+ };
41
+
42
+ const appendFilter = (dom, { select }) => {
43
+ const element = dom.querySelector(select[0]);
44
+
45
+ if (element) {
46
+ element.innerHTML += ' + Appended content';
47
+ }
48
+ };
49
+
50
+ const failingFilter = () => {
51
+ throw new Error('Filter failed');
52
+ };
53
+
54
+ const contextSpyFilter = (dom, context) => {
55
+ receivedContext = context;
56
+ };
57
+
58
+ const asyncFilter = async (dom, { select }) => {
59
+ const element = dom.querySelector(select[0]);
60
+
61
+ if (element) {
62
+ await delay(100);
63
+ element.innerHTML = 'Async content';
64
+ }
65
+ };
66
+
67
+ before(() => {
68
+ const div = webPageDOM.createElement('div');
69
+
70
+ div.className = 'custom-content';
71
+ webPageDOM.body.appendChild(div);
72
+
73
+ sourceDocument.contentSelectors = ['.custom-content'];
74
+ });
75
+
76
+ it('applies single filter to content', async () => {
77
+ sourceDocument.filters = [contentFilter];
78
+
79
+ await filter(webPageDOM, sourceDocument);
80
+
81
+ expect(webPageDOM.querySelector('.custom-content').innerHTML).to.equal('Filtered content');
82
+ });
83
+
84
+ it('applies filters in sequence', async () => {
85
+ sourceDocument.filters = [ contentFilter, appendFilter ];
86
+
87
+ await filter(webPageDOM, sourceDocument);
88
+
89
+ expect(webPageDOM.querySelector('.custom-content').innerHTML).to.equal('Filtered content + Appended content');
90
+ });
91
+
92
+ it('applies async filter and waits for completion', async () => {
93
+ sourceDocument.filters = [asyncFilter];
94
+
95
+ await filter(webPageDOM, sourceDocument);
96
+
97
+ expect(webPageDOM.querySelector('.custom-content').innerHTML).to.equal('Async content');
98
+ });
99
+
100
+ it('throws error on filter failure', async () => {
101
+ sourceDocument.filters = [failingFilter];
102
+
103
+ await expect(filter(webPageDOM, sourceDocument)).to.be.rejectedWith('The filter function "failingFilter" failed: Error: Filter failed');
104
+ });
105
+
106
+ describe('filter parameters', () => {
107
+ before(async () => {
108
+ sourceDocument.filters = [contextSpyFilter];
109
+ sourceDocument.contentSelectors = ['.custom-content'];
110
+ sourceDocument.insignificantContentSelectors = ['.insignificant'];
111
+
112
+ await filter(webPageDOM, sourceDocument);
113
+ });
114
+
115
+ it('provides content selectors', () => {
116
+ expect(receivedContext.select).to.deep.equal(['.custom-content']);
117
+ });
118
+
119
+ it('provides insignificant selectors', () => {
120
+ expect(receivedContext.remove).to.deep.equal(['.insignificant']);
121
+ });
122
+
123
+ it('provides location', () => {
124
+ expect(receivedContext.fetch).to.equal(BASE_URL);
125
+ });
126
+
127
+ it('provides filters list', () => {
128
+ expect(receivedContext.filter).to.deep.equal(['contextSpyFilter']);
129
+ });
130
+ });
131
+ });
132
+ });
133
+
134
+ describe('#convertRelativeURLsToAbsolute', () => {
135
+ let link;
136
+
137
+ before(() => {
138
+ link = webPageDOM.createElement('a');
139
+ webPageDOM.body.appendChild(link);
140
+ });
141
+
142
+ it('converts relative URLs to absolute', async () => {
143
+ link.href = '/path/to/page';
144
+ await filter(webPageDOM, sourceDocument);
145
+
146
+ expect(link.href).to.equal('https://example.com/path/to/page');
147
+ });
148
+
149
+ it('keeps invalid URLs unchanged', async () => {
150
+ link.href = 'invalid://url';
151
+ await filter(webPageDOM, sourceDocument);
152
+
153
+ expect(link.href).to.equal('invalid://url');
154
+ });
155
+ });
156
+
157
+ describe('#removeUnwantedElements', () => {
158
+ before(async () => {
159
+ webPageDOM.body.appendChild(webPageDOM.createElement('script'));
160
+ webPageDOM.body.appendChild(webPageDOM.createElement('style'));
161
+
162
+ await filter(webPageDOM, sourceDocument);
163
+ });
164
+
165
+ it('removes script elements', () => {
166
+ expect(webPageDOM.querySelector('script')).to.be.null;
167
+ });
168
+
169
+ it('removes style elements', () => {
170
+ expect(webPageDOM.querySelector('style')).to.be.null;
171
+ });
172
+ });
173
+
174
+ describe('#updateProtectedLinks', () => {
175
+ before(async () => {
176
+ const link = webPageDOM.createElement('a');
177
+
178
+ link.href = 'https://example.com/email-protection';
179
+ link.className = 'email-protection';
180
+ link.innerHTML = 'Click here';
181
+ webPageDOM.body.appendChild(link);
182
+
183
+ await filter(webPageDOM, sourceDocument);
184
+ });
185
+
186
+ it('updates link destination', () => {
187
+ expect(webPageDOM.querySelector('a.email-protection').href).to.equal('https://example.com/email-protection');
188
+ });
189
+
190
+ it('updates link content', () => {
191
+ expect(webPageDOM.querySelector('a.email-protection').innerHTML).to.equal('[email&nbsp;protected]');
192
+ });
193
+ });
194
+ });
@@ -1,28 +1,14 @@
1
- import ciceroMark from '@accordproject/markdown-cicero';
2
- import mardownPdf from '@accordproject/markdown-pdf';
3
- import TurndownService from '@opentermsarchive/turndown';
4
- import turndownPluginGithubFlavouredMarkdown from 'joplin-turndown-plugin-gfm';
5
- import jsdom from 'jsdom';
6
1
  import mime from 'mime';
7
2
 
8
3
  import SourceDocument from '../services/sourceDocument.js';
9
4
 
5
+ import createWebPageDOM from './dom.js';
10
6
  import { ExtractDocumentError } from './errors.js';
7
+ import filter from './filter.js';
8
+ import { transformFromHTML, transformFromPDF } from './markdown.js';
11
9
 
12
10
  export { ExtractDocumentError } from './errors.js';
13
11
 
14
- const { JSDOM } = jsdom;
15
- const turndownService = new TurndownService();
16
-
17
- turndownService.use(turndownPluginGithubFlavouredMarkdown.gfm);
18
-
19
- export const LINKS_TO_CONVERT_SELECTOR = 'a[href]:not([href^="#"]):not([href=""])';
20
-
21
- const { PdfTransformer } = mardownPdf;
22
- const { CiceroMarkTransformer } = ciceroMark;
23
-
24
- const ciceroMarkTransformer = new CiceroMarkTransformer();
25
-
26
12
  /**
27
13
  * Extract content from source document and convert it to Markdown
28
14
  * @function extract
@@ -43,56 +29,18 @@ export default async function extract(sourceDocument) {
43
29
  }
44
30
 
45
31
  export async function extractFromHTML(sourceDocument) {
46
- const {
47
- location,
48
- contentSelectors = [],
49
- insignificantContentSelectors = [],
50
- filters: serviceSpecificFilters = [],
51
- content,
52
- } = sourceDocument;
53
-
54
- const jsdomInstance = new JSDOM(content, {
55
- url: location,
56
- virtualConsole: new jsdom.VirtualConsole(),
57
- });
58
- const { document: webPageDOM } = jsdomInstance.window;
32
+ const { location, content, contentSelectors, insignificantContentSelectors } = sourceDocument;
59
33
 
60
- for (const filterFunction of serviceSpecificFilters) {
61
- try {
62
- await filterFunction(webPageDOM, {
63
- fetch: location,
64
- select: contentSelectors,
65
- remove: insignificantContentSelectors,
66
- filter: serviceSpecificFilters.map(filter => filter.name),
67
- });
68
- } catch (error) {
69
- throw new Error(`The filter function "${filterFunction.name}" failed: ${error}`);
70
- }
71
- }
72
-
73
- remove(webPageDOM, insignificantContentSelectors); // remove function works in place
34
+ const webPageDOM = createWebPageDOM(content, location);
35
+ const filteredDOM = await filter(webPageDOM, sourceDocument);
36
+ const cleanedDOM = filteredDOM.remove(insignificantContentSelectors);
37
+ const selectedDOM = cleanedDOM.select(contentSelectors);
74
38
 
75
- const domFragment = select(webPageDOM, contentSelectors);
76
-
77
- if (!domFragment.children.length) {
78
- throw new Error(`The provided selector "${contentSelectors}" has no match in the web page at '${location}'`);
39
+ if (!selectedDOM?.children.length) {
40
+ throw new Error(`The provided selector "${contentSelectors}" has no match in the web page at '${location}'. This could be due to elements being removed before content selection if "remove" and "select" selectors match the same content.`);
79
41
  }
80
42
 
81
- convertRelativeURLsToAbsolute(domFragment, location);
82
-
83
- domFragment.querySelectorAll('script, style').forEach(node => node.remove());
84
-
85
- // clean code from common changing patterns - initially for Windstream
86
- domFragment.querySelectorAll('a[href*="/email-protection"]').forEach(node => {
87
- const newProtectedLink = webPageDOM.createElement('a');
88
- const [href] = node.href.split('#');
89
-
90
- newProtectedLink.href = href;
91
- newProtectedLink.innerHTML = '[email protected]';
92
- node.parentNode.replaceChild(newProtectedLink, node);
93
- });
94
-
95
- const markdownContent = transform(domFragment);
43
+ const markdownContent = transformFromHTML(selectedDOM);
96
44
 
97
45
  if (!markdownContent) {
98
46
  throw new Error(`The provided selector "${contentSelectors}" matches an empty content in the web page at '${location}'`);
@@ -102,19 +50,7 @@ export async function extractFromHTML(sourceDocument) {
102
50
  }
103
51
 
104
52
  export async function extractFromPDF({ location, content: pdfBuffer }) {
105
- let markdownContent;
106
-
107
- try {
108
- const ciceroMarkdown = await PdfTransformer.toCiceroMark(pdfBuffer);
109
-
110
- markdownContent = ciceroMarkTransformer.toMarkdown(ciceroMarkdown);
111
- } catch (error) {
112
- if (error.parserError) {
113
- throw new Error("Can't parse PDF file");
114
- }
115
-
116
- throw error;
117
- }
53
+ const markdownContent = await transformFromPDF(pdfBuffer);
118
54
 
119
55
  if (!markdownContent) {
120
56
  throw new Error(`The PDF file at '${location}' contains no text, it might contain scanned images of text instead of actual text`);
@@ -122,72 +58,3 @@ export async function extractFromPDF({ location, content: pdfBuffer }) {
122
58
 
123
59
  return markdownContent;
124
60
  }
125
-
126
- function selectRange(webPageDOM, rangeSelector) {
127
- const { startBefore, startAfter, endBefore, endAfter } = rangeSelector;
128
-
129
- const selection = webPageDOM.createRange();
130
- const startNode = webPageDOM.querySelector(startBefore || startAfter);
131
- const endNode = webPageDOM.querySelector(endBefore || endAfter);
132
-
133
- if (!startNode) {
134
- throw new Error(`The "start" selector has no match in document in: ${JSON.stringify(rangeSelector)}`);
135
- }
136
-
137
- if (!endNode) {
138
- throw new Error(`The "end" selector has no match in document in: ${JSON.stringify(rangeSelector)}`);
139
- }
140
-
141
- selection[startBefore ? 'setStartBefore' : 'setStartAfter'](startNode);
142
- selection[endBefore ? 'setEndBefore' : 'setEndAfter'](endNode);
143
-
144
- return selection;
145
- }
146
-
147
- export function convertRelativeURLsToAbsolute(webPageDOM, baseURL) {
148
- Array.from(webPageDOM.querySelectorAll(LINKS_TO_CONVERT_SELECTOR)).forEach(link => {
149
- try {
150
- link.href = new URL(link.href, baseURL).href;
151
- } catch (error) {
152
- // Leave the URL as is if it's invalid in the source document and can't be converted to an absolute URL
153
- }
154
- });
155
- }
156
-
157
- // Works in place
158
- function remove(webPageDOM, insignificantContentSelectors) {
159
- const rangeSelections = [];
160
- const nodes = [];
161
-
162
- [].concat(insignificantContentSelectors).forEach(selector => {
163
- if (typeof selector === 'object') {
164
- rangeSelections.push(selectRange(webPageDOM, selector));
165
- } else {
166
- nodes.push(...webPageDOM.querySelectorAll(selector));
167
- }
168
- });
169
-
170
- // Removing range selections still works even if the starting or ending node is deleted. So, start by removing all nodes selected by a direct CSS selector, then delete all contents selections.
171
- nodes.forEach(node => node.remove());
172
- rangeSelections.forEach(rangeSelection => rangeSelection.deleteContents());
173
- }
174
-
175
- function select(webPageDOM, contentSelectors) {
176
- const result = webPageDOM.createDocumentFragment();
177
-
178
- [].concat(contentSelectors).forEach(selector => {
179
- if (typeof selector === 'object') {
180
- const rangeSelection = selectRange(webPageDOM, selector);
181
-
182
- result.appendChild(rangeSelection.cloneContents());
183
- } else {
184
- webPageDOM.querySelectorAll(selector).forEach(element => result.appendChild(element.cloneNode(true)));
185
- }
186
- });
187
-
188
- return result;
189
- }
190
-
191
- function transform(domFragment) {
192
- return turndownService.turndown(domFragment);
193
- }
@@ -3,21 +3,19 @@ import path from 'path';
3
3
  import { fileURLToPath } from 'url';
4
4
 
5
5
  import chai from 'chai';
6
- import jsdom from 'jsdom';
7
6
  import mime from 'mime';
8
7
 
9
8
  import SourceDocument from '../services/sourceDocument.js';
10
9
 
11
10
  import { ExtractDocumentError } from './errors.js';
12
11
 
13
- import extract, { convertRelativeURLsToAbsolute } from './index.js';
12
+ import extract from './index.js';
14
13
 
15
14
  const __dirname = path.dirname(fileURLToPath(import.meta.url));
16
15
  const fs = fsApi.promises;
17
- const { JSDOM } = jsdom;
18
16
  const { expect } = chai;
19
17
 
20
- const virtualLocation = 'https://exemple.com/main';
18
+ const virtualLocation = 'https://example.com/main';
21
19
  const rawHTML = `
22
20
  <!DOCTYPE html>
23
21
  <html>
@@ -39,7 +37,7 @@ const rawHTML = `
39
37
  const expectedExtracted = `Title
40
38
  =====
41
39
 
42
- [link 1](https://exemple.com/relative/link)
40
+ [link 1](https://example.com/relative/link)
43
41
 
44
42
  [link 2](#anchor)
45
43
 
@@ -63,10 +61,11 @@ const rawHTMLWithCommonChangingItems = `
63
61
  <style>body { background: blue }</style>
64
62
  <script>console.log("test")</script>
65
63
  <h1>Title</h1>
66
- <p><a id="link1" href="/relative/link">link 1</a></p>
64
+ <p><a id="link1" href="/relative/link?utm_source=test&id=123">link 1</a></p>
67
65
  <p><a id="link2" href="#anchor">link 2</a></p>
68
- <p><a id="link3" href="http://absolute.url/link">link 3</a></p>
66
+ <p><a id="link3" href="http://absolute.url/link?keep=me">link 3</a></p>
69
67
  <p><a id="link4" href="">link 4</a></p>
68
+ <p><img src="https://example.com/image.jpg?width=100&quality=80" alt="test"/></p>
70
69
  <a href="/cdn-cgi/l/email-protection#3b4c52555f484f495e5a56154b49524d5a584215484f5a4f5e565e554f7b4c52555f484f495e5a5615585456">[email&#160;protected]</a>
71
70
  <p><a href="/cdn-cgi/l/email-protection#2d4e4243594c4e596d4e4459545e4e424259034858">conta<span>[email&#160;protected]</span></a></p>
72
71
  </body>
@@ -76,17 +75,19 @@ const rawHTMLWithCommonChangingItems = `
76
75
  const expectedExtractedWithCommonChangingItems = `Title
77
76
  =====
78
77
 
79
- [link 1](https://exemple.com/relative/link)
78
+ [link 1](https://example.com/relative/link?utm_source=test&id=123)
80
79
 
81
80
  [link 2](#anchor)
82
81
 
83
- [link 3](http://absolute.url/link)
82
+ [link 3](http://absolute.url/link?keep=me)
84
83
 
85
84
  link 4
86
85
 
87
- [\\[email protected\\]](https://exemple.com/cdn-cgi/l/email-protection)
86
+ ![test](https://example.com/image.jpg?width=100&quality=80)
88
87
 
89
- [\\[email protected\\]](https://exemple.com/cdn-cgi/l/email-protection)`;
88
+ [\\[email protected\\]](https://example.com/cdn-cgi/l/email-protection)
89
+
90
+ [\\[email protected\\]](https://example.com/cdn-cgi/l/email-protection)`;
90
91
  /* eslint-enable no-irregular-whitespace */
91
92
 
92
93
  const additionalFilter = {
@@ -112,31 +113,70 @@ const additionalFilter = {
112
113
  };
113
114
 
114
115
  describe('Extract', () => {
115
- describe('#convertRelativeURLsToAbsolute', () => {
116
- let subject;
116
+ describe('#extract', () => {
117
+ context('from HTML content', () => {
118
+ describe('Filter', () => {
119
+ it('converts relative URLs to absolute', async () => {
120
+ const result = await extract(new SourceDocument({
121
+ content: rawHTML,
122
+ location: virtualLocation,
123
+ contentSelectors: 'body',
124
+ }));
125
+
126
+ expect(result).to.include('https://example.com/relative/link');
127
+ expect(result).to.include('http://absolute.url/link');
128
+ });
117
129
 
118
- before(() => {
119
- const { document: webPageDOM } = new JSDOM(rawHTML).window;
130
+ it('discards non-textual elements', async () => {
131
+ const result = await extract(new SourceDocument({
132
+ content: rawHTMLWithCommonChangingItems,
133
+ location: virtualLocation,
134
+ contentSelectors: 'body',
135
+ }));
120
136
 
121
- convertRelativeURLsToAbsolute(webPageDOM, virtualLocation);
122
- subject = Array.from(webPageDOM.querySelectorAll('a[href]')).map(el => el.href);
123
- });
137
+ expect(result).to.not.include('background: red');
138
+ expect(result).to.not.include('console.log');
139
+ });
124
140
 
125
- it('converts relative urls', () => {
126
- expect(subject).to.include('https://exemple.com/relative/link');
127
- });
141
+ it('cleans up protected links', async () => {
142
+ const result = await extract(new SourceDocument({
143
+ content: rawHTMLWithCommonChangingItems,
144
+ location: virtualLocation,
145
+ contentSelectors: 'body',
146
+ }));
128
147
 
129
- it('leaves absolute urls untouched', () => {
130
- expect(subject).to.include('http://absolute.url/link');
131
- });
148
+ expect(result).to.include('email protected');
149
+ expect(result).to.not.include('3b4c52555f484f495e5a56154b49524d5a584215484f5a4f5e565e554f7b4c52555f484f495e5a5615585456');
150
+ expect(result).to.not.include('2d4e4243594c4e596d4e4459545e4e424259034858');
151
+ });
132
152
 
133
- it('leaves invalid urls untouched', () => {
134
- expect(subject).to.include('http://[INVALID_URL=http://www.example.org/');
135
- });
136
- });
153
+ context('with a synchronous filter', () => {
154
+ it('applies all filters', async () => {
155
+ const result = await extract(new SourceDocument({
156
+ content: rawHTML,
157
+ location: virtualLocation,
158
+ contentSelectors: 'body',
159
+ filters: [additionalFilter.removeLinks],
160
+ }));
161
+
162
+ expect(result).to.equal(expectedExtractedWithAdditional);
163
+ });
164
+ });
165
+
166
+ context('with an asynchronous filter', () => {
167
+ it('applies all filters', async () => {
168
+ const result = await extract(new SourceDocument({
169
+ content: rawHTML,
170
+ location: virtualLocation,
171
+ contentSelectors: 'body',
172
+ filters: [additionalFilter.removeLinksAsync],
173
+ }));
174
+
175
+ expect(result).to.equal(expectedExtractedWithAdditional);
176
+ });
177
+ });
178
+ });
137
179
 
138
- describe('#extract', () => {
139
- context('from HTML content', () => {
140
180
  describe('Select', () => {
141
181
  context('with string selector', () => {
142
182
  it('extracts content from the given HTML with common changing items', async () => {
@@ -221,7 +261,7 @@ describe('Extract', () => {
221
261
  contentSelectors: [ 'h1', 'h1 ~ p' ],
222
262
  }));
223
263
 
224
- expect(result).to.equal('Title\n=====\n\n[link 1](https://exemple.com/relative/link)\n\n[link 2](#anchor)\n\n[link 3](http://absolute.url/link)\n\n[link 5](http://[INVALID_URL=http://www.example.org/)');
264
+ expect(result).to.equal('Title\n=====\n\n[link 1](https://example.com/relative/link)\n\n[link 2](#anchor)\n\n[link 3](http://absolute.url/link)\n\n[link 5](http://[INVALID_URL=http://www.example.org/)');
225
265
  });
226
266
  });
227
267
  });
@@ -238,7 +278,7 @@ describe('Extract', () => {
238
278
  },
239
279
  }));
240
280
 
241
- expect(result).to.equal('[link 1](https://exemple.com/relative/link)');
281
+ expect(result).to.equal('[link 1](https://example.com/relative/link)');
242
282
  });
243
283
  });
244
284
  context('with startBefore and endAfter', () => {
@@ -359,7 +399,7 @@ describe('Extract', () => {
359
399
  insignificantContentSelectors: 'h1',
360
400
  }));
361
401
 
362
- expect(result).to.equal('[link 1](https://exemple.com/relative/link)\n\n[link 2](#anchor)\n\n[link 3](http://absolute.url/link)\n\n[link 5](http://[INVALID_URL=http://www.example.org/)');
402
+ expect(result).to.equal('[link 1](https://example.com/relative/link)\n\n[link 2](#anchor)\n\n[link 3](http://absolute.url/link)\n\n[link 5](http://[INVALID_URL=http://www.example.org/)');
363
403
  });
364
404
  });
365
405
 
@@ -372,7 +412,7 @@ describe('Extract', () => {
372
412
  insignificantContentSelectors: [ 'h1', '#link3', '#link5' ],
373
413
  }));
374
414
 
375
- expect(result).to.equal('[link 1](https://exemple.com/relative/link)\n\n[link 2](#anchor)');
415
+ expect(result).to.equal('[link 1](https://example.com/relative/link)\n\n[link 2](#anchor)');
376
416
  });
377
417
  });
378
418
 
@@ -435,7 +475,7 @@ describe('Extract', () => {
435
475
  ],
436
476
  }));
437
477
 
438
- expect(result).to.equal('[link 1](https://exemple.com/relative/link)\n\n[link 2](#anchor)');
478
+ expect(result).to.equal('[link 1](https://example.com/relative/link)\n\n[link 2](#anchor)');
439
479
  });
440
480
  });
441
481
 
@@ -454,7 +494,7 @@ describe('Extract', () => {
454
494
  ],
455
495
  }));
456
496
 
457
- expect(result).to.equal('[link 1](https://exemple.com/relative/link)\n\n[link 2](#anchor)');
497
+ expect(result).to.equal('[link 1](https://example.com/relative/link)\n\n[link 2](#anchor)');
458
498
  });
459
499
 
460
500
  context('where one selector is dependent on another', () => {
@@ -477,34 +517,6 @@ describe('Extract', () => {
477
517
  });
478
518
  });
479
519
  });
480
-
481
- describe('Filter', () => {
482
- context('with a synchronous filter', () => {
483
- it('extracts content from the given HTML also with given additional filter', async () => {
484
- const result = await extract(new SourceDocument({
485
- content: rawHTML,
486
- location: virtualLocation,
487
- contentSelectors: 'body',
488
- filters: [additionalFilter.removeLinks],
489
- }));
490
-
491
- expect(result).to.equal(expectedExtractedWithAdditional);
492
- });
493
- });
494
-
495
- context('with an asynchronous filter', () => {
496
- it('extracts content from the given HTML also with given additional filter', async () => {
497
- const result = await extract(new SourceDocument({
498
- content: rawHTML,
499
- location: virtualLocation,
500
- contentSelectors: 'body',
501
- filters: [additionalFilter.removeLinksAsync],
502
- }));
503
-
504
- expect(result).to.equal(expectedExtractedWithAdditional);
505
- });
506
- });
507
- });
508
520
  });
509
521
 
510
522
  context('from PDF content', () => {
@@ -0,0 +1,29 @@
1
+ import ciceroMark from '@accordproject/markdown-cicero';
2
+ import mardownPdf from '@accordproject/markdown-pdf';
3
+ import TurndownService from '@opentermsarchive/turndown';
4
+ import turndownPluginGithubFlavouredMarkdown from 'joplin-turndown-plugin-gfm';
5
+
6
+ const turndownService = new TurndownService();
7
+
8
+ turndownService.use(turndownPluginGithubFlavouredMarkdown.gfm);
9
+
10
+ const { PdfTransformer } = mardownPdf;
11
+ const { CiceroMarkTransformer } = ciceroMark;
12
+ const ciceroMarkTransformer = new CiceroMarkTransformer();
13
+
14
+ export function transformFromHTML(html) {
15
+ return turndownService.turndown(html);
16
+ }
17
+
18
+ export async function transformFromPDF(pdfBuffer) {
19
+ try {
20
+ const ciceroMarkdown = await PdfTransformer.toCiceroMark(pdfBuffer);
21
+
22
+ return ciceroMarkTransformer.toMarkdown(ciceroMarkdown);
23
+ } catch (error) {
24
+ if (error.parserError) {
25
+ throw new Error("Can't parse PDF file");
26
+ }
27
+ throw error;
28
+ }
29
+ }