@opentermsarchive/engine 7.0.0 → 7.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +6 -0
- package/package.json +3 -2
- package/scripts/declarations/validate/definitions.js +14 -3
- package/scripts/declarations/validate/index.mocha.js +19 -0
- package/src/archivist/extract/dom.js +75 -0
- package/src/archivist/extract/dom.test.js +207 -0
- package/src/archivist/extract/exposedFilters.js +25 -0
- package/src/archivist/extract/exposedFilters.test.js +208 -0
- package/src/archivist/extract/filter.js +59 -0
- package/src/archivist/extract/filter.test.js +194 -0
- package/src/archivist/extract/index.js +12 -145
- package/src/archivist/extract/index.test.js +76 -64
- package/src/archivist/extract/markdown.js +29 -0
- package/src/archivist/recorder/version.js +1 -1
- package/src/archivist/services/index.js +237 -173
- package/src/archivist/services/index.test.js +499 -7
|
@@ -0,0 +1,194 @@
|
|
|
1
|
+
import { expect } from 'chai';
|
|
2
|
+
|
|
3
|
+
import createWebPageDOM from './dom.js';
|
|
4
|
+
import filter from './filter.js';
|
|
5
|
+
|
|
6
|
+
const delay = ms => new Promise(resolve => { setTimeout(resolve, ms); });
|
|
7
|
+
|
|
8
|
+
describe('Filter', () => {
|
|
9
|
+
let webPageDOM;
|
|
10
|
+
let sourceDocument;
|
|
11
|
+
const BASE_URL = 'https://example.com';
|
|
12
|
+
|
|
13
|
+
before(() => {
|
|
14
|
+
webPageDOM = createWebPageDOM('<!DOCTYPE html><html><body></body></html>', BASE_URL);
|
|
15
|
+
sourceDocument = {
|
|
16
|
+
location: BASE_URL,
|
|
17
|
+
contentSelectors: [],
|
|
18
|
+
insignificantContentSelectors: [],
|
|
19
|
+
filters: [],
|
|
20
|
+
removeQueryParams: [],
|
|
21
|
+
};
|
|
22
|
+
});
|
|
23
|
+
|
|
24
|
+
describe('#filter', () => {
|
|
25
|
+
it('returns the webPageDOM', async () => {
|
|
26
|
+
const result = await filter(webPageDOM, sourceDocument);
|
|
27
|
+
|
|
28
|
+
expect(result).to.equal(webPageDOM);
|
|
29
|
+
});
|
|
30
|
+
|
|
31
|
+
describe('with custom filters', () => {
|
|
32
|
+
let receivedContext;
|
|
33
|
+
|
|
34
|
+
const contentFilter = (dom, { select }) => {
|
|
35
|
+
const element = dom.querySelector(select[0]);
|
|
36
|
+
|
|
37
|
+
if (element) {
|
|
38
|
+
element.innerHTML = 'Filtered content';
|
|
39
|
+
}
|
|
40
|
+
};
|
|
41
|
+
|
|
42
|
+
const appendFilter = (dom, { select }) => {
|
|
43
|
+
const element = dom.querySelector(select[0]);
|
|
44
|
+
|
|
45
|
+
if (element) {
|
|
46
|
+
element.innerHTML += ' + Appended content';
|
|
47
|
+
}
|
|
48
|
+
};
|
|
49
|
+
|
|
50
|
+
const failingFilter = () => {
|
|
51
|
+
throw new Error('Filter failed');
|
|
52
|
+
};
|
|
53
|
+
|
|
54
|
+
const contextSpyFilter = (dom, context) => {
|
|
55
|
+
receivedContext = context;
|
|
56
|
+
};
|
|
57
|
+
|
|
58
|
+
const asyncFilter = async (dom, { select }) => {
|
|
59
|
+
const element = dom.querySelector(select[0]);
|
|
60
|
+
|
|
61
|
+
if (element) {
|
|
62
|
+
await delay(100);
|
|
63
|
+
element.innerHTML = 'Async content';
|
|
64
|
+
}
|
|
65
|
+
};
|
|
66
|
+
|
|
67
|
+
before(() => {
|
|
68
|
+
const div = webPageDOM.createElement('div');
|
|
69
|
+
|
|
70
|
+
div.className = 'custom-content';
|
|
71
|
+
webPageDOM.body.appendChild(div);
|
|
72
|
+
|
|
73
|
+
sourceDocument.contentSelectors = ['.custom-content'];
|
|
74
|
+
});
|
|
75
|
+
|
|
76
|
+
it('applies single filter to content', async () => {
|
|
77
|
+
sourceDocument.filters = [contentFilter];
|
|
78
|
+
|
|
79
|
+
await filter(webPageDOM, sourceDocument);
|
|
80
|
+
|
|
81
|
+
expect(webPageDOM.querySelector('.custom-content').innerHTML).to.equal('Filtered content');
|
|
82
|
+
});
|
|
83
|
+
|
|
84
|
+
it('applies filters in sequence', async () => {
|
|
85
|
+
sourceDocument.filters = [ contentFilter, appendFilter ];
|
|
86
|
+
|
|
87
|
+
await filter(webPageDOM, sourceDocument);
|
|
88
|
+
|
|
89
|
+
expect(webPageDOM.querySelector('.custom-content').innerHTML).to.equal('Filtered content + Appended content');
|
|
90
|
+
});
|
|
91
|
+
|
|
92
|
+
it('applies async filter and waits for completion', async () => {
|
|
93
|
+
sourceDocument.filters = [asyncFilter];
|
|
94
|
+
|
|
95
|
+
await filter(webPageDOM, sourceDocument);
|
|
96
|
+
|
|
97
|
+
expect(webPageDOM.querySelector('.custom-content').innerHTML).to.equal('Async content');
|
|
98
|
+
});
|
|
99
|
+
|
|
100
|
+
it('throws error on filter failure', async () => {
|
|
101
|
+
sourceDocument.filters = [failingFilter];
|
|
102
|
+
|
|
103
|
+
await expect(filter(webPageDOM, sourceDocument)).to.be.rejectedWith('The filter function "failingFilter" failed: Error: Filter failed');
|
|
104
|
+
});
|
|
105
|
+
|
|
106
|
+
describe('filter parameters', () => {
|
|
107
|
+
before(async () => {
|
|
108
|
+
sourceDocument.filters = [contextSpyFilter];
|
|
109
|
+
sourceDocument.contentSelectors = ['.custom-content'];
|
|
110
|
+
sourceDocument.insignificantContentSelectors = ['.insignificant'];
|
|
111
|
+
|
|
112
|
+
await filter(webPageDOM, sourceDocument);
|
|
113
|
+
});
|
|
114
|
+
|
|
115
|
+
it('provides content selectors', () => {
|
|
116
|
+
expect(receivedContext.select).to.deep.equal(['.custom-content']);
|
|
117
|
+
});
|
|
118
|
+
|
|
119
|
+
it('provides insignificant selectors', () => {
|
|
120
|
+
expect(receivedContext.remove).to.deep.equal(['.insignificant']);
|
|
121
|
+
});
|
|
122
|
+
|
|
123
|
+
it('provides location', () => {
|
|
124
|
+
expect(receivedContext.fetch).to.equal(BASE_URL);
|
|
125
|
+
});
|
|
126
|
+
|
|
127
|
+
it('provides filters list', () => {
|
|
128
|
+
expect(receivedContext.filter).to.deep.equal(['contextSpyFilter']);
|
|
129
|
+
});
|
|
130
|
+
});
|
|
131
|
+
});
|
|
132
|
+
});
|
|
133
|
+
|
|
134
|
+
describe('#convertRelativeURLsToAbsolute', () => {
|
|
135
|
+
let link;
|
|
136
|
+
|
|
137
|
+
before(() => {
|
|
138
|
+
link = webPageDOM.createElement('a');
|
|
139
|
+
webPageDOM.body.appendChild(link);
|
|
140
|
+
});
|
|
141
|
+
|
|
142
|
+
it('converts relative URLs to absolute', async () => {
|
|
143
|
+
link.href = '/path/to/page';
|
|
144
|
+
await filter(webPageDOM, sourceDocument);
|
|
145
|
+
|
|
146
|
+
expect(link.href).to.equal('https://example.com/path/to/page');
|
|
147
|
+
});
|
|
148
|
+
|
|
149
|
+
it('keeps invalid URLs unchanged', async () => {
|
|
150
|
+
link.href = 'invalid://url';
|
|
151
|
+
await filter(webPageDOM, sourceDocument);
|
|
152
|
+
|
|
153
|
+
expect(link.href).to.equal('invalid://url');
|
|
154
|
+
});
|
|
155
|
+
});
|
|
156
|
+
|
|
157
|
+
describe('#removeUnwantedElements', () => {
|
|
158
|
+
before(async () => {
|
|
159
|
+
webPageDOM.body.appendChild(webPageDOM.createElement('script'));
|
|
160
|
+
webPageDOM.body.appendChild(webPageDOM.createElement('style'));
|
|
161
|
+
|
|
162
|
+
await filter(webPageDOM, sourceDocument);
|
|
163
|
+
});
|
|
164
|
+
|
|
165
|
+
it('removes script elements', () => {
|
|
166
|
+
expect(webPageDOM.querySelector('script')).to.be.null;
|
|
167
|
+
});
|
|
168
|
+
|
|
169
|
+
it('removes style elements', () => {
|
|
170
|
+
expect(webPageDOM.querySelector('style')).to.be.null;
|
|
171
|
+
});
|
|
172
|
+
});
|
|
173
|
+
|
|
174
|
+
describe('#updateProtectedLinks', () => {
|
|
175
|
+
before(async () => {
|
|
176
|
+
const link = webPageDOM.createElement('a');
|
|
177
|
+
|
|
178
|
+
link.href = 'https://example.com/email-protection';
|
|
179
|
+
link.className = 'email-protection';
|
|
180
|
+
link.innerHTML = 'Click here';
|
|
181
|
+
webPageDOM.body.appendChild(link);
|
|
182
|
+
|
|
183
|
+
await filter(webPageDOM, sourceDocument);
|
|
184
|
+
});
|
|
185
|
+
|
|
186
|
+
it('updates link destination', () => {
|
|
187
|
+
expect(webPageDOM.querySelector('a.email-protection').href).to.equal('https://example.com/email-protection');
|
|
188
|
+
});
|
|
189
|
+
|
|
190
|
+
it('updates link content', () => {
|
|
191
|
+
expect(webPageDOM.querySelector('a.email-protection').innerHTML).to.equal('[email protected]');
|
|
192
|
+
});
|
|
193
|
+
});
|
|
194
|
+
});
|
|
@@ -1,28 +1,14 @@
|
|
|
1
|
-
import ciceroMark from '@accordproject/markdown-cicero';
|
|
2
|
-
import mardownPdf from '@accordproject/markdown-pdf';
|
|
3
|
-
import TurndownService from '@opentermsarchive/turndown';
|
|
4
|
-
import turndownPluginGithubFlavouredMarkdown from 'joplin-turndown-plugin-gfm';
|
|
5
|
-
import jsdom from 'jsdom';
|
|
6
1
|
import mime from 'mime';
|
|
7
2
|
|
|
8
3
|
import SourceDocument from '../services/sourceDocument.js';
|
|
9
4
|
|
|
5
|
+
import createWebPageDOM from './dom.js';
|
|
10
6
|
import { ExtractDocumentError } from './errors.js';
|
|
7
|
+
import filter from './filter.js';
|
|
8
|
+
import { transformFromHTML, transformFromPDF } from './markdown.js';
|
|
11
9
|
|
|
12
10
|
export { ExtractDocumentError } from './errors.js';
|
|
13
11
|
|
|
14
|
-
const { JSDOM } = jsdom;
|
|
15
|
-
const turndownService = new TurndownService();
|
|
16
|
-
|
|
17
|
-
turndownService.use(turndownPluginGithubFlavouredMarkdown.gfm);
|
|
18
|
-
|
|
19
|
-
export const LINKS_TO_CONVERT_SELECTOR = 'a[href]:not([href^="#"]):not([href=""])';
|
|
20
|
-
|
|
21
|
-
const { PdfTransformer } = mardownPdf;
|
|
22
|
-
const { CiceroMarkTransformer } = ciceroMark;
|
|
23
|
-
|
|
24
|
-
const ciceroMarkTransformer = new CiceroMarkTransformer();
|
|
25
|
-
|
|
26
12
|
/**
|
|
27
13
|
* Extract content from source document and convert it to Markdown
|
|
28
14
|
* @function extract
|
|
@@ -43,56 +29,18 @@ export default async function extract(sourceDocument) {
|
|
|
43
29
|
}
|
|
44
30
|
|
|
45
31
|
export async function extractFromHTML(sourceDocument) {
|
|
46
|
-
const {
|
|
47
|
-
location,
|
|
48
|
-
contentSelectors = [],
|
|
49
|
-
insignificantContentSelectors = [],
|
|
50
|
-
filters: serviceSpecificFilters = [],
|
|
51
|
-
content,
|
|
52
|
-
} = sourceDocument;
|
|
53
|
-
|
|
54
|
-
const jsdomInstance = new JSDOM(content, {
|
|
55
|
-
url: location,
|
|
56
|
-
virtualConsole: new jsdom.VirtualConsole(),
|
|
57
|
-
});
|
|
58
|
-
const { document: webPageDOM } = jsdomInstance.window;
|
|
32
|
+
const { location, content, contentSelectors, insignificantContentSelectors } = sourceDocument;
|
|
59
33
|
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
select: contentSelectors,
|
|
65
|
-
remove: insignificantContentSelectors,
|
|
66
|
-
filter: serviceSpecificFilters.map(filter => filter.name),
|
|
67
|
-
});
|
|
68
|
-
} catch (error) {
|
|
69
|
-
throw new Error(`The filter function "${filterFunction.name}" failed: ${error}`);
|
|
70
|
-
}
|
|
71
|
-
}
|
|
72
|
-
|
|
73
|
-
remove(webPageDOM, insignificantContentSelectors); // remove function works in place
|
|
34
|
+
const webPageDOM = createWebPageDOM(content, location);
|
|
35
|
+
const filteredDOM = await filter(webPageDOM, sourceDocument);
|
|
36
|
+
const cleanedDOM = filteredDOM.remove(insignificantContentSelectors);
|
|
37
|
+
const selectedDOM = cleanedDOM.select(contentSelectors);
|
|
74
38
|
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
if (!domFragment.children.length) {
|
|
78
|
-
throw new Error(`The provided selector "${contentSelectors}" has no match in the web page at '${location}'`);
|
|
39
|
+
if (!selectedDOM?.children.length) {
|
|
40
|
+
throw new Error(`The provided selector "${contentSelectors}" has no match in the web page at '${location}'. This could be due to elements being removed before content selection if "remove" and "select" selectors match the same content.`);
|
|
79
41
|
}
|
|
80
42
|
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
domFragment.querySelectorAll('script, style').forEach(node => node.remove());
|
|
84
|
-
|
|
85
|
-
// clean code from common changing patterns - initially for Windstream
|
|
86
|
-
domFragment.querySelectorAll('a[href*="/email-protection"]').forEach(node => {
|
|
87
|
-
const newProtectedLink = webPageDOM.createElement('a');
|
|
88
|
-
const [href] = node.href.split('#');
|
|
89
|
-
|
|
90
|
-
newProtectedLink.href = href;
|
|
91
|
-
newProtectedLink.innerHTML = '[email protected]';
|
|
92
|
-
node.parentNode.replaceChild(newProtectedLink, node);
|
|
93
|
-
});
|
|
94
|
-
|
|
95
|
-
const markdownContent = transform(domFragment);
|
|
43
|
+
const markdownContent = transformFromHTML(selectedDOM);
|
|
96
44
|
|
|
97
45
|
if (!markdownContent) {
|
|
98
46
|
throw new Error(`The provided selector "${contentSelectors}" matches an empty content in the web page at '${location}'`);
|
|
@@ -102,19 +50,7 @@ export async function extractFromHTML(sourceDocument) {
|
|
|
102
50
|
}
|
|
103
51
|
|
|
104
52
|
export async function extractFromPDF({ location, content: pdfBuffer }) {
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
try {
|
|
108
|
-
const ciceroMarkdown = await PdfTransformer.toCiceroMark(pdfBuffer);
|
|
109
|
-
|
|
110
|
-
markdownContent = ciceroMarkTransformer.toMarkdown(ciceroMarkdown);
|
|
111
|
-
} catch (error) {
|
|
112
|
-
if (error.parserError) {
|
|
113
|
-
throw new Error("Can't parse PDF file");
|
|
114
|
-
}
|
|
115
|
-
|
|
116
|
-
throw error;
|
|
117
|
-
}
|
|
53
|
+
const markdownContent = await transformFromPDF(pdfBuffer);
|
|
118
54
|
|
|
119
55
|
if (!markdownContent) {
|
|
120
56
|
throw new Error(`The PDF file at '${location}' contains no text, it might contain scanned images of text instead of actual text`);
|
|
@@ -122,72 +58,3 @@ export async function extractFromPDF({ location, content: pdfBuffer }) {
|
|
|
122
58
|
|
|
123
59
|
return markdownContent;
|
|
124
60
|
}
|
|
125
|
-
|
|
126
|
-
function selectRange(webPageDOM, rangeSelector) {
|
|
127
|
-
const { startBefore, startAfter, endBefore, endAfter } = rangeSelector;
|
|
128
|
-
|
|
129
|
-
const selection = webPageDOM.createRange();
|
|
130
|
-
const startNode = webPageDOM.querySelector(startBefore || startAfter);
|
|
131
|
-
const endNode = webPageDOM.querySelector(endBefore || endAfter);
|
|
132
|
-
|
|
133
|
-
if (!startNode) {
|
|
134
|
-
throw new Error(`The "start" selector has no match in document in: ${JSON.stringify(rangeSelector)}`);
|
|
135
|
-
}
|
|
136
|
-
|
|
137
|
-
if (!endNode) {
|
|
138
|
-
throw new Error(`The "end" selector has no match in document in: ${JSON.stringify(rangeSelector)}`);
|
|
139
|
-
}
|
|
140
|
-
|
|
141
|
-
selection[startBefore ? 'setStartBefore' : 'setStartAfter'](startNode);
|
|
142
|
-
selection[endBefore ? 'setEndBefore' : 'setEndAfter'](endNode);
|
|
143
|
-
|
|
144
|
-
return selection;
|
|
145
|
-
}
|
|
146
|
-
|
|
147
|
-
export function convertRelativeURLsToAbsolute(webPageDOM, baseURL) {
|
|
148
|
-
Array.from(webPageDOM.querySelectorAll(LINKS_TO_CONVERT_SELECTOR)).forEach(link => {
|
|
149
|
-
try {
|
|
150
|
-
link.href = new URL(link.href, baseURL).href;
|
|
151
|
-
} catch (error) {
|
|
152
|
-
// Leave the URL as is if it's invalid in the source document and can't be converted to an absolute URL
|
|
153
|
-
}
|
|
154
|
-
});
|
|
155
|
-
}
|
|
156
|
-
|
|
157
|
-
// Works in place
|
|
158
|
-
function remove(webPageDOM, insignificantContentSelectors) {
|
|
159
|
-
const rangeSelections = [];
|
|
160
|
-
const nodes = [];
|
|
161
|
-
|
|
162
|
-
[].concat(insignificantContentSelectors).forEach(selector => {
|
|
163
|
-
if (typeof selector === 'object') {
|
|
164
|
-
rangeSelections.push(selectRange(webPageDOM, selector));
|
|
165
|
-
} else {
|
|
166
|
-
nodes.push(...webPageDOM.querySelectorAll(selector));
|
|
167
|
-
}
|
|
168
|
-
});
|
|
169
|
-
|
|
170
|
-
// Removing range selections still works even if the starting or ending node is deleted. So, start by removing all nodes selected by a direct CSS selector, then delete all contents selections.
|
|
171
|
-
nodes.forEach(node => node.remove());
|
|
172
|
-
rangeSelections.forEach(rangeSelection => rangeSelection.deleteContents());
|
|
173
|
-
}
|
|
174
|
-
|
|
175
|
-
function select(webPageDOM, contentSelectors) {
|
|
176
|
-
const result = webPageDOM.createDocumentFragment();
|
|
177
|
-
|
|
178
|
-
[].concat(contentSelectors).forEach(selector => {
|
|
179
|
-
if (typeof selector === 'object') {
|
|
180
|
-
const rangeSelection = selectRange(webPageDOM, selector);
|
|
181
|
-
|
|
182
|
-
result.appendChild(rangeSelection.cloneContents());
|
|
183
|
-
} else {
|
|
184
|
-
webPageDOM.querySelectorAll(selector).forEach(element => result.appendChild(element.cloneNode(true)));
|
|
185
|
-
}
|
|
186
|
-
});
|
|
187
|
-
|
|
188
|
-
return result;
|
|
189
|
-
}
|
|
190
|
-
|
|
191
|
-
function transform(domFragment) {
|
|
192
|
-
return turndownService.turndown(domFragment);
|
|
193
|
-
}
|
|
@@ -3,21 +3,19 @@ import path from 'path';
|
|
|
3
3
|
import { fileURLToPath } from 'url';
|
|
4
4
|
|
|
5
5
|
import chai from 'chai';
|
|
6
|
-
import jsdom from 'jsdom';
|
|
7
6
|
import mime from 'mime';
|
|
8
7
|
|
|
9
8
|
import SourceDocument from '../services/sourceDocument.js';
|
|
10
9
|
|
|
11
10
|
import { ExtractDocumentError } from './errors.js';
|
|
12
11
|
|
|
13
|
-
import extract
|
|
12
|
+
import extract from './index.js';
|
|
14
13
|
|
|
15
14
|
const __dirname = path.dirname(fileURLToPath(import.meta.url));
|
|
16
15
|
const fs = fsApi.promises;
|
|
17
|
-
const { JSDOM } = jsdom;
|
|
18
16
|
const { expect } = chai;
|
|
19
17
|
|
|
20
|
-
const virtualLocation = 'https://
|
|
18
|
+
const virtualLocation = 'https://example.com/main';
|
|
21
19
|
const rawHTML = `
|
|
22
20
|
<!DOCTYPE html>
|
|
23
21
|
<html>
|
|
@@ -39,7 +37,7 @@ const rawHTML = `
|
|
|
39
37
|
const expectedExtracted = `Title
|
|
40
38
|
=====
|
|
41
39
|
|
|
42
|
-
[link 1](https://
|
|
40
|
+
[link 1](https://example.com/relative/link)
|
|
43
41
|
|
|
44
42
|
[link 2](#anchor)
|
|
45
43
|
|
|
@@ -63,10 +61,11 @@ const rawHTMLWithCommonChangingItems = `
|
|
|
63
61
|
<style>body { background: blue }</style>
|
|
64
62
|
<script>console.log("test")</script>
|
|
65
63
|
<h1>Title</h1>
|
|
66
|
-
<p><a id="link1" href="/relative/link">link 1</a></p>
|
|
64
|
+
<p><a id="link1" href="/relative/link?utm_source=test&id=123">link 1</a></p>
|
|
67
65
|
<p><a id="link2" href="#anchor">link 2</a></p>
|
|
68
|
-
<p><a id="link3" href="http://absolute.url/link">link 3</a></p>
|
|
66
|
+
<p><a id="link3" href="http://absolute.url/link?keep=me">link 3</a></p>
|
|
69
67
|
<p><a id="link4" href="">link 4</a></p>
|
|
68
|
+
<p><img src="https://example.com/image.jpg?width=100&quality=80" alt="test"/></p>
|
|
70
69
|
<a href="/cdn-cgi/l/email-protection#3b4c52555f484f495e5a56154b49524d5a584215484f5a4f5e565e554f7b4c52555f484f495e5a5615585456">[email protected]</a>
|
|
71
70
|
<p><a href="/cdn-cgi/l/email-protection#2d4e4243594c4e596d4e4459545e4e424259034858">conta<span>[email protected]</span></a></p>
|
|
72
71
|
</body>
|
|
@@ -76,17 +75,19 @@ const rawHTMLWithCommonChangingItems = `
|
|
|
76
75
|
const expectedExtractedWithCommonChangingItems = `Title
|
|
77
76
|
=====
|
|
78
77
|
|
|
79
|
-
[link 1](https://
|
|
78
|
+
[link 1](https://example.com/relative/link?utm_source=test&id=123)
|
|
80
79
|
|
|
81
80
|
[link 2](#anchor)
|
|
82
81
|
|
|
83
|
-
[link 3](http://absolute.url/link)
|
|
82
|
+
[link 3](http://absolute.url/link?keep=me)
|
|
84
83
|
|
|
85
84
|
link 4
|
|
86
85
|
|
|
87
|
-
[
|
|
86
|
+

|
|
88
87
|
|
|
89
|
-
[\\[email protected\\]](https://
|
|
88
|
+
[\\[email protected\\]](https://example.com/cdn-cgi/l/email-protection)
|
|
89
|
+
|
|
90
|
+
[\\[email protected\\]](https://example.com/cdn-cgi/l/email-protection)`;
|
|
90
91
|
/* eslint-enable no-irregular-whitespace */
|
|
91
92
|
|
|
92
93
|
const additionalFilter = {
|
|
@@ -112,31 +113,70 @@ const additionalFilter = {
|
|
|
112
113
|
};
|
|
113
114
|
|
|
114
115
|
describe('Extract', () => {
|
|
115
|
-
describe('#
|
|
116
|
-
|
|
116
|
+
describe('#extract', () => {
|
|
117
|
+
context('from HTML content', () => {
|
|
118
|
+
describe('Filter', () => {
|
|
119
|
+
it('converts relative URLs to absolute', async () => {
|
|
120
|
+
const result = await extract(new SourceDocument({
|
|
121
|
+
content: rawHTML,
|
|
122
|
+
location: virtualLocation,
|
|
123
|
+
contentSelectors: 'body',
|
|
124
|
+
}));
|
|
125
|
+
|
|
126
|
+
expect(result).to.include('https://example.com/relative/link');
|
|
127
|
+
expect(result).to.include('http://absolute.url/link');
|
|
128
|
+
});
|
|
117
129
|
|
|
118
|
-
|
|
119
|
-
|
|
130
|
+
it('discards non-textual elements', async () => {
|
|
131
|
+
const result = await extract(new SourceDocument({
|
|
132
|
+
content: rawHTMLWithCommonChangingItems,
|
|
133
|
+
location: virtualLocation,
|
|
134
|
+
contentSelectors: 'body',
|
|
135
|
+
}));
|
|
120
136
|
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
137
|
+
expect(result).to.not.include('background: red');
|
|
138
|
+
expect(result).to.not.include('console.log');
|
|
139
|
+
});
|
|
124
140
|
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
141
|
+
it('cleans up protected links', async () => {
|
|
142
|
+
const result = await extract(new SourceDocument({
|
|
143
|
+
content: rawHTMLWithCommonChangingItems,
|
|
144
|
+
location: virtualLocation,
|
|
145
|
+
contentSelectors: 'body',
|
|
146
|
+
}));
|
|
128
147
|
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
148
|
+
expect(result).to.include('email protected');
|
|
149
|
+
expect(result).to.not.include('3b4c52555f484f495e5a56154b49524d5a584215484f5a4f5e565e554f7b4c52555f484f495e5a5615585456');
|
|
150
|
+
expect(result).to.not.include('2d4e4243594c4e596d4e4459545e4e424259034858');
|
|
151
|
+
});
|
|
132
152
|
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
153
|
+
context('with a synchronous filter', () => {
|
|
154
|
+
it('applies all filters', async () => {
|
|
155
|
+
const result = await extract(new SourceDocument({
|
|
156
|
+
content: rawHTML,
|
|
157
|
+
location: virtualLocation,
|
|
158
|
+
contentSelectors: 'body',
|
|
159
|
+
filters: [additionalFilter.removeLinks],
|
|
160
|
+
}));
|
|
161
|
+
|
|
162
|
+
expect(result).to.equal(expectedExtractedWithAdditional);
|
|
163
|
+
});
|
|
164
|
+
});
|
|
165
|
+
|
|
166
|
+
context('with an asynchronous filter', () => {
|
|
167
|
+
it('applies all filters', async () => {
|
|
168
|
+
const result = await extract(new SourceDocument({
|
|
169
|
+
content: rawHTML,
|
|
170
|
+
location: virtualLocation,
|
|
171
|
+
contentSelectors: 'body',
|
|
172
|
+
filters: [additionalFilter.removeLinksAsync],
|
|
173
|
+
}));
|
|
174
|
+
|
|
175
|
+
expect(result).to.equal(expectedExtractedWithAdditional);
|
|
176
|
+
});
|
|
177
|
+
});
|
|
178
|
+
});
|
|
137
179
|
|
|
138
|
-
describe('#extract', () => {
|
|
139
|
-
context('from HTML content', () => {
|
|
140
180
|
describe('Select', () => {
|
|
141
181
|
context('with string selector', () => {
|
|
142
182
|
it('extracts content from the given HTML with common changing items', async () => {
|
|
@@ -221,7 +261,7 @@ describe('Extract', () => {
|
|
|
221
261
|
contentSelectors: [ 'h1', 'h1 ~ p' ],
|
|
222
262
|
}));
|
|
223
263
|
|
|
224
|
-
expect(result).to.equal('Title\n=====\n\n[link 1](https://
|
|
264
|
+
expect(result).to.equal('Title\n=====\n\n[link 1](https://example.com/relative/link)\n\n[link 2](#anchor)\n\n[link 3](http://absolute.url/link)\n\n[link 5](http://[INVALID_URL=http://www.example.org/)');
|
|
225
265
|
});
|
|
226
266
|
});
|
|
227
267
|
});
|
|
@@ -238,7 +278,7 @@ describe('Extract', () => {
|
|
|
238
278
|
},
|
|
239
279
|
}));
|
|
240
280
|
|
|
241
|
-
expect(result).to.equal('[link 1](https://
|
|
281
|
+
expect(result).to.equal('[link 1](https://example.com/relative/link)');
|
|
242
282
|
});
|
|
243
283
|
});
|
|
244
284
|
context('with startBefore and endAfter', () => {
|
|
@@ -359,7 +399,7 @@ describe('Extract', () => {
|
|
|
359
399
|
insignificantContentSelectors: 'h1',
|
|
360
400
|
}));
|
|
361
401
|
|
|
362
|
-
expect(result).to.equal('[link 1](https://
|
|
402
|
+
expect(result).to.equal('[link 1](https://example.com/relative/link)\n\n[link 2](#anchor)\n\n[link 3](http://absolute.url/link)\n\n[link 5](http://[INVALID_URL=http://www.example.org/)');
|
|
363
403
|
});
|
|
364
404
|
});
|
|
365
405
|
|
|
@@ -372,7 +412,7 @@ describe('Extract', () => {
|
|
|
372
412
|
insignificantContentSelectors: [ 'h1', '#link3', '#link5' ],
|
|
373
413
|
}));
|
|
374
414
|
|
|
375
|
-
expect(result).to.equal('[link 1](https://
|
|
415
|
+
expect(result).to.equal('[link 1](https://example.com/relative/link)\n\n[link 2](#anchor)');
|
|
376
416
|
});
|
|
377
417
|
});
|
|
378
418
|
|
|
@@ -435,7 +475,7 @@ describe('Extract', () => {
|
|
|
435
475
|
],
|
|
436
476
|
}));
|
|
437
477
|
|
|
438
|
-
expect(result).to.equal('[link 1](https://
|
|
478
|
+
expect(result).to.equal('[link 1](https://example.com/relative/link)\n\n[link 2](#anchor)');
|
|
439
479
|
});
|
|
440
480
|
});
|
|
441
481
|
|
|
@@ -454,7 +494,7 @@ describe('Extract', () => {
|
|
|
454
494
|
],
|
|
455
495
|
}));
|
|
456
496
|
|
|
457
|
-
expect(result).to.equal('[link 1](https://
|
|
497
|
+
expect(result).to.equal('[link 1](https://example.com/relative/link)\n\n[link 2](#anchor)');
|
|
458
498
|
});
|
|
459
499
|
|
|
460
500
|
context('where one selector is dependent on another', () => {
|
|
@@ -477,34 +517,6 @@ describe('Extract', () => {
|
|
|
477
517
|
});
|
|
478
518
|
});
|
|
479
519
|
});
|
|
480
|
-
|
|
481
|
-
describe('Filter', () => {
|
|
482
|
-
context('with a synchronous filter', () => {
|
|
483
|
-
it('extracts content from the given HTML also with given additional filter', async () => {
|
|
484
|
-
const result = await extract(new SourceDocument({
|
|
485
|
-
content: rawHTML,
|
|
486
|
-
location: virtualLocation,
|
|
487
|
-
contentSelectors: 'body',
|
|
488
|
-
filters: [additionalFilter.removeLinks],
|
|
489
|
-
}));
|
|
490
|
-
|
|
491
|
-
expect(result).to.equal(expectedExtractedWithAdditional);
|
|
492
|
-
});
|
|
493
|
-
});
|
|
494
|
-
|
|
495
|
-
context('with an asynchronous filter', () => {
|
|
496
|
-
it('extracts content from the given HTML also with given additional filter', async () => {
|
|
497
|
-
const result = await extract(new SourceDocument({
|
|
498
|
-
content: rawHTML,
|
|
499
|
-
location: virtualLocation,
|
|
500
|
-
contentSelectors: 'body',
|
|
501
|
-
filters: [additionalFilter.removeLinksAsync],
|
|
502
|
-
}));
|
|
503
|
-
|
|
504
|
-
expect(result).to.equal(expectedExtractedWithAdditional);
|
|
505
|
-
});
|
|
506
|
-
});
|
|
507
|
-
});
|
|
508
520
|
});
|
|
509
521
|
|
|
510
522
|
context('from PDF content', () => {
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
import ciceroMark from '@accordproject/markdown-cicero';
|
|
2
|
+
import mardownPdf from '@accordproject/markdown-pdf';
|
|
3
|
+
import TurndownService from '@opentermsarchive/turndown';
|
|
4
|
+
import turndownPluginGithubFlavouredMarkdown from 'joplin-turndown-plugin-gfm';
|
|
5
|
+
|
|
6
|
+
const turndownService = new TurndownService();
|
|
7
|
+
|
|
8
|
+
turndownService.use(turndownPluginGithubFlavouredMarkdown.gfm);
|
|
9
|
+
|
|
10
|
+
const { PdfTransformer } = mardownPdf;
|
|
11
|
+
const { CiceroMarkTransformer } = ciceroMark;
|
|
12
|
+
const ciceroMarkTransformer = new CiceroMarkTransformer();
|
|
13
|
+
|
|
14
|
+
export function transformFromHTML(html) {
|
|
15
|
+
return turndownService.turndown(html);
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
export async function transformFromPDF(pdfBuffer) {
|
|
19
|
+
try {
|
|
20
|
+
const ciceroMarkdown = await PdfTransformer.toCiceroMark(pdfBuffer);
|
|
21
|
+
|
|
22
|
+
return ciceroMarkTransformer.toMarkdown(ciceroMarkdown);
|
|
23
|
+
} catch (error) {
|
|
24
|
+
if (error.parserError) {
|
|
25
|
+
throw new Error("Can't parse PDF file");
|
|
26
|
+
}
|
|
27
|
+
throw error;
|
|
28
|
+
}
|
|
29
|
+
}
|
|
@@ -5,7 +5,7 @@ import Record from './record.js';
|
|
|
5
5
|
export default class Version extends Record {
|
|
6
6
|
static REQUIRED_PARAMS = Object.freeze([ ...Record.REQUIRED_PARAMS, 'snapshotIds' ]);
|
|
7
7
|
|
|
8
|
-
static SOURCE_DOCUMENTS_SEPARATOR = '\n\n';
|
|
8
|
+
static SOURCE_DOCUMENTS_SEPARATOR = '\n\n- - -\n\n'; // Separator used to delimit source documents when concatenating them. The "- - -" produces a horizontal ruler in Markdown
|
|
9
9
|
|
|
10
10
|
constructor(params) {
|
|
11
11
|
super(params);
|