@opentermsarchive/engine 7.1.0 → 7.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +6 -0
- package/package.json +3 -2
- package/scripts/declarations/validate/definitions.js +14 -3
- package/scripts/declarations/validate/index.mocha.js +19 -0
- package/src/archivist/extract/dom.js +75 -0
- package/src/archivist/extract/dom.test.js +207 -0
- package/src/archivist/extract/exposedFilters.js +25 -0
- package/src/archivist/extract/exposedFilters.test.js +208 -0
- package/src/archivist/extract/filter.js +59 -0
- package/src/archivist/extract/filter.test.js +194 -0
- package/src/archivist/extract/index.js +12 -145
- package/src/archivist/extract/index.test.js +76 -64
- package/src/archivist/extract/markdown.js +29 -0
- package/src/archivist/services/index.js +237 -173
- package/src/archivist/services/index.test.js +499 -7
package/README.md
CHANGED
|
@@ -4,6 +4,12 @@ This codebase is a Node.js module enabling downloading, archiving and publishing
|
|
|
4
4
|
|
|
5
5
|
For documentation, visit [docs.opentermsarchive.org](https://docs.opentermsarchive.org/)
|
|
6
6
|
|
|
7
|
+
## Testing
|
|
8
|
+
|
|
9
|
+
Use `npm test` to run all tests.
|
|
10
|
+
|
|
11
|
+
Use `npm run test:only <file.test.js>... [--watch]` to run specific test files. The `--watch` option enables running those tests each time a file changes.
|
|
12
|
+
|
|
7
13
|
- - -
|
|
8
14
|
|
|
9
15
|
## Contribute
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@opentermsarchive/engine",
|
|
3
|
-
"version": "7.
|
|
3
|
+
"version": "7.2.0",
|
|
4
4
|
"description": "Tracks and makes visible changes to the terms of online services",
|
|
5
5
|
"homepage": "https://opentermsarchive.org",
|
|
6
6
|
"bugs": {
|
|
@@ -45,6 +45,7 @@
|
|
|
45
45
|
"start:api": "node bin/ota.js serve",
|
|
46
46
|
"start:scheduler": "npm start -- --schedule",
|
|
47
47
|
"test": "cross-env NODE_ENV=test mocha --recursive \"./src/**/*.test.js\" \"./scripts/**/*.test.js\" --exit",
|
|
48
|
+
"test:only": "cross-env NODE_ENV=test mocha --recursive",
|
|
48
49
|
"posttest": "npm run lint",
|
|
49
50
|
"test:debug": "npm run test -- --inspect-brk --exit"
|
|
50
51
|
},
|
|
@@ -122,6 +123,6 @@
|
|
|
122
123
|
"@opentermsarchive/terms-types": "^2.0.0"
|
|
123
124
|
},
|
|
124
125
|
"engines": {
|
|
125
|
-
"node": ">=16.0.0"
|
|
126
|
+
"node": ">=16.0.0 < 23.0.0"
|
|
126
127
|
}
|
|
127
128
|
}
|
|
@@ -29,9 +29,20 @@ const definitions = {
|
|
|
29
29
|
filters: {
|
|
30
30
|
type: 'array',
|
|
31
31
|
items: {
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
32
|
+
oneOf: [
|
|
33
|
+
{
|
|
34
|
+
type: 'string',
|
|
35
|
+
pattern: '^.+$',
|
|
36
|
+
description: 'Filter function name',
|
|
37
|
+
},
|
|
38
|
+
{
|
|
39
|
+
type: 'object',
|
|
40
|
+
description: 'Filter function with parameters. The key is the filter function name, the value is the parameters.',
|
|
41
|
+
additionalProperties: false,
|
|
42
|
+
minProperties: 1,
|
|
43
|
+
maxProperties: 1,
|
|
44
|
+
},
|
|
45
|
+
],
|
|
35
46
|
},
|
|
36
47
|
},
|
|
37
48
|
validUntil: {
|
|
@@ -7,6 +7,7 @@ import { expect } from 'chai';
|
|
|
7
7
|
import config from 'config';
|
|
8
8
|
import jsonSourceMap from 'json-source-map';
|
|
9
9
|
|
|
10
|
+
import * as exposedFilters from '../../../src/archivist/extract/exposedFilters.js';
|
|
10
11
|
import extract from '../../../src/archivist/extract/index.js';
|
|
11
12
|
import fetch, { launchHeadlessBrowser, stopHeadlessBrowser } from '../../../src/archivist/fetcher/index.js';
|
|
12
13
|
import * as services from '../../../src/archivist/services/index.js';
|
|
@@ -76,6 +77,24 @@ export default async options => {
|
|
|
76
77
|
});
|
|
77
78
|
}
|
|
78
79
|
|
|
80
|
+
it('filters do not use reserved names', async () => {
|
|
81
|
+
const filtersFilePath = path.join(declarationsPath, `${serviceId}.filters.js`);
|
|
82
|
+
|
|
83
|
+
if (!fsApi.existsSync(filtersFilePath)) {
|
|
84
|
+
return; // Skip if no filters file exists
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
const serviceFilters = await services.loadServiceFilters(serviceId);
|
|
88
|
+
const reservedFilterNames = Object.keys(exposedFilters);
|
|
89
|
+
const serviceFilterNames = Object.keys(serviceFilters);
|
|
90
|
+
|
|
91
|
+
const conflictingNames = serviceFilterNames.filter(name => reservedFilterNames.includes(name));
|
|
92
|
+
|
|
93
|
+
if (conflictingNames.length) {
|
|
94
|
+
throw new Error(`Service filter file "${serviceId}.filters.js" declares filters with names used by built-in filters: "${conflictingNames.join('", "')}". Rename these filters to avoid a collision.`);
|
|
95
|
+
}
|
|
96
|
+
});
|
|
97
|
+
|
|
79
98
|
if (!schemaOnly && service) {
|
|
80
99
|
service.getTermsTypes()
|
|
81
100
|
.filter(termsType => {
|
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
import jsdom from 'jsdom';
|
|
2
|
+
|
|
3
|
+
export default function createWebPageDOM(content, location) {
|
|
4
|
+
const { document } = new jsdom.JSDOM(content, {
|
|
5
|
+
url: location,
|
|
6
|
+
virtualConsole: new jsdom.VirtualConsole(),
|
|
7
|
+
}).window;
|
|
8
|
+
|
|
9
|
+
return Object.assign(document, {
|
|
10
|
+
select(contentSelectors) {
|
|
11
|
+
const result = document.createDocumentFragment();
|
|
12
|
+
let hasContent = false;
|
|
13
|
+
|
|
14
|
+
[].concat(contentSelectors).forEach(selector => {
|
|
15
|
+
if (typeof selector === 'object') {
|
|
16
|
+
const rangeSelection = this.selectRange(selector);
|
|
17
|
+
const clonedContent = rangeSelection.cloneContents();
|
|
18
|
+
|
|
19
|
+
if (clonedContent.hasChildNodes()) {
|
|
20
|
+
result.appendChild(clonedContent);
|
|
21
|
+
hasContent = true;
|
|
22
|
+
}
|
|
23
|
+
} else {
|
|
24
|
+
const elements = document.querySelectorAll(selector);
|
|
25
|
+
|
|
26
|
+
if (elements.length > 0) {
|
|
27
|
+
elements.forEach(element => result.appendChild(element.cloneNode(true)));
|
|
28
|
+
hasContent = true;
|
|
29
|
+
}
|
|
30
|
+
}
|
|
31
|
+
});
|
|
32
|
+
|
|
33
|
+
return hasContent ? result : null;
|
|
34
|
+
},
|
|
35
|
+
|
|
36
|
+
remove(insignificantContentSelectors) {
|
|
37
|
+
const rangeSelections = [];
|
|
38
|
+
const nodes = [];
|
|
39
|
+
|
|
40
|
+
[].concat(insignificantContentSelectors).forEach(selector => {
|
|
41
|
+
if (typeof selector === 'object') {
|
|
42
|
+
rangeSelections.push(this.selectRange(selector));
|
|
43
|
+
} else {
|
|
44
|
+
nodes.push(...document.querySelectorAll(selector));
|
|
45
|
+
}
|
|
46
|
+
});
|
|
47
|
+
|
|
48
|
+
nodes.forEach(node => node.remove());
|
|
49
|
+
rangeSelections.forEach(rangeSelection => rangeSelection.deleteContents());
|
|
50
|
+
|
|
51
|
+
return this;
|
|
52
|
+
},
|
|
53
|
+
|
|
54
|
+
selectRange(rangeSelector) {
|
|
55
|
+
const { startBefore, startAfter, endBefore, endAfter } = rangeSelector;
|
|
56
|
+
|
|
57
|
+
const selection = document.createRange();
|
|
58
|
+
const startNode = document.querySelector(startBefore || startAfter);
|
|
59
|
+
const endNode = document.querySelector(endBefore || endAfter);
|
|
60
|
+
|
|
61
|
+
if (!startNode) {
|
|
62
|
+
throw new Error(`The "start" selector has no match in document in: ${JSON.stringify(rangeSelector)}`);
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
if (!endNode) {
|
|
66
|
+
throw new Error(`The "end" selector has no match in document in: ${JSON.stringify(rangeSelector)}`);
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
selection[startBefore ? 'setStartBefore' : 'setStartAfter'](startNode);
|
|
70
|
+
selection[endBefore ? 'setEndBefore' : 'setEndAfter'](endNode);
|
|
71
|
+
|
|
72
|
+
return selection;
|
|
73
|
+
},
|
|
74
|
+
});
|
|
75
|
+
}
|
|
@@ -0,0 +1,207 @@
|
|
|
1
|
+
import { expect } from 'chai';
|
|
2
|
+
|
|
3
|
+
import createWebPageDOM from './dom.js';
|
|
4
|
+
|
|
5
|
+
describe('createWebPageDOM', () => {
|
|
6
|
+
const sampleHTML = `
|
|
7
|
+
<!DOCTYPE html>
|
|
8
|
+
<html>
|
|
9
|
+
<head>
|
|
10
|
+
<title>Test Document</title>
|
|
11
|
+
</head>
|
|
12
|
+
<body>
|
|
13
|
+
<header id="header">
|
|
14
|
+
<h1>Main Title</h1>
|
|
15
|
+
<nav class="navigation">
|
|
16
|
+
<a href="/home">Home</a>
|
|
17
|
+
<a href="/about">About</a>
|
|
18
|
+
</nav>
|
|
19
|
+
</header>
|
|
20
|
+
<main>
|
|
21
|
+
<article id="content">
|
|
22
|
+
<p class="introduction">Introduction paragraph</p>
|
|
23
|
+
<p class="central">Central paragraph</p>
|
|
24
|
+
<p class="conclusion">Conclusion paragraph</p>
|
|
25
|
+
</article>
|
|
26
|
+
<aside class="sidebar">
|
|
27
|
+
<div class="widget">Widget content</div>
|
|
28
|
+
</aside>
|
|
29
|
+
</main>
|
|
30
|
+
<footer id="footer">
|
|
31
|
+
<p>Footer content</p>
|
|
32
|
+
</footer>
|
|
33
|
+
</body>
|
|
34
|
+
</html>
|
|
35
|
+
`;
|
|
36
|
+
const location = 'https://example.com/test';
|
|
37
|
+
let document;
|
|
38
|
+
|
|
39
|
+
before(() => {
|
|
40
|
+
document = createWebPageDOM(sampleHTML, location);
|
|
41
|
+
});
|
|
42
|
+
|
|
43
|
+
it('creates a DOM document from HTML content', () => {
|
|
44
|
+
expect(document.documentElement.tagName).to.equal('HTML');
|
|
45
|
+
});
|
|
46
|
+
|
|
47
|
+
it('sets the document location', () => {
|
|
48
|
+
expect(document.location.href).to.equal(location);
|
|
49
|
+
});
|
|
50
|
+
|
|
51
|
+
it('provides access to the DOM API', () => {
|
|
52
|
+
const title = document.querySelector('title');
|
|
53
|
+
|
|
54
|
+
expect(title.textContent).to.equal('Test Document');
|
|
55
|
+
});
|
|
56
|
+
|
|
57
|
+
describe('#select', () => {
|
|
58
|
+
it('returns elements using CSS selectors', () => {
|
|
59
|
+
const fragment = document.select('p.introduction');
|
|
60
|
+
const paragraph = fragment.querySelector('p');
|
|
61
|
+
|
|
62
|
+
expect(paragraph.textContent).to.equal('Introduction paragraph');
|
|
63
|
+
});
|
|
64
|
+
|
|
65
|
+
it('returns multiple elements using CSS selectors', () => {
|
|
66
|
+
const fragment = document.select('p');
|
|
67
|
+
const paragraphs = fragment.querySelectorAll('p');
|
|
68
|
+
|
|
69
|
+
expect(paragraphs.length).to.equal(4);
|
|
70
|
+
});
|
|
71
|
+
|
|
72
|
+
it('returns elements using an array of CSS selectors', () => {
|
|
73
|
+
const fragment = document.select([ 'h1', '.introduction' ]);
|
|
74
|
+
const heading = fragment.querySelector('h1');
|
|
75
|
+
const paragraph = fragment.querySelector('p');
|
|
76
|
+
|
|
77
|
+
expect(heading.textContent).to.equal('Main Title');
|
|
78
|
+
expect(paragraph.textContent).to.equal('Introduction paragraph');
|
|
79
|
+
});
|
|
80
|
+
|
|
81
|
+
it('returns content using a range selector object', () => {
|
|
82
|
+
const rangeSelector = {
|
|
83
|
+
startAfter: '.introduction',
|
|
84
|
+
endBefore: '.conclusion',
|
|
85
|
+
};
|
|
86
|
+
const fragment = document.select(rangeSelector);
|
|
87
|
+
const paragraph = fragment.querySelector('p');
|
|
88
|
+
|
|
89
|
+
expect(paragraph.textContent).to.equal('Central paragraph');
|
|
90
|
+
});
|
|
91
|
+
|
|
92
|
+
it('returns null when the selector matches no element', () => {
|
|
93
|
+
const result = document.select('.nonexistent');
|
|
94
|
+
|
|
95
|
+
expect(result).to.be.null;
|
|
96
|
+
});
|
|
97
|
+
});
|
|
98
|
+
|
|
99
|
+
describe('#remove', () => {
|
|
100
|
+
let testDocument;
|
|
101
|
+
|
|
102
|
+
beforeEach(() => {
|
|
103
|
+
testDocument = createWebPageDOM(sampleHTML, location);
|
|
104
|
+
});
|
|
105
|
+
|
|
106
|
+
it('removes elements using CSS selectors', () => {
|
|
107
|
+
testDocument.remove('.sidebar');
|
|
108
|
+
const sidebar = testDocument.querySelector('.sidebar');
|
|
109
|
+
|
|
110
|
+
expect(sidebar).to.be.null;
|
|
111
|
+
});
|
|
112
|
+
|
|
113
|
+
it('removes multiple elements using CSS selectors', () => {
|
|
114
|
+
testDocument.remove('p');
|
|
115
|
+
const paragraphs = testDocument.querySelectorAll('p');
|
|
116
|
+
|
|
117
|
+
expect(paragraphs.length).to.equal(0);
|
|
118
|
+
});
|
|
119
|
+
|
|
120
|
+
it('removes elements using an array of CSS selectors', () => {
|
|
121
|
+
testDocument.remove([ 'nav', '.widget' ]);
|
|
122
|
+
const nav = testDocument.querySelector('nav');
|
|
123
|
+
const widget = testDocument.querySelector('.widget');
|
|
124
|
+
|
|
125
|
+
expect(nav).to.be.null;
|
|
126
|
+
expect(widget).to.be.null;
|
|
127
|
+
});
|
|
128
|
+
|
|
129
|
+
it('removes content using a range selector object', () => {
|
|
130
|
+
const rangeSelector = {
|
|
131
|
+
startAfter: '.introduction',
|
|
132
|
+
endBefore: '.conclusion',
|
|
133
|
+
};
|
|
134
|
+
|
|
135
|
+
testDocument.remove(rangeSelector);
|
|
136
|
+
const bodyParagraph = testDocument.querySelector('.central');
|
|
137
|
+
|
|
138
|
+
expect(bodyParagraph).to.be.null;
|
|
139
|
+
});
|
|
140
|
+
});
|
|
141
|
+
|
|
142
|
+
describe('#selectRange', () => {
|
|
143
|
+
it('creates a range using startAfter and endBefore', () => {
|
|
144
|
+
const rangeSelector = {
|
|
145
|
+
startAfter: '.introduction',
|
|
146
|
+
endBefore: '.conclusion',
|
|
147
|
+
};
|
|
148
|
+
const range = document.selectRange(rangeSelector);
|
|
149
|
+
const fragment = range.cloneContents();
|
|
150
|
+
const paragraph = fragment.querySelector('p');
|
|
151
|
+
|
|
152
|
+
expect(paragraph.textContent).to.equal('Central paragraph');
|
|
153
|
+
});
|
|
154
|
+
|
|
155
|
+
it('creates a range using startBefore and endAfter', () => {
|
|
156
|
+
const rangeSelector = {
|
|
157
|
+
startBefore: '.central',
|
|
158
|
+
endAfter: '.central',
|
|
159
|
+
};
|
|
160
|
+
const range = document.selectRange(rangeSelector);
|
|
161
|
+
const fragment = range.cloneContents();
|
|
162
|
+
const paragraph = fragment.querySelector('p');
|
|
163
|
+
|
|
164
|
+
expect(paragraph.textContent).to.equal('Central paragraph');
|
|
165
|
+
});
|
|
166
|
+
|
|
167
|
+
it('throws a clear error when the startBefore selector has no match', () => {
|
|
168
|
+
const rangeSelector = {
|
|
169
|
+
startBefore: '.nonexistent',
|
|
170
|
+
endBefore: '.conclusion',
|
|
171
|
+
};
|
|
172
|
+
|
|
173
|
+
expect(() => document.selectRange(rangeSelector)).to.throw('"start" selector has no match');
|
|
174
|
+
expect(() => document.selectRange(rangeSelector)).to.throw(JSON.stringify(rangeSelector));
|
|
175
|
+
});
|
|
176
|
+
|
|
177
|
+
it('throws a clear error when the startAfter selector has no match', () => {
|
|
178
|
+
const rangeSelector = {
|
|
179
|
+
startAfter: '.nonexistent',
|
|
180
|
+
endBefore: '.conclusion',
|
|
181
|
+
};
|
|
182
|
+
|
|
183
|
+
expect(() => document.selectRange(rangeSelector)).to.throw('"start" selector has no match');
|
|
184
|
+
expect(() => document.selectRange(rangeSelector)).to.throw(JSON.stringify(rangeSelector));
|
|
185
|
+
});
|
|
186
|
+
|
|
187
|
+
it('throws a clear error when the endBefore selector has no match', () => {
|
|
188
|
+
const rangeSelector = {
|
|
189
|
+
startAfter: '.introduction',
|
|
190
|
+
endBefore: '.nonexistent',
|
|
191
|
+
};
|
|
192
|
+
|
|
193
|
+
expect(() => document.selectRange(rangeSelector)).to.throw('"end" selector has no match');
|
|
194
|
+
expect(() => document.selectRange(rangeSelector)).to.throw(JSON.stringify(rangeSelector));
|
|
195
|
+
});
|
|
196
|
+
|
|
197
|
+
it('throws a clear error when the endAfter selector has no match', () => {
|
|
198
|
+
const rangeSelector = {
|
|
199
|
+
startAfter: '.introduction',
|
|
200
|
+
endAfter: '.nonexistent',
|
|
201
|
+
};
|
|
202
|
+
|
|
203
|
+
expect(() => document.selectRange(rangeSelector)).to.throw('"end" selector has no match');
|
|
204
|
+
expect(() => document.selectRange(rangeSelector)).to.throw(JSON.stringify(rangeSelector));
|
|
205
|
+
});
|
|
206
|
+
});
|
|
207
|
+
});
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
export function removeQueryParams(webPageDOM, paramsToRemove = []) {
|
|
2
|
+
if (typeof paramsToRemove === 'string') {
|
|
3
|
+
paramsToRemove = [paramsToRemove];
|
|
4
|
+
}
|
|
5
|
+
|
|
6
|
+
if (!paramsToRemove.length) {
|
|
7
|
+
return;
|
|
8
|
+
}
|
|
9
|
+
|
|
10
|
+
const elements = [
|
|
11
|
+
...webPageDOM.querySelectorAll('a[href]'),
|
|
12
|
+
...webPageDOM.querySelectorAll('img[src]'),
|
|
13
|
+
];
|
|
14
|
+
|
|
15
|
+
elements.forEach(element => {
|
|
16
|
+
try {
|
|
17
|
+
const url = new URL(element.href || element.src);
|
|
18
|
+
|
|
19
|
+
paramsToRemove.forEach(param => url.searchParams.delete(param));
|
|
20
|
+
element[element.tagName === 'A' ? 'href' : 'src'] = url.toString();
|
|
21
|
+
} catch (error) {
|
|
22
|
+
// ignore if the element has not a valid URL
|
|
23
|
+
}
|
|
24
|
+
});
|
|
25
|
+
}
|
|
@@ -0,0 +1,208 @@
|
|
|
1
|
+
import { expect } from 'chai';
|
|
2
|
+
|
|
3
|
+
import createWebPageDOM from './dom.js';
|
|
4
|
+
import { removeQueryParams } from './exposedFilters.js';
|
|
5
|
+
|
|
6
|
+
describe('exposedFilters', () => {
|
|
7
|
+
let webPageDOM;
|
|
8
|
+
|
|
9
|
+
before(() => {
|
|
10
|
+
webPageDOM = createWebPageDOM('<!DOCTYPE html><html><body></body></html>');
|
|
11
|
+
});
|
|
12
|
+
|
|
13
|
+
describe('#removeQueryParams', () => {
|
|
14
|
+
describe('from links', () => {
|
|
15
|
+
let link;
|
|
16
|
+
|
|
17
|
+
before(() => {
|
|
18
|
+
link = webPageDOM.createElement('a');
|
|
19
|
+
link.href = 'https://example.com/page?utm_source=test&keep=value';
|
|
20
|
+
webPageDOM.body.appendChild(link);
|
|
21
|
+
});
|
|
22
|
+
|
|
23
|
+
after(() => {
|
|
24
|
+
link.remove();
|
|
25
|
+
});
|
|
26
|
+
|
|
27
|
+
it('removes the specified query parameters', () => {
|
|
28
|
+
removeQueryParams(webPageDOM, ['utm_source']);
|
|
29
|
+
|
|
30
|
+
expect(link.href).to.equal('https://example.com/page?keep=value');
|
|
31
|
+
});
|
|
32
|
+
});
|
|
33
|
+
|
|
34
|
+
describe('from images', () => {
|
|
35
|
+
let img;
|
|
36
|
+
|
|
37
|
+
before(() => {
|
|
38
|
+
img = webPageDOM.createElement('img');
|
|
39
|
+
img.src = 'https://example.com/image.jpg?width=100&keep=value';
|
|
40
|
+
webPageDOM.body.appendChild(img);
|
|
41
|
+
});
|
|
42
|
+
|
|
43
|
+
after(() => {
|
|
44
|
+
img.remove();
|
|
45
|
+
});
|
|
46
|
+
|
|
47
|
+
it('removes the specified query parameters', () => {
|
|
48
|
+
removeQueryParams(webPageDOM, ['width']);
|
|
49
|
+
|
|
50
|
+
expect(img.src).to.equal('https://example.com/image.jpg?keep=value');
|
|
51
|
+
});
|
|
52
|
+
});
|
|
53
|
+
|
|
54
|
+
describe('with string parameter', () => {
|
|
55
|
+
let link;
|
|
56
|
+
|
|
57
|
+
before(() => {
|
|
58
|
+
link = webPageDOM.createElement('a');
|
|
59
|
+
link.href = 'https://example.com/page?utm_source=test&keep=value';
|
|
60
|
+
webPageDOM.body.appendChild(link);
|
|
61
|
+
});
|
|
62
|
+
|
|
63
|
+
after(() => {
|
|
64
|
+
link.remove();
|
|
65
|
+
});
|
|
66
|
+
|
|
67
|
+
it('removes a single query parameter passed as string', () => {
|
|
68
|
+
removeQueryParams(webPageDOM, 'utm_source');
|
|
69
|
+
|
|
70
|
+
expect(link.href).to.equal('https://example.com/page?keep=value');
|
|
71
|
+
});
|
|
72
|
+
});
|
|
73
|
+
|
|
74
|
+
describe('with empty parameters', () => {
|
|
75
|
+
let link;
|
|
76
|
+
|
|
77
|
+
before(() => {
|
|
78
|
+
link = webPageDOM.createElement('a');
|
|
79
|
+
link.href = 'https://example.com/page?utm_source=test&keep=value';
|
|
80
|
+
webPageDOM.body.appendChild(link);
|
|
81
|
+
});
|
|
82
|
+
|
|
83
|
+
after(() => {
|
|
84
|
+
link.remove();
|
|
85
|
+
});
|
|
86
|
+
|
|
87
|
+
it('leaves the URL unchanged', () => {
|
|
88
|
+
removeQueryParams(webPageDOM, []);
|
|
89
|
+
|
|
90
|
+
expect(link.href).to.equal('https://example.com/page?utm_source=test&keep=value');
|
|
91
|
+
});
|
|
92
|
+
});
|
|
93
|
+
|
|
94
|
+
describe('with invalid URLs', () => {
|
|
95
|
+
let link;
|
|
96
|
+
|
|
97
|
+
before(() => {
|
|
98
|
+
link = webPageDOM.createElement('a');
|
|
99
|
+
link.href = 'ht^THIS_IS_WRONG^tp://example.com?utm_source=test';
|
|
100
|
+
webPageDOM.body.appendChild(link);
|
|
101
|
+
});
|
|
102
|
+
|
|
103
|
+
after(() => {
|
|
104
|
+
link.remove();
|
|
105
|
+
});
|
|
106
|
+
|
|
107
|
+
it('ignores elements with invalid URLs', () => {
|
|
108
|
+
removeQueryParams(webPageDOM, ['utm_source']);
|
|
109
|
+
|
|
110
|
+
expect(link.href).to.equal('ht^THIS_IS_WRONG^tp://example.com?utm_source=test');
|
|
111
|
+
});
|
|
112
|
+
});
|
|
113
|
+
|
|
114
|
+
describe('with multiple parameters', () => {
|
|
115
|
+
let link;
|
|
116
|
+
|
|
117
|
+
before(() => {
|
|
118
|
+
link = webPageDOM.createElement('a');
|
|
119
|
+
link.href = 'https://example.com/page?utm_source=test&utm_medium=email&keep=value&remove=me';
|
|
120
|
+
webPageDOM.body.appendChild(link);
|
|
121
|
+
});
|
|
122
|
+
|
|
123
|
+
after(() => {
|
|
124
|
+
link.remove();
|
|
125
|
+
});
|
|
126
|
+
|
|
127
|
+
it('removes all specified query parameters', () => {
|
|
128
|
+
removeQueryParams(webPageDOM, [ 'utm_source', 'utm_medium', 'remove' ]);
|
|
129
|
+
|
|
130
|
+
expect(link.href).to.equal('https://example.com/page?keep=value');
|
|
131
|
+
});
|
|
132
|
+
});
|
|
133
|
+
|
|
134
|
+
describe('with duplicate parameters', () => {
|
|
135
|
+
let link;
|
|
136
|
+
|
|
137
|
+
before(() => {
|
|
138
|
+
link = webPageDOM.createElement('a');
|
|
139
|
+
link.href = 'https://example.com/test?utm_source=to_remove_1&keep=true&utm_source=to_remove_2';
|
|
140
|
+
webPageDOM.body.appendChild(link);
|
|
141
|
+
});
|
|
142
|
+
|
|
143
|
+
after(() => {
|
|
144
|
+
link.remove();
|
|
145
|
+
});
|
|
146
|
+
|
|
147
|
+
it('removes all instances of duplicate query parameters', () => {
|
|
148
|
+
removeQueryParams(webPageDOM, ['utm_source']);
|
|
149
|
+
|
|
150
|
+
expect(link.href).to.equal('https://example.com/test?keep=true');
|
|
151
|
+
});
|
|
152
|
+
});
|
|
153
|
+
|
|
154
|
+
describe('textual content preservation', () => {
|
|
155
|
+
let codeElement;
|
|
156
|
+
let paragraphElement;
|
|
157
|
+
let preElement;
|
|
158
|
+
|
|
159
|
+
before(() => {
|
|
160
|
+
codeElement = webPageDOM.createElement('code');
|
|
161
|
+
codeElement.textContent = 'https://example.com/track?utm_source=newsletter&utm_campaign=winter';
|
|
162
|
+
webPageDOM.body.appendChild(codeElement);
|
|
163
|
+
|
|
164
|
+
paragraphElement = webPageDOM.createElement('p');
|
|
165
|
+
paragraphElement.textContent = 'When users click on links with utm_source=email or utm_medium=social, we track their behavior using https://analytics.example.com?utm_source=website&session_id=abc123.';
|
|
166
|
+
webPageDOM.body.appendChild(paragraphElement);
|
|
167
|
+
|
|
168
|
+
preElement = webPageDOM.createElement('pre');
|
|
169
|
+
preElement.textContent = `
|
|
170
|
+
// Example tracking implementation
|
|
171
|
+
const trackingUrl = 'https://tracker.com/pixel?utm_source=app&user_id=123';
|
|
172
|
+
fetch(trackingUrl);
|
|
173
|
+
`;
|
|
174
|
+
webPageDOM.body.appendChild(preElement);
|
|
175
|
+
});
|
|
176
|
+
|
|
177
|
+
after(() => {
|
|
178
|
+
codeElement.remove();
|
|
179
|
+
paragraphElement.remove();
|
|
180
|
+
preElement.remove();
|
|
181
|
+
});
|
|
182
|
+
|
|
183
|
+
it('preserves code element URLs with tracking parameters', () => {
|
|
184
|
+
const originalCodeContent = codeElement.textContent;
|
|
185
|
+
|
|
186
|
+
removeQueryParams(webPageDOM, [ 'utm_source', 'utm_campaign', 'utm_medium', 'session_id', 'user_id' ]);
|
|
187
|
+
|
|
188
|
+
expect(codeElement.textContent).to.equal(originalCodeContent);
|
|
189
|
+
});
|
|
190
|
+
|
|
191
|
+
it('preserves paragraph element URLs with tracking parameters', () => {
|
|
192
|
+
const originalParagraphContent = paragraphElement.textContent;
|
|
193
|
+
|
|
194
|
+
removeQueryParams(webPageDOM, [ 'utm_source', 'utm_campaign', 'utm_medium', 'session_id', 'user_id' ]);
|
|
195
|
+
|
|
196
|
+
expect(paragraphElement.textContent).to.equal(originalParagraphContent);
|
|
197
|
+
});
|
|
198
|
+
|
|
199
|
+
it('preserves preformatted element URLs with tracking parameters', () => {
|
|
200
|
+
const originalPreContent = preElement.textContent;
|
|
201
|
+
|
|
202
|
+
removeQueryParams(webPageDOM, [ 'utm_source', 'utm_campaign', 'utm_medium', 'session_id', 'user_id' ]);
|
|
203
|
+
|
|
204
|
+
expect(preElement.textContent).to.equal(originalPreContent);
|
|
205
|
+
});
|
|
206
|
+
});
|
|
207
|
+
});
|
|
208
|
+
});
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
export const LINKS_TO_CONVERT_SELECTOR = 'a[href]:not([href^="#"]):not([href=""])';
|
|
2
|
+
|
|
3
|
+
export default async function filter(webPageDOM, sourceDocument) {
|
|
4
|
+
await applyCustomFilters(webPageDOM, sourceDocument);
|
|
5
|
+
convertRelativeURLsToAbsolute(webPageDOM, sourceDocument.location);
|
|
6
|
+
discardNonTextualElements(webPageDOM);
|
|
7
|
+
cleanEmailProtectedLinks(webPageDOM);
|
|
8
|
+
|
|
9
|
+
return webPageDOM;
|
|
10
|
+
}
|
|
11
|
+
|
|
12
|
+
async function applyCustomFilters(webPageDOM, sourceDocument) {
|
|
13
|
+
const { location, contentSelectors = [], insignificantContentSelectors = [], filters: serviceSpecificFilters = [] } = sourceDocument;
|
|
14
|
+
|
|
15
|
+
for (const filterFunction of serviceSpecificFilters) {
|
|
16
|
+
try {
|
|
17
|
+
await filterFunction(webPageDOM, {
|
|
18
|
+
fetch: location,
|
|
19
|
+
select: contentSelectors,
|
|
20
|
+
remove: insignificantContentSelectors,
|
|
21
|
+
filter: serviceSpecificFilters.map(filter => filter.name),
|
|
22
|
+
});
|
|
23
|
+
} catch (error) {
|
|
24
|
+
throw new Error(`The filter function "${filterFunction.name}" failed: ${error}`);
|
|
25
|
+
}
|
|
26
|
+
}
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
function convertRelativeURLsToAbsolute(webPageDOM, baseURL) {
|
|
30
|
+
Array.from(webPageDOM.querySelectorAll(LINKS_TO_CONVERT_SELECTOR)).forEach(link => {
|
|
31
|
+
try {
|
|
32
|
+
link.href = new URL(link.href, baseURL).href;
|
|
33
|
+
} catch (error) {
|
|
34
|
+
// Leave the URL as is if it's invalid in the source document and can't be converted to an absolute URL
|
|
35
|
+
}
|
|
36
|
+
});
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
function discardNonTextualElements(webPageDOM) {
|
|
40
|
+
webPageDOM.querySelectorAll('script, style').forEach(node => node.remove());
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
function cleanEmailProtectedLinks(webPageDOM) {
|
|
44
|
+
webPageDOM.querySelectorAll('a[href*="/email-protection"]').forEach(node => {
|
|
45
|
+
const replacement = webPageDOM.createElement('a');
|
|
46
|
+
const [href] = node.href.split('#');
|
|
47
|
+
|
|
48
|
+
Array.from(node.attributes).forEach(attr => {
|
|
49
|
+
if (attr.name === 'href') {
|
|
50
|
+
replacement.setAttribute('href', href);
|
|
51
|
+
} else {
|
|
52
|
+
replacement.setAttribute(attr.name, attr.value);
|
|
53
|
+
}
|
|
54
|
+
});
|
|
55
|
+
|
|
56
|
+
replacement.innerHTML = '[email protected]';
|
|
57
|
+
node.parentNode.replaceChild(replacement, node);
|
|
58
|
+
});
|
|
59
|
+
}
|