reffy 15.2.1 → 16.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1 -1
- package/package.json +4 -4
- package/src/browserlib/extract-links.mjs +3 -2
- package/src/lib/mock-server.js +0 -6
- package/src/lib/specs-crawler.js +22 -33
- package/src/lib/throttled-queue.js +129 -0
- package/src/lib/util.js +1 -9
package/README.md
CHANGED
|
@@ -10,7 +10,7 @@ The code features a generic crawler that can fetch Web specifications and genera
|
|
|
10
10
|
|
|
11
11
|
### Pre-requisites
|
|
12
12
|
|
|
13
|
-
To install Reffy, you need [Node.js](https://nodejs.org/en/)
|
|
13
|
+
To install Reffy, you need [Node.js](https://nodejs.org/en/) 20.12.1 or greater (the crawler itself may still run with earlier versions of Node.js but without any guarantee).
|
|
14
14
|
|
|
15
15
|
### Installation
|
|
16
16
|
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "reffy",
|
|
3
|
-
"version": "
|
|
3
|
+
"version": "16.0.1",
|
|
4
4
|
"description": "W3C/WHATWG spec dependencies exploration companion. Features a short set of tools to study spec references as well as WebIDL term definitions and references found in W3C specifications.",
|
|
5
5
|
"repository": {
|
|
6
6
|
"type": "git",
|
|
@@ -27,12 +27,12 @@
|
|
|
27
27
|
],
|
|
28
28
|
"license": "MIT",
|
|
29
29
|
"engines": {
|
|
30
|
-
"node": ">=
|
|
30
|
+
"node": ">=20.12.1"
|
|
31
31
|
},
|
|
32
32
|
"main": "index.js",
|
|
33
33
|
"bin": "./reffy.js",
|
|
34
34
|
"dependencies": {
|
|
35
|
-
"ajv": "8.
|
|
35
|
+
"ajv": "8.16.0",
|
|
36
36
|
"ajv-formats": "3.0.1",
|
|
37
37
|
"commander": "12.1.0",
|
|
38
38
|
"fetch-filecache-for-crawling": "5.1.1",
|
|
@@ -43,7 +43,7 @@
|
|
|
43
43
|
},
|
|
44
44
|
"devDependencies": {
|
|
45
45
|
"mocha": "10.4.0",
|
|
46
|
-
"respec": "35.0
|
|
46
|
+
"respec": "35.1.0",
|
|
47
47
|
"respec-hljs": "2.1.1",
|
|
48
48
|
"rollup": "4.18.0",
|
|
49
49
|
"undici": "^6.1.0"
|
|
@@ -27,8 +27,9 @@ export default function () {
|
|
|
27
27
|
// carry their diff (e.g. W3C Recs with candidate corrections)
|
|
28
28
|
if (n.closest('.head, del')) return;
|
|
29
29
|
const pageUrl = n.href.split('#')[0];
|
|
30
|
-
// links generated by authoring tools have data-link-type set
|
|
31
|
-
|
|
30
|
+
// links generated by authoring tools have data-link-type or data-xref-type set
|
|
31
|
+
// Bikeshed also adds automatic untyped links in the generatedindex ("ul.index aside")
|
|
32
|
+
let linkSet = n.dataset.linkType || n.dataset.xrefType || n.closest("ul.index aside") ? autolinks : rawlinks;
|
|
32
33
|
if (!linkSet[pageUrl]) {
|
|
33
34
|
linkSet[pageUrl] = {anchors: new Set()};
|
|
34
35
|
}
|
package/src/lib/mock-server.js
CHANGED
|
@@ -117,12 +117,6 @@ mockAgent
|
|
|
117
117
|
.reply(200, '')
|
|
118
118
|
.persist();
|
|
119
119
|
|
|
120
|
-
mockAgent
|
|
121
|
-
.get("https://www.w3.org")
|
|
122
|
-
.intercept({ method: "GET", path: "/StyleSheets/TR/2021/base.css" })
|
|
123
|
-
.reply(200, '')
|
|
124
|
-
.persist();
|
|
125
|
-
|
|
126
120
|
mockAgent
|
|
127
121
|
.get("https://www.w3.org")
|
|
128
122
|
.intercept({ method: "GET", path: "/Tools/respec/respec-highlight" })
|
package/src/lib/specs-crawler.js
CHANGED
|
@@ -16,6 +16,7 @@ const specs = require('web-specs');
|
|
|
16
16
|
const inspect = require('util').inspect;
|
|
17
17
|
const cssDfnParser = require('./css-grammar-parser');
|
|
18
18
|
const postProcessor = require('./post-processor');
|
|
19
|
+
const ThrottledQueue = require('./throttled-queue');
|
|
19
20
|
const {
|
|
20
21
|
completeWithAlternativeUrls,
|
|
21
22
|
expandBrowserModules,
|
|
@@ -31,6 +32,7 @@ const {
|
|
|
31
32
|
|
|
32
33
|
const {version: reffyVersion} = require('../../package.json');
|
|
33
34
|
|
|
35
|
+
|
|
34
36
|
/**
|
|
35
37
|
* Return the spec if crawl succeeded or crawl result from given fallback list
|
|
36
38
|
* if crawl yielded an error (and fallback does exist).
|
|
@@ -333,50 +335,37 @@ async function crawlList(speclist, crawlOptions) {
|
|
|
333
335
|
list = list.filter(spec => !!spec.release);
|
|
334
336
|
}
|
|
335
337
|
|
|
336
|
-
const
|
|
337
|
-
|
|
338
|
-
let reject = null;
|
|
339
|
-
let readyToCrawl = new Promise((resolveFunction, rejectFunction) => {
|
|
340
|
-
resolve = resolveFunction;
|
|
341
|
-
reject = rejectFunction;
|
|
342
|
-
});
|
|
343
|
-
return { spec, readyToCrawl, resolve, reject };
|
|
344
|
-
});
|
|
345
|
-
|
|
346
|
-
// In debug mode, specs are processed one by one. In normal mode,
|
|
347
|
-
// specs are processing in chunks
|
|
348
|
-
const chunkSize = Math.min((crawlOptions.debug ? 1 : 4), list.length);
|
|
349
|
-
|
|
350
|
-
let pos = 0;
|
|
351
|
-
function flagNextSpecAsReadyToCrawl() {
|
|
352
|
-
if (pos < listAndPromise.length) {
|
|
353
|
-
listAndPromise[pos].resolve();
|
|
354
|
-
pos += 1;
|
|
355
|
-
}
|
|
356
|
-
}
|
|
357
|
-
for (let i = 0; i < chunkSize; i++) {
|
|
358
|
-
flagNextSpecAsReadyToCrawl();
|
|
359
|
-
}
|
|
360
|
-
|
|
361
|
-
const nbStr = '' + listAndPromise.length;
|
|
362
|
-
async function crawlSpecAndPromise(specAndPromise, idx) {
|
|
363
|
-
await specAndPromise.readyToCrawl;
|
|
364
|
-
const spec = specAndPromise.spec;
|
|
338
|
+
const nbStr = '' + list.length;
|
|
339
|
+
async function processSpec(spec, idx) {
|
|
365
340
|
const logCounter = ('' + (idx + 1)).padStart(nbStr.length, ' ') + '/' + nbStr;
|
|
366
341
|
crawlOptions.quiet ?? console.warn(`${logCounter} - ${spec.url} - crawling`);
|
|
367
342
|
let result = await crawlSpec(spec, crawlOptions);
|
|
368
343
|
result = await saveSpecResults(result, crawlOptions);
|
|
369
344
|
crawlOptions.quiet ?? console.warn(`${logCounter} - ${spec.url} - done`);
|
|
370
|
-
flagNextSpecAsReadyToCrawl();
|
|
371
|
-
|
|
372
345
|
return result;
|
|
373
346
|
}
|
|
374
347
|
|
|
375
|
-
const
|
|
348
|
+
const crawlQueue = new ThrottledQueue({
|
|
349
|
+
maxParallel: 4,
|
|
350
|
+
sleepInterval: origin => {
|
|
351
|
+
switch (origin) {
|
|
352
|
+
case 'https://csswg.org': return 2000;
|
|
353
|
+
case 'https://www.w3.org': return 1000;
|
|
354
|
+
default: return 100;
|
|
355
|
+
}
|
|
356
|
+
}
|
|
357
|
+
});
|
|
358
|
+
const results = await Promise.all(list.map((spec, idx) => {
|
|
359
|
+
const versionToCrawl = crawlOptions.publishedVersion ?
|
|
360
|
+
(spec.release ? spec.release : spec.nightly) :
|
|
361
|
+
spec.nightly;
|
|
362
|
+
const urlToCrawl = versionToCrawl?.url;
|
|
363
|
+
return crawlQueue.runThrottledPerOrigin(urlToCrawl, processSpec, spec, idx);
|
|
364
|
+
}));
|
|
376
365
|
|
|
377
366
|
// Close Puppeteer instance
|
|
378
367
|
if (!crawlOptions.useCrawl) {
|
|
379
|
-
teardownBrowser();
|
|
368
|
+
await teardownBrowser();
|
|
380
369
|
}
|
|
381
370
|
|
|
382
371
|
return results;
|
|
@@ -0,0 +1,129 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Helper function to sleep for a specified number of milliseconds
|
|
3
|
+
*/
|
|
4
|
+
function sleep(ms) {
|
|
5
|
+
return new Promise(resolve => setTimeout(resolve, ms, 'slept'));
|
|
6
|
+
}
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
/**
|
|
10
|
+
* Helper function that returns the "origin" of a URL, defined in a loose way
|
|
11
|
+
* as the part of the true origin that identifies the server that's going to
|
|
12
|
+
* serve the resource.
|
|
13
|
+
*
|
|
14
|
+
* For example "github.io" for all specs under github.io, "whatwg.org" for
|
|
15
|
+
* all WHATWG specs, "csswg.org" for CSS specs at large (including Houdini
|
|
16
|
+
* and FXTF specs since they are served by the same server).
|
|
17
|
+
*/
|
|
18
|
+
function getOrigin(url) {
|
|
19
|
+
if (!url) {
|
|
20
|
+
return '';
|
|
21
|
+
}
|
|
22
|
+
const origin = (new URL(url)).origin;
|
|
23
|
+
if (origin.endsWith('.whatwg.org')) {
|
|
24
|
+
return 'https://whatwg.org';
|
|
25
|
+
}
|
|
26
|
+
else if (origin.endsWith('.github.io')) {
|
|
27
|
+
return 'https://github.io';
|
|
28
|
+
}
|
|
29
|
+
else if (origin.endsWith('.csswg.org') ||
|
|
30
|
+
origin.endsWith('.css-houdini.org') ||
|
|
31
|
+
origin.endsWith('.fxtf.org')) {
|
|
32
|
+
return 'https://csswg.org';
|
|
33
|
+
}
|
|
34
|
+
else {
|
|
35
|
+
return origin;
|
|
36
|
+
}
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
/**
|
|
41
|
+
* The ThrottledQueue class can be used to run a series of tasks that send
|
|
42
|
+
* network requests to an origin server in parallel, up to a certain limit,
|
|
43
|
+
* while guaranteeing that only one request will be sent to a given origin
|
|
44
|
+
* server at a time.
|
|
45
|
+
*/
|
|
46
|
+
module.exports = class ThrottledQueue {
|
|
47
|
+
originQueue = {};
|
|
48
|
+
maxParallel = 4;
|
|
49
|
+
sleepInterval = 2000;
|
|
50
|
+
ongoing = 0;
|
|
51
|
+
pending = [];
|
|
52
|
+
|
|
53
|
+
constructor(options = { maxParallel: 4, sleepInterval: 2000 }) {
|
|
54
|
+
if (options.maxParallel >= 0) {
|
|
55
|
+
this.maxParallel = options.maxParallel;
|
|
56
|
+
}
|
|
57
|
+
if (options.sleepInterval) {
|
|
58
|
+
this.sleepInterval = options.sleepInterval;
|
|
59
|
+
}
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
/**
|
|
63
|
+
* Run the given processing function with the given parameters, immediately
|
|
64
|
+
* if possible or as soon as possible when too many tasks are already running
|
|
65
|
+
* in parallel.
|
|
66
|
+
*
|
|
67
|
+
* Note this function has no notion of origin. Users may call the function
|
|
68
|
+
* directly if they don't need any throttling per origin.
|
|
69
|
+
*/
|
|
70
|
+
async runThrottled(processFunction, ...params) {
|
|
71
|
+
if (this.ongoing >= this.maxParallel) {
|
|
72
|
+
return new Promise((resolve, reject) => {
|
|
73
|
+
this.pending.push({ params, resolve, reject });
|
|
74
|
+
});
|
|
75
|
+
}
|
|
76
|
+
else {
|
|
77
|
+
this.ongoing += 1;
|
|
78
|
+
const result = await processFunction.call(null, ...params);
|
|
79
|
+
this.ongoing -= 1;
|
|
80
|
+
|
|
81
|
+
// Done with current task, trigger next pending task in the background
|
|
82
|
+
setTimeout(_ => {
|
|
83
|
+
if (this.pending.length && this.ongoing < this.maxParallel) {
|
|
84
|
+
const next = this.pending.shift();
|
|
85
|
+
this.runThrottled(processFunction, ...next.params)
|
|
86
|
+
.then(result => next.resolve(result))
|
|
87
|
+
.catch(err => next.reject(err));
|
|
88
|
+
}
|
|
89
|
+
}, 0);
|
|
90
|
+
|
|
91
|
+
return result;
|
|
92
|
+
}
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
/**
|
|
96
|
+
* Run the given processing function with the given parameters, immediately
|
|
97
|
+
* if possible or as soon as possible when too many tasks are already running
|
|
98
|
+
* in parallel, or when there's already a task being run against the same
|
|
99
|
+
* origin as that of the provided URL.
|
|
100
|
+
*
|
|
101
|
+
* Said differently, the function serializes tasks per origin, and calls
|
|
102
|
+
* "runThrottled" to restrict the number of tasks that run in parallel to the
|
|
103
|
+
* requested maximum.
|
|
104
|
+
*
|
|
105
|
+
* Additionally, the function forces a 2 second sleep after processing to
|
|
106
|
+
* keep a low network profile (sleeping time can be adjusted per origin
|
|
107
|
+
* depending if the sleepInterval parameter that was passed to the
|
|
108
|
+
* constructor is a function.
|
|
109
|
+
*/
|
|
110
|
+
async runThrottledPerOrigin(url, processFunction, ...params) {
|
|
111
|
+
const origin = getOrigin(url);
|
|
112
|
+
if (!this.originQueue[origin]) {
|
|
113
|
+
this.originQueue[origin] = Promise.resolve(true);
|
|
114
|
+
}
|
|
115
|
+
return new Promise((resolve, reject) => {
|
|
116
|
+
this.originQueue[origin] = this.originQueue[origin]
|
|
117
|
+
.then(async _ => this.runThrottled(processFunction, ...params))
|
|
118
|
+
.then(async result => {
|
|
119
|
+
const interval = (typeof this.sleepInterval === 'function') ?
|
|
120
|
+
this.sleepInterval(origin) :
|
|
121
|
+
this.sleepInterval;
|
|
122
|
+
await sleep(interval);
|
|
123
|
+
return result;
|
|
124
|
+
})
|
|
125
|
+
.then(resolve)
|
|
126
|
+
.catch(reject);
|
|
127
|
+
});
|
|
128
|
+
}
|
|
129
|
+
}
|
package/src/lib/util.js
CHANGED
|
@@ -292,14 +292,6 @@ async function teardownBrowser() {
|
|
|
292
292
|
* done loading), and that does not work with a file cache approach either.
|
|
293
293
|
* These requests get intercepted.
|
|
294
294
|
*
|
|
295
|
-
* A couple of additional notes:
|
|
296
|
-
* - Requests to CSS stylesheets are not intercepted because Respec dynamically
|
|
297
|
-
* loads a few CSS resources, and intercepting them could perhaps impact the
|
|
298
|
-
* rest of the generation.
|
|
299
|
-
* - SVG images are not intercepted because a couple of specs have a PNG
|
|
300
|
-
* fallback mechanism that, when interception is on, make the browser spin
|
|
301
|
-
* forever, see discussion in: https://github.com/w3c/accelerometer/pull/55
|
|
302
|
-
*
|
|
303
295
|
* Strictly speaking, intercepting request is only needed to be able to use the
|
|
304
296
|
* "networkidle0" option. The whole interception logic could be dropped (and
|
|
305
297
|
* "networkidle2" could be used instead) if it proves too unstable.
|
|
@@ -345,7 +337,7 @@ async function processSpecification(spec, processFunction, args, options) {
|
|
|
345
337
|
return async function ({ requestId, request }) {
|
|
346
338
|
try {
|
|
347
339
|
// Abort network requests to common image formats
|
|
348
|
-
if (/\.(gif|ico|jpg|jpeg|png|ttf|woff)$/i.test(request.url)) {
|
|
340
|
+
if (/\.(gif|ico|jpg|jpeg|png|ttf|woff|svg|css)$/i.test(request.url)) {
|
|
349
341
|
await cdp.send('Fetch.failRequest', { requestId, errorReason: 'Failed' });
|
|
350
342
|
return;
|
|
351
343
|
}
|