reffy 15.2.1 → 16.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -10,7 +10,7 @@ The code features a generic crawler that can fetch Web specifications and genera
10
10
 
11
11
  ### Pre-requisites
12
12
 
13
- To install Reffy, you need [Node.js](https://nodejs.org/en/) 18 or greater (the crawler itself may still run with earlier versions of Node.js, but version 18 is needed to run tests).
13
+ To install Reffy, you need [Node.js](https://nodejs.org/en/) 20.12.1 or greater (the crawler itself may still run with earlier versions of Node.js but without any guarantee).
14
14
 
15
15
  ### Installation
16
16
 
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "reffy",
3
- "version": "15.2.1",
3
+ "version": "16.0.1",
4
4
  "description": "W3C/WHATWG spec dependencies exploration companion. Features a short set of tools to study spec references as well as WebIDL term definitions and references found in W3C specifications.",
5
5
  "repository": {
6
6
  "type": "git",
@@ -27,12 +27,12 @@
27
27
  ],
28
28
  "license": "MIT",
29
29
  "engines": {
30
- "node": ">=18"
30
+ "node": ">=20.12.1"
31
31
  },
32
32
  "main": "index.js",
33
33
  "bin": "./reffy.js",
34
34
  "dependencies": {
35
- "ajv": "8.15.0",
35
+ "ajv": "8.16.0",
36
36
  "ajv-formats": "3.0.1",
37
37
  "commander": "12.1.0",
38
38
  "fetch-filecache-for-crawling": "5.1.1",
@@ -43,7 +43,7 @@
43
43
  },
44
44
  "devDependencies": {
45
45
  "mocha": "10.4.0",
46
- "respec": "35.0.2",
46
+ "respec": "35.1.0",
47
47
  "respec-hljs": "2.1.1",
48
48
  "rollup": "4.18.0",
49
49
  "undici": "^6.1.0"
@@ -27,8 +27,9 @@ export default function () {
27
27
  // carry their diff (e.g. W3C Recs with candidate corrections)
28
28
  if (n.closest('.head, del')) return;
29
29
  const pageUrl = n.href.split('#')[0];
30
- // links generated by authoring tools have data-link-type set
31
- let linkSet = n.dataset.linkType ? autolinks : rawlinks;
30
+ // links generated by authoring tools have data-link-type or data-xref-type set
31
+ // Bikeshed also adds automatic untyped links in the generatedindex ("ul.index aside")
32
+ let linkSet = n.dataset.linkType || n.dataset.xrefType || n.closest("ul.index aside") ? autolinks : rawlinks;
32
33
  if (!linkSet[pageUrl]) {
33
34
  linkSet[pageUrl] = {anchors: new Set()};
34
35
  }
@@ -117,12 +117,6 @@ mockAgent
117
117
  .reply(200, '')
118
118
  .persist();
119
119
 
120
- mockAgent
121
- .get("https://www.w3.org")
122
- .intercept({ method: "GET", path: "/StyleSheets/TR/2021/base.css" })
123
- .reply(200, '')
124
- .persist();
125
-
126
120
  mockAgent
127
121
  .get("https://www.w3.org")
128
122
  .intercept({ method: "GET", path: "/Tools/respec/respec-highlight" })
@@ -16,6 +16,7 @@ const specs = require('web-specs');
16
16
  const inspect = require('util').inspect;
17
17
  const cssDfnParser = require('./css-grammar-parser');
18
18
  const postProcessor = require('./post-processor');
19
+ const ThrottledQueue = require('./throttled-queue');
19
20
  const {
20
21
  completeWithAlternativeUrls,
21
22
  expandBrowserModules,
@@ -31,6 +32,7 @@ const {
31
32
 
32
33
  const {version: reffyVersion} = require('../../package.json');
33
34
 
35
+
34
36
  /**
35
37
  * Return the spec if crawl succeeded or crawl result from given fallback list
36
38
  * if crawl yielded an error (and fallback does exist).
@@ -333,50 +335,37 @@ async function crawlList(speclist, crawlOptions) {
333
335
  list = list.filter(spec => !!spec.release);
334
336
  }
335
337
 
336
- const listAndPromise = list.map(spec => {
337
- let resolve = null;
338
- let reject = null;
339
- let readyToCrawl = new Promise((resolveFunction, rejectFunction) => {
340
- resolve = resolveFunction;
341
- reject = rejectFunction;
342
- });
343
- return { spec, readyToCrawl, resolve, reject };
344
- });
345
-
346
- // In debug mode, specs are processed one by one. In normal mode,
347
- // specs are processing in chunks
348
- const chunkSize = Math.min((crawlOptions.debug ? 1 : 4), list.length);
349
-
350
- let pos = 0;
351
- function flagNextSpecAsReadyToCrawl() {
352
- if (pos < listAndPromise.length) {
353
- listAndPromise[pos].resolve();
354
- pos += 1;
355
- }
356
- }
357
- for (let i = 0; i < chunkSize; i++) {
358
- flagNextSpecAsReadyToCrawl();
359
- }
360
-
361
- const nbStr = '' + listAndPromise.length;
362
- async function crawlSpecAndPromise(specAndPromise, idx) {
363
- await specAndPromise.readyToCrawl;
364
- const spec = specAndPromise.spec;
338
+ const nbStr = '' + list.length;
339
+ async function processSpec(spec, idx) {
365
340
  const logCounter = ('' + (idx + 1)).padStart(nbStr.length, ' ') + '/' + nbStr;
366
341
  crawlOptions.quiet ?? console.warn(`${logCounter} - ${spec.url} - crawling`);
367
342
  let result = await crawlSpec(spec, crawlOptions);
368
343
  result = await saveSpecResults(result, crawlOptions);
369
344
  crawlOptions.quiet ?? console.warn(`${logCounter} - ${spec.url} - done`);
370
- flagNextSpecAsReadyToCrawl();
371
-
372
345
  return result;
373
346
  }
374
347
 
375
- const results = await Promise.all(listAndPromise.map(crawlSpecAndPromise));
348
+ const crawlQueue = new ThrottledQueue({
349
+ maxParallel: 4,
350
+ sleepInterval: origin => {
351
+ switch (origin) {
352
+ case 'https://csswg.org': return 2000;
353
+ case 'https://www.w3.org': return 1000;
354
+ default: return 100;
355
+ }
356
+ }
357
+ });
358
+ const results = await Promise.all(list.map((spec, idx) => {
359
+ const versionToCrawl = crawlOptions.publishedVersion ?
360
+ (spec.release ? spec.release : spec.nightly) :
361
+ spec.nightly;
362
+ const urlToCrawl = versionToCrawl?.url;
363
+ return crawlQueue.runThrottledPerOrigin(urlToCrawl, processSpec, spec, idx);
364
+ }));
376
365
 
377
366
  // Close Puppeteer instance
378
367
  if (!crawlOptions.useCrawl) {
379
- teardownBrowser();
368
+ await teardownBrowser();
380
369
  }
381
370
 
382
371
  return results;
@@ -0,0 +1,129 @@
1
+ /**
2
+ * Helper function to sleep for a specified number of milliseconds
3
+ */
4
+ function sleep(ms) {
5
+ return new Promise(resolve => setTimeout(resolve, ms, 'slept'));
6
+ }
7
+
8
+
9
+ /**
10
+ * Helper function that returns the "origin" of a URL, defined in a loose way
11
+ * as the part of the true origin that identifies the server that's going to
12
+ * serve the resource.
13
+ *
14
+ * For example "github.io" for all specs under github.io, "whatwg.org" for
15
+ * all WHATWG specs, "csswg.org" for CSS specs at large (including Houdini
16
+ * and FXTF specs since they are served by the same server).
17
+ */
18
+ function getOrigin(url) {
19
+ if (!url) {
20
+ return '';
21
+ }
22
+ const origin = (new URL(url)).origin;
23
+ if (origin.endsWith('.whatwg.org')) {
24
+ return 'https://whatwg.org';
25
+ }
26
+ else if (origin.endsWith('.github.io')) {
27
+ return 'https://github.io';
28
+ }
29
+ else if (origin.endsWith('.csswg.org') ||
30
+ origin.endsWith('.css-houdini.org') ||
31
+ origin.endsWith('.fxtf.org')) {
32
+ return 'https://csswg.org';
33
+ }
34
+ else {
35
+ return origin;
36
+ }
37
+ }
38
+
39
+
40
+ /**
41
+ * The ThrottledQueue class can be used to run a series of tasks that send
42
+ * network requests to an origin server in parallel, up to a certain limit,
43
+ * while guaranteeing that only one request will be sent to a given origin
44
+ * server at a time.
45
+ */
46
+ module.exports = class ThrottledQueue {
47
+ originQueue = {};
48
+ maxParallel = 4;
49
+ sleepInterval = 2000;
50
+ ongoing = 0;
51
+ pending = [];
52
+
53
+ constructor(options = { maxParallel: 4, sleepInterval: 2000 }) {
54
+ if (options.maxParallel >= 0) {
55
+ this.maxParallel = options.maxParallel;
56
+ }
57
+ if (options.sleepInterval) {
58
+ this.sleepInterval = options.sleepInterval;
59
+ }
60
+ }
61
+
62
+ /**
63
+ * Run the given processing function with the given parameters, immediately
64
+ * if possible or as soon as possible when too many tasks are already running
65
+ * in parallel.
66
+ *
67
+ * Note this function has no notion of origin. Users may call the function
68
+ * directly if they don't need any throttling per origin.
69
+ */
70
+ async runThrottled(processFunction, ...params) {
71
+ if (this.ongoing >= this.maxParallel) {
72
+ return new Promise((resolve, reject) => {
73
+ this.pending.push({ params, resolve, reject });
74
+ });
75
+ }
76
+ else {
77
+ this.ongoing += 1;
78
+ const result = await processFunction.call(null, ...params);
79
+ this.ongoing -= 1;
80
+
81
+ // Done with current task, trigger next pending task in the background
82
+ setTimeout(_ => {
83
+ if (this.pending.length && this.ongoing < this.maxParallel) {
84
+ const next = this.pending.shift();
85
+ this.runThrottled(processFunction, ...next.params)
86
+ .then(result => next.resolve(result))
87
+ .catch(err => next.reject(err));
88
+ }
89
+ }, 0);
90
+
91
+ return result;
92
+ }
93
+ }
94
+
95
+ /**
96
+ * Run the given processing function with the given parameters, immediately
97
+ * if possible or as soon as possible when too many tasks are already running
98
+ * in parallel, or when there's already a task being run against the same
99
+ * origin as that of the provided URL.
100
+ *
101
+ * Said differently, the function serializes tasks per origin, and calls
102
+ * "runThrottled" to restrict the number of tasks that run in parallel to the
103
+ * requested maximum.
104
+ *
105
+ * Additionally, the function forces a 2 second sleep after processing to
106
+ * keep a low network profile (sleeping time can be adjusted per origin
107
+ * depending if the sleepInterval parameter that was passed to the
108
+ * constructor is a function.
109
+ */
110
+ async runThrottledPerOrigin(url, processFunction, ...params) {
111
+ const origin = getOrigin(url);
112
+ if (!this.originQueue[origin]) {
113
+ this.originQueue[origin] = Promise.resolve(true);
114
+ }
115
+ return new Promise((resolve, reject) => {
116
+ this.originQueue[origin] = this.originQueue[origin]
117
+ .then(async _ => this.runThrottled(processFunction, ...params))
118
+ .then(async result => {
119
+ const interval = (typeof this.sleepInterval === 'function') ?
120
+ this.sleepInterval(origin) :
121
+ this.sleepInterval;
122
+ await sleep(interval);
123
+ return result;
124
+ })
125
+ .then(resolve)
126
+ .catch(reject);
127
+ });
128
+ }
129
+ }
package/src/lib/util.js CHANGED
@@ -292,14 +292,6 @@ async function teardownBrowser() {
292
292
  * done loading), and that does not work with a file cache approach either.
293
293
  * These requests get intercepted.
294
294
  *
295
- * A couple of additional notes:
296
- * - Requests to CSS stylesheets are not intercepted because Respec dynamically
297
- * loads a few CSS resources, and intercepting them could perhaps impact the
298
- * rest of the generation.
299
- * - SVG images are not intercepted because a couple of specs have a PNG
300
- * fallback mechanism that, when interception is on, make the browser spin
301
- * forever, see discussion in: https://github.com/w3c/accelerometer/pull/55
302
- *
303
295
  * Strictly speaking, intercepting request is only needed to be able to use the
304
296
  * "networkidle0" option. The whole interception logic could be dropped (and
305
297
  * "networkidle2" could be used instead) if it proves too unstable.
@@ -345,7 +337,7 @@ async function processSpecification(spec, processFunction, args, options) {
345
337
  return async function ({ requestId, request }) {
346
338
  try {
347
339
  // Abort network requests to common image formats
348
- if (/\.(gif|ico|jpg|jpeg|png|ttf|woff)$/i.test(request.url)) {
340
+ if (/\.(gif|ico|jpg|jpeg|png|ttf|woff|svg|css)$/i.test(request.url)) {
349
341
  await cdp.send('Fetch.failRequest', { requestId, errorReason: 'Failed' });
350
342
  return;
351
343
  }