npm - reffy - Versions diffs - 15.2.1 → 16.0.1 - Mend

reffy 15.2.1 → 16.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

package/README.md +1 -1
package/package.json +4 -4
package/src/browserlib/extract-links.mjs +3 -2
package/src/lib/mock-server.js +0 -6
package/src/lib/specs-crawler.js +22 -33
package/src/lib/throttled-queue.js +129 -0
package/src/lib/util.js +1 -9

package/README.md CHANGED Viewed

@@ -10,7 +10,7 @@ The code features a generic crawler that can fetch Web specifications and genera
 ### Pre-requisites
-To install Reffy, you need [Node.js](https://nodejs.org/en/) 18 or greater (the crawler itself may still run with earlier versions of Node.js, but version 18 is needed to run tests).
+To install Reffy, you need [Node.js](https://nodejs.org/en/) 20.12.1 or greater (the crawler itself may still run with earlier versions of Node.js but without any guarantee).
 ### Installation

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "reffy",
-  "version": "15.2.1",
+  "version": "16.0.1",
   "description": "W3C/WHATWG spec dependencies exploration companion. Features a short set of tools to study spec references as well as WebIDL term definitions and references found in W3C specifications.",
   "repository": {
     "type": "git",
@@ -27,12 +27,12 @@
   ],
   "license": "MIT",
   "engines": {
-    "node": ">=18"
+    "node": ">=20.12.1"
   },
   "main": "index.js",
   "bin": "./reffy.js",
   "dependencies": {
-    "ajv": "8.15.0",
+    "ajv": "8.16.0",
     "ajv-formats": "3.0.1",
     "commander": "12.1.0",
     "fetch-filecache-for-crawling": "5.1.1",
@@ -43,7 +43,7 @@
   },
   "devDependencies": {
     "mocha": "10.4.0",
-    "respec": "35.0.2",
+    "respec": "35.1.0",
     "respec-hljs": "2.1.1",
     "rollup": "4.18.0",
     "undici": "^6.1.0"

package/src/browserlib/extract-links.mjs CHANGED Viewed

@@ -27,8 +27,9 @@ export default function () {
     // carry their diff (e.g. W3C Recs with candidate corrections)
     if (n.closest('.head, del')) return;
     const pageUrl = n.href.split('#')[0];
-    // links generated by authoring tools have data-link-type set
-    let linkSet = n.dataset.linkType ? autolinks : rawlinks;
+    // links generated by authoring tools have data-link-type or data-xref-type set
+    // Bikeshed also adds automatic untyped links in the generatedindex ("ul.index aside")
+    let linkSet = n.dataset.linkType || n.dataset.xrefType || n.closest("ul.index aside") ? autolinks : rawlinks;
     if (!linkSet[pageUrl]) {
       linkSet[pageUrl] = {anchors: new Set()};
     }

package/src/lib/mock-server.js CHANGED Viewed

@@ -117,12 +117,6 @@ mockAgent
   .reply(200, '')
   .persist();
-mockAgent
-  .get("https://www.w3.org")
-  .intercept({ method: "GET", path: "/StyleSheets/TR/2021/base.css" })
-  .reply(200, '')
-  .persist();
 mockAgent
   .get("https://www.w3.org")
   .intercept({ method: "GET", path: "/Tools/respec/respec-highlight" })

package/src/lib/specs-crawler.js CHANGED Viewed

@@ -16,6 +16,7 @@ const specs = require('web-specs');
 const inspect = require('util').inspect;
 const cssDfnParser = require('./css-grammar-parser');
 const postProcessor = require('./post-processor');
+const ThrottledQueue = require('./throttled-queue');
 const {
     completeWithAlternativeUrls,
     expandBrowserModules,
@@ -31,6 +32,7 @@ const {
 const {version: reffyVersion} = require('../../package.json');
 /**
  * Return the spec if crawl succeeded or crawl result from given fallback list
  * if crawl yielded an error (and fallback does exist).
@@ -333,50 +335,37 @@ async function crawlList(speclist, crawlOptions) {
         list = list.filter(spec => !!spec.release);
     }
-    const listAndPromise = list.map(spec => {
-        let resolve = null;
-        let reject = null;
-        let readyToCrawl = new Promise((resolveFunction, rejectFunction) => {
-            resolve = resolveFunction;
-            reject = rejectFunction;
-        });
-        return { spec, readyToCrawl, resolve, reject };
-    });
-    // In debug mode, specs are processed one by one. In normal mode,
-    // specs are processing in chunks
-    const chunkSize = Math.min((crawlOptions.debug ? 1 : 4), list.length);
-    let pos = 0;
-    function flagNextSpecAsReadyToCrawl() {
-        if (pos < listAndPromise.length) {
-            listAndPromise[pos].resolve();
-            pos += 1;
-        }
-    }
-    for (let i = 0; i < chunkSize; i++) {
-        flagNextSpecAsReadyToCrawl();
-    }
-    const nbStr = '' + listAndPromise.length;
-    async function crawlSpecAndPromise(specAndPromise, idx) {
-        await specAndPromise.readyToCrawl;
-        const spec = specAndPromise.spec;
+    const nbStr = '' + list.length;
+    async function processSpec(spec, idx) {
         const logCounter = ('' + (idx + 1)).padStart(nbStr.length, ' ') + '/' + nbStr;
         crawlOptions.quiet ?? console.warn(`${logCounter} - ${spec.url} - crawling`);
         let result = await crawlSpec(spec, crawlOptions);
         result = await saveSpecResults(result, crawlOptions);
         crawlOptions.quiet ?? console.warn(`${logCounter} - ${spec.url} - done`);
-        flagNextSpecAsReadyToCrawl();
         return result;
     }
-    const results = await Promise.all(listAndPromise.map(crawlSpecAndPromise));
+    const crawlQueue = new ThrottledQueue({
+        maxParallel: 4,
+        sleepInterval: origin => {
+            switch (origin) {
+            case 'https://csswg.org': return 2000;
+            case 'https://www.w3.org': return 1000;
+            default: return 100;
+            }
+        }
+    });
+    const results = await Promise.all(list.map((spec, idx) => {
+        const versionToCrawl = crawlOptions.publishedVersion ?
+            (spec.release ? spec.release : spec.nightly) :
+            spec.nightly;
+        const urlToCrawl = versionToCrawl?.url;
+        return crawlQueue.runThrottledPerOrigin(urlToCrawl, processSpec, spec, idx);
+    }));
     // Close Puppeteer instance
     if (!crawlOptions.useCrawl) {
-        teardownBrowser();
+        await teardownBrowser();
     }
     return results;

package/src/lib/throttled-queue.js ADDED Viewed

@@ -0,0 +1,129 @@
+/**
+ * Helper function to sleep for a specified number of milliseconds
+ */
+function sleep(ms) {
+    return new Promise(resolve => setTimeout(resolve, ms, 'slept'));
+}
+/**
+ * Helper function that returns the "origin" of a URL, defined in a loose way
+ * as the part of the true origin that identifies the server that's going to
+ * serve the resource.
+ *
+ * For example "github.io" for all specs under github.io, "whatwg.org" for
+ * all WHATWG specs, "csswg.org" for CSS specs at large (including Houdini
+ * and FXTF specs since they are served by the same server).
+ */
+function getOrigin(url) {
+    if (!url) {
+        return '';
+    }
+    const origin = (new URL(url)).origin;
+    if (origin.endsWith('.whatwg.org')) {
+        return 'https://whatwg.org';
+    }
+    else if (origin.endsWith('.github.io')) {
+        return 'https://github.io';
+    }
+    else if (origin.endsWith('.csswg.org') ||
+             origin.endsWith('.css-houdini.org') ||
+             origin.endsWith('.fxtf.org')) {
+        return 'https://csswg.org';
+    }
+    else {
+        return origin;
+    }
+}
+/**
+ * The ThrottledQueue class can be used to run a series of tasks that send
+ * network requests to an origin server in parallel, up to a certain limit,
+ * while guaranteeing that only one request will be sent to a given origin
+ * server at a time.
+ */
+module.exports = class ThrottledQueue {
+  originQueue = {};
+  maxParallel = 4;
+  sleepInterval = 2000;
+  ongoing = 0;
+  pending = [];
+  constructor(options = { maxParallel: 4, sleepInterval: 2000 }) {
+    if (options.maxParallel >= 0) {
+      this.maxParallel = options.maxParallel;
+    }
+    if (options.sleepInterval) {
+      this.sleepInterval = options.sleepInterval;
+    }
+  }
+  /**
+   * Run the given processing function with the given parameters, immediately
+   * if possible or as soon as possible when too many tasks are already running
+   * in parallel.
+   *
+   * Note this function has no notion of origin. Users may call the function
+   * directly if they don't need any throttling per origin.
+   */
+  async runThrottled(processFunction, ...params) {
+    if (this.ongoing >= this.maxParallel) {
+      return new Promise((resolve, reject) => {
+        this.pending.push({ params, resolve, reject });
+      });
+    }
+    else {
+      this.ongoing += 1;
+      const result = await processFunction.call(null, ...params);
+      this.ongoing -= 1;
+      // Done with current task, trigger next pending task in the background
+      setTimeout(_ => {
+        if (this.pending.length && this.ongoing < this.maxParallel) {
+          const next = this.pending.shift();
+          this.runThrottled(processFunction, ...next.params)
+            .then(result => next.resolve(result))
+            .catch(err => next.reject(err));
+        }
+      }, 0);
+      return result;
+    }
+  }
+  /**
+   * Run the given processing function with the given parameters, immediately
+   * if possible or as soon as possible when too many tasks are already running
+   * in parallel, or when there's already a task being run against the same
+   * origin as that of the provided URL.
+   *
+   * Said differently, the function serializes tasks per origin, and calls
+   * "runThrottled" to restrict the number of tasks that run in parallel to the
+   * requested maximum.
+   *
+   * Additionally, the function forces a 2 second sleep after processing to
+   * keep a low network profile (sleeping time can be adjusted per origin
+   * depending if the sleepInterval parameter that was passed to the
+   * constructor is a function.
+   */
+  async runThrottledPerOrigin(url, processFunction, ...params) {
+    const origin = getOrigin(url);
+    if (!this.originQueue[origin]) {
+      this.originQueue[origin] = Promise.resolve(true);
+    }
+    return new Promise((resolve, reject) => {
+      this.originQueue[origin] = this.originQueue[origin]
+        .then(async _ => this.runThrottled(processFunction, ...params))
+        .then(async result => {
+          const interval = (typeof this.sleepInterval === 'function') ?
+            this.sleepInterval(origin) :
+            this.sleepInterval;
+          await sleep(interval);
+          return result;
+        })
+        .then(resolve)
+        .catch(reject);
+    });
+  }
+}

package/src/lib/util.js CHANGED Viewed

@@ -292,14 +292,6 @@ async function teardownBrowser() {
  * done loading), and that does not work with a file cache approach either.
  * These requests get intercepted.
  *
- * A couple of additional notes:
- * - Requests to CSS stylesheets are not intercepted because Respec dynamically
- * loads a few CSS resources, and intercepting them could perhaps impact the
- * rest of the generation.
- * - SVG images are not intercepted because a couple of specs have a PNG
- * fallback mechanism that, when interception is on, make the browser spin
- * forever, see discussion in: https://github.com/w3c/accelerometer/pull/55
- *
  * Strictly speaking, intercepting request is only needed to be able to use the
  * "networkidle0" option. The whole interception logic could be dropped (and
  * "networkidle2" could be used instead) if it proves too unstable.
@@ -345,7 +337,7 @@ async function processSpecification(spec, processFunction, args, options) {
         return async function ({ requestId, request }) {
             try {
                 // Abort network requests to common image formats
-                if (/\.(gif|ico|jpg|jpeg|png|ttf|woff)$/i.test(request.url)) {
+                if (/\.(gif|ico|jpg|jpeg|png|ttf|woff|svg|css)$/i.test(request.url)) {
                     await cdp.send('Fetch.failRequest', { requestId, errorReason: 'Failed' });
                     return;
                 }