reffy 6.1.4 → 6.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. package/LICENSE +21 -21
  2. package/README.md +158 -158
  3. package/index.js +11 -11
  4. package/package.json +53 -53
  5. package/reffy.js +248 -236
  6. package/src/browserlib/canonicalize-url.mjs +50 -50
  7. package/src/browserlib/create-outline.mjs +352 -352
  8. package/src/browserlib/extract-cssdfn.mjs +319 -319
  9. package/src/browserlib/extract-dfns.mjs +686 -686
  10. package/src/browserlib/extract-elements.mjs +205 -205
  11. package/src/browserlib/extract-headings.mjs +48 -48
  12. package/src/browserlib/extract-ids.mjs +28 -28
  13. package/src/browserlib/extract-links.mjs +28 -28
  14. package/src/browserlib/extract-references.mjs +203 -203
  15. package/src/browserlib/extract-webidl.mjs +134 -134
  16. package/src/browserlib/get-absolute-url.mjs +21 -21
  17. package/src/browserlib/get-generator.mjs +26 -26
  18. package/src/browserlib/get-lastmodified-date.mjs +13 -13
  19. package/src/browserlib/get-title.mjs +11 -11
  20. package/src/browserlib/informative-selector.mjs +16 -16
  21. package/src/browserlib/map-ids-to-headings.mjs +136 -136
  22. package/src/browserlib/reffy.json +53 -53
  23. package/src/cli/check-missing-dfns.js +609 -609
  24. package/src/cli/generate-idlnames.js +430 -430
  25. package/src/cli/generate-idlparsed.js +139 -139
  26. package/src/cli/merge-crawl-results.js +128 -128
  27. package/src/cli/parse-webidl.js +430 -430
  28. package/src/lib/css-grammar-parse-tree.schema.json +109 -109
  29. package/src/lib/css-grammar-parser.js +440 -440
  30. package/src/lib/fetch.js +56 -56
  31. package/src/lib/nock-server.js +127 -112
  32. package/src/lib/specs-crawler.js +622 -552
  33. package/src/lib/util.js +944 -865
  34. package/src/specs/missing-css-rules.json +197 -197
  35. package/src/specs/spec-equivalents.json +149 -149
  36. package/src/browserlib/extract-editors.mjs~ +0 -14
  37. package/src/browserlib/generate-es-dfn-report.sh~ +0 -4
  38. package/src/cli/csstree-grammar-check.js +0 -28
  39. package/src/cli/csstree-grammar-check.js~ +0 -10
  40. package/src/cli/csstree-grammar-parser.js +0 -11
  41. package/src/cli/csstree-grammar-parser.js~ +0 -1
  42. package/src/cli/extract-editors.js~ +0 -38
  43. package/src/cli/process-specs.js~ +0 -28
package/src/lib/util.js CHANGED
@@ -1,865 +1,944 @@
1
- /**
2
- * A bunch of utility functions common to multiple scripts
3
- */
4
-
5
- const fs = require('fs').promises;
6
- const { existsSync } = require('fs');
7
- const path = require('path');
8
- const puppeteer = require('puppeteer');
9
- const crypto = require('crypto');
10
- const { AbortController } = require('abortcontroller-polyfill/dist/cjs-ponyfill');
11
- const fetch = require('./fetch');
12
- const specEquivalents = require('../specs/spec-equivalents.json');
13
-
14
- const reffyModules = require('../browserlib/reffy.json');
15
-
16
-
17
- /**
18
- * Maximum depth difference supported between Reffy's install path and custom
19
- * modules that may be provided on the command-line
20
- *
21
- * TODO: Find a way to get right of that, there should be no limit
22
- */
23
- const maxPathDepth = 20;
24
-
25
-
26
- /**
27
- * Returns a range array from 0 to the number provided (not included)
28
- */
29
- const range = n => Array.from(Array(n).keys());
30
-
31
-
32
- /**
33
- * Shortcut that returns a property extractor iterator
34
- */
35
- const prop = p => x => x[p];
36
-
37
-
38
- /**
39
- * Wrapper around the "require" function to require files relative to the
40
- * current working directory (CWD), instead of relative to the current JS
41
- * file.
42
- *
43
- * This is typically needed to be able to use "require" to load JSON config
44
- * files provided as command-line arguments.
45
- *
46
- * @function
47
- * @param {String} filename The path to the file to require
48
- * @return {Object} The result of requiring the file relative to the current
49
- * working directory.
50
- */
51
- function requireFromWorkingDirectory(filename) {
52
- try {
53
- return require(path.resolve(filename));
54
- }
55
- catch (err) {
56
- return null;
57
- }
58
- }
59
-
60
-
61
- /**
62
- * Determine the path to the "node_modules" folder to resolve relative links
63
- * in the ES6 browser lib modules. The path depends on whether Reffy is run
64
- * directly, or installed as a library.
65
- *
66
- * @function
67
- * @return {String} Path to the node_modules folder.
68
- */
69
- function getModulesFolder() {
70
- const rootFolder = path.resolve(__dirname, '../..');
71
- let folder = path.resolve(rootFolder, 'node_modules');
72
- if (existsSync(folder)) {
73
- return folder;
74
- }
75
- folder = path.resolve(rootFolder, '..');
76
- return folder;
77
- }
78
- const modulesFolder = getModulesFolder();
79
-
80
-
81
- /**
82
- * Puppeteer browser instance used to load and process specifications
83
- */
84
- let browser = null;
85
-
86
- /**
87
- * Promise resolved when there is no running instance of Puppeteer. This allows
88
- * to serialize calls to setupBrowser (and thus to crawlList and crawlSpecs in
89
- * specs-crawler.js)
90
- */
91
- let browserClosed = Promise.resolve();
92
- let resolveBrowserClosed = null;
93
-
94
- /**
95
- * The browser JS library that will be loaded onto every crawled page
96
- */
97
- let browserlib = null;
98
-
99
-
100
- /**
101
- * Expand list of browser modules with right set of descriptive properties
102
- *
103
- * User may specify a browser module as:
104
- * - a name which must match one of the existing modules in browserlib
105
- * - a relative path to an .mjs file which must exist
106
- * - an object with an "href" property that is a relative path to an .mjs file
107
- * which must exist
108
- *
109
- * Relative paths provided by the user are interpreted as relative to the
110
- * current working directory, and converted to be relative to the browserlib
111
- * directory.
112
- *
113
- * @function
114
- * @public
115
- * @return {Array(Object)} List of modules with an href, name and property keys
116
- */
117
- function expandBrowserModules(modules) {
118
- // Helper function to create a camelCase name out of a module path
119
- function getCamelCaseName(href) {
120
- const filename = href.replace(/([^\/\\]+)\.mjs$/, '$1');
121
- const nameParts = filename.split('-');
122
- let name;
123
- let namePart;
124
- while (namePart = nameParts.shift()) {
125
- namePart = namePart.replace(/\W/g, '');
126
- if (name) {
127
- name += namePart.substring(0, 1).toUpperCase() + namePart.substring(1);
128
- }
129
- else {
130
- name = namePart;
131
- }
132
- }
133
- return name;
134
- }
135
-
136
- const browserlibPath = path.resolve(__dirname, '..', 'browserlib');
137
- if (!modules) {
138
- return reffyModules.map(mod => Object.assign({
139
- name: getCamelCaseName(mod.href),
140
- expanded: true
141
- }, mod));
142
- }
143
-
144
- modules = modules.map(mod => {
145
- if (typeof mod === 'string') {
146
- if (mod.endsWith('.mjs')) {
147
- const name = getCamelCaseName(mod);
148
- return {
149
- href: path.relative(browserlibPath, path.join(process.cwd(), mod)).replace(/\\/g, '/'),
150
- name,
151
- property: name,
152
- expanded: true
153
- };
154
- }
155
- else if (mod === 'core') {
156
- return reffyModules.map(mod => Object.assign({
157
- name: getCamelCaseName(mod.href),
158
- expanded: true
159
- }, mod));
160
- }
161
- else {
162
- const res = reffyModules.find(m => m.href === mod ||
163
- getCamelCaseName(m.href) === mod || m.property === mod);
164
- if (!res) {
165
- throw new Error(`Unknown browserlib module ${mod}`);
166
- }
167
- return Object.assign({
168
- name: getCamelCaseName(res.href),
169
- expanded: true
170
- }, res);
171
- }
172
- }
173
- else if (mod.expanded) {
174
- return mod;
175
- }
176
- else {
177
- if (!mod.href) {
178
- throw new Error('Browserlib module does not have an "href" property');
179
- }
180
- mod.href = path.relative(browserlibPath, path.join(process.cwd(), mod.href)).replace(/\\/g, '/');
181
- if (!mod.name) {
182
- mod.name = getCamelCaseName(mod.href);
183
- }
184
- if (!mod.property) {
185
- mod.property = mod.name;
186
- }
187
- mod.expanded = true;
188
- return mod;
189
- }
190
- });
191
-
192
- return modules.flat();
193
- }
194
-
195
-
196
- /**
197
- * Prepare the browserlib script that will be loaded in every crawled page.
198
- *
199
- * The script exposes a global reffy namespace with the requested modules.
200
- *
201
- * The function must be called before any attempt to call `processSpecification`
202
- * and should only be called once. The `setupBrowser` function takes care of it.
203
- *
204
- * @function
205
- * @private
206
- */
207
- function setupBrowserlib(modules) {
208
- modules = expandBrowserModules(modules);
209
- browserlib = 'window.reffy = window.reffy ?? {};\n';
210
-
211
- if (modules.find(module => module.needsIdToHeadingMap)) {
212
- browserlib += `
213
- import mapIdsToHeadings from './map-ids-to-headings.mjs';
214
- window.reffy.mapIdsToHeadings = mapIdsToHeadings;\n`;
215
- }
216
-
217
- browserlib += modules.map(module => `
218
- import ${module.name} from '${module.href}';
219
- window.reffy.${module.name} = ${module.name};
220
- `).join('\n');
221
- }
222
-
223
-
224
- /**
225
- * Setup and launch browser instance to use to load and process specifications.
226
- *
227
- * The function must be called before any attempt to call `processSpecification`
228
- * and should only be called once.
229
- *
230
- * The function also generates the code that will inject the `reffy` namespace
231
- * in each processed page.
232
- *
233
- * Note: Switch `headless` to `false` to access dev tools and debug processing
234
- *
235
- * @function
236
- * @public
237
- */
238
- async function setupBrowser(modules) {
239
- // There can be only one crawl running at a time
240
- await browserClosed;
241
- browserClosed = new Promise(resolve => resolveBrowserClosed = resolve);
242
-
243
- // Create browser instance
244
- // Note: switch "headless" to "false" (and comment out the call to
245
- // "browser.close()") to access dev tools in debug mode
246
- browser = await puppeteer.launch({ headless: true });
247
- setupBrowserlib(modules);
248
- }
249
-
250
-
251
- /**
252
- * Close and destroy browser instance.
253
- *
254
- * The function should be called once at the end of the processing.
255
- *
256
- * @function
257
- * @public
258
- */
259
- async function teardownBrowser() {
260
- if (browser) {
261
- await browser.close();
262
- browser = null;
263
- resolveBrowserClosed();
264
- resolveBrowserClosed = null;
265
- }
266
- }
267
-
268
-
269
- /**
270
- * Load and process the given specification.
271
- *
272
- * The method automatically exposes Reffy's library functions in a window.reffy
273
- * namespace (see setupBrowserlib) so that the callback function can
274
- * call them directly. Additional callback arguments that would need to be
275
- * passed to the browser context can be provided through the "args" parameter.
276
- *
277
- * A crawl will typically fetch and render hundreds of specs, triggering a lot
278
- * of network requests. Given that some of these requests (e.g. those on images)
279
- * are of no interest for the processing, that it is wasteful to fetch the same
280
- * resource again and again during a crawl, and that it is useful to have an
281
- * offline mode for debugging purpose, the method will intercept network
282
- * requests made by the browser, fail those that don't seem needed, and serve
283
- * requests on resources that have already been fetched from a local file cache
284
- * (the "cacheRefresh" setting in "config.json" allows to adjust this behavior).
285
- *
286
- * This triggers a few hiccups and needs for workarounds though:
287
- * - Puppeteer's page.setRequestInterception does not play nicely with workers
288
- * (which Respec typically uses) for the time being, so code uses the Chrome
289
- * DevTools Protocol (CDP) directly, see:
290
- * https://github.com/puppeteer/puppeteer/issues/4208
291
- * - Tampering with network requests means that the loaded page gets
292
- * automatically flagged as "non secure". That's mostly fine but means that
293
- * "window.crypto.subtle" is not available and Respec needs that to generate
294
- * hashes. The code re-creates that method manually.
295
- * - A few specs send HTTP requests that return "streams". This does not work
296
- * well with Puppeteer's "networkidle0" option (to detect when a spec is mostly
297
- * done loading), and that does not work with a file cache approach either.
298
- * These requests get intercepted.
299
- *
300
- * A couple of additional notes:
301
- * - Requests to CSS stylesheets are not intercepted because Respec dynamically
302
- * loads a few CSS resources, and intercepting them could perhaps impact the
303
- * rest of the generation.
304
- * - SVG images are not intercepted because a couple of specs have a PNG
305
- * fallback mechanism that, when interception is on, make the browser spin
306
- * forever, see discussion in: https://github.com/w3c/accelerometer/pull/55
307
- *
308
- * Strictly speaking, intercepting request is only needed to be able to use the
309
- * "networkidle0" option. The whole interception logic could be dropped (and
310
- * "networkidle2" could be used instead) if it proves too unstable.
311
- *
312
- * @function
313
- * @public
314
- * @param {Object|String} spec The spec to load. Must either be a URL string or
315
- * an object with a "url" property. If the object contains an "html" property,
316
- * the HTML content is loaded instead.
317
- * @param {function} processFunction Processing function that will be evaluated
318
- * in the browser context where the spec gets loaded
319
- * @param {Arrays} args List of arguments to pass to the processing function.
320
- * These arguments typically make it possible to pass contextual information
321
- * to the processing function (such as the spec object that describes the
322
- * spec being processed, or the list of processing modules to run)
323
- * @param {Object} options Processing options. The "quiet" flag tells the
324
- * function not to report warnings to the console. The "forceLocalFetch"
325
- * flag tells the function that all network requests need to be only handled
326
- * by Node.js's "fetch" function (as opposed to falling back to Puppeteer's
327
- * network and caching logic), which is useful to keep full control of network
328
- * requests in tests.
329
- * @return {Promise} The promise to get the results of the processing function
330
- */
331
- async function processSpecification(spec, processFunction, args, options) {
332
- spec = (typeof spec === 'string') ? { url: spec } : spec;
333
- processFunction = processFunction || function () {};
334
- args = args || [];
335
- options = options || {};
336
-
337
- if (!browser) {
338
- throw new Error('Browser instance not initialized, setupBrowser() must be called before processSpecification().');
339
- }
340
-
341
- // Create an abort controller for network requests directly handled by the
342
- // Node.js code (and not by Puppeteer)
343
- const abortController = new AbortController();
344
-
345
- // Inner function that returns a network interception method suitable for
346
- // a given CDP session.
347
- function interceptRequest(cdp, controller) {
348
- return async function ({ requestId, request }) {
349
- try {
350
- // Abort network requests to common image formats
351
- if (/\.(gif|ico|jpg|jpeg|png|ttf|woff)$/i.test(request.url)) {
352
- await cdp.send('Fetch.failRequest', { requestId, errorReason: 'Failed' });
353
- return;
354
- }
355
-
356
- // Abort network requests that return a "stream", they won't
357
- // play well with Puppeteer's "networkidle0" option, and our
358
- // custom "fetch" function does not handle streams in any case
359
- if (request.url.startsWith('https://drafts.csswg.org/api/drafts/') ||
360
- request.url.startsWith('https://drafts.css-houdini.org/api/drafts/') ||
361
- request.url.startsWith('https://drafts.fxtf.org/api/drafts/') ||
362
- request.url.startsWith('https://api.csswg.org/shepherd/')) {
363
- await cdp.send('Fetch.failRequest', { requestId, errorReason: 'Failed' });
364
- return;
365
- }
366
-
367
- // The request needs to be intercepted, either because it
368
- // targets one of the local script files, or because we would
369
- // like to use our local cache to avoid sending network requests
370
- // when possible.
371
- //console.log(`intercept ${request.url}`);
372
- const reffyPath = '/reffy/scripts/';
373
- const webidl2Path = '/node_modules/webidl2/';
374
- if (request.url.includes(reffyPath) || request.url.includes(webidl2Path)) {
375
- let body;
376
- if (request.url.endsWith('reffy.mjs')) {
377
- body = Buffer.from(browserlib);
378
- }
379
- else if (request.url.includes(webidl2Path)) {
380
- const file = path.resolve(modulesFolder, 'webidl2',
381
- request.url.substring(request.url.indexOf(webidl2Path) + webidl2Path.length));
382
- body = await fs.readFile(file);
383
- }
384
- else {
385
- // The "__" folders are just a means to resolve
386
- // relative paths that are higher than the "browserlib"
387
- // folder on the storage drive
388
- const requestPath = request.url.substring(request.url.indexOf(reffyPath) + reffyPath.length);
389
- let depth = requestPath.lastIndexOf('__/') / 3;
390
- const filename = requestPath.substring(requestPath.lastIndexOf('__/') + 3);
391
- let filePath = path.resolve(__dirname, '..', 'browserlib');
392
- while (depth < maxPathDepth - 1) {
393
- filePath = path.resolve(filePath, '..');
394
- depth += 1;
395
- }
396
- const file = path.resolve(filePath, filename);
397
- body = await fs.readFile(file);
398
- }
399
- await cdp.send('Fetch.fulfillRequest', {
400
- requestId,
401
- responseCode: 200,
402
- responseHeaders: [{ name: 'Content-Type', value: 'application/javascript' }],
403
- body: body.toString('base64')
404
- });
405
- }
406
- else {
407
- if ((request.method !== 'GET') ||
408
- (!request.url.startsWith('http:') && !request.url.startsWith('https:'))) {
409
- await cdp.send('Fetch.continueRequest', { requestId });
410
- return;
411
- }
412
-
413
- const response = await fetch(request.url, { signal: controller.signal });
414
- const body = await response.buffer();
415
- await cdp.send('Fetch.fulfillRequest', {
416
- requestId,
417
- responseCode: response.status,
418
- responseHeaders: Object.keys(response.headers.raw()).map(header => {
419
- return {
420
- name: header,
421
- value: response.headers.raw()[header].join(',')
422
- };
423
- }),
424
- body: body.toString('base64')
425
- });
426
- }
427
- //console.log(`intercept ${request.url} - done`);
428
- }
429
- catch (err) {
430
- if (controller.signal.aborted) {
431
- // All is normal, processing was over, page and CDP session
432
- // have been closed, and network requests have been aborted
433
- // console.log(`intercept ${request.url} - aborted`);
434
- return;
435
- }
436
-
437
- // Fetch from file cache failed somehow
438
- // Let Puppeteer handle the request as fallback unless
439
- // calling function asked us not to do that
440
- if (options.forceLocalFetch) {
441
- options.quiet ?? console.warn(`[warn] Network request for ${request.url} failed`, err);
442
- await cdp.send('Fetch.failRequest', { requestId, errorReason: 'Failed' });
443
- }
444
- else {
445
- options.quiet ?? console.warn(`[warn] Fall back to regular network request for ${request.url}`, err);
446
- try {
447
- await cdp.send('Fetch.continueRequest', { requestId });
448
- }
449
- catch (err) {
450
- if (!controller.signal.aborted) {
451
- options.quiet ?? console.warn(`[warn] Fall back to regular network request for ${request.url} failed`, err);
452
- }
453
- }
454
- }
455
- }
456
- }
457
- }
458
-
459
- try {
460
- const page = await browser.newPage();
461
-
462
- // Disable cache if caller wants to handle all network requests
463
- await page.setCacheEnabled(!options.forceLocalFetch);
464
-
465
- // Intercept all network requests to use our own version of "fetch"
466
- // that makes use of the local file cache.
467
- const cdp = await page.target().createCDPSession();
468
- await cdp.send('Fetch.enable');
469
- cdp.on('Fetch.requestPaused', interceptRequest(cdp, abortController));
470
-
471
- // Quick and dirty workaround to re-create the "window.crypto.digest"
472
- // function that Respec needs (context is seen as unsecure because we're
473
- // tampering with network requests)
474
- await page.exposeFunction('hashdigest', (algorithm, buffer) => {
475
- return crypto.createHash(algorithm).update(Buffer.from(Object.values(buffer))).digest();
476
- });
477
- await page.evaluateOnNewDocument(() => {
478
- window.crypto.subtle = {
479
- digest: function (algorithm, buffer) {
480
- const res = window.hashdigest('sha1', buffer);
481
- return res.then(buf => {
482
- return Uint8Array.from(buf.data);
483
- });
484
- }
485
- };
486
- });
487
-
488
- // Common loading option to give the browser enough time to load large
489
- // specs, and to consider navigation done when there haven't been
490
- // network connections in the past 500ms. This should be enough to
491
- // handle "redirection" through JS or meta refresh (which would not
492
- // have time to run if we used "load").
493
- const loadOptions = {
494
- timeout: 120000,
495
- waitUntil: 'networkidle0'
496
- };
497
-
498
- // Load the page
499
- if (spec.html) {
500
- await page.setContent(spec.html, loadOptions);
501
- }
502
- else {
503
- await page.goto(spec.url, loadOptions);
504
- }
505
-
506
- // Handle multi-page specs
507
- const pageUrls = spec.pages || [];
508
-
509
- if (pageUrls.length > 0) {
510
- const pages = [];
511
- for (const url of pageUrls) {
512
- const subAbort = new AbortController();
513
- const subPage = await browser.newPage();
514
- await subPage.setCacheEnabled(!options.forceLocalFetch);
515
- const subCdp = await subPage.target().createCDPSession();
516
- await subCdp.send('Fetch.enable');
517
- subCdp.on('Fetch.requestPaused', interceptRequest(subCdp, subAbort));
518
- try {
519
- await subPage.goto(url, loadOptions);
520
- const html = await subPage.evaluate(() => {
521
- return document.body.outerHTML
522
- .replace(/<body/, '<section')
523
- .replace(/<\/body/, '</section');
524
- });
525
- pages.push({ url, html });
526
- }
527
- finally {
528
- subAbort.abort();
529
- await subCdp.detach();
530
- await subPage.close();
531
- }
532
- }
533
- await page.evaluate(pages => {
534
- for (const subPage of pages) {
535
- const section = document.createElement('section');
536
- section.setAttribute('data-reffy-page', subPage.url);
537
- section.innerHTML = subPage.html;
538
- document.body.appendChild(section);
539
- }
540
- }, pages);
541
- }
542
-
543
- // Wait until the generation of the spec is completely over
544
- await page.evaluate(async () => {
545
- const usesRespec = (window.respecConfig || window.eval('typeof respecConfig !== "undefined"')) &&
546
- window.document.head.querySelector("script[src*='respec']");
547
-
548
- function sleep(ms) {
549
- return new Promise(resolve => setTimeout(resolve, ms));
550
- }
551
-
552
- async function isReady(counter) {
553
- counter = counter || 0;
554
- if (counter > 60) {
555
- throw new Error('Respec generation took too long');
556
- }
557
- if (window.document.respec?.ready) {
558
- await window.document.respec.ready;
559
- }
560
- else if (usesRespec) {
561
- await sleep(1000);
562
- await isReady(counter + 1);
563
- }
564
- }
565
-
566
- await isReady();
567
- });
568
-
569
- // Capture and report Reffy's browserlib warnings
570
- page.on('console', msg => {
571
- const text = msg.text();
572
- if (text.startsWith('[reffy] ')) {
573
- options.quiet ?? console.warn(spec.url, `[${msg.type()}]`, msg.text().substr('[reffy] '.length));
574
- }
575
- });
576
-
577
- // Capture and report when page throws an error
578
- page.on('pageerror', err => {
579
- options.quiet ?? console.warn(err);
580
- });
581
-
582
- // Expose additional functions defined in src/browserlib/ to the
583
- // browser context, under a window.reffy namespace, so that processing
584
- // script may call them. The script is an ES6 module and needs to be
585
- // loaded as such.
586
- // Note that we're using a fake relative URL on purpose. In practice,
587
- // the request will be processed by "interceptRequest", which will
588
- // respond with the contents of the script file. Also, there are
589
- // multiple path levels in that fake URL on purpose as well, because
590
- // scripts import the WebIDL2.js library with a URL like
591
- // "../../node_modules/[...]" and may import other scripts that are
592
- // higher in the folder tree.
593
- await page.addScriptTag({
594
- url: `reffy/scripts/${range(maxPathDepth).map(n => '__').join('/')}/reffy.mjs`,
595
- type: 'module'
596
- });
597
-
598
- // Run the processFunction method in the browser context
599
- const results = await page.evaluate(processFunction, ...args);
600
-
601
- // Pending network requests may still be in the queue, flag the page
602
- // as closed not to send commands on a CDP session that's no longer
603
- // attached to anything
604
- abortController.abort();
605
-
606
- // Close CDP session and page
607
- // Note that gets done no matter what when browser.close() gets called.
608
- await cdp.detach();
609
- await page.close();
610
-
611
- return results;
612
- }
613
- finally {
614
- // Signal abortion again (in case an exception was thrown)
615
- abortController.abort();
616
- }
617
- }
618
-
619
-
620
- /**
621
- * Enrich the spec description with alternative URLs (versions and equivalents)
622
- *
623
- * TODO: The list used to contain published versions of TR specs retrieved from
624
- * the W3C API. They are useful to improve the relevance of reported anomalies.
625
- *
626
- * @function
627
- * @param {Object} spec Spec description structure (only the URL is useful)
628
- * @return {Object} The same structure, enriched with the URL of the editor's
629
- * draft when one is found
630
- */
631
- function completeWithAlternativeUrls(spec) {
632
- spec.versions = new Set();
633
- spec.versions.add(spec.url);
634
- if (spec.release) {
635
- spec.versions.add(spec.release.url);
636
- }
637
- if (spec.nightly) {
638
- spec.versions.add(spec.nightly.url);
639
- }
640
- if (specEquivalents[spec.url]) {
641
- spec.versions = new Set([
642
- ...spec.versions,
643
- ...specEquivalents[spec.url]
644
- ]);
645
- }
646
- spec.versions = [...spec.versions];
647
- return spec;
648
- }
649
-
650
-
651
- /**
652
- * Returns true when the given spec is the latest "fullest" level of that spec
653
- * in the given list of specs that passes the given predicate.
654
- *
655
- * "Fullest" means "not a delta spec, unless that is the only level that passes
656
- * the predicate".
657
- *
658
- * @function
659
- * @public
660
- * @param {Object} spec Spec to check
661
- * @param {Array(Object)} list List of specs (must include the spec to check)
662
- * @param {function} predicate Predicate function that the spec must pass. Must
663
- * be a function that takes a spec as argument and returns a boolean.
664
- * @return {Boolean} true if the spec is the latest "fullest" level in the list
665
- * that passes the predicate.
666
- */
667
- function isLatestLevelThatPasses(spec, list, predicate) {
668
- predicate = predicate || (_ => true);
669
- if (!predicate(spec)) {
670
- return false;
671
- }
672
- if (spec.seriesComposition === 'delta') {
673
- while (spec.seriesPrevious) {
674
- spec = list.find(s => s.shortname === spec.seriesPrevious);
675
- if (!spec) {
676
- break;
677
- }
678
- if ((spec.seriesComposition === 'full') && predicate(spec)) {
679
- return false;
680
- }
681
- }
682
- return true;
683
- }
684
- while (spec.seriesNext) {
685
- if (!spec) {
686
- break;
687
- }
688
- spec = list.find(s => s.shortname === spec.seriesNext);
689
- if ((spec.seriesComposition === 'full') && predicate(spec)) {
690
- return false;
691
- }
692
- }
693
- return true;
694
- }
695
-
696
-
697
- /**
698
- * Takes the results of a crawl (typically the contents of the index.json file)
699
- * and expands it to include the contents of all referenced files.
700
- *
701
- * The function handles both files and HTTPS resources, using either filesystem
702
- * functions (for files) or fetch (for HTTPS resources).
703
- *
704
- * Note the crawl object is expanded in place.
705
- *
706
- * @function
707
- * @public
708
- * @param {Object} crawl Crawl index object that needs to be expanded
709
- * @param {string} baseFolder The base folder that contains the crawl file, or
710
- * the base HTTPS URI to resolve relative links in the crawl object.
711
- * @param {Array(string)} An explicit list of properties to expand (no value
712
- * means "expand all possible properties")
713
- * @return {Promise(object)} The promise to get an expanded crawl object that
714
- * contains the entire crawl report (and no longer references external files)
715
- */
716
- async function expandCrawlResult(crawl, baseFolder, properties) {
717
- baseFolder = baseFolder || '';
718
-
719
- async function expandSpec(spec) {
720
- await Promise.all(Object.keys(spec).map(async property => {
721
- // Only consider properties explicitly requested
722
- if (properties && !properties.includes(property)) {
723
- return;
724
- }
725
-
726
- // Only consider properties that link to an extract, i.e. an IDL
727
- // or JSON file in subfolder.
728
- if (!spec[property] ||
729
- (typeof spec[property] !== 'string') ||
730
- !spec[property].match(/^[^\/]+\/[^\/]+\.(json|idl)$/)) {
731
- return;
732
- }
733
- let contents = null;
734
- if (baseFolder.startsWith('https:')) {
735
- const url = (new URL(spec[property], baseFolder)).toString();
736
- const response = await fetch(url, { nolog: true });
737
- contents = await response.text();
738
- }
739
- else {
740
- const filename = path.join(baseFolder, spec[property]);
741
- contents = await fs.readFile(filename, 'utf8');
742
- }
743
-
744
- // Force UNIX-style line endings
745
- // (Git may auto-convert LF to CRLF on Windows machines and we
746
- // want to store multiline IDL fragments as values of properties
747
- // in parsed IDL trees)
748
- contents = contents.replace(/\r\n/g, '\n');
749
-
750
- if (spec[property].endsWith('.json')) {
751
- contents = JSON.parse(contents);
752
- }
753
- if (property === 'css') {
754
- // Special case for CSS where the "css" level does not exist
755
- // in the generated files
756
- const css = Object.assign({}, contents);
757
- delete css.spec;
758
- spec[property] = css;
759
- }
760
- else if (property === 'idl') {
761
- // Special case for raw IDL extracts, which are text extracts.
762
- // Also drop header that may have been added when extract was
763
- // serialized.
764
- if (contents.startsWith('// GENERATED CONTENT - DO NOT EDIT')) {
765
- const endOfHeader = contents.indexOf('\n\n');
766
- contents = contents.substring(endOfHeader + 2);
767
- }
768
- spec.idl = contents;
769
- }
770
- else {
771
- spec[property] = contents[property];
772
- }
773
- }));
774
- return spec;
775
- }
776
-
777
- crawl.results = await Promise.all(crawl.results.map(expandSpec));
778
- return crawl;
779
- }
780
-
781
-
782
- /**
783
- * Retrieves the list of IDL attribute names that the CSS property generates
784
- * per the CSSOM spec, see:
785
- * https://drafts.csswg.org/cssom/#ref-for-css-property-to-idl-attribute
786
- *
787
- * @function
788
- * @param {String} property CSS property name
789
- * @return {Array(String)} An array of IDL attribute names, dashed attribute
790
- * first, then camel-cased attribute if different, then webkit-cased attribute
791
- * name if needed
792
- */
793
- function getGeneratedIDLNamesByCSSProperty(property) {
794
- // Converts a CSS property to an IDL attribute name per the CSSOM spec:
795
- // https://drafts.csswg.org/cssom/#css-property-to-idl-attribute
796
- function cssPropertyToIDLAttribute(property, lowercaseFirst) {
797
- let output = '';
798
- let uppercaseNext = false;
799
- if (lowercaseFirst) {
800
- property = property.substr(1);
801
- }
802
- for (const c of property) {
803
- if (c === '-') {
804
- uppercaseNext = true;
805
- } else if (uppercaseNext) {
806
- uppercaseNext = false;
807
- output += c.toUpperCase();
808
- } else {
809
- output += c;
810
- }
811
- }
812
- return output;
813
- }
814
-
815
- // Start with dashed attribute
816
- const res = [property];
817
-
818
- // Add camel-cased attribute if different
819
- const camelCased = cssPropertyToIDLAttribute(property, false);
820
- if (camelCased !== property) {
821
- res.push(camelCased);
822
- }
823
-
824
- // Add webkit-cased attribute if needed
825
- if (property.startsWith('-webkit-')) {
826
- res.push(cssPropertyToIDLAttribute(property, true));
827
- }
828
-
829
- return res;
830
- };
831
-
832
-
833
- /**
834
- * Creates the given folder if it does not exist yet.
835
- *
836
- * @function
837
- * @public
838
- * @param {String} folder Path to folder to create
839
- * (from current working directory)
840
- */
841
- async function createFolderIfNeeded(folder) {
842
- try {
843
- await fs.mkdir(folder);
844
- }
845
- catch (err) {
846
- if (err.code !== 'EEXIST') {
847
- throw err;
848
- }
849
- }
850
- }
851
-
852
-
853
- module.exports = {
854
- fetch,
855
- requireFromWorkingDirectory,
856
- expandBrowserModules,
857
- setupBrowser,
858
- teardownBrowser,
859
- processSpecification,
860
- completeWithAlternativeUrls,
861
- isLatestLevelThatPasses,
862
- expandCrawlResult,
863
- getGeneratedIDLNamesByCSSProperty,
864
- createFolderIfNeeded
865
- };
1
+ /**
2
+ * A bunch of utility functions common to multiple scripts
3
+ */
4
+
5
+ const fs = require('fs').promises;
6
+ const { existsSync } = require('fs');
7
+ const path = require('path');
8
+ const puppeteer = require('puppeteer');
9
+ const crypto = require('crypto');
10
+ const { AbortController } = require('abortcontroller-polyfill/dist/cjs-ponyfill');
11
+ const fetch = require('./fetch');
12
+ const specEquivalents = require('../specs/spec-equivalents.json');
13
+
14
+ const reffyModules = require('../browserlib/reffy.json');
15
+
16
+ /**
17
+ * Maximum depth difference supported between Reffy's install path and custom
18
+ * modules that may be provided on the command-line
19
+ *
20
+ * TODO: Find a way to get right of that, there should be no limit
21
+ */
22
+ const maxPathDepth = 20;
23
+
24
+ let prefetchedResponses = {};
25
+
26
+ /**
27
+ * Returns a range array from 0 to the number provided (not included)
28
+ */
29
+ const range = n => Array.from(Array(n).keys());
30
+
31
+
32
+ /**
33
+ * Shortcut that returns a property extractor iterator
34
+ */
35
+ const prop = p => x => x[p];
36
+
37
+
38
+ /**
39
+ * Wrapper around the "require" function to require files relative to the
40
+ * current working directory (CWD), instead of relative to the current JS
41
+ * file.
42
+ *
43
+ * This is typically needed to be able to use "require" to load JSON config
44
+ * files provided as command-line arguments.
45
+ *
46
+ * @function
47
+ * @param {String} filename The path to the file to require
48
+ * @return {Object} The result of requiring the file relative to the current
49
+ * working directory.
50
+ */
51
+ function requireFromWorkingDirectory(filename) {
52
+ try {
53
+ return require(path.resolve(filename));
54
+ }
55
+ catch (err) {
56
+ return null;
57
+ }
58
+ }
59
+
60
+
61
+ /**
62
+ * Determine the path to the "node_modules" folder to resolve relative links
63
+ * in the ES6 browser lib modules. The path depends on whether Reffy is run
64
+ * directly, or installed as a library.
65
+ *
66
+ * @function
67
+ * @return {String} Path to the node_modules folder.
68
+ */
69
+ function getModulesFolder() {
70
+ const rootFolder = path.resolve(__dirname, '../..');
71
+ let folder = path.resolve(rootFolder, 'node_modules');
72
+ if (existsSync(folder)) {
73
+ return folder;
74
+ }
75
+ folder = path.resolve(rootFolder, '..');
76
+ return folder;
77
+ }
78
+ const modulesFolder = getModulesFolder();
79
+
80
+
81
+ /**
82
+ * Puppeteer browser instance used to load and process specifications
83
+ */
84
+ let browser = null;
85
+
86
+ /**
87
+ * Promise resolved when there is no running instance of Puppeteer. This allows
88
+ * to serialize calls to setupBrowser (and thus to crawlList and crawlSpecs in
89
+ * specs-crawler.js)
90
+ */
91
+ let browserClosed = Promise.resolve();
92
+ let resolveBrowserClosed = null;
93
+
94
+ /**
95
+ * The browser JS library that will be loaded onto every crawled page
96
+ */
97
+ let browserlib = null;
98
+
99
+
100
+ /**
101
+ * Expand list of browser modules with right set of descriptive properties
102
+ *
103
+ * User may specify a browser module as:
104
+ * - a name which must match one of the existing modules in browserlib
105
+ * - a relative path to an .mjs file which must exist
106
+ * - an object with an "href" property that is a relative path to an .mjs file
107
+ * which must exist
108
+ *
109
+ * Relative paths provided by the user are interpreted as relative to the
110
+ * current working directory, and converted to be relative to the browserlib
111
+ * directory.
112
+ *
113
+ * @function
114
+ * @public
115
+ * @return {Array(Object)} List of modules with an href, name and property keys
116
+ */
117
+ function expandBrowserModules(modules) {
118
+ // Helper function to create a camelCase name out of a module path
119
+ function getCamelCaseName(href) {
120
+ const filename = href.replace(/([^\/\\]+)\.mjs$/, '$1');
121
+ const nameParts = filename.split('-');
122
+ let name;
123
+ let namePart;
124
+ while (namePart = nameParts.shift()) {
125
+ namePart = namePart.replace(/\W/g, '');
126
+ if (name) {
127
+ name += namePart.substring(0, 1).toUpperCase() + namePart.substring(1);
128
+ }
129
+ else {
130
+ name = namePart;
131
+ }
132
+ }
133
+ return name;
134
+ }
135
+
136
+ const browserlibPath = path.resolve(__dirname, '..', 'browserlib');
137
+ if (!modules) {
138
+ return reffyModules.map(mod => Object.assign({
139
+ name: getCamelCaseName(mod.href),
140
+ expanded: true
141
+ }, mod));
142
+ }
143
+
144
+ modules = modules.map(mod => {
145
+ if (typeof mod === 'string') {
146
+ if (mod.endsWith('.mjs')) {
147
+ const name = getCamelCaseName(mod);
148
+ return {
149
+ href: path.relative(browserlibPath, path.join(process.cwd(), mod)).replace(/\\/g, '/'),
150
+ name,
151
+ property: name,
152
+ expanded: true
153
+ };
154
+ }
155
+ else if (mod === 'core') {
156
+ return reffyModules.map(mod => Object.assign({
157
+ name: getCamelCaseName(mod.href),
158
+ expanded: true
159
+ }, mod));
160
+ }
161
+ else {
162
+ const res = reffyModules.find(m => m.href === mod ||
163
+ getCamelCaseName(m.href) === mod || m.property === mod);
164
+ if (!res) {
165
+ throw new Error(`Unknown browserlib module ${mod}`);
166
+ }
167
+ return Object.assign({
168
+ name: getCamelCaseName(res.href),
169
+ expanded: true
170
+ }, res);
171
+ }
172
+ }
173
+ else if (mod.expanded) {
174
+ return mod;
175
+ }
176
+ else {
177
+ if (!mod.href) {
178
+ throw new Error('Browserlib module does not have an "href" property');
179
+ }
180
+ mod.href = path.relative(browserlibPath, path.join(process.cwd(), mod.href)).replace(/\\/g, '/');
181
+ if (!mod.name) {
182
+ mod.name = getCamelCaseName(mod.href);
183
+ }
184
+ if (!mod.property) {
185
+ mod.property = mod.name;
186
+ }
187
+ mod.expanded = true;
188
+ return mod;
189
+ }
190
+ });
191
+
192
+ return modules.flat();
193
+ }
194
+
195
+
196
+ /**
197
+ * Prepare the browserlib script that will be loaded in every crawled page.
198
+ *
199
+ * The script exposes a global reffy namespace with the requested modules.
200
+ *
201
+ * The function must be called before any attempt to call `processSpecification`
202
+ * and should only be called once. The `setupBrowser` function takes care of it.
203
+ *
204
+ * @function
205
+ * @private
206
+ */
207
+ function setupBrowserlib(modules) {
208
+ modules = expandBrowserModules(modules);
209
+ browserlib = 'window.reffy = window.reffy ?? {};\n';
210
+
211
+ if (modules.find(module => module.needsIdToHeadingMap)) {
212
+ browserlib += `
213
+ import mapIdsToHeadings from './map-ids-to-headings.mjs';
214
+ window.reffy.mapIdsToHeadings = mapIdsToHeadings;\n`;
215
+ }
216
+
217
+ browserlib += modules.map(module => `
218
+ import ${module.name} from '${module.href}';
219
+ window.reffy.${module.name} = ${module.name};
220
+ `).join('\n');
221
+ }
222
+
223
+
224
+ /**
225
+ * Setup and launch browser instance to use to load and process specifications.
226
+ *
227
+ * The function must be called before any attempt to call `processSpecification`
228
+ * and should only be called once.
229
+ *
230
+ * The function also generates the code that will inject the `reffy` namespace
231
+ * in each processed page.
232
+ *
233
+ * Note: Switch `headless` to `false` to access dev tools and debug processing
234
+ *
235
+ * @function
236
+ * @public
237
+ */
238
+ async function setupBrowser(modules) {
239
+ // There can be only one crawl running at a time
240
+ await browserClosed;
241
+ browserClosed = new Promise(resolve => resolveBrowserClosed = resolve);
242
+
243
+ // Create browser instance
244
+ // Note: switch "headless" to "false" (and comment out the call to
245
+ // "browser.close()") to access dev tools in debug mode
246
+ browser = await puppeteer.launch({ headless: true });
247
+ setupBrowserlib(modules);
248
+ }
249
+
250
+
251
+ /**
252
+ * Close and destroy browser instance.
253
+ *
254
+ * The function should be called once at the end of the processing.
255
+ *
256
+ * @function
257
+ * @public
258
+ */
259
+ async function teardownBrowser() {
260
+ if (browser) {
261
+ await browser.close();
262
+ browser = null;
263
+ resolveBrowserClosed();
264
+ resolveBrowserClosed = null;
265
+ }
266
+ }
267
+
268
+
269
+ /**
270
+ * Load and process the given specification.
271
+ *
272
+ * The method automatically exposes Reffy's library functions in a window.reffy
273
+ * namespace (see setupBrowserlib) so that the callback function can
274
+ * call them directly. Additional callback arguments that would need to be
275
+ * passed to the browser context can be provided through the "args" parameter.
276
+ *
277
+ * A crawl will typically fetch and render hundreds of specs, triggering a lot
278
+ * of network requests. Given that some of these requests (e.g. those on images)
279
+ * are of no interest for the processing, that it is wasteful to fetch the same
280
+ * resource again and again during a crawl, and that it is useful to have an
281
+ * offline mode for debugging purpose, the method will intercept network
282
+ * requests made by the browser, fail those that don't seem needed, and serve
283
+ * requests on resources that have already been fetched from a local file cache
284
+ * (the "cacheRefresh" setting in "config.json" allows to adjust this behavior).
285
+ *
286
+ * This triggers a few hiccups and needs for workarounds though:
287
+ * - Puppeteer's page.setRequestInterception does not play nicely with workers
288
+ * (which Respec typically uses) for the time being, so code uses the Chrome
289
+ * DevTools Protocol (CDP) directly, see:
290
+ * https://github.com/puppeteer/puppeteer/issues/4208
291
+ * - Tampering with network requests means that the loaded page gets
292
+ * automatically flagged as "non secure". That's mostly fine but means that
293
+ * "window.crypto.subtle" is not available and Respec needs that to generate
294
+ * hashes. The code re-creates that method manually.
295
+ * - A few specs send HTTP requests that return "streams". This does not work
296
+ * well with Puppeteer's "networkidle0" option (to detect when a spec is mostly
297
+ * done loading), and that does not work with a file cache approach either.
298
+ * These requests get intercepted.
299
+ *
300
+ * A couple of additional notes:
301
+ * - Requests to CSS stylesheets are not intercepted because Respec dynamically
302
+ * loads a few CSS resources, and intercepting them could perhaps impact the
303
+ * rest of the generation.
304
+ * - SVG images are not intercepted because a couple of specs have a PNG
305
+ * fallback mechanism that, when interception is on, make the browser spin
306
+ * forever, see discussion in: https://github.com/w3c/accelerometer/pull/55
307
+ *
308
+ * Strictly speaking, intercepting request is only needed to be able to use the
309
+ * "networkidle0" option. The whole interception logic could be dropped (and
310
+ * "networkidle2" could be used instead) if it proves too unstable.
311
+ *
312
+ * @function
313
+ * @public
314
+ * @param {Object|String} spec The spec to load. Must either be a URL string or
315
+ * an object with a "url" property. If the object contains an "html" property,
316
+ * the HTML content is loaded instead.
317
+ * @param {function} processFunction Processing function that will be evaluated
318
+ * in the browser context where the spec gets loaded
319
+ * @param {Arrays} args List of arguments to pass to the processing function.
320
+ * These arguments typically make it possible to pass contextual information
321
+ * to the processing function (such as the spec object that describes the
322
+ * spec being processed, or the list of processing modules to run)
323
+ * @param {Object} options Processing options. The "quiet" flag tells the
324
+ * function not to report warnings to the console. The "forceLocalFetch"
325
+ * flag tells the function that all network requests need to be only handled
326
+ * by Node.js's "fetch" function (as opposed to falling back to Puppeteer's
327
+ * network and caching logic), which is useful to keep full control of network
328
+ * requests in tests. The "etag" and "lastModified" options give input
329
+ * to the conditional fetch request sent for the primary crawled URL
330
+ * @return {Promise} The promise to get the results of the processing function
331
+ */
332
+ async function processSpecification(spec, processFunction, args, options) {
333
+ spec = (typeof spec === 'string') ? { url: spec } : spec;
334
+ processFunction = processFunction || function () {};
335
+ args = args || [];
336
+ options = options || {};
337
+
338
+ if (!browser) {
339
+ throw new Error('Browser instance not initialized, setupBrowser() must be called before processSpecification().');
340
+ }
341
+
342
+ // Create an abort controller for network requests directly handled by the
343
+ // Node.js code (and not by Puppeteer)
344
+ const abortController = new AbortController();
345
+
346
+ // Inner function that returns a network interception method suitable for
347
+ // a given CDP session.
348
+ function interceptRequest(cdp, controller) {
349
+ return async function ({ requestId, request }) {
350
+ try {
351
+ // Abort network requests to common image formats
352
+ if (/\.(gif|ico|jpg|jpeg|png|ttf|woff)$/i.test(request.url)) {
353
+ await cdp.send('Fetch.failRequest', { requestId, errorReason: 'Failed' });
354
+ return;
355
+ }
356
+
357
+ // Abort network requests that return a "stream", they won't
358
+ // play well with Puppeteer's "networkidle0" option, and our
359
+ // custom "fetch" function does not handle streams in any case
360
+ if (request.url.startsWith('https://drafts.csswg.org/api/drafts/') ||
361
+ request.url.startsWith('https://drafts.css-houdini.org/api/drafts/') ||
362
+ request.url.startsWith('https://drafts.fxtf.org/api/drafts/') ||
363
+ request.url.startsWith('https://api.csswg.org/shepherd/')) {
364
+ await cdp.send('Fetch.failRequest', { requestId, errorReason: 'Failed' });
365
+ return;
366
+ }
367
+
368
+ // The request needs to be intercepted, either because it
369
+ // targets one of the local script files, or because we would
370
+ // like to use our local cache to avoid sending network requests
371
+ // when possible.
372
+ //console.log(`intercept ${request.url}`);
373
+ const reffyPath = '/reffy/scripts/';
374
+ const webidl2Path = '/node_modules/webidl2/';
375
+ if (request.url.includes(reffyPath) || request.url.includes(webidl2Path)) {
376
+ let body;
377
+ if (request.url.endsWith('reffy.mjs')) {
378
+ body = Buffer.from(browserlib);
379
+ }
380
+ else if (request.url.includes(webidl2Path)) {
381
+ const file = path.resolve(modulesFolder, 'webidl2',
382
+ request.url.substring(request.url.indexOf(webidl2Path) + webidl2Path.length));
383
+ body = await fs.readFile(file);
384
+ }
385
+ else {
386
+ // The "__" folders are just a means to resolve
387
+ // relative paths that are higher than the "browserlib"
388
+ // folder on the storage drive
389
+ const requestPath = request.url.substring(request.url.indexOf(reffyPath) + reffyPath.length);
390
+ let depth = requestPath.lastIndexOf('__/') / 3;
391
+ const filename = requestPath.substring(requestPath.lastIndexOf('__/') + 3);
392
+ let filePath = path.resolve(__dirname, '..', 'browserlib');
393
+ while (depth < maxPathDepth - 1) {
394
+ filePath = path.resolve(filePath, '..');
395
+ depth += 1;
396
+ }
397
+ const file = path.resolve(filePath, filename);
398
+ body = await fs.readFile(file);
399
+ }
400
+ await cdp.send('Fetch.fulfillRequest', {
401
+ requestId,
402
+ responseCode: 200,
403
+ responseHeaders: [{ name: 'Content-Type', value: 'application/javascript' }],
404
+ body: body.toString('base64')
405
+ });
406
+ }
407
+ else {
408
+ if ((request.method !== 'GET') ||
409
+ (!request.url.startsWith('http:') && !request.url.startsWith('https:'))) {
410
+ await cdp.send('Fetch.continueRequest', { requestId });
411
+ return;
412
+ }
413
+ const response = prefetchedResponses[request.url] ?? await fetch(request.url, { signal: controller.signal, headers: request.headers });
414
+
415
+ const body = await response.buffer();
416
+
417
+ await cdp.send('Fetch.fulfillRequest', {
418
+ requestId,
419
+ responseCode: response.status,
420
+ responseHeaders: Object.keys(response.headers.raw()).map(header => {
421
+ return {
422
+ name: header,
423
+ value: response.headers.raw()[header].join(',')
424
+ };
425
+ }),
426
+ body: body.toString('base64')
427
+ });
428
+ }
429
+ //console.log(`intercept ${request.url} - done`);
430
+ }
431
+ catch (err) {
432
+ if (controller.signal.aborted) {
433
+ // All is normal, processing was over, page and CDP session
434
+ // have been closed, and network requests have been aborted
435
+ // console.log(`intercept ${request.url} - aborted`);
436
+ return;
437
+ }
438
+
439
+ // Fetch from file cache failed somehow
440
+ // Let Puppeteer handle the request as fallback unless
441
+ // calling function asked us not to do that
442
+ if (options.forceLocalFetch) {
443
+ options.quiet ?? console.warn(`[warn] Network request for ${request.url} failed`, err);
444
+ await cdp.send('Fetch.failRequest', { requestId, errorReason: 'Failed' });
445
+ }
446
+ else {
447
+ try {
448
+ options.quiet ?? console.warn(`[warn] Fall back to regular network request for ${request.url}`, err);
449
+ await cdp.send('Fetch.continueRequest', { requestId });
450
+ }
451
+ catch (err) {
452
+ if (!controller.signal.aborted) {
453
+ options.quiet ?? console.warn(`[warn] Fall back to regular network request for ${request.url} failed`, err);
454
+ }
455
+ }
456
+ }
457
+ }
458
+ }
459
+ }
460
+
461
+ try {
462
+ // Fetch the spec URL if using https
463
+ // This allow to skip launching a browser
464
+ // if we have a fallback data source
465
+ // with a defined cache target for the spec
466
+ if (!spec.url.startsWith('file://')) {
467
+ let response;
468
+ // We set a conditional request header
469
+ // Use If-Modified-Since in preference as it is in practice
470
+ // more reliable for conditional requests
471
+ let headers = {'Accept-Encoding': 'gzip, deflate, br', 'Upgrade-Insecure-Requests': 1, 'User-Agent': browser.userAgent()};
472
+ if (options.lastModified) {
473
+ headers["If-Modified-Since"] = options.lastModified;
474
+ } else if (options.etag) {
475
+ headers["If-None-Match"] = options.etag;
476
+ }
477
+ try {
478
+ response = await fetch(spec.url, {headers});
479
+ if (response.status === 304) {
480
+ return {status: "notmodified"};
481
+ }
482
+ prefetchedResponses[spec.url] = response;
483
+ } catch (err) {
484
+ throw new Error(`Loading ${spec.url} triggered network error ${err}`);
485
+ }
486
+ if (response.status !== 200) {
487
+ throw new Error(`Loading ${spec.url} triggered HTTP status ${response.status}`);
488
+ }
489
+ }
490
+ const page = await browser.newPage();
491
+
492
+ // Disable cache if caller wants to handle all network requests
493
+ await page.setCacheEnabled(!options.forceLocalFetch);
494
+
495
+ // Intercept all network requests to use our own version of "fetch"
496
+ // that makes use of the local file cache.
497
+ const cdp = await page.target().createCDPSession();
498
+ await cdp.send('Fetch.enable');
499
+ cdp.on('Fetch.requestPaused', interceptRequest(cdp, abortController));
500
+
501
+ // Quick and dirty workaround to re-create the "window.crypto.digest"
502
+ // function that Respec needs (context is seen as unsecure because we're
503
+ // tampering with network requests)
504
+ await page.exposeFunction('hashdigest', (algorithm, buffer) => {
505
+ return crypto.createHash(algorithm).update(Buffer.from(Object.values(buffer))).digest();
506
+ });
507
+ await page.evaluateOnNewDocument(() => {
508
+ window.crypto.subtle = {
509
+ digest: function (algorithm, buffer) {
510
+ const res = window.hashdigest('sha1', buffer);
511
+ return res.then(buf => {
512
+ return Uint8Array.from(buf.data);
513
+ });
514
+ }
515
+ };
516
+ });
517
+
518
+ // Common loading option to give the browser enough time to load large
519
+ // specs, and to consider navigation done when there haven't been
520
+ // network connections in the past 500ms. This should be enough to
521
+ // handle "redirection" through JS or meta refresh (which would not
522
+ // have time to run if we used "load").
523
+ const loadOptions = {
524
+ timeout: 120000,
525
+ waitUntil: 'networkidle0'
526
+ };
527
+
528
+ // Load the page
529
+ // (note HTTP status is 0 when `file://` URLs are loaded)
530
+ let cacheInfo;
531
+ if (spec.html) {
532
+ await page.setContent(spec.html, loadOptions);
533
+ }
534
+ else {
535
+ let result;
536
+ try {
537
+ result = await page.goto(spec.url, loadOptions);
538
+ } catch (err) {
539
+ throw new Error(`Loading ${spec.url} triggered network error ${err}`);
540
+ }
541
+ if ((result.status() !== 200) && (!spec.url.startsWith('file://') || (result.status() !== 0))) {
542
+ throw new Error(`Loading ${spec.url} triggered HTTP status ${result.status()}`);
543
+ }
544
+ const responseHeaders = result.headers();
545
+ // Use Last-Modified in preference as it is in practice
546
+ // more reliable for conditional requests
547
+ if (responseHeaders['last-modified']) {
548
+ cacheInfo = {lastModified: responseHeaders['last-modified']};
549
+ } else if (responseHeaders.etag) {
550
+ cacheInfo = {etag: responseHeaders.etag};
551
+ }
552
+ }
553
+
554
+ // Handle multi-page specs
555
+ const pageUrls = spec.pages || [];
556
+
557
+ if (pageUrls.length > 0) {
558
+ const pages = [];
559
+ for (const url of pageUrls) {
560
+ const subAbort = new AbortController();
561
+ const subPage = await browser.newPage();
562
+ await subPage.setCacheEnabled(!options.forceLocalFetch);
563
+ const subCdp = await subPage.target().createCDPSession();
564
+ await subCdp.send('Fetch.enable');
565
+ subCdp.on('Fetch.requestPaused', interceptRequest(subCdp, subAbort));
566
+ try {
567
+ // (Note HTTP status is 0 when `file://` URLs are loaded)
568
+ const subresult = await subPage.goto(url, loadOptions);
569
+ if ((subresult.status() !== 200) && (!url.startsWith('file://') || (subresult.status() !== 0))) {
570
+ throw new Error(`Loading ${spec.url} triggered HTTP status ${result.status()}`);
571
+ }
572
+ const html = await subPage.evaluate(() => {
573
+ return document.body.outerHTML
574
+ .replace(/<body/, '<section')
575
+ .replace(/<\/body/, '</section');
576
+ });
577
+ pages.push({ url, html });
578
+ }
579
+ finally {
580
+ subAbort.abort();
581
+ await subCdp.detach();
582
+ await subPage.close();
583
+ }
584
+ }
585
+ await page.evaluate(pages => {
586
+ for (const subPage of pages) {
587
+ const section = document.createElement('section');
588
+ section.setAttribute('data-reffy-page', subPage.url);
589
+ section.innerHTML = subPage.html;
590
+ document.body.appendChild(section);
591
+ }
592
+ }, pages);
593
+ }
594
+
595
+ // Wait until the generation of the spec is completely over
596
+ await page.evaluate(async () => {
597
+ // Detect draft CSS server hiccups as done in browser-specs:
598
+ // https://github.com/w3c/browser-specs/blob/b31fc0b03ba67a19162883afc30e01fcec3c600d/src/fetch-info.js#L292
599
+ const title = (window.document.querySelector('h1')?.textContent || '')
600
+ .replace(/\n/g, '').trim();
601
+ if (title.startsWith('Index of ')) {
602
+ throw new Error(`CSS server issue detected`);
603
+ }
604
+
605
+ const usesRespec = (window.respecConfig || window.eval('typeof respecConfig !== "undefined"')) &&
606
+ window.document.head.querySelector("script[src*='respec']");
607
+
608
+ function sleep(ms) {
609
+ return new Promise(resolve => setTimeout(resolve, ms));
610
+ }
611
+
612
+ async function isReady(counter) {
613
+ counter = counter || 0;
614
+ if (counter > 60) {
615
+ throw new Error('Respec generation took too long');
616
+ }
617
+ if (window.document.respec?.ready) {
618
+ await window.document.respec.ready;
619
+ }
620
+ else if (usesRespec) {
621
+ await sleep(1000);
622
+ await isReady(counter + 1);
623
+ }
624
+ }
625
+
626
+ await isReady();
627
+ });
628
+
629
+ // Capture and report Reffy's browserlib warnings
630
+ page.on('console', msg => {
631
+ const text = msg.text();
632
+ if (text.startsWith('[reffy] ')) {
633
+ options.quiet ?? console.warn(spec.url, `[${msg.type()}]`, msg.text().substr('[reffy] '.length));
634
+ }
635
+ });
636
+
637
+ // Capture and report when page throws an error
638
+ page.on('pageerror', err => {
639
+ options.quiet ?? console.warn(err);
640
+ });
641
+
642
+ // Expose additional functions defined in src/browserlib/ to the
643
+ // browser context, under a window.reffy namespace, so that processing
644
+ // script may call them. The script is an ES6 module and needs to be
645
+ // loaded as such.
646
+ // Note that we're using a fake relative URL on purpose. In practice,
647
+ // the request will be processed by "interceptRequest", which will
648
+ // respond with the contents of the script file. Also, there are
649
+ // multiple path levels in that fake URL on purpose as well, because
650
+ // scripts import the WebIDL2.js library with a URL like
651
+ // "../../node_modules/[...]" and may import other scripts that are
652
+ // higher in the folder tree.
653
+ await page.addScriptTag({
654
+ url: `reffy/scripts/${range(maxPathDepth).map(n => '__').join('/')}/reffy.mjs`,
655
+ type: 'module'
656
+ });
657
+
658
+ // Run the processFunction method in the browser context
659
+ const results = await page.evaluate(processFunction, ...args);
660
+ results.crawlCacheInfo = cacheInfo;
661
+ // Pending network requests may still be in the queue, flag the page
662
+ // as closed not to send commands on a CDP session that's no longer
663
+ // attached to anything
664
+ abortController.abort();
665
+
666
+ // Close CDP session and page
667
+ // Note that gets done no matter what when browser.close() gets called.
668
+ await cdp.detach();
669
+ await page.close();
670
+
671
+ return results;
672
+ }
673
+ finally {
674
+ // Signal abortion again (in case an exception was thrown)
675
+ abortController.abort();
676
+ }
677
+ }
678
+
679
+
680
+ /**
681
+ * Enrich the spec description with alternative URLs (versions and equivalents)
682
+ *
683
+ * TODO: The list used to contain published versions of TR specs retrieved from
684
+ * the W3C API. They are useful to improve the relevance of reported anomalies.
685
+ *
686
+ * @function
687
+ * @param {Object} spec Spec description structure (only the URL is useful)
688
+ * @return {Object} The same structure, enriched with the URL of the editor's
689
+ * draft when one is found
690
+ */
691
+ function completeWithAlternativeUrls(spec) {
692
+ spec.versions = new Set();
693
+ spec.versions.add(spec.url);
694
+ if (spec.release) {
695
+ spec.versions.add(spec.release.url);
696
+ }
697
+ if (spec.nightly) {
698
+ spec.versions.add(spec.nightly.url);
699
+ }
700
+ if (specEquivalents[spec.url]) {
701
+ spec.versions = new Set([
702
+ ...spec.versions,
703
+ ...specEquivalents[spec.url]
704
+ ]);
705
+ }
706
+ spec.versions = [...spec.versions];
707
+ return spec;
708
+ }
709
+
710
+
711
+ /**
712
+ * Returns true when the given spec is the latest "fullest" level of that spec
713
+ * in the given list of specs that passes the given predicate.
714
+ *
715
+ * "Fullest" means "not a delta spec, unless that is the only level that passes
716
+ * the predicate".
717
+ *
718
+ * @function
719
+ * @public
720
+ * @param {Object} spec Spec to check
721
+ * @param {Array(Object)} list List of specs (must include the spec to check)
722
+ * @param {function} predicate Predicate function that the spec must pass. Must
723
+ * be a function that takes a spec as argument and returns a boolean.
724
+ * @return {Boolean} true if the spec is the latest "fullest" level in the list
725
+ * that passes the predicate.
726
+ */
727
+ function isLatestLevelThatPasses(spec, list, predicate) {
728
+ predicate = predicate || (_ => true);
729
+ if (!predicate(spec)) {
730
+ return false;
731
+ }
732
+ if (spec.seriesComposition === 'delta') {
733
+ while (spec.seriesPrevious) {
734
+ spec = list.find(s => s.shortname === spec.seriesPrevious);
735
+ if (!spec) {
736
+ break;
737
+ }
738
+ if ((spec.seriesComposition === 'full') && predicate(spec)) {
739
+ return false;
740
+ }
741
+ }
742
+ return true;
743
+ }
744
+ while (spec.seriesNext) {
745
+ if (!spec) {
746
+ break;
747
+ }
748
+ spec = list.find(s => s.shortname === spec.seriesNext);
749
+ if ((spec.seriesComposition === 'full') && predicate(spec)) {
750
+ return false;
751
+ }
752
+ }
753
+ return true;
754
+ }
755
+
756
+
757
+ /**
758
+ * Takes the results of a crawl for a given spec and expands it to include the
759
+ * contents of referenced files.
760
+ *
761
+ * The function handles both files and HTTPS resources, using either filesystem
762
+ * functions (for files) or fetch (for HTTPS resources).
763
+ *
764
+ * Note the spec object is expanded in place.
765
+ *
766
+ * @function
767
+ * @public
768
+ * @param {Object} spec Spec crawl result that needs to be expanded
769
+ * @param {string} baseFolder The base folder that contains the crawl file, or
770
+ * the base HTTPS URI to resolve relative links in the crawl object.
771
+ * @param {Array(string)} properties An explicit list of properties to expand
772
+ * (no value means "expand all possible properties")
773
+ * @return {Promise(object)} The promise to get an expanded crawl object that
774
+ * contains the contents of referenced files and no longer references external
775
+ * files (for the requested properties)
776
+ */
777
+ async function expandSpecResult(spec, baseFolder, properties) {
778
+ baseFolder = baseFolder || '';
779
+ await Promise.all(Object.keys(spec).map(async property => {
780
+ // Only consider properties explicitly requested
781
+ if (properties && !properties.includes(property)) {
782
+ return;
783
+ }
784
+
785
+ // Only consider properties that link to an extract, i.e. an IDL
786
+ // or JSON file in subfolder.
787
+ if (!spec[property] ||
788
+ (typeof spec[property] !== 'string') ||
789
+ !spec[property].match(/^[^\/]+\/[^\/]+\.(json|idl)$/)) {
790
+ return;
791
+ }
792
+ let contents = null;
793
+ if (baseFolder.startsWith('https:')) {
794
+ const url = (new URL(spec[property], baseFolder)).toString();
795
+ const response = await fetch(url, { nolog: true });
796
+ contents = await response.text();
797
+ }
798
+ else {
799
+ const filename = path.join(baseFolder, spec[property]);
800
+ contents = await fs.readFile(filename, 'utf8');
801
+ }
802
+ if (spec[property].endsWith('.json')) {
803
+ contents = JSON.parse(contents);
804
+ }
805
+ if (property === 'css') {
806
+ // Special case for CSS where the "css" level does not exist
807
+ // in the generated files
808
+ const css = Object.assign({}, contents);
809
+ delete css.spec;
810
+ spec[property] = css;
811
+ }
812
+ else if (property === 'idl') {
813
+ // Special case for raw IDL extracts, which are text extracts.
814
+ // Also drop header that may have been added when extract was
815
+ // serialized.
816
+ if (contents.startsWith('// GENERATED CONTENT - DO NOT EDIT')) {
817
+ const endOfHeader = contents.indexOf('\n\n');
818
+ contents = contents.substring(endOfHeader + 2)
819
+ // remove trailing newline added in saveIdl
820
+ .slice(0, -1);
821
+ }
822
+ spec.idl = contents;
823
+ }
824
+ else {
825
+ spec[property] = contents[property];
826
+ }
827
+ }));
828
+ return spec;
829
+ }
830
+
831
+
832
+ /**
833
+ * Takes the results of a crawl (typically the contents of the index.json file)
834
+ * and expands it to include the contents of all referenced files.
835
+ *
836
+ * The function handles both files and HTTPS resources, using either filesystem
837
+ * functions (for files) or fetch (for HTTPS resources).
838
+ *
839
+ * Note the crawl object is expanded in place.
840
+ *
841
+ * @function
842
+ * @public
843
+ * @param {Object} crawl Crawl index object that needs to be expanded
844
+ * @param {string} baseFolder The base folder that contains the crawl file, or
845
+ * the base HTTPS URI to resolve relative links in the crawl object.
846
+ * @param {Array(string)} properties An explicit list of properties to expand
847
+ * (no value means "expand all possible properties")
848
+ * @return {Promise(object)} The promise to get an expanded crawl object that
849
+ * contains the entire crawl report (and no longer references external files)
850
+ */
851
+ async function expandCrawlResult(crawl, baseFolder, properties) {
852
+ baseFolder = baseFolder || '';
853
+ crawl.results = await Promise.all(
854
+ crawl.results.map(spec => expandSpecResult(spec, baseFolder, properties))
855
+ );
856
+ return crawl;
857
+ }
858
+
859
+
860
+ /**
861
+ * Retrieves the list of IDL attribute names that the CSS property generates
862
+ * per the CSSOM spec, see:
863
+ * https://drafts.csswg.org/cssom/#ref-for-css-property-to-idl-attribute
864
+ *
865
+ * @function
866
+ * @param {String} property CSS property name
867
+ * @return {Array(String)} An array of IDL attribute names, dashed attribute
868
+ * first, then camel-cased attribute if different, then webkit-cased attribute
869
+ * name if needed
870
+ */
871
+ function getGeneratedIDLNamesByCSSProperty(property) {
872
+ // Converts a CSS property to an IDL attribute name per the CSSOM spec:
873
+ // https://drafts.csswg.org/cssom/#css-property-to-idl-attribute
874
+ function cssPropertyToIDLAttribute(property, lowercaseFirst) {
875
+ let output = '';
876
+ let uppercaseNext = false;
877
+ if (lowercaseFirst) {
878
+ property = property.substr(1);
879
+ }
880
+ for (const c of property) {
881
+ if (c === '-') {
882
+ uppercaseNext = true;
883
+ } else if (uppercaseNext) {
884
+ uppercaseNext = false;
885
+ output += c.toUpperCase();
886
+ } else {
887
+ output += c;
888
+ }
889
+ }
890
+ return output;
891
+ }
892
+
893
+ // Start with dashed attribute
894
+ const res = [property];
895
+
896
+ // Add camel-cased attribute if different
897
+ const camelCased = cssPropertyToIDLAttribute(property, false);
898
+ if (camelCased !== property) {
899
+ res.push(camelCased);
900
+ }
901
+
902
+ // Add webkit-cased attribute if needed
903
+ if (property.startsWith('-webkit-')) {
904
+ res.push(cssPropertyToIDLAttribute(property, true));
905
+ }
906
+
907
+ return res;
908
+ };
909
+
910
+
911
+ /**
912
+ * Creates the given folder if it does not exist yet.
913
+ *
914
+ * @function
915
+ * @public
916
+ * @param {String} folder Path to folder to create
917
+ * (from current working directory)
918
+ */
919
+ async function createFolderIfNeeded(folder) {
920
+ try {
921
+ await fs.mkdir(folder);
922
+ }
923
+ catch (err) {
924
+ if (err.code !== 'EEXIST') {
925
+ throw err;
926
+ }
927
+ }
928
+ }
929
+
930
+
931
+ module.exports = {
932
+ fetch,
933
+ requireFromWorkingDirectory,
934
+ expandBrowserModules,
935
+ setupBrowser,
936
+ teardownBrowser,
937
+ processSpecification,
938
+ completeWithAlternativeUrls,
939
+ isLatestLevelThatPasses,
940
+ expandCrawlResult,
941
+ expandSpecResult,
942
+ getGeneratedIDLNamesByCSSProperty,
943
+ createFolderIfNeeded
944
+ };