reffy 6.1.4 → 6.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. package/LICENSE +21 -21
  2. package/README.md +158 -158
  3. package/index.js +11 -11
  4. package/package.json +53 -53
  5. package/reffy.js +248 -236
  6. package/src/browserlib/canonicalize-url.mjs +50 -50
  7. package/src/browserlib/create-outline.mjs +352 -352
  8. package/src/browserlib/extract-cssdfn.mjs +319 -319
  9. package/src/browserlib/extract-dfns.mjs +686 -686
  10. package/src/browserlib/extract-elements.mjs +205 -205
  11. package/src/browserlib/extract-headings.mjs +48 -48
  12. package/src/browserlib/extract-ids.mjs +28 -28
  13. package/src/browserlib/extract-links.mjs +28 -28
  14. package/src/browserlib/extract-references.mjs +203 -203
  15. package/src/browserlib/extract-webidl.mjs +134 -134
  16. package/src/browserlib/get-absolute-url.mjs +21 -21
  17. package/src/browserlib/get-generator.mjs +26 -26
  18. package/src/browserlib/get-lastmodified-date.mjs +13 -13
  19. package/src/browserlib/get-title.mjs +11 -11
  20. package/src/browserlib/informative-selector.mjs +16 -16
  21. package/src/browserlib/map-ids-to-headings.mjs +136 -136
  22. package/src/browserlib/reffy.json +53 -53
  23. package/src/cli/check-missing-dfns.js +609 -609
  24. package/src/cli/generate-idlnames.js +430 -430
  25. package/src/cli/generate-idlparsed.js +139 -139
  26. package/src/cli/merge-crawl-results.js +128 -128
  27. package/src/cli/parse-webidl.js +430 -430
  28. package/src/lib/css-grammar-parse-tree.schema.json +109 -109
  29. package/src/lib/css-grammar-parser.js +440 -440
  30. package/src/lib/fetch.js +56 -56
  31. package/src/lib/nock-server.js +127 -112
  32. package/src/lib/specs-crawler.js +622 -552
  33. package/src/lib/util.js +944 -865
  34. package/src/specs/missing-css-rules.json +197 -197
  35. package/src/specs/spec-equivalents.json +149 -149
  36. package/src/browserlib/extract-editors.mjs~ +0 -14
  37. package/src/browserlib/generate-es-dfn-report.sh~ +0 -4
  38. package/src/cli/csstree-grammar-check.js +0 -28
  39. package/src/cli/csstree-grammar-check.js~ +0 -10
  40. package/src/cli/csstree-grammar-parser.js +0 -11
  41. package/src/cli/csstree-grammar-parser.js~ +0 -1
  42. package/src/cli/extract-editors.js~ +0 -38
  43. package/src/cli/process-specs.js~ +0 -28
@@ -1,552 +1,622 @@
1
- #!/usr/bin/env node
2
- /**
3
- * The spec crawler takes a list of spec URLs as input, gathers some knowledge
4
- * about these specs (published versions, URL of the Editor's Draft, etc.),
5
- * fetches these specs, parses them, extracts relevant information that they
6
- * contain (such as the WebIDL they define, the list of specifications that they
7
- * reference, and links to external specs), and produces a crawl report with the
8
- * results of these investigations.
9
- *
10
- * @module crawler
11
- */
12
-
13
- const fs = require('fs');
14
- const path = require('path');
15
- const specs = require('browser-specs');
16
- const cssDfnParser = require('./css-grammar-parser');
17
- const { generateIdlParsed, saveIdlParsed } = require('../cli/generate-idlparsed');
18
- const { generateIdlNames, saveIdlNames } = require('../cli/generate-idlnames');
19
- const {
20
- completeWithAlternativeUrls,
21
- expandBrowserModules,
22
- expandCrawlResult,
23
- getGeneratedIDLNamesByCSSProperty,
24
- isLatestLevelThatPasses,
25
- processSpecification,
26
- setupBrowser,
27
- teardownBrowser,
28
- createFolderIfNeeded
29
- } = require('./util');
30
-
31
-
32
- /**
33
- * Load and parse the given spec.
34
- *
35
- * @function
36
- * @param {Object} spec The spec to load (must already have been completed with
37
- * useful info, as returned by "createInitialSpecDescriptions")
38
- * @param {Object} crawlOptions Crawl options
39
- * @return {Promise<Object>} The promise to get a spec object with crawl info
40
- */
41
- async function crawlSpec(spec, crawlOptions) {
42
- crawlOptions = crawlOptions || {};
43
- spec.crawled = crawlOptions.publishedVersion ?
44
- (spec.release ? spec.release : spec.nightly) :
45
- spec.nightly;
46
-
47
- if (spec.error) {
48
- return spec;
49
- }
50
-
51
- try {
52
- const result = await processSpecification(
53
- spec.crawled,
54
- (spec, modules) => {
55
- const idToHeading = modules.find(m => m.needsIdToHeadingMap) ?
56
- window.reffy.mapIdsToHeadings() : null;
57
- const res = {
58
- crawled: window.location.toString()
59
- };
60
- modules.forEach(mod => {
61
- res[mod.property] = window.reffy[mod.name](spec, idToHeading);
62
- });
63
- return res;
64
- },
65
- [spec, crawlOptions.modules],
66
- { quiet: crawlOptions.quiet,
67
- forceLocalFetch: crawlOptions.forceLocalFetch }
68
- );
69
-
70
- // Specific rule for IDL extracts:
71
- // parse the extracted WebIdl content
72
- await generateIdlParsed(result);
73
-
74
- if (result.css) {
75
- // Specific rule for CSS properties:
76
- // Add CSS property definitions that weren't in a table
77
- if (result.dfns) {
78
- result.dfns
79
- .filter(dfn => dfn.type == "property" && !dfn.informative)
80
- .forEach(propDfn => {
81
- propDfn.linkingText.forEach(lt => {
82
- if (!result.css.properties.hasOwnProperty(lt)) {
83
- result.css.properties[lt] = {
84
- name: lt
85
- };
86
- }
87
- });
88
- });
89
- }
90
-
91
- // Specific rule for CSS properties:
92
- // Ideally, the sample definition (property-name) in CSS2 and the custom
93
- // property definition (--*) in CSS Variables would not be flagged as
94
- // real CSS properties. In practice, they are. Let's remove them from
95
- // the extract.
96
- ['property-name', '--*'].forEach(prop => {
97
- if ((result.css.properties || {})[prop]) {
98
- delete result.css.properties[prop];
99
- }
100
- });
101
-
102
- // Specific rule for CSS extracts:
103
- // Parse extracted CSS definitions and add generated IDL attribute names
104
- Object.entries(result.css.properties || {}).forEach(([prop, dfn]) => {
105
- if (dfn.value || dfn.newValues) {
106
- try {
107
- dfn.parsedValue = cssDfnParser.parsePropDefValue(
108
- dfn.value || dfn.newValues);
109
- } catch (e) {
110
- dfn.valueParseError = e.message;
111
- }
112
- }
113
- dfn.styleDeclaration = getGeneratedIDLNamesByCSSProperty(prop);
114
- });
115
- Object.entries(result.css.descriptors || {}).forEach(([desc, dfn]) => {
116
- if (dfn.value) {
117
- try {
118
- dfn.parsedValue = cssDfnParser.parsePropDefValue(
119
- dfn.value);
120
- } catch (e) {
121
- dfn.valueParseError = e.message;
122
- }
123
- }
124
- });
125
- Object.entries(result.css.valuespaces || {}).forEach(([vs, dfn]) => {
126
- if (dfn.value) {
127
- try {
128
- dfn.parsedValue = cssDfnParser.parsePropDefValue(
129
- dfn.value);
130
- } catch (e) {
131
- dfn.valueParseError = e.message;
132
- }
133
- }
134
- });
135
- }
136
-
137
- // Copy results back into initial spec object
138
- spec.crawled = result.crawled;
139
- crawlOptions.modules.forEach(mod => {
140
- if (result[mod.property]) {
141
- spec[mod.property] = result[mod.property];
142
- if (mod.property === 'idl') {
143
- spec.idlparsed = result.idlparsed;
144
- }
145
- }
146
- });
147
- }
148
- catch (err) {
149
- spec.title = spec.title || '[Could not be determined, see error]';
150
- spec.error = err.toString() + (err.stack ? ' ' + err.stack : '');
151
- }
152
-
153
- return spec;
154
- }
155
-
156
-
157
- /**
158
- * Saves spec results to extract files as needed and replaces the results with
159
- * links accordingly.
160
- *
161
- * @function
162
- * @param {Object} spec The results of crawling the spec. Object should contain
163
- * metadata about the spec and the crawl processing results in appropriate
164
- * properties.
165
- * @param {Object} settings Crawl settings. Recognized settings: "modules",
166
- * "output" and "quiet". See CLI help (node reffy.js --help) for details.
167
- * The "modules" setting is mandatory and note that the function will not do
168
- * anything if "output" is not set.
169
- * @return {Promise<Object>} The promise to get an updated spec object that
170
- * contains links to created extracts.
171
- */
172
- async function saveSpecResults(spec, settings) {
173
- settings = settings || {};
174
- if (!settings.output) {
175
- return spec;
176
- }
177
-
178
- async function getSubfolder(name) {
179
- let subfolder = path.join(settings.output, name);
180
- await createFolderIfNeeded(subfolder);
181
- return subfolder;
182
- }
183
-
184
- const modules = settings.modules;
185
- const folders = {};
186
- for (const mod of modules) {
187
- if (mod.metadata) {
188
- continue;
189
- }
190
- folders[mod.property] = await getSubfolder(mod.property);
191
-
192
- // Specific rule for IDL:
193
- // Raw IDL goes to "idl" subfolder, parsed IDL goes to "idlparsed"
194
- if (mod.property === 'idl') {
195
- folders.idlparsed = await getSubfolder('idlparsed');
196
- }
197
- }
198
-
199
- function getBaseJSON(spec) {
200
- return {
201
- spec: {
202
- title: spec.title,
203
- url: spec.crawled
204
- }
205
- };
206
- }
207
-
208
- async function saveExtract(spec, property, filter) {
209
- if (filter(spec)) {
210
- const contents = getBaseJSON(spec);
211
- contents[property] = spec[property];
212
- const json = JSON.stringify(contents, null, 2);
213
- const filename = path.join(folders[property], spec.shortname + '.json');
214
- await fs.promises.writeFile(filename, json);
215
- spec[property] = `${property}/${spec.shortname}.json`;
216
- }
217
- else {
218
- delete spec[property];
219
- }
220
- }
221
-
222
- async function saveIdl(spec) {
223
- let idlHeader = `
224
- // GENERATED CONTENT - DO NOT EDIT
225
- // Content was automatically extracted by Reffy into webref
226
- // (https://github.com/w3c/webref)
227
- // Source: ${spec.title} (${spec.crawled})`;
228
- idlHeader = idlHeader.replace(/^\s+/gm, '').trim() + '\n\n';
229
- const idl = idlHeader + spec.idl + '\n';
230
- await fs.promises.writeFile(
231
- path.join(folders.idl, spec.shortname + '.idl'), idl);
232
- return `idl/${spec.shortname}.idl`;
233
- };
234
-
235
- async function saveCss(spec) {
236
- // There are no comments in JSON, so include the spec title+URL as the
237
- // first property instead.
238
- const css = Object.assign(getBaseJSON(spec), spec.css);
239
- const json = JSON.stringify(css, (key, val) => {
240
- if ((key === 'parsedValue') || (key === 'valueParseError')) {
241
- return undefined;
242
- }
243
- else {
244
- return val;
245
- }
246
- }, 2) + '\n';
247
- const pathname = path.join(folders.css, spec.shortname + '.json')
248
- await fs.promises.writeFile(pathname, json);
249
- return `css/${spec.shortname}.json`;
250
- };
251
-
252
- // Save IDL dumps
253
- if (spec.idl) {
254
- spec.idl = await saveIdl(spec);
255
- }
256
- if (spec.idlparsed) {
257
- spec.idlparsed = await saveIdlParsed(spec, settings.output);
258
- }
259
-
260
- // Save CSS dumps
261
- function defineCSSContent(spec) {
262
- return spec.css && (
263
- (Object.keys(spec.css.properties || {}).length > 0) ||
264
- (Object.keys(spec.css.descriptors || {}).length > 0) ||
265
- (Object.keys(spec.css.valuespaces || {}).length > 0));
266
- }
267
- if (defineCSSContent(spec)) {
268
- spec.css = await saveCss(spec);
269
- }
270
-
271
- // Specs that define CSS now have a "css" key that point to the CSS extract.
272
- // Specs that don't define CSS still have a "css" key that points to an
273
- // empty object structure. Let's get rid of it.
274
- if (spec.css && typeof spec.css !== 'string') {
275
- delete spec.css;
276
- }
277
-
278
- // Quick and dirty function to determine whether a variable is "empty"
279
- // (it returns true for falsy values, which is good enough for what we need)
280
- function isEmpty(thing) {
281
- return !thing ||
282
- Array.isArray(thing) && (thing.length === 0) ||
283
- (typeof thing == 'object') && (Object.keys(thing).length === 0);
284
- }
285
-
286
- // Save all other extracts
287
- const remainingModules = modules.filter(mod =>
288
- !mod.metadata && mod.property !== 'css' && mod.property !== 'idl');
289
- for (const mod of remainingModules) {
290
- await saveExtract(spec, mod.property, spec => !isEmpty(spec[mod.property]));
291
- if (spec[mod.property] && typeof spec[mod.property] !== 'string') {
292
- delete spec[mod.property];
293
- }
294
- }
295
-
296
- return spec;
297
- }
298
-
299
-
300
- /**
301
- * Main method that crawls the list of specification URLs and return a structure
302
- * that full describes its title, URLs, references, and IDL definitions.
303
- *
304
- * @function
305
- * @param {Array(String)} speclist List of URLs to parse
306
- * @param {Object} crawlOptions Crawl options
307
- * @return {Promise<Array(Object)} The promise to get an array of complete
308
- * specification descriptions
309
- */
310
- async function crawlList(speclist, crawlOptions) {
311
- crawlOptions = crawlOptions || {};
312
-
313
- // Prepare Puppeteer instance
314
- crawlOptions.modules = expandBrowserModules(crawlOptions.modules);
315
- await setupBrowser(crawlOptions.modules);
316
-
317
- const list = speclist.map(completeWithAlternativeUrls);
318
- const listAndPromise = list.map(spec => {
319
- let resolve = null;
320
- let reject = null;
321
- let readyToCrawl = new Promise((resolveFunction, rejectFunction) => {
322
- resolve = resolveFunction;
323
- reject = rejectFunction;
324
- });
325
- return { spec, readyToCrawl, resolve, reject };
326
- });
327
-
328
- // In debug mode, specs are processed one by one. In normal mode,
329
- // specs are processing in chunks
330
- const chunkSize = Math.min((crawlOptions.debug ? 1 : 4), list.length);
331
-
332
- let pos = 0;
333
- function flagNextSpecAsReadyToCrawl() {
334
- if (pos < listAndPromise.length) {
335
- listAndPromise[pos].resolve();
336
- pos += 1;
337
- }
338
- }
339
- for (let i = 0; i < chunkSize; i++) {
340
- flagNextSpecAsReadyToCrawl();
341
- }
342
-
343
- const nbStr = '' + listAndPromise.length;
344
- async function crawlSpecAndPromise(specAndPromise, idx) {
345
- await specAndPromise.readyToCrawl;
346
- const spec = specAndPromise.spec;
347
- const logCounter = ('' + (idx + 1)).padStart(nbStr.length, ' ') + '/' + nbStr;
348
- crawlOptions.quiet ?? console.warn(`${logCounter} - ${spec.url} - crawling`);
349
- let result = await crawlSpec(spec, crawlOptions);
350
- result = await saveSpecResults(result, crawlOptions);
351
- crawlOptions.quiet ?? console.warn(`${logCounter} - ${spec.url} - done`);
352
- flagNextSpecAsReadyToCrawl();
353
-
354
- return result;
355
- }
356
-
357
- const results = await Promise.all(listAndPromise.map(crawlSpecAndPromise));
358
-
359
- // Close Puppeteer instance
360
- teardownBrowser();
361
-
362
- return results;
363
- }
364
-
365
-
366
- /**
367
- * Merges extracts per series for the given property and adjusts links
368
- *
369
- * @function
370
- * @param {Array(object)} data Crawl results
371
- * @param {string} property The extract property to process
372
- * @param {Object} settings Crawl settings. The function looks at the "output"
373
- * setting to determine where to look for extracts
374
- * @return {Promise(Array)} The promise to get an updated crawl results array
375
- */
376
- async function adjustExtractsPerSeries(data, property, settings) {
377
- if (!settings.output) {
378
- return data;
379
- }
380
-
381
- const fullLevels = data.filter(spec =>
382
- (spec.seriesComposition !== 'delta') &&
383
- isLatestLevelThatPasses(spec, data, spec => spec[property]));
384
- const deltaLevels = data.filter(spec =>
385
- (spec.seriesComposition === 'delta') && spec[property]);
386
-
387
- data.forEach(spec => {
388
- if (fullLevels.includes(spec)) {
389
- // Full level, rename the extract after the series' shortname
390
- const pathname = path.resolve(settings.output, spec[property]);
391
- spec[property] = `${property}/${spec.series.shortname}${path.extname(spec[property])}`;
392
- const newpathname = path.resolve(settings.output, spec[property]);
393
- fs.renameSync(pathname, newpathname);
394
- }
395
- else if (deltaLevels.includes(spec)) {
396
- // Delta level, need to keep the extract as-is
397
- }
398
- else if (spec[property]) {
399
- // Not the right full level in the series, drop created extract
400
- // and link to the series extract instead
401
- const pathname = path.resolve(settings.output, spec[property]);
402
- fs.unlinkSync(pathname);
403
- spec[property] = `${property}/${spec.series.shortname}${path.extname(spec[property])}`;
404
- }
405
- });
406
-
407
- return data;
408
- }
409
-
410
-
411
- /**
412
- * Saves the crawl results to an index.json file.
413
- *
414
- * @function
415
- * @param {Array(Object)} data The list of specification structures to save
416
- * @param {Object} settings Crawl settings. The function does not create any
417
- * save file if the "output" setting is not set.
418
- * @return {Promise<void>} The promise to have saved the data
419
- */
420
- async function saveResults(data, settings) {
421
- if (!settings.output) {
422
- return data;
423
- }
424
-
425
- // Save all results to an index.json file
426
- const indexFilename = path.join(settings.output, 'index.json');
427
- const contents = {
428
- type: 'crawl',
429
- title: 'Reffy crawl',
430
- date: (new Date()).toJSON(),
431
- options: settings,
432
- stats: {},
433
- results: data
434
- };
435
- contents.options.modules = contents.options.modules.map(mod => mod.property);
436
- contents.stats = {
437
- crawled: contents.results.length,
438
- errors: contents.results.filter(spec => !!spec.error).length
439
- };
440
-
441
- await fs.promises.writeFile(indexFilename, JSON.stringify(contents, null, 2));
442
- return contents;
443
- }
444
-
445
-
446
- /**
447
- * Crawls the specifications listed in the given JSON file and generates a
448
- * crawl report in the given folder.
449
- *
450
- * @function
451
- * @param {Object} options Crawl options. Possible options are:
452
- * publishedVersion, debug, output, terse, modules and specs.
453
- * See CLI help (node reffy.js --help) for details.
454
- * @return {Promise<void>} The promise that the crawl will have been made
455
- */
456
- function crawlSpecs(options) {
457
- function prepareListOfSpecs(list) {
458
- return list.map(spec => {
459
- if (typeof spec !== 'string') {
460
- return spec;
461
- }
462
- let match = specs.find(s => s.url === spec || s.shortname === spec);
463
- if (!match) {
464
- match = specs.find(s => s.series &&
465
- s.series.shortname === spec &&
466
- s.series.currentSpecification === s.shortname);
467
- }
468
- if (match) {
469
- return match;
470
- }
471
-
472
- let url = null;
473
- try {
474
- url = (new URL(spec)).href;
475
- }
476
- catch {
477
- if (spec.endsWith('.html')) {
478
- url = (new URL(spec, `file://${process.cwd()}/`)).href;
479
- }
480
- else {
481
- const msg = `Spec ID "${spec}" can neither be interpreted as a URL, a valid shortname or a relative path to an HTML file`;
482
- throw new Error(msg);
483
- }
484
- }
485
- return {
486
- url,
487
- nightly: { url },
488
- shortname: spec.replace(/[:\/\\\.]/g, ''),
489
- series: {
490
- shortname: spec.replace(/[:\/\\\.]/g, ''),
491
- }
492
- };
493
- });
494
- }
495
-
496
- const requestedList = (options && options.specs) ?
497
- prepareListOfSpecs(options.specs) :
498
- specs;
499
-
500
- return crawlList(requestedList, options)
501
- .then(async results => {
502
- // Merge extracts per series when necessary (CSS/IDL extracts)
503
- for (const mod of options.modules) {
504
- if (mod.extractsPerSeries) {
505
- await adjustExtractsPerSeries(results, mod.property, options);
506
- }
507
- }
508
- return results;
509
- })
510
- .then(results => {
511
- // Return results to the console or save crawl results to an
512
- // index.json file
513
- if (options.terse) {
514
- const property = options.modules[0].property;
515
- results = results.map(result => {
516
- let res = result[property];
517
- if (property === 'idl') {
518
- res = res?.idl;
519
- }
520
- return res;
521
- });
522
- if (results.length === 1) {
523
- results = results[0];
524
- }
525
- console.log(typeof results === 'string' ?
526
- results : JSON.stringify(results, null, 2));
527
- }
528
- else if (!options.output) {
529
- console.log(JSON.stringify(results, null, 2));
530
- }
531
- else {
532
- return saveResults(results, options);
533
- }
534
- })
535
- .then(async crawlIndex => {
536
- // Generate IDL names extracts from IDL extracts
537
- // (and dfns extracts to create links to definitions)
538
- if (!options.output || !crawlIndex?.options?.modules?.find(mod => mod === 'idl')) {
539
- return;
540
- }
541
- const crawlResults = await expandCrawlResult(crawlIndex, options.output, ['idlparsed', 'dfns']);
542
- const idlNames = generateIdlNames(crawlResults.results, options);
543
- await saveIdlNames(idlNames, options.output);
544
- });
545
- }
546
-
547
-
548
- /**************************************************
549
- Export methods for use as module
550
- **************************************************/
551
- module.exports.crawlList = crawlList;
552
- module.exports.crawlSpecs = crawlSpecs;
1
+ #!/usr/bin/env node
2
+ /**
3
+ * The spec crawler takes a list of spec URLs as input, gathers some knowledge
4
+ * about these specs (published versions, URL of the Editor's Draft, etc.),
5
+ * fetches these specs, parses them, extracts relevant information that they
6
+ * contain (such as the WebIDL they define, the list of specifications that they
7
+ * reference, and links to external specs), and produces a crawl report with the
8
+ * results of these investigations.
9
+ *
10
+ * @module crawler
11
+ */
12
+
13
+ const fs = require('fs');
14
+ const path = require('path');
15
+ const specs = require('browser-specs');
16
+ const cssDfnParser = require('./css-grammar-parser');
17
+ const { generateIdlParsed, saveIdlParsed } = require('../cli/generate-idlparsed');
18
+ const { generateIdlNames, saveIdlNames } = require('../cli/generate-idlnames');
19
+ const {
20
+ completeWithAlternativeUrls,
21
+ expandBrowserModules,
22
+ expandCrawlResult,
23
+ expandSpecResult,
24
+ getGeneratedIDLNamesByCSSProperty,
25
+ isLatestLevelThatPasses,
26
+ processSpecification,
27
+ setupBrowser,
28
+ teardownBrowser,
29
+ createFolderIfNeeded
30
+ } = require('./util');
31
+
32
+ const {version: reffyVersion} = require('../../package.json');
33
+
34
+ /**
35
+ * Return the spec if crawl succeeded or crawl result from given fallback list
36
+ * if crawl yielded an error (and fallback does exist).
37
+ *
38
+ * The function keeps the "error" property on the crawl result it returns so
39
+ * that the error does not get entirely lost.
40
+ *
41
+ * @function
42
+ * @param {Object} spec Actual spec crawl result
43
+ * * @param {Object} spec Actual spec crawl result
44
+ * @param {String} fallbackFolder The folder that contains fallback extracts
45
+ * @param {Array<Object>} fallbackData A list of crawl results to use as
46
+ * fallback when needed
47
+ * @return {Object} The given crawl result or a new one that reuses fallback
48
+ * content if needed
49
+ */
50
+ async function specOrFallback(spec, fallbackFolder, fallbackData) {
51
+ if (spec.error && fallbackData) {
52
+ const fallback = fallbackData.find(s => s.url === spec.url);
53
+ if (fallback) {
54
+ const copy = Object.assign({}, fallback);
55
+ const result = await expandSpecResult(copy, fallbackFolder);
56
+ result.error = spec.error;
57
+ return result;
58
+ }
59
+ }
60
+ return spec;
61
+ }
62
+
63
+
64
+ /**
65
+ * Load and parse the given spec.
66
+ *
67
+ * @function
68
+ * @param {Object} spec The spec to load (must already have been completed with
69
+ * useful info, as returned by "createInitialSpecDescriptions")
70
+ * @param {Object} crawlOptions Crawl options
71
+ * @return {Promise<Object>} The promise to get a spec object with crawl info
72
+ */
73
+ async function crawlSpec(spec, crawlOptions) {
74
+ crawlOptions = crawlOptions || {};
75
+ spec.crawled = crawlOptions.publishedVersion ?
76
+ (spec.release ? spec.release : spec.nightly) :
77
+ spec.nightly;
78
+ const fallbackFolder = crawlOptions.fallback ?
79
+ path.dirname(crawlOptions.fallback) : '';
80
+
81
+ if (spec.error) {
82
+ return specOrFallback(spec, fallbackFolder, crawlOptions.fallbackData?.results);
83
+ }
84
+
85
+ try {
86
+ const fallback = crawlOptions.fallbackData?.results?.find(s => s.url === spec.url);
87
+ let cacheInfo = {};
88
+ if (crawlOptions.fallbackData?.crawler === `reffy-${reffyVersion}`) {
89
+ cacheInfo = Object.assign({}, fallback?.crawlCacheInfo);
90
+ }
91
+ const result = await processSpecification(
92
+ spec.crawled,
93
+ (spec, modules) => {
94
+ const idToHeading = modules.find(m => m.needsIdToHeadingMap) ?
95
+ window.reffy.mapIdsToHeadings() : null;
96
+ const res = {
97
+ crawled: window.location.toString()
98
+ };
99
+ modules.forEach(mod => {
100
+ res[mod.property] = window.reffy[mod.name](spec, idToHeading);
101
+ });
102
+ return res;
103
+ },
104
+ [spec, crawlOptions.modules],
105
+ { quiet: crawlOptions.quiet,
106
+ forceLocalFetch: crawlOptions.forceLocalFetch,
107
+ ...cacheInfo}
108
+ );
109
+ if (result.status === "notmodified" && fallback) {
110
+ crawlOptions.quiet ?? console.warn(`skipping ${spec.url}, no change`);
111
+ const copy = Object.assign({}, fallback);
112
+ return expandSpecResult(copy, fallbackFolder);
113
+ }
114
+
115
+ // Specific rule for IDL extracts:
116
+ // parse the extracted WebIdl content
117
+ await generateIdlParsed(result);
118
+
119
+ if (result.css) {
120
+ // Specific rule for CSS properties:
121
+ // Add CSS property definitions that weren't in a table
122
+ if (result.dfns) {
123
+ result.dfns
124
+ .filter(dfn => dfn.type == "property" && !dfn.informative)
125
+ .forEach(propDfn => {
126
+ propDfn.linkingText.forEach(lt => {
127
+ if (!result.css.properties.hasOwnProperty(lt)) {
128
+ result.css.properties[lt] = {
129
+ name: lt
130
+ };
131
+ }
132
+ });
133
+ });
134
+ }
135
+
136
+ // Specific rule for CSS properties:
137
+ // Ideally, the sample definition (property-name) in CSS2 and the custom
138
+ // property definition (--*) in CSS Variables would not be flagged as
139
+ // real CSS properties. In practice, they are. Let's remove them from
140
+ // the extract.
141
+ ['property-name', '--*'].forEach(prop => {
142
+ if ((result.css.properties || {})[prop]) {
143
+ delete result.css.properties[prop];
144
+ }
145
+ });
146
+
147
+ // Specific rule for CSS extracts:
148
+ // Parse extracted CSS definitions and add generated IDL attribute names
149
+ Object.entries(result.css.properties || {}).forEach(([prop, dfn]) => {
150
+ if (dfn.value || dfn.newValues) {
151
+ try {
152
+ dfn.parsedValue = cssDfnParser.parsePropDefValue(
153
+ dfn.value || dfn.newValues);
154
+ } catch (e) {
155
+ dfn.valueParseError = e.message;
156
+ }
157
+ }
158
+ dfn.styleDeclaration = getGeneratedIDLNamesByCSSProperty(prop);
159
+ });
160
+ Object.entries(result.css.descriptors || {}).forEach(([desc, dfn]) => {
161
+ if (dfn.value) {
162
+ try {
163
+ dfn.parsedValue = cssDfnParser.parsePropDefValue(
164
+ dfn.value);
165
+ } catch (e) {
166
+ dfn.valueParseError = e.message;
167
+ }
168
+ }
169
+ });
170
+ Object.entries(result.css.valuespaces || {}).forEach(([vs, dfn]) => {
171
+ if (dfn.value) {
172
+ try {
173
+ dfn.parsedValue = cssDfnParser.parsePropDefValue(
174
+ dfn.value);
175
+ } catch (e) {
176
+ dfn.valueParseError = e.message;
177
+ }
178
+ }
179
+ });
180
+ }
181
+
182
+ // Copy results back into initial spec object
183
+ spec.crawled = result.crawled;
184
+ if (result.crawlCacheInfo) {
185
+ spec.crawlCacheInfo = result.crawlCacheInfo;
186
+ }
187
+ crawlOptions.modules.forEach(mod => {
188
+ if (result[mod.property]) {
189
+ spec[mod.property] = result[mod.property];
190
+ if (mod.property === 'idl') {
191
+ spec.idlparsed = result.idlparsed;
192
+ }
193
+ }
194
+ });
195
+ }
196
+ catch (err) {
197
+ spec.title = spec.title || '[Could not be determined, see error]';
198
+ spec.error = err.toString() + (err.stack ? ' ' + err.stack : '');
199
+ }
200
+
201
+ return specOrFallback(spec, fallbackFolder, crawlOptions.fallbackData?.results);
202
+ }
203
+
204
+
205
+ /**
206
+ * Saves spec results to extract files as needed and replaces the results with
207
+ * links accordingly.
208
+ *
209
+ * @function
210
+ * @param {Object} spec The results of crawling the spec. Object should contain
211
+ * metadata about the spec and the crawl processing results in appropriate
212
+ * properties.
213
+ * @param {Object} settings Crawl settings. Recognized settings: "modules",
214
+ * "output" and "quiet". See CLI help (node reffy.js --help) for details.
215
+ * The "modules" setting is mandatory and note that the function will not do
216
+ * anything if "output" is not set.
217
+ * @return {Promise<Object>} The promise to get an updated spec object that
218
+ * contains links to created extracts.
219
+ */
220
+ async function saveSpecResults(spec, settings) {
221
+ settings = settings || {};
222
+ if (!settings.output) {
223
+ return spec;
224
+ }
225
+
226
+ async function getSubfolder(name) {
227
+ let subfolder = path.join(settings.output, name);
228
+ await createFolderIfNeeded(subfolder);
229
+ return subfolder;
230
+ }
231
+
232
+ const modules = settings.modules;
233
+ const folders = {};
234
+ for (const mod of modules) {
235
+ if (mod.metadata) {
236
+ continue;
237
+ }
238
+ folders[mod.property] = await getSubfolder(mod.property);
239
+
240
+ // Specific rule for IDL:
241
+ // Raw IDL goes to "idl" subfolder, parsed IDL goes to "idlparsed"
242
+ if (mod.property === 'idl') {
243
+ folders.idlparsed = await getSubfolder('idlparsed');
244
+ }
245
+ }
246
+
247
+ function getBaseJSON(spec) {
248
+ return {
249
+ spec: {
250
+ title: spec.title,
251
+ url: spec.crawled
252
+ }
253
+ };
254
+ }
255
+
256
+ async function saveExtract(spec, property, filter) {
257
+ if (filter(spec)) {
258
+ const contents = getBaseJSON(spec);
259
+ contents[property] = spec[property];
260
+ const json = JSON.stringify(contents, null, 2);
261
+ const filename = path.join(folders[property], spec.shortname + '.json');
262
+ await fs.promises.writeFile(filename, json);
263
+ spec[property] = `${property}/${spec.shortname}.json`;
264
+ }
265
+ else {
266
+ delete spec[property];
267
+ }
268
+ }
269
+
270
+ async function saveIdl(spec) {
271
+ let idlHeader = `
272
+ // GENERATED CONTENT - DO NOT EDIT
273
+ // Content was automatically extracted by Reffy into webref
274
+ // (https://github.com/w3c/webref)
275
+ // Source: ${spec.title} (${spec.crawled})`;
276
+ idlHeader = idlHeader.replace(/^\s+/gm, '').trim() + '\n\n';
277
+ const idl = idlHeader + spec.idl + '\n';
278
+ await fs.promises.writeFile(
279
+ path.join(folders.idl, spec.shortname + '.idl'), idl);
280
+ return `idl/${spec.shortname}.idl`;
281
+ };
282
+
283
+ async function saveCss(spec) {
284
+ // There are no comments in JSON, so include the spec title+URL as the
285
+ // first property instead.
286
+ const css = Object.assign(getBaseJSON(spec), spec.css);
287
+ const json = JSON.stringify(css, (key, val) => {
288
+ if ((key === 'parsedValue') || (key === 'valueParseError')) {
289
+ return undefined;
290
+ }
291
+ else {
292
+ return val;
293
+ }
294
+ }, 2) + '\n';
295
+ const pathname = path.join(folders.css, spec.shortname + '.json')
296
+ await fs.promises.writeFile(pathname, json);
297
+ return `css/${spec.shortname}.json`;
298
+ };
299
+
300
+ // Save IDL dumps
301
+ if (spec.idl) {
302
+ spec.idl = await saveIdl(spec);
303
+ }
304
+ if (spec.idlparsed) {
305
+ spec.idlparsed = await saveIdlParsed(spec, settings.output);
306
+ }
307
+
308
+ // Save CSS dumps
309
+ function defineCSSContent(spec) {
310
+ return spec.css && (
311
+ (Object.keys(spec.css.properties || {}).length > 0) ||
312
+ (Object.keys(spec.css.descriptors || {}).length > 0) ||
313
+ (Object.keys(spec.css.valuespaces || {}).length > 0));
314
+ }
315
+ if (defineCSSContent(spec)) {
316
+ spec.css = await saveCss(spec);
317
+ }
318
+
319
+ // Specs that define CSS now have a "css" key that point to the CSS extract.
320
+ // Specs that don't define CSS still have a "css" key that points to an
321
+ // empty object structure. Let's get rid of it.
322
+ if (spec.css && typeof spec.css !== 'string') {
323
+ delete spec.css;
324
+ }
325
+
326
+ // Quick and dirty function to determine whether a variable is "empty"
327
+ // (it returns true for falsy values, which is good enough for what we need)
328
+ function isEmpty(thing) {
329
+ return !thing ||
330
+ Array.isArray(thing) && (thing.length === 0) ||
331
+ (typeof thing == 'object') && (Object.keys(thing).length === 0);
332
+ }
333
+
334
+ // Save all other extracts
335
+ const remainingModules = modules.filter(mod =>
336
+ !mod.metadata && mod.property !== 'css' && mod.property !== 'idl');
337
+ for (const mod of remainingModules) {
338
+ await saveExtract(spec, mod.property, spec => !isEmpty(spec[mod.property]));
339
+ if (spec[mod.property] && typeof spec[mod.property] !== 'string') {
340
+ delete spec[mod.property];
341
+ }
342
+ }
343
+
344
+ return spec;
345
+ }
346
+
347
+
348
+ /**
349
+ * Main method that crawls the list of specification URLs and return a structure
350
+ * that full describes its title, URLs, references, and IDL definitions.
351
+ *
352
+ * @function
353
+ * @param {Array(String)} speclist List of URLs to parse
354
+ * @param {Object} crawlOptions Crawl options
355
+ * @return {Promise<Array(Object)} The promise to get an array of complete
356
+ * specification descriptions
357
+ */
358
+ async function crawlList(speclist, crawlOptions) {
359
+ // Make a shallow copy of crawl options object since we're going
360
+ // to modify properties in place
361
+ crawlOptions = Object.assign({}, crawlOptions);
362
+
363
+ // Expand list of processing modules to use if not already done
364
+ crawlOptions.modules = expandBrowserModules(crawlOptions.modules);
365
+
366
+ // Load fallback data if necessary
367
+ if (crawlOptions.fallback) {
368
+ try {
369
+ crawlOptions.fallbackData = JSON.parse(await fs.promises.readFile(crawlOptions.fallback));
370
+ } catch (e) {
371
+ throw new Error(`Could not parse fallback data file ${crawlOptions.fallback}`);
372
+ }
373
+ }
374
+
375
+ // Prepare Puppeteer instance
376
+ await setupBrowser(crawlOptions.modules);
377
+
378
+ const list = speclist.map(completeWithAlternativeUrls);
379
+ const listAndPromise = list.map(spec => {
380
+ let resolve = null;
381
+ let reject = null;
382
+ let readyToCrawl = new Promise((resolveFunction, rejectFunction) => {
383
+ resolve = resolveFunction;
384
+ reject = rejectFunction;
385
+ });
386
+ return { spec, readyToCrawl, resolve, reject };
387
+ });
388
+
389
+ // In debug mode, specs are processed one by one. In normal mode,
390
+ // specs are processing in chunks
391
+ const chunkSize = Math.min((crawlOptions.debug ? 1 : 4), list.length);
392
+
393
+ let pos = 0;
394
+ function flagNextSpecAsReadyToCrawl() {
395
+ if (pos < listAndPromise.length) {
396
+ listAndPromise[pos].resolve();
397
+ pos += 1;
398
+ }
399
+ }
400
+ for (let i = 0; i < chunkSize; i++) {
401
+ flagNextSpecAsReadyToCrawl();
402
+ }
403
+
404
+ const nbStr = '' + listAndPromise.length;
405
+ async function crawlSpecAndPromise(specAndPromise, idx) {
406
+ await specAndPromise.readyToCrawl;
407
+ const spec = specAndPromise.spec;
408
+ const logCounter = ('' + (idx + 1)).padStart(nbStr.length, ' ') + '/' + nbStr;
409
+ crawlOptions.quiet ?? console.warn(`${logCounter} - ${spec.url} - crawling`);
410
+ let result = await crawlSpec(spec, crawlOptions);
411
+ result = await saveSpecResults(result, crawlOptions);
412
+ crawlOptions.quiet ?? console.warn(`${logCounter} - ${spec.url} - done`);
413
+ flagNextSpecAsReadyToCrawl();
414
+
415
+ return result;
416
+ }
417
+
418
+ const results = await Promise.all(listAndPromise.map(crawlSpecAndPromise));
419
+
420
+ // Close Puppeteer instance
421
+ teardownBrowser();
422
+
423
+ return results;
424
+ }
425
+
426
+
427
+ /**
428
+ * Merges extracts per series for the given property and adjusts links
429
+ *
430
+ * @function
431
+ * @param {Array(object)} data Crawl results
432
+ * @param {string} property The extract property to process
433
+ * @param {Object} settings Crawl settings. The function looks at the "output"
434
+ * setting to determine where to look for extracts
435
+ * @return {Promise(Array)} The promise to get an updated crawl results array
436
+ */
437
+ async function adjustExtractsPerSeries(data, property, settings) {
438
+ if (!settings.output) {
439
+ return data;
440
+ }
441
+
442
+ const fullLevels = data.filter(spec =>
443
+ (spec.seriesComposition !== 'delta') &&
444
+ isLatestLevelThatPasses(spec, data, spec => spec[property]));
445
+ const deltaLevels = data.filter(spec =>
446
+ (spec.seriesComposition === 'delta') && spec[property]);
447
+
448
+ data.forEach(spec => {
449
+ if (fullLevels.includes(spec)) {
450
+ // Full level, rename the extract after the series' shortname
451
+ const pathname = path.resolve(settings.output, spec[property]);
452
+ spec[property] = `${property}/${spec.series.shortname}${path.extname(spec[property])}`;
453
+ const newpathname = path.resolve(settings.output, spec[property]);
454
+ fs.renameSync(pathname, newpathname);
455
+ }
456
+ else if (deltaLevels.includes(spec)) {
457
+ // Delta level, need to keep the extract as-is
458
+ }
459
+ else if (spec[property]) {
460
+ // Not the right full level in the series, drop created extract
461
+ const pathname = path.resolve(settings.output, spec[property]);
462
+ fs.unlinkSync(pathname);
463
+ delete spec[property];
464
+ }
465
+ });
466
+
467
+ return data;
468
+ }
469
+
470
+
471
+ /**
472
+ * Saves the crawl results to an index.json file.
473
+ *
474
+ * @function
475
+ * @param {Array(Object)} data The list of specification structures to save
476
+ * @param {Object} settings Crawl settings. The function does not create any
477
+ * save file if the "output" setting is not set.
478
+ * @return {Promise<void>} The promise to have saved the data
479
+ */
480
+ async function saveResults(data, settings) {
481
+ if (!settings.output) {
482
+ return data;
483
+ }
484
+
485
+ // Save all results to an index.json file
486
+ const indexFilename = path.join(settings.output, 'index.json');
487
+
488
+ const contents = {
489
+ type: 'crawl',
490
+ title: 'Reffy crawl',
491
+ date: (new Date()).toJSON(),
492
+ options: settings,
493
+ stats: {},
494
+ crawler: `reffy-${reffyVersion}`,
495
+ results: data
496
+ };
497
+ contents.options.modules = contents.options.modules.map(mod => mod.property);
498
+ contents.stats = {
499
+ crawled: contents.results.length,
500
+ errors: contents.results.filter(spec => !!spec.error).length
501
+ };
502
+
503
+ await fs.promises.writeFile(indexFilename, JSON.stringify(contents, null, 2));
504
+ return contents;
505
+ }
506
+
507
+
508
+ /**
509
+ * Crawls the specifications listed in the given JSON file and generates a
510
+ * crawl report in the given folder.
511
+ *
512
+ * @function
513
+ * @param {Object} options Crawl options. Possible options are:
514
+ * publishedVersion, debug, output, terse, modules and specs.
515
+ * See CLI help (node reffy.js --help) for details.
516
+ * @return {Promise<void>} The promise that the crawl will have been made
517
+ */
518
+ function crawlSpecs(options) {
519
+ function prepareListOfSpecs(list) {
520
+ return list.map(spec => {
521
+ if (typeof spec !== 'string') {
522
+ return spec;
523
+ }
524
+ let match = specs.find(s => s.url === spec || s.shortname === spec);
525
+ if (!match) {
526
+ match = specs.find(s => s.series &&
527
+ s.series.shortname === spec &&
528
+ s.series.currentSpecification === s.shortname);
529
+ }
530
+ if (match) {
531
+ return match;
532
+ }
533
+
534
+ let url = null;
535
+ try {
536
+ url = (new URL(spec)).href;
537
+ }
538
+ catch {
539
+ if (spec.endsWith('.html')) {
540
+ url = (new URL(spec, `file://${process.cwd()}/`)).href;
541
+ }
542
+ else {
543
+ const msg = `Spec ID "${spec}" can neither be interpreted as a URL, a valid shortname or a relative path to an HTML file`;
544
+ throw new Error(msg);
545
+ }
546
+ }
547
+ return {
548
+ url,
549
+ nightly: { url },
550
+ shortname: spec.replace(/[:\/\\\.]/g, ''),
551
+ series: {
552
+ shortname: spec.replace(/[:\/\\\.]/g, ''),
553
+ }
554
+ };
555
+ });
556
+ }
557
+
558
+ const requestedList = options?.specs ?
559
+ prepareListOfSpecs(options.specs) :
560
+ specs;
561
+
562
+ // Make a shallow copy of passed options parameter and expand modules
563
+ // in place.
564
+ options = Object.assign({}, options);
565
+ options.modules = expandBrowserModules(options.modules);
566
+
567
+ return crawlList(requestedList, options)
568
+ .then(async results => {
569
+ // Merge extracts per series when necessary (CSS/IDL extracts)
570
+ for (const mod of options.modules) {
571
+ if (mod.extractsPerSeries) {
572
+ await adjustExtractsPerSeries(results, mod.property, options);
573
+ if (mod.property === 'idl') {
574
+ await adjustExtractsPerSeries(results, 'idlparsed', options);
575
+ }
576
+ }
577
+ }
578
+ return results;
579
+ })
580
+ .then(results => {
581
+ // Return results to the console or save crawl results to an
582
+ // index.json file
583
+ if (options.terse) {
584
+ const property = options.modules[0].property;
585
+ results = results.map(result => {
586
+ let res = result[property];
587
+ if (property === 'idl') {
588
+ res = res?.idl;
589
+ }
590
+ return res;
591
+ });
592
+ if (results.length === 1) {
593
+ results = results[0];
594
+ }
595
+ console.log(typeof results === 'string' ?
596
+ results : JSON.stringify(results, null, 2));
597
+ }
598
+ else if (!options.output) {
599
+ console.log(JSON.stringify(results, null, 2));
600
+ }
601
+ else {
602
+ return saveResults(results, options);
603
+ }
604
+ })
605
+ .then(async crawlIndex => {
606
+ // Generate IDL names extracts from IDL extracts
607
+ // (and dfns extracts to create links to definitions)
608
+ if (!options.output || !crawlIndex?.options?.modules?.find(mod => mod === 'idl')) {
609
+ return;
610
+ }
611
+ const crawlResults = await expandCrawlResult(crawlIndex, options.output, ['idlparsed', 'dfns']);
612
+ const idlNames = generateIdlNames(crawlResults.results, options);
613
+ await saveIdlNames(idlNames, options.output);
614
+ });
615
+ }
616
+
617
+
618
+ /**************************************************
619
+ Export methods for use as module
620
+ **************************************************/
621
+ module.exports.crawlList = crawlList;
622
+ module.exports.crawlSpecs = crawlSpecs;