reffy 6.2.0 → 6.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. package/LICENSE +21 -21
  2. package/README.md +158 -158
  3. package/index.js +11 -11
  4. package/package.json +53 -53
  5. package/reffy.js +248 -248
  6. package/src/browserlib/canonicalize-url.mjs +50 -50
  7. package/src/browserlib/create-outline.mjs +352 -352
  8. package/src/browserlib/extract-cssdfn.mjs +319 -319
  9. package/src/browserlib/extract-dfns.mjs +686 -686
  10. package/src/browserlib/extract-elements.mjs +205 -205
  11. package/src/browserlib/extract-headings.mjs +48 -48
  12. package/src/browserlib/extract-ids.mjs +28 -28
  13. package/src/browserlib/extract-links.mjs +28 -28
  14. package/src/browserlib/extract-references.mjs +203 -203
  15. package/src/browserlib/extract-webidl.mjs +134 -134
  16. package/src/browserlib/get-absolute-url.mjs +21 -21
  17. package/src/browserlib/get-generator.mjs +26 -26
  18. package/src/browserlib/get-lastmodified-date.mjs +13 -13
  19. package/src/browserlib/get-title.mjs +11 -11
  20. package/src/browserlib/informative-selector.mjs +16 -16
  21. package/src/browserlib/map-ids-to-headings.mjs +136 -136
  22. package/src/browserlib/reffy.json +53 -53
  23. package/src/cli/check-missing-dfns.js +609 -609
  24. package/src/cli/generate-idlnames.js +430 -430
  25. package/src/cli/generate-idlparsed.js +139 -139
  26. package/src/cli/merge-crawl-results.js +128 -128
  27. package/src/cli/parse-webidl.js +430 -430
  28. package/src/lib/css-grammar-parse-tree.schema.json +109 -109
  29. package/src/lib/css-grammar-parser.js +440 -440
  30. package/src/lib/fetch.js +55 -55
  31. package/src/lib/nock-server.js +119 -119
  32. package/src/lib/specs-crawler.js +605 -603
  33. package/src/lib/util.js +898 -898
  34. package/src/specs/missing-css-rules.json +197 -197
  35. package/src/specs/spec-equivalents.json +149 -149
  36. package/src/browserlib/extract-editors.mjs~ +0 -14
  37. package/src/browserlib/generate-es-dfn-report.sh~ +0 -4
  38. package/src/cli/csstree-grammar-check.js +0 -28
  39. package/src/cli/csstree-grammar-check.js~ +0 -10
  40. package/src/cli/csstree-grammar-parser.js +0 -11
  41. package/src/cli/csstree-grammar-parser.js~ +0 -1
  42. package/src/cli/extract-editors.js~ +0 -38
  43. package/src/cli/process-specs.js~ +0 -28
@@ -1,603 +1,605 @@
1
- #!/usr/bin/env node
2
- /**
3
- * The spec crawler takes a list of spec URLs as input, gathers some knowledge
4
- * about these specs (published versions, URL of the Editor's Draft, etc.),
5
- * fetches these specs, parses them, extracts relevant information that they
6
- * contain (such as the WebIDL they define, the list of specifications that they
7
- * reference, and links to external specs), and produces a crawl report with the
8
- * results of these investigations.
9
- *
10
- * @module crawler
11
- */
12
-
13
- const fs = require('fs');
14
- const path = require('path');
15
- const specs = require('browser-specs');
16
- const cssDfnParser = require('./css-grammar-parser');
17
- const { generateIdlParsed, saveIdlParsed } = require('../cli/generate-idlparsed');
18
- const { generateIdlNames, saveIdlNames } = require('../cli/generate-idlnames');
19
- const {
20
- completeWithAlternativeUrls,
21
- expandBrowserModules,
22
- expandCrawlResult,
23
- expandSpecResult,
24
- getGeneratedIDLNamesByCSSProperty,
25
- isLatestLevelThatPasses,
26
- processSpecification,
27
- setupBrowser,
28
- teardownBrowser,
29
- createFolderIfNeeded
30
- } = require('./util');
31
-
32
-
33
- /**
34
- * Return the spec if crawl succeeded or crawl result from given fallback list
35
- * if crawl yielded an error (and fallback does exist).
36
- *
37
- * The function keeps the "error" property on the crawl result it returns so
38
- * that the error does not get entirely lost.
39
- *
40
- * @function
41
- * @param {Object} spec Actual spec crawl result
42
- * * @param {Object} spec Actual spec crawl result
43
- * @param {String} fallbackFolder The folder that contains fallback extracts
44
- * @param {Array<Object>} fallbackData A list of crawl results to use as
45
- * fallback when needed
46
- * @return {Object} The given crawl result or a new one that reuses fallback
47
- * content if needed
48
- */
49
- async function specOrFallback(spec, fallbackFolder, fallbackData) {
50
- if (spec.error && fallbackData) {
51
- const fallback = fallbackData.find(s => s.url === spec.url);
52
- if (fallback) {
53
- const copy = Object.assign({}, fallback);
54
- const result = await expandSpecResult(copy, fallbackFolder);
55
- result.error = spec.error;
56
- return result;
57
- }
58
- }
59
- return spec;
60
- }
61
-
62
-
63
- /**
64
- * Load and parse the given spec.
65
- *
66
- * @function
67
- * @param {Object} spec The spec to load (must already have been completed with
68
- * useful info, as returned by "createInitialSpecDescriptions")
69
- * @param {Object} crawlOptions Crawl options
70
- * @return {Promise<Object>} The promise to get a spec object with crawl info
71
- */
72
- async function crawlSpec(spec, crawlOptions) {
73
- crawlOptions = crawlOptions || {};
74
- spec.crawled = crawlOptions.publishedVersion ?
75
- (spec.release ? spec.release : spec.nightly) :
76
- spec.nightly;
77
- const fallbackFolder = crawlOptions.fallback ?
78
- path.dirname(crawlOptions.fallback) : '';
79
-
80
- if (spec.error) {
81
- return specOrFallback(spec, fallbackFolder, crawlOptions.fallbackData);
82
- }
83
-
84
- try {
85
- const result = await processSpecification(
86
- spec.crawled,
87
- (spec, modules) => {
88
- const idToHeading = modules.find(m => m.needsIdToHeadingMap) ?
89
- window.reffy.mapIdsToHeadings() : null;
90
- const res = {
91
- crawled: window.location.toString()
92
- };
93
- modules.forEach(mod => {
94
- res[mod.property] = window.reffy[mod.name](spec, idToHeading);
95
- });
96
- return res;
97
- },
98
- [spec, crawlOptions.modules],
99
- { quiet: crawlOptions.quiet,
100
- forceLocalFetch: crawlOptions.forceLocalFetch }
101
- );
102
-
103
- // Specific rule for IDL extracts:
104
- // parse the extracted WebIdl content
105
- await generateIdlParsed(result);
106
-
107
- if (result.css) {
108
- // Specific rule for CSS properties:
109
- // Add CSS property definitions that weren't in a table
110
- if (result.dfns) {
111
- result.dfns
112
- .filter(dfn => dfn.type == "property" && !dfn.informative)
113
- .forEach(propDfn => {
114
- propDfn.linkingText.forEach(lt => {
115
- if (!result.css.properties.hasOwnProperty(lt)) {
116
- result.css.properties[lt] = {
117
- name: lt
118
- };
119
- }
120
- });
121
- });
122
- }
123
-
124
- // Specific rule for CSS properties:
125
- // Ideally, the sample definition (property-name) in CSS2 and the custom
126
- // property definition (--*) in CSS Variables would not be flagged as
127
- // real CSS properties. In practice, they are. Let's remove them from
128
- // the extract.
129
- ['property-name', '--*'].forEach(prop => {
130
- if ((result.css.properties || {})[prop]) {
131
- delete result.css.properties[prop];
132
- }
133
- });
134
-
135
- // Specific rule for CSS extracts:
136
- // Parse extracted CSS definitions and add generated IDL attribute names
137
- Object.entries(result.css.properties || {}).forEach(([prop, dfn]) => {
138
- if (dfn.value || dfn.newValues) {
139
- try {
140
- dfn.parsedValue = cssDfnParser.parsePropDefValue(
141
- dfn.value || dfn.newValues);
142
- } catch (e) {
143
- dfn.valueParseError = e.message;
144
- }
145
- }
146
- dfn.styleDeclaration = getGeneratedIDLNamesByCSSProperty(prop);
147
- });
148
- Object.entries(result.css.descriptors || {}).forEach(([desc, dfn]) => {
149
- if (dfn.value) {
150
- try {
151
- dfn.parsedValue = cssDfnParser.parsePropDefValue(
152
- dfn.value);
153
- } catch (e) {
154
- dfn.valueParseError = e.message;
155
- }
156
- }
157
- });
158
- Object.entries(result.css.valuespaces || {}).forEach(([vs, dfn]) => {
159
- if (dfn.value) {
160
- try {
161
- dfn.parsedValue = cssDfnParser.parsePropDefValue(
162
- dfn.value);
163
- } catch (e) {
164
- dfn.valueParseError = e.message;
165
- }
166
- }
167
- });
168
- }
169
-
170
- // Copy results back into initial spec object
171
- spec.crawled = result.crawled;
172
- crawlOptions.modules.forEach(mod => {
173
- if (result[mod.property]) {
174
- spec[mod.property] = result[mod.property];
175
- if (mod.property === 'idl') {
176
- spec.idlparsed = result.idlparsed;
177
- }
178
- }
179
- });
180
- }
181
- catch (err) {
182
- spec.title = spec.title || '[Could not be determined, see error]';
183
- spec.error = err.toString() + (err.stack ? ' ' + err.stack : '');
184
- }
185
-
186
- return specOrFallback(spec, fallbackFolder, crawlOptions.fallbackData);
187
- }
188
-
189
-
190
- /**
191
- * Saves spec results to extract files as needed and replaces the results with
192
- * links accordingly.
193
- *
194
- * @function
195
- * @param {Object} spec The results of crawling the spec. Object should contain
196
- * metadata about the spec and the crawl processing results in appropriate
197
- * properties.
198
- * @param {Object} settings Crawl settings. Recognized settings: "modules",
199
- * "output" and "quiet". See CLI help (node reffy.js --help) for details.
200
- * The "modules" setting is mandatory and note that the function will not do
201
- * anything if "output" is not set.
202
- * @return {Promise<Object>} The promise to get an updated spec object that
203
- * contains links to created extracts.
204
- */
205
- async function saveSpecResults(spec, settings) {
206
- settings = settings || {};
207
- if (!settings.output) {
208
- return spec;
209
- }
210
-
211
- async function getSubfolder(name) {
212
- let subfolder = path.join(settings.output, name);
213
- await createFolderIfNeeded(subfolder);
214
- return subfolder;
215
- }
216
-
217
- const modules = settings.modules;
218
- const folders = {};
219
- for (const mod of modules) {
220
- if (mod.metadata) {
221
- continue;
222
- }
223
- folders[mod.property] = await getSubfolder(mod.property);
224
-
225
- // Specific rule for IDL:
226
- // Raw IDL goes to "idl" subfolder, parsed IDL goes to "idlparsed"
227
- if (mod.property === 'idl') {
228
- folders.idlparsed = await getSubfolder('idlparsed');
229
- }
230
- }
231
-
232
- function getBaseJSON(spec) {
233
- return {
234
- spec: {
235
- title: spec.title,
236
- url: spec.crawled
237
- }
238
- };
239
- }
240
-
241
- async function saveExtract(spec, property, filter) {
242
- if (filter(spec)) {
243
- const contents = getBaseJSON(spec);
244
- contents[property] = spec[property];
245
- const json = JSON.stringify(contents, null, 2);
246
- const filename = path.join(folders[property], spec.shortname + '.json');
247
- await fs.promises.writeFile(filename, json);
248
- spec[property] = `${property}/${spec.shortname}.json`;
249
- }
250
- else {
251
- delete spec[property];
252
- }
253
- }
254
-
255
- async function saveIdl(spec) {
256
- let idlHeader = `
257
- // GENERATED CONTENT - DO NOT EDIT
258
- // Content was automatically extracted by Reffy into webref
259
- // (https://github.com/w3c/webref)
260
- // Source: ${spec.title} (${spec.crawled})`;
261
- idlHeader = idlHeader.replace(/^\s+/gm, '').trim() + '\n\n';
262
- const idl = idlHeader + spec.idl + '\n';
263
- await fs.promises.writeFile(
264
- path.join(folders.idl, spec.shortname + '.idl'), idl);
265
- return `idl/${spec.shortname}.idl`;
266
- };
267
-
268
- async function saveCss(spec) {
269
- // There are no comments in JSON, so include the spec title+URL as the
270
- // first property instead.
271
- const css = Object.assign(getBaseJSON(spec), spec.css);
272
- const json = JSON.stringify(css, (key, val) => {
273
- if ((key === 'parsedValue') || (key === 'valueParseError')) {
274
- return undefined;
275
- }
276
- else {
277
- return val;
278
- }
279
- }, 2) + '\n';
280
- const pathname = path.join(folders.css, spec.shortname + '.json')
281
- await fs.promises.writeFile(pathname, json);
282
- return `css/${spec.shortname}.json`;
283
- };
284
-
285
- // Save IDL dumps
286
- if (spec.idl) {
287
- spec.idl = await saveIdl(spec);
288
- }
289
- if (spec.idlparsed) {
290
- spec.idlparsed = await saveIdlParsed(spec, settings.output);
291
- }
292
-
293
- // Save CSS dumps
294
- function defineCSSContent(spec) {
295
- return spec.css && (
296
- (Object.keys(spec.css.properties || {}).length > 0) ||
297
- (Object.keys(spec.css.descriptors || {}).length > 0) ||
298
- (Object.keys(spec.css.valuespaces || {}).length > 0));
299
- }
300
- if (defineCSSContent(spec)) {
301
- spec.css = await saveCss(spec);
302
- }
303
-
304
- // Specs that define CSS now have a "css" key that point to the CSS extract.
305
- // Specs that don't define CSS still have a "css" key that points to an
306
- // empty object structure. Let's get rid of it.
307
- if (spec.css && typeof spec.css !== 'string') {
308
- delete spec.css;
309
- }
310
-
311
- // Quick and dirty function to determine whether a variable is "empty"
312
- // (it returns true for falsy values, which is good enough for what we need)
313
- function isEmpty(thing) {
314
- return !thing ||
315
- Array.isArray(thing) && (thing.length === 0) ||
316
- (typeof thing == 'object') && (Object.keys(thing).length === 0);
317
- }
318
-
319
- // Save all other extracts
320
- const remainingModules = modules.filter(mod =>
321
- !mod.metadata && mod.property !== 'css' && mod.property !== 'idl');
322
- for (const mod of remainingModules) {
323
- await saveExtract(spec, mod.property, spec => !isEmpty(spec[mod.property]));
324
- if (spec[mod.property] && typeof spec[mod.property] !== 'string') {
325
- delete spec[mod.property];
326
- }
327
- }
328
-
329
- return spec;
330
- }
331
-
332
-
333
- /**
334
- * Main method that crawls the list of specification URLs and return a structure
335
- * that full describes its title, URLs, references, and IDL definitions.
336
- *
337
- * @function
338
- * @param {Array(String)} speclist List of URLs to parse
339
- * @param {Object} crawlOptions Crawl options
340
- * @return {Promise<Array(Object)} The promise to get an array of complete
341
- * specification descriptions
342
- */
343
- async function crawlList(speclist, crawlOptions) {
344
- // Make a shallow copy of crawl options object since we're going
345
- // to modify properties in place
346
- crawlOptions = Object.assign({}, crawlOptions);
347
-
348
- // Expand list of processing modules to use if not already done
349
- crawlOptions.modules = expandBrowserModules(crawlOptions.modules);
350
-
351
- // Load fallback data if necessary
352
- if (crawlOptions.fallback) {
353
- try {
354
- crawlOptions.fallbackData = JSON.parse(await fs.promises.readFile(crawlOptions.fallback)).results;
355
- } catch (e) {
356
- throw new Error(`Could not parse fallback data file ${crawlOptions.fallback}`);
357
- }
358
- }
359
-
360
- // Prepare Puppeteer instance
361
- await setupBrowser(crawlOptions.modules);
362
-
363
- const list = speclist.map(completeWithAlternativeUrls);
364
- const listAndPromise = list.map(spec => {
365
- let resolve = null;
366
- let reject = null;
367
- let readyToCrawl = new Promise((resolveFunction, rejectFunction) => {
368
- resolve = resolveFunction;
369
- reject = rejectFunction;
370
- });
371
- return { spec, readyToCrawl, resolve, reject };
372
- });
373
-
374
- // In debug mode, specs are processed one by one. In normal mode,
375
- // specs are processing in chunks
376
- const chunkSize = Math.min((crawlOptions.debug ? 1 : 4), list.length);
377
-
378
- let pos = 0;
379
- function flagNextSpecAsReadyToCrawl() {
380
- if (pos < listAndPromise.length) {
381
- listAndPromise[pos].resolve();
382
- pos += 1;
383
- }
384
- }
385
- for (let i = 0; i < chunkSize; i++) {
386
- flagNextSpecAsReadyToCrawl();
387
- }
388
-
389
- const nbStr = '' + listAndPromise.length;
390
- async function crawlSpecAndPromise(specAndPromise, idx) {
391
- await specAndPromise.readyToCrawl;
392
- const spec = specAndPromise.spec;
393
- const logCounter = ('' + (idx + 1)).padStart(nbStr.length, ' ') + '/' + nbStr;
394
- crawlOptions.quiet ?? console.warn(`${logCounter} - ${spec.url} - crawling`);
395
- let result = await crawlSpec(spec, crawlOptions);
396
- result = await saveSpecResults(result, crawlOptions);
397
- crawlOptions.quiet ?? console.warn(`${logCounter} - ${spec.url} - done`);
398
- flagNextSpecAsReadyToCrawl();
399
-
400
- return result;
401
- }
402
-
403
- const results = await Promise.all(listAndPromise.map(crawlSpecAndPromise));
404
-
405
- // Close Puppeteer instance
406
- teardownBrowser();
407
-
408
- return results;
409
- }
410
-
411
-
412
- /**
413
- * Merges extracts per series for the given property and adjusts links
414
- *
415
- * @function
416
- * @param {Array(object)} data Crawl results
417
- * @param {string} property The extract property to process
418
- * @param {Object} settings Crawl settings. The function looks at the "output"
419
- * setting to determine where to look for extracts
420
- * @return {Promise(Array)} The promise to get an updated crawl results array
421
- */
422
- async function adjustExtractsPerSeries(data, property, settings) {
423
- if (!settings.output) {
424
- return data;
425
- }
426
-
427
- const fullLevels = data.filter(spec =>
428
- (spec.seriesComposition !== 'delta') &&
429
- isLatestLevelThatPasses(spec, data, spec => spec[property]));
430
- const deltaLevels = data.filter(spec =>
431
- (spec.seriesComposition === 'delta') && spec[property]);
432
-
433
- data.forEach(spec => {
434
- if (fullLevels.includes(spec)) {
435
- // Full level, rename the extract after the series' shortname
436
- const pathname = path.resolve(settings.output, spec[property]);
437
- spec[property] = `${property}/${spec.series.shortname}${path.extname(spec[property])}`;
438
- const newpathname = path.resolve(settings.output, spec[property]);
439
- fs.renameSync(pathname, newpathname);
440
- }
441
- else if (deltaLevels.includes(spec)) {
442
- // Delta level, need to keep the extract as-is
443
- }
444
- else if (spec[property]) {
445
- // Not the right full level in the series, drop created extract
446
- // and link to the series extract instead
447
- const pathname = path.resolve(settings.output, spec[property]);
448
- fs.unlinkSync(pathname);
449
- spec[property] = `${property}/${spec.series.shortname}${path.extname(spec[property])}`;
450
- }
451
- });
452
-
453
- return data;
454
- }
455
-
456
-
457
- /**
458
- * Saves the crawl results to an index.json file.
459
- *
460
- * @function
461
- * @param {Array(Object)} data The list of specification structures to save
462
- * @param {Object} settings Crawl settings. The function does not create any
463
- * save file if the "output" setting is not set.
464
- * @return {Promise<void>} The promise to have saved the data
465
- */
466
- async function saveResults(data, settings) {
467
- if (!settings.output) {
468
- return data;
469
- }
470
-
471
- // Save all results to an index.json file
472
- const indexFilename = path.join(settings.output, 'index.json');
473
- const contents = {
474
- type: 'crawl',
475
- title: 'Reffy crawl',
476
- date: (new Date()).toJSON(),
477
- options: settings,
478
- stats: {},
479
- results: data
480
- };
481
- contents.options.modules = contents.options.modules.map(mod => mod.property);
482
- contents.stats = {
483
- crawled: contents.results.length,
484
- errors: contents.results.filter(spec => !!spec.error).length
485
- };
486
-
487
- await fs.promises.writeFile(indexFilename, JSON.stringify(contents, null, 2));
488
- return contents;
489
- }
490
-
491
-
492
- /**
493
- * Crawls the specifications listed in the given JSON file and generates a
494
- * crawl report in the given folder.
495
- *
496
- * @function
497
- * @param {Object} options Crawl options. Possible options are:
498
- * publishedVersion, debug, output, terse, modules and specs.
499
- * See CLI help (node reffy.js --help) for details.
500
- * @return {Promise<void>} The promise that the crawl will have been made
501
- */
502
- function crawlSpecs(options) {
503
- function prepareListOfSpecs(list) {
504
- return list.map(spec => {
505
- if (typeof spec !== 'string') {
506
- return spec;
507
- }
508
- let match = specs.find(s => s.url === spec || s.shortname === spec);
509
- if (!match) {
510
- match = specs.find(s => s.series &&
511
- s.series.shortname === spec &&
512
- s.series.currentSpecification === s.shortname);
513
- }
514
- if (match) {
515
- return match;
516
- }
517
-
518
- let url = null;
519
- try {
520
- url = (new URL(spec)).href;
521
- }
522
- catch {
523
- if (spec.endsWith('.html')) {
524
- url = (new URL(spec, `file://${process.cwd()}/`)).href;
525
- }
526
- else {
527
- const msg = `Spec ID "${spec}" can neither be interpreted as a URL, a valid shortname or a relative path to an HTML file`;
528
- throw new Error(msg);
529
- }
530
- }
531
- return {
532
- url,
533
- nightly: { url },
534
- shortname: spec.replace(/[:\/\\\.]/g, ''),
535
- series: {
536
- shortname: spec.replace(/[:\/\\\.]/g, ''),
537
- }
538
- };
539
- });
540
- }
541
-
542
- const requestedList = options?.specs ?
543
- prepareListOfSpecs(options.specs) :
544
- specs;
545
-
546
- // Make a shallow copy of passed options parameter and expand modules
547
- // in place.
548
- options = Object.assign({}, options);
549
- options.modules = expandBrowserModules(options.modules);
550
-
551
- return crawlList(requestedList, options)
552
- .then(async results => {
553
- // Merge extracts per series when necessary (CSS/IDL extracts)
554
- for (const mod of options.modules) {
555
- if (mod.extractsPerSeries) {
556
- await adjustExtractsPerSeries(results, mod.property, options);
557
- }
558
- }
559
- return results;
560
- })
561
- .then(results => {
562
- // Return results to the console or save crawl results to an
563
- // index.json file
564
- if (options.terse) {
565
- const property = options.modules[0].property;
566
- results = results.map(result => {
567
- let res = result[property];
568
- if (property === 'idl') {
569
- res = res?.idl;
570
- }
571
- return res;
572
- });
573
- if (results.length === 1) {
574
- results = results[0];
575
- }
576
- console.log(typeof results === 'string' ?
577
- results : JSON.stringify(results, null, 2));
578
- }
579
- else if (!options.output) {
580
- console.log(JSON.stringify(results, null, 2));
581
- }
582
- else {
583
- return saveResults(results, options);
584
- }
585
- })
586
- .then(async crawlIndex => {
587
- // Generate IDL names extracts from IDL extracts
588
- // (and dfns extracts to create links to definitions)
589
- if (!options.output || !crawlIndex?.options?.modules?.find(mod => mod === 'idl')) {
590
- return;
591
- }
592
- const crawlResults = await expandCrawlResult(crawlIndex, options.output, ['idlparsed', 'dfns']);
593
- const idlNames = generateIdlNames(crawlResults.results, options);
594
- await saveIdlNames(idlNames, options.output);
595
- });
596
- }
597
-
598
-
599
- /**************************************************
600
- Export methods for use as module
601
- **************************************************/
602
- module.exports.crawlList = crawlList;
603
- module.exports.crawlSpecs = crawlSpecs;
1
+ #!/usr/bin/env node
2
+ /**
3
+ * The spec crawler takes a list of spec URLs as input, gathers some knowledge
4
+ * about these specs (published versions, URL of the Editor's Draft, etc.),
5
+ * fetches these specs, parses them, extracts relevant information that they
6
+ * contain (such as the WebIDL they define, the list of specifications that they
7
+ * reference, and links to external specs), and produces a crawl report with the
8
+ * results of these investigations.
9
+ *
10
+ * @module crawler
11
+ */
12
+
13
+ const fs = require('fs');
14
+ const path = require('path');
15
+ const specs = require('browser-specs');
16
+ const cssDfnParser = require('./css-grammar-parser');
17
+ const { generateIdlParsed, saveIdlParsed } = require('../cli/generate-idlparsed');
18
+ const { generateIdlNames, saveIdlNames } = require('../cli/generate-idlnames');
19
+ const {
20
+ completeWithAlternativeUrls,
21
+ expandBrowserModules,
22
+ expandCrawlResult,
23
+ expandSpecResult,
24
+ getGeneratedIDLNamesByCSSProperty,
25
+ isLatestLevelThatPasses,
26
+ processSpecification,
27
+ setupBrowser,
28
+ teardownBrowser,
29
+ createFolderIfNeeded
30
+ } = require('./util');
31
+
32
+
33
+ /**
34
+ * Return the spec if crawl succeeded or crawl result from given fallback list
35
+ * if crawl yielded an error (and fallback does exist).
36
+ *
37
+ * The function keeps the "error" property on the crawl result it returns so
38
+ * that the error does not get entirely lost.
39
+ *
40
+ * @function
41
+ * @param {Object} spec Actual spec crawl result
42
+ * * @param {Object} spec Actual spec crawl result
43
+ * @param {String} fallbackFolder The folder that contains fallback extracts
44
+ * @param {Array<Object>} fallbackData A list of crawl results to use as
45
+ * fallback when needed
46
+ * @return {Object} The given crawl result or a new one that reuses fallback
47
+ * content if needed
48
+ */
49
+ async function specOrFallback(spec, fallbackFolder, fallbackData) {
50
+ if (spec.error && fallbackData) {
51
+ const fallback = fallbackData.find(s => s.url === spec.url);
52
+ if (fallback) {
53
+ const copy = Object.assign({}, fallback);
54
+ const result = await expandSpecResult(copy, fallbackFolder);
55
+ result.error = spec.error;
56
+ return result;
57
+ }
58
+ }
59
+ return spec;
60
+ }
61
+
62
+
63
+ /**
64
+ * Load and parse the given spec.
65
+ *
66
+ * @function
67
+ * @param {Object} spec The spec to load (must already have been completed with
68
+ * useful info, as returned by "createInitialSpecDescriptions")
69
+ * @param {Object} crawlOptions Crawl options
70
+ * @return {Promise<Object>} The promise to get a spec object with crawl info
71
+ */
72
+ async function crawlSpec(spec, crawlOptions) {
73
+ crawlOptions = crawlOptions || {};
74
+ spec.crawled = crawlOptions.publishedVersion ?
75
+ (spec.release ? spec.release : spec.nightly) :
76
+ spec.nightly;
77
+ const fallbackFolder = crawlOptions.fallback ?
78
+ path.dirname(crawlOptions.fallback) : '';
79
+
80
+ if (spec.error) {
81
+ return specOrFallback(spec, fallbackFolder, crawlOptions.fallbackData);
82
+ }
83
+
84
+ try {
85
+ const result = await processSpecification(
86
+ spec.crawled,
87
+ (spec, modules) => {
88
+ const idToHeading = modules.find(m => m.needsIdToHeadingMap) ?
89
+ window.reffy.mapIdsToHeadings() : null;
90
+ const res = {
91
+ crawled: window.location.toString()
92
+ };
93
+ modules.forEach(mod => {
94
+ res[mod.property] = window.reffy[mod.name](spec, idToHeading);
95
+ });
96
+ return res;
97
+ },
98
+ [spec, crawlOptions.modules],
99
+ { quiet: crawlOptions.quiet,
100
+ forceLocalFetch: crawlOptions.forceLocalFetch }
101
+ );
102
+
103
+ // Specific rule for IDL extracts:
104
+ // parse the extracted WebIdl content
105
+ await generateIdlParsed(result);
106
+
107
+ if (result.css) {
108
+ // Specific rule for CSS properties:
109
+ // Add CSS property definitions that weren't in a table
110
+ if (result.dfns) {
111
+ result.dfns
112
+ .filter(dfn => dfn.type == "property" && !dfn.informative)
113
+ .forEach(propDfn => {
114
+ propDfn.linkingText.forEach(lt => {
115
+ if (!result.css.properties.hasOwnProperty(lt)) {
116
+ result.css.properties[lt] = {
117
+ name: lt
118
+ };
119
+ }
120
+ });
121
+ });
122
+ }
123
+
124
+ // Specific rule for CSS properties:
125
+ // Ideally, the sample definition (property-name) in CSS2 and the custom
126
+ // property definition (--*) in CSS Variables would not be flagged as
127
+ // real CSS properties. In practice, they are. Let's remove them from
128
+ // the extract.
129
+ ['property-name', '--*'].forEach(prop => {
130
+ if ((result.css.properties || {})[prop]) {
131
+ delete result.css.properties[prop];
132
+ }
133
+ });
134
+
135
+ // Specific rule for CSS extracts:
136
+ // Parse extracted CSS definitions and add generated IDL attribute names
137
+ Object.entries(result.css.properties || {}).forEach(([prop, dfn]) => {
138
+ if (dfn.value || dfn.newValues) {
139
+ try {
140
+ dfn.parsedValue = cssDfnParser.parsePropDefValue(
141
+ dfn.value || dfn.newValues);
142
+ } catch (e) {
143
+ dfn.valueParseError = e.message;
144
+ }
145
+ }
146
+ dfn.styleDeclaration = getGeneratedIDLNamesByCSSProperty(prop);
147
+ });
148
+ Object.entries(result.css.descriptors || {}).forEach(([desc, dfn]) => {
149
+ if (dfn.value) {
150
+ try {
151
+ dfn.parsedValue = cssDfnParser.parsePropDefValue(
152
+ dfn.value);
153
+ } catch (e) {
154
+ dfn.valueParseError = e.message;
155
+ }
156
+ }
157
+ });
158
+ Object.entries(result.css.valuespaces || {}).forEach(([vs, dfn]) => {
159
+ if (dfn.value) {
160
+ try {
161
+ dfn.parsedValue = cssDfnParser.parsePropDefValue(
162
+ dfn.value);
163
+ } catch (e) {
164
+ dfn.valueParseError = e.message;
165
+ }
166
+ }
167
+ });
168
+ }
169
+
170
+ // Copy results back into initial spec object
171
+ spec.crawled = result.crawled;
172
+ crawlOptions.modules.forEach(mod => {
173
+ if (result[mod.property]) {
174
+ spec[mod.property] = result[mod.property];
175
+ if (mod.property === 'idl') {
176
+ spec.idlparsed = result.idlparsed;
177
+ }
178
+ }
179
+ });
180
+ }
181
+ catch (err) {
182
+ spec.title = spec.title || '[Could not be determined, see error]';
183
+ spec.error = err.toString() + (err.stack ? ' ' + err.stack : '');
184
+ }
185
+
186
+ return specOrFallback(spec, fallbackFolder, crawlOptions.fallbackData);
187
+ }
188
+
189
+
190
+ /**
191
+ * Saves spec results to extract files as needed and replaces the results with
192
+ * links accordingly.
193
+ *
194
+ * @function
195
+ * @param {Object} spec The results of crawling the spec. Object should contain
196
+ * metadata about the spec and the crawl processing results in appropriate
197
+ * properties.
198
+ * @param {Object} settings Crawl settings. Recognized settings: "modules",
199
+ * "output" and "quiet". See CLI help (node reffy.js --help) for details.
200
+ * The "modules" setting is mandatory and note that the function will not do
201
+ * anything if "output" is not set.
202
+ * @return {Promise<Object>} The promise to get an updated spec object that
203
+ * contains links to created extracts.
204
+ */
205
+ async function saveSpecResults(spec, settings) {
206
+ settings = settings || {};
207
+ if (!settings.output) {
208
+ return spec;
209
+ }
210
+
211
+ async function getSubfolder(name) {
212
+ let subfolder = path.join(settings.output, name);
213
+ await createFolderIfNeeded(subfolder);
214
+ return subfolder;
215
+ }
216
+
217
+ const modules = settings.modules;
218
+ const folders = {};
219
+ for (const mod of modules) {
220
+ if (mod.metadata) {
221
+ continue;
222
+ }
223
+ folders[mod.property] = await getSubfolder(mod.property);
224
+
225
+ // Specific rule for IDL:
226
+ // Raw IDL goes to "idl" subfolder, parsed IDL goes to "idlparsed"
227
+ if (mod.property === 'idl') {
228
+ folders.idlparsed = await getSubfolder('idlparsed');
229
+ }
230
+ }
231
+
232
+ function getBaseJSON(spec) {
233
+ return {
234
+ spec: {
235
+ title: spec.title,
236
+ url: spec.crawled
237
+ }
238
+ };
239
+ }
240
+
241
+ async function saveExtract(spec, property, filter) {
242
+ if (filter(spec)) {
243
+ const contents = getBaseJSON(spec);
244
+ contents[property] = spec[property];
245
+ const json = JSON.stringify(contents, null, 2);
246
+ const filename = path.join(folders[property], spec.shortname + '.json');
247
+ await fs.promises.writeFile(filename, json);
248
+ spec[property] = `${property}/${spec.shortname}.json`;
249
+ }
250
+ else {
251
+ delete spec[property];
252
+ }
253
+ }
254
+
255
+ async function saveIdl(spec) {
256
+ let idlHeader = `
257
+ // GENERATED CONTENT - DO NOT EDIT
258
+ // Content was automatically extracted by Reffy into webref
259
+ // (https://github.com/w3c/webref)
260
+ // Source: ${spec.title} (${spec.crawled})`;
261
+ idlHeader = idlHeader.replace(/^\s+/gm, '').trim() + '\n\n';
262
+ const idl = idlHeader + spec.idl + '\n';
263
+ await fs.promises.writeFile(
264
+ path.join(folders.idl, spec.shortname + '.idl'), idl);
265
+ return `idl/${spec.shortname}.idl`;
266
+ };
267
+
268
+ async function saveCss(spec) {
269
+ // There are no comments in JSON, so include the spec title+URL as the
270
+ // first property instead.
271
+ const css = Object.assign(getBaseJSON(spec), spec.css);
272
+ const json = JSON.stringify(css, (key, val) => {
273
+ if ((key === 'parsedValue') || (key === 'valueParseError')) {
274
+ return undefined;
275
+ }
276
+ else {
277
+ return val;
278
+ }
279
+ }, 2) + '\n';
280
+ const pathname = path.join(folders.css, spec.shortname + '.json')
281
+ await fs.promises.writeFile(pathname, json);
282
+ return `css/${spec.shortname}.json`;
283
+ };
284
+
285
+ // Save IDL dumps
286
+ if (spec.idl) {
287
+ spec.idl = await saveIdl(spec);
288
+ }
289
+ if (spec.idlparsed) {
290
+ spec.idlparsed = await saveIdlParsed(spec, settings.output);
291
+ }
292
+
293
+ // Save CSS dumps
294
+ function defineCSSContent(spec) {
295
+ return spec.css && (
296
+ (Object.keys(spec.css.properties || {}).length > 0) ||
297
+ (Object.keys(spec.css.descriptors || {}).length > 0) ||
298
+ (Object.keys(spec.css.valuespaces || {}).length > 0));
299
+ }
300
+ if (defineCSSContent(spec)) {
301
+ spec.css = await saveCss(spec);
302
+ }
303
+
304
+ // Specs that define CSS now have a "css" key that point to the CSS extract.
305
+ // Specs that don't define CSS still have a "css" key that points to an
306
+ // empty object structure. Let's get rid of it.
307
+ if (spec.css && typeof spec.css !== 'string') {
308
+ delete spec.css;
309
+ }
310
+
311
+ // Quick and dirty function to determine whether a variable is "empty"
312
+ // (it returns true for falsy values, which is good enough for what we need)
313
+ function isEmpty(thing) {
314
+ return !thing ||
315
+ Array.isArray(thing) && (thing.length === 0) ||
316
+ (typeof thing == 'object') && (Object.keys(thing).length === 0);
317
+ }
318
+
319
+ // Save all other extracts
320
+ const remainingModules = modules.filter(mod =>
321
+ !mod.metadata && mod.property !== 'css' && mod.property !== 'idl');
322
+ for (const mod of remainingModules) {
323
+ await saveExtract(spec, mod.property, spec => !isEmpty(spec[mod.property]));
324
+ if (spec[mod.property] && typeof spec[mod.property] !== 'string') {
325
+ delete spec[mod.property];
326
+ }
327
+ }
328
+
329
+ return spec;
330
+ }
331
+
332
+
333
+ /**
334
+ * Main method that crawls the list of specification URLs and return a structure
335
+ * that full describes its title, URLs, references, and IDL definitions.
336
+ *
337
+ * @function
338
+ * @param {Array(String)} speclist List of URLs to parse
339
+ * @param {Object} crawlOptions Crawl options
340
+ * @return {Promise<Array(Object)} The promise to get an array of complete
341
+ * specification descriptions
342
+ */
343
+ async function crawlList(speclist, crawlOptions) {
344
+ // Make a shallow copy of crawl options object since we're going
345
+ // to modify properties in place
346
+ crawlOptions = Object.assign({}, crawlOptions);
347
+
348
+ // Expand list of processing modules to use if not already done
349
+ crawlOptions.modules = expandBrowserModules(crawlOptions.modules);
350
+
351
+ // Load fallback data if necessary
352
+ if (crawlOptions.fallback) {
353
+ try {
354
+ crawlOptions.fallbackData = JSON.parse(await fs.promises.readFile(crawlOptions.fallback)).results;
355
+ } catch (e) {
356
+ throw new Error(`Could not parse fallback data file ${crawlOptions.fallback}`);
357
+ }
358
+ }
359
+
360
+ // Prepare Puppeteer instance
361
+ await setupBrowser(crawlOptions.modules);
362
+
363
+ const list = speclist.map(completeWithAlternativeUrls);
364
+ const listAndPromise = list.map(spec => {
365
+ let resolve = null;
366
+ let reject = null;
367
+ let readyToCrawl = new Promise((resolveFunction, rejectFunction) => {
368
+ resolve = resolveFunction;
369
+ reject = rejectFunction;
370
+ });
371
+ return { spec, readyToCrawl, resolve, reject };
372
+ });
373
+
374
+ // In debug mode, specs are processed one by one. In normal mode,
375
+ // specs are processing in chunks
376
+ const chunkSize = Math.min((crawlOptions.debug ? 1 : 4), list.length);
377
+
378
+ let pos = 0;
379
+ function flagNextSpecAsReadyToCrawl() {
380
+ if (pos < listAndPromise.length) {
381
+ listAndPromise[pos].resolve();
382
+ pos += 1;
383
+ }
384
+ }
385
+ for (let i = 0; i < chunkSize; i++) {
386
+ flagNextSpecAsReadyToCrawl();
387
+ }
388
+
389
+ const nbStr = '' + listAndPromise.length;
390
+ async function crawlSpecAndPromise(specAndPromise, idx) {
391
+ await specAndPromise.readyToCrawl;
392
+ const spec = specAndPromise.spec;
393
+ const logCounter = ('' + (idx + 1)).padStart(nbStr.length, ' ') + '/' + nbStr;
394
+ crawlOptions.quiet ?? console.warn(`${logCounter} - ${spec.url} - crawling`);
395
+ let result = await crawlSpec(spec, crawlOptions);
396
+ result = await saveSpecResults(result, crawlOptions);
397
+ crawlOptions.quiet ?? console.warn(`${logCounter} - ${spec.url} - done`);
398
+ flagNextSpecAsReadyToCrawl();
399
+
400
+ return result;
401
+ }
402
+
403
+ const results = await Promise.all(listAndPromise.map(crawlSpecAndPromise));
404
+
405
+ // Close Puppeteer instance
406
+ teardownBrowser();
407
+
408
+ return results;
409
+ }
410
+
411
+
412
+ /**
413
+ * Merges extracts per series for the given property and adjusts links
414
+ *
415
+ * @function
416
+ * @param {Array(object)} data Crawl results
417
+ * @param {string} property The extract property to process
418
+ * @param {Object} settings Crawl settings. The function looks at the "output"
419
+ * setting to determine where to look for extracts
420
+ * @return {Promise(Array)} The promise to get an updated crawl results array
421
+ */
422
+ async function adjustExtractsPerSeries(data, property, settings) {
423
+ if (!settings.output) {
424
+ return data;
425
+ }
426
+
427
+ const fullLevels = data.filter(spec =>
428
+ (spec.seriesComposition !== 'delta') &&
429
+ isLatestLevelThatPasses(spec, data, spec => spec[property]));
430
+ const deltaLevels = data.filter(spec =>
431
+ (spec.seriesComposition === 'delta') && spec[property]);
432
+
433
+ data.forEach(spec => {
434
+ if (fullLevels.includes(spec)) {
435
+ // Full level, rename the extract after the series' shortname
436
+ const pathname = path.resolve(settings.output, spec[property]);
437
+ spec[property] = `${property}/${spec.series.shortname}${path.extname(spec[property])}`;
438
+ const newpathname = path.resolve(settings.output, spec[property]);
439
+ fs.renameSync(pathname, newpathname);
440
+ }
441
+ else if (deltaLevels.includes(spec)) {
442
+ // Delta level, need to keep the extract as-is
443
+ }
444
+ else if (spec[property]) {
445
+ // Not the right full level in the series, drop created extract
446
+ const pathname = path.resolve(settings.output, spec[property]);
447
+ fs.unlinkSync(pathname);
448
+ delete spec[property];
449
+ }
450
+ });
451
+
452
+ return data;
453
+ }
454
+
455
+
456
+ /**
457
+ * Saves the crawl results to an index.json file.
458
+ *
459
+ * @function
460
+ * @param {Array(Object)} data The list of specification structures to save
461
+ * @param {Object} settings Crawl settings. The function does not create any
462
+ * save file if the "output" setting is not set.
463
+ * @return {Promise<void>} The promise to have saved the data
464
+ */
465
+ async function saveResults(data, settings) {
466
+ if (!settings.output) {
467
+ return data;
468
+ }
469
+
470
+ // Save all results to an index.json file
471
+ const indexFilename = path.join(settings.output, 'index.json');
472
+ const contents = {
473
+ type: 'crawl',
474
+ title: 'Reffy crawl',
475
+ date: (new Date()).toJSON(),
476
+ options: settings,
477
+ stats: {},
478
+ results: data
479
+ };
480
+ contents.options.modules = contents.options.modules.map(mod => mod.property);
481
+ contents.stats = {
482
+ crawled: contents.results.length,
483
+ errors: contents.results.filter(spec => !!spec.error).length
484
+ };
485
+
486
+ await fs.promises.writeFile(indexFilename, JSON.stringify(contents, null, 2));
487
+ return contents;
488
+ }
489
+
490
+
491
+ /**
492
+ * Crawls the specifications listed in the given JSON file and generates a
493
+ * crawl report in the given folder.
494
+ *
495
+ * @function
496
+ * @param {Object} options Crawl options. Possible options are:
497
+ * publishedVersion, debug, output, terse, modules and specs.
498
+ * See CLI help (node reffy.js --help) for details.
499
+ * @return {Promise<void>} The promise that the crawl will have been made
500
+ */
501
+ function crawlSpecs(options) {
502
+ function prepareListOfSpecs(list) {
503
+ return list.map(spec => {
504
+ if (typeof spec !== 'string') {
505
+ return spec;
506
+ }
507
+ let match = specs.find(s => s.url === spec || s.shortname === spec);
508
+ if (!match) {
509
+ match = specs.find(s => s.series &&
510
+ s.series.shortname === spec &&
511
+ s.series.currentSpecification === s.shortname);
512
+ }
513
+ if (match) {
514
+ return match;
515
+ }
516
+
517
+ let url = null;
518
+ try {
519
+ url = (new URL(spec)).href;
520
+ }
521
+ catch {
522
+ if (spec.endsWith('.html')) {
523
+ url = (new URL(spec, `file://${process.cwd()}/`)).href;
524
+ }
525
+ else {
526
+ const msg = `Spec ID "${spec}" can neither be interpreted as a URL, a valid shortname or a relative path to an HTML file`;
527
+ throw new Error(msg);
528
+ }
529
+ }
530
+ return {
531
+ url,
532
+ nightly: { url },
533
+ shortname: spec.replace(/[:\/\\\.]/g, ''),
534
+ series: {
535
+ shortname: spec.replace(/[:\/\\\.]/g, ''),
536
+ }
537
+ };
538
+ });
539
+ }
540
+
541
+ const requestedList = options?.specs ?
542
+ prepareListOfSpecs(options.specs) :
543
+ specs;
544
+
545
+ // Make a shallow copy of passed options parameter and expand modules
546
+ // in place.
547
+ options = Object.assign({}, options);
548
+ options.modules = expandBrowserModules(options.modules);
549
+
550
+ return crawlList(requestedList, options)
551
+ .then(async results => {
552
+ // Merge extracts per series when necessary (CSS/IDL extracts)
553
+ for (const mod of options.modules) {
554
+ if (mod.extractsPerSeries) {
555
+ await adjustExtractsPerSeries(results, mod.property, options);
556
+ if (mod.property === 'idl') {
557
+ await adjustExtractsPerSeries(results, 'idlparsed', options);
558
+ }
559
+ }
560
+ }
561
+ return results;
562
+ })
563
+ .then(results => {
564
+ // Return results to the console or save crawl results to an
565
+ // index.json file
566
+ if (options.terse) {
567
+ const property = options.modules[0].property;
568
+ results = results.map(result => {
569
+ let res = result[property];
570
+ if (property === 'idl') {
571
+ res = res?.idl;
572
+ }
573
+ return res;
574
+ });
575
+ if (results.length === 1) {
576
+ results = results[0];
577
+ }
578
+ console.log(typeof results === 'string' ?
579
+ results : JSON.stringify(results, null, 2));
580
+ }
581
+ else if (!options.output) {
582
+ console.log(JSON.stringify(results, null, 2));
583
+ }
584
+ else {
585
+ return saveResults(results, options);
586
+ }
587
+ })
588
+ .then(async crawlIndex => {
589
+ // Generate IDL names extracts from IDL extracts
590
+ // (and dfns extracts to create links to definitions)
591
+ if (!options.output || !crawlIndex?.options?.modules?.find(mod => mod === 'idl')) {
592
+ return;
593
+ }
594
+ const crawlResults = await expandCrawlResult(crawlIndex, options.output, ['idlparsed', 'dfns']);
595
+ const idlNames = generateIdlNames(crawlResults.results, options);
596
+ await saveIdlNames(idlNames, options.output);
597
+ });
598
+ }
599
+
600
+
601
+ /**************************************************
602
+ Export methods for use as module
603
+ **************************************************/
604
+ module.exports.crawlList = crawlList;
605
+ module.exports.crawlSpecs = crawlSpecs;