reffy 4.0.5 → 5.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,534 +0,0 @@
1
- #!/usr/bin/env node
2
- /**
3
- * The backrefs analyzer takes links to a ED crawl folder and a TR crawl folder,
4
- * and creates a report that lists, for each spec:
5
- *
6
- * - Links to anchors that do not exist
7
- * - Links to anchors that no longer exist in the ED of the target spec
8
- * - Links to anchors that are not definitions or headings
9
- * - Links to definitions that are not exported
10
- * - Links to dated TR URLs
11
- * - Links to specs that should no longer be referenced
12
- * - Links to documents that look like specs but are unknown in Reffy
13
- * (likely not an anomaly per se)
14
- *
15
- * It also flags links that look like specs but that do not appear in the crawl
16
- * (most of these should be false positives).
17
- *
18
- * The backrefs analyzer can be called directly through:
19
- *
20
- * `node study-backrefs.js [root crawl folder]`
21
- *
22
- * where `root crawl folder` is the path to the root folder that contains `ed`
23
- * and `tr` subfolders. Alternatively, the analyzer may be called with two
24
- * arguments, one being the path to the ED crawl folder, another being the path
25
- * to the TR crawl folder.
26
- *
27
- * @module backrefs
28
- */
29
-
30
- const {expandCrawlResult, requireFromWorkingDirectory} = require("../lib/util");
31
- const path = require("path");
32
-
33
- /**
34
- * The backrefs analyzer only checks links to other specs. This function returns
35
- * true when a link does target a spec, and false if it targets something else
36
- * (e.g. a test suite, a wiki page, an issue, etc.)
37
- *
38
- * TODO: Consider matching specs from the Khronos Group (but note there are
39
- * different types of resources under www.khronos.org0)
40
- * TODO: Consider matching TC39 specs... once we have dfns and IDs for them!
41
- */
42
- const matchSpecUrl = url =>
43
- url.match(/spec\.whatwg\.org/) ||
44
- url.match(/www\.w3\.org\/TR\/[a-z0-9]/) ||
45
-
46
- // CSS drafts tend to link to various non-spec resources under *.csswg.org
47
- // (e.g. log.csswg.org or wiki.csswg.org)
48
- url.match(/(?<!log|hg|test|wiki)\.csswg\.org(?!\/issues)/) ||
49
- url.match(/\.fxtf\.org/) ||
50
- url.match(/\.css-houdini\.org/) ||
51
-
52
- url.match(/\.svgwg\.org/) ||
53
- (url.match(/\.github\.io/) && !url.match(/w3c\.github\.io\/test-results\//));
54
-
55
- /*
56
- TODO: DRY
57
- Copied from browser-specs/src/compute-shortname.js
58
- */
59
- function computeShortname(url) {
60
- function parseUrl(url) {
61
- // Handle /TR/ URLs
62
- const w3cTr = url.match(/^https?:\/\/(?:www\.)?w3\.org\/TR\/([^\/]+)\/$/);
63
- if (w3cTr) {
64
- return w3cTr[1];
65
- }
66
-
67
- // Handle WHATWG specs
68
- const whatwg = url.match(/\/\/(.+)\.spec\.whatwg\.org\/?/);
69
- if (whatwg) {
70
- return whatwg[1];
71
- }
72
-
73
- // Handle TC39 Proposals
74
- const tc39 = url.match(/\/\/tc39\.es\/proposal-([^\/]+)\/$/);
75
- if (tc39) {
76
- return "tc39-" + tc39[1];
77
- }
78
-
79
-
80
- // Handle Khronos extensions
81
- const khronos = url.match(/https:\/\/www\.khronos\.org\/registry\/webgl\/extensions\/([^\/]+)\/$/);
82
- if (khronos) {
83
- return khronos[1];
84
- }
85
-
86
- // Handle extension specs defined in the same repo as the main spec
87
- // (e.g. generate a "gamepad-extensions" name for
88
- // https://w3c.github.io/gamepad/extensions.html")
89
- const ext = url.match(/\/.*\.github\.io\/([^\/]+)\/(extensions?)\.html$/);
90
- if (ext) {
91
- return ext[1] + '-' + ext[2];
92
- }
93
-
94
- // Handle draft specs on GitHub, excluding the "webappsec-" prefix for
95
- // specifications developed by the Web Application Security Working Group
96
- const github = url.match(/\/.*\.github\.io\/(?:webappsec-)?([^\/]+)\//);
97
- if (github) {
98
- return github[1];
99
- }
100
-
101
- // Handle CSS WG specs
102
- const css = url.match(/\/drafts\.(?:csswg|fxtf|css-houdini)\.org\/([^\/]+)\//);
103
- if (css) {
104
- return css[1];
105
- }
106
-
107
- // Handle SVG drafts
108
- const svg = url.match(/\/svgwg\.org\/specs\/(?:svg-)?([^\/]+)\//);
109
- if (svg) {
110
- return "svg-" + svg[1];
111
- }
112
-
113
- // Return name when one was given
114
- if (!url.match(/\//)) {
115
- return url;
116
- }
117
-
118
- throw `Cannot extract meaningful name from ${url}`;
119
- }
120
-
121
- // Parse the URL to extract the name
122
- const name = parseUrl(url);
123
-
124
- // Make sure name looks legit, in other words that it is composed of basic
125
- // Latin characters (a-z letters, digits, underscore and "-"), and that it
126
- // only contains a dot for fractional levels at the end of the name
127
- // (e.g. "blah-1.2" is good but "blah.blah" and "blah-3.1-blah" are not)
128
- if (!name.match(/^[\w\-]+((?<=\-\d+)\.\d+)?$/)) {
129
- throw `Specification name contains unexpected characters: ${name} (extracted from ${url})`;
130
- }
131
-
132
- return name;
133
- }
134
-
135
-
136
- // shortnames for specs that should no longer be linked to
137
- const shortNamesOfOutdatedSpecs = {
138
- "2dcontext": "html",
139
- "2dcontext2": "html",
140
- "cors": "fetch",
141
- "custom-elements": "html",
142
- "domcore": "dom",
143
- "eventsource": "html",
144
- "html5": "html",
145
- "html50": "html",
146
- "html51": "html",
147
- "html52": "html",
148
- "selectors-api": "dom",
149
- "webmessaging": "html",
150
- "websockets": "html",
151
- "webstorage": "html",
152
- "workers": "html",
153
- "worklets-1": "html"
154
- };
155
-
156
- const shortnameMap = {
157
- "accname-1.1": "accname",
158
- "accname-aam-1.1": "accname",
159
- "BackgroundSync": "background-sync",
160
- "content-security-policy": "CSP",
161
- "core-aam-1.1": "core-aam",
162
- "csp": "CSP",
163
- "CSP2": "CSP",
164
- "css-color-3": "css-color",
165
- "css-contain-1": "css-contain",
166
- "css-fonts-3": "css-fonts",
167
- "css-grid-1": "css-grid",
168
- "css-selectors": "selectors",
169
- "css-selectors-3": "selectors",
170
- "css-ui-3": "css-ui",
171
- "css-writing-modes-3": "css-writing-modes",
172
- "css2": "CSS21",
173
- "css3-align": "css-align",
174
- "css3-animations": "css-animations",
175
- "css3-background": "css-backgrounds",
176
- "css3-box": "css-box",
177
- "css3-break": "css-break",
178
- "css3-color": "css-color",
179
- "css3-flexbox": "css-flexbox",
180
- "css3-fonts": "css-fonts",
181
- "css3-grid-layout": "css-grid",
182
- "css3-images": "css-images",
183
- "css3-mediaqueries": "mediaqueries",
184
- "css3-multicol": "css-multicol",
185
- "css3-namespace": "css-namespaces",
186
- "css3-page": "css-page",
187
- "css3-positioning": "css-position",
188
- "css3-regions": "css-regions",
189
- "css3-selectors": "selectors",
190
- "css3-speech": "css-speech",
191
- "css3-syntax": "css-syntax",
192
- "css3-text": "css-text",
193
- "css3-transforms": "css-transforms",
194
- "css3-transitions": "css-transitions",
195
- "css3-values": "css-values",
196
- "css3-writing-modes": "css-writing-modes",
197
- "feature-policy": "permissions-policy",
198
- "hr-time-2": "hr-time",
199
- "html-aam": "html-aam-1.0",
200
- "input-events-1": "input-events",
201
- "InputDeviceCapabilities": "input-device-capabilities",
202
- "IntersectionObserver": "intersection-observer",
203
- "mixedcontent": "mixed-content",
204
- "pointerevents2": "pointerevents",
205
- "powerfulfeatures": "secure-contexts",
206
- "resource-timing": "resource-timing-2",
207
- "resource-timing-1": "resource-timing",
208
- "selectors-3": "selectors",
209
- "selectors4": "selectors",
210
- "ServiceWorker": "service-workers",
211
- "wai-aria-1.1": "wai-aria-1.2",
212
- "wasm-core-1": "wasm-core",
213
- "webauthn-1": "webauthn",
214
- "webdriver": "webdriver2",
215
- "webdriver1": "webdriver2"
216
- };
217
-
218
- // TODO: check the link is non-normative (somehow)
219
- const shortnameOfNonNormativeDocs = [
220
- "accept-encoding-range-test",
221
- "aria-practices",
222
- "Audio-EQ-Cookbook",
223
- "books",
224
- "capability-urls",
225
- "clreq",
226
- "css-2017",
227
- "css-print",
228
- "css3-marquee",
229
- "css3-preslev",
230
- "design-principles",
231
- "discovery-api",
232
- "dpub-latinreq",
233
- "dpub-pagination",
234
- "file-system-api",
235
- "fingerprinting-guidance",
236
- "html-design-principles",
237
- "ilreq",
238
- "installable-webapps",
239
- "jlreq",
240
- "klreq",
241
- "media-accessibility-reqs",
242
- "media-source-testcoverage",
243
- "motion-sensors",
244
- "predefined-counter-styles",
245
- "rdf11-primer",
246
- "security-privacy-questionnaire",
247
- "security-questionnaire",
248
- "sensor-polyfills",
249
- "sensors",
250
- "sniffly",
251
- "spatial-navigation",
252
- "ssml-sayas",
253
- "storage-partitioning",
254
- "streamproc",
255
- "touch-events-extensions",
256
- "typography",
257
- "using-aria",
258
- "wai-aria-implementation",
259
- "wai-aria-practices",
260
- "wai-aria-practices-1.1",
261
- "wai-aria-practices-1.2",
262
- "wai-aria-roadmap",
263
- "wake-lock-use-cases",
264
- "web-audio-perf",
265
- "web-intents",
266
- "webaudio-usecases",
267
- "webdatabase",
268
- "webrtc-interop-reports",
269
- "webrtc-nv-use-cases"
270
- ];
271
-
272
-
273
- function studyBackrefs(edResults, trResults = []) {
274
- trResults = trResults || [];
275
- const report = {};
276
-
277
- function recordAnomaly(spec, anomalyType, link) {
278
- if (!report[spec.url]) {
279
- report[spec.url] = {
280
- title: spec.title,
281
- notExported: [],
282
- notDfn: [],
283
- brokenLinks: [],
284
- evolvingLinks: [],
285
- outdatedSpecs: [],
286
- unknownSpecs: [],
287
- datedUrls: []
288
- };
289
- }
290
- report[spec.url][anomalyType].push(link);
291
- }
292
-
293
- edResults.forEach(spec => {
294
- Object.keys(spec.links || {})
295
- .filter(matchSpecUrl)
296
- .forEach(link => {
297
- let shortname;
298
- let nakedLink = link;
299
- if (nakedLink.endsWith(".html")) {
300
- nakedLink = nakedLink.replace(/\/[^/]*\.html$/, '/');
301
- }
302
- if (nakedLink[nakedLink.length - 1] !== '/') {
303
- nakedLink += '/';
304
- }
305
-
306
- // Detect links to dated specs
307
- const match = nakedLink.match(/www\.w3\.org\/TR\/[0-9]{4}\/[A-Z]+-(.+)-[0-9]{8}\//);
308
- if (match) {
309
- // ED should not link to dated versions of the spec, unless it
310
- // voluntarily links to previous versions of itself
311
- if (match[1] !== spec.shortname) {
312
- recordAnomaly(spec, "datedUrls", link);
313
- }
314
-
315
- // TODO: consider pursuing the analysis with the non-dated version,
316
- // but note this may trigger some obscure broken fragment messages
317
- // when a fragment exists in the dated version but no longer exists
318
- // in the ED.
319
- return;
320
- }
321
-
322
- // Check whether the naked link matches any known URL in the crawl
323
- shortname = (edResults.find(r =>
324
- r.url === nakedLink ||
325
- (r.release && r.release.url === nakedLink) ||
326
- r.nightly.url === nakedLink ||
327
- (r.series && nakedLink === `https://www.w3.org/TR/${r.series.shortname}/`) ) || {}).shortname;
328
-
329
- // If it does not match any known URL, try to compute a shortname out of
330
- // it directly.
331
- if (!shortname) {
332
- try {
333
- shortname = computeShortname(nakedLink);
334
- }
335
- catch (e) {
336
- recordAnomaly(spec, "unknownSpecs", link);
337
- return;
338
- }
339
- }
340
-
341
- if (shortNamesOfOutdatedSpecs[shortname]) {
342
- // The specification should no longer be referenced.
343
- // In theory, we could still try to match the anchor against the
344
- // right spec. In practice, these outdated specs are sufficiently
345
- // outdated that it does not make a lot of sense to do so.
346
- recordAnomaly(spec, "outdatedSpecs", link);
347
- return;
348
- }
349
-
350
- if (shortnameMap[shortname]) {
351
- // TODO: Consider reporting that as a "non ideal" link.
352
- shortname = shortnameMap[shortname];
353
- }
354
-
355
- // At this point, we managed to associate the link with a shortname,
356
- // let's check whether the shortname matches a spec in the crawl,
357
- // matching the exact spec shortname if possible, or the series
358
- // shortname otherwise (in which case we'll use the current spec)
359
- const sourceSpec =
360
- edResults.find(s => s.shortname === shortname) ||
361
- edResults.find(s => s.series.shortname === shortname && s.series.currentSpecification === s.shortname);
362
- if (!sourceSpec) {
363
- if (!shortnameOfNonNormativeDocs.includes(shortname)) {
364
- recordAnomaly(spec, "unknownSpecs", link);
365
- }
366
- return;
367
- }
368
-
369
- // Self-references might be broken because of ED vs TR, ignore that
370
- if (shortname === spec.shortname || shortname === spec.series.shortname) {
371
- return;
372
- }
373
-
374
- // Look for a corresponding entry in the TR crawl, which we'll use to
375
- // distinguish between broken links and "evolving" links (meaning links
376
- // that exist in the TR version but no longer exist in the ED)
377
- const trSourceSpec =
378
- trResults.find(s => s.shortname === shortname) ||
379
- trResults.find(s => s.series.shortname === shortname && s.series.currentSpecification === s.shortname) ||
380
- {};
381
- const headings = sourceSpec.headings || [];
382
- const dfns = sourceSpec.dfns || [];
383
- const ids = sourceSpec.ids || [];
384
-
385
- // Check anchors
386
- const anchors = spec.links[link];
387
- for (let anchor of anchors) {
388
- const isKnownId = ids.includes(sourceSpec.nightly.url + "#" + anchor);
389
- const heading = headings.find(h => h.id === anchor);
390
- const dfn = dfns.find(d => d.id === anchor);
391
- if (!isKnownId) {
392
- if ((trSourceSpec.ids || []).includes(anchor) && link.match(/w3\.org\/TR\//)) {
393
- recordAnomaly(spec, "evolvingLinks", link + "#" + anchor);
394
- } else {
395
- recordAnomaly(spec, "brokenLinks", link + "#" + anchor);
396
- }
397
- } else if (!heading && !dfn) {
398
- recordAnomaly(spec, "notDfn", link + "#" + anchor);
399
- } else if (dfn && dfn.access !== "public") {
400
- recordAnomaly(spec, "notExported", link + "#" + anchor);
401
- }
402
- }
403
- });
404
- });
405
- return report;
406
- }
407
-
408
-
409
- async function loadCrawlResults(edCrawlResultsPath, trCrawlResultsPath) {
410
- let edCrawlResults, trCrawlResults;
411
- try {
412
- edCrawlResults = requireFromWorkingDirectory(edCrawlResultsPath);
413
- } catch(e) {
414
- throw "Impossible to read " + edCrawlResultsPath + ": " + e;
415
- }
416
- try {
417
- trCrawlResults = requireFromWorkingDirectory(trCrawlResultsPath);
418
- } catch(e) {
419
- throw "Impossible to read " + trCrawlResultsPath + ": " + e;
420
- }
421
-
422
- edCrawlResults = await expandCrawlResult(edCrawlResults, edCrawlResultsPath.replace(/index\.json$/, ''));
423
- trCrawlResults = await expandCrawlResult(trCrawlResults, trCrawlResultsPath.replace(/index\.json$/, ''));
424
-
425
- return {
426
- ed: edCrawlResults.results,
427
- tr: trCrawlResults.results
428
- };
429
- }
430
-
431
- function reportToConsole(results) {
432
- let report = "";
433
- Object.keys(results)
434
- .sort((r1, r2) => results[r1].title.localeCompare(results[r2].title))
435
- .forEach(s => {
436
- const result = results[s];
437
- report += `<details><summary><a href="${s}">${result.title}</a></summary>\n\n`;
438
- if (result.brokenLinks.length) {
439
- report += "Links to anchors that don't exist:\n"
440
- result.brokenLinks.forEach(l => {
441
- report += "* " + l + "\n";
442
- })
443
- report += "\n\n";
444
- }
445
- if (result.evolvingLinks.length) {
446
- report += "Links to anchors that no longer exist in the editor draft of the target spec:\n"
447
- result.evolvingLinks.forEach(l => {
448
- report += "* " + l + "\n";
449
- })
450
- report += "\n\n";
451
- }
452
- if (result.notDfn.length) {
453
- report += "Links to anchors that are not definitions or headings:\n"
454
- result.notDfn.forEach(l => {
455
- report += "* " + l + "\n";
456
- })
457
- report += "\n\n";
458
- }
459
- if (result.notExported.length) {
460
- report += "Links to definitions that are not exported:\n"
461
- result.notExported.forEach(l => {
462
- report += "* " + l + "\n";
463
- })
464
- report += "\n\n";
465
- }
466
- if (result.datedUrls.length) {
467
- report += "Links to dated TR URLs:\n"
468
- result.datedUrls.forEach(l => {
469
- report += "* " + l + "\n";
470
- })
471
- report += "\n\n";
472
- }
473
- if (result.outdatedSpecs.length) {
474
- report += "Links to specs that should no longer be referenced:\n"
475
- result.outdatedSpecs.forEach(l => {
476
- report += "* " + l + "\n";
477
- })
478
- report += "\n\n";
479
- }
480
- if (result.unknownSpecs.length) {
481
- report += "Links to things that look like specs but that aren't recognized in reffy data:\n"
482
- result.unknownSpecs.forEach(l => {
483
- report += "* " + l + "\n";
484
- })
485
- report += "\n\n";
486
- }
487
- report += "</details>\n";
488
- });
489
- console.log(report);
490
- }
491
-
492
-
493
- /**************************************************
494
- Export methods for use as module
495
- **************************************************/
496
- module.exports.studyBackrefs = studyBackrefs;
497
-
498
-
499
- /**************************************************
500
- Code run if the code is run as a stand-alone module
501
- **************************************************/
502
- if (require.main === module) {
503
- let edCrawlResultsPath = process.argv[2];
504
- let trCrawlResultsPath = process.argv[3];
505
-
506
- if (!edCrawlResultsPath) {
507
- console.error('Backrefs analyzer must be called with a paths to crawl results as first parameter');
508
- process.exit(2);
509
- }
510
-
511
- // If only one argument is provided, consider that it is the path to the
512
- // root folder of a crawl results, with "ed" and "tr" subfolders
513
- if (!trCrawlResultsPath) {
514
- trCrawlResultsPath = path.join(edCrawlResultsPath, 'tr');
515
- edCrawlResultsPath = path.join(edCrawlResultsPath, 'ed');
516
- }
517
-
518
- // Target the index file if needed
519
- if (!edCrawlResultsPath.endsWith('index.json')) {
520
- edCrawlResultsPath = path.join(edCrawlResultsPath, 'index.json');
521
- }
522
- if (!trCrawlResultsPath.endsWith('index.json')) {
523
- trCrawlResultsPath = path.join(trCrawlResultsPath, 'index.json');
524
- }
525
-
526
- // Analyze the crawl results
527
- loadCrawlResults(edCrawlResultsPath, trCrawlResultsPath)
528
- .then(crawl => studyBackrefs(crawl.ed, crawl.tr))
529
- .then(reportToConsole)
530
- .catch(e => {
531
- console.error(e);
532
- process.exit(3);
533
- });
534
- }