reffy 6.2.0 → 6.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. package/LICENSE +21 -21
  2. package/README.md +158 -158
  3. package/index.js +11 -11
  4. package/package.json +53 -53
  5. package/reffy.js +248 -248
  6. package/src/browserlib/canonicalize-url.mjs +50 -50
  7. package/src/browserlib/create-outline.mjs +352 -352
  8. package/src/browserlib/extract-cssdfn.mjs +319 -319
  9. package/src/browserlib/extract-dfns.mjs +686 -686
  10. package/src/browserlib/extract-elements.mjs +205 -205
  11. package/src/browserlib/extract-headings.mjs +48 -48
  12. package/src/browserlib/extract-ids.mjs +28 -28
  13. package/src/browserlib/extract-links.mjs +28 -28
  14. package/src/browserlib/extract-references.mjs +203 -203
  15. package/src/browserlib/extract-webidl.mjs +134 -134
  16. package/src/browserlib/get-absolute-url.mjs +21 -21
  17. package/src/browserlib/get-generator.mjs +26 -26
  18. package/src/browserlib/get-lastmodified-date.mjs +13 -13
  19. package/src/browserlib/get-title.mjs +11 -11
  20. package/src/browserlib/informative-selector.mjs +16 -16
  21. package/src/browserlib/map-ids-to-headings.mjs +136 -136
  22. package/src/browserlib/reffy.json +53 -53
  23. package/src/cli/check-missing-dfns.js +609 -609
  24. package/src/cli/generate-idlnames.js +430 -430
  25. package/src/cli/generate-idlparsed.js +139 -139
  26. package/src/cli/merge-crawl-results.js +128 -128
  27. package/src/cli/parse-webidl.js +430 -430
  28. package/src/lib/css-grammar-parse-tree.schema.json +109 -109
  29. package/src/lib/css-grammar-parser.js +440 -440
  30. package/src/lib/fetch.js +56 -56
  31. package/src/lib/nock-server.js +127 -120
  32. package/src/lib/specs-crawler.js +622 -603
  33. package/src/lib/util.js +943 -898
  34. package/src/specs/missing-css-rules.json +197 -197
  35. package/src/specs/spec-equivalents.json +149 -149
  36. package/src/browserlib/extract-editors.mjs~ +0 -14
  37. package/src/browserlib/generate-es-dfn-report.sh~ +0 -4
  38. package/src/cli/csstree-grammar-check.js +0 -28
  39. package/src/cli/csstree-grammar-check.js~ +0 -10
  40. package/src/cli/csstree-grammar-parser.js +0 -11
  41. package/src/cli/csstree-grammar-parser.js~ +0 -1
  42. package/src/cli/extract-editors.js~ +0 -38
  43. package/src/cli/process-specs.js~ +0 -28
package/reffy.js CHANGED
@@ -1,248 +1,248 @@
1
- #!/usr/bin/env node
2
- /**
3
- * The spec crawler takes a list of spec URLs as input, gathers some knowledge
4
- * about these specs (published versions, URL of the Editor's Draft, etc.),
5
- * fetches these specs, parses them, extracts relevant information that they
6
- * contain (such as the WebIDL they define, the list of specifications that they
7
- * reference, and links to external specs), and produces a crawl report with the
8
- * results of these investigations.
9
- *
10
- * Provided Reffy was installed as a global package, the spec crawler can be
11
- * called directly through:
12
- *
13
- * `reffy [options]`
14
- *
15
- * Use the `--help` option for usage instructions.
16
- *
17
- * If Reffy was not installed as a global package, call:
18
- *
19
- * `node reffy.js [options]`
20
- *
21
- * @module crawler
22
- */
23
-
24
- const commander = require('commander');
25
- const satisfies = require('semver/functions/satisfies');
26
- const specs = require('browser-specs');
27
- const { version, engines } = require('./package.json');
28
- const { requireFromWorkingDirectory } = require('./src/lib/util');
29
- const { crawlSpecs } = require('./src/lib/specs-crawler');
30
-
31
- // Warn if version of Node.js does not satisfy requirements
32
- if (engines && engines.node && !satisfies(process.version, engines.node)) {
33
- console.warn(`
34
- [WARNING] Node.js ${process.version} detected but Reffy needs Node.js ${engines.node}.
35
- Please consider upgrading Node.js if the program crashes!`);
36
- }
37
-
38
-
39
- function parseModuleOption(input) {
40
- const parts = input.split(':');
41
- if (parts.length > 2) {
42
- console.error('Module input cannot have more than one ":" character');
43
- process.exit(2);
44
- }
45
- if (parts.length === 2) {
46
- return {
47
- href: parts[1],
48
- property: parts[0]
49
- };
50
- }
51
- else {
52
- return parts[0];
53
- }
54
- }
55
-
56
- function parseSpecOption(input) {
57
- if (input === 'all') {
58
- return specs.map(s => s.shortname);
59
- }
60
- else {
61
- const list = requireFromWorkingDirectory(input);
62
- return list ?? input;
63
- }
64
- }
65
-
66
-
67
- const program = new commander.Command();
68
- program
69
- .version(version)
70
- .usage('[options]')
71
- .description('Crawls and processes a list of Web specifications')
72
- .option('-d, --debug', 'debug mode, crawl one spec at a time')
73
- .option('-f, --fallback <json>', 'fallback data to use when a spec crawl fails')
74
- .option('-m, --module <modules...>', 'spec processing modules')
75
- .option('-o, --output <folder>', 'existing folder/file where crawl results are to be saved')
76
- .option('-q, --quiet', 'do not report progress and other warnings to the console')
77
- .option('-r, --release', 'crawl release (TR) version of specs')
78
- .option('-s, --spec <specs...>', 'specs to crawl')
79
- .option('-t, --terse', 'output crawl results without metadata')
80
- .action(options => {
81
- if (!(options.output || options.module || options.spec)) {
82
- console.error(`
83
- At least one of the --output, --module or --spec options needs to be specified.
84
- For usage notes, run:
85
- reffy --help
86
-
87
- If you really want to crawl all specs, run all processing modules and report the
88
- JSON outcome to the console, you may run the following command but note that it
89
- will dump ~100MB of data to the console:
90
- reffy --spec all
91
- `);
92
- process.exit(2);
93
- }
94
- const crawlOptions = {
95
- debug: options.debug,
96
- fallback: options.fallback,
97
- output: options.output,
98
- publishedVersion: options.release,
99
- quiet: options.quiet,
100
- terse: options.terse
101
- };
102
- if (options.module) {
103
- crawlOptions.modules = options.module.map(parseModuleOption);
104
- }
105
- if (options.spec) {
106
- crawlOptions.specs = options.spec.map(parseSpecOption).flat();
107
- }
108
-
109
- if (crawlOptions.terse && crawlOptions.output) {
110
- console.error('The --terse option cannot be combined with the --output option');
111
- process.exit(2);
112
- }
113
- if (crawlOptions.terse && (!crawlOptions.modules || crawlOptions.modules.length === 0 || crawlOptions.modules.length > 1)) {
114
- console.error('The --terse option can be only be set when only one core processing module runs');
115
- process.exit(2);
116
- }
117
- crawlSpecs(crawlOptions)
118
- .then(_ => {
119
- process.exit(0);
120
- })
121
- .catch(err => {
122
- console.error(err);
123
- process.exit(1);
124
- });
125
- })
126
- .showHelpAfterError('(run with --help for usage information)')
127
- .addHelpText('after', `
128
- Minimal usage example:
129
- To crawl all known specs, run all processing modules, and save generated
130
- extracts to the current folder, run:
131
- $ reffy -o .
132
-
133
- Description:
134
- Crawls a set of specifications and runs processing modules against each of
135
- them to generate extracts.
136
-
137
- Crawl results are written to the console as a serialized JSON array with one
138
- entry per spec by default. The order of the specs in the array matches the
139
- order of the specs provided as input (or the order of the specs in
140
- browser-specs if no explicit spec was provided).
141
-
142
- Resulting array may be large. Crawling all specs with core processing module
143
- produces ~100MB of serialized JSON for instance. To avoid janking the console
144
- or running into possible memory issues, setting the --output option is
145
- strongly recommended.
146
-
147
- Usage notes for some of the options:
148
- -f, --fallback <jsondata>
149
- Provides an existing JSON crawl data file to use as a source of fallback data
150
- for specs that fail to be crawled.
151
-
152
- The fallback data gets copied as-is. It is the responsibility of the caller
153
- to make sure that extracts it may link to actually exist and match the ones
154
- that the crawl would produce in the absence of errors (e.g. same modules).
155
-
156
- The "error" property is set on specs for which fallback data was used.
157
-
158
- -m, --module <modules...>
159
- If processing modules are not specified, the crawler runs all core processing
160
- modules defined in:
161
- https://github.com/w3c/reffy/blob/main/src/browserlib/reffy.json
162
-
163
- Modules must be specified using a relative path to an ".mjs" file that defines
164
- the processing logic to run on the spec's page in a browser context. For
165
- instance:
166
- $ reffy reports/test --module extract-editors.mjs
167
-
168
- Absolute paths to modules are not properly handled and will likely result in a
169
- crawling error.
170
-
171
- Multiple modules can be specified, repeating the option name or not:
172
- $ reffy reports/test -m extract-words.mjs extract-editors.mjs
173
- $ reffy reports/test -m extract-words.mjs -m extract-editors.mjs
174
-
175
- The option cannot appear before <folder>, unless you use "--" to flag the end
176
- of the list:
177
- $ reffy --module extract-editors.mjs -- reports/test
178
-
179
- Core processing modules may be referenced using the name of the extract folder
180
- or property that they would create:
181
- $ reffy reports/test --module dfns
182
-
183
- To run all core processing modules, use "core". For instance, to apply a
184
- processing module on top of core processing modules, use:
185
- $ reffy reports/test --module core extract-editors.mjs
186
-
187
- Each module must export a function that takes a spec object as input and
188
- return a result that can be serialized as JSON. A typical module code looks
189
- like:
190
- https://github.com/w3c/reffy/blob/main/src/browserlib/extract-ids.mjs
191
-
192
- Individual extracts will be created under "<folder>/[camelCaseModule]" where
193
- "[camelCaseModule]" is derived from the module's filename. For instance:
194
- "extract-editors.mjs" creates extracts under "<folder>/extractEditors"
195
-
196
- The name of the folder where extracts get created may be specified for custom
197
- modules by prefixing the path to the module with the folder name followed by
198
- ":". For instance, to save extracts to "reports/test/editors", use:
199
- $ reffy reports/test --module editors:extract-editors.mjs
200
-
201
- -o, --output <folder>
202
- By default, crawl results are written to the console as a serialized JSON
203
- array with one entry per spec, and module processing results attached as
204
- property values in each of these entries.
205
-
206
- If an output <folder> is specified, crawl results are rather saved to that
207
- folder, with module processing results created under subfolders (see the
208
- --module option) and linked from an index.json file created under <folder>.
209
-
210
- Additionally, if an output <folder> is specified and if the IDL processing
211
- module is run, the crawler will also creates an index of IDL names named
212
- "idlnames.json" that links to relevant extracts in subfolders.
213
-
214
- The folder targeted by <folder> must exist.
215
-
216
- -r, --release
217
- If the flag is not set, the crawler defaults to crawl nightly versions of the
218
- specs.
219
-
220
- -s, --spec <specs...>
221
- If specs to crawl are not specified, all specs in browser-specs get crawled:
222
- https://github.com/w3c/browser-specs/
223
-
224
- Valid spec values may be a shortname, a URL, or a relative path to a file that
225
- contains a list of spec URLs and/or shortnames. All shortnames must exist in
226
- browser-specs. Shortname may be the shortname of the spec series, in which
227
- case the spec identified as the current specification in the series is used.
228
- For instance, as of September 2021, "pointerlock" will map to "pointerlock-2"
229
- because Pointer Lock 2.0 is the current level in the series.
230
-
231
- Use "all" to include all specs in browser-specs in the crawl. For instance, to
232
- crawl all specs plus one custom spec that does not exist in browser-specs:
233
- $ reffy reports/test -s all https://example.org/myspec
234
-
235
- -t, --terse
236
- This flag cannot be combined with the --output option and cannot be set if
237
- more than one processing module gets run. When set, the crawler writes the
238
- processing module results to the console directly without wrapping them with
239
- spec metadata. In other words, the spec entry in the crawl results directly
240
- contains the outcome of the processing module when the flag is set.
241
-
242
- Additionally, if crawl runs on a single specification, the array is omitted
243
- and the processing module results are thus written to the console directly.
244
- For instance:
245
- $ reffy --spec fetch --module idl --terse
246
- `);
247
-
248
- program.parse(process.argv);
1
+ #!/usr/bin/env node
2
+ /**
3
+ * The spec crawler takes a list of spec URLs as input, gathers some knowledge
4
+ * about these specs (published versions, URL of the Editor's Draft, etc.),
5
+ * fetches these specs, parses them, extracts relevant information that they
6
+ * contain (such as the WebIDL they define, the list of specifications that they
7
+ * reference, and links to external specs), and produces a crawl report with the
8
+ * results of these investigations.
9
+ *
10
+ * Provided Reffy was installed as a global package, the spec crawler can be
11
+ * called directly through:
12
+ *
13
+ * `reffy [options]`
14
+ *
15
+ * Use the `--help` option for usage instructions.
16
+ *
17
+ * If Reffy was not installed as a global package, call:
18
+ *
19
+ * `node reffy.js [options]`
20
+ *
21
+ * @module crawler
22
+ */
23
+
24
+ const commander = require('commander');
25
+ const satisfies = require('semver/functions/satisfies');
26
+ const specs = require('web-specs');
27
+ const { version, engines } = require('./package.json');
28
+ const { requireFromWorkingDirectory } = require('./src/lib/util');
29
+ const { crawlSpecs } = require('./src/lib/specs-crawler');
30
+
31
+ // Warn if version of Node.js does not satisfy requirements
32
+ if (engines && engines.node && !satisfies(process.version, engines.node)) {
33
+ console.warn(`
34
+ [WARNING] Node.js ${process.version} detected but Reffy needs Node.js ${engines.node}.
35
+ Please consider upgrading Node.js if the program crashes!`);
36
+ }
37
+
38
+
39
+ function parseModuleOption(input) {
40
+ const parts = input.split(':');
41
+ if (parts.length > 2) {
42
+ console.error('Module input cannot have more than one ":" character');
43
+ process.exit(2);
44
+ }
45
+ if (parts.length === 2) {
46
+ return {
47
+ href: parts[1],
48
+ property: parts[0]
49
+ };
50
+ }
51
+ else {
52
+ return parts[0];
53
+ }
54
+ }
55
+
56
+ function parseSpecOption(input) {
57
+ if (input === 'all') {
58
+ return specs.map(s => s.shortname);
59
+ }
60
+ else {
61
+ const list = requireFromWorkingDirectory(input);
62
+ return list ?? input;
63
+ }
64
+ }
65
+
66
+
67
+ const program = new commander.Command();
68
+ program
69
+ .version(version)
70
+ .usage('[options]')
71
+ .description('Crawls and processes a list of Web specifications')
72
+ .option('-d, --debug', 'debug mode, crawl one spec at a time')
73
+ .option('-f, --fallback <json>', 'fallback data to use when a spec crawl fails')
74
+ .option('-m, --module <modules...>', 'spec processing modules')
75
+ .option('-o, --output <folder>', 'existing folder/file where crawl results are to be saved')
76
+ .option('-q, --quiet', 'do not report progress and other warnings to the console')
77
+ .option('-r, --release', 'crawl release (TR) version of specs')
78
+ .option('-s, --spec <specs...>', 'specs to crawl')
79
+ .option('-t, --terse', 'output crawl results without metadata')
80
+ .action(options => {
81
+ if (!(options.output || options.module || options.spec)) {
82
+ console.error(`
83
+ At least one of the --output, --module or --spec options needs to be specified.
84
+ For usage notes, run:
85
+ reffy --help
86
+
87
+ If you really want to crawl all specs, run all processing modules and report the
88
+ JSON outcome to the console, you may run the following command but note that it
89
+ will dump ~100MB of data to the console:
90
+ reffy --spec all
91
+ `);
92
+ process.exit(2);
93
+ }
94
+ const crawlOptions = {
95
+ debug: options.debug,
96
+ fallback: options.fallback,
97
+ output: options.output,
98
+ publishedVersion: options.release,
99
+ quiet: options.quiet,
100
+ terse: options.terse
101
+ };
102
+ if (options.module) {
103
+ crawlOptions.modules = options.module.map(parseModuleOption);
104
+ }
105
+ if (options.spec) {
106
+ crawlOptions.specs = options.spec.map(parseSpecOption).flat();
107
+ }
108
+
109
+ if (crawlOptions.terse && crawlOptions.output) {
110
+ console.error('The --terse option cannot be combined with the --output option');
111
+ process.exit(2);
112
+ }
113
+ if (crawlOptions.terse && (!crawlOptions.modules || crawlOptions.modules.length === 0 || crawlOptions.modules.length > 1)) {
114
+ console.error('The --terse option can be only be set when only one core processing module runs');
115
+ process.exit(2);
116
+ }
117
+ crawlSpecs(crawlOptions)
118
+ .then(_ => {
119
+ process.exit(0);
120
+ })
121
+ .catch(err => {
122
+ console.error(err);
123
+ process.exit(1);
124
+ });
125
+ })
126
+ .showHelpAfterError('(run with --help for usage information)')
127
+ .addHelpText('after', `
128
+ Minimal usage example:
129
+ To crawl all known specs, run all processing modules, and save generated
130
+ extracts to the current folder, run:
131
+ $ reffy -o .
132
+
133
+ Description:
134
+ Crawls a set of specifications and runs processing modules against each of
135
+ them to generate extracts.
136
+
137
+ Crawl results are written to the console as a serialized JSON array with one
138
+ entry per spec by default. The order of the specs in the array matches the
139
+ order of the specs provided as input (or the order of the specs in
140
+ browser-specs if no explicit spec was provided).
141
+
142
+ Resulting array may be large. Crawling all specs with core processing module
143
+ produces ~100MB of serialized JSON for instance. To avoid janking the console
144
+ or running into possible memory issues, setting the --output option is
145
+ strongly recommended.
146
+
147
+ Usage notes for some of the options:
148
+ -f, --fallback <jsondata>
149
+ Provides an existing JSON crawl data file to use as a source of fallback data
150
+ for specs that fail to be crawled.
151
+
152
+ The fallback data gets copied as-is. It is the responsibility of the caller
153
+ to make sure that extracts it may link to actually exist and match the ones
154
+ that the crawl would produce in the absence of errors (e.g. same modules).
155
+
156
+ The "error" property is set on specs for which fallback data was used.
157
+
158
+ -m, --module <modules...>
159
+ If processing modules are not specified, the crawler runs all core processing
160
+ modules defined in:
161
+ https://github.com/w3c/reffy/blob/main/src/browserlib/reffy.json
162
+
163
+ Modules must be specified using a relative path to an ".mjs" file that defines
164
+ the processing logic to run on the spec's page in a browser context. For
165
+ instance:
166
+ $ reffy reports/test --module extract-editors.mjs
167
+
168
+ Absolute paths to modules are not properly handled and will likely result in a
169
+ crawling error.
170
+
171
+ Multiple modules can be specified, repeating the option name or not:
172
+ $ reffy reports/test -m extract-words.mjs extract-editors.mjs
173
+ $ reffy reports/test -m extract-words.mjs -m extract-editors.mjs
174
+
175
+ The option cannot appear before <folder>, unless you use "--" to flag the end
176
+ of the list:
177
+ $ reffy --module extract-editors.mjs -- reports/test
178
+
179
+ Core processing modules may be referenced using the name of the extract folder
180
+ or property that they would create:
181
+ $ reffy reports/test --module dfns
182
+
183
+ To run all core processing modules, use "core". For instance, to apply a
184
+ processing module on top of core processing modules, use:
185
+ $ reffy reports/test --module core extract-editors.mjs
186
+
187
+ Each module must export a function that takes a spec object as input and
188
+ return a result that can be serialized as JSON. A typical module code looks
189
+ like:
190
+ https://github.com/w3c/reffy/blob/main/src/browserlib/extract-ids.mjs
191
+
192
+ Individual extracts will be created under "<folder>/[camelCaseModule]" where
193
+ "[camelCaseModule]" is derived from the module's filename. For instance:
194
+ "extract-editors.mjs" creates extracts under "<folder>/extractEditors"
195
+
196
+ The name of the folder where extracts get created may be specified for custom
197
+ modules by prefixing the path to the module with the folder name followed by
198
+ ":". For instance, to save extracts to "reports/test/editors", use:
199
+ $ reffy reports/test --module editors:extract-editors.mjs
200
+
201
+ -o, --output <folder>
202
+ By default, crawl results are written to the console as a serialized JSON
203
+ array with one entry per spec, and module processing results attached as
204
+ property values in each of these entries.
205
+
206
+ If an output <folder> is specified, crawl results are rather saved to that
207
+ folder, with module processing results created under subfolders (see the
208
+ --module option) and linked from an index.json file created under <folder>.
209
+
210
+ Additionally, if an output <folder> is specified and if the IDL processing
211
+ module is run, the crawler will also creates an index of IDL names named
212
+ "idlnames.json" that links to relevant extracts in subfolders.
213
+
214
+ The folder targeted by <folder> must exist.
215
+
216
+ -r, --release
217
+ If the flag is not set, the crawler defaults to crawl nightly versions of the
218
+ specs.
219
+
220
+ -s, --spec <specs...>
221
+ If specs to crawl are not specified, all specs in browser-specs get crawled:
222
+ https://github.com/w3c/browser-specs/
223
+
224
+ Valid spec values may be a shortname, a URL, or a relative path to a file that
225
+ contains a list of spec URLs and/or shortnames. All shortnames must exist in
226
+ browser-specs. Shortname may be the shortname of the spec series, in which
227
+ case the spec identified as the current specification in the series is used.
228
+ For instance, as of September 2021, "pointerlock" will map to "pointerlock-2"
229
+ because Pointer Lock 2.0 is the current level in the series.
230
+
231
+ Use "all" to include all specs in browser-specs in the crawl. For instance, to
232
+ crawl all specs plus one custom spec that does not exist in browser-specs:
233
+ $ reffy reports/test -s all https://example.org/myspec
234
+
235
+ -t, --terse
236
+ This flag cannot be combined with the --output option and cannot be set if
237
+ more than one processing module gets run. When set, the crawler writes the
238
+ processing module results to the console directly without wrapping them with
239
+ spec metadata. In other words, the spec entry in the crawl results directly
240
+ contains the outcome of the processing module when the flag is set.
241
+
242
+ Additionally, if crawl runs on a single specification, the array is omitted
243
+ and the processing module results are thus written to the console directly.
244
+ For instance:
245
+ $ reffy --spec fetch --module idl --terse
246
+ `);
247
+
248
+ program.parse(process.argv);
@@ -1,51 +1,51 @@
1
- /**
2
- * Return a canonicalized version of the given URL.
3
- *
4
- * By default, the canonicalized URL should represent the same resource and
5
- * typically de-reference to the same document (or a subpage of it).
6
- *
7
- * Canonicalization can be made a bit stronger through options, in particular
8
- * to canonicalize dated W3C URLs to the Latest version, and to use a list of
9
- * equivalent URLs (that the crawler typically generates).
10
- */
11
- export function canonicalizeUrl(url, options) {
12
- options = options || {};
13
-
14
- let canon = url.replace(/^http:/, 'https:')
15
- .split('#')[0]
16
- .replace('index.html', '')
17
- .replace('Overview.html', '')
18
- .replace('cover.html', '')
19
- .replace(/spec.whatwg.org\/.*/, 'spec.whatwg.org/') // subpage to main document in whatwg
20
- .replace(/w3.org\/TR\/(([^\/]+\/)+)[^\/]+\.[^\/]+$/, 'w3.org/TR/$1') // subpage to main document in w3c
21
- .replace(/w3.org\/TR\/([^\/]+)$/, 'w3.org/TR/$1/') // enforce trailing slash
22
- .replace(/w3c.github.io\/([^\/]+)$/, 'w3c.github.io/$1/') // enforce trailing slash for ED on GitHub
23
- ;
24
-
25
- if (options.datedToLatest) {
26
- canon = canon.replace(
27
- /w3.org\/TR\/[0-9]{4}\/[A-Z]+-(.*)-[0-9]{8}\/?/,
28
- 'w3.org/TR/$1/');
29
- }
30
-
31
- let equivalentUrls = (options.equivalents) ? options.equivalents[canon] : null;
32
- if (Array.isArray(equivalentUrls)) {
33
- return (options.returnAlternatives ? equivalentUrls : equivalentUrls[0]);
34
- }
35
- else {
36
- return (equivalentUrls ? equivalentUrls : canon);
37
- }
38
- }
39
-
40
-
41
- export function canonicalizesTo(url, refUrl, options) {
42
- let newOptions = {
43
- datedToLatest: (options ? options.datedToLatest : false),
44
- equivalents: (options ? options.equivalents : null),
45
- returnAlternatives: true
46
- };
47
- let canon = canonicalizeUrl(url, newOptions);
48
- return Array.isArray(refUrl) ?
49
- refUrl.some(u => canon.includes(u)) :
50
- canon.includes(refUrl);
1
+ /**
2
+ * Return a canonicalized version of the given URL.
3
+ *
4
+ * By default, the canonicalized URL should represent the same resource and
5
+ * typically de-reference to the same document (or a subpage of it).
6
+ *
7
+ * Canonicalization can be made a bit stronger through options, in particular
8
+ * to canonicalize dated W3C URLs to the Latest version, and to use a list of
9
+ * equivalent URLs (that the crawler typically generates).
10
+ */
11
+ export function canonicalizeUrl(url, options) {
12
+ options = options || {};
13
+
14
+ let canon = url.replace(/^http:/, 'https:')
15
+ .split('#')[0]
16
+ .replace('index.html', '')
17
+ .replace('Overview.html', '')
18
+ .replace('cover.html', '')
19
+ .replace(/spec.whatwg.org\/.*/, 'spec.whatwg.org/') // subpage to main document in whatwg
20
+ .replace(/w3.org\/TR\/(([^\/]+\/)+)[^\/]+\.[^\/]+$/, 'w3.org/TR/$1') // subpage to main document in w3c
21
+ .replace(/w3.org\/TR\/([^\/]+)$/, 'w3.org/TR/$1/') // enforce trailing slash
22
+ .replace(/w3c.github.io\/([^\/]+)$/, 'w3c.github.io/$1/') // enforce trailing slash for ED on GitHub
23
+ ;
24
+
25
+ if (options.datedToLatest) {
26
+ canon = canon.replace(
27
+ /w3.org\/TR\/[0-9]{4}\/[A-Z]+-(.*)-[0-9]{8}\/?/,
28
+ 'w3.org/TR/$1/');
29
+ }
30
+
31
+ let equivalentUrls = (options.equivalents) ? options.equivalents[canon] : null;
32
+ if (Array.isArray(equivalentUrls)) {
33
+ return (options.returnAlternatives ? equivalentUrls : equivalentUrls[0]);
34
+ }
35
+ else {
36
+ return (equivalentUrls ? equivalentUrls : canon);
37
+ }
38
+ }
39
+
40
+
41
+ export function canonicalizesTo(url, refUrl, options) {
42
+ let newOptions = {
43
+ datedToLatest: (options ? options.datedToLatest : false),
44
+ equivalents: (options ? options.equivalents : null),
45
+ returnAlternatives: true
46
+ };
47
+ let canon = canonicalizeUrl(url, newOptions);
48
+ return Array.isArray(refUrl) ?
49
+ refUrl.some(u => canon.includes(u)) :
50
+ canon.includes(refUrl);
51
51
  }