reffy 6.2.2 → 6.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +5 -5
- package/src/lib/fetch.js +2 -2
- package/src/lib/nock-server.js +9 -2
- package/src/lib/specs-crawler.js +21 -4
- package/src/lib/util.js +55 -11
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "reffy",
|
|
3
|
-
"version": "6.
|
|
3
|
+
"version": "6.3.0",
|
|
4
4
|
"description": "W3C/WHATWG spec dependencies exploration companion. Features a short set of tools to study spec references as well as WebIDL term definitions and references found in W3C specifications.",
|
|
5
5
|
"repository": {
|
|
6
6
|
"type": "git",
|
|
@@ -34,7 +34,7 @@
|
|
|
34
34
|
"abortcontroller-polyfill": "1.7.3",
|
|
35
35
|
"browser-specs": "2.27.0",
|
|
36
36
|
"commander": "9.0.0",
|
|
37
|
-
"fetch-filecache-for-crawling": "4.0
|
|
37
|
+
"fetch-filecache-for-crawling": "4.1.0",
|
|
38
38
|
"puppeteer": "13.1.3",
|
|
39
39
|
"semver": "^7.3.5",
|
|
40
40
|
"webidl2": "24.2.0"
|
|
@@ -42,10 +42,10 @@
|
|
|
42
42
|
"devDependencies": {
|
|
43
43
|
"chai": "4.3.6",
|
|
44
44
|
"mocha": "9.2.0",
|
|
45
|
-
"nock": "13.2.
|
|
46
|
-
"respec": "29.0.
|
|
45
|
+
"nock": "13.2.4",
|
|
46
|
+
"respec": "29.0.6",
|
|
47
47
|
"respec-hljs": "2.1.1",
|
|
48
|
-
"rollup": "2.67.
|
|
48
|
+
"rollup": "2.67.1"
|
|
49
49
|
},
|
|
50
50
|
"scripts": {
|
|
51
51
|
"test": "mocha --recursive tests/"
|
package/src/lib/fetch.js
CHANGED
|
@@ -33,7 +33,7 @@ catch (err) {
|
|
|
33
33
|
* @return {Promise(Response)} Promise to get an HTTP response
|
|
34
34
|
*/
|
|
35
35
|
async function fetch(url, options) {
|
|
36
|
-
options = Object.assign({}, options);
|
|
36
|
+
options = Object.assign({headers: {}}, options);
|
|
37
37
|
['cacheFolder', 'resetCache', 'cacheRefresh', 'logToConsole'].forEach(param => {
|
|
38
38
|
let fetchParam = (param === 'cacheRefresh') ? 'refresh' : param;
|
|
39
39
|
if (config[param] && !options.hasOwnProperty(fetchParam)) {
|
|
@@ -53,4 +53,4 @@ async function fetch(url, options) {
|
|
|
53
53
|
}
|
|
54
54
|
|
|
55
55
|
|
|
56
|
-
module.exports = fetch;
|
|
56
|
+
module.exports = fetch;
|
package/src/lib/nock-server.js
CHANGED
|
@@ -98,7 +98,14 @@ nock("https://www.w3.org")
|
|
|
98
98
|
.get("/Tools/respec/respec-w3c").replyWithFile(200,
|
|
99
99
|
path.join(modulesFolder, "respec", "builds", "respec-w3c.js"),
|
|
100
100
|
{ "Content-Type": "application/js" })
|
|
101
|
-
.get("/TR/idontexist/").reply(404, '')
|
|
101
|
+
.get("/TR/idontexist/").reply(404, '')
|
|
102
|
+
.get("/TR/ididnotchange/").reply(function() {
|
|
103
|
+
if (this.req.headers['if-modified-since'][0] === "Fri, 11 Feb 2022 00:00:42 GMT") {
|
|
104
|
+
return [304, ''];
|
|
105
|
+
} else {
|
|
106
|
+
return [200, 'Unexpected path'];
|
|
107
|
+
}
|
|
108
|
+
});
|
|
102
109
|
|
|
103
110
|
nock("https://drafts.csswg.org")
|
|
104
111
|
.persist()
|
|
@@ -117,4 +124,4 @@ nock.emitter.on('no match', function(req, options, requestBody) {
|
|
|
117
124
|
}
|
|
118
125
|
});
|
|
119
126
|
|
|
120
|
-
module.exports = nock;
|
|
127
|
+
module.exports = nock;
|
package/src/lib/specs-crawler.js
CHANGED
|
@@ -29,6 +29,7 @@ const {
|
|
|
29
29
|
createFolderIfNeeded
|
|
30
30
|
} = require('./util');
|
|
31
31
|
|
|
32
|
+
const {version: reffyVersion} = require('../../package.json');
|
|
32
33
|
|
|
33
34
|
/**
|
|
34
35
|
* Return the spec if crawl succeeded or crawl result from given fallback list
|
|
@@ -78,10 +79,15 @@ async function crawlSpec(spec, crawlOptions) {
|
|
|
78
79
|
path.dirname(crawlOptions.fallback) : '';
|
|
79
80
|
|
|
80
81
|
if (spec.error) {
|
|
81
|
-
return specOrFallback(spec, fallbackFolder, crawlOptions.fallbackData);
|
|
82
|
+
return specOrFallback(spec, fallbackFolder, crawlOptions.fallbackData?.results);
|
|
82
83
|
}
|
|
83
84
|
|
|
84
85
|
try {
|
|
86
|
+
const fallback = crawlOptions.fallbackData?.results?.find(s => s.url === spec.url);
|
|
87
|
+
let cacheInfo = {};
|
|
88
|
+
if (crawlOptions.fallbackData?.crawler === `reffy-${reffyVersion}`) {
|
|
89
|
+
cacheInfo = Object.assign({}, fallback?.crawlCacheInfo);
|
|
90
|
+
}
|
|
85
91
|
const result = await processSpecification(
|
|
86
92
|
spec.crawled,
|
|
87
93
|
(spec, modules) => {
|
|
@@ -97,8 +103,14 @@ async function crawlSpec(spec, crawlOptions) {
|
|
|
97
103
|
},
|
|
98
104
|
[spec, crawlOptions.modules],
|
|
99
105
|
{ quiet: crawlOptions.quiet,
|
|
100
|
-
forceLocalFetch: crawlOptions.forceLocalFetch
|
|
106
|
+
forceLocalFetch: crawlOptions.forceLocalFetch,
|
|
107
|
+
...cacheInfo}
|
|
101
108
|
);
|
|
109
|
+
if (result.status === "notmodified" && fallback) {
|
|
110
|
+
crawlOptions.quiet ?? console.warn(`skipping ${spec.url}, no change`);
|
|
111
|
+
const copy = Object.assign({}, fallback);
|
|
112
|
+
return expandSpecResult(copy, fallbackFolder);
|
|
113
|
+
}
|
|
102
114
|
|
|
103
115
|
// Specific rule for IDL extracts:
|
|
104
116
|
// parse the extracted WebIdl content
|
|
@@ -169,6 +181,9 @@ async function crawlSpec(spec, crawlOptions) {
|
|
|
169
181
|
|
|
170
182
|
// Copy results back into initial spec object
|
|
171
183
|
spec.crawled = result.crawled;
|
|
184
|
+
if (result.crawlCacheInfo) {
|
|
185
|
+
spec.crawlCacheInfo = result.crawlCacheInfo;
|
|
186
|
+
}
|
|
172
187
|
crawlOptions.modules.forEach(mod => {
|
|
173
188
|
if (result[mod.property]) {
|
|
174
189
|
spec[mod.property] = result[mod.property];
|
|
@@ -183,7 +198,7 @@ async function crawlSpec(spec, crawlOptions) {
|
|
|
183
198
|
spec.error = err.toString() + (err.stack ? ' ' + err.stack : '');
|
|
184
199
|
}
|
|
185
200
|
|
|
186
|
-
return specOrFallback(spec, fallbackFolder, crawlOptions.fallbackData);
|
|
201
|
+
return specOrFallback(spec, fallbackFolder, crawlOptions.fallbackData?.results);
|
|
187
202
|
}
|
|
188
203
|
|
|
189
204
|
|
|
@@ -351,7 +366,7 @@ async function crawlList(speclist, crawlOptions) {
|
|
|
351
366
|
// Load fallback data if necessary
|
|
352
367
|
if (crawlOptions.fallback) {
|
|
353
368
|
try {
|
|
354
|
-
crawlOptions.fallbackData = JSON.parse(await fs.promises.readFile(crawlOptions.fallback))
|
|
369
|
+
crawlOptions.fallbackData = JSON.parse(await fs.promises.readFile(crawlOptions.fallback));
|
|
355
370
|
} catch (e) {
|
|
356
371
|
throw new Error(`Could not parse fallback data file ${crawlOptions.fallback}`);
|
|
357
372
|
}
|
|
@@ -469,12 +484,14 @@ async function saveResults(data, settings) {
|
|
|
469
484
|
|
|
470
485
|
// Save all results to an index.json file
|
|
471
486
|
const indexFilename = path.join(settings.output, 'index.json');
|
|
487
|
+
|
|
472
488
|
const contents = {
|
|
473
489
|
type: 'crawl',
|
|
474
490
|
title: 'Reffy crawl',
|
|
475
491
|
date: (new Date()).toJSON(),
|
|
476
492
|
options: settings,
|
|
477
493
|
stats: {},
|
|
494
|
+
crawler: `reffy-${reffyVersion}`,
|
|
478
495
|
results: data
|
|
479
496
|
};
|
|
480
497
|
contents.options.modules = contents.options.modules.map(mod => mod.property);
|
package/src/lib/util.js
CHANGED
|
@@ -13,7 +13,6 @@ const specEquivalents = require('../specs/spec-equivalents.json');
|
|
|
13
13
|
|
|
14
14
|
const reffyModules = require('../browserlib/reffy.json');
|
|
15
15
|
|
|
16
|
-
|
|
17
16
|
/**
|
|
18
17
|
* Maximum depth difference supported between Reffy's install path and custom
|
|
19
18
|
* modules that may be provided on the command-line
|
|
@@ -22,6 +21,7 @@ const reffyModules = require('../browserlib/reffy.json');
|
|
|
22
21
|
*/
|
|
23
22
|
const maxPathDepth = 20;
|
|
24
23
|
|
|
24
|
+
let prefetchedResponses = {};
|
|
25
25
|
|
|
26
26
|
/**
|
|
27
27
|
* Returns a range array from 0 to the number provided (not included)
|
|
@@ -325,7 +325,8 @@ async function teardownBrowser() {
|
|
|
325
325
|
* flag tells the function that all network requests need to be only handled
|
|
326
326
|
* by Node.js's "fetch" function (as opposed to falling back to Puppeteer's
|
|
327
327
|
* network and caching logic), which is useful to keep full control of network
|
|
328
|
-
* requests in tests.
|
|
328
|
+
* requests in tests. The "etag" and "lastModified" options give input
|
|
329
|
+
* to the conditional fetch request sent for the primary crawled URL
|
|
329
330
|
* @return {Promise} The promise to get the results of the processing function
|
|
330
331
|
*/
|
|
331
332
|
async function processSpecification(spec, processFunction, args, options) {
|
|
@@ -409,17 +410,18 @@ async function processSpecification(spec, processFunction, args, options) {
|
|
|
409
410
|
await cdp.send('Fetch.continueRequest', { requestId });
|
|
410
411
|
return;
|
|
411
412
|
}
|
|
413
|
+
const response = prefetchedResponses[request.url] ?? await fetch(request.url, { signal: controller.signal, headers: request.headers });
|
|
412
414
|
|
|
413
|
-
const response = await fetch(request.url, { signal: controller.signal });
|
|
414
415
|
const body = await response.buffer();
|
|
416
|
+
|
|
415
417
|
await cdp.send('Fetch.fulfillRequest', {
|
|
416
418
|
requestId,
|
|
417
419
|
responseCode: response.status,
|
|
418
420
|
responseHeaders: Object.keys(response.headers.raw()).map(header => {
|
|
419
|
-
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
|
|
421
|
+
return {
|
|
422
|
+
name: header,
|
|
423
|
+
value: response.headers.raw()[header].join(',')
|
|
424
|
+
};
|
|
423
425
|
}),
|
|
424
426
|
body: body.toString('base64')
|
|
425
427
|
});
|
|
@@ -442,8 +444,8 @@ async function processSpecification(spec, processFunction, args, options) {
|
|
|
442
444
|
await cdp.send('Fetch.failRequest', { requestId, errorReason: 'Failed' });
|
|
443
445
|
}
|
|
444
446
|
else {
|
|
445
|
-
options.quiet ?? console.warn(`[warn] Fall back to regular network request for ${request.url}`, err);
|
|
446
447
|
try {
|
|
448
|
+
options.quiet ?? console.warn(`[warn] Fall back to regular network request for ${request.url}`, err);
|
|
447
449
|
await cdp.send('Fetch.continueRequest', { requestId });
|
|
448
450
|
}
|
|
449
451
|
catch (err) {
|
|
@@ -457,6 +459,34 @@ async function processSpecification(spec, processFunction, args, options) {
|
|
|
457
459
|
}
|
|
458
460
|
|
|
459
461
|
try {
|
|
462
|
+
// Fetch the spec URL if using https
|
|
463
|
+
// This allow to skip launching a browser
|
|
464
|
+
// if we have a fallback data source
|
|
465
|
+
// with a defined cache target for the spec
|
|
466
|
+
if (!spec.url.startsWith('file://')) {
|
|
467
|
+
let response;
|
|
468
|
+
// We set a conditional request header
|
|
469
|
+
// Use If-Modified-Since in preference as it is in practice
|
|
470
|
+
// more reliable for conditional requests
|
|
471
|
+
let headers = {'Accept-Encoding': 'gzip, deflate, br', 'Upgrade-Insecure-Requests': 1, 'User-Agent': browser.userAgent()};
|
|
472
|
+
if (options.lastModified) {
|
|
473
|
+
headers["If-Modified-Since"] = options.lastModified;
|
|
474
|
+
} else if (options.etag) {
|
|
475
|
+
headers["If-None-Match"] = options.etag;
|
|
476
|
+
}
|
|
477
|
+
try {
|
|
478
|
+
response = await fetch(spec.url, {headers});
|
|
479
|
+
if (response.status === 304) {
|
|
480
|
+
return {status: "notmodified"};
|
|
481
|
+
}
|
|
482
|
+
prefetchedResponses[spec.url] = response;
|
|
483
|
+
} catch (err) {
|
|
484
|
+
throw new Error(`Loading ${spec.url} triggered network error ${err}`);
|
|
485
|
+
}
|
|
486
|
+
if (response.status !== 200) {
|
|
487
|
+
throw new Error(`Loading ${spec.url} triggered HTTP status ${response.status}`);
|
|
488
|
+
}
|
|
489
|
+
}
|
|
460
490
|
const page = await browser.newPage();
|
|
461
491
|
|
|
462
492
|
// Disable cache if caller wants to handle all network requests
|
|
@@ -497,13 +527,27 @@ async function processSpecification(spec, processFunction, args, options) {
|
|
|
497
527
|
|
|
498
528
|
// Load the page
|
|
499
529
|
// (note HTTP status is 0 when `file://` URLs are loaded)
|
|
530
|
+
let cacheInfo;
|
|
500
531
|
if (spec.html) {
|
|
501
532
|
await page.setContent(spec.html, loadOptions);
|
|
502
533
|
}
|
|
503
534
|
else {
|
|
504
|
-
|
|
535
|
+
let result;
|
|
536
|
+
try {
|
|
537
|
+
result = await page.goto(spec.url, loadOptions);
|
|
538
|
+
} catch (err) {
|
|
539
|
+
throw new Error(`Loading ${spec.url} triggered network error ${err}`);
|
|
540
|
+
}
|
|
505
541
|
if ((result.status() !== 200) && (!spec.url.startsWith('file://') || (result.status() !== 0))) {
|
|
506
|
-
|
|
542
|
+
throw new Error(`Loading ${spec.url} triggered HTTP status ${result.status()}`);
|
|
543
|
+
}
|
|
544
|
+
const responseHeaders = result.headers();
|
|
545
|
+
// Use Last-Modified in preference as it is in practice
|
|
546
|
+
// more reliable for conditional requests
|
|
547
|
+
if (responseHeaders['last-modified']) {
|
|
548
|
+
cacheInfo = {lastModified: responseHeaders['last-modified']};
|
|
549
|
+
} else if (responseHeaders.etag) {
|
|
550
|
+
cacheInfo = {etag: responseHeaders.etag};
|
|
507
551
|
}
|
|
508
552
|
}
|
|
509
553
|
|
|
@@ -613,7 +657,7 @@ async function processSpecification(spec, processFunction, args, options) {
|
|
|
613
657
|
|
|
614
658
|
// Run the processFunction method in the browser context
|
|
615
659
|
const results = await page.evaluate(processFunction, ...args);
|
|
616
|
-
|
|
660
|
+
results.crawlCacheInfo = cacheInfo;
|
|
617
661
|
// Pending network requests may still be in the queue, flag the page
|
|
618
662
|
// as closed not to send commands on a CDP session that's no longer
|
|
619
663
|
// attached to anything
|