lint-wiki-dumps 0.0.0 → 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +5 -5
- package/package.json +4 -4
- package/parser-parallel.js +90 -0
- package/parser.js +28 -60
- package/processor.js +117 -0
- package/report.js +88 -48
- package/reports/dist/rule.js +3 -2
- package/reports/dist/wiki.js +2 -2
- package/scan-parallel.sh +23 -0
- package/scan.sh +3 -3
package/README.md
CHANGED
@@ -23,15 +23,15 @@ npm i vscode-css-languageservice
|
|
23
23
|
## Usage
|
24
24
|
|
25
25
|
```sh
|
26
|
-
npx lint-wiki-dumps <language> <path to download
|
26
|
+
npx lint-wiki-dumps <language> <path to download> [path to HTML output]
|
27
27
|
# For example:
|
28
28
|
npx lint-wiki-dumps zh-yue ~/Downloads/dumps
|
29
29
|
```
|
30
30
|
|
31
|
-
or execute the Bash script `scan.sh` directly:
|
31
|
+
or execute the Bash script `scan.sh` (single thread) or `scan-parallel.sh` (multi-core cluster) directly:
|
32
32
|
|
33
33
|
```sh
|
34
|
-
bash scan.sh <language> <path to download>
|
34
|
+
bash scan.sh <language> <path to download> [path to HTML output]
|
35
35
|
# For example:
|
36
36
|
bash scan.sh zh-yue ~/Downloads/dumps
|
37
37
|
```
|
@@ -49,11 +49,11 @@ node parser.js zh-yue ~/Downloads/dumps/zh-yuewiki-lastest-pages-articles.xml.bz
|
|
49
49
|
To generate HTML reports, you can use the following command:
|
50
50
|
|
51
51
|
```sh
|
52
|
-
node report.js <language>
|
52
|
+
node report.js <language> [path to HTML output]
|
53
53
|
# For example:
|
54
54
|
node report.js zh-yue
|
55
55
|
```
|
56
56
|
|
57
57
|
## Report
|
58
58
|
|
59
|
-
The tool will generate reports in two formats: JSON and HTML. The JSON report will be saved in the `results` folder, while the HTML report will be available at `reports/index.html
|
59
|
+
The tool will generate reports in two formats: JSON and HTML. The JSON report will be saved in the `results` folder, while the HTML report will be available at `reports/index.html` or the specified path.
|
package/package.json
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
{
|
2
2
|
"name": "lint-wiki-dumps",
|
3
|
-
"version": "0.
|
3
|
+
"version": "0.1.0",
|
4
4
|
"description": "Lint Wikipedia dumps",
|
5
5
|
"keywords": [
|
6
6
|
"lint",
|
@@ -20,7 +20,7 @@
|
|
20
20
|
"reports/*.css"
|
21
21
|
],
|
22
22
|
"bin": {
|
23
|
-
"lint-wiki-dumps": "scan.sh"
|
23
|
+
"lint-wiki-dumps": "scan-parallel.sh"
|
24
24
|
},
|
25
25
|
"repository": {
|
26
26
|
"type": "git",
|
@@ -32,10 +32,10 @@
|
|
32
32
|
"lint": "tsc --noEmit && tsc --project reports/tsconfig.json --noEmit && eslint --cache ."
|
33
33
|
},
|
34
34
|
"dependencies": {
|
35
|
-
"@bhsd/common": "^0.9.
|
35
|
+
"@bhsd/common": "^0.9.1",
|
36
36
|
"chalk": "^4.1.2",
|
37
37
|
"unbzip2-stream": "^1.4.3",
|
38
|
-
"wikilint": "^2.18.
|
38
|
+
"wikilint": "^2.18.4",
|
39
39
|
"xml-stream": "^0.4.5"
|
40
40
|
},
|
41
41
|
"optionalDependencies": {
|
@@ -0,0 +1,90 @@
|
|
1
|
+
"use strict";
|
2
|
+
var __importDefault = (this && this.__importDefault) || function (mod) {
|
3
|
+
return (mod && mod.__esModule) ? mod : { "default": mod };
|
4
|
+
};
|
5
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
6
|
+
const cluster_1 = __importDefault(require("cluster"));
|
7
|
+
const fs_1 = __importDefault(require("fs"));
|
8
|
+
const path_1 = __importDefault(require("path"));
|
9
|
+
const os_1 = __importDefault(require("os"));
|
10
|
+
const chalk_1 = __importDefault(require("chalk"));
|
11
|
+
const common_1 = require("@bhsd/common");
|
12
|
+
const processor_1 = require("./processor");
|
13
|
+
const [, , site, dir] = process.argv, target = `${site}wiki`;
|
14
|
+
if (cluster_1.default.isPrimary) {
|
15
|
+
(0, processor_1.init)();
|
16
|
+
const dumpDir = dir.replace(/^~/u, os_1.default.homedir()), prefix = target.replaceAll('-', '_'), files = fs_1.default.readdirSync(dumpDir).filter(file => file.endsWith('.bz2') && file.startsWith(prefix))
|
17
|
+
.map(file => {
|
18
|
+
const filePath = path_1.default.join(dumpDir, file);
|
19
|
+
return [filePath, fs_1.default.statSync(filePath).size];
|
20
|
+
})
|
21
|
+
.sort(([, a], [, b]) => b - a),
|
22
|
+
// eslint-disable-next-line n/no-unsupported-features/node-builtins
|
23
|
+
workers = new Array(Math.min(os_1.default.availableParallelism(), files.length)).fill(undefined)
|
24
|
+
.map(() => cluster_1.default.fork());
|
25
|
+
let i = 0, n = 0;
|
26
|
+
console.time('parse');
|
27
|
+
for (; i < workers.length; i++) {
|
28
|
+
const worker = workers[i];
|
29
|
+
worker.on('message', count => {
|
30
|
+
n += count;
|
31
|
+
if (i < files.length) {
|
32
|
+
worker.send([files[i], i]);
|
33
|
+
i++;
|
34
|
+
}
|
35
|
+
else {
|
36
|
+
worker.disconnect();
|
37
|
+
}
|
38
|
+
}).send([files[i], i]);
|
39
|
+
}
|
40
|
+
process.on('exit', () => {
|
41
|
+
console.timeEnd('parse');
|
42
|
+
console.log(chalk_1.default.green(`Parsed ${n} pages in total`));
|
43
|
+
});
|
44
|
+
}
|
45
|
+
else {
|
46
|
+
process.on('message', ([[file], j]) => {
|
47
|
+
const results = fs_1.default.createWriteStream(path_1.default.join(processor_1.resultDir, `${site}-${j}.json`)), processor = new processor_1.Processor(site, results);
|
48
|
+
let i = 0;
|
49
|
+
results.write('{');
|
50
|
+
results.on('close', () => {
|
51
|
+
process.send(i);
|
52
|
+
});
|
53
|
+
const stop = () => {
|
54
|
+
processor.stop(`parse ${file}`, `Parsed ${i} pages from ${file}`);
|
55
|
+
};
|
56
|
+
const lint = ($text, ns, title, date, retry = 0) => {
|
57
|
+
try {
|
58
|
+
processor.lint($text, ns, title, date);
|
59
|
+
return true;
|
60
|
+
}
|
61
|
+
catch (e) {
|
62
|
+
if (e instanceof RangeError && e.message === 'Maximum heap size exceeded') {
|
63
|
+
if (retry === 0) {
|
64
|
+
stream.pause();
|
65
|
+
}
|
66
|
+
else if (retry > 5) {
|
67
|
+
processor.error(e, title);
|
68
|
+
return true;
|
69
|
+
}
|
70
|
+
setTimeout(() => {
|
71
|
+
if (lint($text, ns, title, date, retry + 1)) {
|
72
|
+
stream.resume();
|
73
|
+
}
|
74
|
+
}, 1e4);
|
75
|
+
return false;
|
76
|
+
}
|
77
|
+
throw e;
|
78
|
+
}
|
79
|
+
};
|
80
|
+
console.time(`parse ${file}`);
|
81
|
+
const stream = (0, processor_1.getXmlStream)(file);
|
82
|
+
stream.on('endElement: page', ({ title, ns, revision: { model, timestamp, text: { $text } } }) => {
|
83
|
+
if (model === 'wikitext' && $text && ns === '0') {
|
84
|
+
(0, common_1.refreshStdout)(`${i++} ${title}`);
|
85
|
+
lint($text, ns, title, new Date(timestamp));
|
86
|
+
}
|
87
|
+
});
|
88
|
+
stream.on('end', stop);
|
89
|
+
});
|
90
|
+
}
|
package/parser.js
CHANGED
@@ -6,26 +6,30 @@ Object.defineProperty(exports, "__esModule", { value: true });
|
|
6
6
|
const fs_1 = __importDefault(require("fs"));
|
7
7
|
const path_1 = __importDefault(require("path"));
|
8
8
|
const os_1 = __importDefault(require("os"));
|
9
|
-
const perf_hooks_1 = require("perf_hooks");
|
10
|
-
const chalk_1 = __importDefault(require("chalk"));
|
11
|
-
const unbzip2_stream_1 = __importDefault(require("unbzip2-stream"));
|
12
|
-
const xml_stream_1 = __importDefault(require("xml-stream"));
|
13
|
-
const wikilint_1 = __importDefault(require("wikilint"));
|
14
9
|
const common_1 = require("@bhsd/common");
|
15
|
-
const
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
10
|
+
const processor_1 = require("./processor");
|
11
|
+
const n = Number(process.argv[4]) || Infinity, [, , site, file, , restart] = process.argv, filePath = path_1.default.join(processor_1.resultDir, `${site}.json`), data = fs_1.default.existsSync(filePath) && fs_1.default.readFileSync(filePath, 'utf8');
|
12
|
+
const getTimestamp = () => {
|
13
|
+
if (!data) {
|
14
|
+
return undefined;
|
15
|
+
}
|
16
|
+
const i = data.indexOf('"#timestamp": "') + 15;
|
17
|
+
return data.slice(i, data.indexOf('"', i));
|
18
|
+
};
|
19
|
+
const getErrors = (page) => {
|
20
|
+
if (!data) {
|
21
|
+
return undefined;
|
22
|
+
}
|
23
|
+
const str = JSON.stringify(page), i = data.indexOf(`${str}: [`);
|
24
|
+
if (i === -1) {
|
25
|
+
return undefined;
|
26
|
+
}
|
27
|
+
const j = i + str.length + 2;
|
28
|
+
return JSON.parse(data.slice(j, data.indexOf('\n]', j) + 2));
|
29
|
+
};
|
30
|
+
(0, processor_1.init)();
|
31
|
+
const time = getTimestamp(), last = time && new Date(time), results = fs_1.default.createWriteStream(path_1.default.join(processor_1.resultDir, `${site}.json`), { flags: restart ? 'a' : 'w' }), processor = new processor_1.Processor(site, results, last);
|
32
|
+
let i = 0, stopping = false, restarted = !restart;
|
29
33
|
if (!restart) {
|
30
34
|
results.write('{');
|
31
35
|
}
|
@@ -34,22 +38,10 @@ results.on('close', () => {
|
|
34
38
|
});
|
35
39
|
const stop = () => {
|
36
40
|
stopping = true;
|
37
|
-
|
38
|
-
console.log(chalk_1.default.green(`Parsed ${i} pages`));
|
39
|
-
if (failed) {
|
40
|
-
console.error(chalk_1.default.red(`${failed} pages failed to parse`));
|
41
|
-
}
|
42
|
-
if (worst) {
|
43
|
-
console.info(chalk_1.default.yellow(`Worst page: ${worst.title} (${worst.duration.toFixed(3)} ms)`));
|
44
|
-
}
|
45
|
-
results.write(`${comma}\n"#timestamp": ${JSON.stringify(latest)}\n}`);
|
46
|
-
results.close();
|
47
|
-
};
|
48
|
-
const newEntry = (title, errors) => {
|
49
|
-
results.write(`${comma}\n${JSON.stringify(title)}: ${JSON.stringify(errors, null, '\t')}`);
|
50
|
-
comma ||= ',';
|
41
|
+
processor.stop('parse', `Parsed ${i} pages`);
|
51
42
|
};
|
52
43
|
console.time('parse');
|
44
|
+
const stream = (0, processor_1.getXmlStream)(file.replace(/^~/u, os_1.default.homedir()));
|
53
45
|
stream.on('endElement: page', ({ title, ns, revision: { model, timestamp, text: { $text } } }) => {
|
54
46
|
if (i === n) {
|
55
47
|
if (!stopping) {
|
@@ -60,37 +52,13 @@ stream.on('endElement: page', ({ title, ns, revision: { model, timestamp, text:
|
|
60
52
|
(0, common_1.refreshStdout)(`${i++} ${title}`);
|
61
53
|
const date = new Date(timestamp);
|
62
54
|
if (last && date <= last) {
|
63
|
-
const previous =
|
55
|
+
const previous = getErrors(title);
|
64
56
|
if (previous) {
|
65
|
-
newEntry(title, previous);
|
57
|
+
processor.newEntry(title, previous);
|
66
58
|
}
|
67
59
|
}
|
68
60
|
else {
|
69
|
-
|
70
|
-
try {
|
71
|
-
const start = perf_hooks_1.performance.now(), errors = wikilint_1.default.parse($text).lint()
|
72
|
-
.filter(({ severity, rule }) => severity === 'error' && !ignore.has(rule)), duration = perf_hooks_1.performance.now() - start;
|
73
|
-
if (errors.length > 0) {
|
74
|
-
newEntry(title, errors.map(({ severity, suggestions, fix, ...e }) => ({
|
75
|
-
...e,
|
76
|
-
...suggestions && {
|
77
|
-
suggestions: suggestions.map(action => ({
|
78
|
-
...action,
|
79
|
-
original: $text.slice(...action.range),
|
80
|
-
})),
|
81
|
-
},
|
82
|
-
...fix && { fix: { ...fix, original: $text.slice(...fix.range) } },
|
83
|
-
excerpt: $text.slice(e.startIndex, e.endIndex),
|
84
|
-
})));
|
85
|
-
}
|
86
|
-
if (!worst || duration > worst.duration) {
|
87
|
-
worst = { title, duration };
|
88
|
-
}
|
89
|
-
}
|
90
|
-
catch (e) {
|
91
|
-
console.error(chalk_1.default.red(`Error parsing ${title}`), e);
|
92
|
-
failed++;
|
93
|
-
}
|
61
|
+
processor.lint($text, ns, title, date);
|
94
62
|
}
|
95
63
|
}
|
96
64
|
else if (title === restart) {
|
package/processor.js
ADDED
@@ -0,0 +1,117 @@
|
|
1
|
+
"use strict";
|
2
|
+
var __importDefault = (this && this.__importDefault) || function (mod) {
|
3
|
+
return (mod && mod.__esModule) ? mod : { "default": mod };
|
4
|
+
};
|
5
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
6
|
+
exports.Processor = exports.getXmlStream = exports.init = exports.resultDir = exports.MAX = void 0;
|
7
|
+
const fs_1 = __importDefault(require("fs"));
|
8
|
+
const path_1 = __importDefault(require("path"));
|
9
|
+
const perf_hooks_1 = require("perf_hooks");
|
10
|
+
const cluster_1 = __importDefault(require("cluster"));
|
11
|
+
const chalk_1 = __importDefault(require("chalk"));
|
12
|
+
const unbzip2_stream_1 = __importDefault(require("unbzip2-stream"));
|
13
|
+
const xml_stream_1 = __importDefault(require("xml-stream"));
|
14
|
+
const wikilint_1 = __importDefault(require("wikilint"));
|
15
|
+
exports.MAX = 100, exports.resultDir = path_1.default.join(__dirname, 'results');
|
16
|
+
const ignore = new Set(['no-arg', 'url-encoding', 'h1', 'var-anchor']);
|
17
|
+
const init = () => {
|
18
|
+
if (!fs_1.default.existsSync(exports.resultDir)) {
|
19
|
+
fs_1.default.mkdirSync(exports.resultDir);
|
20
|
+
}
|
21
|
+
};
|
22
|
+
exports.init = init;
|
23
|
+
const getXmlStream = (file) => {
|
24
|
+
const stream = new xml_stream_1.default(fs_1.default.createReadStream(file).pipe((0, unbzip2_stream_1.default)()));
|
25
|
+
stream.preserve('text', true);
|
26
|
+
return stream;
|
27
|
+
};
|
28
|
+
exports.getXmlStream = getXmlStream;
|
29
|
+
class Processor {
|
30
|
+
#failed = 0;
|
31
|
+
#comma = '';
|
32
|
+
#worst;
|
33
|
+
#results;
|
34
|
+
#latest;
|
35
|
+
/** @param site site nickname */
|
36
|
+
constructor(site, results, latest) {
|
37
|
+
wikilint_1.default.config = `${site}wiki`;
|
38
|
+
this.#results = results;
|
39
|
+
this.#latest = latest;
|
40
|
+
}
|
41
|
+
/**
|
42
|
+
* Stop the processing and log the results.
|
43
|
+
* @param timer timer name
|
44
|
+
* @param msg message to log
|
45
|
+
*/
|
46
|
+
stop(timer, msg) {
|
47
|
+
console.log();
|
48
|
+
console.timeEnd(timer);
|
49
|
+
console.log(chalk_1.default.green(msg));
|
50
|
+
if (this.#failed) {
|
51
|
+
console.error(chalk_1.default.red(`${this.#failed} pages failed to parse`));
|
52
|
+
}
|
53
|
+
if (this.#worst) {
|
54
|
+
console.info(chalk_1.default.yellow(`Worst page: ${this.#worst.title} (${this.#worst.duration.toFixed(3)} ms)`));
|
55
|
+
}
|
56
|
+
this.#results.write(`${this.#comma}\n"#timestamp": ${JSON.stringify(this.#latest)}\n}`);
|
57
|
+
this.#results.end();
|
58
|
+
}
|
59
|
+
/**
|
60
|
+
* Write a new entry to the results file.
|
61
|
+
* @param title page title
|
62
|
+
* @param errors lint errors
|
63
|
+
*/
|
64
|
+
newEntry(title, errors) {
|
65
|
+
this.#results.write(`${this.#comma}\n${JSON.stringify(title)}: ${JSON.stringify(errors, null, '\t')}`);
|
66
|
+
this.#comma ||= ',';
|
67
|
+
}
|
68
|
+
/**
|
69
|
+
* Parse a page and lint it.
|
70
|
+
* @param $text page text
|
71
|
+
* @param ns page namespace
|
72
|
+
* @param title page title
|
73
|
+
* @param date page revision date
|
74
|
+
* @throws `RangeError` maximum heap size exceeded
|
75
|
+
*/
|
76
|
+
lint($text, ns, title, date) {
|
77
|
+
if (!this.#latest || date > this.#latest) {
|
78
|
+
this.#latest = date;
|
79
|
+
}
|
80
|
+
try {
|
81
|
+
const start = perf_hooks_1.performance.now(), errors = wikilint_1.default.parse($text, ns === '828').lint()
|
82
|
+
.filter(({ severity, rule }) => severity === 'error' && !ignore.has(rule)), duration = perf_hooks_1.performance.now() - start;
|
83
|
+
if (errors.length > 0) {
|
84
|
+
this.newEntry(title, errors.map(({ severity, suggestions, fix, ...e }) => ({
|
85
|
+
...e,
|
86
|
+
...suggestions && {
|
87
|
+
suggestions: suggestions.map(action => ({
|
88
|
+
...action,
|
89
|
+
original: $text.slice(...action.range),
|
90
|
+
})),
|
91
|
+
},
|
92
|
+
...fix && { fix: { ...fix, original: $text.slice(...fix.range) } },
|
93
|
+
excerpt: $text.slice(e.startIndex, e.endIndex).slice(0, exports.MAX),
|
94
|
+
})));
|
95
|
+
}
|
96
|
+
if (!this.#worst || duration > this.#worst.duration) {
|
97
|
+
this.#worst = { title, duration };
|
98
|
+
}
|
99
|
+
}
|
100
|
+
catch (e) {
|
101
|
+
if (cluster_1.default.isWorker && e instanceof RangeError && e.message === 'Maximum heap size exceeded') {
|
102
|
+
throw e;
|
103
|
+
}
|
104
|
+
this.error(e, title);
|
105
|
+
}
|
106
|
+
}
|
107
|
+
/**
|
108
|
+
* Log an error message.
|
109
|
+
* @param e error object
|
110
|
+
* @param title page title
|
111
|
+
*/
|
112
|
+
error(e, title) {
|
113
|
+
console.error(chalk_1.default.red(`Error parsing ${title}`), e);
|
114
|
+
this.#failed++;
|
115
|
+
}
|
116
|
+
}
|
117
|
+
exports.Processor = Processor;
|
package/report.js
CHANGED
@@ -7,65 +7,105 @@ const fs_1 = __importDefault(require("fs"));
|
|
7
7
|
const path_1 = __importDefault(require("path"));
|
8
8
|
const crypto_1 = require("crypto");
|
9
9
|
const chalk_1 = __importDefault(require("chalk"));
|
10
|
-
const
|
11
|
-
const
|
12
|
-
|
13
|
-
|
10
|
+
const processor_1 = require("./processor");
|
11
|
+
const { argv } = process, [, , lang] = argv, defaultOurDir = path_1.default.join(__dirname, 'reports');
|
12
|
+
let [, , , outDir] = argv;
|
13
|
+
const mkdir = (dir, empty) => {
|
14
|
+
if (fs_1.default.existsSync(dir)) {
|
15
|
+
if (!empty) {
|
16
|
+
return;
|
17
|
+
}
|
18
|
+
fs_1.default.rmSync(dir, { recursive: true });
|
14
19
|
}
|
20
|
+
fs_1.default.mkdirSync(dir);
|
15
21
|
};
|
16
|
-
|
22
|
+
if (outDir) {
|
23
|
+
mkdir(outDir);
|
24
|
+
// eslint-disable-next-line n/no-unsupported-features/node-builtins
|
25
|
+
fs_1.default.cpSync(defaultOurDir, outDir, { recursive: true, force: true });
|
26
|
+
}
|
27
|
+
else {
|
28
|
+
outDir = defaultOurDir;
|
29
|
+
}
|
30
|
+
const dataDir = path_1.default.join(outDir, 'data');
|
17
31
|
mkdir(dataDir);
|
18
32
|
const writeJS = (data, file) => {
|
19
33
|
fs_1.default.writeFileSync(path_1.default.join(dataDir, `${file}.js`), `window.data=${JSON.stringify(data)}`);
|
20
34
|
};
|
21
|
-
const
|
35
|
+
const initJS = (file) => {
|
36
|
+
const stream = fs_1.default.createWriteStream(`${file}.js`);
|
37
|
+
stream.write('window.data={"articles":[');
|
38
|
+
return stream;
|
39
|
+
};
|
40
|
+
const compare = (a, b) => a.localeCompare(b);
|
41
|
+
const resultDir = path_1.default.join(__dirname, 'results'), dir = fs_1.default.readdirSync(resultDir), summary = new Set(), ruleRecords = new Map(), wiki = {}, siteDir = path_1.default.join(dataDir, lang), articlesDir = path_1.default.join(siteDir, 'pages');
|
42
|
+
let latest;
|
43
|
+
mkdir(siteDir, true);
|
44
|
+
mkdir(articlesDir);
|
22
45
|
for (const file of dir) {
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
46
|
+
if (!file.endsWith('.json')) {
|
47
|
+
continue;
|
48
|
+
}
|
49
|
+
const fileDir = path_1.default.join(resultDir, file);
|
50
|
+
if (!fs_1.default.existsSync(fileDir)) {
|
51
|
+
console.error(chalk_1.default.red(`Failed to read ${file}`));
|
52
|
+
continue;
|
53
|
+
}
|
54
|
+
const k = file.search(/-\d+\.json$/u), site = k === -1 ? file.slice(0, -5) : file.slice(0, k);
|
55
|
+
summary.add(site);
|
56
|
+
if (lang !== site) {
|
57
|
+
continue;
|
58
|
+
}
|
59
|
+
const data = fs_1.default.readFileSync(fileDir, 'utf8'), date = new Date(data.substr(data.indexOf('\n"#timestamp": "') + 16, 10));
|
60
|
+
latest = !latest || date > latest ? date : latest;
|
61
|
+
for (const mt of data.matchAll(/^(".+"): \[$/gmu)) {
|
62
|
+
const page = JSON.parse(mt[1]), hash = (0, crypto_1.createHash)('sha256').update(page).digest('hex')
|
63
|
+
.slice(0, 8), errors = JSON.parse(data.slice(mt.index + mt[0].length - 1, data.indexOf('\n]', mt.index) + 2)), rules = new Set(), info = [];
|
64
|
+
for (const { rule, startLine, startCol, message, excerpt } of errors) {
|
65
|
+
// article
|
66
|
+
const line = startLine + 1, col = startCol + 1;
|
67
|
+
info.push([rule, line, col, message, excerpt]);
|
32
68
|
// wiki
|
33
|
-
|
34
|
-
|
35
|
-
wiki.push([rule, values.filter(errors => errors.some(({ rule: r }) => r === rule)).length]);
|
36
|
-
// rule
|
37
|
-
const articles = Object.entries(data).filter(([, errors]) => errors.some(({ rule: r }) => r === rule))
|
38
|
-
.sort(([a], [b]) => a.localeCompare(b))
|
39
|
-
.map(([page, errors]) => {
|
40
|
-
const { startLine, startCol, message, excerpt } = errors.find(({ rule: r }) => r === rule);
|
41
|
-
return [page, startLine + 1, startCol + 1, message, excerpt.slice(0, MAX * 0.8)];
|
42
|
-
}), batches = Math.ceil(articles.length / 200);
|
43
|
-
for (let i = 0; i < batches; i++) {
|
44
|
-
writeJS({
|
45
|
-
articles: articles.slice(i * 200, (i + 1) * 200),
|
46
|
-
batches,
|
47
|
-
}, path_1.default.join(site, `${rule}-${i}`));
|
48
|
-
}
|
69
|
+
if (!(rule in wiki)) {
|
70
|
+
wiki[rule] = 0;
|
49
71
|
}
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
]);
|
63
|
-
|
72
|
+
// rule
|
73
|
+
if (!rules.has(rule)) {
|
74
|
+
rules.add(rule);
|
75
|
+
wiki[rule]++;
|
76
|
+
let ruleRecord;
|
77
|
+
if (ruleRecords.has(rule)) {
|
78
|
+
ruleRecord = ruleRecords.get(rule);
|
79
|
+
}
|
80
|
+
else {
|
81
|
+
ruleRecord = ['', []];
|
82
|
+
ruleRecords.set(rule, ruleRecord);
|
83
|
+
}
|
84
|
+
ruleRecord[1].push(page);
|
85
|
+
ruleRecord[0] += `${JSON.stringify([page, line, col, message, excerpt.slice(0, processor_1.MAX * 0.8)], null, '\t')},`;
|
64
86
|
}
|
65
87
|
}
|
88
|
+
writeJS(info, path_1.default.join(site, 'pages', hash));
|
66
89
|
}
|
67
|
-
|
68
|
-
|
90
|
+
}
|
91
|
+
const timestamp = latest.toISOString().slice(0, 10);
|
92
|
+
// rule
|
93
|
+
for (const [rule, [str, pages]] of ruleRecords) {
|
94
|
+
const batches = Math.ceil(pages.length / 200);
|
95
|
+
pages.sort(compare);
|
96
|
+
for (let i = 0; i < batches; i++) {
|
97
|
+
const stream = initJS(path_1.default.join(siteDir, `${rule}-${i}`));
|
98
|
+
for (let j = i * 200; j < (i + 1) * 200; j++) {
|
99
|
+
const page = pages[j];
|
100
|
+
if (!page) {
|
101
|
+
break;
|
102
|
+
}
|
103
|
+
const index = str.indexOf(`[\n\t${JSON.stringify(page)}`);
|
104
|
+
stream.write(str.slice(index, str.indexOf('\n]', index) + 3));
|
105
|
+
}
|
106
|
+
stream.write(`],batches:${batches},timestamp:"${timestamp}"}`);
|
107
|
+
stream.end();
|
69
108
|
}
|
70
109
|
}
|
71
|
-
writeJS(summary, 'index');
|
110
|
+
writeJS([...summary].sort(compare), 'index');
|
111
|
+
writeJS([...Object.entries(wiki).sort(([a], [b]) => a.localeCompare(b)), timestamp], path_1.default.join(lang, 'index'));
|
package/reports/dist/rule.js
CHANGED
@@ -1,7 +1,6 @@
|
|
1
1
|
"use strict";
|
2
2
|
(() => {
|
3
|
-
const search = new URLSearchParams(location.search), lang = search.get('lang'), rule = search.get('rule'), batch = Math.floor(Number(search.get('start') || 0) / 200), endStr = String((batch + 1) * 200), prev = document.getElementById('prev'), next = document.getElementById('next'), start = document.getElementById('start'), end = document.getElementById('end'), title = document.querySelector('title'), h2 = document.querySelector('h2'), wiki = document.getElementById('wiki'), tbody = document.querySelector('tbody'), script = document.createElement('script');
|
4
|
-
h2.textContent = h2.textContent.replace('Wikipedia', `${lang}.wikipedia.org: ${rule}`);
|
3
|
+
const search = new URLSearchParams(location.search), lang = search.get('lang'), rule = search.get('rule'), batch = Math.floor(Number(search.get('start') || 0) / 200), endStr = String((batch + 1) * 200), nav = document.getElementById('nav'), prev = document.getElementById('prev'), next = document.getElementById('next'), start = document.getElementById('start'), end = document.getElementById('end'), title = document.querySelector('title'), h2 = document.querySelector('h2'), wiki = document.getElementById('wiki'), table = document.querySelector('table'), tbody = document.querySelector('tbody'), script = document.createElement('script');
|
5
4
|
title.textContent = title.textContent.replace('Wikipedia', `${lang}.wikipedia.org`);
|
6
5
|
wiki.textContent = `${lang}wiki`;
|
7
6
|
wiki.href += `?lang=${lang}`;
|
@@ -18,6 +17,7 @@
|
|
18
17
|
next.href = `${location.pathname}?${search}`;
|
19
18
|
script.src = `./data/${lang}/${rule}-${batch}.js`;
|
20
19
|
script.addEventListener('load', () => {
|
20
|
+
h2.textContent = `${h2.textContent.replace('Wikipedia', `${lang}.wikipedia.org: ${rule}`)} (${data.timestamp})`;
|
21
21
|
if (data.batches === batch + 1) {
|
22
22
|
next.removeAttribute('href');
|
23
23
|
end.textContent = String(batch * 200 + data.articles.length);
|
@@ -43,6 +43,7 @@
|
|
43
43
|
tr.append(article, edit, line, column, detail, notice, more);
|
44
44
|
tbody.append(tr);
|
45
45
|
}
|
46
|
+
table.after(nav.cloneNode(true));
|
46
47
|
});
|
47
48
|
document.head.append(script);
|
48
49
|
})();
|
package/reports/dist/wiki.js
CHANGED
@@ -1,11 +1,11 @@
|
|
1
1
|
"use strict";
|
2
2
|
(() => {
|
3
3
|
const lang = new URLSearchParams(location.search).get('lang'), script = document.createElement('script'), title = document.querySelector('title'), h2 = document.querySelector('h2'), tbody = document.querySelector('tbody');
|
4
|
-
h2.textContent = h2.textContent.replace('Wikipedia', `${lang}.wikipedia.org`);
|
5
4
|
title.textContent = title.textContent.replace('Wikipedia', `${lang}.wikipedia.org`);
|
6
5
|
script.src = `./data/${lang}/index.js`;
|
7
6
|
script.addEventListener('load', () => {
|
8
|
-
|
7
|
+
h2.textContent = `${h2.textContent.replace('Wikipedia', `${lang}.wikipedia.org`)} (${data.slice(-1)[0]})`;
|
8
|
+
for (const [rule, count] of data.slice(0, -1)) {
|
9
9
|
const tr = document.createElement('tr'), description = document.createElement('td'), pages = document.createElement('td'), a = document.createElement('a');
|
10
10
|
a.textContent = rule;
|
11
11
|
a.href = `./rule.html?lang=${lang}&rule=${rule}`;
|
package/scan-parallel.sh
ADDED
@@ -0,0 +1,23 @@
|
|
1
|
+
#!/usr/local/bin/bash
|
2
|
+
if (( $# < 2 ))
|
3
|
+
then
|
4
|
+
echo 'Usage: npx lint-wiki-dumps <language> <path to download> [path to HTML output]'
|
5
|
+
echo 'Example: npx lint-wiki-dumps zh-yue ~/Downloads/dumps'
|
6
|
+
exit 1
|
7
|
+
fi
|
8
|
+
site="${1}wiki" # example: zh-yuewiki
|
9
|
+
target="${1//-/_}wiki" # example: zh_yuewiki
|
10
|
+
files=$( \
|
11
|
+
curl -s "https://dumps.wikimedia.org/$target/latest/" \
|
12
|
+
| grep -o "href=\"$target-latest-pages-articles[0-9].*\.bz2\">" \
|
13
|
+
| gsed "s|href=\"|https://dumps.wikimedia.org/$target/latest/|;s|\">||" \
|
14
|
+
)
|
15
|
+
if (( ${#files} < 2 ))
|
16
|
+
then
|
17
|
+
echo 'Switching to single-threaded mode'
|
18
|
+
bash scan.sh "$1" "$2" "$3"
|
19
|
+
else
|
20
|
+
curl --output-dir "$2" --remote-name-all $files
|
21
|
+
npx getParserConfig "$site" "https://$1.wikipedia.org/w/"
|
22
|
+
node parser-parallel.js "$1" "$2" && node report.js "$1" "$3"
|
23
|
+
fi
|
package/scan.sh
CHANGED
@@ -1,16 +1,16 @@
|
|
1
1
|
#!/usr/local/bin/bash
|
2
2
|
if (( $# < 2 ))
|
3
3
|
then
|
4
|
-
echo 'Usage: npx lint-wiki-dumps <language> <path to download>'
|
4
|
+
echo 'Usage: npx lint-wiki-dumps <language> <path to download> [path to HTML output]'
|
5
5
|
echo 'Example: npx lint-wiki-dumps zh-yue ~/Downloads/dumps'
|
6
6
|
exit 1
|
7
7
|
fi
|
8
8
|
site="${1}wiki" # example: zh-yuewiki
|
9
9
|
target="${1//-/_}wiki" # example: zh_yuewiki
|
10
|
-
file="$
|
10
|
+
file="$target-latest-pages-articles.xml.bz2"
|
11
11
|
if (( $# < 3 ))
|
12
12
|
then
|
13
13
|
curl --output-dir "$2" -O "https://dumps.wikimedia.org/$target/latest/$file"
|
14
14
|
npx getParserConfig "$site" "https://$1.wikipedia.org/w/"
|
15
15
|
fi
|
16
|
-
node parser.js "$1" "$2/$file" "$
|
16
|
+
node parser.js "$1" "$2/$file" "$4" "$5" && node report.js "$1" "$3"
|