taraskevizer 10.4.9 → 10.4.11
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/bin/index.js +121 -92
- package/package.json +1 -1
package/dist/bin/index.js
CHANGED
|
@@ -1,22 +1,33 @@
|
|
|
1
1
|
#!/usr/bin/env node
|
|
2
2
|
import { cpus } from 'node:os';
|
|
3
|
+
import * as path from 'node:path';
|
|
4
|
+
import { fileURLToPath } from 'node:url';
|
|
3
5
|
import { Worker } from 'node:worker_threads';
|
|
4
6
|
import { pipelines } from '../index.js';
|
|
5
7
|
import { parseArgs } from './parse-args.js';
|
|
6
|
-
const
|
|
7
|
-
|
|
8
|
+
const getPrint = (stream) => (msg) => {
|
|
9
|
+
stream.write("[34m[taraskevizer][0m" + ' ' + msg);
|
|
8
10
|
};
|
|
11
|
+
const getPrintLn = (printFn) => (msg) => {
|
|
12
|
+
printFn(msg + '\n');
|
|
13
|
+
};
|
|
14
|
+
const print = getPrint(process.stdout);
|
|
15
|
+
const printErr = getPrint(process.stderr);
|
|
16
|
+
const printLn = getPrintLn(print);
|
|
17
|
+
const printErrLn = getPrintLn(printErr);
|
|
18
|
+
// AI-written, may need improvements
|
|
9
19
|
const splitIntoChunks = (text, n) => {
|
|
10
20
|
const size = Math.ceil(text.length / n);
|
|
11
21
|
const chunks = [];
|
|
12
22
|
let start = 0;
|
|
13
23
|
for (let i = 0; i < n; i++) {
|
|
14
24
|
let end = start + size;
|
|
25
|
+
// Adjust end to the nearest newline character
|
|
15
26
|
if (end < text.length) {
|
|
16
27
|
const forward = text.indexOf('\n', end);
|
|
17
28
|
const backward = text.lastIndexOf('\n', end);
|
|
18
29
|
if (forward === -1 && backward === -1) {
|
|
19
|
-
// no-op
|
|
30
|
+
// no-op
|
|
20
31
|
}
|
|
21
32
|
else if (forward === -1) {
|
|
22
33
|
end = backward;
|
|
@@ -28,6 +39,17 @@ const splitIntoChunks = (text, n) => {
|
|
|
28
39
|
end = forward - end < end - backward ? forward : backward;
|
|
29
40
|
}
|
|
30
41
|
}
|
|
42
|
+
// Ensure we don't split inside HTML tags or special syntax tags
|
|
43
|
+
if (end < text.length) {
|
|
44
|
+
const lastOpen = text.lastIndexOf('<', end);
|
|
45
|
+
const lastClose = text.lastIndexOf('>', end);
|
|
46
|
+
if (lastOpen > lastClose) {
|
|
47
|
+
const nextClose = text.indexOf('>', end);
|
|
48
|
+
if (nextClose !== -1) {
|
|
49
|
+
end = nextClose + 1;
|
|
50
|
+
}
|
|
51
|
+
}
|
|
52
|
+
}
|
|
31
53
|
if (end > text.length)
|
|
32
54
|
end = text.length;
|
|
33
55
|
chunks.push(text.slice(start, end));
|
|
@@ -37,122 +59,129 @@ const splitIntoChunks = (text, n) => {
|
|
|
37
59
|
}
|
|
38
60
|
return chunks;
|
|
39
61
|
};
|
|
62
|
+
const getPrettyByteSize = (n) => n < 1024
|
|
63
|
+
? `${n} B`
|
|
64
|
+
: n < 1024 * 1024
|
|
65
|
+
? `${(n / 1024).toFixed(2)} KB`
|
|
66
|
+
: `${(n / (1024 * 1024)).toFixed(2)} MB`;
|
|
40
67
|
process.argv.splice(0, 2);
|
|
41
68
|
const firstArg = process.argv[0];
|
|
42
69
|
if (firstArg) {
|
|
43
70
|
if (firstArg === '-v' || firstArg === '--version') {
|
|
44
|
-
|
|
71
|
+
printLn("10.4.11");
|
|
45
72
|
process.exit(0);
|
|
46
73
|
}
|
|
47
74
|
if (firstArg === '-h' || firstArg === '--help') {
|
|
48
|
-
|
|
49
|
-
If text is not passed, interactive mode is enabled
|
|
50
|
-
|
|
51
|
-
EXAMPLES
|
|
52
|
-
|
|
53
|
-
Convert and latinize a word
|
|
54
|
-
[34mtarask[0m [35m--latin[0m 'планета'
|
|
55
|
-
Prints "p[32ml[0ma[32mne[0mta"
|
|
56
|
-
|
|
57
|
-
Read from one file and write converted text to another
|
|
58
|
-
[34mtarask[0m < ./cyr-text.txt > ./lat-text.txt
|
|
59
|
-
|
|
60
|
-
Enter interactive mode
|
|
61
|
-
[34mtarask[0m
|
|
62
|
-
Prints "[34m[taraskevizer][0m Enter the text:" and waits until you enter a new line
|
|
63
|
-
|
|
64
|
-
OPTIONS
|
|
65
|
-
|
|
66
|
-
[33mGeneral[0m:
|
|
67
|
-
[35m-h[0m [35m--help[0m
|
|
68
|
-
[35m-v[0m [35m--version[0m
|
|
69
|
-
|
|
70
|
-
[33mAlphabet[0m:
|
|
71
|
-
[35m-l[0m [35m--latin[0m
|
|
72
|
-
[35m-lj[0m [35m--latin-ji[0m
|
|
73
|
-
[35m-a[0m [35m--arabic[0m
|
|
74
|
-
|
|
75
|
-
[33mWhen to replace і(i) by й(j) after vowels[0m:
|
|
76
|
-
[35m-jr[0m [35m--jrandom[0m
|
|
77
|
-
[35m-ja[0m [35m--jalways[0m
|
|
78
|
-
|
|
79
|
-
[33mReplace ґ(g) by г(h) in cyrillic alphabet[0m:
|
|
80
|
-
[35m--h[0m
|
|
81
|
-
|
|
82
|
-
[33mVariations[0m:
|
|
83
|
-
[35m-nv[0m [35m--no-variations[0m
|
|
84
|
-
[35m-fv[0m [35m--first-variation[0m
|
|
85
|
-
|
|
86
|
-
[33mMode (only one can be used)[0m:
|
|
87
|
-
[35m-html[0m [35m--html[0m
|
|
88
|
-
[35m-abc[0m [35m--alphabet-only[0m
|
|
89
|
-
|
|
90
|
-
[33mOther[0m:
|
|
91
|
-
[35m-nec[0m [35m--not-escape-caps[0m
|
|
92
|
-
[35m-nc[0m [35m--no-color[0m
|
|
93
|
-
[35m-st[0m [35m--single-thread[0m
|
|
94
|
-
`);
|
|
75
|
+
printLn("Usage: \u001b[34mtarask\u001b[0m [options] text\nIf text is not passed, interactive mode is enabled\n\nEXAMPLES\n\nConvert and latinize a word\n\t\u001b[34mtarask\u001b[0m \u001b[35m--latin\u001b[0m 'планета'\nPrints \"p\u001b[32ml\u001b[0ma\u001b[32mne\u001b[0mta\"\n\nRead from one file and write converted text to another\n\t\u001b[34mtarask\u001b[0m < ./cyr-text.txt > ./lat-text.txt\n\nEnter interactive mode\n\t\u001b[34mtarask\u001b[0m\nPrints \"\u001b[34m[taraskevizer]\u001b[0m Enter the text:\" and waits until you enter a new line\n\nOPTIONS\n\n\u001b[33mGeneral\u001b[0m:\n \u001b[35m-h\u001b[0m \u001b[35m--help\u001b[0m\n \u001b[35m-v\u001b[0m \u001b[35m--version\u001b[0m\n\n\u001b[33mAlphabet\u001b[0m:\n \u001b[35m-l\u001b[0m \u001b[35m--latin\u001b[0m\n \u001b[35m-lj\u001b[0m \u001b[35m--latin-ji\u001b[0m\n \u001b[35m-a\u001b[0m \u001b[35m--arabic\u001b[0m\n\n\u001b[33mWhen to replace і(i) by й(j) after vowels\u001b[0m:\n \u001b[35m-jr\u001b[0m \u001b[35m--jrandom\u001b[0m\n \u001b[35m-ja\u001b[0m \u001b[35m--jalways\u001b[0m\n\n\u001b[33mReplace ґ(g) by г(h) in cyrillic alphabet\u001b[0m:\n \u001b[35m--h\u001b[0m\n\n\u001b[33mVariations\u001b[0m:\n \u001b[35m-nv\u001b[0m \u001b[35m--no-variations\u001b[0m\n \u001b[35m-fv\u001b[0m \u001b[35m--first-variation\u001b[0m\n\n\u001b[33mMode (only one can be used)\u001b[0m:\n \u001b[35m-html\u001b[0m \u001b[35m--html\u001b[0m\n \u001b[35m-abc\u001b[0m \u001b[35m--alphabet-only\u001b[0m\n\n\u001b[33mOther\u001b[0m:\n \u001b[35m-nec\u001b[0m \u001b[35m--not-escape-caps\u001b[0m\n \u001b[35m-nc\u001b[0m \u001b[35m--no-color\u001b[0m\n \u001b[35m-st\u001b[0m \u001b[35m--single-thread\u001b[0m\n");
|
|
95
76
|
process.exit(0);
|
|
96
77
|
}
|
|
97
78
|
}
|
|
79
|
+
const argv = process.argv.slice();
|
|
98
80
|
const { mode, cfg, doForceSingleThread } = parseArgs(process.argv);
|
|
99
|
-
|
|
81
|
+
const workers = {
|
|
82
|
+
size: cpus()?.length || 1,
|
|
83
|
+
workers: null,
|
|
84
|
+
init() {
|
|
85
|
+
if (this.workers)
|
|
86
|
+
return;
|
|
87
|
+
process.stderr.write(`(Initializing ${this.size} workers... `);
|
|
88
|
+
const dirname = path.dirname(fileURLToPath(import.meta.url));
|
|
89
|
+
this.workers = Array.from({ length: this.size }, () => new Worker("const {\n\tparentPort,\n\tworkerData: { argv, dirname },\n} = require('node:worker_threads');\nconst { resolve } = require('node:path');\nconst { pipelines } = require(resolve(dirname, '..'));\nconst { parseArgs } = require(resolve(dirname, 'parse-args'));\n\nconst { mode, cfg } = parseArgs(argv);\n\nparentPort.on('message', (chunk) => {\n\tparentPort.postMessage(pipelines[mode](chunk, cfg));\n});\n", {
|
|
90
|
+
eval: true,
|
|
91
|
+
workerData: { argv, dirname },
|
|
92
|
+
}));
|
|
93
|
+
process.stderr.write('done.) ');
|
|
94
|
+
},
|
|
95
|
+
process(chunks) {
|
|
96
|
+
return Promise.all(chunks.map((chunk, i) => new Promise((resolve, reject) => {
|
|
97
|
+
const worker = this.workers[i % this.size];
|
|
98
|
+
worker.postMessage(chunk);
|
|
99
|
+
worker.once('message', resolve);
|
|
100
|
+
worker.once('error', reject);
|
|
101
|
+
})));
|
|
102
|
+
},
|
|
103
|
+
};
|
|
104
|
+
const processText = async (text) => {
|
|
105
|
+
let result = '';
|
|
106
|
+
if (!doForceSingleThread && workers.size > 1 && text.length > 50_000) {
|
|
107
|
+
workers.init();
|
|
108
|
+
const chunks = splitIntoChunks(text, workers.size);
|
|
109
|
+
const results = await workers.process(chunks);
|
|
110
|
+
result = results.join('');
|
|
111
|
+
}
|
|
112
|
+
else {
|
|
113
|
+
result = pipelines[mode](text, cfg);
|
|
114
|
+
}
|
|
115
|
+
if (!process.stdout.write(result)) {
|
|
116
|
+
process.stdout.once('drain', () => {
|
|
117
|
+
printErrLn('Drain event fired, exiting.');
|
|
118
|
+
process.exit(0);
|
|
119
|
+
});
|
|
120
|
+
}
|
|
121
|
+
};
|
|
100
122
|
if (process.argv.length) {
|
|
101
|
-
|
|
123
|
+
printErrLn('Processing the rest of command-line arguments as text...');
|
|
124
|
+
await processText(process.argv.reverse().join(' '));
|
|
102
125
|
}
|
|
103
126
|
else {
|
|
104
127
|
const chunks = [];
|
|
128
|
+
let value = '';
|
|
105
129
|
let length = 0;
|
|
130
|
+
let byteLength = 0;
|
|
131
|
+
const MAX_BYTE_LENGTH = 64 * 1024 * 1024;
|
|
132
|
+
const getChunksString = () => Buffer.concat(chunks, length).toString();
|
|
106
133
|
if (process.stdin.isTTY) {
|
|
107
|
-
|
|
134
|
+
printErrLn('Enter the text');
|
|
108
135
|
for await (const chunk of process.stdin) {
|
|
109
136
|
chunks.push(chunk);
|
|
110
137
|
length += chunk.length;
|
|
111
138
|
if (chunk.includes('\n'))
|
|
112
139
|
break;
|
|
113
140
|
}
|
|
141
|
+
await processText(getChunksString());
|
|
142
|
+
process.stdout.write('\n');
|
|
114
143
|
}
|
|
115
144
|
else {
|
|
145
|
+
printErrLn('Reading from stdin...');
|
|
146
|
+
const processTextWithLogs = async (value) => {
|
|
147
|
+
printErr(`Processing ${value.length
|
|
148
|
+
.toString()
|
|
149
|
+
.replace(/\B(?=(\d{3})+(?!\d))/g, ' ')} characters (${getPrettyByteSize(Buffer.byteLength(value))}) chunk... `);
|
|
150
|
+
const startTime = performance.now();
|
|
151
|
+
await processText(value);
|
|
152
|
+
process.stderr.write(`done in ${((performance.now() - startTime) / 1000).toFixed(2)} seconds.\n`);
|
|
153
|
+
};
|
|
116
154
|
for await (const chunk of process.stdin) {
|
|
117
|
-
|
|
155
|
+
byteLength += chunk.byteLength;
|
|
156
|
+
if (byteLength >= MAX_BYTE_LENGTH) {
|
|
157
|
+
value += getChunksString();
|
|
158
|
+
const lastNewlineIndex = value.lastIndexOf('\n');
|
|
159
|
+
if (lastNewlineIndex === -1) {
|
|
160
|
+
printErrLn('\nInput exceeded maximum size of ' +
|
|
161
|
+
MAX_BYTE_LENGTH +
|
|
162
|
+
' bytes without a newline. Stopping.');
|
|
163
|
+
process.exit(1);
|
|
164
|
+
}
|
|
165
|
+
let valueForNextBatch = value.slice(lastNewlineIndex + 1);
|
|
166
|
+
value = value.slice(0, lastNewlineIndex + 1);
|
|
167
|
+
const lastOpeningTagIndex = value.lastIndexOf('<');
|
|
168
|
+
const lastClosingTagIndex = value.lastIndexOf('>');
|
|
169
|
+
if (lastOpeningTagIndex !== -1 &&
|
|
170
|
+
lastOpeningTagIndex > lastClosingTagIndex) {
|
|
171
|
+
const incompleteTag = value.slice(lastOpeningTagIndex);
|
|
172
|
+
value = value.slice(0, lastOpeningTagIndex);
|
|
173
|
+
valueForNextBatch = incompleteTag + valueForNextBatch;
|
|
174
|
+
}
|
|
175
|
+
await processTextWithLogs(value);
|
|
176
|
+
value = valueForNextBatch;
|
|
177
|
+
byteLength = chunk.byteLength;
|
|
178
|
+
length = 0;
|
|
179
|
+
chunks.length = 0;
|
|
180
|
+
}
|
|
118
181
|
length += chunk.length;
|
|
182
|
+
chunks.push(chunk);
|
|
119
183
|
}
|
|
184
|
+
await processTextWithLogs(value + getChunksString());
|
|
120
185
|
}
|
|
121
|
-
text = Buffer.concat(chunks, length).toString();
|
|
122
|
-
}
|
|
123
|
-
let result = '';
|
|
124
|
-
if (text.length > 50_000 && !doForceSingleThread) {
|
|
125
|
-
const cpuCount = Math.max(1, cpus()?.length || 1);
|
|
126
|
-
const chunks = splitIntoChunks(text, cpuCount);
|
|
127
|
-
const WORKER_CODE = `
|
|
128
|
-
const { parentPort, workerData } = require('node:worker_threads');
|
|
129
|
-
const { pipelines } = require('./dist');
|
|
130
|
-
const { parseArgs } = require('./dist/bin/parse-args');
|
|
131
|
-
const { argv, chunk } = workerData;
|
|
132
|
-
const { mode, cfg } = parseArgs(argv);
|
|
133
|
-
parentPort.postMessage(pipelines[mode](chunk, cfg));`;
|
|
134
|
-
const results = await Promise.all(chunks.map((chunk) => new Promise((resolve, reject) => {
|
|
135
|
-
const worker = new Worker(WORKER_CODE, {
|
|
136
|
-
eval: true,
|
|
137
|
-
workerData: { argv: process.argv, chunk },
|
|
138
|
-
});
|
|
139
|
-
worker.on('message', resolve);
|
|
140
|
-
worker.on('error', reject);
|
|
141
|
-
worker.on('exit', (code) => {
|
|
142
|
-
if (code !== 0)
|
|
143
|
-
reject(new Error('Worker exit code ' + code));
|
|
144
|
-
});
|
|
145
|
-
})));
|
|
146
|
-
result = results.join('\n') + '\n';
|
|
147
|
-
}
|
|
148
|
-
else {
|
|
149
|
-
result = pipelines[mode](text, cfg) + '\n';
|
|
150
|
-
}
|
|
151
|
-
if (process.stdout.write(result)) {
|
|
152
|
-
process.exit(0);
|
|
153
|
-
}
|
|
154
|
-
else {
|
|
155
|
-
process.stdout.once('drain', () => {
|
|
156
|
-
process.exit(0);
|
|
157
|
-
});
|
|
158
186
|
}
|
|
187
|
+
process.exit(0);
|