taraskevizer 10.4.9 → 10.4.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (2) hide show
  1. package/dist/bin/index.js +118 -92
  2. package/package.json +1 -1
package/dist/bin/index.js CHANGED
@@ -3,20 +3,29 @@ import { cpus } from 'node:os';
3
3
  import { Worker } from 'node:worker_threads';
4
4
  import { pipelines } from '../index.js';
5
5
  import { parseArgs } from './parse-args.js';
6
- const printWithPrefix = (msg) => {
7
- process.stdout.write("[taraskevizer]" + ' ' + msg + '\n');
6
+ const getPrint = (stream) => (msg) => {
7
+ stream.write("[taraskevizer]" + ' ' + msg);
8
8
  };
9
+ const getPrintLn = (printFn) => (msg) => {
10
+ printFn(msg + '\n');
11
+ };
12
+ const print = getPrint(process.stdout);
13
+ const printErr = getPrint(process.stderr);
14
+ const printLn = getPrintLn(print);
15
+ const printErrLn = getPrintLn(printErr);
16
+ // AI-written, may need improvements
9
17
  const splitIntoChunks = (text, n) => {
10
18
  const size = Math.ceil(text.length / n);
11
19
  const chunks = [];
12
20
  let start = 0;
13
21
  for (let i = 0; i < n; i++) {
14
22
  let end = start + size;
23
+ // Adjust end to the nearest newline character
15
24
  if (end < text.length) {
16
25
  const forward = text.indexOf('\n', end);
17
26
  const backward = text.lastIndexOf('\n', end);
18
27
  if (forward === -1 && backward === -1) {
19
- // no-op, use raw end
28
+ // no-op
20
29
  }
21
30
  else if (forward === -1) {
22
31
  end = backward;
@@ -28,6 +37,17 @@ const splitIntoChunks = (text, n) => {
28
37
  end = forward - end < end - backward ? forward : backward;
29
38
  }
30
39
  }
40
+ // Ensure we don't split inside HTML tags or special syntax tags
41
+ if (end < text.length) {
42
+ const lastOpen = text.lastIndexOf('<', end);
43
+ const lastClose = text.lastIndexOf('>', end);
44
+ if (lastOpen > lastClose) {
45
+ const nextClose = text.indexOf('>', end);
46
+ if (nextClose !== -1) {
47
+ end = nextClose + 1;
48
+ }
49
+ }
50
+ }
31
51
  if (end > text.length)
32
52
  end = text.length;
33
53
  chunks.push(text.slice(start, end));
@@ -37,122 +57,128 @@ const splitIntoChunks = (text, n) => {
37
57
  }
38
58
  return chunks;
39
59
  };
60
+ const getPrettyByteSize = (n) => n < 1024
61
+ ? `${n} B`
62
+ : n < 1024 * 1024
63
+ ? `${(n / 1024).toFixed(2)} KB`
64
+ : `${(n / (1024 * 1024)).toFixed(2)} MB`;
40
65
  process.argv.splice(0, 2);
41
66
  const firstArg = process.argv[0];
42
67
  if (firstArg) {
43
68
  if (firstArg === '-v' || firstArg === '--version') {
44
- printWithPrefix("10.4.9");
69
+ printLn("10.4.10");
45
70
  process.exit(0);
46
71
  }
47
72
  if (firstArg === '-h' || firstArg === '--help') {
48
- printWithPrefix(`Usage: tarask [options] text
49
- If text is not passed, interactive mode is enabled
50
-
51
- EXAMPLES
52
-
53
- Convert and latinize a word
54
- tarask --latin 'планета'
55
- Prints "planeta"
56
-
57
- Read from one file and write converted text to another
58
- tarask < ./cyr-text.txt > ./lat-text.txt
59
-
60
- Enter interactive mode
61
- tarask
62
- Prints "[taraskevizer] Enter the text:" and waits until you enter a new line
63
-
64
- OPTIONS
65
-
66
- General:
67
- -h --help
68
- -v --version
69
-
70
- Alphabet:
71
- -l --latin
72
- -lj --latin-ji
73
- -a --arabic
74
-
75
- When to replace і(i) by й(j) after vowels:
76
- -jr --jrandom
77
- -ja --jalways
78
-
79
- Replace ґ(g) by г(h) in cyrillic alphabet:
80
- --h
81
-
82
- Variations:
83
- -nv --no-variations
84
- -fv --first-variation
85
-
86
- Mode (only one can be used):
87
- -html --html
88
- -abc --alphabet-only
89
-
90
- Other:
91
- -nec --not-escape-caps
92
- -nc --no-color
93
- -st --single-thread
94
- `);
73
+ printLn("Usage: \u001b[34mtarask\u001b[0m [options] text\nIf text is not passed, interactive mode is enabled\n\nEXAMPLES\n\nConvert and latinize a word\n\t\u001b[34mtarask\u001b[0m \u001b[35m--latin\u001b[0m 'планета'\nPrints \"p\u001b[32ml\u001b[0ma\u001b[32mne\u001b[0mta\"\n\nRead from one file and write converted text to another\n\t\u001b[34mtarask\u001b[0m < ./cyr-text.txt > ./lat-text.txt\n\nEnter interactive mode\n\t\u001b[34mtarask\u001b[0m\nPrints \"\u001b[34m[taraskevizer]\u001b[0m Enter the text:\" and waits until you enter a new line\n\nOPTIONS\n\n\u001b[33mGeneral\u001b[0m:\n \u001b[35m-h\u001b[0m \u001b[35m--help\u001b[0m\n \u001b[35m-v\u001b[0m \u001b[35m--version\u001b[0m\n\n\u001b[33mAlphabet\u001b[0m:\n \u001b[35m-l\u001b[0m \u001b[35m--latin\u001b[0m\n \u001b[35m-lj\u001b[0m \u001b[35m--latin-ji\u001b[0m\n \u001b[35m-a\u001b[0m \u001b[35m--arabic\u001b[0m\n\n\u001b[33mWhen to replace і(i) by й(j) after vowels\u001b[0m:\n \u001b[35m-jr\u001b[0m \u001b[35m--jrandom\u001b[0m\n \u001b[35m-ja\u001b[0m \u001b[35m--jalways\u001b[0m\n\n\u001b[33mReplace ґ(g) by г(h) in cyrillic alphabet\u001b[0m:\n \u001b[35m--h\u001b[0m\n\n\u001b[33mVariations\u001b[0m:\n \u001b[35m-nv\u001b[0m \u001b[35m--no-variations\u001b[0m\n \u001b[35m-fv\u001b[0m \u001b[35m--first-variation\u001b[0m\n\n\u001b[33mMode (only one can be used)\u001b[0m:\n \u001b[35m-html\u001b[0m \u001b[35m--html\u001b[0m\n \u001b[35m-abc\u001b[0m \u001b[35m--alphabet-only\u001b[0m\n\n\u001b[33mOther\u001b[0m:\n \u001b[35m-nec\u001b[0m \u001b[35m--not-escape-caps\u001b[0m\n \u001b[35m-nc\u001b[0m \u001b[35m--no-color\u001b[0m\n \u001b[35m-st\u001b[0m \u001b[35m--single-thread\u001b[0m\n");
95
74
  process.exit(0);
96
75
  }
97
76
  }
77
+ const argv = process.argv.slice();
98
78
  const { mode, cfg, doForceSingleThread } = parseArgs(process.argv);
99
- let text = '';
79
+ const workers = {
80
+ size: cpus()?.length || 1,
81
+ workers: null,
82
+ init() {
83
+ if (this.workers)
84
+ return;
85
+ process.stderr.write(`(Initializing ${this.size} workers... `);
86
+ this.workers = Array.from({ length: this.size }, () => new Worker("const { parentPort, workerData } = require('node:worker_threads');\nconst { pipelines } = require('./dist');\nconst { parseArgs } = require('./dist/bin/parse-args');\n\nconst { mode, cfg } = parseArgs(workerData.argv);\n\nparentPort.on('message', (chunk) => {\n\tparentPort.postMessage(pipelines[mode](chunk, cfg));\n});\n", {
87
+ eval: true,
88
+ workerData: { argv },
89
+ }));
90
+ process.stderr.write('done.) ');
91
+ },
92
+ process(chunks) {
93
+ return Promise.all(chunks.map((chunk, i) => new Promise((resolve, reject) => {
94
+ const worker = this.workers[i % this.size];
95
+ worker.postMessage(chunk);
96
+ worker.once('message', resolve);
97
+ worker.once('error', reject);
98
+ })));
99
+ },
100
+ };
101
+ const processText = async (text) => {
102
+ let result = '';
103
+ if (!doForceSingleThread && workers.size > 1 && text.length > 50_000) {
104
+ workers.init();
105
+ const chunks = splitIntoChunks(text, workers.size);
106
+ const results = await workers.process(chunks);
107
+ result = results.join('');
108
+ }
109
+ else {
110
+ result = pipelines[mode](text, cfg);
111
+ }
112
+ if (!process.stdout.write(result)) {
113
+ process.stdout.once('drain', () => {
114
+ printErrLn('Drain event fired, exiting.');
115
+ process.exit(0);
116
+ });
117
+ }
118
+ };
100
119
  if (process.argv.length) {
101
- text = process.argv.reverse().join(' ');
120
+ printErrLn('Processing the rest of command-line arguments as text...');
121
+ await processText(process.argv.reverse().join(' '));
102
122
  }
103
123
  else {
104
124
  const chunks = [];
125
+ let value = '';
105
126
  let length = 0;
127
+ let byteLength = 0;
128
+ const MAX_BYTE_LENGTH = 64 * 1024 * 1024;
129
+ const getChunksString = () => Buffer.concat(chunks, length).toString();
106
130
  if (process.stdin.isTTY) {
107
- printWithPrefix('Enter the text');
131
+ printErrLn('Enter the text');
108
132
  for await (const chunk of process.stdin) {
109
133
  chunks.push(chunk);
110
134
  length += chunk.length;
111
135
  if (chunk.includes('\n'))
112
136
  break;
113
137
  }
138
+ await processText(getChunksString());
139
+ process.stdout.write('\n');
114
140
  }
115
141
  else {
142
+ printErrLn('Reading from stdin...');
143
+ const processTextWithLogs = async (value) => {
144
+ printErr(`Processing ${value.length
145
+ .toString()
146
+ .replace(/\B(?=(\d{3})+(?!\d))/g, ' ')} characters (${getPrettyByteSize(Buffer.byteLength(value))}) chunk... `);
147
+ const startTime = performance.now();
148
+ await processText(value);
149
+ process.stderr.write(`done in ${((performance.now() - startTime) / 1000).toFixed(2)} seconds.\n`);
150
+ };
116
151
  for await (const chunk of process.stdin) {
117
- chunks.push(chunk);
152
+ byteLength += chunk.byteLength;
153
+ if (byteLength >= MAX_BYTE_LENGTH) {
154
+ value += getChunksString();
155
+ const lastNewlineIndex = value.lastIndexOf('\n');
156
+ if (lastNewlineIndex === -1) {
157
+ printErrLn('\nInput exceeded maximum size of ' +
158
+ MAX_BYTE_LENGTH +
159
+ ' bytes without a newline. Stopping.');
160
+ process.exit(1);
161
+ }
162
+ let valueForNextBatch = value.slice(lastNewlineIndex + 1);
163
+ value = value.slice(0, lastNewlineIndex + 1);
164
+ const lastOpeningTagIndex = value.lastIndexOf('<');
165
+ const lastClosingTagIndex = value.lastIndexOf('>');
166
+ if (lastOpeningTagIndex !== -1 &&
167
+ lastOpeningTagIndex > lastClosingTagIndex) {
168
+ const incompleteTag = value.slice(lastOpeningTagIndex);
169
+ value = value.slice(0, lastOpeningTagIndex);
170
+ valueForNextBatch = incompleteTag + valueForNextBatch;
171
+ }
172
+ await processTextWithLogs(value);
173
+ value = valueForNextBatch;
174
+ byteLength = chunk.byteLength;
175
+ length = 0;
176
+ chunks.length = 0;
177
+ }
118
178
  length += chunk.length;
179
+ chunks.push(chunk);
119
180
  }
181
+ await processTextWithLogs(value + getChunksString());
120
182
  }
121
- text = Buffer.concat(chunks, length).toString();
122
- }
123
- let result = '';
124
- if (text.length > 50_000 && !doForceSingleThread) {
125
- const cpuCount = Math.max(1, cpus()?.length || 1);
126
- const chunks = splitIntoChunks(text, cpuCount);
127
- const WORKER_CODE = `
128
- const { parentPort, workerData } = require('node:worker_threads');
129
- const { pipelines } = require('./dist');
130
- const { parseArgs } = require('./dist/bin/parse-args');
131
- const { argv, chunk } = workerData;
132
- const { mode, cfg } = parseArgs(argv);
133
- parentPort.postMessage(pipelines[mode](chunk, cfg));`;
134
- const results = await Promise.all(chunks.map((chunk) => new Promise((resolve, reject) => {
135
- const worker = new Worker(WORKER_CODE, {
136
- eval: true,
137
- workerData: { argv: process.argv, chunk },
138
- });
139
- worker.on('message', resolve);
140
- worker.on('error', reject);
141
- worker.on('exit', (code) => {
142
- if (code !== 0)
143
- reject(new Error('Worker exit code ' + code));
144
- });
145
- })));
146
- result = results.join('\n') + '\n';
147
- }
148
- else {
149
- result = pipelines[mode](text, cfg) + '\n';
150
- }
151
- if (process.stdout.write(result)) {
152
- process.exit(0);
153
- }
154
- else {
155
- process.stdout.once('drain', () => {
156
- process.exit(0);
157
- });
158
183
  }
184
+ process.exit(0);
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "taraskevizer",
3
- "version": "10.4.9",
3
+ "version": "10.4.10",
4
4
  "author": "GooseOb",
5
5
  "repository": {
6
6
  "type": "git",