@iteratech/iteraocr-cli 0.1.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +52 -0
- package/bin/iteraocr.js +808 -0
- package/package.json +19 -0
package/README.md
ADDED
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
# IteraOCR CLI
|
|
2
|
+
|
|
3
|
+
Terminal utility for folder-based OCR ingestion with IteraOCR.
|
|
4
|
+
|
|
5
|
+
## Requirements
|
|
6
|
+
|
|
7
|
+
- Node.js 20+
|
|
8
|
+
- npm
|
|
9
|
+
|
|
10
|
+
## Install
|
|
11
|
+
|
|
12
|
+
After the package is published to npm:
|
|
13
|
+
|
|
14
|
+
```bash
|
|
15
|
+
npx @iteratech/iteraocr-cli start --config ./iteraocr-activation.json
|
|
16
|
+
```
|
|
17
|
+
|
|
18
|
+
Optional global install:
|
|
19
|
+
|
|
20
|
+
```bash
|
|
21
|
+
npm install -g @iteratech/iteraocr-cli
|
|
22
|
+
iteraocr start --config ./iteraocr-activation.json
|
|
23
|
+
```
|
|
24
|
+
|
|
25
|
+
## Commands
|
|
26
|
+
|
|
27
|
+
```bash
|
|
28
|
+
iteraocr start [--config file] [--input dir] [--output dir] [--api-base-url url] [--api-key key] [--routing-preset fast|balanced|quality]
|
|
29
|
+
iteraocr init [--config file] [--input dir] [--output dir] [--api-base-url url] [--api-key key] [--routing-preset fast|balanced|quality]
|
|
30
|
+
iteraocr watch
|
|
31
|
+
iteraocr status
|
|
32
|
+
iteraocr reset [--yes]
|
|
33
|
+
iteraocr help
|
|
34
|
+
```
|
|
35
|
+
|
|
36
|
+
## Activation
|
|
37
|
+
|
|
38
|
+
The CLI does not contain customer-specific API addresses or keys.
|
|
39
|
+
|
|
40
|
+
Those values come from:
|
|
41
|
+
|
|
42
|
+
- `iteraocr-activation.json`, or
|
|
43
|
+
- explicit CLI arguments such as `--api-base-url` and `--api-key`
|
|
44
|
+
|
|
45
|
+
## Supported formats
|
|
46
|
+
|
|
47
|
+
- `jpg`
|
|
48
|
+
- `jpeg`
|
|
49
|
+
- `png`
|
|
50
|
+
- `tif`
|
|
51
|
+
- `tiff`
|
|
52
|
+
- `jp2`
|
package/bin/iteraocr.js
ADDED
|
@@ -0,0 +1,808 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
|
|
3
|
+
const fs = require('fs');
|
|
4
|
+
const fsp = require('fs/promises');
|
|
5
|
+
const os = require('os');
|
|
6
|
+
const path = require('path');
|
|
7
|
+
const readline = require('readline/promises');
|
|
8
|
+
const { stdin, stdout } = require('process');
|
|
9
|
+
const crypto = require('crypto');
|
|
10
|
+
|
|
11
|
+
const APP_NAME = 'IteraOCR CLI';
|
|
12
|
+
const HOME_DIR = process.env.ITERAOCR_HOME || path.join(os.homedir(), '.config', 'iteraocr');
|
|
13
|
+
const CONFIG_PATH = path.join(HOME_DIR, 'config.json');
|
|
14
|
+
const STATE_PATH = path.join(HOME_DIR, 'state.json');
|
|
15
|
+
const DEFAULT_WORK_ROOT = path.join(os.homedir(), 'IteraOCR');
|
|
16
|
+
const DEFAULT_INPUT_DIR = path.join(DEFAULT_WORK_ROOT, 'input');
|
|
17
|
+
const DEFAULT_OUTPUT_DIR = path.join(DEFAULT_WORK_ROOT, 'output');
|
|
18
|
+
const CHOICE_DEFAULT = '1';
|
|
19
|
+
const CHOICE_CURRENT = '2';
|
|
20
|
+
const CHOICE_CUSTOM = '3';
|
|
21
|
+
const BURST_TIERS = [
|
|
22
|
+
{ minQueued: 1000, batchSize: 400 },
|
|
23
|
+
{ minQueued: 400, batchSize: 250 },
|
|
24
|
+
{ minQueued: 150, batchSize: 125 }
|
|
25
|
+
];
|
|
26
|
+
const BURST_REQUEUE_THRESHOLD = 150;
|
|
27
|
+
const SUPPORTED_EXTENSIONS = new Set(['.jpg', '.jpeg', '.png', '.tif', '.tiff', '.jp2']);
|
|
28
|
+
const ROUTING_PRESETS = new Set(['fast', 'balanced', 'quality']);
|
|
29
|
+
const DEFAULTS = {
|
|
30
|
+
header_name: 'x-api-key',
|
|
31
|
+
upload_path: '/v1/utility/upload-submit',
|
|
32
|
+
status_path_template: '/v1/gate/jobs/{job_id}',
|
|
33
|
+
result_path_template: '/v1/gate/jobs/{job_id}/result',
|
|
34
|
+
routing_preset: 'balanced',
|
|
35
|
+
batch_size: 50,
|
|
36
|
+
stable_delay_seconds: 10,
|
|
37
|
+
scan_interval_seconds: 5,
|
|
38
|
+
supported_extensions: ['jpg', 'jpeg', 'png', 'tif', 'tiff', 'jp2']
|
|
39
|
+
};
|
|
40
|
+
|
|
41
|
+
function printHelp() {
|
|
42
|
+
console.log(`${APP_NAME}\n`);
|
|
43
|
+
console.log('Commands:');
|
|
44
|
+
console.log(' iteraocr start [--config file] [--input dir] [--output dir] [--api-base-url url] [--api-key key] [--routing-preset fast|balanced|quality]');
|
|
45
|
+
console.log(' iteraocr init [--config file] [--input dir] [--output dir] [--api-base-url url] [--api-key key] [--routing-preset fast|balanced|quality]');
|
|
46
|
+
console.log(' iteraocr watch');
|
|
47
|
+
console.log(' iteraocr status');
|
|
48
|
+
console.log(' iteraocr reset [--yes]');
|
|
49
|
+
console.log(' iteraocr help');
|
|
50
|
+
console.log('\nDefaults:');
|
|
51
|
+
console.log(` Input folder: ${DEFAULT_INPUT_DIR}`);
|
|
52
|
+
console.log(` Output folder: ${DEFAULT_OUTPUT_DIR}`);
|
|
53
|
+
console.log(' Or choose the current folder or a custom path during first-time setup.');
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
function parseArgs(argv) {
|
|
57
|
+
const args = { _: [] };
|
|
58
|
+
for (let i = 0; i < argv.length; i += 1) {
|
|
59
|
+
const token = argv[i];
|
|
60
|
+
if (!token.startsWith('--')) {
|
|
61
|
+
args._.push(token);
|
|
62
|
+
continue;
|
|
63
|
+
}
|
|
64
|
+
const key = token.slice(2);
|
|
65
|
+
const next = argv[i + 1];
|
|
66
|
+
if (!next || next.startsWith('--')) {
|
|
67
|
+
args[key] = true;
|
|
68
|
+
continue;
|
|
69
|
+
}
|
|
70
|
+
args[key] = next;
|
|
71
|
+
i += 1;
|
|
72
|
+
}
|
|
73
|
+
return args;
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
async function ensureHome() {
|
|
77
|
+
await fsp.mkdir(HOME_DIR, { recursive: true });
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
async function readJson(filePath, fallback) {
|
|
81
|
+
try {
|
|
82
|
+
const raw = await fsp.readFile(filePath, 'utf-8');
|
|
83
|
+
return JSON.parse(raw);
|
|
84
|
+
} catch {
|
|
85
|
+
return fallback;
|
|
86
|
+
}
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
async function writeJson(filePath, payload) {
|
|
90
|
+
await ensureHome();
|
|
91
|
+
await fsp.writeFile(filePath, JSON.stringify(payload, null, 2) + '\n', 'utf-8');
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
async function loadConfig() {
|
|
95
|
+
const config = await readJson(CONFIG_PATH, null);
|
|
96
|
+
if (!config) {
|
|
97
|
+
throw new Error('Config is missing. Run `iteraocr init` first.');
|
|
98
|
+
}
|
|
99
|
+
return { ...DEFAULTS, ...config };
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
async function loadState() {
|
|
103
|
+
const state = await readJson(STATE_PATH, { files: {}, jobs: {} });
|
|
104
|
+
normalizeState(state);
|
|
105
|
+
return state;
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
async function saveState(state) {
|
|
109
|
+
await writeJson(STATE_PATH, state);
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
function normalizeState(state) {
|
|
113
|
+
if (!state || typeof state !== 'object') {
|
|
114
|
+
return;
|
|
115
|
+
}
|
|
116
|
+
if (!state.files || typeof state.files !== 'object') {
|
|
117
|
+
state.files = {};
|
|
118
|
+
}
|
|
119
|
+
if (!state.jobs || typeof state.jobs !== 'object') {
|
|
120
|
+
state.jobs = {};
|
|
121
|
+
}
|
|
122
|
+
for (const entry of Object.values(state.files)) {
|
|
123
|
+
if (!entry || typeof entry !== 'object') {
|
|
124
|
+
continue;
|
|
125
|
+
}
|
|
126
|
+
if (entry.status === 'failed' && isQuotaRejected(entry.error || '')) {
|
|
127
|
+
entry.status = 'rejected';
|
|
128
|
+
}
|
|
129
|
+
}
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
function isQuotaRejected(detail) {
|
|
133
|
+
const message = String(detail || '');
|
|
134
|
+
return message.includes('monthly_limit_exceeded');
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
function markEntryRejected(entry, detail) {
|
|
138
|
+
entry.status = 'rejected';
|
|
139
|
+
entry.error = String(detail || 'request rejected');
|
|
140
|
+
entry.updatedAt = Date.now();
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
function markEntryFailed(entry, detail) {
|
|
144
|
+
entry.status = 'failed';
|
|
145
|
+
entry.error = String(detail || 'ocr failed');
|
|
146
|
+
entry.updatedAt = Date.now();
|
|
147
|
+
}
|
|
148
|
+
|
|
149
|
+
function normalizeRoutingPreset(value) {
|
|
150
|
+
const preset = String(value || 'balanced').trim().toLowerCase();
|
|
151
|
+
if (!ROUTING_PRESETS.has(preset)) {
|
|
152
|
+
throw new Error(`Invalid routing preset: ${value}`);
|
|
153
|
+
}
|
|
154
|
+
return preset;
|
|
155
|
+
}
|
|
156
|
+
|
|
157
|
+
async function promptIfMissing(question, currentValue) {
|
|
158
|
+
if (currentValue) {
|
|
159
|
+
return String(currentValue).trim();
|
|
160
|
+
}
|
|
161
|
+
const rl = readline.createInterface({ input: stdin, output: stdout });
|
|
162
|
+
try {
|
|
163
|
+
const answer = await rl.question(`${question}: `);
|
|
164
|
+
return answer.trim();
|
|
165
|
+
} finally {
|
|
166
|
+
rl.close();
|
|
167
|
+
}
|
|
168
|
+
}
|
|
169
|
+
|
|
170
|
+
function findActivationConfigCandidates() {
|
|
171
|
+
const candidates = [];
|
|
172
|
+
const names = ['iteraocr-activation.json', 'iteratech-activation.json'];
|
|
173
|
+
const roots = [process.cwd(), path.join(os.homedir(), 'Downloads')];
|
|
174
|
+
for (const root of roots) {
|
|
175
|
+
for (const name of names) {
|
|
176
|
+
const candidate = path.join(root, name);
|
|
177
|
+
if (fs.existsSync(candidate)) {
|
|
178
|
+
candidates.push(candidate);
|
|
179
|
+
}
|
|
180
|
+
}
|
|
181
|
+
}
|
|
182
|
+
return Array.from(new Set(candidates));
|
|
183
|
+
}
|
|
184
|
+
|
|
185
|
+
async function promptActivationConfigPath(currentValue) {
|
|
186
|
+
if (currentValue) {
|
|
187
|
+
return path.resolve(String(currentValue));
|
|
188
|
+
}
|
|
189
|
+
const candidates = findActivationConfigCandidates();
|
|
190
|
+
const suggested = candidates[0] || path.join(process.cwd(), 'iteraocr-activation.json');
|
|
191
|
+
const rl = readline.createInterface({ input: stdin, output: stdout });
|
|
192
|
+
try {
|
|
193
|
+
const answer = (await rl.question(`Aktivační konfigurace [${prettyPath(suggested)}]: `)).trim();
|
|
194
|
+
return path.resolve(expandUserPath(answer || suggested));
|
|
195
|
+
} finally {
|
|
196
|
+
rl.close();
|
|
197
|
+
}
|
|
198
|
+
}
|
|
199
|
+
|
|
200
|
+
function expandUserPath(value) {
|
|
201
|
+
const trimmed = String(value || '').trim();
|
|
202
|
+
if (!trimmed) {
|
|
203
|
+
return '';
|
|
204
|
+
}
|
|
205
|
+
if (trimmed === '~') {
|
|
206
|
+
return os.homedir();
|
|
207
|
+
}
|
|
208
|
+
if (trimmed.startsWith('~/')) {
|
|
209
|
+
return path.join(os.homedir(), trimmed.slice(2));
|
|
210
|
+
}
|
|
211
|
+
return trimmed;
|
|
212
|
+
}
|
|
213
|
+
|
|
214
|
+
function prettyPath(value) {
|
|
215
|
+
const home = os.homedir();
|
|
216
|
+
if (value.startsWith(`${home}${path.sep}`)) {
|
|
217
|
+
return `~${path.sep}${path.relative(home, value)}`;
|
|
218
|
+
}
|
|
219
|
+
return value;
|
|
220
|
+
}
|
|
221
|
+
|
|
222
|
+
async function promptPath(question, currentValue, defaultValue) {
|
|
223
|
+
const preset = currentValue ? expandUserPath(currentValue) : defaultValue;
|
|
224
|
+
if (currentValue) {
|
|
225
|
+
return path.resolve(preset);
|
|
226
|
+
}
|
|
227
|
+
const rl = readline.createInterface({ input: stdin, output: stdout });
|
|
228
|
+
try {
|
|
229
|
+
const currentDir = process.cwd();
|
|
230
|
+
while (true) {
|
|
231
|
+
console.log(`${question}:`);
|
|
232
|
+
console.log(` ${CHOICE_DEFAULT}) Výchozí složka (${prettyPath(preset)})`);
|
|
233
|
+
console.log(` ${CHOICE_CURRENT}) Aktuální složka (${prettyPath(currentDir)})`);
|
|
234
|
+
console.log(` ${CHOICE_CUSTOM}) Vlastní cesta`);
|
|
235
|
+
const choice = (await rl.question(`Vyberte možnost [${CHOICE_DEFAULT}/${CHOICE_CURRENT}/${CHOICE_CUSTOM}] (Enter = ${CHOICE_DEFAULT}): `)).trim() || CHOICE_DEFAULT;
|
|
236
|
+
|
|
237
|
+
if (choice === CHOICE_DEFAULT) {
|
|
238
|
+
return path.resolve(preset);
|
|
239
|
+
}
|
|
240
|
+
if (choice === CHOICE_CURRENT) {
|
|
241
|
+
return path.resolve(currentDir);
|
|
242
|
+
}
|
|
243
|
+
if (choice === CHOICE_CUSTOM) {
|
|
244
|
+
const customValue = expandUserPath(await rl.question('Zadejte cestu ke složce: '));
|
|
245
|
+
if (customValue) {
|
|
246
|
+
return path.resolve(customValue);
|
|
247
|
+
}
|
|
248
|
+
console.log('Vlastní cesta nesmí být prázdná.');
|
|
249
|
+
continue;
|
|
250
|
+
}
|
|
251
|
+
console.log(`Neplatná volba: ${choice}`);
|
|
252
|
+
}
|
|
253
|
+
} finally {
|
|
254
|
+
rl.close();
|
|
255
|
+
}
|
|
256
|
+
}
|
|
257
|
+
|
|
258
|
+
function hasSetupOverrides(args) {
|
|
259
|
+
return Boolean(args.config || args.input || args.output || args['api-base-url'] || args['api-key'] || args['routing-preset']);
|
|
260
|
+
}
|
|
261
|
+
|
|
262
|
+
function configIsComplete(config) {
|
|
263
|
+
return Boolean(
|
|
264
|
+
config &&
|
|
265
|
+
String(config.api_base_url || '').trim() &&
|
|
266
|
+
String(config.api_key || '').trim() &&
|
|
267
|
+
String(config.input_dir || '').trim() &&
|
|
268
|
+
String(config.output_dir || '').trim()
|
|
269
|
+
);
|
|
270
|
+
}
|
|
271
|
+
|
|
272
|
+
function assertDistinctFolders(inputDir, outputDir) {
|
|
273
|
+
const resolvedInput = path.resolve(String(inputDir || ''));
|
|
274
|
+
const resolvedOutput = path.resolve(String(outputDir || ''));
|
|
275
|
+
if (resolvedInput && resolvedOutput && resolvedInput === resolvedOutput) {
|
|
276
|
+
throw new Error('Input and output folders must be different.');
|
|
277
|
+
}
|
|
278
|
+
}
|
|
279
|
+
|
|
280
|
+
function createTimeoutSignal(ms, extraSignals = []) {
|
|
281
|
+
const controller = new AbortController();
|
|
282
|
+
const timeout = setTimeout(() => {
|
|
283
|
+
controller.abort(new Error(`request timed out after ${ms}ms`));
|
|
284
|
+
}, ms);
|
|
285
|
+
const cleanups = [];
|
|
286
|
+
const abortFrom = (source) => {
|
|
287
|
+
if (!source) {
|
|
288
|
+
return;
|
|
289
|
+
}
|
|
290
|
+
if (source.aborted) {
|
|
291
|
+
controller.abort(source.reason || new Error('request aborted'));
|
|
292
|
+
return;
|
|
293
|
+
}
|
|
294
|
+
const handler = () => controller.abort(source.reason || new Error('request aborted'));
|
|
295
|
+
source.addEventListener('abort', handler, { once: true });
|
|
296
|
+
cleanups.push(() => source.removeEventListener('abort', handler));
|
|
297
|
+
};
|
|
298
|
+
for (const source of extraSignals) {
|
|
299
|
+
abortFrom(source);
|
|
300
|
+
}
|
|
301
|
+
controller.signal.addEventListener(
|
|
302
|
+
'abort',
|
|
303
|
+
() => {
|
|
304
|
+
clearTimeout(timeout);
|
|
305
|
+
for (const cleanup of cleanups) {
|
|
306
|
+
cleanup();
|
|
307
|
+
}
|
|
308
|
+
},
|
|
309
|
+
{ once: true }
|
|
310
|
+
);
|
|
311
|
+
return controller;
|
|
312
|
+
}
|
|
313
|
+
|
|
314
|
+
async function collectInitConfig(args) {
|
|
315
|
+
let fromConfig = {};
|
|
316
|
+
if (args.config) {
|
|
317
|
+
const activationPath = await promptActivationConfigPath(String(args.config));
|
|
318
|
+
fromConfig = await readJson(activationPath, {});
|
|
319
|
+
} else if (!args['api-base-url'] && !args['api-key']) {
|
|
320
|
+
const activationPath = await promptActivationConfigPath('');
|
|
321
|
+
fromConfig = await readJson(activationPath, {});
|
|
322
|
+
}
|
|
323
|
+
const merged = {
|
|
324
|
+
...DEFAULTS,
|
|
325
|
+
...fromConfig,
|
|
326
|
+
api_base_url: String(args['api-base-url'] || fromConfig.api_base_url || '').trim().replace(/\/$/, ''),
|
|
327
|
+
api_key: String(args['api-key'] || fromConfig.api_key || '').trim(),
|
|
328
|
+
routing_preset: normalizeRoutingPreset(args['routing-preset'] || fromConfig.routing_preset || DEFAULTS.routing_preset),
|
|
329
|
+
input_dir: String(args.input || fromConfig.input_dir || '').trim(),
|
|
330
|
+
output_dir: String(args.output || fromConfig.output_dir || '').trim()
|
|
331
|
+
};
|
|
332
|
+
return merged;
|
|
333
|
+
}
|
|
334
|
+
|
|
335
|
+
async function cmdInit(args, options = {}) {
|
|
336
|
+
const merged = await collectInitConfig(args);
|
|
337
|
+
merged.api_base_url = await promptIfMissing('OCR service base URL', merged.api_base_url);
|
|
338
|
+
merged.api_key = await promptIfMissing('API key', merged.api_key);
|
|
339
|
+
merged.input_dir = await promptPath('Input folder', merged.input_dir, DEFAULT_INPUT_DIR);
|
|
340
|
+
merged.output_dir = await promptPath('Output folder', merged.output_dir, DEFAULT_OUTPUT_DIR);
|
|
341
|
+
|
|
342
|
+
if (!merged.api_base_url || !merged.api_key) {
|
|
343
|
+
throw new Error('API base URL and API key are required.');
|
|
344
|
+
}
|
|
345
|
+
assertDistinctFolders(merged.input_dir, merged.output_dir);
|
|
346
|
+
|
|
347
|
+
await fsp.mkdir(merged.input_dir, { recursive: true });
|
|
348
|
+
await fsp.mkdir(merged.output_dir, { recursive: true });
|
|
349
|
+
await writeJson(CONFIG_PATH, merged);
|
|
350
|
+
console.log(`Saved configuration to ${CONFIG_PATH}`);
|
|
351
|
+
console.log(`Input folder ready at ${merged.input_dir}`);
|
|
352
|
+
console.log(`Output folder ready at ${merged.output_dir}`);
|
|
353
|
+
if (!options.skipNextStepHint) {
|
|
354
|
+
console.log('Run `iteraocr start` to begin automatic processing.');
|
|
355
|
+
}
|
|
356
|
+
}
|
|
357
|
+
|
|
358
|
+
function fileFingerprint(relPath, stats) {
|
|
359
|
+
return crypto.createHash('sha1').update(`${relPath}|${stats.size}|${stats.mtimeMs}`).digest('hex').slice(0, 16);
|
|
360
|
+
}
|
|
361
|
+
|
|
362
|
+
async function walkFiles(rootDir) {
|
|
363
|
+
const out = [];
|
|
364
|
+
const stack = [rootDir];
|
|
365
|
+
while (stack.length) {
|
|
366
|
+
const current = stack.pop();
|
|
367
|
+
const entries = await fsp.readdir(current, { withFileTypes: true });
|
|
368
|
+
for (const entry of entries) {
|
|
369
|
+
const fullPath = path.join(current, entry.name);
|
|
370
|
+
if (entry.isDirectory()) {
|
|
371
|
+
stack.push(fullPath);
|
|
372
|
+
continue;
|
|
373
|
+
}
|
|
374
|
+
if (!entry.isFile()) {
|
|
375
|
+
continue;
|
|
376
|
+
}
|
|
377
|
+
if (!SUPPORTED_EXTENSIONS.has(path.extname(entry.name).toLowerCase())) {
|
|
378
|
+
continue;
|
|
379
|
+
}
|
|
380
|
+
out.push(fullPath);
|
|
381
|
+
}
|
|
382
|
+
}
|
|
383
|
+
return out.sort();
|
|
384
|
+
}
|
|
385
|
+
|
|
386
|
+
async function scanInput(config, state) {
|
|
387
|
+
const now = Date.now();
|
|
388
|
+
const inputDir = path.resolve(config.input_dir);
|
|
389
|
+
const files = await walkFiles(inputDir);
|
|
390
|
+
for (const filePath of files) {
|
|
391
|
+
const stats = await fsp.stat(filePath);
|
|
392
|
+
const relPath = path.relative(inputDir, filePath);
|
|
393
|
+
const current = state.files[filePath];
|
|
394
|
+
const stableAfter = now + config.stable_delay_seconds * 1000;
|
|
395
|
+
if (!current) {
|
|
396
|
+
state.files[filePath] = {
|
|
397
|
+
relPath,
|
|
398
|
+
size: stats.size,
|
|
399
|
+
mtimeMs: Math.round(stats.mtimeMs),
|
|
400
|
+
status: 'discovering',
|
|
401
|
+
firstSeenTs: now,
|
|
402
|
+
stableAfterTs: stableAfter,
|
|
403
|
+
updatedAt: now,
|
|
404
|
+
itemId: null,
|
|
405
|
+
jobId: null,
|
|
406
|
+
outputPath: null,
|
|
407
|
+
error: null
|
|
408
|
+
};
|
|
409
|
+
continue;
|
|
410
|
+
}
|
|
411
|
+
const changed = current.size !== stats.size || current.mtimeMs !== Math.round(stats.mtimeMs);
|
|
412
|
+
if (changed) {
|
|
413
|
+
Object.assign(current, {
|
|
414
|
+
relPath,
|
|
415
|
+
size: stats.size,
|
|
416
|
+
mtimeMs: Math.round(stats.mtimeMs),
|
|
417
|
+
status: 'discovering',
|
|
418
|
+
stableAfterTs: stableAfter,
|
|
419
|
+
updatedAt: now,
|
|
420
|
+
itemId: null,
|
|
421
|
+
jobId: null,
|
|
422
|
+
outputPath: null,
|
|
423
|
+
error: null
|
|
424
|
+
});
|
|
425
|
+
continue;
|
|
426
|
+
}
|
|
427
|
+
if (current.status === 'discovering' && current.stableAfterTs <= now) {
|
|
428
|
+
current.status = 'queued';
|
|
429
|
+
current.updatedAt = now;
|
|
430
|
+
}
|
|
431
|
+
}
|
|
432
|
+
}
|
|
433
|
+
|
|
434
|
+
function queuedEntries(state, batchSize) {
|
|
435
|
+
return Object.entries(state.files)
|
|
436
|
+
.filter(([, item]) => item.status === 'queued')
|
|
437
|
+
.sort((a, b) => a[1].updatedAt - b[1].updatedAt)
|
|
438
|
+
.slice(0, batchSize);
|
|
439
|
+
}
|
|
440
|
+
|
|
441
|
+
function queuedCount(state) {
|
|
442
|
+
return Object.values(state.files).filter((item) => item.status === 'queued').length;
|
|
443
|
+
}
|
|
444
|
+
|
|
445
|
+
function selectBatchSize(config, queued) {
|
|
446
|
+
for (const tier of BURST_TIERS) {
|
|
447
|
+
if (queued >= tier.minQueued) {
|
|
448
|
+
return tier.batchSize;
|
|
449
|
+
}
|
|
450
|
+
}
|
|
451
|
+
return Number(config.batch_size || 50);
|
|
452
|
+
}
|
|
453
|
+
|
|
454
|
+
function shouldStayInBurstMode(queued) {
|
|
455
|
+
return queued >= BURST_REQUEUE_THRESHOLD;
|
|
456
|
+
}
|
|
457
|
+
|
|
458
|
+
function authHeaderName(config) {
|
|
459
|
+
return String(config.header_name || 'x-api-key');
|
|
460
|
+
}
|
|
461
|
+
|
|
462
|
+
async function submitBatch(config, state, batchSize, abortSignal) {
|
|
463
|
+
const queued = queuedEntries(state, batchSize);
|
|
464
|
+
if (!queued.length) {
|
|
465
|
+
return { accepted: 0, attempted: 0, batchSize };
|
|
466
|
+
}
|
|
467
|
+
|
|
468
|
+
const form = new FormData();
|
|
469
|
+
form.append('routing_preset', normalizeRoutingPreset(config.routing_preset));
|
|
470
|
+
for (const [filePath, entry] of queued) {
|
|
471
|
+
const buffer = await fsp.readFile(filePath);
|
|
472
|
+
const itemId = `item-${fileFingerprint(entry.relPath, { size: entry.size, mtimeMs: entry.mtimeMs })}`;
|
|
473
|
+
form.append('item_ids', itemId);
|
|
474
|
+
form.append('relative_paths', entry.relPath);
|
|
475
|
+
const mime = mimeType(filePath);
|
|
476
|
+
form.append('files', new Blob([buffer], { type: mime }), path.basename(filePath));
|
|
477
|
+
entry.itemId = itemId;
|
|
478
|
+
}
|
|
479
|
+
|
|
480
|
+
const timeoutController = createTimeoutSignal(30000, [abortSignal]);
|
|
481
|
+
const response = await fetch(new URL(config.upload_path, config.api_base_url), {
|
|
482
|
+
method: 'POST',
|
|
483
|
+
headers: {
|
|
484
|
+
[authHeaderName(config)]: config.api_key
|
|
485
|
+
},
|
|
486
|
+
body: form,
|
|
487
|
+
signal: timeoutController.signal
|
|
488
|
+
});
|
|
489
|
+
|
|
490
|
+
if (!response.ok) {
|
|
491
|
+
const detail = await response.text();
|
|
492
|
+
for (const [, entry] of queued) {
|
|
493
|
+
const message = `upload failed: http ${response.status} ${detail.slice(0, 200)}`;
|
|
494
|
+
if (isQuotaRejected(message)) {
|
|
495
|
+
markEntryRejected(entry, message);
|
|
496
|
+
} else {
|
|
497
|
+
markEntryFailed(entry, message);
|
|
498
|
+
}
|
|
499
|
+
}
|
|
500
|
+
return { accepted: 0, attempted: queued.length, batchSize };
|
|
501
|
+
}
|
|
502
|
+
|
|
503
|
+
const payload = await response.json();
|
|
504
|
+
const rejected = new Map((payload.rejected || []).map((item) => [String(item.item_id || ''), String(item.error || 'upload rejected')]));
|
|
505
|
+
const acceptedItems = new Map((payload.items || []).map((item) => [String(item.item_id || ''), item]));
|
|
506
|
+
|
|
507
|
+
for (const [, entry] of queued) {
|
|
508
|
+
if (rejected.has(entry.itemId)) {
|
|
509
|
+
const message = rejected.get(entry.itemId);
|
|
510
|
+
if (isQuotaRejected(message)) {
|
|
511
|
+
markEntryRejected(entry, message);
|
|
512
|
+
} else {
|
|
513
|
+
markEntryFailed(entry, message);
|
|
514
|
+
}
|
|
515
|
+
continue;
|
|
516
|
+
}
|
|
517
|
+
if (acceptedItems.has(entry.itemId)) {
|
|
518
|
+
entry.status = 'submitted';
|
|
519
|
+
entry.jobId = String(payload.job_id || '');
|
|
520
|
+
entry.updatedAt = Date.now();
|
|
521
|
+
state.jobs[entry.jobId] = {
|
|
522
|
+
status: 'submitted',
|
|
523
|
+
updatedAt: Date.now()
|
|
524
|
+
};
|
|
525
|
+
}
|
|
526
|
+
}
|
|
527
|
+
|
|
528
|
+
return { accepted: Number(payload.accepted || 0), attempted: queued.length, batchSize };
|
|
529
|
+
}
|
|
530
|
+
|
|
531
|
+
async function submitQueuedWork(config, state, abortSignal) {
|
|
532
|
+
let totalAccepted = 0;
|
|
533
|
+
let submittedJobs = 0;
|
|
534
|
+
let lastBatchSize = 0;
|
|
535
|
+
let burstMode = false;
|
|
536
|
+
|
|
537
|
+
while (true) {
|
|
538
|
+
const queued = queuedCount(state);
|
|
539
|
+
if (!queued) {
|
|
540
|
+
break;
|
|
541
|
+
}
|
|
542
|
+
const batchSize = selectBatchSize(config, queued);
|
|
543
|
+
lastBatchSize = batchSize;
|
|
544
|
+
burstMode = burstMode || batchSize > Number(config.batch_size || 50);
|
|
545
|
+
const result = await submitBatch(config, state, batchSize, abortSignal);
|
|
546
|
+
totalAccepted += result.accepted;
|
|
547
|
+
if (!result.attempted) {
|
|
548
|
+
break;
|
|
549
|
+
}
|
|
550
|
+
if (result.accepted > 0) {
|
|
551
|
+
submittedJobs += 1;
|
|
552
|
+
}
|
|
553
|
+
if (!shouldStayInBurstMode(queuedCount(state))) {
|
|
554
|
+
break;
|
|
555
|
+
}
|
|
556
|
+
}
|
|
557
|
+
|
|
558
|
+
return {
|
|
559
|
+
accepted: totalAccepted,
|
|
560
|
+
submittedJobs,
|
|
561
|
+
lastBatchSize,
|
|
562
|
+
burstMode
|
|
563
|
+
};
|
|
564
|
+
}
|
|
565
|
+
|
|
566
|
+
async function pollJobs(config, state, abortSignal) {
|
|
567
|
+
const outputDir = path.resolve(config.output_dir);
|
|
568
|
+
await fsp.mkdir(outputDir, { recursive: true });
|
|
569
|
+
const submittedJobs = Array.from(new Set(Object.values(state.files).filter((item) => item.status === 'submitted' && item.jobId).map((item) => item.jobId)));
|
|
570
|
+
for (const jobId of submittedJobs) {
|
|
571
|
+
const url = new URL(config.result_path_template.replace('{job_id}', encodeURIComponent(jobId)), config.api_base_url);
|
|
572
|
+
const timeoutController = createTimeoutSignal(30000, [abortSignal]);
|
|
573
|
+
const response = await fetch(url, {
|
|
574
|
+
headers: {
|
|
575
|
+
[authHeaderName(config)]: config.api_key
|
|
576
|
+
},
|
|
577
|
+
signal: timeoutController.signal
|
|
578
|
+
});
|
|
579
|
+
if (!response.ok) {
|
|
580
|
+
state.jobs[jobId] = { status: `poll_http_${response.status}`, updatedAt: Date.now() };
|
|
581
|
+
continue;
|
|
582
|
+
}
|
|
583
|
+
const payload = await response.json();
|
|
584
|
+
state.jobs[jobId] = {
|
|
585
|
+
status: String(payload.job?.status || 'unknown'),
|
|
586
|
+
updatedAt: Date.now()
|
|
587
|
+
};
|
|
588
|
+
for (const item of payload.items || []) {
|
|
589
|
+
const match = Object.entries(state.files).find(([, entry]) => entry.itemId === item.item_id && entry.jobId === jobId);
|
|
590
|
+
if (!match) {
|
|
591
|
+
continue;
|
|
592
|
+
}
|
|
593
|
+
const [filePath, entry] = match;
|
|
594
|
+
if (item.status === 'completed') {
|
|
595
|
+
const relPath = entry.relPath;
|
|
596
|
+
const text = String((item.response_body || {}).text || '');
|
|
597
|
+
const outputPath = path.join(outputDir, relPath).replace(/\.[^.]+$/, '.txt');
|
|
598
|
+
await fsp.mkdir(path.dirname(outputPath), { recursive: true });
|
|
599
|
+
await fsp.writeFile(outputPath, text, 'utf-8');
|
|
600
|
+
entry.status = 'completed';
|
|
601
|
+
entry.outputPath = outputPath;
|
|
602
|
+
entry.error = null;
|
|
603
|
+
entry.updatedAt = Date.now();
|
|
604
|
+
} else if (item.status === 'error') {
|
|
605
|
+
const errorPath = path.join(outputDir, entry.relPath).replace(/\.[^.]+$/, '.error.json');
|
|
606
|
+
await fsp.mkdir(path.dirname(errorPath), { recursive: true });
|
|
607
|
+
await fsp.writeFile(errorPath, JSON.stringify({ error: item.error || 'ocr failed' }, null, 2) + '\n', 'utf-8');
|
|
608
|
+
markEntryFailed(entry, item.error || 'ocr failed');
|
|
609
|
+
}
|
|
610
|
+
}
|
|
611
|
+
}
|
|
612
|
+
}
|
|
613
|
+
|
|
614
|
+
function mimeType(filePath) {
|
|
615
|
+
const ext = path.extname(filePath).toLowerCase();
|
|
616
|
+
if (ext === '.jpg' || ext === '.jpeg') return 'image/jpeg';
|
|
617
|
+
if (ext === '.png') return 'image/png';
|
|
618
|
+
if (ext === '.tif' || ext === '.tiff') return 'image/tiff';
|
|
619
|
+
if (ext === '.jp2') return 'image/jp2';
|
|
620
|
+
return 'application/octet-stream';
|
|
621
|
+
}
|
|
622
|
+
|
|
623
|
+
function summarize(state) {
|
|
624
|
+
const summary = { discovering: 0, queued: 0, submitted: 0, completed: 0, failed: 0, rejected: 0 };
|
|
625
|
+
for (const item of Object.values(state.files)) {
|
|
626
|
+
if (summary[item.status] !== undefined) {
|
|
627
|
+
summary[item.status] += 1;
|
|
628
|
+
}
|
|
629
|
+
}
|
|
630
|
+
return summary;
|
|
631
|
+
}
|
|
632
|
+
|
|
633
|
+
function printSummary(state) {
|
|
634
|
+
const summary = summarize(state);
|
|
635
|
+
console.log(`Nalezeno=${summary.discovering} Čeká=${summary.queued} Odesláno=${summary.submitted} Hotovo=${summary.completed} Odmítnuto=${summary.rejected} Chyba=${summary.failed}`);
|
|
636
|
+
}
|
|
637
|
+
|
|
638
|
+
async function cmdStatus() {
|
|
639
|
+
const config = await loadConfig();
|
|
640
|
+
const state = await loadState();
|
|
641
|
+
console.log(`Nastavení: ${CONFIG_PATH}`);
|
|
642
|
+
console.log(`OCR služba: ${config.api_base_url}`);
|
|
643
|
+
console.log(`Režim: ${normalizeRoutingPreset(config.routing_preset)}`);
|
|
644
|
+
console.log(`Vstup: ${config.input_dir}`);
|
|
645
|
+
console.log(`Výstup: ${config.output_dir}`);
|
|
646
|
+
printSummary(state);
|
|
647
|
+
const recentJobs = Object.entries(state.jobs)
|
|
648
|
+
.sort((a, b) => b[1].updatedAt - a[1].updatedAt)
|
|
649
|
+
.slice(0, 10);
|
|
650
|
+
for (const [jobId, job] of recentJobs) {
|
|
651
|
+
console.log(`job ${jobId} ${job.status}`);
|
|
652
|
+
}
|
|
653
|
+
}
|
|
654
|
+
|
|
655
|
+
async function confirmReset(skipPrompt) {
|
|
656
|
+
if (skipPrompt) {
|
|
657
|
+
return true;
|
|
658
|
+
}
|
|
659
|
+
const rl = readline.createInterface({ input: stdin, output: stdout });
|
|
660
|
+
try {
|
|
661
|
+
const answer = (await rl.question('Opravdu chcete vymazat nastavení utility a místní historii? [y/N]: ')).trim().toLowerCase();
|
|
662
|
+
return answer === 'y' || answer === 'yes' || answer === 'a' || answer === 'ano';
|
|
663
|
+
} finally {
|
|
664
|
+
rl.close();
|
|
665
|
+
}
|
|
666
|
+
}
|
|
667
|
+
|
|
668
|
+
async function cmdReset(args) {
|
|
669
|
+
const confirmed = await confirmReset(Boolean(args.yes));
|
|
670
|
+
if (!confirmed) {
|
|
671
|
+
console.log('Reset zrušen.');
|
|
672
|
+
return;
|
|
673
|
+
}
|
|
674
|
+
await Promise.allSettled([
|
|
675
|
+
fsp.rm(CONFIG_PATH, { force: true }),
|
|
676
|
+
fsp.rm(STATE_PATH, { force: true })
|
|
677
|
+
]);
|
|
678
|
+
console.log('Nastavení utility a místní historie byly smazány.');
|
|
679
|
+
console.log('Vaše vstupní a výstupní soubory zůstaly beze změny.');
|
|
680
|
+
console.log('Příště spusťte `iteraocr start` a nastavte utilitu znovu.');
|
|
681
|
+
}
|
|
682
|
+
|
|
683
|
+
async function runWatch(config) {
|
|
684
|
+
if (!configIsComplete(config)) {
|
|
685
|
+
throw new Error('Configuration is incomplete. Run `iteraocr init` again.');
|
|
686
|
+
}
|
|
687
|
+
assertDistinctFolders(config.input_dir, config.output_dir);
|
|
688
|
+
console.log(`${APP_NAME} běží`);
|
|
689
|
+
console.log(`Vstupní složka: ${config.input_dir}`);
|
|
690
|
+
console.log(`Výstupní složka: ${config.output_dir}`);
|
|
691
|
+
console.log(`OCR služba: ${config.api_base_url}`);
|
|
692
|
+
console.log(`Režim: ${normalizeRoutingPreset(config.routing_preset)}`);
|
|
693
|
+
console.log('Ukončení: Ctrl+C');
|
|
694
|
+
|
|
695
|
+
let stop = false;
|
|
696
|
+
let shuttingDown = false;
|
|
697
|
+
let currentController = null;
|
|
698
|
+
const onSigint = () => {
|
|
699
|
+
if (shuttingDown) {
|
|
700
|
+
console.error('Vynucené ukončení.');
|
|
701
|
+
process.exit(130);
|
|
702
|
+
}
|
|
703
|
+
shuttingDown = true;
|
|
704
|
+
stop = true;
|
|
705
|
+
console.log('\nUkončuji…');
|
|
706
|
+
currentController?.abort(new Error('interrupted'));
|
|
707
|
+
};
|
|
708
|
+
const onSigterm = () => {
|
|
709
|
+
stop = true;
|
|
710
|
+
currentController?.abort(new Error('terminated'));
|
|
711
|
+
};
|
|
712
|
+
process.on('SIGINT', onSigint);
|
|
713
|
+
process.on('SIGTERM', onSigterm);
|
|
714
|
+
|
|
715
|
+
while (!stop) {
|
|
716
|
+
const state = await loadState();
|
|
717
|
+
currentController = new AbortController();
|
|
718
|
+
try {
|
|
719
|
+
await scanInput(config, state);
|
|
720
|
+
const queueBeforeSubmit = queuedCount(state);
|
|
721
|
+
const submitSummary = await submitQueuedWork(config, state, currentController.signal);
|
|
722
|
+
await pollJobs(config, state, currentController.signal);
|
|
723
|
+
await saveState(state);
|
|
724
|
+
const summary = summarize(state);
|
|
725
|
+
const stamp = new Date().toISOString();
|
|
726
|
+
console.log(
|
|
727
|
+
`[${stamp}] Přijato=${submitSummary.accepted} Úlohy=${submitSummary.submittedJobs} Dávka=${submitSummary.lastBatchSize || Number(config.batch_size || 50)} Režim=${submitSummary.burstMode ? 'rychlý' : 'běžný'} Čekalo=${queueBeforeSubmit} Čeká=${summary.queued} Odesláno=${summary.submitted} Hotovo=${summary.completed} Odmítnuto=${summary.rejected} Chyba=${summary.failed}`
|
|
728
|
+
);
|
|
729
|
+
} catch (error) {
|
|
730
|
+
if (currentController.signal.aborted && stop) {
|
|
731
|
+
break;
|
|
732
|
+
}
|
|
733
|
+
const detail = error?.cause?.message ? `${error.message || error} (${error.cause.message})` : String(error.message || error);
|
|
734
|
+
console.error(`Zpracování se nepodařilo: ${detail}`);
|
|
735
|
+
} finally {
|
|
736
|
+
currentController = null;
|
|
737
|
+
}
|
|
738
|
+
if (stop) {
|
|
739
|
+
break;
|
|
740
|
+
}
|
|
741
|
+
await new Promise((resolve) => setTimeout(resolve, Number(config.scan_interval_seconds || 5) * 1000));
|
|
742
|
+
}
|
|
743
|
+
process.off('SIGINT', onSigint);
|
|
744
|
+
process.off('SIGTERM', onSigterm);
|
|
745
|
+
console.log('Ukončeno.');
|
|
746
|
+
}
|
|
747
|
+
|
|
748
|
+
async function cmdWatch() {
|
|
749
|
+
const config = await loadConfig();
|
|
750
|
+
await runWatch(config);
|
|
751
|
+
}
|
|
752
|
+
|
|
753
|
+
async function cmdStart(args) {
|
|
754
|
+
let config = null;
|
|
755
|
+
let needsSetup = hasSetupOverrides(args);
|
|
756
|
+
if (!needsSetup) {
|
|
757
|
+
try {
|
|
758
|
+
config = await loadConfig();
|
|
759
|
+
needsSetup = !configIsComplete(config);
|
|
760
|
+
} catch {
|
|
761
|
+
needsSetup = true;
|
|
762
|
+
}
|
|
763
|
+
}
|
|
764
|
+
if (needsSetup) {
|
|
765
|
+
console.log('První spuštění utility.');
|
|
766
|
+
console.log('Nejprve nastavíme aktivační soubor, vstupní složku a výstupní složku.\n');
|
|
767
|
+
await cmdInit(args, { skipNextStepHint: true });
|
|
768
|
+
config = await loadConfig();
|
|
769
|
+
}
|
|
770
|
+
await runWatch(config);
|
|
771
|
+
}
|
|
772
|
+
|
|
773
|
+
async function main() {
|
|
774
|
+
const args = parseArgs(process.argv.slice(2));
|
|
775
|
+
const command = args._[0] || 'start';
|
|
776
|
+
try {
|
|
777
|
+
if (command === 'help' || command === '--help' || command === '-h') {
|
|
778
|
+
printHelp();
|
|
779
|
+
return;
|
|
780
|
+
}
|
|
781
|
+
if (command === 'init') {
|
|
782
|
+
await cmdInit(args);
|
|
783
|
+
return;
|
|
784
|
+
}
|
|
785
|
+
if (command === 'start') {
|
|
786
|
+
await cmdStart(args);
|
|
787
|
+
return;
|
|
788
|
+
}
|
|
789
|
+
if (command === 'watch') {
|
|
790
|
+
await cmdWatch();
|
|
791
|
+
return;
|
|
792
|
+
}
|
|
793
|
+
if (command === 'status') {
|
|
794
|
+
await cmdStatus();
|
|
795
|
+
return;
|
|
796
|
+
}
|
|
797
|
+
if (command === 'reset') {
|
|
798
|
+
await cmdReset(args);
|
|
799
|
+
return;
|
|
800
|
+
}
|
|
801
|
+
throw new Error(`Unknown command: ${command}`);
|
|
802
|
+
} catch (error) {
|
|
803
|
+
console.error(String(error.message || error));
|
|
804
|
+
process.exitCode = 1;
|
|
805
|
+
}
|
|
806
|
+
}
|
|
807
|
+
|
|
808
|
+
main();
|
package/package.json
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "@iteratech/iteraocr-cli",
|
|
3
|
+
"version": "0.1.7",
|
|
4
|
+
"description": "Terminal utility for IteraOCR folder ingestion",
|
|
5
|
+
"publishConfig": {
|
|
6
|
+
"access": "public"
|
|
7
|
+
},
|
|
8
|
+
"bin": {
|
|
9
|
+
"iteraocr": "bin/iteraocr.js"
|
|
10
|
+
},
|
|
11
|
+
"type": "commonjs",
|
|
12
|
+
"engines": {
|
|
13
|
+
"node": ">=20"
|
|
14
|
+
},
|
|
15
|
+
"files": [
|
|
16
|
+
"bin"
|
|
17
|
+
],
|
|
18
|
+
"license": "UNLICENSED"
|
|
19
|
+
}
|