@woladi/sortai 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Adrian Wołczuk
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
package/README.md ADDED
@@ -0,0 +1,193 @@
1
+ # sortai
2
+
3
+ > macOS CLI that walks a folder, OCRs every file with Apple Vision, and writes inferred Finder tags + comments — using a local Ollama model by default, or a cloud LLM with optional PII pseudonymisation.
4
+
5
+ This is the TypeScript successor to the legacy Python `tagger_*.py` scripts. Native Swift OCR is now delegated to the [`macos-vision`](https://www.npmjs.com/package/macos-vision) package, so there is **no Python, no `swiftc`, no manual setup** — just `npx sortai`.
6
+
7
+ ## Requirements
8
+
9
+ - macOS 12+
10
+ - Node.js 20+
11
+ - Xcode Command Line Tools (`xcode-select --install`) — needed by `macos-vision` to build its Swift binary at install time
12
+ - One of:
13
+ - [Ollama](https://ollama.com) running locally (default) — keeps everything offline
14
+ - Anthropic or OpenAI API key — for cloud LLM with optional `--mask`
15
+
16
+ ## Quick start
17
+
18
+ ```bash
19
+ # First run creates ~/.config/sortai/config.json with the default taxonomy
20
+ npx sortai
21
+
22
+ # Dry-run on the Desktop with local Ollama (default mistral-nemo)
23
+ npx sortai ~/Desktop --dry-run
24
+
25
+ # Actually write tags & comments
26
+ npx sortai ~/Desktop
27
+ ```
28
+
29
+ > The first invocation only writes the config and exits. Edit the file to fit your taxonomy, then re-run.
30
+
31
+ ### Cloud mode (optional)
32
+
33
+ ```bash
34
+ # Anthropic Claude, with PII masked locally via pseudonym-mcp before the upstream call
35
+ npx sortai ~/Desktop --cloud anthropic --mask --api-key sk-ant-...
36
+
37
+ # OpenAI without masking (everything in the OCR'd text goes to the provider)
38
+ ANTHROPIC_API_KEY=sk-ant-... npx sortai ~/Desktop --cloud openai
39
+ ```
40
+
41
+ When `--mask` is set, `sortai` spawns [`pseudonym-mcp`](https://www.npmjs.com/package/pseudonym-mcp) over stdio, runs `mask_text` on the OCR'd text, sends the masked version to the cloud LLM, then `unmask_text` on the returned comment. Tags are taxonomy-bound and never round-trip through the cloud as user values.
42
+
43
+ > **Pseudonymisation is a defence-in-depth control, not a compliance silver bullet.** Pseudonymised data is still personal data under GDPR Art. 4(5). Read the `pseudonym-mcp` README for the honest limitations.
44
+
45
+ ## How it works
46
+
47
+ ```
48
+ folder (recursive walk, .dotfiles + excluded dirs skipped)
49
+
50
+
51
+ dedup.ts: SHA256 over file bytes ← byte-identical groups → #Duplikat preTag
52
+
53
+ ▼ for each file
54
+ macos-vision: ocr(path, { startPage, maxPages })
55
+ ← Apple Vision OCR (PDF auto-rasterised, page-bounded)
56
+
57
+
58
+ pretag.ts: PATH_TAG_RULES ← regex rules from config
59
+
60
+ ▼ ≥4 pre-tags AND no OCR text → skip LLM (fast path)
61
+ LLM tag/comment inference:
62
+ ├── default: local Ollama (mistral-nemo) — fully offline
63
+ └── --cloud anthropic|openai:
64
+ ├── --mask → pseudonym-mcp.mask_text(ocr)
65
+ ├── cloud LLM gets masked OCR text
66
+ └── --mask → pseudonym-mcp.unmask_text(comment)
67
+
68
+ ▼ strict-evidence validation, contextual guards (#CV vs financial, noOcr → no strict)
69
+ │ per-file 180s watchdog → fallback if a single call hangs
70
+ macos.ts: xattr -wx + binary plist
71
+ ├── com.apple.metadata:_kMDItemUserTags (Finder tags)
72
+ └── com.apple.metadata:kMDItemFinderComment (Finder comment)
73
+ ├── mdimport <file> (Spotlight reindex, fire-and-forget)
74
+ ```
75
+
76
+ > Why not `osascript` + Finder `set tags`? It returns `-10006` on macOS 26+ (Tahoe). `xattr` + a binary plist is the same path the Python tagger used and works on every macOS version.
77
+
78
+ ## CLI flags
79
+
80
+ | Flag | Default | Description |
81
+ |------|---------|-------------|
82
+ | `<folder>` | from config | Folder to scan recursively |
83
+ | `--config <path>` | `~/.config/sortai/config.json` | Alternative config file |
84
+ | `--dry-run` | off | Print results without writing tags/comments |
85
+ | `--model <name>` | `mistral-nemo` (Ollama) | LLM model name |
86
+ | `--ollama-url <url>` | `http://localhost:11434` | Ollama server |
87
+ | `--cloud anthropic\|openai` | — | Switch to a cloud LLM |
88
+ | `--api-key <key>` | env | API key for the cloud provider |
89
+ | `--mask` | off | Pseudonymise OCR via pseudonym-mcp (only with `--cloud`) |
90
+ | `--lang en\|pl` | `pl` | Language for pseudonym-mcp regex rules |
91
+ | `--exclude <names>` | from config | Comma-separated folder names to skip |
92
+ | `--limit <n>` | — | Process at most N files |
93
+ | `--skip-tagged` | off | Skip files that already carry `cfg.tags.autoTag` (`#AI_Sorted` by default) |
94
+ | `--no-dedup` | off | Skip SHA256 hashing pre-pass (no hash-based `#Duplikat`) |
95
+ | `--verbose` | off | Extra logs |
96
+
97
+ Environment variables: `SORTAI_API_KEY`, `ANTHROPIC_API_KEY`, `OPENAI_API_KEY`.
98
+
99
+ ## Configuration
100
+
101
+ The default taxonomy that ships in `defaults.ts` is intentionally generic (`#Bank`, `#Faktura`, `#Umowa`, `#CV`, `#Wniosek`, `#Screenshot`, …) and is meant as a starting point. **Edit `~/.config/sortai/config.json` after the first run** to match your own categories — vendors, projects, clients, recurring matters.
102
+
103
+ The config file is plain JSON. Sections:
104
+
105
+ ```json
106
+ {
107
+ "scan": {
108
+ "folder": "~/Desktop",
109
+ "excludeFolders": ["node_modules", ".git", ".cache"],
110
+ "skipExtensions": [".ds_store", ".sig", ".localized", ".tmp", ".lock", ".pyc"],
111
+ "ocrExtensions": [".pdf", ".png", ".jpg", ".jpeg", ".webp", ".heic"],
112
+ "videoExtensions": [".mov", ".mp4", ".m4v"]
113
+ },
114
+ "ocr": { "maxChars": 4000, "llmMaxChars": 1500, "startPage": 1, "maxPages": 2 },
115
+ "llm": {
116
+ "provider": "ollama",
117
+ "model": "mistral-nemo",
118
+ "temperature": 0.15,
119
+ "numPredict": 300,
120
+ "ollamaUrl": "http://localhost:11434"
121
+ },
122
+ "mask": { "enabled": false, "lang": "pl" },
123
+ "dedup": { "enabled": true, "maxFileSizeMB": 200 },
124
+ "tags": {
125
+ "allowed": ["#Bank", "#Faktura", "#Umowa", "#CV", "#Wniosek", "#AI_Sorted"],
126
+ "strict": ["#Bank", "#Faktura", "#KartaKredytowa", "#Kredyt", "#RODO"],
127
+ "aliases": { "#Invoice": "#Faktura", "#Mortgage": "#Kredyt", "#GDPR": "#RODO" },
128
+ "strictEvidence": {
129
+ "#Bank": ["bank", "iban", "rachunek bankowy"],
130
+ "#Faktura": ["faktura", "invoice", " vat "]
131
+ },
132
+ "pathRules": [
133
+ { "pattern": "\\bbank\\b|iban|rachunek", "flags": "i", "tags": ["#Bank"] },
134
+ { "pattern": "faktura|invoice", "flags": "i", "tags": ["#Faktura"] }
135
+ ],
136
+ "autoTag": "#AI_Sorted"
137
+ },
138
+ "context": "1-2 sentence description of yourself and ongoing matters — used by the LLM as background. Example: 'Self-employed designer in Warsaw, clients AcmeCorp + BetaInc.'"
139
+ }
140
+ ```
141
+
142
+ - `scan.folder` / `scan.excludeFolders` / `scan.skipExtensions` — what to walk, skip, and ignore by extension.
143
+ - `ocr.maxChars` / `ocr.llmMaxChars` — cap on OCR text fed to the post-filter and to the LLM prompt.
144
+ - `ocr.startPage` / `ocr.maxPages` — PDF page range (1-based). Default `1` / `2` only OCRs the first two pages; raise it for content-heavy documents.
145
+ - `mask` — pseudonymisation toggle for `--cloud` (no-op without `--cloud`).
146
+ - `dedup` — SHA256 duplicate detection (see below).
147
+ - `tags.allowed` — set of tags the LLM is allowed to return; anything else is dropped.
148
+ - `tags.strict` — subset of `allowed`. Strict tags only land on the file if at least one `strictEvidence` keyword appears verbatim in the OCR or filename.
149
+ - `tags.aliases` — model-friendly normalisation (`#Invoice` → `#Faktura`).
150
+ - `tags.pathRules` — regex patterns over `path.replace(/[\\/_-]/g, " ") + " " + ocrText`. Multiple rules can match; results merge into `preTags`.
151
+ - `tags.autoTag` — appended to every successfully tagged file (sentinel so you can find "already processed" items in Finder and `--skip-tagged` works).
152
+ - `context` — pinned to the system prompt as background knowledge. **Edit this** — the default is a placeholder.
153
+
154
+ ## Duplicate detection
155
+
156
+ `sortai` ships two independent duplicate signals:
157
+
158
+ - **`#Duplikat`** — SHA256 over file bytes, computed for every file before the main pipeline. Files in a group of ≥2 identical hashes all get this tag. Catches `cp foo bar`, sync conflicts, etc. — anything bit-identical regardless of name. Skipped for files > `cfg.dedup.maxFileSizeMB` (200 by default) and 0-byte files.
159
+ - **`#PrawdopodobnaKopia`** — heuristic over filename + OCR: matches `copy`, `kopia`, `duplikat`, `(N)` in parentheses. Catches macOS Finder "Duplicate", Preview "Save As" copies, manual versioning — where the bytes differ (different `mtime`, repacked PDF, embedded timestamp) but the file is logically a copy.
160
+
161
+ A file can carry both, one, or neither. Skip the hash pre-pass with `--no-dedup` if it's too slow on huge media libraries.
162
+
163
+ ## What about Markdown export?
164
+
165
+ `sortai` is the *tagger*. If you want image/PDF → Markdown, use `macos-vision` directly:
166
+
167
+ ```bash
168
+ npx macos-vision --markdown invoice.pdf -o invoice.md
169
+ ```
170
+
171
+ That's the same Apple Vision + Ollama pipeline (VisionScribe), without the file-tagging layer.
172
+
173
+ ## Privacy
174
+
175
+ - **Default (Ollama)**: nothing leaves your machine.
176
+ - **`--cloud` without `--mask`**: the *full* OCR text of every scanned file is sent to your chosen provider. Use only when you trust the provider with the documents.
177
+ - **`--cloud --mask`**: the OCR text is masked locally first; tokens like `[PERSON:1]`, `[PESEL:1]` flow to the cloud instead of literals. Structure, dates, amounts, and any PII the regex/LLM detector misses still travel. See [`pseudonym-mcp`](https://www.npmjs.com/package/pseudonym-mcp) for the full caveats.
178
+ - File metadata is written via `osascript` (Apple Events). `sortai` makes no other network calls beyond your chosen LLM provider.
179
+
180
+ ## Development
181
+
182
+ ```bash
183
+ git clone https://github.com/woladi/sortai.git
184
+ cd sortai
185
+ npm install # macOS only; Linux/Windows needs --ignore-scripts to skip the native build
186
+ npm run typecheck
187
+ npm run build
188
+ node dist/cli.js --help
189
+ ```
190
+
191
+ ## License
192
+
193
+ MIT — Adrian Wołczuk
package/dist/cli.js ADDED
@@ -0,0 +1,261 @@
1
+ #!/usr/bin/env node
2
+ import { Command } from 'commander';
3
+ import chalk from 'chalk';
4
+ import ora from 'ora';
5
+ import path from 'node:path';
6
+ import { existsSync } from 'node:fs';
7
+ import { expandHome, loadConfig } from './config.js';
8
+ import { walkFiles } from './walker.js';
9
+ import { extractOcrText } from './ocr.js';
10
+ import { preTagFromPath } from './pretag.js';
11
+ import { mergeTags } from './tags.js';
12
+ import { writeFileMetadata } from './macos.js';
13
+ import { Masker } from './mask.js';
14
+ import { inferTagsAndComment } from './llm/index.js';
15
+ import { findDuplicates } from './dedup.js';
16
+ function applyOverrides(cfg, opts) {
17
+ const apiKey = opts.apiKey
18
+ ?? process.env.SORTAI_API_KEY
19
+ ?? (opts.cloud === 'anthropic' ? process.env.ANTHROPIC_API_KEY : undefined)
20
+ ?? (opts.cloud === 'openai' ? process.env.OPENAI_API_KEY : undefined);
21
+ const provider = opts.cloud ?? 'ollama';
22
+ const defaultCloudModels = {
23
+ anthropic: 'claude-sonnet-4-6',
24
+ openai: 'gpt-4o-mini',
25
+ };
26
+ return {
27
+ ...cfg,
28
+ scan: {
29
+ ...cfg.scan,
30
+ excludeFolders: opts.exclude
31
+ ? opts.exclude.split(',').map(s => s.trim()).filter(Boolean)
32
+ : cfg.scan.excludeFolders,
33
+ },
34
+ llm: {
35
+ ...cfg.llm,
36
+ provider,
37
+ model: opts.model ?? (opts.cloud ? defaultCloudModels[opts.cloud] ?? cfg.llm.model : cfg.llm.model),
38
+ ollamaUrl: opts.ollamaUrl ?? cfg.llm.ollamaUrl,
39
+ apiKey,
40
+ },
41
+ mask: {
42
+ ...cfg.mask,
43
+ enabled: opts.mask,
44
+ lang: opts.lang ?? cfg.mask.lang,
45
+ },
46
+ dedup: {
47
+ ...cfg.dedup,
48
+ enabled: opts.dedup,
49
+ },
50
+ };
51
+ }
52
+ async function main() {
53
+ const program = new Command();
54
+ program
55
+ .name('sortai')
56
+ .description('macOS CLI that tags files based on OCR + LLM-inferred Finder tags & comments')
57
+ .version('0.1.0')
58
+ .argument('[folder]', 'folder to scan (overrides config.scan.folder)')
59
+ .option('--config <path>', 'path to config JSON (default: ~/.config/sortai/config.json)')
60
+ .option('--dry-run', 'do not write tags/comments; just log', false)
61
+ .option('--model <name>', 'LLM model name (default depends on provider)')
62
+ .option('--ollama-url <url>', 'Ollama base URL (default: http://localhost:11434)')
63
+ .option('--cloud <provider>', "use a cloud LLM: 'anthropic' or 'openai'")
64
+ .option('--api-key <key>', 'API key for cloud provider (or env SORTAI_API_KEY / ANTHROPIC_API_KEY / OPENAI_API_KEY)')
65
+ .option('--mask', 'pseudonymise OCR via pseudonym-mcp before sending to cloud LLM', false)
66
+ .option('--lang <code>', "pseudonym-mcp language: 'en' | 'pl' (default: pl)")
67
+ .option('--exclude <patterns>', 'comma-separated folder names to skip (overrides config)')
68
+ .option('--limit <n>', 'process at most N files', v => parseInt(v, 10))
69
+ .option('--skip-tagged', 'skip files that already have the auto-tag (cfg.tags.autoTag)', false)
70
+ .option('--no-dedup', 'skip SHA256 duplicate detection (hash-based #Duplikat tag)')
71
+ .option('--verbose', 'extra logs', false)
72
+ .parse(process.argv);
73
+ const opts = program.opts();
74
+ const folderArg = program.args[0];
75
+ if (opts.cloud && !['anthropic', 'openai'].includes(opts.cloud)) {
76
+ process.stderr.write(chalk.red(`Unknown --cloud provider: ${opts.cloud}\n`));
77
+ process.exit(1);
78
+ }
79
+ if (opts.mask && !opts.cloud) {
80
+ process.stderr.write(chalk.yellow('⚠️ --mask without --cloud is a no-op (local Ollama already keeps data offline).\n'));
81
+ }
82
+ let configResult;
83
+ try {
84
+ configResult = await loadConfig(opts.config);
85
+ }
86
+ catch (err) {
87
+ const msg = err instanceof Error ? err.message : String(err);
88
+ process.stderr.write(chalk.red(`Config error: ${msg}\n`));
89
+ process.exit(1);
90
+ }
91
+ if (configResult.created) {
92
+ process.stdout.write(chalk.green(`✨ Created default config at ${configResult.path}\n`));
93
+ process.stdout.write(' Edit it to customise tags, then re-run.\n');
94
+ return;
95
+ }
96
+ const cfg = applyOverrides(configResult.config, opts);
97
+ if (cfg.llm.provider !== 'ollama' && !cfg.llm.apiKey) {
98
+ process.stderr.write(chalk.red(`Missing API key for ${cfg.llm.provider}. Pass --api-key or set the env var.\n`));
99
+ process.exit(1);
100
+ }
101
+ const rawFolder = folderArg ?? cfg.scan.folder;
102
+ const root = path.resolve(expandHome(rawFolder));
103
+ if (!existsSync(root)) {
104
+ process.stderr.write(chalk.red(`Folder does not exist: ${root}\n`));
105
+ process.exit(1);
106
+ }
107
+ let masker;
108
+ if (cfg.mask.enabled && cfg.llm.provider !== 'ollama') {
109
+ masker = new Masker(cfg);
110
+ const spin = ora('Starting pseudonym-mcp…').start();
111
+ try {
112
+ await masker.connect();
113
+ spin.succeed('pseudonym-mcp ready');
114
+ }
115
+ catch (err) {
116
+ spin.fail(err instanceof Error ? err.message : String(err));
117
+ masker = undefined;
118
+ }
119
+ }
120
+ process.stdout.write(chalk.cyan(`🚀 Scanning ${root}\n`));
121
+ process.stdout.write(` Provider: ${cfg.llm.provider} (${cfg.llm.model})`);
122
+ if (cfg.mask.enabled && masker)
123
+ process.stdout.write(chalk.gray(' [masked]'));
124
+ if (opts.dryRun)
125
+ process.stdout.write(chalk.yellow(' [dry-run]'));
126
+ process.stdout.write('\n');
127
+ if (cfg.scan.excludeFolders.length) {
128
+ process.stdout.write(chalk.gray(` Excluded: ${cfg.scan.excludeFolders.join(', ')}\n`));
129
+ }
130
+ process.stdout.write('\n');
131
+ let allFiles = await walkFiles(root, cfg);
132
+ process.stdout.write(`📁 Files: ${allFiles.length}\n`);
133
+ if (opts.skipTagged) {
134
+ const { execFile } = await import('node:child_process');
135
+ const { promisify } = await import('node:util');
136
+ const exec = promisify(execFile);
137
+ const before = allFiles.length;
138
+ const filtered = [];
139
+ for (const f of allFiles) {
140
+ try {
141
+ const { stdout: md } = await exec('mdls', ['-name', 'kMDItemUserTags', '-raw', f], { timeout: 3_000 });
142
+ if (!md.includes(cfg.tags.autoTag))
143
+ filtered.push(f);
144
+ }
145
+ catch {
146
+ filtered.push(f);
147
+ }
148
+ }
149
+ allFiles = filtered;
150
+ process.stdout.write(chalk.gray(` Skip-tagged: ${before - allFiles.length} pominięte, ${allFiles.length} do przetworzenia\n`));
151
+ }
152
+ if (opts.limit && opts.limit > 0 && allFiles.length > opts.limit) {
153
+ allFiles = allFiles.slice(0, opts.limit);
154
+ process.stdout.write(chalk.gray(` Limit: ${opts.limit} plików\n`));
155
+ }
156
+ let dedup;
157
+ if (cfg.dedup.enabled && allFiles.length > 1) {
158
+ process.stdout.write(chalk.gray(`🔢 Hashing ${allFiles.length} files for dedup…\n`));
159
+ dedup = await findDuplicates(allFiles, cfg);
160
+ process.stdout.write(chalk.gray(` Hashed: ${dedup.hashedFiles}, skipped >${cfg.dedup.maxFileSizeMB}MB: ${dedup.skippedLarge}, ` +
161
+ `duplicate groups: ${dedup.totalGroups}, files in groups: ${dedup.totalDuplicates}\n`));
162
+ }
163
+ process.stdout.write('\n');
164
+ const stats = { ok: 0, preOnly: 0, skipped: 0, errors: 0, total: allFiles.length };
165
+ const skipExt = new Set(cfg.scan.skipExtensions);
166
+ const ocrExt = new Set(cfg.scan.ocrExtensions);
167
+ const videoExt = new Set(cfg.scan.videoExtensions);
168
+ for (const filePath of allFiles) {
169
+ const rel = path.relative(root, filePath);
170
+ const name = path.basename(filePath);
171
+ const ext = path.extname(filePath).toLowerCase();
172
+ if (skipExt.has(ext)) {
173
+ stats.skipped++;
174
+ continue;
175
+ }
176
+ process.stdout.write(chalk.bold(`🔍 ${rel}\n`));
177
+ let ocrText = '';
178
+ if (ocrExt.has(ext)) {
179
+ process.stdout.write(' 📖 OCR…');
180
+ ocrText = await extractOcrText(filePath, cfg);
181
+ const words = ocrText.split(/\s+/).filter(Boolean).length;
182
+ process.stdout.write(` ${words} words\n`);
183
+ }
184
+ else if (videoExt.has(ext)) {
185
+ process.stdout.write(' 🎬 Video\n');
186
+ }
187
+ else {
188
+ process.stdout.write(` 📄 ${ext}\n`);
189
+ }
190
+ const preTagsBase = preTagFromPath(filePath, ocrText, cfg);
191
+ const dupGroup = dedup?.groupByFile.get(filePath);
192
+ const preTags = dupGroup ? mergeTags(cfg, preTagsBase, ['#Duplikat']) : preTagsBase;
193
+ if (dupGroup) {
194
+ const others = dupGroup.files.filter(f => f !== filePath).map(f => path.basename(f));
195
+ process.stdout.write(chalk.magenta(` 🧬 Duplicate of: ${others.join(', ')}\n`));
196
+ }
197
+ let finalTags;
198
+ let finalComment;
199
+ if (preTags.length >= 4 && !ocrText.trim()) {
200
+ finalTags = mergeTags(cfg, preTags, [cfg.tags.autoTag]);
201
+ finalComment = `Auto z nazwy/ścieżki: ${name}.`;
202
+ process.stdout.write(chalk.gray(` ⚡ Pre-only: ${preTags.join(' ')}\n`));
203
+ stats.preOnly++;
204
+ }
205
+ else {
206
+ process.stdout.write(chalk.gray(` 🧠 ${cfg.llm.provider}…\n`));
207
+ const PER_FILE_TIMEOUT_MS = 180_000;
208
+ let timer;
209
+ const timeout = new Promise((_, reject) => {
210
+ timer = setTimeout(() => reject(new Error(`per-file timeout after ${PER_FILE_TIMEOUT_MS}ms`)), PER_FILE_TIMEOUT_MS);
211
+ });
212
+ try {
213
+ const result = await Promise.race([
214
+ inferTagsAndComment({ fileName: name, ext, preTags, ocrText }, cfg, masker),
215
+ timeout,
216
+ ]);
217
+ finalTags = mergeTags(cfg, result.tags, [cfg.tags.autoTag]);
218
+ finalComment = result.comment || `Plik: ${name}.`;
219
+ }
220
+ catch (err) {
221
+ const msg = err instanceof Error ? err.message : String(err);
222
+ process.stdout.write(chalk.yellow(` ⏱ ${msg} — fallback\n`));
223
+ finalTags = mergeTags(cfg, preTags, [cfg.tags.autoTag]).slice(0, 6);
224
+ finalComment = `Plik: ${name}.`;
225
+ }
226
+ finally {
227
+ if (timer)
228
+ clearTimeout(timer);
229
+ }
230
+ }
231
+ if (opts.dryRun) {
232
+ process.stdout.write(chalk.green(` ✅ ${finalTags.join(' ')}\n`));
233
+ process.stdout.write(chalk.gray(` 📝 ${finalComment}\n\n`));
234
+ stats.ok++;
235
+ continue;
236
+ }
237
+ try {
238
+ await writeFileMetadata(filePath, finalTags, finalComment);
239
+ process.stdout.write(chalk.green(` ✅ ${finalTags.join(' ')}\n`));
240
+ process.stdout.write(chalk.gray(` 📝 ${finalComment}\n\n`));
241
+ stats.ok++;
242
+ }
243
+ catch (err) {
244
+ const msg = err instanceof Error ? err.message : String(err);
245
+ process.stdout.write(chalk.red(` ❌ Write failed: ${msg}\n\n`));
246
+ stats.errors++;
247
+ }
248
+ }
249
+ if (masker)
250
+ await masker.close();
251
+ process.stdout.write('═══════════════════════════════════════════════════════\n');
252
+ process.stdout.write(chalk.bold('✨ Done\n'));
253
+ process.stdout.write(chalk.green(` ✅ Success: ${stats.ok}\n`));
254
+ process.stdout.write(chalk.gray(` ⚡ Pre-only: ${stats.preOnly}\n`));
255
+ process.stdout.write(chalk.gray(` ⏭ Skipped: ${stats.skipped}\n`));
256
+ process.stdout.write(chalk.red(` ❌ Errors: ${stats.errors}\n`));
257
+ }
258
+ main().catch(err => {
259
+ process.stderr.write(chalk.red(`Fatal: ${err instanceof Error ? err.message : String(err)}\n`));
260
+ process.exit(1);
261
+ });
package/dist/config.js ADDED
@@ -0,0 +1,70 @@
1
+ import { promises as fs } from 'node:fs';
2
+ import { existsSync } from 'node:fs';
3
+ import path from 'node:path';
4
+ import os from 'node:os';
5
+ import { z } from 'zod';
6
+ import { DEFAULT_CONFIG } from './defaults.js';
7
+ const PathRuleSchema = z.object({
8
+ pattern: z.string(),
9
+ flags: z.string().optional(),
10
+ tags: z.array(z.string()),
11
+ });
12
+ const ConfigSchema = z.object({
13
+ scan: z.object({
14
+ folder: z.string(),
15
+ excludeFolders: z.array(z.string()),
16
+ skipExtensions: z.array(z.string()),
17
+ ocrExtensions: z.array(z.string()),
18
+ videoExtensions: z.array(z.string()),
19
+ }),
20
+ ocr: z.object({
21
+ maxChars: z.number().int().positive(),
22
+ llmMaxChars: z.number().int().positive(),
23
+ startPage: z.number().int().positive().default(1),
24
+ maxPages: z.number().int().positive().default(2),
25
+ }),
26
+ llm: z.object({
27
+ provider: z.enum(['ollama', 'anthropic', 'openai']),
28
+ model: z.string(),
29
+ temperature: z.number(),
30
+ numPredict: z.number().int().positive(),
31
+ ollamaUrl: z.string().url(),
32
+ apiKey: z.string().optional(),
33
+ }),
34
+ mask: z.object({
35
+ enabled: z.boolean(),
36
+ lang: z.enum(['en', 'pl']),
37
+ }),
38
+ dedup: z.object({
39
+ enabled: z.boolean().default(true),
40
+ maxFileSizeMB: z.number().int().positive().default(200),
41
+ }).default({ enabled: true, maxFileSizeMB: 200 }),
42
+ tags: z.object({
43
+ allowed: z.array(z.string()),
44
+ strict: z.array(z.string()),
45
+ aliases: z.record(z.string(), z.string()),
46
+ strictEvidence: z.record(z.string(), z.array(z.string())),
47
+ pathRules: z.array(PathRuleSchema),
48
+ autoTag: z.string(),
49
+ }),
50
+ context: z.string(),
51
+ });
52
+ export const DEFAULT_CONFIG_PATH = path.join(process.env.XDG_CONFIG_HOME ?? path.join(os.homedir(), '.config'), 'sortai', 'config.json');
53
+ export function expandHome(p) {
54
+ if (p.startsWith('~')) {
55
+ return path.join(os.homedir(), p.slice(1));
56
+ }
57
+ return p;
58
+ }
59
+ export async function loadConfig(customPath) {
60
+ const cfgPath = customPath ? path.resolve(expandHome(customPath)) : DEFAULT_CONFIG_PATH;
61
+ if (!existsSync(cfgPath)) {
62
+ await fs.mkdir(path.dirname(cfgPath), { recursive: true });
63
+ await fs.writeFile(cfgPath, JSON.stringify(DEFAULT_CONFIG, null, 2), 'utf8');
64
+ return { config: DEFAULT_CONFIG, path: cfgPath, created: true };
65
+ }
66
+ const raw = await fs.readFile(cfgPath, 'utf8');
67
+ const parsed = JSON.parse(raw);
68
+ const config = ConfigSchema.parse(parsed);
69
+ return { config, path: cfgPath, created: false };
70
+ }
package/dist/dedup.js ADDED
@@ -0,0 +1,64 @@
1
+ import { promises as fs, createReadStream } from 'node:fs';
2
+ import { createHash } from 'node:crypto';
3
+ async function hashFile(filePath) {
4
+ return new Promise((resolve, reject) => {
5
+ const hash = createHash('sha256');
6
+ const stream = createReadStream(filePath);
7
+ stream.on('data', chunk => hash.update(chunk));
8
+ stream.on('end', () => resolve(hash.digest('hex')));
9
+ stream.on('error', reject);
10
+ });
11
+ }
12
+ export async function findDuplicates(files, cfg) {
13
+ const empty = {
14
+ groupByFile: new Map(),
15
+ totalGroups: 0,
16
+ totalDuplicates: 0,
17
+ skippedLarge: 0,
18
+ hashedFiles: 0,
19
+ };
20
+ if (!cfg.dedup.enabled || files.length < 2)
21
+ return empty;
22
+ const maxBytes = cfg.dedup.maxFileSizeMB * 1024 * 1024;
23
+ const hashes = new Map();
24
+ let skippedLarge = 0;
25
+ let hashedFiles = 0;
26
+ const BATCH = 8;
27
+ for (let i = 0; i < files.length; i += BATCH) {
28
+ const batch = files.slice(i, i + BATCH);
29
+ await Promise.all(batch.map(async (f) => {
30
+ try {
31
+ const st = await fs.stat(f);
32
+ if (!st.isFile())
33
+ return;
34
+ if (st.size > maxBytes) {
35
+ skippedLarge++;
36
+ return;
37
+ }
38
+ if (st.size === 0)
39
+ return;
40
+ const h = await hashFile(f);
41
+ if (!hashes.has(h))
42
+ hashes.set(h, []);
43
+ hashes.get(h).push(f);
44
+ hashedFiles++;
45
+ }
46
+ catch {
47
+ // unreadable or transient; skip silently
48
+ }
49
+ }));
50
+ }
51
+ const groupByFile = new Map();
52
+ let totalGroups = 0;
53
+ let totalDuplicates = 0;
54
+ for (const [hash, paths] of hashes) {
55
+ if (paths.length > 1) {
56
+ totalGroups++;
57
+ totalDuplicates += paths.length;
58
+ const group = { hash, files: paths };
59
+ for (const p of paths)
60
+ groupByFile.set(p, group);
61
+ }
62
+ }
63
+ return { groupByFile, totalGroups, totalDuplicates, skippedLarge, hashedFiles };
64
+ }