@woladi/sortai 0.1.1 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/mask.js CHANGED
@@ -51,9 +51,15 @@ export class Masker {
51
51
  name: 'unmask_text',
52
52
  arguments: { text, session_id: sessionId },
53
53
  });
54
- const parsed = extractJson(result);
55
- return typeof parsed.unmasked_text === 'string' ? parsed.unmasked_text :
56
- typeof parsed.text === 'string' ? parsed.text : text;
54
+ const raw = extractRawText(result);
55
+ const parsed = tryParseJsonObject(raw);
56
+ if (parsed) {
57
+ if (typeof parsed.unmasked_text === 'string')
58
+ return parsed.unmasked_text;
59
+ if (typeof parsed.text === 'string')
60
+ return parsed.text;
61
+ }
62
+ return raw ?? text;
57
63
  }
58
64
  async close() {
59
65
  if (this.client) {
@@ -63,21 +69,30 @@ export class Masker {
63
69
  this.transport = null;
64
70
  }
65
71
  }
66
- function extractJson(result) {
72
+ function extractRawText(result) {
67
73
  if (!result || typeof result !== 'object')
68
- return {};
74
+ return null;
69
75
  const r = result;
70
- if (r.structuredContent && typeof r.structuredContent === 'object') {
71
- return r.structuredContent;
72
- }
73
76
  const textBlock = r.content?.find(b => b.type === 'text' && typeof b.text === 'string');
74
- if (!textBlock?.text)
75
- return {};
77
+ return textBlock?.text ?? null;
78
+ }
79
+ function tryParseJsonObject(text) {
80
+ if (!text)
81
+ return null;
76
82
  try {
77
- const parsed = JSON.parse(textBlock.text);
78
- return parsed && typeof parsed === 'object' ? parsed : {};
83
+ const parsed = JSON.parse(text);
84
+ return parsed && typeof parsed === 'object' ? parsed : null;
79
85
  }
80
86
  catch {
87
+ return null;
88
+ }
89
+ }
90
+ function extractJson(result) {
91
+ if (!result || typeof result !== 'object')
81
92
  return {};
93
+ const r = result;
94
+ if (r.structuredContent && typeof r.structuredContent === 'object') {
95
+ return r.structuredContent;
82
96
  }
97
+ return tryParseJsonObject(extractRawText(result)) ?? {};
83
98
  }
@@ -0,0 +1,20 @@
1
+ import { promises as fs } from 'node:fs';
2
+ import path from 'node:path';
3
+ import { execFile } from 'node:child_process';
4
+ export async function executeMove(op) {
5
+ await fs.mkdir(path.dirname(op.to), { recursive: true });
6
+ try {
7
+ await fs.rename(op.from, op.to);
8
+ }
9
+ catch (err) {
10
+ const code = err.code;
11
+ if (code === 'EXDEV') {
12
+ await fs.copyFile(op.from, op.to);
13
+ await fs.unlink(op.from);
14
+ }
15
+ else {
16
+ throw err;
17
+ }
18
+ }
19
+ execFile('mdimport', [op.to], { timeout: 5_000 }, () => { });
20
+ }
@@ -0,0 +1,84 @@
1
+ import path from 'node:path';
2
+ import { existsSync } from 'node:fs';
3
+ import { walkFiles } from '../walker.js';
4
+ import { isMetaTag } from '../tags.js';
5
+ import { expandHome } from '../config.js';
6
+ import { readMacosTags } from './read-tags.js';
7
+ function dedupName(desired, taken) {
8
+ const key = desired.toLowerCase();
9
+ if (!taken.has(key) && !existsSync(desired)) {
10
+ taken.add(key);
11
+ return desired;
12
+ }
13
+ const dir = path.dirname(desired);
14
+ const ext = path.extname(desired);
15
+ const base = path.basename(desired, ext);
16
+ for (let n = 2; n < 10_000; n++) {
17
+ const cand = path.join(dir, `${base}_${n}${ext}`);
18
+ const ck = cand.toLowerCase();
19
+ if (!taken.has(ck) && !existsSync(cand)) {
20
+ taken.add(ck);
21
+ return cand;
22
+ }
23
+ }
24
+ taken.add(key);
25
+ return desired;
26
+ }
27
+ export function pickPrimaryTag(tags, cfg) {
28
+ const filtered = tags.filter(t => !isMetaTag(t, cfg));
29
+ if (filtered.length === 0)
30
+ return null;
31
+ for (const p of cfg.organize.priority) {
32
+ if (filtered.includes(p))
33
+ return p;
34
+ }
35
+ return filtered[0];
36
+ }
37
+ export function folderForTag(tag, cfg) {
38
+ if (cfg.organize.folderMap[tag])
39
+ return cfg.organize.folderMap[tag];
40
+ return tag.replace(/^#/, '');
41
+ }
42
+ export async function buildOrganizePlan(root, cfg) {
43
+ const files = await walkFiles(root, cfg);
44
+ const moves = [];
45
+ const skips = [];
46
+ const taken = new Set();
47
+ let conflicts = 0;
48
+ const target = path.resolve(expandHome(cfg.organize.target));
49
+ for (const file of files) {
50
+ const tags = await readMacosTags(file);
51
+ const primary = pickPrimaryTag(tags, cfg);
52
+ if (!primary) {
53
+ if (cfg.organize.unsorted === 'skip') {
54
+ skips.push({ path: file, reason: 'brak tagów (skip)' });
55
+ continue;
56
+ }
57
+ if (cfg.organize.unsorted === 'keep') {
58
+ skips.push({ path: file, reason: 'brak tagów (zostawiam w miejscu)' });
59
+ continue;
60
+ }
61
+ const desired = path.join(target, cfg.organize.unsortedFolder, path.basename(file));
62
+ if (path.resolve(desired) === path.resolve(file)) {
63
+ skips.push({ path: file, reason: 'już w docelowym miejscu' });
64
+ continue;
65
+ }
66
+ const final = dedupName(desired, taken);
67
+ if (final !== desired)
68
+ conflicts++;
69
+ moves.push({ from: file, to: final, primaryTag: '', allTags: tags });
70
+ continue;
71
+ }
72
+ const folder = folderForTag(primary, cfg);
73
+ const desired = path.join(target, folder, path.basename(file));
74
+ if (path.resolve(desired) === path.resolve(file)) {
75
+ skips.push({ path: file, reason: 'już w docelowym miejscu' });
76
+ continue;
77
+ }
78
+ const final = dedupName(desired, taken);
79
+ if (final !== desired)
80
+ conflicts++;
81
+ moves.push({ from: file, to: final, primaryTag: primary, allTags: tags });
82
+ }
83
+ return { moves, skips, conflicts };
84
+ }
@@ -0,0 +1,18 @@
1
+ import { execFile } from 'node:child_process';
2
+ import { promisify } from 'node:util';
3
+ const execFileAsync = promisify(execFile);
4
+ export async function readMacosTags(filePath) {
5
+ try {
6
+ const { stdout } = await execFileAsync('mdls', ['-name', 'kMDItemUserTags', '-raw', filePath], { timeout: 3_000 });
7
+ const trimmed = stdout.trim();
8
+ if (!trimmed || trimmed === '(null)')
9
+ return [];
10
+ const matches = [...trimmed.matchAll(/"((?:[^"\\]|\\.)*?)"/g)];
11
+ return matches
12
+ .map(m => m[1].split('\n')[0])
13
+ .filter(t => t.startsWith('#'));
14
+ }
15
+ catch {
16
+ return [];
17
+ }
18
+ }
package/dist/tags.js CHANGED
@@ -1,4 +1,5 @@
1
- export function normalizeTag(raw, cfg) {
1
+ export const TAG_SHAPE = /^#[\p{L}\p{N}_-]+$/u;
2
+ export function normalizeTag(raw, cfg, freeForm = false) {
2
3
  if (typeof raw !== 'string')
3
4
  return null;
4
5
  let tag = raw.trim();
@@ -8,7 +9,14 @@ export function normalizeTag(raw, cfg) {
8
9
  tag = '#' + tag;
9
10
  tag = cfg.tags.aliases[tag] ?? tag;
10
11
  const allowed = new Set([...cfg.tags.allowed, cfg.tags.autoTag]);
11
- return allowed.has(tag) ? tag : null;
12
+ if (allowed.has(tag))
13
+ return tag;
14
+ if (freeForm || cfg.tags.freeForm) {
15
+ if (TAG_SHAPE.test(tag))
16
+ return tag;
17
+ return null;
18
+ }
19
+ return null;
12
20
  }
13
21
  export function mergeTags(cfg, ...lists) {
14
22
  const seen = [];
@@ -16,7 +24,7 @@ export function mergeTags(cfg, ...lists) {
16
24
  if (!list)
17
25
  continue;
18
26
  for (const raw of list) {
19
- const normalized = normalizeTag(raw, cfg);
27
+ const normalized = normalizeTag(raw, cfg, cfg.tags.freeForm);
20
28
  if (normalized && !seen.includes(normalized)) {
21
29
  seen.push(normalized);
22
30
  }
@@ -34,3 +42,24 @@ export function strictTagHasEvidence(tag, evidence, cfg) {
34
42
  const haystack = evidence.toLowerCase();
35
43
  return keywords.some(kw => haystack.includes(kw.toLowerCase()));
36
44
  }
45
+ export function isMetaTag(tag, cfg) {
46
+ if (tag === cfg.tags.autoTag)
47
+ return true;
48
+ if (tag === '#Duplikat' || tag === '#PrawdopodobnaKopia')
49
+ return true;
50
+ return false;
51
+ }
52
+ export class TagDiscovery {
53
+ counts = new Map();
54
+ record(tag) {
55
+ this.counts.set(tag, (this.counts.get(tag) ?? 0) + 1);
56
+ }
57
+ entries() {
58
+ return [...this.counts.entries()]
59
+ .map(([tag, count]) => ({ tag, count }))
60
+ .sort((a, b) => b.count - a.count);
61
+ }
62
+ get size() {
63
+ return this.counts.size;
64
+ }
65
+ }
@@ -0,0 +1,359 @@
1
+ import path from 'node:path';
2
+ import { existsSync } from 'node:fs';
3
+ import chalk from 'chalk';
4
+ import ora from 'ora';
5
+ import { select, input, confirm, password } from '@inquirer/prompts';
6
+ import { DEFAULT_CONFIG, DEFAULT_ORGANIZE } from '../defaults.js';
7
+ import { expandHome, saveConfig, resolveConfigPath } from '../config.js';
8
+ import { probeOllama, modelSizeLabel, SUGGESTED_OLLAMA_MODELS } from '../llm/ollama-detect.js';
9
+ import { pickSampleFiles, ocrSamples } from './sample.js';
10
+ import { detectLanguages, languageList } from './languages.js';
11
+ import { generateTaxonomy, applyTaxonomyToConfig } from './taxonomy.js';
12
+ import { refineTaxonomyLoop, renderTaxonomyTable } from './refine.js';
13
+ const CLOUD_MODELS = {
14
+ anthropic: [
15
+ { name: 'claude-opus-4-7', label: 'Claude Opus 4.7 — najmocniejszy' },
16
+ { name: 'claude-sonnet-4-6', label: 'Claude Sonnet 4.6 — szybki + mocny (rekomendowany)' },
17
+ { name: 'claude-haiku-4-5-20251001', label: 'Claude Haiku 4.5 — najszybszy' },
18
+ ],
19
+ openai: [
20
+ { name: 'gpt-4o', label: 'GPT-4o' },
21
+ { name: 'gpt-4o-mini', label: 'GPT-4o mini — szybki, tani' },
22
+ ],
23
+ };
24
+ async function askMode() {
25
+ return select({
26
+ message: 'Co chcesz robić z plikami?',
27
+ choices: [
28
+ { name: 'Tagowanie (Finder tagi + komentarze)', value: 'tag', description: 'Zapisuje tagi i komentarze do xattr plików, indeksuje Spotlight.' },
29
+ { name: 'Sortowanie do folderów (na bazie istniejących tagów)', value: 'organize', description: 'Przenosi pliki do folderów zgodnie z tagami które już mają.' },
30
+ { name: 'Tagowanie + sortowanie', value: 'tag+organize', description: 'Najpierw nakłada tagi, potem przenosi do folderów.' },
31
+ { name: 'Tylko discovery (próbka, bez modyfikacji)', value: 'discovery', description: 'Pokaż jak LLM widzi Twoją kolekcję, bez zapisywania niczego.' },
32
+ ],
33
+ });
34
+ }
35
+ async function askScan(defaultFolder) {
36
+ const folder = await input({ message: 'Folder do skanowania (rekurencyjnie):', default: defaultFolder });
37
+ const excludeCsv = await input({
38
+ message: 'Wyklucz podkatalogi (CSV):',
39
+ default: DEFAULT_CONFIG.scan.excludeFolders.join(','),
40
+ });
41
+ return {
42
+ folder,
43
+ excludeFolders: excludeCsv.split(',').map(s => s.trim()).filter(Boolean),
44
+ skipExtensions: DEFAULT_CONFIG.scan.skipExtensions,
45
+ ocrExtensions: DEFAULT_CONFIG.scan.ocrExtensions,
46
+ videoExtensions: DEFAULT_CONFIG.scan.videoExtensions,
47
+ };
48
+ }
49
+ async function askProvider() {
50
+ return select({
51
+ message: 'Gdzie ma działać LLM?',
52
+ choices: [
53
+ { name: 'Lokalnie (Ollama) — 100% offline, wolniejsze', value: 'ollama' },
54
+ { name: 'Anthropic Claude — szybkie, wysyła OCR do API', value: 'anthropic' },
55
+ { name: 'OpenAI — szybkie, wysyła OCR do API', value: 'openai' },
56
+ ],
57
+ });
58
+ }
59
+ async function askOllamaModel() {
60
+ const ollamaUrl = await input({
61
+ message: 'Adres Ollama:',
62
+ default: DEFAULT_CONFIG.llm.ollamaUrl,
63
+ });
64
+ const spin = ora('Sprawdzam Ollamę…').start();
65
+ const probe = await probeOllama(ollamaUrl);
66
+ if (!probe.reachable) {
67
+ spin.warn(`Ollama nieosiągalna: ${probe.error ?? 'brak odpowiedzi'}`);
68
+ const proceed = await confirm({
69
+ message: 'Kontynuować mimo wszystko? (możesz uruchomić `ollama serve` później)',
70
+ default: true,
71
+ });
72
+ if (!proceed)
73
+ throw new Error('Anulowano przez użytkownika');
74
+ const model = await input({
75
+ message: 'Model Ollamy (np. mistral-nemo):',
76
+ default: DEFAULT_CONFIG.llm.model,
77
+ });
78
+ return { model, ollamaUrl };
79
+ }
80
+ spin.succeed(`Ollama OK — znaleziono ${probe.models.length} modeli`);
81
+ if (probe.models.length === 0) {
82
+ process.stdout.write(chalk.yellow('\n⚠️ Brak zainstalowanych modeli. Sugerowane:\n'));
83
+ for (const s of SUGGESTED_OLLAMA_MODELS) {
84
+ process.stdout.write(` ${chalk.cyan(`ollama pull ${s.name}`)} — ${s.note}\n`);
85
+ }
86
+ process.stdout.write('\n');
87
+ const model = await input({
88
+ message: 'Model którego użyjesz (wpisz nazwę):',
89
+ default: 'mistral-nemo',
90
+ });
91
+ return { model, ollamaUrl };
92
+ }
93
+ const isEmbedding = (name) => /embed/i.test(name);
94
+ const usable = probe.models.filter(m => !isEmbedding(m.name));
95
+ const embedders = probe.models.filter(m => isEmbedding(m.name));
96
+ const choices = [
97
+ ...usable.map(m => ({
98
+ name: `${m.name} ${chalk.gray('(' + modelSizeLabel(m.size) + ')')}`,
99
+ value: m.name,
100
+ })),
101
+ ...embedders.map(m => ({
102
+ name: `${m.name} ${chalk.gray('(' + modelSizeLabel(m.size) + ') — embedding, nie do tagowania')}`,
103
+ value: m.name,
104
+ })),
105
+ { name: chalk.gray('— wpisz inny —'), value: '__custom__' },
106
+ ];
107
+ const picked = await select({
108
+ message: 'Wybierz model Ollamy:',
109
+ choices,
110
+ });
111
+ const model = picked === '__custom__'
112
+ ? await input({ message: 'Nazwa modelu:', default: 'mistral-nemo' })
113
+ : picked;
114
+ const smallNames = ['llama3.1', 'llama3', 'phi', 'gemma:2b', 'mistral', 'tinyllama'];
115
+ if (smallNames.some(s => model.toLowerCase().includes(s))) {
116
+ process.stdout.write(chalk.yellow('\n⚠️ Mały model — wygenerowana taksonomia może być szorstka.\n' +
117
+ ' Możesz później użyć trybu cloud do init, a Ollamy do tagowania.\n\n'));
118
+ }
119
+ return { model, ollamaUrl };
120
+ }
121
+ async function askCloud(provider, existingKey) {
122
+ const envKey = provider === 'anthropic' ? process.env.ANTHROPIC_API_KEY : process.env.OPENAI_API_KEY;
123
+ let apiKey = existingKey ?? envKey ?? '';
124
+ if (!apiKey) {
125
+ apiKey = await password({
126
+ message: `Klucz API ${provider} (zostanie zapisany w configu):`,
127
+ mask: '*',
128
+ });
129
+ }
130
+ else {
131
+ process.stdout.write(chalk.gray(` Klucz API wykryty z env: ${apiKey.slice(0, 8)}…\n`));
132
+ }
133
+ const model = await select({
134
+ message: `Model ${provider}:`,
135
+ choices: CLOUD_MODELS[provider].map(m => ({ name: m.label, value: m.name })),
136
+ });
137
+ return { model, apiKey };
138
+ }
139
+ async function askMask() {
140
+ const enabled = await confirm({
141
+ message: 'Włączyć pseudonimizację PII (pseudonym-mcp) przed wysyłką do cloud?',
142
+ default: false,
143
+ });
144
+ if (!enabled)
145
+ return { enabled: false, lang: 'pl' };
146
+ const lang = await select({
147
+ message: 'Język danych (dla regułek pseudonimizacji):',
148
+ choices: [
149
+ { name: 'Polski', value: 'pl' },
150
+ { name: 'English', value: 'en' },
151
+ ],
152
+ });
153
+ return { enabled: true, lang };
154
+ }
155
+ async function askContext() {
156
+ process.stdout.write(chalk.gray('Krótki opis siebie/branży/aktualnych spraw. LLM użyje tego jako tła przy tagowaniu.\n' +
157
+ 'Przykład: "Freelancer-projektant w Warszawie. Klienci: AcmeCorp. Aktualne: rozliczenia 2024."\n'));
158
+ return input({
159
+ message: 'Kontekst (możesz pominąć, edytujesz później):',
160
+ default: '',
161
+ });
162
+ }
163
+ async function askFreeForm() {
164
+ return confirm({
165
+ message: 'Pozwolić LLM-owi proponować NOWE tagi (free-form)? Pokażę je w podsumowaniu.',
166
+ default: false,
167
+ });
168
+ }
169
+ async function askOrganize(allowed) {
170
+ const target = await input({
171
+ message: 'Folder docelowy dla sortowania:',
172
+ default: DEFAULT_ORGANIZE.target,
173
+ });
174
+ const strategy = await select({
175
+ message: 'Strategia układu folderów:',
176
+ choices: [
177
+ { name: 'Flat — jeden folder per tag (Faktura/, Bank/)', value: 'flat' },
178
+ { name: 'Nested — priorytet wygrywa, plik trafia tylko do jednego folderu', value: 'nested' },
179
+ { name: 'Custom — niestandardowe mapowanie tag→folder', value: 'custom' },
180
+ ],
181
+ });
182
+ const unsorted = await select({
183
+ message: 'Co z plikami bez tagów?',
184
+ choices: [
185
+ { name: 'Przenieś do _unsorted/', value: 'move' },
186
+ { name: 'Zostaw w miejscu', value: 'keep' },
187
+ { name: 'Pomiń całkowicie', value: 'skip' },
188
+ ],
189
+ });
190
+ return {
191
+ enabled: true,
192
+ target,
193
+ strategy,
194
+ priority: allowed,
195
+ folderMap: {},
196
+ unsorted,
197
+ unsortedFolder: '_unsorted',
198
+ multiTag: 'primary',
199
+ };
200
+ }
201
+ async function runDiscoverySampling(scan, count) {
202
+ const cfgBase = { ...DEFAULT_CONFIG, scan };
203
+ const root = path.resolve(expandHome(scan.folder));
204
+ if (!existsSync(root))
205
+ throw new Error(`Folder nie istnieje: ${root}`);
206
+ const files = await pickSampleFiles(root, cfgBase, { count, ocrEligibleOnly: true });
207
+ if (files.length === 0) {
208
+ throw new Error(`Brak plików do OCR w ${root}. Sprawdź rozszerzenia / wykluczenia.`);
209
+ }
210
+ const spin = ora(`OCR próbki (${files.length} plików)…`).start();
211
+ const samples = await ocrSamples(files, cfgBase, (done, total, file) => {
212
+ spin.text = `OCR ${done}/${total}: ${file ? path.basename(file) : ''}`;
213
+ });
214
+ spin.succeed(`OCR gotowy: ${samples.length} plików`);
215
+ return { files, samples, cfgBase };
216
+ }
217
+ export async function runWizard(opts) {
218
+ process.stdout.write(chalk.bold.cyan('\n✨ sortai — interaktywny wizard\n\n'));
219
+ const mode = await askMode();
220
+ process.stdout.write('\n' + chalk.bold('📁 Folder\n'));
221
+ const scan = await askScan(opts.folderHint ?? DEFAULT_CONFIG.scan.folder);
222
+ let provider = DEFAULT_CONFIG.llm.provider;
223
+ let model = DEFAULT_CONFIG.llm.model;
224
+ let ollamaUrl = DEFAULT_CONFIG.llm.ollamaUrl;
225
+ let apiKey;
226
+ let mask = { ...DEFAULT_CONFIG.mask };
227
+ const needsLlm = mode === 'tag' || mode === 'tag+organize' || mode === 'discovery';
228
+ if (needsLlm) {
229
+ process.stdout.write('\n' + chalk.bold('🧠 LLM\n'));
230
+ provider = await askProvider();
231
+ if (provider === 'ollama') {
232
+ const o = await askOllamaModel();
233
+ model = o.model;
234
+ ollamaUrl = o.ollamaUrl;
235
+ }
236
+ else {
237
+ const c = await askCloud(provider, opts.apiKey);
238
+ model = c.model;
239
+ apiKey = c.apiKey || undefined;
240
+ mask = await askMask();
241
+ }
242
+ }
243
+ process.stdout.write('\n' + chalk.bold('💬 Kontekst\n'));
244
+ const context = needsLlm ? await askContext() : DEFAULT_CONFIG.context;
245
+ const freeForm = needsLlm ? await askFreeForm() : false;
246
+ let cfg = {
247
+ ...DEFAULT_CONFIG,
248
+ scan,
249
+ llm: {
250
+ provider,
251
+ model,
252
+ temperature: DEFAULT_CONFIG.llm.temperature,
253
+ numPredict: DEFAULT_CONFIG.llm.numPredict,
254
+ ollamaUrl,
255
+ apiKey,
256
+ },
257
+ mask,
258
+ tags: { ...DEFAULT_CONFIG.tags, freeForm },
259
+ organize: { ...DEFAULT_ORGANIZE },
260
+ context: context || DEFAULT_CONFIG.context,
261
+ };
262
+ const wantsTaxonomy = needsLlm && (mode === 'tag' || mode === 'tag+organize' || mode === 'discovery');
263
+ if (wantsTaxonomy) {
264
+ process.stdout.write('\n' + chalk.bold('🔬 Próbkowanie\n'));
265
+ const sampleCount = Number(await input({
266
+ message: 'Ile plików zsamplować do generacji taksonomii?',
267
+ default: '30',
268
+ validate: v => /^\d+$/.test(v) && Number(v) > 0 ? true : 'Liczba dodatnia',
269
+ }));
270
+ let sampling;
271
+ try {
272
+ sampling = await runDiscoverySampling(scan, sampleCount);
273
+ }
274
+ catch (err) {
275
+ const msg = err instanceof Error ? err.message : String(err);
276
+ process.stdout.write(chalk.red(`\nProblem z próbkowaniem: ${msg}\n`));
277
+ process.stdout.write(chalk.yellow('Zapisuję config bez taksonomii — uzupełnisz po dograniu plików.\n\n'));
278
+ return finalize(cfg, opts, mode);
279
+ }
280
+ const { samples } = sampling;
281
+ const lang = detectLanguages(samples.map(s => s.ocrText));
282
+ const langs = languageList(lang);
283
+ process.stdout.write(chalk.gray(` Wykryty język: ${langs.join(' + ')} ` +
284
+ `(PL ${(lang.scores.pl * 100).toFixed(0)}% / EN ${(lang.scores.en * 100).toFixed(0)}%)\n`));
285
+ process.stdout.write('\n' + chalk.bold('🪄 Generacja taksonomii (LLM)\n'));
286
+ const taxSpin = ora(`Pytam ${provider}:${model} o taksonomię…`).start();
287
+ let taxonomy;
288
+ try {
289
+ taxonomy = await generateTaxonomy(samples, langs, cfg.context, cfg);
290
+ taxSpin.succeed(`Wygenerowano ${taxonomy.categories.length} kategorii`);
291
+ }
292
+ catch (err) {
293
+ const msg = err instanceof Error ? err.message : String(err);
294
+ taxSpin.fail(`Nieudane: ${msg}`);
295
+ process.stdout.write(chalk.yellow('Zapisuję config z domyślną taksonomią.\n\n'));
296
+ return finalize(cfg, opts, mode);
297
+ }
298
+ if (provider === 'ollama') {
299
+ process.stdout.write(chalk.yellow('\n⚠️ To DRAFT z lokalnego modelu — przejrzyj kategorie i popraw co trzeba.\n'));
300
+ }
301
+ if (mode !== 'discovery') {
302
+ taxonomy = await refineTaxonomyLoop(taxonomy, {
303
+ samples,
304
+ langs,
305
+ userContext: cfg.context,
306
+ baseCfg: cfg,
307
+ });
308
+ cfg = applyTaxonomyToConfig(taxonomy, cfg);
309
+ }
310
+ else {
311
+ process.stdout.write('\n' + chalk.bold('🔎 Discovery — proponowana taksonomia:\n\n'));
312
+ process.stdout.write(renderTaxonomyTable(taxonomy) + '\n');
313
+ const save = await confirm({ message: 'Zapisać tę taksonomię do configu?', default: false });
314
+ if (save)
315
+ cfg = applyTaxonomyToConfig(taxonomy, cfg);
316
+ }
317
+ }
318
+ if (mode === 'organize' || mode === 'tag+organize') {
319
+ process.stdout.write('\n' + chalk.bold('📂 Sortowanie\n'));
320
+ cfg.organize = await askOrganize(cfg.tags.allowed);
321
+ }
322
+ return finalize(cfg, opts, mode);
323
+ }
324
+ async function finalize(cfg, opts, mode) {
325
+ process.stdout.write('\n' + chalk.bold('💾 Zapis\n'));
326
+ const cfgPath = resolveConfigPath(opts.configPath);
327
+ const exists = existsSync(cfgPath);
328
+ if (exists) {
329
+ process.stdout.write(chalk.gray(` Istniejący config zostanie zarchiwizowany: ${cfgPath}.bak.*\n`));
330
+ }
331
+ const savedPath = await saveConfig(cfg, opts.configPath);
332
+ process.stdout.write(chalk.green(` ✓ Zapisano ${savedPath}\n`));
333
+ let shouldRunTag = false;
334
+ let shouldRunOrganize = false;
335
+ if (mode !== 'discovery') {
336
+ const runNow = await confirm({
337
+ message: mode === 'tag+organize'
338
+ ? 'Uruchomić teraz tagowanie + sortowanie?'
339
+ : mode === 'organize'
340
+ ? 'Uruchomić teraz sortowanie?'
341
+ : 'Uruchomić teraz tagowanie?',
342
+ default: false,
343
+ });
344
+ if (runNow) {
345
+ shouldRunTag = mode === 'tag' || mode === 'tag+organize';
346
+ shouldRunOrganize = mode === 'organize' || mode === 'tag+organize';
347
+ }
348
+ else {
349
+ process.stdout.write(chalk.gray('\nMożesz uruchomić później:\n'));
350
+ if (mode === 'tag' || mode === 'tag+organize') {
351
+ process.stdout.write(` ${chalk.cyan(`sortai tag ${cfg.scan.folder}`)}\n`);
352
+ }
353
+ if (mode === 'organize' || mode === 'tag+organize') {
354
+ process.stdout.write(` ${chalk.cyan(`sortai organize ${cfg.scan.folder} --apply`)}\n`);
355
+ }
356
+ }
357
+ }
358
+ return { config: cfg, configPath: savedPath, mode, shouldRunTag, shouldRunOrganize };
359
+ }
@@ -0,0 +1,34 @@
1
+ const STOPWORDS = {
2
+ pl: ['i ', ' w ', ' na ', ' z ', ' do ', ' się ', ' nie ', ' jest ', ' są ', ' oraz ', ' lub ', ' za ', ' od ', ' dla ', ' co ', ' to ', ' już ', ' tym ', ' przez ', ' przy ', ' aby '],
3
+ en: [' the ', ' and ', ' of ', ' to ', ' for ', ' in ', ' on ', ' is ', ' are ', ' with ', ' by ', ' as ', ' at ', ' from ', ' this ', ' that ', ' be ', ' or ', ' has ', ' have '],
4
+ };
5
+ export function detectLanguages(texts) {
6
+ const scores = { pl: 0, en: 0 };
7
+ for (const txt of texts) {
8
+ if (!txt)
9
+ continue;
10
+ const lower = ' ' + txt.toLowerCase().replace(/\s+/g, ' ') + ' ';
11
+ for (const lang of Object.keys(STOPWORDS)) {
12
+ for (const sw of STOPWORDS[lang]) {
13
+ let idx = 0;
14
+ while ((idx = lower.indexOf(sw, idx)) !== -1) {
15
+ scores[lang]++;
16
+ idx += sw.length;
17
+ }
18
+ }
19
+ }
20
+ }
21
+ const total = scores.pl + scores.en;
22
+ const ratios = {
23
+ pl: total > 0 ? scores.pl / total : 0,
24
+ en: total > 0 ? scores.en / total : 0,
25
+ };
26
+ const dominant = ratios.pl >= ratios.en ? 'pl' : 'en';
27
+ const bilingual = total > 0 && Math.min(ratios.pl, ratios.en) >= 0.2;
28
+ return { dominant, scores: ratios, bilingual };
29
+ }
30
+ export function languageList(result) {
31
+ if (result.bilingual)
32
+ return ['pl', 'en'];
33
+ return [result.dominant];
34
+ }