@woladi/sortai 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +193 -0
- package/dist/cli.js +261 -0
- package/dist/config.js +70 -0
- package/dist/dedup.js +64 -0
- package/dist/defaults.js +123 -0
- package/dist/llm/cloud.js +67 -0
- package/dist/llm/index.js +127 -0
- package/dist/llm/local.js +43 -0
- package/dist/llm/prompt.js +75 -0
- package/dist/macos.js +55 -0
- package/dist/mask.js +83 -0
- package/dist/ocr.js +19 -0
- package/dist/pretag.js +33 -0
- package/dist/tags.js +36 -0
- package/dist/types.js +1 -0
- package/dist/walker.js +34 -0
- package/package.json +61 -0
package/dist/defaults.js
ADDED
|
@@ -0,0 +1,123 @@
|
|
|
1
|
+
export const DEFAULT_ALLOWED_TAGS = [
|
|
2
|
+
'#Bank', '#Faktura', '#FakturaProforma', '#Wyciag', '#Kredyt', '#KartaKredytowa', '#Podatki',
|
|
3
|
+
'#CV', '#Kariera', '#Umowa', '#Oferta', '#Korespondencja',
|
|
4
|
+
'#Pismo', '#Wniosek', '#Reklamacja', '#Skarga', '#Decyzja', '#Oswiadczenie', '#Ugoda',
|
|
5
|
+
'#Protokol', '#Regulamin', '#Harmonogram',
|
|
6
|
+
'#Nieruchomosc', '#Zdrowie', '#RODO',
|
|
7
|
+
'#Wyslane', '#Odebrane', '#Draft', '#Zalacznik',
|
|
8
|
+
'#Duplikat', '#PrawdopodobnaKopia',
|
|
9
|
+
'#Skan', '#Screenshot', '#Nagranie', '#Email', '#Foto', '#Grafika',
|
|
10
|
+
'#AI_Sorted',
|
|
11
|
+
];
|
|
12
|
+
export const DEFAULT_STRICT_TAGS = [
|
|
13
|
+
'#Bank', '#Faktura', '#KartaKredytowa', '#Kredyt', '#Podatki',
|
|
14
|
+
'#RODO', '#Zdrowie',
|
|
15
|
+
'#PrawdopodobnaKopia',
|
|
16
|
+
];
|
|
17
|
+
export const DEFAULT_TAG_ALIASES = {
|
|
18
|
+
'#Karta': '#KartaKredytowa',
|
|
19
|
+
'#Invoice': '#Faktura',
|
|
20
|
+
'#Mortgage': '#Kredyt',
|
|
21
|
+
'#Loan': '#Kredyt',
|
|
22
|
+
'#Tax': '#Podatki',
|
|
23
|
+
'#GDPR': '#RODO',
|
|
24
|
+
'#Health': '#Zdrowie',
|
|
25
|
+
'#Screen': '#Screenshot',
|
|
26
|
+
'#Photo': '#Foto',
|
|
27
|
+
'#Graphic': '#Grafika',
|
|
28
|
+
'#Video': '#Nagranie',
|
|
29
|
+
'#Resume': '#CV',
|
|
30
|
+
'#Contract': '#Umowa',
|
|
31
|
+
};
|
|
32
|
+
export const DEFAULT_STRICT_EVIDENCE = {
|
|
33
|
+
'#Bank': ['bank', 'iban', 'rachunek bankowy', 'numer konta', 'account number'],
|
|
34
|
+
'#Faktura': ['faktura', 'invoice', ' vat ', 'nip '],
|
|
35
|
+
'#KartaKredytowa': ['karta kredytowa', 'kartą kredytową', 'credit card'],
|
|
36
|
+
'#Kredyt': ['kredyt', 'pożyczka', 'mortgage', 'loan'],
|
|
37
|
+
'#Podatki': ['podatek', ' pit ', ' cit ', 'urząd skarbowy', 'tax return'],
|
|
38
|
+
'#RODO': ['rodo', 'gdpr', 'dane osobowe', 'rozporządzenie o ochronie danych', 'personal data'],
|
|
39
|
+
'#Zdrowie': ['nfz', ' zus ', 'lekarz', 'recepta', 'apteka', 'medical', 'prescription'],
|
|
40
|
+
'#PrawdopodobnaKopia': ['copy', ' kopia', 'duplikat', 'duplicate', '(1)', '(2)', '(3)', '(4)', '(5)', ' 2.', ' 3.'],
|
|
41
|
+
};
|
|
42
|
+
export const DEFAULT_PATH_RULES = [
|
|
43
|
+
{ pattern: '\\bbank\\b|iban|rachunek', flags: 'i', tags: ['#Bank'] },
|
|
44
|
+
{ pattern: 'faktura|invoice', flags: 'i', tags: ['#Faktura'] },
|
|
45
|
+
{ pattern: 'proforma', flags: 'i', tags: ['#FakturaProforma'] },
|
|
46
|
+
{ pattern: 'wyci.g|wyciag|statement', flags: 'i', tags: ['#Wyciag', '#Bank'] },
|
|
47
|
+
{ pattern: 'karta.kredyt|kredyt.kart|credit.card', flags: 'i', tags: ['#KartaKredytowa', '#Bank'] },
|
|
48
|
+
{ pattern: 'kredyt|po.yczka|loan|mortgage', flags: 'i', tags: ['#Kredyt'] },
|
|
49
|
+
{ pattern: 'podatek|\\bpit\\b|\\bcit\\b|urz.d.skarbow|tax.return', flags: 'i', tags: ['#Podatki'] },
|
|
50
|
+
{ pattern: 'RODO|GDPR|ochrona.danych|personal.data', flags: 'i', tags: ['#RODO'] },
|
|
51
|
+
{ pattern: 'NFZ|ZUS|recepta|apteka|prescription|medical', flags: 'i', tags: ['#Zdrowie'] },
|
|
52
|
+
{ pattern: 'reklamacj|complaint', flags: 'i', tags: ['#Reklamacja'] },
|
|
53
|
+
{ pattern: '\\bskarg', flags: 'i', tags: ['#Skarga'] },
|
|
54
|
+
{ pattern: 'umow[ae]|contract', flags: 'i', tags: ['#Umowa'] },
|
|
55
|
+
{ pattern: 'oferta|offer|proposal', flags: 'i', tags: ['#Oferta'] },
|
|
56
|
+
{ pattern: 'wniosek|wniosk|application', flags: 'i', tags: ['#Wniosek'] },
|
|
57
|
+
{ pattern: 'harmonogram|schedule|timeline', flags: 'i', tags: ['#Harmonogram'] },
|
|
58
|
+
{ pattern: 'oswiadczen|o.wiadczen|statement.of', flags: 'i', tags: ['#Oswiadczenie'] },
|
|
59
|
+
{ pattern: 'decyzja|decyzj|decision', flags: 'i', tags: ['#Decyzja'] },
|
|
60
|
+
{ pattern: 'protokol|protok..|protocol|minutes', flags: 'i', tags: ['#Protokol'] },
|
|
61
|
+
{ pattern: 'ugoda|settlement', flags: 'i', tags: ['#Ugoda'] },
|
|
62
|
+
{ pattern: 'regulamin|terms|\\btos\\b', flags: 'i', tags: ['#Regulamin'] },
|
|
63
|
+
{ pattern: 'pismo|letter|correspondence', flags: 'i', tags: ['#Pismo'] },
|
|
64
|
+
{ pattern: '\\bcv\\b|resume|curriculum.vitae', flags: 'i', tags: ['#CV', '#Kariera'] },
|
|
65
|
+
{ pattern: 'linkedin|kariera|career', flags: 'i', tags: ['#Kariera'] },
|
|
66
|
+
{ pattern: 'mieszkanie|nieruchomo|apartment|property|real.estate', flags: 'i', tags: ['#Nieruchomosc'] },
|
|
67
|
+
{ pattern: 'screenshot|zrzut.ekranu', flags: 'i', tags: ['#Screenshot'] },
|
|
68
|
+
{ pattern: 'nagranie|recording|screen.rec', flags: 'i', tags: ['#Nagranie'] },
|
|
69
|
+
{ pattern: 'za..[aą]cznik|attachment|enclosure', flags: 'i', tags: ['#Zalacznik'] },
|
|
70
|
+
{ pattern: '\\bIMG[\\ _-]?\\d+|\\bphoto\\b|\\bfoto\\b', flags: 'i', tags: ['#Foto'] },
|
|
71
|
+
{ pattern: '[\\ \\-_](?:kopia|copy|duplikat)[\\ \\-_.]|\\(\\d+\\)', flags: 'i', tags: ['#PrawdopodobnaKopia'] },
|
|
72
|
+
];
|
|
73
|
+
export const DEFAULT_CONTEXT = 'EDIT ME in ~/.config/sortai/config.json. 1-2 sentence description of yourself and ongoing matters — ' +
|
|
74
|
+
'used by the LLM as background context to prefer the right tags. ' +
|
|
75
|
+
'Example: "Self-employed graphic designer in Warsaw. Recurring clients: AcmeCorp, BetaInc. ' +
|
|
76
|
+
'Documents in PL and EN. Active: tax filings 2024, AcmeCorp branding project."';
|
|
77
|
+
export const DEFAULT_CONFIG = {
|
|
78
|
+
scan: {
|
|
79
|
+
folder: '~/Desktop',
|
|
80
|
+
excludeFolders: ['node_modules', '.git', '.cache'],
|
|
81
|
+
skipExtensions: ['.ds_store', '.sig', '.localized', '.tmp', '.lock', '.pyc'],
|
|
82
|
+
ocrExtensions: ['.pdf', '.png', '.jpg', '.jpeg', '.webp', '.heic'],
|
|
83
|
+
videoExtensions: ['.mov', '.mp4', '.m4v'],
|
|
84
|
+
},
|
|
85
|
+
ocr: {
|
|
86
|
+
maxChars: 4000,
|
|
87
|
+
llmMaxChars: 1500,
|
|
88
|
+
startPage: 1,
|
|
89
|
+
maxPages: 2,
|
|
90
|
+
},
|
|
91
|
+
llm: {
|
|
92
|
+
provider: 'ollama',
|
|
93
|
+
model: 'mistral-nemo',
|
|
94
|
+
temperature: 0.15,
|
|
95
|
+
numPredict: 300,
|
|
96
|
+
ollamaUrl: 'http://localhost:11434',
|
|
97
|
+
},
|
|
98
|
+
mask: {
|
|
99
|
+
enabled: false,
|
|
100
|
+
lang: 'pl',
|
|
101
|
+
},
|
|
102
|
+
dedup: {
|
|
103
|
+
enabled: true,
|
|
104
|
+
maxFileSizeMB: 200,
|
|
105
|
+
},
|
|
106
|
+
tags: {
|
|
107
|
+
allowed: DEFAULT_ALLOWED_TAGS,
|
|
108
|
+
strict: DEFAULT_STRICT_TAGS,
|
|
109
|
+
aliases: DEFAULT_TAG_ALIASES,
|
|
110
|
+
strictEvidence: DEFAULT_STRICT_EVIDENCE,
|
|
111
|
+
pathRules: DEFAULT_PATH_RULES,
|
|
112
|
+
autoTag: '#AI_Sorted',
|
|
113
|
+
},
|
|
114
|
+
context: DEFAULT_CONTEXT,
|
|
115
|
+
};
|
|
116
|
+
export const BAD_COMMENT_PHRASES = [
|
|
117
|
+
'brak danych',
|
|
118
|
+
'brak pewnych danych',
|
|
119
|
+
'brak tekstu',
|
|
120
|
+
'plik graficzny',
|
|
121
|
+
'plik wideo',
|
|
122
|
+
'rak informacji',
|
|
123
|
+
];
|
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
import Anthropic from '@anthropic-ai/sdk';
|
|
2
|
+
export class CloudLlmError extends Error {
|
|
3
|
+
constructor(message) {
|
|
4
|
+
super(message);
|
|
5
|
+
this.name = 'CloudLlmError';
|
|
6
|
+
}
|
|
7
|
+
}
|
|
8
|
+
const SYSTEM_PROMPT = 'You receive a Polish/English file-classification task. Respond with ONLY a valid JSON object ' +
|
|
9
|
+
'matching {"tags": ["#Tag", ...], "comment": "..."}. No markdown, no commentary.';
|
|
10
|
+
export async function callAnthropic(prompt, cfg) {
|
|
11
|
+
if (!cfg.llm.apiKey) {
|
|
12
|
+
throw new CloudLlmError('Missing API key for Anthropic provider. Pass --api-key or set ANTHROPIC_API_KEY.');
|
|
13
|
+
}
|
|
14
|
+
const client = new Anthropic({ apiKey: cfg.llm.apiKey });
|
|
15
|
+
try {
|
|
16
|
+
const msg = await client.messages.create({
|
|
17
|
+
model: cfg.llm.model,
|
|
18
|
+
max_tokens: Math.max(cfg.llm.numPredict, 512),
|
|
19
|
+
temperature: cfg.llm.temperature,
|
|
20
|
+
system: SYSTEM_PROMPT,
|
|
21
|
+
messages: [{ role: 'user', content: prompt }],
|
|
22
|
+
});
|
|
23
|
+
const text = msg.content
|
|
24
|
+
.filter((b) => b.type === 'text')
|
|
25
|
+
.map(b => b.text)
|
|
26
|
+
.join('\n')
|
|
27
|
+
.trim();
|
|
28
|
+
return text;
|
|
29
|
+
}
|
|
30
|
+
catch (err) {
|
|
31
|
+
const m = err instanceof Error ? err.message : String(err);
|
|
32
|
+
throw new CloudLlmError(`Anthropic call failed: ${m}`);
|
|
33
|
+
}
|
|
34
|
+
}
|
|
35
|
+
export async function callOpenAi(prompt, cfg) {
|
|
36
|
+
if (!cfg.llm.apiKey) {
|
|
37
|
+
throw new CloudLlmError('Missing API key for OpenAI provider. Pass --api-key or set OPENAI_API_KEY.');
|
|
38
|
+
}
|
|
39
|
+
try {
|
|
40
|
+
const res = await fetch('https://api.openai.com/v1/chat/completions', {
|
|
41
|
+
method: 'POST',
|
|
42
|
+
headers: {
|
|
43
|
+
'Content-Type': 'application/json',
|
|
44
|
+
Authorization: `Bearer ${cfg.llm.apiKey}`,
|
|
45
|
+
},
|
|
46
|
+
body: JSON.stringify({
|
|
47
|
+
model: cfg.llm.model,
|
|
48
|
+
temperature: cfg.llm.temperature,
|
|
49
|
+
max_tokens: Math.max(cfg.llm.numPredict, 512),
|
|
50
|
+
response_format: { type: 'json_object' },
|
|
51
|
+
messages: [
|
|
52
|
+
{ role: 'system', content: SYSTEM_PROMPT },
|
|
53
|
+
{ role: 'user', content: prompt },
|
|
54
|
+
],
|
|
55
|
+
}),
|
|
56
|
+
});
|
|
57
|
+
if (!res.ok) {
|
|
58
|
+
throw new Error(`OpenAI HTTP ${res.status}: ${await res.text().catch(() => '')}`);
|
|
59
|
+
}
|
|
60
|
+
const data = (await res.json());
|
|
61
|
+
return (data.choices?.[0]?.message?.content ?? '').trim();
|
|
62
|
+
}
|
|
63
|
+
catch (err) {
|
|
64
|
+
const m = err instanceof Error ? err.message : String(err);
|
|
65
|
+
throw new CloudLlmError(`OpenAI call failed: ${m}`);
|
|
66
|
+
}
|
|
67
|
+
}
|
|
@@ -0,0 +1,127 @@
|
|
|
1
|
+
import { mergeTags, normalizeTag, isStrictTag, strictTagHasEvidence } from '../tags.js';
|
|
2
|
+
import { BAD_COMMENT_PHRASES } from '../defaults.js';
|
|
3
|
+
import { buildPrompt, parseJsonSafe } from './prompt.js';
|
|
4
|
+
import { callOllama } from './local.js';
|
|
5
|
+
import { callAnthropic, callOpenAi } from './cloud.js';
|
|
6
|
+
async function dispatchProvider(prompt, cfg) {
|
|
7
|
+
switch (cfg.llm.provider) {
|
|
8
|
+
case 'anthropic':
|
|
9
|
+
return callAnthropic(prompt, cfg);
|
|
10
|
+
case 'openai':
|
|
11
|
+
return callOpenAi(prompt, cfg);
|
|
12
|
+
case 'ollama':
|
|
13
|
+
default:
|
|
14
|
+
return callOllama(prompt, cfg);
|
|
15
|
+
}
|
|
16
|
+
}
|
|
17
|
+
export async function inferTagsAndComment(req, cfg, masker) {
|
|
18
|
+
const fallback = {
|
|
19
|
+
tags: mergeTags(cfg, req.preTags).slice(0, 6),
|
|
20
|
+
comment: `Plik: ${req.fileName}.`,
|
|
21
|
+
};
|
|
22
|
+
let promptOcr = req.ocrText;
|
|
23
|
+
let sessionId = '';
|
|
24
|
+
if (masker && req.ocrText.trim()) {
|
|
25
|
+
try {
|
|
26
|
+
const m = await masker.mask(req.ocrText);
|
|
27
|
+
promptOcr = m.maskedText;
|
|
28
|
+
sessionId = m.sessionId;
|
|
29
|
+
}
|
|
30
|
+
catch (err) {
|
|
31
|
+
const msg = err instanceof Error ? err.message : String(err);
|
|
32
|
+
process.stderr.write(` ⚠️ Mask failed, sending raw OCR: ${msg}\n`);
|
|
33
|
+
}
|
|
34
|
+
}
|
|
35
|
+
const prompt = buildPrompt(req, cfg, promptOcr);
|
|
36
|
+
let raw;
|
|
37
|
+
try {
|
|
38
|
+
raw = await dispatchProvider(prompt, cfg);
|
|
39
|
+
}
|
|
40
|
+
catch (err) {
|
|
41
|
+
const msg = err instanceof Error ? err.message : String(err);
|
|
42
|
+
process.stderr.write(` ⚠️ ${msg}\n`);
|
|
43
|
+
return fallback;
|
|
44
|
+
}
|
|
45
|
+
if (!raw)
|
|
46
|
+
return fallback;
|
|
47
|
+
const data = parseJsonSafe(raw);
|
|
48
|
+
const rawTags = Array.isArray(data.tags) ? data.tags : [];
|
|
49
|
+
const evidence = (req.fileName + ' ' + req.ocrText).toLowerCase();
|
|
50
|
+
const cleaned = [];
|
|
51
|
+
for (const t of rawTags) {
|
|
52
|
+
const n = normalizeTag(t, cfg);
|
|
53
|
+
if (!n)
|
|
54
|
+
continue;
|
|
55
|
+
if (isStrictTag(n, cfg)) {
|
|
56
|
+
if (strictTagHasEvidence(n, evidence, cfg))
|
|
57
|
+
cleaned.push(n);
|
|
58
|
+
}
|
|
59
|
+
else {
|
|
60
|
+
cleaned.push(n);
|
|
61
|
+
}
|
|
62
|
+
}
|
|
63
|
+
applyContextualGuards(cleaned, req, cfg, evidence);
|
|
64
|
+
const strictFound = cleaned.filter(t => isStrictTag(t, cfg));
|
|
65
|
+
const restLlm = cleaned.filter(t => !isStrictTag(t, cfg));
|
|
66
|
+
let final = mergeTags(cfg, strictFound, req.preTags, restLlm);
|
|
67
|
+
if (req.ext === '.pdf' && req.ocrText.trim() && !final.includes('#Skan')) {
|
|
68
|
+
final = mergeTags(cfg, final, ['#Skan']);
|
|
69
|
+
}
|
|
70
|
+
let comment = typeof data.comment === 'string' ? data.comment.trim() : '';
|
|
71
|
+
if (isBadComment(comment)) {
|
|
72
|
+
comment = `Plik: ${req.fileName}.`;
|
|
73
|
+
}
|
|
74
|
+
if (masker && sessionId && comment) {
|
|
75
|
+
try {
|
|
76
|
+
comment = await masker.unmask(comment, sessionId);
|
|
77
|
+
}
|
|
78
|
+
catch (err) {
|
|
79
|
+
const msg = err instanceof Error ? err.message : String(err);
|
|
80
|
+
process.stderr.write(` ⚠️ Unmask failed: ${msg}\n`);
|
|
81
|
+
}
|
|
82
|
+
}
|
|
83
|
+
return {
|
|
84
|
+
tags: final.slice(0, 6),
|
|
85
|
+
comment: comment.slice(0, 500),
|
|
86
|
+
};
|
|
87
|
+
}
|
|
88
|
+
function isBadComment(c) {
|
|
89
|
+
if (!c)
|
|
90
|
+
return true;
|
|
91
|
+
const low = c.toLowerCase();
|
|
92
|
+
if (BAD_COMMENT_PHRASES.some(p => low.includes(p)))
|
|
93
|
+
return true;
|
|
94
|
+
if (low.length < 10)
|
|
95
|
+
return true;
|
|
96
|
+
return false;
|
|
97
|
+
}
|
|
98
|
+
function applyContextualGuards(cleaned, req, cfg, evidence) {
|
|
99
|
+
const financialBlock = new Set(['#Bank', '#Kredyt', '#KartaKredytowa', '#Wyciag', '#Podatki']);
|
|
100
|
+
const preSet = new Set(req.preTags);
|
|
101
|
+
const isCv = preSet.has('#CV');
|
|
102
|
+
const noOcr = !req.ocrText.trim() && ['.png', '.jpg', '.jpeg', '.webp', '.heic'].includes(req.ext);
|
|
103
|
+
const remove = (predicate) => {
|
|
104
|
+
for (let i = cleaned.length - 1; i >= 0; i--) {
|
|
105
|
+
if (predicate(cleaned[i]))
|
|
106
|
+
cleaned.splice(i, 1);
|
|
107
|
+
}
|
|
108
|
+
};
|
|
109
|
+
if (isCv)
|
|
110
|
+
remove(t => financialBlock.has(t));
|
|
111
|
+
const strictSet = new Set(cfg.tags.strict);
|
|
112
|
+
if (noOcr)
|
|
113
|
+
remove(t => strictSet.has(t));
|
|
114
|
+
if (cleaned.includes('#Bank')) {
|
|
115
|
+
const bankKeywords = ['bank', 'iban', 'rachunek', 'wyciąg', 'prowizja', 'account'];
|
|
116
|
+
if (!bankKeywords.some(k => evidence.includes(k))) {
|
|
117
|
+
const idx = cleaned.indexOf('#Bank');
|
|
118
|
+
if (idx >= 0)
|
|
119
|
+
cleaned.splice(idx, 1);
|
|
120
|
+
}
|
|
121
|
+
}
|
|
122
|
+
if (cleaned.includes('#Grafika') && cleaned.includes('#CV')) {
|
|
123
|
+
const idx = cleaned.indexOf('#CV');
|
|
124
|
+
if (idx >= 0)
|
|
125
|
+
cleaned.splice(idx, 1);
|
|
126
|
+
}
|
|
127
|
+
}
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
export class OllamaUnavailableError extends Error {
|
|
2
|
+
constructor(message) {
|
|
3
|
+
super(message);
|
|
4
|
+
this.name = 'OllamaUnavailableError';
|
|
5
|
+
}
|
|
6
|
+
}
|
|
7
|
+
export async function callOllama(prompt, cfg, timeoutMs = 90_000) {
|
|
8
|
+
const url = `${cfg.llm.ollamaUrl.replace(/\/$/, '')}/api/generate`;
|
|
9
|
+
const controller = new AbortController();
|
|
10
|
+
const t = setTimeout(() => controller.abort(), timeoutMs);
|
|
11
|
+
try {
|
|
12
|
+
const res = await fetch(url, {
|
|
13
|
+
method: 'POST',
|
|
14
|
+
headers: { 'Content-Type': 'application/json' },
|
|
15
|
+
body: JSON.stringify({
|
|
16
|
+
model: cfg.llm.model,
|
|
17
|
+
prompt,
|
|
18
|
+
stream: false,
|
|
19
|
+
format: 'json',
|
|
20
|
+
options: {
|
|
21
|
+
temperature: cfg.llm.temperature,
|
|
22
|
+
num_predict: cfg.llm.numPredict,
|
|
23
|
+
},
|
|
24
|
+
}),
|
|
25
|
+
signal: controller.signal,
|
|
26
|
+
});
|
|
27
|
+
if (!res.ok) {
|
|
28
|
+
throw new Error(`Ollama HTTP ${res.status}: ${await res.text().catch(() => '')}`);
|
|
29
|
+
}
|
|
30
|
+
const data = (await res.json());
|
|
31
|
+
return (data.response ?? '').trim();
|
|
32
|
+
}
|
|
33
|
+
catch (err) {
|
|
34
|
+
if (err instanceof Error && err.name === 'AbortError') {
|
|
35
|
+
throw new OllamaUnavailableError(`Ollama timeout after ${timeoutMs}ms at ${url}`);
|
|
36
|
+
}
|
|
37
|
+
const msg = err instanceof Error ? err.message : String(err);
|
|
38
|
+
throw new OllamaUnavailableError(`Ollama call failed: ${msg}`);
|
|
39
|
+
}
|
|
40
|
+
finally {
|
|
41
|
+
clearTimeout(t);
|
|
42
|
+
}
|
|
43
|
+
}
|
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
export function buildPrompt(req, cfg, ocrTextForPrompt) {
|
|
2
|
+
const preStr = req.preTags.length > 0 ? req.preTags.join(', ') : 'brak';
|
|
3
|
+
const ocrTrim = ocrTextForPrompt.slice(0, cfg.ocr.llmMaxChars).trim() ||
|
|
4
|
+
'(brak tekstu – plik graficzny/wideo)';
|
|
5
|
+
const strictSet = new Set(cfg.tags.strict);
|
|
6
|
+
const autoTag = cfg.tags.autoTag;
|
|
7
|
+
const safeList = cfg.tags.allowed
|
|
8
|
+
.filter(t => !strictSet.has(t) && t !== autoTag)
|
|
9
|
+
.sort();
|
|
10
|
+
const strictList = [...cfg.tags.strict].sort();
|
|
11
|
+
return [
|
|
12
|
+
'Klasyfikujesz prywatne pliki w języku polskim i angielskim. Bądź OSTROŻNY i PRECYZYJNY.',
|
|
13
|
+
'Zwróć WYŁĄCZNIE poprawny JSON (bez markdown, bez komentarzy):',
|
|
14
|
+
'{"tags": ["#Tag1", "#Tag2"], "comment": "..."}',
|
|
15
|
+
'',
|
|
16
|
+
'PLIK:',
|
|
17
|
+
`- nazwa: ${req.fileName}`,
|
|
18
|
+
`- rozszerzenie: ${req.ext}`,
|
|
19
|
+
`- pre_tags (z nazwy/ścieżki/OCR – ZAWSZE uwzględnij): ${preStr}`,
|
|
20
|
+
`- tekst z dokumentu (OCR):`,
|
|
21
|
+
ocrTrim,
|
|
22
|
+
'',
|
|
23
|
+
'KONTEKST (tylko informacyjnie):',
|
|
24
|
+
cfg.context,
|
|
25
|
+
'',
|
|
26
|
+
'ZASADY TAGOWANIA:',
|
|
27
|
+
'1. Zawsze uwzględnij wszystkie pre_tags.',
|
|
28
|
+
'2. Tagi BEZPIECZNE dodaj jeśli wynikają z OCR lub nazwy.',
|
|
29
|
+
'3. Tagi STRICT dodaj TYLKO gdy słowo kluczowe DOSŁOWNIE występuje w OCR/nazwie',
|
|
30
|
+
' (lista evidence keywords per-tag jest w configu w polu strictEvidence).',
|
|
31
|
+
' BEZ DOWODU W TEKŚCIE – NIE DODAWAJ tagu strict.',
|
|
32
|
+
'4. Zwróć 2-5 tagów.',
|
|
33
|
+
'5. Bez tekstu OCR: użyj tylko pre_tags + #Grafika/#Foto.',
|
|
34
|
+
'',
|
|
35
|
+
'ZASADY KOMENTARZA:',
|
|
36
|
+
'- Napisz JEDNO konkretne zdanie po polsku opisujące WYŁĄCZNIE ten plik.',
|
|
37
|
+
'- Komentarz musi wynikać z nazwy pliku lub treści OCR – nie wymyślaj.',
|
|
38
|
+
'- NIE używaj żadnych przykładów z tej instrukcji jako komentarza.',
|
|
39
|
+
"- Format: co to jest + czego dotyczy.",
|
|
40
|
+
" Dla CV: 'CV z [rok] – [krótki kontekst, np. branża].'",
|
|
41
|
+
" Dla faktury: 'Faktura od [wystawca] za [usługa].'",
|
|
42
|
+
" Dla pisma z banku: 'Pismo z [bank] dotyczące [temat].'",
|
|
43
|
+
" Dla grafiki bez tekstu: 'Grafika: [co widać na podstawie nazwy pliku].'",
|
|
44
|
+
'',
|
|
45
|
+
'TAGI BEZPIECZNE:',
|
|
46
|
+
safeList.join('\n'),
|
|
47
|
+
'',
|
|
48
|
+
'TAGI STRICT (tylko z dowodem w OCR/nazwie):',
|
|
49
|
+
strictList.join('\n'),
|
|
50
|
+
'',
|
|
51
|
+
'RESTRYKCJE:',
|
|
52
|
+
'1. Jeśli OCR ma < 10 słów, użyj #Grafika i krótkiego komentarza opartego o nazwę pliku – nie zgaduj treści dokumentu.',
|
|
53
|
+
'2. Komentarz musi być UNIKALNY. Nie używaj frazy "[rok]". Jeśli nie znasz daty, nie pisz o niej.',
|
|
54
|
+
'3. BĄDŹ SCEPTYCZNY. Lepiej dać 1 tag (#Grafika) niż 5 błędnych.',
|
|
55
|
+
].join('\n');
|
|
56
|
+
}
|
|
57
|
+
export function parseJsonSafe(raw) {
|
|
58
|
+
const cleaned = raw.trim();
|
|
59
|
+
try {
|
|
60
|
+
return JSON.parse(cleaned);
|
|
61
|
+
}
|
|
62
|
+
catch {
|
|
63
|
+
// fall through
|
|
64
|
+
}
|
|
65
|
+
const match = cleaned.match(/\{[\s\S]*\}/);
|
|
66
|
+
if (match) {
|
|
67
|
+
try {
|
|
68
|
+
return JSON.parse(match[0]);
|
|
69
|
+
}
|
|
70
|
+
catch {
|
|
71
|
+
return {};
|
|
72
|
+
}
|
|
73
|
+
}
|
|
74
|
+
return {};
|
|
75
|
+
}
|
package/dist/macos.js
ADDED
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
import { execFile } from 'node:child_process';
|
|
2
|
+
import { promisify } from 'node:util';
|
|
3
|
+
import { promises as fs } from 'node:fs';
|
|
4
|
+
import path from 'node:path';
|
|
5
|
+
import os from 'node:os';
|
|
6
|
+
import { randomBytes } from 'node:crypto';
|
|
7
|
+
const execFileAsync = promisify(execFile);
|
|
8
|
+
const XATTR_TAGS = 'com.apple.metadata:_kMDItemUserTags';
|
|
9
|
+
const XATTR_COMMENT = 'com.apple.metadata:kMDItemFinderComment';
|
|
10
|
+
function escapeXml(s) {
|
|
11
|
+
return s
|
|
12
|
+
.replace(/&/g, '&')
|
|
13
|
+
.replace(/</g, '<')
|
|
14
|
+
.replace(/>/g, '>');
|
|
15
|
+
}
|
|
16
|
+
async function writeBinaryPlistXattr(filePath, attr, plistXml) {
|
|
17
|
+
const tmp = path.join(os.tmpdir(), `sortai-${randomBytes(6).toString('hex')}.plist`);
|
|
18
|
+
await fs.writeFile(tmp, plistXml, 'utf8');
|
|
19
|
+
try {
|
|
20
|
+
await execFileAsync('plutil', ['-convert', 'binary1', tmp], { timeout: 5_000 });
|
|
21
|
+
const hex = (await fs.readFile(tmp)).toString('hex');
|
|
22
|
+
await execFileAsync('xattr', ['-wx', attr, hex, filePath], { timeout: 5_000 });
|
|
23
|
+
}
|
|
24
|
+
finally {
|
|
25
|
+
await fs.unlink(tmp).catch(() => { });
|
|
26
|
+
}
|
|
27
|
+
}
|
|
28
|
+
export async function clearMacosMetadata(filePath) {
|
|
29
|
+
await execFileAsync('xattr', ['-d', XATTR_TAGS, filePath], { timeout: 5_000 }).catch(() => { });
|
|
30
|
+
await execFileAsync('xattr', ['-d', XATTR_COMMENT, filePath], { timeout: 5_000 }).catch(() => { });
|
|
31
|
+
}
|
|
32
|
+
export async function setMacosTags(filePath, tags) {
|
|
33
|
+
if (tags.length === 0)
|
|
34
|
+
return;
|
|
35
|
+
const items = tags.map(t => ` <string>${escapeXml(t)}\n0</string>`).join('\n');
|
|
36
|
+
const plistXml = `<?xml version="1.0" encoding="UTF-8"?>\n` +
|
|
37
|
+
`<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">\n` +
|
|
38
|
+
`<plist version="1.0">\n<array>\n${items}\n</array>\n</plist>\n`;
|
|
39
|
+
await writeBinaryPlistXattr(path.resolve(filePath), XATTR_TAGS, plistXml);
|
|
40
|
+
}
|
|
41
|
+
export async function setFinderComment(filePath, comment) {
|
|
42
|
+
const trimmed = comment.slice(0, 500).trim();
|
|
43
|
+
if (!trimmed)
|
|
44
|
+
return;
|
|
45
|
+
const plistXml = `<?xml version="1.0" encoding="UTF-8"?>\n` +
|
|
46
|
+
`<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">\n` +
|
|
47
|
+
`<plist version="1.0">\n<string>${escapeXml(trimmed)}</string>\n</plist>\n`;
|
|
48
|
+
await writeBinaryPlistXattr(path.resolve(filePath), XATTR_COMMENT, plistXml);
|
|
49
|
+
}
|
|
50
|
+
export async function writeFileMetadata(filePath, tags, comment) {
|
|
51
|
+
await clearMacosMetadata(filePath);
|
|
52
|
+
await setMacosTags(filePath, tags);
|
|
53
|
+
await setFinderComment(filePath, comment);
|
|
54
|
+
execFile('mdimport', [filePath], { timeout: 5_000 }, () => { });
|
|
55
|
+
}
|
package/dist/mask.js
ADDED
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
import { Client } from '@modelcontextprotocol/sdk/client/index.js';
|
|
2
|
+
import { StdioClientTransport } from '@modelcontextprotocol/sdk/client/stdio.js';
|
|
3
|
+
export class MaskError extends Error {
|
|
4
|
+
constructor(message) {
|
|
5
|
+
super(message);
|
|
6
|
+
this.name = 'MaskError';
|
|
7
|
+
}
|
|
8
|
+
}
|
|
9
|
+
export class Masker {
|
|
10
|
+
client = null;
|
|
11
|
+
transport = null;
|
|
12
|
+
cfg;
|
|
13
|
+
constructor(cfg) {
|
|
14
|
+
this.cfg = cfg;
|
|
15
|
+
}
|
|
16
|
+
async connect() {
|
|
17
|
+
if (this.client)
|
|
18
|
+
return;
|
|
19
|
+
this.transport = new StdioClientTransport({
|
|
20
|
+
command: 'npx',
|
|
21
|
+
args: ['-y', 'pseudonym-mcp', '--lang', this.cfg.mask.lang, '--engines', 'hybrid'],
|
|
22
|
+
});
|
|
23
|
+
this.client = new Client({ name: 'sortai', version: '0.1.0' }, { capabilities: {} });
|
|
24
|
+
try {
|
|
25
|
+
await this.client.connect(this.transport);
|
|
26
|
+
}
|
|
27
|
+
catch (err) {
|
|
28
|
+
this.client = null;
|
|
29
|
+
const m = err instanceof Error ? err.message : String(err);
|
|
30
|
+
throw new MaskError(`Failed to start pseudonym-mcp: ${m}`);
|
|
31
|
+
}
|
|
32
|
+
}
|
|
33
|
+
async mask(text) {
|
|
34
|
+
if (!this.client)
|
|
35
|
+
throw new MaskError('Masker not connected. Call connect() first.');
|
|
36
|
+
const result = await this.client.callTool({
|
|
37
|
+
name: 'mask_text',
|
|
38
|
+
arguments: { text },
|
|
39
|
+
});
|
|
40
|
+
const parsed = extractJson(result);
|
|
41
|
+
const maskedText = typeof parsed.masked_text === 'string' ? parsed.masked_text : text;
|
|
42
|
+
const sessionId = typeof parsed.session_id === 'string' ? parsed.session_id : '';
|
|
43
|
+
return { maskedText, sessionId };
|
|
44
|
+
}
|
|
45
|
+
async unmask(text, sessionId) {
|
|
46
|
+
if (!this.client)
|
|
47
|
+
throw new MaskError('Masker not connected. Call connect() first.');
|
|
48
|
+
if (!sessionId)
|
|
49
|
+
return text;
|
|
50
|
+
const result = await this.client.callTool({
|
|
51
|
+
name: 'unmask_text',
|
|
52
|
+
arguments: { text, session_id: sessionId },
|
|
53
|
+
});
|
|
54
|
+
const parsed = extractJson(result);
|
|
55
|
+
return typeof parsed.unmasked_text === 'string' ? parsed.unmasked_text :
|
|
56
|
+
typeof parsed.text === 'string' ? parsed.text : text;
|
|
57
|
+
}
|
|
58
|
+
async close() {
|
|
59
|
+
if (this.client) {
|
|
60
|
+
await this.client.close().catch(() => undefined);
|
|
61
|
+
this.client = null;
|
|
62
|
+
}
|
|
63
|
+
this.transport = null;
|
|
64
|
+
}
|
|
65
|
+
}
|
|
66
|
+
function extractJson(result) {
|
|
67
|
+
if (!result || typeof result !== 'object')
|
|
68
|
+
return {};
|
|
69
|
+
const r = result;
|
|
70
|
+
if (r.structuredContent && typeof r.structuredContent === 'object') {
|
|
71
|
+
return r.structuredContent;
|
|
72
|
+
}
|
|
73
|
+
const textBlock = r.content?.find(b => b.type === 'text' && typeof b.text === 'string');
|
|
74
|
+
if (!textBlock?.text)
|
|
75
|
+
return {};
|
|
76
|
+
try {
|
|
77
|
+
const parsed = JSON.parse(textBlock.text);
|
|
78
|
+
return parsed && typeof parsed === 'object' ? parsed : {};
|
|
79
|
+
}
|
|
80
|
+
catch {
|
|
81
|
+
return {};
|
|
82
|
+
}
|
|
83
|
+
}
|
package/dist/ocr.js
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
import path from 'node:path';
|
|
2
|
+
import { ocr } from 'macos-vision';
|
|
3
|
+
export async function extractOcrText(filePath, cfg) {
|
|
4
|
+
const ext = path.extname(filePath).toLowerCase();
|
|
5
|
+
if (!cfg.scan.ocrExtensions.includes(ext))
|
|
6
|
+
return '';
|
|
7
|
+
try {
|
|
8
|
+
const text = (await ocr(filePath, {
|
|
9
|
+
startPage: cfg.ocr.startPage,
|
|
10
|
+
maxPages: cfg.ocr.maxPages,
|
|
11
|
+
}));
|
|
12
|
+
return text.slice(0, cfg.ocr.maxChars);
|
|
13
|
+
}
|
|
14
|
+
catch (err) {
|
|
15
|
+
const msg = err instanceof Error ? err.message : String(err);
|
|
16
|
+
process.stderr.write(` ⚠️ OCR error for ${filePath}: ${msg}\n`);
|
|
17
|
+
return '';
|
|
18
|
+
}
|
|
19
|
+
}
|
package/dist/pretag.js
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
import path from 'node:path';
|
|
2
|
+
import { mergeTags } from './tags.js';
|
|
3
|
+
export function preTagFromPath(filePath, ocrText, cfg) {
|
|
4
|
+
const haystack = filePath
|
|
5
|
+
.replace(/[\\/]/g, ' ')
|
|
6
|
+
.replace(/_/g, ' ')
|
|
7
|
+
.replace(/-/g, ' ') +
|
|
8
|
+
' ' + ocrText;
|
|
9
|
+
let collected = [];
|
|
10
|
+
for (const rule of cfg.tags.pathRules) {
|
|
11
|
+
let re;
|
|
12
|
+
try {
|
|
13
|
+
re = new RegExp(rule.pattern, rule.flags ?? '');
|
|
14
|
+
}
|
|
15
|
+
catch {
|
|
16
|
+
continue;
|
|
17
|
+
}
|
|
18
|
+
if (re.test(haystack)) {
|
|
19
|
+
collected = mergeTags(cfg, collected, rule.tags);
|
|
20
|
+
}
|
|
21
|
+
}
|
|
22
|
+
const ext = path.extname(filePath).toLowerCase();
|
|
23
|
+
if (ext === '.eml') {
|
|
24
|
+
collected = mergeTags(cfg, collected, ['#Email']);
|
|
25
|
+
}
|
|
26
|
+
else if (cfg.scan.videoExtensions.includes(ext)) {
|
|
27
|
+
collected = mergeTags(cfg, collected, ['#Nagranie']);
|
|
28
|
+
}
|
|
29
|
+
else if (['.png', '.jpg', '.jpeg', '.webp', '.heic'].includes(ext) && !ocrText.trim()) {
|
|
30
|
+
collected = mergeTags(cfg, collected, ['#Grafika']);
|
|
31
|
+
}
|
|
32
|
+
return collected;
|
|
33
|
+
}
|
package/dist/tags.js
ADDED
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
export function normalizeTag(raw, cfg) {
|
|
2
|
+
if (typeof raw !== 'string')
|
|
3
|
+
return null;
|
|
4
|
+
let tag = raw.trim();
|
|
5
|
+
if (!tag)
|
|
6
|
+
return null;
|
|
7
|
+
if (!tag.startsWith('#'))
|
|
8
|
+
tag = '#' + tag;
|
|
9
|
+
tag = cfg.tags.aliases[tag] ?? tag;
|
|
10
|
+
const allowed = new Set([...cfg.tags.allowed, cfg.tags.autoTag]);
|
|
11
|
+
return allowed.has(tag) ? tag : null;
|
|
12
|
+
}
|
|
13
|
+
export function mergeTags(cfg, ...lists) {
|
|
14
|
+
const seen = [];
|
|
15
|
+
for (const list of lists) {
|
|
16
|
+
if (!list)
|
|
17
|
+
continue;
|
|
18
|
+
for (const raw of list) {
|
|
19
|
+
const normalized = normalizeTag(raw, cfg);
|
|
20
|
+
if (normalized && !seen.includes(normalized)) {
|
|
21
|
+
seen.push(normalized);
|
|
22
|
+
}
|
|
23
|
+
}
|
|
24
|
+
}
|
|
25
|
+
return seen;
|
|
26
|
+
}
|
|
27
|
+
export function isStrictTag(tag, cfg) {
|
|
28
|
+
return cfg.tags.strict.includes(tag);
|
|
29
|
+
}
|
|
30
|
+
export function strictTagHasEvidence(tag, evidence, cfg) {
|
|
31
|
+
const keywords = cfg.tags.strictEvidence[tag] ?? [];
|
|
32
|
+
if (keywords.length === 0)
|
|
33
|
+
return false;
|
|
34
|
+
const haystack = evidence.toLowerCase();
|
|
35
|
+
return keywords.some(kw => haystack.includes(kw.toLowerCase()));
|
|
36
|
+
}
|
package/dist/types.js
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export {};
|