plasalid 0.7.1 → 0.7.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +2 -2
- package/dist/ai/agent.d.ts +6 -7
- package/dist/ai/agent.js +27 -11
- package/dist/ai/personas.js +48 -46
- package/dist/ai/system-prompt.js +1 -1
- package/dist/ai/tools/account-mutex.d.ts +1 -0
- package/dist/ai/tools/account-mutex.js +16 -0
- package/dist/ai/tools/index.js +4 -12
- package/dist/ai/tools/ingest.d.ts +1 -1
- package/dist/ai/tools/ingest.js +282 -242
- package/dist/ai/tools/merchants.js +1 -28
- package/dist/ai/tools/read.js +8 -8
- package/dist/ai/tools/record.js +3 -36
- package/dist/ai/tools/resolve.js +25 -22
- package/dist/ai/tools/scan.js +0 -1
- package/dist/ai/tools/types.d.ts +14 -21
- package/dist/cli/commands/record.js +1 -82
- package/dist/cli/commands/resolve.d.ts +5 -2
- package/dist/cli/commands/resolve.js +36 -5
- package/dist/cli/commands/revert.js +4 -2
- package/dist/cli/commands/rules.js +2 -2
- package/dist/cli/commands/scan.js +199 -128
- package/dist/cli/commands/status.js +5 -5
- package/dist/cli/index.js +8 -29
- package/dist/cli/ink/ScanDashboard.d.ts +49 -0
- package/dist/cli/ink/ScanDashboard.js +214 -0
- package/dist/cli/ink/scan_dashboard.d.ts +40 -25
- package/dist/cli/ink/scan_dashboard.js +139 -44
- package/dist/db/queries/account-balance.d.ts +1 -1
- package/dist/db/queries/questions.d.ts +62 -0
- package/dist/db/queries/questions.js +110 -0
- package/dist/db/queries/transactions.d.ts +1 -1
- package/dist/db/queries/unknowns.d.ts +17 -15
- package/dist/db/queries/unknowns.js +35 -39
- package/dist/db/schema.js +6 -28
- package/dist/scanner/audit/auditor.d.ts +31 -0
- package/dist/scanner/audit/auditor.js +72 -0
- package/dist/scanner/audit/engine.d.ts +10 -0
- package/dist/scanner/audit/engine.js +98 -0
- package/dist/scanner/audit/eventBus.d.ts +60 -0
- package/dist/scanner/audit/eventBus.js +35 -0
- package/dist/scanner/audit/passes/index.d.ts +11 -0
- package/dist/scanner/audit/passes/index.js +9 -0
- package/dist/scanner/audit/passes/types.d.ts +23 -0
- package/dist/scanner/audit/passes/types.js +1 -0
- package/dist/scanner/audit/types.d.ts +27 -0
- package/dist/scanner/audit/types.js +1 -0
- package/dist/scanner/auditor.d.ts +51 -0
- package/dist/scanner/auditor.js +80 -0
- package/dist/scanner/buffer/engine.d.ts +9 -0
- package/dist/scanner/buffer/engine.js +110 -0
- package/dist/scanner/buffer/sharedBuffer.d.ts +78 -0
- package/dist/scanner/buffer/sharedBuffer.js +130 -0
- package/dist/scanner/buffer/types.d.ts +67 -0
- package/dist/scanner/buffer/types.js +1 -0
- package/dist/scanner/buffer.d.ts +45 -38
- package/dist/scanner/buffer.js +93 -61
- package/dist/scanner/bus/engine.d.ts +11 -0
- package/dist/scanner/bus/engine.js +42 -0
- package/dist/scanner/bus/types.d.ts +53 -0
- package/dist/scanner/bus/types.js +1 -0
- package/dist/scanner/bus.d.ts +38 -0
- package/dist/scanner/bus.js +37 -0
- package/dist/scanner/chunk-worker.d.ts +19 -0
- package/dist/scanner/chunk-worker.js +67 -0
- package/dist/scanner/chunkWorker.d.ts +20 -0
- package/dist/scanner/chunkWorker.js +59 -0
- package/dist/scanner/chunker/chunker.d.ts +7 -0
- package/dist/scanner/chunker/chunker.js +60 -0
- package/dist/scanner/chunker.d.ts +7 -0
- package/dist/scanner/chunker.js +60 -0
- package/dist/scanner/converge.d.ts +29 -0
- package/dist/scanner/converge.js +15 -0
- package/dist/scanner/decrypt.d.ts +10 -0
- package/dist/scanner/decrypt.js +80 -0
- package/dist/scanner/engine/scanEngine.d.ts +24 -0
- package/dist/scanner/engine/scanEngine.js +87 -0
- package/dist/scanner/engine/types.d.ts +90 -0
- package/dist/scanner/engine/types.js +1 -0
- package/dist/scanner/engine.d.ts +90 -0
- package/dist/scanner/engine.js +84 -0
- package/dist/scanner/file-worker.d.ts +33 -0
- package/dist/scanner/file-worker.js +28 -0
- package/dist/scanner/fileWorker.d.ts +33 -0
- package/dist/scanner/fileWorker.js +22 -0
- package/dist/scanner/hooks/types.d.ts +25 -0
- package/dist/scanner/hooks/types.js +1 -0
- package/dist/scanner/hooks.d.ts +23 -0
- package/dist/scanner/hooks.js +1 -0
- package/dist/scanner/parse.d.ts +10 -0
- package/dist/scanner/parse.js +47 -0
- package/dist/scanner/passes/index.d.ts +8 -0
- package/dist/scanner/passes/index.js +6 -0
- package/dist/scanner/passes/types.d.ts +22 -0
- package/dist/scanner/passes/types.js +1 -0
- package/dist/scanner/pdf/chunker.d.ts +7 -0
- package/dist/scanner/pdf/chunker.js +60 -0
- package/dist/scanner/pdf/password-store.d.ts +34 -0
- package/dist/scanner/pdf/password-store.js +83 -0
- package/dist/scanner/pdf/pdf-unlock.d.ts +17 -0
- package/dist/scanner/pdf/pdf-unlock.js +50 -0
- package/dist/scanner/pdf/pdf.d.ts +17 -0
- package/dist/scanner/pdf/pdf.js +36 -0
- package/dist/scanner/pdf/state-machine.d.ts +60 -0
- package/dist/scanner/pdf/state-machine.js +64 -0
- package/dist/scanner/pdf/unlock.d.ts +22 -0
- package/dist/scanner/pdf/unlock.js +121 -0
- package/dist/scanner/phase-decrypt.d.ts +10 -0
- package/dist/scanner/phase-decrypt.js +80 -0
- package/dist/scanner/phase-parse.d.ts +10 -0
- package/dist/scanner/phase-parse.js +46 -0
- package/dist/scanner/phases/chunk.d.ts +8 -0
- package/dist/scanner/phases/chunk.js +13 -0
- package/dist/scanner/phases/commit.d.ts +12 -0
- package/dist/scanner/phases/commit.js +140 -0
- package/dist/scanner/phases/decrypt.d.ts +10 -0
- package/dist/scanner/phases/decrypt.js +80 -0
- package/dist/scanner/phases/parse.d.ts +10 -0
- package/dist/scanner/phases/parse.js +46 -0
- package/dist/scanner/phases/resolve.d.ts +10 -0
- package/dist/scanner/phases/resolve.js +17 -0
- package/dist/scanner/phases/review.d.ts +10 -0
- package/dist/scanner/phases/review.js +12 -0
- package/dist/scanner/progress.d.ts +14 -0
- package/dist/scanner/progress.js +21 -0
- package/dist/scanner/resolver-memory.d.ts +8 -0
- package/dist/scanner/resolver-memory.js +24 -0
- package/dist/scanner/resolver.d.ts +39 -0
- package/dist/scanner/resolver.js +196 -0
- package/dist/scanner/result.d.ts +17 -0
- package/dist/scanner/result.js +19 -0
- package/dist/scanner/run-passes.d.ts +30 -0
- package/dist/scanner/run-passes.js +15 -0
- package/dist/scanner/unlock.js +1 -1
- package/dist/scanner/worker.d.ts +19 -0
- package/dist/scanner/worker.js +67 -0
- package/dist/scanner/workers/chunkWorker.d.ts +20 -0
- package/dist/scanner/workers/chunkWorker.js +65 -0
- package/dist/scanner/workers/fileWorker.d.ts +32 -0
- package/dist/scanner/workers/fileWorker.js +22 -0
- package/package.json +1 -1
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Pure state machine for the unlock phase of a single file scan. Side effects
|
|
3
|
+
* (mupdf calls, prompts, DB reads) live in the orchestrator; this module only
|
|
4
|
+
* encodes the transition logic so it can be exhaustively unit-tested.
|
|
5
|
+
*/
|
|
6
|
+
export const MAX_PASSWORD_ATTEMPTS = 10;
|
|
7
|
+
export function isTerminal(state) {
|
|
8
|
+
return state.kind === "done" || state.kind === "failed";
|
|
9
|
+
}
|
|
10
|
+
/**
|
|
11
|
+
* Pure transition. Throws if the event doesn't make sense for the current state;
|
|
12
|
+
* the orchestrator never produces such combinations, so reaching the throw is a
|
|
13
|
+
* programmer error worth surfacing loudly.
|
|
14
|
+
*/
|
|
15
|
+
export function transition(state, event) {
|
|
16
|
+
switch (state.kind) {
|
|
17
|
+
case "init":
|
|
18
|
+
if (event.kind === "INSPECTED_PLAINTEXT") {
|
|
19
|
+
return { kind: "done", decrypted: event.bytes, outcome: { kind: "plaintext" } };
|
|
20
|
+
}
|
|
21
|
+
if (event.kind === "INSPECTED_ENCRYPTED") {
|
|
22
|
+
return { kind: "trying-stored", candidates: event.candidates };
|
|
23
|
+
}
|
|
24
|
+
break;
|
|
25
|
+
case "trying-stored":
|
|
26
|
+
if (event.kind === "STORED_UNLOCK_OK") {
|
|
27
|
+
return {
|
|
28
|
+
kind: "done",
|
|
29
|
+
decrypted: event.decrypted,
|
|
30
|
+
outcome: { kind: "from-store", storedId: event.usedStoredId },
|
|
31
|
+
};
|
|
32
|
+
}
|
|
33
|
+
if (event.kind === "STORED_UNLOCK_EXHAUSTED") {
|
|
34
|
+
return { kind: "awaiting-user", attempt: 1 };
|
|
35
|
+
}
|
|
36
|
+
break;
|
|
37
|
+
case "awaiting-user":
|
|
38
|
+
if (event.kind === "USER_CANCELLED") {
|
|
39
|
+
return { kind: "failed", reason: "password required" };
|
|
40
|
+
}
|
|
41
|
+
if (event.kind === "UNLOCK_OK") {
|
|
42
|
+
return {
|
|
43
|
+
kind: "done",
|
|
44
|
+
decrypted: event.decrypted,
|
|
45
|
+
outcome: { kind: "from-user", password: event.password },
|
|
46
|
+
};
|
|
47
|
+
}
|
|
48
|
+
if (event.kind === "UNLOCK_FAIL") {
|
|
49
|
+
if (state.attempt >= MAX_PASSWORD_ATTEMPTS) {
|
|
50
|
+
return {
|
|
51
|
+
kind: "failed",
|
|
52
|
+
reason: `incorrect password after ${MAX_PASSWORD_ATTEMPTS} attempts`,
|
|
53
|
+
};
|
|
54
|
+
}
|
|
55
|
+
return { kind: "awaiting-user", attempt: state.attempt + 1 };
|
|
56
|
+
}
|
|
57
|
+
break;
|
|
58
|
+
case "done":
|
|
59
|
+
case "failed":
|
|
60
|
+
// Terminal — no further transitions.
|
|
61
|
+
break;
|
|
62
|
+
}
|
|
63
|
+
throw new Error(`Invalid unlock transition: ${state.kind} + ${event.kind}`);
|
|
64
|
+
}
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
import type Database from "libsql";
|
|
2
|
+
import { type UnlockOutcome } from "./state-machine.js";
|
|
3
|
+
export interface UnlockCtx {
|
|
4
|
+
db: Database.Database;
|
|
5
|
+
filePath: string;
|
|
6
|
+
bytes: Buffer;
|
|
7
|
+
interactive: boolean;
|
|
8
|
+
}
|
|
9
|
+
/**
|
|
10
|
+
* Drive the pure unlock state machine to a terminal state, returning the
|
|
11
|
+
* decrypted bytes and the outcome (plaintext / from-store / from-user) so the
|
|
12
|
+
* caller can persist passwords or record stored-key usage.
|
|
13
|
+
*/
|
|
14
|
+
export declare function unlockIfNeeded(ctx: UnlockCtx): Promise<{
|
|
15
|
+
decrypted: Buffer;
|
|
16
|
+
outcome: UnlockOutcome;
|
|
17
|
+
}>;
|
|
18
|
+
/**
|
|
19
|
+
* After a successful unlock: bump usage on a stored hit, save a fresh user
|
|
20
|
+
* password under a filename-pattern key, or no-op for plaintext.
|
|
21
|
+
*/
|
|
22
|
+
export declare function persistUnlockOutcome(db: Database.Database, filePath: string, outcome: UnlockOutcome): void;
|
|
@@ -0,0 +1,121 @@
|
|
|
1
|
+
import inquirer from "inquirer";
|
|
2
|
+
import { basename } from "path";
|
|
3
|
+
import { config } from "../../config.js";
|
|
4
|
+
import { statusSpinner } from "../../cli/ux.js";
|
|
5
|
+
import { findCandidates, savePassword, recordUse, suggestPattern, } from "./password-store.js";
|
|
6
|
+
import { transition, isTerminal, MAX_PASSWORD_ATTEMPTS, } from "./state-machine.js";
|
|
7
|
+
import { isEncrypted, unlock } from "./pdf-unlock.js";
|
|
8
|
+
/**
|
|
9
|
+
* Drive the pure unlock state machine to a terminal state, returning the
|
|
10
|
+
* decrypted bytes and the outcome (plaintext / from-store / from-user) so the
|
|
11
|
+
* caller can persist passwords or record stored-key usage.
|
|
12
|
+
*/
|
|
13
|
+
export async function unlockIfNeeded(ctx) {
|
|
14
|
+
let state = { kind: "init" };
|
|
15
|
+
while (!isTerminal(state)) {
|
|
16
|
+
const event = await stepUnlock(state, ctx);
|
|
17
|
+
state = transition(state, event);
|
|
18
|
+
}
|
|
19
|
+
if (state.kind === "failed") {
|
|
20
|
+
throw new Error(state.reason);
|
|
21
|
+
}
|
|
22
|
+
if (state.kind !== "done") {
|
|
23
|
+
throw new Error(`unlock loop exited in non-terminal state ${state.kind}`);
|
|
24
|
+
}
|
|
25
|
+
return { decrypted: state.decrypted, outcome: state.outcome };
|
|
26
|
+
}
|
|
27
|
+
async function stepUnlock(state, ctx) {
|
|
28
|
+
switch (state.kind) {
|
|
29
|
+
case "init": {
|
|
30
|
+
const spinner = statusSpinner(`Inspecting ${basename(ctx.filePath)}...`);
|
|
31
|
+
try {
|
|
32
|
+
const encrypted = await isEncrypted(ctx.bytes);
|
|
33
|
+
if (!encrypted) {
|
|
34
|
+
spinner.succeed(`${basename(ctx.filePath)} is not encrypted.`);
|
|
35
|
+
return { kind: "INSPECTED_PLAINTEXT", bytes: ctx.bytes };
|
|
36
|
+
}
|
|
37
|
+
const candidates = findCandidates(ctx.db, ctx.filePath, config.dbEncryptionKey);
|
|
38
|
+
spinner.info(`${basename(ctx.filePath)} is encrypted (${candidates.length} saved password${candidates.length === 1 ? "" : "s"} match).`);
|
|
39
|
+
return { kind: "INSPECTED_ENCRYPTED", candidates };
|
|
40
|
+
}
|
|
41
|
+
catch (err) {
|
|
42
|
+
spinner.fail("Inspection failed.");
|
|
43
|
+
throw err;
|
|
44
|
+
}
|
|
45
|
+
}
|
|
46
|
+
case "trying-stored":
|
|
47
|
+
return await tryStoredPasswords(ctx.bytes, state.candidates);
|
|
48
|
+
case "awaiting-user": {
|
|
49
|
+
if (!ctx.interactive) {
|
|
50
|
+
return { kind: "USER_CANCELLED" };
|
|
51
|
+
}
|
|
52
|
+
const password = await promptForPassword(basename(ctx.filePath), state.attempt);
|
|
53
|
+
if (!password) {
|
|
54
|
+
return { kind: "USER_CANCELLED" };
|
|
55
|
+
}
|
|
56
|
+
const spinner = statusSpinner("Decrypting...");
|
|
57
|
+
const result = await unlock(ctx.bytes, password);
|
|
58
|
+
if (result.ok && result.decrypted) {
|
|
59
|
+
spinner.succeed("Decrypted.");
|
|
60
|
+
return { kind: "UNLOCK_OK", decrypted: result.decrypted, password };
|
|
61
|
+
}
|
|
62
|
+
spinner.fail(`Incorrect password (attempt ${state.attempt}/${MAX_PASSWORD_ATTEMPTS}).`);
|
|
63
|
+
return { kind: "UNLOCK_FAIL" };
|
|
64
|
+
}
|
|
65
|
+
default:
|
|
66
|
+
throw new Error(`stepUnlock called with terminal state ${state.kind}`);
|
|
67
|
+
}
|
|
68
|
+
}
|
|
69
|
+
async function tryStoredPasswords(bytes, candidates) {
|
|
70
|
+
if (candidates.length === 0) {
|
|
71
|
+
return { kind: "STORED_UNLOCK_EXHAUSTED" };
|
|
72
|
+
}
|
|
73
|
+
const spinner = statusSpinner(`Trying saved password 1/${candidates.length}...`);
|
|
74
|
+
for (let i = 0; i < candidates.length; i++) {
|
|
75
|
+
const cand = candidates[i];
|
|
76
|
+
spinner.text = `Trying saved password ${i + 1}/${candidates.length} (pattern ${cand.pattern})`;
|
|
77
|
+
const result = await unlock(bytes, cand.password);
|
|
78
|
+
if (result.ok && result.decrypted) {
|
|
79
|
+
spinner.succeed(`Unlocked with saved password (pattern ${cand.pattern}).`);
|
|
80
|
+
return {
|
|
81
|
+
kind: "STORED_UNLOCK_OK",
|
|
82
|
+
decrypted: result.decrypted,
|
|
83
|
+
usedStoredId: cand.id,
|
|
84
|
+
};
|
|
85
|
+
}
|
|
86
|
+
}
|
|
87
|
+
spinner.info("No saved password matched. Asking the user.");
|
|
88
|
+
return { kind: "STORED_UNLOCK_EXHAUSTED" };
|
|
89
|
+
}
|
|
90
|
+
async function promptForPassword(fileName, attempt) {
|
|
91
|
+
const message = attempt === 1
|
|
92
|
+
? `This PDF is encrypted. Password for ${fileName}:`
|
|
93
|
+
: `Password for ${fileName} (attempt ${attempt}/${MAX_PASSWORD_ATTEMPTS}):`;
|
|
94
|
+
const { password } = await inquirer.prompt([
|
|
95
|
+
{ type: "password", name: "password", mask: "*", message },
|
|
96
|
+
]);
|
|
97
|
+
return String(password ?? "").trim();
|
|
98
|
+
}
|
|
99
|
+
const PERSIST = {
|
|
100
|
+
plaintext: () => { },
|
|
101
|
+
"from-store": (db, _filePath, o) => { recordUse(db, o.storedId); },
|
|
102
|
+
"from-user": (db, filePath, o) => {
|
|
103
|
+
const pattern = suggestPattern(filePath);
|
|
104
|
+
const spinner = statusSpinner(`Saving password for pattern ${pattern}...`);
|
|
105
|
+
try {
|
|
106
|
+
savePassword(db, pattern, o.password, config.dbEncryptionKey);
|
|
107
|
+
spinner.succeed(`Saved password for pattern ${pattern} in secure vault.`);
|
|
108
|
+
}
|
|
109
|
+
catch (err) {
|
|
110
|
+
spinner.fail(`Could not save password: ${err instanceof Error ? err.message : String(err)}`);
|
|
111
|
+
throw err;
|
|
112
|
+
}
|
|
113
|
+
},
|
|
114
|
+
};
|
|
115
|
+
/**
|
|
116
|
+
* After a successful unlock: bump usage on a stored hit, save a fresh user
|
|
117
|
+
* password under a filename-pattern key, or no-op for plaintext.
|
|
118
|
+
*/
|
|
119
|
+
export function persistUnlockOutcome(db, filePath, outcome) {
|
|
120
|
+
PERSIST[outcome.kind](db, filePath, outcome);
|
|
121
|
+
}
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
import type Database from "libsql";
|
|
2
|
+
import type { ScanState } from "./engine.js";
|
|
3
|
+
import type { ScanHooks } from "./hooks.js";
|
|
4
|
+
/**
|
|
5
|
+
* Phase 1 — walk the data dir, optionally filter by regex, decrypt each file
|
|
6
|
+
* sequentially (password prompts can't share a TTY). Output partitions into
|
|
7
|
+
* decrypted / skipped / failed via a kind-keyed dispatch map. Bootstrapped
|
|
8
|
+
* scanned_files rows are tagged onto each DecryptedFile.
|
|
9
|
+
*/
|
|
10
|
+
export declare function decryptPhase(db: Database.Database, state: ScanState, hooks: ScanHooks): Promise<void>;
|
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
import { randomUUID } from "crypto";
|
|
2
|
+
import { readPdf } from "./pdf/pdf.js";
|
|
3
|
+
import { unlockIfNeeded, persistUnlockOutcome } from "./pdf/unlock.js";
|
|
4
|
+
import { scanDataDir } from "./walker.js";
|
|
5
|
+
import { tryExecute } from "./result.js";
|
|
6
|
+
function findScannedByHash(db, hash) {
|
|
7
|
+
return db
|
|
8
|
+
.prepare(`SELECT id FROM scanned_files WHERE file_hash = ?`)
|
|
9
|
+
.get(hash) ?? null;
|
|
10
|
+
}
|
|
11
|
+
async function decryptOne(db, file, opts) {
|
|
12
|
+
const read = await tryExecute(() => readPdf(file.path));
|
|
13
|
+
if (!read.ok)
|
|
14
|
+
return { kind: "failed", error: `read failed: ${read.error}` };
|
|
15
|
+
const pdf = read.value;
|
|
16
|
+
const existing = findScannedByHash(db, pdf.hash);
|
|
17
|
+
if (existing && !opts.force) {
|
|
18
|
+
return { kind: "skipped", existingScannedFileId: existing.id };
|
|
19
|
+
}
|
|
20
|
+
const unlock = await tryExecute(() => unlockIfNeeded({
|
|
21
|
+
db,
|
|
22
|
+
filePath: file.path,
|
|
23
|
+
bytes: pdf.bytes,
|
|
24
|
+
interactive: opts.interactive,
|
|
25
|
+
}));
|
|
26
|
+
if (!unlock.ok)
|
|
27
|
+
return { kind: "failed", error: unlock.error || "unlock failed" };
|
|
28
|
+
persistUnlockOutcome(db, file.path, unlock.value.outcome);
|
|
29
|
+
return {
|
|
30
|
+
kind: "decrypted",
|
|
31
|
+
file: {
|
|
32
|
+
path: file.path,
|
|
33
|
+
fileName: file.name,
|
|
34
|
+
relPath: file.relPath,
|
|
35
|
+
hash: pdf.hash,
|
|
36
|
+
mime: pdf.mime,
|
|
37
|
+
decryptedBytes: unlock.value.decrypted,
|
|
38
|
+
replacesPriorScannedFileId: existing?.id,
|
|
39
|
+
},
|
|
40
|
+
};
|
|
41
|
+
}
|
|
42
|
+
const APPLY = {
|
|
43
|
+
decrypted: (state, _file, o) => { state.decrypted.push(o.file); },
|
|
44
|
+
skipped: (state, file, o) => { state.skipped.push({ file, existingScannedFileId: o.existingScannedFileId }); },
|
|
45
|
+
failed: (state, file, o) => { state.failed.push({ file, error: o.error }); },
|
|
46
|
+
};
|
|
47
|
+
/**
|
|
48
|
+
* Bootstrap one scanned_files row per decrypted file. Chunk workers later
|
|
49
|
+
* stamp transactions with source_file_id, so the row must exist before any
|
|
50
|
+
* tool writes hit the DB. Status flips to 'scanned' after parse completes.
|
|
51
|
+
*/
|
|
52
|
+
function bootstrapScannedFiles(db, state) {
|
|
53
|
+
for (const file of state.decrypted) {
|
|
54
|
+
if (file.replacesPriorScannedFileId) {
|
|
55
|
+
db.prepare(`DELETE FROM scanned_files WHERE id = ?`).run(file.replacesPriorScannedFileId);
|
|
56
|
+
}
|
|
57
|
+
const sfId = `sf:${randomUUID()}`;
|
|
58
|
+
db.prepare(`INSERT INTO scanned_files (id, path, file_hash, mime, status) VALUES (?, ?, ?, ?, 'pending')`).run(sfId, file.path, file.hash, file.mime);
|
|
59
|
+
file.scannedFileId = sfId;
|
|
60
|
+
}
|
|
61
|
+
}
|
|
62
|
+
/**
|
|
63
|
+
* Phase 1 — walk the data dir, optionally filter by regex, decrypt each file
|
|
64
|
+
* sequentially (password prompts can't share a TTY). Output partitions into
|
|
65
|
+
* decrypted / skipped / failed via a kind-keyed dispatch map. Bootstrapped
|
|
66
|
+
* scanned_files rows are tagged onto each DecryptedFile.
|
|
67
|
+
*/
|
|
68
|
+
export async function decryptPhase(db, state, hooks) {
|
|
69
|
+
await hooks.beforeDecrypt?.(state);
|
|
70
|
+
const matcher = state.options.regex ? new RegExp(state.options.regex, "i") : null;
|
|
71
|
+
state.files = scanDataDir().filter(f => (matcher ? matcher.test(f.relPath) : true));
|
|
72
|
+
const interactive = state.options.interactive ?? true;
|
|
73
|
+
const force = !!state.options.force;
|
|
74
|
+
for (const file of state.files) {
|
|
75
|
+
const outcome = await decryptOne(db, file, { force, interactive });
|
|
76
|
+
APPLY[outcome.kind](state, file, outcome);
|
|
77
|
+
}
|
|
78
|
+
bootstrapScannedFiles(db, state);
|
|
79
|
+
await hooks.afterDecrypt?.(state);
|
|
80
|
+
}
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
import type Database from "libsql";
|
|
2
|
+
import type { ScanState } from "./engine.js";
|
|
3
|
+
import type { ScanHooks } from "./hooks.js";
|
|
4
|
+
/**
|
|
5
|
+
* Phase 3 — fan out FileWorkers in parallel. Each FileWorker fans out its
|
|
6
|
+
* file's chunks in parallel internally. The scanId + progress sink are
|
|
7
|
+
* threaded through ScanState; chunk-worker tools write to the DB directly
|
|
8
|
+
* and tick the progress sink as they go.
|
|
9
|
+
*/
|
|
10
|
+
export declare function parsePhase(db: Database.Database, state: ScanState, hooks: ScanHooks): Promise<void>;
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
import { runWithConcurrency } from "./concurrency.js";
|
|
2
|
+
import { runFileWorker } from "./file-worker.js";
|
|
3
|
+
import { errorMessage } from "./result.js";
|
|
4
|
+
const DEFAULT_MAX_FILE_WORKERS = 5;
|
|
5
|
+
const DEFAULT_MAX_CHUNK_WORKERS_PER_FILE = 5;
|
|
6
|
+
const HARD_CAP = 8;
|
|
7
|
+
const clamp = (n, fallback) => Math.min(HARD_CAP, Math.max(1, n ?? fallback));
|
|
8
|
+
/**
|
|
9
|
+
* Phase 3 — fan out FileWorkers in parallel. Each FileWorker fans out its
|
|
10
|
+
* file's chunks in parallel internally. The scanId + progress sink are
|
|
11
|
+
* threaded through ScanState; chunk-worker tools write to the DB directly
|
|
12
|
+
* and tick the progress sink as they go.
|
|
13
|
+
*/
|
|
14
|
+
export async function parsePhase(db, state, hooks) {
|
|
15
|
+
await hooks.beforeParse?.(state);
|
|
16
|
+
const maxFile = clamp(state.options.maxFileWorkers, DEFAULT_MAX_FILE_WORKERS);
|
|
17
|
+
const maxChunk = clamp(state.options.maxChunkWorkersPerFile, DEFAULT_MAX_CHUNK_WORKERS_PER_FILE);
|
|
18
|
+
const fileGroups = state.decrypted
|
|
19
|
+
.map(file => ({
|
|
20
|
+
fileId: file.path,
|
|
21
|
+
scannedFileId: file.scannedFileId,
|
|
22
|
+
chunks: state.chunks.filter(c => c.fileId === file.path),
|
|
23
|
+
}))
|
|
24
|
+
.filter(g => g.chunks.length > 0);
|
|
25
|
+
const tasks = fileGroups.map(group => () => runFileWorker({
|
|
26
|
+
db,
|
|
27
|
+
scanId: state.scanId,
|
|
28
|
+
scannedFileId: group.scannedFileId,
|
|
29
|
+
progress: state.progress,
|
|
30
|
+
fileId: group.fileId,
|
|
31
|
+
chunks: group.chunks,
|
|
32
|
+
maxChunkWorkers: maxChunk,
|
|
33
|
+
}, hooks));
|
|
34
|
+
const settled = await runWithConcurrency(tasks, maxFile);
|
|
35
|
+
for (let i = 0; i < settled.length; i++) {
|
|
36
|
+
const r = settled[i];
|
|
37
|
+
if (!r.ok)
|
|
38
|
+
state.errors.push({ phase: "parse", target: fileGroups[i].fileId, error: errorMessage(r.error) });
|
|
39
|
+
}
|
|
40
|
+
for (const file of state.decrypted) {
|
|
41
|
+
if (!file.scannedFileId)
|
|
42
|
+
continue;
|
|
43
|
+
db.prepare(`UPDATE scanned_files SET status = 'scanned', scanned_at = datetime('now') WHERE id = ?`).run(file.scannedFileId);
|
|
44
|
+
}
|
|
45
|
+
await hooks.afterParse?.(state);
|
|
46
|
+
}
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
import type Database from "libsql";
|
|
2
|
+
import type { ScanState } from "../engine.js";
|
|
3
|
+
import type { ScanHooks } from "../hooks.js";
|
|
4
|
+
/**
|
|
5
|
+
* Phase 2 — turn every decrypted file into a list of single-page Chunks.
|
|
6
|
+
* Sequential across files (cheap in-memory operation, no contention).
|
|
7
|
+
*/
|
|
8
|
+
export declare function chunkPhase(_db: Database.Database, state: ScanState, hooks: ScanHooks): Promise<void>;
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
import { chunkPdf } from "../chunker.js";
|
|
2
|
+
/**
|
|
3
|
+
* Phase 2 — turn every decrypted file into a list of single-page Chunks.
|
|
4
|
+
* Sequential across files (cheap in-memory operation, no contention).
|
|
5
|
+
*/
|
|
6
|
+
export async function chunkPhase(_db, state, hooks) {
|
|
7
|
+
await hooks.beforeChunk?.(state);
|
|
8
|
+
for (const file of state.decrypted) {
|
|
9
|
+
const chunks = await chunkPdf(file);
|
|
10
|
+
state.chunks.push(...chunks);
|
|
11
|
+
}
|
|
12
|
+
await hooks.afterChunk?.(state);
|
|
13
|
+
}
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
import type Database from "libsql";
|
|
2
|
+
import { type TransactionInput } from "../../db/queries/transactions.js";
|
|
3
|
+
import type { ScanState } from "../engine.js";
|
|
4
|
+
import type { ScanHooks } from "../hooks.js";
|
|
5
|
+
/**
|
|
6
|
+
* Phase 5 — flush the shared buffer to the DB. Per-row transactions so one
|
|
7
|
+
* bad row drops only itself (lands as a scan_commit_failure unknown). Every
|
|
8
|
+
* successful mutation appends an action_log row keyed to scanId so the run
|
|
9
|
+
* can be reverted as a unit.
|
|
10
|
+
*/
|
|
11
|
+
export declare function commitPhase(db: Database.Database, state: ScanState, hooks: ScanHooks): Promise<void>;
|
|
12
|
+
export type { TransactionInput };
|
|
@@ -0,0 +1,140 @@
|
|
|
1
|
+
import { randomUUID } from "crypto";
|
|
2
|
+
import { validateTransaction, insertTransactionRows, } from "../../db/queries/transactions.js";
|
|
3
|
+
import { recordUnknown } from "../../db/queries/unknowns.js";
|
|
4
|
+
import { appendAction } from "../../db/queries/action-log.js";
|
|
5
|
+
import { tryExecute } from "../result.js";
|
|
6
|
+
const SERIALIZE = {
|
|
7
|
+
record_transaction: (t) => ({
|
|
8
|
+
transaction: {
|
|
9
|
+
date: t.date,
|
|
10
|
+
description: t.description,
|
|
11
|
+
source_page: t.source_page ?? null,
|
|
12
|
+
raw_descriptor: t.raw_descriptor ?? null,
|
|
13
|
+
},
|
|
14
|
+
postings: t.postings,
|
|
15
|
+
}),
|
|
16
|
+
};
|
|
17
|
+
/**
|
|
18
|
+
* Commit ONE buffered transaction in its own DB transaction so a FK violation
|
|
19
|
+
* drops only this row. The audit_log row pairs with the insert so revert can
|
|
20
|
+
* undo this exact entry.
|
|
21
|
+
*/
|
|
22
|
+
async function commitOneTransaction(db, scanId, bt, fileIdByChunkId) {
|
|
23
|
+
const sourceFileId = fileIdByChunkId.get(bt.chunkId);
|
|
24
|
+
const result = await tryExecute(() => {
|
|
25
|
+
const validated = validateTransaction({
|
|
26
|
+
...bt.input,
|
|
27
|
+
id: bt.transaction_id,
|
|
28
|
+
source_file_id: sourceFileId,
|
|
29
|
+
});
|
|
30
|
+
db.transaction(() => {
|
|
31
|
+
insertTransactionRows(db, validated);
|
|
32
|
+
appendAction(db, {
|
|
33
|
+
correlation_id: scanId,
|
|
34
|
+
command: "scan",
|
|
35
|
+
action_type: "record_transaction",
|
|
36
|
+
target_id: validated.id,
|
|
37
|
+
payload: SERIALIZE.record_transaction(validated),
|
|
38
|
+
});
|
|
39
|
+
})();
|
|
40
|
+
return validated.id;
|
|
41
|
+
});
|
|
42
|
+
if (result.ok)
|
|
43
|
+
return { kind: "ok", transactionId: result.value };
|
|
44
|
+
return {
|
|
45
|
+
kind: "failed",
|
|
46
|
+
description: bt.input.description,
|
|
47
|
+
date: bt.input.date,
|
|
48
|
+
error: result.error,
|
|
49
|
+
};
|
|
50
|
+
}
|
|
51
|
+
function bootstrapScannedFiles(db, decrypted) {
|
|
52
|
+
const ids = [];
|
|
53
|
+
const fileIdByPath = new Map();
|
|
54
|
+
for (const file of decrypted) {
|
|
55
|
+
if (file.replacesPriorScannedFileId) {
|
|
56
|
+
db.prepare(`DELETE FROM scanned_files WHERE id = ?`).run(file.replacesPriorScannedFileId);
|
|
57
|
+
}
|
|
58
|
+
const sfId = `sf:${randomUUID()}`;
|
|
59
|
+
db.prepare(`INSERT INTO scanned_files (id, path, file_hash, mime, status) VALUES (?, ?, ?, ?, 'pending')`).run(sfId, file.path, file.hash, file.mime);
|
|
60
|
+
ids.push(sfId);
|
|
61
|
+
fileIdByPath.set(file.path, sfId);
|
|
62
|
+
}
|
|
63
|
+
return { ids, fileIdByPath };
|
|
64
|
+
}
|
|
65
|
+
function buildChunkLookup(chunks, fileIdByPath) {
|
|
66
|
+
const out = new Map();
|
|
67
|
+
for (const c of chunks) {
|
|
68
|
+
const sfId = fileIdByPath.get(c.fileId);
|
|
69
|
+
if (sfId)
|
|
70
|
+
out.set(c.chunkId, sfId);
|
|
71
|
+
}
|
|
72
|
+
return out;
|
|
73
|
+
}
|
|
74
|
+
function failuresToUnknowns(failures) {
|
|
75
|
+
return failures.map(f => ({
|
|
76
|
+
unknown_id: `bu:${randomUUID()}`,
|
|
77
|
+
chunkId: null,
|
|
78
|
+
transaction_id: null,
|
|
79
|
+
account_id: null,
|
|
80
|
+
kind: "scan_commit_failure",
|
|
81
|
+
prompt: `Could not record "${f.description}" on ${f.date}: ${f.error}. Review the source statement and re-enter via the record flow.`,
|
|
82
|
+
answer: null,
|
|
83
|
+
}));
|
|
84
|
+
}
|
|
85
|
+
function writeUnknowns(db, unknowns, fileIdByChunkId) {
|
|
86
|
+
const op = db.transaction(() => {
|
|
87
|
+
for (const u of unknowns) {
|
|
88
|
+
// Closed unknowns: their resolution side-effects already landed via other
|
|
89
|
+
// buffer mutations (update_posting / delete_transaction / etc.) during
|
|
90
|
+
// the audit pass. Skip persisting — leaving them out keeps the DB free
|
|
91
|
+
// of orphan rows masquerading as open work.
|
|
92
|
+
if (u.answer !== null)
|
|
93
|
+
continue;
|
|
94
|
+
const sfId = u.chunkId ? fileIdByChunkId.get(u.chunkId) ?? null : null;
|
|
95
|
+
recordUnknown(db, {
|
|
96
|
+
file_id: sfId,
|
|
97
|
+
transaction_id: u.transaction_id,
|
|
98
|
+
account_id: u.account_id,
|
|
99
|
+
kind: u.kind ?? null,
|
|
100
|
+
prompt: u.prompt,
|
|
101
|
+
options: u.options,
|
|
102
|
+
});
|
|
103
|
+
}
|
|
104
|
+
});
|
|
105
|
+
op();
|
|
106
|
+
}
|
|
107
|
+
/**
|
|
108
|
+
* Phase 5 — flush the shared buffer to the DB. Per-row transactions so one
|
|
109
|
+
* bad row drops only itself (lands as a scan_commit_failure unknown). Every
|
|
110
|
+
* successful mutation appends an action_log row keyed to scanId so the run
|
|
111
|
+
* can be reverted as a unit.
|
|
112
|
+
*/
|
|
113
|
+
export async function commitPhase(db, state, hooks) {
|
|
114
|
+
if (state.review !== "commit")
|
|
115
|
+
return;
|
|
116
|
+
await hooks.beforeCommit?.(state);
|
|
117
|
+
const snapshot = state.buffer.snapshot();
|
|
118
|
+
const { ids: scannedFileIds, fileIdByPath } = bootstrapScannedFiles(db, state.decrypted);
|
|
119
|
+
const fileIdByChunkId = buildChunkLookup(state.chunks, fileIdByPath);
|
|
120
|
+
const outcomes = await Promise.all(snapshot.transactions.map(bt => commitOneTransaction(db, state.scanId, bt, fileIdByChunkId)));
|
|
121
|
+
const insertedTx = outcomes.filter(o => o.kind === "ok").length;
|
|
122
|
+
const failures = outcomes.filter((o) => o.kind === "failed");
|
|
123
|
+
const failureUnknowns = failuresToUnknowns(failures);
|
|
124
|
+
writeUnknowns(db, [...snapshot.unknowns, ...failureUnknowns], fileIdByChunkId);
|
|
125
|
+
for (const sfId of scannedFileIds) {
|
|
126
|
+
db.prepare(`UPDATE scanned_files SET status = 'scanned', scanned_at = datetime('now') WHERE id = ?`).run(sfId);
|
|
127
|
+
}
|
|
128
|
+
const unknownsResolved = snapshot.unknowns.filter(u => u.answer !== null).length;
|
|
129
|
+
const unknownsOpen = snapshot.unknowns.length - unknownsResolved + failureUnknowns.length;
|
|
130
|
+
const outcome = {
|
|
131
|
+
transactions: insertedTx,
|
|
132
|
+
accounts: snapshot.accountsCreated.length,
|
|
133
|
+
merchants: snapshot.merchantsCreated.length,
|
|
134
|
+
unknownsOpen,
|
|
135
|
+
unknownsResolved,
|
|
136
|
+
scannedFileIds,
|
|
137
|
+
};
|
|
138
|
+
state.committed = outcome;
|
|
139
|
+
await hooks.afterCommit?.(state, outcome);
|
|
140
|
+
}
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
import type Database from "libsql";
|
|
2
|
+
import type { ScanState } from "../engine.js";
|
|
3
|
+
import type { ScanHooks } from "../hooks.js";
|
|
4
|
+
/**
|
|
5
|
+
* Phase 1 — walk the data dir, optionally filter by regex, decrypt each file
|
|
6
|
+
* sequentially (password prompts can't share a TTY). Output partitions into
|
|
7
|
+
* decrypted / skipped / failed via a kind-keyed dispatch map. Bootstrapped
|
|
8
|
+
* scanned_files rows are tagged onto each DecryptedFile.
|
|
9
|
+
*/
|
|
10
|
+
export declare function decryptPhase(db: Database.Database, state: ScanState, hooks: ScanHooks): Promise<void>;
|
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
import { randomUUID } from "crypto";
|
|
2
|
+
import { readPdf } from "../pdf.js";
|
|
3
|
+
import { unlockIfNeeded, persistUnlockOutcome } from "../unlock.js";
|
|
4
|
+
import { scanDataDir } from "../walker.js";
|
|
5
|
+
import { tryExecute } from "../result.js";
|
|
6
|
+
function findScannedByHash(db, hash) {
|
|
7
|
+
return db
|
|
8
|
+
.prepare(`SELECT id FROM scanned_files WHERE file_hash = ?`)
|
|
9
|
+
.get(hash) ?? null;
|
|
10
|
+
}
|
|
11
|
+
async function decryptOne(db, file, opts) {
|
|
12
|
+
const read = await tryExecute(() => readPdf(file.path));
|
|
13
|
+
if (!read.ok)
|
|
14
|
+
return { kind: "failed", error: `read failed: ${read.error}` };
|
|
15
|
+
const pdf = read.value;
|
|
16
|
+
const existing = findScannedByHash(db, pdf.hash);
|
|
17
|
+
if (existing && !opts.force) {
|
|
18
|
+
return { kind: "skipped", existingScannedFileId: existing.id };
|
|
19
|
+
}
|
|
20
|
+
const unlock = await tryExecute(() => unlockIfNeeded({
|
|
21
|
+
db,
|
|
22
|
+
filePath: file.path,
|
|
23
|
+
bytes: pdf.bytes,
|
|
24
|
+
interactive: opts.interactive,
|
|
25
|
+
}));
|
|
26
|
+
if (!unlock.ok)
|
|
27
|
+
return { kind: "failed", error: unlock.error || "unlock failed" };
|
|
28
|
+
persistUnlockOutcome(db, file.path, unlock.value.outcome);
|
|
29
|
+
return {
|
|
30
|
+
kind: "decrypted",
|
|
31
|
+
file: {
|
|
32
|
+
path: file.path,
|
|
33
|
+
fileName: file.name,
|
|
34
|
+
relPath: file.relPath,
|
|
35
|
+
hash: pdf.hash,
|
|
36
|
+
mime: pdf.mime,
|
|
37
|
+
decryptedBytes: unlock.value.decrypted,
|
|
38
|
+
replacesPriorScannedFileId: existing?.id,
|
|
39
|
+
},
|
|
40
|
+
};
|
|
41
|
+
}
|
|
42
|
+
const APPLY = {
|
|
43
|
+
decrypted: (state, _file, o) => { state.decrypted.push(o.file); },
|
|
44
|
+
skipped: (state, file, o) => { state.skipped.push({ file, existingScannedFileId: o.existingScannedFileId }); },
|
|
45
|
+
failed: (state, file, o) => { state.failed.push({ file, error: o.error }); },
|
|
46
|
+
};
|
|
47
|
+
/**
|
|
48
|
+
* Bootstrap one scanned_files row per decrypted file. Chunk workers later
|
|
49
|
+
* stamp transactions with source_file_id, so the row must exist before any
|
|
50
|
+
* tool writes hit the DB. Status flips to 'scanned' after parse completes.
|
|
51
|
+
*/
|
|
52
|
+
function bootstrapScannedFiles(db, state) {
|
|
53
|
+
for (const file of state.decrypted) {
|
|
54
|
+
if (file.replacesPriorScannedFileId) {
|
|
55
|
+
db.prepare(`DELETE FROM scanned_files WHERE id = ?`).run(file.replacesPriorScannedFileId);
|
|
56
|
+
}
|
|
57
|
+
const sfId = `sf:${randomUUID()}`;
|
|
58
|
+
db.prepare(`INSERT INTO scanned_files (id, path, file_hash, mime, status) VALUES (?, ?, ?, ?, 'pending')`).run(sfId, file.path, file.hash, file.mime);
|
|
59
|
+
file.scannedFileId = sfId;
|
|
60
|
+
}
|
|
61
|
+
}
|
|
62
|
+
/**
|
|
63
|
+
* Phase 1 — walk the data dir, optionally filter by regex, decrypt each file
|
|
64
|
+
* sequentially (password prompts can't share a TTY). Output partitions into
|
|
65
|
+
* decrypted / skipped / failed via a kind-keyed dispatch map. Bootstrapped
|
|
66
|
+
* scanned_files rows are tagged onto each DecryptedFile.
|
|
67
|
+
*/
|
|
68
|
+
export async function decryptPhase(db, state, hooks) {
|
|
69
|
+
await hooks.beforeDecrypt?.(state);
|
|
70
|
+
const matcher = state.options.regex ? new RegExp(state.options.regex, "i") : null;
|
|
71
|
+
state.files = scanDataDir().filter(f => (matcher ? matcher.test(f.relPath) : true));
|
|
72
|
+
const interactive = state.options.interactive ?? true;
|
|
73
|
+
const force = !!state.options.force;
|
|
74
|
+
for (const file of state.files) {
|
|
75
|
+
const outcome = await decryptOne(db, file, { force, interactive });
|
|
76
|
+
APPLY[outcome.kind](state, file, outcome);
|
|
77
|
+
}
|
|
78
|
+
bootstrapScannedFiles(db, state);
|
|
79
|
+
await hooks.afterDecrypt?.(state);
|
|
80
|
+
}
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
import type Database from "libsql";
|
|
2
|
+
import type { ScanState } from "../engine.js";
|
|
3
|
+
import type { ScanHooks } from "../hooks.js";
|
|
4
|
+
/**
|
|
5
|
+
* Phase 3 — fan out FileWorkers in parallel. Each FileWorker fans out its
|
|
6
|
+
* file's chunks in parallel internally. The scanId + progress sink are
|
|
7
|
+
* threaded through ScanState; chunk-worker tools write to the DB directly
|
|
8
|
+
* and tick the progress sink as they go.
|
|
9
|
+
*/
|
|
10
|
+
export declare function parsePhase(db: Database.Database, state: ScanState, hooks: ScanHooks): Promise<void>;
|