plasalid 0.7.1 → 0.7.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (141) hide show
  1. package/README.md +2 -2
  2. package/dist/ai/agent.d.ts +6 -7
  3. package/dist/ai/agent.js +27 -11
  4. package/dist/ai/personas.js +48 -46
  5. package/dist/ai/system-prompt.js +1 -1
  6. package/dist/ai/tools/account-mutex.d.ts +1 -0
  7. package/dist/ai/tools/account-mutex.js +16 -0
  8. package/dist/ai/tools/index.js +4 -12
  9. package/dist/ai/tools/ingest.d.ts +1 -1
  10. package/dist/ai/tools/ingest.js +282 -242
  11. package/dist/ai/tools/merchants.js +1 -28
  12. package/dist/ai/tools/read.js +8 -8
  13. package/dist/ai/tools/record.js +3 -36
  14. package/dist/ai/tools/resolve.js +25 -22
  15. package/dist/ai/tools/scan.js +0 -1
  16. package/dist/ai/tools/types.d.ts +14 -21
  17. package/dist/cli/commands/record.js +1 -82
  18. package/dist/cli/commands/resolve.d.ts +5 -2
  19. package/dist/cli/commands/resolve.js +36 -5
  20. package/dist/cli/commands/revert.js +4 -2
  21. package/dist/cli/commands/rules.js +2 -2
  22. package/dist/cli/commands/scan.js +199 -128
  23. package/dist/cli/commands/status.js +5 -5
  24. package/dist/cli/index.js +8 -29
  25. package/dist/cli/ink/ScanDashboard.d.ts +49 -0
  26. package/dist/cli/ink/ScanDashboard.js +214 -0
  27. package/dist/cli/ink/scan_dashboard.d.ts +40 -25
  28. package/dist/cli/ink/scan_dashboard.js +139 -44
  29. package/dist/db/queries/account-balance.d.ts +1 -1
  30. package/dist/db/queries/questions.d.ts +62 -0
  31. package/dist/db/queries/questions.js +110 -0
  32. package/dist/db/queries/transactions.d.ts +1 -1
  33. package/dist/db/queries/unknowns.d.ts +17 -15
  34. package/dist/db/queries/unknowns.js +35 -39
  35. package/dist/db/schema.js +6 -28
  36. package/dist/scanner/audit/auditor.d.ts +31 -0
  37. package/dist/scanner/audit/auditor.js +72 -0
  38. package/dist/scanner/audit/engine.d.ts +10 -0
  39. package/dist/scanner/audit/engine.js +98 -0
  40. package/dist/scanner/audit/eventBus.d.ts +60 -0
  41. package/dist/scanner/audit/eventBus.js +35 -0
  42. package/dist/scanner/audit/passes/index.d.ts +11 -0
  43. package/dist/scanner/audit/passes/index.js +9 -0
  44. package/dist/scanner/audit/passes/types.d.ts +23 -0
  45. package/dist/scanner/audit/passes/types.js +1 -0
  46. package/dist/scanner/audit/types.d.ts +27 -0
  47. package/dist/scanner/audit/types.js +1 -0
  48. package/dist/scanner/auditor.d.ts +51 -0
  49. package/dist/scanner/auditor.js +80 -0
  50. package/dist/scanner/buffer/engine.d.ts +9 -0
  51. package/dist/scanner/buffer/engine.js +110 -0
  52. package/dist/scanner/buffer/sharedBuffer.d.ts +78 -0
  53. package/dist/scanner/buffer/sharedBuffer.js +130 -0
  54. package/dist/scanner/buffer/types.d.ts +67 -0
  55. package/dist/scanner/buffer/types.js +1 -0
  56. package/dist/scanner/buffer.d.ts +45 -38
  57. package/dist/scanner/buffer.js +93 -61
  58. package/dist/scanner/bus/engine.d.ts +11 -0
  59. package/dist/scanner/bus/engine.js +42 -0
  60. package/dist/scanner/bus/types.d.ts +53 -0
  61. package/dist/scanner/bus/types.js +1 -0
  62. package/dist/scanner/bus.d.ts +38 -0
  63. package/dist/scanner/bus.js +37 -0
  64. package/dist/scanner/chunk-worker.d.ts +19 -0
  65. package/dist/scanner/chunk-worker.js +67 -0
  66. package/dist/scanner/chunkWorker.d.ts +20 -0
  67. package/dist/scanner/chunkWorker.js +59 -0
  68. package/dist/scanner/chunker/chunker.d.ts +7 -0
  69. package/dist/scanner/chunker/chunker.js +60 -0
  70. package/dist/scanner/chunker.d.ts +7 -0
  71. package/dist/scanner/chunker.js +60 -0
  72. package/dist/scanner/converge.d.ts +29 -0
  73. package/dist/scanner/converge.js +15 -0
  74. package/dist/scanner/decrypt.d.ts +10 -0
  75. package/dist/scanner/decrypt.js +80 -0
  76. package/dist/scanner/engine/scanEngine.d.ts +24 -0
  77. package/dist/scanner/engine/scanEngine.js +87 -0
  78. package/dist/scanner/engine/types.d.ts +90 -0
  79. package/dist/scanner/engine/types.js +1 -0
  80. package/dist/scanner/engine.d.ts +90 -0
  81. package/dist/scanner/engine.js +84 -0
  82. package/dist/scanner/file-worker.d.ts +33 -0
  83. package/dist/scanner/file-worker.js +28 -0
  84. package/dist/scanner/fileWorker.d.ts +33 -0
  85. package/dist/scanner/fileWorker.js +22 -0
  86. package/dist/scanner/hooks/types.d.ts +25 -0
  87. package/dist/scanner/hooks/types.js +1 -0
  88. package/dist/scanner/hooks.d.ts +23 -0
  89. package/dist/scanner/hooks.js +1 -0
  90. package/dist/scanner/parse.d.ts +10 -0
  91. package/dist/scanner/parse.js +47 -0
  92. package/dist/scanner/passes/index.d.ts +8 -0
  93. package/dist/scanner/passes/index.js +6 -0
  94. package/dist/scanner/passes/types.d.ts +22 -0
  95. package/dist/scanner/passes/types.js +1 -0
  96. package/dist/scanner/pdf/chunker.d.ts +7 -0
  97. package/dist/scanner/pdf/chunker.js +60 -0
  98. package/dist/scanner/pdf/password-store.d.ts +34 -0
  99. package/dist/scanner/pdf/password-store.js +83 -0
  100. package/dist/scanner/pdf/pdf-unlock.d.ts +17 -0
  101. package/dist/scanner/pdf/pdf-unlock.js +50 -0
  102. package/dist/scanner/pdf/pdf.d.ts +17 -0
  103. package/dist/scanner/pdf/pdf.js +36 -0
  104. package/dist/scanner/pdf/state-machine.d.ts +60 -0
  105. package/dist/scanner/pdf/state-machine.js +64 -0
  106. package/dist/scanner/pdf/unlock.d.ts +22 -0
  107. package/dist/scanner/pdf/unlock.js +121 -0
  108. package/dist/scanner/phase-decrypt.d.ts +10 -0
  109. package/dist/scanner/phase-decrypt.js +80 -0
  110. package/dist/scanner/phase-parse.d.ts +10 -0
  111. package/dist/scanner/phase-parse.js +46 -0
  112. package/dist/scanner/phases/chunk.d.ts +8 -0
  113. package/dist/scanner/phases/chunk.js +13 -0
  114. package/dist/scanner/phases/commit.d.ts +12 -0
  115. package/dist/scanner/phases/commit.js +140 -0
  116. package/dist/scanner/phases/decrypt.d.ts +10 -0
  117. package/dist/scanner/phases/decrypt.js +80 -0
  118. package/dist/scanner/phases/parse.d.ts +10 -0
  119. package/dist/scanner/phases/parse.js +46 -0
  120. package/dist/scanner/phases/resolve.d.ts +10 -0
  121. package/dist/scanner/phases/resolve.js +17 -0
  122. package/dist/scanner/phases/review.d.ts +10 -0
  123. package/dist/scanner/phases/review.js +12 -0
  124. package/dist/scanner/progress.d.ts +14 -0
  125. package/dist/scanner/progress.js +21 -0
  126. package/dist/scanner/resolver-memory.d.ts +8 -0
  127. package/dist/scanner/resolver-memory.js +24 -0
  128. package/dist/scanner/resolver.d.ts +39 -0
  129. package/dist/scanner/resolver.js +196 -0
  130. package/dist/scanner/result.d.ts +17 -0
  131. package/dist/scanner/result.js +19 -0
  132. package/dist/scanner/run-passes.d.ts +30 -0
  133. package/dist/scanner/run-passes.js +15 -0
  134. package/dist/scanner/unlock.js +1 -1
  135. package/dist/scanner/worker.d.ts +19 -0
  136. package/dist/scanner/worker.js +67 -0
  137. package/dist/scanner/workers/chunkWorker.d.ts +20 -0
  138. package/dist/scanner/workers/chunkWorker.js +65 -0
  139. package/dist/scanner/workers/fileWorker.d.ts +32 -0
  140. package/dist/scanner/workers/fileWorker.js +22 -0
  141. package/package.json +1 -1
@@ -0,0 +1,28 @@
1
+ import { randomUUID } from "crypto";
2
+ import { runWithConcurrency } from "./concurrency.js";
3
+ import { runChunkWorker } from "./chunk-worker.js";
4
+ /**
5
+ * Process every chunk of one file in parallel up to `maxChunkWorkers`.
6
+ * Chunk-worker tools write transactions + unknowns directly to the DB,
7
+ * scoped to `scanId`; per-row ticks fan out via the shared progress sink.
8
+ */
9
+ export async function runFileWorker(deps, hooks) {
10
+ const workerId = `fw:${randomUUID()}`;
11
+ const chunkFn = deps.chunkWorkerFn ?? runChunkWorker;
12
+ const tasks = deps.chunks.map(chunk => () => chunkFn({
13
+ db: deps.db,
14
+ scanId: deps.scanId,
15
+ scannedFileId: deps.scannedFileId,
16
+ progress: deps.progress,
17
+ chunk,
18
+ }, hooks));
19
+ const settled = await runWithConcurrency(tasks, deps.maxChunkWorkers);
20
+ let ok = 0, failed = 0;
21
+ for (const r of settled) {
22
+ if (r.ok && r.value.ok)
23
+ ok++;
24
+ else
25
+ failed++;
26
+ }
27
+ return { workerId, fileId: deps.fileId, ok, failed };
28
+ }
@@ -0,0 +1,33 @@
1
+ import type Database from "libsql";
2
+ import { type ChunkWorkerDeps, type ChunkWorkerResult } from "./chunkWorker.js";
3
+ import type { ScanBuffer } from "./buffer.js";
4
+ import type { Chunk } from "./engine.js";
5
+ import type { ScanHooks } from "./hooks.js";
6
+ /**
7
+ * Pluggable chunk-parser strategy. Default is the LLM-driven runChunkWorker;
8
+ * tests and alternate flows (OCR-only, mock, dry-run) inject their own
9
+ * without modifying the file worker. Open for extension; closed for
10
+ * modification.
11
+ */
12
+ export type ChunkWorkerFn = (deps: ChunkWorkerDeps, hooks: ScanHooks) => Promise<ChunkWorkerResult>;
13
+ export interface FileWorkerDeps {
14
+ readonly db: Database.Database;
15
+ readonly buffer: ScanBuffer;
16
+ readonly fileId: string;
17
+ readonly chunks: readonly Chunk[];
18
+ readonly maxChunkWorkers: number;
19
+ /** Optional override; defaults to the LLM-backed runChunkWorker. */
20
+ readonly chunkWorkerFn?: ChunkWorkerFn;
21
+ }
22
+ export interface FileWorkerResult {
23
+ readonly workerId: string;
24
+ readonly fileId: string;
25
+ readonly ok: number;
26
+ readonly failed: number;
27
+ }
28
+ /**
29
+ * Process every chunk of one file in parallel up to `maxChunkWorkers`. The
30
+ * shared buffer + bus are dependency-injected; this function has no global
31
+ * state and never reaches outside its args.
32
+ */
33
+ export declare function runFileWorker(deps: FileWorkerDeps, hooks: ScanHooks): Promise<FileWorkerResult>;
@@ -0,0 +1,22 @@
1
+ import { randomUUID } from "crypto";
2
+ import { runWithConcurrency } from "./concurrency.js";
3
+ import { runChunkWorker } from "./chunkWorker.js";
4
+ /**
5
+ * Process every chunk of one file in parallel up to `maxChunkWorkers`. The
6
+ * shared buffer + bus are dependency-injected; this function has no global
7
+ * state and never reaches outside its args.
8
+ */
9
+ export async function runFileWorker(deps, hooks) {
10
+ const workerId = `fw:${randomUUID()}`;
11
+ const chunkFn = deps.chunkWorkerFn ?? runChunkWorker;
12
+ const tasks = deps.chunks.map(chunk => () => chunkFn({ db: deps.db, buffer: deps.buffer, chunk }, hooks));
13
+ const settled = await runWithConcurrency(tasks, deps.maxChunkWorkers);
14
+ let ok = 0, failed = 0;
15
+ for (const r of settled) {
16
+ if (r.ok && r.value.ok)
17
+ ok++;
18
+ else
19
+ failed++;
20
+ }
21
+ return { workerId, fileId: deps.fileId, ok, failed };
22
+ }
@@ -0,0 +1,25 @@
1
+ import type { Chunk, ScanState, CommitOutcome, PhaseName } from "../engine/types.js";
2
+ import type { BufferSnapshot } from "../buffer/types.js";
3
+ export type MaybePromise<T> = T | Promise<T>;
4
+ /**
5
+ * Lifecycle hooks the scanner engine fires at phase edges. CLI registers
6
+ * spinner/Ink hooks; tests register assertions. Every hook is optional and
7
+ * best-effort — a hook that throws gets logged and the phase continues.
8
+ */
9
+ export interface ScanHooks {
10
+ onStart?(s: Readonly<ScanState>): MaybePromise<void>;
11
+ beforeDecrypt?(s: Readonly<ScanState>): MaybePromise<void>;
12
+ afterDecrypt?(s: Readonly<ScanState>): MaybePromise<void>;
13
+ beforeChunk?(s: Readonly<ScanState>): MaybePromise<void>;
14
+ afterChunk?(s: Readonly<ScanState>): MaybePromise<void>;
15
+ beforeParse?(s: Readonly<ScanState>): MaybePromise<void>;
16
+ onWorkerStart?(workerId: string, chunk: Chunk): void;
17
+ onWorkerEnd?(workerId: string, chunk: Chunk, ok: boolean): void;
18
+ afterParse?(s: Readonly<ScanState>): MaybePromise<void>;
19
+ beforeReview?(s: Readonly<ScanState>, snapshot: BufferSnapshot): MaybePromise<void>;
20
+ afterReview?(s: Readonly<ScanState>): MaybePromise<void>;
21
+ beforeCommit?(s: Readonly<ScanState>): MaybePromise<void>;
22
+ afterCommit?(s: Readonly<ScanState>, outcome: CommitOutcome): MaybePromise<void>;
23
+ onError?(err: unknown, phase: PhaseName, s: Readonly<ScanState>): MaybePromise<void>;
24
+ onFinish?(s: Readonly<ScanState>): MaybePromise<void>;
25
+ }
@@ -0,0 +1 @@
1
+ export {};
@@ -0,0 +1,23 @@
1
+ import type { Chunk, ScanState, PhaseName } from "./engine.js";
2
+ import type { ResolveSummary } from "./resolver.js";
3
+ export type MaybePromise<T> = T | Promise<T>;
4
+ /**
5
+ * Lifecycle hooks the engine fires at phase edges. CLI registers spinner/Ink
6
+ * hooks; tests register assertions. Every hook is optional and best-effort —
7
+ * a hook that throws gets logged and the phase continues.
8
+ */
9
+ export interface ScanHooks {
10
+ onStart?(s: Readonly<ScanState>): MaybePromise<void>;
11
+ beforeDecrypt?(s: Readonly<ScanState>): MaybePromise<void>;
12
+ afterDecrypt?(s: Readonly<ScanState>): MaybePromise<void>;
13
+ beforeChunk?(s: Readonly<ScanState>): MaybePromise<void>;
14
+ afterChunk?(s: Readonly<ScanState>): MaybePromise<void>;
15
+ beforeParse?(s: Readonly<ScanState>): MaybePromise<void>;
16
+ onWorkerStart?(workerId: string, chunk: Chunk): void;
17
+ onWorkerEnd?(workerId: string, chunk: Chunk, ok: boolean): void;
18
+ afterParse?(s: Readonly<ScanState>): MaybePromise<void>;
19
+ beforeResolve?(s: Readonly<ScanState>): MaybePromise<void>;
20
+ afterResolve?(s: Readonly<ScanState>, summary: ResolveSummary): MaybePromise<void>;
21
+ onError?(err: unknown, phase: PhaseName, s: Readonly<ScanState>): MaybePromise<void>;
22
+ onFinish?(s: Readonly<ScanState>): MaybePromise<void>;
23
+ }
@@ -0,0 +1 @@
1
+ export {};
@@ -0,0 +1,10 @@
1
+ import type Database from "libsql";
2
+ import type { ScanState } from "./engine.js";
3
+ import type { ScanHooks } from "./hooks.js";
4
+ /**
5
+ * Phase 3 — two-tier fan-out: up to `maxFile` files in parallel, each file
6
+ * processing up to `maxChunk` chunks in parallel. Chunk-worker tools write
7
+ * transactions and questions directly to the DB (scoped to `scanId`) and tick
8
+ * the shared progress sink.
9
+ */
10
+ export declare function parsePhase(db: Database.Database, state: ScanState, hooks: ScanHooks): Promise<void>;
@@ -0,0 +1,47 @@
1
+ import { runWithConcurrency } from "./concurrency.js";
2
+ import { runScanWorker } from "./worker.js";
3
+ import { errorMessage } from "./result.js";
4
+ const DEFAULT_MAX_FILE_WORKERS = 5;
5
+ const DEFAULT_MAX_SCAN_WORKERS_PER_FILE = 5;
6
+ const HARD_CAP = 8;
7
+ const clamp = (n, fallback) => Math.min(HARD_CAP, Math.max(1, n ?? fallback));
8
+ /**
9
+ * Phase 3 — two-tier fan-out: up to `maxFile` files in parallel, each file
10
+ * processing up to `maxChunk` chunks in parallel. Chunk-worker tools write
11
+ * transactions and questions directly to the DB (scoped to `scanId`) and tick
12
+ * the shared progress sink.
13
+ */
14
+ export async function parsePhase(db, state, hooks) {
15
+ await hooks.beforeParse?.(state);
16
+ const maxFile = clamp(state.options.maxFileWorkers, DEFAULT_MAX_FILE_WORKERS);
17
+ const maxChunk = clamp(state.options.maxScanWorkersPerFile, DEFAULT_MAX_SCAN_WORKERS_PER_FILE);
18
+ const fileGroups = state.decrypted
19
+ .map(file => ({
20
+ fileId: file.path,
21
+ scannedFileId: file.scannedFileId,
22
+ chunks: state.chunks.filter(c => c.fileId === file.path),
23
+ }))
24
+ .filter(g => g.chunks.length > 0);
25
+ const fileTasks = fileGroups.map(group => () => {
26
+ const chunkTasks = group.chunks.map(chunk => () => runScanWorker({
27
+ db,
28
+ scanId: state.scanId,
29
+ scannedFileId: group.scannedFileId,
30
+ progress: state.progress,
31
+ chunk,
32
+ }, hooks));
33
+ return runWithConcurrency(chunkTasks, maxChunk);
34
+ });
35
+ const settled = await runWithConcurrency(fileTasks, maxFile);
36
+ for (let i = 0; i < settled.length; i++) {
37
+ const r = settled[i];
38
+ if (!r.ok)
39
+ state.errors.push({ phase: "parse", target: fileGroups[i].fileId, error: errorMessage(r.error) });
40
+ }
41
+ for (const file of state.decrypted) {
42
+ if (!file.scannedFileId)
43
+ continue;
44
+ db.prepare(`UPDATE scanned_files SET status = 'scanned', scanned_at = datetime('now') WHERE id = ?`).run(file.scannedFileId);
45
+ }
46
+ await hooks.afterParse?.(state);
47
+ }
@@ -0,0 +1,8 @@
1
+ import type { AuditPass } from "./types.js";
2
+ /**
3
+ * Audit-pass registry. The audit engine indexes these by their declared
4
+ * `kinds` at startup and dispatches matching passes per buffer event.
5
+ * Append to the array to extend; do not edit the engine.
6
+ */
7
+ export declare const AUDIT_PASSES: readonly AuditPass[];
8
+ export type { AuditPass };
@@ -0,0 +1,6 @@
1
+ /**
2
+ * Audit-pass registry. The audit engine indexes these by their declared
3
+ * `kinds` at startup and dispatches matching passes per buffer event.
4
+ * Append to the array to extend; do not edit the engine.
5
+ */
6
+ export const AUDIT_PASSES = [];
@@ -0,0 +1,22 @@
1
+ import type Database from "libsql";
2
+ import type { BufferEvent, EventKind } from "../bus.js";
3
+ import type { ScanBuffer } from "../buffer.js";
4
+ export interface AuditContext {
5
+ readonly db: Database.Database;
6
+ readonly buffer: ScanBuffer;
7
+ /** Optional tally bucket — passes increment their own count for the review summary. */
8
+ readonly tally: Record<string, number>;
9
+ }
10
+ /**
11
+ * Deterministic rule-based reaction to a BufferEvent. Passes mutate the
12
+ * ScanBuffer (apply memory rules, dedup, link to recurrences, etc.) so data
13
+ * lands clean before the review TUI sees it.
14
+ *
15
+ * Declare which event kinds you care about up front via `kinds` — the audit
16
+ * engine indexes once and dispatches only the matching passes per event.
17
+ */
18
+ export interface AuditPass {
19
+ readonly name: string;
20
+ readonly kinds: readonly EventKind[];
21
+ apply(event: BufferEvent, ctx: AuditContext): Promise<void>;
22
+ }
@@ -0,0 +1 @@
1
+ export {};
@@ -0,0 +1,7 @@
1
+ import type { Chunk, DecryptedFile } from "../engine.js";
2
+ /**
3
+ * Split one decrypted PDF into N single-page Chunks. Each chunk is a
4
+ * standalone, valid PDF so the per-chunk LLM agent gets a clean document
5
+ * without siblings.
6
+ */
7
+ export declare function chunkPdf(file: DecryptedFile): Promise<Chunk[]>;
@@ -0,0 +1,60 @@
1
+ let mupdfPromise = null;
2
+ function getMupdf() {
3
+ if (!mupdfPromise)
4
+ mupdfPromise = import("mupdf");
5
+ return mupdfPromise;
6
+ }
7
+ /**
8
+ * Build one Chunk holding exactly page `pageIndex` of `file`. mupdf has no
9
+ * native page-range extract, so we clone the source doc and delete every
10
+ * other page, back-to-front so indices stay stable as we splice. Resource
11
+ * lifetime is contained in the try/finally so a saveToBuffer failure can't
12
+ * leak the cloned doc.
13
+ */
14
+ async function extractPage(file, pageIndex, pageCount) {
15
+ const mupdf = await getMupdf();
16
+ const clone = mupdf.Document.openDocument(file.decryptedBytes, file.mime);
17
+ try {
18
+ for (let j = pageCount - 1; j >= 0; j--) {
19
+ if (j !== pageIndex)
20
+ clone.deletePage(j);
21
+ }
22
+ const out = clone.saveToBuffer("decrypt");
23
+ return {
24
+ chunkId: `${file.path}#p${pageIndex + 1}`,
25
+ fileId: file.path,
26
+ fileName: file.fileName,
27
+ relPath: file.relPath,
28
+ pageNumber: pageIndex + 1,
29
+ totalPages: pageCount,
30
+ bytes: Buffer.from(out.asUint8Array()),
31
+ mime: file.mime,
32
+ };
33
+ }
34
+ finally {
35
+ clone.destroy();
36
+ }
37
+ }
38
+ /**
39
+ * Split one decrypted PDF into N single-page Chunks. Each chunk is a
40
+ * standalone, valid PDF so the per-chunk LLM agent gets a clean document
41
+ * without siblings.
42
+ */
43
+ export async function chunkPdf(file) {
44
+ const mupdf = await getMupdf();
45
+ const probe = mupdf.Document.openDocument(file.decryptedBytes, file.mime);
46
+ let pageCount;
47
+ try {
48
+ pageCount = probe.countPages();
49
+ }
50
+ finally {
51
+ probe.destroy();
52
+ }
53
+ if (pageCount <= 0)
54
+ return [];
55
+ const chunks = [];
56
+ for (let i = 0; i < pageCount; i++) {
57
+ chunks.push(await extractPage(file, i, pageCount));
58
+ }
59
+ return chunks;
60
+ }
@@ -0,0 +1,34 @@
1
+ import type Database from "libsql";
2
+ export interface StoredPassword {
3
+ id: string;
4
+ pattern: string;
5
+ password: string;
6
+ useCount: number;
7
+ lastUsedAt: string | null;
8
+ }
9
+ /**
10
+ * Derive a regex from a filename. Strategy: take the leading alphabetic-ish
11
+ * prefix (up to the first separator: underscore, hyphen, space, or dot) and
12
+ * wildcard everything after it. Looser than a literal match — `AcctSt_May26.pdf`
13
+ * and `AcctSt_Jun26.pdf` share the same pattern.
14
+ *
15
+ * Falls back to the older digit-collapse strategy when the prefix is too short
16
+ * (<3 chars) or doesn't start with a letter, so we don't end up with overly
17
+ * generic patterns like `^a.*` or `^\d+.*`.
18
+ *
19
+ * Examples:
20
+ * `AcctSt_May26.pdf` → `^acctst.*`
21
+ * `KBank-Savings-2026-01.pdf` → `^kbank.*`
22
+ * `statement.pdf` → `^statement.*`
23
+ * `1234567890.pdf` → `^\d+\.pdf$` (fallback)
24
+ * `e-statement.pdf` → `^e\-statement\.pdf$` (fallback — prefix too short)
25
+ */
26
+ export declare function suggestPattern(filename: string): string;
27
+ /** Stored passwords whose pattern matches the basename of `filePath`. */
28
+ export declare function findCandidates(db: Database.Database, filePath: string, dbKey: string): StoredPassword[];
29
+ /**
30
+ * Upsert by pattern. If the pattern already exists the row is replaced — useful
31
+ * when the bank rotates the password for a recurring statement series.
32
+ */
33
+ export declare function savePassword(db: Database.Database, pattern: string, password: string, dbKey: string): string;
34
+ export declare function recordUse(db: Database.Database, id: string): void;
@@ -0,0 +1,83 @@
1
+ import { randomUUID } from "crypto";
2
+ import { basename } from "path";
3
+ import { encryptSecret, decryptSecret } from "../../db/encryption.js";
4
+ const REGEX_META = /[.*+?^${}()|[\]\\]/g;
5
+ const SEPARATORS = /[_\-\s.]/;
6
+ const MIN_PREFIX_LEN = 3;
7
+ /**
8
+ * Derive a regex from a filename. Strategy: take the leading alphabetic-ish
9
+ * prefix (up to the first separator: underscore, hyphen, space, or dot) and
10
+ * wildcard everything after it. Looser than a literal match — `AcctSt_May26.pdf`
11
+ * and `AcctSt_Jun26.pdf` share the same pattern.
12
+ *
13
+ * Falls back to the older digit-collapse strategy when the prefix is too short
14
+ * (<3 chars) or doesn't start with a letter, so we don't end up with overly
15
+ * generic patterns like `^a.*` or `^\d+.*`.
16
+ *
17
+ * Examples:
18
+ * `AcctSt_May26.pdf` → `^acctst.*`
19
+ * `KBank-Savings-2026-01.pdf` → `^kbank.*`
20
+ * `statement.pdf` → `^statement.*`
21
+ * `1234567890.pdf` → `^\d+\.pdf$` (fallback)
22
+ * `e-statement.pdf` → `^e\-statement\.pdf$` (fallback — prefix too short)
23
+ */
24
+ export function suggestPattern(filename) {
25
+ const name = basename(filename).toLowerCase();
26
+ const prefix = name.split(SEPARATORS)[0];
27
+ if (prefix.length >= MIN_PREFIX_LEN && /^[a-z]/.test(prefix)) {
28
+ return `^${prefix.replace(REGEX_META, "\\$&")}.*`;
29
+ }
30
+ const escaped = name.replace(REGEX_META, "\\$&");
31
+ const collapsed = escaped.replace(/\d+/g, "\\d+");
32
+ return `^${collapsed}$`;
33
+ }
34
+ /** Stored passwords whose pattern matches the basename of `filePath`. */
35
+ export function findCandidates(db, filePath, dbKey) {
36
+ const target = basename(filePath);
37
+ const rows = db
38
+ .prepare(`SELECT id, pattern, password_encrypted, use_count, last_used_at
39
+ FROM file_passwords
40
+ ORDER BY use_count DESC, last_used_at DESC NULLS LAST, created_at ASC`)
41
+ .all();
42
+ return rows
43
+ .filter(r => safeTest(r.pattern, target))
44
+ .map(r => ({
45
+ id: r.id,
46
+ pattern: r.pattern,
47
+ password: decryptSecret(r.password_encrypted, dbKey),
48
+ useCount: r.use_count,
49
+ lastUsedAt: r.last_used_at,
50
+ }));
51
+ }
52
+ function safeTest(pattern, target) {
53
+ try {
54
+ return new RegExp(pattern, "i").test(target);
55
+ }
56
+ catch {
57
+ return false;
58
+ }
59
+ }
60
+ /**
61
+ * Upsert by pattern. If the pattern already exists the row is replaced — useful
62
+ * when the bank rotates the password for a recurring statement series.
63
+ */
64
+ export function savePassword(db, pattern, password, dbKey) {
65
+ const encrypted = encryptSecret(password, dbKey);
66
+ const existing = db
67
+ .prepare(`SELECT id FROM file_passwords WHERE pattern = ?`)
68
+ .get(pattern);
69
+ if (existing) {
70
+ db.prepare(`UPDATE file_passwords
71
+ SET password_encrypted = ?, use_count = 0, last_used_at = NULL
72
+ WHERE id = ?`).run(encrypted, existing.id);
73
+ return existing.id;
74
+ }
75
+ const id = `fp:${randomUUID()}`;
76
+ db.prepare(`INSERT INTO file_passwords (id, pattern, password_encrypted) VALUES (?, ?, ?)`).run(id, pattern, encrypted);
77
+ return id;
78
+ }
79
+ export function recordUse(db, id) {
80
+ db.prepare(`UPDATE file_passwords
81
+ SET use_count = use_count + 1, last_used_at = datetime('now')
82
+ WHERE id = ?`).run(id);
83
+ }
@@ -0,0 +1,17 @@
1
+ /**
2
+ * Thin wrapper around the mupdf WASM library. Lazy-imported on first call so
3
+ * the WASM module isn't loaded for data dirs that contain only plaintext PDFs.
4
+ */
5
+ export declare function isEncrypted(bytes: Buffer): Promise<boolean>;
6
+ export interface UnlockResult {
7
+ ok: boolean;
8
+ /** Set when `ok === true`. Plaintext (decrypted) PDF bytes ready to forward. */
9
+ decrypted?: Buffer;
10
+ }
11
+ /**
12
+ * Attempt to unlock and re-save `bytes` as an unencrypted PDF using `password`.
13
+ * Returns `{ ok: false }` on wrong password or non-PDF input. Returns
14
+ * `{ ok: true, decrypted }` on success. If the input wasn't encrypted to begin
15
+ * with, returns `{ ok: true, decrypted: bytes }` unchanged.
16
+ */
17
+ export declare function unlock(bytes: Buffer, password: string): Promise<UnlockResult>;
@@ -0,0 +1,50 @@
1
+ /**
2
+ * Thin wrapper around the mupdf WASM library. Lazy-imported on first call so
3
+ * the WASM module isn't loaded for data dirs that contain only plaintext PDFs.
4
+ */
5
+ let mupdfPromise = null;
6
+ /** mupdf's authenticatePassword returns 0 on a wrong password, non-zero on success. */
7
+ const MUPDF_AUTH_FAILED = 0;
8
+ function getMupdf() {
9
+ if (!mupdfPromise) {
10
+ mupdfPromise = import("mupdf");
11
+ }
12
+ return mupdfPromise;
13
+ }
14
+ export async function isEncrypted(bytes) {
15
+ const mupdf = await getMupdf();
16
+ const doc = mupdf.Document.openDocument(bytes, "application/pdf");
17
+ try {
18
+ return doc.needsPassword();
19
+ }
20
+ finally {
21
+ doc.destroy();
22
+ }
23
+ }
24
+ /**
25
+ * Attempt to unlock and re-save `bytes` as an unencrypted PDF using `password`.
26
+ * Returns `{ ok: false }` on wrong password or non-PDF input. Returns
27
+ * `{ ok: true, decrypted }` on success. If the input wasn't encrypted to begin
28
+ * with, returns `{ ok: true, decrypted: bytes }` unchanged.
29
+ */
30
+ export async function unlock(bytes, password) {
31
+ const mupdf = await getMupdf();
32
+ const doc = mupdf.Document.openDocument(bytes, "application/pdf");
33
+ try {
34
+ if (!(doc instanceof mupdf.PDFDocument)) {
35
+ return { ok: false };
36
+ }
37
+ if (!doc.needsPassword()) {
38
+ return { ok: true, decrypted: bytes };
39
+ }
40
+ const result = doc.authenticatePassword(password);
41
+ if (result === MUPDF_AUTH_FAILED) {
42
+ return { ok: false };
43
+ }
44
+ const out = doc.saveToBuffer("decrypt");
45
+ return { ok: true, decrypted: Buffer.from(out.asUint8Array()) };
46
+ }
47
+ finally {
48
+ doc.destroy();
49
+ }
50
+ }
@@ -0,0 +1,17 @@
1
+ import type { DocumentBlock } from "../../ai/provider.js";
2
+ export interface LoadedFile {
3
+ bytes: Buffer;
4
+ hash: string;
5
+ mime: string;
6
+ fileName: string;
7
+ }
8
+ /**
9
+ * Read a local PDF, hash its bytes, and return everything the scan pipeline
10
+ * needs to decide whether to skip / re-scan / unlock the file. The hash is
11
+ * sha256 of the original on-disk bytes (still encrypted if the PDF is
12
+ * password-protected) — that's what the dedup contract relies on, so we can
13
+ * recognize the same file across re-scans regardless of unlock state.
14
+ */
15
+ export declare function readPdf(path: string): LoadedFile;
16
+ /** Build an Anthropic-compatible document content block from PDF bytes. */
17
+ export declare function buildDocumentBlock(bytes: Buffer, fileName: string, mime?: string): DocumentBlock;
@@ -0,0 +1,36 @@
1
+ import { readFileSync, statSync } from "fs";
2
+ import { createHash } from "crypto";
3
+ import { basename, extname } from "path";
4
+ const MIME_BY_EXT = {
5
+ ".pdf": "application/pdf",
6
+ };
7
+ const MAX_BYTES = 30 * 1024 * 1024;
8
+ /**
9
+ * Read a local PDF, hash its bytes, and return everything the scan pipeline
10
+ * needs to decide whether to skip / re-scan / unlock the file. The hash is
11
+ * sha256 of the original on-disk bytes (still encrypted if the PDF is
12
+ * password-protected) — that's what the dedup contract relies on, so we can
13
+ * recognize the same file across re-scans regardless of unlock state.
14
+ */
15
+ export function readPdf(path) {
16
+ const ext = extname(path).toLowerCase();
17
+ const mime = MIME_BY_EXT[ext];
18
+ if (!mime) {
19
+ throw new Error(`Unsupported file extension: ${ext}. Plasalid v1 only ingests PDFs.`);
20
+ }
21
+ const stat = statSync(path);
22
+ if (stat.size > MAX_BYTES) {
23
+ throw new Error(`File too large (${stat.size} bytes). Limit is ${MAX_BYTES} bytes.`);
24
+ }
25
+ const bytes = readFileSync(path);
26
+ const hash = createHash("sha256").update(bytes).digest("hex");
27
+ return { bytes, hash, mime, fileName: basename(path) };
28
+ }
29
+ /** Build an Anthropic-compatible document content block from PDF bytes. */
30
+ export function buildDocumentBlock(bytes, fileName, mime = "application/pdf") {
31
+ return {
32
+ type: "document",
33
+ source: { type: "base64", media_type: mime, data: bytes.toString("base64") },
34
+ title: fileName,
35
+ };
36
+ }
@@ -0,0 +1,60 @@
1
+ import type { StoredPassword } from "./password-store.js";
2
+ /**
3
+ * Pure state machine for the unlock phase of a single file scan. Side effects
4
+ * (mupdf calls, prompts, DB reads) live in the orchestrator; this module only
5
+ * encodes the transition logic so it can be exhaustively unit-tested.
6
+ */
7
+ export declare const MAX_PASSWORD_ATTEMPTS = 10;
8
+ export type UnlockOutcome = {
9
+ kind: "plaintext";
10
+ } | {
11
+ kind: "from-store";
12
+ storedId: string;
13
+ } | {
14
+ kind: "from-user";
15
+ password: string;
16
+ };
17
+ export type UnlockState = {
18
+ kind: "init";
19
+ } | {
20
+ kind: "trying-stored";
21
+ candidates: StoredPassword[];
22
+ } | {
23
+ kind: "awaiting-user";
24
+ attempt: number;
25
+ } | {
26
+ kind: "done";
27
+ decrypted: Buffer;
28
+ outcome: UnlockOutcome;
29
+ } | {
30
+ kind: "failed";
31
+ reason: string;
32
+ };
33
+ export type UnlockEvent = {
34
+ kind: "INSPECTED_PLAINTEXT";
35
+ bytes: Buffer;
36
+ } | {
37
+ kind: "INSPECTED_ENCRYPTED";
38
+ candidates: StoredPassword[];
39
+ } | {
40
+ kind: "STORED_UNLOCK_OK";
41
+ decrypted: Buffer;
42
+ usedStoredId: string;
43
+ } | {
44
+ kind: "STORED_UNLOCK_EXHAUSTED";
45
+ } | {
46
+ kind: "USER_CANCELLED";
47
+ } | {
48
+ kind: "UNLOCK_OK";
49
+ decrypted: Buffer;
50
+ password: string;
51
+ } | {
52
+ kind: "UNLOCK_FAIL";
53
+ };
54
+ export declare function isTerminal(state: UnlockState): boolean;
55
+ /**
56
+ * Pure transition. Throws if the event doesn't make sense for the current state;
57
+ * the orchestrator never produces such combinations, so reaching the throw is a
58
+ * programmer error worth surfacing loudly.
59
+ */
60
+ export declare function transition(state: UnlockState, event: UnlockEvent): UnlockState;