plasalid 0.7.1 → 0.7.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (157) hide show
  1. package/README.md +15 -15
  2. package/dist/accounts/taxonomy.d.ts +1 -1
  3. package/dist/accounts/taxonomy.js +1 -1
  4. package/dist/ai/agent.d.ts +9 -10
  5. package/dist/ai/agent.js +31 -15
  6. package/dist/ai/personas.d.ts +1 -1
  7. package/dist/ai/personas.js +57 -55
  8. package/dist/ai/prompt-sections.d.ts +4 -4
  9. package/dist/ai/prompt-sections.js +1 -1
  10. package/dist/ai/system-prompt.d.ts +2 -2
  11. package/dist/ai/system-prompt.js +5 -5
  12. package/dist/ai/tools/account-mutex.d.ts +1 -0
  13. package/dist/ai/tools/account-mutex.js +16 -0
  14. package/dist/ai/tools/clarify.d.ts +2 -0
  15. package/dist/ai/tools/clarify.js +169 -0
  16. package/dist/ai/tools/index.js +10 -18
  17. package/dist/ai/tools/ingest.d.ts +2 -2
  18. package/dist/ai/tools/ingest.js +284 -244
  19. package/dist/ai/tools/merchants.js +1 -28
  20. package/dist/ai/tools/read.js +8 -8
  21. package/dist/ai/tools/record.js +7 -40
  22. package/dist/ai/tools/resolve.js +25 -22
  23. package/dist/ai/tools/scan.js +0 -1
  24. package/dist/ai/tools/types.d.ts +14 -21
  25. package/dist/cli/commands/clarify.d.ts +5 -0
  26. package/dist/cli/commands/clarify.js +44 -0
  27. package/dist/cli/commands/record.js +1 -82
  28. package/dist/cli/commands/resolve.d.ts +5 -2
  29. package/dist/cli/commands/resolve.js +36 -5
  30. package/dist/cli/commands/revert.js +4 -2
  31. package/dist/cli/commands/rules.js +2 -2
  32. package/dist/cli/commands/scan.js +199 -128
  33. package/dist/cli/commands/status.js +6 -6
  34. package/dist/cli/index.js +8 -29
  35. package/dist/cli/ink/ScanDashboard.d.ts +49 -0
  36. package/dist/cli/ink/ScanDashboard.js +214 -0
  37. package/dist/cli/ink/scan_dashboard.d.ts +40 -25
  38. package/dist/cli/ink/scan_dashboard.js +139 -44
  39. package/dist/cli/setup.js +1 -1
  40. package/dist/cli/ux.js +1 -1
  41. package/dist/db/queries/account-balance.d.ts +1 -1
  42. package/dist/db/queries/questions.d.ts +62 -0
  43. package/dist/db/queries/questions.js +110 -0
  44. package/dist/db/queries/transactions.d.ts +1 -1
  45. package/dist/db/queries/unknowns.d.ts +17 -15
  46. package/dist/db/queries/unknowns.js +35 -39
  47. package/dist/db/schema.js +6 -28
  48. package/dist/scanner/audit/auditor.d.ts +31 -0
  49. package/dist/scanner/audit/auditor.js +72 -0
  50. package/dist/scanner/audit/engine.d.ts +10 -0
  51. package/dist/scanner/audit/engine.js +98 -0
  52. package/dist/scanner/audit/eventBus.d.ts +60 -0
  53. package/dist/scanner/audit/eventBus.js +35 -0
  54. package/dist/scanner/audit/passes/index.d.ts +11 -0
  55. package/dist/scanner/audit/passes/index.js +9 -0
  56. package/dist/scanner/audit/passes/types.d.ts +23 -0
  57. package/dist/scanner/audit/passes/types.js +1 -0
  58. package/dist/scanner/audit/types.d.ts +27 -0
  59. package/dist/scanner/audit/types.js +1 -0
  60. package/dist/scanner/auditor.d.ts +51 -0
  61. package/dist/scanner/auditor.js +80 -0
  62. package/dist/scanner/buffer/engine.d.ts +9 -0
  63. package/dist/scanner/buffer/engine.js +110 -0
  64. package/dist/scanner/buffer/sharedBuffer.d.ts +78 -0
  65. package/dist/scanner/buffer/sharedBuffer.js +130 -0
  66. package/dist/scanner/buffer/types.d.ts +67 -0
  67. package/dist/scanner/buffer/types.js +1 -0
  68. package/dist/scanner/buffer.d.ts +45 -38
  69. package/dist/scanner/buffer.js +93 -61
  70. package/dist/scanner/bus/engine.d.ts +11 -0
  71. package/dist/scanner/bus/engine.js +42 -0
  72. package/dist/scanner/bus/types.d.ts +53 -0
  73. package/dist/scanner/bus/types.js +1 -0
  74. package/dist/scanner/bus.d.ts +38 -0
  75. package/dist/scanner/bus.js +37 -0
  76. package/dist/scanner/chunk-worker.d.ts +19 -0
  77. package/dist/scanner/chunk-worker.js +67 -0
  78. package/dist/scanner/chunkWorker.d.ts +20 -0
  79. package/dist/scanner/chunkWorker.js +59 -0
  80. package/dist/scanner/chunker/chunker.d.ts +7 -0
  81. package/dist/scanner/chunker/chunker.js +60 -0
  82. package/dist/scanner/chunker.d.ts +7 -0
  83. package/dist/scanner/chunker.js +60 -0
  84. package/dist/scanner/clarifier-memory.d.ts +8 -0
  85. package/dist/scanner/clarifier-memory.js +24 -0
  86. package/dist/scanner/clarifier.d.ts +39 -0
  87. package/dist/scanner/clarifier.js +196 -0
  88. package/dist/scanner/converge.d.ts +29 -0
  89. package/dist/scanner/converge.js +15 -0
  90. package/dist/scanner/decrypt.d.ts +10 -0
  91. package/dist/scanner/decrypt.js +80 -0
  92. package/dist/scanner/engine/scanEngine.d.ts +24 -0
  93. package/dist/scanner/engine/scanEngine.js +87 -0
  94. package/dist/scanner/engine/types.d.ts +90 -0
  95. package/dist/scanner/engine/types.js +1 -0
  96. package/dist/scanner/engine.d.ts +90 -0
  97. package/dist/scanner/engine.js +84 -0
  98. package/dist/scanner/file-worker.d.ts +33 -0
  99. package/dist/scanner/file-worker.js +28 -0
  100. package/dist/scanner/fileWorker.d.ts +33 -0
  101. package/dist/scanner/fileWorker.js +22 -0
  102. package/dist/scanner/hooks/types.d.ts +25 -0
  103. package/dist/scanner/hooks/types.js +1 -0
  104. package/dist/scanner/hooks.d.ts +23 -0
  105. package/dist/scanner/hooks.js +1 -0
  106. package/dist/scanner/parse.d.ts +10 -0
  107. package/dist/scanner/parse.js +47 -0
  108. package/dist/scanner/passes/index.d.ts +8 -0
  109. package/dist/scanner/passes/index.js +6 -0
  110. package/dist/scanner/passes/types.d.ts +22 -0
  111. package/dist/scanner/passes/types.js +1 -0
  112. package/dist/scanner/pdf/chunker.d.ts +7 -0
  113. package/dist/scanner/pdf/chunker.js +60 -0
  114. package/dist/scanner/pdf/password-store.d.ts +34 -0
  115. package/dist/scanner/pdf/password-store.js +83 -0
  116. package/dist/scanner/pdf/pdf-unlock.d.ts +17 -0
  117. package/dist/scanner/pdf/pdf-unlock.js +50 -0
  118. package/dist/scanner/pdf/pdf.d.ts +17 -0
  119. package/dist/scanner/pdf/pdf.js +36 -0
  120. package/dist/scanner/pdf/state-machine.d.ts +60 -0
  121. package/dist/scanner/pdf/state-machine.js +64 -0
  122. package/dist/scanner/pdf/unlock.d.ts +22 -0
  123. package/dist/scanner/pdf/unlock.js +121 -0
  124. package/dist/scanner/phase-decrypt.d.ts +10 -0
  125. package/dist/scanner/phase-decrypt.js +80 -0
  126. package/dist/scanner/phase-parse.d.ts +10 -0
  127. package/dist/scanner/phase-parse.js +46 -0
  128. package/dist/scanner/phases/chunk.d.ts +8 -0
  129. package/dist/scanner/phases/chunk.js +13 -0
  130. package/dist/scanner/phases/commit.d.ts +12 -0
  131. package/dist/scanner/phases/commit.js +140 -0
  132. package/dist/scanner/phases/decrypt.d.ts +10 -0
  133. package/dist/scanner/phases/decrypt.js +80 -0
  134. package/dist/scanner/phases/parse.d.ts +10 -0
  135. package/dist/scanner/phases/parse.js +46 -0
  136. package/dist/scanner/phases/resolve.d.ts +10 -0
  137. package/dist/scanner/phases/resolve.js +17 -0
  138. package/dist/scanner/phases/review.d.ts +10 -0
  139. package/dist/scanner/phases/review.js +12 -0
  140. package/dist/scanner/progress.d.ts +14 -0
  141. package/dist/scanner/progress.js +21 -0
  142. package/dist/scanner/resolver-memory.d.ts +8 -0
  143. package/dist/scanner/resolver-memory.js +24 -0
  144. package/dist/scanner/resolver.d.ts +39 -0
  145. package/dist/scanner/resolver.js +196 -0
  146. package/dist/scanner/result.d.ts +17 -0
  147. package/dist/scanner/result.js +19 -0
  148. package/dist/scanner/run-passes.d.ts +30 -0
  149. package/dist/scanner/run-passes.js +15 -0
  150. package/dist/scanner/unlock.js +1 -1
  151. package/dist/scanner/worker.d.ts +19 -0
  152. package/dist/scanner/worker.js +67 -0
  153. package/dist/scanner/workers/chunkWorker.d.ts +20 -0
  154. package/dist/scanner/workers/chunkWorker.js +65 -0
  155. package/dist/scanner/workers/fileWorker.d.ts +32 -0
  156. package/dist/scanner/workers/fileWorker.js +22 -0
  157. package/package.json +1 -1
@@ -0,0 +1,90 @@
1
+ import type Database from "libsql";
2
+ import type { ScannedFile } from "../walker.js";
3
+ import type { ScanBuffer } from "../buffer/types.js";
4
+ import type { Bus } from "../bus/types.js";
5
+ export type MaybePromise<T> = T | Promise<T>;
6
+ export interface Chunk {
7
+ readonly chunkId: string;
8
+ readonly fileId: string;
9
+ readonly fileName: string;
10
+ readonly relPath: string;
11
+ readonly pageNumber: number;
12
+ readonly totalPages: number;
13
+ readonly bytes: Buffer;
14
+ readonly mime: string;
15
+ }
16
+ export interface DecryptedFile {
17
+ readonly path: string;
18
+ readonly fileName: string;
19
+ readonly relPath: string;
20
+ readonly hash: string;
21
+ readonly mime: string;
22
+ readonly decryptedBytes: Buffer;
23
+ readonly replacesPriorScannedFileId?: string;
24
+ }
25
+ export interface SkippedFile {
26
+ readonly file: ScannedFile;
27
+ readonly existingScannedFileId: string;
28
+ }
29
+ export interface FailedFile {
30
+ readonly file: ScannedFile;
31
+ readonly error: string;
32
+ }
33
+ export interface PhaseError {
34
+ readonly phase: PhaseName;
35
+ readonly target?: string;
36
+ readonly error: unknown;
37
+ }
38
+ export type PhaseName = "decrypt" | "chunk" | "parse" | "review" | "commit";
39
+ export type ReviewDecision = "commit" | "abort";
40
+ export interface CommitOutcome {
41
+ readonly transactions: number;
42
+ readonly accounts: number;
43
+ readonly merchants: number;
44
+ readonly unknowns: number;
45
+ readonly scannedFileIds: readonly string[];
46
+ }
47
+ export interface RunScanOptions {
48
+ regex?: string;
49
+ force?: boolean;
50
+ interactive?: boolean;
51
+ /** Max FileWorkers running concurrently. Default 5, hard cap 8. */
52
+ maxFileWorkers?: number;
53
+ /** Max ChunkWorkers per FileWorker. Default 5, hard cap 8. */
54
+ maxChunkWorkersPerFile?: number;
55
+ review?: boolean;
56
+ autoCommit?: boolean;
57
+ /**
58
+ * Override the phase chain. Default = the five built-in phases. Extending
59
+ * tests / alternate flows (dry-run, OCR-only) inject their own without
60
+ * editing the engine. Open for extension; closed for modification.
61
+ */
62
+ phases?: ReadonlyArray<{
63
+ name: PhaseName;
64
+ phase: Phase;
65
+ }>;
66
+ }
67
+ /**
68
+ * The state object threaded through every phase. Phases mutate it in place;
69
+ * hooks read it. The buffer + bus are interfaces — the engine owns the
70
+ * factory instances and injects them here.
71
+ */
72
+ export interface ScanState {
73
+ readonly scanId: string;
74
+ readonly startedAt: number;
75
+ readonly options: RunScanOptions;
76
+ readonly buffer: ScanBuffer;
77
+ readonly bus: Bus;
78
+ files: ScannedFile[];
79
+ decrypted: DecryptedFile[];
80
+ skipped: SkippedFile[];
81
+ failed: FailedFile[];
82
+ chunks: Chunk[];
83
+ review: ReviewDecision | null;
84
+ committed: CommitOutcome | null;
85
+ errors: PhaseError[];
86
+ auditApplied: Record<string, number>;
87
+ }
88
+ import type { ScanHooks } from "../hooks/types.js";
89
+ export type Phase = (db: Database.Database, state: ScanState, hooks: ScanHooks) => Promise<void>;
90
+ export type { ScanHooks } from "../hooks/types.js";
@@ -0,0 +1 @@
1
+ export {};
@@ -0,0 +1,90 @@
1
+ import type Database from "libsql";
2
+ import type { ScannedFile } from "./walker.js";
3
+ import type { ScanHooks } from "./hooks.js";
4
+ import type { ScanProgress } from "./progress.js";
5
+ import type { ClarifySummary } from "./clarifier.js";
6
+ export interface Chunk {
7
+ readonly chunkId: string;
8
+ readonly fileId: string;
9
+ readonly fileName: string;
10
+ readonly relPath: string;
11
+ readonly pageNumber: number;
12
+ readonly totalPages: number;
13
+ readonly bytes: Buffer;
14
+ readonly mime: string;
15
+ }
16
+ export interface DecryptedFile {
17
+ readonly path: string;
18
+ readonly fileName: string;
19
+ readonly relPath: string;
20
+ readonly hash: string;
21
+ readonly mime: string;
22
+ readonly decryptedBytes: Buffer;
23
+ readonly replacesPriorScannedFileId?: string;
24
+ /** scanned_files.id assigned in decryptPhase so scan-worker tools can stamp source_file_id. */
25
+ scannedFileId?: string;
26
+ }
27
+ export interface SkippedFile {
28
+ readonly file: ScannedFile;
29
+ readonly existingScannedFileId: string;
30
+ }
31
+ export interface FailedFile {
32
+ readonly file: ScannedFile;
33
+ readonly error: string;
34
+ }
35
+ export interface PhaseError {
36
+ readonly phase: PhaseName;
37
+ readonly target?: string;
38
+ readonly error: unknown;
39
+ }
40
+ export type PhaseName = "decrypt" | "chunk" | "parse" | "clarify";
41
+ export interface RunScanOptions {
42
+ regex?: string;
43
+ force?: boolean;
44
+ interactive?: boolean;
45
+ /** Max files processed concurrently. Default 5, hard cap 8. */
46
+ maxFileWorkers?: number;
47
+ /** Max scan workers per file (one per chunk). Default 5, hard cap 8. */
48
+ maxScanWorkersPerFile?: number;
49
+ /**
50
+ * Override the phase chain. Default = the four built-ins. Tests and alternate
51
+ * flows (dry-run, OCR-only) inject their own without editing this file.
52
+ */
53
+ phases?: ReadonlyArray<{
54
+ name: PhaseName;
55
+ phase: Phase;
56
+ }>;
57
+ }
58
+ /**
59
+ * The state object threaded through every phase. Phases mutate it in place;
60
+ * hooks read it. `progress` is the single-consumer event sink scan-worker
61
+ * tools emit into; the CLI subscribes to drive the dashboard.
62
+ */
63
+ export interface ScanState {
64
+ readonly scanId: string;
65
+ readonly startedAt: number;
66
+ readonly options: RunScanOptions;
67
+ readonly progress: ScanProgress;
68
+ files: ScannedFile[];
69
+ decrypted: DecryptedFile[];
70
+ skipped: SkippedFile[];
71
+ failed: FailedFile[];
72
+ chunks: Chunk[];
73
+ clarifySummary: ClarifySummary | null;
74
+ errors: PhaseError[];
75
+ }
76
+ export type Phase = (db: Database.Database, state: ScanState, hooks: ScanHooks) => Promise<void>;
77
+ export interface ScanResult {
78
+ readonly scanId: string;
79
+ readonly state: ScanState;
80
+ }
81
+ export declare const DEFAULT_PHASES: readonly {
82
+ name: PhaseName;
83
+ phase: Phase;
84
+ }[];
85
+ /**
86
+ * Composition root. Builds the progress sink once per scan run, threads it
87
+ * through ScanState, then runs the phase chain. Nothing survives between
88
+ * scans.
89
+ */
90
+ export declare function runScan(db: Database.Database, opts?: RunScanOptions, hooks?: ScanHooks): Promise<ScanResult>;
@@ -0,0 +1,84 @@
1
+ import { randomUUID } from "crypto";
2
+ import { createProgress } from "./progress.js";
3
+ import { decryptPhase } from "./decrypt.js";
4
+ import { parsePhase } from "./parse.js";
5
+ import { chunkPdf } from "./pdf/chunker.js";
6
+ import { runClarify } from "./clarifier.js";
7
+ import { errorMessage } from "./result.js";
8
+ const chunkPhase = async (_db, state, hooks) => {
9
+ await hooks.beforeChunk?.(state);
10
+ for (const file of state.decrypted)
11
+ state.chunks.push(...await chunkPdf(file));
12
+ await hooks.afterChunk?.(state);
13
+ };
14
+ const clarifyPhase = async (db, state, hooks) => {
15
+ await hooks.beforeClarify?.(state);
16
+ const summary = await runClarify({
17
+ db,
18
+ scanId: state.scanId,
19
+ interactive: state.options.interactive ?? true,
20
+ });
21
+ state.clarifySummary = summary;
22
+ await hooks.afterClarify?.(state, summary);
23
+ };
24
+ export const DEFAULT_PHASES = [
25
+ { name: "decrypt", phase: decryptPhase },
26
+ { name: "chunk", phase: chunkPhase },
27
+ { name: "parse", phase: parsePhase },
28
+ { name: "clarify", phase: clarifyPhase },
29
+ ];
30
+ /**
31
+ * Composition root. Builds the progress sink once per scan run, threads it
32
+ * through ScanState, then runs the phase chain. Nothing survives between
33
+ * scans.
34
+ */
35
+ export async function runScan(db, opts = {}, hooks = {}) {
36
+ const scanId = `sc:${randomUUID()}`;
37
+ const progress = createProgress();
38
+ const state = {
39
+ scanId,
40
+ startedAt: Date.now(),
41
+ options: opts,
42
+ progress,
43
+ files: [],
44
+ decrypted: [],
45
+ skipped: [],
46
+ failed: [],
47
+ chunks: [],
48
+ clarifySummary: null,
49
+ errors: [],
50
+ };
51
+ await fire(hooks.onStart, state);
52
+ const phases = opts.phases ?? DEFAULT_PHASES;
53
+ await runPhaseChain(db, state, hooks, phases);
54
+ await fire(hooks.onFinish, state);
55
+ return { scanId, state };
56
+ }
57
+ async function runPhaseChain(db, state, hooks, phases) {
58
+ for (const { name, phase } of phases) {
59
+ const aborted = await tryPhase(db, state, hooks, name, phase);
60
+ if (aborted)
61
+ return;
62
+ }
63
+ }
64
+ async function tryPhase(db, state, hooks, name, phase) {
65
+ try {
66
+ await phase(db, state, hooks);
67
+ return false;
68
+ }
69
+ catch (err) {
70
+ state.errors.push({ phase: name, error: err });
71
+ await fire(hooks.onError, err, name, state);
72
+ return true;
73
+ }
74
+ }
75
+ async function fire(fn, ...args) {
76
+ if (!fn)
77
+ return;
78
+ try {
79
+ await fn(...args);
80
+ }
81
+ catch (err) {
82
+ console.error(`[scan-engine] ${errorMessage(err)}`);
83
+ }
84
+ }
@@ -0,0 +1,33 @@
1
+ import type Database from "libsql";
2
+ import { type ChunkWorkerDeps, type ChunkWorkerResult } from "./chunk-worker.js";
3
+ import type { Chunk } from "./engine.js";
4
+ import type { ScanHooks } from "./hooks.js";
5
+ import type { ScanProgress } from "./progress.js";
6
+ /**
7
+ * Pluggable chunk-parser strategy. Default is the LLM-driven runChunkWorker;
8
+ * tests and alternate flows (OCR-only, mock, dry-run) inject their own.
9
+ */
10
+ export type ChunkWorkerFn = (deps: ChunkWorkerDeps, hooks: ScanHooks) => Promise<ChunkWorkerResult>;
11
+ export interface FileWorkerDeps {
12
+ readonly db: Database.Database;
13
+ readonly scanId: string;
14
+ readonly scannedFileId: string | undefined;
15
+ readonly progress: ScanProgress;
16
+ readonly fileId: string;
17
+ readonly chunks: readonly Chunk[];
18
+ readonly maxChunkWorkers: number;
19
+ /** Optional override; defaults to the LLM-backed runChunkWorker. */
20
+ readonly chunkWorkerFn?: ChunkWorkerFn;
21
+ }
22
+ export interface FileWorkerResult {
23
+ readonly workerId: string;
24
+ readonly fileId: string;
25
+ readonly ok: number;
26
+ readonly failed: number;
27
+ }
28
+ /**
29
+ * Process every chunk of one file in parallel up to `maxChunkWorkers`.
30
+ * Chunk-worker tools write transactions + unknowns directly to the DB,
31
+ * scoped to `scanId`; per-row ticks fan out via the shared progress sink.
32
+ */
33
+ export declare function runFileWorker(deps: FileWorkerDeps, hooks: ScanHooks): Promise<FileWorkerResult>;
@@ -0,0 +1,28 @@
1
+ import { randomUUID } from "crypto";
2
+ import { runWithConcurrency } from "./concurrency.js";
3
+ import { runChunkWorker } from "./chunk-worker.js";
4
+ /**
5
+ * Process every chunk of one file in parallel up to `maxChunkWorkers`.
6
+ * Chunk-worker tools write transactions + unknowns directly to the DB,
7
+ * scoped to `scanId`; per-row ticks fan out via the shared progress sink.
8
+ */
9
+ export async function runFileWorker(deps, hooks) {
10
+ const workerId = `fw:${randomUUID()}`;
11
+ const chunkFn = deps.chunkWorkerFn ?? runChunkWorker;
12
+ const tasks = deps.chunks.map(chunk => () => chunkFn({
13
+ db: deps.db,
14
+ scanId: deps.scanId,
15
+ scannedFileId: deps.scannedFileId,
16
+ progress: deps.progress,
17
+ chunk,
18
+ }, hooks));
19
+ const settled = await runWithConcurrency(tasks, deps.maxChunkWorkers);
20
+ let ok = 0, failed = 0;
21
+ for (const r of settled) {
22
+ if (r.ok && r.value.ok)
23
+ ok++;
24
+ else
25
+ failed++;
26
+ }
27
+ return { workerId, fileId: deps.fileId, ok, failed };
28
+ }
@@ -0,0 +1,33 @@
1
+ import type Database from "libsql";
2
+ import { type ChunkWorkerDeps, type ChunkWorkerResult } from "./chunkWorker.js";
3
+ import type { ScanBuffer } from "./buffer.js";
4
+ import type { Chunk } from "./engine.js";
5
+ import type { ScanHooks } from "./hooks.js";
6
+ /**
7
+ * Pluggable chunk-parser strategy. Default is the LLM-driven runChunkWorker;
8
+ * tests and alternate flows (OCR-only, mock, dry-run) inject their own
9
+ * without modifying the file worker. Open for extension; closed for
10
+ * modification.
11
+ */
12
+ export type ChunkWorkerFn = (deps: ChunkWorkerDeps, hooks: ScanHooks) => Promise<ChunkWorkerResult>;
13
+ export interface FileWorkerDeps {
14
+ readonly db: Database.Database;
15
+ readonly buffer: ScanBuffer;
16
+ readonly fileId: string;
17
+ readonly chunks: readonly Chunk[];
18
+ readonly maxChunkWorkers: number;
19
+ /** Optional override; defaults to the LLM-backed runChunkWorker. */
20
+ readonly chunkWorkerFn?: ChunkWorkerFn;
21
+ }
22
+ export interface FileWorkerResult {
23
+ readonly workerId: string;
24
+ readonly fileId: string;
25
+ readonly ok: number;
26
+ readonly failed: number;
27
+ }
28
+ /**
29
+ * Process every chunk of one file in parallel up to `maxChunkWorkers`. The
30
+ * shared buffer + bus are dependency-injected; this function has no global
31
+ * state and never reaches outside its args.
32
+ */
33
+ export declare function runFileWorker(deps: FileWorkerDeps, hooks: ScanHooks): Promise<FileWorkerResult>;
@@ -0,0 +1,22 @@
1
+ import { randomUUID } from "crypto";
2
+ import { runWithConcurrency } from "./concurrency.js";
3
+ import { runChunkWorker } from "./chunkWorker.js";
4
+ /**
5
+ * Process every chunk of one file in parallel up to `maxChunkWorkers`. The
6
+ * shared buffer + bus are dependency-injected; this function has no global
7
+ * state and never reaches outside its args.
8
+ */
9
+ export async function runFileWorker(deps, hooks) {
10
+ const workerId = `fw:${randomUUID()}`;
11
+ const chunkFn = deps.chunkWorkerFn ?? runChunkWorker;
12
+ const tasks = deps.chunks.map(chunk => () => chunkFn({ db: deps.db, buffer: deps.buffer, chunk }, hooks));
13
+ const settled = await runWithConcurrency(tasks, deps.maxChunkWorkers);
14
+ let ok = 0, failed = 0;
15
+ for (const r of settled) {
16
+ if (r.ok && r.value.ok)
17
+ ok++;
18
+ else
19
+ failed++;
20
+ }
21
+ return { workerId, fileId: deps.fileId, ok, failed };
22
+ }
@@ -0,0 +1,25 @@
1
+ import type { Chunk, ScanState, CommitOutcome, PhaseName } from "../engine/types.js";
2
+ import type { BufferSnapshot } from "../buffer/types.js";
3
+ export type MaybePromise<T> = T | Promise<T>;
4
+ /**
5
+ * Lifecycle hooks the scanner engine fires at phase edges. CLI registers
6
+ * spinner/Ink hooks; tests register assertions. Every hook is optional and
7
+ * best-effort — a hook that throws gets logged and the phase continues.
8
+ */
9
+ export interface ScanHooks {
10
+ onStart?(s: Readonly<ScanState>): MaybePromise<void>;
11
+ beforeDecrypt?(s: Readonly<ScanState>): MaybePromise<void>;
12
+ afterDecrypt?(s: Readonly<ScanState>): MaybePromise<void>;
13
+ beforeChunk?(s: Readonly<ScanState>): MaybePromise<void>;
14
+ afterChunk?(s: Readonly<ScanState>): MaybePromise<void>;
15
+ beforeParse?(s: Readonly<ScanState>): MaybePromise<void>;
16
+ onWorkerStart?(workerId: string, chunk: Chunk): void;
17
+ onWorkerEnd?(workerId: string, chunk: Chunk, ok: boolean): void;
18
+ afterParse?(s: Readonly<ScanState>): MaybePromise<void>;
19
+ beforeReview?(s: Readonly<ScanState>, snapshot: BufferSnapshot): MaybePromise<void>;
20
+ afterReview?(s: Readonly<ScanState>): MaybePromise<void>;
21
+ beforeCommit?(s: Readonly<ScanState>): MaybePromise<void>;
22
+ afterCommit?(s: Readonly<ScanState>, outcome: CommitOutcome): MaybePromise<void>;
23
+ onError?(err: unknown, phase: PhaseName, s: Readonly<ScanState>): MaybePromise<void>;
24
+ onFinish?(s: Readonly<ScanState>): MaybePromise<void>;
25
+ }
@@ -0,0 +1 @@
1
+ export {};
@@ -0,0 +1,23 @@
1
+ import type { Chunk, ScanState, PhaseName } from "./engine.js";
2
+ import type { ClarifySummary } from "./clarifier.js";
3
+ export type MaybePromise<T> = T | Promise<T>;
4
+ /**
5
+ * Lifecycle hooks the engine fires at phase edges. CLI registers spinner/Ink
6
+ * hooks; tests register assertions. Every hook is optional and best-effort —
7
+ * a hook that throws gets logged and the phase continues.
8
+ */
9
+ export interface ScanHooks {
10
+ onStart?(s: Readonly<ScanState>): MaybePromise<void>;
11
+ beforeDecrypt?(s: Readonly<ScanState>): MaybePromise<void>;
12
+ afterDecrypt?(s: Readonly<ScanState>): MaybePromise<void>;
13
+ beforeChunk?(s: Readonly<ScanState>): MaybePromise<void>;
14
+ afterChunk?(s: Readonly<ScanState>): MaybePromise<void>;
15
+ beforeParse?(s: Readonly<ScanState>): MaybePromise<void>;
16
+ onWorkerStart?(workerId: string, chunk: Chunk): void;
17
+ onWorkerEnd?(workerId: string, chunk: Chunk, ok: boolean): void;
18
+ afterParse?(s: Readonly<ScanState>): MaybePromise<void>;
19
+ beforeClarify?(s: Readonly<ScanState>): MaybePromise<void>;
20
+ afterClarify?(s: Readonly<ScanState>, summary: ClarifySummary): MaybePromise<void>;
21
+ onError?(err: unknown, phase: PhaseName, s: Readonly<ScanState>): MaybePromise<void>;
22
+ onFinish?(s: Readonly<ScanState>): MaybePromise<void>;
23
+ }
@@ -0,0 +1 @@
1
+ export {};
@@ -0,0 +1,10 @@
1
+ import type Database from "libsql";
2
+ import type { ScanState } from "./engine.js";
3
+ import type { ScanHooks } from "./hooks.js";
4
+ /**
5
+ * Phase 3 — two-tier fan-out: up to `maxFile` files in parallel, each file
6
+ * processing up to `maxChunk` chunks in parallel. Chunk-worker tools write
7
+ * transactions and questions directly to the DB (scoped to `scanId`) and tick
8
+ * the shared progress sink.
9
+ */
10
+ export declare function parsePhase(db: Database.Database, state: ScanState, hooks: ScanHooks): Promise<void>;
@@ -0,0 +1,47 @@
1
+ import { runWithConcurrency } from "./concurrency.js";
2
+ import { runScanWorker } from "./worker.js";
3
+ import { errorMessage } from "./result.js";
4
+ const DEFAULT_MAX_FILE_WORKERS = 5;
5
+ const DEFAULT_MAX_SCAN_WORKERS_PER_FILE = 5;
6
+ const HARD_CAP = 8;
7
+ const clamp = (n, fallback) => Math.min(HARD_CAP, Math.max(1, n ?? fallback));
8
+ /**
9
+ * Phase 3 — two-tier fan-out: up to `maxFile` files in parallel, each file
10
+ * processing up to `maxChunk` chunks in parallel. Chunk-worker tools write
11
+ * transactions and questions directly to the DB (scoped to `scanId`) and tick
12
+ * the shared progress sink.
13
+ */
14
+ export async function parsePhase(db, state, hooks) {
15
+ await hooks.beforeParse?.(state);
16
+ const maxFile = clamp(state.options.maxFileWorkers, DEFAULT_MAX_FILE_WORKERS);
17
+ const maxChunk = clamp(state.options.maxScanWorkersPerFile, DEFAULT_MAX_SCAN_WORKERS_PER_FILE);
18
+ const fileGroups = state.decrypted
19
+ .map(file => ({
20
+ fileId: file.path,
21
+ scannedFileId: file.scannedFileId,
22
+ chunks: state.chunks.filter(c => c.fileId === file.path),
23
+ }))
24
+ .filter(g => g.chunks.length > 0);
25
+ const fileTasks = fileGroups.map(group => () => {
26
+ const chunkTasks = group.chunks.map(chunk => () => runScanWorker({
27
+ db,
28
+ scanId: state.scanId,
29
+ scannedFileId: group.scannedFileId,
30
+ progress: state.progress,
31
+ chunk,
32
+ }, hooks));
33
+ return runWithConcurrency(chunkTasks, maxChunk);
34
+ });
35
+ const settled = await runWithConcurrency(fileTasks, maxFile);
36
+ for (let i = 0; i < settled.length; i++) {
37
+ const r = settled[i];
38
+ if (!r.ok)
39
+ state.errors.push({ phase: "parse", target: fileGroups[i].fileId, error: errorMessage(r.error) });
40
+ }
41
+ for (const file of state.decrypted) {
42
+ if (!file.scannedFileId)
43
+ continue;
44
+ db.prepare(`UPDATE scanned_files SET status = 'scanned', scanned_at = datetime('now') WHERE id = ?`).run(file.scannedFileId);
45
+ }
46
+ await hooks.afterParse?.(state);
47
+ }
@@ -0,0 +1,8 @@
1
+ import type { AuditPass } from "./types.js";
2
+ /**
3
+ * Audit-pass registry. The audit engine indexes these by their declared
4
+ * `kinds` at startup and dispatches matching passes per buffer event.
5
+ * Append to the array to extend; do not edit the engine.
6
+ */
7
+ export declare const AUDIT_PASSES: readonly AuditPass[];
8
+ export type { AuditPass };
@@ -0,0 +1,6 @@
1
+ /**
2
+ * Audit-pass registry. The audit engine indexes these by their declared
3
+ * `kinds` at startup and dispatches matching passes per buffer event.
4
+ * Append to the array to extend; do not edit the engine.
5
+ */
6
+ export const AUDIT_PASSES = [];
@@ -0,0 +1,22 @@
1
+ import type Database from "libsql";
2
+ import type { BufferEvent, EventKind } from "../bus.js";
3
+ import type { ScanBuffer } from "../buffer.js";
4
+ export interface AuditContext {
5
+ readonly db: Database.Database;
6
+ readonly buffer: ScanBuffer;
7
+ /** Optional tally bucket — passes increment their own count for the review summary. */
8
+ readonly tally: Record<string, number>;
9
+ }
10
+ /**
11
+ * Deterministic rule-based reaction to a BufferEvent. Passes mutate the
12
+ * ScanBuffer (apply memory rules, dedup, link to recurrences, etc.) so data
13
+ * lands clean before the review TUI sees it.
14
+ *
15
+ * Declare which event kinds you care about up front via `kinds` — the audit
16
+ * engine indexes once and dispatches only the matching passes per event.
17
+ */
18
+ export interface AuditPass {
19
+ readonly name: string;
20
+ readonly kinds: readonly EventKind[];
21
+ apply(event: BufferEvent, ctx: AuditContext): Promise<void>;
22
+ }
@@ -0,0 +1 @@
1
+ export {};
@@ -0,0 +1,7 @@
1
+ import type { Chunk, DecryptedFile } from "../engine.js";
2
+ /**
3
+ * Split one decrypted PDF into N single-page Chunks. Each chunk is a
4
+ * standalone, valid PDF so the per-chunk LLM agent gets a clean document
5
+ * without siblings.
6
+ */
7
+ export declare function chunkPdf(file: DecryptedFile): Promise<Chunk[]>;
@@ -0,0 +1,60 @@
1
+ let mupdfPromise = null;
2
+ function getMupdf() {
3
+ if (!mupdfPromise)
4
+ mupdfPromise = import("mupdf");
5
+ return mupdfPromise;
6
+ }
7
+ /**
8
+ * Build one Chunk holding exactly page `pageIndex` of `file`. mupdf has no
9
+ * native page-range extract, so we clone the source doc and delete every
10
+ * other page, back-to-front so indices stay stable as we splice. Resource
11
+ * lifetime is contained in the try/finally so a saveToBuffer failure can't
12
+ * leak the cloned doc.
13
+ */
14
+ async function extractPage(file, pageIndex, pageCount) {
15
+ const mupdf = await getMupdf();
16
+ const clone = mupdf.Document.openDocument(file.decryptedBytes, file.mime);
17
+ try {
18
+ for (let j = pageCount - 1; j >= 0; j--) {
19
+ if (j !== pageIndex)
20
+ clone.deletePage(j);
21
+ }
22
+ const out = clone.saveToBuffer("decrypt");
23
+ return {
24
+ chunkId: `${file.path}#p${pageIndex + 1}`,
25
+ fileId: file.path,
26
+ fileName: file.fileName,
27
+ relPath: file.relPath,
28
+ pageNumber: pageIndex + 1,
29
+ totalPages: pageCount,
30
+ bytes: Buffer.from(out.asUint8Array()),
31
+ mime: file.mime,
32
+ };
33
+ }
34
+ finally {
35
+ clone.destroy();
36
+ }
37
+ }
38
+ /**
39
+ * Split one decrypted PDF into N single-page Chunks. Each chunk is a
40
+ * standalone, valid PDF so the per-chunk LLM agent gets a clean document
41
+ * without siblings.
42
+ */
43
+ export async function chunkPdf(file) {
44
+ const mupdf = await getMupdf();
45
+ const probe = mupdf.Document.openDocument(file.decryptedBytes, file.mime);
46
+ let pageCount;
47
+ try {
48
+ pageCount = probe.countPages();
49
+ }
50
+ finally {
51
+ probe.destroy();
52
+ }
53
+ if (pageCount <= 0)
54
+ return [];
55
+ const chunks = [];
56
+ for (let i = 0; i < pageCount; i++) {
57
+ chunks.push(await extractPage(file, i, pageCount));
58
+ }
59
+ return chunks;
60
+ }
@@ -0,0 +1,34 @@
1
+ import type Database from "libsql";
2
+ export interface StoredPassword {
3
+ id: string;
4
+ pattern: string;
5
+ password: string;
6
+ useCount: number;
7
+ lastUsedAt: string | null;
8
+ }
9
+ /**
10
+ * Derive a regex from a filename. Strategy: take the leading alphabetic-ish
11
+ * prefix (up to the first separator: underscore, hyphen, space, or dot) and
12
+ * wildcard everything after it. Looser than a literal match — `AcctSt_May26.pdf`
13
+ * and `AcctSt_Jun26.pdf` share the same pattern.
14
+ *
15
+ * Falls back to the older digit-collapse strategy when the prefix is too short
16
+ * (<3 chars) or doesn't start with a letter, so we don't end up with overly
17
+ * generic patterns like `^a.*` or `^\d+.*`.
18
+ *
19
+ * Examples:
20
+ * `AcctSt_May26.pdf` → `^acctst.*`
21
+ * `KBank-Savings-2026-01.pdf` → `^kbank.*`
22
+ * `statement.pdf` → `^statement.*`
23
+ * `1234567890.pdf` → `^\d+\.pdf$` (fallback)
24
+ * `e-statement.pdf` → `^e\-statement\.pdf$` (fallback — prefix too short)
25
+ */
26
+ export declare function suggestPattern(filename: string): string;
27
+ /** Stored passwords whose pattern matches the basename of `filePath`. */
28
+ export declare function findCandidates(db: Database.Database, filePath: string, dbKey: string): StoredPassword[];
29
+ /**
30
+ * Upsert by pattern. If the pattern already exists the row is replaced — useful
31
+ * when the bank rotates the password for a recurring statement series.
32
+ */
33
+ export declare function savePassword(db: Database.Database, pattern: string, password: string, dbKey: string): string;
34
+ export declare function recordUse(db: Database.Database, id: string): void;