plasalid 0.7.0 → 0.7.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (141) hide show
  1. package/README.md +3 -4
  2. package/dist/ai/agent.d.ts +6 -7
  3. package/dist/ai/agent.js +27 -11
  4. package/dist/ai/personas.js +48 -46
  5. package/dist/ai/system-prompt.js +1 -1
  6. package/dist/ai/tools/account-mutex.d.ts +1 -0
  7. package/dist/ai/tools/account-mutex.js +16 -0
  8. package/dist/ai/tools/index.js +4 -12
  9. package/dist/ai/tools/ingest.d.ts +1 -1
  10. package/dist/ai/tools/ingest.js +282 -242
  11. package/dist/ai/tools/merchants.js +1 -28
  12. package/dist/ai/tools/read.js +8 -8
  13. package/dist/ai/tools/record.js +3 -36
  14. package/dist/ai/tools/resolve.js +25 -22
  15. package/dist/ai/tools/scan.js +0 -1
  16. package/dist/ai/tools/types.d.ts +14 -21
  17. package/dist/cli/commands/record.js +1 -82
  18. package/dist/cli/commands/resolve.d.ts +5 -2
  19. package/dist/cli/commands/resolve.js +36 -5
  20. package/dist/cli/commands/revert.js +4 -2
  21. package/dist/cli/commands/rules.js +2 -2
  22. package/dist/cli/commands/scan.js +199 -128
  23. package/dist/cli/commands/status.js +5 -5
  24. package/dist/cli/index.js +8 -29
  25. package/dist/cli/ink/ScanDashboard.d.ts +49 -0
  26. package/dist/cli/ink/ScanDashboard.js +214 -0
  27. package/dist/cli/ink/scan_dashboard.d.ts +40 -25
  28. package/dist/cli/ink/scan_dashboard.js +139 -44
  29. package/dist/db/queries/account-balance.d.ts +1 -1
  30. package/dist/db/queries/questions.d.ts +62 -0
  31. package/dist/db/queries/questions.js +110 -0
  32. package/dist/db/queries/transactions.d.ts +1 -1
  33. package/dist/db/queries/unknowns.d.ts +17 -15
  34. package/dist/db/queries/unknowns.js +35 -39
  35. package/dist/db/schema.js +6 -28
  36. package/dist/scanner/audit/auditor.d.ts +31 -0
  37. package/dist/scanner/audit/auditor.js +72 -0
  38. package/dist/scanner/audit/engine.d.ts +10 -0
  39. package/dist/scanner/audit/engine.js +98 -0
  40. package/dist/scanner/audit/eventBus.d.ts +60 -0
  41. package/dist/scanner/audit/eventBus.js +35 -0
  42. package/dist/scanner/audit/passes/index.d.ts +11 -0
  43. package/dist/scanner/audit/passes/index.js +9 -0
  44. package/dist/scanner/audit/passes/types.d.ts +23 -0
  45. package/dist/scanner/audit/passes/types.js +1 -0
  46. package/dist/scanner/audit/types.d.ts +27 -0
  47. package/dist/scanner/audit/types.js +1 -0
  48. package/dist/scanner/auditor.d.ts +51 -0
  49. package/dist/scanner/auditor.js +80 -0
  50. package/dist/scanner/buffer/engine.d.ts +9 -0
  51. package/dist/scanner/buffer/engine.js +110 -0
  52. package/dist/scanner/buffer/sharedBuffer.d.ts +78 -0
  53. package/dist/scanner/buffer/sharedBuffer.js +130 -0
  54. package/dist/scanner/buffer/types.d.ts +67 -0
  55. package/dist/scanner/buffer/types.js +1 -0
  56. package/dist/scanner/buffer.d.ts +45 -38
  57. package/dist/scanner/buffer.js +93 -61
  58. package/dist/scanner/bus/engine.d.ts +11 -0
  59. package/dist/scanner/bus/engine.js +42 -0
  60. package/dist/scanner/bus/types.d.ts +53 -0
  61. package/dist/scanner/bus/types.js +1 -0
  62. package/dist/scanner/bus.d.ts +38 -0
  63. package/dist/scanner/bus.js +37 -0
  64. package/dist/scanner/chunk-worker.d.ts +19 -0
  65. package/dist/scanner/chunk-worker.js +67 -0
  66. package/dist/scanner/chunkWorker.d.ts +20 -0
  67. package/dist/scanner/chunkWorker.js +59 -0
  68. package/dist/scanner/chunker/chunker.d.ts +7 -0
  69. package/dist/scanner/chunker/chunker.js +60 -0
  70. package/dist/scanner/chunker.d.ts +7 -0
  71. package/dist/scanner/chunker.js +60 -0
  72. package/dist/scanner/converge.d.ts +29 -0
  73. package/dist/scanner/converge.js +15 -0
  74. package/dist/scanner/decrypt.d.ts +10 -0
  75. package/dist/scanner/decrypt.js +80 -0
  76. package/dist/scanner/engine/scanEngine.d.ts +24 -0
  77. package/dist/scanner/engine/scanEngine.js +87 -0
  78. package/dist/scanner/engine/types.d.ts +90 -0
  79. package/dist/scanner/engine/types.js +1 -0
  80. package/dist/scanner/engine.d.ts +90 -0
  81. package/dist/scanner/engine.js +84 -0
  82. package/dist/scanner/file-worker.d.ts +33 -0
  83. package/dist/scanner/file-worker.js +28 -0
  84. package/dist/scanner/fileWorker.d.ts +33 -0
  85. package/dist/scanner/fileWorker.js +22 -0
  86. package/dist/scanner/hooks/types.d.ts +25 -0
  87. package/dist/scanner/hooks/types.js +1 -0
  88. package/dist/scanner/hooks.d.ts +23 -0
  89. package/dist/scanner/hooks.js +1 -0
  90. package/dist/scanner/parse.d.ts +10 -0
  91. package/dist/scanner/parse.js +47 -0
  92. package/dist/scanner/passes/index.d.ts +8 -0
  93. package/dist/scanner/passes/index.js +6 -0
  94. package/dist/scanner/passes/types.d.ts +22 -0
  95. package/dist/scanner/passes/types.js +1 -0
  96. package/dist/scanner/pdf/chunker.d.ts +7 -0
  97. package/dist/scanner/pdf/chunker.js +60 -0
  98. package/dist/scanner/pdf/password-store.d.ts +34 -0
  99. package/dist/scanner/pdf/password-store.js +83 -0
  100. package/dist/scanner/pdf/pdf-unlock.d.ts +17 -0
  101. package/dist/scanner/pdf/pdf-unlock.js +50 -0
  102. package/dist/scanner/pdf/pdf.d.ts +17 -0
  103. package/dist/scanner/pdf/pdf.js +36 -0
  104. package/dist/scanner/pdf/state-machine.d.ts +60 -0
  105. package/dist/scanner/pdf/state-machine.js +64 -0
  106. package/dist/scanner/pdf/unlock.d.ts +22 -0
  107. package/dist/scanner/pdf/unlock.js +121 -0
  108. package/dist/scanner/phase-decrypt.d.ts +10 -0
  109. package/dist/scanner/phase-decrypt.js +80 -0
  110. package/dist/scanner/phase-parse.d.ts +10 -0
  111. package/dist/scanner/phase-parse.js +46 -0
  112. package/dist/scanner/phases/chunk.d.ts +8 -0
  113. package/dist/scanner/phases/chunk.js +13 -0
  114. package/dist/scanner/phases/commit.d.ts +12 -0
  115. package/dist/scanner/phases/commit.js +140 -0
  116. package/dist/scanner/phases/decrypt.d.ts +10 -0
  117. package/dist/scanner/phases/decrypt.js +80 -0
  118. package/dist/scanner/phases/parse.d.ts +10 -0
  119. package/dist/scanner/phases/parse.js +46 -0
  120. package/dist/scanner/phases/resolve.d.ts +10 -0
  121. package/dist/scanner/phases/resolve.js +17 -0
  122. package/dist/scanner/phases/review.d.ts +10 -0
  123. package/dist/scanner/phases/review.js +12 -0
  124. package/dist/scanner/progress.d.ts +14 -0
  125. package/dist/scanner/progress.js +21 -0
  126. package/dist/scanner/resolver-memory.d.ts +8 -0
  127. package/dist/scanner/resolver-memory.js +24 -0
  128. package/dist/scanner/resolver.d.ts +39 -0
  129. package/dist/scanner/resolver.js +196 -0
  130. package/dist/scanner/result.d.ts +17 -0
  131. package/dist/scanner/result.js +19 -0
  132. package/dist/scanner/run-passes.d.ts +30 -0
  133. package/dist/scanner/run-passes.js +15 -0
  134. package/dist/scanner/unlock.js +1 -1
  135. package/dist/scanner/worker.d.ts +19 -0
  136. package/dist/scanner/worker.js +67 -0
  137. package/dist/scanner/workers/chunkWorker.d.ts +20 -0
  138. package/dist/scanner/workers/chunkWorker.js +65 -0
  139. package/dist/scanner/workers/fileWorker.d.ts +32 -0
  140. package/dist/scanner/workers/fileWorker.js +22 -0
  141. package/package.json +1 -1
@@ -0,0 +1,60 @@
1
+ let mupdfPromise = null;
2
+ function getMupdf() {
3
+ if (!mupdfPromise)
4
+ mupdfPromise = import("mupdf");
5
+ return mupdfPromise;
6
+ }
7
+ /**
8
+ * Build one Chunk holding exactly page `pageIndex` of `file`. mupdf has no
9
+ * native page-range extract, so we clone the source doc and delete every
10
+ * other page, back-to-front so indices stay stable as we splice. Resource
11
+ * lifetime is contained in the try/finally so a saveToBuffer failure can't
12
+ * leak the cloned doc.
13
+ */
14
+ async function extractPage(file, pageIndex, pageCount) {
15
+ const mupdf = await getMupdf();
16
+ const clone = mupdf.Document.openDocument(file.decryptedBytes, file.mime);
17
+ try {
18
+ for (let j = pageCount - 1; j >= 0; j--) {
19
+ if (j !== pageIndex)
20
+ clone.deletePage(j);
21
+ }
22
+ const out = clone.saveToBuffer("decrypt");
23
+ return {
24
+ chunkId: `${file.path}#p${pageIndex + 1}`,
25
+ fileId: file.path,
26
+ fileName: file.fileName,
27
+ relPath: file.relPath,
28
+ pageNumber: pageIndex + 1,
29
+ totalPages: pageCount,
30
+ bytes: Buffer.from(out.asUint8Array()),
31
+ mime: file.mime,
32
+ };
33
+ }
34
+ finally {
35
+ clone.destroy();
36
+ }
37
+ }
38
+ /**
39
+ * Split one decrypted PDF into N single-page Chunks. Each chunk is a
40
+ * standalone, valid PDF so the per-chunk LLM agent gets a clean document
41
+ * without siblings.
42
+ */
43
+ export async function chunkPdf(file) {
44
+ const mupdf = await getMupdf();
45
+ const probe = mupdf.Document.openDocument(file.decryptedBytes, file.mime);
46
+ let pageCount;
47
+ try {
48
+ pageCount = probe.countPages();
49
+ }
50
+ finally {
51
+ probe.destroy();
52
+ }
53
+ if (pageCount <= 0)
54
+ return [];
55
+ const chunks = [];
56
+ for (let i = 0; i < pageCount; i++) {
57
+ chunks.push(await extractPage(file, i, pageCount));
58
+ }
59
+ return chunks;
60
+ }
@@ -0,0 +1,7 @@
1
+ import type { Chunk, DecryptedFile } from "./engine.js";
2
+ /**
3
+ * Split one decrypted PDF into N single-page Chunks. Each chunk is a
4
+ * standalone, valid PDF so the per-chunk LLM agent gets a clean document
5
+ * without siblings.
6
+ */
7
+ export declare function chunkPdf(file: DecryptedFile): Promise<Chunk[]>;
@@ -0,0 +1,60 @@
1
+ let mupdfPromise = null;
2
+ function getMupdf() {
3
+ if (!mupdfPromise)
4
+ mupdfPromise = import("mupdf");
5
+ return mupdfPromise;
6
+ }
7
+ /**
8
+ * Build one Chunk holding exactly page `pageIndex` of `file`. mupdf has no
9
+ * native page-range extract, so we clone the source doc and delete every
10
+ * other page, back-to-front so indices stay stable as we splice. Resource
11
+ * lifetime is contained in the try/finally so a saveToBuffer failure can't
12
+ * leak the cloned doc.
13
+ */
14
+ async function extractPage(file, pageIndex, pageCount) {
15
+ const mupdf = await getMupdf();
16
+ const clone = mupdf.Document.openDocument(file.decryptedBytes, file.mime);
17
+ try {
18
+ for (let j = pageCount - 1; j >= 0; j--) {
19
+ if (j !== pageIndex)
20
+ clone.deletePage(j);
21
+ }
22
+ const out = clone.saveToBuffer("decrypt");
23
+ return {
24
+ chunkId: `${file.path}#p${pageIndex + 1}`,
25
+ fileId: file.path,
26
+ fileName: file.fileName,
27
+ relPath: file.relPath,
28
+ pageNumber: pageIndex + 1,
29
+ totalPages: pageCount,
30
+ bytes: Buffer.from(out.asUint8Array()),
31
+ mime: file.mime,
32
+ };
33
+ }
34
+ finally {
35
+ clone.destroy();
36
+ }
37
+ }
38
+ /**
39
+ * Split one decrypted PDF into N single-page Chunks. Each chunk is a
40
+ * standalone, valid PDF so the per-chunk LLM agent gets a clean document
41
+ * without siblings.
42
+ */
43
+ export async function chunkPdf(file) {
44
+ const mupdf = await getMupdf();
45
+ const probe = mupdf.Document.openDocument(file.decryptedBytes, file.mime);
46
+ let pageCount;
47
+ try {
48
+ pageCount = probe.countPages();
49
+ }
50
+ finally {
51
+ probe.destroy();
52
+ }
53
+ if (pageCount <= 0)
54
+ return [];
55
+ const chunks = [];
56
+ for (let i = 0; i < pageCount; i++) {
57
+ chunks.push(await extractPage(file, i, pageCount));
58
+ }
59
+ return chunks;
60
+ }
@@ -0,0 +1,29 @@
1
+ /**
2
+ * Drive a stateful loop toward convergence: keep running passes until the
3
+ * caller's `isDone` predicate is true (success), `isStalled` returns true
4
+ * across two passes (stall), or `maxAttempts` is exhausted (fail).
5
+ *
6
+ * The driver owns counting passes, stall detection, and the iteration cap.
7
+ * Everything else (work per pass, callbacks per terminal state) lives in the
8
+ * hooks the caller supplies. `S` is whatever quantity decides "are we done?".
9
+ */
10
+ export interface ConvergeOpts<S> {
11
+ /** Initial state (e.g. `countQuestions(db)`). */
12
+ initial: S;
13
+ /** Maximum number of passes before declaring failure. Must be >= 1. */
14
+ maxAttempts: number;
15
+ /** True when the work is finished and the loop should stop cleanly. */
16
+ isDone: (state: S) => boolean;
17
+ /**
18
+ * True when this pass made no progress vs the previous pass. Fires after
19
+ * the first pass at the earliest.
20
+ */
21
+ isStalled: (curr: S, prev: S) => boolean;
22
+ /** Run one pass; return the new state. Pass numbers are 1-indexed. */
23
+ onPass: (pass: number, state: S) => Promise<S>;
24
+ onStart?: (state: S) => void;
25
+ onStall?: (state: S) => void;
26
+ onSuccess?: (state: S) => void;
27
+ onFail?: (state: S) => void;
28
+ }
29
+ export declare function converge<S>(opts: ConvergeOpts<S>): Promise<S>;
@@ -0,0 +1,15 @@
1
+ export async function converge(opts) {
2
+ let state = opts.initial;
3
+ let prev = state;
4
+ opts.onStart?.(state);
5
+ for (let pass = 1; pass <= opts.maxAttempts && !opts.isDone(state); pass++) {
6
+ if (pass > 1 && opts.isStalled(state, prev)) {
7
+ opts.onStall?.(state);
8
+ return state;
9
+ }
10
+ prev = state;
11
+ state = await opts.onPass(pass, state);
12
+ }
13
+ (opts.isDone(state) ? opts.onSuccess : opts.onFail)?.(state);
14
+ return state;
15
+ }
@@ -0,0 +1,10 @@
1
+ import type Database from "libsql";
2
+ import type { ScanState } from "./engine.js";
3
+ import type { ScanHooks } from "./hooks.js";
4
+ /**
5
+ * Phase 1 — walk the data dir, optionally filter by regex, decrypt each file
6
+ * sequentially (password prompts can't share a TTY). Output partitions into
7
+ * decrypted / skipped / failed via a kind-keyed dispatch map. Bootstrapped
8
+ * scanned_files rows are tagged onto each DecryptedFile.
9
+ */
10
+ export declare function decryptPhase(db: Database.Database, state: ScanState, hooks: ScanHooks): Promise<void>;
@@ -0,0 +1,80 @@
1
+ import { randomUUID } from "crypto";
2
+ import { readPdf } from "./pdf/pdf.js";
3
+ import { unlockIfNeeded, persistUnlockOutcome } from "./pdf/unlock.js";
4
+ import { scanDataDir } from "./walker.js";
5
+ import { tryExecute } from "./result.js";
6
+ function findScannedByHash(db, hash) {
7
+ return db
8
+ .prepare(`SELECT id FROM scanned_files WHERE file_hash = ?`)
9
+ .get(hash) ?? null;
10
+ }
11
+ async function decryptOne(db, file, opts) {
12
+ const read = await tryExecute(() => readPdf(file.path));
13
+ if (!read.ok)
14
+ return { kind: "failed", error: `read failed: ${read.error}` };
15
+ const pdf = read.value;
16
+ const existing = findScannedByHash(db, pdf.hash);
17
+ if (existing && !opts.force) {
18
+ return { kind: "skipped", existingScannedFileId: existing.id };
19
+ }
20
+ const unlock = await tryExecute(() => unlockIfNeeded({
21
+ db,
22
+ filePath: file.path,
23
+ bytes: pdf.bytes,
24
+ interactive: opts.interactive,
25
+ }));
26
+ if (!unlock.ok)
27
+ return { kind: "failed", error: unlock.error || "unlock failed" };
28
+ persistUnlockOutcome(db, file.path, unlock.value.outcome);
29
+ return {
30
+ kind: "decrypted",
31
+ file: {
32
+ path: file.path,
33
+ fileName: file.name,
34
+ relPath: file.relPath,
35
+ hash: pdf.hash,
36
+ mime: pdf.mime,
37
+ decryptedBytes: unlock.value.decrypted,
38
+ replacesPriorScannedFileId: existing?.id,
39
+ },
40
+ };
41
+ }
42
+ const APPLY = {
43
+ decrypted: (state, _file, o) => { state.decrypted.push(o.file); },
44
+ skipped: (state, file, o) => { state.skipped.push({ file, existingScannedFileId: o.existingScannedFileId }); },
45
+ failed: (state, file, o) => { state.failed.push({ file, error: o.error }); },
46
+ };
47
+ /**
48
+ * Bootstrap one scanned_files row per decrypted file. Chunk workers later
49
+ * stamp transactions with source_file_id, so the row must exist before any
50
+ * tool writes hit the DB. Status flips to 'scanned' after parse completes.
51
+ */
52
+ function bootstrapScannedFiles(db, state) {
53
+ for (const file of state.decrypted) {
54
+ if (file.replacesPriorScannedFileId) {
55
+ db.prepare(`DELETE FROM scanned_files WHERE id = ?`).run(file.replacesPriorScannedFileId);
56
+ }
57
+ const sfId = `sf:${randomUUID()}`;
58
+ db.prepare(`INSERT INTO scanned_files (id, path, file_hash, mime, status) VALUES (?, ?, ?, ?, 'pending')`).run(sfId, file.path, file.hash, file.mime);
59
+ file.scannedFileId = sfId;
60
+ }
61
+ }
62
+ /**
63
+ * Phase 1 — walk the data dir, optionally filter by regex, decrypt each file
64
+ * sequentially (password prompts can't share a TTY). Output partitions into
65
+ * decrypted / skipped / failed via a kind-keyed dispatch map. Bootstrapped
66
+ * scanned_files rows are tagged onto each DecryptedFile.
67
+ */
68
+ export async function decryptPhase(db, state, hooks) {
69
+ await hooks.beforeDecrypt?.(state);
70
+ const matcher = state.options.regex ? new RegExp(state.options.regex, "i") : null;
71
+ state.files = scanDataDir().filter(f => (matcher ? matcher.test(f.relPath) : true));
72
+ const interactive = state.options.interactive ?? true;
73
+ const force = !!state.options.force;
74
+ for (const file of state.files) {
75
+ const outcome = await decryptOne(db, file, { force, interactive });
76
+ APPLY[outcome.kind](state, file, outcome);
77
+ }
78
+ bootstrapScannedFiles(db, state);
79
+ await hooks.afterDecrypt?.(state);
80
+ }
@@ -0,0 +1,24 @@
1
+ import type Database from "libsql";
2
+ import type { CommitOutcome, Phase, PhaseName, RunScanOptions, ScanState } from "./types.js";
3
+ import type { ScanHooks } from "../hooks/types.js";
4
+ export interface ScanResult {
5
+ readonly scanId: string;
6
+ readonly state: ScanState;
7
+ readonly committed: CommitOutcome | null;
8
+ readonly aborted: boolean;
9
+ }
10
+ export declare const DEFAULT_PHASES: readonly {
11
+ name: PhaseName;
12
+ phase: Phase;
13
+ }[];
14
+ /**
15
+ * Composition root for a scan run. Builds the singleton subdomain instances
16
+ * (bus, buffer, audit engine) once, threads them through ScanState, then
17
+ * runs the phase chain. Auditor lifecycle wraps the whole chain so it sees
18
+ * every event from decrypt through commit.
19
+ *
20
+ * Per-run isolation: every call to runScan creates fresh instances. Nothing
21
+ * survives between scans.
22
+ */
23
+ export declare function runScan(db: Database.Database, opts?: RunScanOptions, hooks?: ScanHooks): Promise<ScanResult>;
24
+ export type { ScanState, ScanHooks, RunScanOptions, CommitOutcome, } from "./types.js";
@@ -0,0 +1,87 @@
1
+ import { randomUUID } from "crypto";
2
+ import { createBus } from "../bus/engine.js";
3
+ import { createBuffer } from "../buffer/engine.js";
4
+ import { createAuditEngine } from "../audit/engine.js";
5
+ import { AUDIT_PASSES } from "../audit/passes/index.js";
6
+ import { decryptPhase } from "../phases/decrypt.js";
7
+ import { chunkPhase } from "../phases/chunk.js";
8
+ import { parsePhase } from "../phases/parse.js";
9
+ import { reviewPhase } from "../phases/review.js";
10
+ import { commitPhase } from "../phases/commit.js";
11
+ export const DEFAULT_PHASES = [
12
+ { name: "decrypt", phase: decryptPhase },
13
+ { name: "chunk", phase: chunkPhase },
14
+ { name: "parse", phase: parsePhase },
15
+ { name: "review", phase: reviewPhase },
16
+ { name: "commit", phase: commitPhase },
17
+ ];
18
+ /**
19
+ * Composition root for a scan run. Builds the singleton subdomain instances
20
+ * (bus, buffer, audit engine) once, threads them through ScanState, then
21
+ * runs the phase chain. Auditor lifecycle wraps the whole chain so it sees
22
+ * every event from decrypt through commit.
23
+ *
24
+ * Per-run isolation: every call to runScan creates fresh instances. Nothing
25
+ * survives between scans.
26
+ */
27
+ export async function runScan(db, opts = {}, hooks = {}) {
28
+ const scanId = `sc:${randomUUID()}`;
29
+ const bus = createBus();
30
+ const buffer = createBuffer(scanId, bus);
31
+ const audit = createAuditEngine({ db, bus, buffer, passes: AUDIT_PASSES });
32
+ const state = {
33
+ scanId,
34
+ startedAt: Date.now(),
35
+ options: opts,
36
+ buffer,
37
+ bus,
38
+ files: [],
39
+ decrypted: [],
40
+ skipped: [],
41
+ failed: [],
42
+ chunks: [],
43
+ review: null,
44
+ committed: null,
45
+ errors: [],
46
+ auditApplied: {},
47
+ };
48
+ await fire(hooks.onStart, state);
49
+ audit.start();
50
+ const phases = opts.phases ?? DEFAULT_PHASES;
51
+ let aborted = false;
52
+ try {
53
+ for (const { name, phase } of phases) {
54
+ try {
55
+ await phase(db, state, hooks);
56
+ }
57
+ catch (err) {
58
+ state.errors.push({ phase: name, error: err });
59
+ await fire(hooks.onError, err, name, state);
60
+ aborted = true;
61
+ break;
62
+ }
63
+ if (name === "review" && state.review === "abort") {
64
+ aborted = true;
65
+ break;
66
+ }
67
+ }
68
+ }
69
+ finally {
70
+ audit.stop();
71
+ for (const [name, count] of Object.entries(audit.tally)) {
72
+ state.auditApplied[name] = count;
73
+ }
74
+ await fire(hooks.onFinish, state);
75
+ }
76
+ return { scanId, state, committed: state.committed, aborted };
77
+ }
78
+ async function fire(fn, ...args) {
79
+ if (!fn)
80
+ return;
81
+ try {
82
+ await fn(...args);
83
+ }
84
+ catch (err) {
85
+ console.error(`[scan-engine hook] ${err.message}`);
86
+ }
87
+ }
@@ -0,0 +1,90 @@
1
+ import type Database from "libsql";
2
+ import type { ScannedFile } from "../walker.js";
3
+ import type { ScanBuffer } from "../buffer/types.js";
4
+ import type { Bus } from "../bus/types.js";
5
+ export type MaybePromise<T> = T | Promise<T>;
6
+ export interface Chunk {
7
+ readonly chunkId: string;
8
+ readonly fileId: string;
9
+ readonly fileName: string;
10
+ readonly relPath: string;
11
+ readonly pageNumber: number;
12
+ readonly totalPages: number;
13
+ readonly bytes: Buffer;
14
+ readonly mime: string;
15
+ }
16
+ export interface DecryptedFile {
17
+ readonly path: string;
18
+ readonly fileName: string;
19
+ readonly relPath: string;
20
+ readonly hash: string;
21
+ readonly mime: string;
22
+ readonly decryptedBytes: Buffer;
23
+ readonly replacesPriorScannedFileId?: string;
24
+ }
25
+ export interface SkippedFile {
26
+ readonly file: ScannedFile;
27
+ readonly existingScannedFileId: string;
28
+ }
29
+ export interface FailedFile {
30
+ readonly file: ScannedFile;
31
+ readonly error: string;
32
+ }
33
+ export interface PhaseError {
34
+ readonly phase: PhaseName;
35
+ readonly target?: string;
36
+ readonly error: unknown;
37
+ }
38
+ export type PhaseName = "decrypt" | "chunk" | "parse" | "review" | "commit";
39
+ export type ReviewDecision = "commit" | "abort";
40
+ export interface CommitOutcome {
41
+ readonly transactions: number;
42
+ readonly accounts: number;
43
+ readonly merchants: number;
44
+ readonly unknowns: number;
45
+ readonly scannedFileIds: readonly string[];
46
+ }
47
+ export interface RunScanOptions {
48
+ regex?: string;
49
+ force?: boolean;
50
+ interactive?: boolean;
51
+ /** Max FileWorkers running concurrently. Default 5, hard cap 8. */
52
+ maxFileWorkers?: number;
53
+ /** Max ChunkWorkers per FileWorker. Default 5, hard cap 8. */
54
+ maxChunkWorkersPerFile?: number;
55
+ review?: boolean;
56
+ autoCommit?: boolean;
57
+ /**
58
+ * Override the phase chain. Default = the five built-in phases. Extending
59
+ * tests / alternate flows (dry-run, OCR-only) inject their own without
60
+ * editing the engine. Open for extension; closed for modification.
61
+ */
62
+ phases?: ReadonlyArray<{
63
+ name: PhaseName;
64
+ phase: Phase;
65
+ }>;
66
+ }
67
+ /**
68
+ * The state object threaded through every phase. Phases mutate it in place;
69
+ * hooks read it. The buffer + bus are interfaces — the engine owns the
70
+ * factory instances and injects them here.
71
+ */
72
+ export interface ScanState {
73
+ readonly scanId: string;
74
+ readonly startedAt: number;
75
+ readonly options: RunScanOptions;
76
+ readonly buffer: ScanBuffer;
77
+ readonly bus: Bus;
78
+ files: ScannedFile[];
79
+ decrypted: DecryptedFile[];
80
+ skipped: SkippedFile[];
81
+ failed: FailedFile[];
82
+ chunks: Chunk[];
83
+ review: ReviewDecision | null;
84
+ committed: CommitOutcome | null;
85
+ errors: PhaseError[];
86
+ auditApplied: Record<string, number>;
87
+ }
88
+ import type { ScanHooks } from "../hooks/types.js";
89
+ export type Phase = (db: Database.Database, state: ScanState, hooks: ScanHooks) => Promise<void>;
90
+ export type { ScanHooks } from "../hooks/types.js";
@@ -0,0 +1 @@
1
+ export {};
@@ -0,0 +1,90 @@
1
+ import type Database from "libsql";
2
+ import type { ScannedFile } from "./walker.js";
3
+ import type { ScanHooks } from "./hooks.js";
4
+ import type { ScanProgress } from "./progress.js";
5
+ import type { ResolveSummary } from "./resolver.js";
6
+ export interface Chunk {
7
+ readonly chunkId: string;
8
+ readonly fileId: string;
9
+ readonly fileName: string;
10
+ readonly relPath: string;
11
+ readonly pageNumber: number;
12
+ readonly totalPages: number;
13
+ readonly bytes: Buffer;
14
+ readonly mime: string;
15
+ }
16
+ export interface DecryptedFile {
17
+ readonly path: string;
18
+ readonly fileName: string;
19
+ readonly relPath: string;
20
+ readonly hash: string;
21
+ readonly mime: string;
22
+ readonly decryptedBytes: Buffer;
23
+ readonly replacesPriorScannedFileId?: string;
24
+ /** scanned_files.id assigned in decryptPhase so scan-worker tools can stamp source_file_id. */
25
+ scannedFileId?: string;
26
+ }
27
+ export interface SkippedFile {
28
+ readonly file: ScannedFile;
29
+ readonly existingScannedFileId: string;
30
+ }
31
+ export interface FailedFile {
32
+ readonly file: ScannedFile;
33
+ readonly error: string;
34
+ }
35
+ export interface PhaseError {
36
+ readonly phase: PhaseName;
37
+ readonly target?: string;
38
+ readonly error: unknown;
39
+ }
40
+ export type PhaseName = "decrypt" | "chunk" | "parse" | "resolve";
41
+ export interface RunScanOptions {
42
+ regex?: string;
43
+ force?: boolean;
44
+ interactive?: boolean;
45
+ /** Max files processed concurrently. Default 5, hard cap 8. */
46
+ maxFileWorkers?: number;
47
+ /** Max scan workers per file (one per chunk). Default 5, hard cap 8. */
48
+ maxScanWorkersPerFile?: number;
49
+ /**
50
+ * Override the phase chain. Default = the four built-ins. Tests and alternate
51
+ * flows (dry-run, OCR-only) inject their own without editing this file.
52
+ */
53
+ phases?: ReadonlyArray<{
54
+ name: PhaseName;
55
+ phase: Phase;
56
+ }>;
57
+ }
58
+ /**
59
+ * The state object threaded through every phase. Phases mutate it in place;
60
+ * hooks read it. `progress` is the single-consumer event sink scan-worker
61
+ * tools emit into; the CLI subscribes to drive the dashboard.
62
+ */
63
+ export interface ScanState {
64
+ readonly scanId: string;
65
+ readonly startedAt: number;
66
+ readonly options: RunScanOptions;
67
+ readonly progress: ScanProgress;
68
+ files: ScannedFile[];
69
+ decrypted: DecryptedFile[];
70
+ skipped: SkippedFile[];
71
+ failed: FailedFile[];
72
+ chunks: Chunk[];
73
+ resolveSummary: ResolveSummary | null;
74
+ errors: PhaseError[];
75
+ }
76
+ export type Phase = (db: Database.Database, state: ScanState, hooks: ScanHooks) => Promise<void>;
77
+ export interface ScanResult {
78
+ readonly scanId: string;
79
+ readonly state: ScanState;
80
+ }
81
+ export declare const DEFAULT_PHASES: readonly {
82
+ name: PhaseName;
83
+ phase: Phase;
84
+ }[];
85
+ /**
86
+ * Composition root. Builds the progress sink once per scan run, threads it
87
+ * through ScanState, then runs the phase chain. Nothing survives between
88
+ * scans.
89
+ */
90
+ export declare function runScan(db: Database.Database, opts?: RunScanOptions, hooks?: ScanHooks): Promise<ScanResult>;
@@ -0,0 +1,84 @@
1
+ import { randomUUID } from "crypto";
2
+ import { createProgress } from "./progress.js";
3
+ import { decryptPhase } from "./decrypt.js";
4
+ import { parsePhase } from "./parse.js";
5
+ import { chunkPdf } from "./pdf/chunker.js";
6
+ import { runResolve } from "./resolver.js";
7
+ import { errorMessage } from "./result.js";
8
+ const chunkPhase = async (_db, state, hooks) => {
9
+ await hooks.beforeChunk?.(state);
10
+ for (const file of state.decrypted)
11
+ state.chunks.push(...await chunkPdf(file));
12
+ await hooks.afterChunk?.(state);
13
+ };
14
+ const resolvePhase = async (db, state, hooks) => {
15
+ await hooks.beforeResolve?.(state);
16
+ const summary = await runResolve({
17
+ db,
18
+ scanId: state.scanId,
19
+ interactive: state.options.interactive ?? true,
20
+ });
21
+ state.resolveSummary = summary;
22
+ await hooks.afterResolve?.(state, summary);
23
+ };
24
+ export const DEFAULT_PHASES = [
25
+ { name: "decrypt", phase: decryptPhase },
26
+ { name: "chunk", phase: chunkPhase },
27
+ { name: "parse", phase: parsePhase },
28
+ { name: "resolve", phase: resolvePhase },
29
+ ];
30
+ /**
31
+ * Composition root. Builds the progress sink once per scan run, threads it
32
+ * through ScanState, then runs the phase chain. Nothing survives between
33
+ * scans.
34
+ */
35
+ export async function runScan(db, opts = {}, hooks = {}) {
36
+ const scanId = `sc:${randomUUID()}`;
37
+ const progress = createProgress();
38
+ const state = {
39
+ scanId,
40
+ startedAt: Date.now(),
41
+ options: opts,
42
+ progress,
43
+ files: [],
44
+ decrypted: [],
45
+ skipped: [],
46
+ failed: [],
47
+ chunks: [],
48
+ resolveSummary: null,
49
+ errors: [],
50
+ };
51
+ await fire(hooks.onStart, state);
52
+ const phases = opts.phases ?? DEFAULT_PHASES;
53
+ await runPhaseChain(db, state, hooks, phases);
54
+ await fire(hooks.onFinish, state);
55
+ return { scanId, state };
56
+ }
57
+ async function runPhaseChain(db, state, hooks, phases) {
58
+ for (const { name, phase } of phases) {
59
+ const aborted = await tryPhase(db, state, hooks, name, phase);
60
+ if (aborted)
61
+ return;
62
+ }
63
+ }
64
+ async function tryPhase(db, state, hooks, name, phase) {
65
+ try {
66
+ await phase(db, state, hooks);
67
+ return false;
68
+ }
69
+ catch (err) {
70
+ state.errors.push({ phase: name, error: err });
71
+ await fire(hooks.onError, err, name, state);
72
+ return true;
73
+ }
74
+ }
75
+ async function fire(fn, ...args) {
76
+ if (!fn)
77
+ return;
78
+ try {
79
+ await fn(...args);
80
+ }
81
+ catch (err) {
82
+ console.error(`[scan-engine] ${errorMessage(err)}`);
83
+ }
84
+ }
@@ -0,0 +1,33 @@
1
+ import type Database from "libsql";
2
+ import { type ChunkWorkerDeps, type ChunkWorkerResult } from "./chunk-worker.js";
3
+ import type { Chunk } from "./engine.js";
4
+ import type { ScanHooks } from "./hooks.js";
5
+ import type { ScanProgress } from "./progress.js";
6
+ /**
7
+ * Pluggable chunk-parser strategy. Default is the LLM-driven runChunkWorker;
8
+ * tests and alternate flows (OCR-only, mock, dry-run) inject their own.
9
+ */
10
+ export type ChunkWorkerFn = (deps: ChunkWorkerDeps, hooks: ScanHooks) => Promise<ChunkWorkerResult>;
11
+ export interface FileWorkerDeps {
12
+ readonly db: Database.Database;
13
+ readonly scanId: string;
14
+ readonly scannedFileId: string | undefined;
15
+ readonly progress: ScanProgress;
16
+ readonly fileId: string;
17
+ readonly chunks: readonly Chunk[];
18
+ readonly maxChunkWorkers: number;
19
+ /** Optional override; defaults to the LLM-backed runChunkWorker. */
20
+ readonly chunkWorkerFn?: ChunkWorkerFn;
21
+ }
22
+ export interface FileWorkerResult {
23
+ readonly workerId: string;
24
+ readonly fileId: string;
25
+ readonly ok: number;
26
+ readonly failed: number;
27
+ }
28
+ /**
29
+ * Process every chunk of one file in parallel up to `maxChunkWorkers`.
30
+ * Chunk-worker tools write transactions + unknowns directly to the DB,
31
+ * scoped to `scanId`; per-row ticks fan out via the shared progress sink.
32
+ */
33
+ export declare function runFileWorker(deps: FileWorkerDeps, hooks: ScanHooks): Promise<FileWorkerResult>;