@openparachute/vault 0.4.7-rc.2 → 0.4.8-rc.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,189 @@
1
+ /**
2
+ * Transcript-note materialization for the vault↔scribe auto-transcribe path
3
+ * (vault#353, design 2026-05-21 Part 2, design question 3).
4
+ *
5
+ * When the transcription worker resolves an audio attachment, it asks this
6
+ * module to create (or update) a sibling `<attachment-path>.transcript.md`
7
+ * note. The note's frontmatter links back to the original audio attachment
8
+ * via `transcript_of`, lets vault graph queries surface the relation, and
9
+ * captures success / failure state in a single uniform shape.
10
+ *
11
+ * Design Q3's exact shape:
12
+ *
13
+ * ---
14
+ * title: Transcript of <attachment-name>
15
+ * tags: [transcript, capture]
16
+ * created_at: <iso>
17
+ * transcript_of: <attachment-path>
18
+ * transcript_status: complete | failed
19
+ * transcript_provider: <name or unset>
20
+ * transcript_duration_ms: <wall-clock ms>
21
+ * transcript_error: <error-message — failed only>
22
+ * ---
23
+ *
24
+ * <transcript text — empty body when failed>
25
+ *
26
+ * The `transcript_status` values are uniform: `complete` for success and
27
+ * `failed` for any terminal failure (provider missing, scribe 5xx, timeout,
28
+ * etc.). The cause is in `transcript_error`. This matches the design's
29
+ * "same failure substrate" stance — no first-boot-specific branch, just
30
+ * different error strings.
31
+ *
32
+ * ## Retry semantics
33
+ *
34
+ * When the retry endpoint re-runs transcription on an already-failed
35
+ * transcript note, the same note is updated in place — `transcript_of`,
36
+ * `path`, and `id` all stay constant. Callers identify the transcript by
37
+ * note path (`<attachment-path>.transcript.md`) and call `upsertTranscriptNote`
38
+ * to overwrite. The transcript-of relation is materialized as both a
39
+ * frontmatter scalar AND a vault link (via `links.add` on create) so graph
40
+ * queries can find it without parsing frontmatter.
41
+ */
42
+
43
+ import type { Store, Note } from "../core/src/types.ts";
44
+
45
+ export type TranscriptStatus = "complete" | "failed";
46
+
47
+ export interface TranscriptNoteInput {
48
+ /**
49
+ * Path of the source audio attachment (relative to the vault assets dir).
50
+ * Used to derive the transcript note's path and the `transcript_of`
51
+ * frontmatter scalar.
52
+ */
53
+ attachmentPath: string;
54
+ /**
55
+ * Attachment row id of the source audio. Stamped onto the transcript note
56
+ * frontmatter (`transcript_attachment_id`) so the retry endpoint can find
57
+ * the original audio in one DB lookup without walking links or paths.
58
+ */
59
+ attachmentId: string;
60
+ /**
61
+ * The note that owns the audio attachment. The transcript-of relation
62
+ * (vault link) is established to this note so graph queries can find the
63
+ * transcript from either side.
64
+ */
65
+ attachmentNoteId: string;
66
+ /** Outcome — `complete` (transcript text available) or `failed`. */
67
+ status: TranscriptStatus;
68
+ /**
69
+ * Transcript body. Required when `status === "complete"`; ignored otherwise.
70
+ * Failure transcript notes have an empty body — the error string is in
71
+ * frontmatter, not in the note body, so failures don't read like content.
72
+ */
73
+ text?: string;
74
+ /** Error cause for `status === "failed"`. Required for failed status. */
75
+ error?: string;
76
+ /** Scribe-reported provider (e.g. "groq", "whisper"). Optional. */
77
+ provider?: string;
78
+ /** Wall-clock duration of the scribe call. Optional; 0 if unknown. */
79
+ durationMs?: number;
80
+ /** Created-at override (defaults to `new Date()`). */
81
+ createdAt?: Date;
82
+ }
83
+
84
+ /**
85
+ * Compute the canonical transcript-note path for an audio attachment.
86
+ *
87
+ * Example: `inbox/Voice 2026-05-21 09-13.m4a` → `inbox/Voice 2026-05-21 09-13.m4a.transcript`
88
+ *
89
+ * The `.md` extension is implicit in the vault path normalization (paths are
90
+ * stored without `.md` per vault convention; on export to portable markdown,
91
+ * `.md` is appended).
92
+ */
93
+ export function transcriptPathFor(attachmentPath: string): string {
94
+ return `${attachmentPath}.transcript`;
95
+ }
96
+
97
+ /**
98
+ * Build the body+metadata for a transcript note from the worker's input.
99
+ * Pure — no Store calls — so unit tests can assert shape without a DB.
100
+ * Exposed for tests and for any future caller that wants to materialize a
101
+ * transcript note independently of the worker.
102
+ */
103
+ export function buildTranscriptNote(input: TranscriptNoteInput): {
104
+ path: string;
105
+ content: string;
106
+ metadata: Record<string, unknown>;
107
+ tags: string[];
108
+ createdAt: string;
109
+ } {
110
+ const created = (input.createdAt ?? new Date()).toISOString();
111
+ const filename = input.attachmentPath.split("/").pop() ?? input.attachmentPath;
112
+ const metadata: Record<string, unknown> = {
113
+ title: `Transcript of ${filename}`,
114
+ transcript_of: input.attachmentPath,
115
+ transcript_attachment_id: input.attachmentId,
116
+ transcript_status: input.status,
117
+ };
118
+ if (input.provider) metadata.transcript_provider = input.provider;
119
+ if (typeof input.durationMs === "number") {
120
+ metadata.transcript_duration_ms = input.durationMs;
121
+ }
122
+ if (input.status === "failed") {
123
+ metadata.transcript_error = input.error ?? "unknown error";
124
+ }
125
+ const body = input.status === "complete" ? (input.text ?? "") : "";
126
+ return {
127
+ path: transcriptPathFor(input.attachmentPath),
128
+ content: body,
129
+ metadata,
130
+ tags: ["transcript", "capture"],
131
+ createdAt: created,
132
+ };
133
+ }
134
+
135
+ /**
136
+ * Create or update the transcript note at `<attachmentPath>.transcript.md`.
137
+ *
138
+ * - If no note exists at that path, creates one with the standard shape,
139
+ * establishes a `transcript_of` link to the attachment-owning note, and
140
+ * returns the new note.
141
+ * - If a transcript note already exists (retry path), overwrites its content
142
+ * + metadata in place, preserves the existing note id, and returns the
143
+ * updated note. Links are NOT re-created — the existing relation survives.
144
+ *
145
+ * Errors from the Store (path collision with a non-transcript note, etc.)
146
+ * bubble up; the worker catches them and logs.
147
+ */
148
+ export async function upsertTranscriptNote(
149
+ store: Store,
150
+ input: TranscriptNoteInput,
151
+ ): Promise<Note> {
152
+ const built = buildTranscriptNote(input);
153
+ const existing = await store.getNoteByPath(built.path);
154
+ if (existing) {
155
+ await store.updateNote(existing.id, {
156
+ content: built.content,
157
+ metadata: built.metadata,
158
+ // Preserve the original created_at; the retry doesn't reset it.
159
+ skipUpdatedAt: false,
160
+ });
161
+ // Re-apply tags via tagNote — updateNote() doesn't accept a tags field
162
+ // (the engine treats tag mutation as a separate op). tagNote upserts
163
+ // existing tags (no duplicates), so it's safe to call even when tags
164
+ // are unchanged from the prior write.
165
+ await store.tagNote(existing.id, built.tags);
166
+ const updated = await store.getNote(existing.id);
167
+ return updated ?? existing;
168
+ }
169
+ const created = await store.createNote(built.content, {
170
+ path: built.path,
171
+ tags: built.tags,
172
+ metadata: built.metadata,
173
+ created_at: built.createdAt,
174
+ });
175
+ // Materialize the transcript-of relation as a typed link too, so graph
176
+ // queries (find-path, neighborhood) surface the transcript without
177
+ // walking frontmatter. The relation is named "transcript_of" so it
178
+ // matches the frontmatter scalar; if the target note has been deleted
179
+ // by the time we get here (race during retry), skip silently — the
180
+ // frontmatter scalar still carries the relation for query-by-path.
181
+ try {
182
+ await store.createLink(created.id, input.attachmentNoteId, "transcript_of");
183
+ } catch {
184
+ // Target may be missing or the link constraint may reject — failure
185
+ // here doesn't invalidate the transcript itself, which is still
186
+ // queryable via the `transcript_of` frontmatter scalar + path.
187
+ }
188
+ return created;
189
+ }
@@ -0,0 +1,22 @@
1
+ /**
2
+ * Process-singleton holder for the active TranscriptionWorker (vault#353).
3
+ *
4
+ * Mirrors `mirror-registry.ts`: `server.ts` constructs the worker on boot
5
+ * (when scribe is discoverable), and the REST retry endpoint
6
+ * (`/api/notes/:id/retry-transcription`) picks it up here to call `kick()`
7
+ * for an event-driven re-run. Absent the worker (no scribe), the retry
8
+ * endpoint still flips the attachment back to `pending` so the sweep would
9
+ * pick it up — but it'll just sit there until scribe shows up.
10
+ */
11
+
12
+ import type { TranscriptionWorker } from "./transcription-worker.ts";
13
+
14
+ let activeWorker: TranscriptionWorker | null = null;
15
+
16
+ export function setTranscriptionWorker(worker: TranscriptionWorker | null): void {
17
+ activeWorker = worker;
18
+ }
19
+
20
+ export function getTranscriptionWorker(): TranscriptionWorker | null {
21
+ return activeWorker;
22
+ }
@@ -867,3 +867,253 @@ describe("transcription worker — hook-driven", () => {
867
867
  }
868
868
  });
869
869
  });
870
+
871
+ // ---------------------------------------------------------------------------
872
+ // vault#353 — auto-origin path: materialize <attachment-path>.transcript.md
873
+ // notes on success AND on terminal failure (so the retry endpoint has a
874
+ // surface to act on). The legacy `transcribe_stub`-patching path remains
875
+ // unchanged; these tests pin the new behavior for attachments stamped with
876
+ // `transcribe_origin: "auto"`.
877
+ // ---------------------------------------------------------------------------
878
+
879
+ describe("transcription worker — auto-origin (vault#353)", () => {
880
+ test("success: materializes <path>.transcript with frontmatter + body", async () => {
881
+ const owner = await store.createNote("# Voice memo\n", { id: "auto-1" });
882
+ seedAudio("memos/auto-1.webm");
883
+ await store.addAttachment(owner.id, "memos/auto-1.webm", "audio/webm", {
884
+ transcribe_status: "pending",
885
+ transcribe_origin: "auto",
886
+ });
887
+
888
+ const worker = makeWorker({
889
+ fetchImpl: mkFetchMock([{ text: "this is the spoken text" }]),
890
+ });
891
+ try {
892
+ const processed = await worker.tick();
893
+ expect(processed).toBe(1);
894
+ } finally {
895
+ await worker.stop();
896
+ }
897
+
898
+ // The transcript note exists, with the expected shape.
899
+ const transcript = await store.getNoteByPath("memos/auto-1.webm.transcript");
900
+ expect(transcript).not.toBeNull();
901
+ expect(transcript!.content).toBe("this is the spoken text");
902
+ expect(transcript!.tags).toContain("transcript");
903
+ expect((transcript!.metadata as any)?.transcript_status).toBe("complete");
904
+ expect((transcript!.metadata as any)?.transcript_of).toBe("memos/auto-1.webm");
905
+ expect(typeof (transcript!.metadata as any)?.transcript_duration_ms).toBe("number");
906
+
907
+ // Source note is NOT touched (no stub patching on auto-origin).
908
+ const sourceNote = await store.getNote("auto-1");
909
+ expect(sourceNote!.content).toBe("# Voice memo\n");
910
+
911
+ // Attachment row also reflects success — sweep + retry hit the same row.
912
+ const [att] = await store.getAttachments(owner.id);
913
+ expect(att.metadata?.transcribe_status).toBe("done");
914
+ });
915
+
916
+ test("missing_provider 400: terminal failure on first try → failed transcript note", async () => {
917
+ const owner = await store.createNote("# Voice memo\n", { id: "auto-mp" });
918
+ seedAudio("memos/auto-mp.webm");
919
+ await store.addAttachment(owner.id, "memos/auto-mp.webm", "audio/webm", {
920
+ transcribe_status: "pending",
921
+ transcribe_origin: "auto",
922
+ });
923
+
924
+ // Custom fetchImpl that returns the structured 400 missing_provider
925
+ // payload (scribe#47 shape).
926
+ const fetchImpl: typeof fetch = (async () => {
927
+ return new Response(
928
+ JSON.stringify({
929
+ error: "no transcription provider configured",
930
+ error_code: "missing_provider",
931
+ }),
932
+ { status: 400, headers: { "content-type": "application/json" } },
933
+ );
934
+ }) as typeof fetch;
935
+
936
+ const worker = makeWorker({ fetchImpl, maxAttempts: 5 });
937
+ try {
938
+ await worker.tick();
939
+ } finally {
940
+ await worker.stop();
941
+ }
942
+
943
+ const transcript = await store.getNoteByPath("memos/auto-mp.webm.transcript");
944
+ expect(transcript).not.toBeNull();
945
+ expect(transcript!.content).toBe("");
946
+ expect((transcript!.metadata as any)?.transcript_status).toBe("failed");
947
+ expect((transcript!.metadata as any)?.transcript_error).toContain("missing_provider");
948
+
949
+ // 4xx is terminal — attempts tracked but status went straight to failed,
950
+ // not parked in pending with backoff.
951
+ const [att] = await store.getAttachments(owner.id);
952
+ expect(att.metadata?.transcribe_status).toBe("failed");
953
+ expect((att.metadata as any)?.transcribe_error_code).toBe("missing_provider");
954
+ });
955
+
956
+ test("5xx timeout: retries with backoff (NOT terminal on first failure)", async () => {
957
+ const owner = await store.createNote("# Voice memo\n", { id: "auto-503" });
958
+ seedAudio("memos/auto-503.webm");
959
+ await store.addAttachment(owner.id, "memos/auto-503.webm", "audio/webm", {
960
+ transcribe_status: "pending",
961
+ transcribe_origin: "auto",
962
+ });
963
+
964
+ const worker = makeWorker({
965
+ fetchImpl: mkFetchMock([{ error: "upstream timeout", status: 503 }]),
966
+ maxAttempts: 3,
967
+ });
968
+ try {
969
+ await worker.tick();
970
+ } finally {
971
+ await worker.stop();
972
+ }
973
+
974
+ // 5xx is retriable — attachment goes back to pending with backoff.
975
+ // The transcript note is NOT materialized yet (the failure isn't terminal
976
+ // and we don't want to surface intermediate failures to the user).
977
+ const [att] = await store.getAttachments(owner.id);
978
+ expect(att.metadata?.transcribe_status).toBe("pending");
979
+ expect(att.metadata?.transcribe_backoff_until).toBeTruthy();
980
+
981
+ const transcript = await store.getNoteByPath("memos/auto-503.webm.transcript");
982
+ expect(transcript).toBeNull();
983
+ });
984
+
985
+ test("audio gone: terminal failure → failed transcript note materialized", async () => {
986
+ const owner = await store.createNote("# Voice memo\n", { id: "auto-gone" });
987
+ // No seedAudio call — the file is missing.
988
+ await store.addAttachment(owner.id, "memos/auto-gone.webm", "audio/webm", {
989
+ transcribe_status: "pending",
990
+ transcribe_origin: "auto",
991
+ });
992
+
993
+ const worker = makeWorker({
994
+ fetchImpl: mkFetchMock([{ text: "should never run" }]),
995
+ });
996
+ try {
997
+ await worker.tick();
998
+ } finally {
999
+ await worker.stop();
1000
+ }
1001
+
1002
+ const transcript = await store.getNoteByPath("memos/auto-gone.webm.transcript");
1003
+ expect(transcript).not.toBeNull();
1004
+ expect((transcript!.metadata as any)?.transcript_status).toBe("failed");
1005
+ expect((transcript!.metadata as any)?.transcript_error).toBe("audio file not found");
1006
+ });
1007
+
1008
+ test("legacy stub flow unchanged: no transcript note materialized for transcribe_stub-only", async () => {
1009
+ await store.createNote(
1010
+ "# Voice\n\n_Transcript pending._\n",
1011
+ { id: "legacy-1", metadata: { transcribe_stub: true } },
1012
+ );
1013
+ seedAudio("memos/legacy-1.webm");
1014
+ await store.addAttachment("legacy-1", "memos/legacy-1.webm", "audio/webm", {
1015
+ transcribe_status: "pending",
1016
+ // Legacy path: NO transcribe_origin: "auto".
1017
+ });
1018
+
1019
+ const worker = makeWorker({
1020
+ fetchImpl: mkFetchMock([{ text: "stub-patched" }]),
1021
+ });
1022
+ try {
1023
+ await worker.tick();
1024
+ } finally {
1025
+ await worker.stop();
1026
+ }
1027
+
1028
+ // The note body was patched in place (legacy behavior).
1029
+ const note = await store.getNote("legacy-1");
1030
+ expect(note!.content).toBe("# Voice\n\nstub-patched\n");
1031
+
1032
+ // No transcript note was created.
1033
+ const transcript = await store.getNoteByPath("memos/legacy-1.webm.transcript");
1034
+ expect(transcript).toBeNull();
1035
+ });
1036
+
1037
+ test("retry path: failed transcript is overwritten in place on success", async () => {
1038
+ const owner = await store.createNote("# Voice memo\n", { id: "auto-retry" });
1039
+ seedAudio("memos/auto-retry.webm");
1040
+ await store.addAttachment(owner.id, "memos/auto-retry.webm", "audio/webm", {
1041
+ transcribe_status: "pending",
1042
+ transcribe_origin: "auto",
1043
+ });
1044
+
1045
+ // Pass 1: missing_provider failure → failed transcript note materialized.
1046
+ const fetch400: typeof fetch = (async () => {
1047
+ return new Response(
1048
+ JSON.stringify({ error: "missing", error_code: "missing_provider" }),
1049
+ { status: 400, headers: { "content-type": "application/json" } },
1050
+ );
1051
+ }) as typeof fetch;
1052
+ const worker1 = makeWorker({ fetchImpl: fetch400 });
1053
+ try { await worker1.tick(); } finally { await worker1.stop(); }
1054
+
1055
+ const failed = await store.getNoteByPath("memos/auto-retry.webm.transcript");
1056
+ expect(failed).not.toBeNull();
1057
+ const failedId = failed!.id;
1058
+ expect((failed!.metadata as any)?.transcript_status).toBe("failed");
1059
+
1060
+ // Pass 2: re-enqueue by flipping the attachment back to pending (this is
1061
+ // what the retry endpoint does) + give scribe a successful response.
1062
+ const [att] = await store.getAttachments(owner.id);
1063
+ await store.setAttachmentMetadata(att.id, {
1064
+ ...(att.metadata ?? {}),
1065
+ transcribe_status: "pending",
1066
+ transcribe_origin: "auto",
1067
+ });
1068
+ const worker2 = makeWorker({
1069
+ fetchImpl: mkFetchMock([{ text: "second time's the charm" }]),
1070
+ });
1071
+ try { await worker2.tick(); } finally { await worker2.stop(); }
1072
+
1073
+ const success = await store.getNoteByPath("memos/auto-retry.webm.transcript");
1074
+ expect(success).not.toBeNull();
1075
+ // Same note id — updated in place, not a fresh row.
1076
+ expect(success!.id).toBe(failedId);
1077
+ expect(success!.content).toBe("second time's the charm");
1078
+ expect((success!.metadata as any)?.transcript_status).toBe("complete");
1079
+ expect((success!.metadata as any)?.transcript_error).toBeUndefined();
1080
+ });
1081
+
1082
+ test("concurrent uploads: 5 audio files yield 5 transcript notes (no path collision)", async () => {
1083
+ const owners: string[] = [];
1084
+ for (let i = 0; i < 5; i++) {
1085
+ const note = await store.createNote(`# Voice ${i}\n`, { id: `concurrent-${i}` });
1086
+ owners.push(note.id);
1087
+ seedAudio(`memos/concurrent-${i}.webm`);
1088
+ await store.addAttachment(note.id, `memos/concurrent-${i}.webm`, "audio/webm", {
1089
+ transcribe_status: "pending",
1090
+ transcribe_origin: "auto",
1091
+ });
1092
+ }
1093
+
1094
+ // Same transcript body to each. The worker drains FIFO.
1095
+ const worker = makeWorker({
1096
+ fetchImpl: mkFetchMock([
1097
+ { text: "transcript-0" },
1098
+ { text: "transcript-1" },
1099
+ { text: "transcript-2" },
1100
+ { text: "transcript-3" },
1101
+ { text: "transcript-4" },
1102
+ ]),
1103
+ });
1104
+ try {
1105
+ const processed = await worker.tick();
1106
+ expect(processed).toBe(5);
1107
+ } finally {
1108
+ await worker.stop();
1109
+ }
1110
+
1111
+ // 5 transcript notes — one per audio file. The bodies map to FIFO order;
1112
+ // we just assert each note exists with `complete` status.
1113
+ for (let i = 0; i < 5; i++) {
1114
+ const t = await store.getNoteByPath(`memos/concurrent-${i}.webm.transcript`);
1115
+ expect(t).not.toBeNull();
1116
+ expect((t!.metadata as any)?.transcript_status).toBe("complete");
1117
+ }
1118
+ });
1119
+ });