@openparachute/vault 0.3.0-rc.1 → 0.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/core/src/hooks.ts +111 -3
- package/core/src/store.ts +3 -1
- package/docs/auth-model.md +340 -0
- package/package.json +1 -1
- package/src/bind.test.ts +28 -0
- package/src/bind.ts +19 -0
- package/src/cli.ts +56 -17
- package/src/scopes.test.ts +158 -0
- package/src/scopes.ts +148 -0
- package/src/server.ts +19 -7
- package/src/transcription-worker.test.ts +282 -1
- package/src/transcription-worker.ts +171 -16
|
@@ -4,7 +4,9 @@ import { mkdirSync, rmSync, writeFileSync, existsSync } from "fs";
|
|
|
4
4
|
import { join } from "path";
|
|
5
5
|
import { tmpdir } from "os";
|
|
6
6
|
import { BunStore } from "./vault-store.ts";
|
|
7
|
-
import { startTranscriptionWorker } from "./transcription-worker.ts";
|
|
7
|
+
import { startTranscriptionWorker, registerTranscriptionHook } from "./transcription-worker.ts";
|
|
8
|
+
import { HookRegistry } from "../core/src/hooks.ts";
|
|
9
|
+
import { SqliteStore } from "../core/src/store.ts";
|
|
8
10
|
import type { Store } from "../core/src/types.ts";
|
|
9
11
|
|
|
10
12
|
let db: Database;
|
|
@@ -202,6 +204,101 @@ describe("transcription worker", () => {
|
|
|
202
204
|
expect(att.metadata?.transcribe_error).toContain("boom");
|
|
203
205
|
});
|
|
204
206
|
|
|
207
|
+
test("terminal failure with stub=true → note shows 'Transcription unavailable' and stub is cleared", async () => {
|
|
208
|
+
// Mirrors Lens's voice-memo stub shape: note with placeholder body and
|
|
209
|
+
// transcribe_stub marker, attachment pre-loaded near the retry limit.
|
|
210
|
+
await store.createNote(
|
|
211
|
+
"# 🎙️ Voice memo\n\n_Transcript pending._\n",
|
|
212
|
+
{ id: "unavail1", metadata: { transcribe_stub: true } },
|
|
213
|
+
);
|
|
214
|
+
seedAudio("memos/unavail1.webm");
|
|
215
|
+
await store.addAttachment("unavail1", "memos/unavail1.webm", "audio/webm", {
|
|
216
|
+
transcribe_status: "pending",
|
|
217
|
+
transcribe_attempts: 2,
|
|
218
|
+
});
|
|
219
|
+
|
|
220
|
+
const worker = makeWorker({
|
|
221
|
+
fetchImpl: mkFetchMock([{ error: "scribe down hard", status: 500 }]),
|
|
222
|
+
maxAttempts: 3,
|
|
223
|
+
});
|
|
224
|
+
try {
|
|
225
|
+
await worker.tick();
|
|
226
|
+
} finally {
|
|
227
|
+
await worker.stop();
|
|
228
|
+
}
|
|
229
|
+
|
|
230
|
+
const note = await store.getNote("unavail1");
|
|
231
|
+
expect(note!.content).toBe("# 🎙️ Voice memo\n\n_Transcription unavailable._\n");
|
|
232
|
+
expect((note!.metadata as any)?.transcribe_stub).toBeUndefined();
|
|
233
|
+
|
|
234
|
+
const [att] = await store.getAttachments("unavail1");
|
|
235
|
+
expect(att!.metadata?.transcribe_status).toBe("failed");
|
|
236
|
+
expect(att!.metadata?.transcribe_error).toContain("scribe down hard");
|
|
237
|
+
});
|
|
238
|
+
|
|
239
|
+
test("audio-not-found with stub=true → note shows 'Transcription unavailable' and stub is cleared", async () => {
|
|
240
|
+
await store.createNote(
|
|
241
|
+
"# 🎙️ Voice memo\n\n_Transcript pending._\n",
|
|
242
|
+
{ id: "unavail2", metadata: { transcribe_stub: true } },
|
|
243
|
+
);
|
|
244
|
+
// No seedAudio — the file is deliberately missing.
|
|
245
|
+
await store.addAttachment("unavail2", "memos/gone.webm", "audio/webm", {
|
|
246
|
+
transcribe_status: "pending",
|
|
247
|
+
});
|
|
248
|
+
|
|
249
|
+
let called = 0;
|
|
250
|
+
const worker = makeWorker({
|
|
251
|
+
fetchImpl: (async () => {
|
|
252
|
+
called++;
|
|
253
|
+
return new Response("x", { status: 200 });
|
|
254
|
+
}) as typeof fetch,
|
|
255
|
+
});
|
|
256
|
+
try {
|
|
257
|
+
await worker.tick();
|
|
258
|
+
} finally {
|
|
259
|
+
await worker.stop();
|
|
260
|
+
}
|
|
261
|
+
|
|
262
|
+
// Scribe was never called — audio-missing check short-circuits before
|
|
263
|
+
// the network call, same as before. What's new is the note rewrite.
|
|
264
|
+
expect(called).toBe(0);
|
|
265
|
+
|
|
266
|
+
const note = await store.getNote("unavail2");
|
|
267
|
+
expect(note!.content).toBe("# 🎙️ Voice memo\n\n_Transcription unavailable._\n");
|
|
268
|
+
expect((note!.metadata as any)?.transcribe_stub).toBeUndefined();
|
|
269
|
+
|
|
270
|
+
const [att] = await store.getAttachments("unavail2");
|
|
271
|
+
expect(att!.metadata?.transcribe_status).toBe("failed");
|
|
272
|
+
expect(att!.metadata?.transcribe_error).toContain("audio file not found");
|
|
273
|
+
});
|
|
274
|
+
|
|
275
|
+
test("terminal failure with stub=false → note content is NOT touched", async () => {
|
|
276
|
+
// User edited the note after upload, which cleared the stub marker.
|
|
277
|
+
// Worker must not clobber their edit even though transcription failed.
|
|
278
|
+
await store.createNote("my own words", { id: "unavail3" });
|
|
279
|
+
seedAudio("memos/unavail3.webm");
|
|
280
|
+
await store.addAttachment("unavail3", "memos/unavail3.webm", "audio/webm", {
|
|
281
|
+
transcribe_status: "pending",
|
|
282
|
+
transcribe_attempts: 2,
|
|
283
|
+
});
|
|
284
|
+
|
|
285
|
+
const worker = makeWorker({
|
|
286
|
+
fetchImpl: mkFetchMock([{ error: "boom", status: 500 }]),
|
|
287
|
+
maxAttempts: 3,
|
|
288
|
+
});
|
|
289
|
+
try {
|
|
290
|
+
await worker.tick();
|
|
291
|
+
} finally {
|
|
292
|
+
await worker.stop();
|
|
293
|
+
}
|
|
294
|
+
|
|
295
|
+
const note = await store.getNote("unavail3");
|
|
296
|
+
expect(note!.content).toBe("my own words");
|
|
297
|
+
|
|
298
|
+
const [att] = await store.getAttachments("unavail3");
|
|
299
|
+
expect(att!.metadata?.transcribe_status).toBe("failed");
|
|
300
|
+
});
|
|
301
|
+
|
|
205
302
|
test("FIFO: oldest pending is processed first", async () => {
|
|
206
303
|
await store.createNote("s", { id: "f1", metadata: { transcribe_stub: true } });
|
|
207
304
|
await store.createNote("s", { id: "f2", metadata: { transcribe_stub: true } });
|
|
@@ -581,3 +678,187 @@ describe("store.listAttachmentsByTranscribeStatus", () => {
|
|
|
581
678
|
expect(done[0]!.path).toBe("a.webm");
|
|
582
679
|
});
|
|
583
680
|
});
|
|
681
|
+
|
|
682
|
+
describe("transcription worker — hook-driven", () => {
|
|
683
|
+
// These tests use a private HookRegistry so they don't collide with
|
|
684
|
+
// defaultHookRegistry state or other test files.
|
|
685
|
+
let hooks: HookRegistry;
|
|
686
|
+
let hookedStore: SqliteStore;
|
|
687
|
+
let hookedDb: Database;
|
|
688
|
+
|
|
689
|
+
beforeEach(() => {
|
|
690
|
+
hookedDb = new Database(":memory:");
|
|
691
|
+
hooks = new HookRegistry({ concurrency: 4, logger: silentLogger });
|
|
692
|
+
hookedStore = new SqliteStore(hookedDb, { hooks });
|
|
693
|
+
});
|
|
694
|
+
|
|
695
|
+
afterEach(() => {
|
|
696
|
+
hookedDb.close();
|
|
697
|
+
});
|
|
698
|
+
|
|
699
|
+
test("attachment:created event triggers a cycle before the sweep fires", async () => {
|
|
700
|
+
await hookedStore.createNote("stub", { id: "h1", metadata: { transcribe_stub: true } });
|
|
701
|
+
seedAudio("memos/h1.webm");
|
|
702
|
+
|
|
703
|
+
let callCount = 0;
|
|
704
|
+
const fetchImpl = (async () => {
|
|
705
|
+
callCount++;
|
|
706
|
+
return new Response(JSON.stringify({ text: "hook-path" }), {
|
|
707
|
+
status: 200,
|
|
708
|
+
headers: { "content-type": "application/json" },
|
|
709
|
+
});
|
|
710
|
+
}) as unknown as typeof fetch;
|
|
711
|
+
|
|
712
|
+
const worker = startTranscriptionWorker({
|
|
713
|
+
vaultList: () => ["default"],
|
|
714
|
+
getStore: () => hookedStore as unknown as Store,
|
|
715
|
+
scribeUrl: "http://scribe.test",
|
|
716
|
+
resolveAssetsDir: () => assetsRoot,
|
|
717
|
+
// Sweep would never fire within the test window — we prove the hook
|
|
718
|
+
// path is what drives processing.
|
|
719
|
+
pollIntervalMs: 10_000_000,
|
|
720
|
+
fetchImpl,
|
|
721
|
+
logger: silentLogger,
|
|
722
|
+
});
|
|
723
|
+
registerTranscriptionHook(hooks, worker, () => "default");
|
|
724
|
+
|
|
725
|
+
try {
|
|
726
|
+
const start = Date.now();
|
|
727
|
+
await hookedStore.addAttachment("h1", "memos/h1.webm", "audio/webm", {
|
|
728
|
+
transcribe_status: "pending",
|
|
729
|
+
});
|
|
730
|
+
|
|
731
|
+
// Poll for completion rather than sleep-and-hope — `queueMicrotask` +
|
|
732
|
+
// semaphore acquire + a faked fetch round-trip is well under 50ms but
|
|
733
|
+
// not zero.
|
|
734
|
+
const deadline = start + 500;
|
|
735
|
+
while (Date.now() < deadline) {
|
|
736
|
+
const [att] = await hookedStore.getAttachments("h1");
|
|
737
|
+
if (att?.metadata?.transcribe_status === "done") break;
|
|
738
|
+
await new Promise((r) => setTimeout(r, 5));
|
|
739
|
+
}
|
|
740
|
+
const elapsed = Date.now() - start;
|
|
741
|
+
|
|
742
|
+
expect(callCount).toBe(1);
|
|
743
|
+
expect(elapsed).toBeLessThan(500);
|
|
744
|
+
|
|
745
|
+
const [att] = await hookedStore.getAttachments("h1");
|
|
746
|
+
expect(att!.metadata?.transcribe_status).toBe("done");
|
|
747
|
+
expect(att!.metadata?.transcript).toBe("hook-path");
|
|
748
|
+
|
|
749
|
+
const note = await hookedStore.getNote("h1");
|
|
750
|
+
expect(note!.content).toBe("hook-path");
|
|
751
|
+
} finally {
|
|
752
|
+
await worker.stop();
|
|
753
|
+
await hooks.drain();
|
|
754
|
+
}
|
|
755
|
+
});
|
|
756
|
+
|
|
757
|
+
test("sweep still catches a backoff-queued item after its backoff elapses", async () => {
|
|
758
|
+
await hookedStore.createNote("stub", { id: "h2", metadata: { transcribe_stub: true } });
|
|
759
|
+
seedAudio("memos/h2.webm");
|
|
760
|
+
|
|
761
|
+
// Seed an attachment already in backoff, but with a backoff window that
|
|
762
|
+
// has already elapsed — the sweep should pick it up on the next tick.
|
|
763
|
+
// The hook is registered below, AFTER this insert, so the dispatch at
|
|
764
|
+
// addAttachment time has no subscribers and the event-driven path is
|
|
765
|
+
// never taken. What drives the completion is `worker.tick()` alone.
|
|
766
|
+
const past = new Date(Date.now() - 1_000).toISOString();
|
|
767
|
+
await hookedStore.addAttachment("h2", "memos/h2.webm", "audio/webm", {
|
|
768
|
+
transcribe_status: "pending",
|
|
769
|
+
transcribe_attempts: 1,
|
|
770
|
+
transcribe_backoff_until: past,
|
|
771
|
+
});
|
|
772
|
+
|
|
773
|
+
let calls = 0;
|
|
774
|
+
const fetchImpl = (async () => {
|
|
775
|
+
calls++;
|
|
776
|
+
return new Response(JSON.stringify({ text: "sweep-recovered" }), { status: 200 });
|
|
777
|
+
}) as unknown as typeof fetch;
|
|
778
|
+
|
|
779
|
+
const worker = startTranscriptionWorker({
|
|
780
|
+
vaultList: () => ["default"],
|
|
781
|
+
getStore: () => hookedStore as unknown as Store,
|
|
782
|
+
scribeUrl: "http://scribe.test",
|
|
783
|
+
resolveAssetsDir: () => assetsRoot,
|
|
784
|
+
pollIntervalMs: 10_000_000,
|
|
785
|
+
fetchImpl,
|
|
786
|
+
logger: silentLogger,
|
|
787
|
+
});
|
|
788
|
+
// Hook is registered but won't fire (no new addAttachment inside this
|
|
789
|
+
// test window). The sweep is what we're exercising.
|
|
790
|
+
registerTranscriptionHook(hooks, worker, () => "default");
|
|
791
|
+
|
|
792
|
+
try {
|
|
793
|
+
const processed = await worker.tick();
|
|
794
|
+
expect(processed).toBe(1);
|
|
795
|
+
expect(calls).toBe(1);
|
|
796
|
+
|
|
797
|
+
const [att] = await hookedStore.getAttachments("h2");
|
|
798
|
+
expect(att!.metadata?.transcribe_status).toBe("done");
|
|
799
|
+
expect(att!.metadata?.transcript).toBe("sweep-recovered");
|
|
800
|
+
} finally {
|
|
801
|
+
await worker.stop();
|
|
802
|
+
await hooks.drain();
|
|
803
|
+
}
|
|
804
|
+
});
|
|
805
|
+
|
|
806
|
+
test("back-compat: pending status set without dispatching a hook is picked up by the sweep", async () => {
|
|
807
|
+
// Simulate a row inserted by something other than the hooked store —
|
|
808
|
+
// e.g., a restart resumes with a pre-existing pending attachment, or a
|
|
809
|
+
// migration/backfill that writes directly. The sweep must still drain
|
|
810
|
+
// it even though no `attachment:created` event was dispatched.
|
|
811
|
+
await hookedStore.createNote("stub", { id: "h3", metadata: { transcribe_stub: true } });
|
|
812
|
+
seedAudio("memos/h3.webm");
|
|
813
|
+
|
|
814
|
+
// Insert the attachment directly via raw SQL so no hook dispatches.
|
|
815
|
+
const now = new Date().toISOString();
|
|
816
|
+
hookedDb
|
|
817
|
+
.prepare(
|
|
818
|
+
"INSERT INTO attachments (id, note_id, path, mime_type, metadata, created_at) VALUES (?, ?, ?, ?, ?, ?)",
|
|
819
|
+
)
|
|
820
|
+
.run(
|
|
821
|
+
"att-h3",
|
|
822
|
+
"h3",
|
|
823
|
+
"memos/h3.webm",
|
|
824
|
+
"audio/webm",
|
|
825
|
+
JSON.stringify({ transcribe_status: "pending" }),
|
|
826
|
+
now,
|
|
827
|
+
);
|
|
828
|
+
|
|
829
|
+
let calls = 0;
|
|
830
|
+
const fetchImpl = (async () => {
|
|
831
|
+
calls++;
|
|
832
|
+
return new Response(JSON.stringify({ text: "back-compat-sweep" }), { status: 200 });
|
|
833
|
+
}) as unknown as typeof fetch;
|
|
834
|
+
|
|
835
|
+
const worker = startTranscriptionWorker({
|
|
836
|
+
vaultList: () => ["default"],
|
|
837
|
+
getStore: () => hookedStore as unknown as Store,
|
|
838
|
+
scribeUrl: "http://scribe.test",
|
|
839
|
+
resolveAssetsDir: () => assetsRoot,
|
|
840
|
+
pollIntervalMs: 10_000_000,
|
|
841
|
+
fetchImpl,
|
|
842
|
+
logger: silentLogger,
|
|
843
|
+
});
|
|
844
|
+
registerTranscriptionHook(hooks, worker, () => "default");
|
|
845
|
+
|
|
846
|
+
try {
|
|
847
|
+
// No hook fires — row was inserted via raw SQL. Prove the hook is idle.
|
|
848
|
+
await new Promise((r) => setTimeout(r, 30));
|
|
849
|
+
expect(calls).toBe(0);
|
|
850
|
+
|
|
851
|
+
// Sweep tick drains it.
|
|
852
|
+
const processed = await worker.tick();
|
|
853
|
+
expect(processed).toBe(1);
|
|
854
|
+
expect(calls).toBe(1);
|
|
855
|
+
|
|
856
|
+
const [att] = await hookedStore.getAttachments("h3");
|
|
857
|
+
expect(att!.metadata?.transcribe_status).toBe("done");
|
|
858
|
+
expect(att!.metadata?.transcript).toBe("back-compat-sweep");
|
|
859
|
+
} finally {
|
|
860
|
+
await worker.stop();
|
|
861
|
+
await hooks.drain();
|
|
862
|
+
}
|
|
863
|
+
});
|
|
864
|
+
});
|
|
@@ -1,13 +1,22 @@
|
|
|
1
1
|
/**
|
|
2
|
-
*
|
|
2
|
+
* Event-driven transcription with a safety-net sweep.
|
|
3
3
|
*
|
|
4
|
-
* ##
|
|
4
|
+
* ## Shape (event-driven happy path, timer-driven failure path)
|
|
5
5
|
*
|
|
6
|
-
*
|
|
7
|
-
*
|
|
8
|
-
*
|
|
9
|
-
*
|
|
10
|
-
*
|
|
6
|
+
* - **Event path (hot):** `POST /api/notes/:id/attachments` with
|
|
7
|
+
* `{transcribe: true}` writes `attachment.metadata.transcribe_status =
|
|
8
|
+
* "pending"` via `store.addAttachment`, which dispatches an
|
|
9
|
+
* `attachment:created` hook. A handler registered via
|
|
10
|
+
* `registerTranscriptionHook` calls `worker.kick()` on the owning vault,
|
|
11
|
+
* so the cycle begins in the microtask after the HTTP response returns —
|
|
12
|
+
* upload latency is not gated on transcription latency.
|
|
13
|
+
* - **Sweep path (safety net):** Every `pollIntervalMs` (default 30s), the
|
|
14
|
+
* worker lists pending attachments across all vaults and runs them. This
|
|
15
|
+
* catches items queued during a server restart, items whose backoff just
|
|
16
|
+
* elapsed, and anything that got orphaned by a dropped hook dispatch.
|
|
17
|
+
*
|
|
18
|
+
* The DB remains the queue — `metadata.transcribe_status = "pending"` is
|
|
19
|
+
* the source of truth; the hook is a shortcut for cache warmth.
|
|
11
20
|
*
|
|
12
21
|
* ## What the worker does per pending attachment
|
|
13
22
|
*
|
|
@@ -25,28 +34,52 @@
|
|
|
25
34
|
* metadata is still addressable).
|
|
26
35
|
* 4. On failure:
|
|
27
36
|
* - Up to `maxAttempts` retries with exponential backoff encoded as
|
|
28
|
-
* `transcribe_backoff_until`. Status stays `"pending"`;
|
|
29
|
-
* ones whose backoff hasn't expired.
|
|
37
|
+
* `transcribe_backoff_until`. Status stays `"pending"`; the sweep
|
|
38
|
+
* skips ones whose backoff hasn't expired.
|
|
30
39
|
* - After `maxAttempts`, flip status to `"failed"` with `transcribe_error`.
|
|
31
40
|
*
|
|
32
41
|
* ## Concurrency
|
|
33
42
|
*
|
|
34
|
-
* FIFO
|
|
35
|
-
*
|
|
36
|
-
*
|
|
37
|
-
*
|
|
43
|
+
* FIFO across all vaults. Hook-driven and sweep-driven paths race on the
|
|
44
|
+
* same attachment if an upload arrives just before a sweep runs; an
|
|
45
|
+
* in-memory `inFlight` set dedupes within the process so we don't double-
|
|
46
|
+
* POST to scribe. Cross-process guarantees still live in the DB — a sweep
|
|
47
|
+
* on another process would see `transcribe_status = "pending"` and try
|
|
48
|
+
* again, which scribe and the metadata writes handle idempotently.
|
|
38
49
|
*/
|
|
39
50
|
|
|
40
51
|
import { join, normalize } from "path";
|
|
41
52
|
import { existsSync, readFileSync, unlinkSync } from "fs";
|
|
42
53
|
import type { Store, Attachment } from "../core/src/types.ts";
|
|
54
|
+
import type { HookRegistry } from "../core/src/hooks.ts";
|
|
43
55
|
import { appendContextPart, fetchContextEntries, type ContextPayload } from "./context.ts";
|
|
44
56
|
import type { TriggerIncludeContext } from "./config.ts";
|
|
45
57
|
|
|
46
58
|
/** Placeholder pattern written by Lens's voice-memo stub. */
|
|
47
59
|
const TRANSCRIPT_PLACEHOLDER = /_Transcript pending\._/;
|
|
48
60
|
|
|
49
|
-
|
|
61
|
+
/**
|
|
62
|
+
* Body written when transcription reaches a terminal failure (maxAttempts
|
|
63
|
+
* exhausted, or the audio file is missing). This used to be written by
|
|
64
|
+
* Lens's now-removed scribe client; owning it here means a failed upload
|
|
65
|
+
* stops reading "Transcript pending" forever regardless of which client
|
|
66
|
+
* uploaded the audio.
|
|
67
|
+
*/
|
|
68
|
+
const TRANSCRIPT_UNAVAILABLE = "_Transcription unavailable._";
|
|
69
|
+
|
|
70
|
+
/**
|
|
71
|
+
* Default sweep cadence (ms). The sweep is the safety net for backoff-
|
|
72
|
+
* queued items, items that arrived while the server was down, or dispatches
|
|
73
|
+
* that got dropped — not the hot path. Fresh uploads land in single-digit
|
|
74
|
+
* ms via the `attachment:created` hook (see `registerTranscriptionHook`).
|
|
75
|
+
*
|
|
76
|
+
* Operators can override this with the `TRANSCRIPTION_SWEEP_MS` env var
|
|
77
|
+
* (read at `startTranscriptionWorker()` time, not module load, so values
|
|
78
|
+
* in `~/.parachute/vault/.env` apply — ES module import happens before
|
|
79
|
+
* `loadEnvFile()` in server.ts). Per-caller override via the
|
|
80
|
+
* `pollIntervalMs` opt wins over both.
|
|
81
|
+
*/
|
|
82
|
+
const DEFAULT_POLL_MS = 30_000;
|
|
50
83
|
const DEFAULT_MAX_ATTEMPTS = 3;
|
|
51
84
|
const DEFAULT_TIMEOUT_MS = 120_000;
|
|
52
85
|
|
|
@@ -85,6 +118,17 @@ export interface TranscriptionWorker {
|
|
|
85
118
|
stop(): Promise<void>;
|
|
86
119
|
/** Run one poll cycle now. Returns number of attachments processed. */
|
|
87
120
|
tick(): Promise<number>;
|
|
121
|
+
/**
|
|
122
|
+
* Process a single attachment immediately. Called by the
|
|
123
|
+
* `attachment:created` hook to short-circuit the sweep wait.
|
|
124
|
+
*
|
|
125
|
+
* Safe to race with `tick()` — an in-memory `inFlight` guard dedupes
|
|
126
|
+
* same-attachment requests within this process. The handler returns
|
|
127
|
+
* once processing finishes (or is skipped as a dup / backoff / non-
|
|
128
|
+
* pending status). Errors are logged and swallowed so a thrown hook
|
|
129
|
+
* handler never crashes the dispatcher.
|
|
130
|
+
*/
|
|
131
|
+
kick(vault: string, attachment: Attachment): Promise<void>;
|
|
88
132
|
}
|
|
89
133
|
|
|
90
134
|
interface PendingMeta {
|
|
@@ -106,7 +150,12 @@ interface PendingMeta {
|
|
|
106
150
|
export function startTranscriptionWorker(opts: TranscriptionWorkerOpts): TranscriptionWorker {
|
|
107
151
|
const logger = opts.logger ?? console;
|
|
108
152
|
const fetchImpl = opts.fetchImpl ?? fetch;
|
|
109
|
-
|
|
153
|
+
// Precedence: opts.pollIntervalMs > TRANSCRIPTION_SWEEP_MS env > DEFAULT_POLL_MS.
|
|
154
|
+
// Reading env here (not at module scope) means `~/.parachute/vault/.env`
|
|
155
|
+
// values loaded by server.ts still apply, matching how SCRIBE_URL works.
|
|
156
|
+
const envPoll = Number(process.env.TRANSCRIPTION_SWEEP_MS);
|
|
157
|
+
const defaultPollMs = Number.isFinite(envPoll) && envPoll > 0 ? envPoll : DEFAULT_POLL_MS;
|
|
158
|
+
const pollMs = opts.pollIntervalMs ?? defaultPollMs;
|
|
110
159
|
const maxAttempts = opts.maxAttempts ?? DEFAULT_MAX_ATTEMPTS;
|
|
111
160
|
const timeoutMs = opts.timeoutMs ?? DEFAULT_TIMEOUT_MS;
|
|
112
161
|
const retentionFor = opts.getAudioRetention ?? (() => "keep" as const);
|
|
@@ -115,9 +164,67 @@ export function startTranscriptionWorker(opts: TranscriptionWorkerOpts): Transcr
|
|
|
115
164
|
let inflight: Promise<void> = Promise.resolve();
|
|
116
165
|
let timer: ReturnType<typeof setTimeout> | null = null;
|
|
117
166
|
|
|
167
|
+
/**
|
|
168
|
+
* In-process dedupe: holds attachment IDs currently being worked. The
|
|
169
|
+
* event-driven `kick()` path can race the sweep on the same attachment
|
|
170
|
+
* when an upload lands moments before a tick starts. Without this guard
|
|
171
|
+
* both paths would fetch the audio and POST to scribe twice.
|
|
172
|
+
*/
|
|
173
|
+
const inFlightAttachments = new Set<string>();
|
|
174
|
+
|
|
118
175
|
async function processOne(vault: string, attachment: Attachment): Promise<void> {
|
|
176
|
+
// Dedupe: another path (sweep vs hook kick, or a duplicate dispatch)
|
|
177
|
+
// is already working this attachment. Drop — its result is durable
|
|
178
|
+
// in the DB, and the sweep will re-pick anything that truly needs it.
|
|
179
|
+
if (inFlightAttachments.has(attachment.id)) return;
|
|
180
|
+
inFlightAttachments.add(attachment.id);
|
|
181
|
+
try {
|
|
182
|
+
await processOneLocked(vault, attachment);
|
|
183
|
+
} finally {
|
|
184
|
+
inFlightAttachments.delete(attachment.id);
|
|
185
|
+
}
|
|
186
|
+
}
|
|
187
|
+
|
|
188
|
+
/**
|
|
189
|
+
* On a terminal failure (maxAttempts exhausted, or audio file missing),
|
|
190
|
+
* swap the stub placeholder for the "unavailable" marker — otherwise
|
|
191
|
+
* Lens's voice memo sits reading "Transcript pending" forever. Mirrors
|
|
192
|
+
* the success-path note write in shape: only touches the note when
|
|
193
|
+
* `transcribe_stub === true`, clears the stub marker, uses `skipUpdatedAt`
|
|
194
|
+
* so the note's modification time still reflects user intent. Errors
|
|
195
|
+
* are logged and swallowed so a note-write failure doesn't mask the
|
|
196
|
+
* attachment failure we're trying to record.
|
|
197
|
+
*/
|
|
198
|
+
async function applyFailureMarker(store: Store, noteId: string): Promise<void> {
|
|
199
|
+
const note = await store.getNote(noteId);
|
|
200
|
+
if (!note) return;
|
|
201
|
+
const noteMeta = (note.metadata as Record<string, unknown> | undefined) ?? {};
|
|
202
|
+
if (noteMeta.transcribe_stub !== true) return;
|
|
203
|
+
|
|
204
|
+
const body = TRANSCRIPT_PLACEHOLDER.test(note.content)
|
|
205
|
+
? note.content.replace(TRANSCRIPT_PLACEHOLDER, TRANSCRIPT_UNAVAILABLE)
|
|
206
|
+
: TRANSCRIPT_UNAVAILABLE;
|
|
207
|
+
const { transcribe_stub: _drop, ...restMeta } = noteMeta;
|
|
208
|
+
try {
|
|
209
|
+
await store.updateNote(note.id, {
|
|
210
|
+
content: body,
|
|
211
|
+
metadata: restMeta,
|
|
212
|
+
skipUpdatedAt: true,
|
|
213
|
+
});
|
|
214
|
+
} catch (err) {
|
|
215
|
+
logger.error(`[transcribe] failed to apply failure marker to note ${note.id}:`, err);
|
|
216
|
+
}
|
|
217
|
+
}
|
|
218
|
+
|
|
219
|
+
async function processOneLocked(vault: string, attachment: Attachment): Promise<void> {
|
|
119
220
|
const store = opts.getStore(vault);
|
|
120
|
-
|
|
221
|
+
// Re-read metadata — the in-memory `attachment` may be stale (the hook
|
|
222
|
+
// path hands us the row from just after insert; a concurrent completion
|
|
223
|
+
// in another path may have already flipped status). Skip if not pending.
|
|
224
|
+
const fresh = (await store.getAttachment(attachment.id)) ?? attachment;
|
|
225
|
+
const meta: PendingMeta = { ...(fresh.metadata ?? {}) };
|
|
226
|
+
if (meta.transcribe_status !== "pending") return;
|
|
227
|
+
|
|
121
228
|
const attempts = (meta.transcribe_attempts as number | undefined) ?? 0;
|
|
122
229
|
|
|
123
230
|
// Honor backoff — we re-check here in case another tick queued this
|
|
@@ -136,6 +243,7 @@ export function startTranscriptionWorker(opts: TranscriptionWorkerOpts): Transcr
|
|
|
136
243
|
transcribe_status: "failed",
|
|
137
244
|
transcribe_error: "audio file not found",
|
|
138
245
|
});
|
|
246
|
+
await applyFailureMarker(store, attachment.noteId);
|
|
139
247
|
return;
|
|
140
248
|
}
|
|
141
249
|
|
|
@@ -171,6 +279,7 @@ export function startTranscriptionWorker(opts: TranscriptionWorkerOpts): Transcr
|
|
|
171
279
|
transcribe_attempts: nextAttempts,
|
|
172
280
|
transcribe_error: errMsg,
|
|
173
281
|
});
|
|
282
|
+
await applyFailureMarker(store, attachment.noteId);
|
|
174
283
|
// retention=never drops the audio on any terminal state, including
|
|
175
284
|
// failure. The user opted in to "I don't want the audio kept around
|
|
176
285
|
// regardless of outcome" — honor it.
|
|
@@ -293,6 +402,15 @@ export function startTranscriptionWorker(opts: TranscriptionWorkerOpts): Transcr
|
|
|
293
402
|
|
|
294
403
|
schedule();
|
|
295
404
|
|
|
405
|
+
async function kick(vault: string, attachment: Attachment): Promise<void> {
|
|
406
|
+
if (stopped) return;
|
|
407
|
+
try {
|
|
408
|
+
await processOne(vault, attachment);
|
|
409
|
+
} catch (err) {
|
|
410
|
+
logger.error(`[transcribe] kick error on attachment ${attachment.id}:`, err);
|
|
411
|
+
}
|
|
412
|
+
}
|
|
413
|
+
|
|
296
414
|
return {
|
|
297
415
|
async stop() {
|
|
298
416
|
stopped = true;
|
|
@@ -300,9 +418,46 @@ export function startTranscriptionWorker(opts: TranscriptionWorkerOpts): Transcr
|
|
|
300
418
|
await inflight;
|
|
301
419
|
},
|
|
302
420
|
tick,
|
|
421
|
+
kick,
|
|
303
422
|
};
|
|
304
423
|
}
|
|
305
424
|
|
|
425
|
+
/**
|
|
426
|
+
* Wire the transcription worker up as an `attachment:created` hook. This
|
|
427
|
+
* is the event-driven fast path — when a new attachment is inserted with
|
|
428
|
+
* `transcribe_status = "pending"`, the hook fires within a microtask and
|
|
429
|
+
* the worker begins processing without waiting for the next sweep.
|
|
430
|
+
*
|
|
431
|
+
* `resolveVault(store)` maps the store handle delivered to the hook back
|
|
432
|
+
* to its vault name (needed so the worker can resolve the assets dir,
|
|
433
|
+
* retention policy, and context predicates). Returns an unregister
|
|
434
|
+
* function so tests can tear down cleanly.
|
|
435
|
+
*/
|
|
436
|
+
export function registerTranscriptionHook(
|
|
437
|
+
registry: HookRegistry,
|
|
438
|
+
worker: TranscriptionWorker,
|
|
439
|
+
resolveVault: (store: Store) => string | undefined,
|
|
440
|
+
logger: { error: (...args: unknown[]) => void } = console,
|
|
441
|
+
): () => void {
|
|
442
|
+
return registry.onAttachment({
|
|
443
|
+
name: "transcription-kickoff",
|
|
444
|
+
event: "created",
|
|
445
|
+
when: (att) =>
|
|
446
|
+
(att.metadata as { transcribe_status?: string } | undefined)
|
|
447
|
+
?.transcribe_status === "pending",
|
|
448
|
+
handler: async (attachment, store) => {
|
|
449
|
+
const vault = resolveVault(store);
|
|
450
|
+
if (!vault) {
|
|
451
|
+
logger.error(
|
|
452
|
+
`[transcribe] could not resolve vault for attachment ${attachment.id}; sweep will pick it up`,
|
|
453
|
+
);
|
|
454
|
+
return;
|
|
455
|
+
}
|
|
456
|
+
await worker.kick(vault, attachment);
|
|
457
|
+
},
|
|
458
|
+
});
|
|
459
|
+
}
|
|
460
|
+
|
|
306
461
|
async function callScribe(args: {
|
|
307
462
|
url: string;
|
|
308
463
|
token?: string;
|