npm - @openparachute/vault - Versions diffs - 0.3.0-rc.1 → 0.3.1 - Mend

@openparachute/vault 0.3.0-rc.1 → 0.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

package/core/src/hooks.ts +111 -3
package/core/src/store.ts +3 -1
package/docs/auth-model.md +340 -0
package/package.json +1 -1
package/src/bind.test.ts +28 -0
package/src/bind.ts +19 -0
package/src/cli.ts +56 -17
package/src/scopes.test.ts +158 -0
package/src/scopes.ts +148 -0
package/src/server.ts +19 -7
package/src/transcription-worker.test.ts +282 -1
package/src/transcription-worker.ts +171 -16

package/src/transcription-worker.test.ts CHANGED Viewed

@@ -4,7 +4,9 @@ import { mkdirSync, rmSync, writeFileSync, existsSync } from "fs";
 import { join } from "path";
 import { tmpdir } from "os";
 import { BunStore } from "./vault-store.ts";
-import { startTranscriptionWorker } from "./transcription-worker.ts";
+import { startTranscriptionWorker, registerTranscriptionHook } from "./transcription-worker.ts";
+import { HookRegistry } from "../core/src/hooks.ts";
+import { SqliteStore } from "../core/src/store.ts";
 import type { Store } from "../core/src/types.ts";
 let db: Database;
@@ -202,6 +204,101 @@ describe("transcription worker", () => {
     expect(att.metadata?.transcribe_error).toContain("boom");
   });
+  test("terminal failure with stub=true → note shows 'Transcription unavailable' and stub is cleared", async () => {
+    // Mirrors Lens's voice-memo stub shape: note with placeholder body and
+    // transcribe_stub marker, attachment pre-loaded near the retry limit.
+    await store.createNote(
+      "# 🎙️ Voice memo\n\n_Transcript pending._\n",
+      { id: "unavail1", metadata: { transcribe_stub: true } },
+    );
+    seedAudio("memos/unavail1.webm");
+    await store.addAttachment("unavail1", "memos/unavail1.webm", "audio/webm", {
+      transcribe_status: "pending",
+      transcribe_attempts: 2,
+    });
+    const worker = makeWorker({
+      fetchImpl: mkFetchMock([{ error: "scribe down hard", status: 500 }]),
+      maxAttempts: 3,
+    });
+    try {
+      await worker.tick();
+    } finally {
+      await worker.stop();
+    }
+    const note = await store.getNote("unavail1");
+    expect(note!.content).toBe("# 🎙️ Voice memo\n\n_Transcription unavailable._\n");
+    expect((note!.metadata as any)?.transcribe_stub).toBeUndefined();
+    const [att] = await store.getAttachments("unavail1");
+    expect(att!.metadata?.transcribe_status).toBe("failed");
+    expect(att!.metadata?.transcribe_error).toContain("scribe down hard");
+  });
+  test("audio-not-found with stub=true → note shows 'Transcription unavailable' and stub is cleared", async () => {
+    await store.createNote(
+      "# 🎙️ Voice memo\n\n_Transcript pending._\n",
+      { id: "unavail2", metadata: { transcribe_stub: true } },
+    );
+    // No seedAudio — the file is deliberately missing.
+    await store.addAttachment("unavail2", "memos/gone.webm", "audio/webm", {
+      transcribe_status: "pending",
+    });
+    let called = 0;
+    const worker = makeWorker({
+      fetchImpl: (async () => {
+        called++;
+        return new Response("x", { status: 200 });
+      }) as typeof fetch,
+    });
+    try {
+      await worker.tick();
+    } finally {
+      await worker.stop();
+    }
+    // Scribe was never called — audio-missing check short-circuits before
+    // the network call, same as before. What's new is the note rewrite.
+    expect(called).toBe(0);
+    const note = await store.getNote("unavail2");
+    expect(note!.content).toBe("# 🎙️ Voice memo\n\n_Transcription unavailable._\n");
+    expect((note!.metadata as any)?.transcribe_stub).toBeUndefined();
+    const [att] = await store.getAttachments("unavail2");
+    expect(att!.metadata?.transcribe_status).toBe("failed");
+    expect(att!.metadata?.transcribe_error).toContain("audio file not found");
+  });
+  test("terminal failure with stub=false → note content is NOT touched", async () => {
+    // User edited the note after upload, which cleared the stub marker.
+    // Worker must not clobber their edit even though transcription failed.
+    await store.createNote("my own words", { id: "unavail3" });
+    seedAudio("memos/unavail3.webm");
+    await store.addAttachment("unavail3", "memos/unavail3.webm", "audio/webm", {
+      transcribe_status: "pending",
+      transcribe_attempts: 2,
+    });
+    const worker = makeWorker({
+      fetchImpl: mkFetchMock([{ error: "boom", status: 500 }]),
+      maxAttempts: 3,
+    });
+    try {
+      await worker.tick();
+    } finally {
+      await worker.stop();
+    }
+    const note = await store.getNote("unavail3");
+    expect(note!.content).toBe("my own words");
+    const [att] = await store.getAttachments("unavail3");
+    expect(att!.metadata?.transcribe_status).toBe("failed");
+  });
   test("FIFO: oldest pending is processed first", async () => {
     await store.createNote("s", { id: "f1", metadata: { transcribe_stub: true } });
     await store.createNote("s", { id: "f2", metadata: { transcribe_stub: true } });
@@ -581,3 +678,187 @@ describe("store.listAttachmentsByTranscribeStatus", () => {
     expect(done[0]!.path).toBe("a.webm");
   });
 });
+describe("transcription worker — hook-driven", () => {
+  // These tests use a private HookRegistry so they don't collide with
+  // defaultHookRegistry state or other test files.
+  let hooks: HookRegistry;
+  let hookedStore: SqliteStore;
+  let hookedDb: Database;
+  beforeEach(() => {
+    hookedDb = new Database(":memory:");
+    hooks = new HookRegistry({ concurrency: 4, logger: silentLogger });
+    hookedStore = new SqliteStore(hookedDb, { hooks });
+  });
+  afterEach(() => {
+    hookedDb.close();
+  });
+  test("attachment:created event triggers a cycle before the sweep fires", async () => {
+    await hookedStore.createNote("stub", { id: "h1", metadata: { transcribe_stub: true } });
+    seedAudio("memos/h1.webm");
+    let callCount = 0;
+    const fetchImpl = (async () => {
+      callCount++;
+      return new Response(JSON.stringify({ text: "hook-path" }), {
+        status: 200,
+        headers: { "content-type": "application/json" },
+      });
+    }) as unknown as typeof fetch;
+    const worker = startTranscriptionWorker({
+      vaultList: () => ["default"],
+      getStore: () => hookedStore as unknown as Store,
+      scribeUrl: "http://scribe.test",
+      resolveAssetsDir: () => assetsRoot,
+      // Sweep would never fire within the test window — we prove the hook
+      // path is what drives processing.
+      pollIntervalMs: 10_000_000,
+      fetchImpl,
+      logger: silentLogger,
+    });
+    registerTranscriptionHook(hooks, worker, () => "default");
+    try {
+      const start = Date.now();
+      await hookedStore.addAttachment("h1", "memos/h1.webm", "audio/webm", {
+        transcribe_status: "pending",
+      });
+      // Poll for completion rather than sleep-and-hope — `queueMicrotask` +
+      // semaphore acquire + a faked fetch round-trip is well under 50ms but
+      // not zero.
+      const deadline = start + 500;
+      while (Date.now() < deadline) {
+        const [att] = await hookedStore.getAttachments("h1");
+        if (att?.metadata?.transcribe_status === "done") break;
+        await new Promise((r) => setTimeout(r, 5));
+      }
+      const elapsed = Date.now() - start;
+      expect(callCount).toBe(1);
+      expect(elapsed).toBeLessThan(500);
+      const [att] = await hookedStore.getAttachments("h1");
+      expect(att!.metadata?.transcribe_status).toBe("done");
+      expect(att!.metadata?.transcript).toBe("hook-path");
+      const note = await hookedStore.getNote("h1");
+      expect(note!.content).toBe("hook-path");
+    } finally {
+      await worker.stop();
+      await hooks.drain();
+    }
+  });
+  test("sweep still catches a backoff-queued item after its backoff elapses", async () => {
+    await hookedStore.createNote("stub", { id: "h2", metadata: { transcribe_stub: true } });
+    seedAudio("memos/h2.webm");
+    // Seed an attachment already in backoff, but with a backoff window that
+    // has already elapsed — the sweep should pick it up on the next tick.
+    // The hook is registered below, AFTER this insert, so the dispatch at
+    // addAttachment time has no subscribers and the event-driven path is
+    // never taken. What drives the completion is `worker.tick()` alone.
+    const past = new Date(Date.now() - 1_000).toISOString();
+    await hookedStore.addAttachment("h2", "memos/h2.webm", "audio/webm", {
+      transcribe_status: "pending",
+      transcribe_attempts: 1,
+      transcribe_backoff_until: past,
+    });
+    let calls = 0;
+    const fetchImpl = (async () => {
+      calls++;
+      return new Response(JSON.stringify({ text: "sweep-recovered" }), { status: 200 });
+    }) as unknown as typeof fetch;
+    const worker = startTranscriptionWorker({
+      vaultList: () => ["default"],
+      getStore: () => hookedStore as unknown as Store,
+      scribeUrl: "http://scribe.test",
+      resolveAssetsDir: () => assetsRoot,
+      pollIntervalMs: 10_000_000,
+      fetchImpl,
+      logger: silentLogger,
+    });
+    // Hook is registered but won't fire (no new addAttachment inside this
+    // test window). The sweep is what we're exercising.
+    registerTranscriptionHook(hooks, worker, () => "default");
+    try {
+      const processed = await worker.tick();
+      expect(processed).toBe(1);
+      expect(calls).toBe(1);
+      const [att] = await hookedStore.getAttachments("h2");
+      expect(att!.metadata?.transcribe_status).toBe("done");
+      expect(att!.metadata?.transcript).toBe("sweep-recovered");
+    } finally {
+      await worker.stop();
+      await hooks.drain();
+    }
+  });
+  test("back-compat: pending status set without dispatching a hook is picked up by the sweep", async () => {
+    // Simulate a row inserted by something other than the hooked store —
+    // e.g., a restart resumes with a pre-existing pending attachment, or a
+    // migration/backfill that writes directly. The sweep must still drain
+    // it even though no `attachment:created` event was dispatched.
+    await hookedStore.createNote("stub", { id: "h3", metadata: { transcribe_stub: true } });
+    seedAudio("memos/h3.webm");
+    // Insert the attachment directly via raw SQL so no hook dispatches.
+    const now = new Date().toISOString();
+    hookedDb
+      .prepare(
+        "INSERT INTO attachments (id, note_id, path, mime_type, metadata, created_at) VALUES (?, ?, ?, ?, ?, ?)",
+      )
+      .run(
+        "att-h3",
+        "h3",
+        "memos/h3.webm",
+        "audio/webm",
+        JSON.stringify({ transcribe_status: "pending" }),
+        now,
+      );
+    let calls = 0;
+    const fetchImpl = (async () => {
+      calls++;
+      return new Response(JSON.stringify({ text: "back-compat-sweep" }), { status: 200 });
+    }) as unknown as typeof fetch;
+    const worker = startTranscriptionWorker({
+      vaultList: () => ["default"],
+      getStore: () => hookedStore as unknown as Store,
+      scribeUrl: "http://scribe.test",
+      resolveAssetsDir: () => assetsRoot,
+      pollIntervalMs: 10_000_000,
+      fetchImpl,
+      logger: silentLogger,
+    });
+    registerTranscriptionHook(hooks, worker, () => "default");
+    try {
+      // No hook fires — row was inserted via raw SQL. Prove the hook is idle.
+      await new Promise((r) => setTimeout(r, 30));
+      expect(calls).toBe(0);
+      // Sweep tick drains it.
+      const processed = await worker.tick();
+      expect(processed).toBe(1);
+      expect(calls).toBe(1);
+      const [att] = await hookedStore.getAttachments("h3");
+      expect(att!.metadata?.transcribe_status).toBe("done");
+      expect(att!.metadata?.transcript).toBe("back-compat-sweep");
+    } finally {
+      await worker.stop();
+      await hooks.drain();
+    }
+  });
+});

package/src/transcription-worker.ts CHANGED Viewed

@@ -1,13 +1,22 @@
 /**
- * Background worker that drains pending transcription requests.
+ * Event-driven transcription with a safety-net sweep.
  *
- * ## How a request enters the queue
+ * ## Shape (event-driven happy path, timer-driven failure path)
  *
- * The caller `POST /api/notes/:id/attachments` with `{transcribe: true}`.
- * The route writes `attachment.metadata.transcribe_status = "pending"` and
- * sets `note.metadata.transcribe_stub = true` as the opt-in to overwrite.
- * The DB is the queue — a server restart resumes the scan without losing
- * requests.
+ * - **Event path (hot):** `POST /api/notes/:id/attachments` with
+ *   `{transcribe: true}` writes `attachment.metadata.transcribe_status =
+ *   "pending"` via `store.addAttachment`, which dispatches an
+ *   `attachment:created` hook. A handler registered via
+ *   `registerTranscriptionHook` calls `worker.kick()` on the owning vault,
+ *   so the cycle begins in the microtask after the HTTP response returns —
+ *   upload latency is not gated on transcription latency.
+ * - **Sweep path (safety net):** Every `pollIntervalMs` (default 30s), the
+ *   worker lists pending attachments across all vaults and runs them. This
+ *   catches items queued during a server restart, items whose backoff just
+ *   elapsed, and anything that got orphaned by a dropped hook dispatch.
+ *
+ * The DB remains the queue — `metadata.transcribe_status = "pending"` is
+ * the source of truth; the hook is a shortcut for cache warmth.
  *
  * ## What the worker does per pending attachment
  *
@@ -25,28 +34,52 @@
  *      metadata is still addressable).
  * 4. On failure:
  *    - Up to `maxAttempts` retries with exponential backoff encoded as
- *      `transcribe_backoff_until`. Status stays `"pending"`; we simply skip
- *      ones whose backoff hasn't expired.
+ *      `transcribe_backoff_until`. Status stays `"pending"`; the sweep
+ *      skips ones whose backoff hasn't expired.
  *    - After `maxAttempts`, flip status to `"failed"` with `transcribe_error`.
  *
  * ## Concurrency
  *
- * FIFO, one at a time, across all vaults. The poll-then-process loop is
- * intentionally simple — transcription is already seconds-long and scribe
- * is not designed for high concurrency. Scaling to multiple in-flight
- * jobs can be added later without changing the wire contract.
+ * FIFO across all vaults. Hook-driven and sweep-driven paths race on the
+ * same attachment if an upload arrives just before a sweep runs; an
+ * in-memory `inFlight` set dedupes within the process so we don't double-
+ * POST to scribe. Cross-process guarantees still live in the DB — a sweep
+ * on another process would see `transcribe_status = "pending"` and try
+ * again, which scribe and the metadata writes handle idempotently.
  */
 import { join, normalize } from "path";
 import { existsSync, readFileSync, unlinkSync } from "fs";
 import type { Store, Attachment } from "../core/src/types.ts";
+import type { HookRegistry } from "../core/src/hooks.ts";
 import { appendContextPart, fetchContextEntries, type ContextPayload } from "./context.ts";
 import type { TriggerIncludeContext } from "./config.ts";
 /** Placeholder pattern written by Lens's voice-memo stub. */
 const TRANSCRIPT_PLACEHOLDER = /_Transcript pending\._/;
-const DEFAULT_POLL_MS = 5_000;
+/**
+ * Body written when transcription reaches a terminal failure (maxAttempts
+ * exhausted, or the audio file is missing). This used to be written by
+ * Lens's now-removed scribe client; owning it here means a failed upload
+ * stops reading "Transcript pending" forever regardless of which client
+ * uploaded the audio.
+ */
+const TRANSCRIPT_UNAVAILABLE = "_Transcription unavailable._";
+/**
+ * Default sweep cadence (ms). The sweep is the safety net for backoff-
+ * queued items, items that arrived while the server was down, or dispatches
+ * that got dropped — not the hot path. Fresh uploads land in single-digit
+ * ms via the `attachment:created` hook (see `registerTranscriptionHook`).
+ *
+ * Operators can override this with the `TRANSCRIPTION_SWEEP_MS` env var
+ * (read at `startTranscriptionWorker()` time, not module load, so values
+ * in `~/.parachute/vault/.env` apply — ES module import happens before
+ * `loadEnvFile()` in server.ts). Per-caller override via the
+ * `pollIntervalMs` opt wins over both.
+ */
+const DEFAULT_POLL_MS = 30_000;
 const DEFAULT_MAX_ATTEMPTS = 3;
 const DEFAULT_TIMEOUT_MS = 120_000;
@@ -85,6 +118,17 @@ export interface TranscriptionWorker {
   stop(): Promise<void>;
   /** Run one poll cycle now. Returns number of attachments processed. */
   tick(): Promise<number>;
+  /**
+   * Process a single attachment immediately. Called by the
+   * `attachment:created` hook to short-circuit the sweep wait.
+   *
+   * Safe to race with `tick()` — an in-memory `inFlight` guard dedupes
+   * same-attachment requests within this process. The handler returns
+   * once processing finishes (or is skipped as a dup / backoff / non-
+   * pending status). Errors are logged and swallowed so a thrown hook
+   * handler never crashes the dispatcher.
+   */
+  kick(vault: string, attachment: Attachment): Promise<void>;
 }
 interface PendingMeta {
@@ -106,7 +150,12 @@ interface PendingMeta {
 export function startTranscriptionWorker(opts: TranscriptionWorkerOpts): TranscriptionWorker {
   const logger = opts.logger ?? console;
   const fetchImpl = opts.fetchImpl ?? fetch;
-  const pollMs = opts.pollIntervalMs ?? DEFAULT_POLL_MS;
+  // Precedence: opts.pollIntervalMs > TRANSCRIPTION_SWEEP_MS env > DEFAULT_POLL_MS.
+  // Reading env here (not at module scope) means `~/.parachute/vault/.env`
+  // values loaded by server.ts still apply, matching how SCRIBE_URL works.
+  const envPoll = Number(process.env.TRANSCRIPTION_SWEEP_MS);
+  const defaultPollMs = Number.isFinite(envPoll) && envPoll > 0 ? envPoll : DEFAULT_POLL_MS;
+  const pollMs = opts.pollIntervalMs ?? defaultPollMs;
   const maxAttempts = opts.maxAttempts ?? DEFAULT_MAX_ATTEMPTS;
   const timeoutMs = opts.timeoutMs ?? DEFAULT_TIMEOUT_MS;
   const retentionFor = opts.getAudioRetention ?? (() => "keep" as const);
@@ -115,9 +164,67 @@ export function startTranscriptionWorker(opts: TranscriptionWorkerOpts): Transcr
   let inflight: Promise<void> = Promise.resolve();
   let timer: ReturnType<typeof setTimeout> | null = null;
+  /**
+   * In-process dedupe: holds attachment IDs currently being worked. The
+   * event-driven `kick()` path can race the sweep on the same attachment
+   * when an upload lands moments before a tick starts. Without this guard
+   * both paths would fetch the audio and POST to scribe twice.
+   */
+  const inFlightAttachments = new Set<string>();
   async function processOne(vault: string, attachment: Attachment): Promise<void> {
+    // Dedupe: another path (sweep vs hook kick, or a duplicate dispatch)
+    // is already working this attachment. Drop — its result is durable
+    // in the DB, and the sweep will re-pick anything that truly needs it.
+    if (inFlightAttachments.has(attachment.id)) return;
+    inFlightAttachments.add(attachment.id);
+    try {
+      await processOneLocked(vault, attachment);
+    } finally {
+      inFlightAttachments.delete(attachment.id);
+    }
+  }
+  /**
+   * On a terminal failure (maxAttempts exhausted, or audio file missing),
+   * swap the stub placeholder for the "unavailable" marker — otherwise
+   * Lens's voice memo sits reading "Transcript pending" forever. Mirrors
+   * the success-path note write in shape: only touches the note when
+   * `transcribe_stub === true`, clears the stub marker, uses `skipUpdatedAt`
+   * so the note's modification time still reflects user intent. Errors
+   * are logged and swallowed so a note-write failure doesn't mask the
+   * attachment failure we're trying to record.
+   */
+  async function applyFailureMarker(store: Store, noteId: string): Promise<void> {
+    const note = await store.getNote(noteId);
+    if (!note) return;
+    const noteMeta = (note.metadata as Record<string, unknown> | undefined) ?? {};
+    if (noteMeta.transcribe_stub !== true) return;
+    const body = TRANSCRIPT_PLACEHOLDER.test(note.content)
+      ? note.content.replace(TRANSCRIPT_PLACEHOLDER, TRANSCRIPT_UNAVAILABLE)
+      : TRANSCRIPT_UNAVAILABLE;
+    const { transcribe_stub: _drop, ...restMeta } = noteMeta;
+    try {
+      await store.updateNote(note.id, {
+        content: body,
+        metadata: restMeta,
+        skipUpdatedAt: true,
+      });
+    } catch (err) {
+      logger.error(`[transcribe] failed to apply failure marker to note ${note.id}:`, err);
+    }
+  }
+  async function processOneLocked(vault: string, attachment: Attachment): Promise<void> {
     const store = opts.getStore(vault);
-    const meta: PendingMeta = { ...(attachment.metadata ?? {}) };
+    // Re-read metadata — the in-memory `attachment` may be stale (the hook
+    // path hands us the row from just after insert; a concurrent completion
+    // in another path may have already flipped status). Skip if not pending.
+    const fresh = (await store.getAttachment(attachment.id)) ?? attachment;
+    const meta: PendingMeta = { ...(fresh.metadata ?? {}) };
+    if (meta.transcribe_status !== "pending") return;
     const attempts = (meta.transcribe_attempts as number | undefined) ?? 0;
     // Honor backoff — we re-check here in case another tick queued this
@@ -136,6 +243,7 @@ export function startTranscriptionWorker(opts: TranscriptionWorkerOpts): Transcr
         transcribe_status: "failed",
         transcribe_error: "audio file not found",
       });
+      await applyFailureMarker(store, attachment.noteId);
       return;
     }
@@ -171,6 +279,7 @@ export function startTranscriptionWorker(opts: TranscriptionWorkerOpts): Transcr
           transcribe_attempts: nextAttempts,
           transcribe_error: errMsg,
         });
+        await applyFailureMarker(store, attachment.noteId);
         // retention=never drops the audio on any terminal state, including
         // failure. The user opted in to "I don't want the audio kept around
         // regardless of outcome" — honor it.
@@ -293,6 +402,15 @@ export function startTranscriptionWorker(opts: TranscriptionWorkerOpts): Transcr
   schedule();
+  async function kick(vault: string, attachment: Attachment): Promise<void> {
+    if (stopped) return;
+    try {
+      await processOne(vault, attachment);
+    } catch (err) {
+      logger.error(`[transcribe] kick error on attachment ${attachment.id}:`, err);
+    }
+  }
   return {
     async stop() {
       stopped = true;
@@ -300,9 +418,46 @@ export function startTranscriptionWorker(opts: TranscriptionWorkerOpts): Transcr
       await inflight;
     },
     tick,
+    kick,
   };
 }
+/**
+ * Wire the transcription worker up as an `attachment:created` hook. This
+ * is the event-driven fast path — when a new attachment is inserted with
+ * `transcribe_status = "pending"`, the hook fires within a microtask and
+ * the worker begins processing without waiting for the next sweep.
+ *
+ * `resolveVault(store)` maps the store handle delivered to the hook back
+ * to its vault name (needed so the worker can resolve the assets dir,
+ * retention policy, and context predicates). Returns an unregister
+ * function so tests can tear down cleanly.
+ */
+export function registerTranscriptionHook(
+  registry: HookRegistry,
+  worker: TranscriptionWorker,
+  resolveVault: (store: Store) => string | undefined,
+  logger: { error: (...args: unknown[]) => void } = console,
+): () => void {
+  return registry.onAttachment({
+    name: "transcription-kickoff",
+    event: "created",
+    when: (att) =>
+      (att.metadata as { transcribe_status?: string } | undefined)
+        ?.transcribe_status === "pending",
+    handler: async (attachment, store) => {
+      const vault = resolveVault(store);
+      if (!vault) {
+        logger.error(
+          `[transcribe] could not resolve vault for attachment ${attachment.id}; sweep will pick it up`,
+        );
+        return;
+      }
+      await worker.kick(vault, attachment);
+    },
+  });
+}
 async function callScribe(args: {
   url: string;
   token?: string;