npm - @gmickel/gno - Versions diffs - 0.41.0 → 0.41.1 - Mend

@gmickel/gno 0.41.0 → 0.41.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (5) hide show

package/README.md +9 -0
package/package.json +1 -1
package/src/cli/commands/embed.ts +216 -8
package/src/embed/batch.ts +154 -3
package/src/store/vector/sqlite-vec.ts +11 -6

package/README.md CHANGED Viewed

@@ -122,6 +122,15 @@ gno collection clear-embeddings my-collection --all
 gno embed my-collection
 ```
+If a re-embed run still reports failures, rerun with:
+```bash
+gno --verbose embed --force
+```
+Recent releases now print sample embedding errors and a concrete retry hint when
+batch recovery cannot fully recover on its own.
 Model guides:
 - [Code Embeddings](./docs/guides/code-embeddings.md)

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@gmickel/gno",
-  "version": "0.41.0",
+  "version": "0.41.1",
   "description": "Local semantic search for your documents. Index Markdown, PDF, and Office files with hybrid BM25 + vector search.",
   "keywords": [
     "embeddings",

package/src/cli/commands/embed.ts CHANGED Viewed

@@ -71,6 +71,9 @@ export type EmbedResult =
       duration: number;
       model: string;
       searchAvailable: boolean;
+      errorSamples?: string[];
+      suggestion?: string;
+      syncError?: string;
     }
   | { success: false; error: string };
@@ -87,6 +90,30 @@ function formatDuration(seconds: number): string {
   return `${mins}m ${secs.toFixed(0)}s`;
 }
+function formatLlmFailure(
+  error: { message: string; cause?: unknown } | undefined
+): string {
+  if (!error) {
+    return "Unknown embedding failure";
+  }
+  const cause =
+    error.cause &&
+    typeof error.cause === "object" &&
+    "message" in error.cause &&
+    typeof error.cause.message === "string"
+      ? error.cause.message
+      : typeof error.cause === "string"
+        ? error.cause
+        : "";
+  return cause && cause !== error.message
+    ? `${error.message} - ${cause}`
+    : error.message;
+}
+function isDisposedBatchError(message: string): boolean {
+  return message.toLowerCase().includes("object is disposed");
+}
 async function checkVecAvailable(
   db: import("bun:sqlite").Database
 ): Promise<boolean> {
@@ -111,10 +138,20 @@ interface BatchContext {
   showProgress: boolean;
   totalToEmbed: number;
   verbose: boolean;
+  recreateEmbedPort?: () => Promise<
+    { ok: true; value: EmbeddingPort } | { ok: false; error: string }
+  >;
 }
 type BatchResult =
-  | { ok: true; embedded: number; errors: number; duration: number }
+  | {
+      ok: true;
+      embedded: number;
+      errors: number;
+      duration: number;
+      errorSamples: string[];
+      suggestion?: string;
+    }
   | { ok: false; error: string };
 interface Cursor {
@@ -126,8 +163,21 @@ async function processBatches(ctx: BatchContext): Promise<BatchResult> {
   const startTime = Date.now();
   let embedded = 0;
   let errors = 0;
+  const errorSamples: string[] = [];
+  let suggestion: string | undefined;
   let cursor: Cursor | undefined;
+  const pushErrorSamples = (samples: string[]): void => {
+    for (const sample of samples) {
+      if (errorSamples.length >= 5) {
+        break;
+      }
+      if (!errorSamples.includes(sample)) {
+        errorSamples.push(sample);
+      }
+    }
+  };
   while (embedded + errors < ctx.totalToEmbed) {
     // Get next batch using seek pagination (cursor-based)
     const batchResult = ctx.force
@@ -161,6 +211,89 @@ async function processBatches(ctx: BatchContext): Promise<BatchResult> {
       )
     );
     if (!batchEmbedResult.ok) {
+      const formattedError = formatLlmFailure(batchEmbedResult.error);
+      if (ctx.recreateEmbedPort && isDisposedBatchError(formattedError)) {
+        if (ctx.verbose) {
+          process.stderr.write(
+            "\n[embed] Embedding port disposed; recreating model/contexts and retrying batch once\n"
+          );
+        }
+        const recreated = await ctx.recreateEmbedPort();
+        if (recreated.ok) {
+          ctx.embedPort = recreated.value;
+          const retryResult = await embedTextsWithRecovery(
+            ctx.embedPort,
+            batch.map((b) =>
+              formatDocForEmbedding(b.text, b.title ?? undefined, ctx.modelUri)
+            )
+          );
+          if (retryResult.ok) {
+            if (ctx.verbose) {
+              process.stderr.write(
+                "\n[embed] Retry after port reset succeeded\n"
+              );
+            }
+            pushErrorSamples(retryResult.value.failureSamples);
+            suggestion ||= retryResult.value.retrySuggestion;
+            const retryVectors: VectorRow[] = [];
+            for (const [idx, item] of batch.entries()) {
+              const embedding = retryResult.value.vectors[idx];
+              if (!embedding) {
+                errors += 1;
+                continue;
+              }
+              retryVectors.push({
+                mirrorHash: item.mirrorHash,
+                seq: item.seq,
+                model: ctx.modelUri,
+                embedding: new Float32Array(embedding),
+              });
+            }
+            if (retryVectors.length === 0) {
+              if (ctx.verbose) {
+                process.stderr.write(
+                  "\n[embed] No recoverable embeddings in retry batch\n"
+                );
+              }
+              continue;
+            }
+            const retryStoreResult =
+              await ctx.vectorIndex.upsertVectors(retryVectors);
+            if (!retryStoreResult.ok) {
+              if (ctx.verbose) {
+                process.stderr.write(
+                  `\n[embed] Store failed: ${retryStoreResult.error.message}\n`
+                );
+              }
+              pushErrorSamples([retryStoreResult.error.message]);
+              suggestion ??=
+                "Store write failed. Rerun `gno embed` once more; if it repeats, run `gno doctor` and `gno vec sync`.";
+              errors += retryVectors.length;
+              continue;
+            }
+            embedded += retryVectors.length;
+            if (ctx.showProgress) {
+              const embeddedDisplay = Math.min(embedded, ctx.totalToEmbed);
+              const completed = Math.min(embedded + errors, ctx.totalToEmbed);
+              const pct = (completed / ctx.totalToEmbed) * 100;
+              const elapsed = (Date.now() - startTime) / 1000;
+              const rate = embedded / Math.max(elapsed, 0.001);
+              const eta =
+                Math.max(0, ctx.totalToEmbed - completed) /
+                Math.max(rate, 0.001);
+              process.stdout.write(
+                `\rEmbedding: ${embeddedDisplay.toLocaleString()}/${ctx.totalToEmbed.toLocaleString()} (${pct.toFixed(1)}%) | ${rate.toFixed(1)} chunks/s | ETA ${formatDuration(eta)}`
+              );
+            }
+            continue;
+          }
+        }
+      }
       if (ctx.verbose) {
         const err = batchEmbedResult.error;
         const cause = err.cause;
@@ -178,6 +311,9 @@ async function processBatches(ctx: BatchContext): Promise<BatchResult> {
           `\n[embed] Batch failed (${batch.length} chunks: ${titles}${batch.length > 3 ? "..." : ""}): ${err.message}${causeMsg ? ` - ${causeMsg}` : ""}\n`
         );
       }
+      pushErrorSamples([formattedError]);
+      suggestion =
+        "Try rerunning the same command. If failures persist, rerun with `gno --verbose embed --batch-size 1` to isolate failing chunks.";
       errors += batch.length;
       continue;
     }
@@ -191,6 +327,13 @@ async function processBatches(ctx: BatchContext): Promise<BatchResult> {
         `\n[embed] Batch fallback (${batch.length} chunks: ${titles}${batch.length > 3 ? "..." : ""}): ${batchEmbedResult.value.batchError ?? "unknown batch error"}\n`
       );
     }
+    pushErrorSamples(batchEmbedResult.value.failureSamples);
+    suggestion ||= batchEmbedResult.value.retrySuggestion;
+    if (ctx.verbose && batchEmbedResult.value.failureSamples.length > 0) {
+      for (const sample of batchEmbedResult.value.failureSamples) {
+        process.stderr.write(`\n[embed] Sample failure: ${sample}\n`);
+      }
+    }
     const vectors: VectorRow[] = [];
     for (const [idx, item] of batch.entries()) {
@@ -221,6 +364,9 @@ async function processBatches(ctx: BatchContext): Promise<BatchResult> {
           `\n[embed] Store failed: ${storeResult.error.message}\n`
         );
       }
+      pushErrorSamples([storeResult.error.message]);
+      suggestion ??=
+        "Store write failed. Rerun `gno embed` once more; if it repeats, run `gno doctor` and `gno vec sync`.";
       errors += vectors.length;
       continue;
     }
@@ -229,13 +375,15 @@ async function processBatches(ctx: BatchContext): Promise<BatchResult> {
     // Progress output
     if (ctx.showProgress) {
-      const pct = ((embedded + errors) / ctx.totalToEmbed) * 100;
+      const embeddedDisplay = Math.min(embedded, ctx.totalToEmbed);
+      const completed = Math.min(embedded + errors, ctx.totalToEmbed);
+      const pct = (completed / ctx.totalToEmbed) * 100;
       const elapsed = (Date.now() - startTime) / 1000;
       const rate = embedded / Math.max(elapsed, 0.001);
       const eta =
-        (ctx.totalToEmbed - embedded - errors) / Math.max(rate, 0.001);
+        Math.max(0, ctx.totalToEmbed - completed) / Math.max(rate, 0.001);
       process.stdout.write(
-        `\rEmbedding: ${embedded.toLocaleString()}/${ctx.totalToEmbed.toLocaleString()} (${pct.toFixed(1)}%) | ${rate.toFixed(1)} chunks/s | ETA ${formatDuration(eta)}`
+        `\rEmbedding: ${embeddedDisplay.toLocaleString()}/${ctx.totalToEmbed.toLocaleString()} (${pct.toFixed(1)}%) | ${rate.toFixed(1)} chunks/s | ETA ${formatDuration(eta)}`
       );
     }
   }
@@ -249,6 +397,8 @@ async function processBatches(ctx: BatchContext): Promise<BatchResult> {
     embedded,
     errors,
     duration: (Date.now() - startTime) / 1000,
+    errorSamples,
+    suggestion,
   };
 }
@@ -354,6 +504,7 @@ export async function embed(options: EmbedOptions = {}): Promise<EmbedResult> {
         duration: 0,
         model: modelUri,
         searchAvailable: vecAvailable,
+        errorSamples: [],
       };
     }
@@ -366,6 +517,7 @@ export async function embed(options: EmbedOptions = {}): Promise<EmbedResult> {
         duration: 0,
         model: modelUri,
         searchAvailable: vecAvailable,
+        errorSamples: [],
       };
     }
@@ -382,6 +534,27 @@ export async function embed(options: EmbedOptions = {}): Promise<EmbedResult> {
       : undefined;
     const llm = new LlmAdapter(config);
+    const recreateEmbedPort = async () => {
+      if (embedPort) {
+        await embedPort.dispose();
+      }
+      await llm.getManager().dispose(modelUri);
+      const recreated = await llm.createEmbeddingPort(modelUri, {
+        policy,
+        onProgress: downloadProgress
+          ? (progress) => downloadProgress("embed", progress)
+          : undefined,
+      });
+      if (!recreated.ok) {
+        return { ok: false as const, error: recreated.error.message };
+      }
+      const initResult = await recreated.value.init();
+      if (!initResult.ok) {
+        await recreated.value.dispose();
+        return { ok: false as const, error: initResult.error.message };
+      }
+      return { ok: true as const, value: recreated.value };
+    };
     const embedResult = await llm.createEmbeddingPort(modelUri, {
       policy,
       onProgress: downloadProgress
@@ -428,6 +601,7 @@ export async function embed(options: EmbedOptions = {}): Promise<EmbedResult> {
       showProgress: !options.json,
       totalToEmbed,
       verbose: options.verbose ?? false,
+      recreateEmbedPort,
     });
     if (!result.ok) {
@@ -447,10 +621,27 @@ export async function embed(options: EmbedOptions = {}): Promise<EmbedResult> {
           }
         }
         vectorIndex.vecDirty = false;
-      } else if (!options.json) {
-        process.stdout.write(
-          `\n[vec] Sync failed: ${syncResult.error.message}\n`
-        );
+      } else {
+        if (!options.json) {
+          process.stdout.write(
+            `\n[vec] Sync failed: ${syncResult.error.message}\n`
+          );
+        }
+        return {
+          success: true,
+          embedded: result.embedded,
+          errors: result.errors,
+          duration: result.duration,
+          model: modelUri,
+          searchAvailable: vectorIndex.searchAvailable,
+          errorSamples: [
+            ...result.errorSamples,
+            syncResult.error.message,
+          ].slice(0, 5),
+          suggestion:
+            "Vector index sync failed after embedding. Rerun `gno embed` once more. If it repeats, run `gno vec sync`.",
+          syncError: syncResult.error.message,
+        };
       }
     }
@@ -461,6 +652,8 @@ export async function embed(options: EmbedOptions = {}): Promise<EmbedResult> {
       duration: result.duration,
       model: modelUri,
       searchAvailable: vectorIndex.searchAvailable,
+      errorSamples: result.errorSamples,
+      suggestion: result.suggestion,
     };
   } finally {
     if (embedPort) {
@@ -585,6 +778,9 @@ export function formatEmbed(
         duration: result.duration,
         model: result.model,
         searchAvailable: result.searchAvailable,
+        errorSamples: result.errorSamples ?? [],
+        suggestion: result.suggestion,
+        syncError: result.syncError,
       },
       null,
       2
@@ -606,6 +802,14 @@ export function formatEmbed(
   if (result.errors > 0) {
     lines.push(`${result.errors} chunks failed to embed.`);
+    if ((result.errorSamples?.length ?? 0) > 0) {
+      for (const sample of result.errorSamples ?? []) {
+        lines.push(`Sample error: ${sample}`);
+      }
+    }
+    if (result.suggestion) {
+      lines.push(`Hint: ${result.suggestion}`);
+    }
   }
   if (!result.searchAvailable) {
@@ -614,5 +818,9 @@ export function formatEmbed(
     );
   }
+  if (result.syncError) {
+    lines.push(`Vec sync error: ${result.syncError}`);
+  }
   return lines.join("\n");
 }

package/src/embed/batch.ts CHANGED Viewed

@@ -14,8 +14,12 @@ export interface EmbedBatchRecoveryResult {
   batchFailed: boolean;
   batchError?: string;
   fallbackErrors: number;
+  failureSamples: string[];
+  retrySuggestion?: string;
 }
+const MAX_FAILURE_SAMPLES = 5;
 function errorMessage(error: unknown): string {
   if (
     error &&
@@ -28,6 +32,27 @@ function errorMessage(error: unknown): string {
   return String(error);
 }
+function formatFailureMessage(error: {
+  message: string;
+  cause?: unknown;
+}): string {
+  const cause = error.cause ? errorMessage(error.cause) : "";
+  return cause && cause !== error.message
+    ? `${error.message} - ${cause}`
+    : error.message;
+}
+function isDisposedFailure(message: string): boolean {
+  return message.toLowerCase().includes("object is disposed");
+}
+async function resetEmbeddingPort(
+  embedPort: EmbeddingPort
+): Promise<LlmResult<void>> {
+  await embedPort.dispose();
+  return embedPort.init();
+}
 export async function embedTextsWithRecovery(
   embedPort: EmbeddingPort,
   texts: string[]
@@ -39,13 +64,24 @@ export async function embedTextsWithRecovery(
         vectors: [],
         batchFailed: false,
         fallbackErrors: 0,
+        failureSamples: [],
       },
     };
   }
   const profile = getEmbeddingCompatibilityProfile(embedPort.modelUri);
   if (profile.batchEmbeddingTrusted) {
-    const batchResult = await embedPort.embedBatch(texts);
+    let batchResult = await embedPort.embedBatch(texts);
+    if (!batchResult.ok) {
+      const formattedBatchError = formatFailureMessage(batchResult.error);
+      if (isDisposedFailure(formattedBatchError)) {
+        const reset = await resetEmbeddingPort(embedPort);
+        if (!reset.ok) {
+          return reset;
+        }
+        batchResult = await embedPort.embedBatch(texts);
+      }
+    }
     if (batchResult.ok && batchResult.value.length === texts.length) {
       return {
         ok: true,
@@ -53,11 +89,14 @@ export async function embedTextsWithRecovery(
           vectors: batchResult.value,
           batchFailed: false,
           fallbackErrors: 0,
+          failureSamples: [],
         },
       };
     }
-    const recovered = await recoverIndividually(embedPort, texts);
+    const recovered = await recoverWithAdaptiveBatches(embedPort, texts, {
+      rootBatchAlreadyFailed: true,
+    });
     if (!recovered.ok) {
       return recovered;
     }
@@ -68,7 +107,11 @@ export async function embedTextsWithRecovery(
         batchFailed: true,
         batchError: batchResult.ok
           ? `Embedding count mismatch: got ${batchResult.value.length}, expected ${texts.length}`
-          : batchResult.error.message,
+          : formatFailureMessage(batchResult.error),
+        retrySuggestion:
+          recovered.value.fallbackErrors > 0
+            ? "Try rerunning the same command. If failures persist, rerun with `gno --verbose embed --batch-size 1` to isolate failing chunks."
+            : undefined,
       },
     };
   }
@@ -83,10 +126,113 @@ export async function embedTextsWithRecovery(
       ...recovered.value,
       batchFailed: true,
       batchError: "Batch embedding disabled for this compatibility profile",
+      retrySuggestion:
+        recovered.value.fallbackErrors > 0
+          ? "Some chunks still failed individually. Rerun with `gno --verbose embed --batch-size 1` for exact chunk errors."
+          : undefined,
     },
   };
 }
+async function recoverWithAdaptiveBatches(
+  embedPort: EmbeddingPort,
+  texts: string[],
+  options: { rootBatchAlreadyFailed?: boolean } = {}
+): Promise<
+  LlmResult<Omit<EmbedBatchRecoveryResult, "batchFailed" | "batchError">>
+> {
+  try {
+    const vectors: Array<number[] | null> = Array.from(
+      { length: texts.length },
+      () => null
+    );
+    const failureSamples: string[] = [];
+    let fallbackErrors = 0;
+    const recordFailure = (message: string): void => {
+      if (failureSamples.length < MAX_FAILURE_SAMPLES) {
+        failureSamples.push(message);
+      }
+    };
+    const processRange = async (
+      rangeTexts: string[],
+      offset: number,
+      batchAlreadyFailed = false
+    ): Promise<void> => {
+      if (rangeTexts.length === 0) {
+        return;
+      }
+      if (rangeTexts.length === 1) {
+        const result = await embedPort.embed(rangeTexts[0] ?? "");
+        if (result.ok) {
+          vectors[offset] = result.value;
+          return;
+        }
+        fallbackErrors += 1;
+        recordFailure(formatFailureMessage(result.error));
+        return;
+      }
+      let batchResult: Awaited<ReturnType<typeof embedPort.embedBatch>> | null =
+        null;
+      if (!batchAlreadyFailed) {
+        batchResult = await embedPort.embedBatch(rangeTexts);
+      }
+      if (
+        batchResult &&
+        batchResult.ok &&
+        batchResult.value.length === rangeTexts.length
+      ) {
+        for (const [index, vector] of batchResult.value.entries()) {
+          vectors[offset + index] = vector;
+        }
+        return;
+      }
+      const mid = Math.ceil(rangeTexts.length / 2);
+      await processRange(rangeTexts.slice(0, mid), offset);
+      await processRange(rangeTexts.slice(mid), offset + mid);
+    };
+    await processRange(texts, 0, options.rootBatchAlreadyFailed ?? false);
+    if (fallbackErrors === texts.length) {
+      const reinit = await resetEmbeddingPort(embedPort);
+      if (!reinit.ok) {
+        return reinit;
+      }
+      const retry = await recoverIndividually(embedPort, texts);
+      if (!retry.ok) {
+        return retry;
+      }
+      return {
+        ok: true,
+        value: retry.value,
+      };
+    }
+    return {
+      ok: true,
+      value: {
+        vectors,
+        fallbackErrors,
+        failureSamples,
+      },
+    };
+  } catch (error) {
+    return {
+      ok: false,
+      error: inferenceFailedError(
+        embedPort.modelUri,
+        new Error(errorMessage(error))
+      ),
+    };
+  }
+}
 async function recoverIndividually(
   embedPort: EmbeddingPort,
   texts: string[]
@@ -95,6 +241,7 @@ async function recoverIndividually(
 > {
   try {
     const vectors: Array<number[] | null> = [];
+    const failureSamples: string[] = [];
     let fallbackErrors = 0;
     for (const text of texts) {
@@ -104,6 +251,9 @@ async function recoverIndividually(
       } else {
         vectors.push(null);
         fallbackErrors += 1;
+        if (failureSamples.length < MAX_FAILURE_SAMPLES) {
+          failureSamples.push(formatFailureMessage(result.error));
+        }
       }
     }
@@ -112,6 +262,7 @@ async function recoverIndividually(
       value: {
         vectors,
         fallbackErrors,
+        failureSamples,
       },
     };
   } catch (error) {

package/src/store/vector/sqlite-vec.ts CHANGED Viewed

@@ -117,10 +117,12 @@ export async function createVectorIndexPort(
   `);
   // Prepared statements for vec0 table (if available)
-  const upsertVecStmt = searchAvailable
-    ? db.prepare(
-        `INSERT OR REPLACE INTO ${tableName} (chunk_id, embedding) VALUES (?, ?)`
-      )
+  const deleteVecChunkStmt = searchAvailable
+    ? db.prepare(`DELETE FROM ${tableName} WHERE chunk_id = ?`)
+    : null;
+  const insertVecStmt = searchAvailable
+    ? db.prepare(`INSERT INTO ${tableName} (chunk_id, embedding) VALUES (?, ?)`)
     : null;
   const searchStmt = searchAvailable
@@ -175,12 +177,15 @@ export async function createVectorIndexPort(
       }
       // 2. Best-effort update vec0 (graceful degradation)
-      if (upsertVecStmt) {
+      if (deleteVecChunkStmt && insertVecStmt) {
         try {
           db.transaction(() => {
             for (const row of rows) {
               const chunkId = `${row.mirrorHash}:${row.seq}`;
-              upsertVecStmt.run(chunkId, encodeEmbedding(row.embedding));
+              // sqlite-vec vec0 tables do not reliably support OR REPLACE semantics.
+              // Delete first, then insert the fresh vector row.
+              deleteVecChunkStmt.run(chunkId);
+              insertVecStmt.run(chunkId, encodeEmbedding(row.embedding));
             }
           })();
         } catch (e) {