@gmickel/gno 0.40.2 → 0.41.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -87,7 +87,7 @@ gno daemon
87
87
 
88
88
  ## What's New
89
89
 
90
- > Latest release: [v0.39.1](./CHANGELOG.md#0391---2026-04-06)
90
+ > Latest release: [v0.40.2](./CHANGELOG.md#0402---2026-04-06)
91
91
  > Full release history: [CHANGELOG.md](./CHANGELOG.md)
92
92
 
93
93
  - **Retrieval Quality Upgrade**: stronger BM25 lexical handling, code-aware chunking, terminal result hyperlinks, and per-collection model overrides
@@ -108,6 +108,26 @@ gno embed
108
108
  That regenerates embeddings for the new default model. Old vectors are kept
109
109
  until you explicitly clear stale embeddings.
110
110
 
111
+ If the release also changes the embedding formatting/profile behavior for your
112
+ active model, prefer one of these stronger migration paths:
113
+
114
+ ```bash
115
+ gno embed --force
116
+ ```
117
+
118
+ or per collection:
119
+
120
+ ```bash
121
+ gno collection clear-embeddings my-collection --all
122
+ gno embed my-collection
123
+ ```
124
+
125
+ Model guides:
126
+
127
+ - [Code Embeddings](./docs/guides/code-embeddings.md)
128
+ - [Per-Collection Models](./docs/guides/per-collection-models.md)
129
+ - [Bring Your Own Models](./docs/guides/bring-your-own-models.md)
130
+
111
131
  ### Fine-Tuned Model Quick Use
112
132
 
113
133
  ```yaml
@@ -672,22 +692,23 @@ graph TD
672
692
 
673
693
  Models auto-download on first use to `~/.cache/gno/models/`. For deterministic startup, set `GNO_NO_AUTO_DOWNLOAD=1` and use `gno models pull` explicitly. Alternatively, offload to a GPU server on your network using HTTP backends.
674
694
 
675
- | Model | Purpose | Size |
676
- | :------------------- | :------------------------------------ | :----------- |
677
- | Qwen3-Embedding-0.6B | Embeddings (multilingual) | ~640MB |
678
- | Qwen3-Reranker-0.6B | Cross-encoder reranking (32K context) | ~700MB |
679
- | Qwen/SmolLM | Query expansion + AI answers | ~600MB-1.2GB |
695
+ | Model | Purpose | Size |
696
+ | :--------------------- | :------------------------------------ | :----------- |
697
+ | Qwen3-Embedding-0.6B | Embeddings (multilingual) | ~640MB |
698
+ | Qwen3-Reranker-0.6B | Cross-encoder reranking (32K context) | ~700MB |
699
+ | Qwen3 / Qwen2.5 family | Query expansion + AI answers | ~600MB-2.5GB |
680
700
 
681
701
  ### Model Presets
682
702
 
683
- | Preset | Disk | Best For |
684
- | :--------- | :----- | :--------------------------- |
685
- | `slim` | ~1GB | Fast, good quality (default) |
686
- | `balanced` | ~2GB | Slightly larger model |
687
- | `quality` | ~2.5GB | Best answers |
703
+ | Preset | Disk | Best For |
704
+ | :----------- | :----- | :------------------------------------------------------ |
705
+ | `slim-tuned` | ~1GB | Current default, tuned retrieval in a compact footprint |
706
+ | `slim` | ~1GB | Fast, good quality |
707
+ | `balanced` | ~2GB | Slightly larger model |
708
+ | `quality` | ~2.5GB | Best answers |
688
709
 
689
710
  ```bash
690
- gno models use slim
711
+ gno models use slim-tuned
691
712
  gno models pull --all # Optional: pre-download models (auto-downloads on first use)
692
713
  ```
693
714
 
@@ -720,7 +741,7 @@ models:
720
741
  presets:
721
742
  - id: remote-gpu
722
743
  name: Remote GPU Server
723
- embed: "http://192.168.1.100:8081/v1/embeddings#bge-m3"
744
+ embed: "http://192.168.1.100:8081/v1/embeddings#qwen3-embedding-0.6b"
724
745
  rerank: "http://192.168.1.100:8082/v1/completions#reranker"
725
746
  expand: "http://192.168.1.100:8083/v1/chat/completions#gno-expand"
726
747
  gen: "http://192.168.1.100:8083/v1/chat/completions#qwen3-4b"
@@ -730,6 +751,11 @@ Works with llama-server, Ollama, LocalAI, vLLM, or any OpenAI-compatible server.
730
751
 
731
752
  > **Configuration**: [Model Setup](https://gno.sh/docs/CONFIGURATION/)
732
753
 
754
+ Remote/BYOM guides:
755
+
756
+ - [Bring Your Own Models](./docs/guides/bring-your-own-models.md)
757
+ - [Per-Collection Models](./docs/guides/per-collection-models.md)
758
+
733
759
  ---
734
760
 
735
761
  ## Architecture
@@ -801,33 +827,29 @@ If a model turns out to be better specifically for code, the intended user story
801
827
 
802
828
  That lets GNO stay sane by default while still giving power users a clean path to code-specialist retrieval.
803
829
 
804
- Current code-focused recommendation:
830
+ More model docs:
805
831
 
806
- ```yaml
807
- collections:
808
- - name: gno-code
809
- path: /Users/you/work/gno/src
810
- pattern: "**/*.{ts,tsx,js,jsx,go,rs,py,swift,c}"
811
- models:
812
- embed: "hf:Qwen/Qwen3-Embedding-0.6B-GGUF/Qwen3-Embedding-0.6B-Q8_0.gguf"
813
- ```
832
+ - [Code Embeddings](./docs/guides/code-embeddings.md)
833
+ - [Per-Collection Models](./docs/guides/per-collection-models.md)
834
+ - [Bring Your Own Models](./docs/guides/bring-your-own-models.md)
814
835
 
815
- GNO treats that override like any other model URI:
836
+ Current product stance:
816
837
 
817
- - auto-downloads on first use by default
818
- - manual-only if `GNO_NO_AUTO_DOWNLOAD=1`
819
- - offline-safe if the model is already cached
838
+ - `Qwen3-Embedding-0.6B-GGUF` is already the global default embed model
839
+ - you do **not** need a collection override just to get Qwen on code collections
840
+ - use a collection override only when one collection should intentionally diverge from that default
820
841
 
821
- Why this is the current recommendation:
842
+ Why Qwen is the current default:
822
843
 
823
- - matches `bge-m3` on the tiny canonical benchmark
844
+ - matches or exceeds `bge-m3` on the tiny canonical benchmark
824
845
  - significantly beats `bge-m3` on the real GNO `src/serve` code slice
825
846
  - also beats `bge-m3` on a pinned public-OSS code slice
847
+ - also beats `bge-m3` on the multilingual prose/docs benchmark lane
826
848
 
827
- Trade-off:
849
+ Current trade-off:
828
850
 
829
851
  - Qwen is slower to embed than `bge-m3`
830
- - existing users upgrading to the new default may need to run `gno embed` again so vector and hybrid retrieval catch up
852
+ - existing users upgrading or adopting a new embedding formatting profile may need to run `gno embed` again so stored vectors match the current formatter/runtime path
831
853
 
832
854
  ### General Multilingual Embedding Benchmark
833
855
 
@@ -841,8 +863,8 @@ bun run bench:general-embeddings --candidate qwen3-embedding-0.6b --write
841
863
 
842
864
  Current signal on the public multilingual FastAPI-docs fixture:
843
865
 
844
- - `bge-m3`: vector nDCG@10 `0.350`, hybrid nDCG@10 `0.642`
845
- - `Qwen3-Embedding-0.6B-GGUF`: vector nDCG@10 `0.859`, hybrid nDCG@10 `0.947`
866
+ - `bge-m3`: vector nDCG@10 `0.3508`, hybrid nDCG@10 `0.6756`
867
+ - `Qwen3-Embedding-0.6B-GGUF`: vector nDCG@10 `0.9891`, hybrid nDCG@10 `0.9891`
846
868
 
847
869
  Interpretation:
848
870
 
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@gmickel/gno",
3
- "version": "0.40.2",
3
+ "version": "0.41.0",
4
4
  "description": "Local semantic search for your documents. Index Markdown, PDF, and Office files with hybrid BM25 + vector search.",
5
5
  "keywords": [
6
6
  "embeddings",
@@ -17,6 +17,7 @@ import {
17
17
  isInitialized,
18
18
  loadConfig,
19
19
  } from "../../config";
20
+ import { embedTextsWithRecovery } from "../../embed/batch";
20
21
  import { LlmAdapter } from "../../llm/nodeLlamaCpp/adapter";
21
22
  import { resolveDownloadPolicy } from "../../llm/policy";
22
23
  import { resolveModelUri } from "../../llm/registry";
@@ -153,8 +154,11 @@ async function processBatches(ctx: BatchContext): Promise<BatchResult> {
153
154
  }
154
155
 
155
156
  // Embed batch with contextual formatting (title prefix)
156
- const batchEmbedResult = await ctx.embedPort.embedBatch(
157
- batch.map((b) => formatDocForEmbedding(b.text, b.title ?? undefined))
157
+ const batchEmbedResult = await embedTextsWithRecovery(
158
+ ctx.embedPort,
159
+ batch.map((b) =>
160
+ formatDocForEmbedding(b.text, b.title ?? undefined, ctx.modelUri)
161
+ )
158
162
  );
159
163
  if (!batchEmbedResult.ok) {
160
164
  if (ctx.verbose) {
@@ -178,26 +182,38 @@ async function processBatches(ctx: BatchContext): Promise<BatchResult> {
178
182
  continue;
179
183
  }
180
184
 
181
- // Validate batch/embedding count match
182
- const embeddings = batchEmbedResult.value;
183
- if (embeddings.length !== batch.length) {
185
+ if (ctx.verbose && batchEmbedResult.value.batchFailed) {
186
+ const titles = batch
187
+ .slice(0, 3)
188
+ .map((b) => b.title ?? b.mirrorHash.slice(0, 8))
189
+ .join(", ");
190
+ process.stderr.write(
191
+ `\n[embed] Batch fallback (${batch.length} chunks: ${titles}${batch.length > 3 ? "..." : ""}): ${batchEmbedResult.value.batchError ?? "unknown batch error"}\n`
192
+ );
193
+ }
194
+
195
+ const vectors: VectorRow[] = [];
196
+ for (const [idx, item] of batch.entries()) {
197
+ const embedding = batchEmbedResult.value.vectors[idx];
198
+ if (!embedding) {
199
+ errors += 1;
200
+ continue;
201
+ }
202
+ vectors.push({
203
+ mirrorHash: item.mirrorHash,
204
+ seq: item.seq,
205
+ model: ctx.modelUri,
206
+ embedding: new Float32Array(embedding),
207
+ });
208
+ }
209
+
210
+ if (vectors.length === 0) {
184
211
  if (ctx.verbose) {
185
- process.stderr.write(
186
- `\n[embed] Count mismatch: got ${embeddings.length}, expected ${batch.length}\n`
187
- );
212
+ process.stderr.write("\n[embed] No recoverable embeddings in batch\n");
188
213
  }
189
- errors += batch.length;
190
214
  continue;
191
215
  }
192
216
 
193
- // Store vectors (embeddedAt set by DB)
194
- const vectors: VectorRow[] = batch.map((b, idx) => ({
195
- mirrorHash: b.mirrorHash,
196
- seq: b.seq,
197
- model: ctx.modelUri,
198
- embedding: new Float32Array(embeddings[idx] as number[]),
199
- }));
200
-
201
217
  const storeResult = await ctx.vectorIndex.upsertVectors(vectors);
202
218
  if (!storeResult.ok) {
203
219
  if (ctx.verbose) {
@@ -205,11 +221,11 @@ async function processBatches(ctx: BatchContext): Promise<BatchResult> {
205
221
  `\n[embed] Store failed: ${storeResult.error.message}\n`
206
222
  );
207
223
  }
208
- errors += batch.length;
224
+ errors += vectors.length;
209
225
  continue;
210
226
  }
211
227
 
212
- embedded += batch.length;
228
+ embedded += vectors.length;
213
229
 
214
230
  // Progress output
215
231
  if (ctx.showProgress) {
@@ -97,7 +97,7 @@ export async function vsearch(
97
97
  try {
98
98
  // Embed query with contextual formatting (also determines dimensions)
99
99
  const queryEmbedResult = await embedPort.embed(
100
- formatQueryForEmbedding(query)
100
+ formatQueryForEmbedding(query, embedPort.modelUri)
101
101
  );
102
102
  if (!queryEmbedResult.ok) {
103
103
  return { success: false, error: queryEmbedResult.error.message };
@@ -16,6 +16,7 @@ import type {
16
16
 
17
17
  import { formatDocForEmbedding } from "../pipeline/contextual";
18
18
  import { err, ok } from "../store/types";
19
+ import { embedTextsWithRecovery } from "./batch";
19
20
 
20
21
  // ─────────────────────────────────────────────────────────────────────────────
21
22
  // Types
@@ -85,9 +86,14 @@ export async function embedBacklog(
85
86
  }
86
87
 
87
88
  // Embed batch with contextual formatting (title prefix)
88
- const embedResult = await embedPort.embedBatch(
89
+ const embedResult = await embedTextsWithRecovery(
90
+ embedPort,
89
91
  batch.map((b: BacklogItem) =>
90
- formatDocForEmbedding(b.text, b.title ?? undefined)
92
+ formatDocForEmbedding(
93
+ b.text,
94
+ b.title ?? undefined,
95
+ embedPort.modelUri
96
+ )
91
97
  )
92
98
  );
93
99
 
@@ -96,28 +102,29 @@ export async function embedBacklog(
96
102
  continue;
97
103
  }
98
104
 
99
- // Validate batch/embedding count match
100
- const embeddings = embedResult.value;
101
- if (embeddings.length !== batch.length) {
102
- errors += batch.length;
103
- continue;
105
+ const vectors: VectorRow[] = [];
106
+ for (const [idx, item] of batch.entries()) {
107
+ const embedding = embedResult.value.vectors[idx];
108
+ if (!embedding) {
109
+ errors += 1;
110
+ continue;
111
+ }
112
+ vectors.push({
113
+ mirrorHash: item.mirrorHash,
114
+ seq: item.seq,
115
+ model: modelUri,
116
+ embedding: new Float32Array(embedding),
117
+ });
104
118
  }
105
119
 
106
- // Store vectors (embeddedAt set by DB)
107
- const vectors: VectorRow[] = batch.map((b: BacklogItem, idx: number) => ({
108
- mirrorHash: b.mirrorHash,
109
- seq: b.seq,
110
- model: modelUri,
111
- embedding: new Float32Array(embeddings[idx] as number[]),
112
- }));
113
-
114
- const storeResult = await vectorIndex.upsertVectors(vectors);
115
- if (!storeResult.ok) {
116
- errors += batch.length;
117
- continue;
120
+ if (vectors.length > 0) {
121
+ const storeResult = await vectorIndex.upsertVectors(vectors);
122
+ if (!storeResult.ok) {
123
+ errors += vectors.length;
124
+ continue;
125
+ }
126
+ embedded += vectors.length;
118
127
  }
119
-
120
- embedded += batch.length;
121
128
  }
122
129
 
123
130
  // Sync vec index once at end if any vec0 writes failed
@@ -0,0 +1,126 @@
1
+ /**
2
+ * Shared embedding batch helpers.
3
+ *
4
+ * @module src/embed/batch
5
+ */
6
+
7
+ import type { EmbeddingPort, LlmResult } from "../llm/types";
8
+
9
+ import { getEmbeddingCompatibilityProfile } from "../llm/embedding-compatibility";
10
+ import { inferenceFailedError } from "../llm/errors";
11
+
12
+ export interface EmbedBatchRecoveryResult {
13
+ vectors: Array<number[] | null>;
14
+ batchFailed: boolean;
15
+ batchError?: string;
16
+ fallbackErrors: number;
17
+ }
18
+
19
+ function errorMessage(error: unknown): string {
20
+ if (
21
+ error &&
22
+ typeof error === "object" &&
23
+ "message" in error &&
24
+ typeof error.message === "string"
25
+ ) {
26
+ return error.message;
27
+ }
28
+ return String(error);
29
+ }
30
+
31
+ export async function embedTextsWithRecovery(
32
+ embedPort: EmbeddingPort,
33
+ texts: string[]
34
+ ): Promise<LlmResult<EmbedBatchRecoveryResult>> {
35
+ if (texts.length === 0) {
36
+ return {
37
+ ok: true,
38
+ value: {
39
+ vectors: [],
40
+ batchFailed: false,
41
+ fallbackErrors: 0,
42
+ },
43
+ };
44
+ }
45
+
46
+ const profile = getEmbeddingCompatibilityProfile(embedPort.modelUri);
47
+ if (profile.batchEmbeddingTrusted) {
48
+ const batchResult = await embedPort.embedBatch(texts);
49
+ if (batchResult.ok && batchResult.value.length === texts.length) {
50
+ return {
51
+ ok: true,
52
+ value: {
53
+ vectors: batchResult.value,
54
+ batchFailed: false,
55
+ fallbackErrors: 0,
56
+ },
57
+ };
58
+ }
59
+
60
+ const recovered = await recoverIndividually(embedPort, texts);
61
+ if (!recovered.ok) {
62
+ return recovered;
63
+ }
64
+ return {
65
+ ok: true,
66
+ value: {
67
+ ...recovered.value,
68
+ batchFailed: true,
69
+ batchError: batchResult.ok
70
+ ? `Embedding count mismatch: got ${batchResult.value.length}, expected ${texts.length}`
71
+ : batchResult.error.message,
72
+ },
73
+ };
74
+ }
75
+
76
+ const recovered = await recoverIndividually(embedPort, texts);
77
+ if (!recovered.ok) {
78
+ return recovered;
79
+ }
80
+ return {
81
+ ok: true,
82
+ value: {
83
+ ...recovered.value,
84
+ batchFailed: true,
85
+ batchError: "Batch embedding disabled for this compatibility profile",
86
+ },
87
+ };
88
+ }
89
+
90
+ async function recoverIndividually(
91
+ embedPort: EmbeddingPort,
92
+ texts: string[]
93
+ ): Promise<
94
+ LlmResult<Omit<EmbedBatchRecoveryResult, "batchFailed" | "batchError">>
95
+ > {
96
+ try {
97
+ const vectors: Array<number[] | null> = [];
98
+ let fallbackErrors = 0;
99
+
100
+ for (const text of texts) {
101
+ const result = await embedPort.embed(text);
102
+ if (result.ok) {
103
+ vectors.push(result.value);
104
+ } else {
105
+ vectors.push(null);
106
+ fallbackErrors += 1;
107
+ }
108
+ }
109
+
110
+ return {
111
+ ok: true,
112
+ value: {
113
+ vectors,
114
+ fallbackErrors,
115
+ },
116
+ };
117
+ } catch (error) {
118
+ return {
119
+ ok: false,
120
+ error: inferenceFailedError(
121
+ embedPort.modelUri,
122
+ new Error(errorMessage(error))
123
+ ),
124
+ };
125
+ }
126
+ }
@@ -0,0 +1,82 @@
1
+ /**
2
+ * Embedding compatibility profiles.
3
+ *
4
+ * Encodes model-specific formatting/runtime hints for embedding models without
5
+ * forcing every caller to special-case URIs inline.
6
+ *
7
+ * @module src/llm/embedding-compatibility
8
+ */
9
+
10
+ export type EmbeddingQueryFormat = "contextual-task" | "qwen-instruct";
11
+ export type EmbeddingDocumentFormat = "title-prefixed" | "raw-text";
12
+
13
+ export interface EmbeddingCompatibilityProfile {
14
+ id: string;
15
+ queryFormat: EmbeddingQueryFormat;
16
+ documentFormat: EmbeddingDocumentFormat;
17
+ /**
18
+ * Whether embedBatch is trusted for this model in GNO's current native path.
19
+ * If false, callers should use per-item embedding until compatibility is
20
+ * better understood.
21
+ */
22
+ batchEmbeddingTrusted: boolean;
23
+ notes?: string[];
24
+ }
25
+
26
+ const DEFAULT_PROFILE: EmbeddingCompatibilityProfile = {
27
+ id: "default",
28
+ queryFormat: "contextual-task",
29
+ documentFormat: "title-prefixed",
30
+ batchEmbeddingTrusted: true,
31
+ };
32
+
33
+ const QWEN_PROFILE: EmbeddingCompatibilityProfile = {
34
+ id: "qwen-embedding",
35
+ queryFormat: "qwen-instruct",
36
+ documentFormat: "raw-text",
37
+ batchEmbeddingTrusted: true,
38
+ notes: [
39
+ "Uses Qwen-style instruct query formatting.",
40
+ "Documents are embedded as raw text (optionally prefixed with title).",
41
+ ],
42
+ };
43
+
44
+ const JINA_PROFILE: EmbeddingCompatibilityProfile = {
45
+ id: "jina-embedding",
46
+ queryFormat: "contextual-task",
47
+ documentFormat: "title-prefixed",
48
+ batchEmbeddingTrusted: false,
49
+ notes: [
50
+ "Current native runtime path has batch-embedding issues on real fixtures.",
51
+ "Prefer per-item embedding fallback until compatibility improves.",
52
+ ],
53
+ };
54
+
55
+ function normalizeModelUri(modelUri?: string): string {
56
+ return modelUri?.toLowerCase() ?? "";
57
+ }
58
+
59
+ function hasAllTerms(haystack: string, terms: string[]): boolean {
60
+ return terms.every((term) => haystack.includes(term));
61
+ }
62
+
63
+ export function getEmbeddingCompatibilityProfile(
64
+ modelUri?: string
65
+ ): EmbeddingCompatibilityProfile {
66
+ const normalizedUri = normalizeModelUri(modelUri);
67
+
68
+ if (hasAllTerms(normalizedUri, ["qwen", "embed"])) {
69
+ return QWEN_PROFILE;
70
+ }
71
+
72
+ if (
73
+ normalizedUri.includes("jina-embeddings-v4-text-code") ||
74
+ normalizedUri.includes("jina-code-embeddings") ||
75
+ hasAllTerms(normalizedUri, ["jina", "embeddings-v4-text-code"]) ||
76
+ hasAllTerms(normalizedUri, ["jina", "code-embeddings"])
77
+ ) {
78
+ return JINA_PROFILE;
79
+ }
80
+
81
+ return DEFAULT_PROFILE;
82
+ }
@@ -149,7 +149,7 @@ export function handleVsearch(
149
149
  try {
150
150
  // Embed query with contextual formatting
151
151
  const queryEmbedResult = await embedPort.embed(
152
- formatQueryForEmbedding(args.query)
152
+ formatQueryForEmbedding(args.query, embedPort.modelUri)
153
153
  );
154
154
  if (!queryEmbedResult.ok) {
155
155
  throw new Error(queryEmbedResult.error.message);
@@ -10,6 +10,8 @@
10
10
  * @module src/pipeline/contextual
11
11
  */
12
12
 
13
+ import { getEmbeddingCompatibilityProfile } from "../llm/embedding-compatibility";
14
+
13
15
  // Top-level regex for performance
14
16
  const HEADING_REGEX = /^##?\s+(.+)$/m;
15
17
  const SUBHEADING_REGEX = /^##\s+(.+)$/m;
@@ -19,8 +21,16 @@ const EXT_REGEX = /\.\w+$/;
19
21
  * Format document text for embedding.
20
22
  * Prepends title for contextual retrieval.
21
23
  */
22
- export function formatDocForEmbedding(text: string, title?: string): string {
24
+ export function formatDocForEmbedding(
25
+ text: string,
26
+ title?: string,
27
+ modelUri?: string
28
+ ): string {
29
+ const profile = getEmbeddingCompatibilityProfile(modelUri);
23
30
  const safeTitle = title?.trim() || "none";
31
+ if (profile.documentFormat === "raw-text") {
32
+ return title?.trim() ? `${title.trim()}\n${text}` : text;
33
+ }
24
34
  return `title: ${safeTitle} | text: ${text}`;
25
35
  }
26
36
 
@@ -28,7 +38,14 @@ export function formatDocForEmbedding(text: string, title?: string): string {
28
38
  * Format query for embedding.
29
39
  * Uses task-prefixed format for asymmetric retrieval.
30
40
  */
31
- export function formatQueryForEmbedding(query: string): string {
41
+ export function formatQueryForEmbedding(
42
+ query: string,
43
+ modelUri?: string
44
+ ): string {
45
+ const profile = getEmbeddingCompatibilityProfile(modelUri);
46
+ if (profile.queryFormat === "qwen-instruct") {
47
+ return `Instruct: Retrieve relevant documents for the given query\nQuery: ${query}`;
48
+ }
32
49
  return `task: search result | query: ${query}`;
33
50
  }
34
51
 
@@ -18,6 +18,7 @@ import type {
18
18
  SearchResults,
19
19
  } from "./types";
20
20
 
21
+ import { embedTextsWithRecovery } from "../embed/batch";
21
22
  import { err, ok } from "../store/types";
22
23
  import { createChunkLookup } from "./chunk-lookup";
23
24
  import { formatQueryForEmbedding } from "./contextual";
@@ -213,7 +214,9 @@ async function searchVectorChunks(
213
214
  }
214
215
 
215
216
  // Embed query with contextual formatting
216
- const embedResult = await embedPort.embed(formatQueryForEmbedding(query));
217
+ const embedResult = await embedPort.embed(
218
+ formatQueryForEmbedding(query, embedPort.modelUri)
219
+ );
217
220
  if (!embedResult.ok) {
218
221
  return [];
219
222
  }
@@ -443,17 +446,6 @@ export async function searchHybrid(
443
446
  const vectorStartedAt = performance.now();
444
447
 
445
448
  if (vectorAvailable && vectorIndex && embedPort) {
446
- // Original query (increase limit when post-filters are active).
447
- const vecChunks = await searchVectorChunks(vectorIndex, embedPort, query, {
448
- limit: limit * 2 * retrievalMultiplier,
449
- });
450
-
451
- vecCount = vecChunks.length;
452
- if (vecCount > 0) {
453
- rankedInputs.push(toRankedInput("vector", vecChunks));
454
- }
455
-
456
- // Semantic variants + HyDE (optional; run in parallel and ignore failures)
457
449
  const vectorVariantQueries = [
458
450
  ...(expansion?.vectorQueries?.map((query) => ({
459
451
  source: "vector_variant" as const,
@@ -464,22 +456,72 @@ export async function searchHybrid(
464
456
  : []),
465
457
  ];
466
458
 
467
- if (vectorVariantQueries.length > 0) {
468
- const optionalVectorResults = await Promise.allSettled(
469
- vectorVariantQueries.map((variant) =>
470
- searchVectorChunks(vectorIndex, embedPort, variant.query, {
471
- limit: limit * retrievalMultiplier,
472
- })
459
+ if (vectorVariantQueries.length === 0) {
460
+ const vecChunks = await searchVectorChunks(
461
+ vectorIndex,
462
+ embedPort,
463
+ query,
464
+ {
465
+ limit: limit * 2 * retrievalMultiplier,
466
+ }
467
+ );
468
+
469
+ vecCount = vecChunks.length;
470
+ if (vecCount > 0) {
471
+ rankedInputs.push(toRankedInput("vector", vecChunks));
472
+ }
473
+ } else {
474
+ const batchedQueries = [
475
+ {
476
+ source: "vector" as const,
477
+ query,
478
+ limit: limit * 2 * retrievalMultiplier,
479
+ },
480
+ ...vectorVariantQueries.map((variant) => ({
481
+ ...variant,
482
+ limit: limit * retrievalMultiplier,
483
+ })),
484
+ ];
485
+
486
+ const embedResult = await embedTextsWithRecovery(
487
+ embedPort,
488
+ batchedQueries.map((variant) =>
489
+ formatQueryForEmbedding(variant.query, embedPort.modelUri)
473
490
  )
474
491
  );
475
492
 
476
- for (const [index, settled] of optionalVectorResults.entries()) {
477
- if (settled.status !== "fulfilled" || settled.value.length === 0) {
478
- continue;
493
+ if (!embedResult.ok) {
494
+ counters.fallbackEvents.push("vector_embed_error");
495
+ } else {
496
+ if (embedResult.value.batchFailed) {
497
+ counters.fallbackEvents.push("vector_embed_batch_fallback");
479
498
  }
480
- const variant = vectorVariantQueries[index];
481
- if (variant) {
482
- rankedInputs.push(toRankedInput(variant.source, settled.value));
499
+
500
+ for (const [index, variant] of batchedQueries.entries()) {
501
+ const embedding = embedResult.value.vectors[index];
502
+ if (!embedding || !variant) {
503
+ continue;
504
+ }
505
+
506
+ const searchResult = await vectorIndex.searchNearest(
507
+ new Float32Array(embedding),
508
+ variant.limit
509
+ );
510
+ if (!searchResult.ok || searchResult.value.length === 0) {
511
+ continue;
512
+ }
513
+
514
+ const chunks = searchResult.value.map((item) => ({
515
+ mirrorHash: item.mirrorHash,
516
+ seq: item.seq,
517
+ }));
518
+ if (variant.source === "vector") {
519
+ vecCount = chunks.length;
520
+ }
521
+ if (chunks.length === 0) {
522
+ continue;
523
+ }
524
+ rankedInputs.push(toRankedInput(variant.source, chunks));
483
525
  }
484
526
  }
485
527
  }
@@ -353,7 +353,9 @@ export async function searchVector(
353
353
  }
354
354
 
355
355
  // Embed query with contextual formatting
356
- const embedResult = await embedPort.embed(formatQueryForEmbedding(query));
356
+ const embedResult = await embedPort.embed(
357
+ formatQueryForEmbedding(query, embedPort.modelUri)
358
+ );
357
359
  if (!embedResult.ok) {
358
360
  return err(
359
361
  "QUERY_FAILED",
package/src/sdk/client.ts CHANGED
@@ -401,7 +401,7 @@ class GnoClientImpl implements GnoClient {
401
401
  }
402
402
 
403
403
  const queryEmbedResult = await ports.embedPort.embed(
404
- formatQueryForEmbedding(query)
404
+ formatQueryForEmbedding(query, ports.embedPort.modelUri)
405
405
  );
406
406
  if (!queryEmbedResult.ok) {
407
407
  throw sdkError("MODEL", queryEmbedResult.error.message, {
package/src/sdk/embed.ts CHANGED
@@ -19,6 +19,7 @@ import type {
19
19
  import type { GnoEmbedOptions, GnoEmbedResult } from "./types";
20
20
 
21
21
  import { embedBacklog } from "../embed";
22
+ import { embedTextsWithRecovery } from "../embed/batch";
22
23
  import { resolveModelUri } from "../llm/registry";
23
24
  import { formatDocForEmbedding } from "../pipeline/contextual";
24
25
  import { err, ok } from "../store/types";
@@ -139,29 +140,45 @@ async function forceEmbedAll(
139
140
  cursor = { mirrorHash: lastItem.mirrorHash, seq: lastItem.seq };
140
141
  }
141
142
 
142
- const embedResult = await embedPort.embedBatch(
143
+ const embedResult = await embedTextsWithRecovery(
144
+ embedPort,
143
145
  batch.map((item) =>
144
- formatDocForEmbedding(item.text, item.title ?? undefined)
146
+ formatDocForEmbedding(
147
+ item.text,
148
+ item.title ?? undefined,
149
+ embedPort.modelUri
150
+ )
145
151
  )
146
152
  );
147
- if (!embedResult.ok || embedResult.value.length !== batch.length) {
153
+
154
+ if (!embedResult.ok) {
148
155
  errors += batch.length;
149
156
  continue;
150
157
  }
151
158
 
152
- const vectors: VectorRow[] = batch.map((item, idx) => ({
153
- mirrorHash: item.mirrorHash,
154
- seq: item.seq,
155
- model: modelUri,
156
- embedding: new Float32Array(embedResult.value[idx] as number[]),
157
- }));
158
- const storeResult = await vectorIndex.upsertVectors(vectors);
159
- if (!storeResult.ok) {
160
- errors += batch.length;
161
- continue;
159
+ const vectors: VectorRow[] = [];
160
+ for (const [idx, item] of batch.entries()) {
161
+ const embedding = embedResult.value.vectors[idx];
162
+ if (!embedding) {
163
+ errors += 1;
164
+ continue;
165
+ }
166
+ vectors.push({
167
+ mirrorHash: item.mirrorHash,
168
+ seq: item.seq,
169
+ model: modelUri,
170
+ embedding: new Float32Array(embedding),
171
+ });
162
172
  }
163
173
 
164
- embedded += batch.length;
174
+ if (vectors.length > 0) {
175
+ const storeResult = await vectorIndex.upsertVectors(vectors);
176
+ if (!storeResult.ok) {
177
+ errors += vectors.length;
178
+ continue;
179
+ }
180
+ embedded += vectors.length;
181
+ }
165
182
  }
166
183
 
167
184
  if (vectorIndex.vecDirty) {