@xdarkicex/openclaw-memory-libravdb 1.3.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (80) hide show
  1. package/README.md +46 -0
  2. package/docs/README.md +14 -0
  3. package/docs/architecture-decisions/README.md +6 -0
  4. package/docs/architecture-decisions/adr-001-onnx-over-ollama.md +21 -0
  5. package/docs/architecture-decisions/adr-002-libravdb-over-lancedb.md +19 -0
  6. package/docs/architecture-decisions/adr-003-convex-gating-over-threshold.md +27 -0
  7. package/docs/architecture-decisions/adr-004-sidecar-over-native-ts.md +21 -0
  8. package/docs/architecture.md +188 -0
  9. package/docs/contributing.md +76 -0
  10. package/docs/dependencies.md +38 -0
  11. package/docs/embedding-profiles.md +42 -0
  12. package/docs/gating.md +329 -0
  13. package/docs/implementation.md +381 -0
  14. package/docs/installation.md +272 -0
  15. package/docs/mathematics.md +695 -0
  16. package/docs/models.md +63 -0
  17. package/docs/problem.md +64 -0
  18. package/docs/security.md +86 -0
  19. package/openclaw.plugin.json +84 -0
  20. package/package.json +41 -0
  21. package/scripts/build-sidecar.sh +30 -0
  22. package/scripts/postinstall.js +169 -0
  23. package/scripts/setup.sh +20 -0
  24. package/scripts/setup.ts +505 -0
  25. package/scripts/sidecar-release.d.ts +4 -0
  26. package/scripts/sidecar-release.js +17 -0
  27. package/sidecar/cmd/inspect_onnx/main.go +105 -0
  28. package/sidecar/compact/gate.go +273 -0
  29. package/sidecar/compact/gate_test.go +85 -0
  30. package/sidecar/compact/summarize.go +345 -0
  31. package/sidecar/compact/summarize_test.go +319 -0
  32. package/sidecar/compact/tokens.go +11 -0
  33. package/sidecar/config/config.go +119 -0
  34. package/sidecar/config/config_test.go +75 -0
  35. package/sidecar/embed/engine.go +696 -0
  36. package/sidecar/embed/engine_test.go +349 -0
  37. package/sidecar/embed/matryoshka.go +93 -0
  38. package/sidecar/embed/matryoshka_test.go +150 -0
  39. package/sidecar/embed/onnx_local.go +319 -0
  40. package/sidecar/embed/onnx_local_test.go +159 -0
  41. package/sidecar/embed/profile_contract_test.go +71 -0
  42. package/sidecar/embed/profile_eval_test.go +923 -0
  43. package/sidecar/embed/profiles.go +39 -0
  44. package/sidecar/go.mod +21 -0
  45. package/sidecar/go.sum +30 -0
  46. package/sidecar/health/check.go +33 -0
  47. package/sidecar/health/check_test.go +55 -0
  48. package/sidecar/main.go +151 -0
  49. package/sidecar/model/encoder.go +222 -0
  50. package/sidecar/model/registry.go +262 -0
  51. package/sidecar/model/registry_test.go +102 -0
  52. package/sidecar/model/seq2seq.go +133 -0
  53. package/sidecar/server/rpc.go +343 -0
  54. package/sidecar/server/rpc_test.go +350 -0
  55. package/sidecar/server/transport.go +160 -0
  56. package/sidecar/store/libravdb.go +676 -0
  57. package/sidecar/store/libravdb_test.go +472 -0
  58. package/sidecar/summarize/engine.go +360 -0
  59. package/sidecar/summarize/engine_test.go +148 -0
  60. package/sidecar/summarize/onnx_local.go +494 -0
  61. package/sidecar/summarize/onnx_local_test.go +48 -0
  62. package/sidecar/summarize/profiles.go +52 -0
  63. package/sidecar/summarize/tokenizer.go +13 -0
  64. package/sidecar/summarize/tokenizer_hf.go +76 -0
  65. package/sidecar/summarize/util.go +13 -0
  66. package/src/cli.ts +205 -0
  67. package/src/context-engine.ts +195 -0
  68. package/src/index.ts +27 -0
  69. package/src/memory-provider.ts +24 -0
  70. package/src/openclaw-plugin-sdk.d.ts +53 -0
  71. package/src/plugin-runtime.ts +67 -0
  72. package/src/recall-cache.ts +34 -0
  73. package/src/recall-utils.ts +22 -0
  74. package/src/rpc.ts +84 -0
  75. package/src/scoring.ts +58 -0
  76. package/src/sidecar.ts +506 -0
  77. package/src/tokens.ts +36 -0
  78. package/src/types.ts +146 -0
  79. package/tsconfig.json +20 -0
  80. package/tsconfig.tests.json +12 -0
@@ -0,0 +1,923 @@
1
+ package embed
2
+
3
+ import (
4
+ "context"
5
+ "fmt"
6
+ "os"
7
+ "sort"
8
+ "strings"
9
+ "testing"
10
+ "time"
11
+ )
12
+
13
+ const rerankerWindowSize = 8
14
+
15
+ type evalDocument struct {
16
+ ID string
17
+ Text string
18
+ }
19
+
20
+ type evalQuery struct {
21
+ Name string
22
+ Text string
23
+ RelevantIDs []string
24
+ }
25
+
26
+ type stratifiedEvalCase struct {
27
+ Category string
28
+ Query string
29
+ Docs []string
30
+ RelevantIdx int
31
+ }
32
+
33
+ type categoryResult struct {
34
+ hits int
35
+ total int
36
+ }
37
+
38
+ type stratifiedMetrics struct {
39
+ RecallAt1 float64
40
+ RecallAt3 float64
41
+ RecallAtWindow float64
42
+ MRR float64
43
+ N int
44
+ }
45
+
46
+ type evalResult struct {
47
+ Name string
48
+ RecallAt1 float64
49
+ RecallAt3 float64
50
+ MeanReciprocal float64
51
+ AvgDocEmbedMs float64
52
+ AvgQueryMs float64
53
+ Failures []int
54
+ }
55
+
56
+ type evalProfile struct {
57
+ name string
58
+ cfg Config
59
+ }
60
+
61
+ func TestEmbeddingProfileAgentMemoryEval(t *testing.T) {
62
+ if os.Getenv("LIBRAVDB_RUN_EMBED_EVAL") != "1" {
63
+ t.Skip("set LIBRAVDB_RUN_EMBED_EVAL=1 to run real embedding evaluation")
64
+ }
65
+
66
+ runtimePath := strings.TrimSpace(os.Getenv("LIBRAVDB_EVAL_ONNX_RUNTIME"))
67
+ if runtimePath == "" {
68
+ t.Skip("set LIBRAVDB_EVAL_ONNX_RUNTIME to evaluate real ONNX-backed embedders")
69
+ }
70
+
71
+ docs := evaluationCorpus()
72
+ queries := evaluationQueries()
73
+
74
+ filter := strings.TrimSpace(os.Getenv("LIBRAVDB_EVAL_PROFILE_FILTER"))
75
+ debugRankings := os.Getenv("LIBRAVDB_EVAL_DEBUG") == "1"
76
+
77
+ profiles := evaluationProfiles(runtimePath)
78
+
79
+ for _, profile := range profiles {
80
+ if filter != "" && filter != profile.name {
81
+ continue
82
+ }
83
+ t.Run(profile.name, func(t *testing.T) {
84
+ engine := NewWithConfig(profile.cfg)
85
+ if !engine.Ready() {
86
+ t.Fatalf("engine not ready: %s", engine.Reason())
87
+ }
88
+ t.Logf("engine mode=%s family=%s reason=%q dimensions=%d", engine.Mode(), engine.Profile().Family, engine.Reason(), engine.Dimensions())
89
+
90
+ result, err := runEvaluation(t, engine, docs, queries, debugRankings)
91
+ if err != nil {
92
+ t.Fatalf("runEvaluation() error = %v", err)
93
+ }
94
+
95
+ t.Logf("profile=%s recall@1=%.3f recall@3=%.3f mrr=%.3f avg_doc_embed_ms=%.2f avg_query_ms=%.2f",
96
+ result.Name, result.RecallAt1, result.RecallAt3, result.MeanReciprocal, result.AvgDocEmbedMs, result.AvgQueryMs)
97
+
98
+ if result.RecallAt3 < 0.90 {
99
+ t.Fatalf("recall@3 %.3f fell below success-metric target", result.RecallAt3)
100
+ }
101
+ })
102
+ }
103
+ }
104
+
105
+ func TestFailureSetOverlap(t *testing.T) {
106
+ if os.Getenv("LIBRAVDB_RUN_EMBED_EVAL") != "1" {
107
+ t.Skip("set LIBRAVDB_RUN_EMBED_EVAL=1 to run real embedding evaluation")
108
+ }
109
+
110
+ runtimePath := strings.TrimSpace(os.Getenv("LIBRAVDB_EVAL_ONNX_RUNTIME"))
111
+ if runtimePath == "" {
112
+ t.Skip("set LIBRAVDB_EVAL_ONNX_RUNTIME to evaluate real ONNX-backed embedders")
113
+ }
114
+
115
+ docs := evaluationCorpus()
116
+ queries := evaluationQueries()
117
+ profiles := evaluationProfiles(runtimePath)
118
+ results := make([]evalResult, 0, len(profiles))
119
+
120
+ for _, profile := range profiles {
121
+ engine := NewWithConfig(profile.cfg)
122
+ if !engine.Ready() {
123
+ t.Fatalf("engine %s not ready: %s", profile.name, engine.Reason())
124
+ }
125
+ result, err := runEvaluation(t, engine, docs, queries, false)
126
+ if err != nil {
127
+ t.Fatalf("runEvaluation(%s) error = %v", profile.name, err)
128
+ }
129
+ result.Name = profile.name
130
+ t.Logf("%s failures=%v", profile.name, result.Failures)
131
+ results = append(results, result)
132
+ }
133
+
134
+ for i := 0; i < len(results); i++ {
135
+ for j := i + 1; j < len(results); j++ {
136
+ jaccard := jaccardFailures(results[i].Failures, results[j].Failures)
137
+ t.Logf("%s vs %s jaccard=%.3f", results[i].Name, results[j].Name, jaccard)
138
+ }
139
+ }
140
+ }
141
+
142
+ func TestStratifiedRecall(t *testing.T) {
143
+ if os.Getenv("LIBRAVDB_RUN_EMBED_EVAL") != "1" {
144
+ t.Skip("set LIBRAVDB_RUN_EMBED_EVAL=1 to run real embedding evaluation")
145
+ }
146
+
147
+ runtimePath := strings.TrimSpace(os.Getenv("LIBRAVDB_EVAL_ONNX_RUNTIME"))
148
+ if runtimePath == "" {
149
+ t.Skip("set LIBRAVDB_EVAL_ONNX_RUNTIME to evaluate real ONNX-backed embedders")
150
+ }
151
+
152
+ profiles := evaluationProfiles(runtimePath)
153
+ results := map[string]map[string]*categoryResult{}
154
+ cases := stratifiedHarnessCases()
155
+ debugRankings := os.Getenv("LIBRAVDB_EVAL_DEBUG") == "1"
156
+
157
+ for _, testCase := range cases {
158
+ if results[testCase.Category] == nil {
159
+ results[testCase.Category] = map[string]*categoryResult{}
160
+ }
161
+ for _, profile := range profiles {
162
+ if results[testCase.Category][profile.name] == nil {
163
+ results[testCase.Category][profile.name] = &categoryResult{}
164
+ }
165
+ entry := results[testCase.Category][profile.name]
166
+ entry.total++
167
+
168
+ engine := NewWithConfig(profile.cfg)
169
+ if !engine.Ready() {
170
+ t.Fatalf("engine %s not ready: %s", profile.name, engine.Reason())
171
+ }
172
+ if recallAtK(t, engine, testCase, 3, debugRankings) {
173
+ entry.hits++
174
+ }
175
+ }
176
+ }
177
+
178
+ t.Log("")
179
+ t.Log("Stratified recall@3 by category:")
180
+ t.Logf("%-16s %-24s %-24s %-24s",
181
+ "Category", "MiniLM bundled", "MiniLM onnx", "Nomic onnx")
182
+ t.Logf("%s", strings.Repeat("-", 92))
183
+
184
+ for _, category := range []string{"lexical", "paraphrase", "cross-domain", "adversarial"} {
185
+ row := results[category]
186
+ if row == nil {
187
+ continue
188
+ }
189
+ t.Logf("%-16s %-24s %-24s %-24s",
190
+ category,
191
+ formatRecall(row["bundled-all-minilm-l6-v2"]),
192
+ formatRecall(row["onnx-local-all-minilm-l6-v2"]),
193
+ formatRecall(row["onnx-local-nomic-embed-text-v1.5"]))
194
+ }
195
+ }
196
+
197
+ func TestStratifiedMetrics(t *testing.T) {
198
+ if os.Getenv("LIBRAVDB_RUN_EMBED_EVAL") != "1" {
199
+ t.Skip("set LIBRAVDB_RUN_EMBED_EVAL=1 to run real embedding evaluation")
200
+ }
201
+
202
+ runtimePath := strings.TrimSpace(os.Getenv("LIBRAVDB_EVAL_ONNX_RUNTIME"))
203
+ if runtimePath == "" {
204
+ t.Skip("set LIBRAVDB_EVAL_ONNX_RUNTIME to evaluate real ONNX-backed embedders")
205
+ }
206
+
207
+ profiles := evaluationProfiles(runtimePath)
208
+ cases := stratifiedHarnessCases()
209
+ byCategory := map[string][]stratifiedEvalCase{}
210
+ for _, testCase := range cases {
211
+ byCategory[testCase.Category] = append(byCategory[testCase.Category], testCase)
212
+ }
213
+
214
+ t.Log("")
215
+ t.Log("Stratified metrics by category:")
216
+ for _, category := range []string{"lexical", "paraphrase", "cross-domain", "adversarial"} {
217
+ categoryCases := byCategory[category]
218
+ if len(categoryCases) == 0 {
219
+ continue
220
+ }
221
+ t.Logf("category=%s", category)
222
+ for _, profile := range profiles {
223
+ engine := NewWithConfig(profile.cfg)
224
+ if !engine.Ready() {
225
+ t.Fatalf("engine %s not ready: %s", profile.name, engine.Reason())
226
+ }
227
+ metrics, err := computeStratifiedMetrics(engine, categoryCases)
228
+ if err != nil {
229
+ t.Fatalf("computeStratifiedMetrics(%s, %s) error = %v", profile.name, category, err)
230
+ }
231
+ t.Logf(" profile=%s recall@1=%.3f recall@3=%.3f recall@window=%.3f mrr=%.3f n=%d",
232
+ profile.name, metrics.RecallAt1, metrics.RecallAt3, metrics.RecallAtWindow, metrics.MRR, metrics.N)
233
+ }
234
+ }
235
+ }
236
+
237
+ func TestAdversarialFailureDiagnosis(t *testing.T) {
238
+ if os.Getenv("LIBRAVDB_RUN_EMBED_EVAL") != "1" {
239
+ t.Skip("set LIBRAVDB_RUN_EMBED_EVAL=1 to run real embedding evaluation")
240
+ }
241
+
242
+ runtimePath := strings.TrimSpace(os.Getenv("LIBRAVDB_EVAL_ONNX_RUNTIME"))
243
+ if runtimePath == "" {
244
+ t.Skip("set LIBRAVDB_EVAL_ONNX_RUNTIME to evaluate real ONNX-backed embedders")
245
+ }
246
+
247
+ cases := stratifiedHarnessCases()
248
+ target := -1
249
+ for i, testCase := range cases {
250
+ if testCase.Category == "adversarial" && testCase.Query == "model confidence score" {
251
+ target = i
252
+ break
253
+ }
254
+ }
255
+ if target < 0 {
256
+ t.Fatalf("failed to locate adversarial diagnosis case")
257
+ }
258
+
259
+ testCase := cases[target]
260
+ profiles := evaluationProfiles(runtimePath)
261
+ for _, profile := range profiles {
262
+ t.Run(profile.name, func(t *testing.T) {
263
+ engine := NewWithConfig(profile.cfg)
264
+ if !engine.Ready() {
265
+ t.Fatalf("engine %s not ready: %s", profile.name, engine.Reason())
266
+ }
267
+
268
+ queryVec, err := engine.EmbedQuery(context.Background(), testCase.Query)
269
+ if err != nil {
270
+ t.Fatalf("EmbedQuery() error = %v", err)
271
+ }
272
+
273
+ type scored struct {
274
+ Index int
275
+ Score float64
276
+ }
277
+ scores := make([]scored, 0, len(testCase.Docs))
278
+ for i, doc := range testCase.Docs {
279
+ docVec, err := engine.EmbedDocument(context.Background(), doc)
280
+ if err != nil {
281
+ t.Fatalf("EmbedDocument(%d) error = %v", i, err)
282
+ }
283
+ scores = append(scores, scored{
284
+ Index: i,
285
+ Score: cosineEval(queryVec, docVec),
286
+ })
287
+ }
288
+
289
+ sort.Slice(scores, func(i, j int) bool {
290
+ if scores[i].Score == scores[j].Score {
291
+ return scores[i].Index < scores[j].Index
292
+ }
293
+ return scores[i].Score > scores[j].Score
294
+ })
295
+
296
+ t.Logf("query=%q", testCase.Query)
297
+ for rank, score := range scores {
298
+ label := "distractor"
299
+ if score.Index == testCase.RelevantIdx {
300
+ label = "RELEVANT"
301
+ }
302
+ preview := testCase.Docs[score.Index]
303
+ if len(preview) > 72 {
304
+ preview = preview[:72]
305
+ }
306
+ t.Logf("rank=%d score=%.4f [%s] idx=%d %q",
307
+ rank+1, score.Score, label, score.Index, preview)
308
+ }
309
+
310
+ var relevantScore float64
311
+ foundRelevant := false
312
+ for _, score := range scores {
313
+ if score.Index == testCase.RelevantIdx {
314
+ relevantScore = score.Score
315
+ foundRelevant = true
316
+ break
317
+ }
318
+ }
319
+ if !foundRelevant {
320
+ t.Fatalf("relevant document missing from scored results")
321
+ }
322
+
323
+ topDistractorScore := scores[0].Score
324
+ if scores[0].Index == testCase.RelevantIdx && len(scores) > 1 {
325
+ topDistractorScore = scores[1].Score
326
+ }
327
+ t.Logf("margin(relevant-top_distractor)=%.4f", relevantScore-topDistractorScore)
328
+ })
329
+ }
330
+ }
331
+
332
+ func TestCase2TruncationDiagnostic(t *testing.T) {
333
+ if os.Getenv("LIBRAVDB_RUN_EMBED_EVAL") != "1" {
334
+ t.Skip("set LIBRAVDB_RUN_EMBED_EVAL=1 to run real embedding evaluation")
335
+ }
336
+
337
+ runtimePath := strings.TrimSpace(os.Getenv("LIBRAVDB_EVAL_ONNX_RUNTIME"))
338
+ if runtimePath == "" {
339
+ t.Skip("set LIBRAVDB_EVAL_ONNX_RUNTIME to evaluate real ONNX-backed embedders")
340
+ }
341
+
342
+ docs := evaluationCorpus()
343
+ relevantID := evaluationQueries()[2].RelevantIDs[0]
344
+ doc := ""
345
+ for _, candidate := range docs {
346
+ if candidate.ID == relevantID {
347
+ doc = candidate.Text
348
+ break
349
+ }
350
+ }
351
+ if doc == "" {
352
+ t.Fatalf("failed to resolve relevant document for query index 2")
353
+ }
354
+ profiles := evaluationProfiles(runtimePath)
355
+
356
+ for _, profile := range profiles {
357
+ t.Run(profile.name, func(t *testing.T) {
358
+ engine := NewWithConfig(profile.cfg)
359
+ if !engine.Ready() {
360
+ t.Fatalf("engine %s not ready: %s", profile.name, engine.Reason())
361
+ }
362
+
363
+ tokenCount, err := engine.TokenCountDocument(context.Background(), doc)
364
+ if err != nil {
365
+ t.Fatalf("TokenCountDocument() error = %v", err)
366
+ }
367
+ maxCtx := engine.Profile().MaxContextTokens
368
+ effective := tokenCount
369
+ if maxCtx > 0 && effective > maxCtx {
370
+ effective = maxCtx
371
+ }
372
+
373
+ t.Logf("profile=%s token_count=%d max_context_tokens=%d effective_tokens=%d", profile.name, tokenCount, maxCtx, effective)
374
+ if maxCtx > 0 && tokenCount > maxCtx {
375
+ t.Logf("TRUNCATION ACTIVE: %d tokens exceed nominal profile window", tokenCount-maxCtx)
376
+ }
377
+ })
378
+ }
379
+ }
380
+
381
+ func BenchmarkEmbeddingProfileAgentMemory(b *testing.B) {
382
+ runtimePath := strings.TrimSpace(os.Getenv("LIBRAVDB_EVAL_ONNX_RUNTIME"))
383
+ if runtimePath == "" {
384
+ b.Skip("set LIBRAVDB_EVAL_ONNX_RUNTIME for embedding profile benchmarks")
385
+ }
386
+
387
+ benchmarks := evaluationProfiles(runtimePath)
388
+
389
+ queries := evaluationQueries()
390
+ for _, bench := range benchmarks {
391
+ b.Run(bench.name, func(b *testing.B) {
392
+ engine := NewWithConfig(bench.cfg)
393
+ if !engine.Ready() {
394
+ b.Fatalf("engine not ready: %s", engine.Reason())
395
+ }
396
+
397
+ b.ResetTimer()
398
+ for i := 0; i < b.N; i++ {
399
+ query := queries[i%len(queries)]
400
+ if _, err := engine.EmbedQuery(context.Background(), query.Text); err != nil {
401
+ b.Fatalf("EmbedQuery() error = %v", err)
402
+ }
403
+ }
404
+ })
405
+ }
406
+ }
407
+
408
+ func runEvaluation(t *testing.T, engine *Engine, docs []evalDocument, queries []evalQuery, debugRankings bool) (evalResult, error) {
409
+ type docEmbedding struct {
410
+ ID string
411
+ Vector []float32
412
+ }
413
+
414
+ startDocs := time.Now()
415
+ embeddedDocs := make([]docEmbedding, 0, len(docs))
416
+ for _, doc := range docs {
417
+ vec, err := engine.EmbedDocument(context.Background(), doc.Text)
418
+ if err != nil {
419
+ return evalResult{}, fmt.Errorf("embed doc %s: %w", doc.ID, err)
420
+ }
421
+ embeddedDocs = append(embeddedDocs, docEmbedding{ID: doc.ID, Vector: vec})
422
+ }
423
+ docElapsed := time.Since(startDocs)
424
+
425
+ var hitsAt1, hitsAt3 int
426
+ var reciprocalSum float64
427
+ failures := make([]int, 0)
428
+ startQueries := time.Now()
429
+ for queryIndex, query := range queries {
430
+ queryVec, err := engine.EmbedQuery(context.Background(), query.Text)
431
+ if err != nil {
432
+ return evalResult{}, fmt.Errorf("embed query %s: %w", query.Name, err)
433
+ }
434
+
435
+ type scored struct {
436
+ ID string
437
+ Score float64
438
+ }
439
+ scoredDocs := make([]scored, 0, len(embeddedDocs))
440
+ for _, doc := range embeddedDocs {
441
+ scoredDocs = append(scoredDocs, scored{
442
+ ID: doc.ID,
443
+ Score: cosineEval(doc.Vector, queryVec),
444
+ })
445
+ }
446
+ sort.Slice(scoredDocs, func(i, j int) bool {
447
+ if scoredDocs[i].Score == scoredDocs[j].Score {
448
+ return scoredDocs[i].ID < scoredDocs[j].ID
449
+ }
450
+ return scoredDocs[i].Score > scoredDocs[j].Score
451
+ })
452
+
453
+ relevant := make(map[string]struct{}, len(query.RelevantIDs))
454
+ for _, id := range query.RelevantIDs {
455
+ relevant[id] = struct{}{}
456
+ }
457
+
458
+ if _, ok := relevant[scoredDocs[0].ID]; ok {
459
+ hitsAt1++
460
+ }
461
+
462
+ topK := minEval(3, len(scoredDocs))
463
+ foundTop3 := false
464
+ for i := 0; i < topK; i++ {
465
+ if _, ok := relevant[scoredDocs[i].ID]; ok {
466
+ foundTop3 = true
467
+ reciprocalSum += 1.0 / float64(i+1)
468
+ break
469
+ }
470
+ }
471
+ if foundTop3 {
472
+ hitsAt3++
473
+ } else if debugRankings {
474
+ failures = append(failures, queryIndex)
475
+ t.Logf("miss query=%s top3=%s(%.4f), %s(%.4f), %s(%.4f) relevant=%v",
476
+ query.Name,
477
+ scoredDocs[0].ID, scoredDocs[0].Score,
478
+ scoredDocs[1].ID, scoredDocs[1].Score,
479
+ scoredDocs[2].ID, scoredDocs[2].Score,
480
+ query.RelevantIDs)
481
+ } else {
482
+ failures = append(failures, queryIndex)
483
+ }
484
+ }
485
+ queryElapsed := time.Since(startQueries)
486
+
487
+ return evalResult{
488
+ Name: engine.Profile().Family,
489
+ RecallAt1: float64(hitsAt1) / float64(len(queries)),
490
+ RecallAt3: float64(hitsAt3) / float64(len(queries)),
491
+ MeanReciprocal: reciprocalSum / float64(len(queries)),
492
+ AvgDocEmbedMs: float64(docElapsed.Milliseconds()) / float64(len(docs)),
493
+ AvgQueryMs: float64(queryElapsed.Milliseconds()) / float64(len(queries)),
494
+ Failures: failures,
495
+ }, nil
496
+ }
497
+
498
+ func evaluationProfiles(runtimePath string) []evalProfile {
499
+ profiles := []evalProfile{
500
+ {
501
+ name: "bundled-all-minilm-l6-v2",
502
+ cfg: Config{
503
+ Backend: "bundled",
504
+ Profile: "all-minilm-l6-v2",
505
+ RuntimePath: runtimePath,
506
+ Normalize: true,
507
+ },
508
+ },
509
+ }
510
+
511
+ if miniLMDir := strings.TrimSpace(os.Getenv("LIBRAVDB_EVAL_MINILM_DIR")); miniLMDir != "" {
512
+ profiles = append(profiles, evalProfile{
513
+ name: "onnx-local-all-minilm-l6-v2",
514
+ cfg: Config{
515
+ Backend: "onnx-local",
516
+ Profile: "all-minilm-l6-v2",
517
+ RuntimePath: runtimePath,
518
+ ModelPath: miniLMDir,
519
+ },
520
+ })
521
+ }
522
+
523
+ if nomicDir := strings.TrimSpace(os.Getenv("LIBRAVDB_EVAL_NOMIC_DIR")); nomicDir != "" {
524
+ profiles = append(profiles, evalProfile{
525
+ name: "onnx-local-nomic-embed-text-v1.5",
526
+ cfg: Config{
527
+ Backend: "onnx-local",
528
+ Profile: "nomic-embed-text-v1.5",
529
+ RuntimePath: runtimePath,
530
+ ModelPath: nomicDir,
531
+ },
532
+ })
533
+ }
534
+
535
+ return profiles
536
+ }
537
+
538
+ func stratifiedHarnessCases() []stratifiedEvalCase {
539
+ return []stratifiedEvalCase{
540
+ {
541
+ Category: "lexical",
542
+ Query: "user prefers dark theme",
543
+ Docs: []string{
544
+ "User has expressed a preference for dark mode in the UI settings.",
545
+ "The system applies light theme by default on first launch.",
546
+ "Notification preferences are stored separately from display settings.",
547
+ "Accessibility settings allow higher contrast without changing the color theme.",
548
+ "The account profile stores per-device interface customizations.",
549
+ },
550
+ RelevantIdx: 0,
551
+ },
552
+ {
553
+ Category: "lexical",
554
+ Query: "compaction runs every session end",
555
+ Docs: []string{
556
+ "Memory compaction is triggered at the end of each active session.",
557
+ "Embedding inference runs during document ingestion.",
558
+ "The eviction policy removes stale model sessions after idle timeout.",
559
+ "Session summaries are re-embedded after they are generated.",
560
+ "Background maintenance can compact oversized collections on demand.",
561
+ },
562
+ RelevantIdx: 0,
563
+ },
564
+ {
565
+ Category: "paraphrase",
566
+ Query: "the system is slow when loading for the first time",
567
+ Docs: []string{
568
+ "Initial model warmup requires loading ONNX weights from disk, which introduces a startup penalty before the first inference request.",
569
+ "The first retrieval after process launch pays a one-time startup cost while the runtime and tokenizer are prepared.",
570
+ "The sidecar process communicates with the host over a Unix domain socket.",
571
+ "Session collections expire after the configured TTL elapses.",
572
+ "The downloader verifies model hashes before placing assets into the shared model directory.",
573
+ },
574
+ RelevantIdx: 0,
575
+ },
576
+ {
577
+ Category: "paraphrase",
578
+ Query: "teaching the model from past mistakes",
579
+ Docs: []string{
580
+ "When a recalled skill produces a failure outcome, its utility rate decreases and the system schedules an automatic rewrite.",
581
+ "Low-quality memories receive steeper decay so future retrieval relies less heavily on them.",
582
+ "The registry stores versioned skill definitions with prerequisite metadata.",
583
+ "Collision strategies are weighted by historical insight quality ratings.",
584
+ "A stale checkpoint can be restored if the active profile becomes corrupted.",
585
+ },
586
+ RelevantIdx: 0,
587
+ },
588
+ {
589
+ Category: "cross-domain",
590
+ Query: "how does the system decide what to forget",
591
+ Docs: []string{
592
+ "Records that have not been accessed within the idle TTL accumulate a higher eviction priority score, making them candidates for removal from the active model registry.",
593
+ "Less useful models gradually become more disposable as they sit idle and large in memory.",
594
+ "The BM25 index scores documents by term frequency and inverse document frequency.",
595
+ "Cross-encoder reranking refines the initial candidate set using a second model.",
596
+ "Chunked embeddings are averaged across overlapping windows for long documents.",
597
+ },
598
+ RelevantIdx: 0,
599
+ },
600
+ {
601
+ Category: "cross-domain",
602
+ Query: "experience makes you better at a task over time",
603
+ Docs: []string{
604
+ "The logarithmic frequency term in the eviction formula means that models accessed many times accumulate resistance to eviction that grows slower as usage increases, stabilizing long-term residents.",
605
+ "Repeated use dampens future eviction pressure so long-serving models become harder to dislodge from memory.",
606
+ "The tokenizer contract separates document and query encoding paths.",
607
+ "Hybrid scoring combines cosine similarity with recency and scope signals.",
608
+ "Vector dimensions must match the collection profile or reopen fails closed.",
609
+ },
610
+ RelevantIdx: 0,
611
+ },
612
+ {
613
+ Category: "adversarial",
614
+ Query: "model confidence score",
615
+ Docs: []string{
616
+ "The model's confidence in its prediction was scored by the evaluator.",
617
+ "The mean log-probability of generated tokens, exponentiated to [0,1], measures how certain the summarizer was about its output.",
618
+ "The scoring pipeline multiplies retrieval rank by a summary-quality term derived from decay metadata.",
619
+ "Registry status reports the currently loaded profile and dimensions.",
620
+ "Model status includes size, last access time, and cumulative use count.",
621
+ },
622
+ RelevantIdx: 1,
623
+ },
624
+ {
625
+ Category: "adversarial",
626
+ Query: "session memory cleanup",
627
+ Docs: []string{
628
+ "The IT team scheduled a session to discuss memory cleanup procedures for the legacy database.",
629
+ "Compaction clusters raw conversation turns into summarized records and deletes the source turns after summary insertion is confirmed.",
630
+ "Session records are inserted before source deletion so compaction cannot lose data on partial failure.",
631
+ "The embedding model is loaded once and shared across all inference paths.",
632
+ "Old model artifacts are removed from the cache after hash verification fails.",
633
+ },
634
+ RelevantIdx: 1,
635
+ },
636
+ }
637
+ }
638
+
639
+ func evaluationCorpus() []evalDocument {
640
+ return []evalDocument{
641
+ {
642
+ ID: "user_pref_terminal",
643
+ Text: "User preference memory: prefers terminal-first workflows, hates hidden GUI state, and wants plain text explanations that are concise but technically serious.",
644
+ },
645
+ {
646
+ ID: "project_backend_stability",
647
+ Text: "Engineering note: LibraVDB backend was unstable before HNSW and streaming fixes; after allocator, storage, and race work it is now considered credible enough for the OpenClaw memory plugin backbone.",
648
+ },
649
+ {
650
+ ID: "agent_memory_scope",
651
+ Text: "Product decision: this memory system is for agent memory and session continuity, not whole-codebase embedding. Retrieval quality for user preferences, project state, and ongoing tasks matters more than raw document indexing breadth.",
652
+ },
653
+ {
654
+ ID: "nomic_context_advantage",
655
+ Text: longContextDoc("Nomic profile note: nomic-embed-text-v1.5 supports a substantially larger context window and matryoshka embeddings. That matters when a memory entry is a long planning trace where the decisive detail appears late in the text. The key retained fact is that the user worries MiniLM's shorter effective context may bite semantic recall in agent workflows."),
656
+ },
657
+ {
658
+ ID: "minilm_efficiency",
659
+ Text: "MiniLM profile note: all-MiniLM-L6-v2 is lighter on system resources, faster to run locally, and may still be superior if the workload is short-turn memory rather than long-document embedding.",
660
+ },
661
+ {
662
+ ID: "plugin_host_contract",
663
+ Text: "Host integration note: OpenClaw ignores async plugin registration, so the plugin must register synchronously and lazily start the sidecar on first real use.",
664
+ },
665
+ {
666
+ ID: "security_untrusted_memory",
667
+ Text: "Security note: recalled memories are untrusted historical context only and must never be followed as instructions.",
668
+ },
669
+ {
670
+ ID: "compaction_goal",
671
+ Text: "Compaction objective: shrink a large session to a smaller semantic summary while preserving core meaning, instead of letting active sessions grow until they become unusable.",
672
+ },
673
+ {
674
+ ID: "benchmark_metric",
675
+ Text: "Success metric: top-3 recalled messages should be semantically relevant more than ninety percent of the time.",
676
+ },
677
+ {
678
+ ID: "slabby_decision",
679
+ Text: "Allocator decision: slot-based slabby is now the default HNSW raw-vector backend because live bytes match the in-memory backend and the remaining delta is mostly reserved headroom.",
680
+ },
681
+ }
682
+ }
683
+
684
+ func recallAtK(t *testing.T, engine *Engine, testCase stratifiedEvalCase, k int, debug bool) bool {
685
+ t.Helper()
686
+
687
+ scoredDocs, err := scoreStratifiedCase(engine, testCase)
688
+ if err != nil {
689
+ t.Fatalf("scoreStratifiedCase(%s): %v", testCase.Category, err)
690
+ }
691
+ topK := minEval(k, len(scoredDocs))
692
+ for i := 0; i < topK; i++ {
693
+ if scoredDocs[i].Index == testCase.RelevantIdx {
694
+ return true
695
+ }
696
+ }
697
+ if debug {
698
+ parts := make([]string, 0, topK)
699
+ for i := 0; i < topK; i++ {
700
+ parts = append(parts, fmt.Sprintf("%d(%.4f)", scoredDocs[i].Index, scoredDocs[i].Score))
701
+ }
702
+ t.Logf("stratified miss category=%s query=%q top%d=%s relevant=%d",
703
+ testCase.Category, testCase.Query, topK, strings.Join(parts, ", "), testCase.RelevantIdx)
704
+ }
705
+ return false
706
+ }
707
+
708
+ func computeStratifiedMetrics(engine *Engine, cases []stratifiedEvalCase) (stratifiedMetrics, error) {
709
+ if len(cases) == 0 {
710
+ return stratifiedMetrics{}, nil
711
+ }
712
+
713
+ var hitsAt1, hitsAt3, hitsAtWindow int
714
+ var reciprocalSum float64
715
+ for _, testCase := range cases {
716
+ rank, err := rankStratifiedCase(engine, testCase)
717
+ if err != nil {
718
+ return stratifiedMetrics{}, err
719
+ }
720
+ if rank == 1 {
721
+ hitsAt1++
722
+ }
723
+ if rank <= 3 {
724
+ hitsAt3++
725
+ }
726
+ if rank <= rerankerWindowSize {
727
+ hitsAtWindow++
728
+ }
729
+ reciprocalSum += 1.0 / float64(rank)
730
+ }
731
+
732
+ n := float64(len(cases))
733
+ return stratifiedMetrics{
734
+ RecallAt1: float64(hitsAt1) / n,
735
+ RecallAt3: float64(hitsAt3) / n,
736
+ RecallAtWindow: float64(hitsAtWindow) / n,
737
+ MRR: reciprocalSum / n,
738
+ N: len(cases),
739
+ }, nil
740
+ }
741
+
742
+ func rankStratifiedCase(engine *Engine, testCase stratifiedEvalCase) (int, error) {
743
+ scoredDocs, err := scoreStratifiedCase(engine, testCase)
744
+ if err != nil {
745
+ return 0, err
746
+ }
747
+ for rank, score := range scoredDocs {
748
+ if score.Index == testCase.RelevantIdx {
749
+ return rank + 1, nil
750
+ }
751
+ }
752
+ return len(testCase.Docs) + 1, nil
753
+ }
754
+
755
+ func scoreStratifiedCase(engine *Engine, testCase stratifiedEvalCase) ([]struct {
756
+ Index int
757
+ Score float64
758
+ }, error) {
759
+ embeddedDocs := make([][]float32, 0, len(testCase.Docs))
760
+ for _, doc := range testCase.Docs {
761
+ vec, err := engine.EmbedDocument(context.Background(), doc)
762
+ if err != nil {
763
+ return nil, err
764
+ }
765
+ embeddedDocs = append(embeddedDocs, vec)
766
+ }
767
+
768
+ queryVec, err := engine.EmbedQuery(context.Background(), testCase.Query)
769
+ if err != nil {
770
+ return nil, err
771
+ }
772
+
773
+ scoredDocs := make([]struct {
774
+ Index int
775
+ Score float64
776
+ }, 0, len(embeddedDocs))
777
+ for i, docVec := range embeddedDocs {
778
+ scoredDocs = append(scoredDocs, struct {
779
+ Index int
780
+ Score float64
781
+ }{
782
+ Index: i,
783
+ Score: cosineEval(docVec, queryVec),
784
+ })
785
+ }
786
+ sort.Slice(scoredDocs, func(i, j int) bool {
787
+ if scoredDocs[i].Score == scoredDocs[j].Score {
788
+ return scoredDocs[i].Index < scoredDocs[j].Index
789
+ }
790
+ return scoredDocs[i].Score > scoredDocs[j].Score
791
+ })
792
+ return scoredDocs, nil
793
+ }
794
+
795
+ func formatRecall(r *categoryResult) string {
796
+ if r == nil {
797
+ return "n/a"
798
+ }
799
+ hits, total := r.hits, r.total
800
+ if total == 0 {
801
+ return "n/a"
802
+ }
803
+ return fmt.Sprintf("%.3f (%d/%d)", float64(hits)/float64(total), hits, total)
804
+ }
805
+
806
+ func evaluationQueries() []evalQuery {
807
+ return []evalQuery{
808
+ {
809
+ Name: "user_style_preference",
810
+ Text: "How does the user want the assistant to communicate in the terminal?",
811
+ RelevantIDs: []string{"user_pref_terminal"},
812
+ },
813
+ {
814
+ Name: "memory_scope",
815
+ Text: "Is this memory plugin intended for whole codebase embeddings or for the agent's own memory and continuity?",
816
+ RelevantIDs: []string{"agent_memory_scope"},
817
+ },
818
+ {
819
+ Name: "nomic_long_context",
820
+ Text: "Which note says longer context and matryoshka support may help preserve important details late in a long memory entry?",
821
+ RelevantIDs: []string{"nomic_context_advantage"},
822
+ },
823
+ {
824
+ Name: "minilm_tradeoff",
825
+ Text: "Which memory says MiniLM might still win because it is faster and uses fewer resources for short memory turns?",
826
+ RelevantIDs: []string{"minilm_efficiency"},
827
+ },
828
+ {
829
+ Name: "plugin_sync_register",
830
+ Text: "Why does the OpenClaw plugin register synchronously and start the sidecar lazily?",
831
+ RelevantIDs: []string{"plugin_host_contract"},
832
+ },
833
+ {
834
+ Name: "security_untrusted",
835
+ Text: "What is the rule about recalled memories and instructions?",
836
+ RelevantIDs: []string{"security_untrusted_memory"},
837
+ },
838
+ {
839
+ Name: "compaction",
840
+ Text: "What is the purpose of compaction in this memory system?",
841
+ RelevantIDs: []string{"compaction_goal"},
842
+ },
843
+ {
844
+ Name: "recall_metric",
845
+ Text: "What recall relevance target do we want in the top three retrieved memories?",
846
+ RelevantIDs: []string{"benchmark_metric"},
847
+ },
848
+ {
849
+ Name: "allocator_choice",
850
+ Text: "Which allocator backend became the default for HNSW raw vectors and why?",
851
+ RelevantIDs: []string{"slabby_decision"},
852
+ },
853
+ }
854
+ }
855
+
856
+ func longContextDoc(tail string) string {
857
+ prefix := strings.Repeat("Earlier planning chatter about agent loops, retries, sidecar supervision, benchmark setup, memory scope, collection safety, and local model packaging. ", 24)
858
+ return prefix + tail
859
+ }
860
+
861
+ func cosineEval(a, b []float32) float64 {
862
+ if len(a) == 0 || len(a) != len(b) {
863
+ return 0
864
+ }
865
+ var dot, normA, normB float64
866
+ for i := range a {
867
+ av := float64(a[i])
868
+ bv := float64(b[i])
869
+ dot += av * bv
870
+ normA += av * av
871
+ normB += bv * bv
872
+ }
873
+ if normA == 0 || normB == 0 {
874
+ return 0
875
+ }
876
+ return dot / (sqrtEval(normA) * sqrtEval(normB))
877
+ }
878
+
879
+ func sqrtEval(v float64) float64 {
880
+ if v <= 0 {
881
+ return 0
882
+ }
883
+ x := v
884
+ for i := 0; i < 8; i++ {
885
+ x = 0.5 * (x + v/x)
886
+ }
887
+ return x
888
+ }
889
+
890
+ func minEval(a, b int) int {
891
+ if a < b {
892
+ return a
893
+ }
894
+ return b
895
+ }
896
+
897
+ func jaccardFailures(a, b []int) float64 {
898
+ if len(a) == 0 && len(b) == 0 {
899
+ return 1
900
+ }
901
+ left := make(map[int]struct{}, len(a))
902
+ right := make(map[int]struct{}, len(b))
903
+ for _, value := range a {
904
+ left[value] = struct{}{}
905
+ }
906
+ for _, value := range b {
907
+ right[value] = struct{}{}
908
+ }
909
+
910
+ intersection := 0
911
+ union := len(left)
912
+ for value := range right {
913
+ if _, ok := left[value]; ok {
914
+ intersection++
915
+ continue
916
+ }
917
+ union++
918
+ }
919
+ if union == 0 {
920
+ return 1
921
+ }
922
+ return float64(intersection) / float64(union)
923
+ }