@pentatonic-ai/ai-agent-sdk 0.9.6 → 0.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (127) hide show
  1. package/README.md +3 -3
  2. package/bin/cli.js +1 -1
  3. package/bin/commands/config.js +1 -1
  4. package/dist/index.cjs +1 -1
  5. package/dist/index.js +1 -1
  6. package/package.json +2 -2
  7. package/packages/doctor/src/checks/local-memory.js +2 -2
  8. package/packages/memory/README.md +2 -2
  9. package/packages/memory/openclaw-plugin/README.md +2 -2
  10. package/packages/memory/openclaw-plugin/openclaw.plugin.json +1 -1
  11. package/packages/memory/src/server.js +2 -2
  12. package/packages/memory-engine-v2/.env.example +30 -0
  13. package/packages/memory-engine-v2/README.md +125 -0
  14. package/packages/memory-engine-v2/compat/Dockerfile +11 -0
  15. package/packages/memory-engine-v2/compat/requirements.txt +6 -0
  16. package/packages/memory-engine-v2/compat/server.py +1047 -0
  17. package/packages/memory-engine-v2/docker-compose.aws.yml +78 -0
  18. package/packages/memory-engine-v2/docker-compose.yml +206 -0
  19. package/packages/memory-engine-v2/extractor-async/Dockerfile +14 -0
  20. package/packages/memory-engine-v2/extractor-async/confidence.py +62 -0
  21. package/packages/memory-engine-v2/extractor-async/noise_filter.py +144 -0
  22. package/packages/memory-engine-v2/extractor-async/requirements.txt +2 -0
  23. package/packages/memory-engine-v2/extractor-async/test_confidence.py +76 -0
  24. package/packages/memory-engine-v2/extractor-async/test_noise_filter.py +177 -0
  25. package/packages/memory-engine-v2/extractor-async/worker.py +797 -0
  26. package/packages/memory-engine-v2/extractor-sync/Dockerfile +11 -0
  27. package/packages/memory-engine-v2/extractor-sync/requirements.txt +4 -0
  28. package/packages/memory-engine-v2/extractor-sync/server.py +424 -0
  29. package/packages/memory-engine-v2/org-model/migrations/001_init.sql +390 -0
  30. package/packages/memory-engine-v2/tests/e2e_smoke.py +356 -0
  31. package/packages/memory-engine-v2/tests/fixtures/generate_synthetic_corpus.py +758 -0
  32. package/packages/memory-engine/.env.example +0 -13
  33. package/packages/memory-engine/MIGRATION.md +0 -219
  34. package/packages/memory-engine/README.md +0 -145
  35. package/packages/memory-engine/bench/README.md +0 -99
  36. package/packages/memory-engine/bench/scorecards-engine/agent-coding__pentatonic-baseline__20260427-142523.json +0 -1115
  37. package/packages/memory-engine/bench/scorecards-engine/chat-recall__pentatonic-baseline__20260427-142648.json +0 -819
  38. package/packages/memory-engine/bench/scorecards-engine/circular-economy__pentatonic-baseline__20260427-142757.json +0 -1278
  39. package/packages/memory-engine/bench/scorecards-engine/customer-support__pentatonic-baseline__20260427-142900.json +0 -1018
  40. package/packages/memory-engine/bench/scorecards-engine/marketplace-ops__pentatonic-baseline__20260427-142957.json +0 -1038
  41. package/packages/memory-engine/bench/scorecards-engine/product-catalogue__pentatonic-baseline__20260427-143122.json +0 -961
  42. package/packages/memory-engine/bench/scorecards-engine-via-docker/agent-coding__pentatonic-memory__20260427-161812.json +0 -1115
  43. package/packages/memory-engine/bench/scorecards-engine-via-docker/chat-recall__pentatonic-memory__20260427-161701.json +0 -819
  44. package/packages/memory-engine/bench/scorecards-engine-via-docker/circular-economy__pentatonic-memory__20260427-161713.json +0 -1278
  45. package/packages/memory-engine/bench/scorecards-engine-via-docker/customer-support__pentatonic-memory__20260427-161723.json +0 -1018
  46. package/packages/memory-engine/bench/scorecards-engine-via-docker/marketplace-ops__pentatonic-memory__20260427-161732.json +0 -1038
  47. package/packages/memory-engine/bench/scorecards-engine-via-docker/product-catalogue__pentatonic-memory__20260427-161741.json +0 -937
  48. package/packages/memory-engine/bench/scorecards-engine-via-l2-7-layer-populated/agent-coding__pentatonic-memory__20260427-184718.json +0 -1115
  49. package/packages/memory-engine/bench/scorecards-engine-via-l2-7-layer-populated/chat-recall__pentatonic-memory__20260427-184614.json +0 -819
  50. package/packages/memory-engine/bench/scorecards-engine-via-l2-7-layer-populated/circular-economy__pentatonic-memory__20260427-184809.json +0 -1278
  51. package/packages/memory-engine/bench/scorecards-engine-via-l2-7-layer-populated/customer-support__pentatonic-memory__20260427-184854.json +0 -1018
  52. package/packages/memory-engine/bench/scorecards-engine-via-l2-7-layer-populated/marketplace-ops__pentatonic-memory__20260427-184929.json +0 -1038
  53. package/packages/memory-engine/bench/scorecards-engine-via-l2-7-layer-populated/product-catalogue__pentatonic-memory__20260427-185015.json +0 -961
  54. package/packages/memory-engine/bench/scorecards-engine-via-l2-empty-layers/agent-coding__pentatonic-memory__20260427-175252.json +0 -1115
  55. package/packages/memory-engine/bench/scorecards-engine-via-l2-empty-layers/chat-recall__pentatonic-memory__20260427-175312.json +0 -819
  56. package/packages/memory-engine/bench/scorecards-engine-via-l2-empty-layers/circular-economy__pentatonic-memory__20260427-175335.json +0 -1278
  57. package/packages/memory-engine/bench/scorecards-engine-via-l2-empty-layers/customer-support__pentatonic-memory__20260427-175355.json +0 -1018
  58. package/packages/memory-engine/bench/scorecards-engine-via-l2-empty-layers/marketplace-ops__pentatonic-memory__20260427-175413.json +0 -1038
  59. package/packages/memory-engine/bench/scorecards-engine-via-l2-empty-layers/product-catalogue__pentatonic-memory__20260427-175430.json +0 -883
  60. package/packages/memory-engine/bench/scorecards-engine-via-shim/agent-coding__pentatonic-memory__20260427-155409.json +0 -1115
  61. package/packages/memory-engine/bench/scorecards-engine-via-shim/chat-recall__pentatonic-memory__20260427-155421.json +0 -819
  62. package/packages/memory-engine/bench/scorecards-engine-via-shim/circular-economy__pentatonic-memory__20260427-155433.json +0 -1278
  63. package/packages/memory-engine/bench/scorecards-engine-via-shim/customer-support__pentatonic-memory__20260427-155443.json +0 -1018
  64. package/packages/memory-engine/bench/scorecards-engine-via-shim/marketplace-ops__pentatonic-memory__20260427-155453.json +0 -1038
  65. package/packages/memory-engine/bench/scorecards-engine-via-shim/product-catalogue__pentatonic-memory__20260427-155503.json +0 -937
  66. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/agent-coding__pentatonic-memory-latest__20260427-145103.json +0 -1115
  67. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/agent-coding__pentatonic-memory__20260427-144909.json +0 -1115
  68. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/chat-recall__pentatonic-memory-latest__20260427-145153.json +0 -819
  69. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/chat-recall__pentatonic-memory__20260427-145120.json +0 -542
  70. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/circular-economy__pentatonic-memory-latest__20260427-145313.json +0 -1278
  71. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/circular-economy__pentatonic-memory__20260427-145207.json +0 -894
  72. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/customer-support__pentatonic-memory-latest__20260427-145412.json +0 -1018
  73. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/customer-support__pentatonic-memory__20260427-145327.json +0 -680
  74. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/marketplace-ops__pentatonic-memory-latest__20260427-145517.json +0 -1038
  75. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/marketplace-ops__pentatonic-memory__20260427-145422.json +0 -693
  76. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/product-catalogue__pentatonic-memory-latest__20260427-145616.json +0 -961
  77. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/product-catalogue__pentatonic-memory__20260427-145528.json +0 -727
  78. package/packages/memory-engine/compat/Dockerfile +0 -22
  79. package/packages/memory-engine/compat/server.py +0 -1255
  80. package/packages/memory-engine/docker-compose.test.yml +0 -59
  81. package/packages/memory-engine/docker-compose.yml +0 -255
  82. package/packages/memory-engine/engine/README.md +0 -52
  83. package/packages/memory-engine/engine/l2-hybridrag-proxy.py +0 -1543
  84. package/packages/memory-engine/engine/l5-comms-layer.py +0 -663
  85. package/packages/memory-engine/engine/l6-document-store.py +0 -1018
  86. package/packages/memory-engine/engine/services/_shared/__init__.py +0 -1
  87. package/packages/memory-engine/engine/services/_shared/embed_provider.py +0 -562
  88. package/packages/memory-engine/engine/services/l2/Dockerfile +0 -50
  89. package/packages/memory-engine/engine/services/l2/init_databases.py +0 -81
  90. package/packages/memory-engine/engine/services/l2/l2-hybridrag-proxy.py +0 -2721
  91. package/packages/memory-engine/engine/services/l5/Dockerfile +0 -11
  92. package/packages/memory-engine/engine/services/l5/l5-comms-layer.py +0 -808
  93. package/packages/memory-engine/engine/services/l6/Dockerfile +0 -30
  94. package/packages/memory-engine/engine/services/l6/l6-document-store.py +0 -1221
  95. package/packages/memory-engine/engine/services/nv-embed/Dockerfile +0 -28
  96. package/packages/memory-engine/engine/services/nv-embed/server.py +0 -152
  97. package/packages/memory-engine/pme_memory/__init__.py +0 -0
  98. package/packages/memory-engine/pme_memory/__main__.py +0 -129
  99. package/packages/memory-engine/pme_memory/artifacts.py +0 -95
  100. package/packages/memory-engine/pme_memory/embed.py +0 -74
  101. package/packages/memory-engine/pme_memory/health.py +0 -36
  102. package/packages/memory-engine/pme_memory/hygiene.py +0 -159
  103. package/packages/memory-engine/pme_memory/indexer.py +0 -200
  104. package/packages/memory-engine/pme_memory/needs.py +0 -55
  105. package/packages/memory-engine/pme_memory/provenance.py +0 -80
  106. package/packages/memory-engine/pme_memory/scoring.py +0 -168
  107. package/packages/memory-engine/pme_memory/search.py +0 -52
  108. package/packages/memory-engine/pme_memory/store.py +0 -86
  109. package/packages/memory-engine/pme_memory/synthesis.py +0 -114
  110. package/packages/memory-engine/pyproject.toml +0 -65
  111. package/packages/memory-engine/scripts/kg-extractor.py +0 -557
  112. package/packages/memory-engine/scripts/kg-preflexor-v2.py +0 -738
  113. package/packages/memory-engine/scripts/wipe-legacy-l3-entities.py +0 -128
  114. package/packages/memory-engine/tests/e2e_arena.sh +0 -259
  115. package/packages/memory-engine/tests/embed_stub/Dockerfile +0 -13
  116. package/packages/memory-engine/tests/embed_stub/server.py +0 -80
  117. package/packages/memory-engine/tests/test_aggregate.py +0 -333
  118. package/packages/memory-engine/tests/test_api_contract.sh +0 -57
  119. package/packages/memory-engine/tests/test_arena_safety.py +0 -232
  120. package/packages/memory-engine/tests/test_channel_stat_reader.py +0 -437
  121. package/packages/memory-engine/tests/test_channel_stat_rollups.py +0 -308
  122. package/packages/memory-engine/tests/test_compat_nv_embed_probe.py +0 -48
  123. package/packages/memory-engine/tests/test_embed_provider.py +0 -693
  124. package/packages/memory-engine/tests/test_l2_qmd_vec_search.py +0 -280
  125. package/packages/memory-engine/tests/test_l3_arena_isolation.py +0 -412
  126. package/packages/memory-engine/tests/test_l6_module_load.py +0 -84
  127. package/packages/memory-engine/tests/test_people_list_reader.py +0 -432
@@ -1,542 +0,0 @@
1
- {
2
- "bench": "chat-recall",
3
- "stack": "pentatonic-memory",
4
- "n_tasks": 16,
5
- "n_correct": 2,
6
- "accuracy": 0.125,
7
- "mean_score": 0.15625,
8
- "p50_search_ms": 25.818128502578475,
9
- "p95_search_ms": 31.491656991420314,
10
- "total_tokens_in": 0,
11
- "total_tokens_out": 0,
12
- "total_usd": 0.0,
13
- "by_tag": {
14
- "factoid": {
15
- "n": 6,
16
- "mean_score": 0.16666666666666666,
17
- "accuracy": 0.16666666666666666
18
- },
19
- "owner": {
20
- "n": 4,
21
- "mean_score": 0.25,
22
- "accuracy": 0.25
23
- },
24
- "temporal": {
25
- "n": 4,
26
- "mean_score": 0.0,
27
- "accuracy": 0.0
28
- },
29
- "contradiction": {
30
- "n": 2,
31
- "mean_score": 0.0,
32
- "accuracy": 0.0
33
- },
34
- "status": {
35
- "n": 4,
36
- "mean_score": 0.25,
37
- "accuracy": 0.25
38
- },
39
- "multi-hop": {
40
- "n": 4,
41
- "mean_score": 0.125,
42
- "accuracy": 0.0
43
- }
44
- },
45
- "extra": {
46
- "ingest_ms": 16568.97608199506,
47
- "grading": "substring",
48
- "limit": 3,
49
- "tokens": {
50
- "corpus_tokens": 513,
51
- "query_tokens": 140,
52
- "context_tokens": 49,
53
- "retrieval_tokens": 189,
54
- "naive_tokens": 8348,
55
- "saved_tokens": 8159,
56
- "reduction_pct": 0.9773598466698611,
57
- "mean_retrieval_tokens_per_task": 11.8125,
58
- "tokenizer": "cl100k_base",
59
- "per_task": {
60
- "who-owns-atlas": {
61
- "query": 5,
62
- "context": 0,
63
- "retrieval": 5,
64
- "judge_in": 0,
65
- "judge_out": 0,
66
- "judge_latency_ms": 0.0
67
- },
68
- "who-owns-borealis": {
69
- "query": 7,
70
- "context": 0,
71
- "retrieval": 7,
72
- "judge_in": 0,
73
- "judge_out": 0,
74
- "judge_latency_ms": 0.0
75
- },
76
- "who-owns-cirrus": {
77
- "query": 6,
78
- "context": 0,
79
- "retrieval": 6,
80
- "judge_in": 0,
81
- "judge_out": 0,
82
- "judge_latency_ms": 0.0
83
- },
84
- "who-owns-dune": {
85
- "query": 6,
86
- "context": 15,
87
- "retrieval": 21,
88
- "judge_in": 0,
89
- "judge_out": 0,
90
- "judge_latency_ms": 0.0
91
- },
92
- "current-deadline-atlas": {
93
- "query": 8,
94
- "context": 0,
95
- "retrieval": 8,
96
- "judge_in": 0,
97
- "judge_out": 0,
98
- "judge_latency_ms": 0.0
99
- },
100
- "current-deadline-borealis": {
101
- "query": 10,
102
- "context": 0,
103
- "retrieval": 10,
104
- "judge_in": 0,
105
- "judge_out": 0,
106
- "judge_latency_ms": 0.0
107
- },
108
- "current-deadline-cirrus": {
109
- "query": 9,
110
- "context": 0,
111
- "retrieval": 9,
112
- "judge_in": 0,
113
- "judge_out": 0,
114
- "judge_latency_ms": 0.0
115
- },
116
- "current-deadline-dune": {
117
- "query": 9,
118
- "context": 0,
119
- "retrieval": 9,
120
- "judge_in": 0,
121
- "judge_out": 0,
122
- "judge_latency_ms": 0.0
123
- },
124
- "status-atlas": {
125
- "query": 8,
126
- "context": 0,
127
- "retrieval": 8,
128
- "judge_in": 0,
129
- "judge_out": 0,
130
- "judge_latency_ms": 0.0
131
- },
132
- "status-borealis": {
133
- "query": 10,
134
- "context": 0,
135
- "retrieval": 10,
136
- "judge_in": 0,
137
- "judge_out": 0,
138
- "judge_latency_ms": 0.0
139
- },
140
- "status-cirrus": {
141
- "query": 9,
142
- "context": 0,
143
- "retrieval": 9,
144
- "judge_in": 0,
145
- "judge_out": 0,
146
- "judge_latency_ms": 0.0
147
- },
148
- "status-dune": {
149
- "query": 9,
150
- "context": 9,
151
- "retrieval": 18,
152
- "judge_in": 0,
153
- "judge_out": 0,
154
- "judge_latency_ms": 0.0
155
- },
156
- "multihop-atlas": {
157
- "query": 10,
158
- "context": 0,
159
- "retrieval": 10,
160
- "judge_in": 0,
161
- "judge_out": 0,
162
- "judge_latency_ms": 0.0
163
- },
164
- "multihop-borealis": {
165
- "query": 12,
166
- "context": 0,
167
- "retrieval": 12,
168
- "judge_in": 0,
169
- "judge_out": 0,
170
- "judge_latency_ms": 0.0
171
- },
172
- "multihop-cirrus": {
173
- "query": 11,
174
- "context": 0,
175
- "retrieval": 11,
176
- "judge_in": 0,
177
- "judge_out": 0,
178
- "judge_latency_ms": 0.0
179
- },
180
- "multihop-dune": {
181
- "query": 11,
182
- "context": 25,
183
- "retrieval": 36,
184
- "judge_in": 0,
185
- "judge_out": 0,
186
- "judge_latency_ms": 0.0
187
- }
188
- },
189
- "judge_tokens_in": 0,
190
- "judge_tokens_out": 0,
191
- "judge_calls": 0,
192
- "judge_mean_latency_ms": 0.0
193
- },
194
- "cost_usd": {
195
- "assumed_completion_tokens_per_task": 100,
196
- "rates": {
197
- "input_per_1k": 0.0025,
198
- "output_per_1k": 0.01,
199
- "model": "gpt-4o"
200
- },
201
- "retrieval_usd_in": 0.00047250000000000005,
202
- "retrieval_usd_out": 0.016,
203
- "retrieval_usd_total": 0.0164725,
204
- "naive_usd_total": 0.03687,
205
- "saved_usd": 0.0203975,
206
- "saved_usd_per_1k_tasks": 1.2748437499999998
207
- }
208
- },
209
- "task_results": [
210
- {
211
- "task_id": "who-owns-atlas",
212
- "query": "Who owns project Atlas?",
213
- "answer": "",
214
- "hits": [],
215
- "correct": false,
216
- "score": 0.0,
217
- "grading_notes": "missing 1/1: ['Alice']",
218
- "search_time_ms": 31.491656991420314,
219
- "generation_time_ms": 0.0,
220
- "tokens_in": 0,
221
- "tokens_out": 0,
222
- "retrieval_tokens": 5,
223
- "query_tokens": 5,
224
- "context_tokens": 0,
225
- "judge_tokens_in": 0,
226
- "judge_tokens_out": 0,
227
- "judge_latency_ms": 0.0
228
- },
229
- {
230
- "task_id": "who-owns-borealis",
231
- "query": "Who owns project Borealis?",
232
- "answer": "",
233
- "hits": [],
234
- "correct": false,
235
- "score": 0.0,
236
- "grading_notes": "missing 1/1: ['Clara']",
237
- "search_time_ms": 23.22632700088434,
238
- "generation_time_ms": 0.0,
239
- "tokens_in": 0,
240
- "tokens_out": 0,
241
- "retrieval_tokens": 7,
242
- "query_tokens": 7,
243
- "context_tokens": 0,
244
- "judge_tokens_in": 0,
245
- "judge_tokens_out": 0,
246
- "judge_latency_ms": 0.0
247
- },
248
- {
249
- "task_id": "who-owns-cirrus",
250
- "query": "Who owns project Cirrus?",
251
- "answer": "",
252
- "hits": [],
253
- "correct": false,
254
- "score": 0.0,
255
- "grading_notes": "missing 1/1: ['Diego']",
256
- "search_time_ms": 25.621167005738243,
257
- "generation_time_ms": 0.0,
258
- "tokens_in": 0,
259
- "tokens_out": 0,
260
- "retrieval_tokens": 6,
261
- "query_tokens": 6,
262
- "context_tokens": 0,
263
- "judge_tokens_in": 0,
264
- "judge_tokens_out": 0,
265
- "judge_latency_ms": 0.0
266
- },
267
- {
268
- "task_id": "who-owns-dune",
269
- "query": "Who owns project Dune?",
270
- "answer": "Farid: I'll own project Dune. Kickoff this week.",
271
- "hits": [
272
- {
273
- "text": "Farid: I'll own project Dune. Kickoff this week.",
274
- "score": 0.5120010814403368,
275
- "source": "pentatonic-memory",
276
- "doc_id": "chat-assign-dune"
277
- }
278
- ],
279
- "correct": true,
280
- "score": 1.0,
281
- "grading_notes": "all substrings matched",
282
- "search_time_ms": 26.015089999418706,
283
- "generation_time_ms": 0.0,
284
- "tokens_in": 0,
285
- "tokens_out": 0,
286
- "retrieval_tokens": 21,
287
- "query_tokens": 6,
288
- "context_tokens": 15,
289
- "judge_tokens_in": 0,
290
- "judge_tokens_out": 0,
291
- "judge_latency_ms": 0.0
292
- },
293
- {
294
- "task_id": "current-deadline-atlas",
295
- "query": "What is the current deadline for Atlas?",
296
- "answer": "",
297
- "hits": [],
298
- "correct": false,
299
- "score": 0.0,
300
- "grading_notes": "missing 1/1: ['2026-03-17']",
301
- "search_time_ms": 24.67792498646304,
302
- "generation_time_ms": 0.0,
303
- "tokens_in": 0,
304
- "tokens_out": 0,
305
- "retrieval_tokens": 8,
306
- "query_tokens": 8,
307
- "context_tokens": 0,
308
- "judge_tokens_in": 0,
309
- "judge_tokens_out": 0,
310
- "judge_latency_ms": 0.0
311
- },
312
- {
313
- "task_id": "current-deadline-borealis",
314
- "query": "What is the current deadline for Borealis?",
315
- "answer": "",
316
- "hits": [],
317
- "correct": false,
318
- "score": 0.0,
319
- "grading_notes": "missing 1/1: ['2026-02-28']",
320
- "search_time_ms": 25.36684399819933,
321
- "generation_time_ms": 0.0,
322
- "tokens_in": 0,
323
- "tokens_out": 0,
324
- "retrieval_tokens": 10,
325
- "query_tokens": 10,
326
- "context_tokens": 0,
327
- "judge_tokens_in": 0,
328
- "judge_tokens_out": 0,
329
- "judge_latency_ms": 0.0
330
- },
331
- {
332
- "task_id": "current-deadline-cirrus",
333
- "query": "What is the current deadline for Cirrus?",
334
- "answer": "",
335
- "hits": [],
336
- "correct": false,
337
- "score": 0.0,
338
- "grading_notes": "missing 1/1: ['2026-04-08']",
339
- "search_time_ms": 26.766681025037542,
340
- "generation_time_ms": 0.0,
341
- "tokens_in": 0,
342
- "tokens_out": 0,
343
- "retrieval_tokens": 9,
344
- "query_tokens": 9,
345
- "context_tokens": 0,
346
- "judge_tokens_in": 0,
347
- "judge_tokens_out": 0,
348
- "judge_latency_ms": 0.0
349
- },
350
- {
351
- "task_id": "current-deadline-dune",
352
- "query": "What is the current deadline for Dune?",
353
- "answer": "",
354
- "hits": [],
355
- "correct": false,
356
- "score": 0.0,
357
- "grading_notes": "missing 1/1: ['2026-05-20']",
358
- "search_time_ms": 26.705369004048407,
359
- "generation_time_ms": 0.0,
360
- "tokens_in": 0,
361
- "tokens_out": 0,
362
- "retrieval_tokens": 9,
363
- "query_tokens": 9,
364
- "context_tokens": 0,
365
- "judge_tokens_in": 0,
366
- "judge_tokens_out": 0,
367
- "judge_latency_ms": 0.0
368
- },
369
- {
370
- "task_id": "status-atlas",
371
- "query": "What's the latest status of Atlas?",
372
- "answer": "",
373
- "hits": [],
374
- "correct": false,
375
- "score": 0.0,
376
- "grading_notes": "missing 1/1: ['on track']",
377
- "search_time_ms": 27.433937008026987,
378
- "generation_time_ms": 0.0,
379
- "tokens_in": 0,
380
- "tokens_out": 0,
381
- "retrieval_tokens": 8,
382
- "query_tokens": 8,
383
- "context_tokens": 0,
384
- "judge_tokens_in": 0,
385
- "judge_tokens_out": 0,
386
- "judge_latency_ms": 0.0
387
- },
388
- {
389
- "task_id": "status-borealis",
390
- "query": "What's the latest status of Borealis?",
391
- "answer": "",
392
- "hits": [],
393
- "correct": false,
394
- "score": 0.0,
395
- "grading_notes": "missing 1/1: ['blocked on vendor']",
396
- "search_time_ms": 29.91680899867788,
397
- "generation_time_ms": 0.0,
398
- "tokens_in": 0,
399
- "tokens_out": 0,
400
- "retrieval_tokens": 10,
401
- "query_tokens": 10,
402
- "context_tokens": 0,
403
- "judge_tokens_in": 0,
404
- "judge_tokens_out": 0,
405
- "judge_latency_ms": 0.0
406
- },
407
- {
408
- "task_id": "status-cirrus",
409
- "query": "What's the latest status of Cirrus?",
410
- "answer": "",
411
- "hits": [],
412
- "correct": false,
413
- "score": 0.0,
414
- "grading_notes": "missing 1/1: ['scoping']",
415
- "search_time_ms": 25.178106006933376,
416
- "generation_time_ms": 0.0,
417
- "tokens_in": 0,
418
- "tokens_out": 0,
419
- "retrieval_tokens": 9,
420
- "query_tokens": 9,
421
- "context_tokens": 0,
422
- "judge_tokens_in": 0,
423
- "judge_tokens_out": 0,
424
- "judge_latency_ms": 0.0
425
- },
426
- {
427
- "task_id": "status-dune",
428
- "query": "What's the latest status of Dune?",
429
- "answer": "Farid: Dune status \u2014 launched.",
430
- "hits": [
431
- {
432
- "text": "Farid: Dune status \u2014 launched.",
433
- "score": 0.5271684290425744,
434
- "source": "pentatonic-memory",
435
- "doc_id": "status-dune-m4"
436
- }
437
- ],
438
- "correct": true,
439
- "score": 1.0,
440
- "grading_notes": "all substrings matched",
441
- "search_time_ms": 23.801564006134868,
442
- "generation_time_ms": 0.0,
443
- "tokens_in": 0,
444
- "tokens_out": 0,
445
- "retrieval_tokens": 18,
446
- "query_tokens": 9,
447
- "context_tokens": 9,
448
- "judge_tokens_in": 0,
449
- "judge_tokens_out": 0,
450
- "judge_latency_ms": 0.0
451
- },
452
- {
453
- "task_id": "multihop-atlas",
454
- "query": "Who owns Atlas and what is its current deadline?",
455
- "answer": "",
456
- "hits": [],
457
- "correct": false,
458
- "score": 0.0,
459
- "grading_notes": "missing 2/2: ['Alice', '2026-03-17']",
460
- "search_time_ms": 22.88174699060619,
461
- "generation_time_ms": 0.0,
462
- "tokens_in": 0,
463
- "tokens_out": 0,
464
- "retrieval_tokens": 10,
465
- "query_tokens": 10,
466
- "context_tokens": 0,
467
- "judge_tokens_in": 0,
468
- "judge_tokens_out": 0,
469
- "judge_latency_ms": 0.0
470
- },
471
- {
472
- "task_id": "multihop-borealis",
473
- "query": "Who owns Borealis and what is its current deadline?",
474
- "answer": "",
475
- "hits": [],
476
- "correct": false,
477
- "score": 0.0,
478
- "grading_notes": "missing 2/2: ['Clara', '2026-02-28']",
479
- "search_time_ms": 22.36511799856089,
480
- "generation_time_ms": 0.0,
481
- "tokens_in": 0,
482
- "tokens_out": 0,
483
- "retrieval_tokens": 12,
484
- "query_tokens": 12,
485
- "context_tokens": 0,
486
- "judge_tokens_in": 0,
487
- "judge_tokens_out": 0,
488
- "judge_latency_ms": 0.0
489
- },
490
- {
491
- "task_id": "multihop-cirrus",
492
- "query": "Who owns Cirrus and what is its current deadline?",
493
- "answer": "",
494
- "hits": [],
495
- "correct": false,
496
- "score": 0.0,
497
- "grading_notes": "missing 2/2: ['Diego', '2026-04-08']",
498
- "search_time_ms": 27.58819400332868,
499
- "generation_time_ms": 0.0,
500
- "tokens_in": 0,
501
- "tokens_out": 0,
502
- "retrieval_tokens": 11,
503
- "query_tokens": 11,
504
- "context_tokens": 0,
505
- "judge_tokens_in": 0,
506
- "judge_tokens_out": 0,
507
- "judge_latency_ms": 0.0
508
- },
509
- {
510
- "task_id": "multihop-dune",
511
- "query": "Who owns Dune and what is its current deadline?",
512
- "answer": "Farid: I'll own project Dune. Kickoff this week.\n---\nFarid: Dune status \u2014 launched.",
513
- "hits": [
514
- {
515
- "text": "Farid: I'll own project Dune. Kickoff this week.",
516
- "score": 0.5420836484839977,
517
- "source": "pentatonic-memory",
518
- "doc_id": "chat-assign-dune"
519
- },
520
- {
521
- "text": "Farid: Dune status \u2014 launched.",
522
- "score": 0.5371697829805622,
523
- "source": "pentatonic-memory",
524
- "doc_id": "status-dune-m4"
525
- }
526
- ],
527
- "correct": false,
528
- "score": 0.5,
529
- "grading_notes": "missing 1/2: ['2026-05-20']",
530
- "search_time_ms": 26.42112597823143,
531
- "generation_time_ms": 0.0,
532
- "tokens_in": 0,
533
- "tokens_out": 0,
534
- "retrieval_tokens": 36,
535
- "query_tokens": 11,
536
- "context_tokens": 25,
537
- "judge_tokens_in": 0,
538
- "judge_tokens_out": 0,
539
- "judge_latency_ms": 0.0
540
- }
541
- ]
542
- }