@pentatonic-ai/ai-agent-sdk 0.6.0 → 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (94) hide show
  1. package/README.md +170 -69
  2. package/bin/__tests__/callback-server.test.js +4 -1
  3. package/bin/cli.js +41 -164
  4. package/bin/commands/config.js +251 -0
  5. package/package.json +2 -1
  6. package/packages/doctor/__tests__/detect.test.js +2 -6
  7. package/packages/doctor/src/checks/local-memory.js +164 -196
  8. package/packages/doctor/src/detect.js +11 -3
  9. package/packages/memory/src/corpus/adapters.js +104 -0
  10. package/packages/memory/src/corpus/cli.js +72 -7
  11. package/packages/memory/src/corpus/index.js +1 -1
  12. package/packages/memory-engine/.env.example +13 -0
  13. package/packages/memory-engine/README.md +131 -0
  14. package/packages/memory-engine/bench/README.md +99 -0
  15. package/packages/memory-engine/bench/scorecards-engine/agent-coding__pentatonic-baseline__20260427-142523.json +1115 -0
  16. package/packages/memory-engine/bench/scorecards-engine/chat-recall__pentatonic-baseline__20260427-142648.json +819 -0
  17. package/packages/memory-engine/bench/scorecards-engine/circular-economy__pentatonic-baseline__20260427-142757.json +1278 -0
  18. package/packages/memory-engine/bench/scorecards-engine/customer-support__pentatonic-baseline__20260427-142900.json +1018 -0
  19. package/packages/memory-engine/bench/scorecards-engine/marketplace-ops__pentatonic-baseline__20260427-142957.json +1038 -0
  20. package/packages/memory-engine/bench/scorecards-engine/product-catalogue__pentatonic-baseline__20260427-143122.json +961 -0
  21. package/packages/memory-engine/bench/scorecards-engine-via-docker/agent-coding__pentatonic-memory__20260427-161812.json +1115 -0
  22. package/packages/memory-engine/bench/scorecards-engine-via-docker/chat-recall__pentatonic-memory__20260427-161701.json +819 -0
  23. package/packages/memory-engine/bench/scorecards-engine-via-docker/circular-economy__pentatonic-memory__20260427-161713.json +1278 -0
  24. package/packages/memory-engine/bench/scorecards-engine-via-docker/customer-support__pentatonic-memory__20260427-161723.json +1018 -0
  25. package/packages/memory-engine/bench/scorecards-engine-via-docker/marketplace-ops__pentatonic-memory__20260427-161732.json +1038 -0
  26. package/packages/memory-engine/bench/scorecards-engine-via-docker/product-catalogue__pentatonic-memory__20260427-161741.json +937 -0
  27. package/packages/memory-engine/bench/scorecards-engine-via-l2-7-layer-populated/agent-coding__pentatonic-memory__20260427-184718.json +1115 -0
  28. package/packages/memory-engine/bench/scorecards-engine-via-l2-7-layer-populated/chat-recall__pentatonic-memory__20260427-184614.json +819 -0
  29. package/packages/memory-engine/bench/scorecards-engine-via-l2-7-layer-populated/circular-economy__pentatonic-memory__20260427-184809.json +1278 -0
  30. package/packages/memory-engine/bench/scorecards-engine-via-l2-7-layer-populated/customer-support__pentatonic-memory__20260427-184854.json +1018 -0
  31. package/packages/memory-engine/bench/scorecards-engine-via-l2-7-layer-populated/marketplace-ops__pentatonic-memory__20260427-184929.json +1038 -0
  32. package/packages/memory-engine/bench/scorecards-engine-via-l2-7-layer-populated/product-catalogue__pentatonic-memory__20260427-185015.json +961 -0
  33. package/packages/memory-engine/bench/scorecards-engine-via-l2-empty-layers/agent-coding__pentatonic-memory__20260427-175252.json +1115 -0
  34. package/packages/memory-engine/bench/scorecards-engine-via-l2-empty-layers/chat-recall__pentatonic-memory__20260427-175312.json +819 -0
  35. package/packages/memory-engine/bench/scorecards-engine-via-l2-empty-layers/circular-economy__pentatonic-memory__20260427-175335.json +1278 -0
  36. package/packages/memory-engine/bench/scorecards-engine-via-l2-empty-layers/customer-support__pentatonic-memory__20260427-175355.json +1018 -0
  37. package/packages/memory-engine/bench/scorecards-engine-via-l2-empty-layers/marketplace-ops__pentatonic-memory__20260427-175413.json +1038 -0
  38. package/packages/memory-engine/bench/scorecards-engine-via-l2-empty-layers/product-catalogue__pentatonic-memory__20260427-175430.json +883 -0
  39. package/packages/memory-engine/bench/scorecards-engine-via-shim/agent-coding__pentatonic-memory__20260427-155409.json +1115 -0
  40. package/packages/memory-engine/bench/scorecards-engine-via-shim/chat-recall__pentatonic-memory__20260427-155421.json +819 -0
  41. package/packages/memory-engine/bench/scorecards-engine-via-shim/circular-economy__pentatonic-memory__20260427-155433.json +1278 -0
  42. package/packages/memory-engine/bench/scorecards-engine-via-shim/customer-support__pentatonic-memory__20260427-155443.json +1018 -0
  43. package/packages/memory-engine/bench/scorecards-engine-via-shim/marketplace-ops__pentatonic-memory__20260427-155453.json +1038 -0
  44. package/packages/memory-engine/bench/scorecards-engine-via-shim/product-catalogue__pentatonic-memory__20260427-155503.json +937 -0
  45. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/agent-coding__pentatonic-memory-latest__20260427-145103.json +1115 -0
  46. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/agent-coding__pentatonic-memory__20260427-144909.json +1115 -0
  47. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/chat-recall__pentatonic-memory-latest__20260427-145153.json +819 -0
  48. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/chat-recall__pentatonic-memory__20260427-145120.json +542 -0
  49. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/circular-economy__pentatonic-memory-latest__20260427-145313.json +1278 -0
  50. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/circular-economy__pentatonic-memory__20260427-145207.json +894 -0
  51. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/customer-support__pentatonic-memory-latest__20260427-145412.json +1018 -0
  52. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/customer-support__pentatonic-memory__20260427-145327.json +680 -0
  53. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/marketplace-ops__pentatonic-memory-latest__20260427-145517.json +1038 -0
  54. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/marketplace-ops__pentatonic-memory__20260427-145422.json +693 -0
  55. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/product-catalogue__pentatonic-memory-latest__20260427-145616.json +961 -0
  56. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/product-catalogue__pentatonic-memory__20260427-145528.json +727 -0
  57. package/packages/memory-engine/compat/Dockerfile +11 -0
  58. package/packages/memory-engine/compat/server.py +680 -0
  59. package/packages/memory-engine/docker-compose.yml +243 -0
  60. package/packages/memory-engine/docs/MIGRATION.md +178 -0
  61. package/packages/memory-engine/docs/RUNBOOK-AWS.md +375 -0
  62. package/packages/memory-engine/docs/why-v05-underperforms.md +138 -0
  63. package/packages/memory-engine/engine/README.md +52 -0
  64. package/packages/memory-engine/engine/l2-hybridrag-proxy.py +1543 -0
  65. package/packages/memory-engine/engine/l5-comms-layer.py +663 -0
  66. package/packages/memory-engine/engine/l6-document-store.py +1018 -0
  67. package/packages/memory-engine/engine/services/l2/Dockerfile +41 -0
  68. package/packages/memory-engine/engine/services/l2/init_databases.py +81 -0
  69. package/packages/memory-engine/engine/services/l2/l2-hybridrag-proxy.py +1543 -0
  70. package/packages/memory-engine/engine/services/l4/Dockerfile +15 -0
  71. package/packages/memory-engine/engine/services/l4/server.py +235 -0
  72. package/packages/memory-engine/engine/services/l5/Dockerfile +9 -0
  73. package/packages/memory-engine/engine/services/l5/l5-comms-layer.py +678 -0
  74. package/packages/memory-engine/engine/services/l6/Dockerfile +11 -0
  75. package/packages/memory-engine/engine/services/l6/l6-document-store.py +1016 -0
  76. package/packages/memory-engine/engine/services/nv-embed/Dockerfile +28 -0
  77. package/packages/memory-engine/engine/services/nv-embed/server.py +152 -0
  78. package/packages/memory-engine/pme_memory/__init__.py +0 -0
  79. package/packages/memory-engine/pme_memory/__main__.py +129 -0
  80. package/packages/memory-engine/pme_memory/artifacts.py +95 -0
  81. package/packages/memory-engine/pme_memory/embed.py +74 -0
  82. package/packages/memory-engine/pme_memory/health.py +36 -0
  83. package/packages/memory-engine/pme_memory/hygiene.py +159 -0
  84. package/packages/memory-engine/pme_memory/indexer.py +200 -0
  85. package/packages/memory-engine/pme_memory/needs.py +55 -0
  86. package/packages/memory-engine/pme_memory/provenance.py +80 -0
  87. package/packages/memory-engine/pme_memory/scoring.py +168 -0
  88. package/packages/memory-engine/pme_memory/search.py +52 -0
  89. package/packages/memory-engine/pme_memory/store.py +86 -0
  90. package/packages/memory-engine/pme_memory/synthesis.py +114 -0
  91. package/packages/memory-engine/pyproject.toml +65 -0
  92. package/packages/memory-engine/scripts/kg-extractor.py +557 -0
  93. package/packages/memory-engine/scripts/kg-preflexor-v2.py +738 -0
  94. package/packages/memory-engine/tests/test_api_contract.sh +57 -0
@@ -0,0 +1,542 @@
1
+ {
2
+ "bench": "chat-recall",
3
+ "stack": "pentatonic-memory",
4
+ "n_tasks": 16,
5
+ "n_correct": 2,
6
+ "accuracy": 0.125,
7
+ "mean_score": 0.15625,
8
+ "p50_search_ms": 25.818128502578475,
9
+ "p95_search_ms": 31.491656991420314,
10
+ "total_tokens_in": 0,
11
+ "total_tokens_out": 0,
12
+ "total_usd": 0.0,
13
+ "by_tag": {
14
+ "factoid": {
15
+ "n": 6,
16
+ "mean_score": 0.16666666666666666,
17
+ "accuracy": 0.16666666666666666
18
+ },
19
+ "owner": {
20
+ "n": 4,
21
+ "mean_score": 0.25,
22
+ "accuracy": 0.25
23
+ },
24
+ "temporal": {
25
+ "n": 4,
26
+ "mean_score": 0.0,
27
+ "accuracy": 0.0
28
+ },
29
+ "contradiction": {
30
+ "n": 2,
31
+ "mean_score": 0.0,
32
+ "accuracy": 0.0
33
+ },
34
+ "status": {
35
+ "n": 4,
36
+ "mean_score": 0.25,
37
+ "accuracy": 0.25
38
+ },
39
+ "multi-hop": {
40
+ "n": 4,
41
+ "mean_score": 0.125,
42
+ "accuracy": 0.0
43
+ }
44
+ },
45
+ "extra": {
46
+ "ingest_ms": 16568.97608199506,
47
+ "grading": "substring",
48
+ "limit": 3,
49
+ "tokens": {
50
+ "corpus_tokens": 513,
51
+ "query_tokens": 140,
52
+ "context_tokens": 49,
53
+ "retrieval_tokens": 189,
54
+ "naive_tokens": 8348,
55
+ "saved_tokens": 8159,
56
+ "reduction_pct": 0.9773598466698611,
57
+ "mean_retrieval_tokens_per_task": 11.8125,
58
+ "tokenizer": "cl100k_base",
59
+ "per_task": {
60
+ "who-owns-atlas": {
61
+ "query": 5,
62
+ "context": 0,
63
+ "retrieval": 5,
64
+ "judge_in": 0,
65
+ "judge_out": 0,
66
+ "judge_latency_ms": 0.0
67
+ },
68
+ "who-owns-borealis": {
69
+ "query": 7,
70
+ "context": 0,
71
+ "retrieval": 7,
72
+ "judge_in": 0,
73
+ "judge_out": 0,
74
+ "judge_latency_ms": 0.0
75
+ },
76
+ "who-owns-cirrus": {
77
+ "query": 6,
78
+ "context": 0,
79
+ "retrieval": 6,
80
+ "judge_in": 0,
81
+ "judge_out": 0,
82
+ "judge_latency_ms": 0.0
83
+ },
84
+ "who-owns-dune": {
85
+ "query": 6,
86
+ "context": 15,
87
+ "retrieval": 21,
88
+ "judge_in": 0,
89
+ "judge_out": 0,
90
+ "judge_latency_ms": 0.0
91
+ },
92
+ "current-deadline-atlas": {
93
+ "query": 8,
94
+ "context": 0,
95
+ "retrieval": 8,
96
+ "judge_in": 0,
97
+ "judge_out": 0,
98
+ "judge_latency_ms": 0.0
99
+ },
100
+ "current-deadline-borealis": {
101
+ "query": 10,
102
+ "context": 0,
103
+ "retrieval": 10,
104
+ "judge_in": 0,
105
+ "judge_out": 0,
106
+ "judge_latency_ms": 0.0
107
+ },
108
+ "current-deadline-cirrus": {
109
+ "query": 9,
110
+ "context": 0,
111
+ "retrieval": 9,
112
+ "judge_in": 0,
113
+ "judge_out": 0,
114
+ "judge_latency_ms": 0.0
115
+ },
116
+ "current-deadline-dune": {
117
+ "query": 9,
118
+ "context": 0,
119
+ "retrieval": 9,
120
+ "judge_in": 0,
121
+ "judge_out": 0,
122
+ "judge_latency_ms": 0.0
123
+ },
124
+ "status-atlas": {
125
+ "query": 8,
126
+ "context": 0,
127
+ "retrieval": 8,
128
+ "judge_in": 0,
129
+ "judge_out": 0,
130
+ "judge_latency_ms": 0.0
131
+ },
132
+ "status-borealis": {
133
+ "query": 10,
134
+ "context": 0,
135
+ "retrieval": 10,
136
+ "judge_in": 0,
137
+ "judge_out": 0,
138
+ "judge_latency_ms": 0.0
139
+ },
140
+ "status-cirrus": {
141
+ "query": 9,
142
+ "context": 0,
143
+ "retrieval": 9,
144
+ "judge_in": 0,
145
+ "judge_out": 0,
146
+ "judge_latency_ms": 0.0
147
+ },
148
+ "status-dune": {
149
+ "query": 9,
150
+ "context": 9,
151
+ "retrieval": 18,
152
+ "judge_in": 0,
153
+ "judge_out": 0,
154
+ "judge_latency_ms": 0.0
155
+ },
156
+ "multihop-atlas": {
157
+ "query": 10,
158
+ "context": 0,
159
+ "retrieval": 10,
160
+ "judge_in": 0,
161
+ "judge_out": 0,
162
+ "judge_latency_ms": 0.0
163
+ },
164
+ "multihop-borealis": {
165
+ "query": 12,
166
+ "context": 0,
167
+ "retrieval": 12,
168
+ "judge_in": 0,
169
+ "judge_out": 0,
170
+ "judge_latency_ms": 0.0
171
+ },
172
+ "multihop-cirrus": {
173
+ "query": 11,
174
+ "context": 0,
175
+ "retrieval": 11,
176
+ "judge_in": 0,
177
+ "judge_out": 0,
178
+ "judge_latency_ms": 0.0
179
+ },
180
+ "multihop-dune": {
181
+ "query": 11,
182
+ "context": 25,
183
+ "retrieval": 36,
184
+ "judge_in": 0,
185
+ "judge_out": 0,
186
+ "judge_latency_ms": 0.0
187
+ }
188
+ },
189
+ "judge_tokens_in": 0,
190
+ "judge_tokens_out": 0,
191
+ "judge_calls": 0,
192
+ "judge_mean_latency_ms": 0.0
193
+ },
194
+ "cost_usd": {
195
+ "assumed_completion_tokens_per_task": 100,
196
+ "rates": {
197
+ "input_per_1k": 0.0025,
198
+ "output_per_1k": 0.01,
199
+ "model": "gpt-4o"
200
+ },
201
+ "retrieval_usd_in": 0.00047250000000000005,
202
+ "retrieval_usd_out": 0.016,
203
+ "retrieval_usd_total": 0.0164725,
204
+ "naive_usd_total": 0.03687,
205
+ "saved_usd": 0.0203975,
206
+ "saved_usd_per_1k_tasks": 1.2748437499999998
207
+ }
208
+ },
209
+ "task_results": [
210
+ {
211
+ "task_id": "who-owns-atlas",
212
+ "query": "Who owns project Atlas?",
213
+ "answer": "",
214
+ "hits": [],
215
+ "correct": false,
216
+ "score": 0.0,
217
+ "grading_notes": "missing 1/1: ['Alice']",
218
+ "search_time_ms": 31.491656991420314,
219
+ "generation_time_ms": 0.0,
220
+ "tokens_in": 0,
221
+ "tokens_out": 0,
222
+ "retrieval_tokens": 5,
223
+ "query_tokens": 5,
224
+ "context_tokens": 0,
225
+ "judge_tokens_in": 0,
226
+ "judge_tokens_out": 0,
227
+ "judge_latency_ms": 0.0
228
+ },
229
+ {
230
+ "task_id": "who-owns-borealis",
231
+ "query": "Who owns project Borealis?",
232
+ "answer": "",
233
+ "hits": [],
234
+ "correct": false,
235
+ "score": 0.0,
236
+ "grading_notes": "missing 1/1: ['Clara']",
237
+ "search_time_ms": 23.22632700088434,
238
+ "generation_time_ms": 0.0,
239
+ "tokens_in": 0,
240
+ "tokens_out": 0,
241
+ "retrieval_tokens": 7,
242
+ "query_tokens": 7,
243
+ "context_tokens": 0,
244
+ "judge_tokens_in": 0,
245
+ "judge_tokens_out": 0,
246
+ "judge_latency_ms": 0.0
247
+ },
248
+ {
249
+ "task_id": "who-owns-cirrus",
250
+ "query": "Who owns project Cirrus?",
251
+ "answer": "",
252
+ "hits": [],
253
+ "correct": false,
254
+ "score": 0.0,
255
+ "grading_notes": "missing 1/1: ['Diego']",
256
+ "search_time_ms": 25.621167005738243,
257
+ "generation_time_ms": 0.0,
258
+ "tokens_in": 0,
259
+ "tokens_out": 0,
260
+ "retrieval_tokens": 6,
261
+ "query_tokens": 6,
262
+ "context_tokens": 0,
263
+ "judge_tokens_in": 0,
264
+ "judge_tokens_out": 0,
265
+ "judge_latency_ms": 0.0
266
+ },
267
+ {
268
+ "task_id": "who-owns-dune",
269
+ "query": "Who owns project Dune?",
270
+ "answer": "Farid: I'll own project Dune. Kickoff this week.",
271
+ "hits": [
272
+ {
273
+ "text": "Farid: I'll own project Dune. Kickoff this week.",
274
+ "score": 0.5120010814403368,
275
+ "source": "pentatonic-memory",
276
+ "doc_id": "chat-assign-dune"
277
+ }
278
+ ],
279
+ "correct": true,
280
+ "score": 1.0,
281
+ "grading_notes": "all substrings matched",
282
+ "search_time_ms": 26.015089999418706,
283
+ "generation_time_ms": 0.0,
284
+ "tokens_in": 0,
285
+ "tokens_out": 0,
286
+ "retrieval_tokens": 21,
287
+ "query_tokens": 6,
288
+ "context_tokens": 15,
289
+ "judge_tokens_in": 0,
290
+ "judge_tokens_out": 0,
291
+ "judge_latency_ms": 0.0
292
+ },
293
+ {
294
+ "task_id": "current-deadline-atlas",
295
+ "query": "What is the current deadline for Atlas?",
296
+ "answer": "",
297
+ "hits": [],
298
+ "correct": false,
299
+ "score": 0.0,
300
+ "grading_notes": "missing 1/1: ['2026-03-17']",
301
+ "search_time_ms": 24.67792498646304,
302
+ "generation_time_ms": 0.0,
303
+ "tokens_in": 0,
304
+ "tokens_out": 0,
305
+ "retrieval_tokens": 8,
306
+ "query_tokens": 8,
307
+ "context_tokens": 0,
308
+ "judge_tokens_in": 0,
309
+ "judge_tokens_out": 0,
310
+ "judge_latency_ms": 0.0
311
+ },
312
+ {
313
+ "task_id": "current-deadline-borealis",
314
+ "query": "What is the current deadline for Borealis?",
315
+ "answer": "",
316
+ "hits": [],
317
+ "correct": false,
318
+ "score": 0.0,
319
+ "grading_notes": "missing 1/1: ['2026-02-28']",
320
+ "search_time_ms": 25.36684399819933,
321
+ "generation_time_ms": 0.0,
322
+ "tokens_in": 0,
323
+ "tokens_out": 0,
324
+ "retrieval_tokens": 10,
325
+ "query_tokens": 10,
326
+ "context_tokens": 0,
327
+ "judge_tokens_in": 0,
328
+ "judge_tokens_out": 0,
329
+ "judge_latency_ms": 0.0
330
+ },
331
+ {
332
+ "task_id": "current-deadline-cirrus",
333
+ "query": "What is the current deadline for Cirrus?",
334
+ "answer": "",
335
+ "hits": [],
336
+ "correct": false,
337
+ "score": 0.0,
338
+ "grading_notes": "missing 1/1: ['2026-04-08']",
339
+ "search_time_ms": 26.766681025037542,
340
+ "generation_time_ms": 0.0,
341
+ "tokens_in": 0,
342
+ "tokens_out": 0,
343
+ "retrieval_tokens": 9,
344
+ "query_tokens": 9,
345
+ "context_tokens": 0,
346
+ "judge_tokens_in": 0,
347
+ "judge_tokens_out": 0,
348
+ "judge_latency_ms": 0.0
349
+ },
350
+ {
351
+ "task_id": "current-deadline-dune",
352
+ "query": "What is the current deadline for Dune?",
353
+ "answer": "",
354
+ "hits": [],
355
+ "correct": false,
356
+ "score": 0.0,
357
+ "grading_notes": "missing 1/1: ['2026-05-20']",
358
+ "search_time_ms": 26.705369004048407,
359
+ "generation_time_ms": 0.0,
360
+ "tokens_in": 0,
361
+ "tokens_out": 0,
362
+ "retrieval_tokens": 9,
363
+ "query_tokens": 9,
364
+ "context_tokens": 0,
365
+ "judge_tokens_in": 0,
366
+ "judge_tokens_out": 0,
367
+ "judge_latency_ms": 0.0
368
+ },
369
+ {
370
+ "task_id": "status-atlas",
371
+ "query": "What's the latest status of Atlas?",
372
+ "answer": "",
373
+ "hits": [],
374
+ "correct": false,
375
+ "score": 0.0,
376
+ "grading_notes": "missing 1/1: ['on track']",
377
+ "search_time_ms": 27.433937008026987,
378
+ "generation_time_ms": 0.0,
379
+ "tokens_in": 0,
380
+ "tokens_out": 0,
381
+ "retrieval_tokens": 8,
382
+ "query_tokens": 8,
383
+ "context_tokens": 0,
384
+ "judge_tokens_in": 0,
385
+ "judge_tokens_out": 0,
386
+ "judge_latency_ms": 0.0
387
+ },
388
+ {
389
+ "task_id": "status-borealis",
390
+ "query": "What's the latest status of Borealis?",
391
+ "answer": "",
392
+ "hits": [],
393
+ "correct": false,
394
+ "score": 0.0,
395
+ "grading_notes": "missing 1/1: ['blocked on vendor']",
396
+ "search_time_ms": 29.91680899867788,
397
+ "generation_time_ms": 0.0,
398
+ "tokens_in": 0,
399
+ "tokens_out": 0,
400
+ "retrieval_tokens": 10,
401
+ "query_tokens": 10,
402
+ "context_tokens": 0,
403
+ "judge_tokens_in": 0,
404
+ "judge_tokens_out": 0,
405
+ "judge_latency_ms": 0.0
406
+ },
407
+ {
408
+ "task_id": "status-cirrus",
409
+ "query": "What's the latest status of Cirrus?",
410
+ "answer": "",
411
+ "hits": [],
412
+ "correct": false,
413
+ "score": 0.0,
414
+ "grading_notes": "missing 1/1: ['scoping']",
415
+ "search_time_ms": 25.178106006933376,
416
+ "generation_time_ms": 0.0,
417
+ "tokens_in": 0,
418
+ "tokens_out": 0,
419
+ "retrieval_tokens": 9,
420
+ "query_tokens": 9,
421
+ "context_tokens": 0,
422
+ "judge_tokens_in": 0,
423
+ "judge_tokens_out": 0,
424
+ "judge_latency_ms": 0.0
425
+ },
426
+ {
427
+ "task_id": "status-dune",
428
+ "query": "What's the latest status of Dune?",
429
+ "answer": "Farid: Dune status \u2014 launched.",
430
+ "hits": [
431
+ {
432
+ "text": "Farid: Dune status \u2014 launched.",
433
+ "score": 0.5271684290425744,
434
+ "source": "pentatonic-memory",
435
+ "doc_id": "status-dune-m4"
436
+ }
437
+ ],
438
+ "correct": true,
439
+ "score": 1.0,
440
+ "grading_notes": "all substrings matched",
441
+ "search_time_ms": 23.801564006134868,
442
+ "generation_time_ms": 0.0,
443
+ "tokens_in": 0,
444
+ "tokens_out": 0,
445
+ "retrieval_tokens": 18,
446
+ "query_tokens": 9,
447
+ "context_tokens": 9,
448
+ "judge_tokens_in": 0,
449
+ "judge_tokens_out": 0,
450
+ "judge_latency_ms": 0.0
451
+ },
452
+ {
453
+ "task_id": "multihop-atlas",
454
+ "query": "Who owns Atlas and what is its current deadline?",
455
+ "answer": "",
456
+ "hits": [],
457
+ "correct": false,
458
+ "score": 0.0,
459
+ "grading_notes": "missing 2/2: ['Alice', '2026-03-17']",
460
+ "search_time_ms": 22.88174699060619,
461
+ "generation_time_ms": 0.0,
462
+ "tokens_in": 0,
463
+ "tokens_out": 0,
464
+ "retrieval_tokens": 10,
465
+ "query_tokens": 10,
466
+ "context_tokens": 0,
467
+ "judge_tokens_in": 0,
468
+ "judge_tokens_out": 0,
469
+ "judge_latency_ms": 0.0
470
+ },
471
+ {
472
+ "task_id": "multihop-borealis",
473
+ "query": "Who owns Borealis and what is its current deadline?",
474
+ "answer": "",
475
+ "hits": [],
476
+ "correct": false,
477
+ "score": 0.0,
478
+ "grading_notes": "missing 2/2: ['Clara', '2026-02-28']",
479
+ "search_time_ms": 22.36511799856089,
480
+ "generation_time_ms": 0.0,
481
+ "tokens_in": 0,
482
+ "tokens_out": 0,
483
+ "retrieval_tokens": 12,
484
+ "query_tokens": 12,
485
+ "context_tokens": 0,
486
+ "judge_tokens_in": 0,
487
+ "judge_tokens_out": 0,
488
+ "judge_latency_ms": 0.0
489
+ },
490
+ {
491
+ "task_id": "multihop-cirrus",
492
+ "query": "Who owns Cirrus and what is its current deadline?",
493
+ "answer": "",
494
+ "hits": [],
495
+ "correct": false,
496
+ "score": 0.0,
497
+ "grading_notes": "missing 2/2: ['Diego', '2026-04-08']",
498
+ "search_time_ms": 27.58819400332868,
499
+ "generation_time_ms": 0.0,
500
+ "tokens_in": 0,
501
+ "tokens_out": 0,
502
+ "retrieval_tokens": 11,
503
+ "query_tokens": 11,
504
+ "context_tokens": 0,
505
+ "judge_tokens_in": 0,
506
+ "judge_tokens_out": 0,
507
+ "judge_latency_ms": 0.0
508
+ },
509
+ {
510
+ "task_id": "multihop-dune",
511
+ "query": "Who owns Dune and what is its current deadline?",
512
+ "answer": "Farid: I'll own project Dune. Kickoff this week.\n---\nFarid: Dune status \u2014 launched.",
513
+ "hits": [
514
+ {
515
+ "text": "Farid: I'll own project Dune. Kickoff this week.",
516
+ "score": 0.5420836484839977,
517
+ "source": "pentatonic-memory",
518
+ "doc_id": "chat-assign-dune"
519
+ },
520
+ {
521
+ "text": "Farid: Dune status \u2014 launched.",
522
+ "score": 0.5371697829805622,
523
+ "source": "pentatonic-memory",
524
+ "doc_id": "status-dune-m4"
525
+ }
526
+ ],
527
+ "correct": false,
528
+ "score": 0.5,
529
+ "grading_notes": "missing 1/2: ['2026-05-20']",
530
+ "search_time_ms": 26.42112597823143,
531
+ "generation_time_ms": 0.0,
532
+ "tokens_in": 0,
533
+ "tokens_out": 0,
534
+ "retrieval_tokens": 36,
535
+ "query_tokens": 11,
536
+ "context_tokens": 25,
537
+ "judge_tokens_in": 0,
538
+ "judge_tokens_out": 0,
539
+ "judge_latency_ms": 0.0
540
+ }
541
+ ]
542
+ }