@pentatonic-ai/ai-agent-sdk 0.6.0 → 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (94) hide show
  1. package/README.md +170 -69
  2. package/bin/__tests__/callback-server.test.js +4 -1
  3. package/bin/cli.js +41 -164
  4. package/bin/commands/config.js +251 -0
  5. package/package.json +2 -1
  6. package/packages/doctor/__tests__/detect.test.js +2 -6
  7. package/packages/doctor/src/checks/local-memory.js +164 -196
  8. package/packages/doctor/src/detect.js +11 -3
  9. package/packages/memory/src/corpus/adapters.js +104 -0
  10. package/packages/memory/src/corpus/cli.js +72 -7
  11. package/packages/memory/src/corpus/index.js +1 -1
  12. package/packages/memory-engine/.env.example +13 -0
  13. package/packages/memory-engine/README.md +131 -0
  14. package/packages/memory-engine/bench/README.md +99 -0
  15. package/packages/memory-engine/bench/scorecards-engine/agent-coding__pentatonic-baseline__20260427-142523.json +1115 -0
  16. package/packages/memory-engine/bench/scorecards-engine/chat-recall__pentatonic-baseline__20260427-142648.json +819 -0
  17. package/packages/memory-engine/bench/scorecards-engine/circular-economy__pentatonic-baseline__20260427-142757.json +1278 -0
  18. package/packages/memory-engine/bench/scorecards-engine/customer-support__pentatonic-baseline__20260427-142900.json +1018 -0
  19. package/packages/memory-engine/bench/scorecards-engine/marketplace-ops__pentatonic-baseline__20260427-142957.json +1038 -0
  20. package/packages/memory-engine/bench/scorecards-engine/product-catalogue__pentatonic-baseline__20260427-143122.json +961 -0
  21. package/packages/memory-engine/bench/scorecards-engine-via-docker/agent-coding__pentatonic-memory__20260427-161812.json +1115 -0
  22. package/packages/memory-engine/bench/scorecards-engine-via-docker/chat-recall__pentatonic-memory__20260427-161701.json +819 -0
  23. package/packages/memory-engine/bench/scorecards-engine-via-docker/circular-economy__pentatonic-memory__20260427-161713.json +1278 -0
  24. package/packages/memory-engine/bench/scorecards-engine-via-docker/customer-support__pentatonic-memory__20260427-161723.json +1018 -0
  25. package/packages/memory-engine/bench/scorecards-engine-via-docker/marketplace-ops__pentatonic-memory__20260427-161732.json +1038 -0
  26. package/packages/memory-engine/bench/scorecards-engine-via-docker/product-catalogue__pentatonic-memory__20260427-161741.json +937 -0
  27. package/packages/memory-engine/bench/scorecards-engine-via-l2-7-layer-populated/agent-coding__pentatonic-memory__20260427-184718.json +1115 -0
  28. package/packages/memory-engine/bench/scorecards-engine-via-l2-7-layer-populated/chat-recall__pentatonic-memory__20260427-184614.json +819 -0
  29. package/packages/memory-engine/bench/scorecards-engine-via-l2-7-layer-populated/circular-economy__pentatonic-memory__20260427-184809.json +1278 -0
  30. package/packages/memory-engine/bench/scorecards-engine-via-l2-7-layer-populated/customer-support__pentatonic-memory__20260427-184854.json +1018 -0
  31. package/packages/memory-engine/bench/scorecards-engine-via-l2-7-layer-populated/marketplace-ops__pentatonic-memory__20260427-184929.json +1038 -0
  32. package/packages/memory-engine/bench/scorecards-engine-via-l2-7-layer-populated/product-catalogue__pentatonic-memory__20260427-185015.json +961 -0
  33. package/packages/memory-engine/bench/scorecards-engine-via-l2-empty-layers/agent-coding__pentatonic-memory__20260427-175252.json +1115 -0
  34. package/packages/memory-engine/bench/scorecards-engine-via-l2-empty-layers/chat-recall__pentatonic-memory__20260427-175312.json +819 -0
  35. package/packages/memory-engine/bench/scorecards-engine-via-l2-empty-layers/circular-economy__pentatonic-memory__20260427-175335.json +1278 -0
  36. package/packages/memory-engine/bench/scorecards-engine-via-l2-empty-layers/customer-support__pentatonic-memory__20260427-175355.json +1018 -0
  37. package/packages/memory-engine/bench/scorecards-engine-via-l2-empty-layers/marketplace-ops__pentatonic-memory__20260427-175413.json +1038 -0
  38. package/packages/memory-engine/bench/scorecards-engine-via-l2-empty-layers/product-catalogue__pentatonic-memory__20260427-175430.json +883 -0
  39. package/packages/memory-engine/bench/scorecards-engine-via-shim/agent-coding__pentatonic-memory__20260427-155409.json +1115 -0
  40. package/packages/memory-engine/bench/scorecards-engine-via-shim/chat-recall__pentatonic-memory__20260427-155421.json +819 -0
  41. package/packages/memory-engine/bench/scorecards-engine-via-shim/circular-economy__pentatonic-memory__20260427-155433.json +1278 -0
  42. package/packages/memory-engine/bench/scorecards-engine-via-shim/customer-support__pentatonic-memory__20260427-155443.json +1018 -0
  43. package/packages/memory-engine/bench/scorecards-engine-via-shim/marketplace-ops__pentatonic-memory__20260427-155453.json +1038 -0
  44. package/packages/memory-engine/bench/scorecards-engine-via-shim/product-catalogue__pentatonic-memory__20260427-155503.json +937 -0
  45. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/agent-coding__pentatonic-memory-latest__20260427-145103.json +1115 -0
  46. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/agent-coding__pentatonic-memory__20260427-144909.json +1115 -0
  47. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/chat-recall__pentatonic-memory-latest__20260427-145153.json +819 -0
  48. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/chat-recall__pentatonic-memory__20260427-145120.json +542 -0
  49. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/circular-economy__pentatonic-memory-latest__20260427-145313.json +1278 -0
  50. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/circular-economy__pentatonic-memory__20260427-145207.json +894 -0
  51. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/customer-support__pentatonic-memory-latest__20260427-145412.json +1018 -0
  52. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/customer-support__pentatonic-memory__20260427-145327.json +680 -0
  53. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/marketplace-ops__pentatonic-memory-latest__20260427-145517.json +1038 -0
  54. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/marketplace-ops__pentatonic-memory__20260427-145422.json +693 -0
  55. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/product-catalogue__pentatonic-memory-latest__20260427-145616.json +961 -0
  56. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/product-catalogue__pentatonic-memory__20260427-145528.json +727 -0
  57. package/packages/memory-engine/compat/Dockerfile +11 -0
  58. package/packages/memory-engine/compat/server.py +680 -0
  59. package/packages/memory-engine/docker-compose.yml +243 -0
  60. package/packages/memory-engine/docs/MIGRATION.md +178 -0
  61. package/packages/memory-engine/docs/RUNBOOK-AWS.md +375 -0
  62. package/packages/memory-engine/docs/why-v05-underperforms.md +138 -0
  63. package/packages/memory-engine/engine/README.md +52 -0
  64. package/packages/memory-engine/engine/l2-hybridrag-proxy.py +1543 -0
  65. package/packages/memory-engine/engine/l5-comms-layer.py +663 -0
  66. package/packages/memory-engine/engine/l6-document-store.py +1018 -0
  67. package/packages/memory-engine/engine/services/l2/Dockerfile +41 -0
  68. package/packages/memory-engine/engine/services/l2/init_databases.py +81 -0
  69. package/packages/memory-engine/engine/services/l2/l2-hybridrag-proxy.py +1543 -0
  70. package/packages/memory-engine/engine/services/l4/Dockerfile +15 -0
  71. package/packages/memory-engine/engine/services/l4/server.py +235 -0
  72. package/packages/memory-engine/engine/services/l5/Dockerfile +9 -0
  73. package/packages/memory-engine/engine/services/l5/l5-comms-layer.py +678 -0
  74. package/packages/memory-engine/engine/services/l6/Dockerfile +11 -0
  75. package/packages/memory-engine/engine/services/l6/l6-document-store.py +1016 -0
  76. package/packages/memory-engine/engine/services/nv-embed/Dockerfile +28 -0
  77. package/packages/memory-engine/engine/services/nv-embed/server.py +152 -0
  78. package/packages/memory-engine/pme_memory/__init__.py +0 -0
  79. package/packages/memory-engine/pme_memory/__main__.py +129 -0
  80. package/packages/memory-engine/pme_memory/artifacts.py +95 -0
  81. package/packages/memory-engine/pme_memory/embed.py +74 -0
  82. package/packages/memory-engine/pme_memory/health.py +36 -0
  83. package/packages/memory-engine/pme_memory/hygiene.py +159 -0
  84. package/packages/memory-engine/pme_memory/indexer.py +200 -0
  85. package/packages/memory-engine/pme_memory/needs.py +55 -0
  86. package/packages/memory-engine/pme_memory/provenance.py +80 -0
  87. package/packages/memory-engine/pme_memory/scoring.py +168 -0
  88. package/packages/memory-engine/pme_memory/search.py +52 -0
  89. package/packages/memory-engine/pme_memory/store.py +86 -0
  90. package/packages/memory-engine/pme_memory/synthesis.py +114 -0
  91. package/packages/memory-engine/pyproject.toml +65 -0
  92. package/packages/memory-engine/scripts/kg-extractor.py +557 -0
  93. package/packages/memory-engine/scripts/kg-preflexor-v2.py +738 -0
  94. package/packages/memory-engine/tests/test_api_contract.sh +57 -0
@@ -0,0 +1,819 @@
1
+ {
2
+ "bench": "chat-recall",
3
+ "stack": "pentatonic-memory-latest",
4
+ "n_tasks": 16,
5
+ "n_correct": 0,
6
+ "accuracy": 0.0,
7
+ "mean_score": 0.0,
8
+ "p50_search_ms": 35.576166497776285,
9
+ "p95_search_ms": 37.78688400052488,
10
+ "total_tokens_in": 0,
11
+ "total_tokens_out": 0,
12
+ "total_usd": 0.0,
13
+ "by_tag": {
14
+ "factoid": {
15
+ "n": 6,
16
+ "mean_score": 0.0,
17
+ "accuracy": 0.0
18
+ },
19
+ "owner": {
20
+ "n": 4,
21
+ "mean_score": 0.0,
22
+ "accuracy": 0.0
23
+ },
24
+ "temporal": {
25
+ "n": 4,
26
+ "mean_score": 0.0,
27
+ "accuracy": 0.0
28
+ },
29
+ "contradiction": {
30
+ "n": 2,
31
+ "mean_score": 0.0,
32
+ "accuracy": 0.0
33
+ },
34
+ "status": {
35
+ "n": 4,
36
+ "mean_score": 0.0,
37
+ "accuracy": 0.0
38
+ },
39
+ "multi-hop": {
40
+ "n": 4,
41
+ "mean_score": 0.0,
42
+ "accuracy": 0.0
43
+ }
44
+ },
45
+ "extra": {
46
+ "ingest_ms": 32138.345434999792,
47
+ "grading": "substring",
48
+ "limit": 3,
49
+ "tokens": {
50
+ "corpus_tokens": 513,
51
+ "query_tokens": 140,
52
+ "context_tokens": 630,
53
+ "retrieval_tokens": 770,
54
+ "naive_tokens": 8348,
55
+ "saved_tokens": 7578,
56
+ "reduction_pct": 0.907762338284619,
57
+ "mean_retrieval_tokens_per_task": 48.125,
58
+ "tokenizer": "cl100k_base",
59
+ "per_task": {
60
+ "who-owns-atlas": {
61
+ "query": 5,
62
+ "context": 34,
63
+ "retrieval": 39,
64
+ "judge_in": 0,
65
+ "judge_out": 0,
66
+ "judge_latency_ms": 0.0
67
+ },
68
+ "who-owns-borealis": {
69
+ "query": 7,
70
+ "context": 34,
71
+ "retrieval": 41,
72
+ "judge_in": 0,
73
+ "judge_out": 0,
74
+ "judge_latency_ms": 0.0
75
+ },
76
+ "who-owns-cirrus": {
77
+ "query": 6,
78
+ "context": 49,
79
+ "retrieval": 55,
80
+ "judge_in": 0,
81
+ "judge_out": 0,
82
+ "judge_latency_ms": 0.0
83
+ },
84
+ "who-owns-dune": {
85
+ "query": 6,
86
+ "context": 31,
87
+ "retrieval": 37,
88
+ "judge_in": 0,
89
+ "judge_out": 0,
90
+ "judge_latency_ms": 0.0
91
+ },
92
+ "current-deadline-atlas": {
93
+ "query": 8,
94
+ "context": 35,
95
+ "retrieval": 43,
96
+ "judge_in": 0,
97
+ "judge_out": 0,
98
+ "judge_latency_ms": 0.0
99
+ },
100
+ "current-deadline-borealis": {
101
+ "query": 10,
102
+ "context": 34,
103
+ "retrieval": 44,
104
+ "judge_in": 0,
105
+ "judge_out": 0,
106
+ "judge_latency_ms": 0.0
107
+ },
108
+ "current-deadline-cirrus": {
109
+ "query": 9,
110
+ "context": 42,
111
+ "retrieval": 51,
112
+ "judge_in": 0,
113
+ "judge_out": 0,
114
+ "judge_latency_ms": 0.0
115
+ },
116
+ "current-deadline-dune": {
117
+ "query": 9,
118
+ "context": 31,
119
+ "retrieval": 40,
120
+ "judge_in": 0,
121
+ "judge_out": 0,
122
+ "judge_latency_ms": 0.0
123
+ },
124
+ "status-atlas": {
125
+ "query": 8,
126
+ "context": 33,
127
+ "retrieval": 41,
128
+ "judge_in": 0,
129
+ "judge_out": 0,
130
+ "judge_latency_ms": 0.0
131
+ },
132
+ "status-borealis": {
133
+ "query": 10,
134
+ "context": 42,
135
+ "retrieval": 52,
136
+ "judge_in": 0,
137
+ "judge_out": 0,
138
+ "judge_latency_ms": 0.0
139
+ },
140
+ "status-cirrus": {
141
+ "query": 9,
142
+ "context": 50,
143
+ "retrieval": 59,
144
+ "judge_in": 0,
145
+ "judge_out": 0,
146
+ "judge_latency_ms": 0.0
147
+ },
148
+ "status-dune": {
149
+ "query": 9,
150
+ "context": 35,
151
+ "retrieval": 44,
152
+ "judge_in": 0,
153
+ "judge_out": 0,
154
+ "judge_latency_ms": 0.0
155
+ },
156
+ "multihop-atlas": {
157
+ "query": 10,
158
+ "context": 42,
159
+ "retrieval": 52,
160
+ "judge_in": 0,
161
+ "judge_out": 0,
162
+ "judge_latency_ms": 0.0
163
+ },
164
+ "multihop-borealis": {
165
+ "query": 12,
166
+ "context": 50,
167
+ "retrieval": 62,
168
+ "judge_in": 0,
169
+ "judge_out": 0,
170
+ "judge_latency_ms": 0.0
171
+ },
172
+ "multihop-cirrus": {
173
+ "query": 11,
174
+ "context": 50,
175
+ "retrieval": 61,
176
+ "judge_in": 0,
177
+ "judge_out": 0,
178
+ "judge_latency_ms": 0.0
179
+ },
180
+ "multihop-dune": {
181
+ "query": 11,
182
+ "context": 38,
183
+ "retrieval": 49,
184
+ "judge_in": 0,
185
+ "judge_out": 0,
186
+ "judge_latency_ms": 0.0
187
+ }
188
+ },
189
+ "judge_tokens_in": 0,
190
+ "judge_tokens_out": 0,
191
+ "judge_calls": 0,
192
+ "judge_mean_latency_ms": 0.0
193
+ },
194
+ "cost_usd": {
195
+ "assumed_completion_tokens_per_task": 100,
196
+ "rates": {
197
+ "input_per_1k": 0.0025,
198
+ "output_per_1k": 0.01,
199
+ "model": "gpt-4o"
200
+ },
201
+ "retrieval_usd_in": 0.001925,
202
+ "retrieval_usd_out": 0.016,
203
+ "retrieval_usd_total": 0.017925,
204
+ "naive_usd_total": 0.03687,
205
+ "saved_usd": 0.018945,
206
+ "saved_usd_per_1k_tasks": 1.1840625
207
+ }
208
+ },
209
+ "task_results": [
210
+ {
211
+ "task_id": "who-owns-atlas",
212
+ "query": "Who owns project Atlas?",
213
+ "answer": "If secret was rotated recently, `kubectl rollout restart deployment/billing-svc`\n---\nbilling-svc API keys \u2014 jordan\n---\nsearch-svc API keys \u2014 alex",
214
+ "hits": [
215
+ {
216
+ "text": "If secret was rotated recently, `kubectl rollout restart deployment/billing-svc`",
217
+ "score": 0.7094495479744956,
218
+ "source": "pentatonic-memory",
219
+ "doc_id": "mem_bb06d70a-c2f5-4168-b0df-a03b1b585b29"
220
+ },
221
+ {
222
+ "text": "billing-svc API keys \u2014 jordan",
223
+ "score": 0.705149493119418,
224
+ "source": "pentatonic-memory",
225
+ "doc_id": "mem_80182b2e-cbba-4642-b6fd-b8d3fbd6e617"
226
+ },
227
+ {
228
+ "text": "search-svc API keys \u2014 alex",
229
+ "score": 0.6992094186827446,
230
+ "source": "pentatonic-memory",
231
+ "doc_id": "mem_730b4496-40b1-499f-952a-739ae3912826"
232
+ }
233
+ ],
234
+ "correct": false,
235
+ "score": 0.0,
236
+ "grading_notes": "missing 1/1: ['Alice']",
237
+ "search_time_ms": 29.684740991797298,
238
+ "generation_time_ms": 0.0,
239
+ "tokens_in": 0,
240
+ "tokens_out": 0,
241
+ "retrieval_tokens": 39,
242
+ "query_tokens": 5,
243
+ "context_tokens": 34,
244
+ "judge_tokens_in": 0,
245
+ "judge_tokens_out": 0,
246
+ "judge_latency_ms": 0.0
247
+ },
248
+ {
249
+ "task_id": "who-owns-borealis",
250
+ "query": "Who owns project Borealis?",
251
+ "answer": "If secret was rotated recently, `kubectl rollout restart deployment/billing-svc`\n---\nsearch-svc API keys \u2014 alex\n---\nbilling-svc API keys \u2014 jordan",
252
+ "hits": [
253
+ {
254
+ "text": "If secret was rotated recently, `kubectl rollout restart deployment/billing-svc`",
255
+ "score": 0.6940128053794369,
256
+ "source": "pentatonic-memory",
257
+ "doc_id": "mem_bb06d70a-c2f5-4168-b0df-a03b1b585b29"
258
+ },
259
+ {
260
+ "text": "search-svc API keys \u2014 alex",
261
+ "score": 0.6928977121880233,
262
+ "source": "pentatonic-memory",
263
+ "doc_id": "mem_730b4496-40b1-499f-952a-739ae3912826"
264
+ },
265
+ {
266
+ "text": "billing-svc API keys \u2014 jordan",
267
+ "score": 0.6927844693031652,
268
+ "source": "pentatonic-memory",
269
+ "doc_id": "mem_80182b2e-cbba-4642-b6fd-b8d3fbd6e617"
270
+ }
271
+ ],
272
+ "correct": false,
273
+ "score": 0.0,
274
+ "grading_notes": "missing 1/1: ['Clara']",
275
+ "search_time_ms": 27.342910005245358,
276
+ "generation_time_ms": 0.0,
277
+ "tokens_in": 0,
278
+ "tokens_out": 0,
279
+ "retrieval_tokens": 41,
280
+ "query_tokens": 7,
281
+ "context_tokens": 34,
282
+ "judge_tokens_in": 0,
283
+ "judge_tokens_out": 0,
284
+ "judge_latency_ms": 0.0
285
+ },
286
+ {
287
+ "task_id": "who-owns-cirrus",
288
+ "query": "Who owns project Cirrus?",
289
+ "answer": "If secret was rotated recently, `kubectl rollout restart deployment/billing-svc`\n---\norder-svc marks order as paid, emits OrderPaid on pubsub\n---\nhandle_webhook verifies signature, enqueues internal InvoicePaid event on order-svc",
290
+ "hits": [
291
+ {
292
+ "text": "If secret was rotated recently, `kubectl rollout restart deployment/billing-svc`",
293
+ "score": 0.7071406152219076,
294
+ "source": "pentatonic-memory",
295
+ "doc_id": "mem_bb06d70a-c2f5-4168-b0df-a03b1b585b29"
296
+ },
297
+ {
298
+ "text": "order-svc marks order as paid, emits OrderPaid on pubsub",
299
+ "score": 0.6962335336552081,
300
+ "source": "pentatonic-memory",
301
+ "doc_id": "mem_f1083a75-3da4-4a35-bce4-db4a29b46034"
302
+ },
303
+ {
304
+ "text": "handle_webhook verifies signature, enqueues internal InvoicePaid event on order-svc",
305
+ "score": 0.6938935982281584,
306
+ "source": "pentatonic-memory",
307
+ "doc_id": "mem_b6596977-9a91-4469-a8c5-3872edbbbf1c"
308
+ }
309
+ ],
310
+ "correct": false,
311
+ "score": 0.0,
312
+ "grading_notes": "missing 1/1: ['Diego']",
313
+ "search_time_ms": 23.93999500782229,
314
+ "generation_time_ms": 0.0,
315
+ "tokens_in": 0,
316
+ "tokens_out": 0,
317
+ "retrieval_tokens": 55,
318
+ "query_tokens": 6,
319
+ "context_tokens": 49,
320
+ "judge_tokens_in": 0,
321
+ "judge_tokens_out": 0,
322
+ "judge_latency_ms": 0.0
323
+ },
324
+ {
325
+ "task_id": "who-owns-dune",
326
+ "query": "Who owns project Dune?",
327
+ "answer": "If secret was rotated recently, `kubectl rollout restart deployment/billing-svc`\n---\nbilling.new_invoice_pdf\n---\nbilling-svc API keys \u2014 jordan",
328
+ "hits": [
329
+ {
330
+ "text": "If secret was rotated recently, `kubectl rollout restart deployment/billing-svc`",
331
+ "score": 0.6957184482713455,
332
+ "source": "pentatonic-memory",
333
+ "doc_id": "mem_bb06d70a-c2f5-4168-b0df-a03b1b585b29"
334
+ },
335
+ {
336
+ "text": "billing.new_invoice_pdf",
337
+ "score": 0.6797076893489076,
338
+ "source": "pentatonic-memory",
339
+ "doc_id": "mem_ca8bdac3-12ca-4bae-b5a2-fb9c2d069173"
340
+ },
341
+ {
342
+ "text": "billing-svc API keys \u2014 jordan",
343
+ "score": 0.6776694304617411,
344
+ "source": "pentatonic-memory",
345
+ "doc_id": "mem_80182b2e-cbba-4642-b6fd-b8d3fbd6e617"
346
+ }
347
+ ],
348
+ "correct": false,
349
+ "score": 0.0,
350
+ "grading_notes": "missing 1/1: ['Farid']",
351
+ "search_time_ms": 25.87755100103095,
352
+ "generation_time_ms": 0.0,
353
+ "tokens_in": 0,
354
+ "tokens_out": 0,
355
+ "retrieval_tokens": 37,
356
+ "query_tokens": 6,
357
+ "context_tokens": 31,
358
+ "judge_tokens_in": 0,
359
+ "judge_tokens_out": 0,
360
+ "judge_latency_ms": 0.0
361
+ },
362
+ {
363
+ "task_id": "current-deadline-atlas",
364
+ "query": "What is the current deadline for Atlas?",
365
+ "answer": "If secret was rotated recently, `kubectl rollout restart deployment/billing-svc`\n---\nbilling-svc API keys \u2014 jordan\n---\nkubectl rollout restart deployment/billing-svc",
366
+ "hits": [
367
+ {
368
+ "text": "If secret was rotated recently, `kubectl rollout restart deployment/billing-svc`",
369
+ "score": 0.7275709883182019,
370
+ "source": "pentatonic-memory",
371
+ "doc_id": "mem_bb06d70a-c2f5-4168-b0df-a03b1b585b29"
372
+ },
373
+ {
374
+ "text": "billing-svc API keys \u2014 jordan",
375
+ "score": 0.7167343487691263,
376
+ "source": "pentatonic-memory",
377
+ "doc_id": "mem_80182b2e-cbba-4642-b6fd-b8d3fbd6e617"
378
+ },
379
+ {
380
+ "text": "kubectl rollout restart deployment/billing-svc",
381
+ "score": 0.7163043601685071,
382
+ "source": "pentatonic-memory",
383
+ "doc_id": "mem_049e0812-46ed-4db7-84d8-c4a121a1d19c"
384
+ }
385
+ ],
386
+ "correct": false,
387
+ "score": 0.0,
388
+ "grading_notes": "missing 1/1: ['2026-03-17']",
389
+ "search_time_ms": 37.78688400052488,
390
+ "generation_time_ms": 0.0,
391
+ "tokens_in": 0,
392
+ "tokens_out": 0,
393
+ "retrieval_tokens": 43,
394
+ "query_tokens": 8,
395
+ "context_tokens": 35,
396
+ "judge_tokens_in": 0,
397
+ "judge_tokens_out": 0,
398
+ "judge_latency_ms": 0.0
399
+ },
400
+ {
401
+ "task_id": "current-deadline-borealis",
402
+ "query": "What is the current deadline for Borealis?",
403
+ "answer": "If secret was rotated recently, `kubectl rollout restart deployment/billing-svc`\n---\nbilling-svc API keys \u2014 jordan\n---\nsearch-svc API keys \u2014 alex",
404
+ "hits": [
405
+ {
406
+ "text": "If secret was rotated recently, `kubectl rollout restart deployment/billing-svc`",
407
+ "score": 0.7083399225416024,
408
+ "source": "pentatonic-memory",
409
+ "doc_id": "mem_bb06d70a-c2f5-4168-b0df-a03b1b585b29"
410
+ },
411
+ {
412
+ "text": "billing-svc API keys \u2014 jordan",
413
+ "score": 0.6995845993743947,
414
+ "source": "pentatonic-memory",
415
+ "doc_id": "mem_80182b2e-cbba-4642-b6fd-b8d3fbd6e617"
416
+ },
417
+ {
418
+ "text": "search-svc API keys \u2014 alex",
419
+ "score": 0.6989382834457598,
420
+ "source": "pentatonic-memory",
421
+ "doc_id": "mem_730b4496-40b1-499f-952a-739ae3912826"
422
+ }
423
+ ],
424
+ "correct": false,
425
+ "score": 0.0,
426
+ "grading_notes": "missing 1/1: ['2026-02-28']",
427
+ "search_time_ms": 37.3624479980208,
428
+ "generation_time_ms": 0.0,
429
+ "tokens_in": 0,
430
+ "tokens_out": 0,
431
+ "retrieval_tokens": 44,
432
+ "query_tokens": 10,
433
+ "context_tokens": 34,
434
+ "judge_tokens_in": 0,
435
+ "judge_tokens_out": 0,
436
+ "judge_latency_ms": 0.0
437
+ },
438
+ {
439
+ "task_id": "current-deadline-cirrus",
440
+ "query": "What is the current deadline for Cirrus?",
441
+ "answer": "The user's latest Cirrus deadline is April 8, 2026.\n---\nIf secret was rotated recently, `kubectl rollout restart deployment/billing-svc`\n---\nkubectl rollout restart deployment/billing-svc",
442
+ "hits": [
443
+ {
444
+ "text": "The user's latest Cirrus deadline is April 8, 2026.",
445
+ "score": 0.7558460678895833,
446
+ "source": "pentatonic-memory",
447
+ "doc_id": "chat-deadline2-cirrus"
448
+ },
449
+ {
450
+ "text": "If secret was rotated recently, `kubectl rollout restart deployment/billing-svc`",
451
+ "score": 0.7260960333461852,
452
+ "source": "pentatonic-memory",
453
+ "doc_id": "mem_bb06d70a-c2f5-4168-b0df-a03b1b585b29"
454
+ },
455
+ {
456
+ "text": "kubectl rollout restart deployment/billing-svc",
457
+ "score": 0.7114582739003221,
458
+ "source": "pentatonic-memory",
459
+ "doc_id": "mem_049e0812-46ed-4db7-84d8-c4a121a1d19c"
460
+ }
461
+ ],
462
+ "correct": false,
463
+ "score": 0.0,
464
+ "grading_notes": "missing 1/1: ['2026-04-08']",
465
+ "search_time_ms": 33.95489399554208,
466
+ "generation_time_ms": 0.0,
467
+ "tokens_in": 0,
468
+ "tokens_out": 0,
469
+ "retrieval_tokens": 51,
470
+ "query_tokens": 9,
471
+ "context_tokens": 42,
472
+ "judge_tokens_in": 0,
473
+ "judge_tokens_out": 0,
474
+ "judge_latency_ms": 0.0
475
+ },
476
+ {
477
+ "task_id": "current-deadline-dune",
478
+ "query": "What is the current deadline for Dune?",
479
+ "answer": "If secret was rotated recently, `kubectl rollout restart deployment/billing-svc`\n---\nbilling.new_invoice_pdf\n---\nkubectl rollout restart deployment/billing-svc",
480
+ "hits": [
481
+ {
482
+ "text": "If secret was rotated recently, `kubectl rollout restart deployment/billing-svc`",
483
+ "score": 0.7106671210223783,
484
+ "source": "pentatonic-memory",
485
+ "doc_id": "mem_bb06d70a-c2f5-4168-b0df-a03b1b585b29"
486
+ },
487
+ {
488
+ "text": "billing.new_invoice_pdf",
489
+ "score": 0.7030500225794961,
490
+ "source": "pentatonic-memory",
491
+ "doc_id": "mem_ca8bdac3-12ca-4bae-b5a2-fb9c2d069173"
492
+ },
493
+ {
494
+ "text": "kubectl rollout restart deployment/billing-svc",
495
+ "score": 0.696777937772653,
496
+ "source": "pentatonic-memory",
497
+ "doc_id": "mem_049e0812-46ed-4db7-84d8-c4a121a1d19c"
498
+ }
499
+ ],
500
+ "correct": false,
501
+ "score": 0.0,
502
+ "grading_notes": "missing 1/1: ['2026-05-20']",
503
+ "search_time_ms": 35.3459480102174,
504
+ "generation_time_ms": 0.0,
505
+ "tokens_in": 0,
506
+ "tokens_out": 0,
507
+ "retrieval_tokens": 40,
508
+ "query_tokens": 9,
509
+ "context_tokens": 31,
510
+ "judge_tokens_in": 0,
511
+ "judge_tokens_out": 0,
512
+ "judge_latency_ms": 0.0
513
+ },
514
+ {
515
+ "task_id": "status-atlas",
516
+ "query": "What's the latest status of Atlas?",
517
+ "answer": "If secret was rotated recently, `kubectl rollout restart deployment/billing-svc`\n---\nThe Atlas project is currently in development.\n---\nsearch-svc API keys \u2014 alex",
518
+ "hits": [
519
+ {
520
+ "text": "If secret was rotated recently, `kubectl rollout restart deployment/billing-svc`",
521
+ "score": 0.7254432691030256,
522
+ "source": "pentatonic-memory",
523
+ "doc_id": "mem_bb06d70a-c2f5-4168-b0df-a03b1b585b29"
524
+ },
525
+ {
526
+ "text": "The Atlas project is currently in development.",
527
+ "score": 0.7171795355296468,
528
+ "source": "pentatonic-memory",
529
+ "doc_id": "chat-deadline2-atlas"
530
+ },
531
+ {
532
+ "text": "search-svc API keys \u2014 alex",
533
+ "score": 0.7042820502266752,
534
+ "source": "pentatonic-memory",
535
+ "doc_id": "mem_730b4496-40b1-499f-952a-739ae3912826"
536
+ }
537
+ ],
538
+ "correct": false,
539
+ "score": 0.0,
540
+ "grading_notes": "missing 1/1: ['on track']",
541
+ "search_time_ms": 36.89389998908155,
542
+ "generation_time_ms": 0.0,
543
+ "tokens_in": 0,
544
+ "tokens_out": 0,
545
+ "retrieval_tokens": 41,
546
+ "query_tokens": 8,
547
+ "context_tokens": 33,
548
+ "judge_tokens_in": 0,
549
+ "judge_tokens_out": 0,
550
+ "judge_latency_ms": 0.0
551
+ },
552
+ {
553
+ "task_id": "status-borealis",
554
+ "query": "What's the latest status of Borealis?",
555
+ "answer": "If secret was rotated recently, `kubectl rollout restart deployment/billing-svc`\n---\nsearch-svc API keys \u2014 alex\n---\nhandle_webhook verifies signature, enqueues internal InvoicePaid event on order-svc",
556
+ "hits": [
557
+ {
558
+ "text": "If secret was rotated recently, `kubectl rollout restart deployment/billing-svc`",
559
+ "score": 0.6994621643226522,
560
+ "source": "pentatonic-memory",
561
+ "doc_id": "mem_bb06d70a-c2f5-4168-b0df-a03b1b585b29"
562
+ },
563
+ {
564
+ "text": "search-svc API keys \u2014 alex",
565
+ "score": 0.6903014011152464,
566
+ "source": "pentatonic-memory",
567
+ "doc_id": "mem_730b4496-40b1-499f-952a-739ae3912826"
568
+ },
569
+ {
570
+ "text": "handle_webhook verifies signature, enqueues internal InvoicePaid event on order-svc",
571
+ "score": 0.6872532831436886,
572
+ "source": "pentatonic-memory",
573
+ "doc_id": "mem_b6596977-9a91-4469-a8c5-3872edbbbf1c"
574
+ }
575
+ ],
576
+ "correct": false,
577
+ "score": 0.0,
578
+ "grading_notes": "missing 1/1: ['blocked on vendor']",
579
+ "search_time_ms": 36.008898983709514,
580
+ "generation_time_ms": 0.0,
581
+ "tokens_in": 0,
582
+ "tokens_out": 0,
583
+ "retrieval_tokens": 52,
584
+ "query_tokens": 10,
585
+ "context_tokens": 42,
586
+ "judge_tokens_in": 0,
587
+ "judge_tokens_out": 0,
588
+ "judge_latency_ms": 0.0
589
+ },
590
+ {
591
+ "task_id": "status-cirrus",
592
+ "query": "What's the latest status of Cirrus?",
593
+ "answer": "The user's latest Cirrus deadline is April 8, 2026.\n---\nIf secret was rotated recently, `kubectl rollout restart deployment/billing-svc`\n---\nhandle_webhook verifies signature, enqueues internal InvoicePaid event on order-svc",
594
+ "hits": [
595
+ {
596
+ "text": "The user's latest Cirrus deadline is April 8, 2026.",
597
+ "score": 0.7447236868936458,
598
+ "source": "pentatonic-memory",
599
+ "doc_id": "chat-deadline2-cirrus"
600
+ },
601
+ {
602
+ "text": "If secret was rotated recently, `kubectl rollout restart deployment/billing-svc`",
603
+ "score": 0.7199085720912451,
604
+ "source": "pentatonic-memory",
605
+ "doc_id": "mem_bb06d70a-c2f5-4168-b0df-a03b1b585b29"
606
+ },
607
+ {
608
+ "text": "handle_webhook verifies signature, enqueues internal InvoicePaid event on order-svc",
609
+ "score": 0.701595044183101,
610
+ "source": "pentatonic-memory",
611
+ "doc_id": "mem_b6596977-9a91-4469-a8c5-3872edbbbf1c"
612
+ }
613
+ ],
614
+ "correct": false,
615
+ "score": 0.0,
616
+ "grading_notes": "missing 1/1: ['scoping']",
617
+ "search_time_ms": 35.968545998912305,
618
+ "generation_time_ms": 0.0,
619
+ "tokens_in": 0,
620
+ "tokens_out": 0,
621
+ "retrieval_tokens": 59,
622
+ "query_tokens": 9,
623
+ "context_tokens": 50,
624
+ "judge_tokens_in": 0,
625
+ "judge_tokens_out": 0,
626
+ "judge_latency_ms": 0.0
627
+ },
628
+ {
629
+ "task_id": "status-dune",
630
+ "query": "What's the latest status of Dune?",
631
+ "answer": "If secret was rotated recently, `kubectl rollout restart deployment/billing-svc`\n---\nbilling.new_invoice_pdf\n---\nStripe posts a invoice.payment_succeeded webhook to billing-svc",
632
+ "hits": [
633
+ {
634
+ "text": "If secret was rotated recently, `kubectl rollout restart deployment/billing-svc`",
635
+ "score": 0.7076256069009901,
636
+ "source": "pentatonic-memory",
637
+ "doc_id": "mem_bb06d70a-c2f5-4168-b0df-a03b1b585b29"
638
+ },
639
+ {
640
+ "text": "billing.new_invoice_pdf",
641
+ "score": 0.6999989250830485,
642
+ "source": "pentatonic-memory",
643
+ "doc_id": "mem_ca8bdac3-12ca-4bae-b5a2-fb9c2d069173"
644
+ },
645
+ {
646
+ "text": "Stripe posts a invoice.payment_succeeded webhook to billing-svc",
647
+ "score": 0.6846413593936166,
648
+ "source": "pentatonic-memory",
649
+ "doc_id": "mem_76c60bf6-9327-4cd1-9f56-234d229e8c8a"
650
+ }
651
+ ],
652
+ "correct": false,
653
+ "score": 0.0,
654
+ "grading_notes": "missing 1/1: ['launched']",
655
+ "search_time_ms": 35.42196500347927,
656
+ "generation_time_ms": 0.0,
657
+ "tokens_in": 0,
658
+ "tokens_out": 0,
659
+ "retrieval_tokens": 44,
660
+ "query_tokens": 9,
661
+ "context_tokens": 35,
662
+ "judge_tokens_in": 0,
663
+ "judge_tokens_out": 0,
664
+ "judge_latency_ms": 0.0
665
+ },
666
+ {
667
+ "task_id": "multihop-atlas",
668
+ "query": "Who owns Atlas and what is its current deadline?",
669
+ "answer": "The Atlas project is currently in development.\n---\nIf secret was rotated recently, `kubectl rollout restart deployment/billing-svc`\n---\nThe user's latest Cirrus deadline is April 8, 2026.",
670
+ "hits": [
671
+ {
672
+ "text": "The Atlas project is currently in development.",
673
+ "score": 0.7401179380365475,
674
+ "source": "pentatonic-memory",
675
+ "doc_id": "chat-deadline2-atlas"
676
+ },
677
+ {
678
+ "text": "If secret was rotated recently, `kubectl rollout restart deployment/billing-svc`",
679
+ "score": 0.7272517964813692,
680
+ "source": "pentatonic-memory",
681
+ "doc_id": "mem_bb06d70a-c2f5-4168-b0df-a03b1b585b29"
682
+ },
683
+ {
684
+ "text": "The user's latest Cirrus deadline is April 8, 2026.",
685
+ "score": 0.7197845468232443,
686
+ "source": "pentatonic-memory",
687
+ "doc_id": "chat-deadline2-cirrus"
688
+ }
689
+ ],
690
+ "correct": false,
691
+ "score": 0.0,
692
+ "grading_notes": "missing 2/2: ['Alice', '2026-03-17']",
693
+ "search_time_ms": 35.946914984378964,
694
+ "generation_time_ms": 0.0,
695
+ "tokens_in": 0,
696
+ "tokens_out": 0,
697
+ "retrieval_tokens": 52,
698
+ "query_tokens": 10,
699
+ "context_tokens": 42,
700
+ "judge_tokens_in": 0,
701
+ "judge_tokens_out": 0,
702
+ "judge_latency_ms": 0.0
703
+ },
704
+ {
705
+ "task_id": "multihop-borealis",
706
+ "query": "Who owns Borealis and what is its current deadline?",
707
+ "answer": "The user's latest Cirrus deadline is April 8, 2026.\n---\nIf secret was rotated recently, `kubectl rollout restart deployment/billing-svc`\n---\nhandle_webhook verifies signature, enqueues internal InvoicePaid event on order-svc",
708
+ "hits": [
709
+ {
710
+ "text": "The user's latest Cirrus deadline is April 8, 2026.",
711
+ "score": 0.7312607039203609,
712
+ "source": "pentatonic-memory",
713
+ "doc_id": "chat-deadline2-cirrus"
714
+ },
715
+ {
716
+ "text": "If secret was rotated recently, `kubectl rollout restart deployment/billing-svc`",
717
+ "score": 0.7117345215943829,
718
+ "source": "pentatonic-memory",
719
+ "doc_id": "mem_bb06d70a-c2f5-4168-b0df-a03b1b585b29"
720
+ },
721
+ {
722
+ "text": "handle_webhook verifies signature, enqueues internal InvoicePaid event on order-svc",
723
+ "score": 0.7061180599052448,
724
+ "source": "pentatonic-memory",
725
+ "doc_id": "mem_b6596977-9a91-4469-a8c5-3872edbbbf1c"
726
+ }
727
+ ],
728
+ "correct": false,
729
+ "score": 0.0,
730
+ "grading_notes": "missing 2/2: ['Clara', '2026-02-28']",
731
+ "search_time_ms": 36.55060098390095,
732
+ "generation_time_ms": 0.0,
733
+ "tokens_in": 0,
734
+ "tokens_out": 0,
735
+ "retrieval_tokens": 62,
736
+ "query_tokens": 12,
737
+ "context_tokens": 50,
738
+ "judge_tokens_in": 0,
739
+ "judge_tokens_out": 0,
740
+ "judge_latency_ms": 0.0
741
+ },
742
+ {
743
+ "task_id": "multihop-cirrus",
744
+ "query": "Who owns Cirrus and what is its current deadline?",
745
+ "answer": "The user's latest Cirrus deadline is April 8, 2026.\n---\nIf secret was rotated recently, `kubectl rollout restart deployment/billing-svc`\n---\nhandle_webhook verifies signature, enqueues internal InvoicePaid event on order-svc",
746
+ "hits": [
747
+ {
748
+ "text": "The user's latest Cirrus deadline is April 8, 2026.",
749
+ "score": 0.7949980719613424,
750
+ "source": "pentatonic-memory",
751
+ "doc_id": "chat-deadline2-cirrus"
752
+ },
753
+ {
754
+ "text": "If secret was rotated recently, `kubectl rollout restart deployment/billing-svc`",
755
+ "score": 0.7244226449863047,
756
+ "source": "pentatonic-memory",
757
+ "doc_id": "mem_bb06d70a-c2f5-4168-b0df-a03b1b585b29"
758
+ },
759
+ {
760
+ "text": "handle_webhook verifies signature, enqueues internal InvoicePaid event on order-svc",
761
+ "score": 0.7119401034180405,
762
+ "source": "pentatonic-memory",
763
+ "doc_id": "mem_b6596977-9a91-4469-a8c5-3872edbbbf1c"
764
+ }
765
+ ],
766
+ "correct": false,
767
+ "score": 0.0,
768
+ "grading_notes": "missing 2/2: ['Diego', '2026-04-08']",
769
+ "search_time_ms": 35.7303679920733,
770
+ "generation_time_ms": 0.0,
771
+ "tokens_in": 0,
772
+ "tokens_out": 0,
773
+ "retrieval_tokens": 61,
774
+ "query_tokens": 11,
775
+ "context_tokens": 50,
776
+ "judge_tokens_in": 0,
777
+ "judge_tokens_out": 0,
778
+ "judge_latency_ms": 0.0
779
+ },
780
+ {
781
+ "task_id": "multihop-dune",
782
+ "query": "Who owns Dune and what is its current deadline?",
783
+ "answer": "The user's latest Cirrus deadline is April 8, 2026.\n---\nIf secret was rotated recently, `kubectl rollout restart deployment/billing-svc`\n---\nbilling.new_invoice_pdf",
784
+ "hits": [
785
+ {
786
+ "text": "The user's latest Cirrus deadline is April 8, 2026.",
787
+ "score": 0.7311771760384164,
788
+ "source": "pentatonic-memory",
789
+ "doc_id": "chat-deadline2-cirrus"
790
+ },
791
+ {
792
+ "text": "If secret was rotated recently, `kubectl rollout restart deployment/billing-svc`",
793
+ "score": 0.7118296546980413,
794
+ "source": "pentatonic-memory",
795
+ "doc_id": "mem_bb06d70a-c2f5-4168-b0df-a03b1b585b29"
796
+ },
797
+ {
798
+ "text": "billing.new_invoice_pdf",
799
+ "score": 0.6992845290215867,
800
+ "source": "pentatonic-memory",
801
+ "doc_id": "mem_ca8bdac3-12ca-4bae-b5a2-fb9c2d069173"
802
+ }
803
+ ],
804
+ "correct": false,
805
+ "score": 0.0,
806
+ "grading_notes": "missing 2/2: ['Farid', '2026-05-20']",
807
+ "search_time_ms": 35.289291001390666,
808
+ "generation_time_ms": 0.0,
809
+ "tokens_in": 0,
810
+ "tokens_out": 0,
811
+ "retrieval_tokens": 49,
812
+ "query_tokens": 11,
813
+ "context_tokens": 38,
814
+ "judge_tokens_in": 0,
815
+ "judge_tokens_out": 0,
816
+ "judge_latency_ms": 0.0
817
+ }
818
+ ]
819
+ }