@pentatonic-ai/ai-agent-sdk 0.6.0 → 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (94) hide show
  1. package/README.md +170 -69
  2. package/bin/__tests__/callback-server.test.js +4 -1
  3. package/bin/cli.js +41 -164
  4. package/bin/commands/config.js +251 -0
  5. package/package.json +2 -1
  6. package/packages/doctor/__tests__/detect.test.js +2 -6
  7. package/packages/doctor/src/checks/local-memory.js +164 -196
  8. package/packages/doctor/src/detect.js +11 -3
  9. package/packages/memory/src/corpus/adapters.js +104 -0
  10. package/packages/memory/src/corpus/cli.js +72 -7
  11. package/packages/memory/src/corpus/index.js +1 -1
  12. package/packages/memory-engine/.env.example +13 -0
  13. package/packages/memory-engine/README.md +131 -0
  14. package/packages/memory-engine/bench/README.md +99 -0
  15. package/packages/memory-engine/bench/scorecards-engine/agent-coding__pentatonic-baseline__20260427-142523.json +1115 -0
  16. package/packages/memory-engine/bench/scorecards-engine/chat-recall__pentatonic-baseline__20260427-142648.json +819 -0
  17. package/packages/memory-engine/bench/scorecards-engine/circular-economy__pentatonic-baseline__20260427-142757.json +1278 -0
  18. package/packages/memory-engine/bench/scorecards-engine/customer-support__pentatonic-baseline__20260427-142900.json +1018 -0
  19. package/packages/memory-engine/bench/scorecards-engine/marketplace-ops__pentatonic-baseline__20260427-142957.json +1038 -0
  20. package/packages/memory-engine/bench/scorecards-engine/product-catalogue__pentatonic-baseline__20260427-143122.json +961 -0
  21. package/packages/memory-engine/bench/scorecards-engine-via-docker/agent-coding__pentatonic-memory__20260427-161812.json +1115 -0
  22. package/packages/memory-engine/bench/scorecards-engine-via-docker/chat-recall__pentatonic-memory__20260427-161701.json +819 -0
  23. package/packages/memory-engine/bench/scorecards-engine-via-docker/circular-economy__pentatonic-memory__20260427-161713.json +1278 -0
  24. package/packages/memory-engine/bench/scorecards-engine-via-docker/customer-support__pentatonic-memory__20260427-161723.json +1018 -0
  25. package/packages/memory-engine/bench/scorecards-engine-via-docker/marketplace-ops__pentatonic-memory__20260427-161732.json +1038 -0
  26. package/packages/memory-engine/bench/scorecards-engine-via-docker/product-catalogue__pentatonic-memory__20260427-161741.json +937 -0
  27. package/packages/memory-engine/bench/scorecards-engine-via-l2-7-layer-populated/agent-coding__pentatonic-memory__20260427-184718.json +1115 -0
  28. package/packages/memory-engine/bench/scorecards-engine-via-l2-7-layer-populated/chat-recall__pentatonic-memory__20260427-184614.json +819 -0
  29. package/packages/memory-engine/bench/scorecards-engine-via-l2-7-layer-populated/circular-economy__pentatonic-memory__20260427-184809.json +1278 -0
  30. package/packages/memory-engine/bench/scorecards-engine-via-l2-7-layer-populated/customer-support__pentatonic-memory__20260427-184854.json +1018 -0
  31. package/packages/memory-engine/bench/scorecards-engine-via-l2-7-layer-populated/marketplace-ops__pentatonic-memory__20260427-184929.json +1038 -0
  32. package/packages/memory-engine/bench/scorecards-engine-via-l2-7-layer-populated/product-catalogue__pentatonic-memory__20260427-185015.json +961 -0
  33. package/packages/memory-engine/bench/scorecards-engine-via-l2-empty-layers/agent-coding__pentatonic-memory__20260427-175252.json +1115 -0
  34. package/packages/memory-engine/bench/scorecards-engine-via-l2-empty-layers/chat-recall__pentatonic-memory__20260427-175312.json +819 -0
  35. package/packages/memory-engine/bench/scorecards-engine-via-l2-empty-layers/circular-economy__pentatonic-memory__20260427-175335.json +1278 -0
  36. package/packages/memory-engine/bench/scorecards-engine-via-l2-empty-layers/customer-support__pentatonic-memory__20260427-175355.json +1018 -0
  37. package/packages/memory-engine/bench/scorecards-engine-via-l2-empty-layers/marketplace-ops__pentatonic-memory__20260427-175413.json +1038 -0
  38. package/packages/memory-engine/bench/scorecards-engine-via-l2-empty-layers/product-catalogue__pentatonic-memory__20260427-175430.json +883 -0
  39. package/packages/memory-engine/bench/scorecards-engine-via-shim/agent-coding__pentatonic-memory__20260427-155409.json +1115 -0
  40. package/packages/memory-engine/bench/scorecards-engine-via-shim/chat-recall__pentatonic-memory__20260427-155421.json +819 -0
  41. package/packages/memory-engine/bench/scorecards-engine-via-shim/circular-economy__pentatonic-memory__20260427-155433.json +1278 -0
  42. package/packages/memory-engine/bench/scorecards-engine-via-shim/customer-support__pentatonic-memory__20260427-155443.json +1018 -0
  43. package/packages/memory-engine/bench/scorecards-engine-via-shim/marketplace-ops__pentatonic-memory__20260427-155453.json +1038 -0
  44. package/packages/memory-engine/bench/scorecards-engine-via-shim/product-catalogue__pentatonic-memory__20260427-155503.json +937 -0
  45. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/agent-coding__pentatonic-memory-latest__20260427-145103.json +1115 -0
  46. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/agent-coding__pentatonic-memory__20260427-144909.json +1115 -0
  47. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/chat-recall__pentatonic-memory-latest__20260427-145153.json +819 -0
  48. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/chat-recall__pentatonic-memory__20260427-145120.json +542 -0
  49. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/circular-economy__pentatonic-memory-latest__20260427-145313.json +1278 -0
  50. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/circular-economy__pentatonic-memory__20260427-145207.json +894 -0
  51. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/customer-support__pentatonic-memory-latest__20260427-145412.json +1018 -0
  52. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/customer-support__pentatonic-memory__20260427-145327.json +680 -0
  53. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/marketplace-ops__pentatonic-memory-latest__20260427-145517.json +1038 -0
  54. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/marketplace-ops__pentatonic-memory__20260427-145422.json +693 -0
  55. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/product-catalogue__pentatonic-memory-latest__20260427-145616.json +961 -0
  56. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/product-catalogue__pentatonic-memory__20260427-145528.json +727 -0
  57. package/packages/memory-engine/compat/Dockerfile +11 -0
  58. package/packages/memory-engine/compat/server.py +680 -0
  59. package/packages/memory-engine/docker-compose.yml +243 -0
  60. package/packages/memory-engine/docs/MIGRATION.md +178 -0
  61. package/packages/memory-engine/docs/RUNBOOK-AWS.md +375 -0
  62. package/packages/memory-engine/docs/why-v05-underperforms.md +138 -0
  63. package/packages/memory-engine/engine/README.md +52 -0
  64. package/packages/memory-engine/engine/l2-hybridrag-proxy.py +1543 -0
  65. package/packages/memory-engine/engine/l5-comms-layer.py +663 -0
  66. package/packages/memory-engine/engine/l6-document-store.py +1018 -0
  67. package/packages/memory-engine/engine/services/l2/Dockerfile +41 -0
  68. package/packages/memory-engine/engine/services/l2/init_databases.py +81 -0
  69. package/packages/memory-engine/engine/services/l2/l2-hybridrag-proxy.py +1543 -0
  70. package/packages/memory-engine/engine/services/l4/Dockerfile +15 -0
  71. package/packages/memory-engine/engine/services/l4/server.py +235 -0
  72. package/packages/memory-engine/engine/services/l5/Dockerfile +9 -0
  73. package/packages/memory-engine/engine/services/l5/l5-comms-layer.py +678 -0
  74. package/packages/memory-engine/engine/services/l6/Dockerfile +11 -0
  75. package/packages/memory-engine/engine/services/l6/l6-document-store.py +1016 -0
  76. package/packages/memory-engine/engine/services/nv-embed/Dockerfile +28 -0
  77. package/packages/memory-engine/engine/services/nv-embed/server.py +152 -0
  78. package/packages/memory-engine/pme_memory/__init__.py +0 -0
  79. package/packages/memory-engine/pme_memory/__main__.py +129 -0
  80. package/packages/memory-engine/pme_memory/artifacts.py +95 -0
  81. package/packages/memory-engine/pme_memory/embed.py +74 -0
  82. package/packages/memory-engine/pme_memory/health.py +36 -0
  83. package/packages/memory-engine/pme_memory/hygiene.py +159 -0
  84. package/packages/memory-engine/pme_memory/indexer.py +200 -0
  85. package/packages/memory-engine/pme_memory/needs.py +55 -0
  86. package/packages/memory-engine/pme_memory/provenance.py +80 -0
  87. package/packages/memory-engine/pme_memory/scoring.py +168 -0
  88. package/packages/memory-engine/pme_memory/search.py +52 -0
  89. package/packages/memory-engine/pme_memory/store.py +86 -0
  90. package/packages/memory-engine/pme_memory/synthesis.py +114 -0
  91. package/packages/memory-engine/pyproject.toml +65 -0
  92. package/packages/memory-engine/scripts/kg-extractor.py +557 -0
  93. package/packages/memory-engine/scripts/kg-preflexor-v2.py +738 -0
  94. package/packages/memory-engine/tests/test_api_contract.sh +57 -0
@@ -0,0 +1,1115 @@
1
+ {
2
+ "bench": "agent-coding",
3
+ "stack": "pentatonic-memory-latest",
4
+ "n_tasks": 22,
5
+ "n_correct": 2,
6
+ "accuracy": 0.09090909090909091,
7
+ "mean_score": 0.13636363636363635,
8
+ "p50_search_ms": 31.551691005006433,
9
+ "p95_search_ms": 36.34764281014213,
10
+ "total_tokens_in": 0,
11
+ "total_tokens_out": 0,
12
+ "total_usd": 0.0,
13
+ "by_tag": {
14
+ "api-signature": {
15
+ "n": 3,
16
+ "mean_score": 0.0,
17
+ "accuracy": 0.0
18
+ },
19
+ "bug-fix-recall": {
20
+ "n": 3,
21
+ "mean_score": 0.0,
22
+ "accuracy": 0.0
23
+ },
24
+ "deprecation": {
25
+ "n": 3,
26
+ "mean_score": 0.0,
27
+ "accuracy": 0.0
28
+ },
29
+ "config-recall": {
30
+ "n": 3,
31
+ "mean_score": 0.0,
32
+ "accuracy": 0.0
33
+ },
34
+ "pr-rationale": {
35
+ "n": 2,
36
+ "mean_score": 0.0,
37
+ "accuracy": 0.0
38
+ },
39
+ "cross-file-refactor": {
40
+ "n": 2,
41
+ "mean_score": 0.25,
42
+ "accuracy": 0.0
43
+ },
44
+ "stack-trace-match": {
45
+ "n": 1,
46
+ "mean_score": 0.0,
47
+ "accuracy": 0.0
48
+ },
49
+ "temporal": {
50
+ "n": 2,
51
+ "mean_score": 0.5,
52
+ "accuracy": 0.5
53
+ },
54
+ "contradiction": {
55
+ "n": 1,
56
+ "mean_score": 0.0,
57
+ "accuracy": 0.0
58
+ },
59
+ "rubric": {
60
+ "n": 2,
61
+ "mean_score": 0.75,
62
+ "accuracy": 0.5
63
+ }
64
+ },
65
+ "extra": {
66
+ "ingest_ms": 111456.33725798689,
67
+ "grading": "substring",
68
+ "limit": 3,
69
+ "tokens": {
70
+ "corpus_tokens": 5164,
71
+ "query_tokens": 339,
72
+ "context_tokens": 898,
73
+ "retrieval_tokens": 1237,
74
+ "naive_tokens": 113947,
75
+ "saved_tokens": 112710,
76
+ "reduction_pct": 0.9891440757545175,
77
+ "mean_retrieval_tokens_per_task": 56.22727272727273,
78
+ "tokenizer": "cl100k_base",
79
+ "per_task": {
80
+ "api-invoice-signature": {
81
+ "query": 18,
82
+ "context": 32,
83
+ "retrieval": 50,
84
+ "judge_in": 0,
85
+ "judge_out": 0,
86
+ "judge_latency_ms": 0.0
87
+ },
88
+ "api-search-shape": {
89
+ "query": 16,
90
+ "context": 28,
91
+ "retrieval": 44,
92
+ "judge_in": 0,
93
+ "judge_out": 0,
94
+ "judge_latency_ms": 0.0
95
+ },
96
+ "api-invoice-amount-type": {
97
+ "query": 16,
98
+ "context": 26,
99
+ "retrieval": 42,
100
+ "judge_in": 0,
101
+ "judge_out": 0,
102
+ "judge_latency_ms": 0.0
103
+ },
104
+ "bugfix-stripe-rounding": {
105
+ "query": 10,
106
+ "context": 47,
107
+ "retrieval": 57,
108
+ "judge_in": 0,
109
+ "judge_out": 0,
110
+ "judge_latency_ms": 0.0
111
+ },
112
+ "bugfix-webhook-dup": {
113
+ "query": 10,
114
+ "context": 46,
115
+ "retrieval": 56,
116
+ "judge_in": 0,
117
+ "judge_out": 0,
118
+ "judge_latency_ms": 0.0
119
+ },
120
+ "bugfix-rounding-direction": {
121
+ "query": 19,
122
+ "context": 33,
123
+ "retrieval": 52,
124
+ "judge_in": 0,
125
+ "judge_out": 0,
126
+ "judge_latency_ms": 0.0
127
+ },
128
+ "deprecation-v1": {
129
+ "query": 11,
130
+ "context": 35,
131
+ "retrieval": 46,
132
+ "judge_in": 0,
133
+ "judge_out": 0,
134
+ "judge_latency_ms": 0.0
135
+ },
136
+ "deprecation-hs256": {
137
+ "query": 11,
138
+ "context": 24,
139
+ "retrieval": 35,
140
+ "judge_in": 0,
141
+ "judge_out": 0,
142
+ "judge_latency_ms": 0.0
143
+ },
144
+ "deprecation-search-tuple": {
145
+ "query": 13,
146
+ "context": 41,
147
+ "retrieval": 54,
148
+ "judge_in": 0,
149
+ "judge_out": 0,
150
+ "judge_latency_ms": 0.0
151
+ },
152
+ "config-retry-policy-base": {
153
+ "query": 15,
154
+ "context": 30,
155
+ "retrieval": 45,
156
+ "judge_in": 0,
157
+ "judge_out": 0,
158
+ "judge_latency_ms": 0.0
159
+ },
160
+ "config-session-ttl": {
161
+ "query": 14,
162
+ "context": 28,
163
+ "retrieval": 42,
164
+ "judge_in": 0,
165
+ "judge_out": 0,
166
+ "judge_latency_ms": 0.0
167
+ },
168
+ "config-search-hybrid-weight": {
169
+ "query": 11,
170
+ "context": 22,
171
+ "retrieval": 33,
172
+ "judge_in": 0,
173
+ "judge_out": 0,
174
+ "judge_latency_ms": 0.0
175
+ },
176
+ "pr-rationale-retry": {
177
+ "query": 19,
178
+ "context": 45,
179
+ "retrieval": 64,
180
+ "judge_in": 0,
181
+ "judge_out": 0,
182
+ "judge_latency_ms": 0.0
183
+ },
184
+ "pr-rationale-jwt-cleanup": {
185
+ "query": 13,
186
+ "context": 49,
187
+ "retrieval": 62,
188
+ "judge_in": 0,
189
+ "judge_out": 0,
190
+ "judge_latency_ms": 0.0
191
+ },
192
+ "cross-ref-invoicing": {
193
+ "query": 14,
194
+ "context": 46,
195
+ "retrieval": 60,
196
+ "judge_in": 0,
197
+ "judge_out": 0,
198
+ "judge_latency_ms": 0.0
199
+ },
200
+ "cross-ref-ratelimit": {
201
+ "query": 15,
202
+ "context": 44,
203
+ "retrieval": 59,
204
+ "judge_in": 0,
205
+ "judge_out": 0,
206
+ "judge_latency_ms": 0.0
207
+ },
208
+ "stack-trace-webhook": {
209
+ "query": 21,
210
+ "context": 38,
211
+ "retrieval": 59,
212
+ "judge_in": 0,
213
+ "judge_out": 0,
214
+ "judge_latency_ms": 0.0
215
+ },
216
+ "temporal-numpy": {
217
+ "query": 11,
218
+ "context": 30,
219
+ "retrieval": 41,
220
+ "judge_in": 0,
221
+ "judge_out": 0,
222
+ "judge_latency_ms": 0.0
223
+ },
224
+ "temporal-on-call": {
225
+ "query": 20,
226
+ "context": 161,
227
+ "retrieval": 181,
228
+ "judge_in": 0,
229
+ "judge_out": 0,
230
+ "judge_latency_ms": 0.0
231
+ },
232
+ "contradiction-algorithm": {
233
+ "query": 16,
234
+ "context": 30,
235
+ "retrieval": 46,
236
+ "judge_in": 0,
237
+ "judge_out": 0,
238
+ "judge_latency_ms": 0.0
239
+ },
240
+ "rubric-security-advice": {
241
+ "query": 25,
242
+ "context": 22,
243
+ "retrieval": 47,
244
+ "judge_in": 336,
245
+ "judge_out": 38,
246
+ "judge_latency_ms": 799.5739569962025
247
+ },
248
+ "rubric-incident-lesson": {
249
+ "query": 21,
250
+ "context": 41,
251
+ "retrieval": 62,
252
+ "judge_in": 335,
253
+ "judge_out": 31,
254
+ "judge_latency_ms": 736.8484830260277
255
+ }
256
+ },
257
+ "judge_tokens_in": 671,
258
+ "judge_tokens_out": 69,
259
+ "judge_calls": 2,
260
+ "judge_mean_latency_ms": 768.2112200111151
261
+ },
262
+ "cost_usd": {
263
+ "assumed_completion_tokens_per_task": 100,
264
+ "rates": {
265
+ "input_per_1k": 0.0025,
266
+ "output_per_1k": 0.01,
267
+ "model": "gpt-4o"
268
+ },
269
+ "retrieval_usd_in": 0.0030925,
270
+ "retrieval_usd_out": 0.022,
271
+ "retrieval_usd_total": 0.0250925,
272
+ "naive_usd_total": 0.3068675,
273
+ "saved_usd": 0.281775,
274
+ "saved_usd_per_1k_tasks": 12.807954545454546
275
+ }
276
+ },
277
+ "task_results": [
278
+ {
279
+ "task_id": "api-invoice-signature",
280
+ "query": "What's the request body shape for POST /v2/invoices on billing-svc?",
281
+ "answer": "billing-svc creates the invoice\n---\nStripe posts a invoice.payment_succeeded webhook to billing-svc\n---\nemail-svc consumes OrderPaid for the receipt email",
282
+ "hits": [
283
+ {
284
+ "text": "billing-svc creates the invoice",
285
+ "score": 0.6878185110502829,
286
+ "source": "pentatonic-memory",
287
+ "doc_id": "cross-ref-invoicing"
288
+ },
289
+ {
290
+ "text": "Stripe posts a invoice.payment_succeeded webhook to billing-svc",
291
+ "score": 0.6615772089674591,
292
+ "source": "pentatonic-memory",
293
+ "doc_id": "cross-ref-invoicing"
294
+ },
295
+ {
296
+ "text": "email-svc consumes OrderPaid for the receipt email",
297
+ "score": 0.647140561545479,
298
+ "source": "pentatonic-memory",
299
+ "doc_id": "cross-ref-invoicing"
300
+ }
301
+ ],
302
+ "correct": false,
303
+ "score": 0.0,
304
+ "grading_notes": "missing 3/3: ['customer_id', 'amount_cents', 'currency']",
305
+ "search_time_ms": 36.31618601502851,
306
+ "generation_time_ms": 0.0,
307
+ "tokens_in": 0,
308
+ "tokens_out": 0,
309
+ "retrieval_tokens": 50,
310
+ "query_tokens": 18,
311
+ "context_tokens": 32,
312
+ "judge_tokens_in": 0,
313
+ "judge_tokens_out": 0,
314
+ "judge_latency_ms": 0.0
315
+ },
316
+ {
317
+ "task_id": "api-search-shape",
318
+ "query": "What's the response shape of POST /v3/search on search-svc?",
319
+ "answer": "search-svc API keys \u2014 alex\n---\nauth-svc\n---\norder-svc marks order as paid, emits OrderPaid on pubsub",
320
+ "hits": [
321
+ {
322
+ "text": "search-svc API keys \u2014 alex",
323
+ "score": 0.7780207670459662,
324
+ "source": "pentatonic-memory",
325
+ "doc_id": "security-rotation"
326
+ },
327
+ {
328
+ "text": "auth-svc",
329
+ "score": 0.767741541304893,
330
+ "source": "pentatonic-memory",
331
+ "doc_id": "pr-61-jwt-cleanup"
332
+ },
333
+ {
334
+ "text": "order-svc marks order as paid, emits OrderPaid on pubsub",
335
+ "score": 0.7564221051422783,
336
+ "source": "pentatonic-memory",
337
+ "doc_id": "cross-ref-invoicing"
338
+ }
339
+ ],
340
+ "correct": false,
341
+ "score": 0.0,
342
+ "grading_notes": "missing 3/3: ['results', 'score', 'payload']",
343
+ "search_time_ms": 29.63896800065413,
344
+ "generation_time_ms": 0.0,
345
+ "tokens_in": 0,
346
+ "tokens_out": 0,
347
+ "retrieval_tokens": 44,
348
+ "query_tokens": 16,
349
+ "context_tokens": 28,
350
+ "judge_tokens_in": 0,
351
+ "judge_tokens_out": 0,
352
+ "judge_latency_ms": 0.0
353
+ },
354
+ {
355
+ "task_id": "api-invoice-amount-type",
356
+ "query": "When calling POST /v2/invoices, must amount be integer or float?",
357
+ "answer": "billing.new_invoice_pdf\n---\nStripe posts a invoice.payment_succeeded webhook to billing-svc\n---\nbilling-svc creates the invoice",
358
+ "hits": [
359
+ {
360
+ "text": "billing.new_invoice_pdf",
361
+ "score": 0.776560219295151,
362
+ "source": "pentatonic-memory",
363
+ "doc_id": "flags-feature"
364
+ },
365
+ {
366
+ "text": "Stripe posts a invoice.payment_succeeded webhook to billing-svc",
367
+ "score": 0.7684587271642919,
368
+ "source": "pentatonic-memory",
369
+ "doc_id": "cross-ref-invoicing"
370
+ },
371
+ {
372
+ "text": "billing-svc creates the invoice",
373
+ "score": 0.7572711000324983,
374
+ "source": "pentatonic-memory",
375
+ "doc_id": "cross-ref-invoicing"
376
+ }
377
+ ],
378
+ "correct": false,
379
+ "score": 0.0,
380
+ "grading_notes": "missing 1/1: ['integer']",
381
+ "search_time_ms": 27.488578023621812,
382
+ "generation_time_ms": 0.0,
383
+ "tokens_in": 0,
384
+ "tokens_out": 0,
385
+ "retrieval_tokens": 42,
386
+ "query_tokens": 16,
387
+ "context_tokens": 26,
388
+ "judge_tokens_in": 0,
389
+ "judge_tokens_out": 0,
390
+ "judge_latency_ms": 0.0
391
+ },
392
+ {
393
+ "task_id": "bugfix-stripe-rounding",
394
+ "query": "Which commit fixed the stripe float-amount bug?",
395
+ "answer": "Stripe posts a invoice.payment_succeeded webhook to billing-svc\n---\nIf secret was rotated recently, `kubectl rollout restart deployment/billing-svc`\n---\nhandle_webhook verifies signature, enqueues internal InvoicePaid event on order-svc",
396
+ "hits": [
397
+ {
398
+ "text": "Stripe posts a invoice.payment_succeeded webhook to billing-svc",
399
+ "score": 0.7553062567434936,
400
+ "source": "pentatonic-memory",
401
+ "doc_id": "cross-ref-invoicing"
402
+ },
403
+ {
404
+ "text": "If secret was rotated recently, `kubectl rollout restart deployment/billing-svc`",
405
+ "score": 0.7382924461675726,
406
+ "source": "pentatonic-memory",
407
+ "doc_id": "runbook-billing"
408
+ },
409
+ {
410
+ "text": "handle_webhook verifies signature, enqueues internal InvoicePaid event on order-svc",
411
+ "score": 0.7307464450116461,
412
+ "source": "pentatonic-memory",
413
+ "doc_id": "cross-ref-invoicing"
414
+ }
415
+ ],
416
+ "correct": false,
417
+ "score": 0.0,
418
+ "grading_notes": "missing 1/1: ['a1b2c3']",
419
+ "search_time_ms": 30.660513002658263,
420
+ "generation_time_ms": 0.0,
421
+ "tokens_in": 0,
422
+ "tokens_out": 0,
423
+ "retrieval_tokens": 57,
424
+ "query_tokens": 10,
425
+ "context_tokens": 47,
426
+ "judge_tokens_in": 0,
427
+ "judge_tokens_out": 0,
428
+ "judge_latency_ms": 0.0
429
+ },
430
+ {
431
+ "task_id": "bugfix-webhook-dup",
432
+ "query": "How did we fix the duplicate Stripe webhook handling?",
433
+ "answer": "Stripe posts a invoice.payment_succeeded webhook to billing-svc\n---\nhandle_webhook verifies signature, enqueues internal InvoicePaid event on order-svc\n---\norder-svc marks order as paid, emits OrderPaid on pubsub",
434
+ "hits": [
435
+ {
436
+ "text": "Stripe posts a invoice.payment_succeeded webhook to billing-svc",
437
+ "score": 0.7754762809760788,
438
+ "source": "pentatonic-memory",
439
+ "doc_id": "cross-ref-invoicing"
440
+ },
441
+ {
442
+ "text": "handle_webhook verifies signature, enqueues internal InvoicePaid event on order-svc",
443
+ "score": 0.7753877760783608,
444
+ "source": "pentatonic-memory",
445
+ "doc_id": "cross-ref-invoicing"
446
+ },
447
+ {
448
+ "text": "order-svc marks order as paid, emits OrderPaid on pubsub",
449
+ "score": 0.728722307968613,
450
+ "source": "pentatonic-memory",
451
+ "doc_id": "cross-ref-invoicing"
452
+ }
453
+ ],
454
+ "correct": false,
455
+ "score": 0.0,
456
+ "grading_notes": "missing 2/2: ['idempotent', 'webhook_events']",
457
+ "search_time_ms": 21.82778500718996,
458
+ "generation_time_ms": 0.0,
459
+ "tokens_in": 0,
460
+ "tokens_out": 0,
461
+ "retrieval_tokens": 56,
462
+ "query_tokens": 10,
463
+ "context_tokens": 46,
464
+ "judge_tokens_in": 0,
465
+ "judge_tokens_out": 0,
466
+ "judge_latency_ms": 0.0
467
+ },
468
+ {
469
+ "task_id": "bugfix-rounding-direction",
470
+ "query": "When converting a float amount to cents, do we round up, down, or to nearest?",
471
+ "answer": "If secret was rotated recently, `kubectl rollout restart deployment/billing-svc`\n---\nbilling.new_invoice_pdf\n---\nreporting-svc consumes OrderPaid for financial reports",
472
+ "hits": [
473
+ {
474
+ "text": "If secret was rotated recently, `kubectl rollout restart deployment/billing-svc`",
475
+ "score": 0.7034480342700324,
476
+ "source": "pentatonic-memory",
477
+ "doc_id": "runbook-billing"
478
+ },
479
+ {
480
+ "text": "billing.new_invoice_pdf",
481
+ "score": 0.7029726428794822,
482
+ "source": "pentatonic-memory",
483
+ "doc_id": "flags-feature"
484
+ },
485
+ {
486
+ "text": "reporting-svc consumes OrderPaid for financial reports",
487
+ "score": 0.7008979102984831,
488
+ "source": "pentatonic-memory",
489
+ "doc_id": "cross-ref-invoicing"
490
+ }
491
+ ],
492
+ "correct": false,
493
+ "score": 0.0,
494
+ "grading_notes": "missing 1/1: ['floor']",
495
+ "search_time_ms": 24.506276997271925,
496
+ "generation_time_ms": 0.0,
497
+ "tokens_in": 0,
498
+ "tokens_out": 0,
499
+ "retrieval_tokens": 52,
500
+ "query_tokens": 19,
501
+ "context_tokens": 33,
502
+ "judge_tokens_in": 0,
503
+ "judge_tokens_out": 0,
504
+ "judge_latency_ms": 0.0
505
+ },
506
+ {
507
+ "task_id": "deprecation-v1",
508
+ "query": "Are the /api/v1/ endpoints still supported?",
509
+ "answer": "search-svc API keys \u2014 alex\n---\nbilling-svc API keys \u2014 jordan\n---\nhandle_webhook verifies signature, enqueues internal InvoicePaid event on order-svc",
510
+ "hits": [
511
+ {
512
+ "text": "search-svc API keys \u2014 alex",
513
+ "score": 0.7428322749951227,
514
+ "source": "pentatonic-memory",
515
+ "doc_id": "security-rotation"
516
+ },
517
+ {
518
+ "text": "billing-svc API keys \u2014 jordan",
519
+ "score": 0.7305903326471048,
520
+ "source": "pentatonic-memory",
521
+ "doc_id": "security-rotation"
522
+ },
523
+ {
524
+ "text": "handle_webhook verifies signature, enqueues internal InvoicePaid event on order-svc",
525
+ "score": 0.7223923554132379,
526
+ "source": "pentatonic-memory",
527
+ "doc_id": "cross-ref-invoicing"
528
+ }
529
+ ],
530
+ "correct": false,
531
+ "score": 0.0,
532
+ "grading_notes": "missing 1/1: ['410']",
533
+ "search_time_ms": 22.493504016892985,
534
+ "generation_time_ms": 0.0,
535
+ "tokens_in": 0,
536
+ "tokens_out": 0,
537
+ "retrieval_tokens": 46,
538
+ "query_tokens": 11,
539
+ "context_tokens": 35,
540
+ "judge_tokens_in": 0,
541
+ "judge_tokens_out": 0,
542
+ "judge_latency_ms": 0.0
543
+ },
544
+ {
545
+ "task_id": "deprecation-hs256",
546
+ "query": "Can auth-svc still accept HS256 JWTs?",
547
+ "answer": "auth-svc\n---\nsearch-svc API keys \u2014 alex\n---\nreporting-svc consumes OrderPaid for financial reports",
548
+ "hits": [
549
+ {
550
+ "text": "auth-svc",
551
+ "score": 0.826945919903118,
552
+ "source": "pentatonic-memory",
553
+ "doc_id": "pr-61-jwt-cleanup"
554
+ },
555
+ {
556
+ "text": "search-svc API keys \u2014 alex",
557
+ "score": 0.7840599423271489,
558
+ "source": "pentatonic-memory",
559
+ "doc_id": "security-rotation"
560
+ },
561
+ {
562
+ "text": "reporting-svc consumes OrderPaid for financial reports",
563
+ "score": 0.7726857160628934,
564
+ "source": "pentatonic-memory",
565
+ "doc_id": "cross-ref-invoicing"
566
+ }
567
+ ],
568
+ "correct": false,
569
+ "score": 0.0,
570
+ "grading_notes": "missing 1/1: ['RS256']",
571
+ "search_time_ms": 28.16912901471369,
572
+ "generation_time_ms": 0.0,
573
+ "tokens_in": 0,
574
+ "tokens_out": 0,
575
+ "retrieval_tokens": 35,
576
+ "query_tokens": 11,
577
+ "context_tokens": 24,
578
+ "judge_tokens_in": 0,
579
+ "judge_tokens_out": 0,
580
+ "judge_latency_ms": 0.0
581
+ },
582
+ {
583
+ "task_id": "deprecation-search-tuple",
584
+ "query": "What does SearchClient.search() return today \u2014 tuples or dicts?",
585
+ "answer": "search-svc API keys \u2014 alex\n---\nhandle_webhook verifies signature, enqueues internal InvoicePaid event on order-svc\n---\norder-svc marks order as paid, emits OrderPaid on pubsub",
586
+ "hits": [
587
+ {
588
+ "text": "search-svc API keys \u2014 alex",
589
+ "score": 0.7412246001955328,
590
+ "source": "pentatonic-memory",
591
+ "doc_id": "security-rotation"
592
+ },
593
+ {
594
+ "text": "handle_webhook verifies signature, enqueues internal InvoicePaid event on order-svc",
595
+ "score": 0.7362727176052715,
596
+ "source": "pentatonic-memory",
597
+ "doc_id": "cross-ref-invoicing"
598
+ },
599
+ {
600
+ "text": "order-svc marks order as paid, emits OrderPaid on pubsub",
601
+ "score": 0.7289934497759295,
602
+ "source": "pentatonic-memory",
603
+ "doc_id": "cross-ref-invoicing"
604
+ }
605
+ ],
606
+ "correct": false,
607
+ "score": 0.0,
608
+ "grading_notes": "missing 1/1: ['dict']",
609
+ "search_time_ms": 27.236303023528308,
610
+ "generation_time_ms": 0.0,
611
+ "tokens_in": 0,
612
+ "tokens_out": 0,
613
+ "retrieval_tokens": 54,
614
+ "query_tokens": 13,
615
+ "context_tokens": 41,
616
+ "judge_tokens_in": 0,
617
+ "judge_tokens_out": 0,
618
+ "judge_latency_ms": 0.0
619
+ },
620
+ {
621
+ "task_id": "config-retry-policy-base",
622
+ "query": "What's the default stripe retry base delay in ms on billing-svc?",
623
+ "answer": "Stripe posts a invoice.payment_succeeded webhook to billing-svc\n---\nkubectl rollout restart deployment/billing-svc\n---\nbilling-svc creates the invoice",
624
+ "hits": [
625
+ {
626
+ "text": "Stripe posts a invoice.payment_succeeded webhook to billing-svc",
627
+ "score": 0.7889042068524353,
628
+ "source": "pentatonic-memory",
629
+ "doc_id": "cross-ref-invoicing"
630
+ },
631
+ {
632
+ "text": "kubectl rollout restart deployment/billing-svc",
633
+ "score": 0.7811488601598245,
634
+ "source": "pentatonic-memory",
635
+ "doc_id": "incident-2026-03-02-webhook"
636
+ },
637
+ {
638
+ "text": "billing-svc creates the invoice",
639
+ "score": 0.7768972403938293,
640
+ "source": "pentatonic-memory",
641
+ "doc_id": "cross-ref-invoicing"
642
+ }
643
+ ],
644
+ "correct": false,
645
+ "score": 0.0,
646
+ "grading_notes": "missing 1/1: ['200']",
647
+ "search_time_ms": 34.75105101824738,
648
+ "generation_time_ms": 0.0,
649
+ "tokens_in": 0,
650
+ "tokens_out": 0,
651
+ "retrieval_tokens": 45,
652
+ "query_tokens": 15,
653
+ "context_tokens": 30,
654
+ "judge_tokens_in": 0,
655
+ "judge_tokens_out": 0,
656
+ "judge_latency_ms": 0.0
657
+ },
658
+ {
659
+ "task_id": "config-session-ttl",
660
+ "query": "What's the default SESSION_TTL_MINUTES on auth-svc?",
661
+ "answer": "auth-svc\n---\nsearch-svc API keys \u2014 alex\n---\norder-svc marks order as paid, emits OrderPaid on pubsub",
662
+ "hits": [
663
+ {
664
+ "text": "auth-svc",
665
+ "score": 0.7963131234620584,
666
+ "source": "pentatonic-memory",
667
+ "doc_id": "pr-61-jwt-cleanup"
668
+ },
669
+ {
670
+ "text": "search-svc API keys \u2014 alex",
671
+ "score": 0.7682508782131788,
672
+ "source": "pentatonic-memory",
673
+ "doc_id": "security-rotation"
674
+ },
675
+ {
676
+ "text": "order-svc marks order as paid, emits OrderPaid on pubsub",
677
+ "score": 0.7602115053548171,
678
+ "source": "pentatonic-memory",
679
+ "doc_id": "cross-ref-invoicing"
680
+ }
681
+ ],
682
+ "correct": false,
683
+ "score": 0.0,
684
+ "grading_notes": "missing 1/1: ['60']",
685
+ "search_time_ms": 33.15263401600532,
686
+ "generation_time_ms": 0.0,
687
+ "tokens_in": 0,
688
+ "tokens_out": 0,
689
+ "retrieval_tokens": 42,
690
+ "query_tokens": 14,
691
+ "context_tokens": 28,
692
+ "judge_tokens_in": 0,
693
+ "judge_tokens_out": 0,
694
+ "judge_latency_ms": 0.0
695
+ },
696
+ {
697
+ "task_id": "config-search-hybrid-weight",
698
+ "query": "What's the default hybrid_weight for search-svc?",
699
+ "answer": "search-svc API keys \u2014 alex\n---\nauth-svc\n---\nbilling-svc API keys \u2014 jordan",
700
+ "hits": [
701
+ {
702
+ "text": "search-svc API keys \u2014 alex",
703
+ "score": 0.7764229479828173,
704
+ "source": "pentatonic-memory",
705
+ "doc_id": "security-rotation"
706
+ },
707
+ {
708
+ "text": "auth-svc",
709
+ "score": 0.7586926474565442,
710
+ "source": "pentatonic-memory",
711
+ "doc_id": "pr-61-jwt-cleanup"
712
+ },
713
+ {
714
+ "text": "billing-svc API keys \u2014 jordan",
715
+ "score": 0.7544626968851674,
716
+ "source": "pentatonic-memory",
717
+ "doc_id": "security-rotation"
718
+ }
719
+ ],
720
+ "correct": false,
721
+ "score": 0.0,
722
+ "grading_notes": "missing 1/1: ['0.6']",
723
+ "search_time_ms": 29.454885981976986,
724
+ "generation_time_ms": 0.0,
725
+ "tokens_in": 0,
726
+ "tokens_out": 0,
727
+ "retrieval_tokens": 33,
728
+ "query_tokens": 11,
729
+ "context_tokens": 22,
730
+ "judge_tokens_in": 0,
731
+ "judge_tokens_out": 0,
732
+ "judge_latency_ms": 0.0
733
+ },
734
+ {
735
+ "task_id": "pr-rationale-retry",
736
+ "query": "Why did PR #47 choose base=200ms for stripe retries rather than 500ms?",
737
+ "answer": "Stripe posts a invoice.payment_succeeded webhook to billing-svc\n---\nIf secret was rotated recently, `kubectl rollout restart deployment/billing-svc`\n---\norder-svc marks order as paid, emits OrderPaid on pubsub",
738
+ "hits": [
739
+ {
740
+ "text": "Stripe posts a invoice.payment_succeeded webhook to billing-svc",
741
+ "score": 0.7420407080086832,
742
+ "source": "pentatonic-memory",
743
+ "doc_id": "cross-ref-invoicing"
744
+ },
745
+ {
746
+ "text": "If secret was rotated recently, `kubectl rollout restart deployment/billing-svc`",
747
+ "score": 0.7282995286809945,
748
+ "source": "pentatonic-memory",
749
+ "doc_id": "runbook-billing"
750
+ },
751
+ {
752
+ "text": "order-svc marks order as paid, emits OrderPaid on pubsub",
753
+ "score": 0.7255587953749841,
754
+ "source": "pentatonic-memory",
755
+ "doc_id": "cross-ref-invoicing"
756
+ }
757
+ ],
758
+ "correct": false,
759
+ "score": 0.0,
760
+ "grading_notes": "missing 2/2: ['200', 'conservative']",
761
+ "search_time_ms": 34.7500900097657,
762
+ "generation_time_ms": 0.0,
763
+ "tokens_in": 0,
764
+ "tokens_out": 0,
765
+ "retrieval_tokens": 64,
766
+ "query_tokens": 19,
767
+ "context_tokens": 45,
768
+ "judge_tokens_in": 0,
769
+ "judge_tokens_out": 0,
770
+ "judge_latency_ms": 0.0
771
+ },
772
+ {
773
+ "task_id": "pr-rationale-jwt-cleanup",
774
+ "query": "What did PR #61 clean up and why was it safe?",
775
+ "answer": "If secret was rotated recently, `kubectl rollout restart deployment/billing-svc`\n---\nhandle_webhook verifies signature, enqueues internal InvoicePaid event on order-svc\n---\norder-svc marks order as paid, emits OrderPaid on pubsub",
776
+ "hits": [
777
+ {
778
+ "text": "If secret was rotated recently, `kubectl rollout restart deployment/billing-svc`",
779
+ "score": 0.7184887286283659,
780
+ "source": "pentatonic-memory",
781
+ "doc_id": "runbook-billing"
782
+ },
783
+ {
784
+ "text": "handle_webhook verifies signature, enqueues internal InvoicePaid event on order-svc",
785
+ "score": 0.6965767588074839,
786
+ "source": "pentatonic-memory",
787
+ "doc_id": "cross-ref-invoicing"
788
+ },
789
+ {
790
+ "text": "order-svc marks order as paid, emits OrderPaid on pubsub",
791
+ "score": 0.6962552450171736,
792
+ "source": "pentatonic-memory",
793
+ "doc_id": "cross-ref-invoicing"
794
+ }
795
+ ],
796
+ "correct": false,
797
+ "score": 0.0,
798
+ "grading_notes": "missing 2/2: ['JWT_SECRET', 'RS256']",
799
+ "search_time_ms": 30.669441999634728,
800
+ "generation_time_ms": 0.0,
801
+ "tokens_in": 0,
802
+ "tokens_out": 0,
803
+ "retrieval_tokens": 62,
804
+ "query_tokens": 13,
805
+ "context_tokens": 49,
806
+ "judge_tokens_in": 0,
807
+ "judge_tokens_out": 0,
808
+ "judge_latency_ms": 0.0
809
+ },
810
+ {
811
+ "task_id": "cross-ref-invoicing",
812
+ "query": "After Stripe posts invoice.payment_succeeded, which services consume the event?",
813
+ "answer": "Stripe posts a invoice.payment_succeeded webhook to billing-svc\n---\nhandle_webhook verifies signature, enqueues internal InvoicePaid event on order-svc\n---\norder-svc marks order as paid, emits OrderPaid on pubsub",
814
+ "hits": [
815
+ {
816
+ "text": "Stripe posts a invoice.payment_succeeded webhook to billing-svc",
817
+ "score": 0.8498468291248849,
818
+ "source": "pentatonic-memory",
819
+ "doc_id": "cross-ref-invoicing"
820
+ },
821
+ {
822
+ "text": "handle_webhook verifies signature, enqueues internal InvoicePaid event on order-svc",
823
+ "score": 0.7846139216003613,
824
+ "source": "pentatonic-memory",
825
+ "doc_id": "cross-ref-invoicing"
826
+ },
827
+ {
828
+ "text": "order-svc marks order as paid, emits OrderPaid on pubsub",
829
+ "score": 0.7683272603742544,
830
+ "source": "pentatonic-memory",
831
+ "doc_id": "cross-ref-invoicing"
832
+ }
833
+ ],
834
+ "correct": false,
835
+ "score": 0.5,
836
+ "grading_notes": "missing 2/4: ['reporting-svc', 'email-svc']",
837
+ "search_time_ms": 34.23739701975137,
838
+ "generation_time_ms": 0.0,
839
+ "tokens_in": 0,
840
+ "tokens_out": 0,
841
+ "retrieval_tokens": 60,
842
+ "query_tokens": 14,
843
+ "context_tokens": 46,
844
+ "judge_tokens_in": 0,
845
+ "judge_tokens_out": 0,
846
+ "judge_latency_ms": 0.0
847
+ },
848
+ {
849
+ "task_id": "cross-ref-ratelimit",
850
+ "query": "Where does the shared rate-limit lib live and what's the method signature?",
851
+ "answer": "billing-svc API keys \u2014 jordan\n---\nhandle_webhook verifies signature, enqueues internal InvoicePaid event on order-svc\n---\nIf secret was rotated recently, `kubectl rollout restart deployment/billing-svc`",
852
+ "hits": [
853
+ {
854
+ "text": "billing-svc API keys \u2014 jordan",
855
+ "score": 0.7469675209281457,
856
+ "source": "pentatonic-memory",
857
+ "doc_id": "security-rotation"
858
+ },
859
+ {
860
+ "text": "handle_webhook verifies signature, enqueues internal InvoicePaid event on order-svc",
861
+ "score": 0.7440755319931266,
862
+ "source": "pentatonic-memory",
863
+ "doc_id": "cross-ref-invoicing"
864
+ },
865
+ {
866
+ "text": "If secret was rotated recently, `kubectl rollout restart deployment/billing-svc`",
867
+ "score": 0.7374871245377808,
868
+ "source": "pentatonic-memory",
869
+ "doc_id": "runbook-billing"
870
+ }
871
+ ],
872
+ "correct": false,
873
+ "score": 0.0,
874
+ "grading_notes": "missing 2/2: ['shared/rate_limit.py', 'allow']",
875
+ "search_time_ms": 32.43394001037814,
876
+ "generation_time_ms": 0.0,
877
+ "tokens_in": 0,
878
+ "tokens_out": 0,
879
+ "retrieval_tokens": 59,
880
+ "query_tokens": 15,
881
+ "context_tokens": 44,
882
+ "judge_tokens_in": 0,
883
+ "judge_tokens_out": 0,
884
+ "judge_latency_ms": 0.0
885
+ },
886
+ {
887
+ "task_id": "stack-trace-webhook",
888
+ "query": "If a SignatureVerificationError happens on billing-svc webhook, what's the most likely cause and fix?",
889
+ "answer": "handle_webhook verifies signature, enqueues internal InvoicePaid event on order-svc\n---\nStripe posts a invoice.payment_succeeded webhook to billing-svc\n---\nbilling-svc creates the invoice",
890
+ "hits": [
891
+ {
892
+ "text": "handle_webhook verifies signature, enqueues internal InvoicePaid event on order-svc",
893
+ "score": 0.8142024577628363,
894
+ "source": "pentatonic-memory",
895
+ "doc_id": "cross-ref-invoicing"
896
+ },
897
+ {
898
+ "text": "Stripe posts a invoice.payment_succeeded webhook to billing-svc",
899
+ "score": 0.7981789940404692,
900
+ "source": "pentatonic-memory",
901
+ "doc_id": "cross-ref-invoicing"
902
+ },
903
+ {
904
+ "text": "billing-svc creates the invoice",
905
+ "score": 0.7889293966486027,
906
+ "source": "pentatonic-memory",
907
+ "doc_id": "cross-ref-invoicing"
908
+ }
909
+ ],
910
+ "correct": false,
911
+ "score": 0.0,
912
+ "grading_notes": "missing 2/2: ['STRIPE_WEBHOOK_SECRET', 'restart']",
913
+ "search_time_ms": 34.51146400766447,
914
+ "generation_time_ms": 0.0,
915
+ "tokens_in": 0,
916
+ "tokens_out": 0,
917
+ "retrieval_tokens": 59,
918
+ "query_tokens": 21,
919
+ "context_tokens": 38,
920
+ "judge_tokens_in": 0,
921
+ "judge_tokens_out": 0,
922
+ "judge_latency_ms": 0.0
923
+ },
924
+ {
925
+ "task_id": "temporal-numpy",
926
+ "query": "Why was numpy upgraded to 1.26 recently?",
927
+ "answer": "If secret was rotated recently, `kubectl rollout restart deployment/billing-svc`\n---\nbilling.new_invoice_pdf\n---\nsearch-svc API keys \u2014 alex",
928
+ "hits": [
929
+ {
930
+ "text": "If secret was rotated recently, `kubectl rollout restart deployment/billing-svc`",
931
+ "score": 0.7134621175527455,
932
+ "source": "pentatonic-memory",
933
+ "doc_id": "runbook-billing"
934
+ },
935
+ {
936
+ "text": "billing.new_invoice_pdf",
937
+ "score": 0.7098919395743223,
938
+ "source": "pentatonic-memory",
939
+ "doc_id": "flags-feature"
940
+ },
941
+ {
942
+ "text": "search-svc API keys \u2014 alex",
943
+ "score": 0.7063170426417354,
944
+ "source": "pentatonic-memory",
945
+ "doc_id": "security-rotation"
946
+ }
947
+ ],
948
+ "correct": false,
949
+ "score": 0.0,
950
+ "grading_notes": "missing 2/2: ['Python 3.12', 'wheels']",
951
+ "search_time_ms": 27.70480400067754,
952
+ "generation_time_ms": 0.0,
953
+ "tokens_in": 0,
954
+ "tokens_out": 0,
955
+ "retrieval_tokens": 41,
956
+ "query_tokens": 11,
957
+ "context_tokens": 30,
958
+ "judge_tokens_in": 0,
959
+ "judge_tokens_out": 0,
960
+ "judge_latency_ms": 0.0
961
+ },
962
+ {
963
+ "task_id": "temporal-on-call",
964
+ "query": "Who is the primary on-call for the week of April 15\u201321, 2026?",
965
+ "answer": "# On-call rotation April 2026\n\n| Week | Primary | Secondary |\n|------|---------|-----------|\n| Apr 1\u20137 | jordan | priya |\n| Apr 8\u201314 | priya | alex |\n| Apr 15\u201321 | alex | jordan |\n| Apr 22\u201328 | jordan | priya |\n| Apr 29\u201330 | priya | alex |\n\nHandoff: Mondays 10:00 UTC, #on-call channel. Previous week's primary\nruns through the incident log.\n\nEscalation path: primary \u2192 secondary \u2192 engineering manager (kim) \u2192 CTO.\n---\nIf secret was rotated recently, `kubectl rollout restart deployment/billing-svc`\n---\nsearch-svc API keys \u2014 alex",
966
+ "hits": [
967
+ {
968
+ "text": "# On-call rotation April 2026\n\n| Week | Primary | Secondary |\n|------|---------|-----------|\n| Apr 1\u20137 | jordan | priya |\n| Apr 8\u201314 | priya | alex |\n| Apr 15\u201321 | alex | jordan |\n| Apr 22\u201328 | jordan | priya |\n| Apr 29\u201330 | priya | alex |\n\nHandoff: Mondays 10:00 UTC, #on-call channel. Previous week's primary\nruns through the incident log.\n\nEscalation path: primary \u2192 secondary \u2192 engineeri",
969
+ "score": 0.7601899128500287,
970
+ "source": "pentatonic-memory",
971
+ "doc_id": "on-call-rotation"
972
+ },
973
+ {
974
+ "text": "If secret was rotated recently, `kubectl rollout restart deployment/billing-svc`",
975
+ "score": 0.7199911219668085,
976
+ "source": "pentatonic-memory",
977
+ "doc_id": "runbook-billing"
978
+ },
979
+ {
980
+ "text": "search-svc API keys \u2014 alex",
981
+ "score": 0.7148886708511676,
982
+ "source": "pentatonic-memory",
983
+ "doc_id": "security-rotation"
984
+ }
985
+ ],
986
+ "correct": true,
987
+ "score": 1.0,
988
+ "grading_notes": "all substrings matched",
989
+ "search_time_ms": 33.89947398682125,
990
+ "generation_time_ms": 0.0,
991
+ "tokens_in": 0,
992
+ "tokens_out": 0,
993
+ "retrieval_tokens": 181,
994
+ "query_tokens": 20,
995
+ "context_tokens": 161,
996
+ "judge_tokens_in": 0,
997
+ "judge_tokens_out": 0,
998
+ "judge_latency_ms": 0.0
999
+ },
1000
+ {
1001
+ "task_id": "contradiction-algorithm",
1002
+ "query": "Which JWT signing algorithm does auth-svc use \u2014 HS256 or RS256?",
1003
+ "answer": "auth-svc\n---\nhandle_webhook verifies signature, enqueues internal InvoicePaid event on order-svc\n---\nsearch-svc API keys \u2014 alex",
1004
+ "hits": [
1005
+ {
1006
+ "text": "auth-svc",
1007
+ "score": 0.8056572717578909,
1008
+ "source": "pentatonic-memory",
1009
+ "doc_id": "pr-61-jwt-cleanup"
1010
+ },
1011
+ {
1012
+ "text": "handle_webhook verifies signature, enqueues internal InvoicePaid event on order-svc",
1013
+ "score": 0.7907839118000173,
1014
+ "source": "pentatonic-memory",
1015
+ "doc_id": "cross-ref-invoicing"
1016
+ },
1017
+ {
1018
+ "text": "search-svc API keys \u2014 alex",
1019
+ "score": 0.7886045701123212,
1020
+ "source": "pentatonic-memory",
1021
+ "doc_id": "security-rotation"
1022
+ }
1023
+ ],
1024
+ "correct": false,
1025
+ "score": 0.0,
1026
+ "grading_notes": "missing 1/1: ['RS256']",
1027
+ "search_time_ms": 35.218623001128435,
1028
+ "generation_time_ms": 0.0,
1029
+ "tokens_in": 0,
1030
+ "tokens_out": 0,
1031
+ "retrieval_tokens": 46,
1032
+ "query_tokens": 16,
1033
+ "context_tokens": 30,
1034
+ "judge_tokens_in": 0,
1035
+ "judge_tokens_out": 0,
1036
+ "judge_latency_ms": 0.0
1037
+ },
1038
+ {
1039
+ "task_id": "rubric-security-advice",
1040
+ "query": "A new engineer asks: can I add HS256 as a fallback for older clients in auth-svc? Advise them.",
1041
+ "answer": "auth-svc\n---\nsearch-svc API keys \u2014 alex\n---\nbilling-svc API keys \u2014 jordan",
1042
+ "hits": [
1043
+ {
1044
+ "text": "auth-svc",
1045
+ "score": 0.7950344046907108,
1046
+ "source": "pentatonic-memory",
1047
+ "doc_id": "pr-61-jwt-cleanup"
1048
+ },
1049
+ {
1050
+ "text": "search-svc API keys \u2014 alex",
1051
+ "score": 0.7763007598717464,
1052
+ "source": "pentatonic-memory",
1053
+ "doc_id": "security-rotation"
1054
+ },
1055
+ {
1056
+ "text": "billing-svc API keys \u2014 jordan",
1057
+ "score": 0.7594627948102031,
1058
+ "source": "pentatonic-memory",
1059
+ "doc_id": "security-rotation"
1060
+ }
1061
+ ],
1062
+ "correct": true,
1063
+ "score": 1.0,
1064
+ "grading_notes": "judge=1.00 \u2014 The answer explicitly advises against adding HS256 and cites that all services only hold the public key since the RS256 ",
1065
+ "search_time_ms": 36.35319400927983,
1066
+ "generation_time_ms": 0.0,
1067
+ "tokens_in": 0,
1068
+ "tokens_out": 0,
1069
+ "retrieval_tokens": 47,
1070
+ "query_tokens": 25,
1071
+ "context_tokens": 22,
1072
+ "judge_tokens_in": 336,
1073
+ "judge_tokens_out": 38,
1074
+ "judge_latency_ms": 799.5739569962025
1075
+ },
1076
+ {
1077
+ "task_id": "rubric-incident-lesson",
1078
+ "query": "We saw SignatureVerificationError in billing-svc logs right after a secret rotation. What's our SOP?",
1079
+ "answer": "If secret was rotated recently, `kubectl rollout restart deployment/billing-svc`\n---\nhandle_webhook verifies signature, enqueues internal InvoicePaid event on order-svc\n---\nbilling-svc creates the invoice",
1080
+ "hits": [
1081
+ {
1082
+ "text": "If secret was rotated recently, `kubectl rollout restart deployment/billing-svc`",
1083
+ "score": 0.790215175803716,
1084
+ "source": "pentatonic-memory",
1085
+ "doc_id": "runbook-billing"
1086
+ },
1087
+ {
1088
+ "text": "handle_webhook verifies signature, enqueues internal InvoicePaid event on order-svc",
1089
+ "score": 0.7788197656964213,
1090
+ "source": "pentatonic-memory",
1091
+ "doc_id": "cross-ref-invoicing"
1092
+ },
1093
+ {
1094
+ "text": "billing-svc creates the invoice",
1095
+ "score": 0.7785700277142291,
1096
+ "source": "pentatonic-memory",
1097
+ "doc_id": "cross-ref-invoicing"
1098
+ }
1099
+ ],
1100
+ "correct": false,
1101
+ "score": 0.5,
1102
+ "grading_notes": "judge=0.50 \u2014 Mentioned pod restart but did not reference that secrets only load at pod startup.",
1103
+ "search_time_ms": 34.223557013319805,
1104
+ "generation_time_ms": 0.0,
1105
+ "tokens_in": 0,
1106
+ "tokens_out": 0,
1107
+ "retrieval_tokens": 62,
1108
+ "query_tokens": 21,
1109
+ "context_tokens": 41,
1110
+ "judge_tokens_in": 335,
1111
+ "judge_tokens_out": 31,
1112
+ "judge_latency_ms": 736.8484830260277
1113
+ }
1114
+ ]
1115
+ }