@pentatonic-ai/ai-agent-sdk 0.9.6 → 0.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (127) hide show
  1. package/README.md +3 -3
  2. package/bin/cli.js +1 -1
  3. package/bin/commands/config.js +1 -1
  4. package/dist/index.cjs +1 -1
  5. package/dist/index.js +1 -1
  6. package/package.json +2 -2
  7. package/packages/doctor/src/checks/local-memory.js +2 -2
  8. package/packages/memory/README.md +2 -2
  9. package/packages/memory/openclaw-plugin/README.md +2 -2
  10. package/packages/memory/openclaw-plugin/openclaw.plugin.json +1 -1
  11. package/packages/memory/src/server.js +2 -2
  12. package/packages/memory-engine-v2/.env.example +30 -0
  13. package/packages/memory-engine-v2/README.md +125 -0
  14. package/packages/memory-engine-v2/compat/Dockerfile +11 -0
  15. package/packages/memory-engine-v2/compat/requirements.txt +6 -0
  16. package/packages/memory-engine-v2/compat/server.py +1047 -0
  17. package/packages/memory-engine-v2/docker-compose.aws.yml +78 -0
  18. package/packages/memory-engine-v2/docker-compose.yml +206 -0
  19. package/packages/memory-engine-v2/extractor-async/Dockerfile +14 -0
  20. package/packages/memory-engine-v2/extractor-async/confidence.py +62 -0
  21. package/packages/memory-engine-v2/extractor-async/noise_filter.py +144 -0
  22. package/packages/memory-engine-v2/extractor-async/requirements.txt +2 -0
  23. package/packages/memory-engine-v2/extractor-async/test_confidence.py +76 -0
  24. package/packages/memory-engine-v2/extractor-async/test_noise_filter.py +177 -0
  25. package/packages/memory-engine-v2/extractor-async/worker.py +797 -0
  26. package/packages/memory-engine-v2/extractor-sync/Dockerfile +11 -0
  27. package/packages/memory-engine-v2/extractor-sync/requirements.txt +4 -0
  28. package/packages/memory-engine-v2/extractor-sync/server.py +424 -0
  29. package/packages/memory-engine-v2/org-model/migrations/001_init.sql +390 -0
  30. package/packages/memory-engine-v2/tests/e2e_smoke.py +356 -0
  31. package/packages/memory-engine-v2/tests/fixtures/generate_synthetic_corpus.py +758 -0
  32. package/packages/memory-engine/.env.example +0 -13
  33. package/packages/memory-engine/MIGRATION.md +0 -219
  34. package/packages/memory-engine/README.md +0 -145
  35. package/packages/memory-engine/bench/README.md +0 -99
  36. package/packages/memory-engine/bench/scorecards-engine/agent-coding__pentatonic-baseline__20260427-142523.json +0 -1115
  37. package/packages/memory-engine/bench/scorecards-engine/chat-recall__pentatonic-baseline__20260427-142648.json +0 -819
  38. package/packages/memory-engine/bench/scorecards-engine/circular-economy__pentatonic-baseline__20260427-142757.json +0 -1278
  39. package/packages/memory-engine/bench/scorecards-engine/customer-support__pentatonic-baseline__20260427-142900.json +0 -1018
  40. package/packages/memory-engine/bench/scorecards-engine/marketplace-ops__pentatonic-baseline__20260427-142957.json +0 -1038
  41. package/packages/memory-engine/bench/scorecards-engine/product-catalogue__pentatonic-baseline__20260427-143122.json +0 -961
  42. package/packages/memory-engine/bench/scorecards-engine-via-docker/agent-coding__pentatonic-memory__20260427-161812.json +0 -1115
  43. package/packages/memory-engine/bench/scorecards-engine-via-docker/chat-recall__pentatonic-memory__20260427-161701.json +0 -819
  44. package/packages/memory-engine/bench/scorecards-engine-via-docker/circular-economy__pentatonic-memory__20260427-161713.json +0 -1278
  45. package/packages/memory-engine/bench/scorecards-engine-via-docker/customer-support__pentatonic-memory__20260427-161723.json +0 -1018
  46. package/packages/memory-engine/bench/scorecards-engine-via-docker/marketplace-ops__pentatonic-memory__20260427-161732.json +0 -1038
  47. package/packages/memory-engine/bench/scorecards-engine-via-docker/product-catalogue__pentatonic-memory__20260427-161741.json +0 -937
  48. package/packages/memory-engine/bench/scorecards-engine-via-l2-7-layer-populated/agent-coding__pentatonic-memory__20260427-184718.json +0 -1115
  49. package/packages/memory-engine/bench/scorecards-engine-via-l2-7-layer-populated/chat-recall__pentatonic-memory__20260427-184614.json +0 -819
  50. package/packages/memory-engine/bench/scorecards-engine-via-l2-7-layer-populated/circular-economy__pentatonic-memory__20260427-184809.json +0 -1278
  51. package/packages/memory-engine/bench/scorecards-engine-via-l2-7-layer-populated/customer-support__pentatonic-memory__20260427-184854.json +0 -1018
  52. package/packages/memory-engine/bench/scorecards-engine-via-l2-7-layer-populated/marketplace-ops__pentatonic-memory__20260427-184929.json +0 -1038
  53. package/packages/memory-engine/bench/scorecards-engine-via-l2-7-layer-populated/product-catalogue__pentatonic-memory__20260427-185015.json +0 -961
  54. package/packages/memory-engine/bench/scorecards-engine-via-l2-empty-layers/agent-coding__pentatonic-memory__20260427-175252.json +0 -1115
  55. package/packages/memory-engine/bench/scorecards-engine-via-l2-empty-layers/chat-recall__pentatonic-memory__20260427-175312.json +0 -819
  56. package/packages/memory-engine/bench/scorecards-engine-via-l2-empty-layers/circular-economy__pentatonic-memory__20260427-175335.json +0 -1278
  57. package/packages/memory-engine/bench/scorecards-engine-via-l2-empty-layers/customer-support__pentatonic-memory__20260427-175355.json +0 -1018
  58. package/packages/memory-engine/bench/scorecards-engine-via-l2-empty-layers/marketplace-ops__pentatonic-memory__20260427-175413.json +0 -1038
  59. package/packages/memory-engine/bench/scorecards-engine-via-l2-empty-layers/product-catalogue__pentatonic-memory__20260427-175430.json +0 -883
  60. package/packages/memory-engine/bench/scorecards-engine-via-shim/agent-coding__pentatonic-memory__20260427-155409.json +0 -1115
  61. package/packages/memory-engine/bench/scorecards-engine-via-shim/chat-recall__pentatonic-memory__20260427-155421.json +0 -819
  62. package/packages/memory-engine/bench/scorecards-engine-via-shim/circular-economy__pentatonic-memory__20260427-155433.json +0 -1278
  63. package/packages/memory-engine/bench/scorecards-engine-via-shim/customer-support__pentatonic-memory__20260427-155443.json +0 -1018
  64. package/packages/memory-engine/bench/scorecards-engine-via-shim/marketplace-ops__pentatonic-memory__20260427-155453.json +0 -1038
  65. package/packages/memory-engine/bench/scorecards-engine-via-shim/product-catalogue__pentatonic-memory__20260427-155503.json +0 -937
  66. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/agent-coding__pentatonic-memory-latest__20260427-145103.json +0 -1115
  67. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/agent-coding__pentatonic-memory__20260427-144909.json +0 -1115
  68. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/chat-recall__pentatonic-memory-latest__20260427-145153.json +0 -819
  69. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/chat-recall__pentatonic-memory__20260427-145120.json +0 -542
  70. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/circular-economy__pentatonic-memory-latest__20260427-145313.json +0 -1278
  71. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/circular-economy__pentatonic-memory__20260427-145207.json +0 -894
  72. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/customer-support__pentatonic-memory-latest__20260427-145412.json +0 -1018
  73. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/customer-support__pentatonic-memory__20260427-145327.json +0 -680
  74. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/marketplace-ops__pentatonic-memory-latest__20260427-145517.json +0 -1038
  75. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/marketplace-ops__pentatonic-memory__20260427-145422.json +0 -693
  76. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/product-catalogue__pentatonic-memory-latest__20260427-145616.json +0 -961
  77. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/product-catalogue__pentatonic-memory__20260427-145528.json +0 -727
  78. package/packages/memory-engine/compat/Dockerfile +0 -22
  79. package/packages/memory-engine/compat/server.py +0 -1255
  80. package/packages/memory-engine/docker-compose.test.yml +0 -59
  81. package/packages/memory-engine/docker-compose.yml +0 -255
  82. package/packages/memory-engine/engine/README.md +0 -52
  83. package/packages/memory-engine/engine/l2-hybridrag-proxy.py +0 -1543
  84. package/packages/memory-engine/engine/l5-comms-layer.py +0 -663
  85. package/packages/memory-engine/engine/l6-document-store.py +0 -1018
  86. package/packages/memory-engine/engine/services/_shared/__init__.py +0 -1
  87. package/packages/memory-engine/engine/services/_shared/embed_provider.py +0 -562
  88. package/packages/memory-engine/engine/services/l2/Dockerfile +0 -50
  89. package/packages/memory-engine/engine/services/l2/init_databases.py +0 -81
  90. package/packages/memory-engine/engine/services/l2/l2-hybridrag-proxy.py +0 -2721
  91. package/packages/memory-engine/engine/services/l5/Dockerfile +0 -11
  92. package/packages/memory-engine/engine/services/l5/l5-comms-layer.py +0 -808
  93. package/packages/memory-engine/engine/services/l6/Dockerfile +0 -30
  94. package/packages/memory-engine/engine/services/l6/l6-document-store.py +0 -1221
  95. package/packages/memory-engine/engine/services/nv-embed/Dockerfile +0 -28
  96. package/packages/memory-engine/engine/services/nv-embed/server.py +0 -152
  97. package/packages/memory-engine/pme_memory/__init__.py +0 -0
  98. package/packages/memory-engine/pme_memory/__main__.py +0 -129
  99. package/packages/memory-engine/pme_memory/artifacts.py +0 -95
  100. package/packages/memory-engine/pme_memory/embed.py +0 -74
  101. package/packages/memory-engine/pme_memory/health.py +0 -36
  102. package/packages/memory-engine/pme_memory/hygiene.py +0 -159
  103. package/packages/memory-engine/pme_memory/indexer.py +0 -200
  104. package/packages/memory-engine/pme_memory/needs.py +0 -55
  105. package/packages/memory-engine/pme_memory/provenance.py +0 -80
  106. package/packages/memory-engine/pme_memory/scoring.py +0 -168
  107. package/packages/memory-engine/pme_memory/search.py +0 -52
  108. package/packages/memory-engine/pme_memory/store.py +0 -86
  109. package/packages/memory-engine/pme_memory/synthesis.py +0 -114
  110. package/packages/memory-engine/pyproject.toml +0 -65
  111. package/packages/memory-engine/scripts/kg-extractor.py +0 -557
  112. package/packages/memory-engine/scripts/kg-preflexor-v2.py +0 -738
  113. package/packages/memory-engine/scripts/wipe-legacy-l3-entities.py +0 -128
  114. package/packages/memory-engine/tests/e2e_arena.sh +0 -259
  115. package/packages/memory-engine/tests/embed_stub/Dockerfile +0 -13
  116. package/packages/memory-engine/tests/embed_stub/server.py +0 -80
  117. package/packages/memory-engine/tests/test_aggregate.py +0 -333
  118. package/packages/memory-engine/tests/test_api_contract.sh +0 -57
  119. package/packages/memory-engine/tests/test_arena_safety.py +0 -232
  120. package/packages/memory-engine/tests/test_channel_stat_reader.py +0 -437
  121. package/packages/memory-engine/tests/test_channel_stat_rollups.py +0 -308
  122. package/packages/memory-engine/tests/test_compat_nv_embed_probe.py +0 -48
  123. package/packages/memory-engine/tests/test_embed_provider.py +0 -693
  124. package/packages/memory-engine/tests/test_l2_qmd_vec_search.py +0 -280
  125. package/packages/memory-engine/tests/test_l3_arena_isolation.py +0 -412
  126. package/packages/memory-engine/tests/test_l6_module_load.py +0 -84
  127. package/packages/memory-engine/tests/test_people_list_reader.py +0 -432
@@ -1,680 +0,0 @@
1
- {
2
- "bench": "customer-support",
3
- "stack": "pentatonic-memory",
4
- "n_tasks": 20,
5
- "n_correct": 5,
6
- "accuracy": 0.25,
7
- "mean_score": 0.25,
8
- "p50_search_ms": 30.443045994616114,
9
- "p95_search_ms": 37.8251028523664,
10
- "total_tokens_in": 0,
11
- "total_tokens_out": 0,
12
- "total_usd": 0.0,
13
- "by_tag": {
14
- "factoid": {
15
- "n": 10,
16
- "mean_score": 0.3,
17
- "accuracy": 0.3
18
- },
19
- "customer": {
20
- "n": 8,
21
- "mean_score": 0.375,
22
- "accuracy": 0.375
23
- },
24
- "multi-doc": {
25
- "n": 6,
26
- "mean_score": 0.16666666666666666,
27
- "accuracy": 0.16666666666666666
28
- },
29
- "rma": {
30
- "n": 3,
31
- "mean_score": 0.3333333333333333,
32
- "accuracy": 0.3333333333333333
33
- },
34
- "policy": {
35
- "n": 5,
36
- "mean_score": 0.2,
37
- "accuracy": 0.2
38
- },
39
- "escalation": {
40
- "n": 4,
41
- "mean_score": 0.25,
42
- "accuracy": 0.25
43
- },
44
- "rubric": {
45
- "n": 3,
46
- "mean_score": 0.3333333333333333,
47
- "accuracy": 0.3333333333333333
48
- },
49
- "multi-hop": {
50
- "n": 1,
51
- "mean_score": 0.0,
52
- "accuracy": 0.0
53
- },
54
- "entity": {
55
- "n": 2,
56
- "mean_score": 0.0,
57
- "accuracy": 0.0
58
- }
59
- },
60
- "extra": {
61
- "ingest_ms": 11268.10247899266,
62
- "grading": "substring",
63
- "limit": 3,
64
- "tokens": {
65
- "corpus_tokens": 1227,
66
- "query_tokens": 283,
67
- "context_tokens": 278,
68
- "retrieval_tokens": 561,
69
- "naive_tokens": 24823,
70
- "saved_tokens": 24262,
71
- "reduction_pct": 0.9773999919429561,
72
- "mean_retrieval_tokens_per_task": 28.05,
73
- "tokenizer": "cl100k_base",
74
- "per_task": {
75
- "order-mina-count": {
76
- "query": 11,
77
- "context": 0,
78
- "retrieval": 11,
79
- "judge_in": 0,
80
- "judge_out": 0,
81
- "judge_latency_ms": 0.0
82
- },
83
- "order-mina-latest": {
84
- "query": 13,
85
- "context": 0,
86
- "retrieval": 13,
87
- "judge_in": 0,
88
- "judge_out": 0,
89
- "judge_latency_ms": 0.0
90
- },
91
- "rma-mina-sleeve-reason": {
92
- "query": 17,
93
- "context": 0,
94
- "retrieval": 17,
95
- "judge_in": 0,
96
- "judge_out": 0,
97
- "judge_latency_ms": 0.0
98
- },
99
- "rma-mina-lid-resolution": {
100
- "query": 11,
101
- "context": 57,
102
- "retrieval": 68,
103
- "judge_in": 0,
104
- "judge_out": 0,
105
- "judge_latency_ms": 0.0
106
- },
107
- "jareth-harness-bar-followup": {
108
- "query": 15,
109
- "context": 0,
110
- "retrieval": 15,
111
- "judge_in": 0,
112
- "judge_out": 0,
113
- "judge_latency_ms": 0.0
114
- },
115
- "jareth-second-order": {
116
- "query": 11,
117
- "context": 0,
118
- "retrieval": 11,
119
- "judge_in": 0,
120
- "judge_out": 0,
121
- "judge_latency_ms": 0.0
122
- },
123
- "priya-custom-status": {
124
- "query": 11,
125
- "context": 0,
126
- "retrieval": 11,
127
- "judge_in": 0,
128
- "judge_out": 0,
129
- "judge_latency_ms": 0.0
130
- },
131
- "priya-goodwill-offered": {
132
- "query": 9,
133
- "context": 0,
134
- "retrieval": 9,
135
- "judge_in": 0,
136
- "judge_out": 0,
137
- "judge_latency_ms": 0.0
138
- },
139
- "policy-custom-return": {
140
- "query": 11,
141
- "context": 0,
142
- "retrieval": 11,
143
- "judge_in": 0,
144
- "judge_out": 0,
145
- "judge_latency_ms": 0.0
146
- },
147
- "policy-40day-return": {
148
- "query": 18,
149
- "context": 46,
150
- "retrieval": 64,
151
- "judge_in": 0,
152
- "judge_out": 0,
153
- "judge_latency_ms": 0.0
154
- },
155
- "escalation-400gbp-full-refund": {
156
- "query": 20,
157
- "context": 0,
158
- "retrieval": 20,
159
- "judge_in": 0,
160
- "judge_out": 0,
161
- "judge_latency_ms": 0.0
162
- },
163
- "escalation-goodwill-20pct-tier": {
164
- "query": 15,
165
- "context": 49,
166
- "retrieval": 64,
167
- "judge_in": 0,
168
- "judge_out": 0,
169
- "judge_latency_ms": 0.0
170
- },
171
- "mina-vip-status": {
172
- "query": 10,
173
- "context": 42,
174
- "retrieval": 52,
175
- "judge_in": 0,
176
- "judge_out": 0,
177
- "judge_latency_ms": 0.0
178
- },
179
- "mina-preferences": {
180
- "query": 11,
181
- "context": 42,
182
- "retrieval": 53,
183
- "judge_in": 0,
184
- "judge_out": 0,
185
- "judge_latency_ms": 0.0
186
- },
187
- "priya-agent-guidance": {
188
- "query": 16,
189
- "context": 0,
190
- "retrieval": 16,
191
- "judge_in": 0,
192
- "judge_out": 0,
193
- "judge_latency_ms": 0.0
194
- },
195
- "rubric-mina-warranty-recommendation": {
196
- "query": 18,
197
- "context": 42,
198
- "retrieval": 60,
199
- "judge_in": 374,
200
- "judge_out": 44,
201
- "judge_latency_ms": 945.222895026207
202
- },
203
- "rubric-jareth-escalation": {
204
- "query": 30,
205
- "context": 0,
206
- "retrieval": 30,
207
- "judge_in": 357,
208
- "judge_out": 43,
209
- "judge_latency_ms": 868.3296079933643
210
- },
211
- "rubric-priya-delayed-custom": {
212
- "query": 16,
213
- "context": 0,
214
- "retrieval": 16,
215
- "judge_in": 310,
216
- "judge_out": 54,
217
- "judge_latency_ms": 939.1167230010033
218
- },
219
- "entity-mina-orders": {
220
- "query": 10,
221
- "context": 0,
222
- "retrieval": 10,
223
- "judge_in": 0,
224
- "judge_out": 0,
225
- "judge_latency_ms": 0.0
226
- },
227
- "entity-all-rmas": {
228
- "query": 10,
229
- "context": 0,
230
- "retrieval": 10,
231
- "judge_in": 0,
232
- "judge_out": 0,
233
- "judge_latency_ms": 0.0
234
- }
235
- },
236
- "judge_tokens_in": 1041,
237
- "judge_tokens_out": 141,
238
- "judge_calls": 3,
239
- "judge_mean_latency_ms": 917.5564086735249
240
- },
241
- "cost_usd": {
242
- "assumed_completion_tokens_per_task": 100,
243
- "rates": {
244
- "input_per_1k": 0.0025,
245
- "output_per_1k": 0.01,
246
- "model": "gpt-4o"
247
- },
248
- "retrieval_usd_in": 0.0014025,
249
- "retrieval_usd_out": 0.02,
250
- "retrieval_usd_total": 0.0214025,
251
- "naive_usd_total": 0.0820575,
252
- "saved_usd": 0.060655,
253
- "saved_usd_per_1k_tasks": 3.03275
254
- }
255
- },
256
- "task_results": [
257
- {
258
- "task_id": "order-mina-count",
259
- "query": "How many orders has Mina Okafor placed?",
260
- "answer": "",
261
- "hits": [],
262
- "correct": false,
263
- "score": 0.0,
264
- "grading_notes": "missing 1/1: ['2']",
265
- "search_time_ms": 34.31391599588096,
266
- "generation_time_ms": 0.0,
267
- "tokens_in": 0,
268
- "tokens_out": 0,
269
- "retrieval_tokens": 11,
270
- "query_tokens": 11,
271
- "context_tokens": 0,
272
- "judge_tokens_in": 0,
273
- "judge_tokens_out": 0,
274
- "judge_latency_ms": 0.0
275
- },
276
- {
277
- "task_id": "order-mina-latest",
278
- "query": "What was in Mina Okafor's most recent order?",
279
- "answer": "",
280
- "hits": [],
281
- "correct": false,
282
- "score": 0.0,
283
- "grading_notes": "missing 2/2: ['Luna', 'Loop']",
284
- "search_time_ms": 29.60854201228358,
285
- "generation_time_ms": 0.0,
286
- "tokens_in": 0,
287
- "tokens_out": 0,
288
- "retrieval_tokens": 13,
289
- "query_tokens": 13,
290
- "context_tokens": 0,
291
- "judge_tokens_in": 0,
292
- "judge_tokens_out": 0,
293
- "judge_latency_ms": 0.0
294
- },
295
- {
296
- "task_id": "rma-mina-sleeve-reason",
297
- "query": "Why did Mina open an RMA on order 2026-0142?",
298
- "answer": "",
299
- "hits": [],
300
- "correct": false,
301
- "score": 0.0,
302
- "grading_notes": "missing 2/2: ['wrong colour', 'Oat']",
303
- "search_time_ms": 32.31730399420485,
304
- "generation_time_ms": 0.0,
305
- "tokens_in": 0,
306
- "tokens_out": 0,
307
- "retrieval_tokens": 17,
308
- "query_tokens": 17,
309
- "context_tokens": 0,
310
- "judge_tokens_in": 0,
311
- "judge_tokens_out": 0,
312
- "judge_latency_ms": 0.0
313
- },
314
- {
315
- "task_id": "rma-mina-lid-resolution",
316
- "query": "How was Mina's Luna bottle lid complaint resolved?",
317
- "answer": "[Chat 2026-04-10] Mina: Hey, the Luna bottle lid is leaking \u2014 is that covered? Agent: Yes, the lid has a 2-year warranty. I'll ship you a replacement lid free of charge. Mina: Great, thanks!",
318
- "hits": [
319
- {
320
- "text": "[Chat 2026-04-10] Mina: Hey, the Luna bottle lid is leaking \u2014 is that covered? Agent: Yes, the lid has a 2-year warranty. I'll ship you a replacement lid free of charge. Mina: Great, thanks!",
321
- "score": 0.5323119854064589,
322
- "source": "pentatonic-memory",
323
- "doc_id": "chat-mina-2026-04-10"
324
- }
325
- ],
326
- "correct": true,
327
- "score": 1.0,
328
- "grading_notes": "all substrings matched",
329
- "search_time_ms": 28.685988974757493,
330
- "generation_time_ms": 0.0,
331
- "tokens_in": 0,
332
- "tokens_out": 0,
333
- "retrieval_tokens": 68,
334
- "query_tokens": 11,
335
- "context_tokens": 57,
336
- "judge_tokens_in": 0,
337
- "judge_tokens_out": 0,
338
- "judge_latency_ms": 0.0
339
- },
340
- {
341
- "task_id": "jareth-harness-bar-followup",
342
- "query": "Did Jareth's kite harness work with his bar out of the box?",
343
- "answer": "",
344
- "hits": [],
345
- "correct": false,
346
- "score": 0.0,
347
- "grading_notes": "missing 2/2: ['Duotone', 'adapter']",
348
- "search_time_ms": 34.421516000293195,
349
- "generation_time_ms": 0.0,
350
- "tokens_in": 0,
351
- "tokens_out": 0,
352
- "retrieval_tokens": 15,
353
- "query_tokens": 15,
354
- "context_tokens": 0,
355
- "judge_tokens_in": 0,
356
- "judge_tokens_out": 0,
357
- "judge_latency_ms": 0.0
358
- },
359
- {
360
- "task_id": "jareth-second-order",
361
- "query": "What did Jareth order after his initial harness purchase?",
362
- "answer": "",
363
- "hits": [],
364
- "correct": false,
365
- "score": 0.0,
366
- "grading_notes": "missing 2/2: ['adapter', 'NMD-ADPT-DC']",
367
- "search_time_ms": 25.77133697923273,
368
- "generation_time_ms": 0.0,
369
- "tokens_in": 0,
370
- "tokens_out": 0,
371
- "retrieval_tokens": 11,
372
- "query_tokens": 11,
373
- "context_tokens": 0,
374
- "judge_tokens_in": 0,
375
- "judge_tokens_out": 0,
376
- "judge_latency_ms": 0.0
377
- },
378
- {
379
- "task_id": "priya-custom-status",
380
- "query": "Can Priya change the text on her custom tote?",
381
- "answer": "",
382
- "hits": [],
383
- "correct": false,
384
- "score": 0.0,
385
- "grading_notes": "missing 2/2: [\"can't change\", 'production']",
386
- "search_time_ms": 27.974719007033855,
387
- "generation_time_ms": 0.0,
388
- "tokens_in": 0,
389
- "tokens_out": 0,
390
- "retrieval_tokens": 11,
391
- "query_tokens": 11,
392
- "context_tokens": 0,
393
- "judge_tokens_in": 0,
394
- "judge_tokens_out": 0,
395
- "judge_latency_ms": 0.0
396
- },
397
- {
398
- "task_id": "priya-goodwill-offered",
399
- "query": "What goodwill credit did Priya get offered?",
400
- "answer": "",
401
- "hits": [],
402
- "correct": false,
403
- "score": 0.0,
404
- "grading_notes": "missing 1/1: ['15%']",
405
- "search_time_ms": 26.681170013034716,
406
- "generation_time_ms": 0.0,
407
- "tokens_in": 0,
408
- "tokens_out": 0,
409
- "retrieval_tokens": 9,
410
- "query_tokens": 9,
411
- "context_tokens": 0,
412
- "judge_tokens_in": 0,
413
- "judge_tokens_out": 0,
414
- "judge_latency_ms": 0.0
415
- },
416
- {
417
- "task_id": "policy-custom-return",
418
- "query": "Can a custom-printed tote be returned for refund?",
419
- "answer": "",
420
- "hits": [],
421
- "correct": false,
422
- "score": 0.0,
423
- "grading_notes": "missing 1/1: ['final sale']",
424
- "search_time_ms": 29.563406016677618,
425
- "generation_time_ms": 0.0,
426
- "tokens_in": 0,
427
- "tokens_out": 0,
428
- "retrieval_tokens": 11,
429
- "query_tokens": 11,
430
- "context_tokens": 0,
431
- "judge_tokens_in": 0,
432
- "judge_tokens_out": 0,
433
- "judge_latency_ms": 0.0
434
- },
435
- {
436
- "task_id": "policy-40day-return",
437
- "query": "If an unused item was delivered 40 days ago, can I get a full refund?",
438
- "answer": "Returns policy: unused items returnable within 30 days of delivery for full refund. After 30 days but under 90 days: store credit only. Custom-printed and consumable items are final sale and non-returnable.",
439
- "hits": [
440
- {
441
- "text": "Returns policy: unused items returnable within 30 days of delivery for full refund. After 30 days but under 90 days: store credit only. Custom-printed and consumable items are final sale and non-returnable.",
442
- "score": 0.5349228375664262,
443
- "source": "pentatonic-memory",
444
- "doc_id": "policy-returns-30day"
445
- }
446
- ],
447
- "correct": false,
448
- "score": 0.0,
449
- "grading_notes": "all substrings matched; forbidden substring(s) present: ['full refund']",
450
- "search_time_ms": 34.331914997892454,
451
- "generation_time_ms": 0.0,
452
- "tokens_in": 0,
453
- "tokens_out": 0,
454
- "retrieval_tokens": 64,
455
- "query_tokens": 18,
456
- "context_tokens": 46,
457
- "judge_tokens_in": 0,
458
- "judge_tokens_out": 0,
459
- "judge_latency_ms": 0.0
460
- },
461
- {
462
- "task_id": "escalation-400gbp-full-refund",
463
- "query": "A Tier 1 agent wants to issue a full refund on a \u00a3400 order \u2014 can they?",
464
- "answer": "",
465
- "hits": [],
466
- "correct": false,
467
- "score": 0.0,
468
- "grading_notes": "missing 1/1: ['Tier 3']",
469
- "search_time_ms": 32.81780600082129,
470
- "generation_time_ms": 0.0,
471
- "tokens_in": 0,
472
- "tokens_out": 0,
473
- "retrieval_tokens": 20,
474
- "query_tokens": 20,
475
- "context_tokens": 0,
476
- "judge_tokens_in": 0,
477
- "judge_tokens_out": 0,
478
- "judge_latency_ms": 0.0
479
- },
480
- {
481
- "task_id": "escalation-goodwill-20pct-tier",
482
- "query": "What's the minimum agent tier needed to offer 20% goodwill credit?",
483
- "answer": "Goodwill credit: agents at Tier 1 may offer up to 10% off next order as goodwill. Tier 2 may offer up to 25%. Anything above 25% or any full refund outside policy requires Tier 3 approval.",
484
- "hits": [
485
- {
486
- "text": "Goodwill credit: agents at Tier 1 may offer up to 10% off next order as goodwill. Tier 2 may offer up to 25%. Anything above 25% or any full refund outside policy requires Tier 3 approval.",
487
- "score": 0.5517141571721631,
488
- "source": "pentatonic-memory",
489
- "doc_id": "policy-goodwill-credit"
490
- }
491
- ],
492
- "correct": true,
493
- "score": 1.0,
494
- "grading_notes": "all substrings matched",
495
- "search_time_ms": 33.02935999818146,
496
- "generation_time_ms": 0.0,
497
- "tokens_in": 0,
498
- "tokens_out": 0,
499
- "retrieval_tokens": 64,
500
- "query_tokens": 15,
501
- "context_tokens": 49,
502
- "judge_tokens_in": 0,
503
- "judge_tokens_out": 0,
504
- "judge_latency_ms": 0.0
505
- },
506
- {
507
- "task_id": "mina-vip-status",
508
- "query": "Is Mina Okafor a VIP customer?",
509
- "answer": "Customer note \u2014 Mina Okafor: VIP tier (5+ orders, \u00a3350+ lifetime spend). Preferences: matte finishes, neutrals over brights. Has flagged interest in lifetime-warranty items.",
510
- "hits": [
511
- {
512
- "text": "Customer note \u2014 Mina Okafor: VIP tier (5+ orders, \u00a3350+ lifetime spend). Preferences: matte finishes, neutrals over brights. Has flagged interest in lifetime-warranty items.",
513
- "score": 0.7720720221593629,
514
- "source": "pentatonic-memory",
515
- "doc_id": "customer-note-mina-vip"
516
- }
517
- ],
518
- "correct": true,
519
- "score": 1.0,
520
- "grading_notes": "all substrings matched",
521
- "search_time_ms": 27.029798016883433,
522
- "generation_time_ms": 0.0,
523
- "tokens_in": 0,
524
- "tokens_out": 0,
525
- "retrieval_tokens": 52,
526
- "query_tokens": 10,
527
- "context_tokens": 42,
528
- "judge_tokens_in": 0,
529
- "judge_tokens_out": 0,
530
- "judge_latency_ms": 0.0
531
- },
532
- {
533
- "task_id": "mina-preferences",
534
- "query": "What are Mina Okafor's stated preferences?",
535
- "answer": "Customer note \u2014 Mina Okafor: VIP tier (5+ orders, \u00a3350+ lifetime spend). Preferences: matte finishes, neutrals over brights. Has flagged interest in lifetime-warranty items.",
536
- "hits": [
537
- {
538
- "text": "Customer note \u2014 Mina Okafor: VIP tier (5+ orders, \u00a3350+ lifetime spend). Preferences: matte finishes, neutrals over brights. Has flagged interest in lifetime-warranty items.",
539
- "score": 0.5363281403698282,
540
- "source": "pentatonic-memory",
541
- "doc_id": "customer-note-mina-vip"
542
- }
543
- ],
544
- "correct": true,
545
- "score": 1.0,
546
- "grading_notes": "all substrings matched",
547
- "search_time_ms": 26.91432400024496,
548
- "generation_time_ms": 0.0,
549
- "tokens_in": 0,
550
- "tokens_out": 0,
551
- "retrieval_tokens": 53,
552
- "query_tokens": 11,
553
- "context_tokens": 42,
554
- "judge_tokens_in": 0,
555
- "judge_tokens_out": 0,
556
- "judge_latency_ms": 0.0
557
- },
558
- {
559
- "task_id": "priya-agent-guidance",
560
- "query": "When handling Priya's orders, what should the agent do before production starts?",
561
- "answer": "",
562
- "hits": [],
563
- "correct": false,
564
- "score": 0.0,
565
- "grading_notes": "missing 2/2: ['confirm', 'print text']",
566
- "search_time_ms": 31.542113021714613,
567
- "generation_time_ms": 0.0,
568
- "tokens_in": 0,
569
- "tokens_out": 0,
570
- "retrieval_tokens": 16,
571
- "query_tokens": 16,
572
- "context_tokens": 0,
573
- "judge_tokens_in": 0,
574
- "judge_tokens_out": 0,
575
- "judge_latency_ms": 0.0
576
- },
577
- {
578
- "task_id": "rubric-mina-warranty-recommendation",
579
- "query": "Mina is asking for product recommendations. Based on her history, what should we suggest?",
580
- "answer": "Customer note \u2014 Mina Okafor: VIP tier (5+ orders, \u00a3350+ lifetime spend). Preferences: matte finishes, neutrals over brights. Has flagged interest in lifetime-warranty items.",
581
- "hits": [
582
- {
583
- "text": "Customer note \u2014 Mina Okafor: VIP tier (5+ orders, \u00a3350+ lifetime spend). Preferences: matte finishes, neutrals over brights. Has flagged interest in lifetime-warranty items.",
584
- "score": 0.5268969216421068,
585
- "source": "pentatonic-memory",
586
- "doc_id": "customer-note-mina-vip"
587
- }
588
- ],
589
- "correct": true,
590
- "score": 1.0,
591
- "grading_notes": "judge=1.00 \u2014 The answer references Mina's VIP status and her stated interest in lifetime-warranty items, meeting the criteria for a s",
592
- "search_time_ms": 31.27754997694865,
593
- "generation_time_ms": 0.0,
594
- "tokens_in": 0,
595
- "tokens_out": 0,
596
- "retrieval_tokens": 60,
597
- "query_tokens": 18,
598
- "context_tokens": 42,
599
- "judge_tokens_in": 374,
600
- "judge_tokens_out": 44,
601
- "judge_latency_ms": 945.222895026207
602
- },
603
- {
604
- "task_id": "rubric-jareth-escalation",
605
- "query": "Jareth wants to return his kite harness for a full refund, claiming it 'didn't work' with his bar \u2014 how should we handle this?",
606
- "answer": "",
607
- "hits": [],
608
- "correct": false,
609
- "score": 0.0,
610
- "grading_notes": "judge=0.00 \u2014 The answer fails to provide any context or resolution regarding Jareth's claim, simply processing a refund without addre",
611
- "search_time_ms": 38.004239002475515,
612
- "generation_time_ms": 0.0,
613
- "tokens_in": 0,
614
- "tokens_out": 0,
615
- "retrieval_tokens": 30,
616
- "query_tokens": 30,
617
- "context_tokens": 0,
618
- "judge_tokens_in": 357,
619
- "judge_tokens_out": 43,
620
- "judge_latency_ms": 868.3296079933643
621
- },
622
- {
623
- "task_id": "rubric-priya-delayed-custom",
624
- "query": "Priya is frustrated about her custom tote delay. What's the best response?",
625
- "answer": "",
626
- "hits": [],
627
- "correct": false,
628
- "score": 0.0,
629
- "grading_notes": "judge=0.00 \u2014 The response does not acknowledge that custom items have a 10-12 working day lead time and does not mention goodwill. It",
630
- "search_time_ms": 33.17419299855828,
631
- "generation_time_ms": 0.0,
632
- "tokens_in": 0,
633
- "tokens_out": 0,
634
- "retrieval_tokens": 16,
635
- "query_tokens": 16,
636
- "context_tokens": 0,
637
- "judge_tokens_in": 310,
638
- "judge_tokens_out": 54,
639
- "judge_latency_ms": 939.1167230010033
640
- },
641
- {
642
- "task_id": "entity-mina-orders",
643
- "query": "List Mina Okafor's order IDs.",
644
- "answer": "",
645
- "hits": [],
646
- "correct": false,
647
- "score": 0.0,
648
- "grading_notes": "no expected_substrings set",
649
- "search_time_ms": 27.45368899195455,
650
- "generation_time_ms": 0.0,
651
- "tokens_in": 0,
652
- "tokens_out": 0,
653
- "retrieval_tokens": 10,
654
- "query_tokens": 10,
655
- "context_tokens": 0,
656
- "judge_tokens_in": 0,
657
- "judge_tokens_out": 0,
658
- "judge_latency_ms": 0.0
659
- },
660
- {
661
- "task_id": "entity-all-rmas",
662
- "query": "List all open and closed RMA case IDs.",
663
- "answer": "",
664
- "hits": [],
665
- "correct": false,
666
- "score": 0.0,
667
- "grading_notes": "no expected_substrings set",
668
- "search_time_ms": 28.635605005547404,
669
- "generation_time_ms": 0.0,
670
- "tokens_in": 0,
671
- "tokens_out": 0,
672
- "retrieval_tokens": 10,
673
- "query_tokens": 10,
674
- "context_tokens": 0,
675
- "judge_tokens_in": 0,
676
- "judge_tokens_out": 0,
677
- "judge_latency_ms": 0.0
678
- }
679
- ]
680
- }