@pentatonic-ai/ai-agent-sdk 0.9.6 → 0.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (127) hide show
  1. package/README.md +3 -3
  2. package/bin/cli.js +1 -1
  3. package/bin/commands/config.js +1 -1
  4. package/dist/index.cjs +1 -1
  5. package/dist/index.js +1 -1
  6. package/package.json +2 -2
  7. package/packages/doctor/src/checks/local-memory.js +2 -2
  8. package/packages/memory/README.md +2 -2
  9. package/packages/memory/openclaw-plugin/README.md +2 -2
  10. package/packages/memory/openclaw-plugin/openclaw.plugin.json +1 -1
  11. package/packages/memory/src/server.js +2 -2
  12. package/packages/memory-engine-v2/.env.example +30 -0
  13. package/packages/memory-engine-v2/README.md +125 -0
  14. package/packages/memory-engine-v2/compat/Dockerfile +11 -0
  15. package/packages/memory-engine-v2/compat/requirements.txt +6 -0
  16. package/packages/memory-engine-v2/compat/server.py +1047 -0
  17. package/packages/memory-engine-v2/docker-compose.aws.yml +78 -0
  18. package/packages/memory-engine-v2/docker-compose.yml +206 -0
  19. package/packages/memory-engine-v2/extractor-async/Dockerfile +14 -0
  20. package/packages/memory-engine-v2/extractor-async/confidence.py +62 -0
  21. package/packages/memory-engine-v2/extractor-async/noise_filter.py +144 -0
  22. package/packages/memory-engine-v2/extractor-async/requirements.txt +2 -0
  23. package/packages/memory-engine-v2/extractor-async/test_confidence.py +76 -0
  24. package/packages/memory-engine-v2/extractor-async/test_noise_filter.py +177 -0
  25. package/packages/memory-engine-v2/extractor-async/worker.py +797 -0
  26. package/packages/memory-engine-v2/extractor-sync/Dockerfile +11 -0
  27. package/packages/memory-engine-v2/extractor-sync/requirements.txt +4 -0
  28. package/packages/memory-engine-v2/extractor-sync/server.py +424 -0
  29. package/packages/memory-engine-v2/org-model/migrations/001_init.sql +390 -0
  30. package/packages/memory-engine-v2/tests/e2e_smoke.py +356 -0
  31. package/packages/memory-engine-v2/tests/fixtures/generate_synthetic_corpus.py +758 -0
  32. package/packages/memory-engine/.env.example +0 -13
  33. package/packages/memory-engine/MIGRATION.md +0 -219
  34. package/packages/memory-engine/README.md +0 -145
  35. package/packages/memory-engine/bench/README.md +0 -99
  36. package/packages/memory-engine/bench/scorecards-engine/agent-coding__pentatonic-baseline__20260427-142523.json +0 -1115
  37. package/packages/memory-engine/bench/scorecards-engine/chat-recall__pentatonic-baseline__20260427-142648.json +0 -819
  38. package/packages/memory-engine/bench/scorecards-engine/circular-economy__pentatonic-baseline__20260427-142757.json +0 -1278
  39. package/packages/memory-engine/bench/scorecards-engine/customer-support__pentatonic-baseline__20260427-142900.json +0 -1018
  40. package/packages/memory-engine/bench/scorecards-engine/marketplace-ops__pentatonic-baseline__20260427-142957.json +0 -1038
  41. package/packages/memory-engine/bench/scorecards-engine/product-catalogue__pentatonic-baseline__20260427-143122.json +0 -961
  42. package/packages/memory-engine/bench/scorecards-engine-via-docker/agent-coding__pentatonic-memory__20260427-161812.json +0 -1115
  43. package/packages/memory-engine/bench/scorecards-engine-via-docker/chat-recall__pentatonic-memory__20260427-161701.json +0 -819
  44. package/packages/memory-engine/bench/scorecards-engine-via-docker/circular-economy__pentatonic-memory__20260427-161713.json +0 -1278
  45. package/packages/memory-engine/bench/scorecards-engine-via-docker/customer-support__pentatonic-memory__20260427-161723.json +0 -1018
  46. package/packages/memory-engine/bench/scorecards-engine-via-docker/marketplace-ops__pentatonic-memory__20260427-161732.json +0 -1038
  47. package/packages/memory-engine/bench/scorecards-engine-via-docker/product-catalogue__pentatonic-memory__20260427-161741.json +0 -937
  48. package/packages/memory-engine/bench/scorecards-engine-via-l2-7-layer-populated/agent-coding__pentatonic-memory__20260427-184718.json +0 -1115
  49. package/packages/memory-engine/bench/scorecards-engine-via-l2-7-layer-populated/chat-recall__pentatonic-memory__20260427-184614.json +0 -819
  50. package/packages/memory-engine/bench/scorecards-engine-via-l2-7-layer-populated/circular-economy__pentatonic-memory__20260427-184809.json +0 -1278
  51. package/packages/memory-engine/bench/scorecards-engine-via-l2-7-layer-populated/customer-support__pentatonic-memory__20260427-184854.json +0 -1018
  52. package/packages/memory-engine/bench/scorecards-engine-via-l2-7-layer-populated/marketplace-ops__pentatonic-memory__20260427-184929.json +0 -1038
  53. package/packages/memory-engine/bench/scorecards-engine-via-l2-7-layer-populated/product-catalogue__pentatonic-memory__20260427-185015.json +0 -961
  54. package/packages/memory-engine/bench/scorecards-engine-via-l2-empty-layers/agent-coding__pentatonic-memory__20260427-175252.json +0 -1115
  55. package/packages/memory-engine/bench/scorecards-engine-via-l2-empty-layers/chat-recall__pentatonic-memory__20260427-175312.json +0 -819
  56. package/packages/memory-engine/bench/scorecards-engine-via-l2-empty-layers/circular-economy__pentatonic-memory__20260427-175335.json +0 -1278
  57. package/packages/memory-engine/bench/scorecards-engine-via-l2-empty-layers/customer-support__pentatonic-memory__20260427-175355.json +0 -1018
  58. package/packages/memory-engine/bench/scorecards-engine-via-l2-empty-layers/marketplace-ops__pentatonic-memory__20260427-175413.json +0 -1038
  59. package/packages/memory-engine/bench/scorecards-engine-via-l2-empty-layers/product-catalogue__pentatonic-memory__20260427-175430.json +0 -883
  60. package/packages/memory-engine/bench/scorecards-engine-via-shim/agent-coding__pentatonic-memory__20260427-155409.json +0 -1115
  61. package/packages/memory-engine/bench/scorecards-engine-via-shim/chat-recall__pentatonic-memory__20260427-155421.json +0 -819
  62. package/packages/memory-engine/bench/scorecards-engine-via-shim/circular-economy__pentatonic-memory__20260427-155433.json +0 -1278
  63. package/packages/memory-engine/bench/scorecards-engine-via-shim/customer-support__pentatonic-memory__20260427-155443.json +0 -1018
  64. package/packages/memory-engine/bench/scorecards-engine-via-shim/marketplace-ops__pentatonic-memory__20260427-155453.json +0 -1038
  65. package/packages/memory-engine/bench/scorecards-engine-via-shim/product-catalogue__pentatonic-memory__20260427-155503.json +0 -937
  66. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/agent-coding__pentatonic-memory-latest__20260427-145103.json +0 -1115
  67. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/agent-coding__pentatonic-memory__20260427-144909.json +0 -1115
  68. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/chat-recall__pentatonic-memory-latest__20260427-145153.json +0 -819
  69. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/chat-recall__pentatonic-memory__20260427-145120.json +0 -542
  70. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/circular-economy__pentatonic-memory-latest__20260427-145313.json +0 -1278
  71. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/circular-economy__pentatonic-memory__20260427-145207.json +0 -894
  72. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/customer-support__pentatonic-memory-latest__20260427-145412.json +0 -1018
  73. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/customer-support__pentatonic-memory__20260427-145327.json +0 -680
  74. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/marketplace-ops__pentatonic-memory-latest__20260427-145517.json +0 -1038
  75. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/marketplace-ops__pentatonic-memory__20260427-145422.json +0 -693
  76. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/product-catalogue__pentatonic-memory-latest__20260427-145616.json +0 -961
  77. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/product-catalogue__pentatonic-memory__20260427-145528.json +0 -727
  78. package/packages/memory-engine/compat/Dockerfile +0 -22
  79. package/packages/memory-engine/compat/server.py +0 -1255
  80. package/packages/memory-engine/docker-compose.test.yml +0 -59
  81. package/packages/memory-engine/docker-compose.yml +0 -255
  82. package/packages/memory-engine/engine/README.md +0 -52
  83. package/packages/memory-engine/engine/l2-hybridrag-proxy.py +0 -1543
  84. package/packages/memory-engine/engine/l5-comms-layer.py +0 -663
  85. package/packages/memory-engine/engine/l6-document-store.py +0 -1018
  86. package/packages/memory-engine/engine/services/_shared/__init__.py +0 -1
  87. package/packages/memory-engine/engine/services/_shared/embed_provider.py +0 -562
  88. package/packages/memory-engine/engine/services/l2/Dockerfile +0 -50
  89. package/packages/memory-engine/engine/services/l2/init_databases.py +0 -81
  90. package/packages/memory-engine/engine/services/l2/l2-hybridrag-proxy.py +0 -2721
  91. package/packages/memory-engine/engine/services/l5/Dockerfile +0 -11
  92. package/packages/memory-engine/engine/services/l5/l5-comms-layer.py +0 -808
  93. package/packages/memory-engine/engine/services/l6/Dockerfile +0 -30
  94. package/packages/memory-engine/engine/services/l6/l6-document-store.py +0 -1221
  95. package/packages/memory-engine/engine/services/nv-embed/Dockerfile +0 -28
  96. package/packages/memory-engine/engine/services/nv-embed/server.py +0 -152
  97. package/packages/memory-engine/pme_memory/__init__.py +0 -0
  98. package/packages/memory-engine/pme_memory/__main__.py +0 -129
  99. package/packages/memory-engine/pme_memory/artifacts.py +0 -95
  100. package/packages/memory-engine/pme_memory/embed.py +0 -74
  101. package/packages/memory-engine/pme_memory/health.py +0 -36
  102. package/packages/memory-engine/pme_memory/hygiene.py +0 -159
  103. package/packages/memory-engine/pme_memory/indexer.py +0 -200
  104. package/packages/memory-engine/pme_memory/needs.py +0 -55
  105. package/packages/memory-engine/pme_memory/provenance.py +0 -80
  106. package/packages/memory-engine/pme_memory/scoring.py +0 -168
  107. package/packages/memory-engine/pme_memory/search.py +0 -52
  108. package/packages/memory-engine/pme_memory/store.py +0 -86
  109. package/packages/memory-engine/pme_memory/synthesis.py +0 -114
  110. package/packages/memory-engine/pyproject.toml +0 -65
  111. package/packages/memory-engine/scripts/kg-extractor.py +0 -557
  112. package/packages/memory-engine/scripts/kg-preflexor-v2.py +0 -738
  113. package/packages/memory-engine/scripts/wipe-legacy-l3-entities.py +0 -128
  114. package/packages/memory-engine/tests/e2e_arena.sh +0 -259
  115. package/packages/memory-engine/tests/embed_stub/Dockerfile +0 -13
  116. package/packages/memory-engine/tests/embed_stub/server.py +0 -80
  117. package/packages/memory-engine/tests/test_aggregate.py +0 -333
  118. package/packages/memory-engine/tests/test_api_contract.sh +0 -57
  119. package/packages/memory-engine/tests/test_arena_safety.py +0 -232
  120. package/packages/memory-engine/tests/test_channel_stat_reader.py +0 -437
  121. package/packages/memory-engine/tests/test_channel_stat_rollups.py +0 -308
  122. package/packages/memory-engine/tests/test_compat_nv_embed_probe.py +0 -48
  123. package/packages/memory-engine/tests/test_embed_provider.py +0 -693
  124. package/packages/memory-engine/tests/test_l2_qmd_vec_search.py +0 -280
  125. package/packages/memory-engine/tests/test_l3_arena_isolation.py +0 -412
  126. package/packages/memory-engine/tests/test_l6_module_load.py +0 -84
  127. package/packages/memory-engine/tests/test_people_list_reader.py +0 -432
@@ -1,819 +0,0 @@
1
- {
2
- "bench": "chat-recall",
3
- "stack": "pentatonic-memory",
4
- "n_tasks": 16,
5
- "n_correct": 16,
6
- "accuracy": 1.0,
7
- "mean_score": 1.0,
8
- "p50_search_ms": 448.4323229989968,
9
- "p95_search_ms": 533.4035659907386,
10
- "total_tokens_in": 0,
11
- "total_tokens_out": 0,
12
- "total_usd": 0.0,
13
- "by_tag": {
14
- "factoid": {
15
- "n": 6,
16
- "mean_score": 1.0,
17
- "accuracy": 1.0
18
- },
19
- "owner": {
20
- "n": 4,
21
- "mean_score": 1.0,
22
- "accuracy": 1.0
23
- },
24
- "temporal": {
25
- "n": 4,
26
- "mean_score": 1.0,
27
- "accuracy": 1.0
28
- },
29
- "contradiction": {
30
- "n": 2,
31
- "mean_score": 1.0,
32
- "accuracy": 1.0
33
- },
34
- "status": {
35
- "n": 4,
36
- "mean_score": 1.0,
37
- "accuracy": 1.0
38
- },
39
- "multi-hop": {
40
- "n": 4,
41
- "mean_score": 1.0,
42
- "accuracy": 1.0
43
- }
44
- },
45
- "extra": {
46
- "ingest_ms": 12605.152663018089,
47
- "grading": "substring",
48
- "limit": 3,
49
- "tokens": {
50
- "corpus_tokens": 513,
51
- "query_tokens": 140,
52
- "context_tokens": 846,
53
- "retrieval_tokens": 986,
54
- "naive_tokens": 8348,
55
- "saved_tokens": 7362,
56
- "reduction_pct": 0.8818878773358888,
57
- "mean_retrieval_tokens_per_task": 61.625,
58
- "tokenizer": "cl100k_base",
59
- "per_task": {
60
- "who-owns-atlas": {
61
- "query": 5,
62
- "context": 39,
63
- "retrieval": 44,
64
- "judge_in": 0,
65
- "judge_out": 0,
66
- "judge_latency_ms": 0.0
67
- },
68
- "who-owns-borealis": {
69
- "query": 7,
70
- "context": 46,
71
- "retrieval": 53,
72
- "judge_in": 0,
73
- "judge_out": 0,
74
- "judge_latency_ms": 0.0
75
- },
76
- "who-owns-cirrus": {
77
- "query": 6,
78
- "context": 49,
79
- "retrieval": 55,
80
- "judge_in": 0,
81
- "judge_out": 0,
82
- "judge_latency_ms": 0.0
83
- },
84
- "who-owns-dune": {
85
- "query": 6,
86
- "context": 39,
87
- "retrieval": 45,
88
- "judge_in": 0,
89
- "judge_out": 0,
90
- "judge_latency_ms": 0.0
91
- },
92
- "current-deadline-atlas": {
93
- "query": 8,
94
- "context": 56,
95
- "retrieval": 64,
96
- "judge_in": 0,
97
- "judge_out": 0,
98
- "judge_latency_ms": 0.0
99
- },
100
- "current-deadline-borealis": {
101
- "query": 10,
102
- "context": 67,
103
- "retrieval": 77,
104
- "judge_in": 0,
105
- "judge_out": 0,
106
- "judge_latency_ms": 0.0
107
- },
108
- "current-deadline-cirrus": {
109
- "query": 9,
110
- "context": 62,
111
- "retrieval": 71,
112
- "judge_in": 0,
113
- "judge_out": 0,
114
- "judge_latency_ms": 0.0
115
- },
116
- "current-deadline-dune": {
117
- "query": 9,
118
- "context": 65,
119
- "retrieval": 74,
120
- "judge_in": 0,
121
- "judge_out": 0,
122
- "judge_latency_ms": 0.0
123
- },
124
- "status-atlas": {
125
- "query": 8,
126
- "context": 49,
127
- "retrieval": 57,
128
- "judge_in": 0,
129
- "judge_out": 0,
130
- "judge_latency_ms": 0.0
131
- },
132
- "status-borealis": {
133
- "query": 10,
134
- "context": 53,
135
- "retrieval": 63,
136
- "judge_in": 0,
137
- "judge_out": 0,
138
- "judge_latency_ms": 0.0
139
- },
140
- "status-cirrus": {
141
- "query": 9,
142
- "context": 49,
143
- "retrieval": 58,
144
- "judge_in": 0,
145
- "judge_out": 0,
146
- "judge_latency_ms": 0.0
147
- },
148
- "status-dune": {
149
- "query": 9,
150
- "context": 48,
151
- "retrieval": 57,
152
- "judge_in": 0,
153
- "judge_out": 0,
154
- "judge_latency_ms": 0.0
155
- },
156
- "multihop-atlas": {
157
- "query": 10,
158
- "context": 56,
159
- "retrieval": 66,
160
- "judge_in": 0,
161
- "judge_out": 0,
162
- "judge_latency_ms": 0.0
163
- },
164
- "multihop-borealis": {
165
- "query": 12,
166
- "context": 53,
167
- "retrieval": 65,
168
- "judge_in": 0,
169
- "judge_out": 0,
170
- "judge_latency_ms": 0.0
171
- },
172
- "multihop-cirrus": {
173
- "query": 11,
174
- "context": 67,
175
- "retrieval": 78,
176
- "judge_in": 0,
177
- "judge_out": 0,
178
- "judge_latency_ms": 0.0
179
- },
180
- "multihop-dune": {
181
- "query": 11,
182
- "context": 48,
183
- "retrieval": 59,
184
- "judge_in": 0,
185
- "judge_out": 0,
186
- "judge_latency_ms": 0.0
187
- }
188
- },
189
- "judge_tokens_in": 0,
190
- "judge_tokens_out": 0,
191
- "judge_calls": 0,
192
- "judge_mean_latency_ms": 0.0
193
- },
194
- "cost_usd": {
195
- "assumed_completion_tokens_per_task": 100,
196
- "rates": {
197
- "input_per_1k": 0.0025,
198
- "output_per_1k": 0.01,
199
- "model": "gpt-4o"
200
- },
201
- "retrieval_usd_in": 0.0024649999999999997,
202
- "retrieval_usd_out": 0.016,
203
- "retrieval_usd_total": 0.018465,
204
- "naive_usd_total": 0.03687,
205
- "saved_usd": 0.018405,
206
- "saved_usd_per_1k_tasks": 1.1503125
207
- }
208
- },
209
- "task_results": [
210
- {
211
- "task_id": "who-owns-atlas",
212
- "query": "Who owns project Atlas?",
213
- "answer": "Alice: I'll own project Atlas. Kickoff this week.\n---\nAlice: Atlas status \u2014 on track.\n---\nClara: I'll own project Borealis. Kickoff this week.",
214
- "hits": [
215
- {
216
- "text": "Alice: I'll own project Atlas. Kickoff this week.",
217
- "score": 0.4098,
218
- "source": "pentatonic-memory",
219
- "doc_id": "chat-assign-atlas"
220
- },
221
- {
222
- "text": "Alice: Atlas status \u2014 on track.",
223
- "score": 0.4032,
224
- "source": "pentatonic-memory",
225
- "doc_id": "status-atlas-m1"
226
- },
227
- {
228
- "text": "Clara: I'll own project Borealis. Kickoff this week.",
229
- "score": 0.3968,
230
- "source": "pentatonic-memory",
231
- "doc_id": "chat-assign-borealis"
232
- }
233
- ],
234
- "correct": true,
235
- "score": 1.0,
236
- "grading_notes": "all substrings matched",
237
- "search_time_ms": 448.07494498672895,
238
- "generation_time_ms": 0.0,
239
- "tokens_in": 0,
240
- "tokens_out": 0,
241
- "retrieval_tokens": 44,
242
- "query_tokens": 5,
243
- "context_tokens": 39,
244
- "judge_tokens_in": 0,
245
- "judge_tokens_out": 0,
246
- "judge_latency_ms": 0.0
247
- },
248
- {
249
- "task_id": "who-owns-borealis",
250
- "query": "Who owns project Borealis?",
251
- "answer": "Clara: I'll own project Borealis. Kickoff this week.\n---\nFarid: I'll own project Dune. Kickoff this week.\n---\nAlice: I'll own project Atlas. Kickoff this week.",
252
- "hits": [
253
- {
254
- "text": "Clara: I'll own project Borealis. Kickoff this week.",
255
- "score": 0.4098,
256
- "source": "pentatonic-memory",
257
- "doc_id": "chat-assign-borealis"
258
- },
259
- {
260
- "text": "Farid: I'll own project Dune. Kickoff this week.",
261
- "score": 0.4032,
262
- "source": "pentatonic-memory",
263
- "doc_id": "chat-assign-dune"
264
- },
265
- {
266
- "text": "Alice: I'll own project Atlas. Kickoff this week.",
267
- "score": 0.3968,
268
- "source": "pentatonic-memory",
269
- "doc_id": "chat-assign-atlas"
270
- }
271
- ],
272
- "correct": true,
273
- "score": 1.0,
274
- "grading_notes": "all substrings matched",
275
- "search_time_ms": 467.92808899772353,
276
- "generation_time_ms": 0.0,
277
- "tokens_in": 0,
278
- "tokens_out": 0,
279
- "retrieval_tokens": 53,
280
- "query_tokens": 7,
281
- "context_tokens": 46,
282
- "judge_tokens_in": 0,
283
- "judge_tokens_out": 0,
284
- "judge_latency_ms": 0.0
285
- },
286
- {
287
- "task_id": "who-owns-cirrus",
288
- "query": "Who owns project Cirrus?",
289
- "answer": "Diego: I'll own project Cirrus. Kickoff this week.\n---\nDiego: Cirrus status \u2014 scoping.\n---\nDiego: Target delivery for Cirrus is 2026-04-01. Please pencil it in.",
290
- "hits": [
291
- {
292
- "text": "Diego: I'll own project Cirrus. Kickoff this week.",
293
- "score": 0.4098,
294
- "source": "pentatonic-memory",
295
- "doc_id": "chat-assign-cirrus"
296
- },
297
- {
298
- "text": "Diego: Cirrus status \u2014 scoping.",
299
- "score": 0.4032,
300
- "source": "pentatonic-memory",
301
- "doc_id": "status-cirrus-m3"
302
- },
303
- {
304
- "text": "Diego: Target delivery for Cirrus is 2026-04-01. Please pencil it in.",
305
- "score": 0.3968,
306
- "source": "pentatonic-memory",
307
- "doc_id": "chat-deadline1-cirrus"
308
- }
309
- ],
310
- "correct": true,
311
- "score": 1.0,
312
- "grading_notes": "all substrings matched",
313
- "search_time_ms": 432.8923989960458,
314
- "generation_time_ms": 0.0,
315
- "tokens_in": 0,
316
- "tokens_out": 0,
317
- "retrieval_tokens": 55,
318
- "query_tokens": 6,
319
- "context_tokens": 49,
320
- "judge_tokens_in": 0,
321
- "judge_tokens_out": 0,
322
- "judge_latency_ms": 0.0
323
- },
324
- {
325
- "task_id": "who-owns-dune",
326
- "query": "Who owns project Dune?",
327
- "answer": "Farid: I'll own project Dune. Kickoff this week.\n---\nFarid: Dune status \u2014 launched.\n---\nAlice: I'll own project Atlas. Kickoff this week.",
328
- "hits": [
329
- {
330
- "text": "Farid: I'll own project Dune. Kickoff this week.",
331
- "score": 0.4098,
332
- "source": "pentatonic-memory",
333
- "doc_id": "chat-assign-dune"
334
- },
335
- {
336
- "text": "Farid: Dune status \u2014 launched.",
337
- "score": 0.4032,
338
- "source": "pentatonic-memory",
339
- "doc_id": "status-dune-m4"
340
- },
341
- {
342
- "text": "Alice: I'll own project Atlas. Kickoff this week.",
343
- "score": 0.3968,
344
- "source": "pentatonic-memory",
345
- "doc_id": "chat-assign-atlas"
346
- }
347
- ],
348
- "correct": true,
349
- "score": 1.0,
350
- "grading_notes": "all substrings matched",
351
- "search_time_ms": 434.9860879883636,
352
- "generation_time_ms": 0.0,
353
- "tokens_in": 0,
354
- "tokens_out": 0,
355
- "retrieval_tokens": 45,
356
- "query_tokens": 6,
357
- "context_tokens": 39,
358
- "judge_tokens_in": 0,
359
- "judge_tokens_out": 0,
360
- "judge_latency_ms": 0.0
361
- },
362
- {
363
- "task_id": "current-deadline-atlas",
364
- "query": "What is the current deadline for Atlas?",
365
- "answer": "Alice: Update \u2014 Atlas deadline has been moved to 2026-03-17. Latest one supersedes earlier guidance.\n---\nAlice: Target delivery for Atlas is 2026-03-14. Please pencil it in.\n---\nAlice: Atlas status \u2014 on track.",
366
- "hits": [
367
- {
368
- "text": "Alice: Update \u2014 Atlas deadline has been moved to 2026-03-17. Latest one supersedes earlier guidance.",
369
- "score": 0.4098,
370
- "source": "pentatonic-memory",
371
- "doc_id": "chat-deadline2-atlas"
372
- },
373
- {
374
- "text": "Alice: Target delivery for Atlas is 2026-03-14. Please pencil it in.",
375
- "score": 0.4032,
376
- "source": "pentatonic-memory",
377
- "doc_id": "chat-deadline1-atlas"
378
- },
379
- {
380
- "text": "Alice: Atlas status \u2014 on track.",
381
- "score": 0.3968,
382
- "source": "pentatonic-memory",
383
- "doc_id": "status-atlas-m1"
384
- }
385
- ],
386
- "correct": true,
387
- "score": 1.0,
388
- "grading_notes": "all substrings matched",
389
- "search_time_ms": 435.4068410175387,
390
- "generation_time_ms": 0.0,
391
- "tokens_in": 0,
392
- "tokens_out": 0,
393
- "retrieval_tokens": 64,
394
- "query_tokens": 8,
395
- "context_tokens": 56,
396
- "judge_tokens_in": 0,
397
- "judge_tokens_out": 0,
398
- "judge_latency_ms": 0.0
399
- },
400
- {
401
- "task_id": "current-deadline-borealis",
402
- "query": "What is the current deadline for Borealis?",
403
- "answer": "Clara: Target delivery for Borealis is 2026-02-28. Please pencil it in.\n---\nAlice: Update \u2014 Atlas deadline has been moved to 2026-03-17. Latest one supersedes earlier guidance.\n---\nClara: I'll own project Borealis. Kickoff this week.",
404
- "hits": [
405
- {
406
- "text": "Clara: Target delivery for Borealis is 2026-02-28. Please pencil it in.",
407
- "score": 0.4098,
408
- "source": "pentatonic-memory",
409
- "doc_id": "chat-deadline1-borealis"
410
- },
411
- {
412
- "text": "Alice: Update \u2014 Atlas deadline has been moved to 2026-03-17. Latest one supersedes earlier guidance.",
413
- "score": 0.4032,
414
- "source": "pentatonic-memory",
415
- "doc_id": "chat-deadline2-atlas"
416
- },
417
- {
418
- "text": "Clara: I'll own project Borealis. Kickoff this week.",
419
- "score": 0.3968,
420
- "source": "pentatonic-memory",
421
- "doc_id": "chat-assign-borealis"
422
- }
423
- ],
424
- "correct": true,
425
- "score": 1.0,
426
- "grading_notes": "all substrings matched",
427
- "search_time_ms": 448.7897010112647,
428
- "generation_time_ms": 0.0,
429
- "tokens_in": 0,
430
- "tokens_out": 0,
431
- "retrieval_tokens": 77,
432
- "query_tokens": 10,
433
- "context_tokens": 67,
434
- "judge_tokens_in": 0,
435
- "judge_tokens_out": 0,
436
- "judge_latency_ms": 0.0
437
- },
438
- {
439
- "task_id": "current-deadline-cirrus",
440
- "query": "What is the current deadline for Cirrus?",
441
- "answer": "Diego: Update \u2014 Cirrus deadline has been moved to 2026-04-08. Latest one supersedes earlier guidance.\n---\nDiego: Target delivery for Cirrus is 2026-04-01. Please pencil it in.\n---\nDiego: Cirrus status \u2014 scoping.",
442
- "hits": [
443
- {
444
- "text": "Diego: Update \u2014 Cirrus deadline has been moved to 2026-04-08. Latest one supersedes earlier guidance.",
445
- "score": 0.4098,
446
- "source": "pentatonic-memory",
447
- "doc_id": "chat-deadline2-cirrus"
448
- },
449
- {
450
- "text": "Diego: Target delivery for Cirrus is 2026-04-01. Please pencil it in.",
451
- "score": 0.4032,
452
- "source": "pentatonic-memory",
453
- "doc_id": "chat-deadline1-cirrus"
454
- },
455
- {
456
- "text": "Diego: Cirrus status \u2014 scoping.",
457
- "score": 0.3968,
458
- "source": "pentatonic-memory",
459
- "doc_id": "status-cirrus-m3"
460
- }
461
- ],
462
- "correct": true,
463
- "score": 1.0,
464
- "grading_notes": "all substrings matched",
465
- "search_time_ms": 522.1439780143555,
466
- "generation_time_ms": 0.0,
467
- "tokens_in": 0,
468
- "tokens_out": 0,
469
- "retrieval_tokens": 71,
470
- "query_tokens": 9,
471
- "context_tokens": 62,
472
- "judge_tokens_in": 0,
473
- "judge_tokens_out": 0,
474
- "judge_latency_ms": 0.0
475
- },
476
- {
477
- "task_id": "current-deadline-dune",
478
- "query": "What is the current deadline for Dune?",
479
- "answer": "Farid: Target delivery for Dune is 2026-05-20. Please pencil it in.\n---\nAlice: Update \u2014 Atlas deadline has been moved to 2026-03-17. Latest one supersedes earlier guidance.\n---\nFarid: I'll own project Dune. Kickoff this week.",
480
- "hits": [
481
- {
482
- "text": "Farid: Target delivery for Dune is 2026-05-20. Please pencil it in.",
483
- "score": 0.4098,
484
- "source": "pentatonic-memory",
485
- "doc_id": "chat-deadline1-dune"
486
- },
487
- {
488
- "text": "Alice: Update \u2014 Atlas deadline has been moved to 2026-03-17. Latest one supersedes earlier guidance.",
489
- "score": 0.4032,
490
- "source": "pentatonic-memory",
491
- "doc_id": "chat-deadline2-atlas"
492
- },
493
- {
494
- "text": "Farid: I'll own project Dune. Kickoff this week.",
495
- "score": 0.3968,
496
- "source": "pentatonic-memory",
497
- "doc_id": "chat-assign-dune"
498
- }
499
- ],
500
- "correct": true,
501
- "score": 1.0,
502
- "grading_notes": "all substrings matched",
503
- "search_time_ms": 476.2287669873331,
504
- "generation_time_ms": 0.0,
505
- "tokens_in": 0,
506
- "tokens_out": 0,
507
- "retrieval_tokens": 74,
508
- "query_tokens": 9,
509
- "context_tokens": 65,
510
- "judge_tokens_in": 0,
511
- "judge_tokens_out": 0,
512
- "judge_latency_ms": 0.0
513
- },
514
- {
515
- "task_id": "status-atlas",
516
- "query": "What's the latest status of Atlas?",
517
- "answer": "Alice: Atlas status \u2014 on track.\n---\nAlice: Update \u2014 Atlas deadline has been moved to 2026-03-17. Latest one supersedes earlier guidance.\n---\nAlice: I'll own project Atlas. Kickoff this week.",
518
- "hits": [
519
- {
520
- "text": "Alice: Atlas status \u2014 on track.",
521
- "score": 0.4098,
522
- "source": "pentatonic-memory",
523
- "doc_id": "status-atlas-m1"
524
- },
525
- {
526
- "text": "Alice: Update \u2014 Atlas deadline has been moved to 2026-03-17. Latest one supersedes earlier guidance.",
527
- "score": 0.4032,
528
- "source": "pentatonic-memory",
529
- "doc_id": "chat-deadline2-atlas"
530
- },
531
- {
532
- "text": "Alice: I'll own project Atlas. Kickoff this week.",
533
- "score": 0.3968,
534
- "source": "pentatonic-memory",
535
- "doc_id": "chat-assign-atlas"
536
- }
537
- ],
538
- "correct": true,
539
- "score": 1.0,
540
- "grading_notes": "all substrings matched",
541
- "search_time_ms": 533.4035659907386,
542
- "generation_time_ms": 0.0,
543
- "tokens_in": 0,
544
- "tokens_out": 0,
545
- "retrieval_tokens": 57,
546
- "query_tokens": 8,
547
- "context_tokens": 49,
548
- "judge_tokens_in": 0,
549
- "judge_tokens_out": 0,
550
- "judge_latency_ms": 0.0
551
- },
552
- {
553
- "task_id": "status-borealis",
554
- "query": "What's the latest status of Borealis?",
555
- "answer": "Clara: Borealis status \u2014 blocked on vendor.\n---\nClara: I'll own project Borealis. Kickoff this week.\n---\nClara: Target delivery for Borealis is 2026-02-28. Please pencil it in.",
556
- "hits": [
557
- {
558
- "text": "Clara: Borealis status \u2014 blocked on vendor.",
559
- "score": 0.4098,
560
- "source": "pentatonic-memory",
561
- "doc_id": "status-borealis-m2"
562
- },
563
- {
564
- "text": "Clara: I'll own project Borealis. Kickoff this week.",
565
- "score": 0.4032,
566
- "source": "pentatonic-memory",
567
- "doc_id": "chat-assign-borealis"
568
- },
569
- {
570
- "text": "Clara: Target delivery for Borealis is 2026-02-28. Please pencil it in.",
571
- "score": 0.3968,
572
- "source": "pentatonic-memory",
573
- "doc_id": "chat-deadline1-borealis"
574
- }
575
- ],
576
- "correct": true,
577
- "score": 1.0,
578
- "grading_notes": "all substrings matched",
579
- "search_time_ms": 455.40841898764484,
580
- "generation_time_ms": 0.0,
581
- "tokens_in": 0,
582
- "tokens_out": 0,
583
- "retrieval_tokens": 63,
584
- "query_tokens": 10,
585
- "context_tokens": 53,
586
- "judge_tokens_in": 0,
587
- "judge_tokens_out": 0,
588
- "judge_latency_ms": 0.0
589
- },
590
- {
591
- "task_id": "status-cirrus",
592
- "query": "What's the latest status of Cirrus?",
593
- "answer": "Diego: Cirrus status \u2014 scoping.\n---\nDiego: Target delivery for Cirrus is 2026-04-01. Please pencil it in.\n---\nDiego: I'll own project Cirrus. Kickoff this week.",
594
- "hits": [
595
- {
596
- "text": "Diego: Cirrus status \u2014 scoping.",
597
- "score": 0.4098,
598
- "source": "pentatonic-memory",
599
- "doc_id": "status-cirrus-m3"
600
- },
601
- {
602
- "text": "Diego: Target delivery for Cirrus is 2026-04-01. Please pencil it in.",
603
- "score": 0.4032,
604
- "source": "pentatonic-memory",
605
- "doc_id": "chat-deadline1-cirrus"
606
- },
607
- {
608
- "text": "Diego: I'll own project Cirrus. Kickoff this week.",
609
- "score": 0.3968,
610
- "source": "pentatonic-memory",
611
- "doc_id": "chat-assign-cirrus"
612
- }
613
- ],
614
- "correct": true,
615
- "score": 1.0,
616
- "grading_notes": "all substrings matched",
617
- "search_time_ms": 486.57627002103254,
618
- "generation_time_ms": 0.0,
619
- "tokens_in": 0,
620
- "tokens_out": 0,
621
- "retrieval_tokens": 58,
622
- "query_tokens": 9,
623
- "context_tokens": 49,
624
- "judge_tokens_in": 0,
625
- "judge_tokens_out": 0,
626
- "judge_latency_ms": 0.0
627
- },
628
- {
629
- "task_id": "status-dune",
630
- "query": "What's the latest status of Dune?",
631
- "answer": "Farid: Dune status \u2014 launched.\n---\nFarid: Target delivery for Dune is 2026-05-20. Please pencil it in.\n---\nFarid: I'll own project Dune. Kickoff this week.",
632
- "hits": [
633
- {
634
- "text": "Farid: Dune status \u2014 launched.",
635
- "score": 0.4098,
636
- "source": "pentatonic-memory",
637
- "doc_id": "status-dune-m4"
638
- },
639
- {
640
- "text": "Farid: Target delivery for Dune is 2026-05-20. Please pencil it in.",
641
- "score": 0.4032,
642
- "source": "pentatonic-memory",
643
- "doc_id": "chat-deadline1-dune"
644
- },
645
- {
646
- "text": "Farid: I'll own project Dune. Kickoff this week.",
647
- "score": 0.3968,
648
- "source": "pentatonic-memory",
649
- "doc_id": "chat-assign-dune"
650
- }
651
- ],
652
- "correct": true,
653
- "score": 1.0,
654
- "grading_notes": "all substrings matched",
655
- "search_time_ms": 464.17109901085496,
656
- "generation_time_ms": 0.0,
657
- "tokens_in": 0,
658
- "tokens_out": 0,
659
- "retrieval_tokens": 57,
660
- "query_tokens": 9,
661
- "context_tokens": 48,
662
- "judge_tokens_in": 0,
663
- "judge_tokens_out": 0,
664
- "judge_latency_ms": 0.0
665
- },
666
- {
667
- "task_id": "multihop-atlas",
668
- "query": "Who owns Atlas and what is its current deadline?",
669
- "answer": "Alice: Update \u2014 Atlas deadline has been moved to 2026-03-17. Latest one supersedes earlier guidance.\n---\nAlice: Atlas status \u2014 on track.\n---\nAlice: Target delivery for Atlas is 2026-03-14. Please pencil it in.",
670
- "hits": [
671
- {
672
- "text": "Alice: Update \u2014 Atlas deadline has been moved to 2026-03-17. Latest one supersedes earlier guidance.",
673
- "score": 0.4098,
674
- "source": "pentatonic-memory",
675
- "doc_id": "chat-deadline2-atlas"
676
- },
677
- {
678
- "text": "Alice: Atlas status \u2014 on track.",
679
- "score": 0.4032,
680
- "source": "pentatonic-memory",
681
- "doc_id": "status-atlas-m1"
682
- },
683
- {
684
- "text": "Alice: Target delivery for Atlas is 2026-03-14. Please pencil it in.",
685
- "score": 0.3968,
686
- "source": "pentatonic-memory",
687
- "doc_id": "chat-deadline1-atlas"
688
- }
689
- ],
690
- "correct": true,
691
- "score": 1.0,
692
- "grading_notes": "all substrings matched",
693
- "search_time_ms": 440.41278600343503,
694
- "generation_time_ms": 0.0,
695
- "tokens_in": 0,
696
- "tokens_out": 0,
697
- "retrieval_tokens": 66,
698
- "query_tokens": 10,
699
- "context_tokens": 56,
700
- "judge_tokens_in": 0,
701
- "judge_tokens_out": 0,
702
- "judge_latency_ms": 0.0
703
- },
704
- {
705
- "task_id": "multihop-borealis",
706
- "query": "Who owns Borealis and what is its current deadline?",
707
- "answer": "Clara: I'll own project Borealis. Kickoff this week.\n---\nClara: Target delivery for Borealis is 2026-02-28. Please pencil it in.\n---\nClara: Borealis status \u2014 blocked on vendor.",
708
- "hits": [
709
- {
710
- "text": "Clara: I'll own project Borealis. Kickoff this week.",
711
- "score": 0.4098,
712
- "source": "pentatonic-memory",
713
- "doc_id": "chat-assign-borealis"
714
- },
715
- {
716
- "text": "Clara: Target delivery for Borealis is 2026-02-28. Please pencil it in.",
717
- "score": 0.4032,
718
- "source": "pentatonic-memory",
719
- "doc_id": "chat-deadline1-borealis"
720
- },
721
- {
722
- "text": "Clara: Borealis status \u2014 blocked on vendor.",
723
- "score": 0.3968,
724
- "source": "pentatonic-memory",
725
- "doc_id": "status-borealis-m2"
726
- }
727
- ],
728
- "correct": true,
729
- "score": 1.0,
730
- "grading_notes": "all substrings matched",
731
- "search_time_ms": 443.88292901567183,
732
- "generation_time_ms": 0.0,
733
- "tokens_in": 0,
734
- "tokens_out": 0,
735
- "retrieval_tokens": 65,
736
- "query_tokens": 12,
737
- "context_tokens": 53,
738
- "judge_tokens_in": 0,
739
- "judge_tokens_out": 0,
740
- "judge_latency_ms": 0.0
741
- },
742
- {
743
- "task_id": "multihop-cirrus",
744
- "query": "Who owns Cirrus and what is its current deadline?",
745
- "answer": "Diego: Update \u2014 Cirrus deadline has been moved to 2026-04-08. Latest one supersedes earlier guidance.\n---\nDiego: Target delivery for Cirrus is 2026-04-01. Please pencil it in.\n---\nDiego: I'll own project Cirrus. Kickoff this week.",
746
- "hits": [
747
- {
748
- "text": "Diego: Update \u2014 Cirrus deadline has been moved to 2026-04-08. Latest one supersedes earlier guidance.",
749
- "score": 0.4098,
750
- "source": "pentatonic-memory",
751
- "doc_id": "chat-deadline2-cirrus"
752
- },
753
- {
754
- "text": "Diego: Target delivery for Cirrus is 2026-04-01. Please pencil it in.",
755
- "score": 0.4032,
756
- "source": "pentatonic-memory",
757
- "doc_id": "chat-deadline1-cirrus"
758
- },
759
- {
760
- "text": "Diego: I'll own project Cirrus. Kickoff this week.",
761
- "score": 0.3968,
762
- "source": "pentatonic-memory",
763
- "doc_id": "chat-assign-cirrus"
764
- }
765
- ],
766
- "correct": true,
767
- "score": 1.0,
768
- "grading_notes": "all substrings matched",
769
- "search_time_ms": 440.7094269990921,
770
- "generation_time_ms": 0.0,
771
- "tokens_in": 0,
772
- "tokens_out": 0,
773
- "retrieval_tokens": 78,
774
- "query_tokens": 11,
775
- "context_tokens": 67,
776
- "judge_tokens_in": 0,
777
- "judge_tokens_out": 0,
778
- "judge_latency_ms": 0.0
779
- },
780
- {
781
- "task_id": "multihop-dune",
782
- "query": "Who owns Dune and what is its current deadline?",
783
- "answer": "Farid: I'll own project Dune. Kickoff this week.\n---\nFarid: Target delivery for Dune is 2026-05-20. Please pencil it in.\n---\nFarid: Dune status \u2014 launched.",
784
- "hits": [
785
- {
786
- "text": "Farid: I'll own project Dune. Kickoff this week.",
787
- "score": 0.4098,
788
- "source": "pentatonic-memory",
789
- "doc_id": "chat-assign-dune"
790
- },
791
- {
792
- "text": "Farid: Target delivery for Dune is 2026-05-20. Please pencil it in.",
793
- "score": 0.4032,
794
- "source": "pentatonic-memory",
795
- "doc_id": "chat-deadline1-dune"
796
- },
797
- {
798
- "text": "Farid: Dune status \u2014 launched.",
799
- "score": 0.3968,
800
- "source": "pentatonic-memory",
801
- "doc_id": "status-dune-m4"
802
- }
803
- ],
804
- "correct": true,
805
- "score": 1.0,
806
- "grading_notes": "all substrings matched",
807
- "search_time_ms": 446.82001500041224,
808
- "generation_time_ms": 0.0,
809
- "tokens_in": 0,
810
- "tokens_out": 0,
811
- "retrieval_tokens": 59,
812
- "query_tokens": 11,
813
- "context_tokens": 48,
814
- "judge_tokens_in": 0,
815
- "judge_tokens_out": 0,
816
- "judge_latency_ms": 0.0
817
- }
818
- ]
819
- }