@pentatonic-ai/ai-agent-sdk 0.9.6 → 0.10.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +3 -3
- package/bin/cli.js +1 -1
- package/bin/commands/config.js +1 -1
- package/dist/index.cjs +1 -1
- package/dist/index.js +1 -1
- package/package.json +2 -2
- package/packages/doctor/src/checks/local-memory.js +2 -2
- package/packages/memory/README.md +2 -2
- package/packages/memory/openclaw-plugin/README.md +2 -2
- package/packages/memory/openclaw-plugin/openclaw.plugin.json +1 -1
- package/packages/memory/src/server.js +2 -2
- package/packages/memory-engine-v2/.env.example +30 -0
- package/packages/memory-engine-v2/README.md +125 -0
- package/packages/memory-engine-v2/compat/Dockerfile +11 -0
- package/packages/memory-engine-v2/compat/requirements.txt +6 -0
- package/packages/memory-engine-v2/compat/server.py +1047 -0
- package/packages/memory-engine-v2/docker-compose.aws.yml +78 -0
- package/packages/memory-engine-v2/docker-compose.yml +206 -0
- package/packages/memory-engine-v2/extractor-async/Dockerfile +14 -0
- package/packages/memory-engine-v2/extractor-async/confidence.py +62 -0
- package/packages/memory-engine-v2/extractor-async/noise_filter.py +144 -0
- package/packages/memory-engine-v2/extractor-async/requirements.txt +2 -0
- package/packages/memory-engine-v2/extractor-async/test_confidence.py +76 -0
- package/packages/memory-engine-v2/extractor-async/test_noise_filter.py +177 -0
- package/packages/memory-engine-v2/extractor-async/worker.py +797 -0
- package/packages/memory-engine-v2/extractor-sync/Dockerfile +11 -0
- package/packages/memory-engine-v2/extractor-sync/requirements.txt +4 -0
- package/packages/memory-engine-v2/extractor-sync/server.py +424 -0
- package/packages/memory-engine-v2/org-model/migrations/001_init.sql +390 -0
- package/packages/memory-engine-v2/tests/e2e_smoke.py +356 -0
- package/packages/memory-engine-v2/tests/fixtures/generate_synthetic_corpus.py +758 -0
- package/packages/memory-engine/.env.example +0 -13
- package/packages/memory-engine/MIGRATION.md +0 -219
- package/packages/memory-engine/README.md +0 -145
- package/packages/memory-engine/bench/README.md +0 -99
- package/packages/memory-engine/bench/scorecards-engine/agent-coding__pentatonic-baseline__20260427-142523.json +0 -1115
- package/packages/memory-engine/bench/scorecards-engine/chat-recall__pentatonic-baseline__20260427-142648.json +0 -819
- package/packages/memory-engine/bench/scorecards-engine/circular-economy__pentatonic-baseline__20260427-142757.json +0 -1278
- package/packages/memory-engine/bench/scorecards-engine/customer-support__pentatonic-baseline__20260427-142900.json +0 -1018
- package/packages/memory-engine/bench/scorecards-engine/marketplace-ops__pentatonic-baseline__20260427-142957.json +0 -1038
- package/packages/memory-engine/bench/scorecards-engine/product-catalogue__pentatonic-baseline__20260427-143122.json +0 -961
- package/packages/memory-engine/bench/scorecards-engine-via-docker/agent-coding__pentatonic-memory__20260427-161812.json +0 -1115
- package/packages/memory-engine/bench/scorecards-engine-via-docker/chat-recall__pentatonic-memory__20260427-161701.json +0 -819
- package/packages/memory-engine/bench/scorecards-engine-via-docker/circular-economy__pentatonic-memory__20260427-161713.json +0 -1278
- package/packages/memory-engine/bench/scorecards-engine-via-docker/customer-support__pentatonic-memory__20260427-161723.json +0 -1018
- package/packages/memory-engine/bench/scorecards-engine-via-docker/marketplace-ops__pentatonic-memory__20260427-161732.json +0 -1038
- package/packages/memory-engine/bench/scorecards-engine-via-docker/product-catalogue__pentatonic-memory__20260427-161741.json +0 -937
- package/packages/memory-engine/bench/scorecards-engine-via-l2-7-layer-populated/agent-coding__pentatonic-memory__20260427-184718.json +0 -1115
- package/packages/memory-engine/bench/scorecards-engine-via-l2-7-layer-populated/chat-recall__pentatonic-memory__20260427-184614.json +0 -819
- package/packages/memory-engine/bench/scorecards-engine-via-l2-7-layer-populated/circular-economy__pentatonic-memory__20260427-184809.json +0 -1278
- package/packages/memory-engine/bench/scorecards-engine-via-l2-7-layer-populated/customer-support__pentatonic-memory__20260427-184854.json +0 -1018
- package/packages/memory-engine/bench/scorecards-engine-via-l2-7-layer-populated/marketplace-ops__pentatonic-memory__20260427-184929.json +0 -1038
- package/packages/memory-engine/bench/scorecards-engine-via-l2-7-layer-populated/product-catalogue__pentatonic-memory__20260427-185015.json +0 -961
- package/packages/memory-engine/bench/scorecards-engine-via-l2-empty-layers/agent-coding__pentatonic-memory__20260427-175252.json +0 -1115
- package/packages/memory-engine/bench/scorecards-engine-via-l2-empty-layers/chat-recall__pentatonic-memory__20260427-175312.json +0 -819
- package/packages/memory-engine/bench/scorecards-engine-via-l2-empty-layers/circular-economy__pentatonic-memory__20260427-175335.json +0 -1278
- package/packages/memory-engine/bench/scorecards-engine-via-l2-empty-layers/customer-support__pentatonic-memory__20260427-175355.json +0 -1018
- package/packages/memory-engine/bench/scorecards-engine-via-l2-empty-layers/marketplace-ops__pentatonic-memory__20260427-175413.json +0 -1038
- package/packages/memory-engine/bench/scorecards-engine-via-l2-empty-layers/product-catalogue__pentatonic-memory__20260427-175430.json +0 -883
- package/packages/memory-engine/bench/scorecards-engine-via-shim/agent-coding__pentatonic-memory__20260427-155409.json +0 -1115
- package/packages/memory-engine/bench/scorecards-engine-via-shim/chat-recall__pentatonic-memory__20260427-155421.json +0 -819
- package/packages/memory-engine/bench/scorecards-engine-via-shim/circular-economy__pentatonic-memory__20260427-155433.json +0 -1278
- package/packages/memory-engine/bench/scorecards-engine-via-shim/customer-support__pentatonic-memory__20260427-155443.json +0 -1018
- package/packages/memory-engine/bench/scorecards-engine-via-shim/marketplace-ops__pentatonic-memory__20260427-155453.json +0 -1038
- package/packages/memory-engine/bench/scorecards-engine-via-shim/product-catalogue__pentatonic-memory__20260427-155503.json +0 -937
- package/packages/memory-engine/bench/scorecards-pentatonic-baseline/agent-coding__pentatonic-memory-latest__20260427-145103.json +0 -1115
- package/packages/memory-engine/bench/scorecards-pentatonic-baseline/agent-coding__pentatonic-memory__20260427-144909.json +0 -1115
- package/packages/memory-engine/bench/scorecards-pentatonic-baseline/chat-recall__pentatonic-memory-latest__20260427-145153.json +0 -819
- package/packages/memory-engine/bench/scorecards-pentatonic-baseline/chat-recall__pentatonic-memory__20260427-145120.json +0 -542
- package/packages/memory-engine/bench/scorecards-pentatonic-baseline/circular-economy__pentatonic-memory-latest__20260427-145313.json +0 -1278
- package/packages/memory-engine/bench/scorecards-pentatonic-baseline/circular-economy__pentatonic-memory__20260427-145207.json +0 -894
- package/packages/memory-engine/bench/scorecards-pentatonic-baseline/customer-support__pentatonic-memory-latest__20260427-145412.json +0 -1018
- package/packages/memory-engine/bench/scorecards-pentatonic-baseline/customer-support__pentatonic-memory__20260427-145327.json +0 -680
- package/packages/memory-engine/bench/scorecards-pentatonic-baseline/marketplace-ops__pentatonic-memory-latest__20260427-145517.json +0 -1038
- package/packages/memory-engine/bench/scorecards-pentatonic-baseline/marketplace-ops__pentatonic-memory__20260427-145422.json +0 -693
- package/packages/memory-engine/bench/scorecards-pentatonic-baseline/product-catalogue__pentatonic-memory-latest__20260427-145616.json +0 -961
- package/packages/memory-engine/bench/scorecards-pentatonic-baseline/product-catalogue__pentatonic-memory__20260427-145528.json +0 -727
- package/packages/memory-engine/compat/Dockerfile +0 -22
- package/packages/memory-engine/compat/server.py +0 -1255
- package/packages/memory-engine/docker-compose.test.yml +0 -59
- package/packages/memory-engine/docker-compose.yml +0 -255
- package/packages/memory-engine/engine/README.md +0 -52
- package/packages/memory-engine/engine/l2-hybridrag-proxy.py +0 -1543
- package/packages/memory-engine/engine/l5-comms-layer.py +0 -663
- package/packages/memory-engine/engine/l6-document-store.py +0 -1018
- package/packages/memory-engine/engine/services/_shared/__init__.py +0 -1
- package/packages/memory-engine/engine/services/_shared/embed_provider.py +0 -562
- package/packages/memory-engine/engine/services/l2/Dockerfile +0 -50
- package/packages/memory-engine/engine/services/l2/init_databases.py +0 -81
- package/packages/memory-engine/engine/services/l2/l2-hybridrag-proxy.py +0 -2721
- package/packages/memory-engine/engine/services/l5/Dockerfile +0 -11
- package/packages/memory-engine/engine/services/l5/l5-comms-layer.py +0 -808
- package/packages/memory-engine/engine/services/l6/Dockerfile +0 -30
- package/packages/memory-engine/engine/services/l6/l6-document-store.py +0 -1221
- package/packages/memory-engine/engine/services/nv-embed/Dockerfile +0 -28
- package/packages/memory-engine/engine/services/nv-embed/server.py +0 -152
- package/packages/memory-engine/pme_memory/__init__.py +0 -0
- package/packages/memory-engine/pme_memory/__main__.py +0 -129
- package/packages/memory-engine/pme_memory/artifacts.py +0 -95
- package/packages/memory-engine/pme_memory/embed.py +0 -74
- package/packages/memory-engine/pme_memory/health.py +0 -36
- package/packages/memory-engine/pme_memory/hygiene.py +0 -159
- package/packages/memory-engine/pme_memory/indexer.py +0 -200
- package/packages/memory-engine/pme_memory/needs.py +0 -55
- package/packages/memory-engine/pme_memory/provenance.py +0 -80
- package/packages/memory-engine/pme_memory/scoring.py +0 -168
- package/packages/memory-engine/pme_memory/search.py +0 -52
- package/packages/memory-engine/pme_memory/store.py +0 -86
- package/packages/memory-engine/pme_memory/synthesis.py +0 -114
- package/packages/memory-engine/pyproject.toml +0 -65
- package/packages/memory-engine/scripts/kg-extractor.py +0 -557
- package/packages/memory-engine/scripts/kg-preflexor-v2.py +0 -738
- package/packages/memory-engine/scripts/wipe-legacy-l3-entities.py +0 -128
- package/packages/memory-engine/tests/e2e_arena.sh +0 -259
- package/packages/memory-engine/tests/embed_stub/Dockerfile +0 -13
- package/packages/memory-engine/tests/embed_stub/server.py +0 -80
- package/packages/memory-engine/tests/test_aggregate.py +0 -333
- package/packages/memory-engine/tests/test_api_contract.sh +0 -57
- package/packages/memory-engine/tests/test_arena_safety.py +0 -232
- package/packages/memory-engine/tests/test_channel_stat_reader.py +0 -437
- package/packages/memory-engine/tests/test_channel_stat_rollups.py +0 -308
- package/packages/memory-engine/tests/test_compat_nv_embed_probe.py +0 -48
- package/packages/memory-engine/tests/test_embed_provider.py +0 -693
- package/packages/memory-engine/tests/test_l2_qmd_vec_search.py +0 -280
- package/packages/memory-engine/tests/test_l3_arena_isolation.py +0 -412
- package/packages/memory-engine/tests/test_l6_module_load.py +0 -84
- package/packages/memory-engine/tests/test_people_list_reader.py +0 -432
|
@@ -1,542 +0,0 @@
|
|
|
1
|
-
{
|
|
2
|
-
"bench": "chat-recall",
|
|
3
|
-
"stack": "pentatonic-memory",
|
|
4
|
-
"n_tasks": 16,
|
|
5
|
-
"n_correct": 2,
|
|
6
|
-
"accuracy": 0.125,
|
|
7
|
-
"mean_score": 0.15625,
|
|
8
|
-
"p50_search_ms": 25.818128502578475,
|
|
9
|
-
"p95_search_ms": 31.491656991420314,
|
|
10
|
-
"total_tokens_in": 0,
|
|
11
|
-
"total_tokens_out": 0,
|
|
12
|
-
"total_usd": 0.0,
|
|
13
|
-
"by_tag": {
|
|
14
|
-
"factoid": {
|
|
15
|
-
"n": 6,
|
|
16
|
-
"mean_score": 0.16666666666666666,
|
|
17
|
-
"accuracy": 0.16666666666666666
|
|
18
|
-
},
|
|
19
|
-
"owner": {
|
|
20
|
-
"n": 4,
|
|
21
|
-
"mean_score": 0.25,
|
|
22
|
-
"accuracy": 0.25
|
|
23
|
-
},
|
|
24
|
-
"temporal": {
|
|
25
|
-
"n": 4,
|
|
26
|
-
"mean_score": 0.0,
|
|
27
|
-
"accuracy": 0.0
|
|
28
|
-
},
|
|
29
|
-
"contradiction": {
|
|
30
|
-
"n": 2,
|
|
31
|
-
"mean_score": 0.0,
|
|
32
|
-
"accuracy": 0.0
|
|
33
|
-
},
|
|
34
|
-
"status": {
|
|
35
|
-
"n": 4,
|
|
36
|
-
"mean_score": 0.25,
|
|
37
|
-
"accuracy": 0.25
|
|
38
|
-
},
|
|
39
|
-
"multi-hop": {
|
|
40
|
-
"n": 4,
|
|
41
|
-
"mean_score": 0.125,
|
|
42
|
-
"accuracy": 0.0
|
|
43
|
-
}
|
|
44
|
-
},
|
|
45
|
-
"extra": {
|
|
46
|
-
"ingest_ms": 16568.97608199506,
|
|
47
|
-
"grading": "substring",
|
|
48
|
-
"limit": 3,
|
|
49
|
-
"tokens": {
|
|
50
|
-
"corpus_tokens": 513,
|
|
51
|
-
"query_tokens": 140,
|
|
52
|
-
"context_tokens": 49,
|
|
53
|
-
"retrieval_tokens": 189,
|
|
54
|
-
"naive_tokens": 8348,
|
|
55
|
-
"saved_tokens": 8159,
|
|
56
|
-
"reduction_pct": 0.9773598466698611,
|
|
57
|
-
"mean_retrieval_tokens_per_task": 11.8125,
|
|
58
|
-
"tokenizer": "cl100k_base",
|
|
59
|
-
"per_task": {
|
|
60
|
-
"who-owns-atlas": {
|
|
61
|
-
"query": 5,
|
|
62
|
-
"context": 0,
|
|
63
|
-
"retrieval": 5,
|
|
64
|
-
"judge_in": 0,
|
|
65
|
-
"judge_out": 0,
|
|
66
|
-
"judge_latency_ms": 0.0
|
|
67
|
-
},
|
|
68
|
-
"who-owns-borealis": {
|
|
69
|
-
"query": 7,
|
|
70
|
-
"context": 0,
|
|
71
|
-
"retrieval": 7,
|
|
72
|
-
"judge_in": 0,
|
|
73
|
-
"judge_out": 0,
|
|
74
|
-
"judge_latency_ms": 0.0
|
|
75
|
-
},
|
|
76
|
-
"who-owns-cirrus": {
|
|
77
|
-
"query": 6,
|
|
78
|
-
"context": 0,
|
|
79
|
-
"retrieval": 6,
|
|
80
|
-
"judge_in": 0,
|
|
81
|
-
"judge_out": 0,
|
|
82
|
-
"judge_latency_ms": 0.0
|
|
83
|
-
},
|
|
84
|
-
"who-owns-dune": {
|
|
85
|
-
"query": 6,
|
|
86
|
-
"context": 15,
|
|
87
|
-
"retrieval": 21,
|
|
88
|
-
"judge_in": 0,
|
|
89
|
-
"judge_out": 0,
|
|
90
|
-
"judge_latency_ms": 0.0
|
|
91
|
-
},
|
|
92
|
-
"current-deadline-atlas": {
|
|
93
|
-
"query": 8,
|
|
94
|
-
"context": 0,
|
|
95
|
-
"retrieval": 8,
|
|
96
|
-
"judge_in": 0,
|
|
97
|
-
"judge_out": 0,
|
|
98
|
-
"judge_latency_ms": 0.0
|
|
99
|
-
},
|
|
100
|
-
"current-deadline-borealis": {
|
|
101
|
-
"query": 10,
|
|
102
|
-
"context": 0,
|
|
103
|
-
"retrieval": 10,
|
|
104
|
-
"judge_in": 0,
|
|
105
|
-
"judge_out": 0,
|
|
106
|
-
"judge_latency_ms": 0.0
|
|
107
|
-
},
|
|
108
|
-
"current-deadline-cirrus": {
|
|
109
|
-
"query": 9,
|
|
110
|
-
"context": 0,
|
|
111
|
-
"retrieval": 9,
|
|
112
|
-
"judge_in": 0,
|
|
113
|
-
"judge_out": 0,
|
|
114
|
-
"judge_latency_ms": 0.0
|
|
115
|
-
},
|
|
116
|
-
"current-deadline-dune": {
|
|
117
|
-
"query": 9,
|
|
118
|
-
"context": 0,
|
|
119
|
-
"retrieval": 9,
|
|
120
|
-
"judge_in": 0,
|
|
121
|
-
"judge_out": 0,
|
|
122
|
-
"judge_latency_ms": 0.0
|
|
123
|
-
},
|
|
124
|
-
"status-atlas": {
|
|
125
|
-
"query": 8,
|
|
126
|
-
"context": 0,
|
|
127
|
-
"retrieval": 8,
|
|
128
|
-
"judge_in": 0,
|
|
129
|
-
"judge_out": 0,
|
|
130
|
-
"judge_latency_ms": 0.0
|
|
131
|
-
},
|
|
132
|
-
"status-borealis": {
|
|
133
|
-
"query": 10,
|
|
134
|
-
"context": 0,
|
|
135
|
-
"retrieval": 10,
|
|
136
|
-
"judge_in": 0,
|
|
137
|
-
"judge_out": 0,
|
|
138
|
-
"judge_latency_ms": 0.0
|
|
139
|
-
},
|
|
140
|
-
"status-cirrus": {
|
|
141
|
-
"query": 9,
|
|
142
|
-
"context": 0,
|
|
143
|
-
"retrieval": 9,
|
|
144
|
-
"judge_in": 0,
|
|
145
|
-
"judge_out": 0,
|
|
146
|
-
"judge_latency_ms": 0.0
|
|
147
|
-
},
|
|
148
|
-
"status-dune": {
|
|
149
|
-
"query": 9,
|
|
150
|
-
"context": 9,
|
|
151
|
-
"retrieval": 18,
|
|
152
|
-
"judge_in": 0,
|
|
153
|
-
"judge_out": 0,
|
|
154
|
-
"judge_latency_ms": 0.0
|
|
155
|
-
},
|
|
156
|
-
"multihop-atlas": {
|
|
157
|
-
"query": 10,
|
|
158
|
-
"context": 0,
|
|
159
|
-
"retrieval": 10,
|
|
160
|
-
"judge_in": 0,
|
|
161
|
-
"judge_out": 0,
|
|
162
|
-
"judge_latency_ms": 0.0
|
|
163
|
-
},
|
|
164
|
-
"multihop-borealis": {
|
|
165
|
-
"query": 12,
|
|
166
|
-
"context": 0,
|
|
167
|
-
"retrieval": 12,
|
|
168
|
-
"judge_in": 0,
|
|
169
|
-
"judge_out": 0,
|
|
170
|
-
"judge_latency_ms": 0.0
|
|
171
|
-
},
|
|
172
|
-
"multihop-cirrus": {
|
|
173
|
-
"query": 11,
|
|
174
|
-
"context": 0,
|
|
175
|
-
"retrieval": 11,
|
|
176
|
-
"judge_in": 0,
|
|
177
|
-
"judge_out": 0,
|
|
178
|
-
"judge_latency_ms": 0.0
|
|
179
|
-
},
|
|
180
|
-
"multihop-dune": {
|
|
181
|
-
"query": 11,
|
|
182
|
-
"context": 25,
|
|
183
|
-
"retrieval": 36,
|
|
184
|
-
"judge_in": 0,
|
|
185
|
-
"judge_out": 0,
|
|
186
|
-
"judge_latency_ms": 0.0
|
|
187
|
-
}
|
|
188
|
-
},
|
|
189
|
-
"judge_tokens_in": 0,
|
|
190
|
-
"judge_tokens_out": 0,
|
|
191
|
-
"judge_calls": 0,
|
|
192
|
-
"judge_mean_latency_ms": 0.0
|
|
193
|
-
},
|
|
194
|
-
"cost_usd": {
|
|
195
|
-
"assumed_completion_tokens_per_task": 100,
|
|
196
|
-
"rates": {
|
|
197
|
-
"input_per_1k": 0.0025,
|
|
198
|
-
"output_per_1k": 0.01,
|
|
199
|
-
"model": "gpt-4o"
|
|
200
|
-
},
|
|
201
|
-
"retrieval_usd_in": 0.00047250000000000005,
|
|
202
|
-
"retrieval_usd_out": 0.016,
|
|
203
|
-
"retrieval_usd_total": 0.0164725,
|
|
204
|
-
"naive_usd_total": 0.03687,
|
|
205
|
-
"saved_usd": 0.0203975,
|
|
206
|
-
"saved_usd_per_1k_tasks": 1.2748437499999998
|
|
207
|
-
}
|
|
208
|
-
},
|
|
209
|
-
"task_results": [
|
|
210
|
-
{
|
|
211
|
-
"task_id": "who-owns-atlas",
|
|
212
|
-
"query": "Who owns project Atlas?",
|
|
213
|
-
"answer": "",
|
|
214
|
-
"hits": [],
|
|
215
|
-
"correct": false,
|
|
216
|
-
"score": 0.0,
|
|
217
|
-
"grading_notes": "missing 1/1: ['Alice']",
|
|
218
|
-
"search_time_ms": 31.491656991420314,
|
|
219
|
-
"generation_time_ms": 0.0,
|
|
220
|
-
"tokens_in": 0,
|
|
221
|
-
"tokens_out": 0,
|
|
222
|
-
"retrieval_tokens": 5,
|
|
223
|
-
"query_tokens": 5,
|
|
224
|
-
"context_tokens": 0,
|
|
225
|
-
"judge_tokens_in": 0,
|
|
226
|
-
"judge_tokens_out": 0,
|
|
227
|
-
"judge_latency_ms": 0.0
|
|
228
|
-
},
|
|
229
|
-
{
|
|
230
|
-
"task_id": "who-owns-borealis",
|
|
231
|
-
"query": "Who owns project Borealis?",
|
|
232
|
-
"answer": "",
|
|
233
|
-
"hits": [],
|
|
234
|
-
"correct": false,
|
|
235
|
-
"score": 0.0,
|
|
236
|
-
"grading_notes": "missing 1/1: ['Clara']",
|
|
237
|
-
"search_time_ms": 23.22632700088434,
|
|
238
|
-
"generation_time_ms": 0.0,
|
|
239
|
-
"tokens_in": 0,
|
|
240
|
-
"tokens_out": 0,
|
|
241
|
-
"retrieval_tokens": 7,
|
|
242
|
-
"query_tokens": 7,
|
|
243
|
-
"context_tokens": 0,
|
|
244
|
-
"judge_tokens_in": 0,
|
|
245
|
-
"judge_tokens_out": 0,
|
|
246
|
-
"judge_latency_ms": 0.0
|
|
247
|
-
},
|
|
248
|
-
{
|
|
249
|
-
"task_id": "who-owns-cirrus",
|
|
250
|
-
"query": "Who owns project Cirrus?",
|
|
251
|
-
"answer": "",
|
|
252
|
-
"hits": [],
|
|
253
|
-
"correct": false,
|
|
254
|
-
"score": 0.0,
|
|
255
|
-
"grading_notes": "missing 1/1: ['Diego']",
|
|
256
|
-
"search_time_ms": 25.621167005738243,
|
|
257
|
-
"generation_time_ms": 0.0,
|
|
258
|
-
"tokens_in": 0,
|
|
259
|
-
"tokens_out": 0,
|
|
260
|
-
"retrieval_tokens": 6,
|
|
261
|
-
"query_tokens": 6,
|
|
262
|
-
"context_tokens": 0,
|
|
263
|
-
"judge_tokens_in": 0,
|
|
264
|
-
"judge_tokens_out": 0,
|
|
265
|
-
"judge_latency_ms": 0.0
|
|
266
|
-
},
|
|
267
|
-
{
|
|
268
|
-
"task_id": "who-owns-dune",
|
|
269
|
-
"query": "Who owns project Dune?",
|
|
270
|
-
"answer": "Farid: I'll own project Dune. Kickoff this week.",
|
|
271
|
-
"hits": [
|
|
272
|
-
{
|
|
273
|
-
"text": "Farid: I'll own project Dune. Kickoff this week.",
|
|
274
|
-
"score": 0.5120010814403368,
|
|
275
|
-
"source": "pentatonic-memory",
|
|
276
|
-
"doc_id": "chat-assign-dune"
|
|
277
|
-
}
|
|
278
|
-
],
|
|
279
|
-
"correct": true,
|
|
280
|
-
"score": 1.0,
|
|
281
|
-
"grading_notes": "all substrings matched",
|
|
282
|
-
"search_time_ms": 26.015089999418706,
|
|
283
|
-
"generation_time_ms": 0.0,
|
|
284
|
-
"tokens_in": 0,
|
|
285
|
-
"tokens_out": 0,
|
|
286
|
-
"retrieval_tokens": 21,
|
|
287
|
-
"query_tokens": 6,
|
|
288
|
-
"context_tokens": 15,
|
|
289
|
-
"judge_tokens_in": 0,
|
|
290
|
-
"judge_tokens_out": 0,
|
|
291
|
-
"judge_latency_ms": 0.0
|
|
292
|
-
},
|
|
293
|
-
{
|
|
294
|
-
"task_id": "current-deadline-atlas",
|
|
295
|
-
"query": "What is the current deadline for Atlas?",
|
|
296
|
-
"answer": "",
|
|
297
|
-
"hits": [],
|
|
298
|
-
"correct": false,
|
|
299
|
-
"score": 0.0,
|
|
300
|
-
"grading_notes": "missing 1/1: ['2026-03-17']",
|
|
301
|
-
"search_time_ms": 24.67792498646304,
|
|
302
|
-
"generation_time_ms": 0.0,
|
|
303
|
-
"tokens_in": 0,
|
|
304
|
-
"tokens_out": 0,
|
|
305
|
-
"retrieval_tokens": 8,
|
|
306
|
-
"query_tokens": 8,
|
|
307
|
-
"context_tokens": 0,
|
|
308
|
-
"judge_tokens_in": 0,
|
|
309
|
-
"judge_tokens_out": 0,
|
|
310
|
-
"judge_latency_ms": 0.0
|
|
311
|
-
},
|
|
312
|
-
{
|
|
313
|
-
"task_id": "current-deadline-borealis",
|
|
314
|
-
"query": "What is the current deadline for Borealis?",
|
|
315
|
-
"answer": "",
|
|
316
|
-
"hits": [],
|
|
317
|
-
"correct": false,
|
|
318
|
-
"score": 0.0,
|
|
319
|
-
"grading_notes": "missing 1/1: ['2026-02-28']",
|
|
320
|
-
"search_time_ms": 25.36684399819933,
|
|
321
|
-
"generation_time_ms": 0.0,
|
|
322
|
-
"tokens_in": 0,
|
|
323
|
-
"tokens_out": 0,
|
|
324
|
-
"retrieval_tokens": 10,
|
|
325
|
-
"query_tokens": 10,
|
|
326
|
-
"context_tokens": 0,
|
|
327
|
-
"judge_tokens_in": 0,
|
|
328
|
-
"judge_tokens_out": 0,
|
|
329
|
-
"judge_latency_ms": 0.0
|
|
330
|
-
},
|
|
331
|
-
{
|
|
332
|
-
"task_id": "current-deadline-cirrus",
|
|
333
|
-
"query": "What is the current deadline for Cirrus?",
|
|
334
|
-
"answer": "",
|
|
335
|
-
"hits": [],
|
|
336
|
-
"correct": false,
|
|
337
|
-
"score": 0.0,
|
|
338
|
-
"grading_notes": "missing 1/1: ['2026-04-08']",
|
|
339
|
-
"search_time_ms": 26.766681025037542,
|
|
340
|
-
"generation_time_ms": 0.0,
|
|
341
|
-
"tokens_in": 0,
|
|
342
|
-
"tokens_out": 0,
|
|
343
|
-
"retrieval_tokens": 9,
|
|
344
|
-
"query_tokens": 9,
|
|
345
|
-
"context_tokens": 0,
|
|
346
|
-
"judge_tokens_in": 0,
|
|
347
|
-
"judge_tokens_out": 0,
|
|
348
|
-
"judge_latency_ms": 0.0
|
|
349
|
-
},
|
|
350
|
-
{
|
|
351
|
-
"task_id": "current-deadline-dune",
|
|
352
|
-
"query": "What is the current deadline for Dune?",
|
|
353
|
-
"answer": "",
|
|
354
|
-
"hits": [],
|
|
355
|
-
"correct": false,
|
|
356
|
-
"score": 0.0,
|
|
357
|
-
"grading_notes": "missing 1/1: ['2026-05-20']",
|
|
358
|
-
"search_time_ms": 26.705369004048407,
|
|
359
|
-
"generation_time_ms": 0.0,
|
|
360
|
-
"tokens_in": 0,
|
|
361
|
-
"tokens_out": 0,
|
|
362
|
-
"retrieval_tokens": 9,
|
|
363
|
-
"query_tokens": 9,
|
|
364
|
-
"context_tokens": 0,
|
|
365
|
-
"judge_tokens_in": 0,
|
|
366
|
-
"judge_tokens_out": 0,
|
|
367
|
-
"judge_latency_ms": 0.0
|
|
368
|
-
},
|
|
369
|
-
{
|
|
370
|
-
"task_id": "status-atlas",
|
|
371
|
-
"query": "What's the latest status of Atlas?",
|
|
372
|
-
"answer": "",
|
|
373
|
-
"hits": [],
|
|
374
|
-
"correct": false,
|
|
375
|
-
"score": 0.0,
|
|
376
|
-
"grading_notes": "missing 1/1: ['on track']",
|
|
377
|
-
"search_time_ms": 27.433937008026987,
|
|
378
|
-
"generation_time_ms": 0.0,
|
|
379
|
-
"tokens_in": 0,
|
|
380
|
-
"tokens_out": 0,
|
|
381
|
-
"retrieval_tokens": 8,
|
|
382
|
-
"query_tokens": 8,
|
|
383
|
-
"context_tokens": 0,
|
|
384
|
-
"judge_tokens_in": 0,
|
|
385
|
-
"judge_tokens_out": 0,
|
|
386
|
-
"judge_latency_ms": 0.0
|
|
387
|
-
},
|
|
388
|
-
{
|
|
389
|
-
"task_id": "status-borealis",
|
|
390
|
-
"query": "What's the latest status of Borealis?",
|
|
391
|
-
"answer": "",
|
|
392
|
-
"hits": [],
|
|
393
|
-
"correct": false,
|
|
394
|
-
"score": 0.0,
|
|
395
|
-
"grading_notes": "missing 1/1: ['blocked on vendor']",
|
|
396
|
-
"search_time_ms": 29.91680899867788,
|
|
397
|
-
"generation_time_ms": 0.0,
|
|
398
|
-
"tokens_in": 0,
|
|
399
|
-
"tokens_out": 0,
|
|
400
|
-
"retrieval_tokens": 10,
|
|
401
|
-
"query_tokens": 10,
|
|
402
|
-
"context_tokens": 0,
|
|
403
|
-
"judge_tokens_in": 0,
|
|
404
|
-
"judge_tokens_out": 0,
|
|
405
|
-
"judge_latency_ms": 0.0
|
|
406
|
-
},
|
|
407
|
-
{
|
|
408
|
-
"task_id": "status-cirrus",
|
|
409
|
-
"query": "What's the latest status of Cirrus?",
|
|
410
|
-
"answer": "",
|
|
411
|
-
"hits": [],
|
|
412
|
-
"correct": false,
|
|
413
|
-
"score": 0.0,
|
|
414
|
-
"grading_notes": "missing 1/1: ['scoping']",
|
|
415
|
-
"search_time_ms": 25.178106006933376,
|
|
416
|
-
"generation_time_ms": 0.0,
|
|
417
|
-
"tokens_in": 0,
|
|
418
|
-
"tokens_out": 0,
|
|
419
|
-
"retrieval_tokens": 9,
|
|
420
|
-
"query_tokens": 9,
|
|
421
|
-
"context_tokens": 0,
|
|
422
|
-
"judge_tokens_in": 0,
|
|
423
|
-
"judge_tokens_out": 0,
|
|
424
|
-
"judge_latency_ms": 0.0
|
|
425
|
-
},
|
|
426
|
-
{
|
|
427
|
-
"task_id": "status-dune",
|
|
428
|
-
"query": "What's the latest status of Dune?",
|
|
429
|
-
"answer": "Farid: Dune status \u2014 launched.",
|
|
430
|
-
"hits": [
|
|
431
|
-
{
|
|
432
|
-
"text": "Farid: Dune status \u2014 launched.",
|
|
433
|
-
"score": 0.5271684290425744,
|
|
434
|
-
"source": "pentatonic-memory",
|
|
435
|
-
"doc_id": "status-dune-m4"
|
|
436
|
-
}
|
|
437
|
-
],
|
|
438
|
-
"correct": true,
|
|
439
|
-
"score": 1.0,
|
|
440
|
-
"grading_notes": "all substrings matched",
|
|
441
|
-
"search_time_ms": 23.801564006134868,
|
|
442
|
-
"generation_time_ms": 0.0,
|
|
443
|
-
"tokens_in": 0,
|
|
444
|
-
"tokens_out": 0,
|
|
445
|
-
"retrieval_tokens": 18,
|
|
446
|
-
"query_tokens": 9,
|
|
447
|
-
"context_tokens": 9,
|
|
448
|
-
"judge_tokens_in": 0,
|
|
449
|
-
"judge_tokens_out": 0,
|
|
450
|
-
"judge_latency_ms": 0.0
|
|
451
|
-
},
|
|
452
|
-
{
|
|
453
|
-
"task_id": "multihop-atlas",
|
|
454
|
-
"query": "Who owns Atlas and what is its current deadline?",
|
|
455
|
-
"answer": "",
|
|
456
|
-
"hits": [],
|
|
457
|
-
"correct": false,
|
|
458
|
-
"score": 0.0,
|
|
459
|
-
"grading_notes": "missing 2/2: ['Alice', '2026-03-17']",
|
|
460
|
-
"search_time_ms": 22.88174699060619,
|
|
461
|
-
"generation_time_ms": 0.0,
|
|
462
|
-
"tokens_in": 0,
|
|
463
|
-
"tokens_out": 0,
|
|
464
|
-
"retrieval_tokens": 10,
|
|
465
|
-
"query_tokens": 10,
|
|
466
|
-
"context_tokens": 0,
|
|
467
|
-
"judge_tokens_in": 0,
|
|
468
|
-
"judge_tokens_out": 0,
|
|
469
|
-
"judge_latency_ms": 0.0
|
|
470
|
-
},
|
|
471
|
-
{
|
|
472
|
-
"task_id": "multihop-borealis",
|
|
473
|
-
"query": "Who owns Borealis and what is its current deadline?",
|
|
474
|
-
"answer": "",
|
|
475
|
-
"hits": [],
|
|
476
|
-
"correct": false,
|
|
477
|
-
"score": 0.0,
|
|
478
|
-
"grading_notes": "missing 2/2: ['Clara', '2026-02-28']",
|
|
479
|
-
"search_time_ms": 22.36511799856089,
|
|
480
|
-
"generation_time_ms": 0.0,
|
|
481
|
-
"tokens_in": 0,
|
|
482
|
-
"tokens_out": 0,
|
|
483
|
-
"retrieval_tokens": 12,
|
|
484
|
-
"query_tokens": 12,
|
|
485
|
-
"context_tokens": 0,
|
|
486
|
-
"judge_tokens_in": 0,
|
|
487
|
-
"judge_tokens_out": 0,
|
|
488
|
-
"judge_latency_ms": 0.0
|
|
489
|
-
},
|
|
490
|
-
{
|
|
491
|
-
"task_id": "multihop-cirrus",
|
|
492
|
-
"query": "Who owns Cirrus and what is its current deadline?",
|
|
493
|
-
"answer": "",
|
|
494
|
-
"hits": [],
|
|
495
|
-
"correct": false,
|
|
496
|
-
"score": 0.0,
|
|
497
|
-
"grading_notes": "missing 2/2: ['Diego', '2026-04-08']",
|
|
498
|
-
"search_time_ms": 27.58819400332868,
|
|
499
|
-
"generation_time_ms": 0.0,
|
|
500
|
-
"tokens_in": 0,
|
|
501
|
-
"tokens_out": 0,
|
|
502
|
-
"retrieval_tokens": 11,
|
|
503
|
-
"query_tokens": 11,
|
|
504
|
-
"context_tokens": 0,
|
|
505
|
-
"judge_tokens_in": 0,
|
|
506
|
-
"judge_tokens_out": 0,
|
|
507
|
-
"judge_latency_ms": 0.0
|
|
508
|
-
},
|
|
509
|
-
{
|
|
510
|
-
"task_id": "multihop-dune",
|
|
511
|
-
"query": "Who owns Dune and what is its current deadline?",
|
|
512
|
-
"answer": "Farid: I'll own project Dune. Kickoff this week.\n---\nFarid: Dune status \u2014 launched.",
|
|
513
|
-
"hits": [
|
|
514
|
-
{
|
|
515
|
-
"text": "Farid: I'll own project Dune. Kickoff this week.",
|
|
516
|
-
"score": 0.5420836484839977,
|
|
517
|
-
"source": "pentatonic-memory",
|
|
518
|
-
"doc_id": "chat-assign-dune"
|
|
519
|
-
},
|
|
520
|
-
{
|
|
521
|
-
"text": "Farid: Dune status \u2014 launched.",
|
|
522
|
-
"score": 0.5371697829805622,
|
|
523
|
-
"source": "pentatonic-memory",
|
|
524
|
-
"doc_id": "status-dune-m4"
|
|
525
|
-
}
|
|
526
|
-
],
|
|
527
|
-
"correct": false,
|
|
528
|
-
"score": 0.5,
|
|
529
|
-
"grading_notes": "missing 1/2: ['2026-05-20']",
|
|
530
|
-
"search_time_ms": 26.42112597823143,
|
|
531
|
-
"generation_time_ms": 0.0,
|
|
532
|
-
"tokens_in": 0,
|
|
533
|
-
"tokens_out": 0,
|
|
534
|
-
"retrieval_tokens": 36,
|
|
535
|
-
"query_tokens": 11,
|
|
536
|
-
"context_tokens": 25,
|
|
537
|
-
"judge_tokens_in": 0,
|
|
538
|
-
"judge_tokens_out": 0,
|
|
539
|
-
"judge_latency_ms": 0.0
|
|
540
|
-
}
|
|
541
|
-
]
|
|
542
|
-
}
|