observability-toolkit 1.8.0 → 1.8.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +81 -3
- package/dist/backends/index.d.ts +119 -0
- package/dist/backends/index.d.ts.map +1 -1
- package/dist/backends/index.js +57 -0
- package/dist/backends/index.js.map +1 -1
- package/dist/backends/index.test.d.ts +5 -0
- package/dist/backends/index.test.d.ts.map +1 -0
- package/dist/backends/index.test.js +156 -0
- package/dist/backends/index.test.js.map +1 -0
- package/dist/backends/local-jsonl-boolean-search.test.js +8 -27
- package/dist/backends/local-jsonl-boolean-search.test.js.map +1 -1
- package/dist/backends/local-jsonl-logs.test.d.ts +2 -0
- package/dist/backends/local-jsonl-logs.test.d.ts.map +1 -0
- package/dist/backends/local-jsonl-logs.test.js +603 -0
- package/dist/backends/local-jsonl-logs.test.js.map +1 -0
- package/dist/backends/local-jsonl-traces.test.d.ts +2 -0
- package/dist/backends/local-jsonl-traces.test.d.ts.map +1 -0
- package/dist/backends/local-jsonl-traces.test.js +1723 -0
- package/dist/backends/local-jsonl-traces.test.js.map +1 -0
- package/dist/backends/local-jsonl.d.ts +4 -1
- package/dist/backends/local-jsonl.d.ts.map +1 -1
- package/dist/backends/local-jsonl.js +185 -1
- package/dist/backends/local-jsonl.js.map +1 -1
- package/dist/backends/local-jsonl.test.js +723 -46
- package/dist/backends/local-jsonl.test.js.map +1 -1
- package/dist/backends/signoz-api.d.ts +32 -0
- package/dist/backends/signoz-api.d.ts.map +1 -1
- package/dist/backends/signoz-api.js +231 -33
- package/dist/backends/signoz-api.js.map +1 -1
- package/dist/backends/signoz-api.test.js +410 -63
- package/dist/backends/signoz-api.test.js.map +1 -1
- package/dist/lib/constants.d.ts +59 -0
- package/dist/lib/constants.d.ts.map +1 -1
- package/dist/lib/constants.js +252 -6
- package/dist/lib/constants.js.map +1 -1
- package/dist/lib/constants.test.js +357 -21
- package/dist/lib/constants.test.js.map +1 -1
- package/dist/lib/edge-cases.test.d.ts +11 -0
- package/dist/lib/edge-cases.test.d.ts.map +1 -0
- package/dist/lib/edge-cases.test.js +634 -0
- package/dist/lib/edge-cases.test.js.map +1 -0
- package/dist/lib/error-sanitizer.d.ts +57 -0
- package/dist/lib/error-sanitizer.d.ts.map +1 -0
- package/dist/lib/error-sanitizer.js +207 -0
- package/dist/lib/error-sanitizer.js.map +1 -0
- package/dist/lib/error-sanitizer.test.d.ts +8 -0
- package/dist/lib/error-sanitizer.test.d.ts.map +1 -0
- package/dist/lib/error-sanitizer.test.js +369 -0
- package/dist/lib/error-sanitizer.test.js.map +1 -0
- package/dist/lib/file-utils.d.ts +134 -0
- package/dist/lib/file-utils.d.ts.map +1 -1
- package/dist/lib/file-utils.js +395 -9
- package/dist/lib/file-utils.js.map +1 -1
- package/dist/lib/file-utils.test.js +444 -3
- package/dist/lib/file-utils.test.js.map +1 -1
- package/dist/lib/indexer.d.ts +9 -1
- package/dist/lib/indexer.d.ts.map +1 -1
- package/dist/lib/indexer.js +51 -2
- package/dist/lib/indexer.js.map +1 -1
- package/dist/lib/indexer.test.js +138 -20
- package/dist/lib/indexer.test.js.map +1 -1
- package/dist/lib/input-validator.d.ts +103 -0
- package/dist/lib/input-validator.d.ts.map +1 -0
- package/dist/lib/input-validator.js +250 -0
- package/dist/lib/input-validator.js.map +1 -0
- package/dist/lib/input-validator.test.d.ts +2 -0
- package/dist/lib/input-validator.test.d.ts.map +1 -0
- package/dist/lib/input-validator.test.js +287 -0
- package/dist/lib/input-validator.test.js.map +1 -0
- package/dist/lib/query-sanitizer.d.ts +143 -0
- package/dist/lib/query-sanitizer.d.ts.map +1 -0
- package/dist/lib/query-sanitizer.js +261 -0
- package/dist/lib/query-sanitizer.js.map +1 -0
- package/dist/lib/query-sanitizer.test.d.ts +5 -0
- package/dist/lib/query-sanitizer.test.d.ts.map +1 -0
- package/dist/lib/query-sanitizer.test.js +400 -0
- package/dist/lib/query-sanitizer.test.js.map +1 -0
- package/dist/lib/server-utils.d.ts +80 -0
- package/dist/lib/server-utils.d.ts.map +1 -0
- package/dist/lib/server-utils.js +141 -0
- package/dist/lib/server-utils.js.map +1 -0
- package/dist/lib/shared-schemas.d.ts +59 -0
- package/dist/lib/shared-schemas.d.ts.map +1 -0
- package/dist/lib/shared-schemas.js +58 -0
- package/dist/lib/shared-schemas.js.map +1 -0
- package/dist/lib/shared-schemas.test.d.ts +5 -0
- package/dist/lib/shared-schemas.test.d.ts.map +1 -0
- package/dist/lib/shared-schemas.test.js +106 -0
- package/dist/lib/shared-schemas.test.js.map +1 -0
- package/dist/lib/toon-encoder.d.ts +21 -0
- package/dist/lib/toon-encoder.d.ts.map +1 -0
- package/dist/lib/toon-encoder.js +46 -0
- package/dist/lib/toon-encoder.js.map +1 -0
- package/dist/server.d.ts +1 -1
- package/dist/server.d.ts.map +1 -1
- package/dist/server.js +155 -81
- package/dist/server.js.map +1 -1
- package/dist/server.test.js +363 -0
- package/dist/server.test.js.map +1 -1
- package/dist/test-helpers/env-utils.d.ts +65 -0
- package/dist/test-helpers/env-utils.d.ts.map +1 -0
- package/dist/test-helpers/env-utils.js +94 -0
- package/dist/test-helpers/env-utils.js.map +1 -0
- package/dist/test-helpers/file-utils.d.ts +93 -0
- package/dist/test-helpers/file-utils.d.ts.map +1 -0
- package/dist/test-helpers/file-utils.js +206 -0
- package/dist/test-helpers/file-utils.js.map +1 -0
- package/dist/test-helpers/index.d.ts +10 -0
- package/dist/test-helpers/index.d.ts.map +1 -0
- package/dist/test-helpers/index.js +28 -0
- package/dist/test-helpers/index.js.map +1 -0
- package/dist/test-helpers/mock-backends.d.ts +139 -0
- package/dist/test-helpers/mock-backends.d.ts.map +1 -0
- package/dist/test-helpers/mock-backends.js +227 -0
- package/dist/test-helpers/mock-backends.js.map +1 -0
- package/dist/test-helpers/mock-backends.test.d.ts +5 -0
- package/dist/test-helpers/mock-backends.test.d.ts.map +1 -0
- package/dist/test-helpers/mock-backends.test.js +368 -0
- package/dist/test-helpers/mock-backends.test.js.map +1 -0
- package/dist/test-helpers/schema-validators.d.ts +32 -0
- package/dist/test-helpers/schema-validators.d.ts.map +1 -0
- package/dist/test-helpers/schema-validators.js +125 -0
- package/dist/test-helpers/schema-validators.js.map +1 -0
- package/dist/test-helpers/test-data-builders.d.ts +223 -0
- package/dist/test-helpers/test-data-builders.d.ts.map +1 -0
- package/dist/test-helpers/test-data-builders.js +288 -0
- package/dist/test-helpers/test-data-builders.js.map +1 -0
- package/dist/test-helpers/test-data-builders.test.d.ts +2 -0
- package/dist/test-helpers/test-data-builders.test.d.ts.map +1 -0
- package/dist/test-helpers/test-data-builders.test.js +306 -0
- package/dist/test-helpers/test-data-builders.test.js.map +1 -0
- package/dist/test-helpers/tool-validators.d.ts +28 -0
- package/dist/test-helpers/tool-validators.d.ts.map +1 -0
- package/dist/test-helpers/tool-validators.js +56 -0
- package/dist/test-helpers/tool-validators.js.map +1 -0
- package/dist/tools/context-stats.d.ts +1 -0
- package/dist/tools/context-stats.d.ts.map +1 -1
- package/dist/tools/context-stats.js +9 -5
- package/dist/tools/context-stats.js.map +1 -1
- package/dist/tools/context-stats.test.js +24 -10
- package/dist/tools/context-stats.test.js.map +1 -1
- package/dist/tools/get-trace-url.js +2 -2
- package/dist/tools/get-trace-url.js.map +1 -1
- package/dist/tools/health-check.js +2 -2
- package/dist/tools/health-check.js.map +1 -1
- package/dist/tools/index.d.ts +1 -0
- package/dist/tools/index.d.ts.map +1 -1
- package/dist/tools/index.js +1 -0
- package/dist/tools/index.js.map +1 -1
- package/dist/tools/query-evaluations.d.ts +186 -0
- package/dist/tools/query-evaluations.d.ts.map +1 -0
- package/dist/tools/query-evaluations.js +351 -0
- package/dist/tools/query-evaluations.js.map +1 -0
- package/dist/tools/query-evaluations.test.d.ts +5 -0
- package/dist/tools/query-evaluations.test.d.ts.map +1 -0
- package/dist/tools/query-evaluations.test.js +733 -0
- package/dist/tools/query-evaluations.test.js.map +1 -0
- package/dist/tools/query-llm-events.d.ts +24 -18
- package/dist/tools/query-llm-events.d.ts.map +1 -1
- package/dist/tools/query-llm-events.js +103 -60
- package/dist/tools/query-llm-events.js.map +1 -1
- package/dist/tools/query-llm-events.test.js +271 -9
- package/dist/tools/query-llm-events.test.js.map +1 -1
- package/dist/tools/query-logs.d.ts +28 -20
- package/dist/tools/query-logs.d.ts.map +1 -1
- package/dist/tools/query-logs.js +85 -61
- package/dist/tools/query-logs.js.map +1 -1
- package/dist/tools/query-logs.test.js +74 -145
- package/dist/tools/query-logs.test.js.map +1 -1
- package/dist/tools/query-metrics.d.ts +20 -20
- package/dist/tools/query-metrics.d.ts.map +1 -1
- package/dist/tools/query-metrics.js +109 -61
- package/dist/tools/query-metrics.js.map +1 -1
- package/dist/tools/query-metrics.test.js +26 -61
- package/dist/tools/query-metrics.test.js.map +1 -1
- package/dist/tools/query-traces.d.ts +24 -22
- package/dist/tools/query-traces.d.ts.map +1 -1
- package/dist/tools/query-traces.js +95 -70
- package/dist/tools/query-traces.js.map +1 -1
- package/dist/tools/query-traces.test.js +294 -90
- package/dist/tools/query-traces.test.js.map +1 -1
- package/dist/tools/setup-claudeignore.js +7 -7
- package/dist/tools/setup-claudeignore.js.map +1 -1
- package/dist/tools/setup-claudeignore.test.js +4 -25
- package/dist/tools/setup-claudeignore.test.js.map +1 -1
- package/package.json +3 -4
|
@@ -0,0 +1,733 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Tests for query-evaluations tool
|
|
3
|
+
*/
|
|
4
|
+
import { describe, it, beforeEach, afterEach, mock } from 'node:test';
|
|
5
|
+
import assert from 'node:assert';
|
|
6
|
+
import { queryEvaluations, queryEvaluationsSchema, queryEvaluationsTool } from './query-evaluations.js';
|
|
7
|
+
import { LocalJsonlBackend } from '../backends/local-jsonl.js';
|
|
8
|
+
import { validateToolDefinition } from '../test-helpers/tool-validators.js';
|
|
9
|
+
import { validateLimitSchema, validateDateRangeSchema } from '../test-helpers/schema-validators.js';
|
|
10
|
+
describe('queryEvaluations', () => {
|
|
11
|
+
// Use shared schema validators
|
|
12
|
+
validateLimitSchema(queryEvaluationsSchema, 'queryEvaluationsSchema', 50);
|
|
13
|
+
validateDateRangeSchema(queryEvaluationsSchema, 'queryEvaluationsSchema');
|
|
14
|
+
describe('schema validation', () => {
|
|
15
|
+
it('should accept empty input with defaults', () => {
|
|
16
|
+
const result = queryEvaluationsSchema.parse({});
|
|
17
|
+
assert.strictEqual(result.limit, 50);
|
|
18
|
+
});
|
|
19
|
+
it('should accept all optional parameters', () => {
|
|
20
|
+
const input = {
|
|
21
|
+
evaluationName: 'Relevance',
|
|
22
|
+
scoreMin: 0.5,
|
|
23
|
+
scoreMax: 1.0,
|
|
24
|
+
scoreLabel: 'pass',
|
|
25
|
+
responseId: 'resp-123',
|
|
26
|
+
traceId: 'trace-123',
|
|
27
|
+
sessionId: 'sess-123',
|
|
28
|
+
startDate: '2026-01-28',
|
|
29
|
+
endDate: '2026-01-28',
|
|
30
|
+
limit: 100,
|
|
31
|
+
};
|
|
32
|
+
const result = queryEvaluationsSchema.parse(input);
|
|
33
|
+
assert.strictEqual(result.evaluationName, 'Relevance');
|
|
34
|
+
assert.strictEqual(result.scoreMin, 0.5);
|
|
35
|
+
assert.strictEqual(result.scoreMax, 1.0);
|
|
36
|
+
assert.strictEqual(result.scoreLabel, 'pass');
|
|
37
|
+
});
|
|
38
|
+
it('should enforce max limit of 1000', () => {
|
|
39
|
+
assert.throws(() => {
|
|
40
|
+
queryEvaluationsSchema.parse({ limit: 1001 });
|
|
41
|
+
});
|
|
42
|
+
});
|
|
43
|
+
});
|
|
44
|
+
describe('score range validation (P1-3)', () => {
|
|
45
|
+
let originalQueryEvaluations;
|
|
46
|
+
beforeEach(() => {
|
|
47
|
+
originalQueryEvaluations = LocalJsonlBackend.prototype.queryEvaluations;
|
|
48
|
+
LocalJsonlBackend.prototype.queryEvaluations = mock.fn(async () => []);
|
|
49
|
+
});
|
|
50
|
+
afterEach(() => {
|
|
51
|
+
LocalJsonlBackend.prototype.queryEvaluations = originalQueryEvaluations;
|
|
52
|
+
});
|
|
53
|
+
it('should reject inverted score range (scoreMin > scoreMax)', async () => {
|
|
54
|
+
await assert.rejects(async () => queryEvaluations({ scoreMin: 0.9, scoreMax: 0.1 }), /Invalid score range.*scoreMin.*cannot exceed.*scoreMax/);
|
|
55
|
+
});
|
|
56
|
+
it('should accept valid score range (scoreMin < scoreMax)', async () => {
|
|
57
|
+
const result = await queryEvaluations({ scoreMin: 0.3, scoreMax: 0.8 });
|
|
58
|
+
assert.strictEqual(result.count, 0); // Empty mocked response
|
|
59
|
+
});
|
|
60
|
+
it('should accept equal scoreMin and scoreMax', async () => {
|
|
61
|
+
const result = await queryEvaluations({ scoreMin: 0.5, scoreMax: 0.5 });
|
|
62
|
+
assert.strictEqual(result.count, 0);
|
|
63
|
+
});
|
|
64
|
+
it('should accept scoreMin without scoreMax', async () => {
|
|
65
|
+
const result = await queryEvaluations({ scoreMin: 0.5 });
|
|
66
|
+
assert.strictEqual(result.count, 0);
|
|
67
|
+
});
|
|
68
|
+
it('should accept scoreMax without scoreMin', async () => {
|
|
69
|
+
const result = await queryEvaluations({ scoreMax: 0.8 });
|
|
70
|
+
assert.strictEqual(result.count, 0);
|
|
71
|
+
});
|
|
72
|
+
});
|
|
73
|
+
describe('handler with mocked backend', () => {
|
|
74
|
+
let originalQueryEvaluations;
|
|
75
|
+
beforeEach(() => {
|
|
76
|
+
originalQueryEvaluations = LocalJsonlBackend.prototype.queryEvaluations;
|
|
77
|
+
});
|
|
78
|
+
afterEach(() => {
|
|
79
|
+
LocalJsonlBackend.prototype.queryEvaluations = originalQueryEvaluations;
|
|
80
|
+
});
|
|
81
|
+
it('should return empty results when no evaluations found', async () => {
|
|
82
|
+
LocalJsonlBackend.prototype.queryEvaluations = mock.fn(async () => []);
|
|
83
|
+
const result = await queryEvaluations({});
|
|
84
|
+
assert.strictEqual(result.count, 0);
|
|
85
|
+
assert.deepStrictEqual(result.evaluations, []);
|
|
86
|
+
assert.strictEqual(result.summary.averageScore, undefined);
|
|
87
|
+
});
|
|
88
|
+
it('should summarize evaluations by name and label', async () => {
|
|
89
|
+
LocalJsonlBackend.prototype.queryEvaluations = mock.fn(async () => [
|
|
90
|
+
{
|
|
91
|
+
timestamp: '2026-01-28T10:00:00Z',
|
|
92
|
+
evaluationName: 'Relevance',
|
|
93
|
+
scoreValue: 0.9,
|
|
94
|
+
scoreLabel: 'pass',
|
|
95
|
+
},
|
|
96
|
+
{
|
|
97
|
+
timestamp: '2026-01-28T10:01:00Z',
|
|
98
|
+
evaluationName: 'Relevance',
|
|
99
|
+
scoreValue: 0.8,
|
|
100
|
+
scoreLabel: 'pass',
|
|
101
|
+
},
|
|
102
|
+
{
|
|
103
|
+
timestamp: '2026-01-28T10:02:00Z',
|
|
104
|
+
evaluationName: 'Faithfulness',
|
|
105
|
+
scoreValue: 0.7,
|
|
106
|
+
scoreLabel: 'fail',
|
|
107
|
+
},
|
|
108
|
+
]);
|
|
109
|
+
const result = await queryEvaluations({});
|
|
110
|
+
assert.strictEqual(result.count, 3);
|
|
111
|
+
assert.strictEqual(result.summary.byEvaluationName['Relevance'], 2);
|
|
112
|
+
assert.strictEqual(result.summary.byEvaluationName['Faithfulness'], 1);
|
|
113
|
+
assert.strictEqual(result.summary.byScoreLabel['pass'], 2);
|
|
114
|
+
assert.strictEqual(result.summary.byScoreLabel['fail'], 1);
|
|
115
|
+
});
|
|
116
|
+
it('should calculate average score correctly', async () => {
|
|
117
|
+
LocalJsonlBackend.prototype.queryEvaluations = mock.fn(async () => [
|
|
118
|
+
{
|
|
119
|
+
timestamp: '2026-01-28T10:00:00Z',
|
|
120
|
+
evaluationName: 'Relevance',
|
|
121
|
+
scoreValue: 0.8,
|
|
122
|
+
},
|
|
123
|
+
{
|
|
124
|
+
timestamp: '2026-01-28T10:01:00Z',
|
|
125
|
+
evaluationName: 'Relevance',
|
|
126
|
+
scoreValue: 0.9,
|
|
127
|
+
},
|
|
128
|
+
{
|
|
129
|
+
timestamp: '2026-01-28T10:02:00Z',
|
|
130
|
+
evaluationName: 'Relevance',
|
|
131
|
+
scoreValue: 1.0,
|
|
132
|
+
},
|
|
133
|
+
]);
|
|
134
|
+
const result = await queryEvaluations({});
|
|
135
|
+
assert.strictEqual(result.summary.averageScore, 0.9);
|
|
136
|
+
});
|
|
137
|
+
it('should handle evaluations without scores', async () => {
|
|
138
|
+
LocalJsonlBackend.prototype.queryEvaluations = mock.fn(async () => [
|
|
139
|
+
{
|
|
140
|
+
timestamp: '2026-01-28T10:00:00Z',
|
|
141
|
+
evaluationName: 'ToolCorrectness',
|
|
142
|
+
scoreLabel: 'pass',
|
|
143
|
+
// No scoreValue
|
|
144
|
+
},
|
|
145
|
+
{
|
|
146
|
+
timestamp: '2026-01-28T10:01:00Z',
|
|
147
|
+
evaluationName: 'ToolCorrectness',
|
|
148
|
+
scoreLabel: 'fail',
|
|
149
|
+
// No scoreValue
|
|
150
|
+
},
|
|
151
|
+
]);
|
|
152
|
+
const result = await queryEvaluations({});
|
|
153
|
+
assert.strictEqual(result.count, 2);
|
|
154
|
+
assert.strictEqual(result.summary.averageScore, undefined);
|
|
155
|
+
assert.strictEqual(result.summary.byScoreLabel['pass'], 1);
|
|
156
|
+
assert.strictEqual(result.summary.byScoreLabel['fail'], 1);
|
|
157
|
+
});
|
|
158
|
+
it('should include all fields in evaluation response', async () => {
|
|
159
|
+
LocalJsonlBackend.prototype.queryEvaluations = mock.fn(async () => [
|
|
160
|
+
{
|
|
161
|
+
timestamp: '2026-01-28T10:00:00Z',
|
|
162
|
+
evaluationName: 'Relevance',
|
|
163
|
+
scoreValue: 0.92,
|
|
164
|
+
scoreLabel: 'relevant',
|
|
165
|
+
explanation: 'Response directly addresses the query',
|
|
166
|
+
responseId: 'resp-abc123',
|
|
167
|
+
traceId: 'trace-xyz789',
|
|
168
|
+
sessionId: 'sess-def456',
|
|
169
|
+
},
|
|
170
|
+
]);
|
|
171
|
+
const result = await queryEvaluations({});
|
|
172
|
+
const evaluation = result.evaluations[0];
|
|
173
|
+
assert.strictEqual(evaluation.timestamp, '2026-01-28T10:00:00Z');
|
|
174
|
+
assert.strictEqual(evaluation.evaluationName, 'Relevance');
|
|
175
|
+
assert.strictEqual(evaluation.scoreValue, 0.92);
|
|
176
|
+
assert.strictEqual(evaluation.scoreLabel, 'relevant');
|
|
177
|
+
assert.strictEqual(evaluation.explanation, 'Response directly addresses the query');
|
|
178
|
+
assert.strictEqual(evaluation.responseId, 'resp-abc123');
|
|
179
|
+
assert.strictEqual(evaluation.traceId, 'trace-xyz789');
|
|
180
|
+
assert.strictEqual(evaluation.sessionId, 'sess-def456');
|
|
181
|
+
});
|
|
182
|
+
it('should limit returned evaluations to 20', async () => {
|
|
183
|
+
const manyEvaluations = Array.from({ length: 30 }, (_, i) => ({
|
|
184
|
+
timestamp: `2026-01-28T10:${String(i).padStart(2, '0')}:00Z`,
|
|
185
|
+
evaluationName: 'Relevance',
|
|
186
|
+
scoreValue: 0.8 + (i * 0.01),
|
|
187
|
+
}));
|
|
188
|
+
LocalJsonlBackend.prototype.queryEvaluations = mock.fn(async () => manyEvaluations);
|
|
189
|
+
const result = await queryEvaluations({});
|
|
190
|
+
assert.strictEqual(result.count, 30);
|
|
191
|
+
assert.strictEqual(result.evaluations.length, 20);
|
|
192
|
+
});
|
|
193
|
+
});
|
|
194
|
+
describe('OTel GenAI compliance', () => {
|
|
195
|
+
let originalQueryEvaluations;
|
|
196
|
+
beforeEach(() => {
|
|
197
|
+
originalQueryEvaluations = LocalJsonlBackend.prototype.queryEvaluations;
|
|
198
|
+
});
|
|
199
|
+
afterEach(() => {
|
|
200
|
+
LocalJsonlBackend.prototype.queryEvaluations = originalQueryEvaluations;
|
|
201
|
+
});
|
|
202
|
+
it('should accept evaluationName filter', () => {
|
|
203
|
+
const input = { evaluationName: 'Relevance' };
|
|
204
|
+
const result = queryEvaluationsSchema.parse(input);
|
|
205
|
+
assert.strictEqual(result.evaluationName, 'Relevance');
|
|
206
|
+
});
|
|
207
|
+
it('should accept score range filters', () => {
|
|
208
|
+
const input = { scoreMin: 0.5, scoreMax: 0.9 };
|
|
209
|
+
const result = queryEvaluationsSchema.parse(input);
|
|
210
|
+
assert.strictEqual(result.scoreMin, 0.5);
|
|
211
|
+
assert.strictEqual(result.scoreMax, 0.9);
|
|
212
|
+
});
|
|
213
|
+
it('should accept scoreLabel filter', () => {
|
|
214
|
+
const input = { scoreLabel: 'pass' };
|
|
215
|
+
const result = queryEvaluationsSchema.parse(input);
|
|
216
|
+
assert.strictEqual(result.scoreLabel, 'pass');
|
|
217
|
+
});
|
|
218
|
+
it('should accept responseId filter for correlation', () => {
|
|
219
|
+
const input = { responseId: 'resp-12345' };
|
|
220
|
+
const result = queryEvaluationsSchema.parse(input);
|
|
221
|
+
assert.strictEqual(result.responseId, 'resp-12345');
|
|
222
|
+
});
|
|
223
|
+
it('should accept traceId filter for trace correlation', () => {
|
|
224
|
+
const input = { traceId: 'trace-12345' };
|
|
225
|
+
const result = queryEvaluationsSchema.parse(input);
|
|
226
|
+
assert.strictEqual(result.traceId, 'trace-12345');
|
|
227
|
+
});
|
|
228
|
+
it('should accept sessionId filter for session-scoped queries', () => {
|
|
229
|
+
const input = { sessionId: 'session-12345' };
|
|
230
|
+
const result = queryEvaluationsSchema.parse(input);
|
|
231
|
+
assert.strictEqual(result.sessionId, 'session-12345');
|
|
232
|
+
});
|
|
233
|
+
});
|
|
234
|
+
// Use shared tool definition validator
|
|
235
|
+
validateToolDefinition(queryEvaluationsTool, 'obs_query_evaluations', queryEvaluationsSchema, queryEvaluations, ['gen_ai.evaluation.result', 'aggregations']);
|
|
236
|
+
describe('aggregation schema', () => {
|
|
237
|
+
it('should accept aggregation parameter', () => {
|
|
238
|
+
const input = { aggregation: 'avg' };
|
|
239
|
+
const result = queryEvaluationsSchema.parse(input);
|
|
240
|
+
assert.strictEqual(result.aggregation, 'avg');
|
|
241
|
+
});
|
|
242
|
+
it('should accept all aggregation types', () => {
|
|
243
|
+
const types = ['avg', 'min', 'max', 'count', 'p50', 'p95', 'p99'];
|
|
244
|
+
for (const type of types) {
|
|
245
|
+
const result = queryEvaluationsSchema.parse({ aggregation: type });
|
|
246
|
+
assert.strictEqual(result.aggregation, type);
|
|
247
|
+
}
|
|
248
|
+
});
|
|
249
|
+
it('should reject invalid aggregation type', () => {
|
|
250
|
+
assert.throws(() => {
|
|
251
|
+
queryEvaluationsSchema.parse({ aggregation: 'invalid' });
|
|
252
|
+
});
|
|
253
|
+
});
|
|
254
|
+
it('should accept groupBy parameter', () => {
|
|
255
|
+
const input = { groupBy: ['evaluationName'] };
|
|
256
|
+
const result = queryEvaluationsSchema.parse(input);
|
|
257
|
+
assert.deepStrictEqual(result.groupBy, ['evaluationName']);
|
|
258
|
+
});
|
|
259
|
+
it('should accept multiple groupBy fields', () => {
|
|
260
|
+
const input = { groupBy: ['evaluationName', 'scoreLabel'] };
|
|
261
|
+
const result = queryEvaluationsSchema.parse(input);
|
|
262
|
+
assert.deepStrictEqual(result.groupBy, ['evaluationName', 'scoreLabel']);
|
|
263
|
+
});
|
|
264
|
+
it('should reject invalid groupBy field', () => {
|
|
265
|
+
assert.throws(() => {
|
|
266
|
+
queryEvaluationsSchema.parse({ groupBy: ['invalidField'] });
|
|
267
|
+
});
|
|
268
|
+
});
|
|
269
|
+
});
|
|
270
|
+
describe('aggregation handler', () => {
|
|
271
|
+
let originalQueryEvaluations;
|
|
272
|
+
beforeEach(() => {
|
|
273
|
+
originalQueryEvaluations = LocalJsonlBackend.prototype.queryEvaluations;
|
|
274
|
+
});
|
|
275
|
+
afterEach(() => {
|
|
276
|
+
LocalJsonlBackend.prototype.queryEvaluations = originalQueryEvaluations;
|
|
277
|
+
});
|
|
278
|
+
it('should calculate avg aggregation', async () => {
|
|
279
|
+
LocalJsonlBackend.prototype.queryEvaluations = mock.fn(async () => [
|
|
280
|
+
{ timestamp: '2026-01-28T10:00:00Z', evaluationName: 'Relevance', scoreValue: 0.8 },
|
|
281
|
+
{ timestamp: '2026-01-28T10:01:00Z', evaluationName: 'Relevance', scoreValue: 0.9 },
|
|
282
|
+
{ timestamp: '2026-01-28T10:02:00Z', evaluationName: 'Relevance', scoreValue: 1.0 },
|
|
283
|
+
]);
|
|
284
|
+
const result = await queryEvaluations({ aggregation: 'avg' });
|
|
285
|
+
assert.ok(result.summary.aggregations);
|
|
286
|
+
assert.strictEqual(result.summary.aggregations.length, 1);
|
|
287
|
+
assert.strictEqual(result.summary.aggregations[0].value, 0.9);
|
|
288
|
+
assert.strictEqual(result.summary.aggregations[0].count, 3);
|
|
289
|
+
});
|
|
290
|
+
it('should calculate min aggregation', async () => {
|
|
291
|
+
LocalJsonlBackend.prototype.queryEvaluations = mock.fn(async () => [
|
|
292
|
+
{ timestamp: '2026-01-28T10:00:00Z', evaluationName: 'Relevance', scoreValue: 0.8 },
|
|
293
|
+
{ timestamp: '2026-01-28T10:01:00Z', evaluationName: 'Relevance', scoreValue: 0.5 },
|
|
294
|
+
{ timestamp: '2026-01-28T10:02:00Z', evaluationName: 'Relevance', scoreValue: 0.9 },
|
|
295
|
+
]);
|
|
296
|
+
const result = await queryEvaluations({ aggregation: 'min' });
|
|
297
|
+
assert.ok(result.summary.aggregations);
|
|
298
|
+
assert.strictEqual(result.summary.aggregations[0].value, 0.5);
|
|
299
|
+
});
|
|
300
|
+
it('should calculate max aggregation', async () => {
|
|
301
|
+
LocalJsonlBackend.prototype.queryEvaluations = mock.fn(async () => [
|
|
302
|
+
{ timestamp: '2026-01-28T10:00:00Z', evaluationName: 'Relevance', scoreValue: 0.8 },
|
|
303
|
+
{ timestamp: '2026-01-28T10:01:00Z', evaluationName: 'Relevance', scoreValue: 0.5 },
|
|
304
|
+
{ timestamp: '2026-01-28T10:02:00Z', evaluationName: 'Relevance', scoreValue: 0.9 },
|
|
305
|
+
]);
|
|
306
|
+
const result = await queryEvaluations({ aggregation: 'max' });
|
|
307
|
+
assert.ok(result.summary.aggregations);
|
|
308
|
+
assert.strictEqual(result.summary.aggregations[0].value, 0.9);
|
|
309
|
+
});
|
|
310
|
+
it('should calculate count aggregation', async () => {
|
|
311
|
+
LocalJsonlBackend.prototype.queryEvaluations = mock.fn(async () => [
|
|
312
|
+
{ timestamp: '2026-01-28T10:00:00Z', evaluationName: 'Relevance', scoreValue: 0.8 },
|
|
313
|
+
{ timestamp: '2026-01-28T10:01:00Z', evaluationName: 'Relevance', scoreValue: 0.5 },
|
|
314
|
+
{ timestamp: '2026-01-28T10:02:00Z', evaluationName: 'Faithfulness', scoreLabel: 'pass' },
|
|
315
|
+
]);
|
|
316
|
+
const result = await queryEvaluations({ aggregation: 'count' });
|
|
317
|
+
assert.ok(result.summary.aggregations);
|
|
318
|
+
assert.strictEqual(result.summary.aggregations[0].count, 3);
|
|
319
|
+
assert.strictEqual(result.summary.aggregations[0].value, 3);
|
|
320
|
+
});
|
|
321
|
+
it('should calculate p50 percentile', async () => {
|
|
322
|
+
LocalJsonlBackend.prototype.queryEvaluations = mock.fn(async () => [
|
|
323
|
+
{ timestamp: '2026-01-28T10:00:00Z', evaluationName: 'Relevance', scoreValue: 0.1 },
|
|
324
|
+
{ timestamp: '2026-01-28T10:01:00Z', evaluationName: 'Relevance', scoreValue: 0.5 },
|
|
325
|
+
{ timestamp: '2026-01-28T10:02:00Z', evaluationName: 'Relevance', scoreValue: 0.9 },
|
|
326
|
+
]);
|
|
327
|
+
const result = await queryEvaluations({ aggregation: 'p50' });
|
|
328
|
+
assert.ok(result.summary.aggregations);
|
|
329
|
+
assert.strictEqual(result.summary.aggregations[0].value, 0.5);
|
|
330
|
+
});
|
|
331
|
+
it('should calculate p95 percentile with linear interpolation', async () => {
|
|
332
|
+
const values = Array.from({ length: 100 }, (_, i) => ({
|
|
333
|
+
timestamp: '2026-01-28T10:00:00Z',
|
|
334
|
+
evaluationName: 'Relevance',
|
|
335
|
+
scoreValue: (i + 1) / 100,
|
|
336
|
+
}));
|
|
337
|
+
LocalJsonlBackend.prototype.queryEvaluations = mock.fn(async () => values);
|
|
338
|
+
const result = await queryEvaluations({ aggregation: 'p95', limit: 1000 });
|
|
339
|
+
assert.ok(result.summary.aggregations);
|
|
340
|
+
// R-7 linear interpolation: rank = 0.95 * 99 = 94.05
|
|
341
|
+
// Interpolate between index 94 (0.95) and index 95 (0.96)
|
|
342
|
+
const value = result.summary.aggregations[0].value;
|
|
343
|
+
assert.ok(value >= 0.95 && value <= 0.96, `Expected p95 between 0.95-0.96, got ${value}`);
|
|
344
|
+
});
|
|
345
|
+
it('should calculate p99 percentile with linear interpolation', async () => {
|
|
346
|
+
const values = Array.from({ length: 100 }, (_, i) => ({
|
|
347
|
+
timestamp: '2026-01-28T10:00:00Z',
|
|
348
|
+
evaluationName: 'Relevance',
|
|
349
|
+
scoreValue: (i + 1) / 100,
|
|
350
|
+
}));
|
|
351
|
+
LocalJsonlBackend.prototype.queryEvaluations = mock.fn(async () => values);
|
|
352
|
+
const result = await queryEvaluations({ aggregation: 'p99', limit: 1000 });
|
|
353
|
+
assert.ok(result.summary.aggregations);
|
|
354
|
+
// R-7 linear interpolation: rank = 0.99 * 99 = 98.01
|
|
355
|
+
// Interpolate between index 98 (0.99) and index 99 (1.00)
|
|
356
|
+
const value = result.summary.aggregations[0].value;
|
|
357
|
+
assert.ok(value >= 0.99 && value <= 1.0, `Expected p99 between 0.99-1.0, got ${value}`);
|
|
358
|
+
});
|
|
359
|
+
it('should group by evaluationName', async () => {
|
|
360
|
+
LocalJsonlBackend.prototype.queryEvaluations = mock.fn(async () => [
|
|
361
|
+
{ timestamp: '2026-01-28T10:00:00Z', evaluationName: 'Relevance', scoreValue: 0.8 },
|
|
362
|
+
{ timestamp: '2026-01-28T10:01:00Z', evaluationName: 'Relevance', scoreValue: 0.9 },
|
|
363
|
+
{ timestamp: '2026-01-28T10:02:00Z', evaluationName: 'Faithfulness', scoreValue: 0.7 },
|
|
364
|
+
]);
|
|
365
|
+
const result = await queryEvaluations({ aggregation: 'avg', groupBy: ['evaluationName'] });
|
|
366
|
+
assert.ok(result.summary.aggregations);
|
|
367
|
+
assert.strictEqual(result.summary.aggregations.length, 2);
|
|
368
|
+
const relevance = result.summary.aggregations.find(a => a.evaluationName === 'Relevance');
|
|
369
|
+
const faithfulness = result.summary.aggregations.find(a => a.evaluationName === 'Faithfulness');
|
|
370
|
+
assert.ok(relevance);
|
|
371
|
+
assert.ok(Math.abs((relevance.value ?? 0) - 0.85) < 0.0001, `Expected ~0.85, got ${relevance.value}`);
|
|
372
|
+
assert.strictEqual(relevance.count, 2);
|
|
373
|
+
assert.ok(faithfulness);
|
|
374
|
+
assert.strictEqual(faithfulness.value, 0.7);
|
|
375
|
+
assert.strictEqual(faithfulness.count, 1);
|
|
376
|
+
});
|
|
377
|
+
it('should group by scoreLabel', async () => {
|
|
378
|
+
LocalJsonlBackend.prototype.queryEvaluations = mock.fn(async () => [
|
|
379
|
+
{ timestamp: '2026-01-28T10:00:00Z', evaluationName: 'Relevance', scoreValue: 0.9, scoreLabel: 'pass' },
|
|
380
|
+
{ timestamp: '2026-01-28T10:01:00Z', evaluationName: 'Relevance', scoreValue: 0.8, scoreLabel: 'pass' },
|
|
381
|
+
{ timestamp: '2026-01-28T10:02:00Z', evaluationName: 'Relevance', scoreValue: 0.3, scoreLabel: 'fail' },
|
|
382
|
+
]);
|
|
383
|
+
const result = await queryEvaluations({ aggregation: 'avg', groupBy: ['scoreLabel'] });
|
|
384
|
+
assert.ok(result.summary.aggregations);
|
|
385
|
+
assert.strictEqual(result.summary.aggregations.length, 2);
|
|
386
|
+
const pass = result.summary.aggregations.find(a => a.scoreLabel === 'pass');
|
|
387
|
+
const fail = result.summary.aggregations.find(a => a.scoreLabel === 'fail');
|
|
388
|
+
assert.ok(pass);
|
|
389
|
+
assert.ok(Math.abs((pass.value ?? 0) - 0.85) < 0.0001, `Expected ~0.85, got ${pass.value}`);
|
|
390
|
+
assert.ok(fail);
|
|
391
|
+
assert.strictEqual(fail.value, 0.3);
|
|
392
|
+
});
|
|
393
|
+
it('should group by multiple fields', async () => {
|
|
394
|
+
LocalJsonlBackend.prototype.queryEvaluations = mock.fn(async () => [
|
|
395
|
+
{ timestamp: '2026-01-28T10:00:00Z', evaluationName: 'Relevance', scoreValue: 0.9, scoreLabel: 'pass' },
|
|
396
|
+
{ timestamp: '2026-01-28T10:01:00Z', evaluationName: 'Relevance', scoreValue: 0.3, scoreLabel: 'fail' },
|
|
397
|
+
{ timestamp: '2026-01-28T10:02:00Z', evaluationName: 'Faithfulness', scoreValue: 0.8, scoreLabel: 'pass' },
|
|
398
|
+
]);
|
|
399
|
+
const result = await queryEvaluations({ aggregation: 'count', groupBy: ['evaluationName', 'scoreLabel'] });
|
|
400
|
+
assert.ok(result.summary.aggregations);
|
|
401
|
+
assert.strictEqual(result.summary.aggregations.length, 3);
|
|
402
|
+
});
|
|
403
|
+
it('should handle empty results with aggregation', async () => {
|
|
404
|
+
LocalJsonlBackend.prototype.queryEvaluations = mock.fn(async () => []);
|
|
405
|
+
const result = await queryEvaluations({ aggregation: 'avg' });
|
|
406
|
+
assert.ok(result.summary.aggregations);
|
|
407
|
+
assert.strictEqual(result.summary.aggregations.length, 0);
|
|
408
|
+
});
|
|
409
|
+
it('should handle evaluations without scores in aggregation', async () => {
|
|
410
|
+
LocalJsonlBackend.prototype.queryEvaluations = mock.fn(async () => [
|
|
411
|
+
{ timestamp: '2026-01-28T10:00:00Z', evaluationName: 'ToolCorrectness', scoreLabel: 'pass' },
|
|
412
|
+
{ timestamp: '2026-01-28T10:01:00Z', evaluationName: 'ToolCorrectness', scoreLabel: 'fail' },
|
|
413
|
+
]);
|
|
414
|
+
const result = await queryEvaluations({ aggregation: 'avg' });
|
|
415
|
+
assert.ok(result.summary.aggregations);
|
|
416
|
+
assert.strictEqual(result.summary.aggregations[0].count, 2);
|
|
417
|
+
assert.strictEqual(result.summary.aggregations[0].value, undefined);
|
|
418
|
+
});
|
|
419
|
+
it('should not include aggregations when aggregation not requested', async () => {
|
|
420
|
+
LocalJsonlBackend.prototype.queryEvaluations = mock.fn(async () => [
|
|
421
|
+
{ timestamp: '2026-01-28T10:00:00Z', evaluationName: 'Relevance', scoreValue: 0.8 },
|
|
422
|
+
]);
|
|
423
|
+
const result = await queryEvaluations({});
|
|
424
|
+
assert.strictEqual(result.summary.aggregations, undefined);
|
|
425
|
+
});
|
|
426
|
+
// P1-2: min/max should return undefined, not Infinity, when no scores
|
|
427
|
+
it('should return undefined for min aggregation when no scores (P1-2)', async () => {
|
|
428
|
+
LocalJsonlBackend.prototype.queryEvaluations = mock.fn(async () => [
|
|
429
|
+
{ timestamp: '2026-01-28T10:00:00Z', evaluationName: 'ToolCorrectness', scoreLabel: 'pass' },
|
|
430
|
+
{ timestamp: '2026-01-28T10:01:00Z', evaluationName: 'ToolCorrectness', scoreLabel: 'fail' },
|
|
431
|
+
]);
|
|
432
|
+
const result = await queryEvaluations({ aggregation: 'min' });
|
|
433
|
+
assert.ok(result.summary.aggregations);
|
|
434
|
+
assert.strictEqual(result.summary.aggregations[0].count, 2);
|
|
435
|
+
assert.strictEqual(result.summary.aggregations[0].value, undefined);
|
|
436
|
+
});
|
|
437
|
+
it('should return undefined for max aggregation when no scores (P1-2)', async () => {
|
|
438
|
+
LocalJsonlBackend.prototype.queryEvaluations = mock.fn(async () => [
|
|
439
|
+
{ timestamp: '2026-01-28T10:00:00Z', evaluationName: 'ToolCorrectness', scoreLabel: 'pass' },
|
|
440
|
+
]);
|
|
441
|
+
const result = await queryEvaluations({ aggregation: 'max' });
|
|
442
|
+
assert.ok(result.summary.aggregations);
|
|
443
|
+
assert.strictEqual(result.summary.aggregations[0].value, undefined);
|
|
444
|
+
});
|
|
445
|
+
// P1-1: Linear interpolation percentile tests
|
|
446
|
+
it('should calculate p50 with linear interpolation for even-length array (P1-1)', async () => {
|
|
447
|
+
LocalJsonlBackend.prototype.queryEvaluations = mock.fn(async () => [
|
|
448
|
+
{ timestamp: '2026-01-28T10:00:00Z', evaluationName: 'Relevance', scoreValue: 0.1 },
|
|
449
|
+
{ timestamp: '2026-01-28T10:01:00Z', evaluationName: 'Relevance', scoreValue: 0.2 },
|
|
450
|
+
{ timestamp: '2026-01-28T10:02:00Z', evaluationName: 'Relevance', scoreValue: 0.3 },
|
|
451
|
+
{ timestamp: '2026-01-28T10:03:00Z', evaluationName: 'Relevance', scoreValue: 0.4 },
|
|
452
|
+
]);
|
|
453
|
+
const result = await queryEvaluations({ aggregation: 'p50' });
|
|
454
|
+
assert.ok(result.summary.aggregations);
|
|
455
|
+
// R-7 method: rank = 0.5 * (4-1) = 1.5, interpolate between index 1 (0.2) and 2 (0.3)
|
|
456
|
+
// value = 0.2 * 0.5 + 0.3 * 0.5 = 0.25
|
|
457
|
+
assert.strictEqual(result.summary.aggregations[0].value, 0.25);
|
|
458
|
+
});
|
|
459
|
+
it('should handle single value for percentile (P1-1)', async () => {
|
|
460
|
+
LocalJsonlBackend.prototype.queryEvaluations = mock.fn(async () => [
|
|
461
|
+
{ timestamp: '2026-01-28T10:00:00Z', evaluationName: 'Relevance', scoreValue: 0.75 },
|
|
462
|
+
]);
|
|
463
|
+
const result = await queryEvaluations({ aggregation: 'p95' });
|
|
464
|
+
assert.ok(result.summary.aggregations);
|
|
465
|
+
assert.strictEqual(result.summary.aggregations[0].value, 0.75);
|
|
466
|
+
});
|
|
467
|
+
// P1-3: Empty string scoreLabel normalization tests
|
|
468
|
+
it('should treat empty string scoreLabel as missing (P1-3)', async () => {
|
|
469
|
+
LocalJsonlBackend.prototype.queryEvaluations = mock.fn(async () => [
|
|
470
|
+
{ timestamp: '2026-01-28T10:00:00Z', evaluationName: 'Relevance', scoreValue: 0.8, scoreLabel: '' },
|
|
471
|
+
{ timestamp: '2026-01-28T10:01:00Z', evaluationName: 'Relevance', scoreValue: 0.9, scoreLabel: ' ' },
|
|
472
|
+
{ timestamp: '2026-01-28T10:02:00Z', evaluationName: 'Relevance', scoreValue: 0.7, scoreLabel: 'pass' },
|
|
473
|
+
]);
|
|
474
|
+
const result = await queryEvaluations({});
|
|
475
|
+
// Empty and whitespace-only labels should not be counted
|
|
476
|
+
assert.strictEqual(result.summary.byScoreLabel['pass'], 1);
|
|
477
|
+
assert.strictEqual(result.summary.byScoreLabel[''], undefined);
|
|
478
|
+
assert.strictEqual(result.summary.byScoreLabel[' '], undefined);
|
|
479
|
+
});
|
|
480
|
+
it('should group empty scoreLabels together (P1-3)', async () => {
|
|
481
|
+
LocalJsonlBackend.prototype.queryEvaluations = mock.fn(async () => [
|
|
482
|
+
{ timestamp: '2026-01-28T10:00:00Z', evaluationName: 'Relevance', scoreValue: 0.8, scoreLabel: '' },
|
|
483
|
+
{ timestamp: '2026-01-28T10:01:00Z', evaluationName: 'Relevance', scoreValue: 0.9 },
|
|
484
|
+
]);
|
|
485
|
+
const result = await queryEvaluations({ aggregation: 'count', groupBy: ['scoreLabel'] });
|
|
486
|
+
assert.ok(result.summary.aggregations);
|
|
487
|
+
// Both should be grouped together as missing label
|
|
488
|
+
assert.strictEqual(result.summary.aggregations.length, 1);
|
|
489
|
+
assert.strictEqual(result.summary.aggregations[0].count, 2);
|
|
490
|
+
});
|
|
491
|
+
});
|
|
492
|
+
describe('score range validation (P2-2)', () => {
|
|
493
|
+
let originalQueryEvaluations;
|
|
494
|
+
beforeEach(() => {
|
|
495
|
+
originalQueryEvaluations = LocalJsonlBackend.prototype.queryEvaluations;
|
|
496
|
+
LocalJsonlBackend.prototype.queryEvaluations = mock.fn(async () => []);
|
|
497
|
+
});
|
|
498
|
+
afterEach(() => {
|
|
499
|
+
LocalJsonlBackend.prototype.queryEvaluations = originalQueryEvaluations;
|
|
500
|
+
});
|
|
501
|
+
it('should reject NaN scoreMin (via Zod)', async () => {
|
|
502
|
+
await assert.rejects(async () => queryEvaluations({ scoreMin: NaN }), /received.*nan/i);
|
|
503
|
+
});
|
|
504
|
+
it('should reject NaN scoreMax (via Zod)', async () => {
|
|
505
|
+
await assert.rejects(async () => queryEvaluations({ scoreMax: NaN }), /received.*nan/i);
|
|
506
|
+
});
|
|
507
|
+
it('should reject Infinity scoreMin', async () => {
|
|
508
|
+
await assert.rejects(async () => queryEvaluations({ scoreMin: Infinity }), /Invalid scoreMin.*finite number/);
|
|
509
|
+
});
|
|
510
|
+
it('should reject -Infinity scoreMax', async () => {
|
|
511
|
+
await assert.rejects(async () => queryEvaluations({ scoreMax: -Infinity }), /Invalid scoreMax.*finite number/);
|
|
512
|
+
});
|
|
513
|
+
it('should accept negative score ranges', async () => {
|
|
514
|
+
const result = await queryEvaluations({ scoreMin: -1.0, scoreMax: 0.5 });
|
|
515
|
+
assert.strictEqual(result.count, 0);
|
|
516
|
+
});
|
|
517
|
+
});
|
|
518
|
+
describe('evaluator field (Phase 3)', () => {
|
|
519
|
+
describe('schema validation', () => {
|
|
520
|
+
it('should accept evaluator filter', () => {
|
|
521
|
+
const input = { evaluator: 'gpt-4-as-judge' };
|
|
522
|
+
const result = queryEvaluationsSchema.parse(input);
|
|
523
|
+
assert.strictEqual(result.evaluator, 'gpt-4-as-judge');
|
|
524
|
+
});
|
|
525
|
+
it('should accept evaluatorType filter', () => {
|
|
526
|
+
const input = { evaluatorType: 'llm' };
|
|
527
|
+
const result = queryEvaluationsSchema.parse(input);
|
|
528
|
+
assert.strictEqual(result.evaluatorType, 'llm');
|
|
529
|
+
});
|
|
530
|
+
it('should accept all valid evaluatorType values', () => {
|
|
531
|
+
const types = ['llm', 'human', 'rule', 'classifier'];
|
|
532
|
+
for (const type of types) {
|
|
533
|
+
const result = queryEvaluationsSchema.parse({ evaluatorType: type });
|
|
534
|
+
assert.strictEqual(result.evaluatorType, type);
|
|
535
|
+
}
|
|
536
|
+
});
|
|
537
|
+
it('should reject invalid evaluatorType', () => {
|
|
538
|
+
assert.throws(() => {
|
|
539
|
+
queryEvaluationsSchema.parse({ evaluatorType: 'invalid' });
|
|
540
|
+
});
|
|
541
|
+
});
|
|
542
|
+
it('should accept evaluator in groupBy', () => {
|
|
543
|
+
const input = { groupBy: ['evaluator'] };
|
|
544
|
+
const result = queryEvaluationsSchema.parse(input);
|
|
545
|
+
assert.deepStrictEqual(result.groupBy, ['evaluator']);
|
|
546
|
+
});
|
|
547
|
+
it('should accept evaluator with other groupBy fields', () => {
|
|
548
|
+
const input = { groupBy: ['evaluationName', 'evaluator'] };
|
|
549
|
+
const result = queryEvaluationsSchema.parse(input);
|
|
550
|
+
assert.deepStrictEqual(result.groupBy, ['evaluationName', 'evaluator']);
|
|
551
|
+
});
|
|
552
|
+
});
|
|
553
|
+
describe('handler with mocked backend', () => {
|
|
554
|
+
let originalQueryEvaluations;
|
|
555
|
+
beforeEach(() => {
|
|
556
|
+
originalQueryEvaluations = LocalJsonlBackend.prototype.queryEvaluations;
|
|
557
|
+
});
|
|
558
|
+
afterEach(() => {
|
|
559
|
+
LocalJsonlBackend.prototype.queryEvaluations = originalQueryEvaluations;
|
|
560
|
+
});
|
|
561
|
+
it('should include evaluator and evaluatorType in response', async () => {
|
|
562
|
+
LocalJsonlBackend.prototype.queryEvaluations = mock.fn(async () => [
|
|
563
|
+
{
|
|
564
|
+
timestamp: '2026-01-28T10:00:00Z',
|
|
565
|
+
evaluationName: 'Relevance',
|
|
566
|
+
scoreValue: 0.92,
|
|
567
|
+
evaluator: 'gpt-4-as-judge',
|
|
568
|
+
evaluatorType: 'llm',
|
|
569
|
+
},
|
|
570
|
+
]);
|
|
571
|
+
const result = await queryEvaluations({});
|
|
572
|
+
const evaluation = result.evaluations[0];
|
|
573
|
+
assert.strictEqual(evaluation.evaluator, 'gpt-4-as-judge');
|
|
574
|
+
assert.strictEqual(evaluation.evaluatorType, 'llm');
|
|
575
|
+
});
|
|
576
|
+
it('should summarize evaluations by evaluator', async () => {
|
|
577
|
+
LocalJsonlBackend.prototype.queryEvaluations = mock.fn(async () => [
|
|
578
|
+
{ timestamp: '2026-01-28T10:00:00Z', evaluationName: 'Relevance', scoreValue: 0.9, evaluator: 'gpt-4-as-judge' },
|
|
579
|
+
{ timestamp: '2026-01-28T10:01:00Z', evaluationName: 'Relevance', scoreValue: 0.8, evaluator: 'gpt-4-as-judge' },
|
|
580
|
+
{ timestamp: '2026-01-28T10:02:00Z', evaluationName: 'Relevance', scoreValue: 0.85, evaluator: 'human-reviewer' },
|
|
581
|
+
]);
|
|
582
|
+
const result = await queryEvaluations({});
|
|
583
|
+
assert.strictEqual(result.summary.byEvaluator['gpt-4-as-judge'], 2);
|
|
584
|
+
assert.strictEqual(result.summary.byEvaluator['human-reviewer'], 1);
|
|
585
|
+
});
|
|
586
|
+
it('should not count evaluations without evaluator in byEvaluator', async () => {
|
|
587
|
+
LocalJsonlBackend.prototype.queryEvaluations = mock.fn(async () => [
|
|
588
|
+
{ timestamp: '2026-01-28T10:00:00Z', evaluationName: 'Relevance', scoreValue: 0.9, evaluator: 'gpt-4-as-judge' },
|
|
589
|
+
{ timestamp: '2026-01-28T10:01:00Z', evaluationName: 'Relevance', scoreValue: 0.8 }, // no evaluator
|
|
590
|
+
]);
|
|
591
|
+
const result = await queryEvaluations({});
|
|
592
|
+
assert.strictEqual(result.summary.byEvaluator['gpt-4-as-judge'], 1);
|
|
593
|
+
assert.strictEqual(Object.keys(result.summary.byEvaluator).length, 1);
|
|
594
|
+
});
|
|
595
|
+
it('should group by evaluator', async () => {
|
|
596
|
+
LocalJsonlBackend.prototype.queryEvaluations = mock.fn(async () => [
|
|
597
|
+
{ timestamp: '2026-01-28T10:00:00Z', evaluationName: 'Relevance', scoreValue: 0.9, evaluator: 'gpt-4-as-judge' },
|
|
598
|
+
{ timestamp: '2026-01-28T10:01:00Z', evaluationName: 'Relevance', scoreValue: 0.8, evaluator: 'gpt-4-as-judge' },
|
|
599
|
+
{ timestamp: '2026-01-28T10:02:00Z', evaluationName: 'Relevance', scoreValue: 0.7, evaluator: 'human-reviewer' },
|
|
600
|
+
]);
|
|
601
|
+
const result = await queryEvaluations({ aggregation: 'avg', groupBy: ['evaluator'] });
|
|
602
|
+
assert.ok(result.summary.aggregations);
|
|
603
|
+
assert.strictEqual(result.summary.aggregations.length, 2);
|
|
604
|
+
const gpt4 = result.summary.aggregations.find(a => a.evaluator === 'gpt-4-as-judge');
|
|
605
|
+
const human = result.summary.aggregations.find(a => a.evaluator === 'human-reviewer');
|
|
606
|
+
assert.ok(gpt4);
|
|
607
|
+
assert.ok(Math.abs((gpt4.value ?? 0) - 0.85) < 0.0001, `Expected ~0.85, got ${gpt4.value}`);
|
|
608
|
+
assert.strictEqual(gpt4.count, 2);
|
|
609
|
+
assert.ok(human);
|
|
610
|
+
assert.strictEqual(human.value, 0.7);
|
|
611
|
+
assert.strictEqual(human.count, 1);
|
|
612
|
+
});
|
|
613
|
+
it('should group by evaluationName and evaluator', async () => {
|
|
614
|
+
LocalJsonlBackend.prototype.queryEvaluations = mock.fn(async () => [
|
|
615
|
+
{ timestamp: '2026-01-28T10:00:00Z', evaluationName: 'Relevance', scoreValue: 0.9, evaluator: 'gpt-4-as-judge' },
|
|
616
|
+
{ timestamp: '2026-01-28T10:01:00Z', evaluationName: 'Faithfulness', scoreValue: 0.8, evaluator: 'gpt-4-as-judge' },
|
|
617
|
+
{ timestamp: '2026-01-28T10:02:00Z', evaluationName: 'Relevance', scoreValue: 0.7, evaluator: 'human-reviewer' },
|
|
618
|
+
]);
|
|
619
|
+
const result = await queryEvaluations({ aggregation: 'count', groupBy: ['evaluationName', 'evaluator'] });
|
|
620
|
+
assert.ok(result.summary.aggregations);
|
|
621
|
+
assert.strictEqual(result.summary.aggregations.length, 3);
|
|
622
|
+
});
|
|
623
|
+
it('should handle evaluations without evaluator in groupBy', async () => {
|
|
624
|
+
LocalJsonlBackend.prototype.queryEvaluations = mock.fn(async () => [
|
|
625
|
+
{ timestamp: '2026-01-28T10:00:00Z', evaluationName: 'Relevance', scoreValue: 0.9, evaluator: 'gpt-4-as-judge' },
|
|
626
|
+
{ timestamp: '2026-01-28T10:01:00Z', evaluationName: 'Relevance', scoreValue: 0.8 }, // no evaluator
|
|
627
|
+
]);
|
|
628
|
+
const result = await queryEvaluations({ aggregation: 'count', groupBy: ['evaluator'] });
|
|
629
|
+
assert.ok(result.summary.aggregations);
|
|
630
|
+
// Should have two groups: one with evaluator, one without
|
|
631
|
+
assert.strictEqual(result.summary.aggregations.length, 2);
|
|
632
|
+
});
|
|
633
|
+
it('should include all evaluatorType values in response', async () => {
|
|
634
|
+
LocalJsonlBackend.prototype.queryEvaluations = mock.fn(async () => [
|
|
635
|
+
{ timestamp: '2026-01-28T10:00:00Z', evaluationName: 'Relevance', evaluatorType: 'llm' },
|
|
636
|
+
{ timestamp: '2026-01-28T10:01:00Z', evaluationName: 'ToolUse', evaluatorType: 'rule' },
|
|
637
|
+
{ timestamp: '2026-01-28T10:02:00Z', evaluationName: 'UserSatisfaction', evaluatorType: 'human' },
|
|
638
|
+
{ timestamp: '2026-01-28T10:03:00Z', evaluationName: 'Intent', evaluatorType: 'classifier' },
|
|
639
|
+
]);
|
|
640
|
+
const result = await queryEvaluations({});
|
|
641
|
+
// Results are sorted by timestamp DESC, so order is reversed
|
|
642
|
+
const types = result.evaluations.map(e => e.evaluatorType).sort();
|
|
643
|
+
assert.deepStrictEqual(types, ['classifier', 'human', 'llm', 'rule']);
|
|
644
|
+
});
|
|
645
|
+
});
|
|
646
|
+
});
|
|
647
|
+
describe('scoreUnit field (Phase 1)', () => {
|
|
648
|
+
describe('handler with mocked backend', () => {
|
|
649
|
+
let originalQueryEvaluations;
|
|
650
|
+
beforeEach(() => {
|
|
651
|
+
originalQueryEvaluations = LocalJsonlBackend.prototype.queryEvaluations;
|
|
652
|
+
});
|
|
653
|
+
afterEach(() => {
|
|
654
|
+
LocalJsonlBackend.prototype.queryEvaluations = originalQueryEvaluations;
|
|
655
|
+
});
|
|
656
|
+
it('should include scoreUnit in response', async () => {
|
|
657
|
+
LocalJsonlBackend.prototype.queryEvaluations = mock.fn(async () => [
|
|
658
|
+
{
|
|
659
|
+
timestamp: '2026-01-28T10:00:00Z',
|
|
660
|
+
evaluationName: 'Relevance',
|
|
661
|
+
scoreValue: 85,
|
|
662
|
+
scoreUnit: 'percentage',
|
|
663
|
+
},
|
|
664
|
+
]);
|
|
665
|
+
const result = await queryEvaluations({});
|
|
666
|
+
assert.strictEqual(result.evaluations[0].scoreUnit, 'percentage');
|
|
667
|
+
});
|
|
668
|
+
it('should normalize percentage scores for averageScore', async () => {
|
|
669
|
+
LocalJsonlBackend.prototype.queryEvaluations = mock.fn(async () => [
|
|
670
|
+
{ timestamp: '2026-01-28T10:00:00Z', evaluationName: 'Relevance', scoreValue: 80, scoreUnit: 'percentage' },
|
|
671
|
+
{ timestamp: '2026-01-28T10:01:00Z', evaluationName: 'Relevance', scoreValue: 90, scoreUnit: 'percentage' },
|
|
672
|
+
]);
|
|
673
|
+
const result = await queryEvaluations({});
|
|
674
|
+
// (80/100 + 90/100) / 2 = 0.85
|
|
675
|
+
assert.ok(Math.abs((result.summary.averageScore ?? 0) - 0.85) < 0.0001, `Expected ~0.85, got ${result.summary.averageScore}`);
|
|
676
|
+
});
|
|
677
|
+
it('should not change ratio_0_1 scores for averageScore', async () => {
|
|
678
|
+
LocalJsonlBackend.prototype.queryEvaluations = mock.fn(async () => [
|
|
679
|
+
{ timestamp: '2026-01-28T10:00:00Z', evaluationName: 'Relevance', scoreValue: 0.8, scoreUnit: 'ratio_0_1' },
|
|
680
|
+
{ timestamp: '2026-01-28T10:01:00Z', evaluationName: 'Relevance', scoreValue: 0.9, scoreUnit: 'ratio_0_1' },
|
|
681
|
+
]);
|
|
682
|
+
const result = await queryEvaluations({});
|
|
683
|
+
// (0.8 + 0.9) / 2 = 0.85
|
|
684
|
+
assert.ok(Math.abs((result.summary.averageScore ?? 0) - 0.85) < 0.0001, `Expected ~0.85, got ${result.summary.averageScore}`);
|
|
685
|
+
});
|
|
686
|
+
it('should not change scores without unit for averageScore', async () => {
|
|
687
|
+
LocalJsonlBackend.prototype.queryEvaluations = mock.fn(async () => [
|
|
688
|
+
{ timestamp: '2026-01-28T10:00:00Z', evaluationName: 'Relevance', scoreValue: 0.8 },
|
|
689
|
+
{ timestamp: '2026-01-28T10:01:00Z', evaluationName: 'Relevance', scoreValue: 0.9 },
|
|
690
|
+
]);
|
|
691
|
+
const result = await queryEvaluations({});
|
|
692
|
+
// (0.8 + 0.9) / 2 = 0.85
|
|
693
|
+
assert.ok(Math.abs((result.summary.averageScore ?? 0) - 0.85) < 0.0001, `Expected ~0.85, got ${result.summary.averageScore}`);
|
|
694
|
+
});
|
|
695
|
+
it('should handle mixed units for averageScore', async () => {
|
|
696
|
+
LocalJsonlBackend.prototype.queryEvaluations = mock.fn(async () => [
|
|
697
|
+
{ timestamp: '2026-01-28T10:00:00Z', evaluationName: 'Relevance', scoreValue: 80, scoreUnit: 'percentage' },
|
|
698
|
+
{ timestamp: '2026-01-28T10:01:00Z', evaluationName: 'Relevance', scoreValue: 0.9, scoreUnit: 'ratio_0_1' },
|
|
699
|
+
{ timestamp: '2026-01-28T10:02:00Z', evaluationName: 'Relevance', scoreValue: 0.85 }, // no unit, defaults to ratio
|
|
700
|
+
]);
|
|
701
|
+
const result = await queryEvaluations({});
|
|
702
|
+
// (0.8 + 0.9 + 0.85) / 3 = 0.85
|
|
703
|
+
assert.ok(Math.abs((result.summary.averageScore ?? 0) - 0.85) < 0.0001, `Expected ~0.85, got ${result.summary.averageScore}`);
|
|
704
|
+
});
|
|
705
|
+
it('should handle percent as alias for percentage', async () => {
|
|
706
|
+
LocalJsonlBackend.prototype.queryEvaluations = mock.fn(async () => [
|
|
707
|
+
{ timestamp: '2026-01-28T10:00:00Z', evaluationName: 'Relevance', scoreValue: 100, scoreUnit: 'percent' },
|
|
708
|
+
]);
|
|
709
|
+
const result = await queryEvaluations({});
|
|
710
|
+
// 100/100 = 1.0
|
|
711
|
+
assert.strictEqual(result.summary.averageScore, 1.0);
|
|
712
|
+
});
|
|
713
|
+
it('should handle case-insensitive unit names', async () => {
|
|
714
|
+
LocalJsonlBackend.prototype.queryEvaluations = mock.fn(async () => [
|
|
715
|
+
{ timestamp: '2026-01-28T10:00:00Z', evaluationName: 'Relevance', scoreValue: 50, scoreUnit: 'PERCENTAGE' },
|
|
716
|
+
{ timestamp: '2026-01-28T10:01:00Z', evaluationName: 'Relevance', scoreValue: 50, scoreUnit: 'Percentage' },
|
|
717
|
+
]);
|
|
718
|
+
const result = await queryEvaluations({});
|
|
719
|
+
// (50/100 + 50/100) / 2 = 0.5
|
|
720
|
+
assert.strictEqual(result.summary.averageScore, 0.5);
|
|
721
|
+
});
|
|
722
|
+
it('should not normalize unknown units', async () => {
|
|
723
|
+
LocalJsonlBackend.prototype.queryEvaluations = mock.fn(async () => [
|
|
724
|
+
{ timestamp: '2026-01-28T10:00:00Z', evaluationName: 'Relevance', scoreValue: 5, scoreUnit: 'stars_1_5' },
|
|
725
|
+
]);
|
|
726
|
+
const result = await queryEvaluations({});
|
|
727
|
+
// Unknown unit, keep as-is
|
|
728
|
+
assert.strictEqual(result.summary.averageScore, 5);
|
|
729
|
+
});
|
|
730
|
+
});
|
|
731
|
+
});
|
|
732
|
+
});
|
|
733
|
+
//# sourceMappingURL=query-evaluations.test.js.map
|