@lov3kaizen/agentsea-evaluate 0.5.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +339 -0
- package/dist/annotation/index.d.mts +3 -0
- package/dist/annotation/index.d.ts +3 -0
- package/dist/annotation/index.js +630 -0
- package/dist/annotation/index.mjs +22 -0
- package/dist/chunk-5JRYKRSE.mjs +2791 -0
- package/dist/chunk-EUXXIZK3.mjs +676 -0
- package/dist/chunk-NBMUSATK.mjs +596 -0
- package/dist/chunk-PAQ2TTJJ.mjs +1105 -0
- package/dist/chunk-TUMNJN2S.mjs +416 -0
- package/dist/continuous/index.d.mts +2 -0
- package/dist/continuous/index.d.ts +2 -0
- package/dist/continuous/index.js +707 -0
- package/dist/continuous/index.mjs +16 -0
- package/dist/datasets/index.d.mts +1 -0
- package/dist/datasets/index.d.ts +1 -0
- package/dist/datasets/index.js +456 -0
- package/dist/datasets/index.mjs +14 -0
- package/dist/evaluation/index.d.mts +1 -0
- package/dist/evaluation/index.d.ts +1 -0
- package/dist/evaluation/index.js +2853 -0
- package/dist/evaluation/index.mjs +78 -0
- package/dist/feedback/index.d.mts +2 -0
- package/dist/feedback/index.d.ts +2 -0
- package/dist/feedback/index.js +1158 -0
- package/dist/feedback/index.mjs +40 -0
- package/dist/index-6Pbiq7ny.d.mts +234 -0
- package/dist/index-6Pbiq7ny.d.ts +234 -0
- package/dist/index-BNTycFEA.d.mts +479 -0
- package/dist/index-BNTycFEA.d.ts +479 -0
- package/dist/index-CTYCfWfH.d.mts +543 -0
- package/dist/index-CTYCfWfH.d.ts +543 -0
- package/dist/index-Cq5LwG_3.d.mts +322 -0
- package/dist/index-Cq5LwG_3.d.ts +322 -0
- package/dist/index-bPghFsfP.d.mts +315 -0
- package/dist/index-bPghFsfP.d.ts +315 -0
- package/dist/index.d.mts +81 -0
- package/dist/index.d.ts +81 -0
- package/dist/index.js +5962 -0
- package/dist/index.mjs +429 -0
- package/package.json +102 -0
|
@@ -0,0 +1,322 @@
|
|
|
1
|
+
import { EventEmitter } from 'eventemitter3';
|
|
2
|
+
|
|
3
|
+
interface ContinuousEvalConfig {
|
|
4
|
+
pipeline: EvaluationPipelineRef;
|
|
5
|
+
sampleRate: number;
|
|
6
|
+
schedule?: string;
|
|
7
|
+
storage?: 'memory' | 'sqlite' | 'postgres';
|
|
8
|
+
storagePath?: string;
|
|
9
|
+
retentionDays?: number;
|
|
10
|
+
}
|
|
11
|
+
interface EvaluationPipelineRef {
|
|
12
|
+
evaluate(input: EvalInput): Promise<EvalOutput>;
|
|
13
|
+
}
|
|
14
|
+
interface EvalInput {
|
|
15
|
+
input: string;
|
|
16
|
+
output: string;
|
|
17
|
+
context?: string[];
|
|
18
|
+
metadata?: Record<string, unknown>;
|
|
19
|
+
}
|
|
20
|
+
interface EvalOutput {
|
|
21
|
+
scores: Record<string, number>;
|
|
22
|
+
passed: boolean;
|
|
23
|
+
durationMs: number;
|
|
24
|
+
}
|
|
25
|
+
type MonitoringStatus = 'stopped' | 'running' | 'paused' | 'error';
|
|
26
|
+
interface ContinuousEvalStats {
|
|
27
|
+
status: MonitoringStatus;
|
|
28
|
+
startedAt?: number;
|
|
29
|
+
lastEvalAt?: number;
|
|
30
|
+
totalEvaluations: number;
|
|
31
|
+
passRate: number;
|
|
32
|
+
avgScores: Record<string, number>;
|
|
33
|
+
alertsTriggered: number;
|
|
34
|
+
}
|
|
35
|
+
type AlertChannelType = 'slack' | 'email' | 'webhook' | 'pagerduty';
|
|
36
|
+
interface AlertChannelConfig {
|
|
37
|
+
type: AlertChannelType;
|
|
38
|
+
webhook?: string;
|
|
39
|
+
to?: string[];
|
|
40
|
+
apiKey?: string;
|
|
41
|
+
channel?: string;
|
|
42
|
+
}
|
|
43
|
+
interface AlertRule {
|
|
44
|
+
metric: string;
|
|
45
|
+
threshold: number;
|
|
46
|
+
direction: 'above' | 'below';
|
|
47
|
+
window?: number;
|
|
48
|
+
minSamples?: number;
|
|
49
|
+
severity?: 'info' | 'warning' | 'critical';
|
|
50
|
+
}
|
|
51
|
+
interface AlertManagerConfig {
|
|
52
|
+
channels: AlertChannelConfig[];
|
|
53
|
+
rules?: Record<string, AlertRule>;
|
|
54
|
+
cooldownMs?: number;
|
|
55
|
+
groupingWindow?: number;
|
|
56
|
+
}
|
|
57
|
+
interface Alert {
|
|
58
|
+
id: string;
|
|
59
|
+
rule: AlertRule;
|
|
60
|
+
metric: string;
|
|
61
|
+
currentValue: number;
|
|
62
|
+
threshold: number;
|
|
63
|
+
severity: 'info' | 'warning' | 'critical';
|
|
64
|
+
message: string;
|
|
65
|
+
triggeredAt: number;
|
|
66
|
+
resolvedAt?: number;
|
|
67
|
+
acknowledged?: boolean;
|
|
68
|
+
metadata?: Record<string, unknown>;
|
|
69
|
+
}
|
|
70
|
+
interface AlertNotification {
|
|
71
|
+
alertId: string;
|
|
72
|
+
channel: AlertChannelType;
|
|
73
|
+
sentAt: number;
|
|
74
|
+
success: boolean;
|
|
75
|
+
error?: string;
|
|
76
|
+
}
|
|
77
|
+
interface RegressionDetectorConfig {
|
|
78
|
+
baseline: BaselineMetrics;
|
|
79
|
+
sensitivity: 'low' | 'medium' | 'high';
|
|
80
|
+
windowSize: number;
|
|
81
|
+
minSamples?: number;
|
|
82
|
+
pValueThreshold?: number;
|
|
83
|
+
}
|
|
84
|
+
interface BaselineMetrics {
|
|
85
|
+
metrics: Record<string, MetricBaseline>;
|
|
86
|
+
sampleCount: number;
|
|
87
|
+
timestamp: number;
|
|
88
|
+
}
|
|
89
|
+
interface MetricBaseline {
|
|
90
|
+
mean: number;
|
|
91
|
+
std: number;
|
|
92
|
+
min: number;
|
|
93
|
+
max: number;
|
|
94
|
+
p50: number;
|
|
95
|
+
p90: number;
|
|
96
|
+
p95: number;
|
|
97
|
+
}
|
|
98
|
+
interface RegressionResult {
|
|
99
|
+
detected: boolean;
|
|
100
|
+
regressions: MetricRegression[];
|
|
101
|
+
improvements: MetricImprovement[];
|
|
102
|
+
unchanged: string[];
|
|
103
|
+
}
|
|
104
|
+
interface MetricRegression {
|
|
105
|
+
metric: string;
|
|
106
|
+
baselineValue: number;
|
|
107
|
+
currentValue: number;
|
|
108
|
+
changePercent: number;
|
|
109
|
+
pValue: number;
|
|
110
|
+
severity: 'minor' | 'moderate' | 'severe';
|
|
111
|
+
}
|
|
112
|
+
interface MetricImprovement {
|
|
113
|
+
metric: string;
|
|
114
|
+
baselineValue: number;
|
|
115
|
+
currentValue: number;
|
|
116
|
+
changePercent: number;
|
|
117
|
+
pValue: number;
|
|
118
|
+
}
|
|
119
|
+
interface ABTestConfig {
|
|
120
|
+
name: string;
|
|
121
|
+
description?: string;
|
|
122
|
+
variants: ABTestVariants;
|
|
123
|
+
trafficSplit: number;
|
|
124
|
+
metrics: string[];
|
|
125
|
+
minSamples: number;
|
|
126
|
+
significanceLevel?: number;
|
|
127
|
+
maxDuration?: number;
|
|
128
|
+
}
|
|
129
|
+
interface ABTestVariants {
|
|
130
|
+
control: VariantConfig;
|
|
131
|
+
treatment: VariantConfig;
|
|
132
|
+
}
|
|
133
|
+
interface VariantConfig {
|
|
134
|
+
name?: string;
|
|
135
|
+
model?: string;
|
|
136
|
+
prompt?: string;
|
|
137
|
+
parameters?: Record<string, unknown>;
|
|
138
|
+
}
|
|
139
|
+
type ABTestStatus = 'draft' | 'running' | 'paused' | 'completed' | 'cancelled';
|
|
140
|
+
interface ABTest {
|
|
141
|
+
id: string;
|
|
142
|
+
name: string;
|
|
143
|
+
config: ABTestConfig;
|
|
144
|
+
status: ABTestStatus;
|
|
145
|
+
startedAt?: number;
|
|
146
|
+
completedAt?: number;
|
|
147
|
+
results?: ABTestResults;
|
|
148
|
+
}
|
|
149
|
+
interface ABTestResults {
|
|
150
|
+
controlSamples: number;
|
|
151
|
+
treatmentSamples: number;
|
|
152
|
+
metrics: Record<string, ABMetricResult>;
|
|
153
|
+
winner: 'control' | 'treatment' | 'none';
|
|
154
|
+
isSignificant: boolean;
|
|
155
|
+
confidence: number;
|
|
156
|
+
recommendation: string;
|
|
157
|
+
}
|
|
158
|
+
interface ABMetricResult {
|
|
159
|
+
control: MetricSummary;
|
|
160
|
+
treatment: MetricSummary;
|
|
161
|
+
difference: number;
|
|
162
|
+
differencePercent: number;
|
|
163
|
+
pValue: number;
|
|
164
|
+
isSignificant: boolean;
|
|
165
|
+
winner: 'control' | 'treatment' | 'none';
|
|
166
|
+
}
|
|
167
|
+
interface MetricSummary {
|
|
168
|
+
mean: number;
|
|
169
|
+
std: number;
|
|
170
|
+
sampleCount: number;
|
|
171
|
+
confidenceInterval: [number, number];
|
|
172
|
+
}
|
|
173
|
+
interface SampleAssignment {
|
|
174
|
+
variant: 'control' | 'treatment';
|
|
175
|
+
testId: string;
|
|
176
|
+
assignedAt: number;
|
|
177
|
+
}
|
|
178
|
+
interface TimeSeriesPoint {
|
|
179
|
+
timestamp: number;
|
|
180
|
+
value: number;
|
|
181
|
+
metadata?: Record<string, unknown>;
|
|
182
|
+
}
|
|
183
|
+
interface TimeSeries {
|
|
184
|
+
metric: string;
|
|
185
|
+
points: TimeSeriesPoint[];
|
|
186
|
+
aggregation?: 'sum' | 'avg' | 'min' | 'max' | 'count';
|
|
187
|
+
interval?: number;
|
|
188
|
+
}
|
|
189
|
+
interface DashboardUpdate {
|
|
190
|
+
timestamp: number;
|
|
191
|
+
status: MonitoringStatus;
|
|
192
|
+
metrics: Record<string, number>;
|
|
193
|
+
recentAlerts: Alert[];
|
|
194
|
+
activeTests: ABTest[];
|
|
195
|
+
regressions?: RegressionResult;
|
|
196
|
+
}
|
|
197
|
+
interface HistoricalQueryOptions {
|
|
198
|
+
metric: string;
|
|
199
|
+
startTime: number;
|
|
200
|
+
endTime: number;
|
|
201
|
+
interval?: 'minute' | 'hour' | 'day' | 'week';
|
|
202
|
+
aggregation?: 'avg' | 'min' | 'max' | 'sum' | 'count';
|
|
203
|
+
}
|
|
204
|
+
interface ScheduleConfig {
|
|
205
|
+
cron?: string;
|
|
206
|
+
interval?: number;
|
|
207
|
+
timezone?: string;
|
|
208
|
+
immediate?: boolean;
|
|
209
|
+
}
|
|
210
|
+
type ContinuousEvalEventType = 'eval:started' | 'eval:completed' | 'eval:error' | 'alert:triggered' | 'alert:resolved' | 'regression:detected' | 'test:started' | 'test:completed' | 'test:significant';
|
|
211
|
+
interface ContinuousEvalEvent {
|
|
212
|
+
type: ContinuousEvalEventType;
|
|
213
|
+
data: Record<string, unknown>;
|
|
214
|
+
timestamp: number;
|
|
215
|
+
}
|
|
216
|
+
|
|
217
|
+
interface AlertManagerEvents {
|
|
218
|
+
'alert:triggered': (alert: Alert) => void;
|
|
219
|
+
'alert:resolved': (alert: Alert) => void;
|
|
220
|
+
'notification:sent': (notification: AlertNotification) => void;
|
|
221
|
+
}
|
|
222
|
+
declare class AlertManager extends EventEmitter<AlertManagerEvents> {
|
|
223
|
+
private channels;
|
|
224
|
+
private rules;
|
|
225
|
+
private activeAlerts;
|
|
226
|
+
private cooldownMs;
|
|
227
|
+
private lastAlertTime;
|
|
228
|
+
private alertCount;
|
|
229
|
+
constructor(config: AlertManagerConfig);
|
|
230
|
+
addRule(rule: AlertRule): void;
|
|
231
|
+
removeRule(metric: string): boolean;
|
|
232
|
+
check(metric: string, value: number): Alert | null;
|
|
233
|
+
private triggerAlert;
|
|
234
|
+
resolveAlert(metric: string): void;
|
|
235
|
+
acknowledgeAlert(alertId: string): boolean;
|
|
236
|
+
private sendNotifications;
|
|
237
|
+
private sendToChannel;
|
|
238
|
+
private formatMessage;
|
|
239
|
+
getActiveAlerts(): Alert[];
|
|
240
|
+
getAlertCount(): number;
|
|
241
|
+
getRules(): AlertRule[];
|
|
242
|
+
}
|
|
243
|
+
declare function createAlertManager(config: AlertManagerConfig): AlertManager;
|
|
244
|
+
|
|
245
|
+
interface ABTestEvents {
|
|
246
|
+
'test:started': () => void;
|
|
247
|
+
'test:completed': (results: ABTestResults) => void;
|
|
248
|
+
'test:significant': (metric: string, winner: 'control' | 'treatment') => void;
|
|
249
|
+
'sample:assigned': (assignment: SampleAssignment) => void;
|
|
250
|
+
}
|
|
251
|
+
declare class ABTestRunner extends EventEmitter<ABTestEvents> {
|
|
252
|
+
readonly id: string;
|
|
253
|
+
readonly name: string;
|
|
254
|
+
private config;
|
|
255
|
+
private status;
|
|
256
|
+
private startedAt?;
|
|
257
|
+
private completedAt?;
|
|
258
|
+
private controlSamples;
|
|
259
|
+
private treatmentSamples;
|
|
260
|
+
private sampleCount;
|
|
261
|
+
constructor(config: ABTestConfig);
|
|
262
|
+
start(): Promise<void>;
|
|
263
|
+
stop(): ABTestResults;
|
|
264
|
+
pause(): void;
|
|
265
|
+
resume(): void;
|
|
266
|
+
assignVariant(): 'control' | 'treatment';
|
|
267
|
+
recordSample(variant: 'control' | 'treatment', sampleId: string, scores: Record<string, number>): void;
|
|
268
|
+
getResults(): ABTestResults;
|
|
269
|
+
private calculateMetricResult;
|
|
270
|
+
private getScoresForMetric;
|
|
271
|
+
private calculateSummary;
|
|
272
|
+
private calculatePValue;
|
|
273
|
+
private normalCDF;
|
|
274
|
+
private checkSignificance;
|
|
275
|
+
private generateRecommendation;
|
|
276
|
+
getStatus(): ABTestStatus;
|
|
277
|
+
getConfig(): ABTestConfig;
|
|
278
|
+
getTimestamps(): {
|
|
279
|
+
startedAt?: number;
|
|
280
|
+
completedAt?: number;
|
|
281
|
+
};
|
|
282
|
+
}
|
|
283
|
+
declare function createABTestRunner(config: ABTestConfig): ABTestRunner;
|
|
284
|
+
|
|
285
|
+
interface ContinuousEvalEvents {
|
|
286
|
+
'eval:started': () => void;
|
|
287
|
+
'eval:completed': (result: EvalOutput) => void;
|
|
288
|
+
'eval:error': (error: Error) => void;
|
|
289
|
+
'status:changed': (status: MonitoringStatus) => void;
|
|
290
|
+
}
|
|
291
|
+
declare class ContinuousEval extends EventEmitter<ContinuousEvalEvents> {
|
|
292
|
+
private pipeline;
|
|
293
|
+
private sampleRate;
|
|
294
|
+
private status;
|
|
295
|
+
private startedAt?;
|
|
296
|
+
private lastEvalAt?;
|
|
297
|
+
private totalEvaluations;
|
|
298
|
+
private passedCount;
|
|
299
|
+
private scoreHistory;
|
|
300
|
+
private alertManager?;
|
|
301
|
+
private abTests;
|
|
302
|
+
private intervalId?;
|
|
303
|
+
constructor(config: ContinuousEvalConfig);
|
|
304
|
+
setAlerts(alertManager: AlertManager, rules: Record<string, {
|
|
305
|
+
threshold: number;
|
|
306
|
+
direction: 'above' | 'below';
|
|
307
|
+
}>): void;
|
|
308
|
+
start(): void;
|
|
309
|
+
stop(): void;
|
|
310
|
+
pause(): void;
|
|
311
|
+
resume(): void;
|
|
312
|
+
evaluate(input: EvalInput): Promise<EvalOutput | null>;
|
|
313
|
+
getStats(): ContinuousEvalStats;
|
|
314
|
+
createABTest(config: ABTestConfig): ABTestRunner;
|
|
315
|
+
getABTest(id: string): ABTestRunner | undefined;
|
|
316
|
+
getABTests(): ABTestRunner[];
|
|
317
|
+
getScoreHistory(metric: string): number[];
|
|
318
|
+
reset(): void;
|
|
319
|
+
}
|
|
320
|
+
declare function createContinuousEval(config: ContinuousEvalConfig): ContinuousEval;
|
|
321
|
+
|
|
322
|
+
export { type AlertChannelType as A, type BaselineMetrics as B, type ContinuousEvalConfig as C, type DashboardUpdate as D, type EvaluationPipelineRef as E, createAlertManager as F, ABTestRunner as G, type HistoricalQueryOptions as H, createABTestRunner as I, type MonitoringStatus as M, type RegressionDetectorConfig as R, type SampleAssignment as S, type TimeSeriesPoint as T, type VariantConfig as V, type EvalInput as a, type EvalOutput as b, type ContinuousEvalStats as c, type AlertChannelConfig as d, type AlertRule as e, type AlertManagerConfig as f, type Alert as g, type AlertNotification as h, type MetricBaseline as i, type RegressionResult as j, type MetricRegression as k, type MetricImprovement as l, type ABTestConfig as m, type ABTestVariants as n, type ABTestStatus as o, type ABTest as p, type ABTestResults as q, type ABMetricResult as r, type MetricSummary as s, type TimeSeries as t, type ScheduleConfig as u, type ContinuousEvalEventType as v, type ContinuousEvalEvent as w, ContinuousEval as x, createContinuousEval as y, AlertManager as z };
|
|
@@ -0,0 +1,322 @@
|
|
|
1
|
+
import { EventEmitter } from 'eventemitter3';
|
|
2
|
+
|
|
3
|
+
interface ContinuousEvalConfig {
|
|
4
|
+
pipeline: EvaluationPipelineRef;
|
|
5
|
+
sampleRate: number;
|
|
6
|
+
schedule?: string;
|
|
7
|
+
storage?: 'memory' | 'sqlite' | 'postgres';
|
|
8
|
+
storagePath?: string;
|
|
9
|
+
retentionDays?: number;
|
|
10
|
+
}
|
|
11
|
+
interface EvaluationPipelineRef {
|
|
12
|
+
evaluate(input: EvalInput): Promise<EvalOutput>;
|
|
13
|
+
}
|
|
14
|
+
interface EvalInput {
|
|
15
|
+
input: string;
|
|
16
|
+
output: string;
|
|
17
|
+
context?: string[];
|
|
18
|
+
metadata?: Record<string, unknown>;
|
|
19
|
+
}
|
|
20
|
+
interface EvalOutput {
|
|
21
|
+
scores: Record<string, number>;
|
|
22
|
+
passed: boolean;
|
|
23
|
+
durationMs: number;
|
|
24
|
+
}
|
|
25
|
+
type MonitoringStatus = 'stopped' | 'running' | 'paused' | 'error';
|
|
26
|
+
interface ContinuousEvalStats {
|
|
27
|
+
status: MonitoringStatus;
|
|
28
|
+
startedAt?: number;
|
|
29
|
+
lastEvalAt?: number;
|
|
30
|
+
totalEvaluations: number;
|
|
31
|
+
passRate: number;
|
|
32
|
+
avgScores: Record<string, number>;
|
|
33
|
+
alertsTriggered: number;
|
|
34
|
+
}
|
|
35
|
+
type AlertChannelType = 'slack' | 'email' | 'webhook' | 'pagerduty';
|
|
36
|
+
interface AlertChannelConfig {
|
|
37
|
+
type: AlertChannelType;
|
|
38
|
+
webhook?: string;
|
|
39
|
+
to?: string[];
|
|
40
|
+
apiKey?: string;
|
|
41
|
+
channel?: string;
|
|
42
|
+
}
|
|
43
|
+
interface AlertRule {
|
|
44
|
+
metric: string;
|
|
45
|
+
threshold: number;
|
|
46
|
+
direction: 'above' | 'below';
|
|
47
|
+
window?: number;
|
|
48
|
+
minSamples?: number;
|
|
49
|
+
severity?: 'info' | 'warning' | 'critical';
|
|
50
|
+
}
|
|
51
|
+
interface AlertManagerConfig {
|
|
52
|
+
channels: AlertChannelConfig[];
|
|
53
|
+
rules?: Record<string, AlertRule>;
|
|
54
|
+
cooldownMs?: number;
|
|
55
|
+
groupingWindow?: number;
|
|
56
|
+
}
|
|
57
|
+
interface Alert {
|
|
58
|
+
id: string;
|
|
59
|
+
rule: AlertRule;
|
|
60
|
+
metric: string;
|
|
61
|
+
currentValue: number;
|
|
62
|
+
threshold: number;
|
|
63
|
+
severity: 'info' | 'warning' | 'critical';
|
|
64
|
+
message: string;
|
|
65
|
+
triggeredAt: number;
|
|
66
|
+
resolvedAt?: number;
|
|
67
|
+
acknowledged?: boolean;
|
|
68
|
+
metadata?: Record<string, unknown>;
|
|
69
|
+
}
|
|
70
|
+
interface AlertNotification {
|
|
71
|
+
alertId: string;
|
|
72
|
+
channel: AlertChannelType;
|
|
73
|
+
sentAt: number;
|
|
74
|
+
success: boolean;
|
|
75
|
+
error?: string;
|
|
76
|
+
}
|
|
77
|
+
interface RegressionDetectorConfig {
|
|
78
|
+
baseline: BaselineMetrics;
|
|
79
|
+
sensitivity: 'low' | 'medium' | 'high';
|
|
80
|
+
windowSize: number;
|
|
81
|
+
minSamples?: number;
|
|
82
|
+
pValueThreshold?: number;
|
|
83
|
+
}
|
|
84
|
+
interface BaselineMetrics {
|
|
85
|
+
metrics: Record<string, MetricBaseline>;
|
|
86
|
+
sampleCount: number;
|
|
87
|
+
timestamp: number;
|
|
88
|
+
}
|
|
89
|
+
interface MetricBaseline {
|
|
90
|
+
mean: number;
|
|
91
|
+
std: number;
|
|
92
|
+
min: number;
|
|
93
|
+
max: number;
|
|
94
|
+
p50: number;
|
|
95
|
+
p90: number;
|
|
96
|
+
p95: number;
|
|
97
|
+
}
|
|
98
|
+
interface RegressionResult {
|
|
99
|
+
detected: boolean;
|
|
100
|
+
regressions: MetricRegression[];
|
|
101
|
+
improvements: MetricImprovement[];
|
|
102
|
+
unchanged: string[];
|
|
103
|
+
}
|
|
104
|
+
interface MetricRegression {
|
|
105
|
+
metric: string;
|
|
106
|
+
baselineValue: number;
|
|
107
|
+
currentValue: number;
|
|
108
|
+
changePercent: number;
|
|
109
|
+
pValue: number;
|
|
110
|
+
severity: 'minor' | 'moderate' | 'severe';
|
|
111
|
+
}
|
|
112
|
+
interface MetricImprovement {
|
|
113
|
+
metric: string;
|
|
114
|
+
baselineValue: number;
|
|
115
|
+
currentValue: number;
|
|
116
|
+
changePercent: number;
|
|
117
|
+
pValue: number;
|
|
118
|
+
}
|
|
119
|
+
interface ABTestConfig {
|
|
120
|
+
name: string;
|
|
121
|
+
description?: string;
|
|
122
|
+
variants: ABTestVariants;
|
|
123
|
+
trafficSplit: number;
|
|
124
|
+
metrics: string[];
|
|
125
|
+
minSamples: number;
|
|
126
|
+
significanceLevel?: number;
|
|
127
|
+
maxDuration?: number;
|
|
128
|
+
}
|
|
129
|
+
interface ABTestVariants {
|
|
130
|
+
control: VariantConfig;
|
|
131
|
+
treatment: VariantConfig;
|
|
132
|
+
}
|
|
133
|
+
interface VariantConfig {
|
|
134
|
+
name?: string;
|
|
135
|
+
model?: string;
|
|
136
|
+
prompt?: string;
|
|
137
|
+
parameters?: Record<string, unknown>;
|
|
138
|
+
}
|
|
139
|
+
type ABTestStatus = 'draft' | 'running' | 'paused' | 'completed' | 'cancelled';
|
|
140
|
+
interface ABTest {
|
|
141
|
+
id: string;
|
|
142
|
+
name: string;
|
|
143
|
+
config: ABTestConfig;
|
|
144
|
+
status: ABTestStatus;
|
|
145
|
+
startedAt?: number;
|
|
146
|
+
completedAt?: number;
|
|
147
|
+
results?: ABTestResults;
|
|
148
|
+
}
|
|
149
|
+
interface ABTestResults {
|
|
150
|
+
controlSamples: number;
|
|
151
|
+
treatmentSamples: number;
|
|
152
|
+
metrics: Record<string, ABMetricResult>;
|
|
153
|
+
winner: 'control' | 'treatment' | 'none';
|
|
154
|
+
isSignificant: boolean;
|
|
155
|
+
confidence: number;
|
|
156
|
+
recommendation: string;
|
|
157
|
+
}
|
|
158
|
+
interface ABMetricResult {
|
|
159
|
+
control: MetricSummary;
|
|
160
|
+
treatment: MetricSummary;
|
|
161
|
+
difference: number;
|
|
162
|
+
differencePercent: number;
|
|
163
|
+
pValue: number;
|
|
164
|
+
isSignificant: boolean;
|
|
165
|
+
winner: 'control' | 'treatment' | 'none';
|
|
166
|
+
}
|
|
167
|
+
interface MetricSummary {
|
|
168
|
+
mean: number;
|
|
169
|
+
std: number;
|
|
170
|
+
sampleCount: number;
|
|
171
|
+
confidenceInterval: [number, number];
|
|
172
|
+
}
|
|
173
|
+
interface SampleAssignment {
|
|
174
|
+
variant: 'control' | 'treatment';
|
|
175
|
+
testId: string;
|
|
176
|
+
assignedAt: number;
|
|
177
|
+
}
|
|
178
|
+
interface TimeSeriesPoint {
|
|
179
|
+
timestamp: number;
|
|
180
|
+
value: number;
|
|
181
|
+
metadata?: Record<string, unknown>;
|
|
182
|
+
}
|
|
183
|
+
interface TimeSeries {
|
|
184
|
+
metric: string;
|
|
185
|
+
points: TimeSeriesPoint[];
|
|
186
|
+
aggregation?: 'sum' | 'avg' | 'min' | 'max' | 'count';
|
|
187
|
+
interval?: number;
|
|
188
|
+
}
|
|
189
|
+
interface DashboardUpdate {
|
|
190
|
+
timestamp: number;
|
|
191
|
+
status: MonitoringStatus;
|
|
192
|
+
metrics: Record<string, number>;
|
|
193
|
+
recentAlerts: Alert[];
|
|
194
|
+
activeTests: ABTest[];
|
|
195
|
+
regressions?: RegressionResult;
|
|
196
|
+
}
|
|
197
|
+
interface HistoricalQueryOptions {
|
|
198
|
+
metric: string;
|
|
199
|
+
startTime: number;
|
|
200
|
+
endTime: number;
|
|
201
|
+
interval?: 'minute' | 'hour' | 'day' | 'week';
|
|
202
|
+
aggregation?: 'avg' | 'min' | 'max' | 'sum' | 'count';
|
|
203
|
+
}
|
|
204
|
+
interface ScheduleConfig {
|
|
205
|
+
cron?: string;
|
|
206
|
+
interval?: number;
|
|
207
|
+
timezone?: string;
|
|
208
|
+
immediate?: boolean;
|
|
209
|
+
}
|
|
210
|
+
type ContinuousEvalEventType = 'eval:started' | 'eval:completed' | 'eval:error' | 'alert:triggered' | 'alert:resolved' | 'regression:detected' | 'test:started' | 'test:completed' | 'test:significant';
|
|
211
|
+
interface ContinuousEvalEvent {
|
|
212
|
+
type: ContinuousEvalEventType;
|
|
213
|
+
data: Record<string, unknown>;
|
|
214
|
+
timestamp: number;
|
|
215
|
+
}
|
|
216
|
+
|
|
217
|
+
interface AlertManagerEvents {
|
|
218
|
+
'alert:triggered': (alert: Alert) => void;
|
|
219
|
+
'alert:resolved': (alert: Alert) => void;
|
|
220
|
+
'notification:sent': (notification: AlertNotification) => void;
|
|
221
|
+
}
|
|
222
|
+
declare class AlertManager extends EventEmitter<AlertManagerEvents> {
|
|
223
|
+
private channels;
|
|
224
|
+
private rules;
|
|
225
|
+
private activeAlerts;
|
|
226
|
+
private cooldownMs;
|
|
227
|
+
private lastAlertTime;
|
|
228
|
+
private alertCount;
|
|
229
|
+
constructor(config: AlertManagerConfig);
|
|
230
|
+
addRule(rule: AlertRule): void;
|
|
231
|
+
removeRule(metric: string): boolean;
|
|
232
|
+
check(metric: string, value: number): Alert | null;
|
|
233
|
+
private triggerAlert;
|
|
234
|
+
resolveAlert(metric: string): void;
|
|
235
|
+
acknowledgeAlert(alertId: string): boolean;
|
|
236
|
+
private sendNotifications;
|
|
237
|
+
private sendToChannel;
|
|
238
|
+
private formatMessage;
|
|
239
|
+
getActiveAlerts(): Alert[];
|
|
240
|
+
getAlertCount(): number;
|
|
241
|
+
getRules(): AlertRule[];
|
|
242
|
+
}
|
|
243
|
+
declare function createAlertManager(config: AlertManagerConfig): AlertManager;
|
|
244
|
+
|
|
245
|
+
interface ABTestEvents {
|
|
246
|
+
'test:started': () => void;
|
|
247
|
+
'test:completed': (results: ABTestResults) => void;
|
|
248
|
+
'test:significant': (metric: string, winner: 'control' | 'treatment') => void;
|
|
249
|
+
'sample:assigned': (assignment: SampleAssignment) => void;
|
|
250
|
+
}
|
|
251
|
+
declare class ABTestRunner extends EventEmitter<ABTestEvents> {
|
|
252
|
+
readonly id: string;
|
|
253
|
+
readonly name: string;
|
|
254
|
+
private config;
|
|
255
|
+
private status;
|
|
256
|
+
private startedAt?;
|
|
257
|
+
private completedAt?;
|
|
258
|
+
private controlSamples;
|
|
259
|
+
private treatmentSamples;
|
|
260
|
+
private sampleCount;
|
|
261
|
+
constructor(config: ABTestConfig);
|
|
262
|
+
start(): Promise<void>;
|
|
263
|
+
stop(): ABTestResults;
|
|
264
|
+
pause(): void;
|
|
265
|
+
resume(): void;
|
|
266
|
+
assignVariant(): 'control' | 'treatment';
|
|
267
|
+
recordSample(variant: 'control' | 'treatment', sampleId: string, scores: Record<string, number>): void;
|
|
268
|
+
getResults(): ABTestResults;
|
|
269
|
+
private calculateMetricResult;
|
|
270
|
+
private getScoresForMetric;
|
|
271
|
+
private calculateSummary;
|
|
272
|
+
private calculatePValue;
|
|
273
|
+
private normalCDF;
|
|
274
|
+
private checkSignificance;
|
|
275
|
+
private generateRecommendation;
|
|
276
|
+
getStatus(): ABTestStatus;
|
|
277
|
+
getConfig(): ABTestConfig;
|
|
278
|
+
getTimestamps(): {
|
|
279
|
+
startedAt?: number;
|
|
280
|
+
completedAt?: number;
|
|
281
|
+
};
|
|
282
|
+
}
|
|
283
|
+
declare function createABTestRunner(config: ABTestConfig): ABTestRunner;
|
|
284
|
+
|
|
285
|
+
interface ContinuousEvalEvents {
|
|
286
|
+
'eval:started': () => void;
|
|
287
|
+
'eval:completed': (result: EvalOutput) => void;
|
|
288
|
+
'eval:error': (error: Error) => void;
|
|
289
|
+
'status:changed': (status: MonitoringStatus) => void;
|
|
290
|
+
}
|
|
291
|
+
declare class ContinuousEval extends EventEmitter<ContinuousEvalEvents> {
|
|
292
|
+
private pipeline;
|
|
293
|
+
private sampleRate;
|
|
294
|
+
private status;
|
|
295
|
+
private startedAt?;
|
|
296
|
+
private lastEvalAt?;
|
|
297
|
+
private totalEvaluations;
|
|
298
|
+
private passedCount;
|
|
299
|
+
private scoreHistory;
|
|
300
|
+
private alertManager?;
|
|
301
|
+
private abTests;
|
|
302
|
+
private intervalId?;
|
|
303
|
+
constructor(config: ContinuousEvalConfig);
|
|
304
|
+
setAlerts(alertManager: AlertManager, rules: Record<string, {
|
|
305
|
+
threshold: number;
|
|
306
|
+
direction: 'above' | 'below';
|
|
307
|
+
}>): void;
|
|
308
|
+
start(): void;
|
|
309
|
+
stop(): void;
|
|
310
|
+
pause(): void;
|
|
311
|
+
resume(): void;
|
|
312
|
+
evaluate(input: EvalInput): Promise<EvalOutput | null>;
|
|
313
|
+
getStats(): ContinuousEvalStats;
|
|
314
|
+
createABTest(config: ABTestConfig): ABTestRunner;
|
|
315
|
+
getABTest(id: string): ABTestRunner | undefined;
|
|
316
|
+
getABTests(): ABTestRunner[];
|
|
317
|
+
getScoreHistory(metric: string): number[];
|
|
318
|
+
reset(): void;
|
|
319
|
+
}
|
|
320
|
+
declare function createContinuousEval(config: ContinuousEvalConfig): ContinuousEval;
|
|
321
|
+
|
|
322
|
+
export { type AlertChannelType as A, type BaselineMetrics as B, type ContinuousEvalConfig as C, type DashboardUpdate as D, type EvaluationPipelineRef as E, createAlertManager as F, ABTestRunner as G, type HistoricalQueryOptions as H, createABTestRunner as I, type MonitoringStatus as M, type RegressionDetectorConfig as R, type SampleAssignment as S, type TimeSeriesPoint as T, type VariantConfig as V, type EvalInput as a, type EvalOutput as b, type ContinuousEvalStats as c, type AlertChannelConfig as d, type AlertRule as e, type AlertManagerConfig as f, type Alert as g, type AlertNotification as h, type MetricBaseline as i, type RegressionResult as j, type MetricRegression as k, type MetricImprovement as l, type ABTestConfig as m, type ABTestVariants as n, type ABTestStatus as o, type ABTest as p, type ABTestResults as q, type ABMetricResult as r, type MetricSummary as s, type TimeSeries as t, type ScheduleConfig as u, type ContinuousEvalEventType as v, type ContinuousEvalEvent as w, ContinuousEval as x, createContinuousEval as y, AlertManager as z };
|