@artemiskit/core 0.2.0 → 0.2.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +76 -0
- package/dist/adapters/types.d.ts +5 -0
- package/dist/adapters/types.d.ts.map +1 -1
- package/dist/artifacts/manifest.d.ts.map +1 -1
- package/dist/artifacts/types.d.ts +20 -0
- package/dist/artifacts/types.d.ts.map +1 -1
- package/dist/cost/pricing.d.ts +2 -1
- package/dist/cost/pricing.d.ts.map +1 -1
- package/dist/evaluators/llm-grader.d.ts.map +1 -1
- package/dist/index.js +468 -205
- package/dist/scenario/schema.d.ts +8 -0
- package/dist/scenario/schema.d.ts.map +1 -1
- package/dist/storage/local.d.ts +44 -2
- package/dist/storage/local.d.ts.map +1 -1
- package/dist/storage/types.d.ts +66 -0
- package/dist/storage/types.d.ts.map +1 -1
- package/package.json +1 -1
- package/src/adapters/types.ts +5 -0
- package/src/artifacts/manifest.ts +24 -2
- package/src/artifacts/types.ts +21 -0
- package/src/cost/pricing.ts +242 -65
- package/src/evaluators/llm-grader.ts +45 -13
- package/src/evaluators/similarity.test.ts +4 -3
- package/src/scenario/schema.ts +4 -0
- package/src/storage/local.test.ts +243 -0
- package/src/storage/local.ts +186 -4
- package/src/storage/types.ts +77 -0
- package/dist/events/emitter.d.ts +0 -111
- package/dist/events/emitter.d.ts.map +0 -1
- package/dist/events/index.d.ts +0 -6
- package/dist/events/index.d.ts.map +0 -1
- package/dist/events/types.d.ts +0 -177
- package/dist/events/types.d.ts.map +0 -1
|
@@ -22,6 +22,7 @@ export declare const ProviderConfigSchema: z.ZodOptional<z.ZodObject<{
|
|
|
22
22
|
deploymentName: z.ZodOptional<z.ZodString>;
|
|
23
23
|
apiVersion: z.ZodOptional<z.ZodString>;
|
|
24
24
|
embeddingDeploymentName: z.ZodOptional<z.ZodString>;
|
|
25
|
+
modelFamily: z.ZodOptional<z.ZodString>;
|
|
25
26
|
underlyingProvider: z.ZodOptional<z.ZodEnum<["openai", "azure", "anthropic", "google", "mistral"]>>;
|
|
26
27
|
}, "strip", z.ZodTypeAny, {
|
|
27
28
|
apiKey?: string | undefined;
|
|
@@ -34,6 +35,7 @@ export declare const ProviderConfigSchema: z.ZodOptional<z.ZodObject<{
|
|
|
34
35
|
deploymentName?: string | undefined;
|
|
35
36
|
apiVersion?: string | undefined;
|
|
36
37
|
embeddingDeploymentName?: string | undefined;
|
|
38
|
+
modelFamily?: string | undefined;
|
|
37
39
|
underlyingProvider?: "openai" | "anthropic" | "google" | "mistral" | "azure" | undefined;
|
|
38
40
|
}, {
|
|
39
41
|
apiKey?: string | undefined;
|
|
@@ -46,6 +48,7 @@ export declare const ProviderConfigSchema: z.ZodOptional<z.ZodObject<{
|
|
|
46
48
|
deploymentName?: string | undefined;
|
|
47
49
|
apiVersion?: string | undefined;
|
|
48
50
|
embeddingDeploymentName?: string | undefined;
|
|
51
|
+
modelFamily?: string | undefined;
|
|
49
52
|
underlyingProvider?: "openai" | "anthropic" | "google" | "mistral" | "azure" | undefined;
|
|
50
53
|
}>>;
|
|
51
54
|
/**
|
|
@@ -1137,6 +1140,7 @@ export declare const ScenarioSchema: z.ZodObject<{
|
|
|
1137
1140
|
deploymentName: z.ZodOptional<z.ZodString>;
|
|
1138
1141
|
apiVersion: z.ZodOptional<z.ZodString>;
|
|
1139
1142
|
embeddingDeploymentName: z.ZodOptional<z.ZodString>;
|
|
1143
|
+
modelFamily: z.ZodOptional<z.ZodString>;
|
|
1140
1144
|
underlyingProvider: z.ZodOptional<z.ZodEnum<["openai", "azure", "anthropic", "google", "mistral"]>>;
|
|
1141
1145
|
}, "strip", z.ZodTypeAny, {
|
|
1142
1146
|
apiKey?: string | undefined;
|
|
@@ -1149,6 +1153,7 @@ export declare const ScenarioSchema: z.ZodObject<{
|
|
|
1149
1153
|
deploymentName?: string | undefined;
|
|
1150
1154
|
apiVersion?: string | undefined;
|
|
1151
1155
|
embeddingDeploymentName?: string | undefined;
|
|
1156
|
+
modelFamily?: string | undefined;
|
|
1152
1157
|
underlyingProvider?: "openai" | "anthropic" | "google" | "mistral" | "azure" | undefined;
|
|
1153
1158
|
}, {
|
|
1154
1159
|
apiKey?: string | undefined;
|
|
@@ -1161,6 +1166,7 @@ export declare const ScenarioSchema: z.ZodObject<{
|
|
|
1161
1166
|
deploymentName?: string | undefined;
|
|
1162
1167
|
apiVersion?: string | undefined;
|
|
1163
1168
|
embeddingDeploymentName?: string | undefined;
|
|
1169
|
+
modelFamily?: string | undefined;
|
|
1164
1170
|
underlyingProvider?: "openai" | "anthropic" | "google" | "mistral" | "azure" | undefined;
|
|
1165
1171
|
}>>;
|
|
1166
1172
|
seed: z.ZodOptional<z.ZodNumber>;
|
|
@@ -2000,6 +2006,7 @@ export declare const ScenarioSchema: z.ZodObject<{
|
|
|
2000
2006
|
deploymentName?: string | undefined;
|
|
2001
2007
|
apiVersion?: string | undefined;
|
|
2002
2008
|
embeddingDeploymentName?: string | undefined;
|
|
2009
|
+
modelFamily?: string | undefined;
|
|
2003
2010
|
underlyingProvider?: "openai" | "anthropic" | "google" | "mistral" | "azure" | undefined;
|
|
2004
2011
|
} | undefined;
|
|
2005
2012
|
seed?: number | undefined;
|
|
@@ -2156,6 +2163,7 @@ export declare const ScenarioSchema: z.ZodObject<{
|
|
|
2156
2163
|
deploymentName?: string | undefined;
|
|
2157
2164
|
apiVersion?: string | undefined;
|
|
2158
2165
|
embeddingDeploymentName?: string | undefined;
|
|
2166
|
+
modelFamily?: string | undefined;
|
|
2159
2167
|
underlyingProvider?: "openai" | "anthropic" | "google" | "mistral" | "azure" | undefined;
|
|
2160
2168
|
} | undefined;
|
|
2161
2169
|
seed?: number | undefined;
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"schema.d.ts","sourceRoot":"","sources":["../../src/scenario/schema.ts"],"names":[],"mappings":"AAAA;;GAEG;AAEH,OAAO,EAAE,CAAC,EAAE,MAAM,KAAK,CAAC;AAGxB;;GAEG;AACH,eAAO,MAAM,cAAc,mIAWzB,CAAC;AAEH;;;;GAIG;AACH,eAAO,MAAM,oBAAoB
|
|
1
|
+
{"version":3,"file":"schema.d.ts","sourceRoot":"","sources":["../../src/scenario/schema.ts"],"names":[],"mappings":"AAAA;;GAEG;AAEH,OAAO,EAAE,CAAC,EAAE,MAAM,KAAK,CAAC;AAGxB;;GAEG;AACH,eAAO,MAAM,cAAc,mIAWzB,CAAC;AAEH;;;;GAIG;AACH,eAAO,MAAM,oBAAoB;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GAyBpB,CAAC;AAoFd;;;GAGG;AACH,eAAO,MAAM,cAAc;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;IA7BvB,0GAA0G;;IAE1G,8EAA8E;;IAE9E,uFAAuF;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;QAJvF,0GAA0G;;QAE1G,8EAA8E;;QAE9E,uFAAuF;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;IAyBR,CAAC;AAEpF;;GAEG;AACH,eAAO,MAAM,iBAAiB;;;;;;;;;EAG5B,CAAC;AAEH;;GAEG;AACH,eAAO,MAAM,eAAe,gFAAuE,CAAC;AAEpG;;;GAGG;AACH,QAAA,MAAM,qBAAqB;;;;;;;;;;;;;;;;;;;;;GAAuC,CAAC;AAEnE;;GAEG;AACH,eAAO,MAAM,cAAc;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;QArDvB,0GAA0G;;QAE1G,8EAA8E;;QAE9E,uFAAuF;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;YAJvF,0GAA0G;;YAE1G,8EAA8E;;YAE9E,uFAAuF;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;IA8DzF,6DAA6D;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;EAE7D,CAAC;AAEH;;GAEG;AACH,eAAO,MAAM,cAAc;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;IAYzB,6CAA6C;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;YArF3C,0GAA0G;;YAE1G,8EAA8E;;YAE9E,uFAAuF;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;gBAJvF,0GAA0G;;gBAE1G,8EAA8E;;gBAE9E,uFAAuF;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;QA8DzF,6DAA6D;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;EAiC7D,CAAC;AAEH,MAAM,MAAM,QAAQ,GAAG,CAAC,CAAC,KAAK,CAAC,OAAO,cAAc,CAAC,CAAC;AACtD,MAAM,MAAM,QAAQ,GAAG,CAAC,CAAC,KAAK,CAAC,OAAO,cAAc,CAAC,CAAC;AACtD,MAAM,MAAM,QAAQ,GAAG,CAAC,CAAC,KAAK,CAAC,OAAO,cAAc,CAAC,CAAC;AACtD,MAAM,MAAM,QAAQ,GAAG,CAAC,CAAC,KAAK,CAAC,OAAO,cAAc,CAAC,CAAC;AACtD,MAAM,MAAM,cAAc,GAAG,CAAC,CAAC,KAAK,CAAC,OAAO,oBAAoB,CAAC,CAAC;AAClE,MAAM,MAAM,eAAe,GAAG,CAAC,CAAC,KAAK,CAAC,OAAO,iBAAiB,CAAC,CAAC;AAChE,MAAM,MAAM,SAAS,GAAG,CAAC,CAAC,KAAK,CAAC,OAAO,eAAe,CAAC,CAAC;AACxD,MAAM,MAAM,uBAAuB,GAAG,CAAC,CAAC,KAAK,CAAC,OAAO,qBAAqB,CAAC,CAAC"}
|
package/dist/storage/local.d.ts
CHANGED
|
@@ -2,9 +2,10 @@
|
|
|
2
2
|
* Local filesystem storage adapter
|
|
3
3
|
*/
|
|
4
4
|
import type { AnyManifest, RedTeamManifest, RunManifest, StressManifest } from '../artifacts/types';
|
|
5
|
-
import type { ComparisonResult, ListOptions, RunListItem
|
|
6
|
-
export declare class LocalStorageAdapter implements
|
|
5
|
+
import type { BaselineMetadata, BaselineStorageAdapter, ComparisonResult, ListOptions, RunListItem } from './types';
|
|
6
|
+
export declare class LocalStorageAdapter implements BaselineStorageAdapter {
|
|
7
7
|
private basePath;
|
|
8
|
+
private baselinesPath;
|
|
8
9
|
constructor(basePath?: string);
|
|
9
10
|
save(manifest: AnyManifest): Promise<string>;
|
|
10
11
|
load(runId: string): Promise<AnyManifest>;
|
|
@@ -16,5 +17,46 @@ export declare class LocalStorageAdapter implements StorageAdapter {
|
|
|
16
17
|
compare(baselineId: string, currentId: string): Promise<ComparisonResult>;
|
|
17
18
|
private listDirectories;
|
|
18
19
|
private listFiles;
|
|
20
|
+
/**
|
|
21
|
+
* Load baselines file
|
|
22
|
+
*/
|
|
23
|
+
private loadBaselinesFile;
|
|
24
|
+
/**
|
|
25
|
+
* Save baselines file
|
|
26
|
+
*/
|
|
27
|
+
private saveBaselinesFile;
|
|
28
|
+
/**
|
|
29
|
+
* Set a baseline for a scenario
|
|
30
|
+
*/
|
|
31
|
+
setBaseline(scenario: string, runId: string, tag?: string): Promise<BaselineMetadata>;
|
|
32
|
+
/**
|
|
33
|
+
* Get the baseline for a scenario
|
|
34
|
+
*/
|
|
35
|
+
getBaseline(scenario: string): Promise<BaselineMetadata | null>;
|
|
36
|
+
/**
|
|
37
|
+
* Get a baseline by run ID
|
|
38
|
+
*/
|
|
39
|
+
getBaselineByRunId(runId: string): Promise<BaselineMetadata | null>;
|
|
40
|
+
/**
|
|
41
|
+
* List all baselines
|
|
42
|
+
*/
|
|
43
|
+
listBaselines(): Promise<BaselineMetadata[]>;
|
|
44
|
+
/**
|
|
45
|
+
* Remove a baseline by scenario name
|
|
46
|
+
*/
|
|
47
|
+
removeBaseline(scenario: string): Promise<boolean>;
|
|
48
|
+
/**
|
|
49
|
+
* Remove a baseline by run ID
|
|
50
|
+
*/
|
|
51
|
+
removeBaselineByRunId(runId: string): Promise<boolean>;
|
|
52
|
+
/**
|
|
53
|
+
* Compare a run against its baseline (if exists)
|
|
54
|
+
*/
|
|
55
|
+
compareToBaseline(runId: string, regressionThreshold?: number): Promise<{
|
|
56
|
+
baseline: BaselineMetadata;
|
|
57
|
+
comparison: ComparisonResult;
|
|
58
|
+
hasRegression: boolean;
|
|
59
|
+
regressionThreshold: number;
|
|
60
|
+
} | null>;
|
|
19
61
|
}
|
|
20
62
|
//# sourceMappingURL=local.d.ts.map
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"local.d.ts","sourceRoot":"","sources":["../../src/storage/local.ts"],"names":[],"mappings":"AAAA;;GAEG;AAIH,OAAO,KAAK,EAAE,WAAW,EAAE,eAAe,EAAE,WAAW,EAAE,cAAc,EAAE,MAAM,oBAAoB,CAAC;AACpG,OAAO,KAAK,
|
|
1
|
+
{"version":3,"file":"local.d.ts","sourceRoot":"","sources":["../../src/storage/local.ts"],"names":[],"mappings":"AAAA;;GAEG;AAIH,OAAO,KAAK,EAAE,WAAW,EAAE,eAAe,EAAE,WAAW,EAAE,cAAc,EAAE,MAAM,oBAAoB,CAAC;AACpG,OAAO,KAAK,EACV,gBAAgB,EAChB,sBAAsB,EACtB,gBAAgB,EAChB,WAAW,EACX,WAAW,EACZ,MAAM,SAAS,CAAC;AAyDjB,qBAAa,mBAAoB,YAAW,sBAAsB;IAChE,OAAO,CAAC,QAAQ,CAAS;IACzB,OAAO,CAAC,aAAa,CAAS;gBAElB,QAAQ,SAAmB;IAKjC,IAAI,CAAC,QAAQ,EAAE,WAAW,GAAG,OAAO,CAAC,MAAM,CAAC;IAU5C,IAAI,CAAC,KAAK,EAAE,MAAM,GAAG,OAAO,CAAC,WAAW,CAAC;IAczC,OAAO,CAAC,KAAK,EAAE,MAAM,GAAG,OAAO,CAAC,WAAW,CAAC;IAQ5C,WAAW,CAAC,KAAK,EAAE,MAAM,GAAG,OAAO,CAAC,eAAe,CAAC;IAQpD,UAAU,CAAC,KAAK,EAAE,MAAM,GAAG,OAAO,CAAC,cAAc,CAAC;IAQlD,IAAI,CAAC,OAAO,CAAC,EAAE,WAAW,GAAG,OAAO,CAAC,WAAW,EAAE,CAAC;IA0DnD,MAAM,CAAC,KAAK,EAAE,MAAM,GAAG,OAAO,CAAC,IAAI,CAAC;IAYpC,OAAO,CAAC,UAAU,EAAE,MAAM,EAAE,SAAS,EAAE,MAAM,GAAG,OAAO,CAAC,gBAAgB,CAAC;YAiBjE,eAAe;YASf,SAAS;IAWvB;;OAEG;YACW,iBAAiB;IAS/B;;OAEG;YACW,iBAAiB;IAM/B;;OAEG;IACG,WAAW,CAAC,QAAQ,EAAE,MAAM,EAAE,KAAK,EAAE,MAAM,EAAE,GAAG,CAAC,EAAE,MAAM,GAAG,OAAO,CAAC,gBAAgB,CAAC;IA4B3F;;OAEG;IACG,WAAW,CAAC,QAAQ,EAAE,MAAM,GAAG,OAAO,CAAC,gBAAgB,GAAG,IAAI,CAAC;IAKrE;;OAEG;IACG,kBAAkB,CAAC,KAAK,EAAE,MAAM,GAAG,OAAO,CAAC,gBAAgB,GAAG,IAAI,CAAC;IAMzE;;OAEG;IACG,aAAa,IAAI,OAAO,CAAC,gBAAgB,EAAE,CAAC;IAOlD;;OAEG;IACG,cAAc,CAAC,QAAQ,EAAE,MAAM,GAAG,OAAO,CAAC,OAAO,CAAC;IAUxD;;OAEG;IACG,qBAAqB,CAAC,KAAK,EAAE,MAAM,GAAG,OAAO,CAAC,OAAO,CAAC;IAW5D;;OAEG;IACG,iBAAiB,CACrB,KAAK,EAAE,MAAM,EACb,mBAAmB,SAAO,GACzB,OAAO,CAAC;QACT,QAAQ,EAAE,gBAAgB,CAAC;QAC3B,UAAU,EAAE,gBAAgB,CAAC;QAC7B,aAAa,EAAE,OAAO,CAAC;QACvB,mBAAmB,EAAE,MAAM,CAAC;KAC7B,GAAG,IAAI,CAAC;CAwBV"}
|
package/dist/storage/types.d.ts
CHANGED
|
@@ -12,6 +12,8 @@ export interface RunListItem {
|
|
|
12
12
|
createdAt: string;
|
|
13
13
|
/** Type of manifest (run, redteam, stress) */
|
|
14
14
|
type?: 'run' | 'redteam' | 'stress';
|
|
15
|
+
/** Estimated cost in USD (optional, included when --show-cost is used) */
|
|
16
|
+
estimatedCostUsd?: number;
|
|
15
17
|
}
|
|
16
18
|
/**
|
|
17
19
|
* Comparison result between two runs
|
|
@@ -35,6 +37,8 @@ export interface ListOptions {
|
|
|
35
37
|
offset?: number;
|
|
36
38
|
/** Filter by manifest type */
|
|
37
39
|
type?: 'run' | 'redteam' | 'stress';
|
|
40
|
+
/** Include cost information in results */
|
|
41
|
+
includeCost?: boolean;
|
|
38
42
|
}
|
|
39
43
|
/**
|
|
40
44
|
* Storage adapter interface - implement to create custom storage backends
|
|
@@ -83,4 +87,66 @@ export interface StorageConfig {
|
|
|
83
87
|
bucket?: string;
|
|
84
88
|
basePath?: string;
|
|
85
89
|
}
|
|
90
|
+
/**
|
|
91
|
+
* Baseline metadata for regression comparison
|
|
92
|
+
*/
|
|
93
|
+
export interface BaselineMetadata {
|
|
94
|
+
/** Scenario name or identifier */
|
|
95
|
+
scenario: string;
|
|
96
|
+
/** Run ID of the baseline */
|
|
97
|
+
runId: string;
|
|
98
|
+
/** ISO timestamp when baseline was set */
|
|
99
|
+
createdAt: string;
|
|
100
|
+
/** Key metrics captured at baseline time */
|
|
101
|
+
metrics: {
|
|
102
|
+
successRate: number;
|
|
103
|
+
medianLatencyMs: number;
|
|
104
|
+
totalTokens: number;
|
|
105
|
+
passedCases: number;
|
|
106
|
+
failedCases: number;
|
|
107
|
+
totalCases: number;
|
|
108
|
+
};
|
|
109
|
+
/** Optional description or tag */
|
|
110
|
+
tag?: string;
|
|
111
|
+
}
|
|
112
|
+
/**
|
|
113
|
+
* Extended storage adapter with baseline support
|
|
114
|
+
*/
|
|
115
|
+
export interface BaselineStorageAdapter extends StorageAdapter {
|
|
116
|
+
/**
|
|
117
|
+
* Set a baseline for a scenario
|
|
118
|
+
*/
|
|
119
|
+
setBaseline(scenario: string, runId: string, tag?: string): Promise<BaselineMetadata>;
|
|
120
|
+
/**
|
|
121
|
+
* Get the baseline by scenario name
|
|
122
|
+
*/
|
|
123
|
+
getBaseline(scenario: string): Promise<BaselineMetadata | null>;
|
|
124
|
+
/**
|
|
125
|
+
* Get the baseline by run ID
|
|
126
|
+
*/
|
|
127
|
+
getBaselineByRunId(runId: string): Promise<BaselineMetadata | null>;
|
|
128
|
+
/**
|
|
129
|
+
* List all baselines
|
|
130
|
+
*/
|
|
131
|
+
listBaselines(): Promise<BaselineMetadata[]>;
|
|
132
|
+
/**
|
|
133
|
+
* Remove a baseline by scenario name
|
|
134
|
+
*/
|
|
135
|
+
removeBaseline(scenario: string): Promise<boolean>;
|
|
136
|
+
/**
|
|
137
|
+
* Remove a baseline by run ID
|
|
138
|
+
*/
|
|
139
|
+
removeBaselineByRunId(runId: string): Promise<boolean>;
|
|
140
|
+
/**
|
|
141
|
+
* Compare a run against its baseline (if exists)
|
|
142
|
+
* @param runId - The run ID to compare
|
|
143
|
+
* @param regressionThreshold - Threshold for regression detection (0-1), default 0.05
|
|
144
|
+
*/
|
|
145
|
+
compareToBaseline?(runId: string, regressionThreshold?: number): Promise<{
|
|
146
|
+
baseline: BaselineMetadata;
|
|
147
|
+
comparison: ComparisonResult;
|
|
148
|
+
hasRegression: boolean;
|
|
149
|
+
regressionThreshold: number;
|
|
150
|
+
} | null>;
|
|
151
|
+
}
|
|
86
152
|
//# sourceMappingURL=types.d.ts.map
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"types.d.ts","sourceRoot":"","sources":["../../src/storage/types.ts"],"names":[],"mappings":"AAAA;;GAEG;AAEH,OAAO,KAAK,EAAE,WAAW,EAAE,eAAe,EAAE,WAAW,EAAE,cAAc,EAAE,MAAM,oBAAoB,CAAC;AAEpG;;GAEG;AACH,MAAM,WAAW,WAAW;IAC1B,KAAK,EAAE,MAAM,CAAC;IACd,QAAQ,EAAE,MAAM,CAAC;IACjB,WAAW,EAAE,MAAM,CAAC;IACpB,SAAS,EAAE,MAAM,CAAC;IAClB,8CAA8C;IAC9C,IAAI,CAAC,EAAE,KAAK,GAAG,SAAS,GAAG,QAAQ,CAAC;
|
|
1
|
+
{"version":3,"file":"types.d.ts","sourceRoot":"","sources":["../../src/storage/types.ts"],"names":[],"mappings":"AAAA;;GAEG;AAEH,OAAO,KAAK,EAAE,WAAW,EAAE,eAAe,EAAE,WAAW,EAAE,cAAc,EAAE,MAAM,oBAAoB,CAAC;AAEpG;;GAEG;AACH,MAAM,WAAW,WAAW;IAC1B,KAAK,EAAE,MAAM,CAAC;IACd,QAAQ,EAAE,MAAM,CAAC;IACjB,WAAW,EAAE,MAAM,CAAC;IACpB,SAAS,EAAE,MAAM,CAAC;IAClB,8CAA8C;IAC9C,IAAI,CAAC,EAAE,KAAK,GAAG,SAAS,GAAG,QAAQ,CAAC;IACpC,0EAA0E;IAC1E,gBAAgB,CAAC,EAAE,MAAM,CAAC;CAC3B;AAED;;GAEG;AACH,MAAM,WAAW,gBAAgB;IAC/B,QAAQ,EAAE,WAAW,CAAC;IACtB,OAAO,EAAE,WAAW,CAAC;IACrB,KAAK,EAAE;QACL,WAAW,EAAE,MAAM,CAAC;QACpB,OAAO,EAAE,MAAM,CAAC;QAChB,MAAM,EAAE,MAAM,CAAC;KAChB,CAAC;CACH;AAED;;GAEG;AACH,MAAM,WAAW,WAAW;IAC1B,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,MAAM,CAAC,EAAE,MAAM,CAAC;IAChB,8BAA8B;IAC9B,IAAI,CAAC,EAAE,KAAK,GAAG,SAAS,GAAG,QAAQ,CAAC;IACpC,0CAA0C;IAC1C,WAAW,CAAC,EAAE,OAAO,CAAC;CACvB;AAED;;GAEG;AACH,MAAM,WAAW,cAAc;IAC7B;;OAEG;IACH,IAAI,CAAC,QAAQ,EAAE,WAAW,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC;IAE7C;;OAEG;IACH,IAAI,CAAC,KAAK,EAAE,MAAM,GAAG,OAAO,CAAC,WAAW,CAAC,CAAC;IAE1C;;OAEG;IACH,OAAO,CAAC,CAAC,KAAK,EAAE,MAAM,GAAG,OAAO,CAAC,WAAW,CAAC,CAAC;IAE9C;;OAEG;IACH,WAAW,CAAC,CAAC,KAAK,EAAE,MAAM,GAAG,OAAO,CAAC,eAAe,CAAC,CAAC;IAEtD;;OAEG;IACH,UAAU,CAAC,CAAC,KAAK,EAAE,MAAM,GAAG,OAAO,CAAC,cAAc,CAAC,CAAC;IAEpD;;OAEG;IACH,IAAI,CAAC,OAAO,CAAC,EAAE,WAAW,GAAG,OAAO,CAAC,WAAW,EAAE,CAAC,CAAC;IAEpD;;OAEG;IACH,MAAM,CAAC,KAAK,EAAE,MAAM,GAAG,OAAO,CAAC,IAAI,CAAC,CAAC;IAErC;;OAEG;IACH,OAAO,CAAC,CAAC,UAAU,EAAE,MAAM,EAAE,SAAS,EAAE,MAAM,GAAG,OAAO,CAAC,gBAAgB,CAAC,CAAC;CAC5E;AAED;;GAEG;AACH,MAAM,WAAW,aAAa;IAC5B,IAAI,EAAE,UAAU,GAAG,OAAO,CAAC;IAC3B,GAAG,CAAC,EAAE,MAAM,CAAC;IACb,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,MAAM,CAAC,EAAE,MAAM,CAAC;IAChB,QAAQ,CAAC,EAAE,MAAM,CAAC;CACnB;AAED;;GAEG;AACH,MAAM,WAAW,gBAAgB;IAC/B,kCAAkC;IAClC,QAAQ,EAAE,MAAM,CAAC;IACjB,6BAA6B;IAC7B,KAAK,EAAE,MAAM,CAAC;IACd,0CAA0C;IAC1C,SAAS,EAAE,MAAM,CAAC;IAClB,4CAA4C;IAC5C,OAAO,EAAE;QACP,WAAW,EAAE,MAAM,CAAC;QACpB,eAAe,EAAE,MAAM,CAAC;QACxB,WAAW,EAAE,MAAM,CAAC;QACpB,WAAW,EAAE,MAAM,CAAC;QACpB,WAAW,EAAE,MAAM,CAAC;QACpB,UAAU,EAAE,MAAM,CAAC;KACpB,CAAC;IACF,kCAAkC;IAClC,GAAG,CAAC,EAAE,MAAM,CAAC;CACd;AAED;;GAEG;AACH,MAAM,WAAW,sBAAuB,SAAQ,cAAc;IAC5D;;OAEG;IACH,WAAW,CAAC,QAAQ,EAAE,MAAM,EAAE,KAAK,EAAE,MAAM,EAAE,GAAG,CAAC,EAAE,MAAM,GAAG,OAAO,CAAC,gBAAgB,CAAC,CAAC;IAEtF;;OAEG;IACH,WAAW,CAAC,QAAQ,EAAE,MAAM,GAAG,OAAO,CAAC,gBAAgB,GAAG,IAAI,CAAC,CAAC;IAEhE;;OAEG;IACH,kBAAkB,CAAC,KAAK,EAAE,MAAM,GAAG,OAAO,CAAC,gBAAgB,GAAG,IAAI,CAAC,CAAC;IAEpE;;OAEG;IACH,aAAa,IAAI,OAAO,CAAC,gBAAgB,EAAE,CAAC,CAAC;IAE7C;;OAEG;IACH,cAAc,CAAC,QAAQ,EAAE,MAAM,GAAG,OAAO,CAAC,OAAO,CAAC,CAAC;IAEnD;;OAEG;IACH,qBAAqB,CAAC,KAAK,EAAE,MAAM,GAAG,OAAO,CAAC,OAAO,CAAC,CAAC;IAEvD;;;;OAIG;IACH,iBAAiB,CAAC,CAChB,KAAK,EAAE,MAAM,EACb,mBAAmB,CAAC,EAAE,MAAM,GAC3B,OAAO,CAAC;QACT,QAAQ,EAAE,gBAAgB,CAAC;QAC3B,UAAU,EAAE,gBAAgB,CAAC;QAC7B,aAAa,EAAE,OAAO,CAAC;QACvB,mBAAmB,EAAE,MAAM,CAAC;KAC7B,GAAG,IAAI,CAAC,CAAC;CACX"}
|
package/package.json
CHANGED
package/src/adapters/types.ts
CHANGED
|
@@ -157,6 +157,11 @@ export interface AzureOpenAIAdapterConfig extends BaseAdapterConfig {
|
|
|
157
157
|
apiVersion: string;
|
|
158
158
|
/** Optional separate deployment name for embedding models */
|
|
159
159
|
embeddingDeploymentName?: string;
|
|
160
|
+
/**
|
|
161
|
+
* Model family for parameter detection (e.g., 'gpt-5-mini' when deployment is '5-mini')
|
|
162
|
+
* Used to determine which API parameters to use (max_tokens vs max_completion_tokens)
|
|
163
|
+
*/
|
|
164
|
+
modelFamily?: string;
|
|
160
165
|
}
|
|
161
166
|
|
|
162
167
|
/**
|
|
@@ -3,10 +3,12 @@
|
|
|
3
3
|
*/
|
|
4
4
|
|
|
5
5
|
import { nanoid } from 'nanoid';
|
|
6
|
+
import { estimateCost, getModelPricing } from '../cost/pricing';
|
|
6
7
|
import { getEnvironmentInfo } from '../provenance/environment';
|
|
7
8
|
import { getGitInfo } from '../provenance/git';
|
|
8
9
|
import type {
|
|
9
10
|
CaseResult,
|
|
11
|
+
CostEstimateInfo,
|
|
10
12
|
ManifestRedactionInfo,
|
|
11
13
|
ResolvedConfig,
|
|
12
14
|
RunConfig,
|
|
@@ -40,7 +42,9 @@ export function createRunManifest(options: {
|
|
|
40
42
|
redaction,
|
|
41
43
|
} = options;
|
|
42
44
|
|
|
43
|
-
|
|
45
|
+
// Get model for cost calculation - prefer resolvedConfig, then config
|
|
46
|
+
const modelForCost = resolvedConfig?.model || config.model;
|
|
47
|
+
const metrics = calculateMetrics(cases, modelForCost);
|
|
44
48
|
const git = getGitInfo();
|
|
45
49
|
const environment = getEnvironmentInfo();
|
|
46
50
|
|
|
@@ -69,7 +73,7 @@ export function createRunManifest(options: {
|
|
|
69
73
|
/**
|
|
70
74
|
* Calculate metrics from case results
|
|
71
75
|
*/
|
|
72
|
-
function calculateMetrics(cases: CaseResult[]): RunMetrics {
|
|
76
|
+
function calculateMetrics(cases: CaseResult[], model?: string): RunMetrics {
|
|
73
77
|
const passedCases = cases.filter((c) => c.ok);
|
|
74
78
|
const latencies = cases.map((c) => c.latencyMs).sort((a, b) => a - b);
|
|
75
79
|
|
|
@@ -81,6 +85,23 @@ function calculateMetrics(cases: CaseResult[]): RunMetrics {
|
|
|
81
85
|
const totalPromptTokens = cases.reduce((sum, c) => sum + c.tokens.prompt, 0);
|
|
82
86
|
const totalCompletionTokens = cases.reduce((sum, c) => sum + c.tokens.completion, 0);
|
|
83
87
|
|
|
88
|
+
// Calculate cost if model is provided
|
|
89
|
+
let cost: CostEstimateInfo | undefined;
|
|
90
|
+
if (model && (totalPromptTokens > 0 || totalCompletionTokens > 0)) {
|
|
91
|
+
const costEstimate = estimateCost(totalPromptTokens, totalCompletionTokens, model);
|
|
92
|
+
const pricing = getModelPricing(model);
|
|
93
|
+
cost = {
|
|
94
|
+
total_usd: costEstimate.totalUsd,
|
|
95
|
+
prompt_cost_usd: costEstimate.promptCostUsd,
|
|
96
|
+
completion_cost_usd: costEstimate.completionCostUsd,
|
|
97
|
+
model: costEstimate.model,
|
|
98
|
+
pricing: {
|
|
99
|
+
prompt_per_1k: pricing.promptPer1K,
|
|
100
|
+
completion_per_1k: pricing.completionPer1K,
|
|
101
|
+
},
|
|
102
|
+
};
|
|
103
|
+
}
|
|
104
|
+
|
|
84
105
|
return {
|
|
85
106
|
success_rate: cases.length > 0 ? passedCases.length / cases.length : 0,
|
|
86
107
|
total_cases: cases.length,
|
|
@@ -91,6 +112,7 @@ function calculateMetrics(cases: CaseResult[]): RunMetrics {
|
|
|
91
112
|
total_tokens: totalPromptTokens + totalCompletionTokens,
|
|
92
113
|
total_prompt_tokens: totalPromptTokens,
|
|
93
114
|
total_completion_tokens: totalCompletionTokens,
|
|
115
|
+
cost,
|
|
94
116
|
};
|
|
95
117
|
}
|
|
96
118
|
|
package/src/artifacts/types.ts
CHANGED
|
@@ -67,6 +67,25 @@ export interface CaseResult {
|
|
|
67
67
|
redaction?: CaseRedactionInfo;
|
|
68
68
|
}
|
|
69
69
|
|
|
70
|
+
/**
|
|
71
|
+
* Cost estimation details
|
|
72
|
+
*/
|
|
73
|
+
export interface CostEstimateInfo {
|
|
74
|
+
/** Estimated total cost in USD */
|
|
75
|
+
total_usd: number;
|
|
76
|
+
/** Cost for prompt/input tokens */
|
|
77
|
+
prompt_cost_usd: number;
|
|
78
|
+
/** Cost for completion/output tokens */
|
|
79
|
+
completion_cost_usd: number;
|
|
80
|
+
/** Model used for cost calculation */
|
|
81
|
+
model: string;
|
|
82
|
+
/** Pricing used (per 1K tokens) */
|
|
83
|
+
pricing: {
|
|
84
|
+
prompt_per_1k: number;
|
|
85
|
+
completion_per_1k: number;
|
|
86
|
+
};
|
|
87
|
+
}
|
|
88
|
+
|
|
70
89
|
/**
|
|
71
90
|
* Run metrics
|
|
72
91
|
*/
|
|
@@ -80,6 +99,8 @@ export interface RunMetrics {
|
|
|
80
99
|
total_tokens: number;
|
|
81
100
|
total_prompt_tokens: number;
|
|
82
101
|
total_completion_tokens: number;
|
|
102
|
+
/** Estimated cost information */
|
|
103
|
+
cost?: CostEstimateInfo;
|
|
83
104
|
}
|
|
84
105
|
|
|
85
106
|
/**
|