evalsense 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +678 -0
- package/bin/evalsense.js +3 -0
- package/dist/chunk-5P7LNNO6.js +747 -0
- package/dist/chunk-5P7LNNO6.js.map +1 -0
- package/dist/chunk-BRPM6AB6.js +925 -0
- package/dist/chunk-BRPM6AB6.js.map +1 -0
- package/dist/chunk-HDJID3GC.cjs +779 -0
- package/dist/chunk-HDJID3GC.cjs.map +1 -0
- package/dist/chunk-Y23VHTD3.cjs +942 -0
- package/dist/chunk-Y23VHTD3.cjs.map +1 -0
- package/dist/cli.cjs +65 -0
- package/dist/cli.cjs.map +1 -0
- package/dist/cli.d.cts +1 -0
- package/dist/cli.d.ts +1 -0
- package/dist/cli.js +63 -0
- package/dist/cli.js.map +1 -0
- package/dist/index.cjs +1126 -0
- package/dist/index.cjs.map +1 -0
- package/dist/index.d.cts +604 -0
- package/dist/index.d.ts +604 -0
- package/dist/index.js +1043 -0
- package/dist/index.js.map +1 -0
- package/dist/metrics/index.cjs +275 -0
- package/dist/metrics/index.cjs.map +1 -0
- package/dist/metrics/index.d.cts +299 -0
- package/dist/metrics/index.d.ts +299 -0
- package/dist/metrics/index.js +191 -0
- package/dist/metrics/index.js.map +1 -0
- package/dist/metrics/opinionated/index.cjs +24 -0
- package/dist/metrics/opinionated/index.cjs.map +1 -0
- package/dist/metrics/opinionated/index.d.cts +163 -0
- package/dist/metrics/opinionated/index.d.ts +163 -0
- package/dist/metrics/opinionated/index.js +3 -0
- package/dist/metrics/opinionated/index.js.map +1 -0
- package/dist/types-C71p0wzM.d.cts +265 -0
- package/dist/types-C71p0wzM.d.ts +265 -0
- package/package.json +91 -0
|
@@ -0,0 +1,265 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Core type definitions for EvalSense
|
|
3
|
+
*/
|
|
4
|
+
/**
|
|
5
|
+
* A loaded dataset with records and metadata
|
|
6
|
+
*/
|
|
7
|
+
interface Dataset<T = Record<string, unknown>> {
|
|
8
|
+
records: T[];
|
|
9
|
+
metadata: DatasetMetadata;
|
|
10
|
+
}
|
|
11
|
+
interface DatasetMetadata {
|
|
12
|
+
source: string;
|
|
13
|
+
count: number;
|
|
14
|
+
loadedAt: Date;
|
|
15
|
+
}
|
|
16
|
+
/**
|
|
17
|
+
* A record aligned between actual (model output) and expected (ground truth)
|
|
18
|
+
*/
|
|
19
|
+
interface AlignedRecord {
|
|
20
|
+
id: string;
|
|
21
|
+
actual: Record<string, unknown>;
|
|
22
|
+
expected: Record<string, unknown>;
|
|
23
|
+
}
|
|
24
|
+
/**
|
|
25
|
+
* Output from runModel() - predictions with IDs for alignment
|
|
26
|
+
*/
|
|
27
|
+
interface Prediction {
|
|
28
|
+
id: string;
|
|
29
|
+
[field: string]: unknown;
|
|
30
|
+
}
|
|
31
|
+
/**
|
|
32
|
+
* JSON Schema for structured LLM outputs
|
|
33
|
+
*/
|
|
34
|
+
interface JSONSchema {
|
|
35
|
+
type: string;
|
|
36
|
+
properties?: Record<string, unknown>;
|
|
37
|
+
required?: string[];
|
|
38
|
+
[key: string]: unknown;
|
|
39
|
+
}
|
|
40
|
+
/**
|
|
41
|
+
* LLM client interface for metric evaluation
|
|
42
|
+
*/
|
|
43
|
+
interface LLMClient {
|
|
44
|
+
/**
|
|
45
|
+
* Generate a text completion from a prompt
|
|
46
|
+
*/
|
|
47
|
+
complete(prompt: string): Promise<string>;
|
|
48
|
+
/**
|
|
49
|
+
* Generate a structured JSON completion (optional)
|
|
50
|
+
*/
|
|
51
|
+
completeStructured?<T>(prompt: string, schema: JSONSchema): Promise<T>;
|
|
52
|
+
}
|
|
53
|
+
/**
|
|
54
|
+
* Output from an LLM metric evaluation
|
|
55
|
+
*/
|
|
56
|
+
interface MetricOutput {
|
|
57
|
+
id: string;
|
|
58
|
+
metric: string;
|
|
59
|
+
score: number;
|
|
60
|
+
label?: string;
|
|
61
|
+
/** LLM's reasoning/explanation (for LLM-based metrics) */
|
|
62
|
+
reasoning?: string;
|
|
63
|
+
/** Evaluation mode used (for LLM-based metrics) */
|
|
64
|
+
evaluationMode?: "per-row" | "batch";
|
|
65
|
+
}
|
|
66
|
+
/**
|
|
67
|
+
* Configuration for a metric function
|
|
68
|
+
*/
|
|
69
|
+
interface MetricConfig {
|
|
70
|
+
outputs: Array<{
|
|
71
|
+
id: string;
|
|
72
|
+
output: string;
|
|
73
|
+
}>;
|
|
74
|
+
context?: string[];
|
|
75
|
+
query?: string[];
|
|
76
|
+
source?: string[];
|
|
77
|
+
/** LLM client override (defaults to global client) */
|
|
78
|
+
llmClient?: LLMClient;
|
|
79
|
+
/** Evaluation mode: per-row (accurate, expensive) or batch (cheaper, potentially less accurate) */
|
|
80
|
+
evaluationMode?: "per-row" | "batch";
|
|
81
|
+
/** Custom prompt template override */
|
|
82
|
+
customPrompt?: string;
|
|
83
|
+
/** LLM temperature (default: 0) */
|
|
84
|
+
temperature?: number;
|
|
85
|
+
/** Max tokens per completion */
|
|
86
|
+
maxTokens?: number;
|
|
87
|
+
/** Timeout in milliseconds */
|
|
88
|
+
timeout?: number;
|
|
89
|
+
}
|
|
90
|
+
/**
|
|
91
|
+
* A metric function that evaluates outputs
|
|
92
|
+
*/
|
|
93
|
+
type MetricFn = (config: MetricConfig) => Promise<MetricOutput[]>;
|
|
94
|
+
/**
|
|
95
|
+
* Confusion matrix with labels
|
|
96
|
+
*/
|
|
97
|
+
interface ConfusionMatrix {
|
|
98
|
+
matrix: number[][];
|
|
99
|
+
labels: string[];
|
|
100
|
+
total: number;
|
|
101
|
+
}
|
|
102
|
+
/**
|
|
103
|
+
* Per-class classification metrics
|
|
104
|
+
*/
|
|
105
|
+
interface ClassMetrics {
|
|
106
|
+
precision: number;
|
|
107
|
+
recall: number;
|
|
108
|
+
f1: number;
|
|
109
|
+
support: number;
|
|
110
|
+
}
|
|
111
|
+
/**
|
|
112
|
+
* Full classification metrics result
|
|
113
|
+
*/
|
|
114
|
+
interface ClassificationMetrics {
|
|
115
|
+
accuracy: number;
|
|
116
|
+
perClass: Record<string, ClassMetrics>;
|
|
117
|
+
macroAvg: {
|
|
118
|
+
precision: number;
|
|
119
|
+
recall: number;
|
|
120
|
+
f1: number;
|
|
121
|
+
};
|
|
122
|
+
weightedAvg: {
|
|
123
|
+
precision: number;
|
|
124
|
+
recall: number;
|
|
125
|
+
f1: number;
|
|
126
|
+
};
|
|
127
|
+
confusionMatrix: ConfusionMatrix;
|
|
128
|
+
}
|
|
129
|
+
/**
|
|
130
|
+
* Regression metrics result
|
|
131
|
+
*/
|
|
132
|
+
interface RegressionMetrics {
|
|
133
|
+
mae: number;
|
|
134
|
+
mse: number;
|
|
135
|
+
rmse: number;
|
|
136
|
+
r2: number;
|
|
137
|
+
}
|
|
138
|
+
/**
|
|
139
|
+
* Result of evaluating a single field across all predictions
|
|
140
|
+
*/
|
|
141
|
+
interface FieldMetricResult {
|
|
142
|
+
field: string;
|
|
143
|
+
metrics: ClassificationMetrics;
|
|
144
|
+
binarized: boolean;
|
|
145
|
+
binarizeThreshold?: number;
|
|
146
|
+
}
|
|
147
|
+
/**
|
|
148
|
+
* Test function signature
|
|
149
|
+
*/
|
|
150
|
+
type TestFn = () => Promise<void> | void;
|
|
151
|
+
/**
|
|
152
|
+
* An individual eval test
|
|
153
|
+
*/
|
|
154
|
+
interface EvalTest {
|
|
155
|
+
name: string;
|
|
156
|
+
fn: TestFn;
|
|
157
|
+
}
|
|
158
|
+
/**
|
|
159
|
+
* A test suite (describe block)
|
|
160
|
+
*/
|
|
161
|
+
interface Suite {
|
|
162
|
+
name: string;
|
|
163
|
+
tests: EvalTest[];
|
|
164
|
+
beforeAll?: TestFn[];
|
|
165
|
+
afterAll?: TestFn[];
|
|
166
|
+
beforeEach?: TestFn[];
|
|
167
|
+
afterEach?: TestFn[];
|
|
168
|
+
}
|
|
169
|
+
/**
|
|
170
|
+
* Current test execution context
|
|
171
|
+
*/
|
|
172
|
+
interface TestContext {
|
|
173
|
+
currentSuite: Suite | null;
|
|
174
|
+
suites: Suite[];
|
|
175
|
+
results: SuiteResult[];
|
|
176
|
+
}
|
|
177
|
+
/**
|
|
178
|
+
* Result of a single assertion
|
|
179
|
+
*/
|
|
180
|
+
interface AssertionResult {
|
|
181
|
+
type: string;
|
|
182
|
+
passed: boolean;
|
|
183
|
+
message: string;
|
|
184
|
+
expected?: unknown;
|
|
185
|
+
actual?: unknown;
|
|
186
|
+
field?: string;
|
|
187
|
+
class?: string;
|
|
188
|
+
}
|
|
189
|
+
/**
|
|
190
|
+
* Result of a single test
|
|
191
|
+
*/
|
|
192
|
+
interface TestResult {
|
|
193
|
+
name: string;
|
|
194
|
+
status: "passed" | "failed" | "error" | "skipped";
|
|
195
|
+
assertions: AssertionResult[];
|
|
196
|
+
fieldMetrics: FieldMetricResult[];
|
|
197
|
+
duration: number;
|
|
198
|
+
error?: Error;
|
|
199
|
+
}
|
|
200
|
+
/**
|
|
201
|
+
* Result of a test suite
|
|
202
|
+
*/
|
|
203
|
+
interface SuiteResult {
|
|
204
|
+
name: string;
|
|
205
|
+
tests: TestResult[];
|
|
206
|
+
passed: number;
|
|
207
|
+
failed: number;
|
|
208
|
+
errors: number;
|
|
209
|
+
skipped: number;
|
|
210
|
+
duration: number;
|
|
211
|
+
}
|
|
212
|
+
/**
|
|
213
|
+
* Integrity check results for a dataset
|
|
214
|
+
*/
|
|
215
|
+
interface IntegrityResult {
|
|
216
|
+
valid: boolean;
|
|
217
|
+
totalRecords: number;
|
|
218
|
+
missingIds: string[];
|
|
219
|
+
duplicateIds: string[];
|
|
220
|
+
missingFields: Array<{
|
|
221
|
+
id: string;
|
|
222
|
+
fields: string[];
|
|
223
|
+
}>;
|
|
224
|
+
}
|
|
225
|
+
/**
|
|
226
|
+
* Final evaluation report
|
|
227
|
+
*/
|
|
228
|
+
interface EvalReport {
|
|
229
|
+
version: string;
|
|
230
|
+
timestamp: string;
|
|
231
|
+
suites: SuiteResult[];
|
|
232
|
+
summary: {
|
|
233
|
+
totalSuites: number;
|
|
234
|
+
totalTests: number;
|
|
235
|
+
passed: number;
|
|
236
|
+
failed: number;
|
|
237
|
+
errors: number;
|
|
238
|
+
skipped: number;
|
|
239
|
+
duration: number;
|
|
240
|
+
};
|
|
241
|
+
integrity?: IntegrityResult;
|
|
242
|
+
}
|
|
243
|
+
/**
|
|
244
|
+
* CLI configuration options
|
|
245
|
+
*/
|
|
246
|
+
interface CLIOptions {
|
|
247
|
+
filter?: string;
|
|
248
|
+
output?: string;
|
|
249
|
+
reporter?: "json" | "console" | "both";
|
|
250
|
+
bail?: boolean;
|
|
251
|
+
timeout?: number;
|
|
252
|
+
}
|
|
253
|
+
/**
|
|
254
|
+
* Exit codes for CI integration
|
|
255
|
+
*/
|
|
256
|
+
declare const ExitCodes: {
|
|
257
|
+
readonly SUCCESS: 0;
|
|
258
|
+
readonly ASSERTION_FAILURE: 1;
|
|
259
|
+
readonly INTEGRITY_FAILURE: 2;
|
|
260
|
+
readonly EXECUTION_ERROR: 3;
|
|
261
|
+
readonly CONFIGURATION_ERROR: 4;
|
|
262
|
+
};
|
|
263
|
+
type ExitCode = (typeof ExitCodes)[keyof typeof ExitCodes];
|
|
264
|
+
|
|
265
|
+
export { type AlignedRecord as A, type ClassificationMetrics as C, type Dataset as D, type EvalReport as E, type FieldMetricResult as F, type IntegrityResult as I, type JSONSchema as J, type LLMClient as L, type MetricFn as M, type Prediction as P, type RegressionMetrics as R, type Suite as S, type TestFn as T, type MetricConfig as a, type MetricOutput as b, type AssertionResult as c, type ConfusionMatrix as d, type CLIOptions as e, type ClassMetrics as f, type DatasetMetadata as g, type EvalTest as h, type ExitCode as i, ExitCodes as j, type SuiteResult as k, type TestContext as l, type TestResult as m };
|
|
@@ -0,0 +1,265 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Core type definitions for EvalSense
|
|
3
|
+
*/
|
|
4
|
+
/**
|
|
5
|
+
* A loaded dataset with records and metadata
|
|
6
|
+
*/
|
|
7
|
+
interface Dataset<T = Record<string, unknown>> {
|
|
8
|
+
records: T[];
|
|
9
|
+
metadata: DatasetMetadata;
|
|
10
|
+
}
|
|
11
|
+
interface DatasetMetadata {
|
|
12
|
+
source: string;
|
|
13
|
+
count: number;
|
|
14
|
+
loadedAt: Date;
|
|
15
|
+
}
|
|
16
|
+
/**
|
|
17
|
+
* A record aligned between actual (model output) and expected (ground truth)
|
|
18
|
+
*/
|
|
19
|
+
interface AlignedRecord {
|
|
20
|
+
id: string;
|
|
21
|
+
actual: Record<string, unknown>;
|
|
22
|
+
expected: Record<string, unknown>;
|
|
23
|
+
}
|
|
24
|
+
/**
|
|
25
|
+
* Output from runModel() - predictions with IDs for alignment
|
|
26
|
+
*/
|
|
27
|
+
interface Prediction {
|
|
28
|
+
id: string;
|
|
29
|
+
[field: string]: unknown;
|
|
30
|
+
}
|
|
31
|
+
/**
|
|
32
|
+
* JSON Schema for structured LLM outputs
|
|
33
|
+
*/
|
|
34
|
+
interface JSONSchema {
|
|
35
|
+
type: string;
|
|
36
|
+
properties?: Record<string, unknown>;
|
|
37
|
+
required?: string[];
|
|
38
|
+
[key: string]: unknown;
|
|
39
|
+
}
|
|
40
|
+
/**
|
|
41
|
+
* LLM client interface for metric evaluation
|
|
42
|
+
*/
|
|
43
|
+
interface LLMClient {
|
|
44
|
+
/**
|
|
45
|
+
* Generate a text completion from a prompt
|
|
46
|
+
*/
|
|
47
|
+
complete(prompt: string): Promise<string>;
|
|
48
|
+
/**
|
|
49
|
+
* Generate a structured JSON completion (optional)
|
|
50
|
+
*/
|
|
51
|
+
completeStructured?<T>(prompt: string, schema: JSONSchema): Promise<T>;
|
|
52
|
+
}
|
|
53
|
+
/**
|
|
54
|
+
* Output from an LLM metric evaluation
|
|
55
|
+
*/
|
|
56
|
+
interface MetricOutput {
|
|
57
|
+
id: string;
|
|
58
|
+
metric: string;
|
|
59
|
+
score: number;
|
|
60
|
+
label?: string;
|
|
61
|
+
/** LLM's reasoning/explanation (for LLM-based metrics) */
|
|
62
|
+
reasoning?: string;
|
|
63
|
+
/** Evaluation mode used (for LLM-based metrics) */
|
|
64
|
+
evaluationMode?: "per-row" | "batch";
|
|
65
|
+
}
|
|
66
|
+
/**
|
|
67
|
+
* Configuration for a metric function
|
|
68
|
+
*/
|
|
69
|
+
interface MetricConfig {
|
|
70
|
+
outputs: Array<{
|
|
71
|
+
id: string;
|
|
72
|
+
output: string;
|
|
73
|
+
}>;
|
|
74
|
+
context?: string[];
|
|
75
|
+
query?: string[];
|
|
76
|
+
source?: string[];
|
|
77
|
+
/** LLM client override (defaults to global client) */
|
|
78
|
+
llmClient?: LLMClient;
|
|
79
|
+
/** Evaluation mode: per-row (accurate, expensive) or batch (cheaper, potentially less accurate) */
|
|
80
|
+
evaluationMode?: "per-row" | "batch";
|
|
81
|
+
/** Custom prompt template override */
|
|
82
|
+
customPrompt?: string;
|
|
83
|
+
/** LLM temperature (default: 0) */
|
|
84
|
+
temperature?: number;
|
|
85
|
+
/** Max tokens per completion */
|
|
86
|
+
maxTokens?: number;
|
|
87
|
+
/** Timeout in milliseconds */
|
|
88
|
+
timeout?: number;
|
|
89
|
+
}
|
|
90
|
+
/**
|
|
91
|
+
* A metric function that evaluates outputs
|
|
92
|
+
*/
|
|
93
|
+
type MetricFn = (config: MetricConfig) => Promise<MetricOutput[]>;
|
|
94
|
+
/**
|
|
95
|
+
* Confusion matrix with labels
|
|
96
|
+
*/
|
|
97
|
+
interface ConfusionMatrix {
|
|
98
|
+
matrix: number[][];
|
|
99
|
+
labels: string[];
|
|
100
|
+
total: number;
|
|
101
|
+
}
|
|
102
|
+
/**
|
|
103
|
+
* Per-class classification metrics
|
|
104
|
+
*/
|
|
105
|
+
interface ClassMetrics {
|
|
106
|
+
precision: number;
|
|
107
|
+
recall: number;
|
|
108
|
+
f1: number;
|
|
109
|
+
support: number;
|
|
110
|
+
}
|
|
111
|
+
/**
|
|
112
|
+
* Full classification metrics result
|
|
113
|
+
*/
|
|
114
|
+
interface ClassificationMetrics {
|
|
115
|
+
accuracy: number;
|
|
116
|
+
perClass: Record<string, ClassMetrics>;
|
|
117
|
+
macroAvg: {
|
|
118
|
+
precision: number;
|
|
119
|
+
recall: number;
|
|
120
|
+
f1: number;
|
|
121
|
+
};
|
|
122
|
+
weightedAvg: {
|
|
123
|
+
precision: number;
|
|
124
|
+
recall: number;
|
|
125
|
+
f1: number;
|
|
126
|
+
};
|
|
127
|
+
confusionMatrix: ConfusionMatrix;
|
|
128
|
+
}
|
|
129
|
+
/**
|
|
130
|
+
* Regression metrics result
|
|
131
|
+
*/
|
|
132
|
+
interface RegressionMetrics {
|
|
133
|
+
mae: number;
|
|
134
|
+
mse: number;
|
|
135
|
+
rmse: number;
|
|
136
|
+
r2: number;
|
|
137
|
+
}
|
|
138
|
+
/**
|
|
139
|
+
* Result of evaluating a single field across all predictions
|
|
140
|
+
*/
|
|
141
|
+
interface FieldMetricResult {
|
|
142
|
+
field: string;
|
|
143
|
+
metrics: ClassificationMetrics;
|
|
144
|
+
binarized: boolean;
|
|
145
|
+
binarizeThreshold?: number;
|
|
146
|
+
}
|
|
147
|
+
/**
|
|
148
|
+
* Test function signature
|
|
149
|
+
*/
|
|
150
|
+
type TestFn = () => Promise<void> | void;
|
|
151
|
+
/**
|
|
152
|
+
* An individual eval test
|
|
153
|
+
*/
|
|
154
|
+
interface EvalTest {
|
|
155
|
+
name: string;
|
|
156
|
+
fn: TestFn;
|
|
157
|
+
}
|
|
158
|
+
/**
|
|
159
|
+
* A test suite (describe block)
|
|
160
|
+
*/
|
|
161
|
+
interface Suite {
|
|
162
|
+
name: string;
|
|
163
|
+
tests: EvalTest[];
|
|
164
|
+
beforeAll?: TestFn[];
|
|
165
|
+
afterAll?: TestFn[];
|
|
166
|
+
beforeEach?: TestFn[];
|
|
167
|
+
afterEach?: TestFn[];
|
|
168
|
+
}
|
|
169
|
+
/**
|
|
170
|
+
* Current test execution context
|
|
171
|
+
*/
|
|
172
|
+
interface TestContext {
|
|
173
|
+
currentSuite: Suite | null;
|
|
174
|
+
suites: Suite[];
|
|
175
|
+
results: SuiteResult[];
|
|
176
|
+
}
|
|
177
|
+
/**
|
|
178
|
+
* Result of a single assertion
|
|
179
|
+
*/
|
|
180
|
+
interface AssertionResult {
|
|
181
|
+
type: string;
|
|
182
|
+
passed: boolean;
|
|
183
|
+
message: string;
|
|
184
|
+
expected?: unknown;
|
|
185
|
+
actual?: unknown;
|
|
186
|
+
field?: string;
|
|
187
|
+
class?: string;
|
|
188
|
+
}
|
|
189
|
+
/**
|
|
190
|
+
* Result of a single test
|
|
191
|
+
*/
|
|
192
|
+
interface TestResult {
|
|
193
|
+
name: string;
|
|
194
|
+
status: "passed" | "failed" | "error" | "skipped";
|
|
195
|
+
assertions: AssertionResult[];
|
|
196
|
+
fieldMetrics: FieldMetricResult[];
|
|
197
|
+
duration: number;
|
|
198
|
+
error?: Error;
|
|
199
|
+
}
|
|
200
|
+
/**
|
|
201
|
+
* Result of a test suite
|
|
202
|
+
*/
|
|
203
|
+
interface SuiteResult {
|
|
204
|
+
name: string;
|
|
205
|
+
tests: TestResult[];
|
|
206
|
+
passed: number;
|
|
207
|
+
failed: number;
|
|
208
|
+
errors: number;
|
|
209
|
+
skipped: number;
|
|
210
|
+
duration: number;
|
|
211
|
+
}
|
|
212
|
+
/**
|
|
213
|
+
* Integrity check results for a dataset
|
|
214
|
+
*/
|
|
215
|
+
interface IntegrityResult {
|
|
216
|
+
valid: boolean;
|
|
217
|
+
totalRecords: number;
|
|
218
|
+
missingIds: string[];
|
|
219
|
+
duplicateIds: string[];
|
|
220
|
+
missingFields: Array<{
|
|
221
|
+
id: string;
|
|
222
|
+
fields: string[];
|
|
223
|
+
}>;
|
|
224
|
+
}
|
|
225
|
+
/**
|
|
226
|
+
* Final evaluation report
|
|
227
|
+
*/
|
|
228
|
+
interface EvalReport {
|
|
229
|
+
version: string;
|
|
230
|
+
timestamp: string;
|
|
231
|
+
suites: SuiteResult[];
|
|
232
|
+
summary: {
|
|
233
|
+
totalSuites: number;
|
|
234
|
+
totalTests: number;
|
|
235
|
+
passed: number;
|
|
236
|
+
failed: number;
|
|
237
|
+
errors: number;
|
|
238
|
+
skipped: number;
|
|
239
|
+
duration: number;
|
|
240
|
+
};
|
|
241
|
+
integrity?: IntegrityResult;
|
|
242
|
+
}
|
|
243
|
+
/**
|
|
244
|
+
* CLI configuration options
|
|
245
|
+
*/
|
|
246
|
+
interface CLIOptions {
|
|
247
|
+
filter?: string;
|
|
248
|
+
output?: string;
|
|
249
|
+
reporter?: "json" | "console" | "both";
|
|
250
|
+
bail?: boolean;
|
|
251
|
+
timeout?: number;
|
|
252
|
+
}
|
|
253
|
+
/**
|
|
254
|
+
* Exit codes for CI integration
|
|
255
|
+
*/
|
|
256
|
+
declare const ExitCodes: {
|
|
257
|
+
readonly SUCCESS: 0;
|
|
258
|
+
readonly ASSERTION_FAILURE: 1;
|
|
259
|
+
readonly INTEGRITY_FAILURE: 2;
|
|
260
|
+
readonly EXECUTION_ERROR: 3;
|
|
261
|
+
readonly CONFIGURATION_ERROR: 4;
|
|
262
|
+
};
|
|
263
|
+
type ExitCode = (typeof ExitCodes)[keyof typeof ExitCodes];
|
|
264
|
+
|
|
265
|
+
export { type AlignedRecord as A, type ClassificationMetrics as C, type Dataset as D, type EvalReport as E, type FieldMetricResult as F, type IntegrityResult as I, type JSONSchema as J, type LLMClient as L, type MetricFn as M, type Prediction as P, type RegressionMetrics as R, type Suite as S, type TestFn as T, type MetricConfig as a, type MetricOutput as b, type AssertionResult as c, type ConfusionMatrix as d, type CLIOptions as e, type ClassMetrics as f, type DatasetMetadata as g, type EvalTest as h, type ExitCode as i, ExitCodes as j, type SuiteResult as k, type TestContext as l, type TestResult as m };
|
package/package.json
ADDED
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "evalsense",
|
|
3
|
+
"version": "0.2.0",
|
|
4
|
+
"description": "JS-native LLM evaluation framework with Jest-like API and statistical assertions",
|
|
5
|
+
"type": "module",
|
|
6
|
+
"main": "./dist/index.cjs",
|
|
7
|
+
"module": "./dist/index.js",
|
|
8
|
+
"types": "./dist/index.d.ts",
|
|
9
|
+
"exports": {
|
|
10
|
+
".": {
|
|
11
|
+
"import": {
|
|
12
|
+
"types": "./dist/index.d.ts",
|
|
13
|
+
"default": "./dist/index.js"
|
|
14
|
+
},
|
|
15
|
+
"require": {
|
|
16
|
+
"types": "./dist/index.d.cts",
|
|
17
|
+
"default": "./dist/index.cjs"
|
|
18
|
+
}
|
|
19
|
+
},
|
|
20
|
+
"./metrics": {
|
|
21
|
+
"import": {
|
|
22
|
+
"types": "./dist/metrics/index.d.ts",
|
|
23
|
+
"default": "./dist/metrics/index.js"
|
|
24
|
+
},
|
|
25
|
+
"require": {
|
|
26
|
+
"types": "./dist/metrics/index.d.cts",
|
|
27
|
+
"default": "./dist/metrics/index.cjs"
|
|
28
|
+
}
|
|
29
|
+
},
|
|
30
|
+
"./metrics/opinionated": {
|
|
31
|
+
"import": {
|
|
32
|
+
"types": "./dist/metrics/opinionated/index.d.ts",
|
|
33
|
+
"default": "./dist/metrics/opinionated/index.js"
|
|
34
|
+
},
|
|
35
|
+
"require": {
|
|
36
|
+
"types": "./dist/metrics/opinionated/index.d.cts",
|
|
37
|
+
"default": "./dist/metrics/opinionated/index.cjs"
|
|
38
|
+
}
|
|
39
|
+
}
|
|
40
|
+
},
|
|
41
|
+
"bin": {
|
|
42
|
+
"evalsense": "./bin/evalsense.js"
|
|
43
|
+
},
|
|
44
|
+
"files": [
|
|
45
|
+
"dist",
|
|
46
|
+
"bin"
|
|
47
|
+
],
|
|
48
|
+
"scripts": {
|
|
49
|
+
"build": "tsup",
|
|
50
|
+
"dev": "tsup --watch",
|
|
51
|
+
"test": "vitest run",
|
|
52
|
+
"test:watch": "vitest",
|
|
53
|
+
"test:coverage": "vitest run --coverage",
|
|
54
|
+
"lint": "eslint src tests",
|
|
55
|
+
"lint:fix": "eslint src tests --fix",
|
|
56
|
+
"format": "prettier --write .",
|
|
57
|
+
"format:check": "prettier --check .",
|
|
58
|
+
"typecheck": "tsc --noEmit",
|
|
59
|
+
"prepublishOnly": "npm run build"
|
|
60
|
+
},
|
|
61
|
+
"keywords": [
|
|
62
|
+
"llm",
|
|
63
|
+
"evaluation",
|
|
64
|
+
"testing",
|
|
65
|
+
"metrics",
|
|
66
|
+
"machine-learning",
|
|
67
|
+
"ai",
|
|
68
|
+
"statistical-testing"
|
|
69
|
+
],
|
|
70
|
+
"author": "Mohit Joshi",
|
|
71
|
+
"license": "MIT",
|
|
72
|
+
"dependencies": {
|
|
73
|
+
"commander": "^12.1.0",
|
|
74
|
+
"glob": "^11.0.0",
|
|
75
|
+
"fast-json-stable-stringify": "^2.1.0"
|
|
76
|
+
},
|
|
77
|
+
"devDependencies": {
|
|
78
|
+
"@types/node": "^22.10.0",
|
|
79
|
+
"@typescript-eslint/eslint-plugin": "^8.18.0",
|
|
80
|
+
"@typescript-eslint/parser": "^8.18.0",
|
|
81
|
+
"@vitest/coverage-v8": "^2.1.8",
|
|
82
|
+
"eslint": "^9.17.0",
|
|
83
|
+
"prettier": "^3.4.2",
|
|
84
|
+
"tsup": "^8.3.5",
|
|
85
|
+
"typescript": "^5.7.2",
|
|
86
|
+
"vitest": "^2.1.8"
|
|
87
|
+
},
|
|
88
|
+
"engines": {
|
|
89
|
+
"node": ">=18"
|
|
90
|
+
}
|
|
91
|
+
}
|