evalsense 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,265 @@
1
+ /**
2
+ * Core type definitions for EvalSense
3
+ */
4
+ /**
5
+ * A loaded dataset with records and metadata
6
+ */
7
+ interface Dataset<T = Record<string, unknown>> {
8
+ records: T[];
9
+ metadata: DatasetMetadata;
10
+ }
11
+ interface DatasetMetadata {
12
+ source: string;
13
+ count: number;
14
+ loadedAt: Date;
15
+ }
16
+ /**
17
+ * A record aligned between actual (model output) and expected (ground truth)
18
+ */
19
+ interface AlignedRecord {
20
+ id: string;
21
+ actual: Record<string, unknown>;
22
+ expected: Record<string, unknown>;
23
+ }
24
+ /**
25
+ * Output from runModel() - predictions with IDs for alignment
26
+ */
27
+ interface Prediction {
28
+ id: string;
29
+ [field: string]: unknown;
30
+ }
31
+ /**
32
+ * JSON Schema for structured LLM outputs
33
+ */
34
+ interface JSONSchema {
35
+ type: string;
36
+ properties?: Record<string, unknown>;
37
+ required?: string[];
38
+ [key: string]: unknown;
39
+ }
40
+ /**
41
+ * LLM client interface for metric evaluation
42
+ */
43
+ interface LLMClient {
44
+ /**
45
+ * Generate a text completion from a prompt
46
+ */
47
+ complete(prompt: string): Promise<string>;
48
+ /**
49
+ * Generate a structured JSON completion (optional)
50
+ */
51
+ completeStructured?<T>(prompt: string, schema: JSONSchema): Promise<T>;
52
+ }
53
+ /**
54
+ * Output from an LLM metric evaluation
55
+ */
56
+ interface MetricOutput {
57
+ id: string;
58
+ metric: string;
59
+ score: number;
60
+ label?: string;
61
+ /** LLM's reasoning/explanation (for LLM-based metrics) */
62
+ reasoning?: string;
63
+ /** Evaluation mode used (for LLM-based metrics) */
64
+ evaluationMode?: "per-row" | "batch";
65
+ }
66
+ /**
67
+ * Configuration for a metric function
68
+ */
69
+ interface MetricConfig {
70
+ outputs: Array<{
71
+ id: string;
72
+ output: string;
73
+ }>;
74
+ context?: string[];
75
+ query?: string[];
76
+ source?: string[];
77
+ /** LLM client override (defaults to global client) */
78
+ llmClient?: LLMClient;
79
+ /** Evaluation mode: per-row (accurate, expensive) or batch (cheaper, potentially less accurate) */
80
+ evaluationMode?: "per-row" | "batch";
81
+ /** Custom prompt template override */
82
+ customPrompt?: string;
83
+ /** LLM temperature (default: 0) */
84
+ temperature?: number;
85
+ /** Max tokens per completion */
86
+ maxTokens?: number;
87
+ /** Timeout in milliseconds */
88
+ timeout?: number;
89
+ }
90
+ /**
91
+ * A metric function that evaluates outputs
92
+ */
93
+ type MetricFn = (config: MetricConfig) => Promise<MetricOutput[]>;
94
+ /**
95
+ * Confusion matrix with labels
96
+ */
97
+ interface ConfusionMatrix {
98
+ matrix: number[][];
99
+ labels: string[];
100
+ total: number;
101
+ }
102
+ /**
103
+ * Per-class classification metrics
104
+ */
105
+ interface ClassMetrics {
106
+ precision: number;
107
+ recall: number;
108
+ f1: number;
109
+ support: number;
110
+ }
111
+ /**
112
+ * Full classification metrics result
113
+ */
114
+ interface ClassificationMetrics {
115
+ accuracy: number;
116
+ perClass: Record<string, ClassMetrics>;
117
+ macroAvg: {
118
+ precision: number;
119
+ recall: number;
120
+ f1: number;
121
+ };
122
+ weightedAvg: {
123
+ precision: number;
124
+ recall: number;
125
+ f1: number;
126
+ };
127
+ confusionMatrix: ConfusionMatrix;
128
+ }
129
+ /**
130
+ * Regression metrics result
131
+ */
132
+ interface RegressionMetrics {
133
+ mae: number;
134
+ mse: number;
135
+ rmse: number;
136
+ r2: number;
137
+ }
138
+ /**
139
+ * Result of evaluating a single field across all predictions
140
+ */
141
+ interface FieldMetricResult {
142
+ field: string;
143
+ metrics: ClassificationMetrics;
144
+ binarized: boolean;
145
+ binarizeThreshold?: number;
146
+ }
147
+ /**
148
+ * Test function signature
149
+ */
150
+ type TestFn = () => Promise<void> | void;
151
+ /**
152
+ * An individual eval test
153
+ */
154
+ interface EvalTest {
155
+ name: string;
156
+ fn: TestFn;
157
+ }
158
+ /**
159
+ * A test suite (describe block)
160
+ */
161
+ interface Suite {
162
+ name: string;
163
+ tests: EvalTest[];
164
+ beforeAll?: TestFn[];
165
+ afterAll?: TestFn[];
166
+ beforeEach?: TestFn[];
167
+ afterEach?: TestFn[];
168
+ }
169
+ /**
170
+ * Current test execution context
171
+ */
172
+ interface TestContext {
173
+ currentSuite: Suite | null;
174
+ suites: Suite[];
175
+ results: SuiteResult[];
176
+ }
177
+ /**
178
+ * Result of a single assertion
179
+ */
180
+ interface AssertionResult {
181
+ type: string;
182
+ passed: boolean;
183
+ message: string;
184
+ expected?: unknown;
185
+ actual?: unknown;
186
+ field?: string;
187
+ class?: string;
188
+ }
189
+ /**
190
+ * Result of a single test
191
+ */
192
+ interface TestResult {
193
+ name: string;
194
+ status: "passed" | "failed" | "error" | "skipped";
195
+ assertions: AssertionResult[];
196
+ fieldMetrics: FieldMetricResult[];
197
+ duration: number;
198
+ error?: Error;
199
+ }
200
+ /**
201
+ * Result of a test suite
202
+ */
203
+ interface SuiteResult {
204
+ name: string;
205
+ tests: TestResult[];
206
+ passed: number;
207
+ failed: number;
208
+ errors: number;
209
+ skipped: number;
210
+ duration: number;
211
+ }
212
+ /**
213
+ * Integrity check results for a dataset
214
+ */
215
+ interface IntegrityResult {
216
+ valid: boolean;
217
+ totalRecords: number;
218
+ missingIds: string[];
219
+ duplicateIds: string[];
220
+ missingFields: Array<{
221
+ id: string;
222
+ fields: string[];
223
+ }>;
224
+ }
225
+ /**
226
+ * Final evaluation report
227
+ */
228
+ interface EvalReport {
229
+ version: string;
230
+ timestamp: string;
231
+ suites: SuiteResult[];
232
+ summary: {
233
+ totalSuites: number;
234
+ totalTests: number;
235
+ passed: number;
236
+ failed: number;
237
+ errors: number;
238
+ skipped: number;
239
+ duration: number;
240
+ };
241
+ integrity?: IntegrityResult;
242
+ }
243
+ /**
244
+ * CLI configuration options
245
+ */
246
+ interface CLIOptions {
247
+ filter?: string;
248
+ output?: string;
249
+ reporter?: "json" | "console" | "both";
250
+ bail?: boolean;
251
+ timeout?: number;
252
+ }
253
+ /**
254
+ * Exit codes for CI integration
255
+ */
256
+ declare const ExitCodes: {
257
+ readonly SUCCESS: 0;
258
+ readonly ASSERTION_FAILURE: 1;
259
+ readonly INTEGRITY_FAILURE: 2;
260
+ readonly EXECUTION_ERROR: 3;
261
+ readonly CONFIGURATION_ERROR: 4;
262
+ };
263
+ type ExitCode = (typeof ExitCodes)[keyof typeof ExitCodes];
264
+
265
+ export { type AlignedRecord as A, type ClassificationMetrics as C, type Dataset as D, type EvalReport as E, type FieldMetricResult as F, type IntegrityResult as I, type JSONSchema as J, type LLMClient as L, type MetricFn as M, type Prediction as P, type RegressionMetrics as R, type Suite as S, type TestFn as T, type MetricConfig as a, type MetricOutput as b, type AssertionResult as c, type ConfusionMatrix as d, type CLIOptions as e, type ClassMetrics as f, type DatasetMetadata as g, type EvalTest as h, type ExitCode as i, ExitCodes as j, type SuiteResult as k, type TestContext as l, type TestResult as m };
@@ -0,0 +1,265 @@
1
+ /**
2
+ * Core type definitions for EvalSense
3
+ */
4
+ /**
5
+ * A loaded dataset with records and metadata
6
+ */
7
+ interface Dataset<T = Record<string, unknown>> {
8
+ records: T[];
9
+ metadata: DatasetMetadata;
10
+ }
11
+ interface DatasetMetadata {
12
+ source: string;
13
+ count: number;
14
+ loadedAt: Date;
15
+ }
16
+ /**
17
+ * A record aligned between actual (model output) and expected (ground truth)
18
+ */
19
+ interface AlignedRecord {
20
+ id: string;
21
+ actual: Record<string, unknown>;
22
+ expected: Record<string, unknown>;
23
+ }
24
+ /**
25
+ * Output from runModel() - predictions with IDs for alignment
26
+ */
27
+ interface Prediction {
28
+ id: string;
29
+ [field: string]: unknown;
30
+ }
31
+ /**
32
+ * JSON Schema for structured LLM outputs
33
+ */
34
+ interface JSONSchema {
35
+ type: string;
36
+ properties?: Record<string, unknown>;
37
+ required?: string[];
38
+ [key: string]: unknown;
39
+ }
40
+ /**
41
+ * LLM client interface for metric evaluation
42
+ */
43
+ interface LLMClient {
44
+ /**
45
+ * Generate a text completion from a prompt
46
+ */
47
+ complete(prompt: string): Promise<string>;
48
+ /**
49
+ * Generate a structured JSON completion (optional)
50
+ */
51
+ completeStructured?<T>(prompt: string, schema: JSONSchema): Promise<T>;
52
+ }
53
+ /**
54
+ * Output from an LLM metric evaluation
55
+ */
56
+ interface MetricOutput {
57
+ id: string;
58
+ metric: string;
59
+ score: number;
60
+ label?: string;
61
+ /** LLM's reasoning/explanation (for LLM-based metrics) */
62
+ reasoning?: string;
63
+ /** Evaluation mode used (for LLM-based metrics) */
64
+ evaluationMode?: "per-row" | "batch";
65
+ }
66
+ /**
67
+ * Configuration for a metric function
68
+ */
69
+ interface MetricConfig {
70
+ outputs: Array<{
71
+ id: string;
72
+ output: string;
73
+ }>;
74
+ context?: string[];
75
+ query?: string[];
76
+ source?: string[];
77
+ /** LLM client override (defaults to global client) */
78
+ llmClient?: LLMClient;
79
+ /** Evaluation mode: per-row (accurate, expensive) or batch (cheaper, potentially less accurate) */
80
+ evaluationMode?: "per-row" | "batch";
81
+ /** Custom prompt template override */
82
+ customPrompt?: string;
83
+ /** LLM temperature (default: 0) */
84
+ temperature?: number;
85
+ /** Max tokens per completion */
86
+ maxTokens?: number;
87
+ /** Timeout in milliseconds */
88
+ timeout?: number;
89
+ }
90
+ /**
91
+ * A metric function that evaluates outputs
92
+ */
93
+ type MetricFn = (config: MetricConfig) => Promise<MetricOutput[]>;
94
+ /**
95
+ * Confusion matrix with labels
96
+ */
97
+ interface ConfusionMatrix {
98
+ matrix: number[][];
99
+ labels: string[];
100
+ total: number;
101
+ }
102
+ /**
103
+ * Per-class classification metrics
104
+ */
105
+ interface ClassMetrics {
106
+ precision: number;
107
+ recall: number;
108
+ f1: number;
109
+ support: number;
110
+ }
111
+ /**
112
+ * Full classification metrics result
113
+ */
114
+ interface ClassificationMetrics {
115
+ accuracy: number;
116
+ perClass: Record<string, ClassMetrics>;
117
+ macroAvg: {
118
+ precision: number;
119
+ recall: number;
120
+ f1: number;
121
+ };
122
+ weightedAvg: {
123
+ precision: number;
124
+ recall: number;
125
+ f1: number;
126
+ };
127
+ confusionMatrix: ConfusionMatrix;
128
+ }
129
+ /**
130
+ * Regression metrics result
131
+ */
132
+ interface RegressionMetrics {
133
+ mae: number;
134
+ mse: number;
135
+ rmse: number;
136
+ r2: number;
137
+ }
138
+ /**
139
+ * Result of evaluating a single field across all predictions
140
+ */
141
+ interface FieldMetricResult {
142
+ field: string;
143
+ metrics: ClassificationMetrics;
144
+ binarized: boolean;
145
+ binarizeThreshold?: number;
146
+ }
147
+ /**
148
+ * Test function signature
149
+ */
150
+ type TestFn = () => Promise<void> | void;
151
+ /**
152
+ * An individual eval test
153
+ */
154
+ interface EvalTest {
155
+ name: string;
156
+ fn: TestFn;
157
+ }
158
+ /**
159
+ * A test suite (describe block)
160
+ */
161
+ interface Suite {
162
+ name: string;
163
+ tests: EvalTest[];
164
+ beforeAll?: TestFn[];
165
+ afterAll?: TestFn[];
166
+ beforeEach?: TestFn[];
167
+ afterEach?: TestFn[];
168
+ }
169
+ /**
170
+ * Current test execution context
171
+ */
172
+ interface TestContext {
173
+ currentSuite: Suite | null;
174
+ suites: Suite[];
175
+ results: SuiteResult[];
176
+ }
177
+ /**
178
+ * Result of a single assertion
179
+ */
180
+ interface AssertionResult {
181
+ type: string;
182
+ passed: boolean;
183
+ message: string;
184
+ expected?: unknown;
185
+ actual?: unknown;
186
+ field?: string;
187
+ class?: string;
188
+ }
189
+ /**
190
+ * Result of a single test
191
+ */
192
+ interface TestResult {
193
+ name: string;
194
+ status: "passed" | "failed" | "error" | "skipped";
195
+ assertions: AssertionResult[];
196
+ fieldMetrics: FieldMetricResult[];
197
+ duration: number;
198
+ error?: Error;
199
+ }
200
+ /**
201
+ * Result of a test suite
202
+ */
203
+ interface SuiteResult {
204
+ name: string;
205
+ tests: TestResult[];
206
+ passed: number;
207
+ failed: number;
208
+ errors: number;
209
+ skipped: number;
210
+ duration: number;
211
+ }
212
+ /**
213
+ * Integrity check results for a dataset
214
+ */
215
+ interface IntegrityResult {
216
+ valid: boolean;
217
+ totalRecords: number;
218
+ missingIds: string[];
219
+ duplicateIds: string[];
220
+ missingFields: Array<{
221
+ id: string;
222
+ fields: string[];
223
+ }>;
224
+ }
225
+ /**
226
+ * Final evaluation report
227
+ */
228
+ interface EvalReport {
229
+ version: string;
230
+ timestamp: string;
231
+ suites: SuiteResult[];
232
+ summary: {
233
+ totalSuites: number;
234
+ totalTests: number;
235
+ passed: number;
236
+ failed: number;
237
+ errors: number;
238
+ skipped: number;
239
+ duration: number;
240
+ };
241
+ integrity?: IntegrityResult;
242
+ }
243
+ /**
244
+ * CLI configuration options
245
+ */
246
+ interface CLIOptions {
247
+ filter?: string;
248
+ output?: string;
249
+ reporter?: "json" | "console" | "both";
250
+ bail?: boolean;
251
+ timeout?: number;
252
+ }
253
+ /**
254
+ * Exit codes for CI integration
255
+ */
256
+ declare const ExitCodes: {
257
+ readonly SUCCESS: 0;
258
+ readonly ASSERTION_FAILURE: 1;
259
+ readonly INTEGRITY_FAILURE: 2;
260
+ readonly EXECUTION_ERROR: 3;
261
+ readonly CONFIGURATION_ERROR: 4;
262
+ };
263
+ type ExitCode = (typeof ExitCodes)[keyof typeof ExitCodes];
264
+
265
+ export { type AlignedRecord as A, type ClassificationMetrics as C, type Dataset as D, type EvalReport as E, type FieldMetricResult as F, type IntegrityResult as I, type JSONSchema as J, type LLMClient as L, type MetricFn as M, type Prediction as P, type RegressionMetrics as R, type Suite as S, type TestFn as T, type MetricConfig as a, type MetricOutput as b, type AssertionResult as c, type ConfusionMatrix as d, type CLIOptions as e, type ClassMetrics as f, type DatasetMetadata as g, type EvalTest as h, type ExitCode as i, ExitCodes as j, type SuiteResult as k, type TestContext as l, type TestResult as m };
package/package.json ADDED
@@ -0,0 +1,91 @@
1
+ {
2
+ "name": "evalsense",
3
+ "version": "0.2.0",
4
+ "description": "JS-native LLM evaluation framework with Jest-like API and statistical assertions",
5
+ "type": "module",
6
+ "main": "./dist/index.cjs",
7
+ "module": "./dist/index.js",
8
+ "types": "./dist/index.d.ts",
9
+ "exports": {
10
+ ".": {
11
+ "import": {
12
+ "types": "./dist/index.d.ts",
13
+ "default": "./dist/index.js"
14
+ },
15
+ "require": {
16
+ "types": "./dist/index.d.cts",
17
+ "default": "./dist/index.cjs"
18
+ }
19
+ },
20
+ "./metrics": {
21
+ "import": {
22
+ "types": "./dist/metrics/index.d.ts",
23
+ "default": "./dist/metrics/index.js"
24
+ },
25
+ "require": {
26
+ "types": "./dist/metrics/index.d.cts",
27
+ "default": "./dist/metrics/index.cjs"
28
+ }
29
+ },
30
+ "./metrics/opinionated": {
31
+ "import": {
32
+ "types": "./dist/metrics/opinionated/index.d.ts",
33
+ "default": "./dist/metrics/opinionated/index.js"
34
+ },
35
+ "require": {
36
+ "types": "./dist/metrics/opinionated/index.d.cts",
37
+ "default": "./dist/metrics/opinionated/index.cjs"
38
+ }
39
+ }
40
+ },
41
+ "bin": {
42
+ "evalsense": "./bin/evalsense.js"
43
+ },
44
+ "files": [
45
+ "dist",
46
+ "bin"
47
+ ],
48
+ "scripts": {
49
+ "build": "tsup",
50
+ "dev": "tsup --watch",
51
+ "test": "vitest run",
52
+ "test:watch": "vitest",
53
+ "test:coverage": "vitest run --coverage",
54
+ "lint": "eslint src tests",
55
+ "lint:fix": "eslint src tests --fix",
56
+ "format": "prettier --write .",
57
+ "format:check": "prettier --check .",
58
+ "typecheck": "tsc --noEmit",
59
+ "prepublishOnly": "npm run build"
60
+ },
61
+ "keywords": [
62
+ "llm",
63
+ "evaluation",
64
+ "testing",
65
+ "metrics",
66
+ "machine-learning",
67
+ "ai",
68
+ "statistical-testing"
69
+ ],
70
+ "author": "Mohit Joshi",
71
+ "license": "MIT",
72
+ "dependencies": {
73
+ "commander": "^12.1.0",
74
+ "glob": "^11.0.0",
75
+ "fast-json-stable-stringify": "^2.1.0"
76
+ },
77
+ "devDependencies": {
78
+ "@types/node": "^22.10.0",
79
+ "@typescript-eslint/eslint-plugin": "^8.18.0",
80
+ "@typescript-eslint/parser": "^8.18.0",
81
+ "@vitest/coverage-v8": "^2.1.8",
82
+ "eslint": "^9.17.0",
83
+ "prettier": "^3.4.2",
84
+ "tsup": "^8.3.5",
85
+ "typescript": "^5.7.2",
86
+ "vitest": "^2.1.8"
87
+ },
88
+ "engines": {
89
+ "node": ">=18"
90
+ }
91
+ }