@m4trix/evals 0.25.1 → 0.26.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +29 -7
- package/dist/cli-simple.cjs +831 -450
- package/dist/cli-simple.cjs.map +1 -1
- package/dist/cli-simple.js +832 -451
- package/dist/cli-simple.js.map +1 -1
- package/dist/cli.cjs +531 -270
- package/dist/cli.cjs.map +1 -1
- package/dist/cli.js +531 -270
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +888 -509
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.ts +201 -7
- package/dist/index.js +878 -513
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/README.md
CHANGED
|
@@ -35,6 +35,7 @@ Create files under your project (for example, `src/evals/`) with these suffixes:
|
|
|
35
35
|
|
|
36
36
|
- `*.dataset.ts`
|
|
37
37
|
- `*.evaluator.ts`
|
|
38
|
+
- `*.run-config.ts`
|
|
38
39
|
- `*.test-case.ts`
|
|
39
40
|
|
|
40
41
|
Optional: create `m4trix-eval.config.ts` at your project root to customize discovery and output paths.
|
|
@@ -47,6 +48,7 @@ export default defineConfig((): ConfigType => ({
|
|
|
47
48
|
rootDir: 'src/evals',
|
|
48
49
|
datasetFilePatterns: ['.dataset.ts'],
|
|
49
50
|
evaluatorFilePatterns: ['.evaluator.ts'],
|
|
51
|
+
runConfigFilePatterns: ['.run-config.ts'],
|
|
50
52
|
testCaseFilePatterns: ['.test-case.ts'],
|
|
51
53
|
excludeDirectories: ['node_modules', 'dist'],
|
|
52
54
|
},
|
|
@@ -129,22 +131,40 @@ export const myTestCase = TestCase.describe({
|
|
|
129
131
|
});
|
|
130
132
|
```
|
|
131
133
|
|
|
132
|
-
### 4)
|
|
134
|
+
### 4) RunConfig (optional)
|
|
133
135
|
|
|
134
|
-
|
|
135
|
-
|
|
136
|
+
Group several dataset/evaluator runs under one named config. Each row is either
|
|
137
|
+
`evaluators: [...]` (same module instances discovery loads) or `evaluatorPattern: "..."`
|
|
138
|
+
(wildcard / regex rules from `RunnerApi.resolveEvaluatorsByNamePattern`). Multiple jobs share one `--concurrency` cap.
|
|
139
|
+
|
|
140
|
+
Optional **`repetitions`** on a row (default `1`) runs each matching test case that many times. Every execution in that group shares the same **`repetitionId`** in the evaluator callback **`meta`**, with **`repetitionIndex`** / **`repetitionCount`**. Evaluator **`meta`** includes **`runConfigName`**: the **`RunConfig`** name (or **`programmatic`** from **`PROGRAMMATIC_RUN_CONFIG`** for API/TUI-only **`runDatasetWith`**). Names may use **kebab-case**, **snake_case**, **camelCase**, etc. (letters, digits, `_`, `-` only, no spaces); resolution is **case-insensitive**.
|
|
141
|
+
|
|
142
|
+
```ts
|
|
143
|
+
import { RunConfig } from '@m4trix/evals';
|
|
144
|
+
import { myDataset } from './my.dataset';
|
|
145
|
+
import { myEvaluator } from './my.evaluator';
|
|
146
|
+
|
|
147
|
+
export const nightly = RunConfig.define({
|
|
148
|
+
name: 'nightly',
|
|
149
|
+
runs: [
|
|
150
|
+
{ dataset: myDataset, evaluators: [myEvaluator], repetitions: 3 },
|
|
151
|
+
{ dataset: myDataset, evaluatorPattern: '*smoke*' },
|
|
152
|
+
],
|
|
153
|
+
});
|
|
136
154
|
```
|
|
137
155
|
|
|
138
|
-
|
|
156
|
+
### 5) Run
|
|
139
157
|
|
|
140
158
|
```bash
|
|
141
|
-
eval-agents-simple run --
|
|
159
|
+
eval-agents-simple run --run-config "nightly"
|
|
142
160
|
```
|
|
143
161
|
|
|
162
|
+
Repeat **`--run-config`** to queue several configs; jobs share one **`--concurrency`** cap.
|
|
163
|
+
|
|
144
164
|
## CLI Commands
|
|
145
165
|
|
|
146
|
-
- `eval-agents`: interactive CLI
|
|
147
|
-
- `eval-agents-simple run --
|
|
166
|
+
- `eval-agents`: interactive CLI (starts runs with synthetic meta `programmatic` / `Programmatic`)
|
|
167
|
+
- `eval-agents-simple run --run-config "<RunConfig name>"` (repeatable; case-insensitive match); add **`--ci`** to exit with code **1** if any test case fails
|
|
148
168
|
- `eval-agents-simple generate --dataset "<dataset name>"`
|
|
149
169
|
|
|
150
170
|
## Default Discovery and Artifacts
|
|
@@ -153,6 +173,7 @@ By default, the runner uses `process.cwd()` as discovery root and scans for:
|
|
|
153
173
|
|
|
154
174
|
- Datasets: `.dataset.ts`, `.dataset.tsx`, `.dataset.js`, `.dataset.mjs`
|
|
155
175
|
- Evaluators: `.evaluator.ts`, `.evaluator.tsx`, `.evaluator.js`, `.evaluator.mjs`
|
|
176
|
+
- Run configs: `.run-config.ts`, `.run-config.tsx`, `.run-config.js`, `.run-config.mjs`
|
|
156
177
|
- Test cases: `.test-case.ts`, `.test-case.tsx`, `.test-case.js`, `.test-case.mjs`
|
|
157
178
|
|
|
158
179
|
Results are written to `.eval-results`.
|
|
@@ -166,6 +187,7 @@ When present, `m4trix-eval.config.ts` is loaded automatically from `process.cwd(
|
|
|
166
187
|
- Discovery keys:
|
|
167
188
|
- `datasetFilePatterns` (or `datasetSuffixes`)
|
|
168
189
|
- `evaluatorFilePatterns` (or `evaluatorSuffixes`)
|
|
190
|
+
- `runConfigFilePatterns` (or `runConfigSuffixes`)
|
|
169
191
|
- `testCaseFilePatterns` (or `testCaseSuffixes`)
|
|
170
192
|
- `rootDir`, `excludeDirectories`
|
|
171
193
|
|