@m4trix/evals 0.25.1 → 0.27.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -35,6 +35,7 @@ Create files under your project (for example, `src/evals/`) with these suffixes:
35
35
 
36
36
  - `*.dataset.ts`
37
37
  - `*.evaluator.ts`
38
+ - `*.run-config.ts`
38
39
  - `*.test-case.ts`
39
40
 
40
41
  Optional: create `m4trix-eval.config.ts` at your project root to customize discovery and output paths.
@@ -47,6 +48,7 @@ export default defineConfig((): ConfigType => ({
47
48
  rootDir: 'src/evals',
48
49
  datasetFilePatterns: ['.dataset.ts'],
49
50
  evaluatorFilePatterns: ['.evaluator.ts'],
51
+ runConfigFilePatterns: ['.run-config.ts'],
50
52
  testCaseFilePatterns: ['.test-case.ts'],
51
53
  excludeDirectories: ['node_modules', 'dist'],
52
54
  },
@@ -60,7 +62,8 @@ export default defineConfig((): ConfigType => ({
60
62
  import { Dataset } from '@m4trix/evals';
61
63
 
62
64
  export const myDataset = Dataset.define({
63
- name: 'My Dataset',
65
+ name: 'my-dataset',
66
+ displayName: 'My Dataset',
64
67
  includedTags: ['demo'],
65
68
  });
66
69
  ```
@@ -129,23 +132,41 @@ export const myTestCase = TestCase.describe({
129
132
  });
130
133
  ```
131
134
 
132
- ### 4) Run
135
+ ### 4) RunConfig (optional)
133
136
 
134
- ```bash
135
- eval-agents-simple run --dataset "My Dataset" --evaluator "My Evaluator"
137
+ Group several dataset/evaluator runs under one named config. Each row is either
138
+ `evaluators: [...]` (same module instances discovery loads) or `evaluatorPattern: "..."`
139
+ (wildcard / regex rules from `RunnerApi.resolveEvaluatorsByNamePattern`). Multiple jobs share one `--concurrency` cap.
140
+
141
+ Optional **`repetitions`** on a row (default `1`) runs each matching test case that many times. Every execution in that group shares the same **`repetitionId`** in the evaluator callback **`meta`**, with **`repetitionIndex`** / **`repetitionCount`**. Evaluator **`meta`** includes **`datasetName`** (`Dataset.getDisplayLabel()` → `displayName ?? name`) and **`runConfigName`**: the **`RunConfig`** id (or **`programmatic`** from **`PROGRAMMATIC_RUN_CONFIG`** for API/TUI-only **`runDatasetWith`**). **`Dataset`** and **`TestCase`** follow the same naming convention as **`RunConfig`**: **`name`** is the stable id; optional **`displayName`** is unrestricted for UI. Names may use **kebab-case**, **snake_case**, **camelCase**, etc. (letters, digits, `_`, `-` only, no spaces); resolution is **case-insensitive**.
142
+
143
+ ```ts
144
+ import { RunConfig } from '@m4trix/evals';
145
+ import { myDataset } from './my.dataset';
146
+ import { myEvaluator } from './my.evaluator';
147
+
148
+ export const nightly = RunConfig.define({
149
+ name: 'nightly',
150
+ runs: [
151
+ { dataset: myDataset, evaluators: [myEvaluator], repetitions: 3 },
152
+ { dataset: myDataset, evaluatorPattern: '*smoke*' },
153
+ ],
154
+ });
136
155
  ```
137
156
 
138
- You can also use patterns:
157
+ ### 5) Run
139
158
 
140
159
  ```bash
141
- eval-agents-simple run --dataset "*My*" --evaluator "*My*"
160
+ eval-agents-simple run --run-config "nightly"
142
161
  ```
143
162
 
163
+ Repeat **`--run-config`** to queue several configs; jobs share one **`--concurrency`** cap.
164
+
144
165
  ## CLI Commands
145
166
 
146
- - `eval-agents`: interactive CLI
147
- - `eval-agents-simple run --dataset "<name or pattern>" --evaluator "<name or pattern>"`
148
- - `eval-agents-simple generate --dataset "<dataset name>"`
167
+ - `eval-agents`: interactive CLI (starts runs with synthetic meta `programmatic` / `Programmatic`)
168
+ - `eval-agents-simple run --run-config "<RunConfig name>"` (repeatable; case-insensitive match); add **`--ci`** to exit with code **1** if any test case fails
169
+ - `eval-agents-simple generate --dataset "<dataset id>"` (canonical **`Dataset` `name`**, case-insensitive)
149
170
 
150
171
  ## Default Discovery and Artifacts
151
172
 
@@ -153,6 +174,7 @@ By default, the runner uses `process.cwd()` as discovery root and scans for:
153
174
 
154
175
  - Datasets: `.dataset.ts`, `.dataset.tsx`, `.dataset.js`, `.dataset.mjs`
155
176
  - Evaluators: `.evaluator.ts`, `.evaluator.tsx`, `.evaluator.js`, `.evaluator.mjs`
177
+ - Run configs: `.run-config.ts`, `.run-config.tsx`, `.run-config.js`, `.run-config.mjs`
156
178
  - Test cases: `.test-case.ts`, `.test-case.tsx`, `.test-case.js`, `.test-case.mjs`
157
179
 
158
180
  Results are written to `.eval-results`.
@@ -166,6 +188,7 @@ When present, `m4trix-eval.config.ts` is loaded automatically from `process.cwd(
166
188
  - Discovery keys:
167
189
  - `datasetFilePatterns` (or `datasetSuffixes`)
168
190
  - `evaluatorFilePatterns` (or `evaluatorSuffixes`)
191
+ - `runConfigFilePatterns` (or `runConfigSuffixes`)
169
192
  - `testCaseFilePatterns` (or `testCaseSuffixes`)
170
193
  - `rootDir`, `excludeDirectories`
171
194