@docshield/didactic 0.1.1 → 0.1.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +333 -228
- package/dist/index.cjs +1090 -550
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +134 -65
- package/dist/index.d.cts.map +1 -1
- package/dist/index.d.mts +134 -65
- package/dist/index.d.mts.map +1 -1
- package/dist/index.mjs +1085 -552
- package/dist/index.mjs.map +1 -1
- package/package.json +20 -3
package/README.md
CHANGED
|
@@ -5,7 +5,9 @@
|
|
|
5
5
|
[](https://www.npmjs.com/package/@docshield/didactic)
|
|
6
6
|
[](https://opensource.org/licenses/MIT)
|
|
7
7
|
|
|
8
|
-
Eval and
|
|
8
|
+
**Eval** your LLM workflows by comparing actual outputs against expected results with smart comparators that handle real-world variations. **Optimize** prompts automatically through iterative self-improvement—the system analyzes its own mistakes and rewrites prompts to boost accuracy.
|
|
9
|
+
|
|
10
|
+
Use it to test extraction and classification based AI workflows, monitor regression, and improve performance
|
|
9
11
|
|
|
10
12
|
## Installation
|
|
11
13
|
|
|
@@ -18,7 +20,14 @@ Requires Node.js >= 18.0.0
|
|
|
18
20
|
## Quick Start
|
|
19
21
|
|
|
20
22
|
```typescript
|
|
21
|
-
import {
|
|
23
|
+
import {
|
|
24
|
+
didactic,
|
|
25
|
+
within,
|
|
26
|
+
oneOf,
|
|
27
|
+
exact,
|
|
28
|
+
unordered,
|
|
29
|
+
numeric,
|
|
30
|
+
} from '@docshield/didactic';
|
|
22
31
|
|
|
23
32
|
const result = await didactic.eval({
|
|
24
33
|
executor: didactic.endpoint('https://api.example.com/extract'),
|
|
@@ -26,18 +35,63 @@ const result = await didactic.eval({
|
|
|
26
35
|
premium: within({ tolerance: 0.05 }),
|
|
27
36
|
policyType: oneOf(['claims-made', 'occurrence']),
|
|
28
37
|
carrier: exact,
|
|
38
|
+
// Nested comparators for arrays
|
|
39
|
+
coverages: unordered({
|
|
40
|
+
type: exact,
|
|
41
|
+
limit: numeric,
|
|
42
|
+
}),
|
|
29
43
|
},
|
|
30
44
|
testCases: [
|
|
31
45
|
{
|
|
32
46
|
input: { emailId: 'email-123' },
|
|
33
|
-
expected: {
|
|
47
|
+
expected: {
|
|
48
|
+
premium: 12500,
|
|
49
|
+
policyType: 'claims-made',
|
|
50
|
+
carrier: 'Acme Insurance',
|
|
51
|
+
coverages: [
|
|
52
|
+
{ type: 'liability', limit: 1000000 },
|
|
53
|
+
{ type: 'property', limit: 500000 },
|
|
54
|
+
],
|
|
55
|
+
},
|
|
34
56
|
},
|
|
35
57
|
],
|
|
36
58
|
});
|
|
37
59
|
|
|
38
|
-
console.log(
|
|
60
|
+
console.log(
|
|
61
|
+
`${result.passed}/${result.total} passed (${result.accuracy * 100}% field accuracy)`
|
|
62
|
+
);
|
|
63
|
+
```
|
|
64
|
+
|
|
65
|
+
## Example
|
|
66
|
+
|
|
67
|
+
### Eval - Invoice Parser
|
|
68
|
+
|
|
69
|
+
Real-world invoice extraction using Anthropic's Claude with structured outputs. Tests field accuracy across vendor names, line items, and payment terms.
|
|
70
|
+
|
|
71
|
+
```bash
|
|
72
|
+
# Set your API key
|
|
73
|
+
export ANTHROPIC_API_KEY=your_key_here
|
|
74
|
+
|
|
75
|
+
# Run the example
|
|
76
|
+
npm run example:eval:invoice-parser
|
|
77
|
+
```
|
|
78
|
+
|
|
79
|
+
Shows how to use `numeric`, `name`, `exact`, `unordered()`, and `llmCompare` comparators for financial data extraction with nested comparator structures.
|
|
80
|
+
|
|
81
|
+
### Optimizer - Expense Categorizer
|
|
82
|
+
|
|
83
|
+
Iteratively feed eval failures back into an optimization loop to self-improve prompt and performance. Runs evals until it reaches targeted performance or runs out of budget.
|
|
84
|
+
|
|
85
|
+
```bash
|
|
86
|
+
# Set your API key
|
|
87
|
+
export ANTHROPIC_API_KEY=your_key_here
|
|
88
|
+
|
|
89
|
+
# Run the example
|
|
90
|
+
npm run example:optimizer:expense-categorizer
|
|
39
91
|
```
|
|
40
92
|
|
|
93
|
+
Shows how to use Didactic to self-heal failures and improve prompt to better perform across test set data.
|
|
94
|
+
|
|
41
95
|
---
|
|
42
96
|
|
|
43
97
|
## Core Concepts
|
|
@@ -45,65 +99,20 @@ console.log(`${result.passed}/${result.total} passed (${result.accuracy * 100}%
|
|
|
45
99
|
Didactic has three core components:
|
|
46
100
|
|
|
47
101
|
1. **[Executors](#executors)** — Abstraction for running your LLM workflow (local function or HTTP endpoint)
|
|
48
|
-
2. **[Comparators](#comparators)** —
|
|
49
|
-
3. **[Optimization](#didacticoptimizeevalconfig-optimizeconfig)** — Iterative prompt improvement loop to hit a target success
|
|
102
|
+
2. **[Comparators](#comparators)** — Nested structure matching your data shape, with per-field comparison logic and `unordered()` for arrays
|
|
103
|
+
3. **[Optimization](#didacticoptimizeevalconfig-optimizeconfig)** — Iterative prompt improvement loop to hit a target success rate
|
|
50
104
|
|
|
51
|
-
**How they work together:** Your executor runs each test case's input through your LLM workflow, returning output that matches your test case's expected output shape. Comparators then evaluate each field of the output against expected values,
|
|
105
|
+
**How they work together:** Your executor runs each test case's input through your LLM workflow, returning output that matches your test case's expected output shape. Comparators then evaluate each field of the output against expected values, using nested structures that mirror your data shape. For arrays, use `unordered()` to match by similarity rather than index position.
|
|
52
106
|
|
|
53
107
|
In optimization mode, these results feed into an LLM that analyzes failures and generates improved system prompts—repeating until your target success rate or iteration/cost limit is reached.
|
|
54
108
|
|
|
55
|
-
|
|
56
109
|
#### Eval Flow
|
|
57
110
|
|
|
58
111
|

|
|
59
112
|
|
|
60
113
|
#### Optimize Flow
|
|
61
114
|
|
|
62
|
-
|
|
63
|
-
flowchart TB
|
|
64
|
-
subgraph Config ["Config"]
|
|
65
|
-
IP[Initial Prompt]
|
|
66
|
-
TARGET[targetSuccessRate]
|
|
67
|
-
LIMITS[maxIterations / maxCost]
|
|
68
|
-
end
|
|
69
|
-
|
|
70
|
-
IP --> EVAL
|
|
71
|
-
|
|
72
|
-
subgraph Loop ["Optimization Loop"]
|
|
73
|
-
EVAL[Run Eval] --> CHECK{Target reached?}
|
|
74
|
-
CHECK -->|Yes| SUCCESS[Return optimized prompt]
|
|
75
|
-
CHECK -->|No| LIMIT{Limits exceeded?}
|
|
76
|
-
LIMIT -->|Yes| BEST[Return best prompt]
|
|
77
|
-
LIMIT -->|No| FAIL[Extract failures]
|
|
78
|
-
FAIL --> PATCH[Generate patches]
|
|
79
|
-
PATCH --> MERGE[Merge patches]
|
|
80
|
-
MERGE --> UPDATE[New Prompt]
|
|
81
|
-
UPDATE --> EVAL
|
|
82
|
-
end
|
|
83
|
-
|
|
84
|
-
TARGET --> CHECK
|
|
85
|
-
LIMITS --> LIMIT
|
|
86
|
-
|
|
87
|
-
SUCCESS --> OUT[OptimizeResult]
|
|
88
|
-
BEST --> OUT
|
|
89
|
-
|
|
90
|
-
linkStyle default stroke:#FFFFFF
|
|
91
|
-
style Config fill:#343434,stroke:#6D88B4,color:#FFFFFF
|
|
92
|
-
style Loop fill:#343434,stroke:#6D88B4,color:#FFFFFF
|
|
93
|
-
style IP fill:#BFD7FF,stroke:#6D88B4,color:#000B33
|
|
94
|
-
style TARGET fill:#BFD7FF,stroke:#6D88B4,color:#000B33
|
|
95
|
-
style LIMITS fill:#BFD7FF,stroke:#6D88B4,color:#000B33
|
|
96
|
-
style EVAL fill:#BFD7FF,stroke:#6D88B4,color:#000B33
|
|
97
|
-
style FAIL fill:#BFD7FF,stroke:#6D88B4,color:#000B33
|
|
98
|
-
style PATCH fill:#BFD7FF,stroke:#6D88B4,color:#000B33
|
|
99
|
-
style MERGE fill:#BFD7FF,stroke:#6D88B4,color:#000B33
|
|
100
|
-
style UPDATE fill:#BFD7FF,stroke:#6D88B4,color:#000B33
|
|
101
|
-
style CHECK fill:#FFEDE0,stroke:#6D88B4,color:#000B33
|
|
102
|
-
style LIMIT fill:#FFEDE0,stroke:#6D88B4,color:#000B33
|
|
103
|
-
style SUCCESS fill:#CDF1E6,stroke:#6D88B4,color:#000B33
|
|
104
|
-
style BEST fill:#CDF1E6,stroke:#6D88B4,color:#000B33
|
|
105
|
-
style OUT fill:#CDF1E6,stroke:#6D88B4,color:#000B33
|
|
106
|
-
```
|
|
115
|
+

|
|
107
116
|
|
|
108
117
|
---
|
|
109
118
|
|
|
@@ -119,18 +128,18 @@ const result = await didactic.eval(config);
|
|
|
119
128
|
|
|
120
129
|
#### EvalConfig
|
|
121
130
|
|
|
122
|
-
| Property
|
|
123
|
-
|
|
124
|
-
| `executor`
|
|
125
|
-
| `testCases`
|
|
126
|
-
| `comparators`
|
|
127
|
-
| `comparatorOverride` | `Comparator<TOutput>`
|
|
128
|
-
| `
|
|
129
|
-
| `
|
|
130
|
-
| `
|
|
131
|
-
| `rateLimitBatch`
|
|
132
|
-
| `rateLimitPause`
|
|
133
|
-
| `optimize`
|
|
131
|
+
| Property | Type | Kind | Required | Default | Description |
|
|
132
|
+
| -------------------- | ----------------------------- | --------------- | -------- | ------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
|
133
|
+
| `executor` | `Executor<TInput, TOutput>` | Object | **Yes** | — | Function that executes your LLM workflow. Receives input and optional system prompt, returns structured output. |
|
|
134
|
+
| `testCases` | `TestCase<TInput, TOutput>[]` | Array | **Yes** | — | Array of `{ input, expected }` pairs. Each test case runs through the executor and compares output to expected. |
|
|
135
|
+
| `comparators` | `ComparatorsConfig` | Object/Function | No | `exact` | Nested comparator structure matching your data shape. Can be a single comparator function (e.g., `exact`), or a nested object with per-field comparators. Use `unordered()` wrapper for arrays that should match by similarity rather than index. |
|
|
136
|
+
| `comparatorOverride` | `Comparator<TOutput>` | Function | No | — | Custom whole-object comparison function. Use when you need complete control over comparison logic and want to bypass field-level matching. |
|
|
137
|
+
| `llmConfig` | `LLMConfig` | Object | No | — | Default LLM configuration for LLM-based comparators (e.g., `llmCompare`). Provides `apiKey` and optional `provider` so you don't repeat them in each comparator call. |
|
|
138
|
+
| `systemPrompt` | `string` | Primitive | No | — | System prompt passed to the executor. Required if using optimization. |
|
|
139
|
+
| `perTestThreshold` | `number` | Primitive | No | `1.0` | Minimum field pass rate for a test case to pass (0.0–1.0). At default 1.0, all fields must pass. Set to 0.8 to pass if 80% of fields match. |
|
|
140
|
+
| `rateLimitBatch` | `number` | Primitive | No | — | Number of test cases to run concurrently. Use with `rateLimitPause` for rate-limited APIs. |
|
|
141
|
+
| `rateLimitPause` | `number` | Primitive | No | — | Seconds to wait between batches. Pairs with `rateLimitBatch`. |
|
|
142
|
+
| `optimize` | `OptimizeConfig` | Object | No | — | Inline optimization config. When provided, triggers optimization mode instead of single eval. |
|
|
134
143
|
|
|
135
144
|
---
|
|
136
145
|
|
|
@@ -155,21 +164,23 @@ const config = {
|
|
|
155
164
|
storeLogs: true,
|
|
156
165
|
thinking: true,
|
|
157
166
|
},
|
|
158
|
-
}
|
|
167
|
+
};
|
|
159
168
|
```
|
|
160
169
|
|
|
161
170
|
#### OptimizeConfig
|
|
162
171
|
|
|
163
|
-
| Property
|
|
164
|
-
|
|
165
|
-
| `systemPrompt`
|
|
166
|
-
| `targetSuccessRate` | `number`
|
|
167
|
-
| `apiKey`
|
|
168
|
-
| `provider`
|
|
169
|
-
| `maxIterations`
|
|
170
|
-
| `maxCost`
|
|
171
|
-
| `storeLogs`
|
|
172
|
-
| `thinking`
|
|
172
|
+
| Property | Type | Required | Default | Description |
|
|
173
|
+
| ------------------- | ------------------- | -------- | --------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------- |
|
|
174
|
+
| `systemPrompt` | `string` | **Yes** | — | Initial system prompt to optimize. This is the starting point that the optimizer will iteratively improve. |
|
|
175
|
+
| `targetSuccessRate` | `number` | **Yes** | — | Target success rate to achieve (0.0–1.0). Optimization stops when this rate is reached. |
|
|
176
|
+
| `apiKey` | `string` | **Yes** | — | API key for the LLM provider used by the optimizer (not your workflow's LLM). |
|
|
177
|
+
| `provider` | `LLMProviders` | **Yes** | — | LLM provider the optimizer uses to analyze failures and generate improved prompts. |
|
|
178
|
+
| `maxIterations` | `number` | No | `5` | Maximum optimization iterations before stopping, even if target not reached. |
|
|
179
|
+
| `maxCost` | `number` | No | — | Maximum cost budget in dollars. Optimization stops if cumulative cost exceeds this. |
|
|
180
|
+
| `storeLogs` | `boolean \| string` | No | — | Save optimization logs. `true` uses default path (`./didactic-logs/optimize_<timestamp>/summary.md`), or provide custom summary path. |
|
|
181
|
+
| `thinking` | `boolean` | No | — | Enable extended thinking mode for deeper analysis (provider must support it). |
|
|
182
|
+
| `patchSystemPrompt` | `string` | No | [`DEFAULT_PATCH_SYSTEM_PROMPT`](src/optimizer/prompts.ts) | Custom system prompt for patch generation. Completely replaces the default prompt that analyzes failures and suggests improvements. |
|
|
183
|
+
| `mergeSystemPrompt` | `string` | No | [`DEFAULT_MERGE_SYSTEM_PROMPT`](src/optimizer/prompts.ts) | Custom system prompt for merging patches. Completely replaces the default prompt that combines multiple patches into a coherent system prompt. |
|
|
173
184
|
|
|
174
185
|
---
|
|
175
186
|
|
|
@@ -178,11 +189,13 @@ const config = {
|
|
|
178
189
|
Executors abstract your LLM workflow from the evaluation harness. Whether your workflow runs locally, calls a remote API, or orchestrates Temporal activities, executors provide a consistent interface: take input + optional system prompt, return expected output.
|
|
179
190
|
|
|
180
191
|
This separation enables:
|
|
192
|
+
|
|
181
193
|
- **Swap execution strategies** — Switch between local/remote without changing tests
|
|
182
194
|
- **Dynamic prompt injection** — System prompts flow through for optimization
|
|
183
195
|
- **Cost tracking** — Aggregate execution costs across test runs
|
|
184
196
|
|
|
185
197
|
didactic provides two built-in executors:
|
|
198
|
+
|
|
186
199
|
- `endpoint` for calling a remote API
|
|
187
200
|
- `fn` for calling a local function
|
|
188
201
|
|
|
@@ -192,7 +205,6 @@ You may want to provide a `mapAdditionalContext` function to extract metadata fr
|
|
|
192
205
|
|
|
193
206
|
Note: If you do not provide a `mapResponse` function, the executor will assume the response from the executor is the output you want to compare against `expected`.
|
|
194
207
|
|
|
195
|
-
|
|
196
208
|
### `endpoint(url, config?)`
|
|
197
209
|
|
|
198
210
|
Create an executor that calls an HTTP endpoint. The executor sends input + systemPrompt as the request body and expects structured JSON back.
|
|
@@ -211,14 +223,14 @@ const executor = endpoint('https://api.example.com/workflow', {
|
|
|
211
223
|
|
|
212
224
|
#### EndpointConfig
|
|
213
225
|
|
|
214
|
-
| Property
|
|
215
|
-
|
|
216
|
-
| `method`
|
|
217
|
-
| `headers`
|
|
218
|
-
| `mapResponse`
|
|
219
|
-
| `mapAdditionalContext` | `(response: any) => unknown` | No
|
|
220
|
-
| `mapCost`
|
|
221
|
-
| `timeout`
|
|
226
|
+
| Property | Type | Required | Default | Description |
|
|
227
|
+
| ---------------------- | ---------------------------- | -------- | -------- | ------------------------------------------------------------------------------------------ |
|
|
228
|
+
| `method` | `'POST' \| 'GET'` | No | `'POST'` | HTTP method for the request. |
|
|
229
|
+
| `headers` | `Record<string, string>` | No | `{}` | Headers to include (auth tokens, content-type overrides, etc). |
|
|
230
|
+
| `mapResponse` | `(response: any) => TOutput` | No | — | Transform the raw response to your expected output shape. Use when your API wraps results. |
|
|
231
|
+
| `mapAdditionalContext` | `(response: any) => unknown` | No | — | Extract metadata (logs, debug info) from response for inspection. |
|
|
232
|
+
| `mapCost` | `(response: any) => number` | No | — | Extract execution cost from response (e.g., token counts in headers). |
|
|
233
|
+
| `timeout` | `number` | No | `30000` | Request timeout in milliseconds. |
|
|
222
234
|
|
|
223
235
|
---
|
|
224
236
|
|
|
@@ -234,19 +246,24 @@ const executor = fn({
|
|
|
234
246
|
return await myLLMCall(input, systemPrompt);
|
|
235
247
|
},
|
|
236
248
|
mapResponse: (result) => result.output,
|
|
237
|
-
mapCost: (result) =>
|
|
238
|
-
|
|
249
|
+
mapCost: (result) =>
|
|
250
|
+
result.usage.input_tokens * 0.000003 +
|
|
251
|
+
result.usage.output_tokens * 0.000015,
|
|
252
|
+
mapAdditionalContext: (result) => ({
|
|
253
|
+
model: result.model,
|
|
254
|
+
finishReason: result.stop_reason,
|
|
255
|
+
}),
|
|
239
256
|
});
|
|
240
257
|
```
|
|
241
258
|
|
|
242
259
|
#### FnConfig
|
|
243
260
|
|
|
244
|
-
| Property
|
|
245
|
-
|
|
246
|
-
| `fn`
|
|
247
|
-
| `mapResponse`
|
|
248
|
-
| `mapAdditionalContext` | `(result: TRaw) => unknown`
|
|
249
|
-
| `mapCost`
|
|
261
|
+
| Property | Type | Required | Default | Description |
|
|
262
|
+
| ---------------------- | --------------------------------------------------------- | -------- | ------- | ------------------------------------------------------------------------------------------------------------------ |
|
|
263
|
+
| `fn` | `(input: TInput, systemPrompt?: string) => Promise<TRaw>` | **Yes** | — | Async function that executes your workflow. Receives test input and optional system prompt. |
|
|
264
|
+
| `mapResponse` | `(result: TRaw) => TOutput` | No | — | Transform raw result from fn into the expected output shape to compare. Without this, raw result is used directly. |
|
|
265
|
+
| `mapAdditionalContext` | `(result: TRaw) => unknown` | No | — | Map additional context about the run to pass to the optimizer prompt. |
|
|
266
|
+
| `mapCost` | `(result: TRaw) => number` | No | — | Extract cost from the result (if your function tracks it). Used to track the total cost of the runs. |
|
|
250
267
|
|
|
251
268
|
---
|
|
252
269
|
|
|
@@ -276,6 +293,7 @@ const executor = fn({
|
|
|
276
293
|
```
|
|
277
294
|
|
|
278
295
|
Without `mapResponse`:
|
|
296
|
+
|
|
279
297
|
- **endpoint**: uses the raw JSON response as output
|
|
280
298
|
- **fn**: uses the function's return value directly as output
|
|
281
299
|
|
|
@@ -338,56 +356,79 @@ const executor = fn({
|
|
|
338
356
|
|
|
339
357
|
Comparators bridge the gap between messy LLM output and semantic correctness. Rather than requiring exact string matches, comparators handle real-world data variations—currency formatting, date formats, name suffixes, numeric tolerance—while maintaining semantic accuracy.
|
|
340
358
|
|
|
341
|
-
|
|
359
|
+
**Nested structure:** Comparators mirror your data shape. Use objects to define per-field comparators, and `unordered()` to wrap arrays that should match by similarity rather than index position.
|
|
360
|
+
|
|
361
|
+
Each comparator returns a `passed` boolean and a `similarity` score (0.0–1.0). The pass/fail determines test results, while similarity enables Hungarian matching for `unordered()` arrays.
|
|
342
362
|
|
|
343
363
|
### `comparators` vs `comparatorOverride`
|
|
344
364
|
|
|
345
|
-
Use **`comparators`** for standard comparison. It accepts
|
|
365
|
+
Use **`comparators`** for standard comparison. It accepts:
|
|
346
366
|
|
|
347
367
|
**1. A single comparator function** — Applied uniformly across the output:
|
|
348
368
|
|
|
349
369
|
```typescript
|
|
350
|
-
// Clean syntax for primitives
|
|
370
|
+
// Clean syntax for primitives or arrays
|
|
351
371
|
const result = await didactic.eval({
|
|
352
372
|
executor: myNumberExtractor,
|
|
353
|
-
comparators: exact,
|
|
373
|
+
comparators: exact, // Single comparator for root-level output
|
|
354
374
|
testCases: [
|
|
355
375
|
{ input: 'twenty-three', expected: 23 },
|
|
356
376
|
{ input: 'one hundred', expected: 100 },
|
|
357
377
|
],
|
|
358
378
|
});
|
|
359
379
|
|
|
360
|
-
//
|
|
380
|
+
// For unordered arrays, use the unordered() wrapper
|
|
361
381
|
const result = await didactic.eval({
|
|
362
382
|
executor: myListExtractor,
|
|
363
|
-
comparators: exact,
|
|
364
|
-
|
|
365
|
-
testCases: [
|
|
366
|
-
{ input: 'numbers', expected: [1, 2, 3, 4] },
|
|
367
|
-
],
|
|
383
|
+
comparators: unordered(exact), // Match by similarity, not index
|
|
384
|
+
testCases: [{ input: 'numbers', expected: [1, 2, 3, 4] }],
|
|
368
385
|
});
|
|
369
386
|
```
|
|
370
387
|
|
|
371
|
-
**2. A
|
|
388
|
+
**2. A nested object structure** — Mirrors your data shape with per-field comparators:
|
|
372
389
|
|
|
373
390
|
```typescript
|
|
374
391
|
const result = await didactic.eval({
|
|
375
392
|
executor: myExecutor,
|
|
376
393
|
comparators: {
|
|
377
|
-
premium: within({ tolerance: 0.05 }),
|
|
378
|
-
carrier: exact,
|
|
379
|
-
effectiveDate: date,
|
|
394
|
+
premium: within({ tolerance: 0.05 }), // 5% tolerance for numbers
|
|
395
|
+
carrier: exact, // Exact string match
|
|
396
|
+
effectiveDate: date, // Flexible date parsing
|
|
397
|
+
// Use unordered() for arrays that can be in any order
|
|
398
|
+
lineItems: unordered({
|
|
399
|
+
description: name,
|
|
400
|
+
amount: numeric,
|
|
401
|
+
}),
|
|
380
402
|
},
|
|
381
403
|
testCases: [
|
|
382
404
|
{
|
|
383
405
|
input: { emailId: 'email-123' },
|
|
384
|
-
expected: {
|
|
406
|
+
expected: {
|
|
407
|
+
premium: 12500,
|
|
408
|
+
carrier: 'Acme Insurance',
|
|
409
|
+
effectiveDate: '2024-01-15',
|
|
410
|
+
lineItems: [
|
|
411
|
+
{ description: 'Service Fee', amount: 100 },
|
|
412
|
+
{ description: 'Tax', amount: 25 },
|
|
413
|
+
],
|
|
414
|
+
},
|
|
385
415
|
},
|
|
386
416
|
],
|
|
387
417
|
});
|
|
388
418
|
```
|
|
389
419
|
|
|
420
|
+
**3. Optional (defaults to `exact`)** — If omitted, uses `exact` for entire output:
|
|
421
|
+
|
|
422
|
+
```typescript
|
|
423
|
+
// No comparators needed for simple exact matching
|
|
424
|
+
const result = await didactic.eval({
|
|
425
|
+
executor: myExecutor,
|
|
426
|
+
testCases: [{ input: 'hello', expected: 'hello' }],
|
|
427
|
+
});
|
|
428
|
+
```
|
|
429
|
+
|
|
390
430
|
Use **`comparatorOverride`** when you need:
|
|
431
|
+
|
|
391
432
|
- Complete control over comparison logic
|
|
392
433
|
- Custom cross-field validation
|
|
393
434
|
- Whole-object semantic comparison that doesn't map to individual fields
|
|
@@ -409,41 +450,83 @@ const result = await didactic.eval({
|
|
|
409
450
|
|
|
410
451
|
### Built-in Comparators
|
|
411
452
|
|
|
412
|
-
| Comparator
|
|
413
|
-
|
|
414
|
-
| `exact`
|
|
415
|
-
| `within`
|
|
416
|
-
| `oneOf`
|
|
417
|
-
| `contains`
|
|
418
|
-
| `presence`
|
|
419
|
-
| `numeric`
|
|
420
|
-
| `numeric.nullable` | `
|
|
421
|
-
| `date`
|
|
422
|
-
| `name`
|
|
423
|
-
| `
|
|
453
|
+
| Comparator | Usage | Description |
|
|
454
|
+
| ------------------ | --------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
|
455
|
+
| `exact` | `exact` | Deep equality with cycle detection. Default when no comparator specified. |
|
|
456
|
+
| `within` | `within({ tolerance, mode? })` | Numeric tolerance. `mode: 'percentage'` (default) or `'absolute'`. |
|
|
457
|
+
| `oneOf` | `oneOf(allowedValues)` | Enum validation. Passes if actual equals expected AND both are in the allowed set. |
|
|
458
|
+
| `contains` | `contains(substring)` | String contains check. Passes if actual includes the substring. |
|
|
459
|
+
| `presence` | `presence` | Existence check. Passes if expected is absent, or if actual has any value when expected does. |
|
|
460
|
+
| `numeric` | `numeric` | Numeric comparison after stripping currency symbols, commas, accounting notation. |
|
|
461
|
+
| `numeric.nullable` | `numeric.nullable` | Same as `numeric`, but treats null/undefined/empty as 0. |
|
|
462
|
+
| `date` | `date` | Date comparison after normalizing formats (ISO, US MM/DD, EU DD/MM, written). |
|
|
463
|
+
| `name` | `name` | Name comparison with case normalization, suffix removal (Inc, LLC), fuzzy matching. |
|
|
464
|
+
| `unordered` | `unordered(comparator)` or `unordered({ fields })` | Wrapper for arrays that should match by similarity (Hungarian algorithm) rather than index. Pass a comparator for primitives or nested config for objects. |
|
|
465
|
+
| `llmCompare` | `llmCompare({ systemPrompt?, apiKey?, provider? })` | LLM-based semantic comparison. Uses `llmConfig` from eval config if `apiKey` not provided. Returns rationale and tracks cost. |
|
|
466
|
+
| `custom` | `custom({ compare })` | User-defined logic. `compare(expected, actual, context?) => boolean`. Context provides access to parent objects for cross-field logic. |
|
|
424
467
|
|
|
425
468
|
### Examples
|
|
426
469
|
|
|
427
470
|
```typescript
|
|
428
|
-
import {
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
|
|
437
|
-
|
|
438
|
-
|
|
439
|
-
|
|
440
|
-
|
|
441
|
-
|
|
442
|
-
|
|
443
|
-
|
|
444
|
-
|
|
445
|
-
|
|
446
|
-
|
|
471
|
+
import {
|
|
472
|
+
didactic,
|
|
473
|
+
within,
|
|
474
|
+
oneOf,
|
|
475
|
+
exact,
|
|
476
|
+
contains,
|
|
477
|
+
presence,
|
|
478
|
+
numeric,
|
|
479
|
+
date,
|
|
480
|
+
name,
|
|
481
|
+
unordered,
|
|
482
|
+
llmCompare,
|
|
483
|
+
custom,
|
|
484
|
+
LLMProviders,
|
|
485
|
+
} from '@docshield/didactic';
|
|
486
|
+
|
|
487
|
+
const result = await didactic.eval({
|
|
488
|
+
executor: myInvoiceParser,
|
|
489
|
+
testCases: [...],
|
|
490
|
+
// LLM config for all llmCompare calls (no need to repeat apiKey)
|
|
491
|
+
llmConfig: {
|
|
492
|
+
apiKey: process.env.ANTHROPIC_API_KEY,
|
|
493
|
+
provider: LLMProviders.anthropic_claude_haiku,
|
|
494
|
+
},
|
|
495
|
+
comparators: {
|
|
496
|
+
premium: within({ tolerance: 0.05 }), // 5% tolerance
|
|
497
|
+
deductible: within({ tolerance: 100, mode: 'absolute' }), // $100 tolerance
|
|
498
|
+
policyType: oneOf(['claims-made', 'occurrence', 'entity']),
|
|
499
|
+
carrier: exact,
|
|
500
|
+
notes: contains('approved'),
|
|
501
|
+
entityName: name,
|
|
502
|
+
effectiveDate: date,
|
|
503
|
+
amount: numeric,
|
|
504
|
+
optionalField: presence,
|
|
505
|
+
|
|
506
|
+
// Unordered array of objects with nested comparators
|
|
507
|
+
lineItems: unordered({
|
|
508
|
+
description: llmCompare({
|
|
509
|
+
// Uses llmConfig.apiKey from above!
|
|
510
|
+
systemPrompt: 'Compare line item descriptions semantically.',
|
|
511
|
+
}),
|
|
512
|
+
quantity: exact,
|
|
513
|
+
price: numeric,
|
|
514
|
+
}),
|
|
515
|
+
|
|
516
|
+
// LLM-based comparison for flexible semantic matching
|
|
517
|
+
companyName: llmCompare({
|
|
518
|
+
systemPrompt:
|
|
519
|
+
'Compare company names considering abbreviations and legal suffixes.',
|
|
520
|
+
}),
|
|
521
|
+
|
|
522
|
+
customField: custom({
|
|
523
|
+
compare: (expected, actual, context) => {
|
|
524
|
+
// Access sibling fields via context.actualParent
|
|
525
|
+
return actual.toLowerCase() === expected.toLowerCase();
|
|
526
|
+
},
|
|
527
|
+
}),
|
|
528
|
+
},
|
|
529
|
+
});
|
|
447
530
|
```
|
|
448
531
|
|
|
449
532
|
---
|
|
@@ -456,13 +539,13 @@ Supported LLM providers for the optimizer:
|
|
|
456
539
|
import { LLMProviders } from '@docshield/didactic';
|
|
457
540
|
```
|
|
458
541
|
|
|
459
|
-
| Value
|
|
460
|
-
|
|
461
|
-
| `LLMProviders.anthropic_claude_opus`
|
|
542
|
+
| Value | Description |
|
|
543
|
+
| -------------------------------------- | --------------------------------------------- |
|
|
544
|
+
| `LLMProviders.anthropic_claude_opus` | Claude Opus 4.5 — Most capable, highest cost |
|
|
462
545
|
| `LLMProviders.anthropic_claude_sonnet` | Claude Sonnet 4.5 — Balanced performance/cost |
|
|
463
|
-
| `LLMProviders.anthropic_claude_haiku`
|
|
464
|
-
| `LLMProviders.openai_gpt5`
|
|
465
|
-
| `LLMProviders.openai_gpt5_mini`
|
|
546
|
+
| `LLMProviders.anthropic_claude_haiku` | Claude Haiku 4.5 — Fastest, lowest cost |
|
|
547
|
+
| `LLMProviders.openai_gpt5` | GPT-5.2 — OpenAI flagship |
|
|
548
|
+
| `LLMProviders.openai_gpt5_mini` | GPT-5 Mini — OpenAI lightweight |
|
|
466
549
|
|
|
467
550
|
---
|
|
468
551
|
|
|
@@ -472,60 +555,62 @@ import { LLMProviders } from '@docshield/didactic';
|
|
|
472
555
|
|
|
473
556
|
Returned by `didactic.eval()` when no optimization is configured.
|
|
474
557
|
|
|
475
|
-
| Property
|
|
476
|
-
|
|
477
|
-
| `systemPrompt`
|
|
478
|
-
| `testCases`
|
|
479
|
-
| `passed`
|
|
480
|
-
| `total`
|
|
481
|
-
| `successRate`
|
|
482
|
-
| `correctFields`
|
|
483
|
-
| `totalFields`
|
|
484
|
-
| `accuracy`
|
|
485
|
-
| `cost`
|
|
558
|
+
| Property | Type | Description |
|
|
559
|
+
| ---------------- | --------------------- | ----------------------------------------------------------------------------- |
|
|
560
|
+
| `systemPrompt` | `string \| undefined` | System prompt that was used for this eval run. |
|
|
561
|
+
| `testCases` | `TestCaseResult[]` | Detailed results for each test case. Inspect for field-level failure details. |
|
|
562
|
+
| `passed` | `number` | Count of test cases that passed (met `perTestThreshold`). |
|
|
563
|
+
| `total` | `number` | Total number of test cases run. |
|
|
564
|
+
| `successRate` | `number` | Pass rate (0.0–1.0). `passed / total`. |
|
|
565
|
+
| `correctFields` | `number` | Total correct fields across all test cases. |
|
|
566
|
+
| `totalFields` | `number` | Total fields evaluated across all test cases. |
|
|
567
|
+
| `accuracy` | `number` | Field-level accuracy (0.0–1.0). `correctFields / totalFields`. |
|
|
568
|
+
| `cost` | `number` | Total execution cost aggregated from executor results. |
|
|
569
|
+
| `comparatorCost` | `number` | Total cost from LLM-based comparators (e.g., `llmCompare`). |
|
|
486
570
|
|
|
487
571
|
### TestCaseResult
|
|
488
572
|
|
|
489
573
|
Per-test-case detail, accessible via `EvalResult.testCases`.
|
|
490
574
|
|
|
491
|
-
| Property
|
|
492
|
-
|
|
493
|
-
| `input`
|
|
494
|
-
| `expected`
|
|
495
|
-
| `actual`
|
|
496
|
-
| `passed`
|
|
497
|
-
| `fields`
|
|
498
|
-
| `passedFields`
|
|
499
|
-
| `totalFields`
|
|
500
|
-
| `passRate`
|
|
501
|
-
| `cost`
|
|
502
|
-
| `
|
|
503
|
-
| `
|
|
575
|
+
| Property | Type | Description |
|
|
576
|
+
| ------------------- | ----------------------------- | ------------------------------------------------------------------------- |
|
|
577
|
+
| `input` | `TInput` | The input that was passed to the executor. |
|
|
578
|
+
| `expected` | `TOutput` | The expected output from the test case. |
|
|
579
|
+
| `actual` | `TOutput \| undefined` | Actual output returned by executor. Undefined if execution failed. |
|
|
580
|
+
| `passed` | `boolean` | Whether this test case passed (met `perTestThreshold`). |
|
|
581
|
+
| `fields` | `Record<string, FieldResult>` | Per-field comparison results. Key is field path (e.g., `"address.city"`). |
|
|
582
|
+
| `passedFields` | `number` | Count of fields that passed comparison. |
|
|
583
|
+
| `totalFields` | `number` | Total fields compared. |
|
|
584
|
+
| `passRate` | `number` | Field pass rate for this test case (0.0–1.0). |
|
|
585
|
+
| `cost` | `number \| undefined` | Execution cost for this test case, if reported by executor. |
|
|
586
|
+
| `comparatorCost` | `number \| undefined` | Total cost from LLM-based comparators in this test case. |
|
|
587
|
+
| `additionalContext` | `unknown \| undefined` | Extra context extracted by executor (logs, debug info). |
|
|
588
|
+
| `error` | `string \| undefined` | Error message if executor threw an exception. |
|
|
504
589
|
|
|
505
590
|
### OptimizeResult
|
|
506
591
|
|
|
507
592
|
Returned by `didactic.optimize()` or `didactic.eval()` with optimization configured.
|
|
508
593
|
|
|
509
|
-
| Property
|
|
510
|
-
|
|
511
|
-
| `success`
|
|
512
|
-
| `finalPrompt` | `string`
|
|
513
|
-
| `iterations`
|
|
514
|
-
| `totalCost`
|
|
515
|
-
| `logFolder`
|
|
594
|
+
| Property | Type | Description |
|
|
595
|
+
| ------------- | --------------------- | ------------------------------------------------------------------------------------ |
|
|
596
|
+
| `success` | `boolean` | Whether the target success rate was achieved. |
|
|
597
|
+
| `finalPrompt` | `string` | The final optimized system prompt. Use this in production. |
|
|
598
|
+
| `iterations` | `IterationResult[]` | Results from each optimization iteration. Inspect to see how the prompt evolved. |
|
|
599
|
+
| `totalCost` | `number` | Total cost across all iterations (optimizer + executor costs). |
|
|
600
|
+
| `logFolder` | `string \| undefined` | Folder path where optimization logs were written (only when `storeLogs` is enabled). |
|
|
516
601
|
|
|
517
602
|
### IterationResult
|
|
518
603
|
|
|
519
604
|
Per-iteration detail, accessible via `OptimizeResult.iterations`.
|
|
520
605
|
|
|
521
|
-
| Property
|
|
522
|
-
|
|
523
|
-
| `iteration`
|
|
524
|
-
| `systemPrompt` | `string`
|
|
525
|
-
| `passed`
|
|
526
|
-
| `total`
|
|
527
|
-
| `testCases`
|
|
528
|
-
| `cost`
|
|
606
|
+
| Property | Type | Description |
|
|
607
|
+
| -------------- | ------------------ | ---------------------------------------------- |
|
|
608
|
+
| `iteration` | `number` | Iteration number (1-indexed). |
|
|
609
|
+
| `systemPrompt` | `string` | System prompt used for this iteration. |
|
|
610
|
+
| `passed` | `number` | Test cases passed in this iteration. |
|
|
611
|
+
| `total` | `number` | Total test cases in this iteration. |
|
|
612
|
+
| `testCases` | `TestCaseResult[]` | Detailed test case results for this iteration. |
|
|
613
|
+
| `cost` | `number` | Cost for this iteration. |
|
|
529
614
|
|
|
530
615
|
---
|
|
531
616
|
|
|
@@ -535,12 +620,12 @@ When `storeLogs` is enabled in `OptimizeConfig`, four files are written to the l
|
|
|
535
620
|
|
|
536
621
|
**Default path:** `./didactic-logs/optimize_<timestamp>/`
|
|
537
622
|
|
|
538
|
-
| File
|
|
539
|
-
|
|
540
|
-
| `summary.md`
|
|
541
|
-
| `prompts.md`
|
|
542
|
-
| `rawData.json` | Complete iteration data for programmatic analysis
|
|
543
|
-
| `bestRun.json` | Detailed results from the best-performing iteration
|
|
623
|
+
| File | Description |
|
|
624
|
+
| -------------- | ------------------------------------------------------------------------- |
|
|
625
|
+
| `summary.md` | Human-readable report with configuration, metrics, and iteration progress |
|
|
626
|
+
| `prompts.md` | All system prompts used in each iteration |
|
|
627
|
+
| `rawData.json` | Complete iteration data for programmatic analysis |
|
|
628
|
+
| `bestRun.json` | Detailed results from the best-performing iteration |
|
|
544
629
|
|
|
545
630
|
### rawData.json
|
|
546
631
|
|
|
@@ -549,17 +634,17 @@ Contains the complete optimization run data for programmatic analysis:
|
|
|
549
634
|
```typescript
|
|
550
635
|
interface OptimizationReport {
|
|
551
636
|
metadata: {
|
|
552
|
-
timestamp: string;
|
|
553
|
-
model: string;
|
|
554
|
-
provider: string;
|
|
555
|
-
thinking: boolean;
|
|
556
|
-
targetSuccessRate: number;
|
|
557
|
-
maxIterations: number | null;
|
|
558
|
-
maxCost: number | null;
|
|
559
|
-
testCaseCount: number;
|
|
560
|
-
perTestThreshold: number;
|
|
561
|
-
rateLimitBatch?: number;
|
|
562
|
-
rateLimitPause?: number;
|
|
637
|
+
timestamp: string; // ISO timestamp
|
|
638
|
+
model: string; // LLM model used
|
|
639
|
+
provider: string; // Provider (anthropic, openai, etc)
|
|
640
|
+
thinking: boolean; // Extended thinking enabled
|
|
641
|
+
targetSuccessRate: number; // Target (0.0-1.0)
|
|
642
|
+
maxIterations: number | null; // Max iterations or null
|
|
643
|
+
maxCost: number | null; // Max cost budget or null
|
|
644
|
+
testCaseCount: number; // Number of test cases
|
|
645
|
+
perTestThreshold: number; // Per-test threshold (default 1.0)
|
|
646
|
+
rateLimitBatch?: number; // Batch size for rate limiting
|
|
647
|
+
rateLimitPause?: number; // Pause seconds between batches
|
|
563
648
|
};
|
|
564
649
|
summary: {
|
|
565
650
|
totalIterations: number;
|
|
@@ -567,16 +652,16 @@ interface OptimizationReport {
|
|
|
567
652
|
totalCost: number;
|
|
568
653
|
totalInputTokens: number;
|
|
569
654
|
totalOutputTokens: number;
|
|
570
|
-
startRate: number;
|
|
571
|
-
endRate: number;
|
|
655
|
+
startRate: number; // Success rate at start
|
|
656
|
+
endRate: number; // Success rate at end
|
|
572
657
|
targetMet: boolean;
|
|
573
658
|
};
|
|
574
659
|
best: {
|
|
575
|
-
iteration: number;
|
|
576
|
-
successRate: number;
|
|
577
|
-
passed: number;
|
|
578
|
-
total: number;
|
|
579
|
-
fieldAccuracy: number;
|
|
660
|
+
iteration: number; // Which iteration was best
|
|
661
|
+
successRate: number; // Success rate (0.0-1.0)
|
|
662
|
+
passed: number; // Number of passing tests
|
|
663
|
+
total: number; // Total tests
|
|
664
|
+
fieldAccuracy: number; // Field-level accuracy
|
|
580
665
|
};
|
|
581
666
|
iterations: Array<{
|
|
582
667
|
iteration: number;
|
|
@@ -586,8 +671,8 @@ interface OptimizationReport {
|
|
|
586
671
|
correctFields: number;
|
|
587
672
|
totalFields: number;
|
|
588
673
|
fieldAccuracy: number;
|
|
589
|
-
cost: number;
|
|
590
|
-
cumulativeCost: number;
|
|
674
|
+
cost: number; // Cost for this iteration
|
|
675
|
+
cumulativeCost: number; // Total cost so far
|
|
591
676
|
durationMs: number;
|
|
592
677
|
inputTokens: number;
|
|
593
678
|
outputTokens: number;
|
|
@@ -596,7 +681,10 @@ interface OptimizationReport {
|
|
|
596
681
|
input: unknown;
|
|
597
682
|
expected: unknown;
|
|
598
683
|
actual: unknown;
|
|
599
|
-
fields: Record<
|
|
684
|
+
fields: Record<
|
|
685
|
+
string,
|
|
686
|
+
{ expected: unknown; actual: unknown; passed: boolean }
|
|
687
|
+
>;
|
|
600
688
|
}>;
|
|
601
689
|
}>;
|
|
602
690
|
}
|
|
@@ -609,7 +697,7 @@ Contains detailed results from the best-performing iteration, with test results
|
|
|
609
697
|
```typescript
|
|
610
698
|
interface BestRunReport {
|
|
611
699
|
metadata: {
|
|
612
|
-
iteration: number;
|
|
700
|
+
iteration: number; // Which iteration was best
|
|
613
701
|
model: string;
|
|
614
702
|
provider: string;
|
|
615
703
|
thinking: boolean;
|
|
@@ -619,38 +707,41 @@ interface BestRunReport {
|
|
|
619
707
|
rateLimitPause?: number;
|
|
620
708
|
};
|
|
621
709
|
results: {
|
|
622
|
-
successRate: number;
|
|
623
|
-
passed: number;
|
|
624
|
-
total: number;
|
|
625
|
-
fieldAccuracy: number;
|
|
710
|
+
successRate: number; // Overall success rate
|
|
711
|
+
passed: number; // Passed tests
|
|
712
|
+
total: number; // Total tests
|
|
713
|
+
fieldAccuracy: number; // Field-level accuracy
|
|
626
714
|
correctFields: number;
|
|
627
715
|
totalFields: number;
|
|
628
716
|
};
|
|
629
717
|
cost: {
|
|
630
|
-
iteration: number;
|
|
631
|
-
cumulative: number;
|
|
718
|
+
iteration: number; // Cost for this iteration
|
|
719
|
+
cumulative: number; // Total cumulative cost
|
|
632
720
|
};
|
|
633
721
|
timing: {
|
|
634
722
|
durationMs: number;
|
|
635
723
|
inputTokens: number;
|
|
636
724
|
outputTokens: number;
|
|
637
725
|
};
|
|
638
|
-
failures: Array<{
|
|
726
|
+
failures: Array<{
|
|
727
|
+
// Tests that didnt meet the configured perTestThreshold
|
|
639
728
|
testIndex: number;
|
|
640
729
|
input: unknown;
|
|
641
730
|
expected: unknown;
|
|
642
731
|
actual: unknown;
|
|
643
732
|
failedFields: Record<string, { expected: unknown; actual: unknown }>;
|
|
644
733
|
}>;
|
|
645
|
-
partialFailures: Array<{
|
|
734
|
+
partialFailures: Array<{
|
|
735
|
+
// Tests that passed but have some failing fields
|
|
646
736
|
testIndex: number;
|
|
647
|
-
passRate: number;
|
|
737
|
+
passRate: number; // Percentage of fields passing
|
|
648
738
|
input: unknown;
|
|
649
739
|
expected: unknown;
|
|
650
740
|
actual: unknown;
|
|
651
741
|
failedFields: Record<string, { expected: unknown; actual: unknown }>;
|
|
652
742
|
}>;
|
|
653
|
-
successes: Array<{
|
|
743
|
+
successes: Array<{
|
|
744
|
+
// Tests with 100% field accuracy
|
|
654
745
|
testIndex: number;
|
|
655
746
|
input: unknown;
|
|
656
747
|
expected: unknown;
|
|
@@ -666,10 +757,22 @@ interface BestRunReport {
|
|
|
666
757
|
```typescript
|
|
667
758
|
// Namespace
|
|
668
759
|
import { didactic } from '@docshield/didactic';
|
|
669
|
-
import didactic from '@docshield/didactic';
|
|
760
|
+
import didactic from '@docshield/didactic'; // default export
|
|
670
761
|
|
|
671
762
|
// Comparators
|
|
672
|
-
import {
|
|
763
|
+
import {
|
|
764
|
+
exact,
|
|
765
|
+
within,
|
|
766
|
+
oneOf,
|
|
767
|
+
contains,
|
|
768
|
+
presence,
|
|
769
|
+
numeric,
|
|
770
|
+
date,
|
|
771
|
+
name,
|
|
772
|
+
unordered,
|
|
773
|
+
llmCompare,
|
|
774
|
+
custom,
|
|
775
|
+
} from '@docshield/didactic';
|
|
673
776
|
|
|
674
777
|
// Executors
|
|
675
778
|
import { endpoint, fn } from '@docshield/didactic';
|
|
@@ -679,22 +782,24 @@ import { evaluate, optimize } from '@docshield/didactic';
|
|
|
679
782
|
|
|
680
783
|
// Types
|
|
681
784
|
import type {
|
|
785
|
+
// Creating custom comparators
|
|
682
786
|
Comparator,
|
|
683
|
-
ComparatorMap,
|
|
684
787
|
ComparatorResult,
|
|
685
788
|
ComparatorContext,
|
|
789
|
+
// Creating custom executors
|
|
686
790
|
Executor,
|
|
687
791
|
ExecutorResult,
|
|
792
|
+
// Main API types
|
|
688
793
|
TestCase,
|
|
689
794
|
EvalConfig,
|
|
690
795
|
EvalResult,
|
|
691
|
-
TestCaseResult,
|
|
692
|
-
FieldResult,
|
|
693
796
|
OptimizeConfig,
|
|
694
797
|
OptimizeResult,
|
|
695
|
-
|
|
798
|
+
// Executor configs
|
|
696
799
|
EndpointConfig,
|
|
697
800
|
FnConfig,
|
|
801
|
+
// LLM configuration
|
|
802
|
+
LLMConfig,
|
|
698
803
|
} from '@docshield/didactic';
|
|
699
804
|
|
|
700
805
|
// Enum
|