@docshield/didactic 0.1.2 → 0.1.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +332 -183
- package/dist/index.cjs +1090 -550
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +134 -65
- package/dist/index.d.cts.map +1 -1
- package/dist/index.d.mts +134 -65
- package/dist/index.d.mts.map +1 -1
- package/dist/index.mjs +1085 -552
- package/dist/index.mjs.map +1 -1
- package/package.json +20 -3
package/README.md
CHANGED
|
@@ -5,7 +5,9 @@
|
|
|
5
5
|
[](https://www.npmjs.com/package/@docshield/didactic)
|
|
6
6
|
[](https://opensource.org/licenses/MIT)
|
|
7
7
|
|
|
8
|
-
Eval and
|
|
8
|
+
**Eval** your LLM workflows by comparing actual outputs against expected results with smart comparators that handle real-world variations. **Optimize** prompts automatically through iterative self-improvement—the system analyzes its own mistakes and rewrites prompts to boost accuracy.
|
|
9
|
+
|
|
10
|
+
Use it to test extraction and classification based AI workflows, monitor regression, and improve performance
|
|
9
11
|
|
|
10
12
|
## Installation
|
|
11
13
|
|
|
@@ -18,7 +20,14 @@ Requires Node.js >= 18.0.0
|
|
|
18
20
|
## Quick Start
|
|
19
21
|
|
|
20
22
|
```typescript
|
|
21
|
-
import {
|
|
23
|
+
import {
|
|
24
|
+
didactic,
|
|
25
|
+
within,
|
|
26
|
+
oneOf,
|
|
27
|
+
exact,
|
|
28
|
+
unordered,
|
|
29
|
+
numeric,
|
|
30
|
+
} from '@docshield/didactic';
|
|
22
31
|
|
|
23
32
|
const result = await didactic.eval({
|
|
24
33
|
executor: didactic.endpoint('https://api.example.com/extract'),
|
|
@@ -26,18 +35,63 @@ const result = await didactic.eval({
|
|
|
26
35
|
premium: within({ tolerance: 0.05 }),
|
|
27
36
|
policyType: oneOf(['claims-made', 'occurrence']),
|
|
28
37
|
carrier: exact,
|
|
38
|
+
// Nested comparators for arrays
|
|
39
|
+
coverages: unordered({
|
|
40
|
+
type: exact,
|
|
41
|
+
limit: numeric,
|
|
42
|
+
}),
|
|
29
43
|
},
|
|
30
44
|
testCases: [
|
|
31
45
|
{
|
|
32
46
|
input: { emailId: 'email-123' },
|
|
33
|
-
expected: {
|
|
47
|
+
expected: {
|
|
48
|
+
premium: 12500,
|
|
49
|
+
policyType: 'claims-made',
|
|
50
|
+
carrier: 'Acme Insurance',
|
|
51
|
+
coverages: [
|
|
52
|
+
{ type: 'liability', limit: 1000000 },
|
|
53
|
+
{ type: 'property', limit: 500000 },
|
|
54
|
+
],
|
|
55
|
+
},
|
|
34
56
|
},
|
|
35
57
|
],
|
|
36
58
|
});
|
|
37
59
|
|
|
38
|
-
console.log(
|
|
60
|
+
console.log(
|
|
61
|
+
`${result.passed}/${result.total} passed (${result.accuracy * 100}% field accuracy)`
|
|
62
|
+
);
|
|
39
63
|
```
|
|
40
64
|
|
|
65
|
+
## Example
|
|
66
|
+
|
|
67
|
+
### Eval - Invoice Parser
|
|
68
|
+
|
|
69
|
+
Real-world invoice extraction using Anthropic's Claude with structured outputs. Tests field accuracy across vendor names, line items, and payment terms.
|
|
70
|
+
|
|
71
|
+
```bash
|
|
72
|
+
# Set your API key
|
|
73
|
+
export ANTHROPIC_API_KEY=your_key_here
|
|
74
|
+
|
|
75
|
+
# Run the example
|
|
76
|
+
npm run example:eval:invoice-parser
|
|
77
|
+
```
|
|
78
|
+
|
|
79
|
+
Shows how to use `numeric`, `name`, `exact`, `unordered()`, and `llmCompare` comparators for financial data extraction with nested comparator structures.
|
|
80
|
+
|
|
81
|
+
### Optimizer - Expense Categorizer
|
|
82
|
+
|
|
83
|
+
Iteratively feed eval failures back into an optimization loop to self-improve prompt and performance. Runs evals until it reaches targeted performance or runs out of budget.
|
|
84
|
+
|
|
85
|
+
```bash
|
|
86
|
+
# Set your API key
|
|
87
|
+
export ANTHROPIC_API_KEY=your_key_here
|
|
88
|
+
|
|
89
|
+
# Run the example
|
|
90
|
+
npm run example:optimizer:expense-categorizer
|
|
91
|
+
```
|
|
92
|
+
|
|
93
|
+
Shows how to use Didactic to self-heal failures and improve prompt to better perform across test set data.
|
|
94
|
+
|
|
41
95
|
---
|
|
42
96
|
|
|
43
97
|
## Core Concepts
|
|
@@ -45,14 +99,13 @@ console.log(`${result.passed}/${result.total} passed (${result.accuracy * 100}%
|
|
|
45
99
|
Didactic has three core components:
|
|
46
100
|
|
|
47
101
|
1. **[Executors](#executors)** — Abstraction for running your LLM workflow (local function or HTTP endpoint)
|
|
48
|
-
2. **[Comparators](#comparators)** —
|
|
49
|
-
3. **[Optimization](#didacticoptimizeevalconfig-optimizeconfig)** — Iterative prompt improvement loop to hit a target success
|
|
102
|
+
2. **[Comparators](#comparators)** — Nested structure matching your data shape, with per-field comparison logic and `unordered()` for arrays
|
|
103
|
+
3. **[Optimization](#didacticoptimizeevalconfig-optimizeconfig)** — Iterative prompt improvement loop to hit a target success rate
|
|
50
104
|
|
|
51
|
-
**How they work together:** Your executor runs each test case's input through your LLM workflow, returning output that matches your test case's expected output shape. Comparators then evaluate each field of the output against expected values,
|
|
105
|
+
**How they work together:** Your executor runs each test case's input through your LLM workflow, returning output that matches your test case's expected output shape. Comparators then evaluate each field of the output against expected values, using nested structures that mirror your data shape. For arrays, use `unordered()` to match by similarity rather than index position.
|
|
52
106
|
|
|
53
107
|
In optimization mode, these results feed into an LLM that analyzes failures and generates improved system prompts—repeating until your target success rate or iteration/cost limit is reached.
|
|
54
108
|
|
|
55
|
-
|
|
56
109
|
#### Eval Flow
|
|
57
110
|
|
|
58
111
|

|
|
@@ -75,18 +128,18 @@ const result = await didactic.eval(config);
|
|
|
75
128
|
|
|
76
129
|
#### EvalConfig
|
|
77
130
|
|
|
78
|
-
| Property
|
|
79
|
-
|
|
80
|
-
| `executor`
|
|
81
|
-
| `testCases`
|
|
82
|
-
| `comparators`
|
|
83
|
-
| `comparatorOverride` | `Comparator<TOutput>`
|
|
84
|
-
| `
|
|
85
|
-
| `
|
|
86
|
-
| `
|
|
87
|
-
| `rateLimitBatch`
|
|
88
|
-
| `rateLimitPause`
|
|
89
|
-
| `optimize`
|
|
131
|
+
| Property | Type | Kind | Required | Default | Description |
|
|
132
|
+
| -------------------- | ----------------------------- | --------------- | -------- | ------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
|
133
|
+
| `executor` | `Executor<TInput, TOutput>` | Object | **Yes** | — | Function that executes your LLM workflow. Receives input and optional system prompt, returns structured output. |
|
|
134
|
+
| `testCases` | `TestCase<TInput, TOutput>[]` | Array | **Yes** | — | Array of `{ input, expected }` pairs. Each test case runs through the executor and compares output to expected. |
|
|
135
|
+
| `comparators` | `ComparatorsConfig` | Object/Function | No | `exact` | Nested comparator structure matching your data shape. Can be a single comparator function (e.g., `exact`), or a nested object with per-field comparators. Use `unordered()` wrapper for arrays that should match by similarity rather than index. |
|
|
136
|
+
| `comparatorOverride` | `Comparator<TOutput>` | Function | No | — | Custom whole-object comparison function. Use when you need complete control over comparison logic and want to bypass field-level matching. |
|
|
137
|
+
| `llmConfig` | `LLMConfig` | Object | No | — | Default LLM configuration for LLM-based comparators (e.g., `llmCompare`). Provides `apiKey` and optional `provider` so you don't repeat them in each comparator call. |
|
|
138
|
+
| `systemPrompt` | `string` | Primitive | No | — | System prompt passed to the executor. Required if using optimization. |
|
|
139
|
+
| `perTestThreshold` | `number` | Primitive | No | `1.0` | Minimum field pass rate for a test case to pass (0.0–1.0). At default 1.0, all fields must pass. Set to 0.8 to pass if 80% of fields match. |
|
|
140
|
+
| `rateLimitBatch` | `number` | Primitive | No | — | Number of test cases to run concurrently. Use with `rateLimitPause` for rate-limited APIs. |
|
|
141
|
+
| `rateLimitPause` | `number` | Primitive | No | — | Seconds to wait between batches. Pairs with `rateLimitBatch`. |
|
|
142
|
+
| `optimize` | `OptimizeConfig` | Object | No | — | Inline optimization config. When provided, triggers optimization mode instead of single eval. |
|
|
90
143
|
|
|
91
144
|
---
|
|
92
145
|
|
|
@@ -111,21 +164,23 @@ const config = {
|
|
|
111
164
|
storeLogs: true,
|
|
112
165
|
thinking: true,
|
|
113
166
|
},
|
|
114
|
-
}
|
|
167
|
+
};
|
|
115
168
|
```
|
|
116
169
|
|
|
117
170
|
#### OptimizeConfig
|
|
118
171
|
|
|
119
|
-
| Property
|
|
120
|
-
|
|
121
|
-
| `systemPrompt`
|
|
122
|
-
| `targetSuccessRate` | `number`
|
|
123
|
-
| `apiKey`
|
|
124
|
-
| `provider`
|
|
125
|
-
| `maxIterations`
|
|
126
|
-
| `maxCost`
|
|
127
|
-
| `storeLogs`
|
|
128
|
-
| `thinking`
|
|
172
|
+
| Property | Type | Required | Default | Description |
|
|
173
|
+
| ------------------- | ------------------- | -------- | --------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------- |
|
|
174
|
+
| `systemPrompt` | `string` | **Yes** | — | Initial system prompt to optimize. This is the starting point that the optimizer will iteratively improve. |
|
|
175
|
+
| `targetSuccessRate` | `number` | **Yes** | — | Target success rate to achieve (0.0–1.0). Optimization stops when this rate is reached. |
|
|
176
|
+
| `apiKey` | `string` | **Yes** | — | API key for the LLM provider used by the optimizer (not your workflow's LLM). |
|
|
177
|
+
| `provider` | `LLMProviders` | **Yes** | — | LLM provider the optimizer uses to analyze failures and generate improved prompts. |
|
|
178
|
+
| `maxIterations` | `number` | No | `5` | Maximum optimization iterations before stopping, even if target not reached. |
|
|
179
|
+
| `maxCost` | `number` | No | — | Maximum cost budget in dollars. Optimization stops if cumulative cost exceeds this. |
|
|
180
|
+
| `storeLogs` | `boolean \| string` | No | — | Save optimization logs. `true` uses default path (`./didactic-logs/optimize_<timestamp>/summary.md`), or provide custom summary path. |
|
|
181
|
+
| `thinking` | `boolean` | No | — | Enable extended thinking mode for deeper analysis (provider must support it). |
|
|
182
|
+
| `patchSystemPrompt` | `string` | No | [`DEFAULT_PATCH_SYSTEM_PROMPT`](src/optimizer/prompts.ts) | Custom system prompt for patch generation. Completely replaces the default prompt that analyzes failures and suggests improvements. |
|
|
183
|
+
| `mergeSystemPrompt` | `string` | No | [`DEFAULT_MERGE_SYSTEM_PROMPT`](src/optimizer/prompts.ts) | Custom system prompt for merging patches. Completely replaces the default prompt that combines multiple patches into a coherent system prompt. |
|
|
129
184
|
|
|
130
185
|
---
|
|
131
186
|
|
|
@@ -134,11 +189,13 @@ const config = {
|
|
|
134
189
|
Executors abstract your LLM workflow from the evaluation harness. Whether your workflow runs locally, calls a remote API, or orchestrates Temporal activities, executors provide a consistent interface: take input + optional system prompt, return expected output.
|
|
135
190
|
|
|
136
191
|
This separation enables:
|
|
192
|
+
|
|
137
193
|
- **Swap execution strategies** — Switch between local/remote without changing tests
|
|
138
194
|
- **Dynamic prompt injection** — System prompts flow through for optimization
|
|
139
195
|
- **Cost tracking** — Aggregate execution costs across test runs
|
|
140
196
|
|
|
141
197
|
didactic provides two built-in executors:
|
|
198
|
+
|
|
142
199
|
- `endpoint` for calling a remote API
|
|
143
200
|
- `fn` for calling a local function
|
|
144
201
|
|
|
@@ -148,7 +205,6 @@ You may want to provide a `mapAdditionalContext` function to extract metadata fr
|
|
|
148
205
|
|
|
149
206
|
Note: If you do not provide a `mapResponse` function, the executor will assume the response from the executor is the output you want to compare against `expected`.
|
|
150
207
|
|
|
151
|
-
|
|
152
208
|
### `endpoint(url, config?)`
|
|
153
209
|
|
|
154
210
|
Create an executor that calls an HTTP endpoint. The executor sends input + systemPrompt as the request body and expects structured JSON back.
|
|
@@ -167,14 +223,14 @@ const executor = endpoint('https://api.example.com/workflow', {
|
|
|
167
223
|
|
|
168
224
|
#### EndpointConfig
|
|
169
225
|
|
|
170
|
-
| Property
|
|
171
|
-
|
|
172
|
-
| `method`
|
|
173
|
-
| `headers`
|
|
174
|
-
| `mapResponse`
|
|
175
|
-
| `mapAdditionalContext` | `(response: any) => unknown` | No
|
|
176
|
-
| `mapCost`
|
|
177
|
-
| `timeout`
|
|
226
|
+
| Property | Type | Required | Default | Description |
|
|
227
|
+
| ---------------------- | ---------------------------- | -------- | -------- | ------------------------------------------------------------------------------------------ |
|
|
228
|
+
| `method` | `'POST' \| 'GET'` | No | `'POST'` | HTTP method for the request. |
|
|
229
|
+
| `headers` | `Record<string, string>` | No | `{}` | Headers to include (auth tokens, content-type overrides, etc). |
|
|
230
|
+
| `mapResponse` | `(response: any) => TOutput` | No | — | Transform the raw response to your expected output shape. Use when your API wraps results. |
|
|
231
|
+
| `mapAdditionalContext` | `(response: any) => unknown` | No | — | Extract metadata (logs, debug info) from response for inspection. |
|
|
232
|
+
| `mapCost` | `(response: any) => number` | No | — | Extract execution cost from response (e.g., token counts in headers). |
|
|
233
|
+
| `timeout` | `number` | No | `30000` | Request timeout in milliseconds. |
|
|
178
234
|
|
|
179
235
|
---
|
|
180
236
|
|
|
@@ -190,19 +246,24 @@ const executor = fn({
|
|
|
190
246
|
return await myLLMCall(input, systemPrompt);
|
|
191
247
|
},
|
|
192
248
|
mapResponse: (result) => result.output,
|
|
193
|
-
mapCost: (result) =>
|
|
194
|
-
|
|
249
|
+
mapCost: (result) =>
|
|
250
|
+
result.usage.input_tokens * 0.000003 +
|
|
251
|
+
result.usage.output_tokens * 0.000015,
|
|
252
|
+
mapAdditionalContext: (result) => ({
|
|
253
|
+
model: result.model,
|
|
254
|
+
finishReason: result.stop_reason,
|
|
255
|
+
}),
|
|
195
256
|
});
|
|
196
257
|
```
|
|
197
258
|
|
|
198
259
|
#### FnConfig
|
|
199
260
|
|
|
200
|
-
| Property
|
|
201
|
-
|
|
202
|
-
| `fn`
|
|
203
|
-
| `mapResponse`
|
|
204
|
-
| `mapAdditionalContext` | `(result: TRaw) => unknown`
|
|
205
|
-
| `mapCost`
|
|
261
|
+
| Property | Type | Required | Default | Description |
|
|
262
|
+
| ---------------------- | --------------------------------------------------------- | -------- | ------- | ------------------------------------------------------------------------------------------------------------------ |
|
|
263
|
+
| `fn` | `(input: TInput, systemPrompt?: string) => Promise<TRaw>` | **Yes** | — | Async function that executes your workflow. Receives test input and optional system prompt. |
|
|
264
|
+
| `mapResponse` | `(result: TRaw) => TOutput` | No | — | Transform raw result from fn into the expected output shape to compare. Without this, raw result is used directly. |
|
|
265
|
+
| `mapAdditionalContext` | `(result: TRaw) => unknown` | No | — | Map additional context about the run to pass to the optimizer prompt. |
|
|
266
|
+
| `mapCost` | `(result: TRaw) => number` | No | — | Extract cost from the result (if your function tracks it). Used to track the total cost of the runs. |
|
|
206
267
|
|
|
207
268
|
---
|
|
208
269
|
|
|
@@ -232,6 +293,7 @@ const executor = fn({
|
|
|
232
293
|
```
|
|
233
294
|
|
|
234
295
|
Without `mapResponse`:
|
|
296
|
+
|
|
235
297
|
- **endpoint**: uses the raw JSON response as output
|
|
236
298
|
- **fn**: uses the function's return value directly as output
|
|
237
299
|
|
|
@@ -294,56 +356,79 @@ const executor = fn({
|
|
|
294
356
|
|
|
295
357
|
Comparators bridge the gap between messy LLM output and semantic correctness. Rather than requiring exact string matches, comparators handle real-world data variations—currency formatting, date formats, name suffixes, numeric tolerance—while maintaining semantic accuracy.
|
|
296
358
|
|
|
297
|
-
|
|
359
|
+
**Nested structure:** Comparators mirror your data shape. Use objects to define per-field comparators, and `unordered()` to wrap arrays that should match by similarity rather than index position.
|
|
360
|
+
|
|
361
|
+
Each comparator returns a `passed` boolean and a `similarity` score (0.0–1.0). The pass/fail determines test results, while similarity enables Hungarian matching for `unordered()` arrays.
|
|
298
362
|
|
|
299
363
|
### `comparators` vs `comparatorOverride`
|
|
300
364
|
|
|
301
|
-
Use **`comparators`** for standard comparison. It accepts
|
|
365
|
+
Use **`comparators`** for standard comparison. It accepts:
|
|
302
366
|
|
|
303
367
|
**1. A single comparator function** — Applied uniformly across the output:
|
|
304
368
|
|
|
305
369
|
```typescript
|
|
306
|
-
// Clean syntax for primitives
|
|
370
|
+
// Clean syntax for primitives or arrays
|
|
307
371
|
const result = await didactic.eval({
|
|
308
372
|
executor: myNumberExtractor,
|
|
309
|
-
comparators: exact,
|
|
373
|
+
comparators: exact, // Single comparator for root-level output
|
|
310
374
|
testCases: [
|
|
311
375
|
{ input: 'twenty-three', expected: 23 },
|
|
312
376
|
{ input: 'one hundred', expected: 100 },
|
|
313
377
|
],
|
|
314
378
|
});
|
|
315
379
|
|
|
316
|
-
//
|
|
380
|
+
// For unordered arrays, use the unordered() wrapper
|
|
317
381
|
const result = await didactic.eval({
|
|
318
382
|
executor: myListExtractor,
|
|
319
|
-
comparators: exact,
|
|
320
|
-
|
|
321
|
-
testCases: [
|
|
322
|
-
{ input: 'numbers', expected: [1, 2, 3, 4] },
|
|
323
|
-
],
|
|
383
|
+
comparators: unordered(exact), // Match by similarity, not index
|
|
384
|
+
testCases: [{ input: 'numbers', expected: [1, 2, 3, 4] }],
|
|
324
385
|
});
|
|
325
386
|
```
|
|
326
387
|
|
|
327
|
-
**2. A
|
|
388
|
+
**2. A nested object structure** — Mirrors your data shape with per-field comparators:
|
|
328
389
|
|
|
329
390
|
```typescript
|
|
330
391
|
const result = await didactic.eval({
|
|
331
392
|
executor: myExecutor,
|
|
332
393
|
comparators: {
|
|
333
|
-
premium: within({ tolerance: 0.05 }),
|
|
334
|
-
carrier: exact,
|
|
335
|
-
effectiveDate: date,
|
|
394
|
+
premium: within({ tolerance: 0.05 }), // 5% tolerance for numbers
|
|
395
|
+
carrier: exact, // Exact string match
|
|
396
|
+
effectiveDate: date, // Flexible date parsing
|
|
397
|
+
// Use unordered() for arrays that can be in any order
|
|
398
|
+
lineItems: unordered({
|
|
399
|
+
description: name,
|
|
400
|
+
amount: numeric,
|
|
401
|
+
}),
|
|
336
402
|
},
|
|
337
403
|
testCases: [
|
|
338
404
|
{
|
|
339
405
|
input: { emailId: 'email-123' },
|
|
340
|
-
expected: {
|
|
406
|
+
expected: {
|
|
407
|
+
premium: 12500,
|
|
408
|
+
carrier: 'Acme Insurance',
|
|
409
|
+
effectiveDate: '2024-01-15',
|
|
410
|
+
lineItems: [
|
|
411
|
+
{ description: 'Service Fee', amount: 100 },
|
|
412
|
+
{ description: 'Tax', amount: 25 },
|
|
413
|
+
],
|
|
414
|
+
},
|
|
341
415
|
},
|
|
342
416
|
],
|
|
343
417
|
});
|
|
344
418
|
```
|
|
345
419
|
|
|
420
|
+
**3. Optional (defaults to `exact`)** — If omitted, uses `exact` for entire output:
|
|
421
|
+
|
|
422
|
+
```typescript
|
|
423
|
+
// No comparators needed for simple exact matching
|
|
424
|
+
const result = await didactic.eval({
|
|
425
|
+
executor: myExecutor,
|
|
426
|
+
testCases: [{ input: 'hello', expected: 'hello' }],
|
|
427
|
+
});
|
|
428
|
+
```
|
|
429
|
+
|
|
346
430
|
Use **`comparatorOverride`** when you need:
|
|
431
|
+
|
|
347
432
|
- Complete control over comparison logic
|
|
348
433
|
- Custom cross-field validation
|
|
349
434
|
- Whole-object semantic comparison that doesn't map to individual fields
|
|
@@ -365,41 +450,83 @@ const result = await didactic.eval({
|
|
|
365
450
|
|
|
366
451
|
### Built-in Comparators
|
|
367
452
|
|
|
368
|
-
| Comparator
|
|
369
|
-
|
|
370
|
-
| `exact`
|
|
371
|
-
| `within`
|
|
372
|
-
| `oneOf`
|
|
373
|
-
| `contains`
|
|
374
|
-
| `presence`
|
|
375
|
-
| `numeric`
|
|
376
|
-
| `numeric.nullable` | `
|
|
377
|
-
| `date`
|
|
378
|
-
| `name`
|
|
379
|
-
| `
|
|
453
|
+
| Comparator | Usage | Description |
|
|
454
|
+
| ------------------ | --------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
|
455
|
+
| `exact` | `exact` | Deep equality with cycle detection. Default when no comparator specified. |
|
|
456
|
+
| `within` | `within({ tolerance, mode? })` | Numeric tolerance. `mode: 'percentage'` (default) or `'absolute'`. |
|
|
457
|
+
| `oneOf` | `oneOf(allowedValues)` | Enum validation. Passes if actual equals expected AND both are in the allowed set. |
|
|
458
|
+
| `contains` | `contains(substring)` | String contains check. Passes if actual includes the substring. |
|
|
459
|
+
| `presence` | `presence` | Existence check. Passes if expected is absent, or if actual has any value when expected does. |
|
|
460
|
+
| `numeric` | `numeric` | Numeric comparison after stripping currency symbols, commas, accounting notation. |
|
|
461
|
+
| `numeric.nullable` | `numeric.nullable` | Same as `numeric`, but treats null/undefined/empty as 0. |
|
|
462
|
+
| `date` | `date` | Date comparison after normalizing formats (ISO, US MM/DD, EU DD/MM, written). |
|
|
463
|
+
| `name` | `name` | Name comparison with case normalization, suffix removal (Inc, LLC), fuzzy matching. |
|
|
464
|
+
| `unordered` | `unordered(comparator)` or `unordered({ fields })` | Wrapper for arrays that should match by similarity (Hungarian algorithm) rather than index. Pass a comparator for primitives or nested config for objects. |
|
|
465
|
+
| `llmCompare` | `llmCompare({ systemPrompt?, apiKey?, provider? })` | LLM-based semantic comparison. Uses `llmConfig` from eval config if `apiKey` not provided. Returns rationale and tracks cost. |
|
|
466
|
+
| `custom` | `custom({ compare })` | User-defined logic. `compare(expected, actual, context?) => boolean`. Context provides access to parent objects for cross-field logic. |
|
|
380
467
|
|
|
381
468
|
### Examples
|
|
382
469
|
|
|
383
470
|
```typescript
|
|
384
|
-
import {
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
|
|
394
|
-
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
|
|
471
|
+
import {
|
|
472
|
+
didactic,
|
|
473
|
+
within,
|
|
474
|
+
oneOf,
|
|
475
|
+
exact,
|
|
476
|
+
contains,
|
|
477
|
+
presence,
|
|
478
|
+
numeric,
|
|
479
|
+
date,
|
|
480
|
+
name,
|
|
481
|
+
unordered,
|
|
482
|
+
llmCompare,
|
|
483
|
+
custom,
|
|
484
|
+
LLMProviders,
|
|
485
|
+
} from '@docshield/didactic';
|
|
486
|
+
|
|
487
|
+
const result = await didactic.eval({
|
|
488
|
+
executor: myInvoiceParser,
|
|
489
|
+
testCases: [...],
|
|
490
|
+
// LLM config for all llmCompare calls (no need to repeat apiKey)
|
|
491
|
+
llmConfig: {
|
|
492
|
+
apiKey: process.env.ANTHROPIC_API_KEY,
|
|
493
|
+
provider: LLMProviders.anthropic_claude_haiku,
|
|
494
|
+
},
|
|
495
|
+
comparators: {
|
|
496
|
+
premium: within({ tolerance: 0.05 }), // 5% tolerance
|
|
497
|
+
deductible: within({ tolerance: 100, mode: 'absolute' }), // $100 tolerance
|
|
498
|
+
policyType: oneOf(['claims-made', 'occurrence', 'entity']),
|
|
499
|
+
carrier: exact,
|
|
500
|
+
notes: contains('approved'),
|
|
501
|
+
entityName: name,
|
|
502
|
+
effectiveDate: date,
|
|
503
|
+
amount: numeric,
|
|
504
|
+
optionalField: presence,
|
|
505
|
+
|
|
506
|
+
// Unordered array of objects with nested comparators
|
|
507
|
+
lineItems: unordered({
|
|
508
|
+
description: llmCompare({
|
|
509
|
+
// Uses llmConfig.apiKey from above!
|
|
510
|
+
systemPrompt: 'Compare line item descriptions semantically.',
|
|
511
|
+
}),
|
|
512
|
+
quantity: exact,
|
|
513
|
+
price: numeric,
|
|
514
|
+
}),
|
|
515
|
+
|
|
516
|
+
// LLM-based comparison for flexible semantic matching
|
|
517
|
+
companyName: llmCompare({
|
|
518
|
+
systemPrompt:
|
|
519
|
+
'Compare company names considering abbreviations and legal suffixes.',
|
|
520
|
+
}),
|
|
521
|
+
|
|
522
|
+
customField: custom({
|
|
523
|
+
compare: (expected, actual, context) => {
|
|
524
|
+
// Access sibling fields via context.actualParent
|
|
525
|
+
return actual.toLowerCase() === expected.toLowerCase();
|
|
526
|
+
},
|
|
527
|
+
}),
|
|
528
|
+
},
|
|
529
|
+
});
|
|
403
530
|
```
|
|
404
531
|
|
|
405
532
|
---
|
|
@@ -412,13 +539,13 @@ Supported LLM providers for the optimizer:
|
|
|
412
539
|
import { LLMProviders } from '@docshield/didactic';
|
|
413
540
|
```
|
|
414
541
|
|
|
415
|
-
| Value
|
|
416
|
-
|
|
417
|
-
| `LLMProviders.anthropic_claude_opus`
|
|
542
|
+
| Value | Description |
|
|
543
|
+
| -------------------------------------- | --------------------------------------------- |
|
|
544
|
+
| `LLMProviders.anthropic_claude_opus` | Claude Opus 4.5 — Most capable, highest cost |
|
|
418
545
|
| `LLMProviders.anthropic_claude_sonnet` | Claude Sonnet 4.5 — Balanced performance/cost |
|
|
419
|
-
| `LLMProviders.anthropic_claude_haiku`
|
|
420
|
-
| `LLMProviders.openai_gpt5`
|
|
421
|
-
| `LLMProviders.openai_gpt5_mini`
|
|
546
|
+
| `LLMProviders.anthropic_claude_haiku` | Claude Haiku 4.5 — Fastest, lowest cost |
|
|
547
|
+
| `LLMProviders.openai_gpt5` | GPT-5.2 — OpenAI flagship |
|
|
548
|
+
| `LLMProviders.openai_gpt5_mini` | GPT-5 Mini — OpenAI lightweight |
|
|
422
549
|
|
|
423
550
|
---
|
|
424
551
|
|
|
@@ -428,60 +555,62 @@ import { LLMProviders } from '@docshield/didactic';
|
|
|
428
555
|
|
|
429
556
|
Returned by `didactic.eval()` when no optimization is configured.
|
|
430
557
|
|
|
431
|
-
| Property
|
|
432
|
-
|
|
433
|
-
| `systemPrompt`
|
|
434
|
-
| `testCases`
|
|
435
|
-
| `passed`
|
|
436
|
-
| `total`
|
|
437
|
-
| `successRate`
|
|
438
|
-
| `correctFields`
|
|
439
|
-
| `totalFields`
|
|
440
|
-
| `accuracy`
|
|
441
|
-
| `cost`
|
|
558
|
+
| Property | Type | Description |
|
|
559
|
+
| ---------------- | --------------------- | ----------------------------------------------------------------------------- |
|
|
560
|
+
| `systemPrompt` | `string \| undefined` | System prompt that was used for this eval run. |
|
|
561
|
+
| `testCases` | `TestCaseResult[]` | Detailed results for each test case. Inspect for field-level failure details. |
|
|
562
|
+
| `passed` | `number` | Count of test cases that passed (met `perTestThreshold`). |
|
|
563
|
+
| `total` | `number` | Total number of test cases run. |
|
|
564
|
+
| `successRate` | `number` | Pass rate (0.0–1.0). `passed / total`. |
|
|
565
|
+
| `correctFields` | `number` | Total correct fields across all test cases. |
|
|
566
|
+
| `totalFields` | `number` | Total fields evaluated across all test cases. |
|
|
567
|
+
| `accuracy` | `number` | Field-level accuracy (0.0–1.0). `correctFields / totalFields`. |
|
|
568
|
+
| `cost` | `number` | Total execution cost aggregated from executor results. |
|
|
569
|
+
| `comparatorCost` | `number` | Total cost from LLM-based comparators (e.g., `llmCompare`). |
|
|
442
570
|
|
|
443
571
|
### TestCaseResult
|
|
444
572
|
|
|
445
573
|
Per-test-case detail, accessible via `EvalResult.testCases`.
|
|
446
574
|
|
|
447
|
-
| Property
|
|
448
|
-
|
|
449
|
-
| `input`
|
|
450
|
-
| `expected`
|
|
451
|
-
| `actual`
|
|
452
|
-
| `passed`
|
|
453
|
-
| `fields`
|
|
454
|
-
| `passedFields`
|
|
455
|
-
| `totalFields`
|
|
456
|
-
| `passRate`
|
|
457
|
-
| `cost`
|
|
458
|
-
| `
|
|
459
|
-
| `
|
|
575
|
+
| Property | Type | Description |
|
|
576
|
+
| ------------------- | ----------------------------- | ------------------------------------------------------------------------- |
|
|
577
|
+
| `input` | `TInput` | The input that was passed to the executor. |
|
|
578
|
+
| `expected` | `TOutput` | The expected output from the test case. |
|
|
579
|
+
| `actual` | `TOutput \| undefined` | Actual output returned by executor. Undefined if execution failed. |
|
|
580
|
+
| `passed` | `boolean` | Whether this test case passed (met `perTestThreshold`). |
|
|
581
|
+
| `fields` | `Record<string, FieldResult>` | Per-field comparison results. Key is field path (e.g., `"address.city"`). |
|
|
582
|
+
| `passedFields` | `number` | Count of fields that passed comparison. |
|
|
583
|
+
| `totalFields` | `number` | Total fields compared. |
|
|
584
|
+
| `passRate` | `number` | Field pass rate for this test case (0.0–1.0). |
|
|
585
|
+
| `cost` | `number \| undefined` | Execution cost for this test case, if reported by executor. |
|
|
586
|
+
| `comparatorCost` | `number \| undefined` | Total cost from LLM-based comparators in this test case. |
|
|
587
|
+
| `additionalContext` | `unknown \| undefined` | Extra context extracted by executor (logs, debug info). |
|
|
588
|
+
| `error` | `string \| undefined` | Error message if executor threw an exception. |
|
|
460
589
|
|
|
461
590
|
### OptimizeResult
|
|
462
591
|
|
|
463
592
|
Returned by `didactic.optimize()` or `didactic.eval()` with optimization configured.
|
|
464
593
|
|
|
465
|
-
| Property
|
|
466
|
-
|
|
467
|
-
| `success`
|
|
468
|
-
| `finalPrompt` | `string`
|
|
469
|
-
| `iterations`
|
|
470
|
-
| `totalCost`
|
|
471
|
-
| `logFolder`
|
|
594
|
+
| Property | Type | Description |
|
|
595
|
+
| ------------- | --------------------- | ------------------------------------------------------------------------------------ |
|
|
596
|
+
| `success` | `boolean` | Whether the target success rate was achieved. |
|
|
597
|
+
| `finalPrompt` | `string` | The final optimized system prompt. Use this in production. |
|
|
598
|
+
| `iterations` | `IterationResult[]` | Results from each optimization iteration. Inspect to see how the prompt evolved. |
|
|
599
|
+
| `totalCost` | `number` | Total cost across all iterations (optimizer + executor costs). |
|
|
600
|
+
| `logFolder` | `string \| undefined` | Folder path where optimization logs were written (only when `storeLogs` is enabled). |
|
|
472
601
|
|
|
473
602
|
### IterationResult
|
|
474
603
|
|
|
475
604
|
Per-iteration detail, accessible via `OptimizeResult.iterations`.
|
|
476
605
|
|
|
477
|
-
| Property
|
|
478
|
-
|
|
479
|
-
| `iteration`
|
|
480
|
-
| `systemPrompt` | `string`
|
|
481
|
-
| `passed`
|
|
482
|
-
| `total`
|
|
483
|
-
| `testCases`
|
|
484
|
-
| `cost`
|
|
606
|
+
| Property | Type | Description |
|
|
607
|
+
| -------------- | ------------------ | ---------------------------------------------- |
|
|
608
|
+
| `iteration` | `number` | Iteration number (1-indexed). |
|
|
609
|
+
| `systemPrompt` | `string` | System prompt used for this iteration. |
|
|
610
|
+
| `passed` | `number` | Test cases passed in this iteration. |
|
|
611
|
+
| `total` | `number` | Total test cases in this iteration. |
|
|
612
|
+
| `testCases` | `TestCaseResult[]` | Detailed test case results for this iteration. |
|
|
613
|
+
| `cost` | `number` | Cost for this iteration. |
|
|
485
614
|
|
|
486
615
|
---
|
|
487
616
|
|
|
@@ -491,12 +620,12 @@ When `storeLogs` is enabled in `OptimizeConfig`, four files are written to the l
|
|
|
491
620
|
|
|
492
621
|
**Default path:** `./didactic-logs/optimize_<timestamp>/`
|
|
493
622
|
|
|
494
|
-
| File
|
|
495
|
-
|
|
496
|
-
| `summary.md`
|
|
497
|
-
| `prompts.md`
|
|
498
|
-
| `rawData.json` | Complete iteration data for programmatic analysis
|
|
499
|
-
| `bestRun.json` | Detailed results from the best-performing iteration
|
|
623
|
+
| File | Description |
|
|
624
|
+
| -------------- | ------------------------------------------------------------------------- |
|
|
625
|
+
| `summary.md` | Human-readable report with configuration, metrics, and iteration progress |
|
|
626
|
+
| `prompts.md` | All system prompts used in each iteration |
|
|
627
|
+
| `rawData.json` | Complete iteration data for programmatic analysis |
|
|
628
|
+
| `bestRun.json` | Detailed results from the best-performing iteration |
|
|
500
629
|
|
|
501
630
|
### rawData.json
|
|
502
631
|
|
|
@@ -505,17 +634,17 @@ Contains the complete optimization run data for programmatic analysis:
|
|
|
505
634
|
```typescript
|
|
506
635
|
interface OptimizationReport {
|
|
507
636
|
metadata: {
|
|
508
|
-
timestamp: string;
|
|
509
|
-
model: string;
|
|
510
|
-
provider: string;
|
|
511
|
-
thinking: boolean;
|
|
512
|
-
targetSuccessRate: number;
|
|
513
|
-
maxIterations: number | null;
|
|
514
|
-
maxCost: number | null;
|
|
515
|
-
testCaseCount: number;
|
|
516
|
-
perTestThreshold: number;
|
|
517
|
-
rateLimitBatch?: number;
|
|
518
|
-
rateLimitPause?: number;
|
|
637
|
+
timestamp: string; // ISO timestamp
|
|
638
|
+
model: string; // LLM model used
|
|
639
|
+
provider: string; // Provider (anthropic, openai, etc)
|
|
640
|
+
thinking: boolean; // Extended thinking enabled
|
|
641
|
+
targetSuccessRate: number; // Target (0.0-1.0)
|
|
642
|
+
maxIterations: number | null; // Max iterations or null
|
|
643
|
+
maxCost: number | null; // Max cost budget or null
|
|
644
|
+
testCaseCount: number; // Number of test cases
|
|
645
|
+
perTestThreshold: number; // Per-test threshold (default 1.0)
|
|
646
|
+
rateLimitBatch?: number; // Batch size for rate limiting
|
|
647
|
+
rateLimitPause?: number; // Pause seconds between batches
|
|
519
648
|
};
|
|
520
649
|
summary: {
|
|
521
650
|
totalIterations: number;
|
|
@@ -523,16 +652,16 @@ interface OptimizationReport {
|
|
|
523
652
|
totalCost: number;
|
|
524
653
|
totalInputTokens: number;
|
|
525
654
|
totalOutputTokens: number;
|
|
526
|
-
startRate: number;
|
|
527
|
-
endRate: number;
|
|
655
|
+
startRate: number; // Success rate at start
|
|
656
|
+
endRate: number; // Success rate at end
|
|
528
657
|
targetMet: boolean;
|
|
529
658
|
};
|
|
530
659
|
best: {
|
|
531
|
-
iteration: number;
|
|
532
|
-
successRate: number;
|
|
533
|
-
passed: number;
|
|
534
|
-
total: number;
|
|
535
|
-
fieldAccuracy: number;
|
|
660
|
+
iteration: number; // Which iteration was best
|
|
661
|
+
successRate: number; // Success rate (0.0-1.0)
|
|
662
|
+
passed: number; // Number of passing tests
|
|
663
|
+
total: number; // Total tests
|
|
664
|
+
fieldAccuracy: number; // Field-level accuracy
|
|
536
665
|
};
|
|
537
666
|
iterations: Array<{
|
|
538
667
|
iteration: number;
|
|
@@ -542,8 +671,8 @@ interface OptimizationReport {
|
|
|
542
671
|
correctFields: number;
|
|
543
672
|
totalFields: number;
|
|
544
673
|
fieldAccuracy: number;
|
|
545
|
-
cost: number;
|
|
546
|
-
cumulativeCost: number;
|
|
674
|
+
cost: number; // Cost for this iteration
|
|
675
|
+
cumulativeCost: number; // Total cost so far
|
|
547
676
|
durationMs: number;
|
|
548
677
|
inputTokens: number;
|
|
549
678
|
outputTokens: number;
|
|
@@ -552,7 +681,10 @@ interface OptimizationReport {
|
|
|
552
681
|
input: unknown;
|
|
553
682
|
expected: unknown;
|
|
554
683
|
actual: unknown;
|
|
555
|
-
fields: Record<
|
|
684
|
+
fields: Record<
|
|
685
|
+
string,
|
|
686
|
+
{ expected: unknown; actual: unknown; passed: boolean }
|
|
687
|
+
>;
|
|
556
688
|
}>;
|
|
557
689
|
}>;
|
|
558
690
|
}
|
|
@@ -565,7 +697,7 @@ Contains detailed results from the best-performing iteration, with test results
|
|
|
565
697
|
```typescript
|
|
566
698
|
interface BestRunReport {
|
|
567
699
|
metadata: {
|
|
568
|
-
iteration: number;
|
|
700
|
+
iteration: number; // Which iteration was best
|
|
569
701
|
model: string;
|
|
570
702
|
provider: string;
|
|
571
703
|
thinking: boolean;
|
|
@@ -575,38 +707,41 @@ interface BestRunReport {
|
|
|
575
707
|
rateLimitPause?: number;
|
|
576
708
|
};
|
|
577
709
|
results: {
|
|
578
|
-
successRate: number;
|
|
579
|
-
passed: number;
|
|
580
|
-
total: number;
|
|
581
|
-
fieldAccuracy: number;
|
|
710
|
+
successRate: number; // Overall success rate
|
|
711
|
+
passed: number; // Passed tests
|
|
712
|
+
total: number; // Total tests
|
|
713
|
+
fieldAccuracy: number; // Field-level accuracy
|
|
582
714
|
correctFields: number;
|
|
583
715
|
totalFields: number;
|
|
584
716
|
};
|
|
585
717
|
cost: {
|
|
586
|
-
iteration: number;
|
|
587
|
-
cumulative: number;
|
|
718
|
+
iteration: number; // Cost for this iteration
|
|
719
|
+
cumulative: number; // Total cumulative cost
|
|
588
720
|
};
|
|
589
721
|
timing: {
|
|
590
722
|
durationMs: number;
|
|
591
723
|
inputTokens: number;
|
|
592
724
|
outputTokens: number;
|
|
593
725
|
};
|
|
594
|
-
failures: Array<{
|
|
726
|
+
failures: Array<{
|
|
727
|
+
// Tests that didnt meet the configured perTestThreshold
|
|
595
728
|
testIndex: number;
|
|
596
729
|
input: unknown;
|
|
597
730
|
expected: unknown;
|
|
598
731
|
actual: unknown;
|
|
599
732
|
failedFields: Record<string, { expected: unknown; actual: unknown }>;
|
|
600
733
|
}>;
|
|
601
|
-
partialFailures: Array<{
|
|
734
|
+
partialFailures: Array<{
|
|
735
|
+
// Tests that passed but have some failing fields
|
|
602
736
|
testIndex: number;
|
|
603
|
-
passRate: number;
|
|
737
|
+
passRate: number; // Percentage of fields passing
|
|
604
738
|
input: unknown;
|
|
605
739
|
expected: unknown;
|
|
606
740
|
actual: unknown;
|
|
607
741
|
failedFields: Record<string, { expected: unknown; actual: unknown }>;
|
|
608
742
|
}>;
|
|
609
|
-
successes: Array<{
|
|
743
|
+
successes: Array<{
|
|
744
|
+
// Tests with 100% field accuracy
|
|
610
745
|
testIndex: number;
|
|
611
746
|
input: unknown;
|
|
612
747
|
expected: unknown;
|
|
@@ -622,10 +757,22 @@ interface BestRunReport {
|
|
|
622
757
|
```typescript
|
|
623
758
|
// Namespace
|
|
624
759
|
import { didactic } from '@docshield/didactic';
|
|
625
|
-
import didactic from '@docshield/didactic';
|
|
760
|
+
import didactic from '@docshield/didactic'; // default export
|
|
626
761
|
|
|
627
762
|
// Comparators
|
|
628
|
-
import {
|
|
763
|
+
import {
|
|
764
|
+
exact,
|
|
765
|
+
within,
|
|
766
|
+
oneOf,
|
|
767
|
+
contains,
|
|
768
|
+
presence,
|
|
769
|
+
numeric,
|
|
770
|
+
date,
|
|
771
|
+
name,
|
|
772
|
+
unordered,
|
|
773
|
+
llmCompare,
|
|
774
|
+
custom,
|
|
775
|
+
} from '@docshield/didactic';
|
|
629
776
|
|
|
630
777
|
// Executors
|
|
631
778
|
import { endpoint, fn } from '@docshield/didactic';
|
|
@@ -635,22 +782,24 @@ import { evaluate, optimize } from '@docshield/didactic';
|
|
|
635
782
|
|
|
636
783
|
// Types
|
|
637
784
|
import type {
|
|
785
|
+
// Creating custom comparators
|
|
638
786
|
Comparator,
|
|
639
|
-
ComparatorMap,
|
|
640
787
|
ComparatorResult,
|
|
641
788
|
ComparatorContext,
|
|
789
|
+
// Creating custom executors
|
|
642
790
|
Executor,
|
|
643
791
|
ExecutorResult,
|
|
792
|
+
// Main API types
|
|
644
793
|
TestCase,
|
|
645
794
|
EvalConfig,
|
|
646
795
|
EvalResult,
|
|
647
|
-
TestCaseResult,
|
|
648
|
-
FieldResult,
|
|
649
796
|
OptimizeConfig,
|
|
650
797
|
OptimizeResult,
|
|
651
|
-
|
|
798
|
+
// Executor configs
|
|
652
799
|
EndpointConfig,
|
|
653
800
|
FnConfig,
|
|
801
|
+
// LLM configuration
|
|
802
|
+
LLMConfig,
|
|
654
803
|
} from '@docshield/didactic';
|
|
655
804
|
|
|
656
805
|
// Enum
|