llm-testrunner-components 1.1.0 → 1.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +1 -1
- package/README.md +165 -242
- package/dist/cjs/index.cjs.js +305 -237
- package/dist/cjs/index.cjs.js.map +1 -1
- package/dist/cjs/llm-testrunner.cjs.js +1 -1
- package/dist/cjs/loader.cjs.js +1 -1
- package/dist/collection/components/llm-test-runner/header/llm-test-runner-header.js +2 -2
- package/dist/collection/components/llm-test-runner/header/llm-test-runner-header.js.map +1 -1
- package/dist/collection/components/llm-test-runner/llm-test-runner.import-export.test.js +25 -54
- package/dist/collection/components/llm-test-runner/llm-test-runner.import-export.test.js.map +1 -1
- package/dist/collection/components/llm-test-runner/llm-test-runner.js +27 -49
- package/dist/collection/components/llm-test-runner/llm-test-runner.js.map +1 -1
- package/dist/collection/components/llm-test-runner/test-cases/evaluation/evaluation-summary.css +60 -21
- package/dist/collection/components/llm-test-runner/test-cases/evaluation/evaluation-summary.js +3 -1
- package/dist/collection/components/llm-test-runner/test-cases/evaluation/evaluation-summary.js.map +1 -1
- package/dist/collection/components/llm-test-runner/test-cases/expected-outcome-renderer.js +31 -11
- package/dist/collection/components/llm-test-runner/test-cases/expected-outcome-renderer.js.map +1 -1
- package/dist/collection/components/llm-test-runner/test-cases/llm-test-case-row.css +17 -0
- package/dist/collection/components/llm-test-runner/test-cases/llm-test-case-row.js +2 -12
- package/dist/collection/components/llm-test-runner/test-cases/llm-test-case-row.js.map +1 -1
- package/dist/collection/components/llm-test-runner/test-cases/llm-test-cases.js +2 -2
- package/dist/collection/components/llm-test-runner/test-cases/llm-test-cases.js.map +1 -1
- package/dist/collection/lib/evaluation/evaluation-engine.js +63 -42
- package/dist/collection/lib/evaluation/evaluation-engine.js.map +1 -1
- package/dist/collection/lib/evaluation/evaluation-service.js +15 -3
- package/dist/collection/lib/evaluation/evaluation-service.js.map +1 -1
- package/dist/collection/lib/evaluation/{rouge1-evaluator.test.js → evaluators/rouge1-evaluator.test.js} +2 -2
- package/dist/collection/lib/evaluation/evaluators/rouge1-evaluator.test.js.map +1 -0
- package/dist/collection/lib/evaluation/evaluators/semantic/SemanticEvaluator.js +4 -3
- package/dist/collection/lib/evaluation/evaluators/semantic/SemanticEvaluator.js.map +1 -1
- package/dist/collection/lib/evaluation/field-evaluation-approach.js +24 -0
- package/dist/collection/lib/evaluation/field-evaluation-approach.js.map +1 -0
- package/dist/collection/lib/evaluation/index.js +0 -4
- package/dist/collection/lib/evaluation/index.js.map +1 -1
- package/dist/collection/lib/evaluation/types.js.map +1 -1
- package/dist/collection/lib/import-export/test-results-csv.js +47 -33
- package/dist/collection/lib/import-export/test-results-csv.js.map +1 -1
- package/dist/collection/lib/import-export/test-suite-exporter.js +0 -1
- package/dist/collection/lib/import-export/test-suite-exporter.js.map +1 -1
- package/dist/collection/lib/test-cases/test-case-factory.js +17 -27
- package/dist/collection/lib/test-cases/test-case-factory.js.map +1 -1
- package/dist/collection/lib/test-cases/test-case-mutations.js +60 -9
- package/dist/collection/lib/test-cases/test-case-mutations.js.map +1 -1
- package/dist/collection/schemas/expected-outcome.js +20 -2
- package/dist/collection/schemas/expected-outcome.js.map +1 -1
- package/dist/collection/schemas/test-case.js +2 -20
- package/dist/collection/schemas/test-case.js.map +1 -1
- package/dist/collection/types/llm-test-runner.js.map +1 -1
- package/dist/collection/types/test-case.js.map +1 -1
- package/dist/components/index.js +1 -1
- package/dist/components/llm-test-runner.js +1 -1
- package/dist/components/p-JPMPoOC8.js +7 -0
- package/dist/components/p-JPMPoOC8.js.map +1 -0
- package/dist/esm/index.js +305 -237
- package/dist/esm/index.js.map +1 -1
- package/dist/esm/llm-testrunner.js +1 -1
- package/dist/esm/loader.js +1 -1
- package/dist/llm-testrunner/index.esm.js +2 -2
- package/dist/llm-testrunner/index.esm.js.map +1 -1
- package/dist/llm-testrunner/llm-testrunner.esm.js +1 -1
- package/dist/types/components/llm-test-runner/header/llm-test-runner-header.d.ts +1 -0
- package/dist/types/components/llm-test-runner/llm-test-runner.d.ts +1 -1
- package/dist/types/components/llm-test-runner/test-cases/expected-outcome-renderer.d.ts +3 -6
- package/dist/types/components/llm-test-runner/test-cases/llm-test-case-row.d.ts +0 -2
- package/dist/types/components/llm-test-runner/test-cases/llm-test-cases.d.ts +0 -2
- package/dist/types/components.d.ts +9 -0
- package/dist/types/lib/evaluation/evaluation-engine.d.ts +4 -2
- package/dist/types/lib/evaluation/field-evaluation-approach.d.ts +6 -0
- package/dist/types/lib/evaluation/index.d.ts +0 -1
- package/dist/types/lib/evaluation/types.d.ts +26 -0
- package/dist/types/lib/import-export/test-suite-exporter.d.ts +0 -4
- package/dist/types/lib/test-cases/test-case-factory.d.ts +2 -3
- package/dist/types/lib/test-cases/test-case-mutations.d.ts +21 -5
- package/dist/types/schemas/expected-outcome.d.ts +65 -17
- package/dist/types/schemas/test-case.d.ts +51 -95
- package/dist/types/types/llm-test-runner.d.ts +1 -1
- package/dist/types/types/test-case.d.ts +1 -1
- package/package.json +9 -2
- package/dist/collection/lib/evaluation/rouge1-evaluator.test.js.map +0 -1
- package/dist/components/p-BF90yb1z.js +0 -7
- package/dist/components/p-BF90yb1z.js.map +0 -1
- /package/dist/types/lib/evaluation/{rouge1-evaluator.test.d.ts → evaluators/rouge1-evaluator.test.d.ts} +0 -0
package/LICENSE
CHANGED
package/README.md
CHANGED
|
@@ -1,298 +1,221 @@
|
|
|
1
|
-
# LLM TestRunner
|
|
1
|
+
# LLM TestRunner Components
|
|
2
2
|
|
|
3
|
-
A
|
|
3
|
+
**A ready-made UI for testing your LLM.** Add questions and expected outcomes, run tests one-by-one or in batch, and get pass/fail results using five evaluation strategies—while you keep full control over which LLM you call (OpenAI, Gemini, Claude, or your own).
|
|
4
4
|
|
|
5
|
-
|
|
5
|
+
[](https://www.npmjs.com/package/llm-testrunner-components) [](https://opensource.org/licenses/MIT)
|
|
6
6
|
|
|
7
|
-
|
|
7
|
+
---
|
|
8
8
|
|
|
9
|
-
|
|
10
|
-
- **AI Integration**: Can be integrated with any LLM provider
|
|
11
|
-
- **Automated Evaluation**: Built-in evaluation engine that checks responses against expected keywords and source links
|
|
12
|
-
- **Batch Testing**: Run multiple tests sequentially
|
|
13
|
-
- **Real-time Results**: Live evaluation results with pass/fail indicators, including details such as:
|
|
14
|
-
- Number of keywords matched.
|
|
15
|
-
- Presence of source links in the response.
|
|
9
|
+
## Why use this
|
|
16
10
|
|
|
17
|
-
|
|
18
|
-
|
|
11
|
+
- **Test faster** — You get a complete test-runner UI (questions, expected outcomes, run one / run all, pass/fail, response times). No need to build tables, evaluation logic, or import/export from scratch.
|
|
12
|
+
- **Stay in control** — The library never calls an LLM. You handle one event: we send you the prompt, you call your API and pass back the response (or an error). Works with any provider or local model.
|
|
13
|
+
- **Match how you think** — Each expected-outcome field can use a different evaluation: exact keywords, semantic similarity (meaning), ROUGE (word overlap / sequence), or BLEU (n-gram precision). Choose per field.
|
|
14
|
+
- **Fit your stack** — Load test cases from your backend or a JSON file. Optionally persist runs with a Save button that emits the current state so you can store it in Firebase, your API, or anywhere else.
|
|
19
15
|
|
|
20
|
-
|
|
16
|
+
---
|
|
21
17
|
|
|
22
|
-
|
|
18
|
+
## What you get
|
|
23
19
|
|
|
24
|
-
|
|
20
|
+
- **Test case table** — Add, edit, delete test cases. Each test case has a question, configurable expected-outcome fields (single line, paragraph, keyword chips, dropdown), and a per-field evaluation approach (exact, semantic, ROUGE-1, ROUGE-L, BLEU).
|
|
21
|
+
- **Run one or run all** — Run a single test or batch with a configurable delay between API calls (rate limiting).
|
|
22
|
+
- **Live results** — Pass/fail, keyword match count (e.g. X/Y found), and response time per test.
|
|
23
|
+
- **Import / export** — Import a test suite from JSON. Export the current suite as JSON or export run results as CSV.
|
|
24
|
+
- **Optional save** — When enabled, a Save button emits the current test cases so your app can persist them (e.g. to your backend).
|
|
25
25
|
|
|
26
|
-
|
|
26
|
+
---
|
|
27
27
|
|
|
28
|
-
|
|
29
|
-
- Real-time AI response generation any LLM provider
|
|
30
|
-
- Test case management (add, delete, run individual or all tests)
|
|
31
|
-
- Built-in evaluation engine with keyword and source link matching
|
|
32
|
-
- Error handling and loading states
|
|
33
|
-
- Rate limiting for batch operations
|
|
28
|
+
## Installation
|
|
34
29
|
|
|
35
|
-
|
|
30
|
+
```bash
|
|
31
|
+
npm install llm-testrunner-components
|
|
32
|
+
```
|
|
36
33
|
|
|
37
|
-
|
|
38
|
-
|
|
34
|
+
---
|
|
35
|
+
|
|
36
|
+
## Get started (React)
|
|
37
|
+
|
|
38
|
+
**Step 1 — Register the custom elements once** (e.g. in your app entry):
|
|
39
|
+
|
|
40
|
+
```tsx
|
|
41
|
+
// e.g. in main.tsx or App.tsx
|
|
42
|
+
import { defineCustomElements } from "llm-testrunner-components/loader";
|
|
43
|
+
|
|
44
|
+
defineCustomElements();
|
|
39
45
|
```
|
|
40
46
|
|
|
41
|
-
|
|
47
|
+
**Step 2 — Use the component and connect your LLM.** The runner fires an `llmRequest` event whenever it needs a response. You call your API, then either `resolve(responseText)` or `reject(error)`.
|
|
42
48
|
|
|
43
|
-
|
|
49
|
+
```tsx
|
|
50
|
+
import { useRef } from "react";
|
|
51
|
+
import { LlmTestRunner } from "llm-testrunner-components/react";
|
|
44
52
|
|
|
45
|
-
|
|
53
|
+
function App() {
|
|
54
|
+
const runnerRef = useRef<any>(null);
|
|
46
55
|
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
</head>
|
|
54
|
-
<body>
|
|
55
|
-
<llm-test-runner id="llm-test-runner" delay-ms="1000"></llm-test-runner>
|
|
56
|
-
</body>
|
|
57
|
-
<script>
|
|
58
|
-
const llmTestRunner = document.getElementById('llm-test-runner');
|
|
59
|
-
// Gemini API
|
|
60
|
-
async function handlellmRequest(event) {
|
|
61
|
-
try {
|
|
62
|
-
const requestBody = {
|
|
63
|
-
contents: [
|
|
64
|
-
{
|
|
65
|
-
parts: [
|
|
66
|
-
{
|
|
67
|
-
text: event.detail.prompt,
|
|
68
|
-
},
|
|
69
|
-
],
|
|
70
|
-
},
|
|
71
|
-
],
|
|
72
|
-
};
|
|
73
|
-
|
|
74
|
-
const response = await fetch(
|
|
75
|
-
`https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash:generateContent?key=your-gemini-api-key-here`,
|
|
76
|
-
{
|
|
77
|
-
method: 'POST',
|
|
78
|
-
headers: {
|
|
79
|
-
'Content-Type': 'application/json',
|
|
80
|
-
},
|
|
81
|
-
body: JSON.stringify(requestBody),
|
|
82
|
-
},
|
|
83
|
-
);
|
|
84
|
-
|
|
85
|
-
if (!response.ok) {
|
|
86
|
-
const errorData = await response.json().catch(() => ({}));
|
|
87
|
-
throw new Error(
|
|
88
|
-
errorData.error?.message ||
|
|
89
|
-
`HTTP error! status: ${response.status}`,
|
|
90
|
-
);
|
|
91
|
-
}
|
|
92
|
-
|
|
93
|
-
const data = await response.json();
|
|
94
|
-
|
|
95
|
-
if (
|
|
96
|
-
data.candidates &&
|
|
97
|
-
data.candidates[0] &&
|
|
98
|
-
data.candidates[0].content
|
|
99
|
-
) {
|
|
100
|
-
event.detail.resolve(data.candidates[0].content.parts[0].text);
|
|
101
|
-
} else {
|
|
102
|
-
throw new Error('Unexpected response format from Gemini API');
|
|
103
|
-
}
|
|
104
|
-
} catch (err) {
|
|
105
|
-
event.detail.reject(
|
|
106
|
-
err instanceof Error ? err : new Error(String(err)),
|
|
107
|
-
);
|
|
108
|
-
}
|
|
56
|
+
const handleLlmRequest = async (e) => {
|
|
57
|
+
try {
|
|
58
|
+
const response = await yourLLMApi(e.detail.prompt);
|
|
59
|
+
e.detail.resolve(response);
|
|
60
|
+
} catch (err) {
|
|
61
|
+
e.detail.reject(err);
|
|
109
62
|
}
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
63
|
+
};
|
|
64
|
+
|
|
65
|
+
const handleSave = async (e) => {
|
|
66
|
+
await yourSaveApi(e.detail);
|
|
67
|
+
await runnerRef.current?.resetSavingState();
|
|
68
|
+
};
|
|
69
|
+
|
|
70
|
+
return (
|
|
71
|
+
<LlmTestRunner
|
|
72
|
+
ref={runnerRef}
|
|
73
|
+
onLlmRequest={handleLlmRequest}
|
|
74
|
+
onSave={handleSave}
|
|
75
|
+
delayMs={500}
|
|
76
|
+
useSave={true}
|
|
77
|
+
/>
|
|
78
|
+
);
|
|
79
|
+
}
|
|
113
80
|
```
|
|
114
81
|
|
|
115
|
-
|
|
82
|
+
That’s enough for a working runner. Replace `yourLLMApi` and `yourSaveApi` with your real calls. If you don’t need persistence, omit `useSave`, `onSave`, and `ref` / `resetSavingState`.
|
|
83
|
+
|
|
84
|
+
---
|
|
116
85
|
|
|
117
|
-
|
|
86
|
+
## Get started (vanilla HTML)
|
|
118
87
|
|
|
119
|
-
|
|
120
|
-
import { LLMTestRunner } from 'llm-testrunner-components';
|
|
88
|
+
Load the loader and define the custom elements, then listen for `llmRequest` and call `resolve` or `reject`.
|
|
121
89
|
|
|
122
|
-
|
|
90
|
+
```html
|
|
91
|
+
<llm-test-runner id="runner" delay-ms="500"></llm-test-runner>
|
|
92
|
+
|
|
93
|
+
<script type="module">
|
|
94
|
+
import { defineCustomElements } from "https://unpkg.com/llm-testrunner-components@1/loader/index.js";
|
|
95
|
+
defineCustomElements();
|
|
96
|
+
|
|
97
|
+
const runner = document.getElementById("runner");
|
|
98
|
+
runner.addEventListener("llmRequest", async (e) => {
|
|
99
|
+
try {
|
|
100
|
+
const response = await yourLLMFetch(e.detail.prompt);
|
|
101
|
+
e.detail.resolve(response);
|
|
102
|
+
} catch (err) {
|
|
103
|
+
e.detail.reject(err);
|
|
104
|
+
}
|
|
105
|
+
});
|
|
106
|
+
</script>
|
|
123
107
|
```
|
|
124
108
|
|
|
125
|
-
|
|
109
|
+
---
|
|
126
110
|
|
|
127
|
-
|
|
111
|
+
## Connect your LLM
|
|
128
112
|
|
|
129
|
-
The
|
|
130
|
-
This helps prevent exceeding **API rate limits** by spacing out requests automatically.
|
|
113
|
+
The library **never** sends requests to an LLM. You do. When a test runs, the component emits an `llmRequest` event with:
|
|
131
114
|
|
|
132
|
-
|
|
115
|
+
- `prompt` — the question text for this test case
|
|
116
|
+
- `resolve(responseText)` — call this with the model’s reply (string)
|
|
117
|
+
- `reject(error)` — call this if the request fails
|
|
133
118
|
|
|
134
|
-
|
|
135
|
-
| --------- | -------- | ----------- | -------------------------------------------------------------------------------------------------------------------- |
|
|
136
|
-
| `delayMs` | `number` | `undefined` | Optional delay (in milliseconds) between consecutive API calls. If not provided, all API calls are made in parallel. |
|
|
119
|
+
How you get the response is up to you: REST, SDK, or local inference. Same pattern for OpenAI, Gemini, Claude, or any other provider.
|
|
137
120
|
|
|
138
|
-
|
|
139
|
-
<llm-test-runner delay-ms="2000"></llm-test-runner>
|
|
140
|
-
```
|
|
121
|
+
---
|
|
141
122
|
|
|
142
|
-
|
|
123
|
+
## Loading and saving test cases
|
|
143
124
|
|
|
144
|
-
|
|
145
|
-
function App() {
|
|
146
|
-
return (
|
|
147
|
-
<div>
|
|
148
|
-
<llm-test-runner delayMs="1000" />
|
|
149
|
-
</div>
|
|
150
|
-
);
|
|
151
|
-
}
|
|
152
|
-
```
|
|
125
|
+
**Loading** — Pass `initialTestCases` with an array of test cases (e.g. from your backend or a file). You can use the full `TestCase` shape or a minimal one: `question` and `expectedOutcome`. The runner will fill in `id` and run state.
|
|
153
126
|
|
|
154
|
-
|
|
127
|
+
**Saving** — Set `useSave={true}` to show the Save button. When the user clicks it, the component emits a `save` event with `{ timestamp, testCases }`. Persist that in your backend (e.g. Firebase or your API). After the save completes, call `runnerRef.current.resetSavingState()` so the button leaves the loading state. If you don’t call it, a failsafe resets it after 10 seconds.
|
|
155
128
|
|
|
156
|
-
|
|
129
|
+
---
|
|
157
130
|
|
|
158
|
-
|
|
159
|
-
- **Source Link Validation**: Checks for presence of expected URLs in responses
|
|
160
|
-
- **Pass/Fail Logic**: Tests pass only when ALL expected items are found
|
|
161
|
-
- **Detailed Results**: Shows which keywords and links were found/missing
|
|
131
|
+
## Evaluation: pick the right approach
|
|
162
132
|
|
|
163
|
-
|
|
133
|
+
Each expected-outcome field can use a different evaluation method. All of them compare the **expected** text for that field to the **actual** LLM response. A test **passes only if every field** passes with its selected method.
|
|
164
134
|
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
135
|
+
| Approach | What it measures | Good for | Paraphrasing / synonyms | Speed |
|
|
136
|
+
| --------- | ----------------------------- | --------------------------------------------- | ------------------------ | ------------ |
|
|
137
|
+
| **Exact** | Literal keyword in response | Strict wording, facts, templates | No | Fast |
|
|
138
|
+
| **ROUGE-1** | Word overlap (unigram) | Slight paraphrasing, same key words | Moderate | Fast |
|
|
139
|
+
| **ROUGE-L** | Longest common subsequence | Phrasing and word order matter | Moderate–high | Slightly slower |
|
|
140
|
+
| **Semantic** | Meaning (embeddings + cosine) | Different words, same meaning | Yes | First run loads model |
|
|
141
|
+
| **BLEU** | N-gram precision (1–4) | Translation-like or n-gram overlap | Moderate | Fast |
|
|
168
142
|
|
|
169
|
-
|
|
143
|
+
- Set **per expected-outcome field** via the dropdown in the UI, or via each field’s `evaluationParameters.approach` when you pass `initialTestCases`.
|
|
144
|
+
- **ROUGE, BLEU, and Semantic** use a fixed threshold (0.7).
|
|
145
|
+
- **Semantic** uses in-browser embeddings ([Xenova/all-MiniLM-L6-v2](https://huggingface.co/Xenova/all-MiniLM-L6-v2)). The first time you use it, the model is downloaded; later runs are faster.
|
|
170
146
|
|
|
171
|
-
|
|
147
|
+
---
|
|
172
148
|
|
|
173
|
-
|
|
174
|
-
npm install llm-testrunner-components
|
|
175
|
-
```
|
|
149
|
+
## Expected outcome fields
|
|
176
150
|
|
|
177
|
-
|
|
151
|
+
Expected outcomes can be more than a single text block. You can define:
|
|
178
152
|
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
153
|
+
- **text** — Single line
|
|
154
|
+
- **textarea** — Multi-line
|
|
155
|
+
- **chips-input** — List of keywords (each compared in evaluation)
|
|
156
|
+
- **select** — Dropdown (value must be one of the options)
|
|
182
157
|
|
|
183
|
-
|
|
184
|
-
useEffect(() => {
|
|
185
|
-
defineCustomElements();
|
|
186
|
-
}, []);
|
|
158
|
+
When you pass `initialTestCases`, use an array of objects with `type`, `label`, and `value` (and for `select`, `options`). For **new** test cases, the runner uses `defaultExpectedOutcomeSchema` if you pass it; otherwise it uses a default single textarea.
|
|
187
159
|
|
|
188
|
-
|
|
189
|
-
try {
|
|
190
|
-
console.log('🚀 callGeminiAPI called with prompt:', event.detail.prompt);
|
|
191
|
-
const requestBody = {
|
|
192
|
-
contents: [
|
|
193
|
-
{
|
|
194
|
-
parts: [
|
|
195
|
-
{
|
|
196
|
-
text: event.detail.prompt,
|
|
197
|
-
},
|
|
198
|
-
],
|
|
199
|
-
},
|
|
200
|
-
],
|
|
201
|
-
};
|
|
202
|
-
|
|
203
|
-
const response = await fetch(
|
|
204
|
-
`https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash:generateContent?key=your-gemini-api-key-here`,
|
|
205
|
-
{
|
|
206
|
-
method: 'POST',
|
|
207
|
-
headers: {
|
|
208
|
-
'Content-Type': 'application/json',
|
|
209
|
-
},
|
|
210
|
-
body: JSON.stringify(requestBody),
|
|
211
|
-
},
|
|
212
|
-
);
|
|
213
|
-
|
|
214
|
-
if (!response.ok) {
|
|
215
|
-
const errorData = await response.json().catch(() => ({}));
|
|
216
|
-
throw new Error(
|
|
217
|
-
errorData.error?.message || `HTTP error! status: ${response.status}`,
|
|
218
|
-
);
|
|
219
|
-
}
|
|
220
|
-
|
|
221
|
-
const data = await response.json();
|
|
222
|
-
|
|
223
|
-
if (data.candidates && data.candidates[0] && data.candidates[0].content) {
|
|
224
|
-
event.detail.resolve(data.candidates[0].content.parts[0].text);
|
|
225
|
-
} else {
|
|
226
|
-
throw new Error('Unexpected response format from Gemini API');
|
|
227
|
-
}
|
|
228
|
-
} catch (err) {
|
|
229
|
-
event.detail.reject(err instanceof Error ? err : new Error(String(err)));
|
|
230
|
-
}
|
|
231
|
-
};
|
|
160
|
+
---
|
|
232
161
|
|
|
233
|
-
|
|
234
|
-
<div>
|
|
235
|
-
<h1>LLM Test Runner</h1>
|
|
236
|
-
<llm-test-runner llmRequest={handlellmRequest}></llm-test-runner>
|
|
237
|
-
</div>
|
|
238
|
-
);
|
|
239
|
-
}
|
|
240
|
-
```
|
|
162
|
+
## API reference
|
|
241
163
|
|
|
242
|
-
###
|
|
164
|
+
### Props
|
|
243
165
|
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
}
|
|
251
|
-
}
|
|
252
|
-
```
|
|
166
|
+
| Prop | Attribute | Type | Default | Description |
|
|
167
|
+
|------|-----------|------|---------|-------------|
|
|
168
|
+
| `delayMs` | `delay-ms` | `number` | `500` | Delay (ms) between API calls when running all tests (rate limiting). |
|
|
169
|
+
| `useSave` | `use-save` | `boolean` | `false` | Show Save button and emit `save` events. |
|
|
170
|
+
| `initialTestCases` | — | `TestCase[]` | `undefined` | Preload test cases. See [types](#types) below. |
|
|
171
|
+
| `defaultExpectedOutcomeSchema` | — | `ExpectedOutcomeSchema` | built-in | Schema for new test cases (field types and labels). |
|
|
253
172
|
|
|
254
|
-
|
|
173
|
+
### Events
|
|
255
174
|
|
|
256
|
-
|
|
175
|
+
| Event | Payload | Description |
|
|
176
|
+
|-------|---------|-------------|
|
|
177
|
+
| `llmRequest` | `{ prompt, resolve, reject }` | Runner needs an LLM response. Call `resolve(responseText)` or `reject(error)`. |
|
|
178
|
+
| `save` | `{ timestamp, testCases }` | User clicked Save (only when `useSave` is true). Persist then call `resetSavingState()`. |
|
|
257
179
|
|
|
258
|
-
|
|
259
|
-
interface LLMTestRunnerProps {
|
|
260
|
-
apiKey: string; // Required: Your Gemini API key
|
|
261
|
-
}
|
|
262
|
-
```
|
|
180
|
+
### Methods
|
|
263
181
|
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
interface TestCase {
|
|
268
|
-
id: string;
|
|
269
|
-
question: string;
|
|
270
|
-
expectedOutcome: string;
|
|
271
|
-
output?: string;
|
|
272
|
-
isRunning?: boolean;
|
|
273
|
-
error?: string;
|
|
274
|
-
evaluationResult?: EvaluationResult;
|
|
275
|
-
}
|
|
276
|
-
```
|
|
182
|
+
| Method | Description |
|
|
183
|
+
|--------|-------------|
|
|
184
|
+
| `resetSavingState()` | Call after you finish persisting a save so the Save button leaves loading state. Use a ref in React. |
|
|
277
185
|
|
|
278
|
-
###
|
|
186
|
+
### Types
|
|
279
187
|
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
188
|
+
Import from `llm-testrunner-components/react/types`:
|
|
189
|
+
|
|
190
|
+
```ts
|
|
191
|
+
import type {
|
|
192
|
+
TestCase,
|
|
193
|
+
LLMRequestPayload,
|
|
194
|
+
SavePayload,
|
|
195
|
+
ExpectedOutcomeSchema,
|
|
196
|
+
ExpectedOutcomeField,
|
|
197
|
+
EvaluationParameters,
|
|
198
|
+
} from "llm-testrunner-components/react/types";
|
|
288
199
|
```
|
|
289
200
|
|
|
290
|
-
|
|
201
|
+
---
|
|
291
202
|
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
203
|
+
## Import and export
|
|
204
|
+
|
|
205
|
+
- **Import** — Use the UI to load a JSON file. It must be an array of test cases. Invalid or empty files show an error.
|
|
206
|
+
- **Export test suite** — Downloads a JSON file with the current test cases.
|
|
207
|
+
- **Export results** — Downloads a CSV of the latest run (includes evaluation score).
|
|
208
|
+
|
|
209
|
+
---
|
|
210
|
+
|
|
211
|
+
## Contributing
|
|
212
|
+
|
|
213
|
+
We welcome contributions. See [CONTRIBUTING.md](CONTRIBUTING.md) for how to get started (opening issues, pull request workflow, and code of conduct).
|
|
214
|
+
|
|
215
|
+
---
|
|
216
|
+
|
|
217
|
+
## License
|
|
218
|
+
|
|
219
|
+
The project is licensed under the [MIT License](LICENSE).
|
|
220
|
+
|
|
221
|
+
Third-party licenses are in `node_modules/<package>/`. This project uses [licensee](https://github.com/jslicense/licensee.js) and the [Blue Oak Council](https://blueoakcouncil.org/list) permissive list; only dependencies with a Blue Oak bronze-or-better license (or an exception in [.licensee.json](.licensee.json)) are allowed. Run `npm run license-check` to verify locally.
|