@assay-ai/core 0.1.0-beta → 0.2.1-beta
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +118 -0
- package/package.json +1 -2
package/README.md
ADDED
|
@@ -0,0 +1,118 @@
|
|
|
1
|
+
# @assay-ai/core
|
|
2
|
+
|
|
3
|
+
The core evaluation engine for [Assay](https://github.com/assay-ai/assay) -- the TypeScript-native LLM evaluation framework.
|
|
4
|
+
|
|
5
|
+
[](https://www.npmjs.com/package/@assay-ai/core)
|
|
6
|
+
[](https://opensource.org/licenses/MIT)
|
|
7
|
+
|
|
8
|
+
## Installation
|
|
9
|
+
|
|
10
|
+
```bash
|
|
11
|
+
npm install @assay-ai/core
|
|
12
|
+
# or
|
|
13
|
+
pnpm add @assay-ai/core
|
|
14
|
+
```
|
|
15
|
+
|
|
16
|
+
## Quick Start
|
|
17
|
+
|
|
18
|
+
```typescript
|
|
19
|
+
import {
|
|
20
|
+
AnswerRelevancyMetric,
|
|
21
|
+
evaluate,
|
|
22
|
+
FaithfulnessMetric,
|
|
23
|
+
HallucinationMetric,
|
|
24
|
+
} from "@assay-ai/core";
|
|
25
|
+
|
|
26
|
+
const results = await evaluate(
|
|
27
|
+
[
|
|
28
|
+
{
|
|
29
|
+
input: "What is the capital of France?",
|
|
30
|
+
actualOutput: "The capital of France is Paris.",
|
|
31
|
+
context: ["France is a country in Europe. Its capital is Paris."],
|
|
32
|
+
},
|
|
33
|
+
],
|
|
34
|
+
[
|
|
35
|
+
new FaithfulnessMetric({ threshold: 0.7 }),
|
|
36
|
+
new AnswerRelevancyMetric({ threshold: 0.7 }),
|
|
37
|
+
new HallucinationMetric({ threshold: 0.3 }),
|
|
38
|
+
],
|
|
39
|
+
);
|
|
40
|
+
|
|
41
|
+
console.log(`Pass rate: ${results.summary.passRate.toFixed(1)}%`);
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
## Metrics
|
|
45
|
+
|
|
46
|
+
Assay ships with 12 evaluation metrics out of the box:
|
|
47
|
+
|
|
48
|
+
| Metric | Description | Required Fields |
|
|
49
|
+
|--------|-------------|-----------------|
|
|
50
|
+
| `AnswerRelevancyMetric` | Measures how relevant the output is to the input | `input`, `actualOutput` |
|
|
51
|
+
| `FaithfulnessMetric` | Measures whether the output is grounded in context | `input`, `actualOutput`, `retrievalContext` |
|
|
52
|
+
| `HallucinationMetric` | Detects claims not supported by context | `input`, `actualOutput`, `context` |
|
|
53
|
+
| `ContextualPrecisionMetric` | Measures whether relevant context items are ranked higher | `input`, `expectedOutput`, `retrievalContext` |
|
|
54
|
+
| `ContextualRecallMetric` | Measures whether all relevant information is retrieved | `input`, `expectedOutput`, `retrievalContext` |
|
|
55
|
+
| `ContextualRelevancyMetric` | Measures whether retrieved context is relevant | `input`, `actualOutput`, `retrievalContext` |
|
|
56
|
+
| `BiasMetric` | Detects demographic or ideological bias | `input`, `actualOutput` |
|
|
57
|
+
| `ToxicityMetric` | Detects toxic or harmful content | `input`, `actualOutput` |
|
|
58
|
+
| `GEval` | Custom LLM-as-judge with user-defined criteria | `input`, `actualOutput` |
|
|
59
|
+
| `SummarizationMetric` | Evaluates summary quality | `input`, `actualOutput` |
|
|
60
|
+
| `ExactMatchMetric` | Exact string comparison (no LLM needed) | `actualOutput`, `expectedOutput` |
|
|
61
|
+
| `JsonCorrectnessMetric` | Validates JSON structure (no LLM needed) | `actualOutput` |
|
|
62
|
+
|
|
63
|
+
## Configuration
|
|
64
|
+
|
|
65
|
+
### Provider
|
|
66
|
+
|
|
67
|
+
Assay auto-detects your LLM provider from environment variables:
|
|
68
|
+
|
|
69
|
+
```bash
|
|
70
|
+
# OpenAI (default)
|
|
71
|
+
export OPENAI_API_KEY="sk-..."
|
|
72
|
+
|
|
73
|
+
# Anthropic
|
|
74
|
+
export ANTHROPIC_API_KEY="sk-ant-..."
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
### Metric Options
|
|
78
|
+
|
|
79
|
+
Every metric accepts optional configuration:
|
|
80
|
+
|
|
81
|
+
```typescript
|
|
82
|
+
new FaithfulnessMetric({
|
|
83
|
+
threshold: 0.7, // Minimum score to pass (default: 0.5)
|
|
84
|
+
model: "gpt-4o-mini", // LLM model for evaluation
|
|
85
|
+
verbose: true, // Log detailed reasoning
|
|
86
|
+
});
|
|
87
|
+
```
|
|
88
|
+
|
|
89
|
+
### Custom Metrics with GEval
|
|
90
|
+
|
|
91
|
+
Define any evaluation criteria in plain English:
|
|
92
|
+
|
|
93
|
+
```typescript
|
|
94
|
+
import { GEval } from "@assay-ai/core";
|
|
95
|
+
|
|
96
|
+
const politeness = new GEval({
|
|
97
|
+
name: "Politeness",
|
|
98
|
+
criteria: "The response should be polite and professional.",
|
|
99
|
+
evaluationSteps: [
|
|
100
|
+
"Check if the response uses polite phrases",
|
|
101
|
+
"Verify the tone is respectful",
|
|
102
|
+
],
|
|
103
|
+
});
|
|
104
|
+
```
|
|
105
|
+
|
|
106
|
+
## Exports
|
|
107
|
+
|
|
108
|
+
This package exports:
|
|
109
|
+
|
|
110
|
+
- **Metrics**: `AnswerRelevancyMetric`, `FaithfulnessMetric`, `HallucinationMetric`, `ContextualPrecisionMetric`, `ContextualRecallMetric`, `ContextualRelevancyMetric`, `BiasMetric`, `ToxicityMetric`, `GEval`, `SummarizationMetric`, `ExactMatchMetric`, `JsonCorrectnessMetric`
|
|
111
|
+
- **Evaluation**: `evaluate`, `assertEval`
|
|
112
|
+
- **Providers**: `BaseLLMProvider`, `OpenAIProvider`, `AnthropicProvider`, `OllamaProvider`, `resolveProvider`
|
|
113
|
+
- **Utilities**: `parseJson`, `tryParseJson`, `createLimiter`, `ConsoleReporter`
|
|
114
|
+
- **Types**: `LLMTestCase`, `MetricResult`, `MetricConfig`, `EvaluateConfig`, `EvaluateResult`, `EvaluationDataset`
|
|
115
|
+
|
|
116
|
+
## License
|
|
117
|
+
|
|
118
|
+
[MIT](https://github.com/assay-ai/assay/blob/main/LICENSE)
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@assay-ai/core",
|
|
3
|
-
"version": "0.1
|
|
3
|
+
"version": "0.2.1-beta",
|
|
4
4
|
"description": "Core evaluation engine for the Assay LLM evaluation framework",
|
|
5
5
|
"license": "MIT",
|
|
6
6
|
"type": "module",
|
|
@@ -60,7 +60,6 @@
|
|
|
60
60
|
"build": "tsup",
|
|
61
61
|
"dev": "tsup --watch",
|
|
62
62
|
"typecheck": "tsc --noEmit",
|
|
63
|
-
"test": "vitest run",
|
|
64
63
|
"clean": "rm -rf dist .turbo"
|
|
65
64
|
}
|
|
66
65
|
}
|