openevalkit 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- openevalkit-0.1.0/PKG-INFO +218 -0
- openevalkit-0.1.0/README.md +206 -0
- openevalkit-0.1.0/pyproject.toml +26 -0
- openevalkit-0.1.0/src/openevalkit/__init__.py +63 -0
- openevalkit-0.1.0/src/openevalkit/__version__.py +1 -0
- openevalkit-0.1.0/src/openevalkit/cache.py +324 -0
- openevalkit-0.1.0/src/openevalkit/config.py +49 -0
- openevalkit-0.1.0/src/openevalkit/dataset.py +189 -0
- openevalkit-0.1.0/src/openevalkit/errors.py +28 -0
- openevalkit-0.1.0/src/openevalkit/evaluate.py +434 -0
- openevalkit-0.1.0/src/openevalkit/judges/__init__.py +19 -0
- openevalkit-0.1.0/src/openevalkit/judges/base.py +159 -0
- openevalkit-0.1.0/src/openevalkit/judges/ensemble.py +371 -0
- openevalkit-0.1.0/src/openevalkit/judges/llm_config.py +85 -0
- openevalkit-0.1.0/src/openevalkit/judges/llm_judge.py +196 -0
- openevalkit-0.1.0/src/openevalkit/judges/prompt.py +119 -0
- openevalkit-0.1.0/src/openevalkit/judges/rubric.py +129 -0
- openevalkit-0.1.0/src/openevalkit/judgment.py +53 -0
- openevalkit-0.1.0/src/openevalkit/py.typed +0 -0
- openevalkit-0.1.0/src/openevalkit/result.py +62 -0
- openevalkit-0.1.0/src/openevalkit/run.py +30 -0
- openevalkit-0.1.0/src/openevalkit/score.py +25 -0
- openevalkit-0.1.0/src/openevalkit/scorers/__init__.py +18 -0
- openevalkit-0.1.0/src/openevalkit/scorers/base.py +73 -0
- openevalkit-0.1.0/src/openevalkit/scorers/performance.py +148 -0
- openevalkit-0.1.0/src/openevalkit/scorers/reference.py +61 -0
- openevalkit-0.1.0/src/openevalkit/scorers/rule.py +187 -0
- openevalkit-0.1.0/src/openevalkit/utils/__init__.py +0 -0
|
@@ -0,0 +1,218 @@
|
|
|
1
|
+
Metadata-Version: 2.3
|
|
2
|
+
Name: openevalkit
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Open evaluation kit for LLM systems
|
|
5
|
+
Keywords: llm,agent,evaluation,judge,nlp,ml,evals,generative
|
|
6
|
+
License: MIT
|
|
7
|
+
Requires-Dist: litellm>=1.81.9
|
|
8
|
+
Requires-Dist: numpy>=2.0.2
|
|
9
|
+
Requires-Dist: pandas>=2.3.3
|
|
10
|
+
Requires-Python: >=3.9
|
|
11
|
+
Description-Content-Type: text/markdown
|
|
12
|
+
|
|
13
|
+
# OpenEvalKit
|
|
14
|
+
|
|
15
|
+
**Universal evaluation framework for LLM systems**
|
|
16
|
+
|
|
17
|
+
[](https://pypi.org/project/openevalkit/)
|
|
18
|
+
[](https://pypi.org/project/openevalkit/)
|
|
19
|
+
|
|
20
|
+
OpenEvalKit is a production-grade framework for evaluating LLM systems with traditional metrics, LLM-as-a-judge, and ensemble evaluation.
|
|
21
|
+
|
|
22
|
+
## Features
|
|
23
|
+
|
|
24
|
+
- 📊 **Traditional Scorers** - ExactMatch, Latency, Cost, TokenCount, RegexMatch, JSONValid, ContainsKeywords
|
|
25
|
+
- 🤖 **LLM Judges** - Use any LLM (OpenAI, Anthropic, Ollama, 100+ models) to evaluate quality
|
|
26
|
+
- 🎯 **Ensemble Judges** - Combine multiple judges for more reliable evaluation
|
|
27
|
+
- 💾 **Smart Caching** - Automatic caching with LRU eviction (saves API costs)
|
|
28
|
+
- âš¡ **Parallel Execution** - Fast evaluation with configurable concurrency
|
|
29
|
+
- 🔧 **Flexible** - Custom scorers, judges, and rubrics
|
|
30
|
+
|
|
31
|
+
## Installation
|
|
32
|
+
```bash
|
|
33
|
+
pip install openevalkit
|
|
34
|
+
```
|
|
35
|
+
|
|
36
|
+
## Quick Start
|
|
37
|
+
|
|
38
|
+
### Loading Datasets
|
|
39
|
+
```python
|
|
40
|
+
from openevalkit import Dataset
|
|
41
|
+
|
|
42
|
+
# From JSONL
|
|
43
|
+
dataset = Dataset.from_jsonl(
|
|
44
|
+
"data.jsonl",
|
|
45
|
+
input_field="question",
|
|
46
|
+
output_field="answer",
|
|
47
|
+
reference_field="expected"
|
|
48
|
+
)
|
|
49
|
+
|
|
50
|
+
# From CSV
|
|
51
|
+
dataset = Dataset.from_csv(
|
|
52
|
+
"data.csv",
|
|
53
|
+
input_col="question",
|
|
54
|
+
output_col="answer",
|
|
55
|
+
reference_col="expected",
|
|
56
|
+
metadata_cols=["user_id"],
|
|
57
|
+
metrics_cols=["latency"]
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
# From list
|
|
61
|
+
from openevalkit import Run
|
|
62
|
+
dataset = Dataset([
|
|
63
|
+
Run(id="1", input="What is 2+2?", output="4", reference="4"),
|
|
64
|
+
Run(id="2", input="What is 3+3?", output="6", reference="6"),
|
|
65
|
+
])
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
### Evaluate with Traditional Scorers
|
|
69
|
+
```python
|
|
70
|
+
from openevalkit import evaluate
|
|
71
|
+
from openevalkit.scorers import ExactMatch, RegexMatch, JSONValid, ContainsKeywords
|
|
72
|
+
|
|
73
|
+
# Exact match
|
|
74
|
+
results = evaluate(dataset, scorers=[ExactMatch()])
|
|
75
|
+
print(results.aggregates)
|
|
76
|
+
# {'exact_match': 1.0}
|
|
77
|
+
|
|
78
|
+
# Regex pattern matching
|
|
79
|
+
scorer = RegexMatch(pattern=r'\d+') # Check if output contains numbers
|
|
80
|
+
results = evaluate(dataset, scorers=[scorer])
|
|
81
|
+
|
|
82
|
+
# JSON validation
|
|
83
|
+
json_scorer = JSONValid()
|
|
84
|
+
results = evaluate(dataset, scorers=[json_scorer])
|
|
85
|
+
|
|
86
|
+
# Keyword detection
|
|
87
|
+
keyword_scorer = ContainsKeywords(keywords=["python", "code"], ignore_case=True)
|
|
88
|
+
results = evaluate(dataset, scorers=[keyword_scorer])
|
|
89
|
+
```
|
|
90
|
+
|
|
91
|
+
### Evaluate with LLM Judge
|
|
92
|
+
```python
|
|
93
|
+
from openevalkit.judges import LLMJudge, LLMConfig, Rubric
|
|
94
|
+
|
|
95
|
+
# Create dataset
|
|
96
|
+
dataset = Dataset([
|
|
97
|
+
{"input": "Explain Python", "output": "Python is a programming language..."},
|
|
98
|
+
])
|
|
99
|
+
|
|
100
|
+
# Create rubric
|
|
101
|
+
rubric = Rubric(
|
|
102
|
+
criteria=["helpfulness", "accuracy", "clarity"],
|
|
103
|
+
scale="0-1",
|
|
104
|
+
weights={"helpfulness": 2.0, "accuracy": 3.0, "clarity": 1.0}
|
|
105
|
+
)
|
|
106
|
+
|
|
107
|
+
# Create judge
|
|
108
|
+
judge = LLMJudge(
|
|
109
|
+
llm_config=LLMConfig(model="gpt-4o"),
|
|
110
|
+
rubric=rubric
|
|
111
|
+
)
|
|
112
|
+
|
|
113
|
+
# Evaluate
|
|
114
|
+
results = evaluate(dataset, judges=[judge])
|
|
115
|
+
print(results.aggregates)
|
|
116
|
+
# {'llm_judge_gpt-4o_score': 0.85, 'llm_judge_gpt-4o_helpfulness': 0.9, ...}
|
|
117
|
+
```
|
|
118
|
+
|
|
119
|
+
### Ensemble Evaluation (Multiple Judges)
|
|
120
|
+
```python
|
|
121
|
+
from openevalkit.judges import EnsembleJudge
|
|
122
|
+
|
|
123
|
+
# Combine multiple judges for more reliable evaluation
|
|
124
|
+
ensemble = EnsembleJudge(
|
|
125
|
+
judges=[
|
|
126
|
+
LLMJudge(LLMConfig(model="gpt-4o"), rubric),
|
|
127
|
+
LLMJudge(LLMConfig(model="claude-3-5-sonnet-20241022"), rubric),
|
|
128
|
+
LLMJudge(LLMConfig(model="gpt-4o-mini"), rubric),
|
|
129
|
+
],
|
|
130
|
+
method="average", # or "median", "majority_vote", "unanimous"
|
|
131
|
+
n_jobs=3 # Parallel execution
|
|
132
|
+
)
|
|
133
|
+
|
|
134
|
+
results = evaluate(dataset, judges=[ensemble])
|
|
135
|
+
```
|
|
136
|
+
|
|
137
|
+
## Configuration
|
|
138
|
+
```python
|
|
139
|
+
from openevalkit import EvalConfig
|
|
140
|
+
|
|
141
|
+
config = EvalConfig(
|
|
142
|
+
concurrency=10, # Parallel runs
|
|
143
|
+
cache_enabled=True, # Cache results (saves API costs)
|
|
144
|
+
cache_max_size_mb=500, # Cache size limit
|
|
145
|
+
timeout=30.0, # Timeout per run
|
|
146
|
+
seed=42, # Reproducible results
|
|
147
|
+
verbose=True, # Show progress
|
|
148
|
+
)
|
|
149
|
+
|
|
150
|
+
results = evaluate(dataset, judges=[judge], config=config)
|
|
151
|
+
```
|
|
152
|
+
|
|
153
|
+
## Built-in Scorers
|
|
154
|
+
|
|
155
|
+
### String Matching
|
|
156
|
+
- **ExactMatch** - Exact string comparison with reference
|
|
157
|
+
- **RegexMatch** - Pattern matching with regex
|
|
158
|
+
- **ContainsKeywords** - Check for required keywords
|
|
159
|
+
|
|
160
|
+
### Structure Validation
|
|
161
|
+
- **JSONValid** - Validate JSON output
|
|
162
|
+
|
|
163
|
+
### Performance Metrics
|
|
164
|
+
- **Latency** - Response time from run.metrics
|
|
165
|
+
- **Cost** - API cost from run.metrics
|
|
166
|
+
- **TokenCount** - Token usage (exact or estimated)
|
|
167
|
+
|
|
168
|
+
## Supported Models
|
|
169
|
+
|
|
170
|
+
Via [LiteLLM](https://github.com/BerriAI/litellm), supports 100+ models:
|
|
171
|
+
|
|
172
|
+
- **OpenAI**: gpt-4o, gpt-4o-mini, gpt-4-turbo, gpt-3.5-turbo
|
|
173
|
+
- **Anthropic**: claude-3-5-sonnet, claude-3-opus, claude-3-haiku
|
|
174
|
+
- **Google**: gemini-pro, gemini-1.5-pro
|
|
175
|
+
- **Ollama**: llama3, mistral, phi (local models)
|
|
176
|
+
- **Cohere, Replicate, HuggingFace, and more**
|
|
177
|
+
|
|
178
|
+
## Custom Scorers
|
|
179
|
+
```python
|
|
180
|
+
from openevalkit.scorers.base import Scorer
|
|
181
|
+
from openevalkit import Score
|
|
182
|
+
|
|
183
|
+
class ContainsWord(Scorer):
|
|
184
|
+
name = "contains_word"
|
|
185
|
+
requires_reference = False
|
|
186
|
+
|
|
187
|
+
def __init__(self, word: str):
|
|
188
|
+
self.word = word
|
|
189
|
+
|
|
190
|
+
def score(self, run):
|
|
191
|
+
has_word = self.word.lower() in run.output.lower()
|
|
192
|
+
return Score(
|
|
193
|
+
value=1.0 if has_word else 0.0,
|
|
194
|
+
reason=f"Word '{self.word}' {'found' if has_word else 'not found'}"
|
|
195
|
+
)
|
|
196
|
+
|
|
197
|
+
results = evaluate(dataset, scorers=[ContainsWord("Python")])
|
|
198
|
+
```
|
|
199
|
+
|
|
200
|
+
## Why OpenEvalKit?
|
|
201
|
+
|
|
202
|
+
- **Production Ready**: Smart caching, parallel execution, error handling
|
|
203
|
+
- **Cost Effective**: Cache LLM judgments to avoid redundant API calls
|
|
204
|
+
- **Flexible**: Works with any LLM provider via LiteLLM
|
|
205
|
+
- **Reliable**: Ensemble judges with configurable aggregation
|
|
206
|
+
- **Simple**: Clean API, comprehensive documentation
|
|
207
|
+
|
|
208
|
+
## Documentation
|
|
209
|
+
|
|
210
|
+
Coming soon! For now, see examples above and docstrings.
|
|
211
|
+
|
|
212
|
+
## License
|
|
213
|
+
|
|
214
|
+
MIT
|
|
215
|
+
|
|
216
|
+
## Contributing
|
|
217
|
+
|
|
218
|
+
Contributions welcome! Please open an issue or PR.
|
|
@@ -0,0 +1,206 @@
|
|
|
1
|
+
# OpenEvalKit
|
|
2
|
+
|
|
3
|
+
**Universal evaluation framework for LLM systems**
|
|
4
|
+
|
|
5
|
+
[](https://pypi.org/project/openevalkit/)
|
|
6
|
+
[](https://pypi.org/project/openevalkit/)
|
|
7
|
+
|
|
8
|
+
OpenEvalKit is a production-grade framework for evaluating LLM systems with traditional metrics, LLM-as-a-judge, and ensemble evaluation.
|
|
9
|
+
|
|
10
|
+
## Features
|
|
11
|
+
|
|
12
|
+
- 📊 **Traditional Scorers** - ExactMatch, Latency, Cost, TokenCount, RegexMatch, JSONValid, ContainsKeywords
|
|
13
|
+
- 🤖 **LLM Judges** - Use any LLM (OpenAI, Anthropic, Ollama, 100+ models) to evaluate quality
|
|
14
|
+
- 🎯 **Ensemble Judges** - Combine multiple judges for more reliable evaluation
|
|
15
|
+
- 💾 **Smart Caching** - Automatic caching with LRU eviction (saves API costs)
|
|
16
|
+
- âš¡ **Parallel Execution** - Fast evaluation with configurable concurrency
|
|
17
|
+
- 🔧 **Flexible** - Custom scorers, judges, and rubrics
|
|
18
|
+
|
|
19
|
+
## Installation
|
|
20
|
+
```bash
|
|
21
|
+
pip install openevalkit
|
|
22
|
+
```
|
|
23
|
+
|
|
24
|
+
## Quick Start
|
|
25
|
+
|
|
26
|
+
### Loading Datasets
|
|
27
|
+
```python
|
|
28
|
+
from openevalkit import Dataset
|
|
29
|
+
|
|
30
|
+
# From JSONL
|
|
31
|
+
dataset = Dataset.from_jsonl(
|
|
32
|
+
"data.jsonl",
|
|
33
|
+
input_field="question",
|
|
34
|
+
output_field="answer",
|
|
35
|
+
reference_field="expected"
|
|
36
|
+
)
|
|
37
|
+
|
|
38
|
+
# From CSV
|
|
39
|
+
dataset = Dataset.from_csv(
|
|
40
|
+
"data.csv",
|
|
41
|
+
input_col="question",
|
|
42
|
+
output_col="answer",
|
|
43
|
+
reference_col="expected",
|
|
44
|
+
metadata_cols=["user_id"],
|
|
45
|
+
metrics_cols=["latency"]
|
|
46
|
+
)
|
|
47
|
+
|
|
48
|
+
# From list
|
|
49
|
+
from openevalkit import Run
|
|
50
|
+
dataset = Dataset([
|
|
51
|
+
Run(id="1", input="What is 2+2?", output="4", reference="4"),
|
|
52
|
+
Run(id="2", input="What is 3+3?", output="6", reference="6"),
|
|
53
|
+
])
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
### Evaluate with Traditional Scorers
|
|
57
|
+
```python
|
|
58
|
+
from openevalkit import evaluate
|
|
59
|
+
from openevalkit.scorers import ExactMatch, RegexMatch, JSONValid, ContainsKeywords
|
|
60
|
+
|
|
61
|
+
# Exact match
|
|
62
|
+
results = evaluate(dataset, scorers=[ExactMatch()])
|
|
63
|
+
print(results.aggregates)
|
|
64
|
+
# {'exact_match': 1.0}
|
|
65
|
+
|
|
66
|
+
# Regex pattern matching
|
|
67
|
+
scorer = RegexMatch(pattern=r'\d+') # Check if output contains numbers
|
|
68
|
+
results = evaluate(dataset, scorers=[scorer])
|
|
69
|
+
|
|
70
|
+
# JSON validation
|
|
71
|
+
json_scorer = JSONValid()
|
|
72
|
+
results = evaluate(dataset, scorers=[json_scorer])
|
|
73
|
+
|
|
74
|
+
# Keyword detection
|
|
75
|
+
keyword_scorer = ContainsKeywords(keywords=["python", "code"], ignore_case=True)
|
|
76
|
+
results = evaluate(dataset, scorers=[keyword_scorer])
|
|
77
|
+
```
|
|
78
|
+
|
|
79
|
+
### Evaluate with LLM Judge
|
|
80
|
+
```python
|
|
81
|
+
from openevalkit.judges import LLMJudge, LLMConfig, Rubric
|
|
82
|
+
|
|
83
|
+
# Create dataset
|
|
84
|
+
dataset = Dataset([
|
|
85
|
+
{"input": "Explain Python", "output": "Python is a programming language..."},
|
|
86
|
+
])
|
|
87
|
+
|
|
88
|
+
# Create rubric
|
|
89
|
+
rubric = Rubric(
|
|
90
|
+
criteria=["helpfulness", "accuracy", "clarity"],
|
|
91
|
+
scale="0-1",
|
|
92
|
+
weights={"helpfulness": 2.0, "accuracy": 3.0, "clarity": 1.0}
|
|
93
|
+
)
|
|
94
|
+
|
|
95
|
+
# Create judge
|
|
96
|
+
judge = LLMJudge(
|
|
97
|
+
llm_config=LLMConfig(model="gpt-4o"),
|
|
98
|
+
rubric=rubric
|
|
99
|
+
)
|
|
100
|
+
|
|
101
|
+
# Evaluate
|
|
102
|
+
results = evaluate(dataset, judges=[judge])
|
|
103
|
+
print(results.aggregates)
|
|
104
|
+
# {'llm_judge_gpt-4o_score': 0.85, 'llm_judge_gpt-4o_helpfulness': 0.9, ...}
|
|
105
|
+
```
|
|
106
|
+
|
|
107
|
+
### Ensemble Evaluation (Multiple Judges)
|
|
108
|
+
```python
|
|
109
|
+
from openevalkit.judges import EnsembleJudge
|
|
110
|
+
|
|
111
|
+
# Combine multiple judges for more reliable evaluation
|
|
112
|
+
ensemble = EnsembleJudge(
|
|
113
|
+
judges=[
|
|
114
|
+
LLMJudge(LLMConfig(model="gpt-4o"), rubric),
|
|
115
|
+
LLMJudge(LLMConfig(model="claude-3-5-sonnet-20241022"), rubric),
|
|
116
|
+
LLMJudge(LLMConfig(model="gpt-4o-mini"), rubric),
|
|
117
|
+
],
|
|
118
|
+
method="average", # or "median", "majority_vote", "unanimous"
|
|
119
|
+
n_jobs=3 # Parallel execution
|
|
120
|
+
)
|
|
121
|
+
|
|
122
|
+
results = evaluate(dataset, judges=[ensemble])
|
|
123
|
+
```
|
|
124
|
+
|
|
125
|
+
## Configuration
|
|
126
|
+
```python
|
|
127
|
+
from openevalkit import EvalConfig
|
|
128
|
+
|
|
129
|
+
config = EvalConfig(
|
|
130
|
+
concurrency=10, # Parallel runs
|
|
131
|
+
cache_enabled=True, # Cache results (saves API costs)
|
|
132
|
+
cache_max_size_mb=500, # Cache size limit
|
|
133
|
+
timeout=30.0, # Timeout per run
|
|
134
|
+
seed=42, # Reproducible results
|
|
135
|
+
verbose=True, # Show progress
|
|
136
|
+
)
|
|
137
|
+
|
|
138
|
+
results = evaluate(dataset, judges=[judge], config=config)
|
|
139
|
+
```
|
|
140
|
+
|
|
141
|
+
## Built-in Scorers
|
|
142
|
+
|
|
143
|
+
### String Matching
|
|
144
|
+
- **ExactMatch** - Exact string comparison with reference
|
|
145
|
+
- **RegexMatch** - Pattern matching with regex
|
|
146
|
+
- **ContainsKeywords** - Check for required keywords
|
|
147
|
+
|
|
148
|
+
### Structure Validation
|
|
149
|
+
- **JSONValid** - Validate JSON output
|
|
150
|
+
|
|
151
|
+
### Performance Metrics
|
|
152
|
+
- **Latency** - Response time from run.metrics
|
|
153
|
+
- **Cost** - API cost from run.metrics
|
|
154
|
+
- **TokenCount** - Token usage (exact or estimated)
|
|
155
|
+
|
|
156
|
+
## Supported Models
|
|
157
|
+
|
|
158
|
+
Via [LiteLLM](https://github.com/BerriAI/litellm), supports 100+ models:
|
|
159
|
+
|
|
160
|
+
- **OpenAI**: gpt-4o, gpt-4o-mini, gpt-4-turbo, gpt-3.5-turbo
|
|
161
|
+
- **Anthropic**: claude-3-5-sonnet, claude-3-opus, claude-3-haiku
|
|
162
|
+
- **Google**: gemini-pro, gemini-1.5-pro
|
|
163
|
+
- **Ollama**: llama3, mistral, phi (local models)
|
|
164
|
+
- **Cohere, Replicate, HuggingFace, and more**
|
|
165
|
+
|
|
166
|
+
## Custom Scorers
|
|
167
|
+
```python
|
|
168
|
+
from openevalkit.scorers.base import Scorer
|
|
169
|
+
from openevalkit import Score
|
|
170
|
+
|
|
171
|
+
class ContainsWord(Scorer):
|
|
172
|
+
name = "contains_word"
|
|
173
|
+
requires_reference = False
|
|
174
|
+
|
|
175
|
+
def __init__(self, word: str):
|
|
176
|
+
self.word = word
|
|
177
|
+
|
|
178
|
+
def score(self, run):
|
|
179
|
+
has_word = self.word.lower() in run.output.lower()
|
|
180
|
+
return Score(
|
|
181
|
+
value=1.0 if has_word else 0.0,
|
|
182
|
+
reason=f"Word '{self.word}' {'found' if has_word else 'not found'}"
|
|
183
|
+
)
|
|
184
|
+
|
|
185
|
+
results = evaluate(dataset, scorers=[ContainsWord("Python")])
|
|
186
|
+
```
|
|
187
|
+
|
|
188
|
+
## Why OpenEvalKit?
|
|
189
|
+
|
|
190
|
+
- **Production Ready**: Smart caching, parallel execution, error handling
|
|
191
|
+
- **Cost Effective**: Cache LLM judgments to avoid redundant API calls
|
|
192
|
+
- **Flexible**: Works with any LLM provider via LiteLLM
|
|
193
|
+
- **Reliable**: Ensemble judges with configurable aggregation
|
|
194
|
+
- **Simple**: Clean API, comprehensive documentation
|
|
195
|
+
|
|
196
|
+
## Documentation
|
|
197
|
+
|
|
198
|
+
Coming soon! For now, see examples above and docstrings.
|
|
199
|
+
|
|
200
|
+
## License
|
|
201
|
+
|
|
202
|
+
MIT
|
|
203
|
+
|
|
204
|
+
## Contributing
|
|
205
|
+
|
|
206
|
+
Contributions welcome! Please open an issue or PR.
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "openevalkit"
|
|
3
|
+
version = "0.1.0"
|
|
4
|
+
description = "Open evaluation kit for LLM systems"
|
|
5
|
+
readme = "README.md"
|
|
6
|
+
requires-python = ">=3.9"
|
|
7
|
+
license = {text = "MIT"}
|
|
8
|
+
keywords = ["llm", "agent", "evaluation", "judge", "nlp", "ml", "evals", "generative"]
|
|
9
|
+
dependencies = [
|
|
10
|
+
"litellm>=1.81.9",
|
|
11
|
+
"numpy>=2.0.2",
|
|
12
|
+
"pandas>=2.3.3",
|
|
13
|
+
]
|
|
14
|
+
|
|
15
|
+
[build-system]
|
|
16
|
+
requires = ["uv_build>=0.10.0,<0.11.0"]
|
|
17
|
+
build-backend = "uv_build"
|
|
18
|
+
|
|
19
|
+
[dependency-groups]
|
|
20
|
+
dev = [
|
|
21
|
+
"black>=25.11.0",
|
|
22
|
+
"mypy>=1.19.1",
|
|
23
|
+
"pytest>=8.4.2",
|
|
24
|
+
"pytest-cov>=7.0.0",
|
|
25
|
+
"ruff>=0.15.0",
|
|
26
|
+
]
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
"""
|
|
2
|
+
OpenEvalKit: Universal evaluation framework for LLM systems.
|
|
3
|
+
|
|
4
|
+
Examples:
|
|
5
|
+
>>> from openevalkit import Dataset, evaluate
|
|
6
|
+
>>> from openevalkit.scorers import ExactMatch
|
|
7
|
+
>>> from openevalkit.judges import LLMJudge, LLMConfig, Rubric
|
|
8
|
+
>>>
|
|
9
|
+
>>> # Evaluate with scorers
|
|
10
|
+
>>> dataset = Dataset.from_jsonl("data.jsonl")
|
|
11
|
+
>>> results = evaluate(dataset, scorers=[ExactMatch()])
|
|
12
|
+
>>> print(results.aggregates)
|
|
13
|
+
>>>
|
|
14
|
+
>>> # Evaluate with judges
|
|
15
|
+
>>> rubric = Rubric(criteria=["helpfulness"], scale="0-1")
|
|
16
|
+
>>> judge = LLMJudge(LLMConfig(model="gpt-4o"), rubric)
|
|
17
|
+
>>> results = evaluate(dataset, judges=[judge])
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
# Package version
|
|
21
|
+
from openevalkit.__version__ import __version__
|
|
22
|
+
|
|
23
|
+
# Core classes
|
|
24
|
+
from openevalkit.run import Run
|
|
25
|
+
from openevalkit.dataset import Dataset
|
|
26
|
+
from openevalkit.score import Score
|
|
27
|
+
from openevalkit.judgment import Judgment
|
|
28
|
+
from openevalkit.config import EvalConfig
|
|
29
|
+
|
|
30
|
+
# Main function
|
|
31
|
+
from openevalkit import evaluate
|
|
32
|
+
|
|
33
|
+
# Results
|
|
34
|
+
from openevalkit.result import EvaluationResult
|
|
35
|
+
|
|
36
|
+
# Cache (optional - for advanced users)
|
|
37
|
+
from openevalkit.cache import Cache
|
|
38
|
+
|
|
39
|
+
# Make submodules available
|
|
40
|
+
from openevalkit import scorers
|
|
41
|
+
from openevalkit import judges
|
|
42
|
+
from openevalkit import errors
|
|
43
|
+
|
|
44
|
+
__all__ = [
|
|
45
|
+
# Version
|
|
46
|
+
"__version__",
|
|
47
|
+
# Core
|
|
48
|
+
"Run",
|
|
49
|
+
"Dataset",
|
|
50
|
+
"Score",
|
|
51
|
+
"Judgment",
|
|
52
|
+
"EvalConfig",
|
|
53
|
+
# Functions
|
|
54
|
+
"evaluate",
|
|
55
|
+
# Results
|
|
56
|
+
"EvaluationResult",
|
|
57
|
+
# Cache
|
|
58
|
+
"Cache",
|
|
59
|
+
# Submodules
|
|
60
|
+
"scorers",
|
|
61
|
+
"judges",
|
|
62
|
+
"errors",
|
|
63
|
+
]
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "0.1.0"
|