openevalkit 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (28) hide show
  1. openevalkit-0.1.0/PKG-INFO +218 -0
  2. openevalkit-0.1.0/README.md +206 -0
  3. openevalkit-0.1.0/pyproject.toml +26 -0
  4. openevalkit-0.1.0/src/openevalkit/__init__.py +63 -0
  5. openevalkit-0.1.0/src/openevalkit/__version__.py +1 -0
  6. openevalkit-0.1.0/src/openevalkit/cache.py +324 -0
  7. openevalkit-0.1.0/src/openevalkit/config.py +49 -0
  8. openevalkit-0.1.0/src/openevalkit/dataset.py +189 -0
  9. openevalkit-0.1.0/src/openevalkit/errors.py +28 -0
  10. openevalkit-0.1.0/src/openevalkit/evaluate.py +434 -0
  11. openevalkit-0.1.0/src/openevalkit/judges/__init__.py +19 -0
  12. openevalkit-0.1.0/src/openevalkit/judges/base.py +159 -0
  13. openevalkit-0.1.0/src/openevalkit/judges/ensemble.py +371 -0
  14. openevalkit-0.1.0/src/openevalkit/judges/llm_config.py +85 -0
  15. openevalkit-0.1.0/src/openevalkit/judges/llm_judge.py +196 -0
  16. openevalkit-0.1.0/src/openevalkit/judges/prompt.py +119 -0
  17. openevalkit-0.1.0/src/openevalkit/judges/rubric.py +129 -0
  18. openevalkit-0.1.0/src/openevalkit/judgment.py +53 -0
  19. openevalkit-0.1.0/src/openevalkit/py.typed +0 -0
  20. openevalkit-0.1.0/src/openevalkit/result.py +62 -0
  21. openevalkit-0.1.0/src/openevalkit/run.py +30 -0
  22. openevalkit-0.1.0/src/openevalkit/score.py +25 -0
  23. openevalkit-0.1.0/src/openevalkit/scorers/__init__.py +18 -0
  24. openevalkit-0.1.0/src/openevalkit/scorers/base.py +73 -0
  25. openevalkit-0.1.0/src/openevalkit/scorers/performance.py +148 -0
  26. openevalkit-0.1.0/src/openevalkit/scorers/reference.py +61 -0
  27. openevalkit-0.1.0/src/openevalkit/scorers/rule.py +187 -0
  28. openevalkit-0.1.0/src/openevalkit/utils/__init__.py +0 -0
@@ -0,0 +1,218 @@
1
+ Metadata-Version: 2.3
2
+ Name: openevalkit
3
+ Version: 0.1.0
4
+ Summary: Open evaluation kit for LLM systems
5
+ Keywords: llm,agent,evaluation,judge,nlp,ml,evals,generative
6
+ License: MIT
7
+ Requires-Dist: litellm>=1.81.9
8
+ Requires-Dist: numpy>=2.0.2
9
+ Requires-Dist: pandas>=2.3.3
10
+ Requires-Python: >=3.9
11
+ Description-Content-Type: text/markdown
12
+
13
+ # OpenEvalKit
14
+
15
+ **Universal evaluation framework for LLM systems**
16
+
17
+ [![PyPI](https://img.shields.io/pypi/v/openevalkit.svg)](https://pypi.org/project/openevalkit/)
18
+ [![Python](https://img.shields.io/pypi/pyversions/openevalkit.svg)](https://pypi.org/project/openevalkit/)
19
+
20
+ OpenEvalKit is a production-grade framework for evaluating LLM systems with traditional metrics, LLM-as-a-judge, and ensemble evaluation.
21
+
22
+ ## Features
23
+
24
+ - 📊 **Traditional Scorers** - ExactMatch, Latency, Cost, TokenCount, RegexMatch, JSONValid, ContainsKeywords
25
+ - 🤖 **LLM Judges** - Use any LLM (OpenAI, Anthropic, Ollama, 100+ models) to evaluate quality
26
+ - 🎯 **Ensemble Judges** - Combine multiple judges for more reliable evaluation
27
+ - 💾 **Smart Caching** - Automatic caching with LRU eviction (saves API costs)
28
+ - âš¡ **Parallel Execution** - Fast evaluation with configurable concurrency
29
+ - 🔧 **Flexible** - Custom scorers, judges, and rubrics
30
+
31
+ ## Installation
32
+ ```bash
33
+ pip install openevalkit
34
+ ```
35
+
36
+ ## Quick Start
37
+
38
+ ### Loading Datasets
39
+ ```python
40
+ from openevalkit import Dataset
41
+
42
+ # From JSONL
43
+ dataset = Dataset.from_jsonl(
44
+ "data.jsonl",
45
+ input_field="question",
46
+ output_field="answer",
47
+ reference_field="expected"
48
+ )
49
+
50
+ # From CSV
51
+ dataset = Dataset.from_csv(
52
+ "data.csv",
53
+ input_col="question",
54
+ output_col="answer",
55
+ reference_col="expected",
56
+ metadata_cols=["user_id"],
57
+ metrics_cols=["latency"]
58
+ )
59
+
60
+ # From list
61
+ from openevalkit import Run
62
+ dataset = Dataset([
63
+ Run(id="1", input="What is 2+2?", output="4", reference="4"),
64
+ Run(id="2", input="What is 3+3?", output="6", reference="6"),
65
+ ])
66
+ ```
67
+
68
+ ### Evaluate with Traditional Scorers
69
+ ```python
70
+ from openevalkit import evaluate
71
+ from openevalkit.scorers import ExactMatch, RegexMatch, JSONValid, ContainsKeywords
72
+
73
+ # Exact match
74
+ results = evaluate(dataset, scorers=[ExactMatch()])
75
+ print(results.aggregates)
76
+ # {'exact_match': 1.0}
77
+
78
+ # Regex pattern matching
79
+ scorer = RegexMatch(pattern=r'\d+') # Check if output contains numbers
80
+ results = evaluate(dataset, scorers=[scorer])
81
+
82
+ # JSON validation
83
+ json_scorer = JSONValid()
84
+ results = evaluate(dataset, scorers=[json_scorer])
85
+
86
+ # Keyword detection
87
+ keyword_scorer = ContainsKeywords(keywords=["python", "code"], ignore_case=True)
88
+ results = evaluate(dataset, scorers=[keyword_scorer])
89
+ ```
90
+
91
+ ### Evaluate with LLM Judge
92
+ ```python
93
+ from openevalkit.judges import LLMJudge, LLMConfig, Rubric
94
+
95
+ # Create dataset
96
+ dataset = Dataset([
97
+ {"input": "Explain Python", "output": "Python is a programming language..."},
98
+ ])
99
+
100
+ # Create rubric
101
+ rubric = Rubric(
102
+ criteria=["helpfulness", "accuracy", "clarity"],
103
+ scale="0-1",
104
+ weights={"helpfulness": 2.0, "accuracy": 3.0, "clarity": 1.0}
105
+ )
106
+
107
+ # Create judge
108
+ judge = LLMJudge(
109
+ llm_config=LLMConfig(model="gpt-4o"),
110
+ rubric=rubric
111
+ )
112
+
113
+ # Evaluate
114
+ results = evaluate(dataset, judges=[judge])
115
+ print(results.aggregates)
116
+ # {'llm_judge_gpt-4o_score': 0.85, 'llm_judge_gpt-4o_helpfulness': 0.9, ...}
117
+ ```
118
+
119
+ ### Ensemble Evaluation (Multiple Judges)
120
+ ```python
121
+ from openevalkit.judges import EnsembleJudge
122
+
123
+ # Combine multiple judges for more reliable evaluation
124
+ ensemble = EnsembleJudge(
125
+ judges=[
126
+ LLMJudge(LLMConfig(model="gpt-4o"), rubric),
127
+ LLMJudge(LLMConfig(model="claude-3-5-sonnet-20241022"), rubric),
128
+ LLMJudge(LLMConfig(model="gpt-4o-mini"), rubric),
129
+ ],
130
+ method="average", # or "median", "majority_vote", "unanimous"
131
+ n_jobs=3 # Parallel execution
132
+ )
133
+
134
+ results = evaluate(dataset, judges=[ensemble])
135
+ ```
136
+
137
+ ## Configuration
138
+ ```python
139
+ from openevalkit import EvalConfig
140
+
141
+ config = EvalConfig(
142
+ concurrency=10, # Parallel runs
143
+ cache_enabled=True, # Cache results (saves API costs)
144
+ cache_max_size_mb=500, # Cache size limit
145
+ timeout=30.0, # Timeout per run
146
+ seed=42, # Reproducible results
147
+ verbose=True, # Show progress
148
+ )
149
+
150
+ results = evaluate(dataset, judges=[judge], config=config)
151
+ ```
152
+
153
+ ## Built-in Scorers
154
+
155
+ ### String Matching
156
+ - **ExactMatch** - Exact string comparison with reference
157
+ - **RegexMatch** - Pattern matching with regex
158
+ - **ContainsKeywords** - Check for required keywords
159
+
160
+ ### Structure Validation
161
+ - **JSONValid** - Validate JSON output
162
+
163
+ ### Performance Metrics
164
+ - **Latency** - Response time from run.metrics
165
+ - **Cost** - API cost from run.metrics
166
+ - **TokenCount** - Token usage (exact or estimated)
167
+
168
+ ## Supported Models
169
+
170
+ Via [LiteLLM](https://github.com/BerriAI/litellm), supports 100+ models:
171
+
172
+ - **OpenAI**: gpt-4o, gpt-4o-mini, gpt-4-turbo, gpt-3.5-turbo
173
+ - **Anthropic**: claude-3-5-sonnet, claude-3-opus, claude-3-haiku
174
+ - **Google**: gemini-pro, gemini-1.5-pro
175
+ - **Ollama**: llama3, mistral, phi (local models)
176
+ - **Cohere, Replicate, HuggingFace, and more**
177
+
178
+ ## Custom Scorers
179
+ ```python
180
+ from openevalkit.scorers.base import Scorer
181
+ from openevalkit import Score
182
+
183
+ class ContainsWord(Scorer):
184
+ name = "contains_word"
185
+ requires_reference = False
186
+
187
+ def __init__(self, word: str):
188
+ self.word = word
189
+
190
+ def score(self, run):
191
+ has_word = self.word.lower() in run.output.lower()
192
+ return Score(
193
+ value=1.0 if has_word else 0.0,
194
+ reason=f"Word '{self.word}' {'found' if has_word else 'not found'}"
195
+ )
196
+
197
+ results = evaluate(dataset, scorers=[ContainsWord("Python")])
198
+ ```
199
+
200
+ ## Why OpenEvalKit?
201
+
202
+ - **Production Ready**: Smart caching, parallel execution, error handling
203
+ - **Cost Effective**: Cache LLM judgments to avoid redundant API calls
204
+ - **Flexible**: Works with any LLM provider via LiteLLM
205
+ - **Reliable**: Ensemble judges with configurable aggregation
206
+ - **Simple**: Clean API, comprehensive documentation
207
+
208
+ ## Documentation
209
+
210
+ Coming soon! For now, see examples above and docstrings.
211
+
212
+ ## License
213
+
214
+ MIT
215
+
216
+ ## Contributing
217
+
218
+ Contributions welcome! Please open an issue or PR.
@@ -0,0 +1,206 @@
1
+ # OpenEvalKit
2
+
3
+ **Universal evaluation framework for LLM systems**
4
+
5
+ [![PyPI](https://img.shields.io/pypi/v/openevalkit.svg)](https://pypi.org/project/openevalkit/)
6
+ [![Python](https://img.shields.io/pypi/pyversions/openevalkit.svg)](https://pypi.org/project/openevalkit/)
7
+
8
+ OpenEvalKit is a production-grade framework for evaluating LLM systems with traditional metrics, LLM-as-a-judge, and ensemble evaluation.
9
+
10
+ ## Features
11
+
12
+ - 📊 **Traditional Scorers** - ExactMatch, Latency, Cost, TokenCount, RegexMatch, JSONValid, ContainsKeywords
13
+ - 🤖 **LLM Judges** - Use any LLM (OpenAI, Anthropic, Ollama, 100+ models) to evaluate quality
14
+ - 🎯 **Ensemble Judges** - Combine multiple judges for more reliable evaluation
15
+ - 💾 **Smart Caching** - Automatic caching with LRU eviction (saves API costs)
16
+ - âš¡ **Parallel Execution** - Fast evaluation with configurable concurrency
17
+ - 🔧 **Flexible** - Custom scorers, judges, and rubrics
18
+
19
+ ## Installation
20
+ ```bash
21
+ pip install openevalkit
22
+ ```
23
+
24
+ ## Quick Start
25
+
26
+ ### Loading Datasets
27
+ ```python
28
+ from openevalkit import Dataset
29
+
30
+ # From JSONL
31
+ dataset = Dataset.from_jsonl(
32
+ "data.jsonl",
33
+ input_field="question",
34
+ output_field="answer",
35
+ reference_field="expected"
36
+ )
37
+
38
+ # From CSV
39
+ dataset = Dataset.from_csv(
40
+ "data.csv",
41
+ input_col="question",
42
+ output_col="answer",
43
+ reference_col="expected",
44
+ metadata_cols=["user_id"],
45
+ metrics_cols=["latency"]
46
+ )
47
+
48
+ # From list
49
+ from openevalkit import Run
50
+ dataset = Dataset([
51
+ Run(id="1", input="What is 2+2?", output="4", reference="4"),
52
+ Run(id="2", input="What is 3+3?", output="6", reference="6"),
53
+ ])
54
+ ```
55
+
56
+ ### Evaluate with Traditional Scorers
57
+ ```python
58
+ from openevalkit import evaluate
59
+ from openevalkit.scorers import ExactMatch, RegexMatch, JSONValid, ContainsKeywords
60
+
61
+ # Exact match
62
+ results = evaluate(dataset, scorers=[ExactMatch()])
63
+ print(results.aggregates)
64
+ # {'exact_match': 1.0}
65
+
66
+ # Regex pattern matching
67
+ scorer = RegexMatch(pattern=r'\d+') # Check if output contains numbers
68
+ results = evaluate(dataset, scorers=[scorer])
69
+
70
+ # JSON validation
71
+ json_scorer = JSONValid()
72
+ results = evaluate(dataset, scorers=[json_scorer])
73
+
74
+ # Keyword detection
75
+ keyword_scorer = ContainsKeywords(keywords=["python", "code"], ignore_case=True)
76
+ results = evaluate(dataset, scorers=[keyword_scorer])
77
+ ```
78
+
79
+ ### Evaluate with LLM Judge
80
+ ```python
81
+ from openevalkit.judges import LLMJudge, LLMConfig, Rubric
82
+
83
+ # Create dataset
84
+ dataset = Dataset([
85
+ {"input": "Explain Python", "output": "Python is a programming language..."},
86
+ ])
87
+
88
+ # Create rubric
89
+ rubric = Rubric(
90
+ criteria=["helpfulness", "accuracy", "clarity"],
91
+ scale="0-1",
92
+ weights={"helpfulness": 2.0, "accuracy": 3.0, "clarity": 1.0}
93
+ )
94
+
95
+ # Create judge
96
+ judge = LLMJudge(
97
+ llm_config=LLMConfig(model="gpt-4o"),
98
+ rubric=rubric
99
+ )
100
+
101
+ # Evaluate
102
+ results = evaluate(dataset, judges=[judge])
103
+ print(results.aggregates)
104
+ # {'llm_judge_gpt-4o_score': 0.85, 'llm_judge_gpt-4o_helpfulness': 0.9, ...}
105
+ ```
106
+
107
+ ### Ensemble Evaluation (Multiple Judges)
108
+ ```python
109
+ from openevalkit.judges import EnsembleJudge
110
+
111
+ # Combine multiple judges for more reliable evaluation
112
+ ensemble = EnsembleJudge(
113
+ judges=[
114
+ LLMJudge(LLMConfig(model="gpt-4o"), rubric),
115
+ LLMJudge(LLMConfig(model="claude-3-5-sonnet-20241022"), rubric),
116
+ LLMJudge(LLMConfig(model="gpt-4o-mini"), rubric),
117
+ ],
118
+ method="average", # or "median", "majority_vote", "unanimous"
119
+ n_jobs=3 # Parallel execution
120
+ )
121
+
122
+ results = evaluate(dataset, judges=[ensemble])
123
+ ```
124
+
125
+ ## Configuration
126
+ ```python
127
+ from openevalkit import EvalConfig
128
+
129
+ config = EvalConfig(
130
+ concurrency=10, # Parallel runs
131
+ cache_enabled=True, # Cache results (saves API costs)
132
+ cache_max_size_mb=500, # Cache size limit
133
+ timeout=30.0, # Timeout per run
134
+ seed=42, # Reproducible results
135
+ verbose=True, # Show progress
136
+ )
137
+
138
+ results = evaluate(dataset, judges=[judge], config=config)
139
+ ```
140
+
141
+ ## Built-in Scorers
142
+
143
+ ### String Matching
144
+ - **ExactMatch** - Exact string comparison with reference
145
+ - **RegexMatch** - Pattern matching with regex
146
+ - **ContainsKeywords** - Check for required keywords
147
+
148
+ ### Structure Validation
149
+ - **JSONValid** - Validate JSON output
150
+
151
+ ### Performance Metrics
152
+ - **Latency** - Response time from run.metrics
153
+ - **Cost** - API cost from run.metrics
154
+ - **TokenCount** - Token usage (exact or estimated)
155
+
156
+ ## Supported Models
157
+
158
+ Via [LiteLLM](https://github.com/BerriAI/litellm), supports 100+ models:
159
+
160
+ - **OpenAI**: gpt-4o, gpt-4o-mini, gpt-4-turbo, gpt-3.5-turbo
161
+ - **Anthropic**: claude-3-5-sonnet, claude-3-opus, claude-3-haiku
162
+ - **Google**: gemini-pro, gemini-1.5-pro
163
+ - **Ollama**: llama3, mistral, phi (local models)
164
+ - **Cohere, Replicate, HuggingFace, and more**
165
+
166
+ ## Custom Scorers
167
+ ```python
168
+ from openevalkit.scorers.base import Scorer
169
+ from openevalkit import Score
170
+
171
+ class ContainsWord(Scorer):
172
+ name = "contains_word"
173
+ requires_reference = False
174
+
175
+ def __init__(self, word: str):
176
+ self.word = word
177
+
178
+ def score(self, run):
179
+ has_word = self.word.lower() in run.output.lower()
180
+ return Score(
181
+ value=1.0 if has_word else 0.0,
182
+ reason=f"Word '{self.word}' {'found' if has_word else 'not found'}"
183
+ )
184
+
185
+ results = evaluate(dataset, scorers=[ContainsWord("Python")])
186
+ ```
187
+
188
+ ## Why OpenEvalKit?
189
+
190
+ - **Production Ready**: Smart caching, parallel execution, error handling
191
+ - **Cost Effective**: Cache LLM judgments to avoid redundant API calls
192
+ - **Flexible**: Works with any LLM provider via LiteLLM
193
+ - **Reliable**: Ensemble judges with configurable aggregation
194
+ - **Simple**: Clean API, comprehensive documentation
195
+
196
+ ## Documentation
197
+
198
+ Coming soon! For now, see examples above and docstrings.
199
+
200
+ ## License
201
+
202
+ MIT
203
+
204
+ ## Contributing
205
+
206
+ Contributions welcome! Please open an issue or PR.
@@ -0,0 +1,26 @@
1
+ [project]
2
+ name = "openevalkit"
3
+ version = "0.1.0"
4
+ description = "Open evaluation kit for LLM systems"
5
+ readme = "README.md"
6
+ requires-python = ">=3.9"
7
+ license = {text = "MIT"}
8
+ keywords = ["llm", "agent", "evaluation", "judge", "nlp", "ml", "evals", "generative"]
9
+ dependencies = [
10
+ "litellm>=1.81.9",
11
+ "numpy>=2.0.2",
12
+ "pandas>=2.3.3",
13
+ ]
14
+
15
+ [build-system]
16
+ requires = ["uv_build>=0.10.0,<0.11.0"]
17
+ build-backend = "uv_build"
18
+
19
+ [dependency-groups]
20
+ dev = [
21
+ "black>=25.11.0",
22
+ "mypy>=1.19.1",
23
+ "pytest>=8.4.2",
24
+ "pytest-cov>=7.0.0",
25
+ "ruff>=0.15.0",
26
+ ]
@@ -0,0 +1,63 @@
1
+ """
2
+ OpenEvalKit: Universal evaluation framework for LLM systems.
3
+
4
+ Examples:
5
+ >>> from openevalkit import Dataset, evaluate
6
+ >>> from openevalkit.scorers import ExactMatch
7
+ >>> from openevalkit.judges import LLMJudge, LLMConfig, Rubric
8
+ >>>
9
+ >>> # Evaluate with scorers
10
+ >>> dataset = Dataset.from_jsonl("data.jsonl")
11
+ >>> results = evaluate(dataset, scorers=[ExactMatch()])
12
+ >>> print(results.aggregates)
13
+ >>>
14
+ >>> # Evaluate with judges
15
+ >>> rubric = Rubric(criteria=["helpfulness"], scale="0-1")
16
+ >>> judge = LLMJudge(LLMConfig(model="gpt-4o"), rubric)
17
+ >>> results = evaluate(dataset, judges=[judge])
18
+ """
19
+
20
+ # Package version
21
+ from openevalkit.__version__ import __version__
22
+
23
+ # Core classes
24
+ from openevalkit.run import Run
25
+ from openevalkit.dataset import Dataset
26
+ from openevalkit.score import Score
27
+ from openevalkit.judgment import Judgment
28
+ from openevalkit.config import EvalConfig
29
+
30
+ # Main function
31
+ from openevalkit import evaluate
32
+
33
+ # Results
34
+ from openevalkit.result import EvaluationResult
35
+
36
+ # Cache (optional - for advanced users)
37
+ from openevalkit.cache import Cache
38
+
39
+ # Make submodules available
40
+ from openevalkit import scorers
41
+ from openevalkit import judges
42
+ from openevalkit import errors
43
+
44
+ __all__ = [
45
+ # Version
46
+ "__version__",
47
+ # Core
48
+ "Run",
49
+ "Dataset",
50
+ "Score",
51
+ "Judgment",
52
+ "EvalConfig",
53
+ # Functions
54
+ "evaluate",
55
+ # Results
56
+ "EvaluationResult",
57
+ # Cache
58
+ "Cache",
59
+ # Submodules
60
+ "scorers",
61
+ "judges",
62
+ "errors",
63
+ ]
@@ -0,0 +1 @@
1
+ __version__ = "0.1.0"