hellmholtz 0.3.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- hellmholtz-0.3.0/LICENSE +21 -0
- hellmholtz-0.3.0/PKG-INFO +505 -0
- hellmholtz-0.3.0/README.md +462 -0
- hellmholtz-0.3.0/pyproject.toml +178 -0
- hellmholtz-0.3.0/src/hellmholtz/__init__.py +1 -0
- hellmholtz-0.3.0/src/hellmholtz/benchmark/__init__.py +22 -0
- hellmholtz-0.3.0/src/hellmholtz/benchmark/evaluator.py +94 -0
- hellmholtz-0.3.0/src/hellmholtz/benchmark/prompts.json +49 -0
- hellmholtz-0.3.0/src/hellmholtz/benchmark/prompts.py +203 -0
- hellmholtz-0.3.0/src/hellmholtz/benchmark/prompts.txt +6 -0
- hellmholtz-0.3.0/src/hellmholtz/benchmark/runner.py +438 -0
- hellmholtz-0.3.0/src/hellmholtz/cli/__init__.py +47 -0
- hellmholtz-0.3.0/src/hellmholtz/cli/benchmark.py +274 -0
- hellmholtz-0.3.0/src/hellmholtz/cli/chat.py +37 -0
- hellmholtz-0.3.0/src/hellmholtz/cli/common.py +277 -0
- hellmholtz-0.3.0/src/hellmholtz/cli/integrations.py +79 -0
- hellmholtz-0.3.0/src/hellmholtz/cli/models.py +116 -0
- hellmholtz-0.3.0/src/hellmholtz/client.py +148 -0
- hellmholtz-0.3.0/src/hellmholtz/core/__init__.py +5 -0
- hellmholtz-0.3.0/src/hellmholtz/core/config.py +47 -0
- hellmholtz-0.3.0/src/hellmholtz/core/prompts.py +224 -0
- hellmholtz-0.3.0/src/hellmholtz/evaluation_analysis.py +925 -0
- hellmholtz-0.3.0/src/hellmholtz/export.py +81 -0
- hellmholtz-0.3.0/src/hellmholtz/integrations/__init__.py +1 -0
- hellmholtz-0.3.0/src/hellmholtz/integrations/litellm.py +30 -0
- hellmholtz-0.3.0/src/hellmholtz/integrations/lm_eval.py +67 -0
- hellmholtz-0.3.0/src/hellmholtz/monitoring.py +556 -0
- hellmholtz-0.3.0/src/hellmholtz/providers/__init__.py +1 -0
- hellmholtz-0.3.0/src/hellmholtz/providers/blablador.py +121 -0
- hellmholtz-0.3.0/src/hellmholtz/providers/blablador_config.py +702 -0
- hellmholtz-0.3.0/src/hellmholtz/providers/blablador_provider.py +166 -0
- hellmholtz-0.3.0/src/hellmholtz/reporting/__init__.py +33 -0
- hellmholtz-0.3.0/src/hellmholtz/reporting/chart.py +304 -0
- hellmholtz-0.3.0/src/hellmholtz/reporting/html.py +304 -0
- hellmholtz-0.3.0/src/hellmholtz/reporting/markdown.py +39 -0
- hellmholtz-0.3.0/src/hellmholtz/reporting/stats.py +267 -0
- hellmholtz-0.3.0/src/hellmholtz/reporting/templates/detailed.html +302 -0
- hellmholtz-0.3.0/src/hellmholtz/reporting/templates/simple.html +186 -0
- hellmholtz-0.3.0/src/hellmholtz/reporting/utils.py +59 -0
hellmholtz-0.3.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2024 Jonas Heinicke
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,505 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: hellmholtz
|
|
3
|
+
Version: 0.3.0
|
|
4
|
+
Summary: A comprehensive Python package for unified LLM access, benchmarking, evaluation, and reporting
|
|
5
|
+
License: MIT
|
|
6
|
+
License-File: LICENSE
|
|
7
|
+
Keywords: llm,benchmark,aisuite,openai,anthropic,google,ollama
|
|
8
|
+
Author: jhe24
|
|
9
|
+
Author-email: jhe24@example.com
|
|
10
|
+
Requires-Python: >=3.10
|
|
11
|
+
Classifier: Development Status :: 4 - Beta
|
|
12
|
+
Classifier: Environment :: Console
|
|
13
|
+
Classifier: Intended Audience :: Developers
|
|
14
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
15
|
+
Classifier: Operating System :: OS Independent
|
|
16
|
+
Classifier: Programming Language :: Python :: 3
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
20
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
21
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
22
|
+
Provides-Extra: eval
|
|
23
|
+
Provides-Extra: proxy
|
|
24
|
+
Provides-Extra: reporting
|
|
25
|
+
Requires-Dist: aisuite[all] (>=0.1.6,<0.2.0)
|
|
26
|
+
Requires-Dist: jinja2 (>=3.1.0,<4.0.0)
|
|
27
|
+
Requires-Dist: litellm ; extra == "proxy"
|
|
28
|
+
Requires-Dist: lm-eval ; extra == "eval"
|
|
29
|
+
Requires-Dist: matplotlib ; extra == "reporting"
|
|
30
|
+
Requires-Dist: pydantic (>=2.10.3,<3.0.0)
|
|
31
|
+
Requires-Dist: python-dotenv (>=1.0.1,<2.0.0)
|
|
32
|
+
Requires-Dist: requests (>=2.32.0,<3.0.0)
|
|
33
|
+
Requires-Dist: scipy ; extra == "reporting"
|
|
34
|
+
Requires-Dist: seaborn ; extra == "reporting"
|
|
35
|
+
Requires-Dist: typer (>=0.15.1,<0.16.0)
|
|
36
|
+
Project-URL: Changelog, https://github.com/JonasHeinickeBio/HeLLMholtz/releases
|
|
37
|
+
Project-URL: Documentation, https://github.com/JonasHeinickeBio/HeLLMholtz#readme
|
|
38
|
+
Project-URL: Homepage, https://github.com/JonasHeinickeBio/HeLLMholtz
|
|
39
|
+
Project-URL: Issues, https://github.com/JonasHeinickeBio/HeLLMholtz/issues
|
|
40
|
+
Project-URL: Repository, https://github.com/JonasHeinickeBio/HeLLMholtz.git
|
|
41
|
+
Description-Content-Type: text/markdown
|
|
42
|
+
|
|
43
|
+
# HeLLMholtz LLM Suite
|
|
44
|
+
|
|
45
|
+
[](https://www.python.org/downloads/)
|
|
46
|
+
[](https://pypi.org/project/hellmholtz/)
|
|
47
|
+
[](LICENSE)
|
|
48
|
+
[](https://github.com/astral-sh/ruff)
|
|
49
|
+
[](https://github.com/JonasHeinickeBio/HeLLMholtz/actions)
|
|
50
|
+
|
|
51
|
+
A comprehensive Python package for unified LLM access, benchmarking, evaluation, and reporting. Built on top of `aisuite` with specialized support for Helmholtz Blablador models.
|
|
52
|
+
|
|
53
|
+
## Features
|
|
54
|
+
|
|
55
|
+
- **Unified Client**: Single interface for OpenAI, Google, Anthropic, Ollama, and Helmholtz Blablador models
|
|
56
|
+
- **Centralized Configuration**: Environment-based configuration for all your projects
|
|
57
|
+
- **Advanced Benchmarking**: Compare model performance across temperatures, replications, and prompt categories
|
|
58
|
+
- **LLM-as-a-Judge Evaluation**: Automated evaluation with comprehensive statistical analysis
|
|
59
|
+
- **Interactive Reports**: HTML reports with Chart.js visualizations and Markdown summaries
|
|
60
|
+
- **Flexible Prompt System**: Support for both simple text files and structured JSON prompt collections
|
|
61
|
+
- **Model Monitoring**: Track Blablador model availability and configuration consistency
|
|
62
|
+
- **LM Evaluation Harness**: Integration with EleutherAI's comprehensive evaluation suite
|
|
63
|
+
- **LiteLLM Proxy**: Built-in proxy server for model routing and load balancing
|
|
64
|
+
- **Throughput Testing**: Performance benchmarking for high-throughput scenarios
|
|
65
|
+
- **Model Discovery**: Dynamic model listing and availability checking (19+ BLABLADOR models currently available)
|
|
66
|
+
|
|
67
|
+
## Installation
|
|
68
|
+
|
|
69
|
+
### Basic Installation
|
|
70
|
+
|
|
71
|
+
```bash
|
|
72
|
+
pip install hellmholtz
|
|
73
|
+
```
|
|
74
|
+
|
|
75
|
+
### Development Installation
|
|
76
|
+
|
|
77
|
+
For development with all optional dependencies:
|
|
78
|
+
|
|
79
|
+
```bash
|
|
80
|
+
git clone https://github.com/JonasHeinickeBio/HeLLMholtz.git
|
|
81
|
+
cd HeLLMholtz
|
|
82
|
+
pip install -e ".[eval,proxy]"
|
|
83
|
+
```
|
|
84
|
+
|
|
85
|
+
### Poetry Installation
|
|
86
|
+
|
|
87
|
+
```bash
|
|
88
|
+
poetry install --with eval,proxy
|
|
89
|
+
```
|
|
90
|
+
|
|
91
|
+
## Configuration
|
|
92
|
+
|
|
93
|
+
1. Copy the example environment file:
|
|
94
|
+
```bash
|
|
95
|
+
cp .env.example .env
|
|
96
|
+
```
|
|
97
|
+
|
|
98
|
+
2. Configure your API keys in `.env`:
|
|
99
|
+
```bash
|
|
100
|
+
# OpenAI
|
|
101
|
+
OPENAI_API_KEY=your_openai_key
|
|
102
|
+
|
|
103
|
+
# Anthropic
|
|
104
|
+
ANTHROPIC_API_KEY=your_anthropic_key
|
|
105
|
+
|
|
106
|
+
# Google
|
|
107
|
+
GOOGLE_API_KEY=your_google_key
|
|
108
|
+
|
|
109
|
+
# Helmholtz Blablador
|
|
110
|
+
BLABLADOR_API_KEY=your_blablador_key
|
|
111
|
+
BLABLADOR_API_BASE=https://your-blablador-instance.com
|
|
112
|
+
|
|
113
|
+
# Optional: Default models
|
|
114
|
+
AISUITE_DEFAULT_MODELS='{"openai": "gpt-4o", "anthropic": "claude-3-haiku"}'
|
|
115
|
+
```
|
|
116
|
+
|
|
117
|
+
## Usage
|
|
118
|
+
|
|
119
|
+
### Python API
|
|
120
|
+
|
|
121
|
+
#### Basic Chat Interface
|
|
122
|
+
|
|
123
|
+
```python
|
|
124
|
+
from hellmholtz.client import chat
|
|
125
|
+
|
|
126
|
+
# Simple chat
|
|
127
|
+
response = chat("openai:gpt-4o", "Hello, how are you?")
|
|
128
|
+
print(response)
|
|
129
|
+
|
|
130
|
+
# With conversation history
|
|
131
|
+
messages = [
|
|
132
|
+
{"role": "system", "content": "You are a helpful assistant."},
|
|
133
|
+
{"role": "user", "content": "Explain quantum computing in simple terms."}
|
|
134
|
+
]
|
|
135
|
+
response = chat("anthropic:claude-3-sonnet", messages)
|
|
136
|
+
```
|
|
137
|
+
|
|
138
|
+
#### Benchmarking
|
|
139
|
+
|
|
140
|
+
```python
|
|
141
|
+
from hellmholtz.benchmark import run_benchmarks
|
|
142
|
+
from hellmholtz.core.prompts import load_prompts
|
|
143
|
+
|
|
144
|
+
# Load prompts from JSON file
|
|
145
|
+
prompts = load_prompts("prompts.json", category="reasoning")
|
|
146
|
+
|
|
147
|
+
# Run benchmarks
|
|
148
|
+
results = run_benchmarks(
|
|
149
|
+
models=["openai:gpt-4o", "anthropic:claude-3-haiku", "blablador:gpt-4o"],
|
|
150
|
+
prompts=prompts,
|
|
151
|
+
temperatures=[0.1, 0.7, 1.0],
|
|
152
|
+
replications=3
|
|
153
|
+
)
|
|
154
|
+
|
|
155
|
+
# Analyze results
|
|
156
|
+
from hellmholtz.evaluation_analysis import EvaluationAnalyzer
|
|
157
|
+
analyzer = EvaluationAnalyzer()
|
|
158
|
+
analysis = analyzer.analyze_evaluation_results("results/benchmark_latest.json")
|
|
159
|
+
analyzer.print_analysis_summary(analysis)
|
|
160
|
+
```
|
|
161
|
+
|
|
162
|
+
### Command Line Interface
|
|
163
|
+
|
|
164
|
+
HeLLMholtz provides a comprehensive CLI for all operations:
|
|
165
|
+
|
|
166
|
+
#### Chat Interface
|
|
167
|
+
|
|
168
|
+
```bash
|
|
169
|
+
# Simple chat
|
|
170
|
+
hellm chat --model openai:gpt-4o "Explain the theory of relativity"
|
|
171
|
+
|
|
172
|
+
# Interactive mode
|
|
173
|
+
hellm chat --model anthropic:claude-3-sonnet --interactive
|
|
174
|
+
|
|
175
|
+
# With system prompt
|
|
176
|
+
hellm chat --model blablador:gpt-4o --system "You are a coding assistant" "Write a Python function to calculate fibonacci numbers"
|
|
177
|
+
```
|
|
178
|
+
|
|
179
|
+
#### Benchmarking
|
|
180
|
+
|
|
181
|
+
```bash
|
|
182
|
+
# Basic benchmark
|
|
183
|
+
hellm bench --models openai:gpt-4o,anthropic:claude-3-haiku --prompts-file prompts.txt
|
|
184
|
+
|
|
185
|
+
# Advanced benchmark with evaluation
|
|
186
|
+
hellm bench \
|
|
187
|
+
--models openai:gpt-4o,blablador:gpt-4o \
|
|
188
|
+
--prompts-file prompts.json \
|
|
189
|
+
--prompts-category reasoning \
|
|
190
|
+
--temperatures 0.1,0.7,1.0 \
|
|
191
|
+
--replications 3 \
|
|
192
|
+
--evaluate-with openai:gpt-4o \
|
|
193
|
+
--results-dir results/
|
|
194
|
+
|
|
195
|
+
# Throughput testing
|
|
196
|
+
hellm bench-throughput \
|
|
197
|
+
--model openai:gpt-4o \
|
|
198
|
+
--requests 100 \
|
|
199
|
+
--concurrency 10 \
|
|
200
|
+
--prompt "Write a short story about AI"
|
|
201
|
+
```
|
|
202
|
+
|
|
203
|
+
#### Evaluation and Analysis
|
|
204
|
+
|
|
205
|
+
```bash
|
|
206
|
+
# Analyze benchmark results
|
|
207
|
+
hellm analyze results/benchmark_latest.json --html-report analysis_report.html
|
|
208
|
+
|
|
209
|
+
# Generate reports
|
|
210
|
+
hellm report --results-file results/benchmark_latest.json --output report.md
|
|
211
|
+
```
|
|
212
|
+
|
|
213
|
+
#### Model Management
|
|
214
|
+
|
|
215
|
+
```bash
|
|
216
|
+
# List available Blablador models
|
|
217
|
+
hellm models
|
|
218
|
+
|
|
219
|
+
# Monitor model availability and test accessibility
|
|
220
|
+
hellm monitor --test-accessibility
|
|
221
|
+
|
|
222
|
+
# Check model configuration consistency
|
|
223
|
+
hellm monitor --check-config
|
|
224
|
+
```
|
|
225
|
+
|
|
226
|
+
#### Weekly Automated Benchmarking
|
|
227
|
+
|
|
228
|
+
The repository includes a GitHub Actions workflow that automatically runs benchmarks weekly and updates reports:
|
|
229
|
+
|
|
230
|
+
- **Scheduled**: Runs every Sunday at 00:00 UTC
|
|
231
|
+
- **Model Discovery**: Automatically fetches latest Blablador models
|
|
232
|
+
- **Performance Charts**: Generates visual charts comparing model performance
|
|
233
|
+
- **Multiple Formats**: Creates HTML, Markdown, and PNG chart reports
|
|
234
|
+
- **Auto-commit**: Updates reports in the repository for public viewing
|
|
235
|
+
|
|
236
|
+
To enable automated benchmarking:
|
|
237
|
+
|
|
238
|
+
1. Set repository secrets for API keys:
|
|
239
|
+
- `BLABLADOR_API_KEY`: Your Blablador API key
|
|
240
|
+
- `BLABLADOR_API_BASE`: Blablador API base URL (optional)
|
|
241
|
+
|
|
242
|
+
2. The workflow will automatically:
|
|
243
|
+
- Run benchmarks on selected models
|
|
244
|
+
- Generate performance reports
|
|
245
|
+
- Create visual charts
|
|
246
|
+
- Commit updated reports to the repository
|
|
247
|
+
|
|
248
|
+
Reports are available in the `reports/` directory and include:
|
|
249
|
+
- `weekly_benchmark_report.html`: Interactive HTML report
|
|
250
|
+
- `weekly_benchmark_report.md`: Markdown summary
|
|
251
|
+
- `weekly_benchmark_chart.png`: Performance visualization
|
|
252
|
+
|
|
253
|
+
#### Advanced Features
|
|
254
|
+
|
|
255
|
+
```bash
|
|
256
|
+
# Run LM Evaluation Harness
|
|
257
|
+
hellm lm-eval \
|
|
258
|
+
--model openai:gpt-4o \
|
|
259
|
+
--tasks hellaswag,winogrande \
|
|
260
|
+
--limit 100
|
|
261
|
+
|
|
262
|
+
# Start LiteLLM proxy server
|
|
263
|
+
hellm proxy \
|
|
264
|
+
--config litellm_config.yaml \
|
|
265
|
+
--port 8000
|
|
266
|
+
```
|
|
267
|
+
|
|
268
|
+
## Project Structure
|
|
269
|
+
|
|
270
|
+
```
|
|
271
|
+
hellmholtz/
|
|
272
|
+
├── cli.py # Command-line interface
|
|
273
|
+
├── client.py # Unified LLM client
|
|
274
|
+
├── monitoring.py # Model availability monitoring
|
|
275
|
+
├── evaluation_analysis.py # Statistical analysis and reporting
|
|
276
|
+
├── export.py # Result export utilities
|
|
277
|
+
├── core/
|
|
278
|
+
│ ├── config.py # Configuration management
|
|
279
|
+
│ └── prompts.py # Prompt loading and validation
|
|
280
|
+
├── benchmark/
|
|
281
|
+
│ ├── runner.py # Benchmark execution
|
|
282
|
+
│ ├── evaluator.py # LLM-as-a-Judge evaluation
|
|
283
|
+
│ └── prompts.py # Benchmark-specific prompts
|
|
284
|
+
├── providers/
|
|
285
|
+
│ ├── blablador_provider.py # Custom Blablador provider
|
|
286
|
+
│ ├── blablador_config.py # Blablador model configuration
|
|
287
|
+
│ ├── blablador.py # Blablador utilities
|
|
288
|
+
│ └── __init__.py
|
|
289
|
+
├── reporting/
|
|
290
|
+
│ ├── html.py # HTML report generation
|
|
291
|
+
│ ├── markdown.py # Markdown report generation
|
|
292
|
+
│ ├── stats.py # Statistical calculations
|
|
293
|
+
│ ├── utils.py # Reporting utilities
|
|
294
|
+
│ └── templates/ # HTML templates
|
|
295
|
+
└── integrations/
|
|
296
|
+
├── lm_eval.py # LM Evaluation Harness integration
|
|
297
|
+
└── litellm.py # LiteLLM proxy integration
|
|
298
|
+
```
|
|
299
|
+
|
|
300
|
+
## Prompt System
|
|
301
|
+
|
|
302
|
+
HeLLMholtz supports two prompt formats:
|
|
303
|
+
|
|
304
|
+
### Simple Text Format (`prompts.txt`)
|
|
305
|
+
|
|
306
|
+
```
|
|
307
|
+
What is the capital of France?
|
|
308
|
+
Explain quantum computing in simple terms.
|
|
309
|
+
Write a Python function to reverse a string.
|
|
310
|
+
```
|
|
311
|
+
|
|
312
|
+
### Structured JSON Format (`prompts.json`)
|
|
313
|
+
|
|
314
|
+
```json
|
|
315
|
+
[
|
|
316
|
+
{
|
|
317
|
+
"id": "capital-france",
|
|
318
|
+
"category": "knowledge",
|
|
319
|
+
"description": "Test basic geographical knowledge",
|
|
320
|
+
"messages": [
|
|
321
|
+
{
|
|
322
|
+
"role": "user",
|
|
323
|
+
"content": "What is the capital of France?"
|
|
324
|
+
}
|
|
325
|
+
],
|
|
326
|
+
"expected_output": "Paris"
|
|
327
|
+
},
|
|
328
|
+
{
|
|
329
|
+
"id": "quantum-explanation",
|
|
330
|
+
"category": "reasoning",
|
|
331
|
+
"description": "Test ability to explain complex concepts simply",
|
|
332
|
+
"messages": [
|
|
333
|
+
{
|
|
334
|
+
"role": "user",
|
|
335
|
+
"content": "Explain quantum computing in simple terms."
|
|
336
|
+
}
|
|
337
|
+
]
|
|
338
|
+
}
|
|
339
|
+
]
|
|
340
|
+
```
|
|
341
|
+
|
|
342
|
+
## Evaluation System
|
|
343
|
+
|
|
344
|
+
The LLM-as-a-Judge evaluation system provides:
|
|
345
|
+
|
|
346
|
+
- **Automated Scoring**: AI-powered evaluation of response quality
|
|
347
|
+
- **Statistical Analysis**: Comprehensive metrics and distributions
|
|
348
|
+
- **Model Rankings**: Performance comparisons across all dimensions
|
|
349
|
+
- **Interactive Reports**: Web-based visualizations of results
|
|
350
|
+
- **Detailed Critiques**: Specific feedback for each response
|
|
351
|
+
|
|
352
|
+
### Example Analysis Output
|
|
353
|
+
|
|
354
|
+
```
|
|
355
|
+
[Monitor] EVALUATION ANALYSIS RESULTS
|
|
356
|
+
══════════════════════════════════════════════════════════════
|
|
357
|
+
|
|
358
|
+
OVERVIEW
|
|
359
|
+
• Total Evaluations: 150
|
|
360
|
+
• Models Tested: 3
|
|
361
|
+
• Prompts Tested: 5
|
|
362
|
+
• Success Rate: 94.7%
|
|
363
|
+
|
|
364
|
+
🏆 MODEL RANKINGS
|
|
365
|
+
1. openai:gpt-4o - Avg Score: 8.7/10 (±0.8)
|
|
366
|
+
2. anthropic:claude-3-opus - Avg Score: 8.4/10 (±0.9)
|
|
367
|
+
3. blablador:gpt-4o - Avg Score: 7.9/10 (±1.1)
|
|
368
|
+
|
|
369
|
+
DETAILED METRICS
|
|
370
|
+
• Response Quality: 8.3/10 average
|
|
371
|
+
• Relevance: 8.6/10 average
|
|
372
|
+
• Accuracy: 9.1/10 average
|
|
373
|
+
• Creativity: 7.8/10 average
|
|
374
|
+
```
|
|
375
|
+
|
|
376
|
+
## Latest Benchmark Results
|
|
377
|
+
|
|
378
|
+
Recent benchmarking results from the automated weekly workflow testing BLABLADOR models:
|
|
379
|
+
|
|
380
|
+
### Model Performance Overview
|
|
381
|
+
|
|
382
|
+
| Model | Success Rate | Avg Latency | Avg Rating (1-10) | Rating Std Dev |
|
|
383
|
+
|-------|-------------|-------------|-------------------|----------------|
|
|
384
|
+
| GPT-OSS-120b | 100.0% | 5.35s | 8.5 | ±2.38 |
|
|
385
|
+
| Ministral-3-14B-Instruct-2512 | 100.0% | 9.55s | 7.5 | ±3.70 |
|
|
386
|
+
|
|
387
|
+
**Overall Statistics:**
|
|
388
|
+
- **Total Evaluations**: 8 across 4 different prompts
|
|
389
|
+
- **Models Tested**: 2 BLABLADOR models
|
|
390
|
+
- **Overall Success Rate**: 100.0%
|
|
391
|
+
- **Average Rating**: 8.0/10
|
|
392
|
+
- **Average Latency**: 7.45s
|
|
393
|
+
|
|
394
|
+
### Key Findings
|
|
395
|
+
- **Top Performer**: GPT-OSS-120b with highest rating (8.5/10) and fastest response time (5.35s)
|
|
396
|
+
- **Most Consistent**: GPT-OSS-120b with lower rating variation (±2.38)
|
|
397
|
+
- **Performance Gap**: 1.0 point difference between best and worst performing models
|
|
398
|
+
- **Model Availability**: Both tested models are fully operational with 100% success rates
|
|
399
|
+
|
|
400
|
+
### Evaluation Details
|
|
401
|
+
- **Prompt Categories**: Reasoning, coding, and creative writing tasks
|
|
402
|
+
- **Temperature Testing**: Multiple temperature settings (0.1, 0.7, 1.0) for response variation
|
|
403
|
+
- **LLM-as-a-Judge**: Automated evaluation with detailed critiques and statistical analysis
|
|
404
|
+
- **Rating Distribution**: GPT-OSS-120b received mostly 9-10 ratings, Ministral-3-14B showed more variation
|
|
405
|
+
|
|
406
|
+
### Reports and Visualizations
|
|
407
|
+
|
|
408
|
+
- [Interactive HTML Report](reports/evaluation_analysis.html) - Comprehensive evaluation analysis with charts
|
|
409
|
+
- [Markdown Summary](reports/benchmark_report_comprehensive.md) - Detailed performance metrics
|
|
410
|
+
- [Performance Chart](reports/benchmark_chart_comprehensive.png) - Visual model comparison
|
|
411
|
+
- [Basic Report](reports/benchmark_report.md) - Simple performance overview
|
|
412
|
+
|
|
413
|
+
Reports are automatically updated and include LLM-as-a-Judge evaluation with detailed statistical analysis and model rankings.
|
|
414
|
+
|
|
415
|
+
## Development
|
|
416
|
+
|
|
417
|
+
### Setup Development Environment
|
|
418
|
+
|
|
419
|
+
```bash
|
|
420
|
+
# Clone repository
|
|
421
|
+
git clone https://github.com/JonasHeinickeBio/HeLLMholtz.git
|
|
422
|
+
cd HeLLMholtz
|
|
423
|
+
|
|
424
|
+
# Install with development dependencies
|
|
425
|
+
poetry install --with dev
|
|
426
|
+
|
|
427
|
+
# Install pre-commit hooks
|
|
428
|
+
poetry run pre-commit install
|
|
429
|
+
```
|
|
430
|
+
|
|
431
|
+
### Running Tests
|
|
432
|
+
|
|
433
|
+
```bash
|
|
434
|
+
# Run all tests
|
|
435
|
+
poetry run pytest
|
|
436
|
+
|
|
437
|
+
# Run with coverage
|
|
438
|
+
poetry run pytest --cov=hellmholtz --cov-report=html
|
|
439
|
+
|
|
440
|
+
# Run specific test categories
|
|
441
|
+
poetry run pytest -m "slow" # Slow integration tests
|
|
442
|
+
poetry run pytest -m "network" # Tests requiring network access
|
|
443
|
+
poetry run pytest -m "model" # Tests using actual models
|
|
444
|
+
```
|
|
445
|
+
|
|
446
|
+
### Code Quality
|
|
447
|
+
|
|
448
|
+
```bash
|
|
449
|
+
# Lint code
|
|
450
|
+
poetry run ruff check .
|
|
451
|
+
|
|
452
|
+
# Format code
|
|
453
|
+
poetry run ruff format .
|
|
454
|
+
|
|
455
|
+
# Type checking
|
|
456
|
+
poetry run mypy src/
|
|
457
|
+
|
|
458
|
+
# Security scanning
|
|
459
|
+
poetry run bandit -r src/
|
|
460
|
+
```
|
|
461
|
+
|
|
462
|
+
### Building Documentation
|
|
463
|
+
|
|
464
|
+
```bash
|
|
465
|
+
# Generate API documentation
|
|
466
|
+
poetry run sphinx-build docs/ docs/_build/
|
|
467
|
+
|
|
468
|
+
# Serve documentation locally
|
|
469
|
+
poetry run sphinx-serve docs/_build/
|
|
470
|
+
```
|
|
471
|
+
|
|
472
|
+
## Contributing
|
|
473
|
+
|
|
474
|
+
We welcome contributions! Please see our [Contributing Guide](CONTRIBUTING.md) for details.
|
|
475
|
+
|
|
476
|
+
1. Fork the repository
|
|
477
|
+
2. Create a feature branch: `git checkout -b feature/amazing-feature`
|
|
478
|
+
3. Make your changes and add tests
|
|
479
|
+
4. Run the full test suite: `poetry run pytest`
|
|
480
|
+
5. Ensure code quality: `poetry run ruff check . && poetry run mypy src/`
|
|
481
|
+
6. Commit your changes: `git commit -m 'Add amazing feature'`
|
|
482
|
+
7. Push to the branch: `git push origin feature/amazing-feature`
|
|
483
|
+
8. Open a Pull Request
|
|
484
|
+
|
|
485
|
+
## License
|
|
486
|
+
|
|
487
|
+
This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
|
|
488
|
+
|
|
489
|
+
## Acknowledgments
|
|
490
|
+
|
|
491
|
+
- Built on top of [aisuite](https://github.com/andrewyng/aisuite) for unified LLM access
|
|
492
|
+
- LLM evaluation powered by [EleutherAI's LM Evaluation Harness](https://github.com/EleutherAI/lm-evaluation-harness)
|
|
493
|
+
- Proxy functionality via [LiteLLM](https://github.com/BerriAI/litellm)
|
|
494
|
+
- Special thanks to the Helmholtz Association for Blablador model access
|
|
495
|
+
|
|
496
|
+
## Support
|
|
497
|
+
|
|
498
|
+
- Documentation: https://hellmholtz.readthedocs.io/
|
|
499
|
+
- Issue Tracker: https://github.com/JonasHeinickeBio/HeLLMholtz/issues
|
|
500
|
+
- Discussions: https://github.com/JonasHeinickeBio/HeLLMholtz/discussions
|
|
501
|
+
|
|
502
|
+
---
|
|
503
|
+
|
|
504
|
+
<p align="center">Made with love for the scientific computing community</p>
|
|
505
|
+
|