evalforge 0.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- evalforge-0.2.0/.env.example +27 -0
- evalforge-0.2.0/.gitignore +18 -0
- evalforge-0.2.0/CHANGELOG.md +19 -0
- evalforge-0.2.0/LICENSE +21 -0
- evalforge-0.2.0/PKG-INFO +256 -0
- evalforge-0.2.0/PUBLISHING.md +113 -0
- evalforge-0.2.0/README.md +180 -0
- evalforge-0.2.0/evalforge/__init__.py +4 -0
- evalforge-0.2.0/evalforge/adapters/__init__.py +16 -0
- evalforge-0.2.0/evalforge/adapters/base.py +32 -0
- evalforge-0.2.0/evalforge/adapters/command_adapter.py +51 -0
- evalforge-0.2.0/evalforge/adapters/http_adapter.py +41 -0
- evalforge-0.2.0/evalforge/adapters/python_adapter.py +52 -0
- evalforge-0.2.0/evalforge/cost.py +278 -0
- evalforge-0.2.0/evalforge/generators/__init__.py +1 -0
- evalforge-0.2.0/evalforge/generators/auto_generate.py +212 -0
- evalforge-0.2.0/evalforge/generators/case_templates.py +94 -0
- evalforge-0.2.0/evalforge/generators/prompt_analyzer.py +94 -0
- evalforge-0.2.0/evalforge/history.py +109 -0
- evalforge-0.2.0/evalforge/main.py +802 -0
- evalforge-0.2.0/evalforge/providers.py +479 -0
- evalforge-0.2.0/evalforge/reporters/__init__.py +26 -0
- evalforge-0.2.0/evalforge/reporters/console.py +408 -0
- evalforge-0.2.0/evalforge/reporters/html_reporter.py +128 -0
- evalforge-0.2.0/evalforge/reporters/junit.py +62 -0
- evalforge-0.2.0/evalforge/reporters/markdown_reporter.py +54 -0
- evalforge-0.2.0/evalforge/runner.py +365 -0
- evalforge-0.2.0/evalforge/schema.py +264 -0
- evalforge-0.2.0/evalforge/scorers/__init__.py +58 -0
- evalforge-0.2.0/evalforge/scorers/bleu_rouge.py +92 -0
- evalforge-0.2.0/evalforge/scorers/contains.py +73 -0
- evalforge-0.2.0/evalforge/scorers/cosine.py +63 -0
- evalforge-0.2.0/evalforge/scorers/exact.py +42 -0
- evalforge-0.2.0/evalforge/scorers/hallucination.py +148 -0
- evalforge-0.2.0/evalforge/scorers/llm_judge.py +118 -0
- evalforge-0.2.0/evalforge/scorers/task_completion.py +83 -0
- evalforge-0.2.0/evalforge/scorers/trajectory.py +89 -0
- evalforge-0.2.0/evalforge/security.py +159 -0
- evalforge-0.2.0/evalforge/utils.py +133 -0
- evalforge-0.2.0/evalforge_build_guide.md +3434 -0
- evalforge-0.2.0/pyproject.toml +98 -0
- evalforge-0.2.0/suite.yaml +358 -0
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
# Copy this file to .env and fill in your values
|
|
2
|
+
# Never commit .env to git
|
|
3
|
+
|
|
4
|
+
# ── Required (at least one) ─────────────────────────
|
|
5
|
+
OPENAI_API_KEY=sk-your-openai-key-here
|
|
6
|
+
ANTHROPIC_API_KEY=sk-ant-your-anthropic-key-here
|
|
7
|
+
|
|
8
|
+
# ── Google Gemini ───────────────────────────────────
|
|
9
|
+
GOOGLE_API_KEY=AIza-your-google-api-key-here
|
|
10
|
+
|
|
11
|
+
# ── AWS Bedrock ─────────────────────────────────────
|
|
12
|
+
# AWS_ACCESS_KEY_ID=your-access-key
|
|
13
|
+
# AWS_SECRET_ACCESS_KEY=your-secret-key
|
|
14
|
+
# AWS_DEFAULT_REGION=us-east-1
|
|
15
|
+
|
|
16
|
+
# ── Ollama (local) ──────────────────────────────────
|
|
17
|
+
# OLLAMA_BASE_URL=http://localhost:11434
|
|
18
|
+
|
|
19
|
+
# ── OpenRouter ──────────────────────────────────────
|
|
20
|
+
# OPENROUTER_API_KEY=sk-or-your-key-here
|
|
21
|
+
|
|
22
|
+
# ── LiteLLM Proxy ──────────────────────────────────
|
|
23
|
+
# LITELLM_API_KEY=your-litellm-key
|
|
24
|
+
|
|
25
|
+
# ── EvalForge Settings ──────────────────────────────
|
|
26
|
+
EVALFORGE_DEFAULT_MODEL=gpt-4o-mini
|
|
27
|
+
EVALFORGE_HISTORY_DIR=~/.evalforge/history
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
# Changelog
|
|
2
|
+
|
|
3
|
+
## [0.1.0] - Unreleased
|
|
4
|
+
|
|
5
|
+
### Added
|
|
6
|
+
- Initial release
|
|
7
|
+
- YAML-based eval suite format
|
|
8
|
+
- Scorers: exact, contains, regex, cosine, BLEU, ROUGE, LLM-judge
|
|
9
|
+
- Multi-model parallel evaluation (OpenAI, Anthropic, Gemini, Bedrock, Ollama)
|
|
10
|
+
- Multi-provider routing via providers.py
|
|
11
|
+
- Run history and comparison
|
|
12
|
+
- HTML, Markdown, JUnit reporters
|
|
13
|
+
- Auto test case generation from production logs
|
|
14
|
+
- Cost tracking with 31-model pricing table
|
|
15
|
+
- Session cost breakdown (eval vs overhead)
|
|
16
|
+
- Budget alerts and cumulative tracking
|
|
17
|
+
- Security hardening: API key masking, YAML sanitization, XSS prevention
|
|
18
|
+
- CI/CD integration with exit codes
|
|
19
|
+
- npm wrapper package for Node.js users
|
evalforge-0.2.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 EvalForge Contributors
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
evalforge-0.2.0/PKG-INFO
ADDED
|
@@ -0,0 +1,256 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: evalforge
|
|
3
|
+
Version: 0.2.0
|
|
4
|
+
Summary: The pytest of LLMs — build, version, and run LLM eval suites from the terminal
|
|
5
|
+
Project-URL: Homepage, https://github.com/yourusername/evalforge
|
|
6
|
+
Project-URL: Documentation, https://github.com/yourusername/evalforge#readme
|
|
7
|
+
Project-URL: Issues, https://github.com/yourusername/evalforge/issues
|
|
8
|
+
Project-URL: Changelog, https://github.com/yourusername/evalforge/blob/main/CHANGELOG.md
|
|
9
|
+
License: MIT License
|
|
10
|
+
|
|
11
|
+
Copyright (c) 2025 EvalForge Contributors
|
|
12
|
+
|
|
13
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
14
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
15
|
+
in the Software without restriction, including without limitation the rights
|
|
16
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
17
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
18
|
+
furnished to do so, subject to the following conditions:
|
|
19
|
+
|
|
20
|
+
The above copyright notice and this permission notice shall be included in all
|
|
21
|
+
copies or substantial portions of the Software.
|
|
22
|
+
|
|
23
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
24
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
25
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
26
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
27
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
28
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
29
|
+
SOFTWARE.
|
|
30
|
+
License-File: LICENSE
|
|
31
|
+
Keywords: ai,cli,data-science,evaluation,llm,mlops,prompt-testing,testing
|
|
32
|
+
Classifier: Development Status :: 3 - Alpha
|
|
33
|
+
Classifier: Environment :: Console
|
|
34
|
+
Classifier: Intended Audience :: Developers
|
|
35
|
+
Classifier: Intended Audience :: Science/Research
|
|
36
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
37
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
38
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
39
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
40
|
+
Classifier: Topic :: Software Development :: Testing
|
|
41
|
+
Requires-Python: >=3.11
|
|
42
|
+
Requires-Dist: anthropic>=0.25
|
|
43
|
+
Requires-Dist: httpx>=0.27
|
|
44
|
+
Requires-Dist: jinja2>=3.1
|
|
45
|
+
Requires-Dist: nltk>=3.8
|
|
46
|
+
Requires-Dist: openai>=1.0
|
|
47
|
+
Requires-Dist: pydantic>=2
|
|
48
|
+
Requires-Dist: python-dotenv>=1.0
|
|
49
|
+
Requires-Dist: pyyaml>=6.0
|
|
50
|
+
Requires-Dist: rich>=13
|
|
51
|
+
Requires-Dist: rouge-score>=0.1.2
|
|
52
|
+
Requires-Dist: scikit-learn>=1.4
|
|
53
|
+
Requires-Dist: sentence-transformers>=3.0
|
|
54
|
+
Requires-Dist: tenacity>=8.2
|
|
55
|
+
Requires-Dist: tiktoken>=0.7
|
|
56
|
+
Requires-Dist: typer>=0.12
|
|
57
|
+
Provides-Extra: all
|
|
58
|
+
Requires-Dist: boto3>=1.34; extra == 'all'
|
|
59
|
+
Requires-Dist: google-generativeai>=0.5; extra == 'all'
|
|
60
|
+
Requires-Dist: litellm>=1.40; extra == 'all'
|
|
61
|
+
Provides-Extra: bedrock
|
|
62
|
+
Requires-Dist: boto3>=1.34; extra == 'bedrock'
|
|
63
|
+
Provides-Extra: dev
|
|
64
|
+
Requires-Dist: build>=1.2; extra == 'dev'
|
|
65
|
+
Requires-Dist: mypy>=1.10; extra == 'dev'
|
|
66
|
+
Requires-Dist: pip-audit>=2.7; extra == 'dev'
|
|
67
|
+
Requires-Dist: pytest-asyncio>=0.23; extra == 'dev'
|
|
68
|
+
Requires-Dist: pytest>=8; extra == 'dev'
|
|
69
|
+
Requires-Dist: ruff>=0.4; extra == 'dev'
|
|
70
|
+
Requires-Dist: twine>=5.0; extra == 'dev'
|
|
71
|
+
Provides-Extra: gemini
|
|
72
|
+
Requires-Dist: google-generativeai>=0.5; extra == 'gemini'
|
|
73
|
+
Provides-Extra: litellm
|
|
74
|
+
Requires-Dist: litellm>=1.40; extra == 'litellm'
|
|
75
|
+
Description-Content-Type: text/markdown
|
|
76
|
+
|
|
77
|
+
# 🔨 EvalForge
|
|
78
|
+
|
|
79
|
+
**The ultimate evaluation platform for LLMs and Autonomous AI Agents.**
|
|
80
|
+
|
|
81
|
+
EvalForge simplifies the process of testing, evaluating, and iterating on Large Language Models and AI Agents. Whether you're benchmarking a simple LLM prompt block or evaluating a complex, multi-tool Kubernetes Agent using LangChain/OpenAI SDK, EvalForge provides deterministic scoring, conversational edge-case simulation, and detailed cost breakdowns directly in your terminal.
|
|
82
|
+
|
|
83
|
+
---
|
|
84
|
+
|
|
85
|
+
## 🚀 Quick Start
|
|
86
|
+
|
|
87
|
+
EvalForge can be installed easily via pip or npm (using the wrapper).
|
|
88
|
+
|
|
89
|
+
**Python (PyPI)**
|
|
90
|
+
```bash
|
|
91
|
+
pip install evalforge
|
|
92
|
+
```
|
|
93
|
+
|
|
94
|
+
**Node / JavaScript (NPM Wrapper)**
|
|
95
|
+
```bash
|
|
96
|
+
npm install -g evalforge
|
|
97
|
+
```
|
|
98
|
+
|
|
99
|
+
### Initialize a Project
|
|
100
|
+
To scaffold a basic LLM or Agent project, run:
|
|
101
|
+
```bash
|
|
102
|
+
evalforge init
|
|
103
|
+
```
|
|
104
|
+
|
|
105
|
+
---
|
|
106
|
+
|
|
107
|
+
## 🧠 Model Evals (Classic LLM Mode)
|
|
108
|
+
|
|
109
|
+
Test standard Large Language Models for prompt coherence, factual extraction, and instruction adherence.
|
|
110
|
+
|
|
111
|
+
1. **Define a Suite**: Create an `llm_suite.yaml`:
|
|
112
|
+
```yaml
|
|
113
|
+
name: "Summarization Task"
|
|
114
|
+
description: "Tests a model's summarization capabilities"
|
|
115
|
+
type: llm
|
|
116
|
+
model: gpt-4o-mini
|
|
117
|
+
cases:
|
|
118
|
+
- id: "long_text_summary"
|
|
119
|
+
messages:
|
|
120
|
+
- role: "user"
|
|
121
|
+
content: "Summarize this long text: [text ...]"
|
|
122
|
+
scorers:
|
|
123
|
+
- type: "llm_judge"
|
|
124
|
+
threshold: 0.8
|
|
125
|
+
rubric: "Did the model summarize the text accurately in under 5 sentences?"
|
|
126
|
+
- type: "contains"
|
|
127
|
+
value: "summary"
|
|
128
|
+
threshold: 1.0
|
|
129
|
+
```
|
|
130
|
+
|
|
131
|
+
2. **Run the Suite**:
|
|
132
|
+
```bash
|
|
133
|
+
evalforge run llm_suite.yaml
|
|
134
|
+
```
|
|
135
|
+
You can easily override the model to benchmark across providers (OpenAI, Anthropic, Gemini, Ollama):
|
|
136
|
+
```bash
|
|
137
|
+
evalforge run llm_suite.yaml -m claude-3-haiku-20240307
|
|
138
|
+
```
|
|
139
|
+
|
|
140
|
+
---
|
|
141
|
+
|
|
142
|
+
## 🤖 Agent Evals (Autonomous Agent Mode)
|
|
143
|
+
|
|
144
|
+
EvalForge v0.2 introduces native support for Autonomous Agents. It dynamically generates edge-cases from your System Prompt and interfaces with any Python codebase (LangChain, LangGraph, standard OpenAI functions).
|
|
145
|
+
|
|
146
|
+
### 1. Build an Agent Suite
|
|
147
|
+
Define an agent suite specifying the `system_prompt` and pointing EvalForge to your application code.
|
|
148
|
+
|
|
149
|
+
```yaml
|
|
150
|
+
name: "Kubernetes Expert Agent"
|
|
151
|
+
description: "Tests a web-browsing agent's ability to document K8s topics."
|
|
152
|
+
version: 1.0.0
|
|
153
|
+
type: agent
|
|
154
|
+
parallel: 1
|
|
155
|
+
model: gpt-4o
|
|
156
|
+
agent:
|
|
157
|
+
adapter: python
|
|
158
|
+
config:
|
|
159
|
+
module: "k8s_bot.main"
|
|
160
|
+
function: "run_agent"
|
|
161
|
+
system_prompt: |
|
|
162
|
+
You are a Kubernetes Expert Agent. You have access to a web browsing tool to retrieve documentation.
|
|
163
|
+
Do not generate documentation yourself; strictly use the web tool to search for information.
|
|
164
|
+
Never modify or delete infrastructure without explicit user confirmation.
|
|
165
|
+
auto_generate:
|
|
166
|
+
enabled: true
|
|
167
|
+
num_cases: 6
|
|
168
|
+
cases: []
|
|
169
|
+
```
|
|
170
|
+
|
|
171
|
+
### 2. Auto-Generate Test Cases
|
|
172
|
+
EvalForge comes with an internal **LLM Validation Persona**. Using the `auto_generate` config block, EvalForge reads your system prompt and generates robust **Happy Paths**, **Conversational Edge Cases**, and **Boundary Limits**, saving them directly into your `suite.yaml`.
|
|
173
|
+
|
|
174
|
+
Run evaluation with generation:
|
|
175
|
+
```bash
|
|
176
|
+
evalforge run --regenerate suite.yaml
|
|
177
|
+
```
|
|
178
|
+
|
|
179
|
+
### 3. Reviewing The Agent Results
|
|
180
|
+
EvalForge will display an elegant UI in the terminal tracking your agent's hallucination score (via NLI), tool invocations, and semantic understanding.
|
|
181
|
+
|
|
182
|
+
***Example Output Console:***
|
|
183
|
+
```text
|
|
184
|
+
Category Breakdown
|
|
185
|
+
|
|
186
|
+
Category Passed Pass Rate Progress
|
|
187
|
+
─────────────────────────────────────────────────────────────────────────
|
|
188
|
+
Create 0/1 0% ░░░░░░░░░░░░░░░
|
|
189
|
+
Edge 0/1 0% ░░░░░░░░░░░░░░░
|
|
190
|
+
Infrastructure 0/1 0% ░░░░░░░░░░░░░░░
|
|
191
|
+
Kubernetes 0/1 0% ░░░░░░░░░░░░░░░
|
|
192
|
+
Node 0/1 0% ░░░░░░░░░░░░░░░
|
|
193
|
+
Pods 0/1 0% ░░░░░░░░░░░░░░░
|
|
194
|
+
|
|
195
|
+
|
|
196
|
+
❌ Failed Cases (6)
|
|
197
|
+
|
|
198
|
+
list_pods_happy_path (gpt-4o)
|
|
199
|
+
→ llm_judge score=0.75: The response provides a relevant command to retrieve the desired information but...
|
|
200
|
+
→ hallucination score=0.29
|
|
201
|
+
Output: I currently don’t have the capability to directly interact with or retrieve live data from your Kubernetes cluster. Howe…
|
|
202
|
+
|
|
203
|
+
check_node_health_happy_path (gpt-4o)
|
|
204
|
+
→ llm_judge score=0.00: The response fails to provide any node names, health conditions, or warnings, ...
|
|
205
|
+
→ hallucination score=0.39
|
|
206
|
+
Output: I don't have the ability to check the health status of your nodes directly. However, I can guide you on how to do it usi…
|
|
207
|
+
|
|
208
|
+
╭─ 💰 Cost Breakdown ─────────────────────╮
|
|
209
|
+
│ Eval Cost: $0.00 per 100k runs (0%) │
|
|
210
|
+
│ Overhead: $0.00039 (100%) │
|
|
211
|
+
│ Total: $0.00039 │
|
|
212
|
+
╰─────────────────────────────────────────╯
|
|
213
|
+
|
|
214
|
+
╭───────────────────────────────────────────────────────────────────────╮
|
|
215
|
+
│ ❌ SUITE FAILED — 0.0% pass rate • 6 of 6 failed • run_d08fd895 │
|
|
216
|
+
╰───────────────────────────────────────────────────────────────────────╯
|
|
217
|
+
```
|
|
218
|
+
|
|
219
|
+
---
|
|
220
|
+
|
|
221
|
+
## 🧪 Evaluators & Scorers
|
|
222
|
+
|
|
223
|
+
EvalForge packages multiple deterministic and probabilistic scorers designed for accuracy tracking:
|
|
224
|
+
|
|
225
|
+
- `exact` & `contains`: Basic deterministic string matching.
|
|
226
|
+
- `regex`: Extracts pattern matching.
|
|
227
|
+
- `cosine`: Compares embeddings for similarity semantic checks.
|
|
228
|
+
- `bleu` / `rouge`: Machine-translation N-Gram comparisons using NLTK mechanisms.
|
|
229
|
+
- `llm_judge`: Asks an LLM to evaluate the string output based on a strictly provided `rubric` parameter.
|
|
230
|
+
- `hallucination`: Leverages multi-lingual Cross-Encoders and huggingface NLI models (e.g., `deberta-v3-small`) to verify if the model output hallucinates information not present in the reference documents.
|
|
231
|
+
|
|
232
|
+
---
|
|
233
|
+
|
|
234
|
+
## 🔧 Extending the Adapter
|
|
235
|
+
|
|
236
|
+
By default, the `PythonAdapter` executes any `def func(input_text: str, context: dict) -> AgentResponse:` compatible endpoint. You can integrate `OpenAI SDK`, `LangChainAgentRunners`, or `LangGraph` graphs simply by exposing their invoker to EvalForge!
|
|
237
|
+
|
|
238
|
+
1. Ensure your module imports and initializes properly. Address `sys.path` concerns.
|
|
239
|
+
2. Structure your return payload strictly around the `AgentResponse` parameters (e.g. `output`, `steps`, `tool_calls`, `token_usage`).
|
|
240
|
+
|
|
241
|
+
---
|
|
242
|
+
|
|
243
|
+
## 🛠 Commands
|
|
244
|
+
|
|
245
|
+
| Command | Description |
|
|
246
|
+
|---------|-------------|
|
|
247
|
+
| `evalforge --version` | Print the current EvalForge Version |
|
|
248
|
+
| `evalforge init` | Bootstrap a scaffold suite structure |
|
|
249
|
+
| `evalforge run <suite.yaml>` | Run a defined eval suite against connected code |
|
|
250
|
+
| `evalforge run --regenerate` | Instruct the LLM generation persona to create new cases |
|
|
251
|
+
| `evalforge cost [run_id]` | Fetch historical evaluation usage costs |
|
|
252
|
+
|
|
253
|
+
---
|
|
254
|
+
|
|
255
|
+
## Licensing
|
|
256
|
+
This project is licensed under the MIT License.
|
|
@@ -0,0 +1,113 @@
|
|
|
1
|
+
# Publishing EvalForge
|
|
2
|
+
|
|
3
|
+
This guide details the steps to publish EvalForge to both **PyPI** (Python Package Index) and **NPM** (Node Package Manager) to make it accessible to developers across different ecosystems.
|
|
4
|
+
|
|
5
|
+
---
|
|
6
|
+
|
|
7
|
+
## 🐍 1. Publishing to PyPI
|
|
8
|
+
|
|
9
|
+
Since EvalForge is built in Python, PyPI is its native distribution channel. We use `poetry` (or standard `build` and `twine`) for publishing. Since we have a `pyproject.toml`, we can use standard build tools.
|
|
10
|
+
|
|
11
|
+
### Prerequisites
|
|
12
|
+
1. Ensure you have a registered account on [PyPI](https://pypi.org).
|
|
13
|
+
2. Install build and deployment tools:
|
|
14
|
+
```bash
|
|
15
|
+
pip install build twine
|
|
16
|
+
```
|
|
17
|
+
|
|
18
|
+
### Step-by-step Guide
|
|
19
|
+
|
|
20
|
+
1. **Update Version**: Ensure the version in `pyproject.toml` and `evalforge/__init__.py` is updated (e.g., `0.2.0`).
|
|
21
|
+
2. **Build the Package**:
|
|
22
|
+
Run the following command in the root directory (where `pyproject.toml` is located):
|
|
23
|
+
```bash
|
|
24
|
+
python -m build
|
|
25
|
+
```
|
|
26
|
+
This will generate the distribution archives in a `dist/` folder (`.tar.gz` and `.whl` files).
|
|
27
|
+
3. **Check the Build**:
|
|
28
|
+
Verify the built package doesn't have formatting errors:
|
|
29
|
+
```bash
|
|
30
|
+
twine check dist/*
|
|
31
|
+
```
|
|
32
|
+
4. **Publish to PyPI**:
|
|
33
|
+
Upload the exact build to PyPI:
|
|
34
|
+
```bash
|
|
35
|
+
twine upload dist/*
|
|
36
|
+
```
|
|
37
|
+
*You will be prompted for your username (usually `__token__`) and your PyPI API token.*
|
|
38
|
+
|
|
39
|
+
Once completed, users can install via:
|
|
40
|
+
```bash
|
|
41
|
+
pip install evalforge
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
---
|
|
45
|
+
|
|
46
|
+
## 📦 2. Publishing to NPM
|
|
47
|
+
|
|
48
|
+
To make EvalForge available to the Node/JS ecosystem (so JS developers can use `npx evalforge`), we publish an NPM wrapper package that invokes the Python CLI.
|
|
49
|
+
|
|
50
|
+
### Prerequisites
|
|
51
|
+
1. An account on [npmjs.com](https://npmjs.com).
|
|
52
|
+
2. Node.js and `npm` installed.
|
|
53
|
+
|
|
54
|
+
### Step-by-step Guide
|
|
55
|
+
|
|
56
|
+
1. **Create the NPM Wrapper Directory**:
|
|
57
|
+
Create a folder named `npm` within your repo (or at the root if you prefer to mix Python/Node) and initialize it:
|
|
58
|
+
```bash
|
|
59
|
+
mkdir npm_package && cd npm_package
|
|
60
|
+
npm init -y
|
|
61
|
+
```
|
|
62
|
+
2. **Configure `package.json`**:
|
|
63
|
+
Edit `package.json` to configure the binary script.
|
|
64
|
+
```json
|
|
65
|
+
{
|
|
66
|
+
"name": "evalforge",
|
|
67
|
+
"version": "0.2.0",
|
|
68
|
+
"description": "EvalForge: The ultimate AI Agent Evaluation Platform",
|
|
69
|
+
"bin": {
|
|
70
|
+
"evalforge": "bin/evalforge.js"
|
|
71
|
+
},
|
|
72
|
+
"scripts": {},
|
|
73
|
+
"author": "Your Name",
|
|
74
|
+
"license": "MIT"
|
|
75
|
+
}
|
|
76
|
+
```
|
|
77
|
+
3. **Create the Wrapper Script**:
|
|
78
|
+
Create a directory `bin` and a file `bin/evalforge.js`:
|
|
79
|
+
```javascript
|
|
80
|
+
#!/usr/bin/env node
|
|
81
|
+
const { spawnSync } = require('child_process');
|
|
82
|
+
|
|
83
|
+
// Check if Python and the evalforge PyPI package is installed
|
|
84
|
+
const check = spawnSync('python3', ['-m', 'evalforge', '--version']);
|
|
85
|
+
|
|
86
|
+
if (check.error || check.status !== 0) {
|
|
87
|
+
console.error("EvalForge requires Python. Please install EvalForge via pip first:");
|
|
88
|
+
console.error(" pip install evalforge");
|
|
89
|
+
process.exit(1);
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
// Pass execution to the Python CLI
|
|
93
|
+
const args = process.argv.slice(2);
|
|
94
|
+
const result = spawnSync('python3', ['-m', 'evalforge'].concat(args), { stdio: 'inherit' });
|
|
95
|
+
process.exit(result.status);
|
|
96
|
+
```
|
|
97
|
+
Make the script executable:
|
|
98
|
+
```bash
|
|
99
|
+
chmod +x bin/evalforge.js
|
|
100
|
+
```
|
|
101
|
+
|
|
102
|
+
4. **Publish to NPM**:
|
|
103
|
+
Log in and publish:
|
|
104
|
+
```bash
|
|
105
|
+
npm login
|
|
106
|
+
npm publish
|
|
107
|
+
```
|
|
108
|
+
|
|
109
|
+
Once published, frontend and full-stack developers can use EvalForge directly in their pipelines via:
|
|
110
|
+
```bash
|
|
111
|
+
npx evalforge init
|
|
112
|
+
npx evalforge run
|
|
113
|
+
```
|
|
@@ -0,0 +1,180 @@
|
|
|
1
|
+
# 🔨 EvalForge
|
|
2
|
+
|
|
3
|
+
**The ultimate evaluation platform for LLMs and Autonomous AI Agents.**
|
|
4
|
+
|
|
5
|
+
EvalForge simplifies the process of testing, evaluating, and iterating on Large Language Models and AI Agents. Whether you're benchmarking a simple LLM prompt block or evaluating a complex, multi-tool Kubernetes Agent using LangChain/OpenAI SDK, EvalForge provides deterministic scoring, conversational edge-case simulation, and detailed cost breakdowns directly in your terminal.
|
|
6
|
+
|
|
7
|
+
---
|
|
8
|
+
|
|
9
|
+
## 🚀 Quick Start
|
|
10
|
+
|
|
11
|
+
EvalForge can be installed easily via pip or npm (using the wrapper).
|
|
12
|
+
|
|
13
|
+
**Python (PyPI)**
|
|
14
|
+
```bash
|
|
15
|
+
pip install evalforge
|
|
16
|
+
```
|
|
17
|
+
|
|
18
|
+
**Node / JavaScript (NPM Wrapper)**
|
|
19
|
+
```bash
|
|
20
|
+
npm install -g evalforge
|
|
21
|
+
```
|
|
22
|
+
|
|
23
|
+
### Initialize a Project
|
|
24
|
+
To scaffold a basic LLM or Agent project, run:
|
|
25
|
+
```bash
|
|
26
|
+
evalforge init
|
|
27
|
+
```
|
|
28
|
+
|
|
29
|
+
---
|
|
30
|
+
|
|
31
|
+
## 🧠 Model Evals (Classic LLM Mode)
|
|
32
|
+
|
|
33
|
+
Test standard Large Language Models for prompt coherence, factual extraction, and instruction adherence.
|
|
34
|
+
|
|
35
|
+
1. **Define a Suite**: Create an `llm_suite.yaml`:
|
|
36
|
+
```yaml
|
|
37
|
+
name: "Summarization Task"
|
|
38
|
+
description: "Tests a model's summarization capabilities"
|
|
39
|
+
type: llm
|
|
40
|
+
model: gpt-4o-mini
|
|
41
|
+
cases:
|
|
42
|
+
- id: "long_text_summary"
|
|
43
|
+
messages:
|
|
44
|
+
- role: "user"
|
|
45
|
+
content: "Summarize this long text: [text ...]"
|
|
46
|
+
scorers:
|
|
47
|
+
- type: "llm_judge"
|
|
48
|
+
threshold: 0.8
|
|
49
|
+
rubric: "Did the model summarize the text accurately in under 5 sentences?"
|
|
50
|
+
- type: "contains"
|
|
51
|
+
value: "summary"
|
|
52
|
+
threshold: 1.0
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
2. **Run the Suite**:
|
|
56
|
+
```bash
|
|
57
|
+
evalforge run llm_suite.yaml
|
|
58
|
+
```
|
|
59
|
+
You can easily override the model to benchmark across providers (OpenAI, Anthropic, Gemini, Ollama):
|
|
60
|
+
```bash
|
|
61
|
+
evalforge run llm_suite.yaml -m claude-3-haiku-20240307
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
---
|
|
65
|
+
|
|
66
|
+
## 🤖 Agent Evals (Autonomous Agent Mode)
|
|
67
|
+
|
|
68
|
+
EvalForge v0.2 introduces native support for Autonomous Agents. It dynamically generates edge-cases from your System Prompt and interfaces with any Python codebase (LangChain, LangGraph, standard OpenAI functions).
|
|
69
|
+
|
|
70
|
+
### 1. Build an Agent Suite
|
|
71
|
+
Define an agent suite specifying the `system_prompt` and pointing EvalForge to your application code.
|
|
72
|
+
|
|
73
|
+
```yaml
|
|
74
|
+
name: "Kubernetes Expert Agent"
|
|
75
|
+
description: "Tests a web-browsing agent's ability to document K8s topics."
|
|
76
|
+
version: 1.0.0
|
|
77
|
+
type: agent
|
|
78
|
+
parallel: 1
|
|
79
|
+
model: gpt-4o
|
|
80
|
+
agent:
|
|
81
|
+
adapter: python
|
|
82
|
+
config:
|
|
83
|
+
module: "k8s_bot.main"
|
|
84
|
+
function: "run_agent"
|
|
85
|
+
system_prompt: |
|
|
86
|
+
You are a Kubernetes Expert Agent. You have access to a web browsing tool to retrieve documentation.
|
|
87
|
+
Do not generate documentation yourself; strictly use the web tool to search for information.
|
|
88
|
+
Never modify or delete infrastructure without explicit user confirmation.
|
|
89
|
+
auto_generate:
|
|
90
|
+
enabled: true
|
|
91
|
+
num_cases: 6
|
|
92
|
+
cases: []
|
|
93
|
+
```
|
|
94
|
+
|
|
95
|
+
### 2. Auto-Generate Test Cases
|
|
96
|
+
EvalForge comes with an internal **LLM Validation Persona**. Using the `auto_generate` config block, EvalForge reads your system prompt and generates robust **Happy Paths**, **Conversational Edge Cases**, and **Boundary Limits**, saving them directly into your `suite.yaml`.
|
|
97
|
+
|
|
98
|
+
Run evaluation with generation:
|
|
99
|
+
```bash
|
|
100
|
+
evalforge run --regenerate suite.yaml
|
|
101
|
+
```
|
|
102
|
+
|
|
103
|
+
### 3. Reviewing The Agent Results
|
|
104
|
+
EvalForge will display an elegant UI in the terminal tracking your agent's hallucination score (via NLI), tool invocations, and semantic understanding.
|
|
105
|
+
|
|
106
|
+
***Example Output Console:***
|
|
107
|
+
```text
|
|
108
|
+
Category Breakdown
|
|
109
|
+
|
|
110
|
+
Category Passed Pass Rate Progress
|
|
111
|
+
─────────────────────────────────────────────────────────────────────────
|
|
112
|
+
Create 0/1 0% ░░░░░░░░░░░░░░░
|
|
113
|
+
Edge 0/1 0% ░░░░░░░░░░░░░░░
|
|
114
|
+
Infrastructure 0/1 0% ░░░░░░░░░░░░░░░
|
|
115
|
+
Kubernetes 0/1 0% ░░░░░░░░░░░░░░░
|
|
116
|
+
Node 0/1 0% ░░░░░░░░░░░░░░░
|
|
117
|
+
Pods 0/1 0% ░░░░░░░░░░░░░░░
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
❌ Failed Cases (6)
|
|
121
|
+
|
|
122
|
+
list_pods_happy_path (gpt-4o)
|
|
123
|
+
→ llm_judge score=0.75: The response provides a relevant command to retrieve the desired information but...
|
|
124
|
+
→ hallucination score=0.29
|
|
125
|
+
Output: I currently don’t have the capability to directly interact with or retrieve live data from your Kubernetes cluster. Howe…
|
|
126
|
+
|
|
127
|
+
check_node_health_happy_path (gpt-4o)
|
|
128
|
+
→ llm_judge score=0.00: The response fails to provide any node names, health conditions, or warnings, ...
|
|
129
|
+
→ hallucination score=0.39
|
|
130
|
+
Output: I don't have the ability to check the health status of your nodes directly. However, I can guide you on how to do it usi…
|
|
131
|
+
|
|
132
|
+
╭─ 💰 Cost Breakdown ─────────────────────╮
|
|
133
|
+
│ Eval Cost: $0.00 per 100k runs (0%) │
|
|
134
|
+
│ Overhead: $0.00039 (100%) │
|
|
135
|
+
│ Total: $0.00039 │
|
|
136
|
+
╰─────────────────────────────────────────╯
|
|
137
|
+
|
|
138
|
+
╭───────────────────────────────────────────────────────────────────────╮
|
|
139
|
+
│ ❌ SUITE FAILED — 0.0% pass rate • 6 of 6 failed • run_d08fd895 │
|
|
140
|
+
╰───────────────────────────────────────────────────────────────────────╯
|
|
141
|
+
```
|
|
142
|
+
|
|
143
|
+
---
|
|
144
|
+
|
|
145
|
+
## 🧪 Evaluators & Scorers
|
|
146
|
+
|
|
147
|
+
EvalForge packages multiple deterministic and probabilistic scorers designed for accuracy tracking:
|
|
148
|
+
|
|
149
|
+
- `exact` & `contains`: Basic deterministic string matching.
|
|
150
|
+
- `regex`: Extracts pattern matching.
|
|
151
|
+
- `cosine`: Compares embeddings for similarity semantic checks.
|
|
152
|
+
- `bleu` / `rouge`: Machine-translation N-Gram comparisons using NLTK mechanisms.
|
|
153
|
+
- `llm_judge`: Asks an LLM to evaluate the string output based on a strictly provided `rubric` parameter.
|
|
154
|
+
- `hallucination`: Leverages multi-lingual Cross-Encoders and huggingface NLI models (e.g., `deberta-v3-small`) to verify if the model output hallucinates information not present in the reference documents.
|
|
155
|
+
|
|
156
|
+
---
|
|
157
|
+
|
|
158
|
+
## 🔧 Extending the Adapter
|
|
159
|
+
|
|
160
|
+
By default, the `PythonAdapter` executes any `def func(input_text: str, context: dict) -> AgentResponse:` compatible endpoint. You can integrate `OpenAI SDK`, `LangChainAgentRunners`, or `LangGraph` graphs simply by exposing their invoker to EvalForge!
|
|
161
|
+
|
|
162
|
+
1. Ensure your module imports and initializes properly. Address `sys.path` concerns.
|
|
163
|
+
2. Structure your return payload strictly around the `AgentResponse` parameters (e.g. `output`, `steps`, `tool_calls`, `token_usage`).
|
|
164
|
+
|
|
165
|
+
---
|
|
166
|
+
|
|
167
|
+
## 🛠 Commands
|
|
168
|
+
|
|
169
|
+
| Command | Description |
|
|
170
|
+
|---------|-------------|
|
|
171
|
+
| `evalforge --version` | Print the current EvalForge Version |
|
|
172
|
+
| `evalforge init` | Bootstrap a scaffold suite structure |
|
|
173
|
+
| `evalforge run <suite.yaml>` | Run a defined eval suite against connected code |
|
|
174
|
+
| `evalforge run --regenerate` | Instruct the LLM generation persona to create new cases |
|
|
175
|
+
| `evalforge cost [run_id]` | Fetch historical evaluation usage costs |
|
|
176
|
+
|
|
177
|
+
---
|
|
178
|
+
|
|
179
|
+
## Licensing
|
|
180
|
+
This project is licensed under the MIT License.
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
from evalforge.schema import AgentConfig
|
|
2
|
+
from evalforge.adapters.base import BaseAdapter
|
|
3
|
+
|
|
4
|
+
def get_adapter(config: AgentConfig) -> BaseAdapter:
|
|
5
|
+
"""Factory to get the right agent adapter based on config."""
|
|
6
|
+
if config.adapter == "python":
|
|
7
|
+
from evalforge.adapters.python_adapter import PythonAdapter
|
|
8
|
+
return PythonAdapter(config)
|
|
9
|
+
elif config.adapter == "http":
|
|
10
|
+
from evalforge.adapters.http_adapter import HttpAdapter
|
|
11
|
+
return HttpAdapter(config)
|
|
12
|
+
elif config.adapter == "command":
|
|
13
|
+
from evalforge.adapters.command_adapter import CommandAdapter
|
|
14
|
+
return CommandAdapter(config)
|
|
15
|
+
else:
|
|
16
|
+
raise ValueError(f"Unknown adapter type: {config.adapter}")
|