evalforge 0.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. evalforge-0.2.0/.env.example +27 -0
  2. evalforge-0.2.0/.gitignore +18 -0
  3. evalforge-0.2.0/CHANGELOG.md +19 -0
  4. evalforge-0.2.0/LICENSE +21 -0
  5. evalforge-0.2.0/PKG-INFO +256 -0
  6. evalforge-0.2.0/PUBLISHING.md +113 -0
  7. evalforge-0.2.0/README.md +180 -0
  8. evalforge-0.2.0/evalforge/__init__.py +4 -0
  9. evalforge-0.2.0/evalforge/adapters/__init__.py +16 -0
  10. evalforge-0.2.0/evalforge/adapters/base.py +32 -0
  11. evalforge-0.2.0/evalforge/adapters/command_adapter.py +51 -0
  12. evalforge-0.2.0/evalforge/adapters/http_adapter.py +41 -0
  13. evalforge-0.2.0/evalforge/adapters/python_adapter.py +52 -0
  14. evalforge-0.2.0/evalforge/cost.py +278 -0
  15. evalforge-0.2.0/evalforge/generators/__init__.py +1 -0
  16. evalforge-0.2.0/evalforge/generators/auto_generate.py +212 -0
  17. evalforge-0.2.0/evalforge/generators/case_templates.py +94 -0
  18. evalforge-0.2.0/evalforge/generators/prompt_analyzer.py +94 -0
  19. evalforge-0.2.0/evalforge/history.py +109 -0
  20. evalforge-0.2.0/evalforge/main.py +802 -0
  21. evalforge-0.2.0/evalforge/providers.py +479 -0
  22. evalforge-0.2.0/evalforge/reporters/__init__.py +26 -0
  23. evalforge-0.2.0/evalforge/reporters/console.py +408 -0
  24. evalforge-0.2.0/evalforge/reporters/html_reporter.py +128 -0
  25. evalforge-0.2.0/evalforge/reporters/junit.py +62 -0
  26. evalforge-0.2.0/evalforge/reporters/markdown_reporter.py +54 -0
  27. evalforge-0.2.0/evalforge/runner.py +365 -0
  28. evalforge-0.2.0/evalforge/schema.py +264 -0
  29. evalforge-0.2.0/evalforge/scorers/__init__.py +58 -0
  30. evalforge-0.2.0/evalforge/scorers/bleu_rouge.py +92 -0
  31. evalforge-0.2.0/evalforge/scorers/contains.py +73 -0
  32. evalforge-0.2.0/evalforge/scorers/cosine.py +63 -0
  33. evalforge-0.2.0/evalforge/scorers/exact.py +42 -0
  34. evalforge-0.2.0/evalforge/scorers/hallucination.py +148 -0
  35. evalforge-0.2.0/evalforge/scorers/llm_judge.py +118 -0
  36. evalforge-0.2.0/evalforge/scorers/task_completion.py +83 -0
  37. evalforge-0.2.0/evalforge/scorers/trajectory.py +89 -0
  38. evalforge-0.2.0/evalforge/security.py +159 -0
  39. evalforge-0.2.0/evalforge/utils.py +133 -0
  40. evalforge-0.2.0/evalforge_build_guide.md +3434 -0
  41. evalforge-0.2.0/pyproject.toml +98 -0
  42. evalforge-0.2.0/suite.yaml +358 -0
@@ -0,0 +1,27 @@
1
+ # Copy this file to .env and fill in your values
2
+ # Never commit .env to git
3
+
4
+ # ── Required (at least one) ─────────────────────────
5
+ OPENAI_API_KEY=sk-your-openai-key-here
6
+ ANTHROPIC_API_KEY=sk-ant-your-anthropic-key-here
7
+
8
+ # ── Google Gemini ───────────────────────────────────
9
+ GOOGLE_API_KEY=AIza-your-google-api-key-here
10
+
11
+ # ── AWS Bedrock ─────────────────────────────────────
12
+ # AWS_ACCESS_KEY_ID=your-access-key
13
+ # AWS_SECRET_ACCESS_KEY=your-secret-key
14
+ # AWS_DEFAULT_REGION=us-east-1
15
+
16
+ # ── Ollama (local) ──────────────────────────────────
17
+ # OLLAMA_BASE_URL=http://localhost:11434
18
+
19
+ # ── OpenRouter ──────────────────────────────────────
20
+ # OPENROUTER_API_KEY=sk-or-your-key-here
21
+
22
+ # ── LiteLLM Proxy ──────────────────────────────────
23
+ # LITELLM_API_KEY=your-litellm-key
24
+
25
+ # ── EvalForge Settings ──────────────────────────────
26
+ EVALFORGE_DEFAULT_MODEL=gpt-4o-mini
27
+ EVALFORGE_HISTORY_DIR=~/.evalforge/history
@@ -0,0 +1,18 @@
1
+ .venv/
2
+ __pycache__/
3
+ *.pyc
4
+ *.pyo
5
+ .env
6
+ .env.*
7
+ !.env.example
8
+ dist/
9
+ build/
10
+ *.egg-info/
11
+ .pytest_cache/
12
+ .mypy_cache/
13
+ .ruff_cache/
14
+ ~/.evalforge/
15
+ *.html
16
+ htmlcov/
17
+ .coverage
18
+ node_modules/
@@ -0,0 +1,19 @@
1
+ # Changelog
2
+
3
+ ## [0.1.0] - Unreleased
4
+
5
+ ### Added
6
+ - Initial release
7
+ - YAML-based eval suite format
8
+ - Scorers: exact, contains, regex, cosine, BLEU, ROUGE, LLM-judge
9
+ - Multi-model parallel evaluation (OpenAI, Anthropic, Gemini, Bedrock, Ollama)
10
+ - Multi-provider routing via providers.py
11
+ - Run history and comparison
12
+ - HTML, Markdown, JUnit reporters
13
+ - Auto test case generation from production logs
14
+ - Cost tracking with 31-model pricing table
15
+ - Session cost breakdown (eval vs overhead)
16
+ - Budget alerts and cumulative tracking
17
+ - Security hardening: API key masking, YAML sanitization, XSS prevention
18
+ - CI/CD integration with exit codes
19
+ - npm wrapper package for Node.js users
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 EvalForge Contributors
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,256 @@
1
+ Metadata-Version: 2.4
2
+ Name: evalforge
3
+ Version: 0.2.0
4
+ Summary: The pytest of LLMs — build, version, and run LLM eval suites from the terminal
5
+ Project-URL: Homepage, https://github.com/yourusername/evalforge
6
+ Project-URL: Documentation, https://github.com/yourusername/evalforge#readme
7
+ Project-URL: Issues, https://github.com/yourusername/evalforge/issues
8
+ Project-URL: Changelog, https://github.com/yourusername/evalforge/blob/main/CHANGELOG.md
9
+ License: MIT License
10
+
11
+ Copyright (c) 2025 EvalForge Contributors
12
+
13
+ Permission is hereby granted, free of charge, to any person obtaining a copy
14
+ of this software and associated documentation files (the "Software"), to deal
15
+ in the Software without restriction, including without limitation the rights
16
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
17
+ copies of the Software, and to permit persons to whom the Software is
18
+ furnished to do so, subject to the following conditions:
19
+
20
+ The above copyright notice and this permission notice shall be included in all
21
+ copies or substantial portions of the Software.
22
+
23
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
24
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
25
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
26
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
27
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
28
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
29
+ SOFTWARE.
30
+ License-File: LICENSE
31
+ Keywords: ai,cli,data-science,evaluation,llm,mlops,prompt-testing,testing
32
+ Classifier: Development Status :: 3 - Alpha
33
+ Classifier: Environment :: Console
34
+ Classifier: Intended Audience :: Developers
35
+ Classifier: Intended Audience :: Science/Research
36
+ Classifier: License :: OSI Approved :: MIT License
37
+ Classifier: Programming Language :: Python :: 3.11
38
+ Classifier: Programming Language :: Python :: 3.12
39
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
40
+ Classifier: Topic :: Software Development :: Testing
41
+ Requires-Python: >=3.11
42
+ Requires-Dist: anthropic>=0.25
43
+ Requires-Dist: httpx>=0.27
44
+ Requires-Dist: jinja2>=3.1
45
+ Requires-Dist: nltk>=3.8
46
+ Requires-Dist: openai>=1.0
47
+ Requires-Dist: pydantic>=2
48
+ Requires-Dist: python-dotenv>=1.0
49
+ Requires-Dist: pyyaml>=6.0
50
+ Requires-Dist: rich>=13
51
+ Requires-Dist: rouge-score>=0.1.2
52
+ Requires-Dist: scikit-learn>=1.4
53
+ Requires-Dist: sentence-transformers>=3.0
54
+ Requires-Dist: tenacity>=8.2
55
+ Requires-Dist: tiktoken>=0.7
56
+ Requires-Dist: typer>=0.12
57
+ Provides-Extra: all
58
+ Requires-Dist: boto3>=1.34; extra == 'all'
59
+ Requires-Dist: google-generativeai>=0.5; extra == 'all'
60
+ Requires-Dist: litellm>=1.40; extra == 'all'
61
+ Provides-Extra: bedrock
62
+ Requires-Dist: boto3>=1.34; extra == 'bedrock'
63
+ Provides-Extra: dev
64
+ Requires-Dist: build>=1.2; extra == 'dev'
65
+ Requires-Dist: mypy>=1.10; extra == 'dev'
66
+ Requires-Dist: pip-audit>=2.7; extra == 'dev'
67
+ Requires-Dist: pytest-asyncio>=0.23; extra == 'dev'
68
+ Requires-Dist: pytest>=8; extra == 'dev'
69
+ Requires-Dist: ruff>=0.4; extra == 'dev'
70
+ Requires-Dist: twine>=5.0; extra == 'dev'
71
+ Provides-Extra: gemini
72
+ Requires-Dist: google-generativeai>=0.5; extra == 'gemini'
73
+ Provides-Extra: litellm
74
+ Requires-Dist: litellm>=1.40; extra == 'litellm'
75
+ Description-Content-Type: text/markdown
76
+
77
+ # 🔨 EvalForge
78
+
79
+ **The ultimate evaluation platform for LLMs and Autonomous AI Agents.**
80
+
81
+ EvalForge simplifies the process of testing, evaluating, and iterating on Large Language Models and AI Agents. Whether you're benchmarking a simple LLM prompt block or evaluating a complex, multi-tool Kubernetes Agent using LangChain/OpenAI SDK, EvalForge provides deterministic scoring, conversational edge-case simulation, and detailed cost breakdowns directly in your terminal.
82
+
83
+ ---
84
+
85
+ ## 🚀 Quick Start
86
+
87
+ EvalForge can be installed easily via pip or npm (using the wrapper).
88
+
89
+ **Python (PyPI)**
90
+ ```bash
91
+ pip install evalforge
92
+ ```
93
+
94
+ **Node / JavaScript (NPM Wrapper)**
95
+ ```bash
96
+ npm install -g evalforge
97
+ ```
98
+
99
+ ### Initialize a Project
100
+ To scaffold a basic LLM or Agent project, run:
101
+ ```bash
102
+ evalforge init
103
+ ```
104
+
105
+ ---
106
+
107
+ ## 🧠 Model Evals (Classic LLM Mode)
108
+
109
+ Test standard Large Language Models for prompt coherence, factual extraction, and instruction adherence.
110
+
111
+ 1. **Define a Suite**: Create an `llm_suite.yaml`:
112
+ ```yaml
113
+ name: "Summarization Task"
114
+ description: "Tests a model's summarization capabilities"
115
+ type: llm
116
+ model: gpt-4o-mini
117
+ cases:
118
+ - id: "long_text_summary"
119
+ messages:
120
+ - role: "user"
121
+ content: "Summarize this long text: [text ...]"
122
+ scorers:
123
+ - type: "llm_judge"
124
+ threshold: 0.8
125
+ rubric: "Did the model summarize the text accurately in under 5 sentences?"
126
+ - type: "contains"
127
+ value: "summary"
128
+ threshold: 1.0
129
+ ```
130
+
131
+ 2. **Run the Suite**:
132
+ ```bash
133
+ evalforge run llm_suite.yaml
134
+ ```
135
+ You can easily override the model to benchmark across providers (OpenAI, Anthropic, Gemini, Ollama):
136
+ ```bash
137
+ evalforge run llm_suite.yaml -m claude-3-haiku-20240307
138
+ ```
139
+
140
+ ---
141
+
142
+ ## 🤖 Agent Evals (Autonomous Agent Mode)
143
+
144
+ EvalForge v0.2 introduces native support for Autonomous Agents. It dynamically generates edge-cases from your System Prompt and interfaces with any Python codebase (LangChain, LangGraph, standard OpenAI functions).
145
+
146
+ ### 1. Build an Agent Suite
147
+ Define an agent suite specifying the `system_prompt` and pointing EvalForge to your application code.
148
+
149
+ ```yaml
150
+ name: "Kubernetes Expert Agent"
151
+ description: "Tests a web-browsing agent's ability to document K8s topics."
152
+ version: 1.0.0
153
+ type: agent
154
+ parallel: 1
155
+ model: gpt-4o
156
+ agent:
157
+ adapter: python
158
+ config:
159
+ module: "k8s_bot.main"
160
+ function: "run_agent"
161
+ system_prompt: |
162
+ You are a Kubernetes Expert Agent. You have access to a web browsing tool to retrieve documentation.
163
+ Do not generate documentation yourself; strictly use the web tool to search for information.
164
+ Never modify or delete infrastructure without explicit user confirmation.
165
+ auto_generate:
166
+ enabled: true
167
+ num_cases: 6
168
+ cases: []
169
+ ```
170
+
171
+ ### 2. Auto-Generate Test Cases
172
+ EvalForge comes with an internal **LLM Validation Persona**. Using the `auto_generate` config block, EvalForge reads your system prompt and generates robust **Happy Paths**, **Conversational Edge Cases**, and **Boundary Limits**, saving them directly into your `suite.yaml`.
173
+
174
+ Run evaluation with generation:
175
+ ```bash
176
+ evalforge run --regenerate suite.yaml
177
+ ```
178
+
179
+ ### 3. Reviewing The Agent Results
180
+ EvalForge will display an elegant UI in the terminal tracking your agent's hallucination score (via NLI), tool invocations, and semantic understanding.
181
+
182
+ ***Example Output Console:***
183
+ ```text
184
+ Category Breakdown
185
+
186
+ Category Passed Pass Rate Progress
187
+ ─────────────────────────────────────────────────────────────────────────
188
+ Create 0/1 0% ░░░░░░░░░░░░░░░
189
+ Edge 0/1 0% ░░░░░░░░░░░░░░░
190
+ Infrastructure 0/1 0% ░░░░░░░░░░░░░░░
191
+ Kubernetes 0/1 0% ░░░░░░░░░░░░░░░
192
+ Node 0/1 0% ░░░░░░░░░░░░░░░
193
+ Pods 0/1 0% ░░░░░░░░░░░░░░░
194
+
195
+
196
+ ❌ Failed Cases (6)
197
+
198
+ list_pods_happy_path (gpt-4o)
199
+ → llm_judge score=0.75: The response provides a relevant command to retrieve the desired information but...
200
+ → hallucination score=0.29
201
+ Output: I currently don’t have the capability to directly interact with or retrieve live data from your Kubernetes cluster. Howe…
202
+
203
+ check_node_health_happy_path (gpt-4o)
204
+ → llm_judge score=0.00: The response fails to provide any node names, health conditions, or warnings, ...
205
+ → hallucination score=0.39
206
+ Output: I don't have the ability to check the health status of your nodes directly. However, I can guide you on how to do it usi…
207
+
208
+ ╭─ 💰 Cost Breakdown ─────────────────────╮
209
+ │ Eval Cost: $0.00 per 100k runs (0%) │
210
+ │ Overhead: $0.00039 (100%) │
211
+ │ Total: $0.00039 │
212
+ ╰─────────────────────────────────────────╯
213
+
214
+ ╭───────────────────────────────────────────────────────────────────────╮
215
+ │ ❌ SUITE FAILED — 0.0% pass rate • 6 of 6 failed • run_d08fd895 │
216
+ ╰───────────────────────────────────────────────────────────────────────╯
217
+ ```
218
+
219
+ ---
220
+
221
+ ## 🧪 Evaluators & Scorers
222
+
223
+ EvalForge packages multiple deterministic and probabilistic scorers designed for accuracy tracking:
224
+
225
+ - `exact` & `contains`: Basic deterministic string matching.
226
+ - `regex`: Extracts pattern matching.
227
+ - `cosine`: Compares embeddings for similarity semantic checks.
228
+ - `bleu` / `rouge`: Machine-translation N-Gram comparisons using NLTK mechanisms.
229
+ - `llm_judge`: Asks an LLM to evaluate the string output based on a strictly provided `rubric` parameter.
230
+ - `hallucination`: Leverages multi-lingual Cross-Encoders and huggingface NLI models (e.g., `deberta-v3-small`) to verify if the model output hallucinates information not present in the reference documents.
231
+
232
+ ---
233
+
234
+ ## 🔧 Extending the Adapter
235
+
236
+ By default, the `PythonAdapter` executes any `def func(input_text: str, context: dict) -> AgentResponse:` compatible endpoint. You can integrate `OpenAI SDK`, `LangChainAgentRunners`, or `LangGraph` graphs simply by exposing their invoker to EvalForge!
237
+
238
+ 1. Ensure your module imports and initializes properly. Address `sys.path` concerns.
239
+ 2. Structure your return payload strictly around the `AgentResponse` parameters (e.g. `output`, `steps`, `tool_calls`, `token_usage`).
240
+
241
+ ---
242
+
243
+ ## 🛠 Commands
244
+
245
+ | Command | Description |
246
+ |---------|-------------|
247
+ | `evalforge --version` | Print the current EvalForge Version |
248
+ | `evalforge init` | Bootstrap a scaffold suite structure |
249
+ | `evalforge run <suite.yaml>` | Run a defined eval suite against connected code |
250
+ | `evalforge run --regenerate` | Instruct the LLM generation persona to create new cases |
251
+ | `evalforge cost [run_id]` | Fetch historical evaluation usage costs |
252
+
253
+ ---
254
+
255
+ ## Licensing
256
+ This project is licensed under the MIT License.
@@ -0,0 +1,113 @@
1
+ # Publishing EvalForge
2
+
3
+ This guide details the steps to publish EvalForge to both **PyPI** (Python Package Index) and **NPM** (Node Package Manager) to make it accessible to developers across different ecosystems.
4
+
5
+ ---
6
+
7
+ ## 🐍 1. Publishing to PyPI
8
+
9
+ Since EvalForge is built in Python, PyPI is its native distribution channel. We use `poetry` (or standard `build` and `twine`) for publishing. Since we have a `pyproject.toml`, we can use standard build tools.
10
+
11
+ ### Prerequisites
12
+ 1. Ensure you have a registered account on [PyPI](https://pypi.org).
13
+ 2. Install build and deployment tools:
14
+ ```bash
15
+ pip install build twine
16
+ ```
17
+
18
+ ### Step-by-step Guide
19
+
20
+ 1. **Update Version**: Ensure the version in `pyproject.toml` and `evalforge/__init__.py` is updated (e.g., `0.2.0`).
21
+ 2. **Build the Package**:
22
+ Run the following command in the root directory (where `pyproject.toml` is located):
23
+ ```bash
24
+ python -m build
25
+ ```
26
+ This will generate the distribution archives in a `dist/` folder (`.tar.gz` and `.whl` files).
27
+ 3. **Check the Build**:
28
+ Verify the built package doesn't have formatting errors:
29
+ ```bash
30
+ twine check dist/*
31
+ ```
32
+ 4. **Publish to PyPI**:
33
+ Upload the exact build to PyPI:
34
+ ```bash
35
+ twine upload dist/*
36
+ ```
37
+ *You will be prompted for your username (usually `__token__`) and your PyPI API token.*
38
+
39
+ Once completed, users can install via:
40
+ ```bash
41
+ pip install evalforge
42
+ ```
43
+
44
+ ---
45
+
46
+ ## 📦 2. Publishing to NPM
47
+
48
+ To make EvalForge available to the Node/JS ecosystem (so JS developers can use `npx evalforge`), we publish an NPM wrapper package that invokes the Python CLI.
49
+
50
+ ### Prerequisites
51
+ 1. An account on [npmjs.com](https://npmjs.com).
52
+ 2. Node.js and `npm` installed.
53
+
54
+ ### Step-by-step Guide
55
+
56
+ 1. **Create the NPM Wrapper Directory**:
57
+ Create a folder named `npm` within your repo (or at the root if you prefer to mix Python/Node) and initialize it:
58
+ ```bash
59
+ mkdir npm_package && cd npm_package
60
+ npm init -y
61
+ ```
62
+ 2. **Configure `package.json`**:
63
+ Edit `package.json` to configure the binary script.
64
+ ```json
65
+ {
66
+ "name": "evalforge",
67
+ "version": "0.2.0",
68
+ "description": "EvalForge: The ultimate AI Agent Evaluation Platform",
69
+ "bin": {
70
+ "evalforge": "bin/evalforge.js"
71
+ },
72
+ "scripts": {},
73
+ "author": "Your Name",
74
+ "license": "MIT"
75
+ }
76
+ ```
77
+ 3. **Create the Wrapper Script**:
78
+ Create a directory `bin` and a file `bin/evalforge.js`:
79
+ ```javascript
80
+ #!/usr/bin/env node
81
+ const { spawnSync } = require('child_process');
82
+
83
+ // Check if Python and the evalforge PyPI package is installed
84
+ const check = spawnSync('python3', ['-m', 'evalforge', '--version']);
85
+
86
+ if (check.error || check.status !== 0) {
87
+ console.error("EvalForge requires Python. Please install EvalForge via pip first:");
88
+ console.error(" pip install evalforge");
89
+ process.exit(1);
90
+ }
91
+
92
+ // Pass execution to the Python CLI
93
+ const args = process.argv.slice(2);
94
+ const result = spawnSync('python3', ['-m', 'evalforge'].concat(args), { stdio: 'inherit' });
95
+ process.exit(result.status);
96
+ ```
97
+ Make the script executable:
98
+ ```bash
99
+ chmod +x bin/evalforge.js
100
+ ```
101
+
102
+ 4. **Publish to NPM**:
103
+ Log in and publish:
104
+ ```bash
105
+ npm login
106
+ npm publish
107
+ ```
108
+
109
+ Once published, frontend and full-stack developers can use EvalForge directly in their pipelines via:
110
+ ```bash
111
+ npx evalforge init
112
+ npx evalforge run
113
+ ```
@@ -0,0 +1,180 @@
1
+ # 🔨 EvalForge
2
+
3
+ **The ultimate evaluation platform for LLMs and Autonomous AI Agents.**
4
+
5
+ EvalForge simplifies the process of testing, evaluating, and iterating on Large Language Models and AI Agents. Whether you're benchmarking a simple LLM prompt block or evaluating a complex, multi-tool Kubernetes Agent using LangChain/OpenAI SDK, EvalForge provides deterministic scoring, conversational edge-case simulation, and detailed cost breakdowns directly in your terminal.
6
+
7
+ ---
8
+
9
+ ## 🚀 Quick Start
10
+
11
+ EvalForge can be installed easily via pip or npm (using the wrapper).
12
+
13
+ **Python (PyPI)**
14
+ ```bash
15
+ pip install evalforge
16
+ ```
17
+
18
+ **Node / JavaScript (NPM Wrapper)**
19
+ ```bash
20
+ npm install -g evalforge
21
+ ```
22
+
23
+ ### Initialize a Project
24
+ To scaffold a basic LLM or Agent project, run:
25
+ ```bash
26
+ evalforge init
27
+ ```
28
+
29
+ ---
30
+
31
+ ## 🧠 Model Evals (Classic LLM Mode)
32
+
33
+ Test standard Large Language Models for prompt coherence, factual extraction, and instruction adherence.
34
+
35
+ 1. **Define a Suite**: Create an `llm_suite.yaml`:
36
+ ```yaml
37
+ name: "Summarization Task"
38
+ description: "Tests a model's summarization capabilities"
39
+ type: llm
40
+ model: gpt-4o-mini
41
+ cases:
42
+ - id: "long_text_summary"
43
+ messages:
44
+ - role: "user"
45
+ content: "Summarize this long text: [text ...]"
46
+ scorers:
47
+ - type: "llm_judge"
48
+ threshold: 0.8
49
+ rubric: "Did the model summarize the text accurately in under 5 sentences?"
50
+ - type: "contains"
51
+ value: "summary"
52
+ threshold: 1.0
53
+ ```
54
+
55
+ 2. **Run the Suite**:
56
+ ```bash
57
+ evalforge run llm_suite.yaml
58
+ ```
59
+ You can easily override the model to benchmark across providers (OpenAI, Anthropic, Gemini, Ollama):
60
+ ```bash
61
+ evalforge run llm_suite.yaml -m claude-3-haiku-20240307
62
+ ```
63
+
64
+ ---
65
+
66
+ ## 🤖 Agent Evals (Autonomous Agent Mode)
67
+
68
+ EvalForge v0.2 introduces native support for Autonomous Agents. It dynamically generates edge-cases from your System Prompt and interfaces with any Python codebase (LangChain, LangGraph, standard OpenAI functions).
69
+
70
+ ### 1. Build an Agent Suite
71
+ Define an agent suite specifying the `system_prompt` and pointing EvalForge to your application code.
72
+
73
+ ```yaml
74
+ name: "Kubernetes Expert Agent"
75
+ description: "Tests a web-browsing agent's ability to document K8s topics."
76
+ version: 1.0.0
77
+ type: agent
78
+ parallel: 1
79
+ model: gpt-4o
80
+ agent:
81
+ adapter: python
82
+ config:
83
+ module: "k8s_bot.main"
84
+ function: "run_agent"
85
+ system_prompt: |
86
+ You are a Kubernetes Expert Agent. You have access to a web browsing tool to retrieve documentation.
87
+ Do not generate documentation yourself; strictly use the web tool to search for information.
88
+ Never modify or delete infrastructure without explicit user confirmation.
89
+ auto_generate:
90
+ enabled: true
91
+ num_cases: 6
92
+ cases: []
93
+ ```
94
+
95
+ ### 2. Auto-Generate Test Cases
96
+ EvalForge comes with an internal **LLM Validation Persona**. Using the `auto_generate` config block, EvalForge reads your system prompt and generates robust **Happy Paths**, **Conversational Edge Cases**, and **Boundary Limits**, saving them directly into your `suite.yaml`.
97
+
98
+ Run evaluation with generation:
99
+ ```bash
100
+ evalforge run --regenerate suite.yaml
101
+ ```
102
+
103
+ ### 3. Reviewing The Agent Results
104
+ EvalForge will display an elegant UI in the terminal tracking your agent's hallucination score (via NLI), tool invocations, and semantic understanding.
105
+
106
+ ***Example Output Console:***
107
+ ```text
108
+ Category Breakdown
109
+
110
+ Category Passed Pass Rate Progress
111
+ ─────────────────────────────────────────────────────────────────────────
112
+ Create 0/1 0% ░░░░░░░░░░░░░░░
113
+ Edge 0/1 0% ░░░░░░░░░░░░░░░
114
+ Infrastructure 0/1 0% ░░░░░░░░░░░░░░░
115
+ Kubernetes 0/1 0% ░░░░░░░░░░░░░░░
116
+ Node 0/1 0% ░░░░░░░░░░░░░░░
117
+ Pods 0/1 0% ░░░░░░░░░░░░░░░
118
+
119
+
120
+ ❌ Failed Cases (6)
121
+
122
+ list_pods_happy_path (gpt-4o)
123
+ → llm_judge score=0.75: The response provides a relevant command to retrieve the desired information but...
124
+ → hallucination score=0.29
125
+ Output: I currently don’t have the capability to directly interact with or retrieve live data from your Kubernetes cluster. Howe…
126
+
127
+ check_node_health_happy_path (gpt-4o)
128
+ → llm_judge score=0.00: The response fails to provide any node names, health conditions, or warnings, ...
129
+ → hallucination score=0.39
130
+ Output: I don't have the ability to check the health status of your nodes directly. However, I can guide you on how to do it usi…
131
+
132
+ ╭─ 💰 Cost Breakdown ─────────────────────╮
133
+ │ Eval Cost: $0.00 per 100k runs (0%) │
134
+ │ Overhead: $0.00039 (100%) │
135
+ │ Total: $0.00039 │
136
+ ╰─────────────────────────────────────────╯
137
+
138
+ ╭───────────────────────────────────────────────────────────────────────╮
139
+ │ ❌ SUITE FAILED — 0.0% pass rate • 6 of 6 failed • run_d08fd895 │
140
+ ╰───────────────────────────────────────────────────────────────────────╯
141
+ ```
142
+
143
+ ---
144
+
145
+ ## 🧪 Evaluators & Scorers
146
+
147
+ EvalForge packages multiple deterministic and probabilistic scorers designed for accuracy tracking:
148
+
149
+ - `exact` & `contains`: Basic deterministic string matching.
150
+ - `regex`: Extracts pattern matching.
151
+ - `cosine`: Compares embeddings for similarity semantic checks.
152
+ - `bleu` / `rouge`: Machine-translation N-Gram comparisons using NLTK mechanisms.
153
+ - `llm_judge`: Asks an LLM to evaluate the string output based on a strictly provided `rubric` parameter.
154
+ - `hallucination`: Leverages multi-lingual Cross-Encoders and huggingface NLI models (e.g., `deberta-v3-small`) to verify if the model output hallucinates information not present in the reference documents.
155
+
156
+ ---
157
+
158
+ ## 🔧 Extending the Adapter
159
+
160
+ By default, the `PythonAdapter` executes any `def func(input_text: str, context: dict) -> AgentResponse:` compatible endpoint. You can integrate `OpenAI SDK`, `LangChainAgentRunners`, or `LangGraph` graphs simply by exposing their invoker to EvalForge!
161
+
162
+ 1. Ensure your module imports and initializes properly. Address `sys.path` concerns.
163
+ 2. Structure your return payload strictly around the `AgentResponse` parameters (e.g. `output`, `steps`, `tool_calls`, `token_usage`).
164
+
165
+ ---
166
+
167
+ ## 🛠 Commands
168
+
169
+ | Command | Description |
170
+ |---------|-------------|
171
+ | `evalforge --version` | Print the current EvalForge Version |
172
+ | `evalforge init` | Bootstrap a scaffold suite structure |
173
+ | `evalforge run <suite.yaml>` | Run a defined eval suite against connected code |
174
+ | `evalforge run --regenerate` | Instruct the LLM generation persona to create new cases |
175
+ | `evalforge cost [run_id]` | Fetch historical evaluation usage costs |
176
+
177
+ ---
178
+
179
+ ## Licensing
180
+ This project is licensed under the MIT License.
@@ -0,0 +1,4 @@
1
+ """evalforge — The pytest of LLMs."""
2
+
3
+ __version__ = "0.2.0"
4
+ __author__ = "EvalForge Contributors"
@@ -0,0 +1,16 @@
1
+ from evalforge.schema import AgentConfig
2
+ from evalforge.adapters.base import BaseAdapter
3
+
4
+ def get_adapter(config: AgentConfig) -> BaseAdapter:
5
+ """Factory to get the right agent adapter based on config."""
6
+ if config.adapter == "python":
7
+ from evalforge.adapters.python_adapter import PythonAdapter
8
+ return PythonAdapter(config)
9
+ elif config.adapter == "http":
10
+ from evalforge.adapters.http_adapter import HttpAdapter
11
+ return HttpAdapter(config)
12
+ elif config.adapter == "command":
13
+ from evalforge.adapters.command_adapter import CommandAdapter
14
+ return CommandAdapter(config)
15
+ else:
16
+ raise ValueError(f"Unknown adapter type: {config.adapter}")