substrai-evalforge 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- substrai_evalforge-0.1.0/LICENSE +21 -0
- substrai_evalforge-0.1.0/PKG-INFO +129 -0
- substrai_evalforge-0.1.0/README.md +105 -0
- substrai_evalforge-0.1.0/pyproject.toml +40 -0
- substrai_evalforge-0.1.0/setup.cfg +4 -0
- substrai_evalforge-0.1.0/src/evalforge/__init__.py +31 -0
- substrai_evalforge-0.1.0/src/evalforge/cli/__init__.py +2 -0
- substrai_evalforge-0.1.0/src/evalforge/cli/main.py +182 -0
- substrai_evalforge-0.1.0/src/evalforge/core/__init__.py +4 -0
- substrai_evalforge-0.1.0/src/evalforge/core/config.py +253 -0
- substrai_evalforge-0.1.0/src/evalforge/core/pipeline.py +186 -0
- substrai_evalforge-0.1.0/src/evalforge/core/result.py +125 -0
- substrai_evalforge-0.1.0/src/evalforge/drift/__init__.py +0 -0
- substrai_evalforge-0.1.0/src/evalforge/generators/__init__.py +0 -0
- substrai_evalforge-0.1.0/src/evalforge/metrics/__init__.py +7 -0
- substrai_evalforge-0.1.0/src/evalforge/metrics/base.py +62 -0
- substrai_evalforge-0.1.0/src/evalforge/metrics/classification.py +59 -0
- substrai_evalforge-0.1.0/src/evalforge/metrics/rag.py +176 -0
- substrai_evalforge-0.1.0/src/evalforge/metrics/registry.py +85 -0
- substrai_evalforge-0.1.0/src/evalforge/metrics/safety.py +108 -0
- substrai_evalforge-0.1.0/src/evalforge/metrics/text.py +217 -0
- substrai_evalforge-0.1.0/src/evalforge/pipeline/__init__.py +0 -0
- substrai_evalforge-0.1.0/src/evalforge/plugins/__init__.py +0 -0
- substrai_evalforge-0.1.0/src/substrai_evalforge.egg-info/PKG-INFO +129 -0
- substrai_evalforge-0.1.0/src/substrai_evalforge.egg-info/SOURCES.txt +30 -0
- substrai_evalforge-0.1.0/src/substrai_evalforge.egg-info/dependency_links.txt +1 -0
- substrai_evalforge-0.1.0/src/substrai_evalforge.egg-info/entry_points.txt +2 -0
- substrai_evalforge-0.1.0/src/substrai_evalforge.egg-info/requires.txt +8 -0
- substrai_evalforge-0.1.0/src/substrai_evalforge.egg-info/top_level.txt +1 -0
- substrai_evalforge-0.1.0/tests/test_config.py +93 -0
- substrai_evalforge-0.1.0/tests/test_metrics.py +181 -0
- substrai_evalforge-0.1.0/tests/test_pipeline.py +98 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 Gaurav Kumar Sinha (Substrai AI)
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,129 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: substrai-evalforge
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Automated LLM evaluation pipeline generator
|
|
5
|
+
Author-email: Gaurav Kumar Sinha <gaurav@substrai.dev>
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/substrai/evalforge
|
|
8
|
+
Project-URL: Repository, https://github.com/substrai/evalforge
|
|
9
|
+
Keywords: llm,evaluation,testing,mlops,genai,rag,metrics,pipeline,serverless,aws-lambda
|
|
10
|
+
Classifier: Development Status :: 3 - Alpha
|
|
11
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
12
|
+
Classifier: Programming Language :: Python :: 3
|
|
13
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
14
|
+
Requires-Python: >=3.9
|
|
15
|
+
Description-Content-Type: text/markdown
|
|
16
|
+
License-File: LICENSE
|
|
17
|
+
Requires-Dist: pyyaml>=6.0
|
|
18
|
+
Provides-Extra: aws
|
|
19
|
+
Requires-Dist: boto3>=1.28.0; extra == "aws"
|
|
20
|
+
Provides-Extra: dev
|
|
21
|
+
Requires-Dist: pytest>=7.0; extra == "dev"
|
|
22
|
+
Requires-Dist: pytest-cov>=4.0; extra == "dev"
|
|
23
|
+
Dynamic: license-file
|
|
24
|
+
|
|
25
|
+
# EvalForge
|
|
26
|
+
|
|
27
|
+
**Automated LLM evaluation pipeline generator.**
|
|
28
|
+
|
|
29
|
+
> Built by [SubstrAI](https://github.com/substrai) — Open-source GenAI frameworks for serverless infrastructure.
|
|
30
|
+
|
|
31
|
+
[](https://pypi.org/project/substrai-evalforge/)
|
|
32
|
+
[](https://opensource.org/licenses/MIT)
|
|
33
|
+
[](https://www.python.org/downloads/)
|
|
34
|
+
|
|
35
|
+
## The Problem
|
|
36
|
+
|
|
37
|
+
Every team deploying LLMs builds evaluation pipelines from scratch. RAGAS and DeepEval are libraries — they don't generate infrastructure, schedule runs, detect drift, or route to human reviewers.
|
|
38
|
+
|
|
39
|
+
## The Solution
|
|
40
|
+
|
|
41
|
+
Describe your use case → EvalForge generates the complete evaluation pipeline:
|
|
42
|
+
|
|
43
|
+
```yaml
|
|
44
|
+
# evalforge.yaml
|
|
45
|
+
use_case:
|
|
46
|
+
type: rag
|
|
47
|
+
description: "Customer support chatbot"
|
|
48
|
+
model:
|
|
49
|
+
provider: bedrock
|
|
50
|
+
model_id: anthropic.claude-3-haiku-20240307-v1:0
|
|
51
|
+
|
|
52
|
+
evaluation:
|
|
53
|
+
metrics: auto # auto-selects: faithfulness, relevancy, precision, recall, toxicity
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
```bash
|
|
57
|
+
evalforge run
|
|
58
|
+
# Faithfulness: 0.91 ✓ (threshold: 0.85)
|
|
59
|
+
# Answer Relevancy: 0.87 ✓ (threshold: 0.80)
|
|
60
|
+
# Context Precision: 0.78 ✓ (threshold: 0.75)
|
|
61
|
+
# Toxicity: 0.02 ✓ (threshold: 0.05)
|
|
62
|
+
# Overall: PASS (4/4 metrics passing)
|
|
63
|
+
```
|
|
64
|
+
|
|
65
|
+
## Features
|
|
66
|
+
|
|
67
|
+
- **Use-case-driven metric selection** — describe your app, get optimal metrics
|
|
68
|
+
- **6 use case types** — RAG, summarization, classification, generation, chat, code
|
|
69
|
+
- **16+ built-in metrics** — faithfulness, ROUGE, BLEU, toxicity, injection resistance, F1
|
|
70
|
+
- **Synthetic test data generation** — adversarial, edge cases, domain-specific
|
|
71
|
+
- **Drift detection** — alerts when quality degrades over time
|
|
72
|
+
- **Human-in-the-loop** — route uncertain evaluations to reviewers
|
|
73
|
+
- **Scheduled pipelines** — daily/weekly automated evaluation runs
|
|
74
|
+
- **Benchmark registry** — compare against published benchmarks
|
|
75
|
+
- **One-command deploy** — Step Functions + Lambda infrastructure
|
|
76
|
+
|
|
77
|
+
## Installation
|
|
78
|
+
|
|
79
|
+
```bash
|
|
80
|
+
pip install substrai-evalforge
|
|
81
|
+
```
|
|
82
|
+
|
|
83
|
+
## Quick Start
|
|
84
|
+
|
|
85
|
+
```bash
|
|
86
|
+
# Scaffold project
|
|
87
|
+
evalforge init my-eval --use-case rag
|
|
88
|
+
|
|
89
|
+
# Run evaluation
|
|
90
|
+
cd my-eval
|
|
91
|
+
evalforge run
|
|
92
|
+
|
|
93
|
+
# List available metrics
|
|
94
|
+
evalforge metrics --use-case rag
|
|
95
|
+
```
|
|
96
|
+
|
|
97
|
+
## Python SDK
|
|
98
|
+
|
|
99
|
+
```python
|
|
100
|
+
from evalforge import EvalPipeline
|
|
101
|
+
|
|
102
|
+
# Quick start for any use case
|
|
103
|
+
pipeline = EvalPipeline.for_use_case("rag")
|
|
104
|
+
results = pipeline.run()
|
|
105
|
+
print(results.summary())
|
|
106
|
+
print(f"All passing: {results.all_passing}")
|
|
107
|
+
```
|
|
108
|
+
|
|
109
|
+
## Supported Use Cases & Auto-Selected Metrics
|
|
110
|
+
|
|
111
|
+
| Use Case | Auto-Selected Metrics |
|
|
112
|
+
|---|---|
|
|
113
|
+
| **rag** | faithfulness, answer_relevancy, context_precision, context_recall, toxicity |
|
|
114
|
+
| **summarization** | rouge_l, bleu, coherence, conciseness, fluency |
|
|
115
|
+
| **classification** | accuracy, precision, recall, f1_score |
|
|
116
|
+
| **generation** | fluency, coherence, toxicity, bias_detection |
|
|
117
|
+
| **chat** | coherence, toxicity, injection_resistance, fluency |
|
|
118
|
+
| **code** | accuracy, coherence |
|
|
119
|
+
|
|
120
|
+
## License
|
|
121
|
+
|
|
122
|
+
MIT — see [LICENSE](LICENSE)
|
|
123
|
+
|
|
124
|
+
## Author
|
|
125
|
+
|
|
126
|
+
**Gaurav Kumar Sinha** — Founder, [SubstrAI](https://github.com/substrai)
|
|
127
|
+
|
|
128
|
+
- Email: gaurav@substrai.dev
|
|
129
|
+
- GitHub: [@substrai](https://github.com/substrai)
|
|
@@ -0,0 +1,105 @@
|
|
|
1
|
+
# EvalForge
|
|
2
|
+
|
|
3
|
+
**Automated LLM evaluation pipeline generator.**
|
|
4
|
+
|
|
5
|
+
> Built by [SubstrAI](https://github.com/substrai) — Open-source GenAI frameworks for serverless infrastructure.
|
|
6
|
+
|
|
7
|
+
[](https://pypi.org/project/substrai-evalforge/)
|
|
8
|
+
[](https://opensource.org/licenses/MIT)
|
|
9
|
+
[](https://www.python.org/downloads/)
|
|
10
|
+
|
|
11
|
+
## The Problem
|
|
12
|
+
|
|
13
|
+
Every team deploying LLMs builds evaluation pipelines from scratch. RAGAS and DeepEval are libraries — they don't generate infrastructure, schedule runs, detect drift, or route to human reviewers.
|
|
14
|
+
|
|
15
|
+
## The Solution
|
|
16
|
+
|
|
17
|
+
Describe your use case → EvalForge generates the complete evaluation pipeline:
|
|
18
|
+
|
|
19
|
+
```yaml
|
|
20
|
+
# evalforge.yaml
|
|
21
|
+
use_case:
|
|
22
|
+
type: rag
|
|
23
|
+
description: "Customer support chatbot"
|
|
24
|
+
model:
|
|
25
|
+
provider: bedrock
|
|
26
|
+
model_id: anthropic.claude-3-haiku-20240307-v1:0
|
|
27
|
+
|
|
28
|
+
evaluation:
|
|
29
|
+
metrics: auto # auto-selects: faithfulness, relevancy, precision, recall, toxicity
|
|
30
|
+
```
|
|
31
|
+
|
|
32
|
+
```bash
|
|
33
|
+
evalforge run
|
|
34
|
+
# Faithfulness: 0.91 ✓ (threshold: 0.85)
|
|
35
|
+
# Answer Relevancy: 0.87 ✓ (threshold: 0.80)
|
|
36
|
+
# Context Precision: 0.78 ✓ (threshold: 0.75)
|
|
37
|
+
# Toxicity: 0.02 ✓ (threshold: 0.05)
|
|
38
|
+
# Overall: PASS (4/4 metrics passing)
|
|
39
|
+
```
|
|
40
|
+
|
|
41
|
+
## Features
|
|
42
|
+
|
|
43
|
+
- **Use-case-driven metric selection** — describe your app, get optimal metrics
|
|
44
|
+
- **6 use case types** — RAG, summarization, classification, generation, chat, code
|
|
45
|
+
- **16+ built-in metrics** — faithfulness, ROUGE, BLEU, toxicity, injection resistance, F1
|
|
46
|
+
- **Synthetic test data generation** — adversarial, edge cases, domain-specific
|
|
47
|
+
- **Drift detection** — alerts when quality degrades over time
|
|
48
|
+
- **Human-in-the-loop** — route uncertain evaluations to reviewers
|
|
49
|
+
- **Scheduled pipelines** — daily/weekly automated evaluation runs
|
|
50
|
+
- **Benchmark registry** — compare against published benchmarks
|
|
51
|
+
- **One-command deploy** — Step Functions + Lambda infrastructure
|
|
52
|
+
|
|
53
|
+
## Installation
|
|
54
|
+
|
|
55
|
+
```bash
|
|
56
|
+
pip install substrai-evalforge
|
|
57
|
+
```
|
|
58
|
+
|
|
59
|
+
## Quick Start
|
|
60
|
+
|
|
61
|
+
```bash
|
|
62
|
+
# Scaffold project
|
|
63
|
+
evalforge init my-eval --use-case rag
|
|
64
|
+
|
|
65
|
+
# Run evaluation
|
|
66
|
+
cd my-eval
|
|
67
|
+
evalforge run
|
|
68
|
+
|
|
69
|
+
# List available metrics
|
|
70
|
+
evalforge metrics --use-case rag
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
## Python SDK
|
|
74
|
+
|
|
75
|
+
```python
|
|
76
|
+
from evalforge import EvalPipeline
|
|
77
|
+
|
|
78
|
+
# Quick start for any use case
|
|
79
|
+
pipeline = EvalPipeline.for_use_case("rag")
|
|
80
|
+
results = pipeline.run()
|
|
81
|
+
print(results.summary())
|
|
82
|
+
print(f"All passing: {results.all_passing}")
|
|
83
|
+
```
|
|
84
|
+
|
|
85
|
+
## Supported Use Cases & Auto-Selected Metrics
|
|
86
|
+
|
|
87
|
+
| Use Case | Auto-Selected Metrics |
|
|
88
|
+
|---|---|
|
|
89
|
+
| **rag** | faithfulness, answer_relevancy, context_precision, context_recall, toxicity |
|
|
90
|
+
| **summarization** | rouge_l, bleu, coherence, conciseness, fluency |
|
|
91
|
+
| **classification** | accuracy, precision, recall, f1_score |
|
|
92
|
+
| **generation** | fluency, coherence, toxicity, bias_detection |
|
|
93
|
+
| **chat** | coherence, toxicity, injection_resistance, fluency |
|
|
94
|
+
| **code** | accuracy, coherence |
|
|
95
|
+
|
|
96
|
+
## License
|
|
97
|
+
|
|
98
|
+
MIT — see [LICENSE](LICENSE)
|
|
99
|
+
|
|
100
|
+
## Author
|
|
101
|
+
|
|
102
|
+
**Gaurav Kumar Sinha** — Founder, [SubstrAI](https://github.com/substrai)
|
|
103
|
+
|
|
104
|
+
- Email: gaurav@substrai.dev
|
|
105
|
+
- GitHub: [@substrai](https://github.com/substrai)
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=68.0", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "substrai-evalforge"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "Automated LLM evaluation pipeline generator"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
license = {text = "MIT"}
|
|
11
|
+
requires-python = ">=3.9"
|
|
12
|
+
authors = [
|
|
13
|
+
{name = "Gaurav Kumar Sinha", email = "gaurav@substrai.dev"},
|
|
14
|
+
]
|
|
15
|
+
keywords = ["llm", "evaluation", "testing", "mlops", "genai", "rag", "metrics", "pipeline", "serverless", "aws-lambda"]
|
|
16
|
+
classifiers = [
|
|
17
|
+
"Development Status :: 3 - Alpha",
|
|
18
|
+
"License :: OSI Approved :: MIT License",
|
|
19
|
+
"Programming Language :: Python :: 3",
|
|
20
|
+
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
|
21
|
+
]
|
|
22
|
+
dependencies = ["pyyaml>=6.0"]
|
|
23
|
+
|
|
24
|
+
[project.optional-dependencies]
|
|
25
|
+
aws = ["boto3>=1.28.0"]
|
|
26
|
+
dev = ["pytest>=7.0", "pytest-cov>=4.0"]
|
|
27
|
+
|
|
28
|
+
[project.scripts]
|
|
29
|
+
evalforge = "evalforge.cli.main:main"
|
|
30
|
+
|
|
31
|
+
[project.urls]
|
|
32
|
+
Homepage = "https://github.com/substrai/evalforge"
|
|
33
|
+
Repository = "https://github.com/substrai/evalforge"
|
|
34
|
+
|
|
35
|
+
[tool.setuptools.packages.find]
|
|
36
|
+
where = ["src"]
|
|
37
|
+
|
|
38
|
+
[tool.pytest.ini_options]
|
|
39
|
+
testpaths = ["tests"]
|
|
40
|
+
pythonpath = ["src"]
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
"""
|
|
2
|
+
EvalForge - Automated LLM Evaluation Pipeline Generator
|
|
3
|
+
|
|
4
|
+
Describe your GenAI use case in a config file, and EvalForge generates
|
|
5
|
+
the complete evaluation infrastructure: metrics selection, test data,
|
|
6
|
+
scheduled pipelines, drift detection, and human-in-the-loop review.
|
|
7
|
+
|
|
8
|
+
Usage:
|
|
9
|
+
from evalforge import EvalPipeline, EvalConfig, EvalResult
|
|
10
|
+
|
|
11
|
+
pipeline = EvalPipeline.from_config("evalforge.yaml")
|
|
12
|
+
results = pipeline.run()
|
|
13
|
+
print(results.summary())
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
__version__ = "0.1.0"
|
|
17
|
+
|
|
18
|
+
from evalforge.core.config import EvalConfig, UseCaseType
|
|
19
|
+
from evalforge.core.pipeline import EvalPipeline
|
|
20
|
+
from evalforge.core.result import EvalResult, MetricScore
|
|
21
|
+
from evalforge.metrics.registry import MetricRegistry, get_metrics_for_use_case
|
|
22
|
+
|
|
23
|
+
__all__ = [
|
|
24
|
+
"EvalConfig",
|
|
25
|
+
"EvalPipeline",
|
|
26
|
+
"EvalResult",
|
|
27
|
+
"MetricScore",
|
|
28
|
+
"UseCaseType",
|
|
29
|
+
"MetricRegistry",
|
|
30
|
+
"get_metrics_for_use_case",
|
|
31
|
+
]
|
|
@@ -0,0 +1,182 @@
|
|
|
1
|
+
"""EvalForge CLI - command-line interface."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import argparse
|
|
6
|
+
import sys
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
|
|
9
|
+
from evalforge.core.config import EvalConfig, UseCaseType
|
|
10
|
+
from evalforge.core.pipeline import EvalPipeline
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def cmd_init(args):
|
|
14
|
+
"""Initialize a new EvalForge project."""
|
|
15
|
+
project_name = args.name or "my-evaluation"
|
|
16
|
+
use_case = args.use_case or "rag"
|
|
17
|
+
project_dir = Path(project_name)
|
|
18
|
+
|
|
19
|
+
if project_dir.exists():
|
|
20
|
+
print(f"Error: Directory '{project_name}' already exists")
|
|
21
|
+
sys.exit(1)
|
|
22
|
+
|
|
23
|
+
# Create structure
|
|
24
|
+
dirs = [
|
|
25
|
+
project_dir / "metrics",
|
|
26
|
+
project_dir / "data" / "golden",
|
|
27
|
+
project_dir / "data" / "synthetic",
|
|
28
|
+
project_dir / "judges",
|
|
29
|
+
project_dir / "reports",
|
|
30
|
+
project_dir / "tests",
|
|
31
|
+
]
|
|
32
|
+
for d in dirs:
|
|
33
|
+
d.mkdir(parents=True, exist_ok=True)
|
|
34
|
+
|
|
35
|
+
# Create evalforge.yaml
|
|
36
|
+
config = f"""project:
|
|
37
|
+
name: "{project_name}"
|
|
38
|
+
version: "1.0.0"
|
|
39
|
+
|
|
40
|
+
use_case:
|
|
41
|
+
type: {use_case}
|
|
42
|
+
description: "Evaluation pipeline for {use_case} application"
|
|
43
|
+
model:
|
|
44
|
+
provider: bedrock
|
|
45
|
+
model_id: anthropic.claude-3-haiku-20240307-v1:0
|
|
46
|
+
region: us-east-1
|
|
47
|
+
|
|
48
|
+
evaluation:
|
|
49
|
+
metrics: auto # auto-selects based on use_case.type
|
|
50
|
+
thresholds: {{}} # uses defaults for {use_case}
|
|
51
|
+
|
|
52
|
+
test_data:
|
|
53
|
+
source: synthetic
|
|
54
|
+
count: 100
|
|
55
|
+
categories: [simple, complex, adversarial, edge_cases]
|
|
56
|
+
|
|
57
|
+
schedule:
|
|
58
|
+
frequency: daily
|
|
59
|
+
time: "02:00"
|
|
60
|
+
"""
|
|
61
|
+
(project_dir / "evalforge.yaml").write_text(config)
|
|
62
|
+
(project_dir / "metrics" / "__init__.py").write_text("")
|
|
63
|
+
(project_dir / "judges" / "__init__.py").write_text("")
|
|
64
|
+
|
|
65
|
+
# README
|
|
66
|
+
readme = f"""# {project_name}
|
|
67
|
+
|
|
68
|
+
LLM evaluation pipeline managed by [EvalForge](https://github.com/substrai/evalforge).
|
|
69
|
+
|
|
70
|
+
## Quick Start
|
|
71
|
+
|
|
72
|
+
```bash
|
|
73
|
+
evalforge validate
|
|
74
|
+
evalforge run
|
|
75
|
+
evalforge report --last 7d
|
|
76
|
+
```
|
|
77
|
+
"""
|
|
78
|
+
(project_dir / "README.md").write_text(readme)
|
|
79
|
+
|
|
80
|
+
print(f"✓ Created EvalForge project: {project_name}/")
|
|
81
|
+
print(f" Use case: {use_case}")
|
|
82
|
+
print(f" ├── evalforge.yaml")
|
|
83
|
+
print(f" ├── metrics/")
|
|
84
|
+
print(f" ├── data/golden/")
|
|
85
|
+
print(f" ├── data/synthetic/")
|
|
86
|
+
print(f" └── reports/")
|
|
87
|
+
print(f"\nNext steps:")
|
|
88
|
+
print(f" cd {project_name}")
|
|
89
|
+
print(f" evalforge validate")
|
|
90
|
+
print(f" evalforge run")
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def cmd_validate(args):
|
|
94
|
+
"""Validate configuration."""
|
|
95
|
+
config_path = Path(args.config or "evalforge.yaml")
|
|
96
|
+
if not config_path.exists():
|
|
97
|
+
print(f"Error: Config not found: {config_path}")
|
|
98
|
+
sys.exit(1)
|
|
99
|
+
|
|
100
|
+
try:
|
|
101
|
+
config = EvalConfig.from_file(config_path)
|
|
102
|
+
print(f"✓ Configuration valid")
|
|
103
|
+
print(config.summary())
|
|
104
|
+
except (ValueError, FileNotFoundError) as e:
|
|
105
|
+
print(f"✗ Configuration error: {e}")
|
|
106
|
+
sys.exit(1)
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
def cmd_run(args):
|
|
110
|
+
"""Run evaluation pipeline."""
|
|
111
|
+
config_path = Path(args.config or "evalforge.yaml")
|
|
112
|
+
|
|
113
|
+
if config_path.exists():
|
|
114
|
+
pipeline = EvalPipeline.from_config(config_path)
|
|
115
|
+
else:
|
|
116
|
+
use_case = args.use_case or "rag"
|
|
117
|
+
pipeline = EvalPipeline.for_use_case(use_case)
|
|
118
|
+
|
|
119
|
+
metrics_filter = args.metrics.split(",") if args.metrics else None
|
|
120
|
+
results = pipeline.run(metrics=metrics_filter)
|
|
121
|
+
|
|
122
|
+
print(results.summary())
|
|
123
|
+
|
|
124
|
+
if not results.all_passing:
|
|
125
|
+
sys.exit(1)
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
def cmd_metrics(args):
|
|
129
|
+
"""List available metrics."""
|
|
130
|
+
from evalforge.metrics.registry import MetricRegistry
|
|
131
|
+
registry = MetricRegistry()
|
|
132
|
+
|
|
133
|
+
if args.use_case:
|
|
134
|
+
metrics = registry.get_metrics_for(args.use_case)
|
|
135
|
+
print(f"Metrics for '{args.use_case}':")
|
|
136
|
+
for m in metrics:
|
|
137
|
+
print(f" • {m.name} — {m.description}")
|
|
138
|
+
else:
|
|
139
|
+
print("All available metrics:")
|
|
140
|
+
for name in registry.list_metrics():
|
|
141
|
+
metric = registry.get(name)
|
|
142
|
+
print(f" • {name} [{metric.category}] — {metric.description}")
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
def main():
|
|
146
|
+
"""Main CLI entry point."""
|
|
147
|
+
parser = argparse.ArgumentParser(
|
|
148
|
+
prog="evalforge",
|
|
149
|
+
description="EvalForge - Automated LLM Evaluation Pipeline Generator",
|
|
150
|
+
)
|
|
151
|
+
subparsers = parser.add_subparsers(dest="command")
|
|
152
|
+
|
|
153
|
+
# init
|
|
154
|
+
init_p = subparsers.add_parser("init", help="Initialize a new project")
|
|
155
|
+
init_p.add_argument("name", nargs="?", default=None)
|
|
156
|
+
init_p.add_argument("--use-case", default="rag", choices=["rag", "summarization", "classification", "generation", "chat", "code"])
|
|
157
|
+
|
|
158
|
+
# validate
|
|
159
|
+
val_p = subparsers.add_parser("validate", help="Validate configuration")
|
|
160
|
+
val_p.add_argument("--config", default=None)
|
|
161
|
+
|
|
162
|
+
# run
|
|
163
|
+
run_p = subparsers.add_parser("run", help="Run evaluation pipeline")
|
|
164
|
+
run_p.add_argument("--config", default=None)
|
|
165
|
+
run_p.add_argument("--metrics", default=None, help="Comma-separated metrics to run")
|
|
166
|
+
run_p.add_argument("--use-case", default=None)
|
|
167
|
+
|
|
168
|
+
# metrics
|
|
169
|
+
met_p = subparsers.add_parser("metrics", help="List available metrics")
|
|
170
|
+
met_p.add_argument("--use-case", default=None)
|
|
171
|
+
|
|
172
|
+
args = parser.parse_args()
|
|
173
|
+
commands = {"init": cmd_init, "validate": cmd_validate, "run": cmd_run, "metrics": cmd_metrics}
|
|
174
|
+
|
|
175
|
+
if args.command in commands:
|
|
176
|
+
commands[args.command](args)
|
|
177
|
+
else:
|
|
178
|
+
parser.print_help()
|
|
179
|
+
|
|
180
|
+
|
|
181
|
+
if __name__ == "__main__":
|
|
182
|
+
main()
|