substrai-evalforge 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (32) hide show
  1. substrai_evalforge-0.1.0/LICENSE +21 -0
  2. substrai_evalforge-0.1.0/PKG-INFO +129 -0
  3. substrai_evalforge-0.1.0/README.md +105 -0
  4. substrai_evalforge-0.1.0/pyproject.toml +40 -0
  5. substrai_evalforge-0.1.0/setup.cfg +4 -0
  6. substrai_evalforge-0.1.0/src/evalforge/__init__.py +31 -0
  7. substrai_evalforge-0.1.0/src/evalforge/cli/__init__.py +2 -0
  8. substrai_evalforge-0.1.0/src/evalforge/cli/main.py +182 -0
  9. substrai_evalforge-0.1.0/src/evalforge/core/__init__.py +4 -0
  10. substrai_evalforge-0.1.0/src/evalforge/core/config.py +253 -0
  11. substrai_evalforge-0.1.0/src/evalforge/core/pipeline.py +186 -0
  12. substrai_evalforge-0.1.0/src/evalforge/core/result.py +125 -0
  13. substrai_evalforge-0.1.0/src/evalforge/drift/__init__.py +0 -0
  14. substrai_evalforge-0.1.0/src/evalforge/generators/__init__.py +0 -0
  15. substrai_evalforge-0.1.0/src/evalforge/metrics/__init__.py +7 -0
  16. substrai_evalforge-0.1.0/src/evalforge/metrics/base.py +62 -0
  17. substrai_evalforge-0.1.0/src/evalforge/metrics/classification.py +59 -0
  18. substrai_evalforge-0.1.0/src/evalforge/metrics/rag.py +176 -0
  19. substrai_evalforge-0.1.0/src/evalforge/metrics/registry.py +85 -0
  20. substrai_evalforge-0.1.0/src/evalforge/metrics/safety.py +108 -0
  21. substrai_evalforge-0.1.0/src/evalforge/metrics/text.py +217 -0
  22. substrai_evalforge-0.1.0/src/evalforge/pipeline/__init__.py +0 -0
  23. substrai_evalforge-0.1.0/src/evalforge/plugins/__init__.py +0 -0
  24. substrai_evalforge-0.1.0/src/substrai_evalforge.egg-info/PKG-INFO +129 -0
  25. substrai_evalforge-0.1.0/src/substrai_evalforge.egg-info/SOURCES.txt +30 -0
  26. substrai_evalforge-0.1.0/src/substrai_evalforge.egg-info/dependency_links.txt +1 -0
  27. substrai_evalforge-0.1.0/src/substrai_evalforge.egg-info/entry_points.txt +2 -0
  28. substrai_evalforge-0.1.0/src/substrai_evalforge.egg-info/requires.txt +8 -0
  29. substrai_evalforge-0.1.0/src/substrai_evalforge.egg-info/top_level.txt +1 -0
  30. substrai_evalforge-0.1.0/tests/test_config.py +93 -0
  31. substrai_evalforge-0.1.0/tests/test_metrics.py +181 -0
  32. substrai_evalforge-0.1.0/tests/test_pipeline.py +98 -0
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 Gaurav Kumar Sinha (Substrai AI)
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,129 @@
1
+ Metadata-Version: 2.4
2
+ Name: substrai-evalforge
3
+ Version: 0.1.0
4
+ Summary: Automated LLM evaluation pipeline generator
5
+ Author-email: Gaurav Kumar Sinha <gaurav@substrai.dev>
6
+ License: MIT
7
+ Project-URL: Homepage, https://github.com/substrai/evalforge
8
+ Project-URL: Repository, https://github.com/substrai/evalforge
9
+ Keywords: llm,evaluation,testing,mlops,genai,rag,metrics,pipeline,serverless,aws-lambda
10
+ Classifier: Development Status :: 3 - Alpha
11
+ Classifier: License :: OSI Approved :: MIT License
12
+ Classifier: Programming Language :: Python :: 3
13
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
14
+ Requires-Python: >=3.9
15
+ Description-Content-Type: text/markdown
16
+ License-File: LICENSE
17
+ Requires-Dist: pyyaml>=6.0
18
+ Provides-Extra: aws
19
+ Requires-Dist: boto3>=1.28.0; extra == "aws"
20
+ Provides-Extra: dev
21
+ Requires-Dist: pytest>=7.0; extra == "dev"
22
+ Requires-Dist: pytest-cov>=4.0; extra == "dev"
23
+ Dynamic: license-file
24
+
25
+ # EvalForge
26
+
27
+ **Automated LLM evaluation pipeline generator.**
28
+
29
+ > Built by [SubstrAI](https://github.com/substrai) — Open-source GenAI frameworks for serverless infrastructure.
30
+
31
+ [![PyPI version](https://badge.fury.io/py/substrai-evalforge.svg)](https://pypi.org/project/substrai-evalforge/)
32
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
33
+ [![Python 3.9+](https://img.shields.io/badge/python-3.9+-blue.svg)](https://www.python.org/downloads/)
34
+
35
+ ## The Problem
36
+
37
+ Every team deploying LLMs builds evaluation pipelines from scratch. RAGAS and DeepEval are libraries — they don't generate infrastructure, schedule runs, detect drift, or route to human reviewers.
38
+
39
+ ## The Solution
40
+
41
+ Describe your use case → EvalForge generates the complete evaluation pipeline:
42
+
43
+ ```yaml
44
+ # evalforge.yaml
45
+ use_case:
46
+ type: rag
47
+ description: "Customer support chatbot"
48
+ model:
49
+ provider: bedrock
50
+ model_id: anthropic.claude-3-haiku-20240307-v1:0
51
+
52
+ evaluation:
53
+ metrics: auto # auto-selects: faithfulness, relevancy, precision, recall, toxicity
54
+ ```
55
+
56
+ ```bash
57
+ evalforge run
58
+ # Faithfulness: 0.91 ✓ (threshold: 0.85)
59
+ # Answer Relevancy: 0.87 ✓ (threshold: 0.80)
60
+ # Context Precision: 0.78 ✓ (threshold: 0.75)
61
+ # Toxicity: 0.02 ✓ (threshold: 0.05)
62
+ # Overall: PASS (4/4 metrics passing)
63
+ ```
64
+
65
+ ## Features
66
+
67
+ - **Use-case-driven metric selection** — describe your app, get optimal metrics
68
+ - **6 use case types** — RAG, summarization, classification, generation, chat, code
69
+ - **16+ built-in metrics** — faithfulness, ROUGE, BLEU, toxicity, injection resistance, F1
70
+ - **Synthetic test data generation** — adversarial, edge cases, domain-specific
71
+ - **Drift detection** — alerts when quality degrades over time
72
+ - **Human-in-the-loop** — route uncertain evaluations to reviewers
73
+ - **Scheduled pipelines** — daily/weekly automated evaluation runs
74
+ - **Benchmark registry** — compare against published benchmarks
75
+ - **One-command deploy** — Step Functions + Lambda infrastructure
76
+
77
+ ## Installation
78
+
79
+ ```bash
80
+ pip install substrai-evalforge
81
+ ```
82
+
83
+ ## Quick Start
84
+
85
+ ```bash
86
+ # Scaffold project
87
+ evalforge init my-eval --use-case rag
88
+
89
+ # Run evaluation
90
+ cd my-eval
91
+ evalforge run
92
+
93
+ # List available metrics
94
+ evalforge metrics --use-case rag
95
+ ```
96
+
97
+ ## Python SDK
98
+
99
+ ```python
100
+ from evalforge import EvalPipeline
101
+
102
+ # Quick start for any use case
103
+ pipeline = EvalPipeline.for_use_case("rag")
104
+ results = pipeline.run()
105
+ print(results.summary())
106
+ print(f"All passing: {results.all_passing}")
107
+ ```
108
+
109
+ ## Supported Use Cases & Auto-Selected Metrics
110
+
111
+ | Use Case | Auto-Selected Metrics |
112
+ |---|---|
113
+ | **rag** | faithfulness, answer_relevancy, context_precision, context_recall, toxicity |
114
+ | **summarization** | rouge_l, bleu, coherence, conciseness, fluency |
115
+ | **classification** | accuracy, precision, recall, f1_score |
116
+ | **generation** | fluency, coherence, toxicity, bias_detection |
117
+ | **chat** | coherence, toxicity, injection_resistance, fluency |
118
+ | **code** | accuracy, coherence |
119
+
120
+ ## License
121
+
122
+ MIT — see [LICENSE](LICENSE)
123
+
124
+ ## Author
125
+
126
+ **Gaurav Kumar Sinha** — Founder, [SubstrAI](https://github.com/substrai)
127
+
128
+ - Email: gaurav@substrai.dev
129
+ - GitHub: [@substrai](https://github.com/substrai)
@@ -0,0 +1,105 @@
1
+ # EvalForge
2
+
3
+ **Automated LLM evaluation pipeline generator.**
4
+
5
+ > Built by [SubstrAI](https://github.com/substrai) — Open-source GenAI frameworks for serverless infrastructure.
6
+
7
+ [![PyPI version](https://badge.fury.io/py/substrai-evalforge.svg)](https://pypi.org/project/substrai-evalforge/)
8
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
9
+ [![Python 3.9+](https://img.shields.io/badge/python-3.9+-blue.svg)](https://www.python.org/downloads/)
10
+
11
+ ## The Problem
12
+
13
+ Every team deploying LLMs builds evaluation pipelines from scratch. RAGAS and DeepEval are libraries — they don't generate infrastructure, schedule runs, detect drift, or route to human reviewers.
14
+
15
+ ## The Solution
16
+
17
+ Describe your use case → EvalForge generates the complete evaluation pipeline:
18
+
19
+ ```yaml
20
+ # evalforge.yaml
21
+ use_case:
22
+ type: rag
23
+ description: "Customer support chatbot"
24
+ model:
25
+ provider: bedrock
26
+ model_id: anthropic.claude-3-haiku-20240307-v1:0
27
+
28
+ evaluation:
29
+ metrics: auto # auto-selects: faithfulness, relevancy, precision, recall, toxicity
30
+ ```
31
+
32
+ ```bash
33
+ evalforge run
34
+ # Faithfulness: 0.91 ✓ (threshold: 0.85)
35
+ # Answer Relevancy: 0.87 ✓ (threshold: 0.80)
36
+ # Context Precision: 0.78 ✓ (threshold: 0.75)
37
+ # Toxicity: 0.02 ✓ (threshold: 0.05)
38
+ # Overall: PASS (4/4 metrics passing)
39
+ ```
40
+
41
+ ## Features
42
+
43
+ - **Use-case-driven metric selection** — describe your app, get optimal metrics
44
+ - **6 use case types** — RAG, summarization, classification, generation, chat, code
45
+ - **16+ built-in metrics** — faithfulness, ROUGE, BLEU, toxicity, injection resistance, F1
46
+ - **Synthetic test data generation** — adversarial, edge cases, domain-specific
47
+ - **Drift detection** — alerts when quality degrades over time
48
+ - **Human-in-the-loop** — route uncertain evaluations to reviewers
49
+ - **Scheduled pipelines** — daily/weekly automated evaluation runs
50
+ - **Benchmark registry** — compare against published benchmarks
51
+ - **One-command deploy** — Step Functions + Lambda infrastructure
52
+
53
+ ## Installation
54
+
55
+ ```bash
56
+ pip install substrai-evalforge
57
+ ```
58
+
59
+ ## Quick Start
60
+
61
+ ```bash
62
+ # Scaffold project
63
+ evalforge init my-eval --use-case rag
64
+
65
+ # Run evaluation
66
+ cd my-eval
67
+ evalforge run
68
+
69
+ # List available metrics
70
+ evalforge metrics --use-case rag
71
+ ```
72
+
73
+ ## Python SDK
74
+
75
+ ```python
76
+ from evalforge import EvalPipeline
77
+
78
+ # Quick start for any use case
79
+ pipeline = EvalPipeline.for_use_case("rag")
80
+ results = pipeline.run()
81
+ print(results.summary())
82
+ print(f"All passing: {results.all_passing}")
83
+ ```
84
+
85
+ ## Supported Use Cases & Auto-Selected Metrics
86
+
87
+ | Use Case | Auto-Selected Metrics |
88
+ |---|---|
89
+ | **rag** | faithfulness, answer_relevancy, context_precision, context_recall, toxicity |
90
+ | **summarization** | rouge_l, bleu, coherence, conciseness, fluency |
91
+ | **classification** | accuracy, precision, recall, f1_score |
92
+ | **generation** | fluency, coherence, toxicity, bias_detection |
93
+ | **chat** | coherence, toxicity, injection_resistance, fluency |
94
+ | **code** | accuracy, coherence |
95
+
96
+ ## License
97
+
98
+ MIT — see [LICENSE](LICENSE)
99
+
100
+ ## Author
101
+
102
+ **Gaurav Kumar Sinha** — Founder, [SubstrAI](https://github.com/substrai)
103
+
104
+ - Email: gaurav@substrai.dev
105
+ - GitHub: [@substrai](https://github.com/substrai)
@@ -0,0 +1,40 @@
1
+ [build-system]
2
+ requires = ["setuptools>=68.0", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "substrai-evalforge"
7
+ version = "0.1.0"
8
+ description = "Automated LLM evaluation pipeline generator"
9
+ readme = "README.md"
10
+ license = {text = "MIT"}
11
+ requires-python = ">=3.9"
12
+ authors = [
13
+ {name = "Gaurav Kumar Sinha", email = "gaurav@substrai.dev"},
14
+ ]
15
+ keywords = ["llm", "evaluation", "testing", "mlops", "genai", "rag", "metrics", "pipeline", "serverless", "aws-lambda"]
16
+ classifiers = [
17
+ "Development Status :: 3 - Alpha",
18
+ "License :: OSI Approved :: MIT License",
19
+ "Programming Language :: Python :: 3",
20
+ "Topic :: Scientific/Engineering :: Artificial Intelligence",
21
+ ]
22
+ dependencies = ["pyyaml>=6.0"]
23
+
24
+ [project.optional-dependencies]
25
+ aws = ["boto3>=1.28.0"]
26
+ dev = ["pytest>=7.0", "pytest-cov>=4.0"]
27
+
28
+ [project.scripts]
29
+ evalforge = "evalforge.cli.main:main"
30
+
31
+ [project.urls]
32
+ Homepage = "https://github.com/substrai/evalforge"
33
+ Repository = "https://github.com/substrai/evalforge"
34
+
35
+ [tool.setuptools.packages.find]
36
+ where = ["src"]
37
+
38
+ [tool.pytest.ini_options]
39
+ testpaths = ["tests"]
40
+ pythonpath = ["src"]
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,31 @@
1
+ """
2
+ EvalForge - Automated LLM Evaluation Pipeline Generator
3
+
4
+ Describe your GenAI use case in a config file, and EvalForge generates
5
+ the complete evaluation infrastructure: metrics selection, test data,
6
+ scheduled pipelines, drift detection, and human-in-the-loop review.
7
+
8
+ Usage:
9
+ from evalforge import EvalPipeline, EvalConfig, EvalResult
10
+
11
+ pipeline = EvalPipeline.from_config("evalforge.yaml")
12
+ results = pipeline.run()
13
+ print(results.summary())
14
+ """
15
+
16
+ __version__ = "0.1.0"
17
+
18
+ from evalforge.core.config import EvalConfig, UseCaseType
19
+ from evalforge.core.pipeline import EvalPipeline
20
+ from evalforge.core.result import EvalResult, MetricScore
21
+ from evalforge.metrics.registry import MetricRegistry, get_metrics_for_use_case
22
+
23
+ __all__ = [
24
+ "EvalConfig",
25
+ "EvalPipeline",
26
+ "EvalResult",
27
+ "MetricScore",
28
+ "UseCaseType",
29
+ "MetricRegistry",
30
+ "get_metrics_for_use_case",
31
+ ]
@@ -0,0 +1,2 @@
1
+ """EvalForge CLI."""
2
+ from evalforge.cli.main import main
@@ -0,0 +1,182 @@
1
+ """EvalForge CLI - command-line interface."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import argparse
6
+ import sys
7
+ from pathlib import Path
8
+
9
+ from evalforge.core.config import EvalConfig, UseCaseType
10
+ from evalforge.core.pipeline import EvalPipeline
11
+
12
+
13
+ def cmd_init(args):
14
+ """Initialize a new EvalForge project."""
15
+ project_name = args.name or "my-evaluation"
16
+ use_case = args.use_case or "rag"
17
+ project_dir = Path(project_name)
18
+
19
+ if project_dir.exists():
20
+ print(f"Error: Directory '{project_name}' already exists")
21
+ sys.exit(1)
22
+
23
+ # Create structure
24
+ dirs = [
25
+ project_dir / "metrics",
26
+ project_dir / "data" / "golden",
27
+ project_dir / "data" / "synthetic",
28
+ project_dir / "judges",
29
+ project_dir / "reports",
30
+ project_dir / "tests",
31
+ ]
32
+ for d in dirs:
33
+ d.mkdir(parents=True, exist_ok=True)
34
+
35
+ # Create evalforge.yaml
36
+ config = f"""project:
37
+ name: "{project_name}"
38
+ version: "1.0.0"
39
+
40
+ use_case:
41
+ type: {use_case}
42
+ description: "Evaluation pipeline for {use_case} application"
43
+ model:
44
+ provider: bedrock
45
+ model_id: anthropic.claude-3-haiku-20240307-v1:0
46
+ region: us-east-1
47
+
48
+ evaluation:
49
+ metrics: auto # auto-selects based on use_case.type
50
+ thresholds: {{}} # uses defaults for {use_case}
51
+
52
+ test_data:
53
+ source: synthetic
54
+ count: 100
55
+ categories: [simple, complex, adversarial, edge_cases]
56
+
57
+ schedule:
58
+ frequency: daily
59
+ time: "02:00"
60
+ """
61
+ (project_dir / "evalforge.yaml").write_text(config)
62
+ (project_dir / "metrics" / "__init__.py").write_text("")
63
+ (project_dir / "judges" / "__init__.py").write_text("")
64
+
65
+ # README
66
+ readme = f"""# {project_name}
67
+
68
+ LLM evaluation pipeline managed by [EvalForge](https://github.com/substrai/evalforge).
69
+
70
+ ## Quick Start
71
+
72
+ ```bash
73
+ evalforge validate
74
+ evalforge run
75
+ evalforge report --last 7d
76
+ ```
77
+ """
78
+ (project_dir / "README.md").write_text(readme)
79
+
80
+ print(f"✓ Created EvalForge project: {project_name}/")
81
+ print(f" Use case: {use_case}")
82
+ print(f" ├── evalforge.yaml")
83
+ print(f" ├── metrics/")
84
+ print(f" ├── data/golden/")
85
+ print(f" ├── data/synthetic/")
86
+ print(f" └── reports/")
87
+ print(f"\nNext steps:")
88
+ print(f" cd {project_name}")
89
+ print(f" evalforge validate")
90
+ print(f" evalforge run")
91
+
92
+
93
+ def cmd_validate(args):
94
+ """Validate configuration."""
95
+ config_path = Path(args.config or "evalforge.yaml")
96
+ if not config_path.exists():
97
+ print(f"Error: Config not found: {config_path}")
98
+ sys.exit(1)
99
+
100
+ try:
101
+ config = EvalConfig.from_file(config_path)
102
+ print(f"✓ Configuration valid")
103
+ print(config.summary())
104
+ except (ValueError, FileNotFoundError) as e:
105
+ print(f"✗ Configuration error: {e}")
106
+ sys.exit(1)
107
+
108
+
109
+ def cmd_run(args):
110
+ """Run evaluation pipeline."""
111
+ config_path = Path(args.config or "evalforge.yaml")
112
+
113
+ if config_path.exists():
114
+ pipeline = EvalPipeline.from_config(config_path)
115
+ else:
116
+ use_case = args.use_case or "rag"
117
+ pipeline = EvalPipeline.for_use_case(use_case)
118
+
119
+ metrics_filter = args.metrics.split(",") if args.metrics else None
120
+ results = pipeline.run(metrics=metrics_filter)
121
+
122
+ print(results.summary())
123
+
124
+ if not results.all_passing:
125
+ sys.exit(1)
126
+
127
+
128
+ def cmd_metrics(args):
129
+ """List available metrics."""
130
+ from evalforge.metrics.registry import MetricRegistry
131
+ registry = MetricRegistry()
132
+
133
+ if args.use_case:
134
+ metrics = registry.get_metrics_for(args.use_case)
135
+ print(f"Metrics for '{args.use_case}':")
136
+ for m in metrics:
137
+ print(f" • {m.name} — {m.description}")
138
+ else:
139
+ print("All available metrics:")
140
+ for name in registry.list_metrics():
141
+ metric = registry.get(name)
142
+ print(f" • {name} [{metric.category}] — {metric.description}")
143
+
144
+
145
+ def main():
146
+ """Main CLI entry point."""
147
+ parser = argparse.ArgumentParser(
148
+ prog="evalforge",
149
+ description="EvalForge - Automated LLM Evaluation Pipeline Generator",
150
+ )
151
+ subparsers = parser.add_subparsers(dest="command")
152
+
153
+ # init
154
+ init_p = subparsers.add_parser("init", help="Initialize a new project")
155
+ init_p.add_argument("name", nargs="?", default=None)
156
+ init_p.add_argument("--use-case", default="rag", choices=["rag", "summarization", "classification", "generation", "chat", "code"])
157
+
158
+ # validate
159
+ val_p = subparsers.add_parser("validate", help="Validate configuration")
160
+ val_p.add_argument("--config", default=None)
161
+
162
+ # run
163
+ run_p = subparsers.add_parser("run", help="Run evaluation pipeline")
164
+ run_p.add_argument("--config", default=None)
165
+ run_p.add_argument("--metrics", default=None, help="Comma-separated metrics to run")
166
+ run_p.add_argument("--use-case", default=None)
167
+
168
+ # metrics
169
+ met_p = subparsers.add_parser("metrics", help="List available metrics")
170
+ met_p.add_argument("--use-case", default=None)
171
+
172
+ args = parser.parse_args()
173
+ commands = {"init": cmd_init, "validate": cmd_validate, "run": cmd_run, "metrics": cmd_metrics}
174
+
175
+ if args.command in commands:
176
+ commands[args.command](args)
177
+ else:
178
+ parser.print_help()
179
+
180
+
181
+ if __name__ == "__main__":
182
+ main()
@@ -0,0 +1,4 @@
1
+ """Core modules for EvalForge."""
2
+ from evalforge.core.config import EvalConfig, UseCaseType
3
+ from evalforge.core.pipeline import EvalPipeline
4
+ from evalforge.core.result import EvalResult, MetricScore