evalforge 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- evalforge-0.1.0/.gitignore +42 -0
- evalforge-0.1.0/PKG-INFO +20 -0
- evalforge-0.1.0/README.md +9 -0
- evalforge-0.1.0/evalforge/.gitkeep +0 -0
- evalforge-0.1.0/evalforge/__init__.py +3 -0
- evalforge-0.1.0/evalforge/client.py +122 -0
- evalforge-0.1.0/pyproject.toml +21 -0
- evalforge-0.1.0/tests/.gitkeep +0 -0
- evalforge-0.1.0/tests/test_client.py +49 -0
- evalforge-0.1.0/uv.lock +78 -0
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
# Generated by Cargo
|
|
2
|
+
# will have compiled files and executables
|
|
3
|
+
debug
|
|
4
|
+
target
|
|
5
|
+
|
|
6
|
+
# These are backup files generated by rustfmt
|
|
7
|
+
**/*.rs.bk
|
|
8
|
+
|
|
9
|
+
# MSVC Windows builds of rustc generate these, which store debugging information
|
|
10
|
+
*.pdb
|
|
11
|
+
|
|
12
|
+
# Generated by cargo mutants
|
|
13
|
+
# Contains mutation testing data
|
|
14
|
+
**/mutants.out*/
|
|
15
|
+
|
|
16
|
+
# RustRover
|
|
17
|
+
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
|
|
18
|
+
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
|
|
19
|
+
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
|
20
|
+
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
|
21
|
+
#.idea/
|
|
22
|
+
|
|
23
|
+
# Python
|
|
24
|
+
__pycache__/
|
|
25
|
+
*.py[cod]
|
|
26
|
+
*.egg-info/
|
|
27
|
+
dist/
|
|
28
|
+
build/
|
|
29
|
+
.venv/
|
|
30
|
+
.env
|
|
31
|
+
*.egg
|
|
32
|
+
|
|
33
|
+
# macOS
|
|
34
|
+
.DS_Store
|
|
35
|
+
|
|
36
|
+
# IDE
|
|
37
|
+
.vscode/
|
|
38
|
+
.idea/
|
|
39
|
+
*.swp
|
|
40
|
+
|
|
41
|
+
# EvalForge specific
|
|
42
|
+
*.trace.json.bak
|
evalforge-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: evalforge
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Framework-agnostic LLM agent evaluation harness
|
|
5
|
+
Project-URL: Homepage, https://github.com/YOUR_USERNAME/evalforge
|
|
6
|
+
License: MIT
|
|
7
|
+
Requires-Python: >=3.11
|
|
8
|
+
Provides-Extra: dev
|
|
9
|
+
Requires-Dist: pytest>=7.0; extra == 'dev'
|
|
10
|
+
Description-Content-Type: text/markdown
|
|
11
|
+
|
|
12
|
+
# EvalForge Python SDK
|
|
13
|
+
|
|
14
|
+
pip install evalforge
|
|
15
|
+
|
|
16
|
+
## Quick Start
|
|
17
|
+
import evalforge
|
|
18
|
+
result = evalforge.run("trace.json", metrics=["faithfulness"])
|
|
19
|
+
print(result.passed)
|
|
20
|
+
print(result.metrics[0].score)
|
|
File without changes
|
|
@@ -0,0 +1,122 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
import re
|
|
5
|
+
import subprocess
|
|
6
|
+
from dataclasses import dataclass, field
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
@dataclass
|
|
11
|
+
class MetricResult:
|
|
12
|
+
metric: str
|
|
13
|
+
score: float
|
|
14
|
+
passed: bool
|
|
15
|
+
reason: str
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
@dataclass
|
|
19
|
+
class EvalResult:
|
|
20
|
+
trace_id: str
|
|
21
|
+
framework: str
|
|
22
|
+
metrics: list[MetricResult] = field(default_factory=list)
|
|
23
|
+
passed: bool = False
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def _find_binary() -> str:
|
|
27
|
+
# 1. Explicit override via environment variable
|
|
28
|
+
env_bin = os.environ.get("EVALFORGE_BIN")
|
|
29
|
+
if env_bin and Path(env_bin).is_file():
|
|
30
|
+
return env_bin
|
|
31
|
+
|
|
32
|
+
# 2. Walk up from client.py looking for target/debug/evalforge (up to 5 levels)
|
|
33
|
+
current = Path(__file__).resolve().parent
|
|
34
|
+
for _ in range(5):
|
|
35
|
+
candidate = current / "target" / "debug" / "evalforge"
|
|
36
|
+
if candidate.exists():
|
|
37
|
+
return str(candidate)
|
|
38
|
+
current = current.parent
|
|
39
|
+
|
|
40
|
+
# 3. System PATH
|
|
41
|
+
import shutil
|
|
42
|
+
path_bin = shutil.which("evalforge")
|
|
43
|
+
if path_bin:
|
|
44
|
+
return path_bin
|
|
45
|
+
|
|
46
|
+
raise RuntimeError(
|
|
47
|
+
f"EvalForge binary not found. "
|
|
48
|
+
f"Searched up to 5 parent dirs from {Path(__file__).resolve()}. "
|
|
49
|
+
f"Set EVALFORGE_BIN env var to the binary path, or run 'cargo build' first."
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def run(
|
|
54
|
+
trace: str,
|
|
55
|
+
metrics: list[str],
|
|
56
|
+
threshold: float = 0.7,
|
|
57
|
+
mock: bool = False,
|
|
58
|
+
api_key: str | None = None,
|
|
59
|
+
) -> EvalResult:
|
|
60
|
+
binary = _find_binary()
|
|
61
|
+
|
|
62
|
+
cmd = [
|
|
63
|
+
binary,
|
|
64
|
+
"run",
|
|
65
|
+
"--trace", trace,
|
|
66
|
+
"--metrics", ",".join(metrics),
|
|
67
|
+
"--threshold", str(threshold),
|
|
68
|
+
]
|
|
69
|
+
if mock:
|
|
70
|
+
cmd.append("--mock")
|
|
71
|
+
|
|
72
|
+
env = os.environ.copy()
|
|
73
|
+
if api_key is not None:
|
|
74
|
+
env["ANTHROPIC_API_KEY"] = api_key
|
|
75
|
+
|
|
76
|
+
proc = subprocess.run(cmd, capture_output=True, text=True, env=env)
|
|
77
|
+
|
|
78
|
+
if proc.returncode not in (0, 1):
|
|
79
|
+
raise RuntimeError(
|
|
80
|
+
f"evalforge exited with code {proc.returncode}.\n"
|
|
81
|
+
f"stdout: {proc.stdout}\nstderr: {proc.stderr}"
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
output = proc.stdout
|
|
85
|
+
|
|
86
|
+
# Parse trace summary fields
|
|
87
|
+
trace_id = _extract_field(output, "Trace ID")
|
|
88
|
+
framework = _extract_field(output, "Framework")
|
|
89
|
+
|
|
90
|
+
# Parse scoring result lines and the Reason: line that follows each.
|
|
91
|
+
# e.g.:
|
|
92
|
+
# faithfulness 0.91 PASS
|
|
93
|
+
# Reason: Mock score — skipping live API call
|
|
94
|
+
metric_results: list[MetricResult] = []
|
|
95
|
+
pattern = re.compile(
|
|
96
|
+
r"^(\w+)\s+([\d.]+)\s+(PASS|FAIL)\s*\nReason:\s*(.+)$",
|
|
97
|
+
re.MULTILINE,
|
|
98
|
+
)
|
|
99
|
+
for m in pattern.finditer(output):
|
|
100
|
+
metric_results.append(
|
|
101
|
+
MetricResult(
|
|
102
|
+
metric=m.group(1),
|
|
103
|
+
score=float(m.group(2)),
|
|
104
|
+
passed=m.group(3) == "PASS",
|
|
105
|
+
reason=m.group(4).strip(),
|
|
106
|
+
)
|
|
107
|
+
)
|
|
108
|
+
|
|
109
|
+
overall_passed = proc.returncode == 0 and all(r.passed for r in metric_results)
|
|
110
|
+
|
|
111
|
+
return EvalResult(
|
|
112
|
+
trace_id=trace_id,
|
|
113
|
+
framework=framework,
|
|
114
|
+
metrics=metric_results,
|
|
115
|
+
passed=overall_passed,
|
|
116
|
+
)
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
def _extract_field(output: str, label: str) -> str:
|
|
120
|
+
pattern = re.compile(rf"^{re.escape(label)}:\s+(.+)$", re.MULTILINE)
|
|
121
|
+
m = pattern.search(output)
|
|
122
|
+
return m.group(1).strip() if m else ""
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["hatchling"]
|
|
3
|
+
build-backend = "hatchling.build"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "evalforge"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "Framework-agnostic LLM agent evaluation harness"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.11"
|
|
11
|
+
license = { text = "MIT" }
|
|
12
|
+
dependencies = []
|
|
13
|
+
|
|
14
|
+
[project.urls]
|
|
15
|
+
Homepage = "https://github.com/YOUR_USERNAME/evalforge"
|
|
16
|
+
|
|
17
|
+
[project.optional-dependencies]
|
|
18
|
+
dev = ["pytest>=7.0"]
|
|
19
|
+
|
|
20
|
+
[tool.hatch.build.targets.wheel]
|
|
21
|
+
packages = ["evalforge"]
|
|
File without changes
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
|
|
3
|
+
import evalforge
|
|
4
|
+
from evalforge.client import EvalResult, MetricResult
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
# Resolve fixture paths relative to this file so tests work regardless of
|
|
8
|
+
# which directory pytest is invoked from.
|
|
9
|
+
_WORKSPACE_ROOT = Path(__file__).parent.parent.parent.parent
|
|
10
|
+
SAMPLE_TRACE = str(_WORKSPACE_ROOT / "tests/fixtures/sample_trace.json")
|
|
11
|
+
SIMPLE_TRACE = str(_WORKSPACE_ROOT / "tests/fixtures/simple_trace.json")
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def test_mock_run():
|
|
15
|
+
result = evalforge.run(SAMPLE_TRACE, metrics=["faithfulness"], mock=True)
|
|
16
|
+
assert result.passed is True
|
|
17
|
+
assert len(result.metrics) == 1
|
|
18
|
+
assert result.metrics[0].score == 0.91
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def test_simple_trace_mock():
|
|
22
|
+
result = evalforge.run(SIMPLE_TRACE, metrics=["faithfulness"], mock=True)
|
|
23
|
+
assert result.passed is True
|
|
24
|
+
assert len(result.metrics) == 1
|
|
25
|
+
assert result.metrics[0].score == 0.91
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def test_threshold_boundary():
|
|
29
|
+
mr = MetricResult(metric="faithfulness", score=0.7, passed=True, reason="at threshold")
|
|
30
|
+
assert mr.passed is True
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def test_metric_result_fields():
|
|
34
|
+
mr = MetricResult(metric="faithfulness", score=0.85, passed=True, reason="looks good")
|
|
35
|
+
assert mr.metric == "faithfulness"
|
|
36
|
+
assert mr.score == 0.85
|
|
37
|
+
assert mr.passed is True
|
|
38
|
+
assert mr.reason == "looks good"
|
|
39
|
+
|
|
40
|
+
er = EvalResult(
|
|
41
|
+
trace_id="trace-001",
|
|
42
|
+
framework="langchain",
|
|
43
|
+
metrics=[mr],
|
|
44
|
+
passed=True,
|
|
45
|
+
)
|
|
46
|
+
assert er.trace_id == "trace-001"
|
|
47
|
+
assert er.framework == "langchain"
|
|
48
|
+
assert er.passed is True
|
|
49
|
+
assert er.metrics[0].metric == "faithfulness"
|
evalforge-0.1.0/uv.lock
ADDED
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
version = 1
|
|
2
|
+
revision = 3
|
|
3
|
+
requires-python = ">=3.11"
|
|
4
|
+
|
|
5
|
+
[[package]]
|
|
6
|
+
name = "colorama"
|
|
7
|
+
version = "0.4.6"
|
|
8
|
+
source = { registry = "https://pypi.org/simple" }
|
|
9
|
+
sdist = { url = "https://files.pythonhosted.org/packages/d8/53/6f443c9a4a8358a93a6792e2acffb9d9d5cb0a5cfd8802644b7b1c9a02e4/colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44", size = 27697, upload-time = "2022-10-25T02:36:22.414Z" }
|
|
10
|
+
wheels = [
|
|
11
|
+
{ url = "https://files.pythonhosted.org/packages/d1/d6/3965ed04c63042e047cb6a3e6ed1a63a35087b6a609aa3a15ed8ac56c221/colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6", size = 25335, upload-time = "2022-10-25T02:36:20.889Z" },
|
|
12
|
+
]
|
|
13
|
+
|
|
14
|
+
[[package]]
|
|
15
|
+
name = "evalforge"
|
|
16
|
+
version = "0.1.0"
|
|
17
|
+
source = { editable = "." }
|
|
18
|
+
|
|
19
|
+
[package.optional-dependencies]
|
|
20
|
+
dev = [
|
|
21
|
+
{ name = "pytest" },
|
|
22
|
+
]
|
|
23
|
+
|
|
24
|
+
[package.metadata]
|
|
25
|
+
requires-dist = [{ name = "pytest", marker = "extra == 'dev'", specifier = ">=7.0" }]
|
|
26
|
+
provides-extras = ["dev"]
|
|
27
|
+
|
|
28
|
+
[[package]]
|
|
29
|
+
name = "iniconfig"
|
|
30
|
+
version = "2.3.0"
|
|
31
|
+
source = { registry = "https://pypi.org/simple" }
|
|
32
|
+
sdist = { url = "https://files.pythonhosted.org/packages/72/34/14ca021ce8e5dfedc35312d08ba8bf51fdd999c576889fc2c24cb97f4f10/iniconfig-2.3.0.tar.gz", hash = "sha256:c76315c77db068650d49c5b56314774a7804df16fee4402c1f19d6d15d8c4730", size = 20503, upload-time = "2025-10-18T21:55:43.219Z" }
|
|
33
|
+
wheels = [
|
|
34
|
+
{ url = "https://files.pythonhosted.org/packages/cb/b1/3846dd7f199d53cb17f49cba7e651e9ce294d8497c8c150530ed11865bb8/iniconfig-2.3.0-py3-none-any.whl", hash = "sha256:f631c04d2c48c52b84d0d0549c99ff3859c98df65b3101406327ecc7d53fbf12", size = 7484, upload-time = "2025-10-18T21:55:41.639Z" },
|
|
35
|
+
]
|
|
36
|
+
|
|
37
|
+
[[package]]
|
|
38
|
+
name = "packaging"
|
|
39
|
+
version = "26.0"
|
|
40
|
+
source = { registry = "https://pypi.org/simple" }
|
|
41
|
+
sdist = { url = "https://files.pythonhosted.org/packages/65/ee/299d360cdc32edc7d2cf530f3accf79c4fca01e96ffc950d8a52213bd8e4/packaging-26.0.tar.gz", hash = "sha256:00243ae351a257117b6a241061796684b084ed1c516a08c48a3f7e147a9d80b4", size = 143416, upload-time = "2026-01-21T20:50:39.064Z" }
|
|
42
|
+
wheels = [
|
|
43
|
+
{ url = "https://files.pythonhosted.org/packages/b7/b9/c538f279a4e237a006a2c98387d081e9eb060d203d8ed34467cc0f0b9b53/packaging-26.0-py3-none-any.whl", hash = "sha256:b36f1fef9334a5588b4166f8bcd26a14e521f2b55e6b9de3aaa80d3ff7a37529", size = 74366, upload-time = "2026-01-21T20:50:37.788Z" },
|
|
44
|
+
]
|
|
45
|
+
|
|
46
|
+
[[package]]
|
|
47
|
+
name = "pluggy"
|
|
48
|
+
version = "1.6.0"
|
|
49
|
+
source = { registry = "https://pypi.org/simple" }
|
|
50
|
+
sdist = { url = "https://files.pythonhosted.org/packages/f9/e2/3e91f31a7d2b083fe6ef3fa267035b518369d9511ffab804f839851d2779/pluggy-1.6.0.tar.gz", hash = "sha256:7dcc130b76258d33b90f61b658791dede3486c3e6bfb003ee5c9bfb396dd22f3", size = 69412, upload-time = "2025-05-15T12:30:07.975Z" }
|
|
51
|
+
wheels = [
|
|
52
|
+
{ url = "https://files.pythonhosted.org/packages/54/20/4d324d65cc6d9205fabedc306948156824eb9f0ee1633355a8f7ec5c66bf/pluggy-1.6.0-py3-none-any.whl", hash = "sha256:e920276dd6813095e9377c0bc5566d94c932c33b27a3e3945d8389c374dd4746", size = 20538, upload-time = "2025-05-15T12:30:06.134Z" },
|
|
53
|
+
]
|
|
54
|
+
|
|
55
|
+
[[package]]
|
|
56
|
+
name = "pygments"
|
|
57
|
+
version = "2.20.0"
|
|
58
|
+
source = { registry = "https://pypi.org/simple" }
|
|
59
|
+
sdist = { url = "https://files.pythonhosted.org/packages/c3/b2/bc9c9196916376152d655522fdcebac55e66de6603a76a02bca1b6414f6c/pygments-2.20.0.tar.gz", hash = "sha256:6757cd03768053ff99f3039c1a36d6c0aa0b263438fcab17520b30a303a82b5f", size = 4955991, upload-time = "2026-03-29T13:29:33.898Z" }
|
|
60
|
+
wheels = [
|
|
61
|
+
{ url = "https://files.pythonhosted.org/packages/f4/7e/a72dd26f3b0f4f2bf1dd8923c85f7ceb43172af56d63c7383eb62b332364/pygments-2.20.0-py3-none-any.whl", hash = "sha256:81a9e26dd42fd28a23a2d169d86d7ac03b46e2f8b59ed4698fb4785f946d0176", size = 1231151, upload-time = "2026-03-29T13:29:30.038Z" },
|
|
62
|
+
]
|
|
63
|
+
|
|
64
|
+
[[package]]
|
|
65
|
+
name = "pytest"
|
|
66
|
+
version = "9.0.2"
|
|
67
|
+
source = { registry = "https://pypi.org/simple" }
|
|
68
|
+
dependencies = [
|
|
69
|
+
{ name = "colorama", marker = "sys_platform == 'win32'" },
|
|
70
|
+
{ name = "iniconfig" },
|
|
71
|
+
{ name = "packaging" },
|
|
72
|
+
{ name = "pluggy" },
|
|
73
|
+
{ name = "pygments" },
|
|
74
|
+
]
|
|
75
|
+
sdist = { url = "https://files.pythonhosted.org/packages/d1/db/7ef3487e0fb0049ddb5ce41d3a49c235bf9ad299b6a25d5780a89f19230f/pytest-9.0.2.tar.gz", hash = "sha256:75186651a92bd89611d1d9fc20f0b4345fd827c41ccd5c299a868a05d70edf11", size = 1568901, upload-time = "2025-12-06T21:30:51.014Z" }
|
|
76
|
+
wheels = [
|
|
77
|
+
{ url = "https://files.pythonhosted.org/packages/3b/ab/b3226f0bd7cdcf710fbede2b3548584366da3b19b5021e74f5bde2a8fa3f/pytest-9.0.2-py3-none-any.whl", hash = "sha256:711ffd45bf766d5264d487b917733b453d917afd2b0ad65223959f59089f875b", size = 374801, upload-time = "2025-12-06T21:30:49.154Z" },
|
|
78
|
+
]
|