evalforge 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,42 @@
1
+ # Generated by Cargo
2
+ # will have compiled files and executables
3
+ debug
4
+ target
5
+
6
+ # These are backup files generated by rustfmt
7
+ **/*.rs.bk
8
+
9
+ # MSVC Windows builds of rustc generate these, which store debugging information
10
+ *.pdb
11
+
12
+ # Generated by cargo mutants
13
+ # Contains mutation testing data
14
+ **/mutants.out*/
15
+
16
+ # RustRover
17
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
18
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
19
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
20
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
21
+ #.idea/
22
+
23
+ # Python
24
+ __pycache__/
25
+ *.py[cod]
26
+ *.egg-info/
27
+ dist/
28
+ build/
29
+ .venv/
30
+ .env
31
+ *.egg
32
+
33
+ # macOS
34
+ .DS_Store
35
+
36
+ # IDE
37
+ .vscode/
38
+ .idea/
39
+ *.swp
40
+
41
+ # EvalForge specific
42
+ *.trace.json.bak
@@ -0,0 +1,20 @@
1
+ Metadata-Version: 2.4
2
+ Name: evalforge
3
+ Version: 0.1.0
4
+ Summary: Framework-agnostic LLM agent evaluation harness
5
+ Project-URL: Homepage, https://github.com/YOUR_USERNAME/evalforge
6
+ License: MIT
7
+ Requires-Python: >=3.11
8
+ Provides-Extra: dev
9
+ Requires-Dist: pytest>=7.0; extra == 'dev'
10
+ Description-Content-Type: text/markdown
11
+
12
+ # EvalForge Python SDK
13
+
14
+ pip install evalforge
15
+
16
+ ## Quick Start
17
+ import evalforge
18
+ result = evalforge.run("trace.json", metrics=["faithfulness"])
19
+ print(result.passed)
20
+ print(result.metrics[0].score)
@@ -0,0 +1,9 @@
1
+ # EvalForge Python SDK
2
+
3
+ pip install evalforge
4
+
5
+ ## Quick Start
6
+ import evalforge
7
+ result = evalforge.run("trace.json", metrics=["faithfulness"])
8
+ print(result.passed)
9
+ print(result.metrics[0].score)
File without changes
@@ -0,0 +1,3 @@
1
+ from .client import run, EvalResult, MetricResult
2
+
3
+ __all__ = ["run", "EvalResult", "MetricResult"]
@@ -0,0 +1,122 @@
1
+ from __future__ import annotations
2
+
3
+ import os
4
+ import re
5
+ import subprocess
6
+ from dataclasses import dataclass, field
7
+ from pathlib import Path
8
+
9
+
10
+ @dataclass
11
+ class MetricResult:
12
+ metric: str
13
+ score: float
14
+ passed: bool
15
+ reason: str
16
+
17
+
18
+ @dataclass
19
+ class EvalResult:
20
+ trace_id: str
21
+ framework: str
22
+ metrics: list[MetricResult] = field(default_factory=list)
23
+ passed: bool = False
24
+
25
+
26
+ def _find_binary() -> str:
27
+ # 1. Explicit override via environment variable
28
+ env_bin = os.environ.get("EVALFORGE_BIN")
29
+ if env_bin and Path(env_bin).is_file():
30
+ return env_bin
31
+
32
+ # 2. Walk up from client.py looking for target/debug/evalforge (up to 5 levels)
33
+ current = Path(__file__).resolve().parent
34
+ for _ in range(5):
35
+ candidate = current / "target" / "debug" / "evalforge"
36
+ if candidate.exists():
37
+ return str(candidate)
38
+ current = current.parent
39
+
40
+ # 3. System PATH
41
+ import shutil
42
+ path_bin = shutil.which("evalforge")
43
+ if path_bin:
44
+ return path_bin
45
+
46
+ raise RuntimeError(
47
+ f"EvalForge binary not found. "
48
+ f"Searched up to 5 parent dirs from {Path(__file__).resolve()}. "
49
+ f"Set EVALFORGE_BIN env var to the binary path, or run 'cargo build' first."
50
+ )
51
+
52
+
53
+ def run(
54
+ trace: str,
55
+ metrics: list[str],
56
+ threshold: float = 0.7,
57
+ mock: bool = False,
58
+ api_key: str | None = None,
59
+ ) -> EvalResult:
60
+ binary = _find_binary()
61
+
62
+ cmd = [
63
+ binary,
64
+ "run",
65
+ "--trace", trace,
66
+ "--metrics", ",".join(metrics),
67
+ "--threshold", str(threshold),
68
+ ]
69
+ if mock:
70
+ cmd.append("--mock")
71
+
72
+ env = os.environ.copy()
73
+ if api_key is not None:
74
+ env["ANTHROPIC_API_KEY"] = api_key
75
+
76
+ proc = subprocess.run(cmd, capture_output=True, text=True, env=env)
77
+
78
+ if proc.returncode not in (0, 1):
79
+ raise RuntimeError(
80
+ f"evalforge exited with code {proc.returncode}.\n"
81
+ f"stdout: {proc.stdout}\nstderr: {proc.stderr}"
82
+ )
83
+
84
+ output = proc.stdout
85
+
86
+ # Parse trace summary fields
87
+ trace_id = _extract_field(output, "Trace ID")
88
+ framework = _extract_field(output, "Framework")
89
+
90
+ # Parse scoring result lines and the Reason: line that follows each.
91
+ # e.g.:
92
+ # faithfulness 0.91 PASS
93
+ # Reason: Mock score — skipping live API call
94
+ metric_results: list[MetricResult] = []
95
+ pattern = re.compile(
96
+ r"^(\w+)\s+([\d.]+)\s+(PASS|FAIL)\s*\nReason:\s*(.+)$",
97
+ re.MULTILINE,
98
+ )
99
+ for m in pattern.finditer(output):
100
+ metric_results.append(
101
+ MetricResult(
102
+ metric=m.group(1),
103
+ score=float(m.group(2)),
104
+ passed=m.group(3) == "PASS",
105
+ reason=m.group(4).strip(),
106
+ )
107
+ )
108
+
109
+ overall_passed = proc.returncode == 0 and all(r.passed for r in metric_results)
110
+
111
+ return EvalResult(
112
+ trace_id=trace_id,
113
+ framework=framework,
114
+ metrics=metric_results,
115
+ passed=overall_passed,
116
+ )
117
+
118
+
119
+ def _extract_field(output: str, label: str) -> str:
120
+ pattern = re.compile(rf"^{re.escape(label)}:\s+(.+)$", re.MULTILINE)
121
+ m = pattern.search(output)
122
+ return m.group(1).strip() if m else ""
@@ -0,0 +1,21 @@
1
+ [build-system]
2
+ requires = ["hatchling"]
3
+ build-backend = "hatchling.build"
4
+
5
+ [project]
6
+ name = "evalforge"
7
+ version = "0.1.0"
8
+ description = "Framework-agnostic LLM agent evaluation harness"
9
+ readme = "README.md"
10
+ requires-python = ">=3.11"
11
+ license = { text = "MIT" }
12
+ dependencies = []
13
+
14
+ [project.urls]
15
+ Homepage = "https://github.com/YOUR_USERNAME/evalforge"
16
+
17
+ [project.optional-dependencies]
18
+ dev = ["pytest>=7.0"]
19
+
20
+ [tool.hatch.build.targets.wheel]
21
+ packages = ["evalforge"]
File without changes
@@ -0,0 +1,49 @@
1
+ from pathlib import Path
2
+
3
+ import evalforge
4
+ from evalforge.client import EvalResult, MetricResult
5
+
6
+
7
+ # Resolve fixture paths relative to this file so tests work regardless of
8
+ # which directory pytest is invoked from.
9
+ _WORKSPACE_ROOT = Path(__file__).parent.parent.parent.parent
10
+ SAMPLE_TRACE = str(_WORKSPACE_ROOT / "tests/fixtures/sample_trace.json")
11
+ SIMPLE_TRACE = str(_WORKSPACE_ROOT / "tests/fixtures/simple_trace.json")
12
+
13
+
14
+ def test_mock_run():
15
+ result = evalforge.run(SAMPLE_TRACE, metrics=["faithfulness"], mock=True)
16
+ assert result.passed is True
17
+ assert len(result.metrics) == 1
18
+ assert result.metrics[0].score == 0.91
19
+
20
+
21
+ def test_simple_trace_mock():
22
+ result = evalforge.run(SIMPLE_TRACE, metrics=["faithfulness"], mock=True)
23
+ assert result.passed is True
24
+ assert len(result.metrics) == 1
25
+ assert result.metrics[0].score == 0.91
26
+
27
+
28
+ def test_threshold_boundary():
29
+ mr = MetricResult(metric="faithfulness", score=0.7, passed=True, reason="at threshold")
30
+ assert mr.passed is True
31
+
32
+
33
+ def test_metric_result_fields():
34
+ mr = MetricResult(metric="faithfulness", score=0.85, passed=True, reason="looks good")
35
+ assert mr.metric == "faithfulness"
36
+ assert mr.score == 0.85
37
+ assert mr.passed is True
38
+ assert mr.reason == "looks good"
39
+
40
+ er = EvalResult(
41
+ trace_id="trace-001",
42
+ framework="langchain",
43
+ metrics=[mr],
44
+ passed=True,
45
+ )
46
+ assert er.trace_id == "trace-001"
47
+ assert er.framework == "langchain"
48
+ assert er.passed is True
49
+ assert er.metrics[0].metric == "faithfulness"
@@ -0,0 +1,78 @@
1
+ version = 1
2
+ revision = 3
3
+ requires-python = ">=3.11"
4
+
5
+ [[package]]
6
+ name = "colorama"
7
+ version = "0.4.6"
8
+ source = { registry = "https://pypi.org/simple" }
9
+ sdist = { url = "https://files.pythonhosted.org/packages/d8/53/6f443c9a4a8358a93a6792e2acffb9d9d5cb0a5cfd8802644b7b1c9a02e4/colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44", size = 27697, upload-time = "2022-10-25T02:36:22.414Z" }
10
+ wheels = [
11
+ { url = "https://files.pythonhosted.org/packages/d1/d6/3965ed04c63042e047cb6a3e6ed1a63a35087b6a609aa3a15ed8ac56c221/colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6", size = 25335, upload-time = "2022-10-25T02:36:20.889Z" },
12
+ ]
13
+
14
+ [[package]]
15
+ name = "evalforge"
16
+ version = "0.1.0"
17
+ source = { editable = "." }
18
+
19
+ [package.optional-dependencies]
20
+ dev = [
21
+ { name = "pytest" },
22
+ ]
23
+
24
+ [package.metadata]
25
+ requires-dist = [{ name = "pytest", marker = "extra == 'dev'", specifier = ">=7.0" }]
26
+ provides-extras = ["dev"]
27
+
28
+ [[package]]
29
+ name = "iniconfig"
30
+ version = "2.3.0"
31
+ source = { registry = "https://pypi.org/simple" }
32
+ sdist = { url = "https://files.pythonhosted.org/packages/72/34/14ca021ce8e5dfedc35312d08ba8bf51fdd999c576889fc2c24cb97f4f10/iniconfig-2.3.0.tar.gz", hash = "sha256:c76315c77db068650d49c5b56314774a7804df16fee4402c1f19d6d15d8c4730", size = 20503, upload-time = "2025-10-18T21:55:43.219Z" }
33
+ wheels = [
34
+ { url = "https://files.pythonhosted.org/packages/cb/b1/3846dd7f199d53cb17f49cba7e651e9ce294d8497c8c150530ed11865bb8/iniconfig-2.3.0-py3-none-any.whl", hash = "sha256:f631c04d2c48c52b84d0d0549c99ff3859c98df65b3101406327ecc7d53fbf12", size = 7484, upload-time = "2025-10-18T21:55:41.639Z" },
35
+ ]
36
+
37
+ [[package]]
38
+ name = "packaging"
39
+ version = "26.0"
40
+ source = { registry = "https://pypi.org/simple" }
41
+ sdist = { url = "https://files.pythonhosted.org/packages/65/ee/299d360cdc32edc7d2cf530f3accf79c4fca01e96ffc950d8a52213bd8e4/packaging-26.0.tar.gz", hash = "sha256:00243ae351a257117b6a241061796684b084ed1c516a08c48a3f7e147a9d80b4", size = 143416, upload-time = "2026-01-21T20:50:39.064Z" }
42
+ wheels = [
43
+ { url = "https://files.pythonhosted.org/packages/b7/b9/c538f279a4e237a006a2c98387d081e9eb060d203d8ed34467cc0f0b9b53/packaging-26.0-py3-none-any.whl", hash = "sha256:b36f1fef9334a5588b4166f8bcd26a14e521f2b55e6b9de3aaa80d3ff7a37529", size = 74366, upload-time = "2026-01-21T20:50:37.788Z" },
44
+ ]
45
+
46
+ [[package]]
47
+ name = "pluggy"
48
+ version = "1.6.0"
49
+ source = { registry = "https://pypi.org/simple" }
50
+ sdist = { url = "https://files.pythonhosted.org/packages/f9/e2/3e91f31a7d2b083fe6ef3fa267035b518369d9511ffab804f839851d2779/pluggy-1.6.0.tar.gz", hash = "sha256:7dcc130b76258d33b90f61b658791dede3486c3e6bfb003ee5c9bfb396dd22f3", size = 69412, upload-time = "2025-05-15T12:30:07.975Z" }
51
+ wheels = [
52
+ { url = "https://files.pythonhosted.org/packages/54/20/4d324d65cc6d9205fabedc306948156824eb9f0ee1633355a8f7ec5c66bf/pluggy-1.6.0-py3-none-any.whl", hash = "sha256:e920276dd6813095e9377c0bc5566d94c932c33b27a3e3945d8389c374dd4746", size = 20538, upload-time = "2025-05-15T12:30:06.134Z" },
53
+ ]
54
+
55
+ [[package]]
56
+ name = "pygments"
57
+ version = "2.20.0"
58
+ source = { registry = "https://pypi.org/simple" }
59
+ sdist = { url = "https://files.pythonhosted.org/packages/c3/b2/bc9c9196916376152d655522fdcebac55e66de6603a76a02bca1b6414f6c/pygments-2.20.0.tar.gz", hash = "sha256:6757cd03768053ff99f3039c1a36d6c0aa0b263438fcab17520b30a303a82b5f", size = 4955991, upload-time = "2026-03-29T13:29:33.898Z" }
60
+ wheels = [
61
+ { url = "https://files.pythonhosted.org/packages/f4/7e/a72dd26f3b0f4f2bf1dd8923c85f7ceb43172af56d63c7383eb62b332364/pygments-2.20.0-py3-none-any.whl", hash = "sha256:81a9e26dd42fd28a23a2d169d86d7ac03b46e2f8b59ed4698fb4785f946d0176", size = 1231151, upload-time = "2026-03-29T13:29:30.038Z" },
62
+ ]
63
+
64
+ [[package]]
65
+ name = "pytest"
66
+ version = "9.0.2"
67
+ source = { registry = "https://pypi.org/simple" }
68
+ dependencies = [
69
+ { name = "colorama", marker = "sys_platform == 'win32'" },
70
+ { name = "iniconfig" },
71
+ { name = "packaging" },
72
+ { name = "pluggy" },
73
+ { name = "pygments" },
74
+ ]
75
+ sdist = { url = "https://files.pythonhosted.org/packages/d1/db/7ef3487e0fb0049ddb5ce41d3a49c235bf9ad299b6a25d5780a89f19230f/pytest-9.0.2.tar.gz", hash = "sha256:75186651a92bd89611d1d9fc20f0b4345fd827c41ccd5c299a868a05d70edf11", size = 1568901, upload-time = "2025-12-06T21:30:51.014Z" }
76
+ wheels = [
77
+ { url = "https://files.pythonhosted.org/packages/3b/ab/b3226f0bd7cdcf710fbede2b3548584366da3b19b5021e74f5bde2a8fa3f/pytest-9.0.2-py3-none-any.whl", hash = "sha256:711ffd45bf766d5264d487b917733b453d917afd2b0ad65223959f59089f875b", size = 374801, upload-time = "2025-12-06T21:30:49.154Z" },
78
+ ]