fusionkit-evals 0.1.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- fusionkit_evals-0.1.1/PKG-INFO +9 -0
- fusionkit_evals-0.1.1/pyproject.toml +18 -0
- fusionkit_evals-0.1.1/src/fusionkit_evals/__init__.py +119 -0
- fusionkit_evals-0.1.1/src/fusionkit_evals/benchmark.py +68 -0
- fusionkit_evals-0.1.1/src/fusionkit_evals/dirty_dozen.py +100 -0
- fusionkit_evals-0.1.1/src/fusionkit_evals/fusion_bench.py +1489 -0
- fusionkit_evals-0.1.1/src/fusionkit_evals/pareto.py +95 -0
- fusionkit_evals-0.1.1/src/fusionkit_evals/public_smoke.py +122 -0
- fusionkit_evals-0.1.1/src/fusionkit_evals/schema.py +24 -0
- fusionkit_evals-0.1.1/src/fusionkit_evals/scorers.py +13 -0
- fusionkit_evals-0.1.1/src/fusionkit_evals/tiny.py +316 -0
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
Metadata-Version: 2.3
|
|
2
|
+
Name: fusionkit-evals
|
|
3
|
+
Version: 0.1.1
|
|
4
|
+
Summary: Evaluation and Pareto analysis tools for fusionkit.
|
|
5
|
+
Requires-Dist: fusionkit-core==0.1.1
|
|
6
|
+
Requires-Dist: pydantic>=2.12.5
|
|
7
|
+
Requires-Dist: pandas>=2.3.3 ; extra == 'evals'
|
|
8
|
+
Requires-Python: >=3.11
|
|
9
|
+
Provides-Extra: evals
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "fusionkit-evals"
|
|
3
|
+
version = "0.1.1"
|
|
4
|
+
description = "Evaluation and Pareto analysis tools for fusionkit."
|
|
5
|
+
requires-python = ">=3.11"
|
|
6
|
+
dependencies = [
|
|
7
|
+
"fusionkit-core==0.1.1",
|
|
8
|
+
"pydantic>=2.12.5",
|
|
9
|
+
]
|
|
10
|
+
|
|
11
|
+
[project.optional-dependencies]
|
|
12
|
+
evals = [
|
|
13
|
+
"pandas>=2.3.3",
|
|
14
|
+
]
|
|
15
|
+
|
|
16
|
+
[build-system]
|
|
17
|
+
requires = ["uv_build>=0.11.21,<0.12.0"]
|
|
18
|
+
build-backend = "uv_build"
|
|
@@ -0,0 +1,119 @@
|
|
|
1
|
+
from fusionkit_evals.benchmark import BenchmarkRunner
|
|
2
|
+
from fusionkit_evals.dirty_dozen import (
|
|
3
|
+
DIRTY_DOZEN_REPOS,
|
|
4
|
+
DIRTY_DOZEN_ROOT,
|
|
5
|
+
DIRTY_DOZEN_TASK_COUNT,
|
|
6
|
+
DirtyDozenRepo,
|
|
7
|
+
assert_dirty_dozen_manifest,
|
|
8
|
+
load_dirty_dozen_tasks,
|
|
9
|
+
)
|
|
10
|
+
from fusionkit_evals.fusion_bench import (
|
|
11
|
+
FUSION_BENCH_DISCLAIMER,
|
|
12
|
+
CommandHandoffKitExecutor,
|
|
13
|
+
FusionBenchAggregateMetrics,
|
|
14
|
+
FusionBenchAttemptRow,
|
|
15
|
+
FusionBenchFailure,
|
|
16
|
+
FusionBenchFailureCorrelation,
|
|
17
|
+
FusionBenchParetoPoint,
|
|
18
|
+
FusionBenchReport,
|
|
19
|
+
FusionBenchReproducibilityMetadata,
|
|
20
|
+
FusionBenchRunner,
|
|
21
|
+
FusionBenchTask,
|
|
22
|
+
FusionBenchTaskMetrics,
|
|
23
|
+
HandoffKitExecutor,
|
|
24
|
+
HandoffKitExecutorError,
|
|
25
|
+
HandoffKitExecutorUnavailable,
|
|
26
|
+
build_fusion_bench_report,
|
|
27
|
+
format_fusion_bench_html_report,
|
|
28
|
+
format_fusion_bench_markdown_report,
|
|
29
|
+
join_handoffkit_records,
|
|
30
|
+
load_benchmark_tasks,
|
|
31
|
+
load_fusion_bench_jsonl,
|
|
32
|
+
parse_handoffkit_records,
|
|
33
|
+
score_fusion_bench_row,
|
|
34
|
+
write_fusion_bench_html_report,
|
|
35
|
+
write_fusion_bench_jsonl,
|
|
36
|
+
write_fusion_bench_markdown_report,
|
|
37
|
+
write_fusion_bench_report_jsonl,
|
|
38
|
+
)
|
|
39
|
+
from fusionkit_evals.pareto import ParetoPoint, find_pareto_front, format_pareto_markdown
|
|
40
|
+
from fusionkit_evals.public_smoke import (
|
|
41
|
+
PUBLIC_SMOKE_DISCLAIMER,
|
|
42
|
+
PUBLIC_SMOKE_FIXTURE_ROOT,
|
|
43
|
+
PUBLIC_SMOKE_SUITE_INFO,
|
|
44
|
+
PUBLIC_SMOKE_SUITES,
|
|
45
|
+
PublicSmokeSuite,
|
|
46
|
+
PublicSmokeSuiteInfo,
|
|
47
|
+
assert_public_smoke_matrix,
|
|
48
|
+
load_public_smoke_tasks,
|
|
49
|
+
)
|
|
50
|
+
from fusionkit_evals.schema import EvalResult, EvalSample
|
|
51
|
+
from fusionkit_evals.scorers import contains_expected, exact_match
|
|
52
|
+
from fusionkit_evals.tiny import (
|
|
53
|
+
TinyBenchmarkResult,
|
|
54
|
+
TinyBenchmarkTask,
|
|
55
|
+
format_tiny_benchmark_report,
|
|
56
|
+
load_tiny_tasks,
|
|
57
|
+
run_tiny_benchmark,
|
|
58
|
+
write_tiny_benchmark_report,
|
|
59
|
+
write_tiny_jsonl,
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
__all__ = [
|
|
63
|
+
"BenchmarkRunner",
|
|
64
|
+
"CommandHandoffKitExecutor",
|
|
65
|
+
"DIRTY_DOZEN_REPOS",
|
|
66
|
+
"DIRTY_DOZEN_ROOT",
|
|
67
|
+
"DIRTY_DOZEN_TASK_COUNT",
|
|
68
|
+
"DirtyDozenRepo",
|
|
69
|
+
"EvalResult",
|
|
70
|
+
"EvalSample",
|
|
71
|
+
"FUSION_BENCH_DISCLAIMER",
|
|
72
|
+
"FusionBenchAggregateMetrics",
|
|
73
|
+
"FusionBenchAttemptRow",
|
|
74
|
+
"FusionBenchFailure",
|
|
75
|
+
"FusionBenchFailureCorrelation",
|
|
76
|
+
"FusionBenchParetoPoint",
|
|
77
|
+
"FusionBenchReport",
|
|
78
|
+
"FusionBenchReproducibilityMetadata",
|
|
79
|
+
"FusionBenchRunner",
|
|
80
|
+
"FusionBenchTask",
|
|
81
|
+
"FusionBenchTaskMetrics",
|
|
82
|
+
"HandoffKitExecutor",
|
|
83
|
+
"HandoffKitExecutorError",
|
|
84
|
+
"HandoffKitExecutorUnavailable",
|
|
85
|
+
"PUBLIC_SMOKE_DISCLAIMER",
|
|
86
|
+
"PUBLIC_SMOKE_FIXTURE_ROOT",
|
|
87
|
+
"PUBLIC_SMOKE_SUITES",
|
|
88
|
+
"PUBLIC_SMOKE_SUITE_INFO",
|
|
89
|
+
"ParetoPoint",
|
|
90
|
+
"PublicSmokeSuite",
|
|
91
|
+
"PublicSmokeSuiteInfo",
|
|
92
|
+
"TinyBenchmarkResult",
|
|
93
|
+
"TinyBenchmarkTask",
|
|
94
|
+
"assert_dirty_dozen_manifest",
|
|
95
|
+
"assert_public_smoke_matrix",
|
|
96
|
+
"build_fusion_bench_report",
|
|
97
|
+
"contains_expected",
|
|
98
|
+
"exact_match",
|
|
99
|
+
"find_pareto_front",
|
|
100
|
+
"format_fusion_bench_html_report",
|
|
101
|
+
"format_fusion_bench_markdown_report",
|
|
102
|
+
"format_pareto_markdown",
|
|
103
|
+
"format_tiny_benchmark_report",
|
|
104
|
+
"join_handoffkit_records",
|
|
105
|
+
"load_benchmark_tasks",
|
|
106
|
+
"load_dirty_dozen_tasks",
|
|
107
|
+
"load_fusion_bench_jsonl",
|
|
108
|
+
"load_public_smoke_tasks",
|
|
109
|
+
"load_tiny_tasks",
|
|
110
|
+
"parse_handoffkit_records",
|
|
111
|
+
"run_tiny_benchmark",
|
|
112
|
+
"score_fusion_bench_row",
|
|
113
|
+
"write_tiny_benchmark_report",
|
|
114
|
+
"write_fusion_bench_html_report",
|
|
115
|
+
"write_fusion_bench_jsonl",
|
|
116
|
+
"write_fusion_bench_markdown_report",
|
|
117
|
+
"write_fusion_bench_report_jsonl",
|
|
118
|
+
"write_tiny_jsonl",
|
|
119
|
+
]
|
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import time
|
|
5
|
+
from collections.abc import Callable, Iterable
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
|
|
8
|
+
from fusionkit_core.config import FusionMode
|
|
9
|
+
from fusionkit_core.fusion import FusionEngine
|
|
10
|
+
from fusionkit_core.types import ChatMessage
|
|
11
|
+
|
|
12
|
+
from fusionkit_evals.schema import EvalResult, EvalSample
|
|
13
|
+
from fusionkit_evals.scorers import contains_expected
|
|
14
|
+
|
|
15
|
+
Scorer = Callable[[str, str | None], float | None]
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class BenchmarkRunner:
|
|
19
|
+
def __init__(self, engine: FusionEngine, scorer: Scorer = contains_expected) -> None:
|
|
20
|
+
self.engine = engine
|
|
21
|
+
self.scorer = scorer
|
|
22
|
+
|
|
23
|
+
async def run_samples(
|
|
24
|
+
self,
|
|
25
|
+
samples: Iterable[EvalSample],
|
|
26
|
+
config_id: str,
|
|
27
|
+
mode: FusionMode,
|
|
28
|
+
) -> list[EvalResult]:
|
|
29
|
+
results = []
|
|
30
|
+
for sample in samples:
|
|
31
|
+
started = time.perf_counter()
|
|
32
|
+
fusion_result = await self.engine.run(
|
|
33
|
+
[ChatMessage(role="user", content=sample.prompt)],
|
|
34
|
+
mode=mode,
|
|
35
|
+
)
|
|
36
|
+
latency_s = time.perf_counter() - started
|
|
37
|
+
results.append(
|
|
38
|
+
EvalResult(
|
|
39
|
+
sample_id=sample.id,
|
|
40
|
+
config_id=config_id,
|
|
41
|
+
mode=mode,
|
|
42
|
+
output=fusion_result.content,
|
|
43
|
+
score=self.scorer(fusion_result.content, sample.expected),
|
|
44
|
+
latency_s=latency_s,
|
|
45
|
+
metadata={
|
|
46
|
+
"fusion_mode": fusion_result.mode,
|
|
47
|
+
"route": fusion_result.route,
|
|
48
|
+
},
|
|
49
|
+
)
|
|
50
|
+
)
|
|
51
|
+
return results
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def load_jsonl_samples(path: str | Path) -> list[EvalSample]:
|
|
55
|
+
samples = []
|
|
56
|
+
with Path(path).open(encoding="utf-8") as handle:
|
|
57
|
+
for line in handle:
|
|
58
|
+
if line.strip():
|
|
59
|
+
samples.append(EvalSample.model_validate_json(line))
|
|
60
|
+
return samples
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def write_jsonl_results(path: str | Path, results: Iterable[EvalResult]) -> None:
|
|
64
|
+
output_path = Path(path)
|
|
65
|
+
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
66
|
+
with output_path.open("w", encoding="utf-8") as handle:
|
|
67
|
+
for result in results:
|
|
68
|
+
handle.write(json.dumps(result.model_dump(mode="json")) + "\n")
|
|
@@ -0,0 +1,100 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from collections import Counter
|
|
4
|
+
from collections.abc import Iterable
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Literal
|
|
7
|
+
|
|
8
|
+
from fusionkit_evals.fusion_bench import FusionBenchTask, load_benchmark_tasks
|
|
9
|
+
|
|
10
|
+
DIRTY_DOZEN_ROOT = Path(__file__).resolve().parents[2] / "benchmarks" / "dirty-dozen"
|
|
11
|
+
DIRTY_DOZEN_TASK_COUNT = 12
|
|
12
|
+
DirtyDozenRepo = Literal["fusionkit", "handoffkit", "cursorkit", "mlx-lm"]
|
|
13
|
+
DIRTY_DOZEN_REPOS: tuple[DirtyDozenRepo, ...] = (
|
|
14
|
+
"fusionkit",
|
|
15
|
+
"handoffkit",
|
|
16
|
+
"cursorkit",
|
|
17
|
+
"mlx-lm",
|
|
18
|
+
)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def load_dirty_dozen_tasks(
|
|
22
|
+
root: str | Path = DIRTY_DOZEN_ROOT,
|
|
23
|
+
repos: Iterable[DirtyDozenRepo] | None = None,
|
|
24
|
+
) -> list[FusionBenchTask]:
|
|
25
|
+
tasks = load_benchmark_tasks(root)
|
|
26
|
+
if repos is None:
|
|
27
|
+
return tasks
|
|
28
|
+
selected = set(repos)
|
|
29
|
+
unknown = selected - set(DIRTY_DOZEN_REPOS)
|
|
30
|
+
if unknown:
|
|
31
|
+
raise ValueError(f"Unknown dirty-dozen repos: {sorted(unknown)}")
|
|
32
|
+
return [task for task in tasks if task.record.source_repo in selected]
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def assert_dirty_dozen_manifest(tasks: Iterable[FusionBenchTask]) -> None:
|
|
36
|
+
task_list = list(tasks)
|
|
37
|
+
if len(task_list) != DIRTY_DOZEN_TASK_COUNT:
|
|
38
|
+
raise ValueError(f"Expected 12 dirty-dozen tasks, got {len(task_list)}")
|
|
39
|
+
task_ids = [task.record.task_id for task in task_list]
|
|
40
|
+
duplicate_ids = sorted(
|
|
41
|
+
task_id for task_id, count in Counter(task_ids).items() if count > 1
|
|
42
|
+
)
|
|
43
|
+
if duplicate_ids:
|
|
44
|
+
raise ValueError(f"Dirty-dozen task IDs must be unique: {duplicate_ids}")
|
|
45
|
+
repo_counts = Counter(task.record.source_repo for task in task_list)
|
|
46
|
+
missing_repos = [repo for repo in DIRTY_DOZEN_REPOS if repo_counts[repo] == 0]
|
|
47
|
+
underrepresented = {
|
|
48
|
+
repo: count
|
|
49
|
+
for repo, count in sorted(repo_counts.items())
|
|
50
|
+
if repo in DIRTY_DOZEN_REPOS and count < 2
|
|
51
|
+
}
|
|
52
|
+
unexpected_repos = sorted(set(repo_counts) - set(DIRTY_DOZEN_REPOS))
|
|
53
|
+
if missing_repos:
|
|
54
|
+
raise ValueError(f"Missing dirty-dozen source repos: {missing_repos}")
|
|
55
|
+
if underrepresented:
|
|
56
|
+
raise ValueError(f"Expected at least two tasks per repo, got {underrepresented}")
|
|
57
|
+
if unexpected_repos:
|
|
58
|
+
raise ValueError(f"Unexpected dirty-dozen source repos: {unexpected_repos}")
|
|
59
|
+
task_kinds = {task.record.task_kind for task in task_list}
|
|
60
|
+
if not {"model_fusion", "harness_coding"}.issubset(task_kinds):
|
|
61
|
+
raise ValueError(f"Dirty-dozen must include both task kinds, got {task_kinds}")
|
|
62
|
+
for task in task_list:
|
|
63
|
+
_assert_task_policy(task)
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def _assert_task_policy(task: FusionBenchTask) -> None:
|
|
67
|
+
record = task.record
|
|
68
|
+
if record.source_repo != task.category:
|
|
69
|
+
raise ValueError(
|
|
70
|
+
f"Dirty-dozen category {task.category!r} must match source repo "
|
|
71
|
+
f"{record.source_repo!r} for {record.task_id}"
|
|
72
|
+
)
|
|
73
|
+
if not record.expected_evidence:
|
|
74
|
+
raise ValueError(f"Dirty-dozen task must include expected evidence: {record.task_id}")
|
|
75
|
+
if not record.contamination_notes:
|
|
76
|
+
raise ValueError(f"Dirty-dozen task must include contamination notes: {record.task_id}")
|
|
77
|
+
if "solution" not in record.contamination_notes:
|
|
78
|
+
raise ValueError(
|
|
79
|
+
f"Dirty-dozen contamination notes must state no solution is included: "
|
|
80
|
+
f"{record.task_id}"
|
|
81
|
+
)
|
|
82
|
+
if not record.prompt_hash.startswith("sha256:"):
|
|
83
|
+
raise ValueError(f"Dirty-dozen task must include prompt hash: {record.task_id}")
|
|
84
|
+
if not record.setup_hash.startswith("sha256:"):
|
|
85
|
+
raise ValueError(f"Dirty-dozen task must include setup hash: {record.task_id}")
|
|
86
|
+
if not record.allowed_tools:
|
|
87
|
+
raise ValueError(f"Dirty-dozen task must include allowed tools: {record.task_id}")
|
|
88
|
+
params = record.scorer.params or {}
|
|
89
|
+
if params.get("dirty_dozen") is not True:
|
|
90
|
+
raise ValueError(f"Dirty-dozen scorer params must set dirty_dozen: {record.task_id}")
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
__all__ = [
|
|
94
|
+
"DIRTY_DOZEN_REPOS",
|
|
95
|
+
"DIRTY_DOZEN_ROOT",
|
|
96
|
+
"DIRTY_DOZEN_TASK_COUNT",
|
|
97
|
+
"DirtyDozenRepo",
|
|
98
|
+
"assert_dirty_dozen_manifest",
|
|
99
|
+
"load_dirty_dozen_tasks",
|
|
100
|
+
]
|