fusionkit-evals 0.1.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,9 @@
1
+ Metadata-Version: 2.3
2
+ Name: fusionkit-evals
3
+ Version: 0.1.1
4
+ Summary: Evaluation and Pareto analysis tools for fusionkit.
5
+ Requires-Dist: fusionkit-core==0.1.1
6
+ Requires-Dist: pydantic>=2.12.5
7
+ Requires-Dist: pandas>=2.3.3 ; extra == 'evals'
8
+ Requires-Python: >=3.11
9
+ Provides-Extra: evals
@@ -0,0 +1,18 @@
1
+ [project]
2
+ name = "fusionkit-evals"
3
+ version = "0.1.1"
4
+ description = "Evaluation and Pareto analysis tools for fusionkit."
5
+ requires-python = ">=3.11"
6
+ dependencies = [
7
+ "fusionkit-core==0.1.1",
8
+ "pydantic>=2.12.5",
9
+ ]
10
+
11
+ [project.optional-dependencies]
12
+ evals = [
13
+ "pandas>=2.3.3",
14
+ ]
15
+
16
+ [build-system]
17
+ requires = ["uv_build>=0.11.21,<0.12.0"]
18
+ build-backend = "uv_build"
@@ -0,0 +1,119 @@
1
+ from fusionkit_evals.benchmark import BenchmarkRunner
2
+ from fusionkit_evals.dirty_dozen import (
3
+ DIRTY_DOZEN_REPOS,
4
+ DIRTY_DOZEN_ROOT,
5
+ DIRTY_DOZEN_TASK_COUNT,
6
+ DirtyDozenRepo,
7
+ assert_dirty_dozen_manifest,
8
+ load_dirty_dozen_tasks,
9
+ )
10
+ from fusionkit_evals.fusion_bench import (
11
+ FUSION_BENCH_DISCLAIMER,
12
+ CommandHandoffKitExecutor,
13
+ FusionBenchAggregateMetrics,
14
+ FusionBenchAttemptRow,
15
+ FusionBenchFailure,
16
+ FusionBenchFailureCorrelation,
17
+ FusionBenchParetoPoint,
18
+ FusionBenchReport,
19
+ FusionBenchReproducibilityMetadata,
20
+ FusionBenchRunner,
21
+ FusionBenchTask,
22
+ FusionBenchTaskMetrics,
23
+ HandoffKitExecutor,
24
+ HandoffKitExecutorError,
25
+ HandoffKitExecutorUnavailable,
26
+ build_fusion_bench_report,
27
+ format_fusion_bench_html_report,
28
+ format_fusion_bench_markdown_report,
29
+ join_handoffkit_records,
30
+ load_benchmark_tasks,
31
+ load_fusion_bench_jsonl,
32
+ parse_handoffkit_records,
33
+ score_fusion_bench_row,
34
+ write_fusion_bench_html_report,
35
+ write_fusion_bench_jsonl,
36
+ write_fusion_bench_markdown_report,
37
+ write_fusion_bench_report_jsonl,
38
+ )
39
+ from fusionkit_evals.pareto import ParetoPoint, find_pareto_front, format_pareto_markdown
40
+ from fusionkit_evals.public_smoke import (
41
+ PUBLIC_SMOKE_DISCLAIMER,
42
+ PUBLIC_SMOKE_FIXTURE_ROOT,
43
+ PUBLIC_SMOKE_SUITE_INFO,
44
+ PUBLIC_SMOKE_SUITES,
45
+ PublicSmokeSuite,
46
+ PublicSmokeSuiteInfo,
47
+ assert_public_smoke_matrix,
48
+ load_public_smoke_tasks,
49
+ )
50
+ from fusionkit_evals.schema import EvalResult, EvalSample
51
+ from fusionkit_evals.scorers import contains_expected, exact_match
52
+ from fusionkit_evals.tiny import (
53
+ TinyBenchmarkResult,
54
+ TinyBenchmarkTask,
55
+ format_tiny_benchmark_report,
56
+ load_tiny_tasks,
57
+ run_tiny_benchmark,
58
+ write_tiny_benchmark_report,
59
+ write_tiny_jsonl,
60
+ )
61
+
62
+ __all__ = [
63
+ "BenchmarkRunner",
64
+ "CommandHandoffKitExecutor",
65
+ "DIRTY_DOZEN_REPOS",
66
+ "DIRTY_DOZEN_ROOT",
67
+ "DIRTY_DOZEN_TASK_COUNT",
68
+ "DirtyDozenRepo",
69
+ "EvalResult",
70
+ "EvalSample",
71
+ "FUSION_BENCH_DISCLAIMER",
72
+ "FusionBenchAggregateMetrics",
73
+ "FusionBenchAttemptRow",
74
+ "FusionBenchFailure",
75
+ "FusionBenchFailureCorrelation",
76
+ "FusionBenchParetoPoint",
77
+ "FusionBenchReport",
78
+ "FusionBenchReproducibilityMetadata",
79
+ "FusionBenchRunner",
80
+ "FusionBenchTask",
81
+ "FusionBenchTaskMetrics",
82
+ "HandoffKitExecutor",
83
+ "HandoffKitExecutorError",
84
+ "HandoffKitExecutorUnavailable",
85
+ "PUBLIC_SMOKE_DISCLAIMER",
86
+ "PUBLIC_SMOKE_FIXTURE_ROOT",
87
+ "PUBLIC_SMOKE_SUITES",
88
+ "PUBLIC_SMOKE_SUITE_INFO",
89
+ "ParetoPoint",
90
+ "PublicSmokeSuite",
91
+ "PublicSmokeSuiteInfo",
92
+ "TinyBenchmarkResult",
93
+ "TinyBenchmarkTask",
94
+ "assert_dirty_dozen_manifest",
95
+ "assert_public_smoke_matrix",
96
+ "build_fusion_bench_report",
97
+ "contains_expected",
98
+ "exact_match",
99
+ "find_pareto_front",
100
+ "format_fusion_bench_html_report",
101
+ "format_fusion_bench_markdown_report",
102
+ "format_pareto_markdown",
103
+ "format_tiny_benchmark_report",
104
+ "join_handoffkit_records",
105
+ "load_benchmark_tasks",
106
+ "load_dirty_dozen_tasks",
107
+ "load_fusion_bench_jsonl",
108
+ "load_public_smoke_tasks",
109
+ "load_tiny_tasks",
110
+ "parse_handoffkit_records",
111
+ "run_tiny_benchmark",
112
+ "score_fusion_bench_row",
113
+ "write_tiny_benchmark_report",
114
+ "write_fusion_bench_html_report",
115
+ "write_fusion_bench_jsonl",
116
+ "write_fusion_bench_markdown_report",
117
+ "write_fusion_bench_report_jsonl",
118
+ "write_tiny_jsonl",
119
+ ]
@@ -0,0 +1,68 @@
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ import time
5
+ from collections.abc import Callable, Iterable
6
+ from pathlib import Path
7
+
8
+ from fusionkit_core.config import FusionMode
9
+ from fusionkit_core.fusion import FusionEngine
10
+ from fusionkit_core.types import ChatMessage
11
+
12
+ from fusionkit_evals.schema import EvalResult, EvalSample
13
+ from fusionkit_evals.scorers import contains_expected
14
+
15
+ Scorer = Callable[[str, str | None], float | None]
16
+
17
+
18
+ class BenchmarkRunner:
19
+ def __init__(self, engine: FusionEngine, scorer: Scorer = contains_expected) -> None:
20
+ self.engine = engine
21
+ self.scorer = scorer
22
+
23
+ async def run_samples(
24
+ self,
25
+ samples: Iterable[EvalSample],
26
+ config_id: str,
27
+ mode: FusionMode,
28
+ ) -> list[EvalResult]:
29
+ results = []
30
+ for sample in samples:
31
+ started = time.perf_counter()
32
+ fusion_result = await self.engine.run(
33
+ [ChatMessage(role="user", content=sample.prompt)],
34
+ mode=mode,
35
+ )
36
+ latency_s = time.perf_counter() - started
37
+ results.append(
38
+ EvalResult(
39
+ sample_id=sample.id,
40
+ config_id=config_id,
41
+ mode=mode,
42
+ output=fusion_result.content,
43
+ score=self.scorer(fusion_result.content, sample.expected),
44
+ latency_s=latency_s,
45
+ metadata={
46
+ "fusion_mode": fusion_result.mode,
47
+ "route": fusion_result.route,
48
+ },
49
+ )
50
+ )
51
+ return results
52
+
53
+
54
+ def load_jsonl_samples(path: str | Path) -> list[EvalSample]:
55
+ samples = []
56
+ with Path(path).open(encoding="utf-8") as handle:
57
+ for line in handle:
58
+ if line.strip():
59
+ samples.append(EvalSample.model_validate_json(line))
60
+ return samples
61
+
62
+
63
+ def write_jsonl_results(path: str | Path, results: Iterable[EvalResult]) -> None:
64
+ output_path = Path(path)
65
+ output_path.parent.mkdir(parents=True, exist_ok=True)
66
+ with output_path.open("w", encoding="utf-8") as handle:
67
+ for result in results:
68
+ handle.write(json.dumps(result.model_dump(mode="json")) + "\n")
@@ -0,0 +1,100 @@
1
+ from __future__ import annotations
2
+
3
+ from collections import Counter
4
+ from collections.abc import Iterable
5
+ from pathlib import Path
6
+ from typing import Literal
7
+
8
+ from fusionkit_evals.fusion_bench import FusionBenchTask, load_benchmark_tasks
9
+
10
+ DIRTY_DOZEN_ROOT = Path(__file__).resolve().parents[2] / "benchmarks" / "dirty-dozen"
11
+ DIRTY_DOZEN_TASK_COUNT = 12
12
+ DirtyDozenRepo = Literal["fusionkit", "handoffkit", "cursorkit", "mlx-lm"]
13
+ DIRTY_DOZEN_REPOS: tuple[DirtyDozenRepo, ...] = (
14
+ "fusionkit",
15
+ "handoffkit",
16
+ "cursorkit",
17
+ "mlx-lm",
18
+ )
19
+
20
+
21
+ def load_dirty_dozen_tasks(
22
+ root: str | Path = DIRTY_DOZEN_ROOT,
23
+ repos: Iterable[DirtyDozenRepo] | None = None,
24
+ ) -> list[FusionBenchTask]:
25
+ tasks = load_benchmark_tasks(root)
26
+ if repos is None:
27
+ return tasks
28
+ selected = set(repos)
29
+ unknown = selected - set(DIRTY_DOZEN_REPOS)
30
+ if unknown:
31
+ raise ValueError(f"Unknown dirty-dozen repos: {sorted(unknown)}")
32
+ return [task for task in tasks if task.record.source_repo in selected]
33
+
34
+
35
+ def assert_dirty_dozen_manifest(tasks: Iterable[FusionBenchTask]) -> None:
36
+ task_list = list(tasks)
37
+ if len(task_list) != DIRTY_DOZEN_TASK_COUNT:
38
+ raise ValueError(f"Expected 12 dirty-dozen tasks, got {len(task_list)}")
39
+ task_ids = [task.record.task_id for task in task_list]
40
+ duplicate_ids = sorted(
41
+ task_id for task_id, count in Counter(task_ids).items() if count > 1
42
+ )
43
+ if duplicate_ids:
44
+ raise ValueError(f"Dirty-dozen task IDs must be unique: {duplicate_ids}")
45
+ repo_counts = Counter(task.record.source_repo for task in task_list)
46
+ missing_repos = [repo for repo in DIRTY_DOZEN_REPOS if repo_counts[repo] == 0]
47
+ underrepresented = {
48
+ repo: count
49
+ for repo, count in sorted(repo_counts.items())
50
+ if repo in DIRTY_DOZEN_REPOS and count < 2
51
+ }
52
+ unexpected_repos = sorted(set(repo_counts) - set(DIRTY_DOZEN_REPOS))
53
+ if missing_repos:
54
+ raise ValueError(f"Missing dirty-dozen source repos: {missing_repos}")
55
+ if underrepresented:
56
+ raise ValueError(f"Expected at least two tasks per repo, got {underrepresented}")
57
+ if unexpected_repos:
58
+ raise ValueError(f"Unexpected dirty-dozen source repos: {unexpected_repos}")
59
+ task_kinds = {task.record.task_kind for task in task_list}
60
+ if not {"model_fusion", "harness_coding"}.issubset(task_kinds):
61
+ raise ValueError(f"Dirty-dozen must include both task kinds, got {task_kinds}")
62
+ for task in task_list:
63
+ _assert_task_policy(task)
64
+
65
+
66
+ def _assert_task_policy(task: FusionBenchTask) -> None:
67
+ record = task.record
68
+ if record.source_repo != task.category:
69
+ raise ValueError(
70
+ f"Dirty-dozen category {task.category!r} must match source repo "
71
+ f"{record.source_repo!r} for {record.task_id}"
72
+ )
73
+ if not record.expected_evidence:
74
+ raise ValueError(f"Dirty-dozen task must include expected evidence: {record.task_id}")
75
+ if not record.contamination_notes:
76
+ raise ValueError(f"Dirty-dozen task must include contamination notes: {record.task_id}")
77
+ if "solution" not in record.contamination_notes:
78
+ raise ValueError(
79
+ f"Dirty-dozen contamination notes must state no solution is included: "
80
+ f"{record.task_id}"
81
+ )
82
+ if not record.prompt_hash.startswith("sha256:"):
83
+ raise ValueError(f"Dirty-dozen task must include prompt hash: {record.task_id}")
84
+ if not record.setup_hash.startswith("sha256:"):
85
+ raise ValueError(f"Dirty-dozen task must include setup hash: {record.task_id}")
86
+ if not record.allowed_tools:
87
+ raise ValueError(f"Dirty-dozen task must include allowed tools: {record.task_id}")
88
+ params = record.scorer.params or {}
89
+ if params.get("dirty_dozen") is not True:
90
+ raise ValueError(f"Dirty-dozen scorer params must set dirty_dozen: {record.task_id}")
91
+
92
+
93
+ __all__ = [
94
+ "DIRTY_DOZEN_REPOS",
95
+ "DIRTY_DOZEN_ROOT",
96
+ "DIRTY_DOZEN_TASK_COUNT",
97
+ "DirtyDozenRepo",
98
+ "assert_dirty_dozen_manifest",
99
+ "load_dirty_dozen_tasks",
100
+ ]