holistic-math-harness 0.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- holistic_math_harness-0.0.0/LICENSE +27 -0
- holistic_math_harness-0.0.0/PKG-INFO +24 -0
- holistic_math_harness-0.0.0/holistic_math_harness/__init__.py +19 -0
- holistic_math_harness-0.0.0/holistic_math_harness/evaluator.py +62 -0
- holistic_math_harness-0.0.0/holistic_math_harness/evaluator_cross_model_diagnostic_synthesis.py +321 -0
- holistic_math_harness-0.0.0/holistic_math_harness/evaluator_error_taxonomy_induction.py +260 -0
- holistic_math_harness-0.0.0/holistic_math_harness/evaluator_structured_signal_extraction.py +166 -0
- holistic_math_harness-0.0.0/holistic_math_harness/holistic_harness.py +369 -0
- holistic_math_harness-0.0.0/holistic_math_harness/initialize_benchmark.py +174 -0
- holistic_math_harness-0.0.0/holistic_math_harness/prompt_variation.py +117 -0
- holistic_math_harness-0.0.0/holistic_math_harness/utils.py +135 -0
- holistic_math_harness-0.0.0/holistic_math_harness.egg-info/PKG-INFO +24 -0
- holistic_math_harness-0.0.0/holistic_math_harness.egg-info/SOURCES.txt +25 -0
- holistic_math_harness-0.0.0/holistic_math_harness.egg-info/dependency_links.txt +1 -0
- holistic_math_harness-0.0.0/holistic_math_harness.egg-info/requires.txt +14 -0
- holistic_math_harness-0.0.0/holistic_math_harness.egg-info/top_level.txt +2 -0
- holistic_math_harness-0.0.0/pyproject.toml +41 -0
- holistic_math_harness-0.0.0/setup.cfg +4 -0
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
|
|
2
|
+
---
|
|
3
|
+
|
|
4
|
+
# 📄 **LICENSE (MIT License)**
|
|
5
|
+
|
|
6
|
+
```text id="license001"
|
|
7
|
+
MIT License
|
|
8
|
+
|
|
9
|
+
Copyright (c) 2026 CIOL
|
|
10
|
+
|
|
11
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
12
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
13
|
+
in the Software without restriction, including without limitation the rights
|
|
14
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
15
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
16
|
+
furnished to do so, subject to the following conditions:
|
|
17
|
+
|
|
18
|
+
The above copyright notice and this permission notice shall be included in all
|
|
19
|
+
copies or substantial portions of the Software.
|
|
20
|
+
|
|
21
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
22
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
23
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
24
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
25
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
26
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
27
|
+
SOFTWARE.
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: holistic-math-harness
|
|
3
|
+
Version: 0.0.0
|
|
4
|
+
Summary: A lightweight framework for benchmarking multimodal AI agents with parallel execution, prompt variation, and automated evaluation.
|
|
5
|
+
Author-email: Wahid Faisal <wahiddhrubo@gmail.com>
|
|
6
|
+
License: MIT
|
|
7
|
+
Requires-Python: >=3.8
|
|
8
|
+
Description-Content-Type: text/markdown
|
|
9
|
+
License-File: LICENSE
|
|
10
|
+
Requires-Dist: pandas
|
|
11
|
+
Requires-Dist: numpy
|
|
12
|
+
Requires-Dist: alive-progress
|
|
13
|
+
Requires-Dist: gitpython
|
|
14
|
+
Requires-Dist: huggingface-hub
|
|
15
|
+
Requires-Dist: openai
|
|
16
|
+
Requires-Dist: scipy
|
|
17
|
+
Requires-Dist: scikit-learn
|
|
18
|
+
Requires-Dist: torch
|
|
19
|
+
Requires-Dist: transformers
|
|
20
|
+
Requires-Dist: umap-learn
|
|
21
|
+
Requires-Dist: pillow
|
|
22
|
+
Requires-Dist: cohere
|
|
23
|
+
Requires-Dist: hdbscan
|
|
24
|
+
Dynamic: license-file
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
from .evaluator import HarnessEvaluator,structured_signal_extraction,cross_model_diagnostic_synthesis
|
|
2
|
+
from .holistic_harness import HarnessRunner
|
|
3
|
+
from .initialize_benchmark import process_benchmark, BenchmarkStats
|
|
4
|
+
from .prompt_variation import PromptVariationGenerator
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
__version__ = "0.1.5"
|
|
9
|
+
|
|
10
|
+
__all__ = [
|
|
11
|
+
"HarnessEvaluator",
|
|
12
|
+
"cross_model_diagnostic_synthesis",
|
|
13
|
+
"structured_signal_extraction",
|
|
14
|
+
"HarnessRunner",
|
|
15
|
+
"process_benchmark",
|
|
16
|
+
"PromptVariationGenerator",
|
|
17
|
+
"BenchmarkStats"
|
|
18
|
+
]
|
|
19
|
+
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
# ================= STANDARD LIB =================
|
|
2
|
+
import os
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
# ================= DATA =================
|
|
6
|
+
import pandas as pd
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
from .evaluator_cross_model_diagnostic_synthesis import cross_model_diagnostic_synthesis
|
|
11
|
+
from .evaluator_error_taxonomy_induction import error_taxonomy_induction
|
|
12
|
+
from .evaluator_structured_signal_extraction import structured_signal_extraction
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def HarnessEvaluator(results_path,custom_error_taxonomies=None,max_concurrent=10,model_name="gemma-4-31b-it"):
|
|
19
|
+
|
|
20
|
+
results_path=Path(results_path)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
error_taxonomies= custom_error_taxonomies if custom_error_taxonomies else {
|
|
24
|
+
"E1":"perceptual failure — model misidentified or failed to detect a spatial element in the image",
|
|
25
|
+
"E2":"grounding failure — model identified elements but failed to anchor them in the correct spatial frame of reference",
|
|
26
|
+
"E3":"spatial transformation failure — model understood the spatial arrangement but failed to correctly apply a rotation, reflection, or perspective change)",
|
|
27
|
+
"E4":"quantitative spatial failure — failure on counting, distance estimation, or measurement",
|
|
28
|
+
"E5":"multi-step reasoning failure — model solved one spatial step correctly but failed on a subsequent step requiring the first as input",
|
|
29
|
+
"E6":"language-spatial mapping failure — model misunderstood the spatial language in the question, e.g., confused 'left of X' with 'right of X'",
|
|
30
|
+
"E7":"format/refusal failure — model did not produce a valid answer regardless of spatial competence",
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
for model_folder in results_path.iterdir():
|
|
34
|
+
if model_folder.is_dir():
|
|
35
|
+
response_csv_path= model_folder / "agent_response.csv"
|
|
36
|
+
score_csv_path= model_folder / "agent_scores.csv"
|
|
37
|
+
prev_df=None
|
|
38
|
+
|
|
39
|
+
if os.path.exists(score_csv_path) :
|
|
40
|
+
prev_df=pd.read_csv(score_csv_path)
|
|
41
|
+
|
|
42
|
+
structured_signal_extraction(
|
|
43
|
+
response_csv_path,
|
|
44
|
+
max_concurrent=max_concurrent,
|
|
45
|
+
model_name=model_name,
|
|
46
|
+
prev_df=prev_df
|
|
47
|
+
)
|
|
48
|
+
error_taxonomy_induction(
|
|
49
|
+
score_csv_path,
|
|
50
|
+
error_taxonomies=error_taxonomies,
|
|
51
|
+
model_name=model_name,
|
|
52
|
+
)
|
|
53
|
+
|
|
54
|
+
if score_csv_path.exists():
|
|
55
|
+
df=pd.read_csv(score_csv_path)
|
|
56
|
+
if "accuracy_score" in df.columns:
|
|
57
|
+
print("Numbers of Failed Tasks: ",len(df[df["accuracy_score"].isna()]))
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
cross_model_diagnostic_synthesis(results_path.rglob("agent_scores.csv"),results_path)
|
|
62
|
+
|
holistic_math_harness-0.0.0/holistic_math_harness/evaluator_cross_model_diagnostic_synthesis.py
ADDED
|
@@ -0,0 +1,321 @@
|
|
|
1
|
+
# ================= STANDARD LIB =================
|
|
2
|
+
import json
|
|
3
|
+
from itertools import combinations
|
|
4
|
+
|
|
5
|
+
# ================= DATA =================
|
|
6
|
+
import numpy as np
|
|
7
|
+
import pandas as pd
|
|
8
|
+
|
|
9
|
+
# ================= ML / STATS =================
|
|
10
|
+
from scipy.stats import chi2
|
|
11
|
+
from sklearn.preprocessing import MultiLabelBinarizer
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
# ================= BOOTSTRAP CI =================
|
|
15
|
+
def bootstrap_ci(scores_lst, n_bootstrap=1000, alpha=0.05):
|
|
16
|
+
scores_lst = np.asarray(scores_lst)
|
|
17
|
+
scores_lst = scores_lst[~np.isnan(scores_lst)]
|
|
18
|
+
|
|
19
|
+
if len(scores_lst) == 0:
|
|
20
|
+
return {"mean": np.nan, "ci_lower": np.nan, "ci_upper": np.nan, "std": np.nan}
|
|
21
|
+
|
|
22
|
+
boot_means = []
|
|
23
|
+
n = len(scores_lst)
|
|
24
|
+
|
|
25
|
+
for _ in range(n_bootstrap):
|
|
26
|
+
idx = np.random.randint(0, n, n)
|
|
27
|
+
sample = scores_lst[idx]
|
|
28
|
+
boot_means.append(np.mean(sample)) # ✅ scalar
|
|
29
|
+
|
|
30
|
+
boot_means = np.array(boot_means)
|
|
31
|
+
|
|
32
|
+
return {
|
|
33
|
+
"mean": float(np.mean(boot_means)),
|
|
34
|
+
"ci_lower": float(np.percentile(boot_means, 100 * alpha / 2)),
|
|
35
|
+
"ci_upper": float(np.percentile(boot_means, 100 * (1 - alpha / 2))),
|
|
36
|
+
"std": float(np.std(boot_means))
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
# ================= COHEN'S D =================
|
|
41
|
+
def cohens_d(x, y):
|
|
42
|
+
x, y = np.asarray(x), np.asarray(y)
|
|
43
|
+
|
|
44
|
+
x = x[~np.isnan(x)]
|
|
45
|
+
y = y[~np.isnan(y)]
|
|
46
|
+
|
|
47
|
+
nx, ny = len(x), len(y)
|
|
48
|
+
if nx < 2 or ny < 2:
|
|
49
|
+
return np.nan
|
|
50
|
+
|
|
51
|
+
sx, sy = np.var(x, ddof=1), np.var(y, ddof=1)
|
|
52
|
+
pooled = np.sqrt(((nx - 1) * sx + (ny - 1) * sy) / (nx + ny - 2))
|
|
53
|
+
|
|
54
|
+
return (np.mean(x) - np.mean(y)) / pooled if pooled != 0 else 0.0
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
# ================= MCNEMAR TEST =================
|
|
58
|
+
def mcnemar_test(x, y):
|
|
59
|
+
x, y = np.asarray(x), np.asarray(y)
|
|
60
|
+
|
|
61
|
+
mask = ~np.isnan(x) & ~np.isnan(y)
|
|
62
|
+
x, y = x[mask], y[mask]
|
|
63
|
+
|
|
64
|
+
b = np.sum((x == 1) & (y == 0))
|
|
65
|
+
c = np.sum((x == 0) & (y == 1))
|
|
66
|
+
|
|
67
|
+
if b + c == 0:
|
|
68
|
+
return 0.0, 1.0
|
|
69
|
+
|
|
70
|
+
stat = (abs(b - c) - 1) ** 2 / (b + c)
|
|
71
|
+
p = 1 - chi2.cdf(stat, df=1)
|
|
72
|
+
|
|
73
|
+
return float(stat), float(p)
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
# ================= FULL PAIRWISE =================
|
|
77
|
+
def full_pairwise_evaluation(model_dfs, score_col="accuracy_score"):
|
|
78
|
+
models = list(model_dfs.keys())
|
|
79
|
+
pairs = list(combinations(models, 2))
|
|
80
|
+
|
|
81
|
+
m = len(pairs)
|
|
82
|
+
bonf_alpha = 0.05 / m if m > 0 else 0.05
|
|
83
|
+
|
|
84
|
+
results = []
|
|
85
|
+
|
|
86
|
+
for m1, m2 in pairs:
|
|
87
|
+
df1 = model_dfs[m1]
|
|
88
|
+
df2 = model_dfs[m2]
|
|
89
|
+
|
|
90
|
+
# ✅ align by id
|
|
91
|
+
merged = df1.merge(df2, on="id", suffixes=("_1", "_2"))
|
|
92
|
+
|
|
93
|
+
x = merged[f"{score_col}_1"].values
|
|
94
|
+
y = merged[f"{score_col}_2"].values
|
|
95
|
+
|
|
96
|
+
# McNemar
|
|
97
|
+
mcn_stat, mcn_p = mcnemar_test(x, y)
|
|
98
|
+
|
|
99
|
+
# Cohen's d
|
|
100
|
+
d = cohens_d(x, y)
|
|
101
|
+
|
|
102
|
+
results.append({
|
|
103
|
+
"model_1": m1,
|
|
104
|
+
"model_2": m2,
|
|
105
|
+
"mcnemar_stat": mcn_stat,
|
|
106
|
+
"mcnemar_p": mcn_p,
|
|
107
|
+
"cohens_d": d,
|
|
108
|
+
"bonferroni_alpha": bonf_alpha,
|
|
109
|
+
"significant": mcn_p < bonf_alpha
|
|
110
|
+
})
|
|
111
|
+
|
|
112
|
+
df = pd.DataFrame(results)
|
|
113
|
+
|
|
114
|
+
# reverse pairs
|
|
115
|
+
if not df.empty:
|
|
116
|
+
rev = df.copy()
|
|
117
|
+
rev[["model_1", "model_2"]] = rev[["model_2", "model_1"]]
|
|
118
|
+
rev["cohens_d"] = -rev["cohens_d"]
|
|
119
|
+
df = pd.concat([df, rev], ignore_index=True)
|
|
120
|
+
|
|
121
|
+
return df
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
# ================= MAIN PIPELINE =================
|
|
125
|
+
def cross_model_diagnostic_synthesis(csv_paths, save_dir):
|
|
126
|
+
model_dfs = {}
|
|
127
|
+
model_evals = {}
|
|
128
|
+
error_types = set()
|
|
129
|
+
|
|
130
|
+
score_col = "accuracy_score"
|
|
131
|
+
|
|
132
|
+
# ================= LOAD + PREPROCESS =================
|
|
133
|
+
for csv_path in csv_paths:
|
|
134
|
+
df = pd.read_csv(csv_path)
|
|
135
|
+
|
|
136
|
+
if score_col not in df.columns:
|
|
137
|
+
continue
|
|
138
|
+
|
|
139
|
+
model_name = str(csv_path.parent).split("/")[-1]
|
|
140
|
+
|
|
141
|
+
df = df[df["error_type"].notna()].copy()
|
|
142
|
+
df["error_type"] = df["error_type"].apply(
|
|
143
|
+
lambda x: [x] if isinstance(x, str) else []
|
|
144
|
+
)
|
|
145
|
+
|
|
146
|
+
error_types.update(df["error_type"].explode().dropna().unique())
|
|
147
|
+
|
|
148
|
+
mlb = MultiLabelBinarizer()
|
|
149
|
+
dummies = pd.DataFrame(
|
|
150
|
+
mlb.fit_transform(df["error_type"]),
|
|
151
|
+
columns=mlb.classes_,
|
|
152
|
+
index=df.index
|
|
153
|
+
)
|
|
154
|
+
|
|
155
|
+
df = pd.concat([df, dummies], axis=1)
|
|
156
|
+
|
|
157
|
+
# clean scores
|
|
158
|
+
scores_clean = pd.to_numeric(df[score_col], errors="coerce").dropna()
|
|
159
|
+
refusal_score=pd.to_numeric(df["refusal_score"], errors="coerce").dropna()
|
|
160
|
+
off_topic_score=pd.to_numeric(df["off_topic_score"], errors="coerce").dropna()
|
|
161
|
+
spatial_elements_score=pd.to_numeric(df["spatial_elements_score"], errors="coerce").dropna()
|
|
162
|
+
spatial_relations_score=pd.to_numeric(df["spatial_relations_score"], errors="coerce").dropna()
|
|
163
|
+
spatial_reasoning_score=pd.to_numeric(df["spatial_reasoning_score"], errors="coerce").dropna()
|
|
164
|
+
|
|
165
|
+
model_evals[model_name] = {
|
|
166
|
+
"bootstrap_ci": bootstrap_ci(scores_clean, n_bootstrap=500),
|
|
167
|
+
"average_score": df[score_col].mean().round(5),
|
|
168
|
+
"error_types": list(mlb.classes_),
|
|
169
|
+
"overall_score": float(scores_clean.mean()) if len(scores_clean) > 0 else np.nan,
|
|
170
|
+
"refusal_score": float(refusal_score.mean()) if len(scores_clean) > 0 else np.nan,
|
|
171
|
+
"off_topic_score": float(off_topic_score.mean()) if len(scores_clean) > 0 else np.nan,
|
|
172
|
+
"spatial_elements_score": float(spatial_elements_score.mean()) if len(scores_clean) > 0 else np.nan,
|
|
173
|
+
"spatial_relations_score": float(spatial_relations_score.mean()) if len(scores_clean) > 0 else np.nan,
|
|
174
|
+
"spatial_reasoning_score": float(spatial_reasoning_score.mean()) if len(scores_clean) > 0 else np.nan,
|
|
175
|
+
}
|
|
176
|
+
|
|
177
|
+
model_dfs[model_name] = df
|
|
178
|
+
|
|
179
|
+
error_types = list(error_types)
|
|
180
|
+
|
|
181
|
+
# initialize
|
|
182
|
+
for model in model_evals:
|
|
183
|
+
model_evals[model]["Cohen_d"] = {}
|
|
184
|
+
model_evals[model]["Bonferroni"] = {}
|
|
185
|
+
model_evals[model]["Mcnemar"] = {}
|
|
186
|
+
|
|
187
|
+
# ================= PER ERROR TYPE =================
|
|
188
|
+
for error_type in error_types:
|
|
189
|
+
selected_models = [
|
|
190
|
+
m for m, v in model_evals.items()
|
|
191
|
+
if error_type in v["error_types"]
|
|
192
|
+
]
|
|
193
|
+
|
|
194
|
+
selected_model_dfs = {
|
|
195
|
+
k: v for k, v in model_dfs.items() if k in selected_models
|
|
196
|
+
}
|
|
197
|
+
|
|
198
|
+
if len(selected_model_dfs) <= 1:
|
|
199
|
+
continue
|
|
200
|
+
|
|
201
|
+
eval_res_df = full_pairwise_evaluation(
|
|
202
|
+
selected_model_dfs, score_col=error_type
|
|
203
|
+
)
|
|
204
|
+
|
|
205
|
+
for model in selected_models:
|
|
206
|
+
model_evals[model].setdefault("Cohen_d", {}).setdefault(error_type, [])
|
|
207
|
+
model_evals[model].setdefault("Bonferroni", {}).setdefault(error_type, [])
|
|
208
|
+
model_evals[model].setdefault("Mcnemar", {}).setdefault(error_type, [])
|
|
209
|
+
|
|
210
|
+
for _, row in eval_res_df.iterrows():
|
|
211
|
+
if row["model_1"] == model:
|
|
212
|
+
model_evals[model]["Cohen_d"][error_type].append({
|
|
213
|
+
"to": row["model_2"],
|
|
214
|
+
"cohens_d": row["cohens_d"]
|
|
215
|
+
})
|
|
216
|
+
|
|
217
|
+
model_evals[model]["Bonferroni"][error_type].append({
|
|
218
|
+
"to": row["model_2"],
|
|
219
|
+
"bonferroni_alpha": row["bonferroni_alpha"]
|
|
220
|
+
})
|
|
221
|
+
|
|
222
|
+
model_evals[model]["Mcnemar"][error_type].append({
|
|
223
|
+
"to": row["model_2"],
|
|
224
|
+
"mcnemar_stat": row["mcnemar_stat"],
|
|
225
|
+
"mcnemar_p": row["mcnemar_p"]
|
|
226
|
+
})
|
|
227
|
+
|
|
228
|
+
# ================= EXPORT =================
|
|
229
|
+
rows = []
|
|
230
|
+
|
|
231
|
+
for model, data in model_evals.items():
|
|
232
|
+
|
|
233
|
+
rows.append({
|
|
234
|
+
"model": model,
|
|
235
|
+
"to": None,
|
|
236
|
+
"metric": "Score",
|
|
237
|
+
"error_type": "Overall",
|
|
238
|
+
"value": data.get("average_score", 0)
|
|
239
|
+
})
|
|
240
|
+
rows.append({
|
|
241
|
+
"model": model,
|
|
242
|
+
"to": None,
|
|
243
|
+
"metric": "Refusal Score",
|
|
244
|
+
"error_type": "Overall",
|
|
245
|
+
"value": data.get("refusal_score", 0)
|
|
246
|
+
})
|
|
247
|
+
rows.append({
|
|
248
|
+
"model": model,
|
|
249
|
+
"to": None,
|
|
250
|
+
"metric": "Off Topic Score",
|
|
251
|
+
"error_type": "Overall",
|
|
252
|
+
"value": data.get("off_topic_score", 0)
|
|
253
|
+
})
|
|
254
|
+
rows.append({
|
|
255
|
+
"model": model,
|
|
256
|
+
"to": None,
|
|
257
|
+
"metric": "Spatial Elements Score",
|
|
258
|
+
"error_type": "Overall",
|
|
259
|
+
"value": data.get("spatial_elements_score", 0)
|
|
260
|
+
})
|
|
261
|
+
rows.append({
|
|
262
|
+
"model": model,
|
|
263
|
+
"to": None,
|
|
264
|
+
"metric": "Spatial Relations Score",
|
|
265
|
+
"error_type": "Overall",
|
|
266
|
+
"value": data.get("spatial_relations_score", 0)
|
|
267
|
+
})
|
|
268
|
+
rows.append({
|
|
269
|
+
"model": model,
|
|
270
|
+
"to": None,
|
|
271
|
+
"metric": "Spatial Reasoning Score",
|
|
272
|
+
"error_type": "Overall",
|
|
273
|
+
"value": data.get("spatial_reasoning_score", 0)
|
|
274
|
+
})
|
|
275
|
+
# bootstrap
|
|
276
|
+
bc = data.get("bootstrap_ci", {})
|
|
277
|
+
if bc:
|
|
278
|
+
rows.append({
|
|
279
|
+
"model": model,
|
|
280
|
+
"to": None,
|
|
281
|
+
"metric": "bootstrap_mean",
|
|
282
|
+
"error_type": "Overall",
|
|
283
|
+
"value": bc.get("mean")
|
|
284
|
+
})
|
|
285
|
+
|
|
286
|
+
# Cohen d
|
|
287
|
+
for e, lst in data.get("Cohen_d", {}).items():
|
|
288
|
+
for v in lst:
|
|
289
|
+
rows.append({
|
|
290
|
+
"model": model,
|
|
291
|
+
"to": v["to"],
|
|
292
|
+
"metric": "cohens_d",
|
|
293
|
+
"error_type": e,
|
|
294
|
+
"value": v["cohens_d"]
|
|
295
|
+
})
|
|
296
|
+
|
|
297
|
+
# Bonferroni
|
|
298
|
+
for e, lst in data.get("Bonferroni", {}).items():
|
|
299
|
+
for v in lst:
|
|
300
|
+
rows.append({
|
|
301
|
+
"model": model,
|
|
302
|
+
"to": v["to"],
|
|
303
|
+
"metric": "bonferroni_alpha",
|
|
304
|
+
"error_type": e,
|
|
305
|
+
"value": v["bonferroni_alpha"]
|
|
306
|
+
})
|
|
307
|
+
|
|
308
|
+
# McNemar
|
|
309
|
+
for e, lst in data.get("Mcnemar", {}).items():
|
|
310
|
+
for v in lst:
|
|
311
|
+
rows.append({
|
|
312
|
+
"model": model,
|
|
313
|
+
"to": v["to"],
|
|
314
|
+
"metric": "mcnemar_stat",
|
|
315
|
+
"error_type": e,
|
|
316
|
+
"value": v["mcnemar_stat"]
|
|
317
|
+
})
|
|
318
|
+
|
|
319
|
+
|
|
320
|
+
pd.DataFrame(rows).to_csv(save_dir / "model_evaluations.csv", index=False)
|
|
321
|
+
json.dump(model_evals, open(save_dir / "model_evaluations.json", "w"), indent=4)
|