holistic-math-harness 0.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,27 @@
1
+
2
+ ---
3
+
4
+ # 📄 **LICENSE (MIT License)**
5
+
6
+ ```text id="license001"
7
+ MIT License
8
+
9
+ Copyright (c) 2026 CIOL
10
+
11
+ Permission is hereby granted, free of charge, to any person obtaining a copy
12
+ of this software and associated documentation files (the "Software"), to deal
13
+ in the Software without restriction, including without limitation the rights
14
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
15
+ copies of the Software, and to permit persons to whom the Software is
16
+ furnished to do so, subject to the following conditions:
17
+
18
+ The above copyright notice and this permission notice shall be included in all
19
+ copies or substantial portions of the Software.
20
+
21
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
22
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
23
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
24
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
25
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
26
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
27
+ SOFTWARE.
@@ -0,0 +1,24 @@
1
+ Metadata-Version: 2.4
2
+ Name: holistic-math-harness
3
+ Version: 0.0.0
4
+ Summary: A lightweight framework for benchmarking multimodal AI agents with parallel execution, prompt variation, and automated evaluation.
5
+ Author-email: Wahid Faisal <wahiddhrubo@gmail.com>
6
+ License: MIT
7
+ Requires-Python: >=3.8
8
+ Description-Content-Type: text/markdown
9
+ License-File: LICENSE
10
+ Requires-Dist: pandas
11
+ Requires-Dist: numpy
12
+ Requires-Dist: alive-progress
13
+ Requires-Dist: gitpython
14
+ Requires-Dist: huggingface-hub
15
+ Requires-Dist: openai
16
+ Requires-Dist: scipy
17
+ Requires-Dist: scikit-learn
18
+ Requires-Dist: torch
19
+ Requires-Dist: transformers
20
+ Requires-Dist: umap-learn
21
+ Requires-Dist: pillow
22
+ Requires-Dist: cohere
23
+ Requires-Dist: hdbscan
24
+ Dynamic: license-file
@@ -0,0 +1,19 @@
1
+ from .evaluator import HarnessEvaluator,structured_signal_extraction,cross_model_diagnostic_synthesis
2
+ from .holistic_harness import HarnessRunner
3
+ from .initialize_benchmark import process_benchmark, BenchmarkStats
4
+ from .prompt_variation import PromptVariationGenerator
5
+
6
+
7
+
8
+ __version__ = "0.1.5"
9
+
10
+ __all__ = [
11
+ "HarnessEvaluator",
12
+ "cross_model_diagnostic_synthesis",
13
+ "structured_signal_extraction",
14
+ "HarnessRunner",
15
+ "process_benchmark",
16
+ "PromptVariationGenerator",
17
+ "BenchmarkStats"
18
+ ]
19
+
@@ -0,0 +1,62 @@
1
+ # ================= STANDARD LIB =================
2
+ import os
3
+ from pathlib import Path
4
+
5
+ # ================= DATA =================
6
+ import pandas as pd
7
+
8
+
9
+
10
+ from .evaluator_cross_model_diagnostic_synthesis import cross_model_diagnostic_synthesis
11
+ from .evaluator_error_taxonomy_induction import error_taxonomy_induction
12
+ from .evaluator_structured_signal_extraction import structured_signal_extraction
13
+
14
+
15
+
16
+
17
+
18
+ def HarnessEvaluator(results_path,custom_error_taxonomies=None,max_concurrent=10,model_name="gemma-4-31b-it"):
19
+
20
+ results_path=Path(results_path)
21
+
22
+
23
+ error_taxonomies= custom_error_taxonomies if custom_error_taxonomies else {
24
+ "E1":"perceptual failure — model misidentified or failed to detect a spatial element in the image",
25
+ "E2":"grounding failure — model identified elements but failed to anchor them in the correct spatial frame of reference",
26
+ "E3":"spatial transformation failure — model understood the spatial arrangement but failed to correctly apply a rotation, reflection, or perspective change)",
27
+ "E4":"quantitative spatial failure — failure on counting, distance estimation, or measurement",
28
+ "E5":"multi-step reasoning failure — model solved one spatial step correctly but failed on a subsequent step requiring the first as input",
29
+ "E6":"language-spatial mapping failure — model misunderstood the spatial language in the question, e.g., confused 'left of X' with 'right of X'",
30
+ "E7":"format/refusal failure — model did not produce a valid answer regardless of spatial competence",
31
+ }
32
+
33
+ for model_folder in results_path.iterdir():
34
+ if model_folder.is_dir():
35
+ response_csv_path= model_folder / "agent_response.csv"
36
+ score_csv_path= model_folder / "agent_scores.csv"
37
+ prev_df=None
38
+
39
+ if os.path.exists(score_csv_path) :
40
+ prev_df=pd.read_csv(score_csv_path)
41
+
42
+ structured_signal_extraction(
43
+ response_csv_path,
44
+ max_concurrent=max_concurrent,
45
+ model_name=model_name,
46
+ prev_df=prev_df
47
+ )
48
+ error_taxonomy_induction(
49
+ score_csv_path,
50
+ error_taxonomies=error_taxonomies,
51
+ model_name=model_name,
52
+ )
53
+
54
+ if score_csv_path.exists():
55
+ df=pd.read_csv(score_csv_path)
56
+ if "accuracy_score" in df.columns:
57
+ print("Numbers of Failed Tasks: ",len(df[df["accuracy_score"].isna()]))
58
+
59
+
60
+
61
+ cross_model_diagnostic_synthesis(results_path.rglob("agent_scores.csv"),results_path)
62
+
@@ -0,0 +1,321 @@
1
+ # ================= STANDARD LIB =================
2
+ import json
3
+ from itertools import combinations
4
+
5
+ # ================= DATA =================
6
+ import numpy as np
7
+ import pandas as pd
8
+
9
+ # ================= ML / STATS =================
10
+ from scipy.stats import chi2
11
+ from sklearn.preprocessing import MultiLabelBinarizer
12
+
13
+
14
+ # ================= BOOTSTRAP CI =================
15
+ def bootstrap_ci(scores_lst, n_bootstrap=1000, alpha=0.05):
16
+ scores_lst = np.asarray(scores_lst)
17
+ scores_lst = scores_lst[~np.isnan(scores_lst)]
18
+
19
+ if len(scores_lst) == 0:
20
+ return {"mean": np.nan, "ci_lower": np.nan, "ci_upper": np.nan, "std": np.nan}
21
+
22
+ boot_means = []
23
+ n = len(scores_lst)
24
+
25
+ for _ in range(n_bootstrap):
26
+ idx = np.random.randint(0, n, n)
27
+ sample = scores_lst[idx]
28
+ boot_means.append(np.mean(sample)) # ✅ scalar
29
+
30
+ boot_means = np.array(boot_means)
31
+
32
+ return {
33
+ "mean": float(np.mean(boot_means)),
34
+ "ci_lower": float(np.percentile(boot_means, 100 * alpha / 2)),
35
+ "ci_upper": float(np.percentile(boot_means, 100 * (1 - alpha / 2))),
36
+ "std": float(np.std(boot_means))
37
+ }
38
+
39
+
40
+ # ================= COHEN'S D =================
41
+ def cohens_d(x, y):
42
+ x, y = np.asarray(x), np.asarray(y)
43
+
44
+ x = x[~np.isnan(x)]
45
+ y = y[~np.isnan(y)]
46
+
47
+ nx, ny = len(x), len(y)
48
+ if nx < 2 or ny < 2:
49
+ return np.nan
50
+
51
+ sx, sy = np.var(x, ddof=1), np.var(y, ddof=1)
52
+ pooled = np.sqrt(((nx - 1) * sx + (ny - 1) * sy) / (nx + ny - 2))
53
+
54
+ return (np.mean(x) - np.mean(y)) / pooled if pooled != 0 else 0.0
55
+
56
+
57
+ # ================= MCNEMAR TEST =================
58
+ def mcnemar_test(x, y):
59
+ x, y = np.asarray(x), np.asarray(y)
60
+
61
+ mask = ~np.isnan(x) & ~np.isnan(y)
62
+ x, y = x[mask], y[mask]
63
+
64
+ b = np.sum((x == 1) & (y == 0))
65
+ c = np.sum((x == 0) & (y == 1))
66
+
67
+ if b + c == 0:
68
+ return 0.0, 1.0
69
+
70
+ stat = (abs(b - c) - 1) ** 2 / (b + c)
71
+ p = 1 - chi2.cdf(stat, df=1)
72
+
73
+ return float(stat), float(p)
74
+
75
+
76
+ # ================= FULL PAIRWISE =================
77
+ def full_pairwise_evaluation(model_dfs, score_col="accuracy_score"):
78
+ models = list(model_dfs.keys())
79
+ pairs = list(combinations(models, 2))
80
+
81
+ m = len(pairs)
82
+ bonf_alpha = 0.05 / m if m > 0 else 0.05
83
+
84
+ results = []
85
+
86
+ for m1, m2 in pairs:
87
+ df1 = model_dfs[m1]
88
+ df2 = model_dfs[m2]
89
+
90
+ # ✅ align by id
91
+ merged = df1.merge(df2, on="id", suffixes=("_1", "_2"))
92
+
93
+ x = merged[f"{score_col}_1"].values
94
+ y = merged[f"{score_col}_2"].values
95
+
96
+ # McNemar
97
+ mcn_stat, mcn_p = mcnemar_test(x, y)
98
+
99
+ # Cohen's d
100
+ d = cohens_d(x, y)
101
+
102
+ results.append({
103
+ "model_1": m1,
104
+ "model_2": m2,
105
+ "mcnemar_stat": mcn_stat,
106
+ "mcnemar_p": mcn_p,
107
+ "cohens_d": d,
108
+ "bonferroni_alpha": bonf_alpha,
109
+ "significant": mcn_p < bonf_alpha
110
+ })
111
+
112
+ df = pd.DataFrame(results)
113
+
114
+ # reverse pairs
115
+ if not df.empty:
116
+ rev = df.copy()
117
+ rev[["model_1", "model_2"]] = rev[["model_2", "model_1"]]
118
+ rev["cohens_d"] = -rev["cohens_d"]
119
+ df = pd.concat([df, rev], ignore_index=True)
120
+
121
+ return df
122
+
123
+
124
+ # ================= MAIN PIPELINE =================
125
+ def cross_model_diagnostic_synthesis(csv_paths, save_dir):
126
+ model_dfs = {}
127
+ model_evals = {}
128
+ error_types = set()
129
+
130
+ score_col = "accuracy_score"
131
+
132
+ # ================= LOAD + PREPROCESS =================
133
+ for csv_path in csv_paths:
134
+ df = pd.read_csv(csv_path)
135
+
136
+ if score_col not in df.columns:
137
+ continue
138
+
139
+ model_name = str(csv_path.parent).split("/")[-1]
140
+
141
+ df = df[df["error_type"].notna()].copy()
142
+ df["error_type"] = df["error_type"].apply(
143
+ lambda x: [x] if isinstance(x, str) else []
144
+ )
145
+
146
+ error_types.update(df["error_type"].explode().dropna().unique())
147
+
148
+ mlb = MultiLabelBinarizer()
149
+ dummies = pd.DataFrame(
150
+ mlb.fit_transform(df["error_type"]),
151
+ columns=mlb.classes_,
152
+ index=df.index
153
+ )
154
+
155
+ df = pd.concat([df, dummies], axis=1)
156
+
157
+ # clean scores
158
+ scores_clean = pd.to_numeric(df[score_col], errors="coerce").dropna()
159
+ refusal_score=pd.to_numeric(df["refusal_score"], errors="coerce").dropna()
160
+ off_topic_score=pd.to_numeric(df["off_topic_score"], errors="coerce").dropna()
161
+ spatial_elements_score=pd.to_numeric(df["spatial_elements_score"], errors="coerce").dropna()
162
+ spatial_relations_score=pd.to_numeric(df["spatial_relations_score"], errors="coerce").dropna()
163
+ spatial_reasoning_score=pd.to_numeric(df["spatial_reasoning_score"], errors="coerce").dropna()
164
+
165
+ model_evals[model_name] = {
166
+ "bootstrap_ci": bootstrap_ci(scores_clean, n_bootstrap=500),
167
+ "average_score": df[score_col].mean().round(5),
168
+ "error_types": list(mlb.classes_),
169
+ "overall_score": float(scores_clean.mean()) if len(scores_clean) > 0 else np.nan,
170
+ "refusal_score": float(refusal_score.mean()) if len(scores_clean) > 0 else np.nan,
171
+ "off_topic_score": float(off_topic_score.mean()) if len(scores_clean) > 0 else np.nan,
172
+ "spatial_elements_score": float(spatial_elements_score.mean()) if len(scores_clean) > 0 else np.nan,
173
+ "spatial_relations_score": float(spatial_relations_score.mean()) if len(scores_clean) > 0 else np.nan,
174
+ "spatial_reasoning_score": float(spatial_reasoning_score.mean()) if len(scores_clean) > 0 else np.nan,
175
+ }
176
+
177
+ model_dfs[model_name] = df
178
+
179
+ error_types = list(error_types)
180
+
181
+ # initialize
182
+ for model in model_evals:
183
+ model_evals[model]["Cohen_d"] = {}
184
+ model_evals[model]["Bonferroni"] = {}
185
+ model_evals[model]["Mcnemar"] = {}
186
+
187
+ # ================= PER ERROR TYPE =================
188
+ for error_type in error_types:
189
+ selected_models = [
190
+ m for m, v in model_evals.items()
191
+ if error_type in v["error_types"]
192
+ ]
193
+
194
+ selected_model_dfs = {
195
+ k: v for k, v in model_dfs.items() if k in selected_models
196
+ }
197
+
198
+ if len(selected_model_dfs) <= 1:
199
+ continue
200
+
201
+ eval_res_df = full_pairwise_evaluation(
202
+ selected_model_dfs, score_col=error_type
203
+ )
204
+
205
+ for model in selected_models:
206
+ model_evals[model].setdefault("Cohen_d", {}).setdefault(error_type, [])
207
+ model_evals[model].setdefault("Bonferroni", {}).setdefault(error_type, [])
208
+ model_evals[model].setdefault("Mcnemar", {}).setdefault(error_type, [])
209
+
210
+ for _, row in eval_res_df.iterrows():
211
+ if row["model_1"] == model:
212
+ model_evals[model]["Cohen_d"][error_type].append({
213
+ "to": row["model_2"],
214
+ "cohens_d": row["cohens_d"]
215
+ })
216
+
217
+ model_evals[model]["Bonferroni"][error_type].append({
218
+ "to": row["model_2"],
219
+ "bonferroni_alpha": row["bonferroni_alpha"]
220
+ })
221
+
222
+ model_evals[model]["Mcnemar"][error_type].append({
223
+ "to": row["model_2"],
224
+ "mcnemar_stat": row["mcnemar_stat"],
225
+ "mcnemar_p": row["mcnemar_p"]
226
+ })
227
+
228
+ # ================= EXPORT =================
229
+ rows = []
230
+
231
+ for model, data in model_evals.items():
232
+
233
+ rows.append({
234
+ "model": model,
235
+ "to": None,
236
+ "metric": "Score",
237
+ "error_type": "Overall",
238
+ "value": data.get("average_score", 0)
239
+ })
240
+ rows.append({
241
+ "model": model,
242
+ "to": None,
243
+ "metric": "Refusal Score",
244
+ "error_type": "Overall",
245
+ "value": data.get("refusal_score", 0)
246
+ })
247
+ rows.append({
248
+ "model": model,
249
+ "to": None,
250
+ "metric": "Off Topic Score",
251
+ "error_type": "Overall",
252
+ "value": data.get("off_topic_score", 0)
253
+ })
254
+ rows.append({
255
+ "model": model,
256
+ "to": None,
257
+ "metric": "Spatial Elements Score",
258
+ "error_type": "Overall",
259
+ "value": data.get("spatial_elements_score", 0)
260
+ })
261
+ rows.append({
262
+ "model": model,
263
+ "to": None,
264
+ "metric": "Spatial Relations Score",
265
+ "error_type": "Overall",
266
+ "value": data.get("spatial_relations_score", 0)
267
+ })
268
+ rows.append({
269
+ "model": model,
270
+ "to": None,
271
+ "metric": "Spatial Reasoning Score",
272
+ "error_type": "Overall",
273
+ "value": data.get("spatial_reasoning_score", 0)
274
+ })
275
+ # bootstrap
276
+ bc = data.get("bootstrap_ci", {})
277
+ if bc:
278
+ rows.append({
279
+ "model": model,
280
+ "to": None,
281
+ "metric": "bootstrap_mean",
282
+ "error_type": "Overall",
283
+ "value": bc.get("mean")
284
+ })
285
+
286
+ # Cohen d
287
+ for e, lst in data.get("Cohen_d", {}).items():
288
+ for v in lst:
289
+ rows.append({
290
+ "model": model,
291
+ "to": v["to"],
292
+ "metric": "cohens_d",
293
+ "error_type": e,
294
+ "value": v["cohens_d"]
295
+ })
296
+
297
+ # Bonferroni
298
+ for e, lst in data.get("Bonferroni", {}).items():
299
+ for v in lst:
300
+ rows.append({
301
+ "model": model,
302
+ "to": v["to"],
303
+ "metric": "bonferroni_alpha",
304
+ "error_type": e,
305
+ "value": v["bonferroni_alpha"]
306
+ })
307
+
308
+ # McNemar
309
+ for e, lst in data.get("Mcnemar", {}).items():
310
+ for v in lst:
311
+ rows.append({
312
+ "model": model,
313
+ "to": v["to"],
314
+ "metric": "mcnemar_stat",
315
+ "error_type": e,
316
+ "value": v["mcnemar_stat"]
317
+ })
318
+
319
+
320
+ pd.DataFrame(rows).to_csv(save_dir / "model_evaluations.csv", index=False)
321
+ json.dump(model_evals, open(save_dir / "model_evaluations.json", "w"), indent=4)