hal-spatial-harness 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- hal_spatial_harness-0.1.0/LICENSE +27 -0
- hal_spatial_harness-0.1.0/PKG-INFO +16 -0
- hal_spatial_harness-0.1.0/hal_spatial_harness.egg-info/PKG-INFO +16 -0
- hal_spatial_harness-0.1.0/hal_spatial_harness.egg-info/SOURCES.txt +13 -0
- hal_spatial_harness-0.1.0/hal_spatial_harness.egg-info/dependency_links.txt +1 -0
- hal_spatial_harness-0.1.0/hal_spatial_harness.egg-info/requires.txt +6 -0
- hal_spatial_harness-0.1.0/hal_spatial_harness.egg-info/top_level.txt +1 -0
- hal_spatial_harness-0.1.0/hal_spatial_harness_package/__init__.py +17 -0
- hal_spatial_harness-0.1.0/hal_spatial_harness_package/evaluator.py +245 -0
- hal_spatial_harness-0.1.0/hal_spatial_harness_package/hal_harness.py +230 -0
- hal_spatial_harness-0.1.0/hal_spatial_harness_package/initialize_benchmark.py +143 -0
- hal_spatial_harness-0.1.0/hal_spatial_harness_package/prompt_variation.py +117 -0
- hal_spatial_harness-0.1.0/hal_spatial_harness_package/utils.py +61 -0
- hal_spatial_harness-0.1.0/pyproject.toml +23 -0
- hal_spatial_harness-0.1.0/setup.cfg +4 -0
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
|
|
2
|
+
---
|
|
3
|
+
|
|
4
|
+
# 📄 **LICENSE (MIT License)**
|
|
5
|
+
|
|
6
|
+
```text id="license001"
|
|
7
|
+
MIT License
|
|
8
|
+
|
|
9
|
+
Copyright (c) 2026 CIOL
|
|
10
|
+
|
|
11
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
12
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
13
|
+
in the Software without restriction, including without limitation the rights
|
|
14
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
15
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
16
|
+
furnished to do so, subject to the following conditions:
|
|
17
|
+
|
|
18
|
+
The above copyright notice and this permission notice shall be included in all
|
|
19
|
+
copies or substantial portions of the Software.
|
|
20
|
+
|
|
21
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
22
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
23
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
24
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
25
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
26
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
27
|
+
SOFTWARE.
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: hal-spatial-harness
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: A lightweight framework for benchmarking multimodal AI agents with parallel execution, prompt variation, and automated evaluation.
|
|
5
|
+
Author-email: Wahid Faisal <wahiddhrubo@gmail.com>
|
|
6
|
+
License: MIT
|
|
7
|
+
Requires-Python: >=3.8
|
|
8
|
+
Description-Content-Type: text/markdown
|
|
9
|
+
License-File: LICENSE
|
|
10
|
+
Requires-Dist: pandas
|
|
11
|
+
Requires-Dist: numpy
|
|
12
|
+
Requires-Dist: alive-progress
|
|
13
|
+
Requires-Dist: gitpython
|
|
14
|
+
Requires-Dist: huggingface-hub
|
|
15
|
+
Requires-Dist: openai
|
|
16
|
+
Dynamic: license-file
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: hal-spatial-harness
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: A lightweight framework for benchmarking multimodal AI agents with parallel execution, prompt variation, and automated evaluation.
|
|
5
|
+
Author-email: Wahid Faisal <wahiddhrubo@gmail.com>
|
|
6
|
+
License: MIT
|
|
7
|
+
Requires-Python: >=3.8
|
|
8
|
+
Description-Content-Type: text/markdown
|
|
9
|
+
License-File: LICENSE
|
|
10
|
+
Requires-Dist: pandas
|
|
11
|
+
Requires-Dist: numpy
|
|
12
|
+
Requires-Dist: alive-progress
|
|
13
|
+
Requires-Dist: gitpython
|
|
14
|
+
Requires-Dist: huggingface-hub
|
|
15
|
+
Requires-Dist: openai
|
|
16
|
+
Dynamic: license-file
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
LICENSE
|
|
2
|
+
pyproject.toml
|
|
3
|
+
hal_spatial_harness.egg-info/PKG-INFO
|
|
4
|
+
hal_spatial_harness.egg-info/SOURCES.txt
|
|
5
|
+
hal_spatial_harness.egg-info/dependency_links.txt
|
|
6
|
+
hal_spatial_harness.egg-info/requires.txt
|
|
7
|
+
hal_spatial_harness.egg-info/top_level.txt
|
|
8
|
+
hal_spatial_harness_package/__init__.py
|
|
9
|
+
hal_spatial_harness_package/evaluator.py
|
|
10
|
+
hal_spatial_harness_package/hal_harness.py
|
|
11
|
+
hal_spatial_harness_package/initialize_benchmark.py
|
|
12
|
+
hal_spatial_harness_package/prompt_variation.py
|
|
13
|
+
hal_spatial_harness_package/utils.py
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
hal_spatial_harness_package
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
from .evaluator import evaluation_result
|
|
2
|
+
from .hal_harness import HarnessRunner
|
|
3
|
+
from .initialize_benchmark import initialize_benchmark, BenchmarkStats
|
|
4
|
+
from .prompt_variation import PromptVariationGenerator
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
__version__ = "0.1.0"
|
|
9
|
+
|
|
10
|
+
__all__ = [
|
|
11
|
+
"evaluation_result",
|
|
12
|
+
"HarnessRunner",
|
|
13
|
+
"initialize_benchmark",
|
|
14
|
+
"PromptVariationGenerator",
|
|
15
|
+
"BenchmarkStats"
|
|
16
|
+
]
|
|
17
|
+
|
|
@@ -0,0 +1,245 @@
|
|
|
1
|
+
from utils import get_client, encode_image
|
|
2
|
+
import os
|
|
3
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
4
|
+
from typing import Dict
|
|
5
|
+
|
|
6
|
+
# 3rd-party packages
|
|
7
|
+
import pandas as pd
|
|
8
|
+
from alive_progress import alive_bar
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def compute_metrics(df: pd.DataFrame):
|
|
12
|
+
TP = (df["score"] == 1).sum()
|
|
13
|
+
FP = (df["score"] == 0).sum()
|
|
14
|
+
FN = FP
|
|
15
|
+
precision = TP / (TP + FP) if (TP + FP) > 0 else 0
|
|
16
|
+
recall = TP / (TP + FN) if (TP + FN) > 0 else 0
|
|
17
|
+
f1 = (2 * precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
|
|
18
|
+
return round(precision*100,2), round(recall*100,2), round(f1*100,2)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def llm_judge_binary(task, model_name="gpt-4o-mini-2024-07-18"):
|
|
25
|
+
client = get_client()
|
|
26
|
+
|
|
27
|
+
prompt = f"""
|
|
28
|
+
You are a strict evaluator.
|
|
29
|
+
|
|
30
|
+
Return ONLY:
|
|
31
|
+
1 → if the model answer is correct
|
|
32
|
+
0 → if the model answer is incorrect
|
|
33
|
+
|
|
34
|
+
No explanation.
|
|
35
|
+
|
|
36
|
+
Question:
|
|
37
|
+
{task['question']}
|
|
38
|
+
|
|
39
|
+
Model Answer:
|
|
40
|
+
{task['response']}
|
|
41
|
+
|
|
42
|
+
Choices:
|
|
43
|
+
{task['choices']}
|
|
44
|
+
|
|
45
|
+
Correct Answer:
|
|
46
|
+
{task['answer']}
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
"""
|
|
50
|
+
images_paths = [p for p in task["ques_image_path_lst"]]
|
|
51
|
+
|
|
52
|
+
content = [{"type": "text", "text": prompt}]
|
|
53
|
+
|
|
54
|
+
content += [
|
|
55
|
+
{
|
|
56
|
+
"type": "image_url",
|
|
57
|
+
"image_url": {"url": f"data:image/jpeg;base64,{encode_image(path)}"}
|
|
58
|
+
}
|
|
59
|
+
for path in images_paths
|
|
60
|
+
]
|
|
61
|
+
|
|
62
|
+
response = client.chat.completions.create(
|
|
63
|
+
model=model_name,
|
|
64
|
+
temperature=0,
|
|
65
|
+
messages=[{"role": "user", "content": content}],
|
|
66
|
+
max_tokens=5,
|
|
67
|
+
)
|
|
68
|
+
|
|
69
|
+
output = response.choices[0].message.content.strip()
|
|
70
|
+
|
|
71
|
+
# safe parsing
|
|
72
|
+
if "1" in output:
|
|
73
|
+
return 1
|
|
74
|
+
elif "0" in output:
|
|
75
|
+
return 0
|
|
76
|
+
else:
|
|
77
|
+
return 0.5 # fallback if model misbehaves
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def evaluator(
|
|
81
|
+
dataset: Dict,
|
|
82
|
+
evaluate_mcq_by_judge_llm: bool,
|
|
83
|
+
mcq_match_full_answer: bool,
|
|
84
|
+
var: bool = False,
|
|
85
|
+
eval_path: str = "",
|
|
86
|
+
max_workers: int = 4
|
|
87
|
+
) -> Dict:
|
|
88
|
+
"""
|
|
89
|
+
Parallel evaluation of dataset with optional LLM scoring.
|
|
90
|
+
"""
|
|
91
|
+
evaluation = {}
|
|
92
|
+
|
|
93
|
+
# Exclude already-evaluated items
|
|
94
|
+
if eval_path and os.path.exists(eval_path):
|
|
95
|
+
|
|
96
|
+
exclude_ids = set(pd.read_csv(eval_path)["id"].to_list())
|
|
97
|
+
dataset = {k: v for k, v in dataset.items() if k not in exclude_ids}
|
|
98
|
+
evaluation=pd.read_csv(eval_path).set_index('id').to_dict(orient='index')
|
|
99
|
+
|
|
100
|
+
def evaluate_item(d_id, data):
|
|
101
|
+
"""Evaluate a single dataset item and return result tuple"""
|
|
102
|
+
# MCQ scoring
|
|
103
|
+
if data.get("question_type") == "MCQ" and not evaluate_mcq_by_judge_llm:
|
|
104
|
+
if mcq_match_full_answer:
|
|
105
|
+
score = 1 if data["answer"] in data["response"] else 0
|
|
106
|
+
else:
|
|
107
|
+
score = 1 if data["correct_option"] == data["response"] else 0
|
|
108
|
+
else:
|
|
109
|
+
score = llm_judge_binary(task=data)
|
|
110
|
+
|
|
111
|
+
key_id = "".join(d_id.split("-")[:2]) if var else d_id
|
|
112
|
+
return key_id, data, score
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
# Run in parallel
|
|
116
|
+
with alive_bar(len(dataset), bar='blocks', spinner='dots', title='Running Evaluation', force_tty=True) as bar:
|
|
117
|
+
with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
|
118
|
+
future_to_id = {executor.submit(evaluate_item, d_id, data): d_id for d_id, data in dataset.items()}
|
|
119
|
+
|
|
120
|
+
for future in as_completed(future_to_id):
|
|
121
|
+
key_id, data, score = future.result()
|
|
122
|
+
|
|
123
|
+
# Merge results safely
|
|
124
|
+
if var and key_id in evaluation:
|
|
125
|
+
evaluation[key_id]["prompt"].append(data["task_description"])
|
|
126
|
+
evaluation[key_id]["response"].append(data["response"])
|
|
127
|
+
evaluation[key_id]["scores"].append(score)
|
|
128
|
+
else:
|
|
129
|
+
evaluation[key_id] = {**data}
|
|
130
|
+
if var:
|
|
131
|
+
evaluation[key_id]["prompt"] = [data["task_description"]]
|
|
132
|
+
evaluation[key_id]["response"] = [data["response"]]
|
|
133
|
+
evaluation[key_id]["scores"] = [score]
|
|
134
|
+
else:
|
|
135
|
+
evaluation[key_id]["score"] = score
|
|
136
|
+
|
|
137
|
+
bar() # update progress
|
|
138
|
+
|
|
139
|
+
# Compute averaged scores for var=True
|
|
140
|
+
if var:
|
|
141
|
+
for i in evaluation:
|
|
142
|
+
evaluation[i]["score"] = sum(evaluation[i]["scores"]) / len(evaluation[i]["scores"])
|
|
143
|
+
|
|
144
|
+
# Save final CSV once
|
|
145
|
+
if eval_path:
|
|
146
|
+
pd.DataFrame.from_dict(evaluation, orient="index").rename_axis("id").reset_index().to_csv(eval_path, index=False)
|
|
147
|
+
|
|
148
|
+
return evaluation
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
def get_evaluation_result(df):
|
|
152
|
+
TP = (df["score"] == 1).sum()
|
|
153
|
+
FP = (df["score"] == 0).sum()
|
|
154
|
+
|
|
155
|
+
# If every row has a prediction, FN ~ FP for binary correctness
|
|
156
|
+
FN = FP
|
|
157
|
+
|
|
158
|
+
precision = TP / (TP + FP) if (TP + FP) > 0 else 0
|
|
159
|
+
recall = TP / (TP + FN) if (TP + FN) > 0 else 0
|
|
160
|
+
f1 = (2 * precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
|
|
161
|
+
|
|
162
|
+
return round(precision*100,2), round(recall*100,2), round(f1*100,2)
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
def evaluation_result(eval_path):
|
|
167
|
+
df = pd.read_csv(eval_path)
|
|
168
|
+
|
|
169
|
+
# ---------------- Aggregations ----------------
|
|
170
|
+
cat_df = (
|
|
171
|
+
df.groupby("category")
|
|
172
|
+
.apply(lambda x: pd.Series({
|
|
173
|
+
"cat_score": x["score"].mean()*100,
|
|
174
|
+
"cat_count": len(x),
|
|
175
|
+
"precision": compute_metrics(x)[0],
|
|
176
|
+
"recall": compute_metrics(x)[1],
|
|
177
|
+
"f1": compute_metrics(x)[2],
|
|
178
|
+
}))
|
|
179
|
+
.reset_index()
|
|
180
|
+
)
|
|
181
|
+
|
|
182
|
+
sub_df = (
|
|
183
|
+
df.groupby(["category", "sub_category"])
|
|
184
|
+
.apply(lambda x: pd.Series({
|
|
185
|
+
"subcat_score": x["score"].mean()*100,
|
|
186
|
+
"subcat_count": len(x),
|
|
187
|
+
"precision": compute_metrics(x)[0],
|
|
188
|
+
"recall": compute_metrics(x)[1],
|
|
189
|
+
"f1": compute_metrics(x)[2],
|
|
190
|
+
}))
|
|
191
|
+
.reset_index()
|
|
192
|
+
)
|
|
193
|
+
|
|
194
|
+
task_df = (
|
|
195
|
+
df.groupby(["category", "sub_category", "task"])
|
|
196
|
+
.apply(lambda x: pd.Series({
|
|
197
|
+
"task_score": x["score"].mean()*100,
|
|
198
|
+
"task_count": len(x),
|
|
199
|
+
"precision": compute_metrics(x)[0],
|
|
200
|
+
"recall": compute_metrics(x)[1],
|
|
201
|
+
"f1": compute_metrics(x)[2],
|
|
202
|
+
}))
|
|
203
|
+
.reset_index()
|
|
204
|
+
)
|
|
205
|
+
|
|
206
|
+
# Merge
|
|
207
|
+
merged = (
|
|
208
|
+
task_df
|
|
209
|
+
.merge(sub_df, on=["category", "sub_category"])
|
|
210
|
+
.merge(cat_df, on="category")
|
|
211
|
+
)
|
|
212
|
+
|
|
213
|
+
# Sort worst → best
|
|
214
|
+
merged = merged.sort_values(
|
|
215
|
+
by=["cat_score", "subcat_score", "task_score"],
|
|
216
|
+
ascending=[True, True, True]
|
|
217
|
+
)
|
|
218
|
+
|
|
219
|
+
# ---------------- Pretty Print ----------------
|
|
220
|
+
for cat, cat_group in merged.groupby("category", sort=False):
|
|
221
|
+
row = cat_group.iloc[0]
|
|
222
|
+
print(f"\n📊 Category: {cat} | Score: {row['cat_score']:.2f}% | Count: {row['cat_count']}")
|
|
223
|
+
print(f" Precision: {row['precision']}% | Recall: {row['recall']}% | F1: {row['f1']}%")
|
|
224
|
+
print("-" * 90)
|
|
225
|
+
|
|
226
|
+
for sub, sub_group in cat_group.groupby("sub_category", sort=False):
|
|
227
|
+
row = sub_group.iloc[0]
|
|
228
|
+
print(f" 🔹 Sub-category: {sub} | Score: {row['subcat_score']:.2f}% | Count: {row['subcat_count']}")
|
|
229
|
+
print(f" Precision: {row['precision']}% | Recall: {row['recall']}% | F1: {row['f1']}%")
|
|
230
|
+
|
|
231
|
+
for _, r in sub_group.iterrows():
|
|
232
|
+
print(
|
|
233
|
+
f" ▪ Task: {r['task']:<25} "
|
|
234
|
+
f"| Score: {r['task_score']:>6.2f}% "
|
|
235
|
+
f"| P: {r['precision']:>5}% "
|
|
236
|
+
f"| R: {r['recall']:>5}% "
|
|
237
|
+
f"| F1: {r['f1']:>5}% "
|
|
238
|
+
f"| Count: {r['task_count']}"
|
|
239
|
+
)
|
|
240
|
+
|
|
241
|
+
# ---------------- Overall ----------------
|
|
242
|
+
p, r, f1 = compute_metrics(df)
|
|
243
|
+
print("\nOverall Score:", round(df["score"].mean()*100, 2), "%")
|
|
244
|
+
print(f"Overall Precision: {p}% | Recall: {r}% | F1: {f1}%")
|
|
245
|
+
|
|
@@ -0,0 +1,230 @@
|
|
|
1
|
+
# =========================
|
|
2
|
+
# Standard library
|
|
3
|
+
# =========================
|
|
4
|
+
import os
|
|
5
|
+
import logging
|
|
6
|
+
import ast
|
|
7
|
+
import shutil
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
10
|
+
|
|
11
|
+
# =========================
|
|
12
|
+
# 3rd-party packages
|
|
13
|
+
# =========================
|
|
14
|
+
import pandas as pd
|
|
15
|
+
from alive_progress import alive_bar
|
|
16
|
+
|
|
17
|
+
# =========================
|
|
18
|
+
# Local modules
|
|
19
|
+
# =========================
|
|
20
|
+
from initialize_benchmark import initialize_benchmark, process_benchmark, prompt_generator
|
|
21
|
+
from prompt_variation import PromptVariationGenerator, generate_var_bench
|
|
22
|
+
from evaluator import evaluation_result, evaluator
|
|
23
|
+
from utils import encode_image, get_client, verify_run_name
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
logger = logging.getLogger(__name__)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def agent(inputs: dict[str, dict], model_name: str,**kwargs):
|
|
35
|
+
|
|
36
|
+
client = get_client()
|
|
37
|
+
|
|
38
|
+
task_id, task = list(inputs.items())[0]
|
|
39
|
+
|
|
40
|
+
images_paths = [p for p in task["ques_image_path_lst"]]
|
|
41
|
+
|
|
42
|
+
# build content list
|
|
43
|
+
content = [{"type": "text", "text": task["task_description"]}]
|
|
44
|
+
|
|
45
|
+
# add all images
|
|
46
|
+
content += [
|
|
47
|
+
{
|
|
48
|
+
"type": "image_url",
|
|
49
|
+
"image_url": {"url": f"data:image/jpeg;base64,{encode_image(path)}"}
|
|
50
|
+
}
|
|
51
|
+
for path in images_paths
|
|
52
|
+
]
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
response=None
|
|
56
|
+
|
|
57
|
+
try:
|
|
58
|
+
response = client.chat.completions.create(
|
|
59
|
+
model=model_name,
|
|
60
|
+
messages=[
|
|
61
|
+
{
|
|
62
|
+
"role": "user",
|
|
63
|
+
"content": content
|
|
64
|
+
}
|
|
65
|
+
],
|
|
66
|
+
temperature=0.1,
|
|
67
|
+
max_tokens=1000,
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
except Exception as e:
|
|
71
|
+
print(f"Error Answering task {task_id}: {e}")
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
inputs[task_id]["response"] = response.choices[0].message.content.strip()
|
|
75
|
+
|
|
76
|
+
return inputs
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def run_agent_function(inputs: dict[str, dict], agent_function , use_template: bool =False, model_name: str = "" , **kwargs):
|
|
80
|
+
if use_template:
|
|
81
|
+
res=agent_function(inputs,model_name=model_name)
|
|
82
|
+
else:
|
|
83
|
+
res=agent_function(inputs,)
|
|
84
|
+
|
|
85
|
+
return res
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def HarnessRunner(
|
|
91
|
+
use_template=True,
|
|
92
|
+
custom_function=None,
|
|
93
|
+
template_name="default_agent",
|
|
94
|
+
dataset_dir=None,
|
|
95
|
+
clean_dataset_dir=True,
|
|
96
|
+
template_model="gpt-4o-mini-2024-07-18",
|
|
97
|
+
run_id = "test",
|
|
98
|
+
max_concurrent = 5,
|
|
99
|
+
continue_run = False,
|
|
100
|
+
max_tasks = 0,
|
|
101
|
+
prompt_sensitivity = False,
|
|
102
|
+
num_variations = 3,
|
|
103
|
+
variation_strength = "mild",
|
|
104
|
+
results_dir = "results",
|
|
105
|
+
task_ids = [],
|
|
106
|
+
categories = [],
|
|
107
|
+
sub_categories = [],
|
|
108
|
+
openai_api_key = None,
|
|
109
|
+
open_router_api_key = None,
|
|
110
|
+
evaluation_needed=True,
|
|
111
|
+
evaluate_mcq_by_judge_llm=False,
|
|
112
|
+
mcq_match_full_answer=False,
|
|
113
|
+
):
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
function_template = {
|
|
117
|
+
"default_agent": agent,
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
if openai_api_key:
|
|
121
|
+
os.environ["OPENAI_API_KEY"]=openai_api_key
|
|
122
|
+
|
|
123
|
+
if open_router_api_key:
|
|
124
|
+
os.environ["OPEN_ROUTER_API_KEY"]=open_router_api_key
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
if (not use_template) and (not custom_function):
|
|
129
|
+
raise ValueError(
|
|
130
|
+
"Select use_template or provide custom_function "
|
|
131
|
+
)
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
if use_template:
|
|
135
|
+
agent_function= function_template[template_name]
|
|
136
|
+
else:
|
|
137
|
+
agent_function= custom_function
|
|
138
|
+
|
|
139
|
+
temp_dir="./temp_dir"
|
|
140
|
+
verify_run_name(run_id)
|
|
141
|
+
Path(f"{results_dir}/{run_id}").mkdir(parents=True, exist_ok=True)
|
|
142
|
+
response_path=f"{results_dir}/{run_id}/agent_response.csv"
|
|
143
|
+
eval_path=f"{results_dir}/{run_id}/evaluations.csv"
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
if os.path.exists(response_path) and (not continue_run):
|
|
147
|
+
raise ValueError("Run Already Exist. Pass continue_run if you want to Continue Run.")
|
|
148
|
+
|
|
149
|
+
if dataset_dir:
|
|
150
|
+
temp_dir=dataset_dir
|
|
151
|
+
if not os.path.exists(temp_dir) :
|
|
152
|
+
initialize_benchmark(temp_dir)
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
df=pd.read_csv(f"{temp_dir}/annotations.csv").drop_duplicates(subset=["id"])
|
|
157
|
+
|
|
158
|
+
df["ques_image_path_lst"] = df["ques_image_path_lst"].apply(ast.literal_eval)
|
|
159
|
+
|
|
160
|
+
df["ques_image_path_lst"] = df["ques_image_path_lst"].apply(
|
|
161
|
+
lambda paths: [os.path.join(temp_dir, "question_images", p) for p in paths]
|
|
162
|
+
)
|
|
163
|
+
|
|
164
|
+
dataset = process_benchmark(
|
|
165
|
+
df,
|
|
166
|
+
categories,
|
|
167
|
+
sub_categories,
|
|
168
|
+
task_ids,
|
|
169
|
+
max_tasks,
|
|
170
|
+
continue_run=continue_run,
|
|
171
|
+
eval_path=response_path,
|
|
172
|
+
)
|
|
173
|
+
|
|
174
|
+
eval_results = {}
|
|
175
|
+
|
|
176
|
+
if continue_run:
|
|
177
|
+
eval_results=pd.read_csv(response_path).set_index('id').to_dict(orient='index')
|
|
178
|
+
|
|
179
|
+
var_dataset=None
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
if prompt_sensitivity:
|
|
183
|
+
|
|
184
|
+
|
|
185
|
+
generator = PromptVariationGenerator(num_variations=num_variations, strength=variation_strength)
|
|
186
|
+
prompt_map = generator.generate_variations_for_dataset(dataset)
|
|
187
|
+
var_dataset=generate_var_bench(dataset, prompt_map)
|
|
188
|
+
|
|
189
|
+
active_dataset = var_dataset if var_dataset else dataset
|
|
190
|
+
|
|
191
|
+
def process_item(key, item):
|
|
192
|
+
"""Generate prompt and run agent for a single item"""
|
|
193
|
+
value = prompt_generator(item)
|
|
194
|
+
res = run_agent_function(
|
|
195
|
+
inputs={key: value},
|
|
196
|
+
agent_function=agent_function,
|
|
197
|
+
use_template=use_template,
|
|
198
|
+
model_name=template_model,
|
|
199
|
+
)
|
|
200
|
+
return key, res[key]
|
|
201
|
+
|
|
202
|
+
|
|
203
|
+
# Run in parallel with progress bar
|
|
204
|
+
with alive_bar(len(active_dataset), bar='blocks', spinner='dots', title='Running Agent', force_tty=True) as bar:
|
|
205
|
+
with ThreadPoolExecutor(max_workers=max_concurrent) as executor:
|
|
206
|
+
futures = {executor.submit(process_item, k, v): k for k, v in active_dataset.items()}
|
|
207
|
+
|
|
208
|
+
for future in as_completed(futures):
|
|
209
|
+
key, result = future.result()
|
|
210
|
+
eval_results[key] = result
|
|
211
|
+
pd.DataFrame.from_dict(eval_results, orient="index").rename_axis("id").reset_index().to_csv(response_path, index=False)
|
|
212
|
+
bar() # update progress
|
|
213
|
+
|
|
214
|
+
if evaluation_needed:
|
|
215
|
+
# Run evaluator on results
|
|
216
|
+
evaluation = evaluator(
|
|
217
|
+
dataset=eval_results,
|
|
218
|
+
evaluate_mcq_by_judge_llm=evaluate_mcq_by_judge_llm,
|
|
219
|
+
mcq_match_full_answer=mcq_match_full_answer,
|
|
220
|
+
var= True if var_dataset else False,
|
|
221
|
+
eval_path=eval_path,
|
|
222
|
+
)
|
|
223
|
+
evaluation_result(eval_path)
|
|
224
|
+
|
|
225
|
+
if clean_dataset_dir:
|
|
226
|
+
shutil.rmtree(temp_dir)
|
|
227
|
+
|
|
228
|
+
|
|
229
|
+
|
|
230
|
+
|
|
@@ -0,0 +1,143 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
|
|
4
|
+
# 3rd-party packages
|
|
5
|
+
import pandas as pd
|
|
6
|
+
from alive_progress import alive_bar
|
|
7
|
+
from huggingface_hub import snapshot_download
|
|
8
|
+
from zipfile import ZipFile
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class BenchmarkStats:
|
|
13
|
+
def __init__(self, ):
|
|
14
|
+
self.df=pd.read_csv("./annotations.csv")
|
|
15
|
+
|
|
16
|
+
def _print_stats(self, column, title):
|
|
17
|
+
pd.set_option("display.max_rows", None)
|
|
18
|
+
|
|
19
|
+
df_counts = (
|
|
20
|
+
self.df[column]
|
|
21
|
+
.value_counts()
|
|
22
|
+
.sort_values()
|
|
23
|
+
.rename_axis(title)
|
|
24
|
+
.reset_index(name="Count")
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
print(df_counts.to_markdown(index=False)) # clean table
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def get_dataset_stats(self):
|
|
31
|
+
self._print_stats("dataset", "Dataset")
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def get_category_stats(self):
|
|
35
|
+
self._print_stats("category", "Category")
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def get_sub_category_stats(self):
|
|
39
|
+
self._print_stats("sub_category", "Sub Category")
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def get_task_stats(self):
|
|
43
|
+
self._print_stats("task", "Task")
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def initialize_benchmark(local_path):
|
|
49
|
+
token = os.environ["HF_TOKEN"]
|
|
50
|
+
repo_id="Wahiddhrubo/hal-spatial"
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
snapshot_download(
|
|
54
|
+
repo_id=repo_id,
|
|
55
|
+
repo_type="dataset", # "dataset" or "model"
|
|
56
|
+
local_dir=local_path, # your path
|
|
57
|
+
token=token,
|
|
58
|
+
allow_patterns=None, # optional: download only some files
|
|
59
|
+
ignore_patterns=None, # optional: ignore some files
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
zip_path = Path(local_path) / "question_images.zip"
|
|
64
|
+
extract_to = Path(local_path) / "question_images"
|
|
65
|
+
|
|
66
|
+
with ZipFile(zip_path, 'r') as z:
|
|
67
|
+
members = z.infolist()
|
|
68
|
+
|
|
69
|
+
with alive_bar(len(members), title="Extracting files") as bar:
|
|
70
|
+
for m in members:
|
|
71
|
+
z.extract(m, extract_to)
|
|
72
|
+
bar()
|
|
73
|
+
|
|
74
|
+
zip_path.unlink()
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def prompt_generator(row):
|
|
78
|
+
|
|
79
|
+
# print(row)
|
|
80
|
+
|
|
81
|
+
prompt = f"""You are a vision-language reasoning agent.
|
|
82
|
+
Understand the intent of the question based on the category {row['category']}. Focus on key visual or logical cues such as shading, color consistency, spatial relations, or context depending on the category.
|
|
83
|
+
|
|
84
|
+
Carefully read and compare all choices, paying attention to small differences and ignoring formatting issues. Use relative reasoning when needed (e.g., darker vs lighter, same vs different).
|
|
85
|
+
|
|
86
|
+
Choose the {'Answer' if row['question_type'] != 'MCQ' else 'OPTION LETTER ONLY'} that best matches the question’s intent based on sound reasoning, not assumptions.
|
|
87
|
+
|
|
88
|
+
JUST GIVE THE ANSWER NO NEED OF REASONING.
|
|
89
|
+
|
|
90
|
+
Question:
|
|
91
|
+
{row['question']}
|
|
92
|
+
"""
|
|
93
|
+
|
|
94
|
+
choices= f"""Choices: {row['choices']} """ if row["choices"] else ""
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
return {**row, "task_description": prompt + choices}
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
def process_benchmark(
|
|
105
|
+
df,
|
|
106
|
+
categories,
|
|
107
|
+
sub_categories,
|
|
108
|
+
task_ids,
|
|
109
|
+
max_tasks,
|
|
110
|
+
continue_run=False,
|
|
111
|
+
eval_path="",
|
|
112
|
+
):
|
|
113
|
+
|
|
114
|
+
if len(categories):
|
|
115
|
+
df=df[df["categories"].isin(categories)]
|
|
116
|
+
|
|
117
|
+
if len(task_ids):
|
|
118
|
+
df=df[df["task"].isin(task_ids)]
|
|
119
|
+
|
|
120
|
+
if len(sub_categories):
|
|
121
|
+
df=df[df["sub_categories"].isin(sub_categories)]
|
|
122
|
+
|
|
123
|
+
ids=[]
|
|
124
|
+
if continue_run:
|
|
125
|
+
if os.path.exists(eval_path):
|
|
126
|
+
ids=pd.read_csv(eval_path)["id"].to_list()
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
df=df[~df["id"].isin(ids)]
|
|
134
|
+
dataset=df.set_index('id').to_dict(orient='index')
|
|
135
|
+
|
|
136
|
+
if max_tasks and max_tasks > 0 and max_tasks < len(dataset):
|
|
137
|
+
print(f"Limiting to the first {max_tasks} tasks as requested")
|
|
138
|
+
keep_task_ids = list(dataset.keys())[: max_tasks]
|
|
139
|
+
dataset = {task_id: dataset[task_id] for task_id in keep_task_ids}
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
return dataset
|
|
143
|
+
|
|
@@ -0,0 +1,117 @@
|
|
|
1
|
+
|
|
2
|
+
from typing import Dict, List
|
|
3
|
+
from utils import get_client
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
# ---------------- Config ----------------
|
|
12
|
+
|
|
13
|
+
# Instructions per strength
|
|
14
|
+
STRENGTH_INSTRUCTIONS = {
|
|
15
|
+
"mild": (
|
|
16
|
+
"Generate small, surface-level variations of the prompt. "
|
|
17
|
+
"Use synonyms, minor changes in formality, active/passive voice changes. "
|
|
18
|
+
"Keep all information, meaning, and structure intact. "
|
|
19
|
+
"Output should be fluent and similar in style to the original."
|
|
20
|
+
),
|
|
21
|
+
"medium": (
|
|
22
|
+
"Generate variations that restructure sentences and reorder information. "
|
|
23
|
+
"Combine or split sentences, slightly change phrasing, but preserve all meaning. "
|
|
24
|
+
"Output should still be clear and professional, resembling the original style."
|
|
25
|
+
),
|
|
26
|
+
"strong": (
|
|
27
|
+
"Generate majorly different variations that sound like different people wrote them. "
|
|
28
|
+
"Use conversational phrasing, varied sentence structures, and style differences. "
|
|
29
|
+
"Preserve all information and constraints exactly. Output should maintain readability."
|
|
30
|
+
),
|
|
31
|
+
"naturalistic": (
|
|
32
|
+
"Generate realistic user-style variations as if typed by a real person. "
|
|
33
|
+
"Include casual phrasing, abbreviations, typos, and informal chat patterns. "
|
|
34
|
+
"Keep all meaning and requirements intact. Output should still resemble the original prompt in content."
|
|
35
|
+
),
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
STRENGTH_TEMPERATURE = {
|
|
39
|
+
"mild": 0.7,
|
|
40
|
+
"medium": 0.8,
|
|
41
|
+
"strong": 0.9,
|
|
42
|
+
"naturalistic": 0.9,
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
class PromptVariationGenerator:
|
|
47
|
+
def __init__(self, model_name="gpt-4o-mini-2024-07-18", num_variations=2, strength="medium"):
|
|
48
|
+
self.model_name = model_name
|
|
49
|
+
self.num_variations = num_variations
|
|
50
|
+
self.strength = strength.lower()
|
|
51
|
+
self.client = get_client()
|
|
52
|
+
|
|
53
|
+
def generate_variations_for_task(self, task_id: str, prompt: str) -> List[Dict]:
|
|
54
|
+
instruction = STRENGTH_INSTRUCTIONS.get(self.strength, STRENGTH_INSTRUCTIONS["mild"])
|
|
55
|
+
temperature = STRENGTH_TEMPERATURE.get(self.strength, 0.7)
|
|
56
|
+
|
|
57
|
+
system_prompt = f"You are an expert at paraphrasing prompts while preserving meaning.\nInstruction: {instruction}"
|
|
58
|
+
user_prompt = f"Original prompt:\n{prompt}\nGenerate {self.num_variations} variations, one per line. Do NOT number them."
|
|
59
|
+
|
|
60
|
+
try:
|
|
61
|
+
response = self.client.chat.completions.create(
|
|
62
|
+
model=self.model_name,
|
|
63
|
+
messages=[
|
|
64
|
+
{"role": "system", "content": system_prompt},
|
|
65
|
+
{"role": "user", "content": user_prompt},
|
|
66
|
+
],
|
|
67
|
+
temperature=temperature,
|
|
68
|
+
max_tokens=1000,
|
|
69
|
+
)
|
|
70
|
+
|
|
71
|
+
lines = response.choices[0].message.content.strip().split("\n")
|
|
72
|
+
variations = [line.strip() for line in lines if len(line.strip()) > 10]
|
|
73
|
+
|
|
74
|
+
except Exception as e:
|
|
75
|
+
print(f"Error generating variations for task {task_id}: {e}")
|
|
76
|
+
variations = []
|
|
77
|
+
|
|
78
|
+
all_variations = [prompt] + variations[:self.num_variations]
|
|
79
|
+
|
|
80
|
+
return [
|
|
81
|
+
{"question": v, "prompt_variation_id": i, "prompt_variation_strength": self.strength}
|
|
82
|
+
for i, v in enumerate(all_variations)
|
|
83
|
+
]
|
|
84
|
+
|
|
85
|
+
def generate_variations_for_dataset(self, dataset: Dict[str, Dict], prompt_field: str = "question") -> Dict[str, List[Dict]]:
|
|
86
|
+
result = {}
|
|
87
|
+
for task_id, task_data in dataset.items():
|
|
88
|
+
if prompt_field not in task_data:
|
|
89
|
+
result[task_id] = [{
|
|
90
|
+
"question": task_data.get(prompt_field, ""),
|
|
91
|
+
"prompt_variation_id": 0,
|
|
92
|
+
"prompt_variation_strength": self.strength
|
|
93
|
+
}]
|
|
94
|
+
continue
|
|
95
|
+
prompt = task_data[prompt_field]
|
|
96
|
+
result[task_id] = self.generate_variations_for_task(task_id, prompt)
|
|
97
|
+
return result
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
def generate_var_bench( dataset, prompt_map):
|
|
102
|
+
new_dataset={}
|
|
103
|
+
for d_id,d in dataset.items():
|
|
104
|
+
|
|
105
|
+
for idx,q in enumerate(prompt_map[d_id]):
|
|
106
|
+
new_dataset[f'{d_id}-{str(idx).zfill(2)}'] = {
|
|
107
|
+
**dataset[d_id],
|
|
108
|
+
"question":q['question']
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
return new_dataset
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from openai import OpenAI
|
|
3
|
+
import base64
|
|
4
|
+
|
|
5
|
+
import re
|
|
6
|
+
import os
|
|
7
|
+
|
|
8
|
+
class InvalidPathNameError(ValueError):
|
|
9
|
+
"""Custom exception for invalid path names."""
|
|
10
|
+
pass
|
|
11
|
+
|
|
12
|
+
def verify_run_name(name: str) -> str:
|
|
13
|
+
"""
|
|
14
|
+
Verify that the given name is a valid filename/folder name.
|
|
15
|
+
|
|
16
|
+
Rules:
|
|
17
|
+
- Cannot contain forbidden characters: \ / : * ? " < > |
|
|
18
|
+
- Cannot be empty or only whitespace
|
|
19
|
+
- Cannot be a reserved Windows name (CON, PRN, AUX, NUL, COM1..9, LPT1..9)
|
|
20
|
+
|
|
21
|
+
Returns the stripped valid name if valid.
|
|
22
|
+
Raises InvalidPathNameError if invalid.
|
|
23
|
+
"""
|
|
24
|
+
if not name or name.strip() == "":
|
|
25
|
+
raise InvalidPathNameError("Path name cannot be empty or whitespace.")
|
|
26
|
+
|
|
27
|
+
name = name.strip()
|
|
28
|
+
|
|
29
|
+
# Forbidden characters
|
|
30
|
+
forbidden_chars = r'[<>:"/\\|?*]'
|
|
31
|
+
if re.search(forbidden_chars, name):
|
|
32
|
+
raise InvalidPathNameError(
|
|
33
|
+
f"Path name '{name}' contains forbidden characters: \\ / : * ? \" < > |"
|
|
34
|
+
)
|
|
35
|
+
|
|
36
|
+
# Reserved Windows names
|
|
37
|
+
reserved = {"CON","PRN","AUX","NUL"} | {f"COM{i}" for i in range(1,10)} | {f"LPT{i}" for i in range(1,10)}
|
|
38
|
+
if name.upper() in reserved:
|
|
39
|
+
raise InvalidPathNameError(f"Path name '{name}' is a reserved system name.")
|
|
40
|
+
|
|
41
|
+
return name
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def encode_image(path):
|
|
45
|
+
with open(path, "rb") as f:
|
|
46
|
+
return base64.b64encode(f.read()).decode("utf-8")
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def get_client():
|
|
50
|
+
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
|
|
51
|
+
OPEN_ROUTER_API_KEY = os.environ.get("OPEN_ROUTER_API_KEY")
|
|
52
|
+
if OPEN_ROUTER_API_KEY:
|
|
53
|
+
return OpenAI(
|
|
54
|
+
base_url="https://openrouter.ai/api/v1",
|
|
55
|
+
api_key=OPEN_ROUTER_API_KEY
|
|
56
|
+
)
|
|
57
|
+
elif OPENAI_API_KEY:
|
|
58
|
+
return OpenAI(api_key=OPENAI_API_KEY)
|
|
59
|
+
else:
|
|
60
|
+
raise ValueError("No API key found. Set OPENAI_API_KEY or OPEN_ROUTER_API_KEY.")
|
|
61
|
+
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=61.0"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "hal-spatial-harness" # ⚠️ must be unique on PyPI
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "A lightweight framework for benchmarking multimodal AI agents with parallel execution, prompt variation, and automated evaluation."
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.8"
|
|
11
|
+
authors = [
|
|
12
|
+
{ name="Wahid Faisal", email="wahiddhrubo@gmail.com" }
|
|
13
|
+
]
|
|
14
|
+
license = { text = "MIT" }
|
|
15
|
+
|
|
16
|
+
dependencies = [
|
|
17
|
+
"pandas",
|
|
18
|
+
"numpy",
|
|
19
|
+
"alive-progress",
|
|
20
|
+
"gitpython",
|
|
21
|
+
"huggingface-hub",
|
|
22
|
+
"openai"
|
|
23
|
+
]
|