hal-spatial-harness 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,27 @@
1
+
2
+ ---
3
+
4
+ # 📄 **LICENSE (MIT License)**
5
+
6
+ ```text id="license001"
7
+ MIT License
8
+
9
+ Copyright (c) 2026 CIOL
10
+
11
+ Permission is hereby granted, free of charge, to any person obtaining a copy
12
+ of this software and associated documentation files (the "Software"), to deal
13
+ in the Software without restriction, including without limitation the rights
14
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
15
+ copies of the Software, and to permit persons to whom the Software is
16
+ furnished to do so, subject to the following conditions:
17
+
18
+ The above copyright notice and this permission notice shall be included in all
19
+ copies or substantial portions of the Software.
20
+
21
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
22
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
23
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
24
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
25
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
26
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
27
+ SOFTWARE.
@@ -0,0 +1,16 @@
1
+ Metadata-Version: 2.4
2
+ Name: hal-spatial-harness
3
+ Version: 0.1.0
4
+ Summary: A lightweight framework for benchmarking multimodal AI agents with parallel execution, prompt variation, and automated evaluation.
5
+ Author-email: Wahid Faisal <wahiddhrubo@gmail.com>
6
+ License: MIT
7
+ Requires-Python: >=3.8
8
+ Description-Content-Type: text/markdown
9
+ License-File: LICENSE
10
+ Requires-Dist: pandas
11
+ Requires-Dist: numpy
12
+ Requires-Dist: alive-progress
13
+ Requires-Dist: gitpython
14
+ Requires-Dist: huggingface-hub
15
+ Requires-Dist: openai
16
+ Dynamic: license-file
@@ -0,0 +1,16 @@
1
+ Metadata-Version: 2.4
2
+ Name: hal-spatial-harness
3
+ Version: 0.1.0
4
+ Summary: A lightweight framework for benchmarking multimodal AI agents with parallel execution, prompt variation, and automated evaluation.
5
+ Author-email: Wahid Faisal <wahiddhrubo@gmail.com>
6
+ License: MIT
7
+ Requires-Python: >=3.8
8
+ Description-Content-Type: text/markdown
9
+ License-File: LICENSE
10
+ Requires-Dist: pandas
11
+ Requires-Dist: numpy
12
+ Requires-Dist: alive-progress
13
+ Requires-Dist: gitpython
14
+ Requires-Dist: huggingface-hub
15
+ Requires-Dist: openai
16
+ Dynamic: license-file
@@ -0,0 +1,13 @@
1
+ LICENSE
2
+ pyproject.toml
3
+ hal_spatial_harness.egg-info/PKG-INFO
4
+ hal_spatial_harness.egg-info/SOURCES.txt
5
+ hal_spatial_harness.egg-info/dependency_links.txt
6
+ hal_spatial_harness.egg-info/requires.txt
7
+ hal_spatial_harness.egg-info/top_level.txt
8
+ hal_spatial_harness_package/__init__.py
9
+ hal_spatial_harness_package/evaluator.py
10
+ hal_spatial_harness_package/hal_harness.py
11
+ hal_spatial_harness_package/initialize_benchmark.py
12
+ hal_spatial_harness_package/prompt_variation.py
13
+ hal_spatial_harness_package/utils.py
@@ -0,0 +1,6 @@
1
+ pandas
2
+ numpy
3
+ alive-progress
4
+ gitpython
5
+ huggingface-hub
6
+ openai
@@ -0,0 +1 @@
1
+ hal_spatial_harness_package
@@ -0,0 +1,17 @@
1
+ from .evaluator import evaluation_result
2
+ from .hal_harness import HarnessRunner
3
+ from .initialize_benchmark import initialize_benchmark, BenchmarkStats
4
+ from .prompt_variation import PromptVariationGenerator
5
+
6
+
7
+
8
+ __version__ = "0.1.0"
9
+
10
+ __all__ = [
11
+ "evaluation_result",
12
+ "HarnessRunner",
13
+ "initialize_benchmark",
14
+ "PromptVariationGenerator",
15
+ "BenchmarkStats"
16
+ ]
17
+
@@ -0,0 +1,245 @@
1
+ from utils import get_client, encode_image
2
+ import os
3
+ from concurrent.futures import ThreadPoolExecutor, as_completed
4
+ from typing import Dict
5
+
6
+ # 3rd-party packages
7
+ import pandas as pd
8
+ from alive_progress import alive_bar
9
+
10
+
11
+ def compute_metrics(df: pd.DataFrame):
12
+ TP = (df["score"] == 1).sum()
13
+ FP = (df["score"] == 0).sum()
14
+ FN = FP
15
+ precision = TP / (TP + FP) if (TP + FP) > 0 else 0
16
+ recall = TP / (TP + FN) if (TP + FN) > 0 else 0
17
+ f1 = (2 * precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
18
+ return round(precision*100,2), round(recall*100,2), round(f1*100,2)
19
+
20
+
21
+
22
+
23
+
24
+ def llm_judge_binary(task, model_name="gpt-4o-mini-2024-07-18"):
25
+ client = get_client()
26
+
27
+ prompt = f"""
28
+ You are a strict evaluator.
29
+
30
+ Return ONLY:
31
+ 1 → if the model answer is correct
32
+ 0 → if the model answer is incorrect
33
+
34
+ No explanation.
35
+
36
+ Question:
37
+ {task['question']}
38
+
39
+ Model Answer:
40
+ {task['response']}
41
+
42
+ Choices:
43
+ {task['choices']}
44
+
45
+ Correct Answer:
46
+ {task['answer']}
47
+
48
+
49
+ """
50
+ images_paths = [p for p in task["ques_image_path_lst"]]
51
+
52
+ content = [{"type": "text", "text": prompt}]
53
+
54
+ content += [
55
+ {
56
+ "type": "image_url",
57
+ "image_url": {"url": f"data:image/jpeg;base64,{encode_image(path)}"}
58
+ }
59
+ for path in images_paths
60
+ ]
61
+
62
+ response = client.chat.completions.create(
63
+ model=model_name,
64
+ temperature=0,
65
+ messages=[{"role": "user", "content": content}],
66
+ max_tokens=5,
67
+ )
68
+
69
+ output = response.choices[0].message.content.strip()
70
+
71
+ # safe parsing
72
+ if "1" in output:
73
+ return 1
74
+ elif "0" in output:
75
+ return 0
76
+ else:
77
+ return 0.5 # fallback if model misbehaves
78
+
79
+
80
+ def evaluator(
81
+ dataset: Dict,
82
+ evaluate_mcq_by_judge_llm: bool,
83
+ mcq_match_full_answer: bool,
84
+ var: bool = False,
85
+ eval_path: str = "",
86
+ max_workers: int = 4
87
+ ) -> Dict:
88
+ """
89
+ Parallel evaluation of dataset with optional LLM scoring.
90
+ """
91
+ evaluation = {}
92
+
93
+ # Exclude already-evaluated items
94
+ if eval_path and os.path.exists(eval_path):
95
+
96
+ exclude_ids = set(pd.read_csv(eval_path)["id"].to_list())
97
+ dataset = {k: v for k, v in dataset.items() if k not in exclude_ids}
98
+ evaluation=pd.read_csv(eval_path).set_index('id').to_dict(orient='index')
99
+
100
+ def evaluate_item(d_id, data):
101
+ """Evaluate a single dataset item and return result tuple"""
102
+ # MCQ scoring
103
+ if data.get("question_type") == "MCQ" and not evaluate_mcq_by_judge_llm:
104
+ if mcq_match_full_answer:
105
+ score = 1 if data["answer"] in data["response"] else 0
106
+ else:
107
+ score = 1 if data["correct_option"] == data["response"] else 0
108
+ else:
109
+ score = llm_judge_binary(task=data)
110
+
111
+ key_id = "".join(d_id.split("-")[:2]) if var else d_id
112
+ return key_id, data, score
113
+
114
+
115
+ # Run in parallel
116
+ with alive_bar(len(dataset), bar='blocks', spinner='dots', title='Running Evaluation', force_tty=True) as bar:
117
+ with ThreadPoolExecutor(max_workers=max_workers) as executor:
118
+ future_to_id = {executor.submit(evaluate_item, d_id, data): d_id for d_id, data in dataset.items()}
119
+
120
+ for future in as_completed(future_to_id):
121
+ key_id, data, score = future.result()
122
+
123
+ # Merge results safely
124
+ if var and key_id in evaluation:
125
+ evaluation[key_id]["prompt"].append(data["task_description"])
126
+ evaluation[key_id]["response"].append(data["response"])
127
+ evaluation[key_id]["scores"].append(score)
128
+ else:
129
+ evaluation[key_id] = {**data}
130
+ if var:
131
+ evaluation[key_id]["prompt"] = [data["task_description"]]
132
+ evaluation[key_id]["response"] = [data["response"]]
133
+ evaluation[key_id]["scores"] = [score]
134
+ else:
135
+ evaluation[key_id]["score"] = score
136
+
137
+ bar() # update progress
138
+
139
+ # Compute averaged scores for var=True
140
+ if var:
141
+ for i in evaluation:
142
+ evaluation[i]["score"] = sum(evaluation[i]["scores"]) / len(evaluation[i]["scores"])
143
+
144
+ # Save final CSV once
145
+ if eval_path:
146
+ pd.DataFrame.from_dict(evaluation, orient="index").rename_axis("id").reset_index().to_csv(eval_path, index=False)
147
+
148
+ return evaluation
149
+
150
+
151
+ def get_evaluation_result(df):
152
+ TP = (df["score"] == 1).sum()
153
+ FP = (df["score"] == 0).sum()
154
+
155
+ # If every row has a prediction, FN ~ FP for binary correctness
156
+ FN = FP
157
+
158
+ precision = TP / (TP + FP) if (TP + FP) > 0 else 0
159
+ recall = TP / (TP + FN) if (TP + FN) > 0 else 0
160
+ f1 = (2 * precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
161
+
162
+ return round(precision*100,2), round(recall*100,2), round(f1*100,2)
163
+
164
+
165
+
166
+ def evaluation_result(eval_path):
167
+ df = pd.read_csv(eval_path)
168
+
169
+ # ---------------- Aggregations ----------------
170
+ cat_df = (
171
+ df.groupby("category")
172
+ .apply(lambda x: pd.Series({
173
+ "cat_score": x["score"].mean()*100,
174
+ "cat_count": len(x),
175
+ "precision": compute_metrics(x)[0],
176
+ "recall": compute_metrics(x)[1],
177
+ "f1": compute_metrics(x)[2],
178
+ }))
179
+ .reset_index()
180
+ )
181
+
182
+ sub_df = (
183
+ df.groupby(["category", "sub_category"])
184
+ .apply(lambda x: pd.Series({
185
+ "subcat_score": x["score"].mean()*100,
186
+ "subcat_count": len(x),
187
+ "precision": compute_metrics(x)[0],
188
+ "recall": compute_metrics(x)[1],
189
+ "f1": compute_metrics(x)[2],
190
+ }))
191
+ .reset_index()
192
+ )
193
+
194
+ task_df = (
195
+ df.groupby(["category", "sub_category", "task"])
196
+ .apply(lambda x: pd.Series({
197
+ "task_score": x["score"].mean()*100,
198
+ "task_count": len(x),
199
+ "precision": compute_metrics(x)[0],
200
+ "recall": compute_metrics(x)[1],
201
+ "f1": compute_metrics(x)[2],
202
+ }))
203
+ .reset_index()
204
+ )
205
+
206
+ # Merge
207
+ merged = (
208
+ task_df
209
+ .merge(sub_df, on=["category", "sub_category"])
210
+ .merge(cat_df, on="category")
211
+ )
212
+
213
+ # Sort worst → best
214
+ merged = merged.sort_values(
215
+ by=["cat_score", "subcat_score", "task_score"],
216
+ ascending=[True, True, True]
217
+ )
218
+
219
+ # ---------------- Pretty Print ----------------
220
+ for cat, cat_group in merged.groupby("category", sort=False):
221
+ row = cat_group.iloc[0]
222
+ print(f"\n📊 Category: {cat} | Score: {row['cat_score']:.2f}% | Count: {row['cat_count']}")
223
+ print(f" Precision: {row['precision']}% | Recall: {row['recall']}% | F1: {row['f1']}%")
224
+ print("-" * 90)
225
+
226
+ for sub, sub_group in cat_group.groupby("sub_category", sort=False):
227
+ row = sub_group.iloc[0]
228
+ print(f" 🔹 Sub-category: {sub} | Score: {row['subcat_score']:.2f}% | Count: {row['subcat_count']}")
229
+ print(f" Precision: {row['precision']}% | Recall: {row['recall']}% | F1: {row['f1']}%")
230
+
231
+ for _, r in sub_group.iterrows():
232
+ print(
233
+ f" ▪ Task: {r['task']:<25} "
234
+ f"| Score: {r['task_score']:>6.2f}% "
235
+ f"| P: {r['precision']:>5}% "
236
+ f"| R: {r['recall']:>5}% "
237
+ f"| F1: {r['f1']:>5}% "
238
+ f"| Count: {r['task_count']}"
239
+ )
240
+
241
+ # ---------------- Overall ----------------
242
+ p, r, f1 = compute_metrics(df)
243
+ print("\nOverall Score:", round(df["score"].mean()*100, 2), "%")
244
+ print(f"Overall Precision: {p}% | Recall: {r}% | F1: {f1}%")
245
+
@@ -0,0 +1,230 @@
1
+ # =========================
2
+ # Standard library
3
+ # =========================
4
+ import os
5
+ import logging
6
+ import ast
7
+ import shutil
8
+ from pathlib import Path
9
+ from concurrent.futures import ThreadPoolExecutor, as_completed
10
+
11
+ # =========================
12
+ # 3rd-party packages
13
+ # =========================
14
+ import pandas as pd
15
+ from alive_progress import alive_bar
16
+
17
+ # =========================
18
+ # Local modules
19
+ # =========================
20
+ from initialize_benchmark import initialize_benchmark, process_benchmark, prompt_generator
21
+ from prompt_variation import PromptVariationGenerator, generate_var_bench
22
+ from evaluator import evaluation_result, evaluator
23
+ from utils import encode_image, get_client, verify_run_name
24
+
25
+
26
+
27
+
28
+ logger = logging.getLogger(__name__)
29
+
30
+
31
+
32
+
33
+
34
+ def agent(inputs: dict[str, dict], model_name: str,**kwargs):
35
+
36
+ client = get_client()
37
+
38
+ task_id, task = list(inputs.items())[0]
39
+
40
+ images_paths = [p for p in task["ques_image_path_lst"]]
41
+
42
+ # build content list
43
+ content = [{"type": "text", "text": task["task_description"]}]
44
+
45
+ # add all images
46
+ content += [
47
+ {
48
+ "type": "image_url",
49
+ "image_url": {"url": f"data:image/jpeg;base64,{encode_image(path)}"}
50
+ }
51
+ for path in images_paths
52
+ ]
53
+
54
+
55
+ response=None
56
+
57
+ try:
58
+ response = client.chat.completions.create(
59
+ model=model_name,
60
+ messages=[
61
+ {
62
+ "role": "user",
63
+ "content": content
64
+ }
65
+ ],
66
+ temperature=0.1,
67
+ max_tokens=1000,
68
+ )
69
+
70
+ except Exception as e:
71
+ print(f"Error Answering task {task_id}: {e}")
72
+
73
+
74
+ inputs[task_id]["response"] = response.choices[0].message.content.strip()
75
+
76
+ return inputs
77
+
78
+
79
+ def run_agent_function(inputs: dict[str, dict], agent_function , use_template: bool =False, model_name: str = "" , **kwargs):
80
+ if use_template:
81
+ res=agent_function(inputs,model_name=model_name)
82
+ else:
83
+ res=agent_function(inputs,)
84
+
85
+ return res
86
+
87
+
88
+
89
+
90
+ def HarnessRunner(
91
+ use_template=True,
92
+ custom_function=None,
93
+ template_name="default_agent",
94
+ dataset_dir=None,
95
+ clean_dataset_dir=True,
96
+ template_model="gpt-4o-mini-2024-07-18",
97
+ run_id = "test",
98
+ max_concurrent = 5,
99
+ continue_run = False,
100
+ max_tasks = 0,
101
+ prompt_sensitivity = False,
102
+ num_variations = 3,
103
+ variation_strength = "mild",
104
+ results_dir = "results",
105
+ task_ids = [],
106
+ categories = [],
107
+ sub_categories = [],
108
+ openai_api_key = None,
109
+ open_router_api_key = None,
110
+ evaluation_needed=True,
111
+ evaluate_mcq_by_judge_llm=False,
112
+ mcq_match_full_answer=False,
113
+ ):
114
+
115
+
116
+ function_template = {
117
+ "default_agent": agent,
118
+ }
119
+
120
+ if openai_api_key:
121
+ os.environ["OPENAI_API_KEY"]=openai_api_key
122
+
123
+ if open_router_api_key:
124
+ os.environ["OPEN_ROUTER_API_KEY"]=open_router_api_key
125
+
126
+
127
+
128
+ if (not use_template) and (not custom_function):
129
+ raise ValueError(
130
+ "Select use_template or provide custom_function "
131
+ )
132
+
133
+
134
+ if use_template:
135
+ agent_function= function_template[template_name]
136
+ else:
137
+ agent_function= custom_function
138
+
139
+ temp_dir="./temp_dir"
140
+ verify_run_name(run_id)
141
+ Path(f"{results_dir}/{run_id}").mkdir(parents=True, exist_ok=True)
142
+ response_path=f"{results_dir}/{run_id}/agent_response.csv"
143
+ eval_path=f"{results_dir}/{run_id}/evaluations.csv"
144
+
145
+
146
+ if os.path.exists(response_path) and (not continue_run):
147
+ raise ValueError("Run Already Exist. Pass continue_run if you want to Continue Run.")
148
+
149
+ if dataset_dir:
150
+ temp_dir=dataset_dir
151
+ if not os.path.exists(temp_dir) :
152
+ initialize_benchmark(temp_dir)
153
+
154
+
155
+
156
+ df=pd.read_csv(f"{temp_dir}/annotations.csv").drop_duplicates(subset=["id"])
157
+
158
+ df["ques_image_path_lst"] = df["ques_image_path_lst"].apply(ast.literal_eval)
159
+
160
+ df["ques_image_path_lst"] = df["ques_image_path_lst"].apply(
161
+ lambda paths: [os.path.join(temp_dir, "question_images", p) for p in paths]
162
+ )
163
+
164
+ dataset = process_benchmark(
165
+ df,
166
+ categories,
167
+ sub_categories,
168
+ task_ids,
169
+ max_tasks,
170
+ continue_run=continue_run,
171
+ eval_path=response_path,
172
+ )
173
+
174
+ eval_results = {}
175
+
176
+ if continue_run:
177
+ eval_results=pd.read_csv(response_path).set_index('id').to_dict(orient='index')
178
+
179
+ var_dataset=None
180
+
181
+
182
+ if prompt_sensitivity:
183
+
184
+
185
+ generator = PromptVariationGenerator(num_variations=num_variations, strength=variation_strength)
186
+ prompt_map = generator.generate_variations_for_dataset(dataset)
187
+ var_dataset=generate_var_bench(dataset, prompt_map)
188
+
189
+ active_dataset = var_dataset if var_dataset else dataset
190
+
191
+ def process_item(key, item):
192
+ """Generate prompt and run agent for a single item"""
193
+ value = prompt_generator(item)
194
+ res = run_agent_function(
195
+ inputs={key: value},
196
+ agent_function=agent_function,
197
+ use_template=use_template,
198
+ model_name=template_model,
199
+ )
200
+ return key, res[key]
201
+
202
+
203
+ # Run in parallel with progress bar
204
+ with alive_bar(len(active_dataset), bar='blocks', spinner='dots', title='Running Agent', force_tty=True) as bar:
205
+ with ThreadPoolExecutor(max_workers=max_concurrent) as executor:
206
+ futures = {executor.submit(process_item, k, v): k for k, v in active_dataset.items()}
207
+
208
+ for future in as_completed(futures):
209
+ key, result = future.result()
210
+ eval_results[key] = result
211
+ pd.DataFrame.from_dict(eval_results, orient="index").rename_axis("id").reset_index().to_csv(response_path, index=False)
212
+ bar() # update progress
213
+
214
+ if evaluation_needed:
215
+ # Run evaluator on results
216
+ evaluation = evaluator(
217
+ dataset=eval_results,
218
+ evaluate_mcq_by_judge_llm=evaluate_mcq_by_judge_llm,
219
+ mcq_match_full_answer=mcq_match_full_answer,
220
+ var= True if var_dataset else False,
221
+ eval_path=eval_path,
222
+ )
223
+ evaluation_result(eval_path)
224
+
225
+ if clean_dataset_dir:
226
+ shutil.rmtree(temp_dir)
227
+
228
+
229
+
230
+
@@ -0,0 +1,143 @@
1
+ import os
2
+ from pathlib import Path
3
+
4
+ # 3rd-party packages
5
+ import pandas as pd
6
+ from alive_progress import alive_bar
7
+ from huggingface_hub import snapshot_download
8
+ from zipfile import ZipFile
9
+
10
+
11
+
12
+ class BenchmarkStats:
13
+ def __init__(self, ):
14
+ self.df=pd.read_csv("./annotations.csv")
15
+
16
+ def _print_stats(self, column, title):
17
+ pd.set_option("display.max_rows", None)
18
+
19
+ df_counts = (
20
+ self.df[column]
21
+ .value_counts()
22
+ .sort_values()
23
+ .rename_axis(title)
24
+ .reset_index(name="Count")
25
+ )
26
+
27
+ print(df_counts.to_markdown(index=False)) # clean table
28
+
29
+
30
+ def get_dataset_stats(self):
31
+ self._print_stats("dataset", "Dataset")
32
+
33
+
34
+ def get_category_stats(self):
35
+ self._print_stats("category", "Category")
36
+
37
+
38
+ def get_sub_category_stats(self):
39
+ self._print_stats("sub_category", "Sub Category")
40
+
41
+
42
+ def get_task_stats(self):
43
+ self._print_stats("task", "Task")
44
+
45
+
46
+
47
+
48
+ def initialize_benchmark(local_path):
49
+ token = os.environ["HF_TOKEN"]
50
+ repo_id="Wahiddhrubo/hal-spatial"
51
+
52
+
53
+ snapshot_download(
54
+ repo_id=repo_id,
55
+ repo_type="dataset", # "dataset" or "model"
56
+ local_dir=local_path, # your path
57
+ token=token,
58
+ allow_patterns=None, # optional: download only some files
59
+ ignore_patterns=None, # optional: ignore some files
60
+ )
61
+
62
+
63
+ zip_path = Path(local_path) / "question_images.zip"
64
+ extract_to = Path(local_path) / "question_images"
65
+
66
+ with ZipFile(zip_path, 'r') as z:
67
+ members = z.infolist()
68
+
69
+ with alive_bar(len(members), title="Extracting files") as bar:
70
+ for m in members:
71
+ z.extract(m, extract_to)
72
+ bar()
73
+
74
+ zip_path.unlink()
75
+
76
+
77
+ def prompt_generator(row):
78
+
79
+ # print(row)
80
+
81
+ prompt = f"""You are a vision-language reasoning agent.
82
+ Understand the intent of the question based on the category {row['category']}. Focus on key visual or logical cues such as shading, color consistency, spatial relations, or context depending on the category.
83
+
84
+ Carefully read and compare all choices, paying attention to small differences and ignoring formatting issues. Use relative reasoning when needed (e.g., darker vs lighter, same vs different).
85
+
86
+ Choose the {'Answer' if row['question_type'] != 'MCQ' else 'OPTION LETTER ONLY'} that best matches the question’s intent based on sound reasoning, not assumptions.
87
+
88
+ JUST GIVE THE ANSWER NO NEED OF REASONING.
89
+
90
+ Question:
91
+ {row['question']}
92
+ """
93
+
94
+ choices= f"""Choices: {row['choices']} """ if row["choices"] else ""
95
+
96
+
97
+
98
+ return {**row, "task_description": prompt + choices}
99
+
100
+
101
+
102
+
103
+
104
+ def process_benchmark(
105
+ df,
106
+ categories,
107
+ sub_categories,
108
+ task_ids,
109
+ max_tasks,
110
+ continue_run=False,
111
+ eval_path="",
112
+ ):
113
+
114
+ if len(categories):
115
+ df=df[df["categories"].isin(categories)]
116
+
117
+ if len(task_ids):
118
+ df=df[df["task"].isin(task_ids)]
119
+
120
+ if len(sub_categories):
121
+ df=df[df["sub_categories"].isin(sub_categories)]
122
+
123
+ ids=[]
124
+ if continue_run:
125
+ if os.path.exists(eval_path):
126
+ ids=pd.read_csv(eval_path)["id"].to_list()
127
+
128
+
129
+
130
+
131
+
132
+
133
+ df=df[~df["id"].isin(ids)]
134
+ dataset=df.set_index('id').to_dict(orient='index')
135
+
136
+ if max_tasks and max_tasks > 0 and max_tasks < len(dataset):
137
+ print(f"Limiting to the first {max_tasks} tasks as requested")
138
+ keep_task_ids = list(dataset.keys())[: max_tasks]
139
+ dataset = {task_id: dataset[task_id] for task_id in keep_task_ids}
140
+
141
+
142
+ return dataset
143
+
@@ -0,0 +1,117 @@
1
+
2
+ from typing import Dict, List
3
+ from utils import get_client
4
+
5
+
6
+
7
+
8
+
9
+
10
+
11
+ # ---------------- Config ----------------
12
+
13
+ # Instructions per strength
14
+ STRENGTH_INSTRUCTIONS = {
15
+ "mild": (
16
+ "Generate small, surface-level variations of the prompt. "
17
+ "Use synonyms, minor changes in formality, active/passive voice changes. "
18
+ "Keep all information, meaning, and structure intact. "
19
+ "Output should be fluent and similar in style to the original."
20
+ ),
21
+ "medium": (
22
+ "Generate variations that restructure sentences and reorder information. "
23
+ "Combine or split sentences, slightly change phrasing, but preserve all meaning. "
24
+ "Output should still be clear and professional, resembling the original style."
25
+ ),
26
+ "strong": (
27
+ "Generate majorly different variations that sound like different people wrote them. "
28
+ "Use conversational phrasing, varied sentence structures, and style differences. "
29
+ "Preserve all information and constraints exactly. Output should maintain readability."
30
+ ),
31
+ "naturalistic": (
32
+ "Generate realistic user-style variations as if typed by a real person. "
33
+ "Include casual phrasing, abbreviations, typos, and informal chat patterns. "
34
+ "Keep all meaning and requirements intact. Output should still resemble the original prompt in content."
35
+ ),
36
+ }
37
+
38
+ STRENGTH_TEMPERATURE = {
39
+ "mild": 0.7,
40
+ "medium": 0.8,
41
+ "strong": 0.9,
42
+ "naturalistic": 0.9,
43
+ }
44
+
45
+
46
+ class PromptVariationGenerator:
47
+ def __init__(self, model_name="gpt-4o-mini-2024-07-18", num_variations=2, strength="medium"):
48
+ self.model_name = model_name
49
+ self.num_variations = num_variations
50
+ self.strength = strength.lower()
51
+ self.client = get_client()
52
+
53
+ def generate_variations_for_task(self, task_id: str, prompt: str) -> List[Dict]:
54
+ instruction = STRENGTH_INSTRUCTIONS.get(self.strength, STRENGTH_INSTRUCTIONS["mild"])
55
+ temperature = STRENGTH_TEMPERATURE.get(self.strength, 0.7)
56
+
57
+ system_prompt = f"You are an expert at paraphrasing prompts while preserving meaning.\nInstruction: {instruction}"
58
+ user_prompt = f"Original prompt:\n{prompt}\nGenerate {self.num_variations} variations, one per line. Do NOT number them."
59
+
60
+ try:
61
+ response = self.client.chat.completions.create(
62
+ model=self.model_name,
63
+ messages=[
64
+ {"role": "system", "content": system_prompt},
65
+ {"role": "user", "content": user_prompt},
66
+ ],
67
+ temperature=temperature,
68
+ max_tokens=1000,
69
+ )
70
+
71
+ lines = response.choices[0].message.content.strip().split("\n")
72
+ variations = [line.strip() for line in lines if len(line.strip()) > 10]
73
+
74
+ except Exception as e:
75
+ print(f"Error generating variations for task {task_id}: {e}")
76
+ variations = []
77
+
78
+ all_variations = [prompt] + variations[:self.num_variations]
79
+
80
+ return [
81
+ {"question": v, "prompt_variation_id": i, "prompt_variation_strength": self.strength}
82
+ for i, v in enumerate(all_variations)
83
+ ]
84
+
85
+ def generate_variations_for_dataset(self, dataset: Dict[str, Dict], prompt_field: str = "question") -> Dict[str, List[Dict]]:
86
+ result = {}
87
+ for task_id, task_data in dataset.items():
88
+ if prompt_field not in task_data:
89
+ result[task_id] = [{
90
+ "question": task_data.get(prompt_field, ""),
91
+ "prompt_variation_id": 0,
92
+ "prompt_variation_strength": self.strength
93
+ }]
94
+ continue
95
+ prompt = task_data[prompt_field]
96
+ result[task_id] = self.generate_variations_for_task(task_id, prompt)
97
+ return result
98
+
99
+
100
+
101
+ def generate_var_bench( dataset, prompt_map):
102
+ new_dataset={}
103
+ for d_id,d in dataset.items():
104
+
105
+ for idx,q in enumerate(prompt_map[d_id]):
106
+ new_dataset[f'{d_id}-{str(idx).zfill(2)}'] = {
107
+ **dataset[d_id],
108
+ "question":q['question']
109
+ }
110
+
111
+ return new_dataset
112
+
113
+
114
+
115
+
116
+
117
+
@@ -0,0 +1,61 @@
1
+ import os
2
+ from openai import OpenAI
3
+ import base64
4
+
5
+ import re
6
+ import os
7
+
8
+ class InvalidPathNameError(ValueError):
9
+ """Custom exception for invalid path names."""
10
+ pass
11
+
12
+ def verify_run_name(name: str) -> str:
13
+ """
14
+ Verify that the given name is a valid filename/folder name.
15
+
16
+ Rules:
17
+ - Cannot contain forbidden characters: \ / : * ? " < > |
18
+ - Cannot be empty or only whitespace
19
+ - Cannot be a reserved Windows name (CON, PRN, AUX, NUL, COM1..9, LPT1..9)
20
+
21
+ Returns the stripped valid name if valid.
22
+ Raises InvalidPathNameError if invalid.
23
+ """
24
+ if not name or name.strip() == "":
25
+ raise InvalidPathNameError("Path name cannot be empty or whitespace.")
26
+
27
+ name = name.strip()
28
+
29
+ # Forbidden characters
30
+ forbidden_chars = r'[<>:"/\\|?*]'
31
+ if re.search(forbidden_chars, name):
32
+ raise InvalidPathNameError(
33
+ f"Path name '{name}' contains forbidden characters: \\ / : * ? \" < > |"
34
+ )
35
+
36
+ # Reserved Windows names
37
+ reserved = {"CON","PRN","AUX","NUL"} | {f"COM{i}" for i in range(1,10)} | {f"LPT{i}" for i in range(1,10)}
38
+ if name.upper() in reserved:
39
+ raise InvalidPathNameError(f"Path name '{name}' is a reserved system name.")
40
+
41
+ return name
42
+
43
+
44
+ def encode_image(path):
45
+ with open(path, "rb") as f:
46
+ return base64.b64encode(f.read()).decode("utf-8")
47
+
48
+
49
+ def get_client():
50
+ OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
51
+ OPEN_ROUTER_API_KEY = os.environ.get("OPEN_ROUTER_API_KEY")
52
+ if OPEN_ROUTER_API_KEY:
53
+ return OpenAI(
54
+ base_url="https://openrouter.ai/api/v1",
55
+ api_key=OPEN_ROUTER_API_KEY
56
+ )
57
+ elif OPENAI_API_KEY:
58
+ return OpenAI(api_key=OPENAI_API_KEY)
59
+ else:
60
+ raise ValueError("No API key found. Set OPENAI_API_KEY or OPEN_ROUTER_API_KEY.")
61
+
@@ -0,0 +1,23 @@
1
+ [build-system]
2
+ requires = ["setuptools>=61.0"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "hal-spatial-harness" # ⚠️ must be unique on PyPI
7
+ version = "0.1.0"
8
+ description = "A lightweight framework for benchmarking multimodal AI agents with parallel execution, prompt variation, and automated evaluation."
9
+ readme = "README.md"
10
+ requires-python = ">=3.8"
11
+ authors = [
12
+ { name="Wahid Faisal", email="wahiddhrubo@gmail.com" }
13
+ ]
14
+ license = { text = "MIT" }
15
+
16
+ dependencies = [
17
+ "pandas",
18
+ "numpy",
19
+ "alive-progress",
20
+ "gitpython",
21
+ "huggingface-hub",
22
+ "openai"
23
+ ]
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+