hyperplane-eval 0.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
cli/app.py ADDED
@@ -0,0 +1,429 @@
1
+ import os
2
+ import asyncio
3
+ import yaml
4
+ import questionary
5
+ from rich.console import Console, Group
6
+ from rich.text import Text
7
+ from rich.panel import Panel
8
+ from typing import Any
9
+
10
+ from adapters.local_bindings.scanner import extract_functions
11
+ from adapters.local_bindings.executor import execute_temp_runner
12
+ from adapters.runners.agent_runner import AgentRunner
13
+ from engine.orchestrator import PipelineOrchestrator
14
+ from engine.config import EvaluationConfig
15
+
16
+
17
+ LOGO = """
18
+ █████╗ ██╗ ██╗████████╗ ██████╗ ██╗ ██╗███████╗██████╗ ██╗███████╗██╗ ██╗
19
+ ██╔══██╗██║ ██║╚══██╔══╝██╔═══██╗██║ ██║██╔════╝██╔══██╗██║██╔════╝╚██╗ ██╔╝
20
+ ███████║██║ ██║ ██║ ██║ ██║██║ ██║█████╗ ██████╔╝██║█████╗ ╚████╔╝
21
+ ██╔══██║██║ ██║ ██║ ██║ ██║╚██╗ ██╔╝██╔══╝ ██╔══██╗██║██╔══╝ ╚██╔╝
22
+ ██║ ██║╚██████╔╝ ██║ ╚██████╔╝ ╚████╔╝ ███████╗██║ ██║██║██║ ██║
23
+ ╚═╝ ╚═╝ ╚═════╝ ╚═╝ ╚═════╝ ╚═══╝ ╚══════╝╚═╝ ╚═╝╚═╝╚═╝ ╚═╝
24
+ """
25
+
26
+
27
+ class VerifyApp:
28
+ def __init__(self):
29
+ self.console = Console()
30
+ self.config_file = "verify.yaml"
31
+ self.config = self.load_config()
32
+
33
+ def load_config(self):
34
+ if os.path.exists(self.config_file):
35
+ with open(self.config_file, "r") as f:
36
+ return yaml.safe_load(f) or {}
37
+ return {}
38
+
39
+ def save_config(self):
40
+ with open(self.config_file, "w") as f:
41
+ yaml.dump(self.config, f)
42
+
43
+ async def _prompt_for_target(self):
44
+ """Prompts the user to select or confirm the target file and function."""
45
+ if self.config and "file" in self.config and "function" in self.config:
46
+ self.console.print(
47
+ f"[green]Found existing config for '{self.config['function']}' in '{self.config['file']}'.[/green]"
48
+ )
49
+ use_existing = await questionary.confirm("Use this target?").ask_async()
50
+ if use_existing:
51
+ funcs = extract_functions(self.config["file"])
52
+ selected_func = next(
53
+ (f for f in funcs if f["name"] == self.config["function"]), None
54
+ )
55
+ if not selected_func:
56
+ selected_func = {
57
+ "name": self.config["function"],
58
+ "code": "",
59
+ "params": self.config.get("parameters", []),
60
+ }
61
+ return (
62
+ self.config["file"],
63
+ selected_func,
64
+ self.config.get("description", ""),
65
+ self.config.get("rules", []),
66
+ )
67
+
68
+ # Prompt for file
69
+ files = []
70
+ for root, _, fnames in os.walk("."):
71
+ if any(
72
+ ignored in root for ignored in ["node_modules", ".venv", "__pycache__"]
73
+ ):
74
+ continue
75
+ for fname in fnames:
76
+ if fname.endswith((".py", ".ts", ".js")):
77
+ files.append(os.path.relpath(os.path.join(root, fname)))
78
+
79
+ target_path = await questionary.autocomplete(
80
+ "Locate entry point file:", choices=files
81
+ ).ask_async()
82
+
83
+ if not target_path:
84
+ self.console.print("[red]No file selected. Exiting.[/red]")
85
+ return None, None, None, []
86
+
87
+ self.console.print("[cyan]Scanning for functions...[/cyan]")
88
+ funcs = extract_functions(target_path)
89
+ if not funcs:
90
+ self.console.print(
91
+ f"[red]No functions/methods detected in '{target_path}'.[/red]"
92
+ )
93
+ return None, None, None, []
94
+
95
+ choices = [
96
+ questionary.Choice(
97
+ title=f"{f['name']}({', '.join([p['name'] + ': ' + p['type'] for p in f['params']])})",
98
+ value=f,
99
+ )
100
+ for f in funcs
101
+ ]
102
+
103
+ selected_func = await questionary.select(
104
+ "Select the function where to call the agent:", choices=choices
105
+ ).ask_async()
106
+
107
+ if not selected_func:
108
+ return None, None, None, []
109
+
110
+ # 2.2 Prompt for parameter descriptions
111
+ for param in selected_func.get("params", []):
112
+ desc = await questionary.text(
113
+ f"Enter a description for parameter '{param['name']}' ({param['type']}):",
114
+ default=param.get("description", ""),
115
+ ).ask_async()
116
+ param["description"] = desc
117
+
118
+ description = await questionary.text(
119
+ "Enter a description about the test agent:",
120
+ default=self.config.get("description", ""),
121
+ ).ask_async()
122
+ rules = []
123
+ while True:
124
+ r = await questionary.text(
125
+ "Enter a rule to test for (leave blank to finish):"
126
+ if rules
127
+ else "Enter a rule to test for:"
128
+ ).ask_async()
129
+ if not r:
130
+ if not rules:
131
+ self.console.print("[red]You must provide at least one rule.[/red]")
132
+ continue
133
+ break
134
+ rules.append(r)
135
+
136
+ # 5 Setup LLM API using LiteLLM
137
+ import urllib.request
138
+ import json
139
+
140
+ try:
141
+ with urllib.request.urlopen(
142
+ "https://raw.githubusercontent.com/BerriAI/litellm/main/litellm/proxy/public_endpoints/provider_create_fields.json"
143
+ ) as response:
144
+ provider_data = json.loads(response.read().decode())
145
+ except Exception as e:
146
+ self.console.print(f"[yellow]Could not fetch provider list: {e}[/yellow]")
147
+ provider_data = []
148
+
149
+ if provider_data:
150
+ provider_map = {
151
+ p.get("provider_display_name", p.get("provider")): p
152
+ for p in provider_data
153
+ }
154
+ choices = list(provider_map.keys())
155
+
156
+ selected_provider_name = await questionary.autocomplete(
157
+ "Select LLM Provider (type to search):",
158
+ choices=choices,
159
+ ignore_case=True,
160
+ ).ask_async()
161
+
162
+ if not selected_provider_name:
163
+ return None, None, None, []
164
+
165
+ selected_provider = None
166
+ for k, p in provider_map.items():
167
+ if k.strip().lower() == selected_provider_name.strip().lower():
168
+ selected_provider = p
169
+ break
170
+
171
+ if not selected_provider:
172
+ self.console.print(
173
+ f"[red]Invalid provider selected: '{selected_provider_name}'[/red]"
174
+ )
175
+ return None, None, None, []
176
+
177
+ credential_fields = selected_provider.get("credential_fields", [])
178
+ default_model = selected_provider.get("default_model_placeholder", "")
179
+ else:
180
+ credential_fields = [
181
+ {
182
+ "key": "api_key",
183
+ "label": "API Key",
184
+ "required": False,
185
+ "field_type": "password",
186
+ }
187
+ ]
188
+ default_model = "gpt-4o"
189
+
190
+ llm_model = await questionary.text(
191
+ f"Enter the LiteLLM model string{f' (e.g., {default_model})' if default_model else ''}:",
192
+ default=self.config.get("llm_model", default_model),
193
+ ).ask_async()
194
+
195
+ if llm_model is None:
196
+ return None, None, None, []
197
+
198
+ llm_params = {}
199
+ for field in credential_fields:
200
+ key = field.get("key")
201
+ if not key:
202
+ continue
203
+ label = field.get("label", key)
204
+ is_password = field.get("field_type") == "password"
205
+
206
+ prompt_text = f"Enter {label} (leave blank to use env vars):"
207
+ if is_password:
208
+ val = await questionary.password(prompt_text).ask_async()
209
+ else:
210
+ val = await questionary.text(
211
+ prompt_text, default=self.config.get(f"llm_{key}", "")
212
+ ).ask_async()
213
+
214
+ if val is None:
215
+ return None, None, None, []
216
+
217
+ if val:
218
+ llm_params[key] = val
219
+ elif self.config.get(f"llm_{key}"):
220
+ llm_params[key] = self.config[f"llm_{key}"]
221
+
222
+ # Update and save config
223
+ self.config.update(
224
+ {
225
+ "file": target_path,
226
+ "function": selected_func["name"],
227
+ "description": description,
228
+ "parameters": selected_func["params"],
229
+ "rules": rules,
230
+ "llm_model": llm_model,
231
+ }
232
+ )
233
+
234
+ for k, v in llm_params.items():
235
+ self.config[f"llm_{k}"] = v
236
+
237
+ self.save_config()
238
+
239
+ return target_path, selected_func, description, rules
240
+
241
+ async def _prompt_for_rule(self, rules):
242
+ """Prompts the user to select the rules to test against."""
243
+ if not rules:
244
+ return []
245
+
246
+ choices = [questionary.Choice(title=r, value=r, checked=True) for r in rules]
247
+ choices.append(
248
+ questionary.Choice(title="Add a new rule...", value="Add a new rule...")
249
+ )
250
+
251
+ selected_rules = await questionary.checkbox(
252
+ "Select rules to test (Space to toggle, Enter to confirm):", choices=choices
253
+ ).ask_async()
254
+
255
+ if not selected_rules:
256
+ return []
257
+
258
+ if "Add a new rule..." in selected_rules:
259
+ selected_rules.remove("Add a new rule...")
260
+ while True:
261
+ new_rule = await questionary.text(
262
+ "Enter a new rule to test for (leave blank to finish):"
263
+ ).ask_async()
264
+ if not new_rule:
265
+ break
266
+ rules.append(new_rule)
267
+ selected_rules.append(new_rule)
268
+
269
+ self.config["rules"] = rules
270
+ self.save_config()
271
+
272
+ return selected_rules
273
+
274
+ async def _prompt_for_dynamic_config(self):
275
+ """Prompts the user for depth and breadth config values."""
276
+
277
+ self.console.print("\n[bold cyan]Evaluation Configuration[/bold cyan]")
278
+ self.console.print(
279
+ "• [bold]Breadth[/bold]: Determines how many distinct types of edge cases to test. Higher breadth explores a wider variety of scenarios."
280
+ )
281
+ self.console.print(
282
+ "• [bold]Depth[/bold]: Determines how many variations to generate for each edge case type. Higher depth provides more rigorous testing for a specific scenario."
283
+ )
284
+ self.console.print(
285
+ "[yellow]Note: Setting these to 'high' will significantly increase evaluation time and token usage.[/yellow]\n"
286
+ )
287
+
288
+ breadth = await questionary.select(
289
+ "Select Breadth (Number of Edge Case Types):",
290
+ choices=["low", "mid", "high"],
291
+ default=self.config.get("breadth", "mid"),
292
+ ).ask_async()
293
+
294
+ depth = await questionary.select(
295
+ "Select Depth (Variations per Edge Case):",
296
+ choices=["low", "mid", "high"],
297
+ default=self.config.get("depth", "mid"),
298
+ ).ask_async()
299
+
300
+ self.config["breadth"] = breadth
301
+ self.config["depth"] = depth
302
+
303
+ adversarial = await questionary.confirm(
304
+ "Enable Adversarial Testing? (Injects red-teaming vectors into scenarios)",
305
+ default=self.config.get("adversarial_testing", False),
306
+ ).ask_async()
307
+
308
+ self.config["adversarial_testing"] = adversarial
309
+ self.save_config()
310
+
311
+ return depth, breadth, adversarial
312
+
313
+ @staticmethod
314
+ def update_dashboard_display(
315
+ active_scenarios: dict,
316
+ plane_input_space: Any,
317
+ scenarios_per_plane: int,
318
+ plane_features: list,
319
+ rule_idx: int,
320
+ rules_len: int,
321
+ plane_idx: int,
322
+ num_planes: int,
323
+ rule: str,
324
+ ) -> Group:
325
+ """Generates the CLI dashboard showing evaluation progress and scenario status."""
326
+ pct = min(1.0, len(plane_input_space.get_all_vectors()) / scenarios_per_plane)
327
+ bar = "█" * int(30 * pct) + "░" * (30 - int(30 * pct))
328
+ dims_str = ", ".join(f.name for f in plane_features)
329
+
330
+ renderables = []
331
+ renderables.append(
332
+ Text.from_markup(
333
+ f"[bold cyan]Rule [{rule_idx + 1}/{rules_len}] - Plane [{plane_idx + 1}/{num_planes}]:[/bold cyan] {rule[:80]}..."
334
+ )
335
+ )
336
+ renderables.append(Text.from_markup(f"[cyan]Dimensions:[/cyan] {dims_str}"))
337
+ renderables.append(
338
+ Text.from_markup(
339
+ f"[cyan]Progress:[/cyan] [{bar}] {pct:.0%} ({len(plane_input_space.get_all_vectors())}/{scenarios_per_plane})\n"
340
+ )
341
+ )
342
+
343
+ for item in list(active_scenarios.values())[-3:]:
344
+ if item["status"] == "Pending":
345
+ renderables.append(Text.from_markup(f" • {item['text']}\n"))
346
+ else:
347
+ score = item["score"]
348
+ if score >= 0.75:
349
+ marker = "[bold green][✓][/bold green]"
350
+ elif score >= 0.25:
351
+ marker = "[bold yellow][~][/bold yellow]"
352
+ else:
353
+ marker = "[bold red][✗][/bold red]"
354
+
355
+ renderables.append(
356
+ Text.from_markup(f" • {marker} ({score:.0%}) {item['text']}\n")
357
+ )
358
+
359
+ return Group(*renderables)
360
+
361
+ async def run(self):
362
+ self.console.print(Panel.fit(Text(LOGO, style="bold cyan")))
363
+
364
+ target_path, selected_func, description, rules = await self._prompt_for_target()
365
+ if not target_path or not selected_func:
366
+ return
367
+
368
+ rules_to_run = await self._prompt_for_rule(rules)
369
+ if not rules_to_run:
370
+ self.console.print("[red]No rules selected. Exiting.[/red]")
371
+ return
372
+
373
+ depth, breadth, adversarial = await self._prompt_for_dynamic_config()
374
+
375
+ rules_str = ", ".join(f"'{r}'" for r in rules_to_run)
376
+ self.console.print(
377
+ f"\n[bold green]Starting evaluation locally for rules: {rules_str}[/bold green]"
378
+ )
379
+
380
+ from adapters.llms.llm_client import LLMClient
381
+
382
+ llm_params = {
383
+ k.replace("llm_", ""): v
384
+ for k, v in self.config.items()
385
+ if k.startswith("llm_") and k != "llm_model"
386
+ }
387
+ llm_client = LLMClient(model=self.config.get("llm_model"), **llm_params)
388
+
389
+ runner = AgentRunner(
390
+ executor_func=execute_temp_runner,
391
+ target_path=target_path,
392
+ selected_func=selected_func,
393
+ )
394
+
395
+ import os
396
+
397
+ agent_dir = os.path.dirname(os.path.abspath(target_path))
398
+ results_path = os.path.join(agent_dir, "results")
399
+
400
+ eval_config = EvaluationConfig(
401
+ results_dir=results_path,
402
+ rules=rules_to_run,
403
+ runner=runner,
404
+ generator_target_schema=selected_func.get("params", []),
405
+ generator_target_code=selected_func.get("code", ""),
406
+ depth=depth,
407
+ breadth=breadth,
408
+ adversarial_testing=adversarial,
409
+ llm_client=llm_client,
410
+ agent_description=description,
411
+ )
412
+ orchestrator = PipelineOrchestrator(eval_config)
413
+ await orchestrator.run()
414
+
415
+
416
+ async def main():
417
+ app = VerifyApp()
418
+ await app.run()
419
+
420
+
421
+ def console_main():
422
+ try:
423
+ asyncio.run(main())
424
+ except KeyboardInterrupt:
425
+ Console().print("\n[red]Exited.[/red]")
426
+
427
+
428
+ if __name__ == "__main__":
429
+ console_main()
engine/__init__.py ADDED
File without changes
engine/config.py ADDED
@@ -0,0 +1,20 @@
1
+ from dataclasses import dataclass, field
2
+ from typing import Any, List, Dict, Optional
3
+
4
+ from adapters.runners.agent_runner import AgentRunner
5
+
6
+
7
+ @dataclass
8
+ class EvaluationConfig:
9
+ """Configuration for an evaluation run."""
10
+
11
+ results_dir: str = "results"
12
+ rules: List[str] = field(default_factory=lambda: ["General Safety Policy"])
13
+ runner: AgentRunner = None
14
+ generator_target_schema: Optional[List[Dict[str, Any]]] = None
15
+ generator_target_code: str = ""
16
+ llm_client: Any = None
17
+ depth: str = "mid"
18
+ breadth: str = "mid"
19
+ adversarial_testing: bool = False
20
+ agent_description: str = ""
@@ -0,0 +1,3 @@
1
+ from .dimensions import PromptFeature
2
+
3
+ __all__ = ["PromptFeature"]
@@ -0,0 +1,23 @@
1
+ from dataclasses import dataclass
2
+
3
+
4
+ @dataclass
5
+ class PromptFeature:
6
+ """
7
+ Represents a prompt feature dimension used for adversarial mutations.
8
+
9
+ Attributes:
10
+ name: The name of the feature.
11
+ description: A short description of the feature.
12
+ anchors: A dictionary mapping coordinate levels (floats) to prompt examples.
13
+ weight: Float determining mutation application order (ascending = earlier).
14
+ """
15
+
16
+ name: str
17
+ description: str
18
+ anchors: dict[float, str]
19
+ weight: float
20
+
21
+ def __post_init__(self):
22
+ if self.anchors:
23
+ self.anchors = {float(k): v for k, v in self.anchors.items()}