hyperplane-eval 0.1.3__tar.gz → 0.1.5__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (82) hide show
  1. hyperplane_eval-0.1.5/MANIFEST.in +6 -0
  2. hyperplane_eval-0.1.5/PKG-INFO +88 -0
  3. hyperplane_eval-0.1.5/README.md +54 -0
  4. {hyperplane_eval-0.1.3 → hyperplane_eval-0.1.5/hyperplane_eval}/adapters/llms/llm_client.py +4 -4
  5. {hyperplane_eval-0.1.3 → hyperplane_eval-0.1.5/hyperplane_eval}/adapters/runners/agent_runner.py +4 -8
  6. {hyperplane_eval-0.1.3 → hyperplane_eval-0.1.5/hyperplane_eval}/cli/app.py +116 -111
  7. hyperplane_eval-0.1.5/hyperplane_eval/engine/config.py +20 -0
  8. hyperplane_eval-0.1.5/hyperplane_eval/engine/domain/predefined_features/conversational_features.json +184 -0
  9. {hyperplane_eval-0.1.3/engine/stages → hyperplane_eval-0.1.5/hyperplane_eval/engine/input_space}/input_space.py +44 -14
  10. hyperplane_eval-0.1.3/engine/stages/creator.py → hyperplane_eval-0.1.5/hyperplane_eval/engine/input_space/input_space_factory.py +52 -42
  11. {hyperplane_eval-0.1.3 → hyperplane_eval-0.1.5/hyperplane_eval}/engine/orchestrator.py +85 -82
  12. {hyperplane_eval-0.1.3 → hyperplane_eval-0.1.5/hyperplane_eval}/engine/plane_evaluator.py +25 -27
  13. hyperplane_eval-0.1.5/hyperplane_eval/engine/stages/__init__.py +0 -0
  14. {hyperplane_eval-0.1.3 → hyperplane_eval-0.1.5/hyperplane_eval}/engine/stages/evaluator.py +4 -5
  15. {hyperplane_eval-0.1.3 → hyperplane_eval-0.1.5/hyperplane_eval}/engine/stages/generator.py +8 -24
  16. {hyperplane_eval-0.1.3 → hyperplane_eval-0.1.5/hyperplane_eval}/engine/stages/navigator.py +30 -33
  17. hyperplane_eval-0.1.5/hyperplane_eval/prompts/__init__.py +0 -0
  18. hyperplane_eval-0.1.5/hyperplane_eval/prompts/adapters/llm/schema_prompt.txt +2 -0
  19. hyperplane_eval-0.1.5/hyperplane_eval/prompts/reporting/__init__.py +0 -0
  20. hyperplane_eval-0.1.5/hyperplane_eval/prompts/reporting/dimension_mitigation.txt +17 -0
  21. hyperplane_eval-0.1.5/hyperplane_eval/prompts/reporting/vulnerability_patch.txt +7 -0
  22. hyperplane_eval-0.1.5/hyperplane_eval/prompts/stages/__init__.py +0 -0
  23. hyperplane_eval-0.1.5/hyperplane_eval/prompts/stages/creator/__init__.py +0 -0
  24. {hyperplane_eval-0.1.3 → hyperplane_eval-0.1.5/hyperplane_eval}/prompts/stages/creator/anchors_sys.txt +1 -1
  25. {hyperplane_eval-0.1.3 → hyperplane_eval-0.1.5/hyperplane_eval}/prompts/stages/creator/brainstorm_sys.txt +2 -1
  26. {hyperplane_eval-0.1.3 → hyperplane_eval-0.1.5/hyperplane_eval}/prompts/stages/creator/refine_sys.txt +1 -1
  27. hyperplane_eval-0.1.5/hyperplane_eval/prompts/stages/evaluator/__init__.py +0 -0
  28. hyperplane_eval-0.1.5/hyperplane_eval/prompts/stages/generator/__init__.py +0 -0
  29. {hyperplane_eval-0.1.3 → hyperplane_eval-0.1.5/hyperplane_eval}/prompts/stages/generator/copyeditor_sys.txt +7 -1
  30. hyperplane_eval-0.1.5/hyperplane_eval/prompts/stages/generator/seed_sys.txt +9 -0
  31. {hyperplane_eval-0.1.3 → hyperplane_eval-0.1.5/hyperplane_eval}/prompts/stages/generator/seed_user.txt +1 -1
  32. hyperplane_eval-0.1.5/hyperplane_eval/reporting/__init__.py +0 -0
  33. {hyperplane_eval-0.1.3 → hyperplane_eval-0.1.5/hyperplane_eval}/reporting/analyser.py +408 -424
  34. hyperplane_eval-0.1.5/hyperplane_eval/reporting/templates/__init__.py +0 -0
  35. hyperplane_eval-0.1.5/hyperplane_eval.egg-info/PKG-INFO +88 -0
  36. hyperplane_eval-0.1.5/hyperplane_eval.egg-info/SOURCES.txt +71 -0
  37. hyperplane_eval-0.1.5/hyperplane_eval.egg-info/entry_points.txt +2 -0
  38. hyperplane_eval-0.1.5/hyperplane_eval.egg-info/top_level.txt +1 -0
  39. {hyperplane_eval-0.1.3 → hyperplane_eval-0.1.5}/setup.py +2 -2
  40. hyperplane_eval-0.1.3/MANIFEST.in +0 -6
  41. hyperplane_eval-0.1.3/PKG-INFO +0 -143
  42. hyperplane_eval-0.1.3/README.md +0 -109
  43. hyperplane_eval-0.1.3/engine/config.py +0 -20
  44. hyperplane_eval-0.1.3/hyperplane_eval.egg-info/PKG-INFO +0 -143
  45. hyperplane_eval-0.1.3/hyperplane_eval.egg-info/SOURCES.txt +0 -58
  46. hyperplane_eval-0.1.3/hyperplane_eval.egg-info/entry_points.txt +0 -2
  47. hyperplane_eval-0.1.3/hyperplane_eval.egg-info/top_level.txt +0 -4
  48. hyperplane_eval-0.1.3/prompts/stages/generator/seed_sys.txt +0 -5
  49. {hyperplane_eval-0.1.3 → hyperplane_eval-0.1.5}/LICENSE +0 -0
  50. {hyperplane_eval-0.1.3/adapters/llms → hyperplane_eval-0.1.5/hyperplane_eval}/__init__.py +0 -0
  51. {hyperplane_eval-0.1.3 → hyperplane_eval-0.1.5/hyperplane_eval}/adapters/__init__.py +0 -0
  52. {hyperplane_eval-0.1.3/adapters/local_bindings → hyperplane_eval-0.1.5/hyperplane_eval/adapters/llms}/__init__.py +0 -0
  53. {hyperplane_eval-0.1.3/adapters/runners → hyperplane_eval-0.1.5/hyperplane_eval/adapters/local_bindings}/__init__.py +0 -0
  54. {hyperplane_eval-0.1.3 → hyperplane_eval-0.1.5/hyperplane_eval}/adapters/local_bindings/executor.py +0 -0
  55. {hyperplane_eval-0.1.3 → hyperplane_eval-0.1.5/hyperplane_eval}/adapters/local_bindings/scanner.py +0 -0
  56. {hyperplane_eval-0.1.3/engine → hyperplane_eval-0.1.5/hyperplane_eval/adapters/runners}/__init__.py +0 -0
  57. {hyperplane_eval-0.1.3 → hyperplane_eval-0.1.5/hyperplane_eval}/cli/__init__.py +0 -0
  58. {hyperplane_eval-0.1.3/engine/stages → hyperplane_eval-0.1.5/hyperplane_eval/engine}/__init__.py +0 -0
  59. {hyperplane_eval-0.1.3 → hyperplane_eval-0.1.5/hyperplane_eval}/engine/domain/__init__.py +0 -0
  60. {hyperplane_eval-0.1.3 → hyperplane_eval-0.1.5/hyperplane_eval}/engine/domain/dimensions.py +0 -0
  61. /hyperplane_eval-0.1.3/engine/domain/predefined_features.json → /hyperplane_eval-0.1.5/hyperplane_eval/engine/domain/predefined_features/adversarial_features.json +0 -0
  62. {hyperplane_eval-0.1.3 → hyperplane_eval-0.1.5/hyperplane_eval}/engine/domain/vectors/__init__.py +0 -0
  63. {hyperplane_eval-0.1.3 → hyperplane_eval-0.1.5/hyperplane_eval}/engine/domain/vectors/base.py +0 -0
  64. {hyperplane_eval-0.1.3 → hyperplane_eval-0.1.5/hyperplane_eval}/engine/domain/vectors/evaluated.py +0 -0
  65. {hyperplane_eval-0.1.3 → hyperplane_eval-0.1.5/hyperplane_eval}/engine/domain/vectors/executed.py +0 -0
  66. {hyperplane_eval-0.1.3 → hyperplane_eval-0.1.5/hyperplane_eval}/engine/domain/vectors/synthesized.py +0 -0
  67. {hyperplane_eval-0.1.3/reporting → hyperplane_eval-0.1.5/hyperplane_eval/engine/input_space}/__init__.py +0 -0
  68. {hyperplane_eval-0.1.3 → hyperplane_eval-0.1.5/hyperplane_eval}/engine/prompt_loader.py +0 -0
  69. {hyperplane_eval-0.1.3 → hyperplane_eval-0.1.5/hyperplane_eval}/prompts/stages/creator/anchors_user.txt +0 -0
  70. {hyperplane_eval-0.1.3 → hyperplane_eval-0.1.5/hyperplane_eval}/prompts/stages/creator/brainstorm_user.txt +0 -0
  71. {hyperplane_eval-0.1.3 → hyperplane_eval-0.1.5/hyperplane_eval}/prompts/stages/creator/refine_user.txt +0 -0
  72. {hyperplane_eval-0.1.3 → hyperplane_eval-0.1.5/hyperplane_eval}/prompts/stages/evaluator/judge.txt +0 -0
  73. {hyperplane_eval-0.1.3 → hyperplane_eval-0.1.5/hyperplane_eval}/prompts/stages/generator/continue_sys.txt +0 -0
  74. {hyperplane_eval-0.1.3 → hyperplane_eval-0.1.5/hyperplane_eval}/prompts/stages/generator/continue_user.txt +0 -0
  75. {hyperplane_eval-0.1.3 → hyperplane_eval-0.1.5/hyperplane_eval}/prompts/stages/generator/copyeditor_user.txt +0 -0
  76. {hyperplane_eval-0.1.3 → hyperplane_eval-0.1.5/hyperplane_eval}/prompts/stages/generator/eval_checks_sys.txt +0 -0
  77. {hyperplane_eval-0.1.3 → hyperplane_eval-0.1.5/hyperplane_eval}/prompts/stages/generator/eval_checks_user.txt +0 -0
  78. {hyperplane_eval-0.1.3 → hyperplane_eval-0.1.5/hyperplane_eval}/reporting/templates/report_template.html +0 -0
  79. {hyperplane_eval-0.1.3 → hyperplane_eval-0.1.5}/hyperplane_eval.egg-info/dependency_links.txt +0 -0
  80. {hyperplane_eval-0.1.3 → hyperplane_eval-0.1.5}/hyperplane_eval.egg-info/requires.txt +0 -0
  81. {hyperplane_eval-0.1.3 → hyperplane_eval-0.1.5}/requirements.txt +0 -0
  82. {hyperplane_eval-0.1.3 → hyperplane_eval-0.1.5}/setup.cfg +0 -0
@@ -0,0 +1,6 @@
1
+ include requirements.txt
2
+ include README.md
3
+ include LICENSE
4
+ recursive-include hyperplane_eval/prompts *.txt
5
+ recursive-include hyperplane_eval/engine/domain *.json
6
+ recursive-include hyperplane_eval/reporting/templates *.html
@@ -0,0 +1,88 @@
1
+ Metadata-Version: 2.4
2
+ Name: hyperplane-eval
3
+ Version: 0.1.5
4
+ Summary: A modular framework for evaluating and verifying agentic LLM outputs.
5
+ Author: Marten Panchev
6
+ Author-email: marten@aquithm.com
7
+ Classifier: Programming Language :: Python :: 3
8
+ Classifier: License :: OSI Approved :: Apache Software License
9
+ Classifier: Operating System :: OS Independent
10
+ Requires-Python: >=3.10
11
+ Description-Content-Type: text/markdown
12
+ License-File: LICENSE
13
+ Requires-Dist: pydantic>=2.0.0
14
+ Requires-Dist: numpy>=1.24.0
15
+ Requires-Dist: scipy>=1.10.0
16
+ Requires-Dist: litellm>=1.0.0
17
+ Requires-Dist: aiohttp>=3.9.0
18
+ Requires-Dist: pandas>=2.0.0
19
+ Requires-Dist: scikit-learn>=1.2.0
20
+ Requires-Dist: openai>=1.0.0
21
+ Requires-Dist: pyngrok>=7.1.0
22
+ Requires-Dist: rich>=13.0.0
23
+ Requires-Dist: questionary>=2.0.0
24
+ Requires-Dist: PyYAML>=6.0.0
25
+ Dynamic: author
26
+ Dynamic: author-email
27
+ Dynamic: classifier
28
+ Dynamic: description
29
+ Dynamic: description-content-type
30
+ Dynamic: license-file
31
+ Dynamic: requires-dist
32
+ Dynamic: requires-python
33
+ Dynamic: summary
34
+
35
+ # Hyperplane Eval
36
+
37
+ Hyperplane Eval is a Python-based testing framework that helps you figure out exactly when and where your AI agents break. Instead of writing manual test cases, you give Hyperplane a target function and a set of rules, and it systematically generates edge-cases to map out your agent's "Safe Polytope" — the operational volume where your agent is reliable.
38
+
39
+ ## 🚀 How It Works: Breadth-First Evaluation
40
+
41
+ Testing an AI agent is hard because the potential input space is infinite. Hyperplane solves this by breaking down inputs into "dimensions" of complexity (e.g., Urgency, Ambiguity, Formatting).
42
+
43
+ Instead of randomly guessing inputs, Hyperplane uses a **breadth-first evaluation** approach:
44
+ 1. **Dimension Extraction:** It automatically extracts relevant dimensions based on the rules you want to test.
45
+ 2. **Grid Generation:** It generates a uniform grid of test scenarios across these dimensions (using Sobol sequences for perfectly even distribution).
46
+ 3. **Input Synthesis:** It uses a strong LLM to generate realistic user inputs that match those specific dimension coordinates.
47
+ 4. **Evaluation:** It executes your local agent code with the generated inputs, and evaluates the output against your rules using a Chain-of-Thought (CoT) judge.
48
+
49
+ By doing this breadth-first scan across multiple dimensions simultaneously, Hyperplane creates a mathematical map of your agent's reliability and calculates its "Reliability Coverage" as a clear, comparable percentage.
50
+
51
+ ## 🚦 CLI Integration
52
+
53
+ Hyperplane is incredibly easy to use. You don't need to write any complex evaluation scripts or boilerplate code; everything is handled through an interactive CLI.
54
+
55
+ ### Setup & Installation
56
+
57
+ Install the framework via pip:
58
+
59
+ ```bash
60
+ pip install hyperplane-eval
61
+ ```
62
+
63
+ ### Running the CLI
64
+
65
+ Run the interactive CLI directly in your terminal from inside your project directory:
66
+
67
+ ```bash
68
+ hyperplane
69
+ ```
70
+
71
+ The wizard will immediately guide you through the evaluation setup:
72
+ 1. **Target Selection:** It will automatically scan your local Python files and let you pick the function that acts as your agent's entry point.
73
+ 2. **Rule Definition:** You define the rules your agent must follow in plain English (e.g., "Never offer a refund over $50").
74
+ 3. **Configuration:** You configure the depth (how many points to test) and breadth (how many dimensions to extract).
75
+ 4. **Execution:** The framework will spin up workers, generate the test space, execute your local code, and render a real-time terminal dashboard.
76
+
77
+ Once complete, Hyperplane generates an interactive HTML report showing exactly which dimensions cause your agent to fail, allowing you to easily identify blind spots in your system prompts.
78
+
79
+ ## 🛠 Technology Stack
80
+ - **Language:** Python 3.10+
81
+ - **Data Modeling:** `pydantic`
82
+ - **Math/Geometry:** `numpy`, `scipy` (Sobol sequences, ConvexHull analysis)
83
+ - **LLM Integration:** `litellm` for universal API connectivity (OpenAI, Gemini, Anthropic, or any local vLLM).
84
+
85
+ ## 📄 License
86
+
87
+ This project is licensed under the Apache License, Version 2.0.
88
+ See the [LICENSE](LICENSE) file for more information.
@@ -0,0 +1,54 @@
1
+ # Hyperplane Eval
2
+
3
+ Hyperplane Eval is a Python-based testing framework that helps you figure out exactly when and where your AI agents break. Instead of writing manual test cases, you give Hyperplane a target function and a set of rules, and it systematically generates edge-cases to map out your agent's "Safe Polytope" — the operational volume where your agent is reliable.
4
+
5
+ ## 🚀 How It Works: Breadth-First Evaluation
6
+
7
+ Testing an AI agent is hard because the potential input space is infinite. Hyperplane solves this by breaking down inputs into "dimensions" of complexity (e.g., Urgency, Ambiguity, Formatting).
8
+
9
+ Instead of randomly guessing inputs, Hyperplane uses a **breadth-first evaluation** approach:
10
+ 1. **Dimension Extraction:** It automatically extracts relevant dimensions based on the rules you want to test.
11
+ 2. **Grid Generation:** It generates a uniform grid of test scenarios across these dimensions (using Sobol sequences for perfectly even distribution).
12
+ 3. **Input Synthesis:** It uses a strong LLM to generate realistic user inputs that match those specific dimension coordinates.
13
+ 4. **Evaluation:** It executes your local agent code with the generated inputs, and evaluates the output against your rules using a Chain-of-Thought (CoT) judge.
14
+
15
+ By doing this breadth-first scan across multiple dimensions simultaneously, Hyperplane creates a mathematical map of your agent's reliability and calculates its "Reliability Coverage" as a clear, comparable percentage.
16
+
17
+ ## 🚦 CLI Integration
18
+
19
+ Hyperplane is incredibly easy to use. You don't need to write any complex evaluation scripts or boilerplate code; everything is handled through an interactive CLI.
20
+
21
+ ### Setup & Installation
22
+
23
+ Install the framework via pip:
24
+
25
+ ```bash
26
+ pip install hyperplane-eval
27
+ ```
28
+
29
+ ### Running the CLI
30
+
31
+ Run the interactive CLI directly in your terminal from inside your project directory:
32
+
33
+ ```bash
34
+ hyperplane
35
+ ```
36
+
37
+ The wizard will immediately guide you through the evaluation setup:
38
+ 1. **Target Selection:** It will automatically scan your local Python files and let you pick the function that acts as your agent's entry point.
39
+ 2. **Rule Definition:** You define the rules your agent must follow in plain English (e.g., "Never offer a refund over $50").
40
+ 3. **Configuration:** You configure the depth (how many points to test) and breadth (how many dimensions to extract).
41
+ 4. **Execution:** The framework will spin up workers, generate the test space, execute your local code, and render a real-time terminal dashboard.
42
+
43
+ Once complete, Hyperplane generates an interactive HTML report showing exactly which dimensions cause your agent to fail, allowing you to easily identify blind spots in your system prompts.
44
+
45
+ ## 🛠 Technology Stack
46
+ - **Language:** Python 3.10+
47
+ - **Data Modeling:** `pydantic`
48
+ - **Math/Geometry:** `numpy`, `scipy` (Sobol sequences, ConvexHull analysis)
49
+ - **LLM Integration:** `litellm` for universal API connectivity (OpenAI, Gemini, Anthropic, or any local vLLM).
50
+
51
+ ## 📄 License
52
+
53
+ This project is licensed under the Apache License, Version 2.0.
54
+ See the [LICENSE](LICENSE) file for more information.
@@ -4,6 +4,7 @@ import re
4
4
  import asyncio
5
5
  from typing import Any, Dict
6
6
  from litellm import acompletion
7
+ from hyperplane_eval.engine.prompt_loader import load_prompt
7
8
 
8
9
 
9
10
  class LLMClient:
@@ -39,8 +40,8 @@ class LLMClient:
39
40
  response_schema: Dict[str, Any],
40
41
  temperature: float,
41
42
  ) -> str:
42
- if response_schema:
43
- prompt += f"\n\nYOU MUST RETURN A JSON OBJECT WITH THE EXACT FOLLOWING SCHEMA:\n{json.dumps(response_schema, indent=2)}"
43
+ schema_str = json.dumps(response_schema, indent=2)
44
+ prompt += "\n\n" + load_prompt("adapters/llm/schema_prompt", schema=schema_str)
44
45
 
45
46
  kwargs = {
46
47
  "model": self.model, # Force using the user-selected model
@@ -49,8 +50,7 @@ class LLMClient:
49
50
  **self.llm_kwargs,
50
51
  }
51
52
 
52
- if response_schema:
53
- kwargs["response_format"] = {"type": "json_object"}
53
+ kwargs["response_format"] = {"type": "json_object"}
54
54
 
55
55
  async with self._semaphore:
56
56
  try:
@@ -9,15 +9,15 @@ class AgentRunner:
9
9
 
10
10
  def __init__(
11
11
  self,
12
- executor_func: Callable = None,
13
- target_path: str = "",
14
- selected_func: dict = None,
12
+ executor_func: Callable,
13
+ target_path: str,
14
+ selected_func: dict,
15
15
  ):
16
16
  self.executor_func = executor_func
17
17
  self.target_path = target_path
18
18
  self.selected_func = selected_func
19
19
 
20
- async def _call_target_agent(self, messages: List[Dict[str, str]]) -> str:
20
+ async def call_target_agent(self, messages: List[Dict[str, str]]) -> str:
21
21
  """Dispatches a multi-turn request to the agent under evaluation."""
22
22
  if not messages:
23
23
  return ""
@@ -75,7 +75,3 @@ class AgentRunner:
75
75
  return f"Error: {str(e)}"
76
76
  else:
77
77
  return ""
78
-
79
- async def close(self):
80
- """No-op close method to satisfy framework expectation."""
81
- pass
@@ -7,7 +7,11 @@ from rich.text import Text
7
7
  from rich.panel import Panel
8
8
  from typing import Any
9
9
 
10
-
10
+ from hyperplane_eval.adapters.llms.llm_client import LLMClient
11
+ from hyperplane_eval.adapters.runners.agent_runner import AgentRunner
12
+ from hyperplane_eval.adapters.local_bindings.executor import execute_temp_runner
13
+ from hyperplane_eval.engine.config import EvaluationConfig
14
+ from hyperplane_eval.engine.orchestrator import PipelineOrchestrator
11
15
 
12
16
 
13
17
  LOGO = """
@@ -36,6 +40,106 @@ class VerifyApp:
36
40
  with open(self.config_file, "w") as f:
37
41
  yaml.dump(self.config, f)
38
42
 
43
+ @staticmethod
44
+ def update_dashboard_display(
45
+ active_scenarios: dict,
46
+ plane_input_space: Any,
47
+ scenarios_per_plane: int,
48
+ plane_features: list,
49
+ rule_idx: int,
50
+ rules_len: int,
51
+ plane_idx: int,
52
+ num_planes: int,
53
+ rule: str,
54
+ ) -> Group:
55
+ """Generates the CLI dashboard showing evaluation progress and scenario status."""
56
+ pct = min(1.0, len(plane_input_space.get_all_vectors()) / scenarios_per_plane)
57
+ bar = "█" * int(30 * pct) + "░" * (30 - int(30 * pct))
58
+ dims_str = ", ".join(f.name for f in plane_features)
59
+
60
+ renderables = []
61
+ renderables.append(
62
+ Text.from_markup(
63
+ f"[bold cyan]Rule [{rule_idx + 1}/{rules_len}] - Plane [{plane_idx + 1}/{num_planes}]:[/bold cyan] {rule[:80]}..."
64
+ )
65
+ )
66
+ renderables.append(Text.from_markup(f"[cyan]Dimensions:[/cyan] {dims_str}"))
67
+ renderables.append(
68
+ Text.from_markup(
69
+ f"[cyan]Progress:[/cyan] [{bar}] {pct:.0%} ({len(plane_input_space.get_all_vectors())}/{scenarios_per_plane})\n"
70
+ )
71
+ )
72
+
73
+ for item in list(active_scenarios.values())[-3:]:
74
+ if item["status"] == "Pending":
75
+ renderables.append(Text.from_markup(f" • {item['text']}\n"))
76
+ else:
77
+ score = item["score"]
78
+ if score >= 0.75:
79
+ marker = "[bold green][✓][/bold green]"
80
+ elif score >= 0.25:
81
+ marker = "[bold yellow][~][/bold yellow]"
82
+ else:
83
+ marker = "[bold red][✗][/bold red]"
84
+
85
+ renderables.append(
86
+ Text.from_markup(f" • {marker} ({score:.0%}) {item['text']}\n")
87
+ )
88
+
89
+ return Group(*renderables)
90
+
91
+ async def run(self):
92
+ self.console.print(Panel.fit(Text(LOGO, style="bold cyan")))
93
+
94
+ target_path, selected_func, description, rules = await self._prompt_for_target()
95
+ if not target_path or not selected_func:
96
+ return
97
+
98
+ rules_to_run = await self._prompt_for_rule(rules)
99
+ if not rules_to_run:
100
+ self.console.print("[red]No rules selected. Exiting.[/red]")
101
+ return
102
+
103
+ (
104
+ depth,
105
+ breadth,
106
+ adversarial,
107
+ conversational,
108
+ ) = await self._prompt_for_dynamic_config()
109
+
110
+ rules_str = ", ".join(f"'{r}'" for r in rules_to_run)
111
+ self.console.print(
112
+ f"\n[bold green]Starting evaluation locally for rules: {rules_str}[/bold green]"
113
+ )
114
+
115
+ llm_params = {
116
+ k.replace("llm_", ""): v
117
+ for k, v in self.config.items()
118
+ if k.startswith("llm_") and k != "llm_model"
119
+ }
120
+ llm_client = LLMClient(model=self.config.get("llm_model"), **llm_params)
121
+
122
+ runner = AgentRunner(
123
+ executor_func=execute_temp_runner,
124
+ target_path=target_path,
125
+ selected_func=selected_func,
126
+ )
127
+
128
+ eval_config = EvaluationConfig(
129
+ rules=rules_to_run,
130
+ runner=runner,
131
+ generator_target_schema=selected_func.get("params", []),
132
+ generator_target_code=selected_func.get("code", ""),
133
+ depth=depth,
134
+ breadth=breadth,
135
+ adversarial_testing=adversarial,
136
+ conversational_testing=conversational,
137
+ llm_client=llm_client,
138
+ agent_description=description,
139
+ )
140
+ orchestrator = PipelineOrchestrator(eval_config)
141
+ await orchestrator.run()
142
+
39
143
  async def _prompt_for_target(self):
40
144
  """Prompts the user to select or confirm the target file and function."""
41
145
  if self.config and "file" in self.config and "function" in self.config:
@@ -44,7 +148,8 @@ class VerifyApp:
44
148
  )
45
149
  use_existing = await questionary.confirm("Use this target?").ask_async()
46
150
  if use_existing:
47
- from adapters.local_bindings.scanner import extract_functions
151
+ from hyperplane_eval.adapters.local_bindings.scanner import extract_functions
152
+
48
153
  funcs = extract_functions(self.config["file"])
49
154
  selected_func = next(
50
155
  (f for f in funcs if f["name"] == self.config["function"]), None
@@ -82,7 +187,8 @@ class VerifyApp:
82
187
  return None, None, None, []
83
188
 
84
189
  self.console.print("[cyan]Scanning for functions...[/cyan]")
85
- from adapters.local_bindings.scanner import extract_functions
190
+ from hyperplane_eval.adapters.local_bindings.scanner import extract_functions
191
+
86
192
  funcs = extract_functions(target_path)
87
193
  if not funcs:
88
194
  self.console.print(
@@ -304,117 +410,16 @@ class VerifyApp:
304
410
  ).ask_async()
305
411
 
306
412
  self.config["adversarial_testing"] = adversarial
307
- self.save_config()
308
-
309
- return depth, breadth, adversarial
310
413
 
311
- @staticmethod
312
- def update_dashboard_display(
313
- active_scenarios: dict,
314
- plane_input_space: Any,
315
- scenarios_per_plane: int,
316
- plane_features: list,
317
- rule_idx: int,
318
- rules_len: int,
319
- plane_idx: int,
320
- num_planes: int,
321
- rule: str,
322
- ) -> Group:
323
- """Generates the CLI dashboard showing evaluation progress and scenario status."""
324
- pct = min(1.0, len(plane_input_space.get_all_vectors()) / scenarios_per_plane)
325
- bar = "█" * int(30 * pct) + "░" * (30 - int(30 * pct))
326
- dims_str = ", ".join(f.name for f in plane_features)
327
-
328
- renderables = []
329
- renderables.append(
330
- Text.from_markup(
331
- f"[bold cyan]Rule [{rule_idx + 1}/{rules_len}] - Plane [{plane_idx + 1}/{num_planes}]:[/bold cyan] {rule[:80]}..."
332
- )
333
- )
334
- renderables.append(Text.from_markup(f"[cyan]Dimensions:[/cyan] {dims_str}"))
335
- renderables.append(
336
- Text.from_markup(
337
- f"[cyan]Progress:[/cyan] [{bar}] {pct:.0%} ({len(plane_input_space.get_all_vectors())}/{scenarios_per_plane})\n"
338
- )
339
- )
340
-
341
- for item in list(active_scenarios.values())[-3:]:
342
- if item["status"] == "Pending":
343
- renderables.append(Text.from_markup(f" • {item['text']}\n"))
344
- else:
345
- score = item["score"]
346
- if score >= 0.75:
347
- marker = "[bold green][✓][/bold green]"
348
- elif score >= 0.25:
349
- marker = "[bold yellow][~][/bold yellow]"
350
- else:
351
- marker = "[bold red][✗][/bold red]"
352
-
353
- renderables.append(
354
- Text.from_markup(f" • {marker} ({score:.0%}) {item['text']}\n")
355
- )
356
-
357
- return Group(*renderables)
358
-
359
- async def run(self):
360
- self.console.print(Panel.fit(Text(LOGO, style="bold cyan")))
361
-
362
- target_path, selected_func, description, rules = await self._prompt_for_target()
363
- if not target_path or not selected_func:
364
- return
365
-
366
- rules_to_run = await self._prompt_for_rule(rules)
367
- if not rules_to_run:
368
- self.console.print("[red]No rules selected. Exiting.[/red]")
369
- return
370
-
371
- depth, breadth, adversarial = await self._prompt_for_dynamic_config()
372
-
373
- rules_str = ", ".join(f"'{r}'" for r in rules_to_run)
374
- self.console.print(
375
- f"\n[bold green]Starting evaluation locally for rules: {rules_str}[/bold green]"
376
- )
377
-
378
- from adapters.llms.llm_client import LLMClient
379
-
380
- llm_params = {
381
- k.replace("llm_", ""): v
382
- for k, v in self.config.items()
383
- if k.startswith("llm_") and k != "llm_model"
384
- }
385
- llm_client = LLMClient(model=self.config.get("llm_model"), **llm_params)
386
-
387
- from adapters.runners.agent_runner import AgentRunner
388
- from adapters.local_bindings.executor import execute_temp_runner
389
-
390
- runner = AgentRunner(
391
- executor_func=execute_temp_runner,
392
- target_path=target_path,
393
- selected_func=selected_func,
394
- )
395
-
396
- import os
397
-
398
- agent_dir = os.path.dirname(os.path.abspath(target_path))
399
- results_path = os.path.join(agent_dir, "results")
414
+ conversational = await questionary.confirm(
415
+ "Enable Conversational Testing? (Injects natural conversational artifacts like dictation errors, multi-tasking, etc.)",
416
+ default=self.config.get("conversational_testing", False),
417
+ ).ask_async()
400
418
 
401
- from engine.config import EvaluationConfig
402
- from engine.orchestrator import PipelineOrchestrator
419
+ self.config["conversational_testing"] = conversational
420
+ self.save_config()
403
421
 
404
- eval_config = EvaluationConfig(
405
- results_dir=results_path,
406
- rules=rules_to_run,
407
- runner=runner,
408
- generator_target_schema=selected_func.get("params", []),
409
- generator_target_code=selected_func.get("code", ""),
410
- depth=depth,
411
- breadth=breadth,
412
- adversarial_testing=adversarial,
413
- llm_client=llm_client,
414
- agent_description=description,
415
- )
416
- orchestrator = PipelineOrchestrator(eval_config)
417
- await orchestrator.run()
422
+ return depth, breadth, adversarial, conversational
418
423
 
419
424
 
420
425
  async def main():
@@ -0,0 +1,20 @@
1
+ from dataclasses import dataclass
2
+ from typing import Any, List, Dict
3
+
4
+ from hyperplane_eval.adapters.runners.agent_runner import AgentRunner
5
+
6
+
7
+ @dataclass
8
+ class EvaluationConfig:
9
+ """Configuration for an evaluation run."""
10
+
11
+ rules: List[str]
12
+ runner: AgentRunner
13
+ generator_target_schema: List[Dict[str, Any]]
14
+ generator_target_code: str
15
+ llm_client: Any
16
+ depth: str
17
+ breadth: str
18
+ adversarial_testing: bool
19
+ conversational_testing: bool
20
+ agent_description: str