hyperplane-eval 0.1.4__tar.gz → 0.1.5__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- hyperplane_eval-0.1.5/MANIFEST.in +6 -0
- hyperplane_eval-0.1.5/PKG-INFO +88 -0
- hyperplane_eval-0.1.5/README.md +54 -0
- {hyperplane_eval-0.1.4 → hyperplane_eval-0.1.5/hyperplane_eval}/adapters/llms/llm_client.py +4 -4
- {hyperplane_eval-0.1.4 → hyperplane_eval-0.1.5/hyperplane_eval}/adapters/runners/agent_runner.py +4 -8
- {hyperplane_eval-0.1.4 → hyperplane_eval-0.1.5/hyperplane_eval}/cli/app.py +116 -111
- hyperplane_eval-0.1.5/hyperplane_eval/engine/config.py +20 -0
- hyperplane_eval-0.1.5/hyperplane_eval/engine/domain/predefined_features/conversational_features.json +184 -0
- {hyperplane_eval-0.1.4/engine/stages → hyperplane_eval-0.1.5/hyperplane_eval/engine/input_space}/input_space.py +44 -14
- hyperplane_eval-0.1.4/engine/stages/creator.py → hyperplane_eval-0.1.5/hyperplane_eval/engine/input_space/input_space_factory.py +52 -42
- {hyperplane_eval-0.1.4 → hyperplane_eval-0.1.5/hyperplane_eval}/engine/orchestrator.py +85 -82
- {hyperplane_eval-0.1.4 → hyperplane_eval-0.1.5/hyperplane_eval}/engine/plane_evaluator.py +25 -27
- {hyperplane_eval-0.1.4 → hyperplane_eval-0.1.5/hyperplane_eval}/engine/stages/evaluator.py +4 -5
- {hyperplane_eval-0.1.4 → hyperplane_eval-0.1.5/hyperplane_eval}/engine/stages/generator.py +8 -24
- {hyperplane_eval-0.1.4 → hyperplane_eval-0.1.5/hyperplane_eval}/engine/stages/navigator.py +30 -33
- hyperplane_eval-0.1.5/hyperplane_eval/prompts/adapters/llm/schema_prompt.txt +2 -0
- hyperplane_eval-0.1.5/hyperplane_eval/prompts/reporting/dimension_mitigation.txt +17 -0
- hyperplane_eval-0.1.5/hyperplane_eval/prompts/reporting/vulnerability_patch.txt +7 -0
- {hyperplane_eval-0.1.4 → hyperplane_eval-0.1.5/hyperplane_eval}/prompts/stages/creator/anchors_sys.txt +1 -1
- {hyperplane_eval-0.1.4 → hyperplane_eval-0.1.5/hyperplane_eval}/prompts/stages/creator/brainstorm_sys.txt +2 -1
- {hyperplane_eval-0.1.4 → hyperplane_eval-0.1.5/hyperplane_eval}/prompts/stages/creator/refine_sys.txt +1 -1
- hyperplane_eval-0.1.5/hyperplane_eval/prompts/stages/generator/__init__.py +0 -0
- {hyperplane_eval-0.1.4 → hyperplane_eval-0.1.5/hyperplane_eval}/prompts/stages/generator/copyeditor_sys.txt +7 -1
- hyperplane_eval-0.1.5/hyperplane_eval/prompts/stages/generator/seed_sys.txt +9 -0
- {hyperplane_eval-0.1.4 → hyperplane_eval-0.1.5/hyperplane_eval}/prompts/stages/generator/seed_user.txt +1 -1
- hyperplane_eval-0.1.5/hyperplane_eval/reporting/__init__.py +0 -0
- {hyperplane_eval-0.1.4 → hyperplane_eval-0.1.5/hyperplane_eval}/reporting/analyser.py +408 -424
- hyperplane_eval-0.1.5/hyperplane_eval/reporting/templates/__init__.py +0 -0
- hyperplane_eval-0.1.5/hyperplane_eval.egg-info/PKG-INFO +88 -0
- hyperplane_eval-0.1.5/hyperplane_eval.egg-info/SOURCES.txt +71 -0
- hyperplane_eval-0.1.5/hyperplane_eval.egg-info/entry_points.txt +2 -0
- hyperplane_eval-0.1.5/hyperplane_eval.egg-info/top_level.txt +1 -0
- {hyperplane_eval-0.1.4 → hyperplane_eval-0.1.5}/setup.py +2 -2
- hyperplane_eval-0.1.4/MANIFEST.in +0 -6
- hyperplane_eval-0.1.4/PKG-INFO +0 -143
- hyperplane_eval-0.1.4/README.md +0 -109
- hyperplane_eval-0.1.4/engine/config.py +0 -20
- hyperplane_eval-0.1.4/hyperplane_eval.egg-info/PKG-INFO +0 -143
- hyperplane_eval-0.1.4/hyperplane_eval.egg-info/SOURCES.txt +0 -64
- hyperplane_eval-0.1.4/hyperplane_eval.egg-info/entry_points.txt +0 -2
- hyperplane_eval-0.1.4/hyperplane_eval.egg-info/top_level.txt +0 -5
- hyperplane_eval-0.1.4/prompts/stages/generator/seed_sys.txt +0 -5
- {hyperplane_eval-0.1.4 → hyperplane_eval-0.1.5}/LICENSE +0 -0
- {hyperplane_eval-0.1.4/adapters/llms → hyperplane_eval-0.1.5/hyperplane_eval}/__init__.py +0 -0
- {hyperplane_eval-0.1.4 → hyperplane_eval-0.1.5/hyperplane_eval}/adapters/__init__.py +0 -0
- {hyperplane_eval-0.1.4/adapters/local_bindings → hyperplane_eval-0.1.5/hyperplane_eval/adapters/llms}/__init__.py +0 -0
- {hyperplane_eval-0.1.4/adapters/runners → hyperplane_eval-0.1.5/hyperplane_eval/adapters/local_bindings}/__init__.py +0 -0
- {hyperplane_eval-0.1.4 → hyperplane_eval-0.1.5/hyperplane_eval}/adapters/local_bindings/executor.py +0 -0
- {hyperplane_eval-0.1.4 → hyperplane_eval-0.1.5/hyperplane_eval}/adapters/local_bindings/scanner.py +0 -0
- {hyperplane_eval-0.1.4/engine → hyperplane_eval-0.1.5/hyperplane_eval/adapters/runners}/__init__.py +0 -0
- {hyperplane_eval-0.1.4 → hyperplane_eval-0.1.5/hyperplane_eval}/cli/__init__.py +0 -0
- {hyperplane_eval-0.1.4/engine/stages → hyperplane_eval-0.1.5/hyperplane_eval/engine}/__init__.py +0 -0
- {hyperplane_eval-0.1.4 → hyperplane_eval-0.1.5/hyperplane_eval}/engine/domain/__init__.py +0 -0
- {hyperplane_eval-0.1.4 → hyperplane_eval-0.1.5/hyperplane_eval}/engine/domain/dimensions.py +0 -0
- /hyperplane_eval-0.1.4/engine/domain/predefined_features.json → /hyperplane_eval-0.1.5/hyperplane_eval/engine/domain/predefined_features/adversarial_features.json +0 -0
- {hyperplane_eval-0.1.4 → hyperplane_eval-0.1.5/hyperplane_eval}/engine/domain/vectors/__init__.py +0 -0
- {hyperplane_eval-0.1.4 → hyperplane_eval-0.1.5/hyperplane_eval}/engine/domain/vectors/base.py +0 -0
- {hyperplane_eval-0.1.4 → hyperplane_eval-0.1.5/hyperplane_eval}/engine/domain/vectors/evaluated.py +0 -0
- {hyperplane_eval-0.1.4 → hyperplane_eval-0.1.5/hyperplane_eval}/engine/domain/vectors/executed.py +0 -0
- {hyperplane_eval-0.1.4 → hyperplane_eval-0.1.5/hyperplane_eval}/engine/domain/vectors/synthesized.py +0 -0
- {hyperplane_eval-0.1.4/prompts → hyperplane_eval-0.1.5/hyperplane_eval/engine/input_space}/__init__.py +0 -0
- {hyperplane_eval-0.1.4 → hyperplane_eval-0.1.5/hyperplane_eval}/engine/prompt_loader.py +0 -0
- {hyperplane_eval-0.1.4/prompts → hyperplane_eval-0.1.5/hyperplane_eval/engine}/stages/__init__.py +0 -0
- {hyperplane_eval-0.1.4/prompts/stages/creator → hyperplane_eval-0.1.5/hyperplane_eval/prompts}/__init__.py +0 -0
- {hyperplane_eval-0.1.4/prompts/stages/evaluator → hyperplane_eval-0.1.5/hyperplane_eval/prompts/reporting}/__init__.py +0 -0
- {hyperplane_eval-0.1.4/prompts/stages/generator → hyperplane_eval-0.1.5/hyperplane_eval/prompts/stages}/__init__.py +0 -0
- {hyperplane_eval-0.1.4/reporting → hyperplane_eval-0.1.5/hyperplane_eval/prompts/stages/creator}/__init__.py +0 -0
- {hyperplane_eval-0.1.4 → hyperplane_eval-0.1.5/hyperplane_eval}/prompts/stages/creator/anchors_user.txt +0 -0
- {hyperplane_eval-0.1.4 → hyperplane_eval-0.1.5/hyperplane_eval}/prompts/stages/creator/brainstorm_user.txt +0 -0
- {hyperplane_eval-0.1.4 → hyperplane_eval-0.1.5/hyperplane_eval}/prompts/stages/creator/refine_user.txt +0 -0
- {hyperplane_eval-0.1.4/reporting/templates → hyperplane_eval-0.1.5/hyperplane_eval/prompts/stages/evaluator}/__init__.py +0 -0
- {hyperplane_eval-0.1.4 → hyperplane_eval-0.1.5/hyperplane_eval}/prompts/stages/evaluator/judge.txt +0 -0
- {hyperplane_eval-0.1.4 → hyperplane_eval-0.1.5/hyperplane_eval}/prompts/stages/generator/continue_sys.txt +0 -0
- {hyperplane_eval-0.1.4 → hyperplane_eval-0.1.5/hyperplane_eval}/prompts/stages/generator/continue_user.txt +0 -0
- {hyperplane_eval-0.1.4 → hyperplane_eval-0.1.5/hyperplane_eval}/prompts/stages/generator/copyeditor_user.txt +0 -0
- {hyperplane_eval-0.1.4 → hyperplane_eval-0.1.5/hyperplane_eval}/prompts/stages/generator/eval_checks_sys.txt +0 -0
- {hyperplane_eval-0.1.4 → hyperplane_eval-0.1.5/hyperplane_eval}/prompts/stages/generator/eval_checks_user.txt +0 -0
- {hyperplane_eval-0.1.4 → hyperplane_eval-0.1.5/hyperplane_eval}/reporting/templates/report_template.html +0 -0
- {hyperplane_eval-0.1.4 → hyperplane_eval-0.1.5}/hyperplane_eval.egg-info/dependency_links.txt +0 -0
- {hyperplane_eval-0.1.4 → hyperplane_eval-0.1.5}/hyperplane_eval.egg-info/requires.txt +0 -0
- {hyperplane_eval-0.1.4 → hyperplane_eval-0.1.5}/requirements.txt +0 -0
- {hyperplane_eval-0.1.4 → hyperplane_eval-0.1.5}/setup.cfg +0 -0
|
@@ -0,0 +1,88 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: hyperplane-eval
|
|
3
|
+
Version: 0.1.5
|
|
4
|
+
Summary: A modular framework for evaluating and verifying agentic LLM outputs.
|
|
5
|
+
Author: Marten Panchev
|
|
6
|
+
Author-email: marten@aquithm.com
|
|
7
|
+
Classifier: Programming Language :: Python :: 3
|
|
8
|
+
Classifier: License :: OSI Approved :: Apache Software License
|
|
9
|
+
Classifier: Operating System :: OS Independent
|
|
10
|
+
Requires-Python: >=3.10
|
|
11
|
+
Description-Content-Type: text/markdown
|
|
12
|
+
License-File: LICENSE
|
|
13
|
+
Requires-Dist: pydantic>=2.0.0
|
|
14
|
+
Requires-Dist: numpy>=1.24.0
|
|
15
|
+
Requires-Dist: scipy>=1.10.0
|
|
16
|
+
Requires-Dist: litellm>=1.0.0
|
|
17
|
+
Requires-Dist: aiohttp>=3.9.0
|
|
18
|
+
Requires-Dist: pandas>=2.0.0
|
|
19
|
+
Requires-Dist: scikit-learn>=1.2.0
|
|
20
|
+
Requires-Dist: openai>=1.0.0
|
|
21
|
+
Requires-Dist: pyngrok>=7.1.0
|
|
22
|
+
Requires-Dist: rich>=13.0.0
|
|
23
|
+
Requires-Dist: questionary>=2.0.0
|
|
24
|
+
Requires-Dist: PyYAML>=6.0.0
|
|
25
|
+
Dynamic: author
|
|
26
|
+
Dynamic: author-email
|
|
27
|
+
Dynamic: classifier
|
|
28
|
+
Dynamic: description
|
|
29
|
+
Dynamic: description-content-type
|
|
30
|
+
Dynamic: license-file
|
|
31
|
+
Dynamic: requires-dist
|
|
32
|
+
Dynamic: requires-python
|
|
33
|
+
Dynamic: summary
|
|
34
|
+
|
|
35
|
+
# Hyperplane Eval
|
|
36
|
+
|
|
37
|
+
Hyperplane Eval is a Python-based testing framework that helps you figure out exactly when and where your AI agents break. Instead of writing manual test cases, you give Hyperplane a target function and a set of rules, and it systematically generates edge-cases to map out your agent's "Safe Polytope" — the operational volume where your agent is reliable.
|
|
38
|
+
|
|
39
|
+
## 🚀 How It Works: Breadth-First Evaluation
|
|
40
|
+
|
|
41
|
+
Testing an AI agent is hard because the potential input space is infinite. Hyperplane solves this by breaking down inputs into "dimensions" of complexity (e.g., Urgency, Ambiguity, Formatting).
|
|
42
|
+
|
|
43
|
+
Instead of randomly guessing inputs, Hyperplane uses a **breadth-first evaluation** approach:
|
|
44
|
+
1. **Dimension Extraction:** It automatically extracts relevant dimensions based on the rules you want to test.
|
|
45
|
+
2. **Grid Generation:** It generates a uniform grid of test scenarios across these dimensions (using Sobol sequences for perfectly even distribution).
|
|
46
|
+
3. **Input Synthesis:** It uses a strong LLM to generate realistic user inputs that match those specific dimension coordinates.
|
|
47
|
+
4. **Evaluation:** It executes your local agent code with the generated inputs, and evaluates the output against your rules using a Chain-of-Thought (CoT) judge.
|
|
48
|
+
|
|
49
|
+
By doing this breadth-first scan across multiple dimensions simultaneously, Hyperplane creates a mathematical map of your agent's reliability and calculates its "Reliability Coverage" as a clear, comparable percentage.
|
|
50
|
+
|
|
51
|
+
## 🚦 CLI Integration
|
|
52
|
+
|
|
53
|
+
Hyperplane is incredibly easy to use. You don't need to write any complex evaluation scripts or boilerplate code; everything is handled through an interactive CLI.
|
|
54
|
+
|
|
55
|
+
### Setup & Installation
|
|
56
|
+
|
|
57
|
+
Install the framework via pip:
|
|
58
|
+
|
|
59
|
+
```bash
|
|
60
|
+
pip install hyperplane-eval
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
### Running the CLI
|
|
64
|
+
|
|
65
|
+
Run the interactive CLI directly in your terminal from inside your project directory:
|
|
66
|
+
|
|
67
|
+
```bash
|
|
68
|
+
hyperplane
|
|
69
|
+
```
|
|
70
|
+
|
|
71
|
+
The wizard will immediately guide you through the evaluation setup:
|
|
72
|
+
1. **Target Selection:** It will automatically scan your local Python files and let you pick the function that acts as your agent's entry point.
|
|
73
|
+
2. **Rule Definition:** You define the rules your agent must follow in plain English (e.g., "Never offer a refund over $50").
|
|
74
|
+
3. **Configuration:** You configure the depth (how many points to test) and breadth (how many dimensions to extract).
|
|
75
|
+
4. **Execution:** The framework will spin up workers, generate the test space, execute your local code, and render a real-time terminal dashboard.
|
|
76
|
+
|
|
77
|
+
Once complete, Hyperplane generates an interactive HTML report showing exactly which dimensions cause your agent to fail, allowing you to easily identify blind spots in your system prompts.
|
|
78
|
+
|
|
79
|
+
## 🛠 Technology Stack
|
|
80
|
+
- **Language:** Python 3.10+
|
|
81
|
+
- **Data Modeling:** `pydantic`
|
|
82
|
+
- **Math/Geometry:** `numpy`, `scipy` (Sobol sequences, ConvexHull analysis)
|
|
83
|
+
- **LLM Integration:** `litellm` for universal API connectivity (OpenAI, Gemini, Anthropic, or any local vLLM).
|
|
84
|
+
|
|
85
|
+
## 📄 License
|
|
86
|
+
|
|
87
|
+
This project is licensed under the Apache License, Version 2.0.
|
|
88
|
+
See the [LICENSE](LICENSE) file for more information.
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
# Hyperplane Eval
|
|
2
|
+
|
|
3
|
+
Hyperplane Eval is a Python-based testing framework that helps you figure out exactly when and where your AI agents break. Instead of writing manual test cases, you give Hyperplane a target function and a set of rules, and it systematically generates edge-cases to map out your agent's "Safe Polytope" — the operational volume where your agent is reliable.
|
|
4
|
+
|
|
5
|
+
## 🚀 How It Works: Breadth-First Evaluation
|
|
6
|
+
|
|
7
|
+
Testing an AI agent is hard because the potential input space is infinite. Hyperplane solves this by breaking down inputs into "dimensions" of complexity (e.g., Urgency, Ambiguity, Formatting).
|
|
8
|
+
|
|
9
|
+
Instead of randomly guessing inputs, Hyperplane uses a **breadth-first evaluation** approach:
|
|
10
|
+
1. **Dimension Extraction:** It automatically extracts relevant dimensions based on the rules you want to test.
|
|
11
|
+
2. **Grid Generation:** It generates a uniform grid of test scenarios across these dimensions (using Sobol sequences for perfectly even distribution).
|
|
12
|
+
3. **Input Synthesis:** It uses a strong LLM to generate realistic user inputs that match those specific dimension coordinates.
|
|
13
|
+
4. **Evaluation:** It executes your local agent code with the generated inputs, and evaluates the output against your rules using a Chain-of-Thought (CoT) judge.
|
|
14
|
+
|
|
15
|
+
By doing this breadth-first scan across multiple dimensions simultaneously, Hyperplane creates a mathematical map of your agent's reliability and calculates its "Reliability Coverage" as a clear, comparable percentage.
|
|
16
|
+
|
|
17
|
+
## 🚦 CLI Integration
|
|
18
|
+
|
|
19
|
+
Hyperplane is incredibly easy to use. You don't need to write any complex evaluation scripts or boilerplate code; everything is handled through an interactive CLI.
|
|
20
|
+
|
|
21
|
+
### Setup & Installation
|
|
22
|
+
|
|
23
|
+
Install the framework via pip:
|
|
24
|
+
|
|
25
|
+
```bash
|
|
26
|
+
pip install hyperplane-eval
|
|
27
|
+
```
|
|
28
|
+
|
|
29
|
+
### Running the CLI
|
|
30
|
+
|
|
31
|
+
Run the interactive CLI directly in your terminal from inside your project directory:
|
|
32
|
+
|
|
33
|
+
```bash
|
|
34
|
+
hyperplane
|
|
35
|
+
```
|
|
36
|
+
|
|
37
|
+
The wizard will immediately guide you through the evaluation setup:
|
|
38
|
+
1. **Target Selection:** It will automatically scan your local Python files and let you pick the function that acts as your agent's entry point.
|
|
39
|
+
2. **Rule Definition:** You define the rules your agent must follow in plain English (e.g., "Never offer a refund over $50").
|
|
40
|
+
3. **Configuration:** You configure the depth (how many points to test) and breadth (how many dimensions to extract).
|
|
41
|
+
4. **Execution:** The framework will spin up workers, generate the test space, execute your local code, and render a real-time terminal dashboard.
|
|
42
|
+
|
|
43
|
+
Once complete, Hyperplane generates an interactive HTML report showing exactly which dimensions cause your agent to fail, allowing you to easily identify blind spots in your system prompts.
|
|
44
|
+
|
|
45
|
+
## 🛠 Technology Stack
|
|
46
|
+
- **Language:** Python 3.10+
|
|
47
|
+
- **Data Modeling:** `pydantic`
|
|
48
|
+
- **Math/Geometry:** `numpy`, `scipy` (Sobol sequences, ConvexHull analysis)
|
|
49
|
+
- **LLM Integration:** `litellm` for universal API connectivity (OpenAI, Gemini, Anthropic, or any local vLLM).
|
|
50
|
+
|
|
51
|
+
## 📄 License
|
|
52
|
+
|
|
53
|
+
This project is licensed under the Apache License, Version 2.0.
|
|
54
|
+
See the [LICENSE](LICENSE) file for more information.
|
|
@@ -4,6 +4,7 @@ import re
|
|
|
4
4
|
import asyncio
|
|
5
5
|
from typing import Any, Dict
|
|
6
6
|
from litellm import acompletion
|
|
7
|
+
from hyperplane_eval.engine.prompt_loader import load_prompt
|
|
7
8
|
|
|
8
9
|
|
|
9
10
|
class LLMClient:
|
|
@@ -39,8 +40,8 @@ class LLMClient:
|
|
|
39
40
|
response_schema: Dict[str, Any],
|
|
40
41
|
temperature: float,
|
|
41
42
|
) -> str:
|
|
42
|
-
|
|
43
|
-
|
|
43
|
+
schema_str = json.dumps(response_schema, indent=2)
|
|
44
|
+
prompt += "\n\n" + load_prompt("adapters/llm/schema_prompt", schema=schema_str)
|
|
44
45
|
|
|
45
46
|
kwargs = {
|
|
46
47
|
"model": self.model, # Force using the user-selected model
|
|
@@ -49,8 +50,7 @@ class LLMClient:
|
|
|
49
50
|
**self.llm_kwargs,
|
|
50
51
|
}
|
|
51
52
|
|
|
52
|
-
|
|
53
|
-
kwargs["response_format"] = {"type": "json_object"}
|
|
53
|
+
kwargs["response_format"] = {"type": "json_object"}
|
|
54
54
|
|
|
55
55
|
async with self._semaphore:
|
|
56
56
|
try:
|
{hyperplane_eval-0.1.4 → hyperplane_eval-0.1.5/hyperplane_eval}/adapters/runners/agent_runner.py
RENAMED
|
@@ -9,15 +9,15 @@ class AgentRunner:
|
|
|
9
9
|
|
|
10
10
|
def __init__(
|
|
11
11
|
self,
|
|
12
|
-
executor_func: Callable
|
|
13
|
-
target_path: str
|
|
14
|
-
selected_func: dict
|
|
12
|
+
executor_func: Callable,
|
|
13
|
+
target_path: str,
|
|
14
|
+
selected_func: dict,
|
|
15
15
|
):
|
|
16
16
|
self.executor_func = executor_func
|
|
17
17
|
self.target_path = target_path
|
|
18
18
|
self.selected_func = selected_func
|
|
19
19
|
|
|
20
|
-
async def
|
|
20
|
+
async def call_target_agent(self, messages: List[Dict[str, str]]) -> str:
|
|
21
21
|
"""Dispatches a multi-turn request to the agent under evaluation."""
|
|
22
22
|
if not messages:
|
|
23
23
|
return ""
|
|
@@ -75,7 +75,3 @@ class AgentRunner:
|
|
|
75
75
|
return f"Error: {str(e)}"
|
|
76
76
|
else:
|
|
77
77
|
return ""
|
|
78
|
-
|
|
79
|
-
async def close(self):
|
|
80
|
-
"""No-op close method to satisfy framework expectation."""
|
|
81
|
-
pass
|
|
@@ -7,7 +7,11 @@ from rich.text import Text
|
|
|
7
7
|
from rich.panel import Panel
|
|
8
8
|
from typing import Any
|
|
9
9
|
|
|
10
|
-
|
|
10
|
+
from hyperplane_eval.adapters.llms.llm_client import LLMClient
|
|
11
|
+
from hyperplane_eval.adapters.runners.agent_runner import AgentRunner
|
|
12
|
+
from hyperplane_eval.adapters.local_bindings.executor import execute_temp_runner
|
|
13
|
+
from hyperplane_eval.engine.config import EvaluationConfig
|
|
14
|
+
from hyperplane_eval.engine.orchestrator import PipelineOrchestrator
|
|
11
15
|
|
|
12
16
|
|
|
13
17
|
LOGO = """
|
|
@@ -36,6 +40,106 @@ class VerifyApp:
|
|
|
36
40
|
with open(self.config_file, "w") as f:
|
|
37
41
|
yaml.dump(self.config, f)
|
|
38
42
|
|
|
43
|
+
@staticmethod
|
|
44
|
+
def update_dashboard_display(
|
|
45
|
+
active_scenarios: dict,
|
|
46
|
+
plane_input_space: Any,
|
|
47
|
+
scenarios_per_plane: int,
|
|
48
|
+
plane_features: list,
|
|
49
|
+
rule_idx: int,
|
|
50
|
+
rules_len: int,
|
|
51
|
+
plane_idx: int,
|
|
52
|
+
num_planes: int,
|
|
53
|
+
rule: str,
|
|
54
|
+
) -> Group:
|
|
55
|
+
"""Generates the CLI dashboard showing evaluation progress and scenario status."""
|
|
56
|
+
pct = min(1.0, len(plane_input_space.get_all_vectors()) / scenarios_per_plane)
|
|
57
|
+
bar = "█" * int(30 * pct) + "░" * (30 - int(30 * pct))
|
|
58
|
+
dims_str = ", ".join(f.name for f in plane_features)
|
|
59
|
+
|
|
60
|
+
renderables = []
|
|
61
|
+
renderables.append(
|
|
62
|
+
Text.from_markup(
|
|
63
|
+
f"[bold cyan]Rule [{rule_idx + 1}/{rules_len}] - Plane [{plane_idx + 1}/{num_planes}]:[/bold cyan] {rule[:80]}..."
|
|
64
|
+
)
|
|
65
|
+
)
|
|
66
|
+
renderables.append(Text.from_markup(f"[cyan]Dimensions:[/cyan] {dims_str}"))
|
|
67
|
+
renderables.append(
|
|
68
|
+
Text.from_markup(
|
|
69
|
+
f"[cyan]Progress:[/cyan] [{bar}] {pct:.0%} ({len(plane_input_space.get_all_vectors())}/{scenarios_per_plane})\n"
|
|
70
|
+
)
|
|
71
|
+
)
|
|
72
|
+
|
|
73
|
+
for item in list(active_scenarios.values())[-3:]:
|
|
74
|
+
if item["status"] == "Pending":
|
|
75
|
+
renderables.append(Text.from_markup(f" • {item['text']}\n"))
|
|
76
|
+
else:
|
|
77
|
+
score = item["score"]
|
|
78
|
+
if score >= 0.75:
|
|
79
|
+
marker = "[bold green][✓][/bold green]"
|
|
80
|
+
elif score >= 0.25:
|
|
81
|
+
marker = "[bold yellow][~][/bold yellow]"
|
|
82
|
+
else:
|
|
83
|
+
marker = "[bold red][✗][/bold red]"
|
|
84
|
+
|
|
85
|
+
renderables.append(
|
|
86
|
+
Text.from_markup(f" • {marker} ({score:.0%}) {item['text']}\n")
|
|
87
|
+
)
|
|
88
|
+
|
|
89
|
+
return Group(*renderables)
|
|
90
|
+
|
|
91
|
+
async def run(self):
|
|
92
|
+
self.console.print(Panel.fit(Text(LOGO, style="bold cyan")))
|
|
93
|
+
|
|
94
|
+
target_path, selected_func, description, rules = await self._prompt_for_target()
|
|
95
|
+
if not target_path or not selected_func:
|
|
96
|
+
return
|
|
97
|
+
|
|
98
|
+
rules_to_run = await self._prompt_for_rule(rules)
|
|
99
|
+
if not rules_to_run:
|
|
100
|
+
self.console.print("[red]No rules selected. Exiting.[/red]")
|
|
101
|
+
return
|
|
102
|
+
|
|
103
|
+
(
|
|
104
|
+
depth,
|
|
105
|
+
breadth,
|
|
106
|
+
adversarial,
|
|
107
|
+
conversational,
|
|
108
|
+
) = await self._prompt_for_dynamic_config()
|
|
109
|
+
|
|
110
|
+
rules_str = ", ".join(f"'{r}'" for r in rules_to_run)
|
|
111
|
+
self.console.print(
|
|
112
|
+
f"\n[bold green]Starting evaluation locally for rules: {rules_str}[/bold green]"
|
|
113
|
+
)
|
|
114
|
+
|
|
115
|
+
llm_params = {
|
|
116
|
+
k.replace("llm_", ""): v
|
|
117
|
+
for k, v in self.config.items()
|
|
118
|
+
if k.startswith("llm_") and k != "llm_model"
|
|
119
|
+
}
|
|
120
|
+
llm_client = LLMClient(model=self.config.get("llm_model"), **llm_params)
|
|
121
|
+
|
|
122
|
+
runner = AgentRunner(
|
|
123
|
+
executor_func=execute_temp_runner,
|
|
124
|
+
target_path=target_path,
|
|
125
|
+
selected_func=selected_func,
|
|
126
|
+
)
|
|
127
|
+
|
|
128
|
+
eval_config = EvaluationConfig(
|
|
129
|
+
rules=rules_to_run,
|
|
130
|
+
runner=runner,
|
|
131
|
+
generator_target_schema=selected_func.get("params", []),
|
|
132
|
+
generator_target_code=selected_func.get("code", ""),
|
|
133
|
+
depth=depth,
|
|
134
|
+
breadth=breadth,
|
|
135
|
+
adversarial_testing=adversarial,
|
|
136
|
+
conversational_testing=conversational,
|
|
137
|
+
llm_client=llm_client,
|
|
138
|
+
agent_description=description,
|
|
139
|
+
)
|
|
140
|
+
orchestrator = PipelineOrchestrator(eval_config)
|
|
141
|
+
await orchestrator.run()
|
|
142
|
+
|
|
39
143
|
async def _prompt_for_target(self):
|
|
40
144
|
"""Prompts the user to select or confirm the target file and function."""
|
|
41
145
|
if self.config and "file" in self.config and "function" in self.config:
|
|
@@ -44,7 +148,8 @@ class VerifyApp:
|
|
|
44
148
|
)
|
|
45
149
|
use_existing = await questionary.confirm("Use this target?").ask_async()
|
|
46
150
|
if use_existing:
|
|
47
|
-
from adapters.local_bindings.scanner import extract_functions
|
|
151
|
+
from hyperplane_eval.adapters.local_bindings.scanner import extract_functions
|
|
152
|
+
|
|
48
153
|
funcs = extract_functions(self.config["file"])
|
|
49
154
|
selected_func = next(
|
|
50
155
|
(f for f in funcs if f["name"] == self.config["function"]), None
|
|
@@ -82,7 +187,8 @@ class VerifyApp:
|
|
|
82
187
|
return None, None, None, []
|
|
83
188
|
|
|
84
189
|
self.console.print("[cyan]Scanning for functions...[/cyan]")
|
|
85
|
-
from adapters.local_bindings.scanner import extract_functions
|
|
190
|
+
from hyperplane_eval.adapters.local_bindings.scanner import extract_functions
|
|
191
|
+
|
|
86
192
|
funcs = extract_functions(target_path)
|
|
87
193
|
if not funcs:
|
|
88
194
|
self.console.print(
|
|
@@ -304,117 +410,16 @@ class VerifyApp:
|
|
|
304
410
|
).ask_async()
|
|
305
411
|
|
|
306
412
|
self.config["adversarial_testing"] = adversarial
|
|
307
|
-
self.save_config()
|
|
308
|
-
|
|
309
|
-
return depth, breadth, adversarial
|
|
310
413
|
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
scenarios_per_plane: int,
|
|
316
|
-
plane_features: list,
|
|
317
|
-
rule_idx: int,
|
|
318
|
-
rules_len: int,
|
|
319
|
-
plane_idx: int,
|
|
320
|
-
num_planes: int,
|
|
321
|
-
rule: str,
|
|
322
|
-
) -> Group:
|
|
323
|
-
"""Generates the CLI dashboard showing evaluation progress and scenario status."""
|
|
324
|
-
pct = min(1.0, len(plane_input_space.get_all_vectors()) / scenarios_per_plane)
|
|
325
|
-
bar = "█" * int(30 * pct) + "░" * (30 - int(30 * pct))
|
|
326
|
-
dims_str = ", ".join(f.name for f in plane_features)
|
|
327
|
-
|
|
328
|
-
renderables = []
|
|
329
|
-
renderables.append(
|
|
330
|
-
Text.from_markup(
|
|
331
|
-
f"[bold cyan]Rule [{rule_idx + 1}/{rules_len}] - Plane [{plane_idx + 1}/{num_planes}]:[/bold cyan] {rule[:80]}..."
|
|
332
|
-
)
|
|
333
|
-
)
|
|
334
|
-
renderables.append(Text.from_markup(f"[cyan]Dimensions:[/cyan] {dims_str}"))
|
|
335
|
-
renderables.append(
|
|
336
|
-
Text.from_markup(
|
|
337
|
-
f"[cyan]Progress:[/cyan] [{bar}] {pct:.0%} ({len(plane_input_space.get_all_vectors())}/{scenarios_per_plane})\n"
|
|
338
|
-
)
|
|
339
|
-
)
|
|
340
|
-
|
|
341
|
-
for item in list(active_scenarios.values())[-3:]:
|
|
342
|
-
if item["status"] == "Pending":
|
|
343
|
-
renderables.append(Text.from_markup(f" • {item['text']}\n"))
|
|
344
|
-
else:
|
|
345
|
-
score = item["score"]
|
|
346
|
-
if score >= 0.75:
|
|
347
|
-
marker = "[bold green][✓][/bold green]"
|
|
348
|
-
elif score >= 0.25:
|
|
349
|
-
marker = "[bold yellow][~][/bold yellow]"
|
|
350
|
-
else:
|
|
351
|
-
marker = "[bold red][✗][/bold red]"
|
|
352
|
-
|
|
353
|
-
renderables.append(
|
|
354
|
-
Text.from_markup(f" • {marker} ({score:.0%}) {item['text']}\n")
|
|
355
|
-
)
|
|
356
|
-
|
|
357
|
-
return Group(*renderables)
|
|
358
|
-
|
|
359
|
-
async def run(self):
|
|
360
|
-
self.console.print(Panel.fit(Text(LOGO, style="bold cyan")))
|
|
361
|
-
|
|
362
|
-
target_path, selected_func, description, rules = await self._prompt_for_target()
|
|
363
|
-
if not target_path or not selected_func:
|
|
364
|
-
return
|
|
365
|
-
|
|
366
|
-
rules_to_run = await self._prompt_for_rule(rules)
|
|
367
|
-
if not rules_to_run:
|
|
368
|
-
self.console.print("[red]No rules selected. Exiting.[/red]")
|
|
369
|
-
return
|
|
370
|
-
|
|
371
|
-
depth, breadth, adversarial = await self._prompt_for_dynamic_config()
|
|
372
|
-
|
|
373
|
-
rules_str = ", ".join(f"'{r}'" for r in rules_to_run)
|
|
374
|
-
self.console.print(
|
|
375
|
-
f"\n[bold green]Starting evaluation locally for rules: {rules_str}[/bold green]"
|
|
376
|
-
)
|
|
377
|
-
|
|
378
|
-
from adapters.llms.llm_client import LLMClient
|
|
379
|
-
|
|
380
|
-
llm_params = {
|
|
381
|
-
k.replace("llm_", ""): v
|
|
382
|
-
for k, v in self.config.items()
|
|
383
|
-
if k.startswith("llm_") and k != "llm_model"
|
|
384
|
-
}
|
|
385
|
-
llm_client = LLMClient(model=self.config.get("llm_model"), **llm_params)
|
|
386
|
-
|
|
387
|
-
from adapters.runners.agent_runner import AgentRunner
|
|
388
|
-
from adapters.local_bindings.executor import execute_temp_runner
|
|
389
|
-
|
|
390
|
-
runner = AgentRunner(
|
|
391
|
-
executor_func=execute_temp_runner,
|
|
392
|
-
target_path=target_path,
|
|
393
|
-
selected_func=selected_func,
|
|
394
|
-
)
|
|
395
|
-
|
|
396
|
-
import os
|
|
397
|
-
|
|
398
|
-
agent_dir = os.path.dirname(os.path.abspath(target_path))
|
|
399
|
-
results_path = os.path.join(agent_dir, "results")
|
|
414
|
+
conversational = await questionary.confirm(
|
|
415
|
+
"Enable Conversational Testing? (Injects natural conversational artifacts like dictation errors, multi-tasking, etc.)",
|
|
416
|
+
default=self.config.get("conversational_testing", False),
|
|
417
|
+
).ask_async()
|
|
400
418
|
|
|
401
|
-
|
|
402
|
-
|
|
419
|
+
self.config["conversational_testing"] = conversational
|
|
420
|
+
self.save_config()
|
|
403
421
|
|
|
404
|
-
|
|
405
|
-
results_dir=results_path,
|
|
406
|
-
rules=rules_to_run,
|
|
407
|
-
runner=runner,
|
|
408
|
-
generator_target_schema=selected_func.get("params", []),
|
|
409
|
-
generator_target_code=selected_func.get("code", ""),
|
|
410
|
-
depth=depth,
|
|
411
|
-
breadth=breadth,
|
|
412
|
-
adversarial_testing=adversarial,
|
|
413
|
-
llm_client=llm_client,
|
|
414
|
-
agent_description=description,
|
|
415
|
-
)
|
|
416
|
-
orchestrator = PipelineOrchestrator(eval_config)
|
|
417
|
-
await orchestrator.run()
|
|
422
|
+
return depth, breadth, adversarial, conversational
|
|
418
423
|
|
|
419
424
|
|
|
420
425
|
async def main():
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
from dataclasses import dataclass
|
|
2
|
+
from typing import Any, List, Dict
|
|
3
|
+
|
|
4
|
+
from hyperplane_eval.adapters.runners.agent_runner import AgentRunner
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
@dataclass
|
|
8
|
+
class EvaluationConfig:
|
|
9
|
+
"""Configuration for an evaluation run."""
|
|
10
|
+
|
|
11
|
+
rules: List[str]
|
|
12
|
+
runner: AgentRunner
|
|
13
|
+
generator_target_schema: List[Dict[str, Any]]
|
|
14
|
+
generator_target_code: str
|
|
15
|
+
llm_client: Any
|
|
16
|
+
depth: str
|
|
17
|
+
breadth: str
|
|
18
|
+
adversarial_testing: bool
|
|
19
|
+
conversational_testing: bool
|
|
20
|
+
agent_description: str
|