hyperplane-eval 0.1.5__tar.gz → 0.1.7__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- hyperplane_eval-0.1.7/MANIFEST.in +6 -0
- hyperplane_eval-0.1.7/PKG-INFO +149 -0
- hyperplane_eval-0.1.7/README.md +115 -0
- hyperplane_eval-0.1.7/hyperplane/__init__.py +3 -0
- {hyperplane_eval-0.1.5/hyperplane_eval → hyperplane_eval-0.1.7/hyperplane}/cli/app.py +7 -7
- {hyperplane_eval-0.1.5/hyperplane_eval/adapters → hyperplane_eval-0.1.7/hyperplane/cli}/llms/llm_client.py +2 -2
- {hyperplane_eval-0.1.5/hyperplane_eval/adapters → hyperplane_eval-0.1.7/hyperplane/cli}/local_bindings/executor.py +3 -3
- hyperplane_eval-0.1.7/hyperplane/evaluator.py +171 -0
- {hyperplane_eval-0.1.5/hyperplane_eval/engine → hyperplane_eval-0.1.7/hyperplane/framework}/config.py +1 -1
- hyperplane_eval-0.1.7/hyperplane/framework/domain/dimensions/__init__.py +3 -0
- {hyperplane_eval-0.1.5/hyperplane_eval/engine → hyperplane_eval-0.1.7/hyperplane/framework}/input_space/input_space.py +2 -2
- {hyperplane_eval-0.1.5/hyperplane_eval/engine → hyperplane_eval-0.1.7/hyperplane/framework}/input_space/input_space_factory.py +8 -5
- {hyperplane_eval-0.1.5/hyperplane_eval/engine → hyperplane_eval-0.1.7/hyperplane/framework}/orchestrator.py +7 -7
- {hyperplane_eval-0.1.5/hyperplane_eval/engine → hyperplane_eval-0.1.7/hyperplane/framework}/plane_evaluator.py +12 -9
- {hyperplane_eval-0.1.5/hyperplane_eval → hyperplane_eval-0.1.7/hyperplane/framework}/reporting/analyser.py +3 -3
- {hyperplane_eval-0.1.5/hyperplane_eval/engine → hyperplane_eval-0.1.7/hyperplane/framework}/stages/evaluator.py +3 -3
- {hyperplane_eval-0.1.5/hyperplane_eval/engine → hyperplane_eval-0.1.7/hyperplane/framework}/stages/generator.py +3 -3
- {hyperplane_eval-0.1.5/hyperplane_eval/engine → hyperplane_eval-0.1.7/hyperplane/framework}/stages/navigator.py +2 -2
- hyperplane_eval-0.1.7/hyperplane_eval.egg-info/PKG-INFO +149 -0
- hyperplane_eval-0.1.7/hyperplane_eval.egg-info/SOURCES.txt +71 -0
- hyperplane_eval-0.1.7/hyperplane_eval.egg-info/entry_points.txt +2 -0
- hyperplane_eval-0.1.7/hyperplane_eval.egg-info/top_level.txt +1 -0
- {hyperplane_eval-0.1.5 → hyperplane_eval-0.1.7}/setup.py +3 -3
- hyperplane_eval-0.1.5/MANIFEST.in +0 -6
- hyperplane_eval-0.1.5/PKG-INFO +0 -88
- hyperplane_eval-0.1.5/README.md +0 -54
- hyperplane_eval-0.1.5/hyperplane_eval/adapters/__init__.py +0 -1
- hyperplane_eval-0.1.5/hyperplane_eval/reporting/templates/__init__.py +0 -0
- hyperplane_eval-0.1.5/hyperplane_eval/reporting/templates/report_template.html +0 -988
- hyperplane_eval-0.1.5/hyperplane_eval.egg-info/PKG-INFO +0 -88
- hyperplane_eval-0.1.5/hyperplane_eval.egg-info/SOURCES.txt +0 -71
- hyperplane_eval-0.1.5/hyperplane_eval.egg-info/entry_points.txt +0 -2
- hyperplane_eval-0.1.5/hyperplane_eval.egg-info/top_level.txt +0 -1
- {hyperplane_eval-0.1.5 → hyperplane_eval-0.1.7}/LICENSE +0 -0
- {hyperplane_eval-0.1.5/hyperplane_eval → hyperplane_eval-0.1.7/hyperplane}/cli/__init__.py +0 -0
- {hyperplane_eval-0.1.5/hyperplane_eval → hyperplane_eval-0.1.7/hyperplane/cli/llms}/__init__.py +0 -0
- {hyperplane_eval-0.1.5/hyperplane_eval/adapters/llms → hyperplane_eval-0.1.7/hyperplane/cli/local_bindings}/__init__.py +0 -0
- {hyperplane_eval-0.1.5/hyperplane_eval/adapters → hyperplane_eval-0.1.7/hyperplane/cli}/local_bindings/scanner.py +0 -0
- {hyperplane_eval-0.1.5/hyperplane_eval/adapters/local_bindings → hyperplane_eval-0.1.7/hyperplane/cli/runners}/__init__.py +0 -0
- {hyperplane_eval-0.1.5/hyperplane_eval/adapters → hyperplane_eval-0.1.7/hyperplane/cli}/runners/agent_runner.py +0 -0
- {hyperplane_eval-0.1.5/hyperplane_eval/adapters/runners → hyperplane_eval-0.1.7/hyperplane/framework}/__init__.py +0 -0
- {hyperplane_eval-0.1.5/hyperplane_eval/engine → hyperplane_eval-0.1.7/hyperplane/framework}/domain/__init__.py +0 -0
- {hyperplane_eval-0.1.5/hyperplane_eval/engine/domain/predefined_features → hyperplane_eval-0.1.7/hyperplane/framework/domain/dimensions}/adversarial_features.json +0 -0
- {hyperplane_eval-0.1.5/hyperplane_eval/engine/domain/predefined_features → hyperplane_eval-0.1.7/hyperplane/framework/domain/dimensions}/conversational_features.json +0 -0
- /hyperplane_eval-0.1.5/hyperplane_eval/engine/domain/dimensions.py → /hyperplane_eval-0.1.7/hyperplane/framework/domain/dimensions/prompt_feature.py +0 -0
- {hyperplane_eval-0.1.5/hyperplane_eval/engine → hyperplane_eval-0.1.7/hyperplane/framework}/domain/vectors/__init__.py +0 -0
- {hyperplane_eval-0.1.5/hyperplane_eval/engine → hyperplane_eval-0.1.7/hyperplane/framework}/domain/vectors/base.py +0 -0
- {hyperplane_eval-0.1.5/hyperplane_eval/engine → hyperplane_eval-0.1.7/hyperplane/framework}/domain/vectors/evaluated.py +0 -0
- {hyperplane_eval-0.1.5/hyperplane_eval/engine → hyperplane_eval-0.1.7/hyperplane/framework}/domain/vectors/executed.py +0 -0
- {hyperplane_eval-0.1.5/hyperplane_eval/engine → hyperplane_eval-0.1.7/hyperplane/framework}/domain/vectors/synthesized.py +0 -0
- {hyperplane_eval-0.1.5/hyperplane_eval/engine → hyperplane_eval-0.1.7/hyperplane/framework/input_space}/__init__.py +0 -0
- {hyperplane_eval-0.1.5/hyperplane_eval/engine/input_space → hyperplane_eval-0.1.7/hyperplane/framework/reporting}/__init__.py +0 -0
- {hyperplane_eval-0.1.5/hyperplane_eval/engine/stages → hyperplane_eval-0.1.7/hyperplane/framework/reporting/templates}/__init__.py +0 -0
- {hyperplane_eval-0.1.5/hyperplane_eval/prompts → hyperplane_eval-0.1.7/hyperplane/framework/stages}/__init__.py +0 -0
- {hyperplane_eval-0.1.5/hyperplane_eval/prompts/reporting → hyperplane_eval-0.1.7/hyperplane/prompts}/__init__.py +0 -0
- {hyperplane_eval-0.1.5/hyperplane_eval/prompts/adapters/llm → hyperplane_eval-0.1.7/hyperplane/prompts/llms}/schema_prompt.txt +0 -0
- {hyperplane_eval-0.1.5/hyperplane_eval/engine → hyperplane_eval-0.1.7/hyperplane/prompts}/prompt_loader.py +0 -0
- {hyperplane_eval-0.1.5/hyperplane_eval/prompts/stages → hyperplane_eval-0.1.7/hyperplane/prompts/reporting}/__init__.py +0 -0
- {hyperplane_eval-0.1.5/hyperplane_eval → hyperplane_eval-0.1.7/hyperplane}/prompts/reporting/dimension_mitigation.txt +0 -0
- {hyperplane_eval-0.1.5/hyperplane_eval → hyperplane_eval-0.1.7/hyperplane}/prompts/reporting/vulnerability_patch.txt +0 -0
- {hyperplane_eval-0.1.5/hyperplane_eval/prompts/stages/creator → hyperplane_eval-0.1.7/hyperplane/prompts/stages}/__init__.py +0 -0
- {hyperplane_eval-0.1.5/hyperplane_eval/prompts/stages/evaluator → hyperplane_eval-0.1.7/hyperplane/prompts/stages/creator}/__init__.py +0 -0
- {hyperplane_eval-0.1.5/hyperplane_eval → hyperplane_eval-0.1.7/hyperplane}/prompts/stages/creator/anchors_sys.txt +0 -0
- {hyperplane_eval-0.1.5/hyperplane_eval → hyperplane_eval-0.1.7/hyperplane}/prompts/stages/creator/anchors_user.txt +0 -0
- {hyperplane_eval-0.1.5/hyperplane_eval → hyperplane_eval-0.1.7/hyperplane}/prompts/stages/creator/brainstorm_sys.txt +0 -0
- {hyperplane_eval-0.1.5/hyperplane_eval → hyperplane_eval-0.1.7/hyperplane}/prompts/stages/creator/brainstorm_user.txt +0 -0
- {hyperplane_eval-0.1.5/hyperplane_eval → hyperplane_eval-0.1.7/hyperplane}/prompts/stages/creator/refine_sys.txt +0 -0
- {hyperplane_eval-0.1.5/hyperplane_eval → hyperplane_eval-0.1.7/hyperplane}/prompts/stages/creator/refine_user.txt +0 -0
- {hyperplane_eval-0.1.5/hyperplane_eval/prompts/stages/generator → hyperplane_eval-0.1.7/hyperplane/prompts/stages/evaluator}/__init__.py +0 -0
- {hyperplane_eval-0.1.5/hyperplane_eval → hyperplane_eval-0.1.7/hyperplane}/prompts/stages/evaluator/judge.txt +0 -0
- {hyperplane_eval-0.1.5/hyperplane_eval/reporting → hyperplane_eval-0.1.7/hyperplane/prompts/stages/generator}/__init__.py +0 -0
- {hyperplane_eval-0.1.5/hyperplane_eval → hyperplane_eval-0.1.7/hyperplane}/prompts/stages/generator/continue_sys.txt +0 -0
- {hyperplane_eval-0.1.5/hyperplane_eval → hyperplane_eval-0.1.7/hyperplane}/prompts/stages/generator/continue_user.txt +0 -0
- {hyperplane_eval-0.1.5/hyperplane_eval → hyperplane_eval-0.1.7/hyperplane}/prompts/stages/generator/copyeditor_sys.txt +0 -0
- {hyperplane_eval-0.1.5/hyperplane_eval → hyperplane_eval-0.1.7/hyperplane}/prompts/stages/generator/copyeditor_user.txt +0 -0
- {hyperplane_eval-0.1.5/hyperplane_eval → hyperplane_eval-0.1.7/hyperplane}/prompts/stages/generator/eval_checks_sys.txt +0 -0
- {hyperplane_eval-0.1.5/hyperplane_eval → hyperplane_eval-0.1.7/hyperplane}/prompts/stages/generator/eval_checks_user.txt +0 -0
- {hyperplane_eval-0.1.5/hyperplane_eval → hyperplane_eval-0.1.7/hyperplane}/prompts/stages/generator/seed_sys.txt +0 -0
- {hyperplane_eval-0.1.5/hyperplane_eval → hyperplane_eval-0.1.7/hyperplane}/prompts/stages/generator/seed_user.txt +0 -0
- {hyperplane_eval-0.1.5 → hyperplane_eval-0.1.7}/hyperplane_eval.egg-info/dependency_links.txt +0 -0
- {hyperplane_eval-0.1.5 → hyperplane_eval-0.1.7}/hyperplane_eval.egg-info/requires.txt +0 -0
- {hyperplane_eval-0.1.5 → hyperplane_eval-0.1.7}/requirements.txt +0 -0
- {hyperplane_eval-0.1.5 → hyperplane_eval-0.1.7}/setup.cfg +0 -0
|
@@ -0,0 +1,149 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: hyperplane-eval
|
|
3
|
+
Version: 0.1.7
|
|
4
|
+
Summary: Local tool to evaluate AI agents and find their weak points.
|
|
5
|
+
Author: Marten Panchev
|
|
6
|
+
Author-email: marten@aquithm.com
|
|
7
|
+
Classifier: Programming Language :: Python :: 3
|
|
8
|
+
Classifier: License :: OSI Approved :: Apache Software License
|
|
9
|
+
Classifier: Operating System :: OS Independent
|
|
10
|
+
Requires-Python: >=3.10
|
|
11
|
+
Description-Content-Type: text/markdown
|
|
12
|
+
License-File: LICENSE
|
|
13
|
+
Requires-Dist: pydantic>=2.0.0
|
|
14
|
+
Requires-Dist: numpy>=1.24.0
|
|
15
|
+
Requires-Dist: scipy>=1.10.0
|
|
16
|
+
Requires-Dist: litellm>=1.0.0
|
|
17
|
+
Requires-Dist: aiohttp>=3.9.0
|
|
18
|
+
Requires-Dist: pandas>=2.0.0
|
|
19
|
+
Requires-Dist: scikit-learn>=1.2.0
|
|
20
|
+
Requires-Dist: openai>=1.0.0
|
|
21
|
+
Requires-Dist: pyngrok>=7.1.0
|
|
22
|
+
Requires-Dist: rich>=13.0.0
|
|
23
|
+
Requires-Dist: questionary>=2.0.0
|
|
24
|
+
Requires-Dist: PyYAML>=6.0.0
|
|
25
|
+
Dynamic: author
|
|
26
|
+
Dynamic: author-email
|
|
27
|
+
Dynamic: classifier
|
|
28
|
+
Dynamic: description
|
|
29
|
+
Dynamic: description-content-type
|
|
30
|
+
Dynamic: license-file
|
|
31
|
+
Dynamic: requires-dist
|
|
32
|
+
Dynamic: requires-python
|
|
33
|
+
Dynamic: summary
|
|
34
|
+
|
|
35
|
+
# Hyperplane Eval
|
|
36
|
+
|
|
37
|
+
Hyperplane Eval helps teams discover behavioral failures before deployment.
|
|
38
|
+
|
|
39
|
+
It works as a CLI and programmatic tool for evaluating AI agents against business, security, and ethical requirements using intelligent test generation rather than manually written test cases.
|
|
40
|
+
|
|
41
|
+
## Why Hyperplane?
|
|
42
|
+
|
|
43
|
+
Most AI teams evaluate agents with a few dozen manually written prompts.
|
|
44
|
+
|
|
45
|
+
The problem is that manually written tests only cover a tiny fraction of the agent's behavioral space.
|
|
46
|
+
|
|
47
|
+
Hyperplane automatically explores that space, generating thousands of semantically diverse inputs to uncover failures before users do.
|
|
48
|
+
|
|
49
|
+
## Features
|
|
50
|
+
|
|
51
|
+
- Generate thousands of semantically diverse test cases
|
|
52
|
+
- Evaluate business, security, and ethical requirements of your agent
|
|
53
|
+
- Automatically find edge cases and breaking points
|
|
54
|
+
- Local-first CLI workflow
|
|
55
|
+
- Framework-agnostic agent integration
|
|
56
|
+
- Detailed evaluation reports. [See example here](https://n0tsu5.github.io/results-board/)
|
|
57
|
+
|
|
58
|
+
## CLI Integration
|
|
59
|
+
|
|
60
|
+
Hyperplane operates entirely locally through a terminal-based orchestration wizard.
|
|
61
|
+
|
|
62
|
+
### Setup & Installation
|
|
63
|
+
|
|
64
|
+
Install the framework via pip:
|
|
65
|
+
|
|
66
|
+
```bash
|
|
67
|
+
pip install hyperplane-eval
|
|
68
|
+
```
|
|
69
|
+
|
|
70
|
+
### Running the CLI
|
|
71
|
+
|
|
72
|
+
Run the interactive CLI directly in your terminal from inside your project directory:
|
|
73
|
+
|
|
74
|
+
```bash
|
|
75
|
+
hyperplane
|
|
76
|
+
```
|
|
77
|
+
|
|
78
|
+
The CLI wizard will guide you through:
|
|
79
|
+
1. **Target Binding:** Automatically finding your agent's code.
|
|
80
|
+
2. **Constraint Definition:** Setting natural language rules for your agent.
|
|
81
|
+
3. **Configuration:** Setting the scale and depth of the evaluation.
|
|
82
|
+
4. **Execution:** Running the evaluation with a live dashboard.
|
|
83
|
+
|
|
84
|
+
Hyperplane outputs a structured dataset and an HTML report identifying the specific types and characteristics of inputs that cause the agent to fail, helping you quickly isolate prompt engineering or logic regressions.
|
|
85
|
+
|
|
86
|
+

|
|
87
|
+
|
|
88
|
+
## Programmatic Integration
|
|
89
|
+
|
|
90
|
+
You can also integrate Hyperplane Eval directly into your Python codebase or CI/CD testing suites using the `Evaluator` API:
|
|
91
|
+
|
|
92
|
+
```python
|
|
93
|
+
from hyperplane import Evaluator
|
|
94
|
+
from litellm import Router
|
|
95
|
+
|
|
96
|
+
# 1. Initialize your litellm Router
|
|
97
|
+
router = Router(
|
|
98
|
+
model_list=[{
|
|
99
|
+
"model_name": "gpt-4o",
|
|
100
|
+
"litellm_params": {"model": "gpt-4o"}
|
|
101
|
+
}]
|
|
102
|
+
)
|
|
103
|
+
|
|
104
|
+
# 2. Define your target agent
|
|
105
|
+
def my_agent(prompt: str) -> str:
|
|
106
|
+
return "I am a safe agent."
|
|
107
|
+
|
|
108
|
+
# 3. Initialize the Evaluator
|
|
109
|
+
evaluator = Evaluator(
|
|
110
|
+
agent_desc="A helpful AI assistant",
|
|
111
|
+
param_desc={"prompt": "The user input prompt"},
|
|
112
|
+
target_callable=my_agent,
|
|
113
|
+
llm_client=router
|
|
114
|
+
)
|
|
115
|
+
|
|
116
|
+
# 4. Add constraints and run
|
|
117
|
+
evaluator.run(rules=["Never execute tool calls with unsafe parameters.", "Always respond in English."])
|
|
118
|
+
```
|
|
119
|
+
## Architecture & Methodology
|
|
120
|
+
|
|
121
|
+
Evaluating agentic systems presents a curse-of-dimensionality problem due to the infinite input space. Hyperplane mitigates this via a dimensional reduction and bounded sampling approach:
|
|
122
|
+
|
|
123
|
+
1. **Orthogonal Dimension Extraction:** The target heuristic or constraint (e.g., a business logic rule) is passed to an LLM, which extracts a set of orthogonal, continuous axes representing the variance in potential inputs (e.g., `user_frustration` [0,1], `budget_constraint` [0,1]).
|
|
124
|
+
2. **Quasi-Random Initialization:** The framework maps a bounded multi-dimensional continuous input space $S \in [0, 1]^D$. It utilizes Sobol sequences to generate a low-discrepancy initialization grid to ensure uniform volumetric coverage without clustering.
|
|
125
|
+
3. **Synthetic Input Generation:** Bounded coordinate vectors are mapped back into natural language. A Generator LLM synthesizes adversarial or conversational inputs as structured payloads that strictly adhere to the defined vector coordinates.
|
|
126
|
+
4. **Response Classification** The target agent executes the synthesized inputs. An Evaluator LLM utilizes Chain-of-Thought (CoT) reasoning to classify the agent's response as a pass (1) or fail (0) against the constraint.
|
|
127
|
+
5. **Surrogate Modeling & Active Search:** The framework fits a Random Forest surrogate classifier over the evaluated points to approximate the failure boundary. It utilizes an active search algorithm to sample points near the decision boundary until volumetric saturation is reached, stopping early via dimension-scaled mismatch rate thresholds.
|
|
128
|
+
|
|
129
|
+
The resulting artifact is a detailed report allowing engineers to identify the specific input themes and characteristics that reliably induce constraint violations.
|
|
130
|
+
|
|
131
|
+
## Privacy & Security (BYOK)
|
|
132
|
+
|
|
133
|
+
Hyperplane Eval is designed for enterprise privacy using a **Bring Your Own Key (BYOK)** architecture:
|
|
134
|
+
- **100% Local Execution:** The orchestrator and test synthesis engine run entirely on your local machine or CI/CD runner.
|
|
135
|
+
- **No Telemetry or Data Logging:** We do not collect product telemetry, execution logs, or source code.
|
|
136
|
+
- **Direct Vendor Routing:** Your API keys are routed strictly to the LLM provider you configure (via `litellm`) and are never intercepted or proxied.
|
|
137
|
+
- **Budget Safe:** Built-in safeguards and configurable depth/breadth parameters ensure the evaluation pipeline generates high coverage without blowing up your token bill.
|
|
138
|
+
- **Open Source Auditing:** The entire orchestration pipeline is open-source, allowing your security team to fully audit the codebase.
|
|
139
|
+
|
|
140
|
+
## 🛠 Technology Stack
|
|
141
|
+
- **Language:** Python 3.10+
|
|
142
|
+
- **Data Modeling:** `pydantic`
|
|
143
|
+
- **Math/Geometry:** `numpy`, `scipy` (Sobol sequences, ConvexHull analysis)
|
|
144
|
+
- **LLM Integration:** `litellm` for universal API connectivity (OpenAI, Gemini, Anthropic, or any local vLLM).
|
|
145
|
+
|
|
146
|
+
## 📄 License
|
|
147
|
+
|
|
148
|
+
This project is licensed under the Apache License, Version 2.0.
|
|
149
|
+
See the [LICENSE](LICENSE) file for more information.
|
|
@@ -0,0 +1,115 @@
|
|
|
1
|
+
# Hyperplane Eval
|
|
2
|
+
|
|
3
|
+
Hyperplane Eval helps teams discover behavioral failures before deployment.
|
|
4
|
+
|
|
5
|
+
It works as a CLI and programmatic tool for evaluating AI agents against business, security, and ethical requirements using intelligent test generation rather than manually written test cases.
|
|
6
|
+
|
|
7
|
+
## Why Hyperplane?
|
|
8
|
+
|
|
9
|
+
Most AI teams evaluate agents with a few dozen manually written prompts.
|
|
10
|
+
|
|
11
|
+
The problem is that manually written tests only cover a tiny fraction of the agent's behavioral space.
|
|
12
|
+
|
|
13
|
+
Hyperplane automatically explores that space, generating thousands of semantically diverse inputs to uncover failures before users do.
|
|
14
|
+
|
|
15
|
+
## Features
|
|
16
|
+
|
|
17
|
+
- Generate thousands of semantically diverse test cases
|
|
18
|
+
- Evaluate business, security, and ethical requirements of your agent
|
|
19
|
+
- Automatically find edge cases and breaking points
|
|
20
|
+
- Local-first CLI workflow
|
|
21
|
+
- Framework-agnostic agent integration
|
|
22
|
+
- Detailed evaluation reports. [See example here](https://n0tsu5.github.io/results-board/)
|
|
23
|
+
|
|
24
|
+
## CLI Integration
|
|
25
|
+
|
|
26
|
+
Hyperplane operates entirely locally through a terminal-based orchestration wizard.
|
|
27
|
+
|
|
28
|
+
### Setup & Installation
|
|
29
|
+
|
|
30
|
+
Install the framework via pip:
|
|
31
|
+
|
|
32
|
+
```bash
|
|
33
|
+
pip install hyperplane-eval
|
|
34
|
+
```
|
|
35
|
+
|
|
36
|
+
### Running the CLI
|
|
37
|
+
|
|
38
|
+
Run the interactive CLI directly in your terminal from inside your project directory:
|
|
39
|
+
|
|
40
|
+
```bash
|
|
41
|
+
hyperplane
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
The CLI wizard will guide you through:
|
|
45
|
+
1. **Target Binding:** Automatically finding your agent's code.
|
|
46
|
+
2. **Constraint Definition:** Setting natural language rules for your agent.
|
|
47
|
+
3. **Configuration:** Setting the scale and depth of the evaluation.
|
|
48
|
+
4. **Execution:** Running the evaluation with a live dashboard.
|
|
49
|
+
|
|
50
|
+
Hyperplane outputs a structured dataset and an HTML report identifying the specific types and characteristics of inputs that cause the agent to fail, helping you quickly isolate prompt engineering or logic regressions.
|
|
51
|
+
|
|
52
|
+

|
|
53
|
+
|
|
54
|
+
## Programmatic Integration
|
|
55
|
+
|
|
56
|
+
You can also integrate Hyperplane Eval directly into your Python codebase or CI/CD testing suites using the `Evaluator` API:
|
|
57
|
+
|
|
58
|
+
```python
|
|
59
|
+
from hyperplane import Evaluator
|
|
60
|
+
from litellm import Router
|
|
61
|
+
|
|
62
|
+
# 1. Initialize your litellm Router
|
|
63
|
+
router = Router(
|
|
64
|
+
model_list=[{
|
|
65
|
+
"model_name": "gpt-4o",
|
|
66
|
+
"litellm_params": {"model": "gpt-4o"}
|
|
67
|
+
}]
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
# 2. Define your target agent
|
|
71
|
+
def my_agent(prompt: str) -> str:
|
|
72
|
+
return "I am a safe agent."
|
|
73
|
+
|
|
74
|
+
# 3. Initialize the Evaluator
|
|
75
|
+
evaluator = Evaluator(
|
|
76
|
+
agent_desc="A helpful AI assistant",
|
|
77
|
+
param_desc={"prompt": "The user input prompt"},
|
|
78
|
+
target_callable=my_agent,
|
|
79
|
+
llm_client=router
|
|
80
|
+
)
|
|
81
|
+
|
|
82
|
+
# 4. Add constraints and run
|
|
83
|
+
evaluator.run(rules=["Never execute tool calls with unsafe parameters.", "Always respond in English."])
|
|
84
|
+
```
|
|
85
|
+
## Architecture & Methodology
|
|
86
|
+
|
|
87
|
+
Evaluating agentic systems presents a curse-of-dimensionality problem due to the infinite input space. Hyperplane mitigates this via a dimensional reduction and bounded sampling approach:
|
|
88
|
+
|
|
89
|
+
1. **Orthogonal Dimension Extraction:** The target heuristic or constraint (e.g., a business logic rule) is passed to an LLM, which extracts a set of orthogonal, continuous axes representing the variance in potential inputs (e.g., `user_frustration` [0,1], `budget_constraint` [0,1]).
|
|
90
|
+
2. **Quasi-Random Initialization:** The framework maps a bounded multi-dimensional continuous input space $S \in [0, 1]^D$. It utilizes Sobol sequences to generate a low-discrepancy initialization grid to ensure uniform volumetric coverage without clustering.
|
|
91
|
+
3. **Synthetic Input Generation:** Bounded coordinate vectors are mapped back into natural language. A Generator LLM synthesizes adversarial or conversational inputs as structured payloads that strictly adhere to the defined vector coordinates.
|
|
92
|
+
4. **Response Classification** The target agent executes the synthesized inputs. An Evaluator LLM utilizes Chain-of-Thought (CoT) reasoning to classify the agent's response as a pass (1) or fail (0) against the constraint.
|
|
93
|
+
5. **Surrogate Modeling & Active Search:** The framework fits a Random Forest surrogate classifier over the evaluated points to approximate the failure boundary. It utilizes an active search algorithm to sample points near the decision boundary until volumetric saturation is reached, stopping early via dimension-scaled mismatch rate thresholds.
|
|
94
|
+
|
|
95
|
+
The resulting artifact is a detailed report allowing engineers to identify the specific input themes and characteristics that reliably induce constraint violations.
|
|
96
|
+
|
|
97
|
+
## Privacy & Security (BYOK)
|
|
98
|
+
|
|
99
|
+
Hyperplane Eval is designed for enterprise privacy using a **Bring Your Own Key (BYOK)** architecture:
|
|
100
|
+
- **100% Local Execution:** The orchestrator and test synthesis engine run entirely on your local machine or CI/CD runner.
|
|
101
|
+
- **No Telemetry or Data Logging:** We do not collect product telemetry, execution logs, or source code.
|
|
102
|
+
- **Direct Vendor Routing:** Your API keys are routed strictly to the LLM provider you configure (via `litellm`) and are never intercepted or proxied.
|
|
103
|
+
- **Budget Safe:** Built-in safeguards and configurable depth/breadth parameters ensure the evaluation pipeline generates high coverage without blowing up your token bill.
|
|
104
|
+
- **Open Source Auditing:** The entire orchestration pipeline is open-source, allowing your security team to fully audit the codebase.
|
|
105
|
+
|
|
106
|
+
## 🛠 Technology Stack
|
|
107
|
+
- **Language:** Python 3.10+
|
|
108
|
+
- **Data Modeling:** `pydantic`
|
|
109
|
+
- **Math/Geometry:** `numpy`, `scipy` (Sobol sequences, ConvexHull analysis)
|
|
110
|
+
- **LLM Integration:** `litellm` for universal API connectivity (OpenAI, Gemini, Anthropic, or any local vLLM).
|
|
111
|
+
|
|
112
|
+
## 📄 License
|
|
113
|
+
|
|
114
|
+
This project is licensed under the Apache License, Version 2.0.
|
|
115
|
+
See the [LICENSE](LICENSE) file for more information.
|
|
@@ -7,11 +7,11 @@ from rich.text import Text
|
|
|
7
7
|
from rich.panel import Panel
|
|
8
8
|
from typing import Any
|
|
9
9
|
|
|
10
|
-
from
|
|
11
|
-
from
|
|
12
|
-
from
|
|
13
|
-
from
|
|
14
|
-
from
|
|
10
|
+
from hyperplane.cli.llms.llm_client import LLMClient
|
|
11
|
+
from hyperplane.cli.runners.agent_runner import AgentRunner
|
|
12
|
+
from hyperplane.cli.local_bindings.executor import execute_temp_runner
|
|
13
|
+
from hyperplane.framework.config import EvaluationConfig
|
|
14
|
+
from hyperplane.framework.orchestrator import PipelineOrchestrator
|
|
15
15
|
|
|
16
16
|
|
|
17
17
|
LOGO = """
|
|
@@ -148,7 +148,7 @@ class VerifyApp:
|
|
|
148
148
|
)
|
|
149
149
|
use_existing = await questionary.confirm("Use this target?").ask_async()
|
|
150
150
|
if use_existing:
|
|
151
|
-
from
|
|
151
|
+
from hyperplane.cli.local_bindings.scanner import extract_functions
|
|
152
152
|
|
|
153
153
|
funcs = extract_functions(self.config["file"])
|
|
154
154
|
selected_func = next(
|
|
@@ -187,7 +187,7 @@ class VerifyApp:
|
|
|
187
187
|
return None, None, None, []
|
|
188
188
|
|
|
189
189
|
self.console.print("[cyan]Scanning for functions...[/cyan]")
|
|
190
|
-
from
|
|
190
|
+
from hyperplane.cli.local_bindings.scanner import extract_functions
|
|
191
191
|
|
|
192
192
|
funcs = extract_functions(target_path)
|
|
193
193
|
if not funcs:
|
|
@@ -4,7 +4,7 @@ import re
|
|
|
4
4
|
import asyncio
|
|
5
5
|
from typing import Any, Dict
|
|
6
6
|
from litellm import acompletion
|
|
7
|
-
from
|
|
7
|
+
from hyperplane.prompts.prompt_loader import load_prompt
|
|
8
8
|
|
|
9
9
|
|
|
10
10
|
class LLMClient:
|
|
@@ -41,7 +41,7 @@ class LLMClient:
|
|
|
41
41
|
temperature: float,
|
|
42
42
|
) -> str:
|
|
43
43
|
schema_str = json.dumps(response_schema, indent=2)
|
|
44
|
-
prompt += "\n\n" + load_prompt("
|
|
44
|
+
prompt += "\n\n" + load_prompt("llms/schema_prompt", schema=schema_str)
|
|
45
45
|
|
|
46
46
|
kwargs = {
|
|
47
47
|
"model": self.model, # Force using the user-selected model
|
|
@@ -17,7 +17,7 @@ async def execute_temp_runner(target_path: str, selected_func: dict, params: dic
|
|
|
17
17
|
import sys, json, asyncio, inspect, importlib
|
|
18
18
|
sys.path.insert(0, r"{target_dir}")
|
|
19
19
|
try:
|
|
20
|
-
target_func = getattr(importlib.import_module("{module_name}"), "{selected_func[
|
|
20
|
+
target_func = getattr(importlib.import_module("{module_name}"), "{selected_func["name"]}")
|
|
21
21
|
except Exception as e:
|
|
22
22
|
print("VERIFY_RUN_ERROR:Load fail: " + str(e))
|
|
23
23
|
sys.exit(1)
|
|
@@ -50,8 +50,8 @@ async function main() {{
|
|
|
50
50
|
}} catch(e) {{
|
|
51
51
|
mod = require(moduleName);
|
|
52
52
|
}}
|
|
53
|
-
const func = mod.{selected_func[
|
|
54
|
-
if (!func) throw new Error("Function {selected_func[
|
|
53
|
+
const func = mod.{selected_func["name"]};
|
|
54
|
+
if (!func) throw new Error("Function {selected_func["name"]} not found in module.");
|
|
55
55
|
|
|
56
56
|
const params = JSON.parse(process.argv[1]);
|
|
57
57
|
const funcParams = {params_array_str};
|
|
@@ -0,0 +1,171 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
import inspect
|
|
3
|
+
import json
|
|
4
|
+
from typing import Callable
|
|
5
|
+
import litellm
|
|
6
|
+
|
|
7
|
+
from hyperplane.framework.config import EvaluationConfig
|
|
8
|
+
from hyperplane.framework.orchestrator import PipelineOrchestrator
|
|
9
|
+
from hyperplane.cli.runners.agent_runner import AgentRunner
|
|
10
|
+
from hyperplane.cli.llms.llm_client import LLMClient
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class Evaluator:
|
|
14
|
+
"""
|
|
15
|
+
Programmatic entry point for the Hyperplane Eval framework.
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
def __init__(
|
|
19
|
+
self,
|
|
20
|
+
agent_desc: str,
|
|
21
|
+
param_desc: dict,
|
|
22
|
+
target_callable: Callable,
|
|
23
|
+
llm_client: litellm.Router,
|
|
24
|
+
):
|
|
25
|
+
"""
|
|
26
|
+
Initialize the Evaluator.
|
|
27
|
+
|
|
28
|
+
Args:
|
|
29
|
+
agent_desc: A description of what the agent does.
|
|
30
|
+
param_desc: A map containing descriptions of the input parameters the agent takes (e.g. {"input": "User query"}).
|
|
31
|
+
target_callable: A synchronous or asynchronous Python callable (the agent).
|
|
32
|
+
llm_client: A pre-configured litellm.Router instance.
|
|
33
|
+
"""
|
|
34
|
+
self.agent_desc = agent_desc
|
|
35
|
+
self.param_desc = param_desc
|
|
36
|
+
self.target_callable = target_callable
|
|
37
|
+
self.llm_client = llm_client
|
|
38
|
+
|
|
39
|
+
def run(
|
|
40
|
+
self,
|
|
41
|
+
rules: list[str],
|
|
42
|
+
depth: int = 50,
|
|
43
|
+
breadth: int = 3,
|
|
44
|
+
adversarial_testing: bool = False,
|
|
45
|
+
conversational_testing: bool = False,
|
|
46
|
+
):
|
|
47
|
+
"""
|
|
48
|
+
Runs the evaluation pipeline.
|
|
49
|
+
"""
|
|
50
|
+
# 1. Build schema by inspecting the callable
|
|
51
|
+
try:
|
|
52
|
+
sig = inspect.signature(self.target_callable)
|
|
53
|
+
param_names = list(sig.parameters.keys())
|
|
54
|
+
except Exception:
|
|
55
|
+
param_names = []
|
|
56
|
+
|
|
57
|
+
if not param_names and isinstance(self.param_desc, dict) and self.param_desc:
|
|
58
|
+
param_names = list(self.param_desc.keys())
|
|
59
|
+
elif not param_names:
|
|
60
|
+
param_names = ["input"]
|
|
61
|
+
|
|
62
|
+
schema = []
|
|
63
|
+
for p_name in param_names:
|
|
64
|
+
if isinstance(self.param_desc, dict):
|
|
65
|
+
desc = self.param_desc.get(p_name, "")
|
|
66
|
+
if isinstance(desc, dict):
|
|
67
|
+
p_type = desc.get("type", "str")
|
|
68
|
+
p_desc = desc.get("description", "")
|
|
69
|
+
else:
|
|
70
|
+
p_type = "str"
|
|
71
|
+
p_desc = str(desc)
|
|
72
|
+
else:
|
|
73
|
+
p_type = "str"
|
|
74
|
+
p_desc = str(self.param_desc)
|
|
75
|
+
|
|
76
|
+
schema.append({"name": p_name, "type": p_type, "description": p_desc})
|
|
77
|
+
|
|
78
|
+
# 2. Try to extract source code for context
|
|
79
|
+
try:
|
|
80
|
+
code = inspect.getsource(self.target_callable)
|
|
81
|
+
except Exception:
|
|
82
|
+
code = "Code unavailable"
|
|
83
|
+
|
|
84
|
+
# 3. Resolve LLM configuration
|
|
85
|
+
class RouterWrapper(LLMClient):
|
|
86
|
+
def __init__(self, router):
|
|
87
|
+
self.router = router
|
|
88
|
+
self.model = "router"
|
|
89
|
+
self._semaphore = asyncio.Semaphore(10)
|
|
90
|
+
|
|
91
|
+
async def generate(
|
|
92
|
+
self, prompt: str, response_schema: dict, temperature: float
|
|
93
|
+
) -> str:
|
|
94
|
+
from hyperplane.prompts.prompt_loader import load_prompt
|
|
95
|
+
|
|
96
|
+
schema_str = json.dumps(response_schema, indent=2)
|
|
97
|
+
prompt += "\n\n" + load_prompt("llms/schema_prompt", schema=schema_str)
|
|
98
|
+
|
|
99
|
+
kwargs = {
|
|
100
|
+
"model": "",
|
|
101
|
+
"messages": [{"role": "user", "content": prompt}],
|
|
102
|
+
"temperature": temperature,
|
|
103
|
+
"response_format": {"type": "json_object"},
|
|
104
|
+
}
|
|
105
|
+
try:
|
|
106
|
+
response = await self.router.acompletion(**kwargs)
|
|
107
|
+
return response.choices[0].message.content
|
|
108
|
+
except Exception as e:
|
|
109
|
+
raise RuntimeError(f"LLM Server Error: {e}")
|
|
110
|
+
|
|
111
|
+
llm_client_resolved = RouterWrapper(self.llm_client)
|
|
112
|
+
|
|
113
|
+
# 4. Setup Custom Execution Environment
|
|
114
|
+
selected_func = {
|
|
115
|
+
"name": getattr(self.target_callable, "__name__", "target_callable"),
|
|
116
|
+
"code": code,
|
|
117
|
+
"params": schema,
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
async def executor_func(target_path, func_meta, params):
|
|
121
|
+
try:
|
|
122
|
+
if asyncio.iscoroutinefunction(self.target_callable):
|
|
123
|
+
res = await self.target_callable(**params)
|
|
124
|
+
else:
|
|
125
|
+
res = self.target_callable(**params)
|
|
126
|
+
|
|
127
|
+
if not isinstance(res, str):
|
|
128
|
+
res_str = json.dumps(res)
|
|
129
|
+
else:
|
|
130
|
+
res_str = res
|
|
131
|
+
return {"successVal": res_str}
|
|
132
|
+
except Exception as e:
|
|
133
|
+
return {"errorVal": str(e)}
|
|
134
|
+
|
|
135
|
+
runner = AgentRunner(
|
|
136
|
+
executor_func=executor_func,
|
|
137
|
+
target_path="programmatic_execution",
|
|
138
|
+
selected_func=selected_func,
|
|
139
|
+
)
|
|
140
|
+
|
|
141
|
+
# 5. Build Config
|
|
142
|
+
eval_config = EvaluationConfig(
|
|
143
|
+
rules=rules,
|
|
144
|
+
runner=runner,
|
|
145
|
+
generator_target_schema=schema,
|
|
146
|
+
generator_target_code=code,
|
|
147
|
+
llm_client=llm_client_resolved,
|
|
148
|
+
depth=depth,
|
|
149
|
+
breadth=breadth,
|
|
150
|
+
adversarial_testing=adversarial_testing,
|
|
151
|
+
conversational_testing=conversational_testing,
|
|
152
|
+
agent_description=self.agent_desc,
|
|
153
|
+
)
|
|
154
|
+
|
|
155
|
+
# 6. Execute Pipeline
|
|
156
|
+
orchestrator = PipelineOrchestrator(eval_config)
|
|
157
|
+
|
|
158
|
+
try:
|
|
159
|
+
loop = asyncio.get_running_loop()
|
|
160
|
+
except RuntimeError:
|
|
161
|
+
loop = None
|
|
162
|
+
|
|
163
|
+
if loop and loop.is_running():
|
|
164
|
+
import warnings
|
|
165
|
+
|
|
166
|
+
warnings.warn(
|
|
167
|
+
"Evaluator.run() called from a running event loop. Returning a coroutine instead. Please await it."
|
|
168
|
+
)
|
|
169
|
+
return orchestrator.run()
|
|
170
|
+
|
|
171
|
+
return asyncio.run(orchestrator.run())
|
|
@@ -2,8 +2,8 @@ import json
|
|
|
2
2
|
from pathlib import Path
|
|
3
3
|
from typing import List
|
|
4
4
|
from scipy.stats import qmc
|
|
5
|
-
from
|
|
6
|
-
from
|
|
5
|
+
from hyperplane.framework.domain.vectors import ScenarioVector, EvaluatedVector
|
|
6
|
+
from hyperplane.framework.domain.dimensions.prompt_feature import PromptFeature
|
|
7
7
|
|
|
8
8
|
|
|
9
9
|
class InputSpace:
|
|
@@ -3,10 +3,10 @@ import json
|
|
|
3
3
|
import random
|
|
4
4
|
import os
|
|
5
5
|
from typing import List
|
|
6
|
-
from
|
|
7
|
-
from
|
|
8
|
-
from
|
|
9
|
-
from
|
|
6
|
+
from hyperplane.framework.domain.dimensions.prompt_feature import PromptFeature
|
|
7
|
+
from hyperplane.cli.llms.llm_client import LLMClient
|
|
8
|
+
from hyperplane.prompts.prompt_loader import load_prompt
|
|
9
|
+
from hyperplane.framework.input_space.input_space import InputSpace
|
|
10
10
|
|
|
11
11
|
|
|
12
12
|
class InputSpaceFactory:
|
|
@@ -112,7 +112,10 @@ class InputSpaceFactory:
|
|
|
112
112
|
if plane:
|
|
113
113
|
hyperplanes.append(plane)
|
|
114
114
|
|
|
115
|
-
|
|
115
|
+
if isinstance(breadth, int):
|
|
116
|
+
target_planes = breadth
|
|
117
|
+
else:
|
|
118
|
+
target_planes = self.BREADTH_MAP.get(breadth, self.DEFAULT_TARGET_PLANES)
|
|
116
119
|
|
|
117
120
|
return hyperplanes[:target_planes]
|
|
118
121
|
|
|
@@ -4,13 +4,13 @@ import signal
|
|
|
4
4
|
from pathlib import Path
|
|
5
5
|
from typing import Dict, Any
|
|
6
6
|
|
|
7
|
-
from
|
|
8
|
-
from
|
|
9
|
-
from
|
|
10
|
-
from
|
|
11
|
-
from
|
|
12
|
-
from
|
|
13
|
-
from
|
|
7
|
+
from hyperplane.framework.input_space.input_space import InputSpace
|
|
8
|
+
from hyperplane.cli.llms.llm_client import LLMClient
|
|
9
|
+
from hyperplane.framework.stages.generator import SyntheticInputGenerator
|
|
10
|
+
from hyperplane.framework.stages.evaluator import AgentOutputEvaluator
|
|
11
|
+
from hyperplane.framework.input_space.input_space_factory import InputSpaceFactory
|
|
12
|
+
from hyperplane.framework.config import EvaluationConfig
|
|
13
|
+
from hyperplane.framework.reporting.analyser import ResultsAnalyser
|
|
14
14
|
from .plane_evaluator import PlaneEvaluator
|
|
15
15
|
|
|
16
16
|
|
|
@@ -2,17 +2,17 @@ from pathlib import Path
|
|
|
2
2
|
import asyncio
|
|
3
3
|
import sys
|
|
4
4
|
|
|
5
|
-
from
|
|
5
|
+
from hyperplane.framework.domain.vectors import (
|
|
6
6
|
ScenarioVector,
|
|
7
7
|
SynthesizedVector,
|
|
8
8
|
ExecutedVector,
|
|
9
9
|
)
|
|
10
|
-
from
|
|
11
|
-
from
|
|
12
|
-
from
|
|
13
|
-
from
|
|
14
|
-
from
|
|
15
|
-
from
|
|
10
|
+
from hyperplane.framework.domain.dimensions.prompt_feature import PromptFeature
|
|
11
|
+
from hyperplane.framework.input_space.input_space import InputSpace
|
|
12
|
+
from hyperplane.framework.stages.generator import SyntheticInputGenerator
|
|
13
|
+
from hyperplane.framework.stages.evaluator import AgentOutputEvaluator
|
|
14
|
+
from hyperplane.framework.stages.navigator import AdaptiveNavigator
|
|
15
|
+
from hyperplane.cli.runners.agent_runner import AgentRunner
|
|
16
16
|
from typing import Any
|
|
17
17
|
|
|
18
18
|
|
|
@@ -177,14 +177,17 @@ class PlaneEvaluator:
|
|
|
177
177
|
stop_event: asyncio.Event,
|
|
178
178
|
) -> InputSpace:
|
|
179
179
|
"""Evaluates a single hyperplane of prompt features."""
|
|
180
|
-
from
|
|
180
|
+
from hyperplane.cli.app import VerifyApp
|
|
181
181
|
|
|
182
182
|
state_file = str(
|
|
183
183
|
res_path / f"input_space_state_rule_{rule_idx}_plane_{plane_idx}.json"
|
|
184
184
|
)
|
|
185
185
|
plane_input_space = InputSpace(features=plane_features, state_path=state_file)
|
|
186
186
|
unique_dims = len(set(f.name for f in plane_features))
|
|
187
|
-
|
|
187
|
+
if isinstance(depth, int):
|
|
188
|
+
multiplier = depth
|
|
189
|
+
else:
|
|
190
|
+
multiplier = cls.DEPTH_MAP.get(depth, cls.DEFAULT_MULTIPLIER)
|
|
188
191
|
scenarios_per_plane = unique_dims * multiplier
|
|
189
192
|
|
|
190
193
|
navigator = AdaptiveNavigator(plane_input_space)
|