hyperplane-eval 0.1.5__tar.gz → 0.1.7__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (83) hide show
  1. hyperplane_eval-0.1.7/MANIFEST.in +6 -0
  2. hyperplane_eval-0.1.7/PKG-INFO +149 -0
  3. hyperplane_eval-0.1.7/README.md +115 -0
  4. hyperplane_eval-0.1.7/hyperplane/__init__.py +3 -0
  5. {hyperplane_eval-0.1.5/hyperplane_eval → hyperplane_eval-0.1.7/hyperplane}/cli/app.py +7 -7
  6. {hyperplane_eval-0.1.5/hyperplane_eval/adapters → hyperplane_eval-0.1.7/hyperplane/cli}/llms/llm_client.py +2 -2
  7. {hyperplane_eval-0.1.5/hyperplane_eval/adapters → hyperplane_eval-0.1.7/hyperplane/cli}/local_bindings/executor.py +3 -3
  8. hyperplane_eval-0.1.7/hyperplane/evaluator.py +171 -0
  9. {hyperplane_eval-0.1.5/hyperplane_eval/engine → hyperplane_eval-0.1.7/hyperplane/framework}/config.py +1 -1
  10. hyperplane_eval-0.1.7/hyperplane/framework/domain/dimensions/__init__.py +3 -0
  11. {hyperplane_eval-0.1.5/hyperplane_eval/engine → hyperplane_eval-0.1.7/hyperplane/framework}/input_space/input_space.py +2 -2
  12. {hyperplane_eval-0.1.5/hyperplane_eval/engine → hyperplane_eval-0.1.7/hyperplane/framework}/input_space/input_space_factory.py +8 -5
  13. {hyperplane_eval-0.1.5/hyperplane_eval/engine → hyperplane_eval-0.1.7/hyperplane/framework}/orchestrator.py +7 -7
  14. {hyperplane_eval-0.1.5/hyperplane_eval/engine → hyperplane_eval-0.1.7/hyperplane/framework}/plane_evaluator.py +12 -9
  15. {hyperplane_eval-0.1.5/hyperplane_eval → hyperplane_eval-0.1.7/hyperplane/framework}/reporting/analyser.py +3 -3
  16. {hyperplane_eval-0.1.5/hyperplane_eval/engine → hyperplane_eval-0.1.7/hyperplane/framework}/stages/evaluator.py +3 -3
  17. {hyperplane_eval-0.1.5/hyperplane_eval/engine → hyperplane_eval-0.1.7/hyperplane/framework}/stages/generator.py +3 -3
  18. {hyperplane_eval-0.1.5/hyperplane_eval/engine → hyperplane_eval-0.1.7/hyperplane/framework}/stages/navigator.py +2 -2
  19. hyperplane_eval-0.1.7/hyperplane_eval.egg-info/PKG-INFO +149 -0
  20. hyperplane_eval-0.1.7/hyperplane_eval.egg-info/SOURCES.txt +71 -0
  21. hyperplane_eval-0.1.7/hyperplane_eval.egg-info/entry_points.txt +2 -0
  22. hyperplane_eval-0.1.7/hyperplane_eval.egg-info/top_level.txt +1 -0
  23. {hyperplane_eval-0.1.5 → hyperplane_eval-0.1.7}/setup.py +3 -3
  24. hyperplane_eval-0.1.5/MANIFEST.in +0 -6
  25. hyperplane_eval-0.1.5/PKG-INFO +0 -88
  26. hyperplane_eval-0.1.5/README.md +0 -54
  27. hyperplane_eval-0.1.5/hyperplane_eval/adapters/__init__.py +0 -1
  28. hyperplane_eval-0.1.5/hyperplane_eval/reporting/templates/__init__.py +0 -0
  29. hyperplane_eval-0.1.5/hyperplane_eval/reporting/templates/report_template.html +0 -988
  30. hyperplane_eval-0.1.5/hyperplane_eval.egg-info/PKG-INFO +0 -88
  31. hyperplane_eval-0.1.5/hyperplane_eval.egg-info/SOURCES.txt +0 -71
  32. hyperplane_eval-0.1.5/hyperplane_eval.egg-info/entry_points.txt +0 -2
  33. hyperplane_eval-0.1.5/hyperplane_eval.egg-info/top_level.txt +0 -1
  34. {hyperplane_eval-0.1.5 → hyperplane_eval-0.1.7}/LICENSE +0 -0
  35. {hyperplane_eval-0.1.5/hyperplane_eval → hyperplane_eval-0.1.7/hyperplane}/cli/__init__.py +0 -0
  36. {hyperplane_eval-0.1.5/hyperplane_eval → hyperplane_eval-0.1.7/hyperplane/cli/llms}/__init__.py +0 -0
  37. {hyperplane_eval-0.1.5/hyperplane_eval/adapters/llms → hyperplane_eval-0.1.7/hyperplane/cli/local_bindings}/__init__.py +0 -0
  38. {hyperplane_eval-0.1.5/hyperplane_eval/adapters → hyperplane_eval-0.1.7/hyperplane/cli}/local_bindings/scanner.py +0 -0
  39. {hyperplane_eval-0.1.5/hyperplane_eval/adapters/local_bindings → hyperplane_eval-0.1.7/hyperplane/cli/runners}/__init__.py +0 -0
  40. {hyperplane_eval-0.1.5/hyperplane_eval/adapters → hyperplane_eval-0.1.7/hyperplane/cli}/runners/agent_runner.py +0 -0
  41. {hyperplane_eval-0.1.5/hyperplane_eval/adapters/runners → hyperplane_eval-0.1.7/hyperplane/framework}/__init__.py +0 -0
  42. {hyperplane_eval-0.1.5/hyperplane_eval/engine → hyperplane_eval-0.1.7/hyperplane/framework}/domain/__init__.py +0 -0
  43. {hyperplane_eval-0.1.5/hyperplane_eval/engine/domain/predefined_features → hyperplane_eval-0.1.7/hyperplane/framework/domain/dimensions}/adversarial_features.json +0 -0
  44. {hyperplane_eval-0.1.5/hyperplane_eval/engine/domain/predefined_features → hyperplane_eval-0.1.7/hyperplane/framework/domain/dimensions}/conversational_features.json +0 -0
  45. /hyperplane_eval-0.1.5/hyperplane_eval/engine/domain/dimensions.py → /hyperplane_eval-0.1.7/hyperplane/framework/domain/dimensions/prompt_feature.py +0 -0
  46. {hyperplane_eval-0.1.5/hyperplane_eval/engine → hyperplane_eval-0.1.7/hyperplane/framework}/domain/vectors/__init__.py +0 -0
  47. {hyperplane_eval-0.1.5/hyperplane_eval/engine → hyperplane_eval-0.1.7/hyperplane/framework}/domain/vectors/base.py +0 -0
  48. {hyperplane_eval-0.1.5/hyperplane_eval/engine → hyperplane_eval-0.1.7/hyperplane/framework}/domain/vectors/evaluated.py +0 -0
  49. {hyperplane_eval-0.1.5/hyperplane_eval/engine → hyperplane_eval-0.1.7/hyperplane/framework}/domain/vectors/executed.py +0 -0
  50. {hyperplane_eval-0.1.5/hyperplane_eval/engine → hyperplane_eval-0.1.7/hyperplane/framework}/domain/vectors/synthesized.py +0 -0
  51. {hyperplane_eval-0.1.5/hyperplane_eval/engine → hyperplane_eval-0.1.7/hyperplane/framework/input_space}/__init__.py +0 -0
  52. {hyperplane_eval-0.1.5/hyperplane_eval/engine/input_space → hyperplane_eval-0.1.7/hyperplane/framework/reporting}/__init__.py +0 -0
  53. {hyperplane_eval-0.1.5/hyperplane_eval/engine/stages → hyperplane_eval-0.1.7/hyperplane/framework/reporting/templates}/__init__.py +0 -0
  54. {hyperplane_eval-0.1.5/hyperplane_eval/prompts → hyperplane_eval-0.1.7/hyperplane/framework/stages}/__init__.py +0 -0
  55. {hyperplane_eval-0.1.5/hyperplane_eval/prompts/reporting → hyperplane_eval-0.1.7/hyperplane/prompts}/__init__.py +0 -0
  56. {hyperplane_eval-0.1.5/hyperplane_eval/prompts/adapters/llm → hyperplane_eval-0.1.7/hyperplane/prompts/llms}/schema_prompt.txt +0 -0
  57. {hyperplane_eval-0.1.5/hyperplane_eval/engine → hyperplane_eval-0.1.7/hyperplane/prompts}/prompt_loader.py +0 -0
  58. {hyperplane_eval-0.1.5/hyperplane_eval/prompts/stages → hyperplane_eval-0.1.7/hyperplane/prompts/reporting}/__init__.py +0 -0
  59. {hyperplane_eval-0.1.5/hyperplane_eval → hyperplane_eval-0.1.7/hyperplane}/prompts/reporting/dimension_mitigation.txt +0 -0
  60. {hyperplane_eval-0.1.5/hyperplane_eval → hyperplane_eval-0.1.7/hyperplane}/prompts/reporting/vulnerability_patch.txt +0 -0
  61. {hyperplane_eval-0.1.5/hyperplane_eval/prompts/stages/creator → hyperplane_eval-0.1.7/hyperplane/prompts/stages}/__init__.py +0 -0
  62. {hyperplane_eval-0.1.5/hyperplane_eval/prompts/stages/evaluator → hyperplane_eval-0.1.7/hyperplane/prompts/stages/creator}/__init__.py +0 -0
  63. {hyperplane_eval-0.1.5/hyperplane_eval → hyperplane_eval-0.1.7/hyperplane}/prompts/stages/creator/anchors_sys.txt +0 -0
  64. {hyperplane_eval-0.1.5/hyperplane_eval → hyperplane_eval-0.1.7/hyperplane}/prompts/stages/creator/anchors_user.txt +0 -0
  65. {hyperplane_eval-0.1.5/hyperplane_eval → hyperplane_eval-0.1.7/hyperplane}/prompts/stages/creator/brainstorm_sys.txt +0 -0
  66. {hyperplane_eval-0.1.5/hyperplane_eval → hyperplane_eval-0.1.7/hyperplane}/prompts/stages/creator/brainstorm_user.txt +0 -0
  67. {hyperplane_eval-0.1.5/hyperplane_eval → hyperplane_eval-0.1.7/hyperplane}/prompts/stages/creator/refine_sys.txt +0 -0
  68. {hyperplane_eval-0.1.5/hyperplane_eval → hyperplane_eval-0.1.7/hyperplane}/prompts/stages/creator/refine_user.txt +0 -0
  69. {hyperplane_eval-0.1.5/hyperplane_eval/prompts/stages/generator → hyperplane_eval-0.1.7/hyperplane/prompts/stages/evaluator}/__init__.py +0 -0
  70. {hyperplane_eval-0.1.5/hyperplane_eval → hyperplane_eval-0.1.7/hyperplane}/prompts/stages/evaluator/judge.txt +0 -0
  71. {hyperplane_eval-0.1.5/hyperplane_eval/reporting → hyperplane_eval-0.1.7/hyperplane/prompts/stages/generator}/__init__.py +0 -0
  72. {hyperplane_eval-0.1.5/hyperplane_eval → hyperplane_eval-0.1.7/hyperplane}/prompts/stages/generator/continue_sys.txt +0 -0
  73. {hyperplane_eval-0.1.5/hyperplane_eval → hyperplane_eval-0.1.7/hyperplane}/prompts/stages/generator/continue_user.txt +0 -0
  74. {hyperplane_eval-0.1.5/hyperplane_eval → hyperplane_eval-0.1.7/hyperplane}/prompts/stages/generator/copyeditor_sys.txt +0 -0
  75. {hyperplane_eval-0.1.5/hyperplane_eval → hyperplane_eval-0.1.7/hyperplane}/prompts/stages/generator/copyeditor_user.txt +0 -0
  76. {hyperplane_eval-0.1.5/hyperplane_eval → hyperplane_eval-0.1.7/hyperplane}/prompts/stages/generator/eval_checks_sys.txt +0 -0
  77. {hyperplane_eval-0.1.5/hyperplane_eval → hyperplane_eval-0.1.7/hyperplane}/prompts/stages/generator/eval_checks_user.txt +0 -0
  78. {hyperplane_eval-0.1.5/hyperplane_eval → hyperplane_eval-0.1.7/hyperplane}/prompts/stages/generator/seed_sys.txt +0 -0
  79. {hyperplane_eval-0.1.5/hyperplane_eval → hyperplane_eval-0.1.7/hyperplane}/prompts/stages/generator/seed_user.txt +0 -0
  80. {hyperplane_eval-0.1.5 → hyperplane_eval-0.1.7}/hyperplane_eval.egg-info/dependency_links.txt +0 -0
  81. {hyperplane_eval-0.1.5 → hyperplane_eval-0.1.7}/hyperplane_eval.egg-info/requires.txt +0 -0
  82. {hyperplane_eval-0.1.5 → hyperplane_eval-0.1.7}/requirements.txt +0 -0
  83. {hyperplane_eval-0.1.5 → hyperplane_eval-0.1.7}/setup.cfg +0 -0
@@ -0,0 +1,6 @@
1
+ include requirements.txt
2
+ include README.md
3
+ include LICENSE
4
+ recursive-include hyperplane/prompts *.txt
5
+ recursive-include hyperplane/framework/domain *.json
6
+ recursive-include hyperplane/reporting/templates *.html
@@ -0,0 +1,149 @@
1
+ Metadata-Version: 2.4
2
+ Name: hyperplane-eval
3
+ Version: 0.1.7
4
+ Summary: Local tool to evaluate AI agents and find their weak points.
5
+ Author: Marten Panchev
6
+ Author-email: marten@aquithm.com
7
+ Classifier: Programming Language :: Python :: 3
8
+ Classifier: License :: OSI Approved :: Apache Software License
9
+ Classifier: Operating System :: OS Independent
10
+ Requires-Python: >=3.10
11
+ Description-Content-Type: text/markdown
12
+ License-File: LICENSE
13
+ Requires-Dist: pydantic>=2.0.0
14
+ Requires-Dist: numpy>=1.24.0
15
+ Requires-Dist: scipy>=1.10.0
16
+ Requires-Dist: litellm>=1.0.0
17
+ Requires-Dist: aiohttp>=3.9.0
18
+ Requires-Dist: pandas>=2.0.0
19
+ Requires-Dist: scikit-learn>=1.2.0
20
+ Requires-Dist: openai>=1.0.0
21
+ Requires-Dist: pyngrok>=7.1.0
22
+ Requires-Dist: rich>=13.0.0
23
+ Requires-Dist: questionary>=2.0.0
24
+ Requires-Dist: PyYAML>=6.0.0
25
+ Dynamic: author
26
+ Dynamic: author-email
27
+ Dynamic: classifier
28
+ Dynamic: description
29
+ Dynamic: description-content-type
30
+ Dynamic: license-file
31
+ Dynamic: requires-dist
32
+ Dynamic: requires-python
33
+ Dynamic: summary
34
+
35
+ # Hyperplane Eval
36
+
37
+ Hyperplane Eval helps teams discover behavioral failures before deployment.
38
+
39
+ It works as a CLI and programmatic tool for evaluating AI agents against business, security, and ethical requirements using intelligent test generation rather than manually written test cases.
40
+
41
+ ## Why Hyperplane?
42
+
43
+ Most AI teams evaluate agents with a few dozen manually written prompts.
44
+
45
+ The problem is that manually written tests only cover a tiny fraction of the agent's behavioral space.
46
+
47
+ Hyperplane automatically explores that space, generating thousands of semantically diverse inputs to uncover failures before users do.
48
+
49
+ ## Features
50
+
51
+ - Generate thousands of semantically diverse test cases
52
+ - Evaluate business, security, and ethical requirements of your agent
53
+ - Automatically find edge cases and breaking points
54
+ - Local-first CLI workflow
55
+ - Framework-agnostic agent integration
56
+ - Detailed evaluation reports. [See example here](https://n0tsu5.github.io/results-board/)
57
+
58
+ ## CLI Integration
59
+
60
+ Hyperplane operates entirely locally through a terminal-based orchestration wizard.
61
+
62
+ ### Setup & Installation
63
+
64
+ Install the framework via pip:
65
+
66
+ ```bash
67
+ pip install hyperplane-eval
68
+ ```
69
+
70
+ ### Running the CLI
71
+
72
+ Run the interactive CLI directly in your terminal from inside your project directory:
73
+
74
+ ```bash
75
+ hyperplane
76
+ ```
77
+
78
+ The CLI wizard will guide you through:
79
+ 1. **Target Binding:** Automatically finding your agent's code.
80
+ 2. **Constraint Definition:** Setting natural language rules for your agent.
81
+ 3. **Configuration:** Setting the scale and depth of the evaluation.
82
+ 4. **Execution:** Running the evaluation with a live dashboard.
83
+
84
+ Hyperplane outputs a structured dataset and an HTML report identifying the specific types and characteristics of inputs that cause the agent to fail, helping you quickly isolate prompt engineering or logic regressions.
85
+
86
+ ![Hyperplane Evaluation Dashboard](5cr33n5h0t.png)
87
+
88
+ ## Programmatic Integration
89
+
90
+ You can also integrate Hyperplane Eval directly into your Python codebase or CI/CD testing suites using the `Evaluator` API:
91
+
92
+ ```python
93
+ from hyperplane import Evaluator
94
+ from litellm import Router
95
+
96
+ # 1. Initialize your litellm Router
97
+ router = Router(
98
+ model_list=[{
99
+ "model_name": "gpt-4o",
100
+ "litellm_params": {"model": "gpt-4o"}
101
+ }]
102
+ )
103
+
104
+ # 2. Define your target agent
105
+ def my_agent(prompt: str) -> str:
106
+ return "I am a safe agent."
107
+
108
+ # 3. Initialize the Evaluator
109
+ evaluator = Evaluator(
110
+ agent_desc="A helpful AI assistant",
111
+ param_desc={"prompt": "The user input prompt"},
112
+ target_callable=my_agent,
113
+ llm_client=router
114
+ )
115
+
116
+ # 4. Add constraints and run
117
+ evaluator.run(rules=["Never execute tool calls with unsafe parameters.", "Always respond in English."])
118
+ ```
119
+ ## Architecture & Methodology
120
+
121
+ Evaluating agentic systems presents a curse-of-dimensionality problem due to the infinite input space. Hyperplane mitigates this via a dimensional reduction and bounded sampling approach:
122
+
123
+ 1. **Orthogonal Dimension Extraction:** The target heuristic or constraint (e.g., a business logic rule) is passed to an LLM, which extracts a set of orthogonal, continuous axes representing the variance in potential inputs (e.g., `user_frustration` [0,1], `budget_constraint` [0,1]).
124
+ 2. **Quasi-Random Initialization:** The framework maps a bounded multi-dimensional continuous input space $S \in [0, 1]^D$. It utilizes Sobol sequences to generate a low-discrepancy initialization grid to ensure uniform volumetric coverage without clustering.
125
+ 3. **Synthetic Input Generation:** Bounded coordinate vectors are mapped back into natural language. A Generator LLM synthesizes adversarial or conversational inputs as structured payloads that strictly adhere to the defined vector coordinates.
126
+ 4. **Response Classification** The target agent executes the synthesized inputs. An Evaluator LLM utilizes Chain-of-Thought (CoT) reasoning to classify the agent's response as a pass (1) or fail (0) against the constraint.
127
+ 5. **Surrogate Modeling & Active Search:** The framework fits a Random Forest surrogate classifier over the evaluated points to approximate the failure boundary. It utilizes an active search algorithm to sample points near the decision boundary until volumetric saturation is reached, stopping early via dimension-scaled mismatch rate thresholds.
128
+
129
+ The resulting artifact is a detailed report allowing engineers to identify the specific input themes and characteristics that reliably induce constraint violations.
130
+
131
+ ## Privacy & Security (BYOK)
132
+
133
+ Hyperplane Eval is designed for enterprise privacy using a **Bring Your Own Key (BYOK)** architecture:
134
+ - **100% Local Execution:** The orchestrator and test synthesis engine run entirely on your local machine or CI/CD runner.
135
+ - **No Telemetry or Data Logging:** We do not collect product telemetry, execution logs, or source code.
136
+ - **Direct Vendor Routing:** Your API keys are routed strictly to the LLM provider you configure (via `litellm`) and are never intercepted or proxied.
137
+ - **Budget Safe:** Built-in safeguards and configurable depth/breadth parameters ensure the evaluation pipeline generates high coverage without blowing up your token bill.
138
+ - **Open Source Auditing:** The entire orchestration pipeline is open-source, allowing your security team to fully audit the codebase.
139
+
140
+ ## 🛠 Technology Stack
141
+ - **Language:** Python 3.10+
142
+ - **Data Modeling:** `pydantic`
143
+ - **Math/Geometry:** `numpy`, `scipy` (Sobol sequences, ConvexHull analysis)
144
+ - **LLM Integration:** `litellm` for universal API connectivity (OpenAI, Gemini, Anthropic, or any local vLLM).
145
+
146
+ ## 📄 License
147
+
148
+ This project is licensed under the Apache License, Version 2.0.
149
+ See the [LICENSE](LICENSE) file for more information.
@@ -0,0 +1,115 @@
1
+ # Hyperplane Eval
2
+
3
+ Hyperplane Eval helps teams discover behavioral failures before deployment.
4
+
5
+ It works as a CLI and programmatic tool for evaluating AI agents against business, security, and ethical requirements using intelligent test generation rather than manually written test cases.
6
+
7
+ ## Why Hyperplane?
8
+
9
+ Most AI teams evaluate agents with a few dozen manually written prompts.
10
+
11
+ The problem is that manually written tests only cover a tiny fraction of the agent's behavioral space.
12
+
13
+ Hyperplane automatically explores that space, generating thousands of semantically diverse inputs to uncover failures before users do.
14
+
15
+ ## Features
16
+
17
+ - Generate thousands of semantically diverse test cases
18
+ - Evaluate business, security, and ethical requirements of your agent
19
+ - Automatically find edge cases and breaking points
20
+ - Local-first CLI workflow
21
+ - Framework-agnostic agent integration
22
+ - Detailed evaluation reports. [See example here](https://n0tsu5.github.io/results-board/)
23
+
24
+ ## CLI Integration
25
+
26
+ Hyperplane operates entirely locally through a terminal-based orchestration wizard.
27
+
28
+ ### Setup & Installation
29
+
30
+ Install the framework via pip:
31
+
32
+ ```bash
33
+ pip install hyperplane-eval
34
+ ```
35
+
36
+ ### Running the CLI
37
+
38
+ Run the interactive CLI directly in your terminal from inside your project directory:
39
+
40
+ ```bash
41
+ hyperplane
42
+ ```
43
+
44
+ The CLI wizard will guide you through:
45
+ 1. **Target Binding:** Automatically finding your agent's code.
46
+ 2. **Constraint Definition:** Setting natural language rules for your agent.
47
+ 3. **Configuration:** Setting the scale and depth of the evaluation.
48
+ 4. **Execution:** Running the evaluation with a live dashboard.
49
+
50
+ Hyperplane outputs a structured dataset and an HTML report identifying the specific types and characteristics of inputs that cause the agent to fail, helping you quickly isolate prompt engineering or logic regressions.
51
+
52
+ ![Hyperplane Evaluation Dashboard](5cr33n5h0t.png)
53
+
54
+ ## Programmatic Integration
55
+
56
+ You can also integrate Hyperplane Eval directly into your Python codebase or CI/CD testing suites using the `Evaluator` API:
57
+
58
+ ```python
59
+ from hyperplane import Evaluator
60
+ from litellm import Router
61
+
62
+ # 1. Initialize your litellm Router
63
+ router = Router(
64
+ model_list=[{
65
+ "model_name": "gpt-4o",
66
+ "litellm_params": {"model": "gpt-4o"}
67
+ }]
68
+ )
69
+
70
+ # 2. Define your target agent
71
+ def my_agent(prompt: str) -> str:
72
+ return "I am a safe agent."
73
+
74
+ # 3. Initialize the Evaluator
75
+ evaluator = Evaluator(
76
+ agent_desc="A helpful AI assistant",
77
+ param_desc={"prompt": "The user input prompt"},
78
+ target_callable=my_agent,
79
+ llm_client=router
80
+ )
81
+
82
+ # 4. Add constraints and run
83
+ evaluator.run(rules=["Never execute tool calls with unsafe parameters.", "Always respond in English."])
84
+ ```
85
+ ## Architecture & Methodology
86
+
87
+ Evaluating agentic systems presents a curse-of-dimensionality problem due to the infinite input space. Hyperplane mitigates this via a dimensional reduction and bounded sampling approach:
88
+
89
+ 1. **Orthogonal Dimension Extraction:** The target heuristic or constraint (e.g., a business logic rule) is passed to an LLM, which extracts a set of orthogonal, continuous axes representing the variance in potential inputs (e.g., `user_frustration` [0,1], `budget_constraint` [0,1]).
90
+ 2. **Quasi-Random Initialization:** The framework maps a bounded multi-dimensional continuous input space $S \in [0, 1]^D$. It utilizes Sobol sequences to generate a low-discrepancy initialization grid to ensure uniform volumetric coverage without clustering.
91
+ 3. **Synthetic Input Generation:** Bounded coordinate vectors are mapped back into natural language. A Generator LLM synthesizes adversarial or conversational inputs as structured payloads that strictly adhere to the defined vector coordinates.
92
+ 4. **Response Classification** The target agent executes the synthesized inputs. An Evaluator LLM utilizes Chain-of-Thought (CoT) reasoning to classify the agent's response as a pass (1) or fail (0) against the constraint.
93
+ 5. **Surrogate Modeling & Active Search:** The framework fits a Random Forest surrogate classifier over the evaluated points to approximate the failure boundary. It utilizes an active search algorithm to sample points near the decision boundary until volumetric saturation is reached, stopping early via dimension-scaled mismatch rate thresholds.
94
+
95
+ The resulting artifact is a detailed report allowing engineers to identify the specific input themes and characteristics that reliably induce constraint violations.
96
+
97
+ ## Privacy & Security (BYOK)
98
+
99
+ Hyperplane Eval is designed for enterprise privacy using a **Bring Your Own Key (BYOK)** architecture:
100
+ - **100% Local Execution:** The orchestrator and test synthesis engine run entirely on your local machine or CI/CD runner.
101
+ - **No Telemetry or Data Logging:** We do not collect product telemetry, execution logs, or source code.
102
+ - **Direct Vendor Routing:** Your API keys are routed strictly to the LLM provider you configure (via `litellm`) and are never intercepted or proxied.
103
+ - **Budget Safe:** Built-in safeguards and configurable depth/breadth parameters ensure the evaluation pipeline generates high coverage without blowing up your token bill.
104
+ - **Open Source Auditing:** The entire orchestration pipeline is open-source, allowing your security team to fully audit the codebase.
105
+
106
+ ## 🛠 Technology Stack
107
+ - **Language:** Python 3.10+
108
+ - **Data Modeling:** `pydantic`
109
+ - **Math/Geometry:** `numpy`, `scipy` (Sobol sequences, ConvexHull analysis)
110
+ - **LLM Integration:** `litellm` for universal API connectivity (OpenAI, Gemini, Anthropic, or any local vLLM).
111
+
112
+ ## 📄 License
113
+
114
+ This project is licensed under the Apache License, Version 2.0.
115
+ See the [LICENSE](LICENSE) file for more information.
@@ -0,0 +1,3 @@
1
+ from .evaluator import Evaluator
2
+
3
+ __all__ = ["Evaluator"]
@@ -7,11 +7,11 @@ from rich.text import Text
7
7
  from rich.panel import Panel
8
8
  from typing import Any
9
9
 
10
- from hyperplane_eval.adapters.llms.llm_client import LLMClient
11
- from hyperplane_eval.adapters.runners.agent_runner import AgentRunner
12
- from hyperplane_eval.adapters.local_bindings.executor import execute_temp_runner
13
- from hyperplane_eval.engine.config import EvaluationConfig
14
- from hyperplane_eval.engine.orchestrator import PipelineOrchestrator
10
+ from hyperplane.cli.llms.llm_client import LLMClient
11
+ from hyperplane.cli.runners.agent_runner import AgentRunner
12
+ from hyperplane.cli.local_bindings.executor import execute_temp_runner
13
+ from hyperplane.framework.config import EvaluationConfig
14
+ from hyperplane.framework.orchestrator import PipelineOrchestrator
15
15
 
16
16
 
17
17
  LOGO = """
@@ -148,7 +148,7 @@ class VerifyApp:
148
148
  )
149
149
  use_existing = await questionary.confirm("Use this target?").ask_async()
150
150
  if use_existing:
151
- from hyperplane_eval.adapters.local_bindings.scanner import extract_functions
151
+ from hyperplane.cli.local_bindings.scanner import extract_functions
152
152
 
153
153
  funcs = extract_functions(self.config["file"])
154
154
  selected_func = next(
@@ -187,7 +187,7 @@ class VerifyApp:
187
187
  return None, None, None, []
188
188
 
189
189
  self.console.print("[cyan]Scanning for functions...[/cyan]")
190
- from hyperplane_eval.adapters.local_bindings.scanner import extract_functions
190
+ from hyperplane.cli.local_bindings.scanner import extract_functions
191
191
 
192
192
  funcs = extract_functions(target_path)
193
193
  if not funcs:
@@ -4,7 +4,7 @@ import re
4
4
  import asyncio
5
5
  from typing import Any, Dict
6
6
  from litellm import acompletion
7
- from hyperplane_eval.engine.prompt_loader import load_prompt
7
+ from hyperplane.prompts.prompt_loader import load_prompt
8
8
 
9
9
 
10
10
  class LLMClient:
@@ -41,7 +41,7 @@ class LLMClient:
41
41
  temperature: float,
42
42
  ) -> str:
43
43
  schema_str = json.dumps(response_schema, indent=2)
44
- prompt += "\n\n" + load_prompt("adapters/llm/schema_prompt", schema=schema_str)
44
+ prompt += "\n\n" + load_prompt("llms/schema_prompt", schema=schema_str)
45
45
 
46
46
  kwargs = {
47
47
  "model": self.model, # Force using the user-selected model
@@ -17,7 +17,7 @@ async def execute_temp_runner(target_path: str, selected_func: dict, params: dic
17
17
  import sys, json, asyncio, inspect, importlib
18
18
  sys.path.insert(0, r"{target_dir}")
19
19
  try:
20
- target_func = getattr(importlib.import_module("{module_name}"), "{selected_func['name']}")
20
+ target_func = getattr(importlib.import_module("{module_name}"), "{selected_func["name"]}")
21
21
  except Exception as e:
22
22
  print("VERIFY_RUN_ERROR:Load fail: " + str(e))
23
23
  sys.exit(1)
@@ -50,8 +50,8 @@ async function main() {{
50
50
  }} catch(e) {{
51
51
  mod = require(moduleName);
52
52
  }}
53
- const func = mod.{selected_func['name']};
54
- if (!func) throw new Error("Function {selected_func['name']} not found in module.");
53
+ const func = mod.{selected_func["name"]};
54
+ if (!func) throw new Error("Function {selected_func["name"]} not found in module.");
55
55
 
56
56
  const params = JSON.parse(process.argv[1]);
57
57
  const funcParams = {params_array_str};
@@ -0,0 +1,171 @@
1
+ import asyncio
2
+ import inspect
3
+ import json
4
+ from typing import Callable
5
+ import litellm
6
+
7
+ from hyperplane.framework.config import EvaluationConfig
8
+ from hyperplane.framework.orchestrator import PipelineOrchestrator
9
+ from hyperplane.cli.runners.agent_runner import AgentRunner
10
+ from hyperplane.cli.llms.llm_client import LLMClient
11
+
12
+
13
+ class Evaluator:
14
+ """
15
+ Programmatic entry point for the Hyperplane Eval framework.
16
+ """
17
+
18
+ def __init__(
19
+ self,
20
+ agent_desc: str,
21
+ param_desc: dict,
22
+ target_callable: Callable,
23
+ llm_client: litellm.Router,
24
+ ):
25
+ """
26
+ Initialize the Evaluator.
27
+
28
+ Args:
29
+ agent_desc: A description of what the agent does.
30
+ param_desc: A map containing descriptions of the input parameters the agent takes (e.g. {"input": "User query"}).
31
+ target_callable: A synchronous or asynchronous Python callable (the agent).
32
+ llm_client: A pre-configured litellm.Router instance.
33
+ """
34
+ self.agent_desc = agent_desc
35
+ self.param_desc = param_desc
36
+ self.target_callable = target_callable
37
+ self.llm_client = llm_client
38
+
39
+ def run(
40
+ self,
41
+ rules: list[str],
42
+ depth: int = 50,
43
+ breadth: int = 3,
44
+ adversarial_testing: bool = False,
45
+ conversational_testing: bool = False,
46
+ ):
47
+ """
48
+ Runs the evaluation pipeline.
49
+ """
50
+ # 1. Build schema by inspecting the callable
51
+ try:
52
+ sig = inspect.signature(self.target_callable)
53
+ param_names = list(sig.parameters.keys())
54
+ except Exception:
55
+ param_names = []
56
+
57
+ if not param_names and isinstance(self.param_desc, dict) and self.param_desc:
58
+ param_names = list(self.param_desc.keys())
59
+ elif not param_names:
60
+ param_names = ["input"]
61
+
62
+ schema = []
63
+ for p_name in param_names:
64
+ if isinstance(self.param_desc, dict):
65
+ desc = self.param_desc.get(p_name, "")
66
+ if isinstance(desc, dict):
67
+ p_type = desc.get("type", "str")
68
+ p_desc = desc.get("description", "")
69
+ else:
70
+ p_type = "str"
71
+ p_desc = str(desc)
72
+ else:
73
+ p_type = "str"
74
+ p_desc = str(self.param_desc)
75
+
76
+ schema.append({"name": p_name, "type": p_type, "description": p_desc})
77
+
78
+ # 2. Try to extract source code for context
79
+ try:
80
+ code = inspect.getsource(self.target_callable)
81
+ except Exception:
82
+ code = "Code unavailable"
83
+
84
+ # 3. Resolve LLM configuration
85
+ class RouterWrapper(LLMClient):
86
+ def __init__(self, router):
87
+ self.router = router
88
+ self.model = "router"
89
+ self._semaphore = asyncio.Semaphore(10)
90
+
91
+ async def generate(
92
+ self, prompt: str, response_schema: dict, temperature: float
93
+ ) -> str:
94
+ from hyperplane.prompts.prompt_loader import load_prompt
95
+
96
+ schema_str = json.dumps(response_schema, indent=2)
97
+ prompt += "\n\n" + load_prompt("llms/schema_prompt", schema=schema_str)
98
+
99
+ kwargs = {
100
+ "model": "",
101
+ "messages": [{"role": "user", "content": prompt}],
102
+ "temperature": temperature,
103
+ "response_format": {"type": "json_object"},
104
+ }
105
+ try:
106
+ response = await self.router.acompletion(**kwargs)
107
+ return response.choices[0].message.content
108
+ except Exception as e:
109
+ raise RuntimeError(f"LLM Server Error: {e}")
110
+
111
+ llm_client_resolved = RouterWrapper(self.llm_client)
112
+
113
+ # 4. Setup Custom Execution Environment
114
+ selected_func = {
115
+ "name": getattr(self.target_callable, "__name__", "target_callable"),
116
+ "code": code,
117
+ "params": schema,
118
+ }
119
+
120
+ async def executor_func(target_path, func_meta, params):
121
+ try:
122
+ if asyncio.iscoroutinefunction(self.target_callable):
123
+ res = await self.target_callable(**params)
124
+ else:
125
+ res = self.target_callable(**params)
126
+
127
+ if not isinstance(res, str):
128
+ res_str = json.dumps(res)
129
+ else:
130
+ res_str = res
131
+ return {"successVal": res_str}
132
+ except Exception as e:
133
+ return {"errorVal": str(e)}
134
+
135
+ runner = AgentRunner(
136
+ executor_func=executor_func,
137
+ target_path="programmatic_execution",
138
+ selected_func=selected_func,
139
+ )
140
+
141
+ # 5. Build Config
142
+ eval_config = EvaluationConfig(
143
+ rules=rules,
144
+ runner=runner,
145
+ generator_target_schema=schema,
146
+ generator_target_code=code,
147
+ llm_client=llm_client_resolved,
148
+ depth=depth,
149
+ breadth=breadth,
150
+ adversarial_testing=adversarial_testing,
151
+ conversational_testing=conversational_testing,
152
+ agent_description=self.agent_desc,
153
+ )
154
+
155
+ # 6. Execute Pipeline
156
+ orchestrator = PipelineOrchestrator(eval_config)
157
+
158
+ try:
159
+ loop = asyncio.get_running_loop()
160
+ except RuntimeError:
161
+ loop = None
162
+
163
+ if loop and loop.is_running():
164
+ import warnings
165
+
166
+ warnings.warn(
167
+ "Evaluator.run() called from a running event loop. Returning a coroutine instead. Please await it."
168
+ )
169
+ return orchestrator.run()
170
+
171
+ return asyncio.run(orchestrator.run())
@@ -1,7 +1,7 @@
1
1
  from dataclasses import dataclass
2
2
  from typing import Any, List, Dict
3
3
 
4
- from hyperplane_eval.adapters.runners.agent_runner import AgentRunner
4
+ from hyperplane.cli.runners.agent_runner import AgentRunner
5
5
 
6
6
 
7
7
  @dataclass
@@ -0,0 +1,3 @@
1
+ from .prompt_feature import PromptFeature
2
+
3
+ __all__ = ["PromptFeature"]
@@ -2,8 +2,8 @@ import json
2
2
  from pathlib import Path
3
3
  from typing import List
4
4
  from scipy.stats import qmc
5
- from hyperplane_eval.engine.domain.vectors import ScenarioVector, EvaluatedVector
6
- from hyperplane_eval.engine.domain.dimensions import PromptFeature
5
+ from hyperplane.framework.domain.vectors import ScenarioVector, EvaluatedVector
6
+ from hyperplane.framework.domain.dimensions.prompt_feature import PromptFeature
7
7
 
8
8
 
9
9
  class InputSpace:
@@ -3,10 +3,10 @@ import json
3
3
  import random
4
4
  import os
5
5
  from typing import List
6
- from hyperplane_eval.engine.domain.dimensions import PromptFeature
7
- from hyperplane_eval.adapters.llms.llm_client import LLMClient
8
- from hyperplane_eval.engine.prompt_loader import load_prompt
9
- from hyperplane_eval.engine.input_space.input_space import InputSpace
6
+ from hyperplane.framework.domain.dimensions.prompt_feature import PromptFeature
7
+ from hyperplane.cli.llms.llm_client import LLMClient
8
+ from hyperplane.prompts.prompt_loader import load_prompt
9
+ from hyperplane.framework.input_space.input_space import InputSpace
10
10
 
11
11
 
12
12
  class InputSpaceFactory:
@@ -112,7 +112,10 @@ class InputSpaceFactory:
112
112
  if plane:
113
113
  hyperplanes.append(plane)
114
114
 
115
- target_planes = self.BREADTH_MAP.get(breadth, self.DEFAULT_TARGET_PLANES)
115
+ if isinstance(breadth, int):
116
+ target_planes = breadth
117
+ else:
118
+ target_planes = self.BREADTH_MAP.get(breadth, self.DEFAULT_TARGET_PLANES)
116
119
 
117
120
  return hyperplanes[:target_planes]
118
121
 
@@ -4,13 +4,13 @@ import signal
4
4
  from pathlib import Path
5
5
  from typing import Dict, Any
6
6
 
7
- from hyperplane_eval.engine.input_space.input_space import InputSpace
8
- from hyperplane_eval.adapters.llms.llm_client import LLMClient
9
- from hyperplane_eval.engine.stages.generator import SyntheticInputGenerator
10
- from hyperplane_eval.engine.stages.evaluator import AgentOutputEvaluator
11
- from hyperplane_eval.engine.input_space.input_space_factory import InputSpaceFactory
12
- from hyperplane_eval.engine.config import EvaluationConfig
13
- from hyperplane_eval.reporting.analyser import ResultsAnalyser
7
+ from hyperplane.framework.input_space.input_space import InputSpace
8
+ from hyperplane.cli.llms.llm_client import LLMClient
9
+ from hyperplane.framework.stages.generator import SyntheticInputGenerator
10
+ from hyperplane.framework.stages.evaluator import AgentOutputEvaluator
11
+ from hyperplane.framework.input_space.input_space_factory import InputSpaceFactory
12
+ from hyperplane.framework.config import EvaluationConfig
13
+ from hyperplane.framework.reporting.analyser import ResultsAnalyser
14
14
  from .plane_evaluator import PlaneEvaluator
15
15
 
16
16
 
@@ -2,17 +2,17 @@ from pathlib import Path
2
2
  import asyncio
3
3
  import sys
4
4
 
5
- from hyperplane_eval.engine.domain.vectors import (
5
+ from hyperplane.framework.domain.vectors import (
6
6
  ScenarioVector,
7
7
  SynthesizedVector,
8
8
  ExecutedVector,
9
9
  )
10
- from hyperplane_eval.engine.domain.dimensions import PromptFeature
11
- from hyperplane_eval.engine.input_space.input_space import InputSpace
12
- from hyperplane_eval.engine.stages.generator import SyntheticInputGenerator
13
- from hyperplane_eval.engine.stages.evaluator import AgentOutputEvaluator
14
- from hyperplane_eval.engine.stages.navigator import AdaptiveNavigator
15
- from hyperplane_eval.adapters.runners.agent_runner import AgentRunner
10
+ from hyperplane.framework.domain.dimensions.prompt_feature import PromptFeature
11
+ from hyperplane.framework.input_space.input_space import InputSpace
12
+ from hyperplane.framework.stages.generator import SyntheticInputGenerator
13
+ from hyperplane.framework.stages.evaluator import AgentOutputEvaluator
14
+ from hyperplane.framework.stages.navigator import AdaptiveNavigator
15
+ from hyperplane.cli.runners.agent_runner import AgentRunner
16
16
  from typing import Any
17
17
 
18
18
 
@@ -177,14 +177,17 @@ class PlaneEvaluator:
177
177
  stop_event: asyncio.Event,
178
178
  ) -> InputSpace:
179
179
  """Evaluates a single hyperplane of prompt features."""
180
- from hyperplane_eval.cli.app import VerifyApp
180
+ from hyperplane.cli.app import VerifyApp
181
181
 
182
182
  state_file = str(
183
183
  res_path / f"input_space_state_rule_{rule_idx}_plane_{plane_idx}.json"
184
184
  )
185
185
  plane_input_space = InputSpace(features=plane_features, state_path=state_file)
186
186
  unique_dims = len(set(f.name for f in plane_features))
187
- multiplier = cls.DEPTH_MAP.get(depth, cls.DEFAULT_MULTIPLIER)
187
+ if isinstance(depth, int):
188
+ multiplier = depth
189
+ else:
190
+ multiplier = cls.DEPTH_MAP.get(depth, cls.DEFAULT_MULTIPLIER)
188
191
  scenarios_per_plane = unique_dims * multiplier
189
192
 
190
193
  navigator = AdaptiveNavigator(plane_input_space)