PyPI - hyperplane-eval - Versions diffs - 0.1.2__py3-none-any.whl - Mend

hyperplane-eval 0.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (38) hide show

adapters/__init__.py +1 -0
adapters/llms/__init__.py +0 -0
adapters/llms/llm_client.py +64 -0
adapters/local_bindings/__init__.py +0 -0
adapters/local_bindings/executor.py +97 -0
adapters/local_bindings/scanner.py +124 -0
adapters/runners/__init__.py +0 -0
adapters/runners/agent_runner.py +81 -0
cli/__init__.py +1 -0
cli/app.py +429 -0
engine/__init__.py +0 -0
engine/config.py +20 -0
engine/domain/__init__.py +3 -0
engine/domain/dimensions.py +23 -0
engine/domain/predefined_features.json +327 -0
engine/domain/vectors/__init__.py +11 -0
engine/domain/vectors/base.py +16 -0
engine/domain/vectors/evaluated.py +16 -0
engine/domain/vectors/executed.py +9 -0
engine/domain/vectors/synthesized.py +21 -0
engine/orchestrator.py +193 -0
engine/plane_evaluator.py +250 -0
engine/prompt_loader.py +10 -0
engine/stages/__init__.py +0 -0
engine/stages/creator.py +406 -0
engine/stages/evaluator.py +72 -0
engine/stages/generator.py +327 -0
engine/stages/input_space.py +133 -0
engine/stages/navigator.py +187 -0
hyperplane_eval-0.1.2.dist-info/METADATA +143 -0
hyperplane_eval-0.1.2.dist-info/RECORD +38 -0
hyperplane_eval-0.1.2.dist-info/WHEEL +5 -0
hyperplane_eval-0.1.2.dist-info/entry_points.txt +2 -0
hyperplane_eval-0.1.2.dist-info/licenses/LICENSE +176 -0
hyperplane_eval-0.1.2.dist-info/top_level.txt +4 -0
reporting/__init__.py +0 -0
reporting/analyser.py +786 -0
reporting/templates/report_template.html +988 -0

cli/app.py ADDED Viewed

@@ -0,0 +1,429 @@
+import os
+import asyncio
+import yaml
+import questionary
+from rich.console import Console, Group
+from rich.text import Text
+from rich.panel import Panel
+from typing import Any
+from adapters.local_bindings.scanner import extract_functions
+from adapters.local_bindings.executor import execute_temp_runner
+from adapters.runners.agent_runner import AgentRunner
+from engine.orchestrator import PipelineOrchestrator
+from engine.config import EvaluationConfig
+LOGO = """
+ █████╗ ██╗   ██╗████████╗ ██████╗ ██╗   ██╗███████╗██████╗ ██╗███████╗██╗   ██╗
+██╔══██╗██║   ██║╚══██╔══╝██╔═══██╗██║   ██║██╔════╝██╔══██╗██║██╔════╝╚██╗ ██╔╝
+███████║██║   ██║   ██║   ██║   ██║██║   ██║█████╗  ██████╔╝██║█████╗   ╚████╔╝
+██╔══██║██║   ██║   ██║   ██║   ██║╚██╗ ██╔╝██╔══╝  ██╔══██╗██║██╔══╝    ╚██╔╝
+██║  ██║╚██████╔╝   ██║   ╚██████╔╝ ╚████╔╝ ███████╗██║  ██║██║██║        ██║
+╚═╝  ╚═╝ ╚═════╝    ╚═╝    ╚═════╝   ╚═══╝  ╚══════╝╚═╝  ╚═╝╚═╝╚═╝        ╚═╝
+"""
+class VerifyApp:
+    def __init__(self):
+        self.console = Console()
+        self.config_file = "verify.yaml"
+        self.config = self.load_config()
+    def load_config(self):
+        if os.path.exists(self.config_file):
+            with open(self.config_file, "r") as f:
+                return yaml.safe_load(f) or {}
+        return {}
+    def save_config(self):
+        with open(self.config_file, "w") as f:
+            yaml.dump(self.config, f)
+    async def _prompt_for_target(self):
+        """Prompts the user to select or confirm the target file and function."""
+        if self.config and "file" in self.config and "function" in self.config:
+            self.console.print(
+                f"[green]Found existing config for '{self.config['function']}' in '{self.config['file']}'.[/green]"
+            )
+            use_existing = await questionary.confirm("Use this target?").ask_async()
+            if use_existing:
+                funcs = extract_functions(self.config["file"])
+                selected_func = next(
+                    (f for f in funcs if f["name"] == self.config["function"]), None
+                )
+                if not selected_func:
+                    selected_func = {
+                        "name": self.config["function"],
+                        "code": "",
+                        "params": self.config.get("parameters", []),
+                    }
+                return (
+                    self.config["file"],
+                    selected_func,
+                    self.config.get("description", ""),
+                    self.config.get("rules", []),
+                )
+        # Prompt for file
+        files = []
+        for root, _, fnames in os.walk("."):
+            if any(
+                ignored in root for ignored in ["node_modules", ".venv", "__pycache__"]
+            ):
+                continue
+            for fname in fnames:
+                if fname.endswith((".py", ".ts", ".js")):
+                    files.append(os.path.relpath(os.path.join(root, fname)))
+        target_path = await questionary.autocomplete(
+            "Locate entry point file:", choices=files
+        ).ask_async()
+        if not target_path:
+            self.console.print("[red]No file selected. Exiting.[/red]")
+            return None, None, None, []
+        self.console.print("[cyan]Scanning for functions...[/cyan]")
+        funcs = extract_functions(target_path)
+        if not funcs:
+            self.console.print(
+                f"[red]No functions/methods detected in '{target_path}'.[/red]"
+            )
+            return None, None, None, []
+        choices = [
+            questionary.Choice(
+                title=f"{f['name']}({', '.join([p['name'] + ': ' + p['type'] for p in f['params']])})",
+                value=f,
+            )
+            for f in funcs
+        ]
+        selected_func = await questionary.select(
+            "Select the function where to call the agent:", choices=choices
+        ).ask_async()
+        if not selected_func:
+            return None, None, None, []
+        # 2.2 Prompt for parameter descriptions
+        for param in selected_func.get("params", []):
+            desc = await questionary.text(
+                f"Enter a description for parameter '{param['name']}' ({param['type']}):",
+                default=param.get("description", ""),
+            ).ask_async()
+            param["description"] = desc
+        description = await questionary.text(
+            "Enter a description about the test agent:",
+            default=self.config.get("description", ""),
+        ).ask_async()
+        rules = []
+        while True:
+            r = await questionary.text(
+                "Enter a rule to test for (leave blank to finish):"
+                if rules
+                else "Enter a rule to test for:"
+            ).ask_async()
+            if not r:
+                if not rules:
+                    self.console.print("[red]You must provide at least one rule.[/red]")
+                    continue
+                break
+            rules.append(r)
+        # 5 Setup LLM API using LiteLLM
+        import urllib.request
+        import json
+        try:
+            with urllib.request.urlopen(
+                "https://raw.githubusercontent.com/BerriAI/litellm/main/litellm/proxy/public_endpoints/provider_create_fields.json"
+            ) as response:
+                provider_data = json.loads(response.read().decode())
+        except Exception as e:
+            self.console.print(f"[yellow]Could not fetch provider list: {e}[/yellow]")
+            provider_data = []
+        if provider_data:
+            provider_map = {
+                p.get("provider_display_name", p.get("provider")): p
+                for p in provider_data
+            }
+            choices = list(provider_map.keys())
+            selected_provider_name = await questionary.autocomplete(
+                "Select LLM Provider (type to search):",
+                choices=choices,
+                ignore_case=True,
+            ).ask_async()
+            if not selected_provider_name:
+                return None, None, None, []
+            selected_provider = None
+            for k, p in provider_map.items():
+                if k.strip().lower() == selected_provider_name.strip().lower():
+                    selected_provider = p
+                    break
+            if not selected_provider:
+                self.console.print(
+                    f"[red]Invalid provider selected: '{selected_provider_name}'[/red]"
+                )
+                return None, None, None, []
+            credential_fields = selected_provider.get("credential_fields", [])
+            default_model = selected_provider.get("default_model_placeholder", "")
+        else:
+            credential_fields = [
+                {
+                    "key": "api_key",
+                    "label": "API Key",
+                    "required": False,
+                    "field_type": "password",
+                }
+            ]
+            default_model = "gpt-4o"
+        llm_model = await questionary.text(
+            f"Enter the LiteLLM model string{f' (e.g., {default_model})' if default_model else ''}:",
+            default=self.config.get("llm_model", default_model),
+        ).ask_async()
+        if llm_model is None:
+            return None, None, None, []
+        llm_params = {}
+        for field in credential_fields:
+            key = field.get("key")
+            if not key:
+                continue
+            label = field.get("label", key)
+            is_password = field.get("field_type") == "password"
+            prompt_text = f"Enter {label} (leave blank to use env vars):"
+            if is_password:
+                val = await questionary.password(prompt_text).ask_async()
+            else:
+                val = await questionary.text(
+                    prompt_text, default=self.config.get(f"llm_{key}", "")
+                ).ask_async()
+            if val is None:
+                return None, None, None, []
+            if val:
+                llm_params[key] = val
+            elif self.config.get(f"llm_{key}"):
+                llm_params[key] = self.config[f"llm_{key}"]
+        # Update and save config
+        self.config.update(
+            {
+                "file": target_path,
+                "function": selected_func["name"],
+                "description": description,
+                "parameters": selected_func["params"],
+                "rules": rules,
+                "llm_model": llm_model,
+            }
+        )
+        for k, v in llm_params.items():
+            self.config[f"llm_{k}"] = v
+        self.save_config()
+        return target_path, selected_func, description, rules
+    async def _prompt_for_rule(self, rules):
+        """Prompts the user to select the rules to test against."""
+        if not rules:
+            return []
+        choices = [questionary.Choice(title=r, value=r, checked=True) for r in rules]
+        choices.append(
+            questionary.Choice(title="Add a new rule...", value="Add a new rule...")
+        )
+        selected_rules = await questionary.checkbox(
+            "Select rules to test (Space to toggle, Enter to confirm):", choices=choices
+        ).ask_async()
+        if not selected_rules:
+            return []
+        if "Add a new rule..." in selected_rules:
+            selected_rules.remove("Add a new rule...")
+            while True:
+                new_rule = await questionary.text(
+                    "Enter a new rule to test for (leave blank to finish):"
+                ).ask_async()
+                if not new_rule:
+                    break
+                rules.append(new_rule)
+                selected_rules.append(new_rule)
+            self.config["rules"] = rules
+            self.save_config()
+        return selected_rules
+    async def _prompt_for_dynamic_config(self):
+        """Prompts the user for depth and breadth config values."""
+        self.console.print("\n[bold cyan]Evaluation Configuration[/bold cyan]")
+        self.console.print(
+            "• [bold]Breadth[/bold]: Determines how many distinct types of edge cases to test. Higher breadth explores a wider variety of scenarios."
+        )
+        self.console.print(
+            "• [bold]Depth[/bold]: Determines how many variations to generate for each edge case type. Higher depth provides more rigorous testing for a specific scenario."
+        )
+        self.console.print(
+            "[yellow]Note: Setting these to 'high' will significantly increase evaluation time and token usage.[/yellow]\n"
+        )
+        breadth = await questionary.select(
+            "Select Breadth (Number of Edge Case Types):",
+            choices=["low", "mid", "high"],
+            default=self.config.get("breadth", "mid"),
+        ).ask_async()
+        depth = await questionary.select(
+            "Select Depth (Variations per Edge Case):",
+            choices=["low", "mid", "high"],
+            default=self.config.get("depth", "mid"),
+        ).ask_async()
+        self.config["breadth"] = breadth
+        self.config["depth"] = depth
+        adversarial = await questionary.confirm(
+            "Enable Adversarial Testing? (Injects red-teaming vectors into scenarios)",
+            default=self.config.get("adversarial_testing", False),
+        ).ask_async()
+        self.config["adversarial_testing"] = adversarial
+        self.save_config()
+        return depth, breadth, adversarial
+    @staticmethod
+    def update_dashboard_display(
+        active_scenarios: dict,
+        plane_input_space: Any,
+        scenarios_per_plane: int,
+        plane_features: list,
+        rule_idx: int,
+        rules_len: int,
+        plane_idx: int,
+        num_planes: int,
+        rule: str,
+    ) -> Group:
+        """Generates the CLI dashboard showing evaluation progress and scenario status."""
+        pct = min(1.0, len(plane_input_space.get_all_vectors()) / scenarios_per_plane)
+        bar = "█" * int(30 * pct) + "░" * (30 - int(30 * pct))
+        dims_str = ", ".join(f.name for f in plane_features)
+        renderables = []
+        renderables.append(
+            Text.from_markup(
+                f"[bold cyan]Rule [{rule_idx + 1}/{rules_len}] - Plane [{plane_idx + 1}/{num_planes}]:[/bold cyan] {rule[:80]}..."
+            )
+        )
+        renderables.append(Text.from_markup(f"[cyan]Dimensions:[/cyan] {dims_str}"))
+        renderables.append(
+            Text.from_markup(
+                f"[cyan]Progress:[/cyan] [{bar}] {pct:.0%} ({len(plane_input_space.get_all_vectors())}/{scenarios_per_plane})\n"
+            )
+        )
+        for item in list(active_scenarios.values())[-3:]:
+            if item["status"] == "Pending":
+                renderables.append(Text.from_markup(f" • {item['text']}\n"))
+            else:
+                score = item["score"]
+                if score >= 0.75:
+                    marker = "[bold green][✓][/bold green]"
+                elif score >= 0.25:
+                    marker = "[bold yellow][~][/bold yellow]"
+                else:
+                    marker = "[bold red][✗][/bold red]"
+                renderables.append(
+                    Text.from_markup(f" • {marker} ({score:.0%}) {item['text']}\n")
+                )
+        return Group(*renderables)
+    async def run(self):
+        self.console.print(Panel.fit(Text(LOGO, style="bold cyan")))
+        target_path, selected_func, description, rules = await self._prompt_for_target()
+        if not target_path or not selected_func:
+            return
+        rules_to_run = await self._prompt_for_rule(rules)
+        if not rules_to_run:
+            self.console.print("[red]No rules selected. Exiting.[/red]")
+            return
+        depth, breadth, adversarial = await self._prompt_for_dynamic_config()
+        rules_str = ", ".join(f"'{r}'" for r in rules_to_run)
+        self.console.print(
+            f"\n[bold green]Starting evaluation locally for rules: {rules_str}[/bold green]"
+        )
+        from adapters.llms.llm_client import LLMClient
+        llm_params = {
+            k.replace("llm_", ""): v
+            for k, v in self.config.items()
+            if k.startswith("llm_") and k != "llm_model"
+        }
+        llm_client = LLMClient(model=self.config.get("llm_model"), **llm_params)
+        runner = AgentRunner(
+            executor_func=execute_temp_runner,
+            target_path=target_path,
+            selected_func=selected_func,
+        )
+        import os
+        agent_dir = os.path.dirname(os.path.abspath(target_path))
+        results_path = os.path.join(agent_dir, "results")
+        eval_config = EvaluationConfig(
+            results_dir=results_path,
+            rules=rules_to_run,
+            runner=runner,
+            generator_target_schema=selected_func.get("params", []),
+            generator_target_code=selected_func.get("code", ""),
+            depth=depth,
+            breadth=breadth,
+            adversarial_testing=adversarial,
+            llm_client=llm_client,
+            agent_description=description,
+        )
+        orchestrator = PipelineOrchestrator(eval_config)
+        await orchestrator.run()
+async def main():
+    app = VerifyApp()
+    await app.run()
+def console_main():
+    try:
+        asyncio.run(main())
+    except KeyboardInterrupt:
+        Console().print("\n[red]Exited.[/red]")
+if __name__ == "__main__":
+    console_main()

engine/__init__.py ADDED Viewed

File without changes

engine/config.py ADDED Viewed

@@ -0,0 +1,20 @@
+from dataclasses import dataclass, field
+from typing import Any, List, Dict, Optional
+from adapters.runners.agent_runner import AgentRunner
+@dataclass
+class EvaluationConfig:
+    """Configuration for an evaluation run."""
+    results_dir: str = "results"
+    rules: List[str] = field(default_factory=lambda: ["General Safety Policy"])
+    runner: AgentRunner = None
+    generator_target_schema: Optional[List[Dict[str, Any]]] = None
+    generator_target_code: str = ""
+    llm_client: Any = None
+    depth: str = "mid"
+    breadth: str = "mid"
+    adversarial_testing: bool = False
+    agent_description: str = ""

engine/domain/__init__.py ADDED Viewed

@@ -0,0 +1,3 @@
+from .dimensions import PromptFeature
+__all__ = ["PromptFeature"]

engine/domain/dimensions.py ADDED Viewed

@@ -0,0 +1,23 @@
+from dataclasses import dataclass
+@dataclass
+class PromptFeature:
+    """
+    Represents a prompt feature dimension used for adversarial mutations.
+    Attributes:
+        name: The name of the feature.
+        description: A short description of the feature.
+        anchors: A dictionary mapping coordinate levels (floats) to prompt examples.
+        weight: Float determining mutation application order (ascending = earlier).
+    """
+    name: str
+    description: str
+    anchors: dict[float, str]
+    weight: float
+    def __post_init__(self):
+        if self.anchors:
+            self.anchors = {float(k): v for k, v in self.anchors.items()}