hyperplane-eval 0.1.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- adapters/__init__.py +1 -0
- adapters/llms/__init__.py +0 -0
- adapters/llms/llm_client.py +64 -0
- adapters/local_bindings/__init__.py +0 -0
- adapters/local_bindings/executor.py +97 -0
- adapters/local_bindings/scanner.py +124 -0
- adapters/runners/__init__.py +0 -0
- adapters/runners/agent_runner.py +81 -0
- cli/__init__.py +1 -0
- cli/app.py +429 -0
- engine/__init__.py +0 -0
- engine/config.py +20 -0
- engine/domain/__init__.py +3 -0
- engine/domain/dimensions.py +23 -0
- engine/domain/predefined_features.json +327 -0
- engine/domain/vectors/__init__.py +11 -0
- engine/domain/vectors/base.py +16 -0
- engine/domain/vectors/evaluated.py +16 -0
- engine/domain/vectors/executed.py +9 -0
- engine/domain/vectors/synthesized.py +21 -0
- engine/orchestrator.py +193 -0
- engine/plane_evaluator.py +250 -0
- engine/prompt_loader.py +10 -0
- engine/stages/__init__.py +0 -0
- engine/stages/creator.py +406 -0
- engine/stages/evaluator.py +72 -0
- engine/stages/generator.py +327 -0
- engine/stages/input_space.py +133 -0
- engine/stages/navigator.py +187 -0
- hyperplane_eval-0.1.2.dist-info/METADATA +143 -0
- hyperplane_eval-0.1.2.dist-info/RECORD +38 -0
- hyperplane_eval-0.1.2.dist-info/WHEEL +5 -0
- hyperplane_eval-0.1.2.dist-info/entry_points.txt +2 -0
- hyperplane_eval-0.1.2.dist-info/licenses/LICENSE +176 -0
- hyperplane_eval-0.1.2.dist-info/top_level.txt +4 -0
- reporting/__init__.py +0 -0
- reporting/analyser.py +786 -0
- reporting/templates/report_template.html +988 -0
adapters/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Infrastructure adapters for the evaluation framework."""
|
|
File without changes
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import json
|
|
3
|
+
import re
|
|
4
|
+
import asyncio
|
|
5
|
+
from typing import Any, Dict
|
|
6
|
+
from litellm import acompletion
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class LLMClient:
|
|
10
|
+
"""
|
|
11
|
+
A unified LLM client using LiteLLM to support multiple providers (OpenAI, Gemini, Anthropic, etc).
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
def __init__(self, model: str | None = None, **kwargs):
|
|
15
|
+
# LiteLLM accepts strings like "gpt-4o", "gemini/gemini-1.5-flash", "anthropic/claude-3-5-sonnet"
|
|
16
|
+
self.model = model or os.environ.get("LLM_MODEL", "gpt-4o")
|
|
17
|
+
self.llm_kwargs = kwargs
|
|
18
|
+
self._semaphore = asyncio.Semaphore(10)
|
|
19
|
+
|
|
20
|
+
def parse_json(self, response: str) -> Dict[str, Any]:
|
|
21
|
+
if not (text := (response or "").strip()):
|
|
22
|
+
return {}
|
|
23
|
+
candidates = [text]
|
|
24
|
+
if match := re.search(r"```(?:json)?\s*(.*?)\s*```", text, re.DOTALL):
|
|
25
|
+
candidates.insert(0, match.group(1))
|
|
26
|
+
start, end = text.find("{"), text.rfind("}")
|
|
27
|
+
if start != -1 and end != -1 and end > start:
|
|
28
|
+
candidates.extend([text[start : end + 1], text[start:]])
|
|
29
|
+
for cand in filter(None, candidates):
|
|
30
|
+
try:
|
|
31
|
+
return json.loads(cand, strict=False)
|
|
32
|
+
except Exception:
|
|
33
|
+
pass
|
|
34
|
+
return {}
|
|
35
|
+
|
|
36
|
+
async def generate(
|
|
37
|
+
self,
|
|
38
|
+
prompt: str,
|
|
39
|
+
response_schema: Dict[str, Any],
|
|
40
|
+
temperature: float,
|
|
41
|
+
) -> str:
|
|
42
|
+
if response_schema:
|
|
43
|
+
prompt += f"\n\nYOU MUST RETURN A JSON OBJECT WITH THE EXACT FOLLOWING SCHEMA:\n{json.dumps(response_schema, indent=2)}"
|
|
44
|
+
|
|
45
|
+
kwargs = {
|
|
46
|
+
"model": self.model, # Force using the user-selected model
|
|
47
|
+
"messages": [{"role": "user", "content": prompt}],
|
|
48
|
+
"temperature": temperature,
|
|
49
|
+
**self.llm_kwargs,
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
if response_schema:
|
|
53
|
+
kwargs["response_format"] = {"type": "json_object"}
|
|
54
|
+
|
|
55
|
+
async with self._semaphore:
|
|
56
|
+
try:
|
|
57
|
+
response = await acompletion(**kwargs)
|
|
58
|
+
return response.choices[0].message.content
|
|
59
|
+
except Exception as e:
|
|
60
|
+
print(f"[LiteLLM] Error HTTP: {e}")
|
|
61
|
+
raise RuntimeError(f"LLM Server Error: {e}")
|
|
62
|
+
|
|
63
|
+
async def close(self) -> None:
|
|
64
|
+
pass
|
|
File without changes
|
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import json
|
|
3
|
+
import subprocess
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
async def execute_temp_runner(target_path: str, selected_func: dict, params: dict):
|
|
7
|
+
target_dir = os.path.dirname(os.path.abspath(target_path))
|
|
8
|
+
target_basename = os.path.basename(target_path)
|
|
9
|
+
module_name, ext = os.path.splitext(target_basename)
|
|
10
|
+
is_python = ext == ".py"
|
|
11
|
+
is_ts = ext == ".ts"
|
|
12
|
+
|
|
13
|
+
params_json_str = json.dumps(params)
|
|
14
|
+
|
|
15
|
+
if is_python:
|
|
16
|
+
python_script = f"""
|
|
17
|
+
import sys, json, asyncio, inspect, importlib
|
|
18
|
+
sys.path.insert(0, r"{target_dir}")
|
|
19
|
+
try:
|
|
20
|
+
target_func = getattr(importlib.import_module("{module_name}"), "{selected_func['name']}")
|
|
21
|
+
except Exception as e:
|
|
22
|
+
print("VERIFY_RUN_ERROR:Load fail: " + str(e))
|
|
23
|
+
sys.exit(1)
|
|
24
|
+
|
|
25
|
+
params = json.loads(sys.argv[1])
|
|
26
|
+
casted = {{}}
|
|
27
|
+
for name, param in inspect.signature(target_func).parameters.items():
|
|
28
|
+
if name in params: casted[name] = params[name]
|
|
29
|
+
|
|
30
|
+
try:
|
|
31
|
+
res = target_func(**casted)
|
|
32
|
+
if inspect.iscoroutine(res): res = asyncio.run(res)
|
|
33
|
+
print("VERIFY_RUN_SUCCESS:" + json.dumps(res))
|
|
34
|
+
except Exception as e:
|
|
35
|
+
import traceback
|
|
36
|
+
print("VERIFY_RUN_ERROR:" + str(e) + "\\n" + traceback.format_exc())
|
|
37
|
+
"""
|
|
38
|
+
cmd = ["python3", "-c", python_script, params_json_str]
|
|
39
|
+
else:
|
|
40
|
+
params_array_str = json.dumps(selected_func["params"])
|
|
41
|
+
# Handle both ES modules and CommonJS
|
|
42
|
+
# For inline evaluation, we'll try to import dynamically
|
|
43
|
+
ts_script = f"""
|
|
44
|
+
async function main() {{
|
|
45
|
+
try {{
|
|
46
|
+
const moduleName = './{module_name}{ext}';
|
|
47
|
+
let mod;
|
|
48
|
+
try {{
|
|
49
|
+
mod = await import(moduleName);
|
|
50
|
+
}} catch(e) {{
|
|
51
|
+
mod = require(moduleName);
|
|
52
|
+
}}
|
|
53
|
+
const func = mod.{selected_func['name']};
|
|
54
|
+
if (!func) throw new Error("Function {selected_func['name']} not found in module.");
|
|
55
|
+
|
|
56
|
+
const params = JSON.parse(process.argv[1]);
|
|
57
|
+
const funcParams = {params_array_str};
|
|
58
|
+
const args = funcParams.map(p => params[p.name]);
|
|
59
|
+
|
|
60
|
+
let res = func(...args);
|
|
61
|
+
if (res instanceof Promise) res = await res;
|
|
62
|
+
console.log("VERIFY_RUN_SUCCESS:" + JSON.stringify(res));
|
|
63
|
+
}} catch (err) {{
|
|
64
|
+
console.log("VERIFY_RUN_ERROR:" + (err.stack || err.message));
|
|
65
|
+
}}
|
|
66
|
+
}}
|
|
67
|
+
main();
|
|
68
|
+
"""
|
|
69
|
+
if is_ts:
|
|
70
|
+
cmd = ["npx", "-y", "tsx", "--eval", ts_script, params_json_str]
|
|
71
|
+
else:
|
|
72
|
+
cmd = ["node", "-e", ts_script, params_json_str]
|
|
73
|
+
|
|
74
|
+
res = subprocess.run(cmd, cwd=target_dir, capture_output=True, text=True)
|
|
75
|
+
output = res.stdout
|
|
76
|
+
stderr = res.stderr
|
|
77
|
+
|
|
78
|
+
success_val = ""
|
|
79
|
+
error_val = ""
|
|
80
|
+
for line in output.splitlines():
|
|
81
|
+
if line.startswith("VERIFY_RUN_SUCCESS:"):
|
|
82
|
+
success_val = line[19:]
|
|
83
|
+
elif line.startswith("VERIFY_RUN_ERROR:"):
|
|
84
|
+
error_val = line[17:]
|
|
85
|
+
|
|
86
|
+
if not success_val and not error_val:
|
|
87
|
+
if stderr:
|
|
88
|
+
error_val = f"System Error (stderr): {stderr.strip()}"
|
|
89
|
+
else:
|
|
90
|
+
error_val = f"Empty execution result. Output: {output.strip()}"
|
|
91
|
+
|
|
92
|
+
return {
|
|
93
|
+
"successVal": success_val,
|
|
94
|
+
"errorVal": error_val,
|
|
95
|
+
"output": output,
|
|
96
|
+
"stderr": stderr,
|
|
97
|
+
}
|
|
@@ -0,0 +1,124 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import subprocess
|
|
3
|
+
import ast
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def extract_python_functions(filepath: str) -> list[dict]:
|
|
7
|
+
funcs = []
|
|
8
|
+
try:
|
|
9
|
+
with open(filepath, "r", encoding="utf-8") as f:
|
|
10
|
+
src = f.read()
|
|
11
|
+
tree = ast.parse(src)
|
|
12
|
+
lines = src.splitlines()
|
|
13
|
+
|
|
14
|
+
def get_ann(n):
|
|
15
|
+
if not n:
|
|
16
|
+
return "any"
|
|
17
|
+
if isinstance(n, ast.Name):
|
|
18
|
+
return n.id
|
|
19
|
+
if isinstance(n, ast.Subscript):
|
|
20
|
+
s = n.slice
|
|
21
|
+
if hasattr(s, "value"):
|
|
22
|
+
s = s.value
|
|
23
|
+
return f"{get_ann(n.value)}[{get_ann(s)}]"
|
|
24
|
+
if isinstance(n, ast.Attribute):
|
|
25
|
+
return f"{get_ann(n.value)}.{n.attr}"
|
|
26
|
+
if isinstance(n, ast.Tuple):
|
|
27
|
+
return ", ".join(get_ann(e) for e in n.elts)
|
|
28
|
+
if isinstance(n, ast.BinOp) and isinstance(n.op, ast.BitOr):
|
|
29
|
+
return f"{get_ann(n.left)} | {get_ann(n.right)}"
|
|
30
|
+
return "any"
|
|
31
|
+
|
|
32
|
+
for node in ast.walk(tree):
|
|
33
|
+
if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)):
|
|
34
|
+
if node.name == "__init__":
|
|
35
|
+
continue
|
|
36
|
+
start = min(
|
|
37
|
+
(d.lineno for d in node.decorator_list), default=node.lineno
|
|
38
|
+
)
|
|
39
|
+
end = getattr(node, "end_lineno", len(lines))
|
|
40
|
+
code = "\n".join(lines[max(0, start - 6) : end])
|
|
41
|
+
params = [
|
|
42
|
+
{"name": a.arg, "type": get_ann(a.annotation)}
|
|
43
|
+
for a in node.args.args
|
|
44
|
+
if a.arg not in ("self", "cls")
|
|
45
|
+
]
|
|
46
|
+
funcs.append(
|
|
47
|
+
{"name": node.name, "params": params, "code": code, "line": start}
|
|
48
|
+
)
|
|
49
|
+
|
|
50
|
+
funcs.sort(key=lambda x: x["line"])
|
|
51
|
+
except Exception:
|
|
52
|
+
pass
|
|
53
|
+
return funcs
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def extract_ts_functions(filepath: str) -> list[dict]:
|
|
57
|
+
ts_script = """
|
|
58
|
+
const fs = require('fs');
|
|
59
|
+
const ts = require('typescript');
|
|
60
|
+
const content = fs.readFileSync(process.argv[1], 'utf-8');
|
|
61
|
+
const sourceFile = ts.createSourceFile(process.argv[1], content, ts.ScriptTarget.Latest, true);
|
|
62
|
+
const functions = [];
|
|
63
|
+
const lines = content.split(/\\r?\\n/);
|
|
64
|
+
|
|
65
|
+
function getCode(node) {
|
|
66
|
+
const startPos = sourceFile.getLineAndCharacterOfPosition(node.getStart());
|
|
67
|
+
const endPos = sourceFile.getLineAndCharacterOfPosition(node.getEnd());
|
|
68
|
+
const startLine = Math.max(0, startPos.line - 5);
|
|
69
|
+
return lines.slice(startLine, endPos.line + 1).join('\\n');
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
function getParams(paramsNode) {
|
|
73
|
+
return paramsNode.map(p => {
|
|
74
|
+
let type = 'any';
|
|
75
|
+
if (p.type) type = p.type.getText(sourceFile);
|
|
76
|
+
return { name: p.name.getText(sourceFile), type: type };
|
|
77
|
+
});
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
function visit(node) {
|
|
81
|
+
if (ts.isFunctionDeclaration(node) && node.name) {
|
|
82
|
+
functions.push({
|
|
83
|
+
name: node.name.text,
|
|
84
|
+
params: getParams(node.parameters),
|
|
85
|
+
code: getCode(node)
|
|
86
|
+
});
|
|
87
|
+
} else if (ts.isVariableStatement(node)) {
|
|
88
|
+
for (const decl of node.declarationList.declarations) {
|
|
89
|
+
if (decl.initializer && (ts.isArrowFunction(decl.initializer) || ts.isFunctionExpression(decl.initializer))) {
|
|
90
|
+
if (ts.isIdentifier(decl.name)) {
|
|
91
|
+
functions.push({
|
|
92
|
+
name: decl.name.text,
|
|
93
|
+
params: getParams(decl.initializer.parameters),
|
|
94
|
+
code: getCode(node)
|
|
95
|
+
});
|
|
96
|
+
}
|
|
97
|
+
}
|
|
98
|
+
}
|
|
99
|
+
}
|
|
100
|
+
ts.forEachChild(node, visit);
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
visit(sourceFile);
|
|
104
|
+
console.log(JSON.stringify(functions));
|
|
105
|
+
"""
|
|
106
|
+
try:
|
|
107
|
+
# Use npx -y to ensure typescript is available without prompts
|
|
108
|
+
res = subprocess.run(
|
|
109
|
+
["npx", "-y", "-p", "typescript", "node", "-e", ts_script, filepath],
|
|
110
|
+
capture_output=True,
|
|
111
|
+
text=True,
|
|
112
|
+
)
|
|
113
|
+
if res.returncode == 0:
|
|
114
|
+
return json.loads(res.stdout)
|
|
115
|
+
except Exception:
|
|
116
|
+
pass
|
|
117
|
+
return []
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
def extract_functions(filepath: str) -> list[dict]:
|
|
121
|
+
if filepath.endswith(".py"):
|
|
122
|
+
return extract_python_functions(filepath)
|
|
123
|
+
else:
|
|
124
|
+
return extract_ts_functions(filepath)
|
|
File without changes
|
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
import json
|
|
2
|
+
from typing import Dict, List, Callable
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class AgentRunner:
|
|
6
|
+
"""
|
|
7
|
+
Interfaces with the target AI agent to collect performance data locally via direct execution.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
def __init__(
|
|
11
|
+
self,
|
|
12
|
+
executor_func: Callable = None,
|
|
13
|
+
target_path: str = "",
|
|
14
|
+
selected_func: dict = None,
|
|
15
|
+
):
|
|
16
|
+
self.executor_func = executor_func
|
|
17
|
+
self.target_path = target_path
|
|
18
|
+
self.selected_func = selected_func
|
|
19
|
+
|
|
20
|
+
async def _call_target_agent(self, messages: List[Dict[str, str]]) -> str:
|
|
21
|
+
"""Dispatches a multi-turn request to the agent under evaluation."""
|
|
22
|
+
if not messages:
|
|
23
|
+
return ""
|
|
24
|
+
|
|
25
|
+
prompt = messages[-1]["content"]
|
|
26
|
+
|
|
27
|
+
if self.executor_func:
|
|
28
|
+
# Local Execution
|
|
29
|
+
params = {}
|
|
30
|
+
if isinstance(prompt, str):
|
|
31
|
+
try:
|
|
32
|
+
import re
|
|
33
|
+
|
|
34
|
+
clean_str = prompt.strip()
|
|
35
|
+
first_brace = clean_str.find("{")
|
|
36
|
+
last_brace = clean_str.rfind("}")
|
|
37
|
+
if (
|
|
38
|
+
first_brace != -1
|
|
39
|
+
and last_brace != -1
|
|
40
|
+
and last_brace > first_brace
|
|
41
|
+
):
|
|
42
|
+
clean_str = clean_str[first_brace : last_brace + 1]
|
|
43
|
+
clean_str = re.sub(r"[\x00-\x1F]", "", clean_str)
|
|
44
|
+
params = json.loads(clean_str)
|
|
45
|
+
except Exception:
|
|
46
|
+
fn_params = (
|
|
47
|
+
self.selected_func.get("params", [])
|
|
48
|
+
if self.selected_func
|
|
49
|
+
else []
|
|
50
|
+
)
|
|
51
|
+
if (
|
|
52
|
+
fn_params
|
|
53
|
+
and len(fn_params) == 1
|
|
54
|
+
and fn_params[0].get("type") == "str"
|
|
55
|
+
):
|
|
56
|
+
params = {fn_params[0]["name"]: prompt}
|
|
57
|
+
else:
|
|
58
|
+
return "Error: Failed to parse parameters from test agent"
|
|
59
|
+
elif isinstance(prompt, dict):
|
|
60
|
+
params = prompt
|
|
61
|
+
|
|
62
|
+
try:
|
|
63
|
+
result = await self.executor_func(
|
|
64
|
+
self.target_path, self.selected_func, params
|
|
65
|
+
)
|
|
66
|
+
if result.get("successVal"):
|
|
67
|
+
try:
|
|
68
|
+
return json.loads(result["successVal"])
|
|
69
|
+
except json.JSONDecodeError:
|
|
70
|
+
return result["successVal"]
|
|
71
|
+
elif result.get("errorVal"):
|
|
72
|
+
return f"Error: {result['errorVal']}"
|
|
73
|
+
return "Unknown execution state"
|
|
74
|
+
except Exception as e:
|
|
75
|
+
return f"Error: {str(e)}"
|
|
76
|
+
else:
|
|
77
|
+
return ""
|
|
78
|
+
|
|
79
|
+
async def close(self):
|
|
80
|
+
"""No-op close method to satisfy framework expectation."""
|
|
81
|
+
pass
|
cli/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
# cli package
|