@evo-hq/pi-evo 0.4.2-alpha.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +38 -0
- package/extensions/evo/index.js +244 -0
- package/package.json +41 -0
- package/skills/discover/SKILL.md +403 -0
- package/skills/discover/references/constructing-benchmark.md +167 -0
- package/skills/discover/references/inline_instrumentation.js +97 -0
- package/skills/discover/references/inline_instrumentation.py +109 -0
- package/skills/discover/references/proposing-dimensions.md +52 -0
- package/skills/discover/references/sdk_node.js +28 -0
- package/skills/discover/references/sdk_python.py +43 -0
- package/skills/discover/scripts/validate_result.py +55 -0
- package/skills/infra-setup/SKILL.md +87 -0
- package/skills/infra-setup/references/provider-matrix.md +25 -0
- package/skills/optimize/SKILL.md +309 -0
- package/skills/subagent/SKILL.md +281 -0
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Inline instrumentation for Node benchmarks. Paste into the benchmark and
|
|
3
|
+
* call logTask() per task + writeResult() once at the end.
|
|
4
|
+
*
|
|
5
|
+
* Contract:
|
|
6
|
+
* - Reads EVO_TRACES_DIR, EVO_EXPERIMENT_ID, EVO_RESULT_PATH from process.env.
|
|
7
|
+
* - Writes traces/task_<id>.json per task.
|
|
8
|
+
* - Writes the final result JSON to EVO_RESULT_PATH, or stdout if unset.
|
|
9
|
+
*/
|
|
10
|
+
|
|
11
|
+
import {
|
|
12
|
+
writeFileSync,
|
|
13
|
+
mkdirSync,
|
|
14
|
+
openSync,
|
|
15
|
+
closeSync,
|
|
16
|
+
renameSync,
|
|
17
|
+
} from "node:fs";
|
|
18
|
+
import { dirname, join } from "node:path";
|
|
19
|
+
|
|
20
|
+
const TRACES_DIR = process.env.EVO_TRACES_DIR || null;
|
|
21
|
+
const EXPERIMENT_ID = process.env.EVO_EXPERIMENT_ID || "unknown";
|
|
22
|
+
const RESULT_PATH = process.env.EVO_RESULT_PATH || null;
|
|
23
|
+
const SCORES = {};
|
|
24
|
+
const TASK_META = {};
|
|
25
|
+
const STARTED_AT = new Date().toISOString().replace(/\.\d{3}Z$/, "+00:00");
|
|
26
|
+
|
|
27
|
+
if (TRACES_DIR) mkdirSync(TRACES_DIR, { recursive: true });
|
|
28
|
+
|
|
29
|
+
/**
|
|
30
|
+
* Record the result for one task. `direction` is "max" (higher is better,
|
|
31
|
+
* default) or "min" (lower is better, e.g. latency). Set it only when this
|
|
32
|
+
* task's direction differs from the benchmark's top-level --metric.
|
|
33
|
+
* Propagates to tasks_meta in the final result JSON.
|
|
34
|
+
*/
|
|
35
|
+
export function logTask(taskId, score, { summary, failureReason, log, direction, ...extra } = {}) {
|
|
36
|
+
taskId = String(taskId);
|
|
37
|
+
if (direction !== undefined && direction !== "max" && direction !== "min") {
|
|
38
|
+
throw new Error(`direction must be 'max' or 'min', got ${JSON.stringify(direction)}`);
|
|
39
|
+
}
|
|
40
|
+
SCORES[taskId] = score;
|
|
41
|
+
if (direction !== undefined) TASK_META[taskId] = { direction };
|
|
42
|
+
if (!TRACES_DIR) return;
|
|
43
|
+
const trace = {
|
|
44
|
+
experiment_id: EXPERIMENT_ID,
|
|
45
|
+
task_id: taskId,
|
|
46
|
+
status: score >= 0.5 ? "passed" : "failed",
|
|
47
|
+
score,
|
|
48
|
+
ended_at: new Date().toISOString().replace(/\.\d{3}Z$/, "+00:00"),
|
|
49
|
+
};
|
|
50
|
+
if (direction !== undefined) trace.direction = direction;
|
|
51
|
+
if (summary !== undefined) trace.summary = summary;
|
|
52
|
+
if (failureReason !== undefined) trace.failure_reason = failureReason;
|
|
53
|
+
if (log !== undefined) trace.log = log;
|
|
54
|
+
Object.assign(trace, extra);
|
|
55
|
+
writeFileSync(join(TRACES_DIR, `task_${taskId}.json`), JSON.stringify(trace, null, 2), "utf-8");
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
export function writeResult(score) {
|
|
59
|
+
const ids = Object.keys(SCORES);
|
|
60
|
+
if (score === undefined) {
|
|
61
|
+
score = ids.length === 0 ? 0.0 : ids.reduce((a, id) => a + SCORES[id], 0) / ids.length;
|
|
62
|
+
}
|
|
63
|
+
score = Math.round(score * 10000) / 10000;
|
|
64
|
+
const result = {
|
|
65
|
+
score,
|
|
66
|
+
tasks: { ...SCORES },
|
|
67
|
+
started_at: STARTED_AT,
|
|
68
|
+
ended_at: new Date().toISOString().replace(/\.\d{3}Z$/, "+00:00"),
|
|
69
|
+
};
|
|
70
|
+
if (Object.keys(TASK_META).length > 0) {
|
|
71
|
+
result.tasks_meta = Object.fromEntries(
|
|
72
|
+
Object.entries(TASK_META).map(([k, v]) => [k, { ...v }])
|
|
73
|
+
);
|
|
74
|
+
}
|
|
75
|
+
const payload = JSON.stringify(result, null, 2);
|
|
76
|
+
if (RESULT_PATH) {
|
|
77
|
+
mkdirSync(dirname(RESULT_PATH), { recursive: true });
|
|
78
|
+
// Claim + tmp+rename: duplicate writers fail-fast; crash mid-publish
|
|
79
|
+
// leaves an empty file (caught by load_result) not a partial write.
|
|
80
|
+
try {
|
|
81
|
+
closeSync(openSync(RESULT_PATH, "wx"));
|
|
82
|
+
} catch (e) {
|
|
83
|
+
if (e.code === "EEXIST") {
|
|
84
|
+
throw new Error(
|
|
85
|
+
`${RESULT_PATH} already exists; only one writeResult() per attempt`
|
|
86
|
+
);
|
|
87
|
+
}
|
|
88
|
+
throw e;
|
|
89
|
+
}
|
|
90
|
+
const tmp = RESULT_PATH + ".tmp";
|
|
91
|
+
writeFileSync(tmp, payload, "utf-8");
|
|
92
|
+
renameSync(tmp, RESULT_PATH);
|
|
93
|
+
} else {
|
|
94
|
+
process.stdout.write(payload + "\n");
|
|
95
|
+
}
|
|
96
|
+
return score;
|
|
97
|
+
}
|
|
@@ -0,0 +1,109 @@
|
|
|
1
|
+
"""Inline instrumentation for Python benchmarks. Paste into the benchmark
|
|
2
|
+
and call `log_task()` per task + `write_result()` once at the end.
|
|
3
|
+
|
|
4
|
+
Contract:
|
|
5
|
+
- Reads EVO_TRACES_DIR, EVO_EXPERIMENT_ID, EVO_RESULT_PATH from env.
|
|
6
|
+
- Writes traces/task_<id>.json per task.
|
|
7
|
+
- Writes the final result JSON to EVO_RESULT_PATH, or stdout if unset.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
import json
|
|
13
|
+
import os
|
|
14
|
+
import sys
|
|
15
|
+
from datetime import datetime, timezone
|
|
16
|
+
from pathlib import Path
|
|
17
|
+
from typing import Any
|
|
18
|
+
|
|
19
|
+
_TRACES_DIR = Path(os.environ["EVO_TRACES_DIR"]) if os.environ.get("EVO_TRACES_DIR") else None
|
|
20
|
+
_EXPERIMENT_ID = os.environ.get("EVO_EXPERIMENT_ID", "unknown")
|
|
21
|
+
_RESULT_PATH = os.environ.get("EVO_RESULT_PATH")
|
|
22
|
+
_SCORES: dict[str, float] = {}
|
|
23
|
+
_TASK_META: dict[str, dict[str, Any]] = {}
|
|
24
|
+
_STARTED_AT = datetime.now(timezone.utc).isoformat(timespec="seconds")
|
|
25
|
+
|
|
26
|
+
if _TRACES_DIR:
|
|
27
|
+
_TRACES_DIR.mkdir(parents=True, exist_ok=True)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def log_task(
|
|
31
|
+
task_id: str,
|
|
32
|
+
score: float,
|
|
33
|
+
*,
|
|
34
|
+
summary: str | None = None,
|
|
35
|
+
failure_reason: str | None = None,
|
|
36
|
+
log: list[Any] | None = None,
|
|
37
|
+
direction: str | None = None,
|
|
38
|
+
**extra: Any,
|
|
39
|
+
) -> None:
|
|
40
|
+
"""Record the result for one task. Writes task_<id>.json immediately.
|
|
41
|
+
|
|
42
|
+
*direction* is "max" (higher is better, default) or "min" (lower is
|
|
43
|
+
better, e.g. latency). Only set it when this task's direction differs
|
|
44
|
+
from the benchmark's top-level `--metric`. Propagates to `tasks_meta`
|
|
45
|
+
in the final result JSON for downstream selection strategies.
|
|
46
|
+
"""
|
|
47
|
+
task_id = str(task_id)
|
|
48
|
+
if direction is not None and direction not in ("max", "min"):
|
|
49
|
+
raise ValueError(f"direction must be 'max' or 'min', got {direction!r}")
|
|
50
|
+
_SCORES[task_id] = score
|
|
51
|
+
if direction is not None:
|
|
52
|
+
_TASK_META[task_id] = {"direction": direction}
|
|
53
|
+
if _TRACES_DIR is None:
|
|
54
|
+
return
|
|
55
|
+
trace: dict[str, Any] = {
|
|
56
|
+
"experiment_id": _EXPERIMENT_ID,
|
|
57
|
+
"task_id": task_id,
|
|
58
|
+
"status": "passed" if score >= 0.5 else "failed",
|
|
59
|
+
"score": score,
|
|
60
|
+
"ended_at": datetime.now(timezone.utc).isoformat(timespec="seconds"),
|
|
61
|
+
}
|
|
62
|
+
if direction is not None:
|
|
63
|
+
trace["direction"] = direction
|
|
64
|
+
if summary is not None:
|
|
65
|
+
trace["summary"] = summary
|
|
66
|
+
if failure_reason is not None:
|
|
67
|
+
trace["failure_reason"] = failure_reason
|
|
68
|
+
if log is not None:
|
|
69
|
+
trace["log"] = log
|
|
70
|
+
trace.update(extra)
|
|
71
|
+
(_TRACES_DIR / f"task_{task_id}.json").write_text(
|
|
72
|
+
json.dumps(trace, indent=2), encoding="utf-8"
|
|
73
|
+
)
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def write_result(score: float | None = None) -> float:
|
|
77
|
+
"""Write the final score JSON to $EVO_RESULT_PATH (or stdout if unset)
|
|
78
|
+
and return the score. The return lets callers gate on --min-score
|
|
79
|
+
without recomputing the aggregate.
|
|
80
|
+
"""
|
|
81
|
+
if score is None:
|
|
82
|
+
score = sum(_SCORES.values()) / len(_SCORES) if _SCORES else 0.0
|
|
83
|
+
score = round(score, 4)
|
|
84
|
+
result = {
|
|
85
|
+
"score": score,
|
|
86
|
+
"tasks": dict(_SCORES),
|
|
87
|
+
"started_at": _STARTED_AT,
|
|
88
|
+
"ended_at": datetime.now(timezone.utc).isoformat(timespec="seconds"),
|
|
89
|
+
}
|
|
90
|
+
if _TASK_META:
|
|
91
|
+
result["tasks_meta"] = {k: dict(v) for k, v in _TASK_META.items()}
|
|
92
|
+
payload = json.dumps(result, indent=2)
|
|
93
|
+
if _RESULT_PATH:
|
|
94
|
+
target = Path(_RESULT_PATH)
|
|
95
|
+
target.parent.mkdir(parents=True, exist_ok=True)
|
|
96
|
+
# Claim + tmp+rename: duplicate writers fail-fast; crash mid-publish
|
|
97
|
+
# leaves an empty file (caught by load_result) not a partial write.
|
|
98
|
+
try:
|
|
99
|
+
os.close(os.open(target, os.O_CREAT | os.O_EXCL | os.O_WRONLY))
|
|
100
|
+
except FileExistsError:
|
|
101
|
+
raise RuntimeError(
|
|
102
|
+
f"{target} already exists; only one write_result() per attempt"
|
|
103
|
+
) from None
|
|
104
|
+
tmp = target.with_name(target.name + ".tmp")
|
|
105
|
+
tmp.write_text(payload, encoding="utf-8")
|
|
106
|
+
os.replace(tmp, target)
|
|
107
|
+
else:
|
|
108
|
+
print(payload)
|
|
109
|
+
return score
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
# Proposing unexplored optimization dimensions
|
|
2
|
+
|
|
3
|
+
Used only when the benchmark isn't obvious — no existing eval, ambiguous user intent, or the existing eval covers a narrow slice while the interesting optimization sits elsewhere. If the right benchmark *is* obvious, use it and skip this exercise.
|
|
4
|
+
|
|
5
|
+
When this step does run, the goal is to propose a handful of dimensions for this repo that aren't already measured. Existing benchmarks cover what the authors already worried about; that's where slack is lowest.
|
|
6
|
+
|
|
7
|
+
## Where to look
|
|
8
|
+
|
|
9
|
+
1. **Already-instrumented code.** Grep for `time.`, `perf_counter`, `@profile`, `Counter(`, `metrics.`. What's tracked hints at what authors cared about; what isn't is where slack lives.
|
|
10
|
+
2. **Stated goals.** READMEs, module docstrings, and comments often name what the project values ("fast JSON parsing", "robust against malformed input"). If a stated goal isn't measured, that's a proposal.
|
|
11
|
+
3. **Author pain points.** Grep for `TODO`, `FIXME`, `XXX`, `HACK`. Check the issue tracker if accessible.
|
|
12
|
+
4. **Project-type defaults.** The table below, as a starting point.
|
|
13
|
+
|
|
14
|
+
## Ranking
|
|
15
|
+
|
|
16
|
+
For each candidate, answer three questions honestly in prose. No scores — a 1-5 slack rating from an LLM is a vibe, not a measurement.
|
|
17
|
+
|
|
18
|
+
- **Signal.** Does moving this metric actually correlate with "the project is better"? Or is it a proxy that could drift from what the user cares about?
|
|
19
|
+
- **Slack.** Has anyone hill-climbed this before in this repo? Is there plausibly room to improve, or is the current value already near a floor/ceiling?
|
|
20
|
+
- **Cost per run.** How long and how expensive is one benchmark run? The optimization loop runs many — expensive dimensions compound into real time and money.
|
|
21
|
+
|
|
22
|
+
Rank on a combined judgment of those three. Construction effort (the one-time cost of building the harness) is not a ranking input — flag it qualitatively when presenting, let the user weigh it.
|
|
23
|
+
|
|
24
|
+
## Project-type defaults
|
|
25
|
+
|
|
26
|
+
Start with the obvious column, then look hard at the non-obvious column.
|
|
27
|
+
|
|
28
|
+
| Project type | Obvious (often already done) | Non-obvious (usually unexplored) |
|
|
29
|
+
|---|---|---|
|
|
30
|
+
| LLM / agent | Task pass rate on a benchmark | Token efficiency per correct answer, calibration error, refusal rate on ambiguous tasks, behavior under prompt injection, latency per tool call, recovery from tool errors |
|
|
31
|
+
| Web API / backend | Test pass rate, integration tests | p99 latency on hot endpoints, memory per request, error rate under synthetic load, cold-start time, allocation count per request |
|
|
32
|
+
| ML training | Validation accuracy, loss | Sample efficiency (accuracy per 1k tokens seen), robustness to input perturbations, generalization gap, inference memory, convergence speed |
|
|
33
|
+
| Library / SDK | API tests passing | Import time, allocation count per call, TypeScript strict-mode coverage, docs coverage, cold-import latency, binary size |
|
|
34
|
+
| Compiler / DSL | Correctness on standard suite | Output code size, compile time, optimization quality on standard benchmarks, error message quality (LLM-as-judge), stack trace usefulness |
|
|
35
|
+
| Data pipeline | End-to-end correctness | Throughput (rows/sec), memory peak per batch, late-data handling, schema-drift resilience, idempotency under replay |
|
|
36
|
+
| CLI tool | Unit tests | Cold-start time, memory footprint, output stability across runs, exit-code correctness on edge inputs, help-text discoverability |
|
|
37
|
+
| RAG / retrieval | Recall@K | Embedding cost per indexed doc, query latency p99, answer grounding rate (% of claims traceable to source), robustness to paraphrased queries |
|
|
38
|
+
|
|
39
|
+
## Presenting to the user
|
|
40
|
+
|
|
41
|
+
For each ranked dimension include:
|
|
42
|
+
|
|
43
|
+
- **What it measures** (one sentence)
|
|
44
|
+
- **Why it matters for this project** (tied to what the repo actually does, not generic)
|
|
45
|
+
- **Construction complexity**: *None* (existing eval already produces this score) / *Minor* (wrap or instrument what exists) / *Substantial* (new test cases, scoring logic, or data)
|
|
46
|
+
- **Existing coverage** if any
|
|
47
|
+
|
|
48
|
+
Recommend the highest-ranked dimension whose construction is *None* or *Minor*. If every top pick is *Substantial*, say so and let the user decide whether the signal is worth the work.
|
|
49
|
+
|
|
50
|
+
## Non-picked dimensions
|
|
51
|
+
|
|
52
|
+
Save unused dimensions to `.evo/project.md` under a "Future experiment candidates" section — useful when the first dimension plateaus.
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
// Node SDK usage example. Install: `npm install @evo-hq/evo-agent`.
|
|
2
|
+
//
|
|
3
|
+
// The SDK auto-reads $EVO_TRACES_DIR, $EVO_EXPERIMENT_ID, and
|
|
4
|
+
// $EVO_RESULT_PATH. Traces flush on each report() so the dashboard can
|
|
5
|
+
// stream progress live.
|
|
6
|
+
|
|
7
|
+
import { Run, Gate } from '@evo-hq/evo-agent';
|
|
8
|
+
|
|
9
|
+
// ---- Benchmark run ----
|
|
10
|
+
|
|
11
|
+
const run = new Run();
|
|
12
|
+
for (const task of tasks) {
|
|
13
|
+
const result = await evaluate(task);
|
|
14
|
+
run.log(task.id, { output: result.output });
|
|
15
|
+
run.report(task.id, { score: result.score });
|
|
16
|
+
}
|
|
17
|
+
await run.finish();
|
|
18
|
+
// finish(): writes score JSON to $EVO_RESULT_PATH (or stdout if unset)
|
|
19
|
+
// and one task_<id>.json per task under $EVO_TRACES_DIR.
|
|
20
|
+
|
|
21
|
+
// ---- Gate (exits 0 all-pass / 1 any-fail) ----
|
|
22
|
+
|
|
23
|
+
const gate = new Gate();
|
|
24
|
+
for (const task of criticalTasks) {
|
|
25
|
+
const result = await evaluate(task);
|
|
26
|
+
gate.check(task.id, { score: result.score });
|
|
27
|
+
}
|
|
28
|
+
await gate.finish();
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
"""Python SDK usage examples.
|
|
2
|
+
|
|
3
|
+
Install `evo-hq-agent` with this project's package manager/runtime, for example
|
|
4
|
+
`uv add --dev evo-hq-agent` or `python -m pip install evo-hq-agent`.
|
|
5
|
+
|
|
6
|
+
The SDK auto-reads $EVO_TRACES_DIR, $EVO_EXPERIMENT_ID, and $EVO_RESULT_PATH.
|
|
7
|
+
Traces flush on each report() so the dashboard can stream progress live.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from evo_agent import Run, Gate
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
# ---- Benchmark run ----
|
|
14
|
+
|
|
15
|
+
run = Run()
|
|
16
|
+
try:
|
|
17
|
+
for task in tasks:
|
|
18
|
+
run.log(task["id"], "starting task")
|
|
19
|
+
try:
|
|
20
|
+
result = evaluate(task, agent)
|
|
21
|
+
run.log(task["id"], {"output": result.output})
|
|
22
|
+
run.report(
|
|
23
|
+
task["id"],
|
|
24
|
+
score=result.score,
|
|
25
|
+
summary=f"reward={result.score:.2f}",
|
|
26
|
+
failure_reason=None if result.passed else "task_failed",
|
|
27
|
+
)
|
|
28
|
+
except Exception as exc:
|
|
29
|
+
run.log(task["id"], {"error": repr(exc)})
|
|
30
|
+
run.report(task["id"], score=0.0, failure_reason="exception")
|
|
31
|
+
finally:
|
|
32
|
+
run.finish()
|
|
33
|
+
# finish() writes score JSON to $EVO_RESULT_PATH (or stdout if unset) and one
|
|
34
|
+
# task_<id>.json per task under $EVO_TRACES_DIR. Catch expected per-task errors;
|
|
35
|
+
# an uncaught exception before finish() means evo correctly sees a crashed run.
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
# ---- Gate (exits 0 all-pass / 1 any-fail) ----
|
|
39
|
+
|
|
40
|
+
with Gate() as gate:
|
|
41
|
+
for task in critical_tasks:
|
|
42
|
+
result = evaluate(task, agent)
|
|
43
|
+
gate.check(task["id"], score=result.score)
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""Validate a benchmark result file.
|
|
3
|
+
|
|
4
|
+
Usage: python3 validate_result.py <path-to-result.json>
|
|
5
|
+
|
|
6
|
+
Exits 0 if the file exists, is non-empty, and is a JSON object with a
|
|
7
|
+
numeric 'score'. Exits 1 with a diagnostic on stderr otherwise.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
import json
|
|
11
|
+
import sys
|
|
12
|
+
from pathlib import Path
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def main() -> int:
|
|
16
|
+
if len(sys.argv) != 2:
|
|
17
|
+
print(f"usage: {sys.argv[0]} <result.json>", file=sys.stderr)
|
|
18
|
+
return 1
|
|
19
|
+
|
|
20
|
+
path = Path(sys.argv[1])
|
|
21
|
+
|
|
22
|
+
if not path.exists():
|
|
23
|
+
print(f"FAIL: {path} does not exist", file=sys.stderr)
|
|
24
|
+
return 1
|
|
25
|
+
|
|
26
|
+
if path.stat().st_size == 0:
|
|
27
|
+
print(f"FAIL: {path} is empty", file=sys.stderr)
|
|
28
|
+
return 1
|
|
29
|
+
|
|
30
|
+
try:
|
|
31
|
+
obj = json.loads(path.read_text(encoding="utf-8"))
|
|
32
|
+
except json.JSONDecodeError as exc:
|
|
33
|
+
print(f"FAIL: {path} is not valid JSON: {exc}", file=sys.stderr)
|
|
34
|
+
return 1
|
|
35
|
+
|
|
36
|
+
if not isinstance(obj, dict):
|
|
37
|
+
print(f"FAIL: expected JSON object, got {type(obj).__name__}", file=sys.stderr)
|
|
38
|
+
return 1
|
|
39
|
+
|
|
40
|
+
if "score" not in obj:
|
|
41
|
+
print(f"FAIL: missing 'score' field. Keys: {list(obj.keys())}", file=sys.stderr)
|
|
42
|
+
return 1
|
|
43
|
+
|
|
44
|
+
try:
|
|
45
|
+
score = float(obj["score"])
|
|
46
|
+
except (TypeError, ValueError):
|
|
47
|
+
print(f"FAIL: 'score' is not numeric: {obj['score']!r}", file=sys.stderr)
|
|
48
|
+
return 1
|
|
49
|
+
|
|
50
|
+
print(f"OK: {path}, score = {score}", file=sys.stderr)
|
|
51
|
+
return 0
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
if __name__ == "__main__":
|
|
55
|
+
sys.exit(main())
|
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: infra-setup
|
|
3
|
+
description: Non-user-invocable provider/setup reference for evo backend switching, prerequisite checks, and auth/install guidance.
|
|
4
|
+
disable-model-invocation: true
|
|
5
|
+
---
|
|
6
|
+
|
|
7
|
+
# Infra Setup
|
|
8
|
+
|
|
9
|
+
Use this when the user wants to change where experiments run: local worktrees, pool slots, or a remote provider such as Modal, E2B, Daytona, AWS, Azure, SSH, manual, or a custom dotted-path provider.
|
|
10
|
+
|
|
11
|
+
## Goals
|
|
12
|
+
|
|
13
|
+
- Be explicit about the target backend/provider.
|
|
14
|
+
- Check prerequisites before mutating evo config.
|
|
15
|
+
- Never install provider SDKs silently.
|
|
16
|
+
- Give one actionable auth command per provider.
|
|
17
|
+
- Keep provider credentials separate from benchmark runtime env.
|
|
18
|
+
|
|
19
|
+
## Flow
|
|
20
|
+
|
|
21
|
+
1. Identify the target:
|
|
22
|
+
- `worktree` or `pool` means local backends.
|
|
23
|
+
- `modal`, `e2b`, `ssh:...`, or another remote spec means `backend=remote`.
|
|
24
|
+
2. If the target is remote, parse the provider choice the same way evo CLI does:
|
|
25
|
+
- `modal`
|
|
26
|
+
- `e2b`
|
|
27
|
+
- `daytona`
|
|
28
|
+
- `aws`
|
|
29
|
+
- `azure`
|
|
30
|
+
- `manual`
|
|
31
|
+
- `ssh:user@host[:port]`
|
|
32
|
+
- another built-in provider name
|
|
33
|
+
- dotted import path for a custom provider
|
|
34
|
+
3. Check whether `evo` is on PATH and whether it is the expected `evo-hq-cli` package (`evo --version`). If the provider SDK is missing, evo's provider loader prints the provider-specific extra or SDK package to install; use that message rather than guessing.
|
|
35
|
+
4. For SDK-backed providers, verify the SDK import only when you can run the check in the same environment that owns the `evo` executable. If missing, ask the user before installing it.
|
|
36
|
+
- If `evo` was installed with `uv tool` or `pip`/`venv`, prefer the matching extra on `evo-hq-cli`:
|
|
37
|
+
- `uv-tool`: `uv tool install --reinstall 'evo-hq-cli[<provider-extra>]'`
|
|
38
|
+
- `venv` / `pip`: `python -m pip install 'evo-hq-cli[<provider-extra>]'`
|
|
39
|
+
- If `evo` was installed with `pipx`, inject the provider SDK into the same `evo-hq-cli` environment:
|
|
40
|
+
- `pipx`: `pipx inject evo-hq-cli <provider-sdk>`
|
|
41
|
+
5. Check auth and show exactly one provider-specific auth command or setup step. Use `references/provider-matrix.md`.
|
|
42
|
+
6. Once prerequisites are satisfied, run the explicit config command:
|
|
43
|
+
|
|
44
|
+
```bash
|
|
45
|
+
evo config backend remote --provider <provider> --provider-config ...
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
Or for local backends:
|
|
49
|
+
|
|
50
|
+
```bash
|
|
51
|
+
evo config backend worktree
|
|
52
|
+
evo config backend pool --workspaces /abs/slot-a,/abs/slot-b
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
7. Be explicit that incomplete provider setup usually surfaces on
|
|
56
|
+
`evo new --remote <provider> ...`, because that is where remote
|
|
57
|
+
allocation and bootstrap actually happen.
|
|
58
|
+
8. If the benchmark itself needs application keys, configure runtime env
|
|
59
|
+
separately with `evo env load <path> --all` or
|
|
60
|
+
`evo env load <path> --allow KEY1,KEY2`. Provider auth provisions the
|
|
61
|
+
sandbox; runtime env is what benchmark/gate processes see.
|
|
62
|
+
|
|
63
|
+
## Pre-assumptions
|
|
64
|
+
|
|
65
|
+
Before trying to switch a workspace to a remote provider, confirm the basics:
|
|
66
|
+
|
|
67
|
+
- the target backend is clear from the user's request; only ask if the
|
|
68
|
+
intent is genuinely ambiguous between `worktree`, `pool`, and `remote`
|
|
69
|
+
- the machine running evo has the right provider SDK or transport installed
|
|
70
|
+
- the user has auth for that provider available now, not "somewhere else"
|
|
71
|
+
- the provider-specific minimum config exists
|
|
72
|
+
- `modal`: auth + optional config
|
|
73
|
+
- `e2b`: API key + optional config
|
|
74
|
+
- `daytona`: API key and API URL/target if needed
|
|
75
|
+
- `aws`: creds, region, image, SSH key pair/private key, and usually network config
|
|
76
|
+
- `azure`: subscription, resource group, region, SSH key/private key, and VM/image choices
|
|
77
|
+
- `ssh`: reachable host, working SSH user, and key/port if needed
|
|
78
|
+
- `manual`: reachable remote endpoint URL and bearer token
|
|
79
|
+
- for SSH-backed VM providers, the guest assumptions are plausible before allocation:
|
|
80
|
+
- the image enables SSH
|
|
81
|
+
- the SSH user matches the image
|
|
82
|
+
- the image architecture matches the selected instance type
|
|
83
|
+
- the host can run evo's remote workspace runtime
|
|
84
|
+
|
|
85
|
+
## Provider notes
|
|
86
|
+
|
|
87
|
+
See `references/provider-matrix.md` for the compact provider summary, common config, and provider-specific setup/auth command.
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
## Provider Matrix
|
|
2
|
+
|
|
3
|
+
Use this as the compact summary. This is setup guidance, not a runtime dependency list for evo itself.
|
|
4
|
+
|
|
5
|
+
| Provider | What evo uses at runtime | Setup / auth | Common config |
|
|
6
|
+
|---|---|---|---|
|
|
7
|
+
| `modal` | Modal Python SDK | If missing, install `evo-hq-cli[modal]` (or inject `modal` with `pipx`); then run `modal token new` | `app_name`, `gpu`, `region`, `timeout_seconds`, `health_timeout_seconds`, `apt_install`, `pip_install` |
|
|
8
|
+
| `e2b` | E2B Python SDK | If missing, install `evo-hq-cli[e2b]` (or inject `e2b` with `pipx`); then `export E2B_API_KEY=...` | `template`, `api_key`, `domain`, `root`, `timeout_seconds`, `health_timeout_seconds`, `allow_internet_access`, `secure` |
|
|
9
|
+
| `daytona` | Daytona Python SDK | If missing, install `evo-hq-cli[daytona]` (or inject `daytona` with `pipx`); then `export DAYTONA_API_KEY=...` | `api_key`, `api_url`, `target`, `timeout_seconds`, `health_timeout_seconds`, `ssh_host`, `ssh_port`, `ssh_token_ttl_minutes`, `sandbox_timeout_seconds` |
|
|
10
|
+
| `aws` | `boto3` | If missing, install `evo-hq-cli[aws]` (or inject `boto3` with `pipx`); then export AWS creds and region | `region`, `image_id`, `key_name`, `key`, `instance_type`, `subnet_id`, `security_group_ids`, `ssh_user`, `ssh_port`, `timeout_seconds`, `health_timeout_seconds`, `keep_warm` |
|
|
11
|
+
| `azure` | Azure Python SDK (`azure-identity`, `azure-mgmt-resource`, `azure-mgmt-network`, `azure-mgmt-compute`) | If missing, install `evo-hq-cli[azure]`; then use `az login` or Azure env creds, and provide subscription/resource-group config | `subscription_id`, `resource_group`, `location`, `vm_size`, `image`, `key`, `ssh_public_key`, `ssh_user`, `ssh_cidr`, `vnet_cidr`, `subnet_cidr`, `ssh_port`, `timeout_seconds`, `health_timeout_seconds`, `keep_warm` |
|
|
12
|
+
| `ssh` | local `ssh` transport | `ssh user@host` must work first; then add `-i` / `-p` if needed | `host`, `key`, `port`, `tunnel_port`, `keep_warm`, `health_timeout_seconds` |
|
|
13
|
+
| `manual` | existing remote workspace endpoint | no provisioning; only ask for URL/token if the user explicitly wants manual mode | `base_url`, `bearer_token`, `workspace_root`, `bundle_dir` |
|
|
14
|
+
|
|
15
|
+
Notes:
|
|
16
|
+
- `evo` runtime uses the provider SDK or transport listed in the second column.
|
|
17
|
+
- The `evo-hq-cli[<provider>]` extras are the preferred install path when the provider SDK is missing.
|
|
18
|
+
- Provider auth/setup is operator guidance. It is not the same thing as evo's runtime dependency surface.
|
|
19
|
+
- Common failures are usually one of: missing SDK import, missing auth state/env var, unreachable host/port, or provider-specific bootstrap mismatch.
|
|
20
|
+
- Incomplete provider setup usually surfaces on `evo new --remote <provider> ...`, because that is where remote allocation and bootstrap actually happen.
|
|
21
|
+
- For SSH-backed VM providers, also validate the guest assumptions:
|
|
22
|
+
- the instance image has SSH enabled
|
|
23
|
+
- the SSH user matches the image
|
|
24
|
+
- the image architecture matches the selected instance type
|
|
25
|
+
- the remote host can run evo's remote workspace runtime
|