@evo-hq/pi-evo 0.4.2-alpha.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,97 @@
1
+ /**
2
+ * Inline instrumentation for Node benchmarks. Paste into the benchmark and
3
+ * call logTask() per task + writeResult() once at the end.
4
+ *
5
+ * Contract:
6
+ * - Reads EVO_TRACES_DIR, EVO_EXPERIMENT_ID, EVO_RESULT_PATH from process.env.
7
+ * - Writes traces/task_<id>.json per task.
8
+ * - Writes the final result JSON to EVO_RESULT_PATH, or stdout if unset.
9
+ */
10
+
11
+ import {
12
+ writeFileSync,
13
+ mkdirSync,
14
+ openSync,
15
+ closeSync,
16
+ renameSync,
17
+ } from "node:fs";
18
+ import { dirname, join } from "node:path";
19
+
20
+ const TRACES_DIR = process.env.EVO_TRACES_DIR || null;
21
+ const EXPERIMENT_ID = process.env.EVO_EXPERIMENT_ID || "unknown";
22
+ const RESULT_PATH = process.env.EVO_RESULT_PATH || null;
23
+ const SCORES = {};
24
+ const TASK_META = {};
25
+ const STARTED_AT = new Date().toISOString().replace(/\.\d{3}Z$/, "+00:00");
26
+
27
+ if (TRACES_DIR) mkdirSync(TRACES_DIR, { recursive: true });
28
+
29
+ /**
30
+ * Record the result for one task. `direction` is "max" (higher is better,
31
+ * default) or "min" (lower is better, e.g. latency). Set it only when this
32
+ * task's direction differs from the benchmark's top-level --metric.
33
+ * Propagates to tasks_meta in the final result JSON.
34
+ */
35
+ export function logTask(taskId, score, { summary, failureReason, log, direction, ...extra } = {}) {
36
+ taskId = String(taskId);
37
+ if (direction !== undefined && direction !== "max" && direction !== "min") {
38
+ throw new Error(`direction must be 'max' or 'min', got ${JSON.stringify(direction)}`);
39
+ }
40
+ SCORES[taskId] = score;
41
+ if (direction !== undefined) TASK_META[taskId] = { direction };
42
+ if (!TRACES_DIR) return;
43
+ const trace = {
44
+ experiment_id: EXPERIMENT_ID,
45
+ task_id: taskId,
46
+ status: score >= 0.5 ? "passed" : "failed",
47
+ score,
48
+ ended_at: new Date().toISOString().replace(/\.\d{3}Z$/, "+00:00"),
49
+ };
50
+ if (direction !== undefined) trace.direction = direction;
51
+ if (summary !== undefined) trace.summary = summary;
52
+ if (failureReason !== undefined) trace.failure_reason = failureReason;
53
+ if (log !== undefined) trace.log = log;
54
+ Object.assign(trace, extra);
55
+ writeFileSync(join(TRACES_DIR, `task_${taskId}.json`), JSON.stringify(trace, null, 2), "utf-8");
56
+ }
57
+
58
+ export function writeResult(score) {
59
+ const ids = Object.keys(SCORES);
60
+ if (score === undefined) {
61
+ score = ids.length === 0 ? 0.0 : ids.reduce((a, id) => a + SCORES[id], 0) / ids.length;
62
+ }
63
+ score = Math.round(score * 10000) / 10000;
64
+ const result = {
65
+ score,
66
+ tasks: { ...SCORES },
67
+ started_at: STARTED_AT,
68
+ ended_at: new Date().toISOString().replace(/\.\d{3}Z$/, "+00:00"),
69
+ };
70
+ if (Object.keys(TASK_META).length > 0) {
71
+ result.tasks_meta = Object.fromEntries(
72
+ Object.entries(TASK_META).map(([k, v]) => [k, { ...v }])
73
+ );
74
+ }
75
+ const payload = JSON.stringify(result, null, 2);
76
+ if (RESULT_PATH) {
77
+ mkdirSync(dirname(RESULT_PATH), { recursive: true });
78
+ // Claim + tmp+rename: duplicate writers fail-fast; crash mid-publish
79
+ // leaves an empty file (caught by load_result) not a partial write.
80
+ try {
81
+ closeSync(openSync(RESULT_PATH, "wx"));
82
+ } catch (e) {
83
+ if (e.code === "EEXIST") {
84
+ throw new Error(
85
+ `${RESULT_PATH} already exists; only one writeResult() per attempt`
86
+ );
87
+ }
88
+ throw e;
89
+ }
90
+ const tmp = RESULT_PATH + ".tmp";
91
+ writeFileSync(tmp, payload, "utf-8");
92
+ renameSync(tmp, RESULT_PATH);
93
+ } else {
94
+ process.stdout.write(payload + "\n");
95
+ }
96
+ return score;
97
+ }
@@ -0,0 +1,109 @@
1
+ """Inline instrumentation for Python benchmarks. Paste into the benchmark
2
+ and call `log_task()` per task + `write_result()` once at the end.
3
+
4
+ Contract:
5
+ - Reads EVO_TRACES_DIR, EVO_EXPERIMENT_ID, EVO_RESULT_PATH from env.
6
+ - Writes traces/task_<id>.json per task.
7
+ - Writes the final result JSON to EVO_RESULT_PATH, or stdout if unset.
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ import json
13
+ import os
14
+ import sys
15
+ from datetime import datetime, timezone
16
+ from pathlib import Path
17
+ from typing import Any
18
+
19
+ _TRACES_DIR = Path(os.environ["EVO_TRACES_DIR"]) if os.environ.get("EVO_TRACES_DIR") else None
20
+ _EXPERIMENT_ID = os.environ.get("EVO_EXPERIMENT_ID", "unknown")
21
+ _RESULT_PATH = os.environ.get("EVO_RESULT_PATH")
22
+ _SCORES: dict[str, float] = {}
23
+ _TASK_META: dict[str, dict[str, Any]] = {}
24
+ _STARTED_AT = datetime.now(timezone.utc).isoformat(timespec="seconds")
25
+
26
+ if _TRACES_DIR:
27
+ _TRACES_DIR.mkdir(parents=True, exist_ok=True)
28
+
29
+
30
+ def log_task(
31
+ task_id: str,
32
+ score: float,
33
+ *,
34
+ summary: str | None = None,
35
+ failure_reason: str | None = None,
36
+ log: list[Any] | None = None,
37
+ direction: str | None = None,
38
+ **extra: Any,
39
+ ) -> None:
40
+ """Record the result for one task. Writes task_<id>.json immediately.
41
+
42
+ *direction* is "max" (higher is better, default) or "min" (lower is
43
+ better, e.g. latency). Only set it when this task's direction differs
44
+ from the benchmark's top-level `--metric`. Propagates to `tasks_meta`
45
+ in the final result JSON for downstream selection strategies.
46
+ """
47
+ task_id = str(task_id)
48
+ if direction is not None and direction not in ("max", "min"):
49
+ raise ValueError(f"direction must be 'max' or 'min', got {direction!r}")
50
+ _SCORES[task_id] = score
51
+ if direction is not None:
52
+ _TASK_META[task_id] = {"direction": direction}
53
+ if _TRACES_DIR is None:
54
+ return
55
+ trace: dict[str, Any] = {
56
+ "experiment_id": _EXPERIMENT_ID,
57
+ "task_id": task_id,
58
+ "status": "passed" if score >= 0.5 else "failed",
59
+ "score": score,
60
+ "ended_at": datetime.now(timezone.utc).isoformat(timespec="seconds"),
61
+ }
62
+ if direction is not None:
63
+ trace["direction"] = direction
64
+ if summary is not None:
65
+ trace["summary"] = summary
66
+ if failure_reason is not None:
67
+ trace["failure_reason"] = failure_reason
68
+ if log is not None:
69
+ trace["log"] = log
70
+ trace.update(extra)
71
+ (_TRACES_DIR / f"task_{task_id}.json").write_text(
72
+ json.dumps(trace, indent=2), encoding="utf-8"
73
+ )
74
+
75
+
76
+ def write_result(score: float | None = None) -> float:
77
+ """Write the final score JSON to $EVO_RESULT_PATH (or stdout if unset)
78
+ and return the score. The return lets callers gate on --min-score
79
+ without recomputing the aggregate.
80
+ """
81
+ if score is None:
82
+ score = sum(_SCORES.values()) / len(_SCORES) if _SCORES else 0.0
83
+ score = round(score, 4)
84
+ result = {
85
+ "score": score,
86
+ "tasks": dict(_SCORES),
87
+ "started_at": _STARTED_AT,
88
+ "ended_at": datetime.now(timezone.utc).isoformat(timespec="seconds"),
89
+ }
90
+ if _TASK_META:
91
+ result["tasks_meta"] = {k: dict(v) for k, v in _TASK_META.items()}
92
+ payload = json.dumps(result, indent=2)
93
+ if _RESULT_PATH:
94
+ target = Path(_RESULT_PATH)
95
+ target.parent.mkdir(parents=True, exist_ok=True)
96
+ # Claim + tmp+rename: duplicate writers fail-fast; crash mid-publish
97
+ # leaves an empty file (caught by load_result) not a partial write.
98
+ try:
99
+ os.close(os.open(target, os.O_CREAT | os.O_EXCL | os.O_WRONLY))
100
+ except FileExistsError:
101
+ raise RuntimeError(
102
+ f"{target} already exists; only one write_result() per attempt"
103
+ ) from None
104
+ tmp = target.with_name(target.name + ".tmp")
105
+ tmp.write_text(payload, encoding="utf-8")
106
+ os.replace(tmp, target)
107
+ else:
108
+ print(payload)
109
+ return score
@@ -0,0 +1,52 @@
1
+ # Proposing unexplored optimization dimensions
2
+
3
+ Used only when the benchmark isn't obvious — no existing eval, ambiguous user intent, or the existing eval covers a narrow slice while the interesting optimization sits elsewhere. If the right benchmark *is* obvious, use it and skip this exercise.
4
+
5
+ When this step does run, the goal is to propose a handful of dimensions for this repo that aren't already measured. Existing benchmarks cover what the authors already worried about; that's where slack is lowest.
6
+
7
+ ## Where to look
8
+
9
+ 1. **Already-instrumented code.** Grep for `time.`, `perf_counter`, `@profile`, `Counter(`, `metrics.`. What's tracked hints at what authors cared about; what isn't is where slack lives.
10
+ 2. **Stated goals.** READMEs, module docstrings, and comments often name what the project values ("fast JSON parsing", "robust against malformed input"). If a stated goal isn't measured, that's a proposal.
11
+ 3. **Author pain points.** Grep for `TODO`, `FIXME`, `XXX`, `HACK`. Check the issue tracker if accessible.
12
+ 4. **Project-type defaults.** The table below, as a starting point.
13
+
14
+ ## Ranking
15
+
16
+ For each candidate, answer three questions honestly in prose. No scores — a 1-5 slack rating from an LLM is a vibe, not a measurement.
17
+
18
+ - **Signal.** Does moving this metric actually correlate with "the project is better"? Or is it a proxy that could drift from what the user cares about?
19
+ - **Slack.** Has anyone hill-climbed this before in this repo? Is there plausibly room to improve, or is the current value already near a floor/ceiling?
20
+ - **Cost per run.** How long and how expensive is one benchmark run? The optimization loop runs many — expensive dimensions compound into real time and money.
21
+
22
+ Rank on a combined judgment of those three. Construction effort (the one-time cost of building the harness) is not a ranking input — flag it qualitatively when presenting, let the user weigh it.
23
+
24
+ ## Project-type defaults
25
+
26
+ Start with the obvious column, then look hard at the non-obvious column.
27
+
28
+ | Project type | Obvious (often already done) | Non-obvious (usually unexplored) |
29
+ |---|---|---|
30
+ | LLM / agent | Task pass rate on a benchmark | Token efficiency per correct answer, calibration error, refusal rate on ambiguous tasks, behavior under prompt injection, latency per tool call, recovery from tool errors |
31
+ | Web API / backend | Test pass rate, integration tests | p99 latency on hot endpoints, memory per request, error rate under synthetic load, cold-start time, allocation count per request |
32
+ | ML training | Validation accuracy, loss | Sample efficiency (accuracy per 1k tokens seen), robustness to input perturbations, generalization gap, inference memory, convergence speed |
33
+ | Library / SDK | API tests passing | Import time, allocation count per call, TypeScript strict-mode coverage, docs coverage, cold-import latency, binary size |
34
+ | Compiler / DSL | Correctness on standard suite | Output code size, compile time, optimization quality on standard benchmarks, error message quality (LLM-as-judge), stack trace usefulness |
35
+ | Data pipeline | End-to-end correctness | Throughput (rows/sec), memory peak per batch, late-data handling, schema-drift resilience, idempotency under replay |
36
+ | CLI tool | Unit tests | Cold-start time, memory footprint, output stability across runs, exit-code correctness on edge inputs, help-text discoverability |
37
+ | RAG / retrieval | Recall@K | Embedding cost per indexed doc, query latency p99, answer grounding rate (% of claims traceable to source), robustness to paraphrased queries |
38
+
39
+ ## Presenting to the user
40
+
41
+ For each ranked dimension include:
42
+
43
+ - **What it measures** (one sentence)
44
+ - **Why it matters for this project** (tied to what the repo actually does, not generic)
45
+ - **Construction complexity**: *None* (existing eval already produces this score) / *Minor* (wrap or instrument what exists) / *Substantial* (new test cases, scoring logic, or data)
46
+ - **Existing coverage** if any
47
+
48
+ Recommend the highest-ranked dimension whose construction is *None* or *Minor*. If every top pick is *Substantial*, say so and let the user decide whether the signal is worth the work.
49
+
50
+ ## Non-picked dimensions
51
+
52
+ Save unused dimensions to `.evo/project.md` under a "Future experiment candidates" section — useful when the first dimension plateaus.
@@ -0,0 +1,28 @@
1
+ // Node SDK usage example. Install: `npm install @evo-hq/evo-agent`.
2
+ //
3
+ // The SDK auto-reads $EVO_TRACES_DIR, $EVO_EXPERIMENT_ID, and
4
+ // $EVO_RESULT_PATH. Traces flush on each report() so the dashboard can
5
+ // stream progress live.
6
+
7
+ import { Run, Gate } from '@evo-hq/evo-agent';
8
+
9
+ // ---- Benchmark run ----
10
+
11
+ const run = new Run();
12
+ for (const task of tasks) {
13
+ const result = await evaluate(task);
14
+ run.log(task.id, { output: result.output });
15
+ run.report(task.id, { score: result.score });
16
+ }
17
+ await run.finish();
18
+ // finish(): writes score JSON to $EVO_RESULT_PATH (or stdout if unset)
19
+ // and one task_<id>.json per task under $EVO_TRACES_DIR.
20
+
21
+ // ---- Gate (exits 0 all-pass / 1 any-fail) ----
22
+
23
+ const gate = new Gate();
24
+ for (const task of criticalTasks) {
25
+ const result = await evaluate(task);
26
+ gate.check(task.id, { score: result.score });
27
+ }
28
+ await gate.finish();
@@ -0,0 +1,43 @@
1
+ """Python SDK usage examples.
2
+
3
+ Install `evo-hq-agent` with this project's package manager/runtime, for example
4
+ `uv add --dev evo-hq-agent` or `python -m pip install evo-hq-agent`.
5
+
6
+ The SDK auto-reads $EVO_TRACES_DIR, $EVO_EXPERIMENT_ID, and $EVO_RESULT_PATH.
7
+ Traces flush on each report() so the dashboard can stream progress live.
8
+ """
9
+
10
+ from evo_agent import Run, Gate
11
+
12
+
13
+ # ---- Benchmark run ----
14
+
15
+ run = Run()
16
+ try:
17
+ for task in tasks:
18
+ run.log(task["id"], "starting task")
19
+ try:
20
+ result = evaluate(task, agent)
21
+ run.log(task["id"], {"output": result.output})
22
+ run.report(
23
+ task["id"],
24
+ score=result.score,
25
+ summary=f"reward={result.score:.2f}",
26
+ failure_reason=None if result.passed else "task_failed",
27
+ )
28
+ except Exception as exc:
29
+ run.log(task["id"], {"error": repr(exc)})
30
+ run.report(task["id"], score=0.0, failure_reason="exception")
31
+ finally:
32
+ run.finish()
33
+ # finish() writes score JSON to $EVO_RESULT_PATH (or stdout if unset) and one
34
+ # task_<id>.json per task under $EVO_TRACES_DIR. Catch expected per-task errors;
35
+ # an uncaught exception before finish() means evo correctly sees a crashed run.
36
+
37
+
38
+ # ---- Gate (exits 0 all-pass / 1 any-fail) ----
39
+
40
+ with Gate() as gate:
41
+ for task in critical_tasks:
42
+ result = evaluate(task, agent)
43
+ gate.check(task["id"], score=result.score)
@@ -0,0 +1,55 @@
1
+ #!/usr/bin/env python3
2
+ """Validate a benchmark result file.
3
+
4
+ Usage: python3 validate_result.py <path-to-result.json>
5
+
6
+ Exits 0 if the file exists, is non-empty, and is a JSON object with a
7
+ numeric 'score'. Exits 1 with a diagnostic on stderr otherwise.
8
+ """
9
+
10
+ import json
11
+ import sys
12
+ from pathlib import Path
13
+
14
+
15
+ def main() -> int:
16
+ if len(sys.argv) != 2:
17
+ print(f"usage: {sys.argv[0]} <result.json>", file=sys.stderr)
18
+ return 1
19
+
20
+ path = Path(sys.argv[1])
21
+
22
+ if not path.exists():
23
+ print(f"FAIL: {path} does not exist", file=sys.stderr)
24
+ return 1
25
+
26
+ if path.stat().st_size == 0:
27
+ print(f"FAIL: {path} is empty", file=sys.stderr)
28
+ return 1
29
+
30
+ try:
31
+ obj = json.loads(path.read_text(encoding="utf-8"))
32
+ except json.JSONDecodeError as exc:
33
+ print(f"FAIL: {path} is not valid JSON: {exc}", file=sys.stderr)
34
+ return 1
35
+
36
+ if not isinstance(obj, dict):
37
+ print(f"FAIL: expected JSON object, got {type(obj).__name__}", file=sys.stderr)
38
+ return 1
39
+
40
+ if "score" not in obj:
41
+ print(f"FAIL: missing 'score' field. Keys: {list(obj.keys())}", file=sys.stderr)
42
+ return 1
43
+
44
+ try:
45
+ score = float(obj["score"])
46
+ except (TypeError, ValueError):
47
+ print(f"FAIL: 'score' is not numeric: {obj['score']!r}", file=sys.stderr)
48
+ return 1
49
+
50
+ print(f"OK: {path}, score = {score}", file=sys.stderr)
51
+ return 0
52
+
53
+
54
+ if __name__ == "__main__":
55
+ sys.exit(main())
@@ -0,0 +1,87 @@
1
+ ---
2
+ name: infra-setup
3
+ description: Non-user-invocable provider/setup reference for evo backend switching, prerequisite checks, and auth/install guidance.
4
+ disable-model-invocation: true
5
+ ---
6
+
7
+ # Infra Setup
8
+
9
+ Use this when the user wants to change where experiments run: local worktrees, pool slots, or a remote provider such as Modal, E2B, Daytona, AWS, Azure, SSH, manual, or a custom dotted-path provider.
10
+
11
+ ## Goals
12
+
13
+ - Be explicit about the target backend/provider.
14
+ - Check prerequisites before mutating evo config.
15
+ - Never install provider SDKs silently.
16
+ - Give one actionable auth command per provider.
17
+ - Keep provider credentials separate from benchmark runtime env.
18
+
19
+ ## Flow
20
+
21
+ 1. Identify the target:
22
+ - `worktree` or `pool` means local backends.
23
+ - `modal`, `e2b`, `ssh:...`, or another remote spec means `backend=remote`.
24
+ 2. If the target is remote, parse the provider choice the same way evo CLI does:
25
+ - `modal`
26
+ - `e2b`
27
+ - `daytona`
28
+ - `aws`
29
+ - `azure`
30
+ - `manual`
31
+ - `ssh:user@host[:port]`
32
+ - another built-in provider name
33
+ - dotted import path for a custom provider
34
+ 3. Check whether `evo` is on PATH and whether it is the expected `evo-hq-cli` package (`evo --version`). If the provider SDK is missing, evo's provider loader prints the provider-specific extra or SDK package to install; use that message rather than guessing.
35
+ 4. For SDK-backed providers, verify the SDK import only when you can run the check in the same environment that owns the `evo` executable. If missing, ask the user before installing it.
36
+ - If `evo` was installed with `uv tool` or `pip`/`venv`, prefer the matching extra on `evo-hq-cli`:
37
+ - `uv-tool`: `uv tool install --reinstall 'evo-hq-cli[<provider-extra>]'`
38
+ - `venv` / `pip`: `python -m pip install 'evo-hq-cli[<provider-extra>]'`
39
+ - If `evo` was installed with `pipx`, inject the provider SDK into the same `evo-hq-cli` environment:
40
+ - `pipx`: `pipx inject evo-hq-cli <provider-sdk>`
41
+ 5. Check auth and show exactly one provider-specific auth command or setup step. Use `references/provider-matrix.md`.
42
+ 6. Once prerequisites are satisfied, run the explicit config command:
43
+
44
+ ```bash
45
+ evo config backend remote --provider <provider> --provider-config ...
46
+ ```
47
+
48
+ Or for local backends:
49
+
50
+ ```bash
51
+ evo config backend worktree
52
+ evo config backend pool --workspaces /abs/slot-a,/abs/slot-b
53
+ ```
54
+
55
+ 7. Be explicit that incomplete provider setup usually surfaces on
56
+ `evo new --remote <provider> ...`, because that is where remote
57
+ allocation and bootstrap actually happen.
58
+ 8. If the benchmark itself needs application keys, configure runtime env
59
+ separately with `evo env load <path> --all` or
60
+ `evo env load <path> --allow KEY1,KEY2`. Provider auth provisions the
61
+ sandbox; runtime env is what benchmark/gate processes see.
62
+
63
+ ## Pre-assumptions
64
+
65
+ Before trying to switch a workspace to a remote provider, confirm the basics:
66
+
67
+ - the target backend is clear from the user's request; only ask if the
68
+ intent is genuinely ambiguous between `worktree`, `pool`, and `remote`
69
+ - the machine running evo has the right provider SDK or transport installed
70
+ - the user has auth for that provider available now, not "somewhere else"
71
+ - the provider-specific minimum config exists
72
+ - `modal`: auth + optional config
73
+ - `e2b`: API key + optional config
74
+ - `daytona`: API key and API URL/target if needed
75
+ - `aws`: creds, region, image, SSH key pair/private key, and usually network config
76
+ - `azure`: subscription, resource group, region, SSH key/private key, and VM/image choices
77
+ - `ssh`: reachable host, working SSH user, and key/port if needed
78
+ - `manual`: reachable remote endpoint URL and bearer token
79
+ - for SSH-backed VM providers, the guest assumptions are plausible before allocation:
80
+ - the image enables SSH
81
+ - the SSH user matches the image
82
+ - the image architecture matches the selected instance type
83
+ - the host can run evo's remote workspace runtime
84
+
85
+ ## Provider notes
86
+
87
+ See `references/provider-matrix.md` for the compact provider summary, common config, and provider-specific setup/auth command.
@@ -0,0 +1,25 @@
1
+ ## Provider Matrix
2
+
3
+ Use this as the compact summary. This is setup guidance, not a runtime dependency list for evo itself.
4
+
5
+ | Provider | What evo uses at runtime | Setup / auth | Common config |
6
+ |---|---|---|---|
7
+ | `modal` | Modal Python SDK | If missing, install `evo-hq-cli[modal]` (or inject `modal` with `pipx`); then run `modal token new` | `app_name`, `gpu`, `region`, `timeout_seconds`, `health_timeout_seconds`, `apt_install`, `pip_install` |
8
+ | `e2b` | E2B Python SDK | If missing, install `evo-hq-cli[e2b]` (or inject `e2b` with `pipx`); then `export E2B_API_KEY=...` | `template`, `api_key`, `domain`, `root`, `timeout_seconds`, `health_timeout_seconds`, `allow_internet_access`, `secure` |
9
+ | `daytona` | Daytona Python SDK | If missing, install `evo-hq-cli[daytona]` (or inject `daytona` with `pipx`); then `export DAYTONA_API_KEY=...` | `api_key`, `api_url`, `target`, `timeout_seconds`, `health_timeout_seconds`, `ssh_host`, `ssh_port`, `ssh_token_ttl_minutes`, `sandbox_timeout_seconds` |
10
+ | `aws` | `boto3` | If missing, install `evo-hq-cli[aws]` (or inject `boto3` with `pipx`); then export AWS creds and region | `region`, `image_id`, `key_name`, `key`, `instance_type`, `subnet_id`, `security_group_ids`, `ssh_user`, `ssh_port`, `timeout_seconds`, `health_timeout_seconds`, `keep_warm` |
11
+ | `azure` | Azure Python SDK (`azure-identity`, `azure-mgmt-resource`, `azure-mgmt-network`, `azure-mgmt-compute`) | If missing, install `evo-hq-cli[azure]`; then use `az login` or Azure env creds, and provide subscription/resource-group config | `subscription_id`, `resource_group`, `location`, `vm_size`, `image`, `key`, `ssh_public_key`, `ssh_user`, `ssh_cidr`, `vnet_cidr`, `subnet_cidr`, `ssh_port`, `timeout_seconds`, `health_timeout_seconds`, `keep_warm` |
12
+ | `ssh` | local `ssh` transport | `ssh user@host` must work first; then add `-i` / `-p` if needed | `host`, `key`, `port`, `tunnel_port`, `keep_warm`, `health_timeout_seconds` |
13
+ | `manual` | existing remote workspace endpoint | no provisioning; only ask for URL/token if the user explicitly wants manual mode | `base_url`, `bearer_token`, `workspace_root`, `bundle_dir` |
14
+
15
+ Notes:
16
+ - `evo` runtime uses the provider SDK or transport listed in the second column.
17
+ - The `evo-hq-cli[<provider>]` extras are the preferred install path when the provider SDK is missing.
18
+ - Provider auth/setup is operator guidance. It is not the same thing as evo's runtime dependency surface.
19
+ - Common failures are usually one of: missing SDK import, missing auth state/env var, unreachable host/port, or provider-specific bootstrap mismatch.
20
+ - Incomplete provider setup usually surfaces on `evo new --remote <provider> ...`, because that is where remote allocation and bootstrap actually happen.
21
+ - For SSH-backed VM providers, also validate the guest assumptions:
22
+ - the instance image has SSH enabled
23
+ - the SSH user matches the image
24
+ - the image architecture matches the selected instance type
25
+ - the remote host can run evo's remote workspace runtime