npm - jeo-code - Versions diffs - 0.5.6 → 0.5.8 - Mend

jeo-code 0.5.6 → 0.5.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

package/README.ja.md +2 -2
package/README.ko.md +2 -2
package/README.md +2 -2
package/README.zh.md +2 -2
package/package.json +1 -1
package/src/agent/opik-tracer.ts +364 -0
package/src/autopilot.ts +35 -17
package/src/commands/launch.ts +72 -86

package/README.ja.md CHANGED Viewed

@@ -150,11 +150,11 @@ CI は `.github/workflows/npm-publish.yml` で公開します — GitHub リリ
 ## 変更履歴 (Changelog)
 <!-- CHANGELOG:START (auto-generated from CHANGELOG.md — run `bun run changelog:sync`) -->
+- **[0.5.8]** (2026-06-15) — Native Opik observability for the turn loop (opt-in `JEO_OPIK`, pure-TS no-op when unset) + autopilot convergence tracking.
+- **[0.5.7]** (2026-06-15) — `/model` picker is default-only, `/clear` resets to the initial screen, ESC clears the input box, and a launch process-listener leak is fixed.
 - **[0.5.6]** (2026-06-15) — `/model` sets only the default thinking; per-role reasoning moved to `/agents`.
 - **[0.5.5]** (2026-06-15) — Full multi-line visibility — the input box scrolls to the caret and the submitted card shows every line.
 - **[0.5.4]** (2026-06-15) — Reliable multi-line input is ON by default — a paste fills the box and submits as one message.
-- **[0.5.3]** (2026-06-15) — `$` chains multiple skills in one line (all run, in order), plus multi-line prompt input — paste-merge and gated Shift+Enter.
-- **[0.5.2]** (2026-06-14) — `$skill` prompt invocation with prefix/fuzzy suggestions, and a per-session input-box hue (amber in cmd-mode).
 See [CHANGELOG.md](CHANGELOG.md) for the full history.
 <!-- CHANGELOG:END -->

package/README.ko.md CHANGED Viewed

@@ -150,11 +150,11 @@ CI는 `.github/workflows/npm-publish.yml`로 배포합니다 — GitHub 릴리
 ## 변경 이력 (Changelog)
 <!-- CHANGELOG:START (auto-generated from CHANGELOG.md — run `bun run changelog:sync`) -->
+- **[0.5.8]** (2026-06-15) — Native Opik observability for the turn loop (opt-in `JEO_OPIK`, pure-TS no-op when unset) + autopilot convergence tracking.
+- **[0.5.7]** (2026-06-15) — `/model` picker is default-only, `/clear` resets to the initial screen, ESC clears the input box, and a launch process-listener leak is fixed.
 - **[0.5.6]** (2026-06-15) — `/model` sets only the default thinking; per-role reasoning moved to `/agents`.
 - **[0.5.5]** (2026-06-15) — Full multi-line visibility — the input box scrolls to the caret and the submitted card shows every line.
 - **[0.5.4]** (2026-06-15) — Reliable multi-line input is ON by default — a paste fills the box and submits as one message.
-- **[0.5.3]** (2026-06-15) — `$` chains multiple skills in one line (all run, in order), plus multi-line prompt input — paste-merge and gated Shift+Enter.
-- **[0.5.2]** (2026-06-14) — `$skill` prompt invocation with prefix/fuzzy suggestions, and a per-session input-box hue (amber in cmd-mode).
 See [CHANGELOG.md](CHANGELOG.md) for the full history.
 <!-- CHANGELOG:END -->

package/README.md CHANGED Viewed

@@ -150,11 +150,11 @@ Required npm token permissions (repository secret `NPM_TOKEN`):
 ## Changelog
 <!-- CHANGELOG:START (auto-generated from CHANGELOG.md — run `bun run changelog:sync`) -->
+- **[0.5.8]** (2026-06-15) — Native Opik observability for the turn loop (opt-in `JEO_OPIK`, pure-TS no-op when unset) + autopilot convergence tracking.
+- **[0.5.7]** (2026-06-15) — `/model` picker is default-only, `/clear` resets to the initial screen, ESC clears the input box, and a launch process-listener leak is fixed.
 - **[0.5.6]** (2026-06-15) — `/model` sets only the default thinking; per-role reasoning moved to `/agents`.
 - **[0.5.5]** (2026-06-15) — Full multi-line visibility — the input box scrolls to the caret and the submitted card shows every line.
 - **[0.5.4]** (2026-06-15) — Reliable multi-line input is ON by default — a paste fills the box and submits as one message.
-- **[0.5.3]** (2026-06-15) — `$` chains multiple skills in one line (all run, in order), plus multi-line prompt input — paste-merge and gated Shift+Enter.
-- **[0.5.2]** (2026-06-14) — `$skill` prompt invocation with prefix/fuzzy suggestions, and a per-session input-box hue (amber in cmd-mode).
 See [CHANGELOG.md](CHANGELOG.md) for the full history.
 <!-- CHANGELOG:END -->

package/README.zh.md CHANGED Viewed

@@ -150,11 +150,11 @@ CI 通过 `.github/workflows/npm-publish.yml` 发布 — GitHub 发布 release
 ## 更新日志 (Changelog)
 <!-- CHANGELOG:START (auto-generated from CHANGELOG.md — run `bun run changelog:sync`) -->
+- **[0.5.8]** (2026-06-15) — Native Opik observability for the turn loop (opt-in `JEO_OPIK`, pure-TS no-op when unset) + autopilot convergence tracking.
+- **[0.5.7]** (2026-06-15) — `/model` picker is default-only, `/clear` resets to the initial screen, ESC clears the input box, and a launch process-listener leak is fixed.
 - **[0.5.6]** (2026-06-15) — `/model` sets only the default thinking; per-role reasoning moved to `/agents`.
 - **[0.5.5]** (2026-06-15) — Full multi-line visibility — the input box scrolls to the caret and the submitted card shows every line.
 - **[0.5.4]** (2026-06-15) — Reliable multi-line input is ON by default — a paste fills the box and submits as one message.
-- **[0.5.3]** (2026-06-15) — `$` chains multiple skills in one line (all run, in order), plus multi-line prompt input — paste-merge and gated Shift+Enter.
-- **[0.5.2]** (2026-06-14) — `$skill` prompt invocation with prefix/fuzzy suggestions, and a per-session input-box hue (amber in cmd-mode).
 See [CHANGELOG.md](CHANGELOG.md) for the full history.
 <!-- CHANGELOG:END -->

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "jeo-code",
-  "version": "0.5.6",
+  "version": "0.5.8",
   "description": "Clean, highly optimized AI coding agent using spec-first loop",
   "type": "module",
   "main": "src/cli.ts",

package/src/agent/opik-tracer.ts ADDED Viewed

@@ -0,0 +1,364 @@
+/**
+ * Opik observability for the jeo agent turn loop (spec-stack · Run phase).
+ *
+ * Each agent turn becomes ONE Opik trace; each step/tool becomes a span; token
+ * usage and the eval feedback scores (`completed` / `verified` / `efficiency`)
+ * are attached to the trace. Pure TypeScript over `fetch` — no Python, no
+ * `opik` npm package — consistent with jeo's zero-native-dependency constraint.
+ *
+ * Hard invariants (see .specify/specs/opik-observability/seed.md):
+ *  - I1: `JEO_OPIK` unset => the tracer is a no-op; zero Opik HTTP calls.
+ *  - I2: no tracer error ever propagates out of an events callback.
+ *  - I3: no secret is logged; the key only travels in the `Authorization` header.
+ *  - I4: engine output is identical regardless of tracing outcome.
+ *
+ * Opik REST surface (private v1), confirmed against the installed SDK:
+ *  - POST  {base}/v1/private/traces/batch         { traces: [...] }
+ *  - POST  {base}/v1/private/spans/batch          { spans:  [...] }
+ *  - PUT   {base}/v1/private/traces/feedback-scores { scores: [...] }
+ * Headers: `Authorization: <api_key>`, `Comet-Workspace: <workspace>`.
+ */
+import { jeoEnv } from "../util/env";
+import type { AgentLoopEvents, ToolInvocation } from "./engine";
+type Env = Record<string, string | undefined>;
+type FetchImpl = typeof fetch;
+const DEFAULT_BASE = "https://www.comet.com/opik/api";
+const DEFAULT_PROJECT = "jeo";
+const DEFAULT_WORKSPACE = "jeo";
+/** Verification signal (mirrors engine.ts VERIFY_SIGNAL_RE) — used for the eval score. */
+const VERIFY_SIGNAL_RE = /\b(test|tests|tsc|typecheck|lint|build|check|spec|pytest|vitest|jest)\b/i;
+/** Master switch. Tracing is OFF unless `JEO_OPIK` is `1`/`true`/`yes`/`on`. */
+export function opikEnabled(env: Env = process.env): boolean {
+  const raw = (jeoEnv("OPIK", env) ?? "").trim().toLowerCase();
+  return raw === "1" || raw === "true" || raw === "yes" || raw === "on";
+}
+export interface OpikConfig {
+  apiKey?: string;
+  workspace: string;
+  baseUrl: string;
+  projectName: string;
+}
+/** Resolve Opik connection config from the environment (no I/O). */
+export function resolveOpikConfig(env: Env = process.env): OpikConfig {
+  const baseRaw = (env.OPIK_URL_OVERRIDE ?? DEFAULT_BASE).trim();
+  // Normalize a trailing slash so path joins are predictable.
+  const baseUrl = baseRaw.replace(/\/+$/, "");
+  return {
+    apiKey: env.OPIK_API_KEY?.trim() || undefined,
+    workspace: (env.COMET_WORKSPACE ?? DEFAULT_WORKSPACE).trim() || DEFAULT_WORKSPACE,
+    baseUrl,
+    projectName: (env.OPIK_PROJECT_NAME ?? DEFAULT_PROJECT).trim() || DEFAULT_PROJECT,
+  };
+}
+/** RFC-9562 UUIDv7 (time-ordered) — Opik orders traces/spans by id. */
+export function uuidv7(now: number = Date.now(), rnd: () => number = Math.random): string {
+  const ts = Math.max(0, Math.trunc(now));
+  const hex = ts.toString(16).padStart(12, "0").slice(-12);
+  const b: number[] = [];
+  for (let i = 0; i < 16; i++) b.push(Math.floor(rnd() * 256) & 0xff);
+  // 48-bit big-endian timestamp
+  for (let i = 0; i < 6; i++) b[i] = parseInt(hex.slice(i * 2, i * 2 + 2), 16);
+  b[6] = 0x70 | (b[6]! & 0x0f); // version 7
+  b[8] = 0x80 | (b[8]! & 0x3f); // variant
+  const h = b.map(x => x.toString(16).padStart(2, "0")).join("");
+  return `${h.slice(0, 8)}-${h.slice(8, 12)}-${h.slice(12, 16)}-${h.slice(16, 20)}-${h.slice(20)}`;
+}
+/** ISO-8601 with milliseconds (Opik expects RFC-3339 timestamps). */
+function iso(ms: number): string {
+  return new Date(ms).toISOString();
+}
+export interface TurnMeta {
+  /** Human-readable turn name (the user intent / first message). */
+  name: string;
+  /** The user input recorded on the trace. */
+  input?: string;
+  /** Extra metadata (model, cwd, …). */
+  metadata?: Record<string, unknown>;
+  tags?: string[];
+}
+export interface StepRecord {
+  step: number;
+  tool: string;
+  success: boolean;
+  output: string;
+  startTime: number;
+  endTime: number;
+}
+export interface TurnScores {
+  completed: number;
+  verified: number;
+  efficiency: number;
+}
+/**
+ * Eval scoring (the *evaluation* surface). All in [0,1].
+ *  - completed: 1 when the turn ended in `done`.
+ *  - verified: 1 when a verification signal (test/tsc/build/…) ran in-turn.
+ *  - efficiency: 1 for a 1-step turn, decaying as steps grow (1/sqrt(steps)),
+ *    so fewer steps to reach `done` scores higher; floored at 0.
+ */
+export function computeScores(args: {
+  done: boolean;
+  steps: number;
+  verificationRan: boolean;
+}): TurnScores {
+  const steps = Math.max(1, Math.trunc(args.steps) || 1);
+  const efficiency = Math.min(1, 1 / Math.sqrt(steps));
+  return {
+    completed: args.done ? 1 : 0,
+    verified: args.verificationRan ? 1 : 0,
+    efficiency: Number(efficiency.toFixed(4)),
+  };
+}
+/** Whether a tool name + output looks like an in-turn verification signal. */
+export function isVerificationStep(tool: string, output: string): boolean {
+  if (tool !== "bash") return false;
+  return VERIFY_SIGNAL_RE.test(output);
+}
+// ---- Pure payload builders (unit-tested without network) --------------------
+export function buildTracePayload(args: {
+  id: string;
+  project: string;
+  meta: TurnMeta;
+  startTime: number;
+  endTime: number;
+  output?: string;
+  usage?: { inputTokens: number; outputTokens: number };
+}): Record<string, unknown> {
+  const metadata = { ...(args.meta.metadata ?? {}) } as Record<string, unknown>;
+  if (args.usage) {
+    metadata.usage = {
+      prompt_tokens: args.usage.inputTokens,
+      completion_tokens: args.usage.outputTokens,
+      total_tokens: args.usage.inputTokens + args.usage.outputTokens,
+    };
+  }
+  return {
+    id: args.id,
+    project_name: args.project,
+    name: args.meta.name,
+    start_time: iso(args.startTime),
+    end_time: iso(args.endTime),
+    ...(args.meta.input != null ? { input: { message: args.meta.input } } : {}),
+    ...(args.output != null ? { output: { result: args.output } } : {}),
+    metadata,
+    tags: args.meta.tags ?? ["jeo"],
+  };
+}
+export function buildSpanPayload(args: {
+  id: string;
+  traceId: string;
+  project: string;
+  rec: StepRecord;
+}): Record<string, unknown> {
+  const { rec } = args;
+  return {
+    id: args.id,
+    trace_id: args.traceId,
+    project_name: args.project,
+    name: `step ${rec.step}: ${rec.tool}`,
+    type: "general",
+    start_time: iso(rec.startTime),
+    end_time: iso(rec.endTime),
+    input: { tool: rec.tool },
+    output: { success: rec.success, output: rec.output.slice(0, 4000) },
+    metadata: { step: rec.step, success: rec.success },
+  };
+}
+export function buildScorePayload(args: {
+  traceId: string;
+  project: string;
+  scores: TurnScores;
+}): Record<string, unknown> {
+  const mk = (name: string, value: number, reason: string) => ({
+    id: args.traceId,
+    project_name: args.project,
+    name,
+    value,
+    source: "sdk" as const,
+    reason,
+  });
+  return {
+    scores: [
+      mk("completed", args.scores.completed, "1 when the turn ended in `done`"),
+      mk("verified", args.scores.verified, "1 when a verification signal ran in-turn"),
+      mk("efficiency", args.scores.efficiency, "1/sqrt(steps); fewer steps score higher"),
+    ],
+  };
+}
+// ---- Tracer -----------------------------------------------------------------
+export interface OpikTracer {
+  readonly enabled: boolean;
+  startTurn(): void;
+  step(rec: StepRecord): void;
+  usage(u: { inputTokens: number; outputTokens: number }): void;
+  endTurn(result: { done: boolean; steps: number; output?: string }): Promise<void>;
+}
+const NOOP_TRACER: OpikTracer = {
+  enabled: false,
+  startTurn() {},
+  step() {},
+  usage() {},
+  async endTurn() {},
+};
+class LiveOpikTracer implements OpikTracer {
+  readonly enabled = true;
+  private readonly traceId = uuidv7();
+  private readonly steps: StepRecord[] = [];
+  private readonly spanIds = new Map<number, string>();
+  private startedAt = Date.now();
+  private usageAcc = { inputTokens: 0, outputTokens: 0 };
+  private sawUsage = false;
+  private verificationRan = false;
+  private ended = false;
+  constructor(
+    private readonly meta: TurnMeta,
+    private readonly cfg: OpikConfig,
+    private readonly fetchImpl: FetchImpl,
+  ) {}
+  private headers(): Record<string, string> {
+    const h: Record<string, string> = {
+      "Content-Type": "application/json",
+      "Comet-Workspace": this.cfg.workspace,
+    };
+    if (this.cfg.apiKey) h["Authorization"] = this.cfg.apiKey;
+    return h;
+  }
+  /** Fire-and-forget POST/PUT; any failure is swallowed (I2/I4). */
+  private async send(path: string, body: unknown, method: "POST" | "PUT" = "POST"): Promise<void> {
+    try {
+      await this.fetchImpl(`${this.cfg.baseUrl}/${path}`, {
+        method,
+        headers: this.headers(),
+        body: JSON.stringify(body),
+      });
+    } catch {
+      /* never break the turn */
+    }
+  }
+  startTurn(): void {
+    this.startedAt = Date.now();
+  }
+  step(rec: StepRecord): void {
+    this.steps.push(rec);
+    this.spanIds.set(rec.step, uuidv7(rec.startTime));
+    if (isVerificationStep(rec.tool, rec.output)) this.verificationRan = true;
+  }
+  usage(u: { inputTokens: number; outputTokens: number }): void {
+    this.usageAcc.inputTokens += u.inputTokens || 0;
+    this.usageAcc.outputTokens += u.outputTokens || 0;
+    this.sawUsage = true;
+  }
+  async endTurn(result: { done: boolean; steps: number; output?: string }): Promise<void> {
+    if (this.ended) return;
+    this.ended = true;
+    const endedAt = Date.now();
+    const project = this.cfg.projectName;
+    const trace = buildTracePayload({
+      id: this.traceId,
+      project,
+      meta: this.meta,
+      startTime: this.startedAt,
+      endTime: endedAt,
+      output: result.output,
+      usage: this.sawUsage ? this.usageAcc : undefined,
+    });
+    const spans = this.steps.map(rec =>
+      buildSpanPayload({ id: this.spanIds.get(rec.step)!, traceId: this.traceId, project, rec }),
+    );
+    const scores = computeScores({
+      done: result.done,
+      steps: result.steps,
+      verificationRan: this.verificationRan,
+    });
+    const scorePayload = buildScorePayload({ traceId: this.traceId, project, scores });
+    await this.send("v1/private/traces/batch", { traces: [trace] });
+    if (spans.length > 0) await this.send("v1/private/spans/batch", { spans });
+    await this.send("v1/private/traces/feedback-scores", scorePayload, "PUT");
+  }
+}
+/**
+ * Build a tracer for one turn. Returns a no-op tracer (zero network) when
+ * `JEO_OPIK` is off or no API key is configured.
+ */
+export function createOpikTracer(
+  meta: TurnMeta,
+  env: Env = process.env,
+  fetchImpl: FetchImpl = fetch,
+): OpikTracer {
+  if (!opikEnabled(env)) return NOOP_TRACER;
+  const cfg = resolveOpikConfig(env);
+  if (!cfg.apiKey) return NOOP_TRACER; // no creds => stay silent, never guess
+  return new LiveOpikTracer(meta, cfg, fetchImpl);
+}
+/**
+ * Compose an existing `AgentLoopEvents` with tracer hooks. Every original
+ * callback is delegated unchanged; the tracer observes step boundaries, tool
+ * results, and usage. Tracer side-effects can never throw out of a callback.
+ */
+export function wrapEvents(events: AgentLoopEvents | undefined, tracer: OpikTracer): AgentLoopEvents {
+  if (!tracer.enabled) return events ?? {};
+  const base: AgentLoopEvents = events ?? {};
+  let stepStartedAt = Date.now();
+  let currentStep = 0;
+  const wrapped: AgentLoopEvents = {
+    ...base,
+    onStep(step: number) {
+      currentStep = step;
+      stepStartedAt = Date.now();
+      try { base.onStep?.(step); } finally { /* tracer has no per-onStep write */ }
+    },
+    onAssistant(raw: string, invocation: ToolInvocation | null) {
+      base.onAssistant?.(raw, invocation);
+    },
+    onToolResult(tool: string, success: boolean, output: string) {
+      try {
+        tracer.step({
+          step: currentStep || 1,
+          tool,
+          success,
+          output,
+          startTime: stepStartedAt,
+          endTime: Date.now(),
+        });
+      } catch { /* I2 */ }
+      base.onToolResult?.(tool, success, output);
+    },
+    onUsage(usage: { inputTokens: number; outputTokens: number }) {
+      try { tracer.usage(usage); } catch { /* I2 */ }
+      base.onUsage?.(usage);
+    },
+  };
+  return wrapped;
+}

package/src/autopilot.ts CHANGED Viewed

@@ -142,6 +142,33 @@ function isImprovement(goal: Goal, score: number, best: number | undefined): boo
   return true; // gate handled via passed, not score
 }
+/**
+ * Single source of truth for the ratchet keep/revert decision. Shared by step,
+ * loop, and status so they can never diverge.
+ *  - gate goal: keep iff the eval passed (score is irrelevant).
+ *  - min/max goal: a non-measurable (NaN) score can never prove improvement, so
+ *    it is always reverted; otherwise keep iff it improves on the best so far.
+ */
+export function decideStep(
+  goal: Goal,
+  score: number,
+  passed: boolean,
+  best: number | undefined,
+): "keep" | "revert" {
+  if (goal === "gate") return passed ? "keep" : "revert";
+  if (Number.isNaN(score)) return "revert";
+  return isImprovement(goal, score, best) ? "keep" : "revert";
+}
+/**
+ * Convergence is a streak of consecutive no-progress steps (reverts) reaching
+ * patience — for every goal, gate included. A gate loop that keeps failing has
+ * made no forward progress and must stop early instead of burning the budget.
+ */
+export function isConverged(sinceImprove: number, patience: number): boolean {
+  return sinceImprove >= patience;
+}
 function hasBaseline(): boolean {
   return readLog().some((e) => e.type === "baseline");
 }
@@ -189,14 +216,7 @@ function cmdStep(flags: Record<string, string>): void {
   const best = currentBest(s);
   const { score, passed, output } = runEval(s);
-  let decision: "keep" | "revert";
-  if (s.goal === "gate") {
-    decision = passed ? "keep" : "revert";
-  } else if (Number.isNaN(score)) {
-    decision = "revert"; // no measurable score => cannot prove improvement
-  } else {
-    decision = isImprovement(s.goal, score, best) ? "keep" : "revert";
-  }
+  const decision = decideStep(s.goal, score, passed, best);
   if (decision === "revert" && flags["on-revert"]) {
     try {
@@ -242,10 +262,7 @@ function cmdLoop(flags: Record<string, string>): void {
     const best = currentBest(s);
     const { score, passed, output } = runEval(s);
-    let decision: "keep" | "revert";
-    if (s.goal === "gate") decision = passed ? "keep" : "revert";
-    else if (Number.isNaN(score)) decision = "revert";
-    else decision = isImprovement(s.goal, score, best) ? "keep" : "revert";
+    const decision = decideStep(s.goal, score, passed, best);
     if (decision === "revert" && flags["on-revert"]) {
       try {
@@ -255,11 +272,12 @@ function cmdLoop(flags: Record<string, string>): void {
       }
     }
     appendLog({ type: "step", iteration: i, change: `loop#${i}`, score, passed, decision, prevBest: best ?? null, output });
-    const improved = decision === "keep" && (s.goal === "gate" || !Number.isNaN(score));
-    sinceImprove = improved && (best === undefined || s.goal === "gate" || isImprovement(s.goal, score, best)) ? 0 : sinceImprove + 1;
+    // A keep is forward progress (min/max: provably an improvement; gate: a pass).
+    // Anything else extends the no-progress streak toward convergence.
+    sinceImprove = decision === "keep" ? 0 : sinceImprove + 1;
     console.log(`jeo autopilot: loop ${i}/${max} ${decision.toUpperCase()} score=${fmt(score)} (sinceImprove=${sinceImprove})`);
-    if (s.goal !== "gate" && sinceImprove >= s.patience) {
+    if (isConverged(sinceImprove, s.patience)) {
       appendLog({ type: "stop", reason: "converged", iteration: i, patience: s.patience });
       console.log(`jeo autopilot: stop — converged (no improvement in ${s.patience} steps)`);
       return;
@@ -279,13 +297,13 @@ function cmdStatus(flags: Record<string, string>): void {
   const best = currentBest(s);
   const stop = [...log].reverse().find((e) => e.type === "stop");
-  // convergence: steps since last keep-with-improvement
+  // convergence: steps since last keep (forward progress)
   let sinceImprove = 0;
   for (const e of steps) {
     if (e.decision === "keep") sinceImprove = 0;
     else sinceImprove++;
   }
-  const converged = s.goal !== "gate" && sinceImprove >= s.patience;
+  const converged = isConverged(sinceImprove, s.patience);
   let recommendation: string;
   if (stop) recommendation = `stopped: ${stop.reason as string}`;

package/src/commands/launch.ts CHANGED Viewed

@@ -2,6 +2,7 @@ import { createInterface } from "node:readline/promises";
 import { emitKeypressEvents } from "node:readline";
 import { PassThrough } from "node:stream";
 import { runAgentLoop, executorSystemPrompt, DEFAULT_TOOLS, TOOL_PROTOCOL, WORKING_DISCIPLINE, type AgentLoopEvents } from "../agent/engine";
+import { createOpikTracer, wrapEvents } from "../agent/opik-tracer";
 import { initialDynamicStepLimit } from "../agent/step-budget";
 import { memoryPromptSection, spawnDetachedDistill } from "../agent/memory";
 import { createTaskTool, taskToolProtocolLine, type TaskSubEvent } from "../agent/task-tool";
@@ -1472,6 +1473,16 @@ export async function runLaunchCommand(args: string[]): Promise<void> {
           subagent: createSubagentTool(subagentRegistry),
         };
         const tools = filterToolMap(fullTools, Array.from(allowedTools));
+        // Opik observability (opt-in via JEO_OPIK): one trace per turn, spans per
+        // step/tool, token usage, and completed/verified/efficiency eval scores.
+        // No-op (zero network) when disabled or unconfigured; never breaks a turn.
+        const opik = createOpikTracer({
+          name: userInput.trim().slice(0, 80) || "jeo turn",
+          input: userInput,
+          metadata: { model: sessionModel, cwd },
+          tags: ["jeo", "launch"],
+        });
+        opik.startTurn();
         result = await runAgentLoop(history, {
           cwd,
           tools,
@@ -1480,7 +1491,7 @@ export async function runLaunchCommand(args: string[]): Promise<void> {
           maxTokens: sessionThinking ? thinkingMaxTokens(sessionThinking) : undefined,
           signal: ac.signal,
           steer: drainSteer,
-          events: { ...withToolDetailCapture(tui ? tui.events() : streamEvents), onBeforeDone },
+          events: wrapEvents({ ...withToolDetailCapture(tui ? tui.events() : streamEvents), onBeforeDone }, opik),
         });
         if (result.done && looksLikeSkillEcho(result.doneReason ?? "", resolvedSkills)) {
           history.push({
@@ -1498,7 +1509,7 @@ export async function runLaunchCommand(args: string[]): Promise<void> {
             maxTokens: sessionThinking ? thinkingMaxTokens(sessionThinking) : undefined,
             signal: ac.signal,
             steer: drainSteer,
-            events: withToolDetailCapture(tui ? tui.events() : streamEvents),
+            events: wrapEvents(withToolDetailCapture(tui ? tui.events() : streamEvents), opik),
           });
           const usage =
             result.usage && retry.usage
@@ -1509,6 +1520,8 @@ export async function runLaunchCommand(args: string[]): Promise<void> {
               : retry.usage ?? result.usage;
           result = { ...retry, steps: result.steps + retry.steps, usage };
         }
+        // Close the Opik trace once per turn (done or budget-stop). Errors swallowed.
+        await opik.endTurn({ done: result.done, steps: result.steps, output: result.doneReason });
       } finally {
         harness.dispose();
         subagentRegistry.cancelAll(); // #9: no detached run leaks past the turn
@@ -1931,6 +1944,15 @@ export async function runLaunchCommand(args: string[]): Promise<void> {
   const multilineInput = !!process.stdin.isTTY && jeoEnv("NO_MULTILINE") !== "1";
   const loneLfShiftEnter = jeoEnv("MULTILINE") === "1";
   const expandSentinel = (s: string): string => (multilineInput ? s.split(SENTINEL).join("\n") : s);
+  // Prompt-scoped process listeners (stdin data/keypress, stdout resize). Registered
+  // once per launch but previously anonymous and never removed — benign for a single
+  // CLI run, but repeated launch() (test harness) accumulated them past Node's
+  // 10-listener default → MaxListenersExceededWarning + a real leak. Track each remover
+  // and drain it on every exit path so the process listener set returns to baseline.
+  const promptListenerCleanups: Array<() => void> = [];
+  const drainPromptListeners = () => {
+    for (const off of promptListenerCleanups.splice(0)) { try { off(); } catch { /* best effort */ } }
+  };
   let keyFilter: PassThrough | undefined;
   if (multilineInput) {
     const kf = new PassThrough();
@@ -1951,7 +1973,7 @@ export async function runLaunchCommand(args: string[]): Promise<void> {
     //     off) and the xterm "\x1b[27;2;13~" / kitty "\x1b[13;2u" sequences. Enter ("\r")
     //     passes through and submits.
     let kfInPaste = false;
-    process.stdin.on("data", (chunk: Buffer) => {
+    const kfDataHandler = (chunk: Buffer) => {
       const data = chunk.toString("utf8");
       let out = "";
       let i = 0;
@@ -1972,7 +1994,9 @@ export async function runLaunchCommand(args: string[]): Promise<void> {
         out += data[i]; i += 1;
       }
       kf.write(out);
-    });
+    };
+    process.stdin.on("data", kfDataHandler);
+    promptListenerCleanups.push(() => process.stdin.off("data", kfDataHandler));
     keyFilter = kf;
     // readline now decodes keypresses on `keyFilter`; keep process.stdin emitting
     // 'keypress' too so the footer-redraw / paste-marker / picker listeners (registered
@@ -2032,13 +2056,15 @@ export async function runLaunchCommand(args: string[]): Promise<void> {
   const pasteMerge: { buf: string[]; endWaiters: Array<() => void> } = { buf: [], endWaiters: [] };
   let pasteLineFired = false; // the line that resolved rl.question came from inside a paste
   if (process.stdin.isTTY) {
-    process.stdin.on("keypress", (_ch: string, key: { name?: string } | undefined) => {
+    const pasteKeypressHandler = (_ch: string, key: { name?: string } | undefined) => {
       if (key?.name === "paste-start") { promptPasteActive = true; pasteMerge.buf = []; }
       else if (key?.name === "paste-end") {
         promptPasteActive = false;
         for (const w of pasteMerge.endWaiters.splice(0)) w();
       }
-    });
+    };
+    process.stdin.on("keypress", pasteKeypressHandler);
+    promptListenerCleanups.push(() => process.stdin.off("keypress", pasteKeypressHandler));
     // Enable bracketed paste for the REPL lifetime (restored on exit below):
     // terminals only wrap pastes in the 200~/201~ markers once the app opts in.
     process.stdout.write("\x1b[?2004h");
@@ -2540,28 +2566,13 @@ export async function runLaunchCommand(args: string[]): Promise<void> {
   const notReadyWarning = (st: { name: string; label: string }): string =>
     `  ! ${st.name} is not call-ready yet (${st.label}) — run /provider login antigravity before the first turn.`;
-  const CORE_MODEL_ACTION_ROLE_ORDER = ["executor", "architect", "planner", "critic"] as const;
   const MODEL_BADGE_ROLE_ORDER = ["planner", "architect", "executor", "critic"] as const;
   const roleBadgeColor = (roleId: string): ModelAssignmentBadge["color"] =>
     roleId === "executor" || roleId === "architect" || roleId === "planner" || roleId === "critic" ? roleId : "critic";
-  const orderedModelRoles = (config: Awaited<ReturnType<typeof readGlobalConfig>>) => {
-    const roles = allSubagentRoles(config);
-    const emitted = new Set<string>();
-    const out: ReturnType<typeof allSubagentRoles> = [];
-    for (const id of CORE_MODEL_ACTION_ROLE_ORDER) {
-      const role = roles.find(r => r.id === id);
-      if (role) {
-        emitted.add(role.id);
-        out.push(role);
-      }
-    }
-    for (const role of roles) {
-      if (!emitted.has(role.id)) out.push(role);
-    }
-    return out;
-  };
   const modelPickerAssignments = async (): Promise<ModelAssignmentBadge[]> => {
     const cfg = await readGlobalConfig();
@@ -2732,7 +2743,7 @@ export async function runLaunchCommand(args: string[]): Promise<void> {
     choices.push({
       value: "heading:default",
       label: "Set as DEFAULT (Default)",
-      hint: `${config.defaultModel} (${currentDefaultThinking})`,
+      hint: `${config.defaultModel} (${currentDefaultThinking}) · roles → /agents`,
       disabled: true,
     });
     appendChildren([
@@ -2744,73 +2755,21 @@ export async function runLaunchCommand(args: string[]): Promise<void> {
       })),
     ]);
-    for (const role of orderedModelRoles(config)) {
-      const roleThinking = resolveSubagentThinking(role.id, config) ?? "inherit";
-      choices.push({
-        value: `heading:${role.id}`,
-        label: `Set as ${role.title.toUpperCase()} (${role.title})`,
-        hint: `${resolveSubagentModel(role.id, config)} (${roleThinking})`,
-        disabled: true,
-      });
-      appendChildren([
-        { value: `${role.id}:keep`, label: "Set model only", hint: `keep thinking ${roleThinking} · set via /agents edit` },
-      ]);
-    }
-    choices.push({
-      value: "preset:openai-codex",
-      label: "Apply OpenAI Codex role preset",
-      hint: "Default medium · Executor low · Architect xhigh · Planner medium · Critic high",
-    });
     return choices;
   };
-  const applyOpenAiCodexRolePreset = async (target: string, cfgForPick: Awaited<ReturnType<typeof readGlobalConfig>>): Promise<void> => {
-    const roleThinking: Record<(typeof CORE_MODEL_ACTION_ROLE_ORDER)[number], ThinkLevel> = {
-      executor: "low",
-      architect: "xhigh",
-      planner: "medium",
-      critic: "high",
-    };
-    await saveConfigPatch(raw => {
-      let subagents = raw.subagents ?? {};
-      for (const roleId of CORE_MODEL_ACTION_ROLE_ORDER) {
-        subagents = withSubagentSetting({ subagents }, roleId, { model: target, thinking: roleThinking[roleId] });
-      }
-      return {
-        ...rememberModelPatch(raw, target),
-        thinkingLevel: "medium",
-        subagents,
-      };
-    });
-    sessionModel = target;
-    sessionThinking = "medium";
-    const { resolved, provider } = await describeModel(target);
-    const st = (await describeAllProviders(cfgForPick)).find(s => s.name === provider);
-    console.log(`OpenAI Codex role preset applied to ${formatModelLine({ label: target, resolved, provider, ready: st?.ready })} — Default medium, Executor low, Architect xhigh, Planner medium, Critic high`);
-  };
   const applyPickedModelWithTarget = async (target: string): Promise<boolean> => {
     if (!process.stdin.isTTY || !process.stdout.isTTY) return false;
     const cfgForPick = await readGlobalConfig();
+    // `/model` only assigns the DEFAULT model + (optionally) the default thinking.
+    // Per-role model and thinking are configured in /agents (and /agents edit).
     const choice = await pickFromOptions(`Model Name: ${displayModelName(target)}\n\nAction for: ${target}`, modelActionChoices(cfgForPick)) ?? "default:keep";
-    if (choice === "preset:openai-codex") {
-      await applyOpenAiCodexRolePreset(target, cfgForPick);
-      return true;
-    }
-    const [applyTo, action = "keep"] = choice.split(":", 2);
-    if (applyTo === "heading") return false;
-    const roleTarget = applyTo !== "default" ? getSubagentRole(applyTo, cfgForPick) : undefined;
+    const [, action = "keep"] = choice.split(":", 2);
     const { resolved, provider } = await describeModel(target);
     const st = (await describeAllProviders(cfgForPick)).find(s => s.name === provider);
-    if (roleTarget) {
-      const thinkPatch = action === "inherit" ? { thinking: undefined } : isThinkingLevel(action) ? { thinking: action } : {};
-      await saveConfigPatch(raw => ({ subagents: withSubagentSetting(raw, roleTarget.id, { model: target, ...thinkPatch }) }));
-      const thinkNote = action !== "keep" ? ` · thinking ${action}` : "";
-      console.log(`Subagent '${roleTarget.id}' model set to ${formatModelLine({ label: target, resolved, provider, ready: st?.ready })}${thinkNote} — saved (change anytime via /model or /agents)`);
-      return true;
-    }
     sessionModel = target;
     const defaultThinking = isThinkingLevel(action) ? action : undefined;
     if (defaultThinking) {
@@ -2820,7 +2779,7 @@ export async function runLaunchCommand(args: string[]): Promise<void> {
       ...rememberModelPatch(raw, target),
       ...(defaultThinking ? { thinkingLevel: defaultThinking } : {}),
     }));
-    console.log(`Model set to ${formatModelLine({ label: target, resolved, provider, ready: st?.ready })}${defaultThinking ? ` · thinking ${defaultThinking}` : ""} — saved as default`);
+    console.log(`Model set to ${formatModelLine({ label: target, resolved, provider, ready: st?.ready })}${defaultThinking ? ` · thinking ${defaultThinking}` : ""} — saved as default. Role models/thinking: /agents`);
     return true;
   };
@@ -2925,7 +2884,7 @@ export async function runLaunchCommand(args: string[]): Promise<void> {
   if (previewEnabled) {
     process.once("exit", () => out.write("\x1b[?25h")); // safety net: never leave the cursor hidden
-    process.stdin.on("keypress", (_ch: string, key: { name?: string; ctrl?: boolean; meta?: boolean } | undefined) => {
+    const footerKeypressHandler = (_ch: string, key: { name?: string; ctrl?: boolean; meta?: boolean } | undefined) => {
       if (key?.ctrl && key.name === "c") {
         forceExitFromCtrlC();
         return;
@@ -3040,18 +2999,22 @@ export async function runLaunchCommand(args: string[]): Promise<void> {
           drawFooter(previewLines(typedLine));
         } catch { /* ignore render races */ }
       });
-    });
+    };
+    process.stdin.on("keypress", footerKeypressHandler);
+    promptListenerCleanups.push(() => process.stdin.off("keypress", footerKeypressHandler));
     // Idle-prompt resize: re-reserve the footer at the new terminal height so the
     // fixed reservation stays accurate (otherwise the next paint would target the
     // old row count and either over-shoot or under-paint the reserved region).
-    process.stdout.on("resize", () => {
+    const idleResizeHandler = () => {
       if (!previewArmed) return;
       try {
         disarmPreview();
         armPreview();
         drawFooter(promptHistoryLines ? historyPreviewLines(promptHistoryLines) : previewLines(typedLine, navIdx));
       } catch { /* ignore resize render races */ }
-    });
+    };
+    process.stdout.on("resize", idleResizeHandler);
+    promptListenerCleanups.push(() => process.stdout.off("resize", idleResizeHandler));
   }
   while (true) {
@@ -3162,7 +3125,14 @@ export async function runLaunchCommand(args: string[]): Promise<void> {
       }
       if (input === "/clear") {
         history.length = 1;
-        console.log("(history cleared)");
+        // Back to the initial screen: wipe the conversation, clear the terminal +
+        // scrollback, and re-render the welcome banner so /clear looks like a fresh launch.
+        if (process.stdout.isTTY) {
+          disarmPreview();
+          process.stdout.write("\x1b[2J\x1b[3J\x1b[H"); // clear screen + scrollback + cursor home
+          console.log(renderWelcome(welcomeData).join("\n"));
+        }
+        console.log("(history cleared — back to the start screen)");
         continue;
       }
       if (input === "/compact") {
@@ -3281,6 +3251,20 @@ export async function runLaunchCommand(args: string[]): Promise<void> {
               if (history[k]!.role === "assistant" && !lastReply) lastReply = String(history[k]!.content ?? "");
               if (lastUserInput && lastReply) break;
             }
+            // Seed readline's input history so ↑ in the prompt recalls THIS session's
+            // prior prompts (not just lines typed in the current run). readline history
+            // is newest-first; unshift in chronological order so the session's newest
+            // prompt lands at the front (first ↑). Skip injected/framed messages.
+            const rli = rl as unknown as { history?: string[] };
+            if (Array.isArray(rli.history)) {
+              const priorPrompts = history
+                .filter(m => m.role === "user")
+                .map(m => String(m.content ?? "").trim())
+                .filter(c => c && !c.startsWith("Tool [") && !c.startsWith("[mid-turn steering") && !c.startsWith("[Earlier conversation summary]"));
+              for (const p of priorPrompts) {
+                if (rli.history[0] !== p) rli.history.unshift(p);
+              }
+            }
             const sep = "─".repeat(Math.min(48, Math.max(20, (process.stdout.columns ?? 80) - 1)));
             logLines([
               sep,
@@ -4359,6 +4343,7 @@ export async function runLaunchCommand(args: string[]): Promise<void> {
     } catch { /* best effort */ }
     process.removeListener("SIGINT", forceExitFromCtrlC);
     process.stdin.off("data", forceExitOnCtrlCByte);
+    drainPromptListeners();
     restorePromptRawMode();
     process.exit(130);
   }
@@ -4374,6 +4359,7 @@ export async function runLaunchCommand(args: string[]): Promise<void> {
   if (sessionId && !flags.noSession) console.log(formatResumeHint(sessionId));
   process.removeListener("SIGINT", forceExitFromCtrlC);
   process.stdin.off("data", forceExitOnCtrlCByte);
+  drainPromptListeners();
   restorePromptRawMode();
   gracefulReadlineClose = true;
   rl.close();