jeo-code 0.5.7 → 0.5.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.ja.md CHANGED
@@ -150,11 +150,11 @@ CI は `.github/workflows/npm-publish.yml` で公開します — GitHub リリ
150
150
  ## 変更履歴 (Changelog)
151
151
 
152
152
  <!-- CHANGELOG:START (auto-generated from CHANGELOG.md — run `bun run changelog:sync`) -->
153
+ - **[0.5.9]** (2026-06-15) — Bounded per-frame wrap for the live thinking/tool-output blocks — re-render cost no longer grows with stream length.
154
+ - **[0.5.8]** (2026-06-15) — Native Opik observability for the turn loop (opt-in `JEO_OPIK`, pure-TS no-op when unset) + autopilot convergence tracking.
153
155
  - **[0.5.7]** (2026-06-15) — `/model` picker is default-only, `/clear` resets to the initial screen, ESC clears the input box, and a launch process-listener leak is fixed.
154
156
  - **[0.5.6]** (2026-06-15) — `/model` sets only the default thinking; per-role reasoning moved to `/agents`.
155
157
  - **[0.5.5]** (2026-06-15) — Full multi-line visibility — the input box scrolls to the caret and the submitted card shows every line.
156
- - **[0.5.4]** (2026-06-15) — Reliable multi-line input is ON by default — a paste fills the box and submits as one message.
157
- - **[0.5.3]** (2026-06-15) — `$` chains multiple skills in one line (all run, in order), plus multi-line prompt input — paste-merge and gated Shift+Enter.
158
158
 
159
159
  See [CHANGELOG.md](CHANGELOG.md) for the full history.
160
160
  <!-- CHANGELOG:END -->
package/README.ko.md CHANGED
@@ -150,11 +150,11 @@ CI는 `.github/workflows/npm-publish.yml`로 배포합니다 — GitHub 릴리
150
150
  ## 변경 이력 (Changelog)
151
151
 
152
152
  <!-- CHANGELOG:START (auto-generated from CHANGELOG.md — run `bun run changelog:sync`) -->
153
+ - **[0.5.9]** (2026-06-15) — Bounded per-frame wrap for the live thinking/tool-output blocks — re-render cost no longer grows with stream length.
154
+ - **[0.5.8]** (2026-06-15) — Native Opik observability for the turn loop (opt-in `JEO_OPIK`, pure-TS no-op when unset) + autopilot convergence tracking.
153
155
  - **[0.5.7]** (2026-06-15) — `/model` picker is default-only, `/clear` resets to the initial screen, ESC clears the input box, and a launch process-listener leak is fixed.
154
156
  - **[0.5.6]** (2026-06-15) — `/model` sets only the default thinking; per-role reasoning moved to `/agents`.
155
157
  - **[0.5.5]** (2026-06-15) — Full multi-line visibility — the input box scrolls to the caret and the submitted card shows every line.
156
- - **[0.5.4]** (2026-06-15) — Reliable multi-line input is ON by default — a paste fills the box and submits as one message.
157
- - **[0.5.3]** (2026-06-15) — `$` chains multiple skills in one line (all run, in order), plus multi-line prompt input — paste-merge and gated Shift+Enter.
158
158
 
159
159
  See [CHANGELOG.md](CHANGELOG.md) for the full history.
160
160
  <!-- CHANGELOG:END -->
package/README.md CHANGED
@@ -150,11 +150,11 @@ Required npm token permissions (repository secret `NPM_TOKEN`):
150
150
  ## Changelog
151
151
 
152
152
  <!-- CHANGELOG:START (auto-generated from CHANGELOG.md — run `bun run changelog:sync`) -->
153
+ - **[0.5.9]** (2026-06-15) — Bounded per-frame wrap for the live thinking/tool-output blocks — re-render cost no longer grows with stream length.
154
+ - **[0.5.8]** (2026-06-15) — Native Opik observability for the turn loop (opt-in `JEO_OPIK`, pure-TS no-op when unset) + autopilot convergence tracking.
153
155
  - **[0.5.7]** (2026-06-15) — `/model` picker is default-only, `/clear` resets to the initial screen, ESC clears the input box, and a launch process-listener leak is fixed.
154
156
  - **[0.5.6]** (2026-06-15) — `/model` sets only the default thinking; per-role reasoning moved to `/agents`.
155
157
  - **[0.5.5]** (2026-06-15) — Full multi-line visibility — the input box scrolls to the caret and the submitted card shows every line.
156
- - **[0.5.4]** (2026-06-15) — Reliable multi-line input is ON by default — a paste fills the box and submits as one message.
157
- - **[0.5.3]** (2026-06-15) — `$` chains multiple skills in one line (all run, in order), plus multi-line prompt input — paste-merge and gated Shift+Enter.
158
158
 
159
159
  See [CHANGELOG.md](CHANGELOG.md) for the full history.
160
160
  <!-- CHANGELOG:END -->
package/README.zh.md CHANGED
@@ -150,11 +150,11 @@ CI 通过 `.github/workflows/npm-publish.yml` 发布 — GitHub 发布 release
150
150
  ## 更新日志 (Changelog)
151
151
 
152
152
  <!-- CHANGELOG:START (auto-generated from CHANGELOG.md — run `bun run changelog:sync`) -->
153
+ - **[0.5.9]** (2026-06-15) — Bounded per-frame wrap for the live thinking/tool-output blocks — re-render cost no longer grows with stream length.
154
+ - **[0.5.8]** (2026-06-15) — Native Opik observability for the turn loop (opt-in `JEO_OPIK`, pure-TS no-op when unset) + autopilot convergence tracking.
153
155
  - **[0.5.7]** (2026-06-15) — `/model` picker is default-only, `/clear` resets to the initial screen, ESC clears the input box, and a launch process-listener leak is fixed.
154
156
  - **[0.5.6]** (2026-06-15) — `/model` sets only the default thinking; per-role reasoning moved to `/agents`.
155
157
  - **[0.5.5]** (2026-06-15) — Full multi-line visibility — the input box scrolls to the caret and the submitted card shows every line.
156
- - **[0.5.4]** (2026-06-15) — Reliable multi-line input is ON by default — a paste fills the box and submits as one message.
157
- - **[0.5.3]** (2026-06-15) — `$` chains multiple skills in one line (all run, in order), plus multi-line prompt input — paste-merge and gated Shift+Enter.
158
158
 
159
159
  See [CHANGELOG.md](CHANGELOG.md) for the full history.
160
160
  <!-- CHANGELOG:END -->
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "jeo-code",
3
- "version": "0.5.7",
3
+ "version": "0.5.9",
4
4
  "description": "Clean, highly optimized AI coding agent using spec-first loop",
5
5
  "type": "module",
6
6
  "main": "src/cli.ts",
@@ -0,0 +1,364 @@
1
+ /**
2
+ * Opik observability for the jeo agent turn loop (spec-stack · Run phase).
3
+ *
4
+ * Each agent turn becomes ONE Opik trace; each step/tool becomes a span; token
5
+ * usage and the eval feedback scores (`completed` / `verified` / `efficiency`)
6
+ * are attached to the trace. Pure TypeScript over `fetch` — no Python, no
7
+ * `opik` npm package — consistent with jeo's zero-native-dependency constraint.
8
+ *
9
+ * Hard invariants (see .specify/specs/opik-observability/seed.md):
10
+ * - I1: `JEO_OPIK` unset => the tracer is a no-op; zero Opik HTTP calls.
11
+ * - I2: no tracer error ever propagates out of an events callback.
12
+ * - I3: no secret is logged; the key only travels in the `Authorization` header.
13
+ * - I4: engine output is identical regardless of tracing outcome.
14
+ *
15
+ * Opik REST surface (private v1), confirmed against the installed SDK:
16
+ * - POST {base}/v1/private/traces/batch { traces: [...] }
17
+ * - POST {base}/v1/private/spans/batch { spans: [...] }
18
+ * - PUT {base}/v1/private/traces/feedback-scores { scores: [...] }
19
+ * Headers: `Authorization: <api_key>`, `Comet-Workspace: <workspace>`.
20
+ */
21
+ import { jeoEnv } from "../util/env";
22
+ import type { AgentLoopEvents, ToolInvocation } from "./engine";
23
+
24
+ type Env = Record<string, string | undefined>;
25
+ type FetchImpl = typeof fetch;
26
+
27
+ const DEFAULT_BASE = "https://www.comet.com/opik/api";
28
+ const DEFAULT_PROJECT = "jeo";
29
+ const DEFAULT_WORKSPACE = "jeo";
30
+ /** Verification signal (mirrors engine.ts VERIFY_SIGNAL_RE) — used for the eval score. */
31
+ const VERIFY_SIGNAL_RE = /\b(test|tests|tsc|typecheck|lint|build|check|spec|pytest|vitest|jest)\b/i;
32
+
33
+ /** Master switch. Tracing is OFF unless `JEO_OPIK` is `1`/`true`/`yes`/`on`. */
34
+ export function opikEnabled(env: Env = process.env): boolean {
35
+ const raw = (jeoEnv("OPIK", env) ?? "").trim().toLowerCase();
36
+ return raw === "1" || raw === "true" || raw === "yes" || raw === "on";
37
+ }
38
+
39
+ export interface OpikConfig {
40
+ apiKey?: string;
41
+ workspace: string;
42
+ baseUrl: string;
43
+ projectName: string;
44
+ }
45
+
46
+ /** Resolve Opik connection config from the environment (no I/O). */
47
+ export function resolveOpikConfig(env: Env = process.env): OpikConfig {
48
+ const baseRaw = (env.OPIK_URL_OVERRIDE ?? DEFAULT_BASE).trim();
49
+ // Normalize a trailing slash so path joins are predictable.
50
+ const baseUrl = baseRaw.replace(/\/+$/, "");
51
+ return {
52
+ apiKey: env.OPIK_API_KEY?.trim() || undefined,
53
+ workspace: (env.COMET_WORKSPACE ?? DEFAULT_WORKSPACE).trim() || DEFAULT_WORKSPACE,
54
+ baseUrl,
55
+ projectName: (env.OPIK_PROJECT_NAME ?? DEFAULT_PROJECT).trim() || DEFAULT_PROJECT,
56
+ };
57
+ }
58
+
59
+ /** RFC-9562 UUIDv7 (time-ordered) — Opik orders traces/spans by id. */
60
+ export function uuidv7(now: number = Date.now(), rnd: () => number = Math.random): string {
61
+ const ts = Math.max(0, Math.trunc(now));
62
+ const hex = ts.toString(16).padStart(12, "0").slice(-12);
63
+ const b: number[] = [];
64
+ for (let i = 0; i < 16; i++) b.push(Math.floor(rnd() * 256) & 0xff);
65
+ // 48-bit big-endian timestamp
66
+ for (let i = 0; i < 6; i++) b[i] = parseInt(hex.slice(i * 2, i * 2 + 2), 16);
67
+ b[6] = 0x70 | (b[6]! & 0x0f); // version 7
68
+ b[8] = 0x80 | (b[8]! & 0x3f); // variant
69
+ const h = b.map(x => x.toString(16).padStart(2, "0")).join("");
70
+ return `${h.slice(0, 8)}-${h.slice(8, 12)}-${h.slice(12, 16)}-${h.slice(16, 20)}-${h.slice(20)}`;
71
+ }
72
+
73
+ /** ISO-8601 with milliseconds (Opik expects RFC-3339 timestamps). */
74
+ function iso(ms: number): string {
75
+ return new Date(ms).toISOString();
76
+ }
77
+
78
+ export interface TurnMeta {
79
+ /** Human-readable turn name (the user intent / first message). */
80
+ name: string;
81
+ /** The user input recorded on the trace. */
82
+ input?: string;
83
+ /** Extra metadata (model, cwd, …). */
84
+ metadata?: Record<string, unknown>;
85
+ tags?: string[];
86
+ }
87
+
88
+ export interface StepRecord {
89
+ step: number;
90
+ tool: string;
91
+ success: boolean;
92
+ output: string;
93
+ startTime: number;
94
+ endTime: number;
95
+ }
96
+
97
+ export interface TurnScores {
98
+ completed: number;
99
+ verified: number;
100
+ efficiency: number;
101
+ }
102
+
103
+ /**
104
+ * Eval scoring (the *evaluation* surface). All in [0,1].
105
+ * - completed: 1 when the turn ended in `done`.
106
+ * - verified: 1 when a verification signal (test/tsc/build/…) ran in-turn.
107
+ * - efficiency: 1 for a 1-step turn, decaying as steps grow (1/sqrt(steps)),
108
+ * so fewer steps to reach `done` scores higher; floored at 0.
109
+ */
110
+ export function computeScores(args: {
111
+ done: boolean;
112
+ steps: number;
113
+ verificationRan: boolean;
114
+ }): TurnScores {
115
+ const steps = Math.max(1, Math.trunc(args.steps) || 1);
116
+ const efficiency = Math.min(1, 1 / Math.sqrt(steps));
117
+ return {
118
+ completed: args.done ? 1 : 0,
119
+ verified: args.verificationRan ? 1 : 0,
120
+ efficiency: Number(efficiency.toFixed(4)),
121
+ };
122
+ }
123
+
124
+ /** Whether a tool name + output looks like an in-turn verification signal. */
125
+ export function isVerificationStep(tool: string, output: string): boolean {
126
+ if (tool !== "bash") return false;
127
+ return VERIFY_SIGNAL_RE.test(output);
128
+ }
129
+
130
+ // ---- Pure payload builders (unit-tested without network) --------------------
131
+
132
+ export function buildTracePayload(args: {
133
+ id: string;
134
+ project: string;
135
+ meta: TurnMeta;
136
+ startTime: number;
137
+ endTime: number;
138
+ output?: string;
139
+ usage?: { inputTokens: number; outputTokens: number };
140
+ }): Record<string, unknown> {
141
+ const metadata = { ...(args.meta.metadata ?? {}) } as Record<string, unknown>;
142
+ if (args.usage) {
143
+ metadata.usage = {
144
+ prompt_tokens: args.usage.inputTokens,
145
+ completion_tokens: args.usage.outputTokens,
146
+ total_tokens: args.usage.inputTokens + args.usage.outputTokens,
147
+ };
148
+ }
149
+ return {
150
+ id: args.id,
151
+ project_name: args.project,
152
+ name: args.meta.name,
153
+ start_time: iso(args.startTime),
154
+ end_time: iso(args.endTime),
155
+ ...(args.meta.input != null ? { input: { message: args.meta.input } } : {}),
156
+ ...(args.output != null ? { output: { result: args.output } } : {}),
157
+ metadata,
158
+ tags: args.meta.tags ?? ["jeo"],
159
+ };
160
+ }
161
+
162
+ export function buildSpanPayload(args: {
163
+ id: string;
164
+ traceId: string;
165
+ project: string;
166
+ rec: StepRecord;
167
+ }): Record<string, unknown> {
168
+ const { rec } = args;
169
+ return {
170
+ id: args.id,
171
+ trace_id: args.traceId,
172
+ project_name: args.project,
173
+ name: `step ${rec.step}: ${rec.tool}`,
174
+ type: "general",
175
+ start_time: iso(rec.startTime),
176
+ end_time: iso(rec.endTime),
177
+ input: { tool: rec.tool },
178
+ output: { success: rec.success, output: rec.output.slice(0, 4000) },
179
+ metadata: { step: rec.step, success: rec.success },
180
+ };
181
+ }
182
+
183
+ export function buildScorePayload(args: {
184
+ traceId: string;
185
+ project: string;
186
+ scores: TurnScores;
187
+ }): Record<string, unknown> {
188
+ const mk = (name: string, value: number, reason: string) => ({
189
+ id: args.traceId,
190
+ project_name: args.project,
191
+ name,
192
+ value,
193
+ source: "sdk" as const,
194
+ reason,
195
+ });
196
+ return {
197
+ scores: [
198
+ mk("completed", args.scores.completed, "1 when the turn ended in `done`"),
199
+ mk("verified", args.scores.verified, "1 when a verification signal ran in-turn"),
200
+ mk("efficiency", args.scores.efficiency, "1/sqrt(steps); fewer steps score higher"),
201
+ ],
202
+ };
203
+ }
204
+
205
+ // ---- Tracer -----------------------------------------------------------------
206
+
207
+ export interface OpikTracer {
208
+ readonly enabled: boolean;
209
+ startTurn(): void;
210
+ step(rec: StepRecord): void;
211
+ usage(u: { inputTokens: number; outputTokens: number }): void;
212
+ endTurn(result: { done: boolean; steps: number; output?: string }): Promise<void>;
213
+ }
214
+
215
+ const NOOP_TRACER: OpikTracer = {
216
+ enabled: false,
217
+ startTurn() {},
218
+ step() {},
219
+ usage() {},
220
+ async endTurn() {},
221
+ };
222
+
223
+ class LiveOpikTracer implements OpikTracer {
224
+ readonly enabled = true;
225
+ private readonly traceId = uuidv7();
226
+ private readonly steps: StepRecord[] = [];
227
+ private readonly spanIds = new Map<number, string>();
228
+ private startedAt = Date.now();
229
+ private usageAcc = { inputTokens: 0, outputTokens: 0 };
230
+ private sawUsage = false;
231
+ private verificationRan = false;
232
+ private ended = false;
233
+
234
+ constructor(
235
+ private readonly meta: TurnMeta,
236
+ private readonly cfg: OpikConfig,
237
+ private readonly fetchImpl: FetchImpl,
238
+ ) {}
239
+
240
+ private headers(): Record<string, string> {
241
+ const h: Record<string, string> = {
242
+ "Content-Type": "application/json",
243
+ "Comet-Workspace": this.cfg.workspace,
244
+ };
245
+ if (this.cfg.apiKey) h["Authorization"] = this.cfg.apiKey;
246
+ return h;
247
+ }
248
+
249
+ /** Fire-and-forget POST/PUT; any failure is swallowed (I2/I4). */
250
+ private async send(path: string, body: unknown, method: "POST" | "PUT" = "POST"): Promise<void> {
251
+ try {
252
+ await this.fetchImpl(`${this.cfg.baseUrl}/${path}`, {
253
+ method,
254
+ headers: this.headers(),
255
+ body: JSON.stringify(body),
256
+ });
257
+ } catch {
258
+ /* never break the turn */
259
+ }
260
+ }
261
+
262
+ startTurn(): void {
263
+ this.startedAt = Date.now();
264
+ }
265
+
266
+ step(rec: StepRecord): void {
267
+ this.steps.push(rec);
268
+ this.spanIds.set(rec.step, uuidv7(rec.startTime));
269
+ if (isVerificationStep(rec.tool, rec.output)) this.verificationRan = true;
270
+ }
271
+
272
+ usage(u: { inputTokens: number; outputTokens: number }): void {
273
+ this.usageAcc.inputTokens += u.inputTokens || 0;
274
+ this.usageAcc.outputTokens += u.outputTokens || 0;
275
+ this.sawUsage = true;
276
+ }
277
+
278
+ async endTurn(result: { done: boolean; steps: number; output?: string }): Promise<void> {
279
+ if (this.ended) return;
280
+ this.ended = true;
281
+ const endedAt = Date.now();
282
+ const project = this.cfg.projectName;
283
+
284
+ const trace = buildTracePayload({
285
+ id: this.traceId,
286
+ project,
287
+ meta: this.meta,
288
+ startTime: this.startedAt,
289
+ endTime: endedAt,
290
+ output: result.output,
291
+ usage: this.sawUsage ? this.usageAcc : undefined,
292
+ });
293
+ const spans = this.steps.map(rec =>
294
+ buildSpanPayload({ id: this.spanIds.get(rec.step)!, traceId: this.traceId, project, rec }),
295
+ );
296
+ const scores = computeScores({
297
+ done: result.done,
298
+ steps: result.steps,
299
+ verificationRan: this.verificationRan,
300
+ });
301
+ const scorePayload = buildScorePayload({ traceId: this.traceId, project, scores });
302
+
303
+ await this.send("v1/private/traces/batch", { traces: [trace] });
304
+ if (spans.length > 0) await this.send("v1/private/spans/batch", { spans });
305
+ await this.send("v1/private/traces/feedback-scores", scorePayload, "PUT");
306
+ }
307
+ }
308
+
309
+ /**
310
+ * Build a tracer for one turn. Returns a no-op tracer (zero network) when
311
+ * `JEO_OPIK` is off or no API key is configured.
312
+ */
313
+ export function createOpikTracer(
314
+ meta: TurnMeta,
315
+ env: Env = process.env,
316
+ fetchImpl: FetchImpl = fetch,
317
+ ): OpikTracer {
318
+ if (!opikEnabled(env)) return NOOP_TRACER;
319
+ const cfg = resolveOpikConfig(env);
320
+ if (!cfg.apiKey) return NOOP_TRACER; // no creds => stay silent, never guess
321
+ return new LiveOpikTracer(meta, cfg, fetchImpl);
322
+ }
323
+
324
+ /**
325
+ * Compose an existing `AgentLoopEvents` with tracer hooks. Every original
326
+ * callback is delegated unchanged; the tracer observes step boundaries, tool
327
+ * results, and usage. Tracer side-effects can never throw out of a callback.
328
+ */
329
+ export function wrapEvents(events: AgentLoopEvents | undefined, tracer: OpikTracer): AgentLoopEvents {
330
+ if (!tracer.enabled) return events ?? {};
331
+ const base: AgentLoopEvents = events ?? {};
332
+ let stepStartedAt = Date.now();
333
+ let currentStep = 0;
334
+
335
+ const wrapped: AgentLoopEvents = {
336
+ ...base,
337
+ onStep(step: number) {
338
+ currentStep = step;
339
+ stepStartedAt = Date.now();
340
+ try { base.onStep?.(step); } finally { /* tracer has no per-onStep write */ }
341
+ },
342
+ onAssistant(raw: string, invocation: ToolInvocation | null) {
343
+ base.onAssistant?.(raw, invocation);
344
+ },
345
+ onToolResult(tool: string, success: boolean, output: string) {
346
+ try {
347
+ tracer.step({
348
+ step: currentStep || 1,
349
+ tool,
350
+ success,
351
+ output,
352
+ startTime: stepStartedAt,
353
+ endTime: Date.now(),
354
+ });
355
+ } catch { /* I2 */ }
356
+ base.onToolResult?.(tool, success, output);
357
+ },
358
+ onUsage(usage: { inputTokens: number; outputTokens: number }) {
359
+ try { tracer.usage(usage); } catch { /* I2 */ }
360
+ base.onUsage?.(usage);
361
+ },
362
+ };
363
+ return wrapped;
364
+ }
package/src/autopilot.ts CHANGED
@@ -142,6 +142,33 @@ function isImprovement(goal: Goal, score: number, best: number | undefined): boo
142
142
  return true; // gate handled via passed, not score
143
143
  }
144
144
 
145
+ /**
146
+ * Single source of truth for the ratchet keep/revert decision. Shared by step,
147
+ * loop, and status so they can never diverge.
148
+ * - gate goal: keep iff the eval passed (score is irrelevant).
149
+ * - min/max goal: a non-measurable (NaN) score can never prove improvement, so
150
+ * it is always reverted; otherwise keep iff it improves on the best so far.
151
+ */
152
+ export function decideStep(
153
+ goal: Goal,
154
+ score: number,
155
+ passed: boolean,
156
+ best: number | undefined,
157
+ ): "keep" | "revert" {
158
+ if (goal === "gate") return passed ? "keep" : "revert";
159
+ if (Number.isNaN(score)) return "revert";
160
+ return isImprovement(goal, score, best) ? "keep" : "revert";
161
+ }
162
+
163
+ /**
164
+ * Convergence is a streak of consecutive no-progress steps (reverts) reaching
165
+ * patience — for every goal, gate included. A gate loop that keeps failing has
166
+ * made no forward progress and must stop early instead of burning the budget.
167
+ */
168
+ export function isConverged(sinceImprove: number, patience: number): boolean {
169
+ return sinceImprove >= patience;
170
+ }
171
+
145
172
  function hasBaseline(): boolean {
146
173
  return readLog().some((e) => e.type === "baseline");
147
174
  }
@@ -189,14 +216,7 @@ function cmdStep(flags: Record<string, string>): void {
189
216
  const best = currentBest(s);
190
217
  const { score, passed, output } = runEval(s);
191
218
 
192
- let decision: "keep" | "revert";
193
- if (s.goal === "gate") {
194
- decision = passed ? "keep" : "revert";
195
- } else if (Number.isNaN(score)) {
196
- decision = "revert"; // no measurable score => cannot prove improvement
197
- } else {
198
- decision = isImprovement(s.goal, score, best) ? "keep" : "revert";
199
- }
219
+ const decision = decideStep(s.goal, score, passed, best);
200
220
 
201
221
  if (decision === "revert" && flags["on-revert"]) {
202
222
  try {
@@ -242,10 +262,7 @@ function cmdLoop(flags: Record<string, string>): void {
242
262
 
243
263
  const best = currentBest(s);
244
264
  const { score, passed, output } = runEval(s);
245
- let decision: "keep" | "revert";
246
- if (s.goal === "gate") decision = passed ? "keep" : "revert";
247
- else if (Number.isNaN(score)) decision = "revert";
248
- else decision = isImprovement(s.goal, score, best) ? "keep" : "revert";
265
+ const decision = decideStep(s.goal, score, passed, best);
249
266
 
250
267
  if (decision === "revert" && flags["on-revert"]) {
251
268
  try {
@@ -255,11 +272,12 @@ function cmdLoop(flags: Record<string, string>): void {
255
272
  }
256
273
  }
257
274
  appendLog({ type: "step", iteration: i, change: `loop#${i}`, score, passed, decision, prevBest: best ?? null, output });
258
- const improved = decision === "keep" && (s.goal === "gate" || !Number.isNaN(score));
259
- sinceImprove = improved && (best === undefined || s.goal === "gate" || isImprovement(s.goal, score, best)) ? 0 : sinceImprove + 1;
275
+ // A keep is forward progress (min/max: provably an improvement; gate: a pass).
276
+ // Anything else extends the no-progress streak toward convergence.
277
+ sinceImprove = decision === "keep" ? 0 : sinceImprove + 1;
260
278
  console.log(`jeo autopilot: loop ${i}/${max} ${decision.toUpperCase()} score=${fmt(score)} (sinceImprove=${sinceImprove})`);
261
279
 
262
- if (s.goal !== "gate" && sinceImprove >= s.patience) {
280
+ if (isConverged(sinceImprove, s.patience)) {
263
281
  appendLog({ type: "stop", reason: "converged", iteration: i, patience: s.patience });
264
282
  console.log(`jeo autopilot: stop — converged (no improvement in ${s.patience} steps)`);
265
283
  return;
@@ -279,13 +297,13 @@ function cmdStatus(flags: Record<string, string>): void {
279
297
  const best = currentBest(s);
280
298
  const stop = [...log].reverse().find((e) => e.type === "stop");
281
299
 
282
- // convergence: steps since last keep-with-improvement
300
+ // convergence: steps since last keep (forward progress)
283
301
  let sinceImprove = 0;
284
302
  for (const e of steps) {
285
303
  if (e.decision === "keep") sinceImprove = 0;
286
304
  else sinceImprove++;
287
305
  }
288
- const converged = s.goal !== "gate" && sinceImprove >= s.patience;
306
+ const converged = isConverged(sinceImprove, s.patience);
289
307
 
290
308
  let recommendation: string;
291
309
  if (stop) recommendation = `stopped: ${stop.reason as string}`;
@@ -2,6 +2,7 @@ import { createInterface } from "node:readline/promises";
2
2
  import { emitKeypressEvents } from "node:readline";
3
3
  import { PassThrough } from "node:stream";
4
4
  import { runAgentLoop, executorSystemPrompt, DEFAULT_TOOLS, TOOL_PROTOCOL, WORKING_DISCIPLINE, type AgentLoopEvents } from "../agent/engine";
5
+ import { createOpikTracer, wrapEvents } from "../agent/opik-tracer";
5
6
  import { initialDynamicStepLimit } from "../agent/step-budget";
6
7
  import { memoryPromptSection, spawnDetachedDistill } from "../agent/memory";
7
8
  import { createTaskTool, taskToolProtocolLine, type TaskSubEvent } from "../agent/task-tool";
@@ -1472,6 +1473,16 @@ export async function runLaunchCommand(args: string[]): Promise<void> {
1472
1473
  subagent: createSubagentTool(subagentRegistry),
1473
1474
  };
1474
1475
  const tools = filterToolMap(fullTools, Array.from(allowedTools));
1476
+ // Opik observability (opt-in via JEO_OPIK): one trace per turn, spans per
1477
+ // step/tool, token usage, and completed/verified/efficiency eval scores.
1478
+ // No-op (zero network) when disabled or unconfigured; never breaks a turn.
1479
+ const opik = createOpikTracer({
1480
+ name: userInput.trim().slice(0, 80) || "jeo turn",
1481
+ input: userInput,
1482
+ metadata: { model: sessionModel, cwd },
1483
+ tags: ["jeo", "launch"],
1484
+ });
1485
+ opik.startTurn();
1475
1486
  result = await runAgentLoop(history, {
1476
1487
  cwd,
1477
1488
  tools,
@@ -1480,7 +1491,7 @@ export async function runLaunchCommand(args: string[]): Promise<void> {
1480
1491
  maxTokens: sessionThinking ? thinkingMaxTokens(sessionThinking) : undefined,
1481
1492
  signal: ac.signal,
1482
1493
  steer: drainSteer,
1483
- events: { ...withToolDetailCapture(tui ? tui.events() : streamEvents), onBeforeDone },
1494
+ events: wrapEvents({ ...withToolDetailCapture(tui ? tui.events() : streamEvents), onBeforeDone }, opik),
1484
1495
  });
1485
1496
  if (result.done && looksLikeSkillEcho(result.doneReason ?? "", resolvedSkills)) {
1486
1497
  history.push({
@@ -1498,7 +1509,7 @@ export async function runLaunchCommand(args: string[]): Promise<void> {
1498
1509
  maxTokens: sessionThinking ? thinkingMaxTokens(sessionThinking) : undefined,
1499
1510
  signal: ac.signal,
1500
1511
  steer: drainSteer,
1501
- events: withToolDetailCapture(tui ? tui.events() : streamEvents),
1512
+ events: wrapEvents(withToolDetailCapture(tui ? tui.events() : streamEvents), opik),
1502
1513
  });
1503
1514
  const usage =
1504
1515
  result.usage && retry.usage
@@ -1509,6 +1520,8 @@ export async function runLaunchCommand(args: string[]): Promise<void> {
1509
1520
  : retry.usage ?? result.usage;
1510
1521
  result = { ...retry, steps: result.steps + retry.steps, usage };
1511
1522
  }
1523
+ // Close the Opik trace once per turn (done or budget-stop). Errors swallowed.
1524
+ await opik.endTurn({ done: result.done, steps: result.steps, output: result.doneReason });
1512
1525
  } finally {
1513
1526
  harness.dispose();
1514
1527
  subagentRegistry.cancelAll(); // #9: no detached run leaks past the turn
package/src/tui/app.ts CHANGED
@@ -100,6 +100,18 @@ function extractStreamingActivity(buf: string): string {
100
100
  return t.replace(/\s+/g, " ").slice(0, 140);
101
101
  }
102
102
 
103
+ /** Bound the input to a per-frame wrap to a fixed trailing window. The live thinking
104
+ * and tool-output blocks only ever DISPLAY their last few wrapped rows, but they
105
+ * accumulate the whole step's text — re-wrapping the FULL string every 120ms tick made
106
+ * per-frame work (and GC churn) grow linearly with how much had streamed (a long
107
+ * reasoning trace or a chatty tool can be hundreds of KB). Slicing to the last
108
+ * `maxChars` first keeps the visible tail byte-identical while capping wrap cost at
109
+ * O(maxChars) regardless of total size. 16 KiB is far more than the ~1 KB the visible
110
+ * rows need, so no on-screen row is ever lost to the cut. */
111
+ export const FRAME_WRAP_TAIL_CHARS = 16 * 1024;
112
+ export function tailForWrap(text: string, maxChars = FRAME_WRAP_TAIL_CHARS): string {
113
+ return text.length > maxChars ? text.slice(text.length - maxChars) : text;
114
+ }
103
115
  const DEFAULT_MAX_STEPS = 100;
104
116
  // Tools light enough that they never get a forge card (gjc parity): completion is a
105
117
  // single ✓/✗ ledger line; only failures surface a result card with the error body.
@@ -1112,7 +1124,7 @@ export class LaunchTui {
1112
1124
  const liveThink = this.streamingThought.trim() || this.streamingReasoning.trim();
1113
1125
  if (isThinking && liveThink) {
1114
1126
  const wrapW = Math.max(8, Math.min(120, cols) - 2);
1115
- const wrapped = liveThink
1127
+ const wrapped = tailForWrap(liveThink)
1116
1128
  .split("\n")
1117
1129
  .flatMap(l => wrapTextWithAnsi(l, wrapW))
1118
1130
  .filter(l => l.length > 0);
@@ -1133,7 +1145,7 @@ export class LaunchTui {
1133
1145
  // It is transient — cleared on result, when the formatted forge card takes over.
1134
1146
  if (this.runningTool && this.liveToolOutput.trim()) {
1135
1147
  const wrapW = Math.max(8, Math.min(120, cols) - 2);
1136
- const wrapped = this.liveToolOutput
1148
+ const wrapped = tailForWrap(this.liveToolOutput)
1137
1149
  .split("\n")
1138
1150
  .flatMap(l => wrapTextWithAnsi(l, wrapW))
1139
1151
  .filter(l => l.length > 0);