@lattices/cli 0.3.0 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (95) hide show
  1. package/README.md +85 -9
  2. package/app/Package.swift +8 -1
  3. package/app/Sources/AdvisorLearningStore.swift +90 -0
  4. package/app/Sources/AgentSession.swift +377 -0
  5. package/app/Sources/AppDelegate.swift +44 -12
  6. package/app/Sources/AppShellView.swift +81 -8
  7. package/app/Sources/AudioProvider.swift +386 -0
  8. package/app/Sources/CheatSheetHUD.swift +261 -19
  9. package/app/Sources/DaemonProtocol.swift +13 -0
  10. package/app/Sources/DaemonServer.swift +8 -0
  11. package/app/Sources/DesktopModel.swift +164 -5
  12. package/app/Sources/DesktopModelTypes.swift +2 -0
  13. package/app/Sources/DiagnosticLog.swift +104 -2
  14. package/app/Sources/EventBus.swift +1 -0
  15. package/app/Sources/HUDBottomBar.swift +279 -0
  16. package/app/Sources/HUDController.swift +1158 -0
  17. package/app/Sources/HUDLeftBar.swift +849 -0
  18. package/app/Sources/HUDMinimap.swift +179 -0
  19. package/app/Sources/HUDRightBar.swift +774 -0
  20. package/app/Sources/HUDState.swift +367 -0
  21. package/app/Sources/HUDTopBar.swift +243 -0
  22. package/app/Sources/HandsOffSession.swift +733 -0
  23. package/app/Sources/HomeDashboardView.swift +125 -0
  24. package/app/Sources/HotkeyManager.swift +2 -0
  25. package/app/Sources/HotkeyStore.swift +45 -9
  26. package/app/Sources/IntentEngine.swift +925 -0
  27. package/app/Sources/Intents/CreateLayerIntent.swift +54 -0
  28. package/app/Sources/Intents/DistributeIntent.swift +56 -0
  29. package/app/Sources/Intents/FocusIntent.swift +69 -0
  30. package/app/Sources/Intents/HelpIntent.swift +41 -0
  31. package/app/Sources/Intents/KillIntent.swift +47 -0
  32. package/app/Sources/Intents/LatticeIntent.swift +78 -0
  33. package/app/Sources/Intents/LaunchIntent.swift +67 -0
  34. package/app/Sources/Intents/ListSessionsIntent.swift +32 -0
  35. package/app/Sources/Intents/ListWindowsIntent.swift +30 -0
  36. package/app/Sources/Intents/ScanIntent.swift +52 -0
  37. package/app/Sources/Intents/SearchIntent.swift +190 -0
  38. package/app/Sources/Intents/SwitchLayerIntent.swift +50 -0
  39. package/app/Sources/Intents/TileIntent.swift +61 -0
  40. package/app/Sources/LatticesApi.swift +1235 -30
  41. package/app/Sources/LauncherHUD.swift +348 -0
  42. package/app/Sources/MainView.swift +147 -44
  43. package/app/Sources/OcrModel.swift +34 -1
  44. package/app/Sources/OmniSearchState.swift +99 -102
  45. package/app/Sources/OnboardingView.swift +457 -0
  46. package/app/Sources/PermissionChecker.swift +2 -12
  47. package/app/Sources/PiChatDock.swift +454 -0
  48. package/app/Sources/PiChatSession.swift +815 -0
  49. package/app/Sources/PiWorkspaceView.swift +364 -0
  50. package/app/Sources/PlacementSpec.swift +195 -0
  51. package/app/Sources/Preferences.swift +59 -0
  52. package/app/Sources/ProjectScanner.swift +1 -1
  53. package/app/Sources/ScreenMapState.swift +701 -55
  54. package/app/Sources/ScreenMapView.swift +843 -103
  55. package/app/Sources/ScreenMapWindowController.swift +22 -0
  56. package/app/Sources/SessionLayerStore.swift +285 -0
  57. package/app/Sources/SessionManager.swift +4 -1
  58. package/app/Sources/SettingsView.swift +186 -3
  59. package/app/Sources/Theme.swift +9 -8
  60. package/app/Sources/TmuxModel.swift +7 -0
  61. package/app/Sources/TmuxQuery.swift +27 -3
  62. package/app/Sources/VoiceChatView.swift +192 -0
  63. package/app/Sources/VoiceCommandWindow.swift +1594 -0
  64. package/app/Sources/VoiceIntentResolver.swift +671 -0
  65. package/app/Sources/VoxClient.swift +454 -0
  66. package/app/Sources/WindowTiler.swift +348 -87
  67. package/app/Sources/WorkspaceManager.swift +127 -18
  68. package/bin/client.ts +16 -0
  69. package/bin/{daemon-client.js → daemon-client.ts} +49 -30
  70. package/bin/handsoff-infer.ts +280 -0
  71. package/bin/handsoff-worker.ts +731 -0
  72. package/bin/{lattices-app.js → lattices-app.ts} +67 -32
  73. package/bin/lattices-dev +160 -0
  74. package/bin/{lattices.js → lattices.ts} +600 -137
  75. package/bin/project-twin.ts +645 -0
  76. package/docs/agent-execution-plan.md +562 -0
  77. package/docs/agents.md +142 -0
  78. package/docs/api.md +153 -34
  79. package/docs/app.md +29 -1
  80. package/docs/config.md +5 -1
  81. package/docs/handsoff-test-scenarios.md +84 -0
  82. package/docs/layers.md +20 -20
  83. package/docs/ocr.md +14 -5
  84. package/docs/overview.md +5 -1
  85. package/docs/presentation-execution-review.md +491 -0
  86. package/docs/prompts/hands-off-system.md +374 -0
  87. package/docs/prompts/hands-off-turn.md +30 -0
  88. package/docs/prompts/voice-advisor.md +31 -0
  89. package/docs/prompts/voice-fallback.md +23 -0
  90. package/docs/tiling-reference.md +167 -0
  91. package/docs/twins.md +138 -0
  92. package/docs/voice-command-protocol.md +278 -0
  93. package/docs/voice.md +219 -0
  94. package/package.json +21 -10
  95. package/bin/client.js +0 -4
@@ -0,0 +1,731 @@
1
+ #!/usr/bin/env bun
2
+ /**
3
+ * Hands-off worker — long-running process that handles both inference and TTS.
4
+ *
5
+ * Reads newline-delimited JSON commands from stdin, writes JSON responses to stdout.
6
+ * Keeps SpeakEasy and inference warm — no cold starts.
7
+ *
8
+ * Commands:
9
+ * {"cmd":"infer","transcript":"...","snapshot":{...},"history":[...]}
10
+ * {"cmd":"speak","text":"..."}
11
+ * {"cmd":"ack","text":"..."} (speak + don't wait for completion)
12
+ * {"cmd":"ping"}
13
+ *
14
+ * Responses:
15
+ * {"ok":true,"data":{...}}
16
+ * {"ok":false,"error":"..."}
17
+ */
18
+
19
+ import { infer, inferJSON } from "../lib/infer.ts";
20
+
21
+ /** Call infer and parse JSON if possible, otherwise treat as spoken-only response */
22
+ async function inferSmart(prompt: string, options: any): Promise<{ data: any; raw: any }> {
23
+ const raw = await infer(prompt, options);
24
+
25
+ // Try to parse as JSON
26
+ let cleaned = raw.text
27
+ .replace(/```json\s*/g, "")
28
+ .replace(/```\s*/g, "")
29
+ .trim();
30
+
31
+ const start = cleaned.indexOf("{");
32
+ const end = cleaned.lastIndexOf("}");
33
+
34
+ if (start !== -1 && end !== -1) {
35
+ try {
36
+ const data = JSON.parse(cleaned.slice(start, end + 1));
37
+ return { data, raw };
38
+ } catch {}
39
+ }
40
+
41
+ // Not JSON — treat as conversational response (spoken-only, no actions)
42
+ log(`response was plain text, wrapping as spoken: "${raw.text.slice(0, 80)}"`);
43
+ return {
44
+ data: { actions: [], spoken: raw.text },
45
+ raw,
46
+ };
47
+ }
48
+ import { readFileSync } from "fs";
49
+ import { join, dirname } from "path";
50
+ import { spawn } from "child_process";
51
+
52
+ // ── Streaming TTS via OpenAI API → ffplay ──────────────────────────
53
+
54
+ const OPENAI_TTS_URL = "https://api.openai.com/v1/audio/speech";
55
+ const ttsConfig = loadTTSConfig();
56
+
57
+ function loadTTSConfig() {
58
+ // Load API key from speakeasy config or env
59
+ let apiKey = process.env.OPENAI_API_KEY || "";
60
+ let voice = "nova";
61
+
62
+ try {
63
+ const cfg = JSON.parse(
64
+ readFileSync(join(process.env.HOME || "", ".config/speakeasy/settings.json"), "utf-8")
65
+ );
66
+ if (!apiKey && cfg.providers?.openai?.apiKey) apiKey = cfg.providers.openai.apiKey;
67
+ if (cfg.providers?.openai?.voice) voice = cfg.providers.openai.voice;
68
+ } catch {}
69
+
70
+ return { apiKey, voice };
71
+ }
72
+
73
+ /** Stream TTS: fetch audio from OpenAI and pipe directly to ffplay. Playback starts immediately. */
74
+ async function streamSpeak(text: string): Promise<number> {
75
+ const start = performance.now();
76
+
77
+ const res = await fetch(OPENAI_TTS_URL, {
78
+ method: "POST",
79
+ headers: {
80
+ "Authorization": `Bearer ${ttsConfig.apiKey}`,
81
+ "Content-Type": "application/json",
82
+ },
83
+ body: JSON.stringify({
84
+ model: "tts-1",
85
+ voice: ttsConfig.voice,
86
+ input: text,
87
+ response_format: "pcm",
88
+ speed: 1.1,
89
+ }),
90
+ });
91
+
92
+ if (!res.ok) {
93
+ throw new Error(`OpenAI TTS error: ${res.status} ${res.statusText}`);
94
+ }
95
+
96
+ const ttfb = Math.round(performance.now() - start);
97
+ log(`TTS first byte in ${ttfb}ms`);
98
+
99
+ // Pipe response body directly to ffplay — playback starts as chunks arrive
100
+ return new Promise((resolve, reject) => {
101
+ const player = spawn("ffplay", [
102
+ "-nodisp", // no video window
103
+ "-autoexit", // quit when done
104
+ "-loglevel", "quiet",
105
+ "-f", "s16le", // PCM signed 16-bit little-endian
106
+ "-ar", "24000", // OpenAI TTS outputs 24kHz
107
+ "-ch_layout", "mono",
108
+ "-", // read from stdin
109
+ ], { stdio: ["pipe", "ignore", "ignore"] });
110
+
111
+ const reader = res.body?.getReader();
112
+ if (!reader) {
113
+ reject(new Error("No response body"));
114
+ return;
115
+ }
116
+
117
+ // Pump chunks from fetch → ffplay stdin
118
+ (async () => {
119
+ while (true) {
120
+ const { done, value } = await reader.read();
121
+ if (done) break;
122
+ player.stdin.write(value);
123
+ }
124
+ player.stdin.end();
125
+ })().catch(reject);
126
+
127
+ player.on("close", () => {
128
+ const ms = Math.round(performance.now() - start);
129
+ resolve(ms);
130
+ });
131
+
132
+ player.on("error", reject);
133
+ });
134
+ }
135
+
136
+ // ── Pre-cached ack sounds (no API call needed) ────────────────────
137
+
138
+ // Ack phrases — played immediately when user stops talking
139
+ const ACK_PHRASES = [
140
+ "Got it.",
141
+ "Heard you.",
142
+ "On it.",
143
+ "Yep.",
144
+ "Cool.",
145
+ "Sure.",
146
+ "Okay.",
147
+ "One sec.",
148
+ ];
149
+
150
+ // Confirmation phrases — played after executing known actions
151
+ const CONFIRM_PHRASES = [
152
+ "Tiled.",
153
+ "Focused.",
154
+ "Done.",
155
+ "Maximized.",
156
+ "Split.",
157
+ "Switched.",
158
+ "Distributed.",
159
+ "Restored.",
160
+ "Searching.",
161
+ ];
162
+
163
+ const ackCacheDir = join(process.env.HOME || "", ".lattices", "tts-cache");
164
+ const ackCache = new Map<string, string>(); // phrase → file path
165
+
166
+ async function ensureVoiceCache() {
167
+ const { mkdirSync, existsSync, writeFileSync } = await import("fs");
168
+ mkdirSync(ackCacheDir, { recursive: true });
169
+
170
+ const allPhrases = [...ACK_PHRASES, ...CONFIRM_PHRASES];
171
+ let cached = 0;
172
+ let generated = 0;
173
+
174
+ for (const phrase of allPhrases) {
175
+ const safeName = phrase.replace(/[^a-z]/gi, "_").toLowerCase();
176
+ const filePath = join(ackCacheDir, `voice_${safeName}.pcm`);
177
+
178
+ if (existsSync(filePath)) {
179
+ ackCache.set(phrase, filePath);
180
+ cached++;
181
+ continue;
182
+ }
183
+
184
+ // Generate and cache
185
+ try {
186
+ const res = await fetch(OPENAI_TTS_URL, {
187
+ method: "POST",
188
+ headers: {
189
+ "Authorization": `Bearer ${ttsConfig.apiKey}`,
190
+ "Content-Type": "application/json",
191
+ },
192
+ body: JSON.stringify({
193
+ model: "tts-1",
194
+ voice: ttsConfig.voice,
195
+ input: phrase,
196
+ response_format: "pcm",
197
+ speed: 1.1,
198
+ }),
199
+ });
200
+
201
+ if (res.ok) {
202
+ const buf = Buffer.from(await res.arrayBuffer());
203
+ writeFileSync(filePath, buf);
204
+ ackCache.set(phrase, filePath);
205
+ generated++;
206
+ log(`cached: "${phrase}"`);
207
+ }
208
+ } catch (e: any) {
209
+ log(`cache failed for "${phrase}": ${e.message}`);
210
+ }
211
+ }
212
+ log(`voice cache: ${cached} hit, ${generated} generated, ${allPhrases.length} total`);
213
+ }
214
+
215
+ /** Play a pre-cached audio file. Near-instant — no API call. */
216
+ async function playCached(phrase: string): Promise<number> {
217
+ const start = performance.now();
218
+ const filePath = ackCache.get(phrase);
219
+
220
+ if (!filePath) {
221
+ log(`playCached: cache miss for "${phrase}", falling back to TTS`);
222
+ return streamSpeak(phrase);
223
+ }
224
+
225
+ log(`playing cached: "${phrase}"`);
226
+ return new Promise((resolve, reject) => {
227
+ const player = spawn("ffplay", [
228
+ "-nodisp", "-autoexit", "-loglevel", "quiet",
229
+ "-f", "s16le", "-ar", "24000", "-ch_layout", "mono",
230
+ filePath,
231
+ ], { stdio: ["ignore", "ignore", "pipe"] });
232
+
233
+ let stderr = "";
234
+ player.stderr?.on("data", (d: Buffer) => { stderr += d.toString(); });
235
+
236
+ player.on("close", (code: number) => {
237
+ const ms = Math.round(performance.now() - start);
238
+ if (code !== 0) log(`ffplay error (code ${code}): ${stderr.slice(0, 100)}`);
239
+ else log(`played "${phrase}" in ${ms}ms`);
240
+ resolve(ms);
241
+ });
242
+
243
+ player.on("error", (err: Error) => {
244
+ log(`ffplay spawn error: ${err.message}`);
245
+ reject(err);
246
+ });
247
+ });
248
+ }
249
+
250
+ /** Play a random ack phrase from cache. */
251
+ function playAck(): Promise<number> {
252
+ const phrase = ACK_PHRASES[Math.floor(Math.random() * ACK_PHRASES.length)];
253
+ return playCached(phrase);
254
+ }
255
+
256
+ /** Play the right confirmation for an action. */
257
+ function playConfirm(intent: string): Promise<number> {
258
+ const map: Record<string, string> = {
259
+ tile_window: "Tiled.",
260
+ focus: "Focused.",
261
+ distribute: "Distributed.",
262
+ search: "Searching.",
263
+ switch_layer: "Switched.",
264
+ create_layer: "Done.",
265
+ };
266
+ return playCached(map[intent] ?? "Done.");
267
+ }
268
+
269
+ // ── Fast path: local intent matching (no LLM needed) ──────────────
270
+
271
+ interface FastMatch {
272
+ actions: Array<{ intent: string; slots: Record<string, string> }>;
273
+ confirm: string; // which confirmation to play
274
+ }
275
+
276
+ function tryFastMatch(transcript: string, snapshot: any): FastMatch | null {
277
+ const t = transcript.toLowerCase().trim();
278
+ const activeApps = (snapshot.activeStage ?? []).map((w: any) => ({
279
+ app: w.app as string,
280
+ wid: w.wid as number,
281
+ }));
282
+
283
+ // Tile patterns
284
+ const tileMatch = t.match(
285
+ /(?:tile|snap|put|move)\s+(\w+)\s+(?:to\s+)?(?:the\s+)?(left|right|top|bottom|maximize|center|top.?left|top.?right|bottom.?left|bottom.?right|left.?third|center.?third|right.?third)/
286
+ );
287
+ if (tileMatch) {
288
+ const app = tileMatch[1];
289
+ const pos = tileMatch[2].replace(/\s+/g, "-");
290
+ return {
291
+ actions: [{ intent: "tile_window", slots: { app, position: pos } }],
292
+ confirm: "tile_window",
293
+ };
294
+ }
295
+
296
+ // Split screen: "split X and Y" or "X left Y right"
297
+ const splitMatch = t.match(/split\s+(\w+)\s+(?:and|&)\s+(\w+)/);
298
+ if (splitMatch) {
299
+ return {
300
+ actions: [
301
+ { intent: "tile_window", slots: { app: splitMatch[1], position: "left" } },
302
+ { intent: "tile_window", slots: { app: splitMatch[2], position: "right" } },
303
+ ],
304
+ confirm: "tile_window",
305
+ };
306
+ }
307
+
308
+ // Focus: "focus X" / "focus on X" / "switch to X" / "go to X"
309
+ const focusMatch = t.match(/(?:focus(?:\s+on)?|switch\s+to|go\s+to|show)\s+(?:the\s+)?(?:on\s+)?(\w+)/);
310
+ if (focusMatch && !t.includes("tile") && !t.includes("split")) {
311
+ const app = focusMatch[1];
312
+ if (app && app !== "on" && app !== "the") {
313
+ return {
314
+ actions: [{ intent: "focus", slots: { app } }],
315
+ confirm: "focus",
316
+ };
317
+ }
318
+ }
319
+
320
+ // Maximize: "maximize" / "full screen" / "make it big"
321
+ if (/maximize|full\s*screen|make\s+it\s+big/.test(t)) {
322
+ return {
323
+ actions: [{ intent: "tile_window", slots: { position: "maximize" } }],
324
+ confirm: "tile_window",
325
+ };
326
+ }
327
+
328
+ // Distribute: "grid" / "mosaic" / "distribute" / "even"
329
+ if (/grid|mosaic|distribute|even\s+(?:out|grid)|arrange/.test(t)) {
330
+ return {
331
+ actions: [{ intent: "distribute", slots: {} }],
332
+ confirm: "distribute",
333
+ };
334
+ }
335
+
336
+ // Corners: "quadrants" / "four corners"
337
+ if (/quadrants?|four\s+corners?|corners/.test(t) && activeApps.length >= 4) {
338
+ const positions = ["top-left", "top-right", "bottom-left", "bottom-right"];
339
+ return {
340
+ actions: activeApps.slice(0, 4).map((a: any, i: number) => ({
341
+ intent: "tile_window",
342
+ slots: { app: a.app, position: positions[i] },
343
+ })),
344
+ confirm: "tile_window",
345
+ };
346
+ }
347
+
348
+ // Thirds: "thirds"
349
+ if (/thirds/.test(t) && activeApps.length >= 3) {
350
+ const positions = ["left-third", "center-third", "right-third"];
351
+ return {
352
+ actions: activeApps.slice(0, 3).map((a: any, i: number) => ({
353
+ intent: "tile_window",
354
+ slots: { app: a.app, position: positions[i] },
355
+ })),
356
+ confirm: "tile_window",
357
+ };
358
+ }
359
+
360
+ return null; // No fast match — fall through to LLM
361
+ }
362
+
363
+ // Warm up cache on startup
364
+ ensureVoiceCache().then(() => log("voice cache ready"));
365
+
366
+ log("worker started, streaming TTS ready");
367
+
368
+ // ── Load system prompt once ────────────────────────────────────────
369
+
370
+ const promptDir = join(dirname(import.meta.dir), "docs", "prompts");
371
+ let systemPrompt: string;
372
+ try {
373
+ systemPrompt = readFileSync(join(promptDir, "hands-off-system.md"), "utf-8")
374
+ .split("\n")
375
+ .filter((l) => !l.startsWith("# "))
376
+ .join("\n")
377
+ .trim();
378
+ } catch {
379
+ systemPrompt = "You are a workspace assistant. Respond with JSON: {actions, spoken}.";
380
+ }
381
+
382
+ const intentCatalog = `
383
+ tile_window: Tile a window to a screen position
384
+ Slots:
385
+ position (required): Named position or grid:CxR:C,R syntax.
386
+ Halves: left, right, top, bottom
387
+ Quarters (2x2): top-left, top-right, bottom-left, bottom-right
388
+ Thirds (3x1): left-third, center-third, right-third
389
+ Sixths (3x2): top-left-third, top-center-third, top-right-third, bottom-left-third, bottom-center-third, bottom-right-third
390
+ Fourths (4x1): first-fourth, second-fourth, third-fourth, last-fourth
391
+ Eighths (4x2): top-first-fourth, top-second-fourth, top-third-fourth, top-last-fourth, bottom-first-fourth, bottom-second-fourth, bottom-third-fourth, bottom-last-fourth
392
+ Special: maximize (full screen), center (centered floating)
393
+ Grid syntax: grid:CxR:C,R (e.g. grid:5x3:2,1 = center cell of 5x3 grid)
394
+ app (optional): Target app name — match loosely (e.g. "chrome" matches "Google Chrome")
395
+ wid (optional): Target window ID (from snapshot)
396
+ session (optional): Tmux session name
397
+ If no app/wid/session given, tiles the frontmost window.
398
+ "quarter" = 2x2 cell (top-left etc.), NOT a 4x1 fourth.
399
+ "top quarter" = top-left or top-right (2x2). "top third" = top-left-third (3x2).
400
+
401
+ focus: Focus a window, app, or session
402
+ Slots: app, session, or wid (at least one)
403
+
404
+ distribute: Arrange all visible windows in an even grid. No slots.
405
+
406
+ search: Search windows by text
407
+ Slots: query (required)
408
+
409
+ list_windows: List all visible windows. No slots.
410
+
411
+ switch_layer: Switch to a workspace layer
412
+ Slots: layer (required) — name or index
413
+
414
+ create_layer: Save current arrangement as a named layer
415
+ Slots: name (required)
416
+
417
+ TILING PRESETS (use multiple tile_window actions):
418
+ "split screen" → left + right
419
+ "thirds" → left-third, center-third, right-third
420
+ "mosaic"/"grid" → use distribute
421
+ "corners"/"quadrants" → top-left, top-right, bottom-left, bottom-right
422
+ "stack" → top + bottom
423
+ "six-up"/"3 by 2" → 3x2 grid using the sixth positions
424
+ "eight-up"/"4 by 2" → 4x2 grid using the eighth positions
425
+ `;
426
+
427
+ systemPrompt = systemPrompt.replace("{{intent_catalog}}", intentCatalog);
428
+ log("system prompt loaded");
429
+
430
+ // ── Auto-restart on file changes ───────────────────────────────────
431
+
432
+ const watchFiles = [
433
+ join(promptDir, "hands-off-system.md"),
434
+ import.meta.path, // this script itself
435
+ ];
436
+
437
+ for (const f of watchFiles) {
438
+ try {
439
+ const { watch } = await import("fs");
440
+ let debounce: ReturnType<typeof setTimeout> | null = null;
441
+ watch(f, () => {
442
+ if (debounce) return;
443
+ debounce = setTimeout(() => {
444
+ log(`file changed: ${f.split("/").pop()} — exiting for restart`);
445
+ process.exit(0); // Swift auto-restarts in 2s
446
+ }, 500);
447
+ });
448
+ log(`watching: ${f.split("/").pop()}`);
449
+ } catch {}
450
+ }
451
+
452
+ // ── Build context message from snapshot ─────────────────────────────
453
+
454
+ function buildContextMessage(transcript: string, snap: any): string {
455
+ let msg = `USER: "${transcript}"\n\n`;
456
+ msg += "--- DESKTOP SNAPSHOT ---\n";
457
+
458
+ // Screens
459
+ const screens = snap.screens ?? [];
460
+ if (screens.length > 1) {
461
+ msg += `Displays: ${screens.map((s: any) => `${s.width}x${s.height}${s.isMain ? " (main)" : ""}`).join(", ")}\n`;
462
+ } else if (screens.length === 1) {
463
+ msg += `Screen: ${screens[0].width}x${screens[0].height}\n`;
464
+ }
465
+
466
+ // Stage Manager
467
+ if (snap.stageManager) {
468
+ msg += `Stage Manager: ON (grouping: ${snap.smGrouping ?? "all-at-once"})\n`;
469
+ }
470
+
471
+ // All windows — full inventory, ordered front-to-back (zIndex 0 = frontmost)
472
+ const windows = snap.windows ?? snap.activeStage ?? [];
473
+ const onScreen = windows.filter((w: any) => w.onScreen !== false);
474
+ const offScreen = windows.filter((w: any) => w.onScreen === false);
475
+
476
+ msg += `\nVisible windows (${onScreen.length}, front-to-back order):\n`;
477
+ for (const w of onScreen) {
478
+ const flags: string[] = [];
479
+ if (w.zIndex === 0) flags.push("FRONTMOST");
480
+ if (w.session) flags.push(`session:${w.session}`);
481
+ const flagStr = flags.length ? ` [${flags.join(", ")}]` : "";
482
+ msg += ` wid:${w.wid} ${w.app}: "${w.title}" — ${w.frame}${flagStr}\n`;
483
+ }
484
+
485
+ if (offScreen.length > 0) {
486
+ // Summarize hidden windows by app instead of listing all
487
+ const hiddenByApp: Record<string, number> = {};
488
+ for (const w of offScreen) {
489
+ const app = w.app;
490
+ hiddenByApp[app] = (hiddenByApp[app] || 0) + 1;
491
+ }
492
+ const summary = Object.entries(hiddenByApp)
493
+ .filter(([app]) => !["WindowManager", "Spotlight", "CursorUIViewService", "AutoFill", "coreautha", "loginwindow", "Open and Save Panel Service"].includes(app))
494
+ .map(([app, count]) => `${app}(${count})`)
495
+ .join(", ");
496
+ if (summary) {
497
+ msg += `\nHidden windows: ${summary}\n`;
498
+ }
499
+ }
500
+
501
+ // Terminals — cwd, running commands, claude, tmux
502
+ const terminals = snap.terminals ?? [];
503
+ if (terminals.length > 0) {
504
+ msg += `\nTerminal tabs (${terminals.length}):\n`;
505
+ for (const t of terminals) {
506
+ const flags: string[] = [];
507
+ if (t.hasClaude) flags.push("Claude Code");
508
+ if (t.tmuxSession) flags.push(`tmux:${t.tmuxSession}`);
509
+ if (!t.isActiveTab) flags.push("background tab");
510
+ const flagStr = flags.length ? ` [${flags.join(", ")}]` : "";
511
+ const cwd = t.cwd ? ` cwd:${t.cwd.replace(/^\/Users\/\w+\//, "~/")}` : "";
512
+ const cmds = (t.runningCommands ?? []).map((c: any) => c.command).join(", ");
513
+ const cmdStr = cmds ? ` running:${cmds}` : "";
514
+ msg += ` ${t.displayName}${cwd}${cmdStr}${flagStr}`;
515
+ if (t.windowId) msg += ` (wid:${t.windowId})`;
516
+ msg += "\n";
517
+ }
518
+ }
519
+
520
+ // Tmux sessions
521
+ const tmux = snap.tmuxSessions ?? [];
522
+ if (tmux.length > 0) {
523
+ msg += `\nTmux sessions: ${tmux.map((s: any) => `${s.name} (${s.windows} windows${s.attached ? ", attached" : ""})`).join(", ")}\n`;
524
+ }
525
+
526
+ // Layer
527
+ if (snap.currentLayer) {
528
+ msg += `\nCurrent layer: ${snap.currentLayer.name} (index: ${snap.currentLayer.index})\n`;
529
+ }
530
+
531
+ msg += "--- END SNAPSHOT ---\n";
532
+ return msg;
533
+ }
534
+
535
+ // ── Command loop ───────────────────────────────────────────────────
536
+
537
+ const decoder = new TextDecoder();
538
+ const reader = Bun.stdin.stream().getReader();
539
+ let buffer = "";
540
+
541
+ async function processLine(line: string) {
542
+ const trimmed = line.trim();
543
+ if (!trimmed) return;
544
+
545
+ let cmd: any;
546
+ try {
547
+ cmd = JSON.parse(trimmed);
548
+ } catch {
549
+ respond({ ok: false, error: "invalid JSON" });
550
+ return;
551
+ }
552
+
553
+ switch (cmd.cmd) {
554
+ case "ping":
555
+ respond({ ok: true, data: { pong: true } });
556
+ break;
557
+
558
+ case "speak":
559
+ try {
560
+ const ms = await streamSpeak(cmd.text);
561
+ log(`spoke "${cmd.text.slice(0, 40)}" in ${ms}ms`);
562
+ respond({ ok: true, data: { durationMs: ms } });
563
+ } catch (err: any) {
564
+ log(`TTS error: ${err.message}`);
565
+ respond({ ok: false, error: err.message });
566
+ }
567
+ break;
568
+
569
+ case "ack":
570
+ // Fire and forget — respond immediately, speak in background
571
+ respond({ ok: true, data: { queued: true } });
572
+ streamSpeak(cmd.text).catch((e) => log(`ack TTS error: ${e.message}`));
573
+ break;
574
+
575
+ case "play_cached":
576
+ respond({ ok: true, data: { queued: true, cached: true } });
577
+ playCached(cmd.text).catch((e) => log(`play_cached error: ${e.message}`));
578
+ break;
579
+
580
+ case "infer":
581
+ try {
582
+ const userMessage = buildContextMessage(cmd.transcript, cmd.snapshot ?? {});
583
+
584
+ const messages = (cmd.history ?? []).map((h: any) => ({
585
+ role: h.role as "user" | "assistant",
586
+ content: h.content,
587
+ }));
588
+
589
+ const { data, raw } = await inferSmart(userMessage, {
590
+ provider: "xai",
591
+ model: "grok-4.20-beta-0309-non-reasoning",
592
+ system: systemPrompt,
593
+ messages,
594
+ temperature: 0.2,
595
+ maxTokens: 512,
596
+ tag: "hands-off",
597
+ });
598
+
599
+ respond({
600
+ ok: true,
601
+ data: {
602
+ ...data,
603
+ _meta: {
604
+ provider: raw.provider,
605
+ model: raw.model,
606
+ durationMs: raw.durationMs,
607
+ tokens: raw.usage?.totalTokens,
608
+ },
609
+ },
610
+ });
611
+ } catch (err: any) {
612
+ respond({
613
+ ok: false,
614
+ error: err.message,
615
+ data: {
616
+ actions: [],
617
+ spoken: "Sorry, I had trouble processing that.",
618
+ },
619
+ });
620
+ }
621
+ break;
622
+
623
+ case "turn": {
624
+ // Full orchestrated turn — parallel where possible.
625
+ //
626
+ // Timeline:
627
+ // t=0 ──┬── ack TTS (fire & forget)
628
+ // └── Groq inference
629
+ // t=~600ms ─┬── narrate TTS (what we're doing)
630
+ // └── execute actions (in parallel with narrate)
631
+ // t=done ── respond with results
632
+ //
633
+ const turnStart = performance.now();
634
+ const transcript = cmd.transcript;
635
+ const snap = cmd.snapshot ?? {};
636
+ const history = cmd.history ?? [];
637
+
638
+ log(`⏱ turn start: "${transcript.slice(0, 50)}"`);
639
+
640
+ // Fire cached ack sound + inference in PARALLEL
641
+ const ackPromise = playAck().catch((e) => log(`ack error: ${e.message}`));
642
+
643
+ // Build full context message from snapshot
644
+ const userMessage = buildContextMessage(transcript, snap);
645
+
646
+ const messages = history.map((h: any) => ({
647
+ role: h.role as "user" | "assistant",
648
+ content: typeof h.content === "string" ? h.content : JSON.stringify(h.content),
649
+ })).filter((m: any) => m.content && m.content.length > 0);
650
+
651
+ let inferResult: any = null;
652
+ try {
653
+ const { data, raw } = await inferSmart(userMessage, {
654
+ provider: "xai",
655
+ model: "grok-4.20-beta-0309-non-reasoning",
656
+ system: systemPrompt,
657
+ messages,
658
+ temperature: 0.2,
659
+ maxTokens: 512,
660
+ tag: "hands-off",
661
+ });
662
+ inferResult = { ...data, _meta: { provider: raw.provider, model: raw.model, durationMs: raw.durationMs, tokens: raw.usage?.totalTokens } };
663
+ log(`⏱ inference done in ${raw.durationMs}ms`);
664
+ } catch (err: any) {
665
+ log(`⏱ inference error: ${err.message}`);
666
+ inferResult = { actions: [], spoken: "Sorry, I had trouble with that.", _meta: { error: err.message } };
667
+ }
668
+
669
+ // Wait for ack to finish before narrating (don't overlap speech)
670
+ await ackPromise;
671
+
672
+ // Step 2: Narrate + execute in PARALLEL
673
+ const hasActions = Array.isArray(inferResult.actions) && inferResult.actions.length > 0;
674
+ const spokenText = inferResult.spoken;
675
+
676
+ if (hasActions && spokenText) {
677
+ // SPEAK FIRST — user must hear what's about to happen before windows move
678
+ log(`⏱ narrating: "${spokenText.slice(0, 50)}"`);
679
+ await streamSpeak(spokenText).catch((e) => log(`narrate error: ${e.message}`));
680
+
681
+ // NOW respond with actions — Swift executes after user heard the plan
682
+ const turnMs = Math.round(performance.now() - turnStart);
683
+ log(`⏱ turn response at ${turnMs}ms — actions sent after narration`);
684
+ respond({ ok: true, data: inferResult, turnMs });
685
+
686
+ // Confirm
687
+ await playCached("Done.").catch(() => {});
688
+ } else if (spokenText) {
689
+ // Conversation only — speak and respond
690
+ await streamSpeak(spokenText).catch((e) => log(`speak error: ${e.message}`));
691
+ const turnMs = Math.round(performance.now() - turnStart);
692
+ respond({ ok: true, data: inferResult, turnMs });
693
+ } else {
694
+ const turnMs = Math.round(performance.now() - turnStart);
695
+ respond({ ok: true, data: inferResult, turnMs });
696
+ }
697
+
698
+ const totalMs = Math.round(performance.now() - turnStart);
699
+ log(`⏱ turn complete: ${totalMs}ms total`);
700
+ break;
701
+ }
702
+
703
+ default:
704
+ respond({ ok: false, error: `unknown command: ${cmd.cmd}` });
705
+ }
706
+ }
707
+
708
+ // Read stdin line by line
709
+ (async () => {
710
+ while (true) {
711
+ const { done, value } = await reader.read();
712
+ if (done) break;
713
+
714
+ buffer += decoder.decode(value, { stream: true });
715
+ const lines = buffer.split("\n");
716
+ buffer = lines.pop() ?? "";
717
+
718
+ for (const line of lines) {
719
+ await processLine(line);
720
+ }
721
+ }
722
+ })();
723
+
724
+ function respond(obj: any) {
725
+ console.log(JSON.stringify(obj));
726
+ }
727
+
728
+ function log(msg: string) {
729
+ const ts = new Date().toISOString().slice(11, 23);
730
+ console.error(`[${ts}] handsoff-worker: ${msg}`);
731
+ }