@lattices/cli 0.3.0 → 0.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (111) hide show
  1. package/README.md +85 -9
  2. package/app/Info.plist +30 -0
  3. package/app/Lattices.app/Contents/Info.plist +8 -2
  4. package/app/Lattices.app/Contents/MacOS/Lattices +0 -0
  5. package/app/Lattices.app/Contents/Resources/AppIcon.icns +0 -0
  6. package/app/Lattices.app/Contents/Resources/tap.wav +0 -0
  7. package/app/Lattices.app/Contents/_CodeSignature/CodeResources +139 -0
  8. package/app/Lattices.entitlements +15 -0
  9. package/app/Package.swift +8 -1
  10. package/app/Resources/tap.wav +0 -0
  11. package/app/Sources/AdvisorLearningStore.swift +90 -0
  12. package/app/Sources/AgentSession.swift +377 -0
  13. package/app/Sources/AppDelegate.swift +45 -12
  14. package/app/Sources/AppShellView.swift +81 -8
  15. package/app/Sources/AudioProvider.swift +386 -0
  16. package/app/Sources/CheatSheetHUD.swift +261 -19
  17. package/app/Sources/DaemonProtocol.swift +13 -0
  18. package/app/Sources/DaemonServer.swift +8 -0
  19. package/app/Sources/DesktopModel.swift +189 -6
  20. package/app/Sources/DesktopModelTypes.swift +2 -0
  21. package/app/Sources/DiagnosticLog.swift +104 -2
  22. package/app/Sources/EventBus.swift +1 -0
  23. package/app/Sources/HUDBottomBar.swift +279 -0
  24. package/app/Sources/HUDController.swift +1158 -0
  25. package/app/Sources/HUDLeftBar.swift +849 -0
  26. package/app/Sources/HUDMinimap.swift +179 -0
  27. package/app/Sources/HUDRightBar.swift +774 -0
  28. package/app/Sources/HUDState.swift +367 -0
  29. package/app/Sources/HUDTopBar.swift +243 -0
  30. package/app/Sources/HandsOffSession.swift +802 -0
  31. package/app/Sources/HomeDashboardView.swift +125 -0
  32. package/app/Sources/HotkeyManager.swift +2 -0
  33. package/app/Sources/HotkeyStore.swift +49 -9
  34. package/app/Sources/IntentEngine.swift +962 -0
  35. package/app/Sources/Intents/CreateLayerIntent.swift +54 -0
  36. package/app/Sources/Intents/DistributeIntent.swift +56 -0
  37. package/app/Sources/Intents/FocusIntent.swift +69 -0
  38. package/app/Sources/Intents/HelpIntent.swift +41 -0
  39. package/app/Sources/Intents/KillIntent.swift +47 -0
  40. package/app/Sources/Intents/LatticeIntent.swift +78 -0
  41. package/app/Sources/Intents/LaunchIntent.swift +67 -0
  42. package/app/Sources/Intents/ListSessionsIntent.swift +32 -0
  43. package/app/Sources/Intents/ListWindowsIntent.swift +30 -0
  44. package/app/Sources/Intents/ScanIntent.swift +52 -0
  45. package/app/Sources/Intents/SearchIntent.swift +190 -0
  46. package/app/Sources/Intents/SwitchLayerIntent.swift +50 -0
  47. package/app/Sources/Intents/TileIntent.swift +61 -0
  48. package/app/Sources/LatticesApi.swift +1275 -30
  49. package/app/Sources/LauncherHUD.swift +348 -0
  50. package/app/Sources/MainView.swift +147 -44
  51. package/app/Sources/MouseFinder.swift +222 -0
  52. package/app/Sources/OcrModel.swift +34 -1
  53. package/app/Sources/OmniSearchState.swift +99 -102
  54. package/app/Sources/OnboardingView.swift +457 -0
  55. package/app/Sources/PermissionChecker.swift +2 -12
  56. package/app/Sources/PiChatDock.swift +454 -0
  57. package/app/Sources/PiChatSession.swift +815 -0
  58. package/app/Sources/PiWorkspaceView.swift +364 -0
  59. package/app/Sources/PlacementSpec.swift +195 -0
  60. package/app/Sources/Preferences.swift +59 -0
  61. package/app/Sources/ProjectScanner.swift +58 -45
  62. package/app/Sources/ScreenMapState.swift +701 -55
  63. package/app/Sources/ScreenMapView.swift +843 -103
  64. package/app/Sources/ScreenMapWindowController.swift +22 -0
  65. package/app/Sources/SessionLayerStore.swift +285 -0
  66. package/app/Sources/SessionManager.swift +4 -1
  67. package/app/Sources/SettingsView.swift +186 -3
  68. package/app/Sources/Theme.swift +9 -8
  69. package/app/Sources/TmuxModel.swift +7 -0
  70. package/app/Sources/TmuxQuery.swift +27 -3
  71. package/app/Sources/VoiceChatView.swift +192 -0
  72. package/app/Sources/VoiceCommandWindow.swift +1594 -0
  73. package/app/Sources/VoiceIntentResolver.swift +671 -0
  74. package/app/Sources/VoxClient.swift +454 -0
  75. package/app/Sources/WindowTiler.swift +348 -87
  76. package/app/Sources/WorkspaceManager.swift +127 -18
  77. package/app/Tests/StageDragTests.swift +333 -0
  78. package/app/Tests/StageJoinTests.swift +313 -0
  79. package/app/Tests/StageManagerTests.swift +280 -0
  80. package/app/Tests/StageTileTests.swift +353 -0
  81. package/assets/AppIcon.icns +0 -0
  82. package/bin/client.ts +16 -0
  83. package/bin/{daemon-client.js → daemon-client.ts} +49 -30
  84. package/bin/handsoff-infer.ts +280 -0
  85. package/bin/handsoff-worker.ts +740 -0
  86. package/bin/lattices-app.ts +338 -0
  87. package/bin/lattices-dev +208 -0
  88. package/bin/{lattices.js → lattices.ts} +777 -140
  89. package/bin/project-twin.ts +645 -0
  90. package/docs/agent-execution-plan.md +562 -0
  91. package/docs/agent-layer-guide.md +207 -0
  92. package/docs/agents.md +142 -0
  93. package/docs/api.md +153 -34
  94. package/docs/app.md +29 -1
  95. package/docs/config.md +5 -1
  96. package/docs/handsoff-test-scenarios.md +84 -0
  97. package/docs/layers.md +20 -20
  98. package/docs/ocr.md +14 -5
  99. package/docs/overview.md +5 -1
  100. package/docs/presentation-execution-review.md +491 -0
  101. package/docs/prompts/hands-off-system.md +374 -0
  102. package/docs/prompts/hands-off-turn.md +30 -0
  103. package/docs/prompts/voice-advisor.md +31 -0
  104. package/docs/prompts/voice-fallback.md +23 -0
  105. package/docs/tiling-reference.md +167 -0
  106. package/docs/twins.md +138 -0
  107. package/docs/voice-command-protocol.md +278 -0
  108. package/docs/voice.md +219 -0
  109. package/package.json +29 -11
  110. package/bin/client.js +0 -4
  111. package/bin/lattices-app.js +0 -221
@@ -0,0 +1,740 @@
1
+ #!/usr/bin/env bun
2
+ /**
3
+ * Hands-off worker — long-running process that handles both inference and TTS.
4
+ *
5
+ * Reads newline-delimited JSON commands from stdin, writes JSON responses to stdout.
6
+ * Keeps SpeakEasy and inference warm — no cold starts.
7
+ *
8
+ * Commands:
9
+ * {"cmd":"infer","transcript":"...","snapshot":{...},"history":[...]}
10
+ * {"cmd":"speak","text":"..."}
11
+ * {"cmd":"ack","text":"..."} (speak + don't wait for completion)
12
+ * {"cmd":"ping"}
13
+ *
14
+ * Responses:
15
+ * {"ok":true,"data":{...}}
16
+ * {"ok":false,"error":"..."}
17
+ */
18
+
19
+ import { infer, inferJSON } from "../lib/infer.ts";
20
+
21
+ const INFER_TIMEOUT_MS = 15_000;
22
+
23
+ /** Call infer and parse JSON if possible, otherwise treat as spoken-only response */
24
+ async function inferSmart(prompt: string, options: any): Promise<{ data: any; raw: any }> {
25
+ const controller = new AbortController();
26
+ const timer = setTimeout(() => controller.abort(), INFER_TIMEOUT_MS);
27
+ let raw: any;
28
+ try {
29
+ raw = await infer(prompt, { ...options, abortSignal: controller.signal });
30
+ } finally {
31
+ clearTimeout(timer);
32
+ }
33
+
34
+ // Try to parse as JSON
35
+ let cleaned = raw.text
36
+ .replace(/```json\s*/g, "")
37
+ .replace(/```\s*/g, "")
38
+ .trim();
39
+
40
+ const start = cleaned.indexOf("{");
41
+ const end = cleaned.lastIndexOf("}");
42
+
43
+ if (start !== -1 && end !== -1) {
44
+ try {
45
+ const data = JSON.parse(cleaned.slice(start, end + 1));
46
+ return { data, raw };
47
+ } catch {}
48
+ }
49
+
50
+ // Not JSON — treat as conversational response (spoken-only, no actions)
51
+ log(`response was plain text, wrapping as spoken: "${raw.text.slice(0, 80)}"`);
52
+ return {
53
+ data: { actions: [], spoken: raw.text },
54
+ raw,
55
+ };
56
+ }
57
+ import { readFileSync } from "fs";
58
+ import { join, dirname } from "path";
59
+ import { spawn } from "child_process";
60
+
61
+ // ── Streaming TTS via OpenAI API → ffplay ──────────────────────────
62
+
63
+ const OPENAI_TTS_URL = "https://api.openai.com/v1/audio/speech";
64
+ const ttsConfig = loadTTSConfig();
65
+
66
+ function loadTTSConfig() {
67
+ // Load API key from speakeasy config or env
68
+ let apiKey = process.env.OPENAI_API_KEY || "";
69
+ let voice = "nova";
70
+
71
+ try {
72
+ const cfg = JSON.parse(
73
+ readFileSync(join(process.env.HOME || "", ".config/speakeasy/settings.json"), "utf-8")
74
+ );
75
+ if (!apiKey && cfg.providers?.openai?.apiKey) apiKey = cfg.providers.openai.apiKey;
76
+ if (cfg.providers?.openai?.voice) voice = cfg.providers.openai.voice;
77
+ } catch {}
78
+
79
+ return { apiKey, voice };
80
+ }
81
+
82
+ /** Stream TTS: fetch audio from OpenAI and pipe directly to ffplay. Playback starts immediately. */
83
+ async function streamSpeak(text: string): Promise<number> {
84
+ const start = performance.now();
85
+
86
+ const res = await fetch(OPENAI_TTS_URL, {
87
+ method: "POST",
88
+ headers: {
89
+ "Authorization": `Bearer ${ttsConfig.apiKey}`,
90
+ "Content-Type": "application/json",
91
+ },
92
+ body: JSON.stringify({
93
+ model: "tts-1",
94
+ voice: ttsConfig.voice,
95
+ input: text,
96
+ response_format: "pcm",
97
+ speed: 1.1,
98
+ }),
99
+ });
100
+
101
+ if (!res.ok) {
102
+ throw new Error(`OpenAI TTS error: ${res.status} ${res.statusText}`);
103
+ }
104
+
105
+ const ttfb = Math.round(performance.now() - start);
106
+ log(`TTS first byte in ${ttfb}ms`);
107
+
108
+ // Pipe response body directly to ffplay — playback starts as chunks arrive
109
+ return new Promise((resolve, reject) => {
110
+ const player = spawn("ffplay", [
111
+ "-nodisp", // no video window
112
+ "-autoexit", // quit when done
113
+ "-loglevel", "quiet",
114
+ "-f", "s16le", // PCM signed 16-bit little-endian
115
+ "-ar", "24000", // OpenAI TTS outputs 24kHz
116
+ "-ch_layout", "mono",
117
+ "-", // read from stdin
118
+ ], { stdio: ["pipe", "ignore", "ignore"] });
119
+
120
+ const reader = res.body?.getReader();
121
+ if (!reader) {
122
+ reject(new Error("No response body"));
123
+ return;
124
+ }
125
+
126
+ // Pump chunks from fetch → ffplay stdin
127
+ (async () => {
128
+ while (true) {
129
+ const { done, value } = await reader.read();
130
+ if (done) break;
131
+ player.stdin.write(value);
132
+ }
133
+ player.stdin.end();
134
+ })().catch(reject);
135
+
136
+ player.on("close", () => {
137
+ const ms = Math.round(performance.now() - start);
138
+ resolve(ms);
139
+ });
140
+
141
+ player.on("error", reject);
142
+ });
143
+ }
144
+
145
+ // ── Pre-cached ack sounds (no API call needed) ────────────────────
146
+
147
+ // Ack phrases — played immediately when user stops talking
148
+ const ACK_PHRASES = [
149
+ "Got it.",
150
+ "Heard you.",
151
+ "On it.",
152
+ "Yep.",
153
+ "Cool.",
154
+ "Sure.",
155
+ "Okay.",
156
+ "One sec.",
157
+ ];
158
+
159
+ // Confirmation phrases — played after executing known actions
160
+ const CONFIRM_PHRASES = [
161
+ "Tiled.",
162
+ "Focused.",
163
+ "Done.",
164
+ "Maximized.",
165
+ "Split.",
166
+ "Switched.",
167
+ "Distributed.",
168
+ "Restored.",
169
+ "Searching.",
170
+ ];
171
+
172
+ const ackCacheDir = join(process.env.HOME || "", ".lattices", "tts-cache");
173
+ const ackCache = new Map<string, string>(); // phrase → file path
174
+
175
+ async function ensureVoiceCache() {
176
+ const { mkdirSync, existsSync, writeFileSync } = await import("fs");
177
+ mkdirSync(ackCacheDir, { recursive: true });
178
+
179
+ const allPhrases = [...ACK_PHRASES, ...CONFIRM_PHRASES];
180
+ let cached = 0;
181
+ let generated = 0;
182
+
183
+ for (const phrase of allPhrases) {
184
+ const safeName = phrase.replace(/[^a-z]/gi, "_").toLowerCase();
185
+ const filePath = join(ackCacheDir, `voice_${safeName}.pcm`);
186
+
187
+ if (existsSync(filePath)) {
188
+ ackCache.set(phrase, filePath);
189
+ cached++;
190
+ continue;
191
+ }
192
+
193
+ // Generate and cache
194
+ try {
195
+ const res = await fetch(OPENAI_TTS_URL, {
196
+ method: "POST",
197
+ headers: {
198
+ "Authorization": `Bearer ${ttsConfig.apiKey}`,
199
+ "Content-Type": "application/json",
200
+ },
201
+ body: JSON.stringify({
202
+ model: "tts-1",
203
+ voice: ttsConfig.voice,
204
+ input: phrase,
205
+ response_format: "pcm",
206
+ speed: 1.1,
207
+ }),
208
+ });
209
+
210
+ if (res.ok) {
211
+ const buf = Buffer.from(await res.arrayBuffer());
212
+ writeFileSync(filePath, buf);
213
+ ackCache.set(phrase, filePath);
214
+ generated++;
215
+ log(`cached: "${phrase}"`);
216
+ }
217
+ } catch (e: any) {
218
+ log(`cache failed for "${phrase}": ${e.message}`);
219
+ }
220
+ }
221
+ log(`voice cache: ${cached} hit, ${generated} generated, ${allPhrases.length} total`);
222
+ }
223
+
224
+ /** Play a pre-cached audio file. Near-instant — no API call. */
225
+ async function playCached(phrase: string): Promise<number> {
226
+ const start = performance.now();
227
+ const filePath = ackCache.get(phrase);
228
+
229
+ if (!filePath) {
230
+ log(`playCached: cache miss for "${phrase}", falling back to TTS`);
231
+ return streamSpeak(phrase);
232
+ }
233
+
234
+ log(`playing cached: "${phrase}"`);
235
+ return new Promise((resolve, reject) => {
236
+ const player = spawn("ffplay", [
237
+ "-nodisp", "-autoexit", "-loglevel", "quiet",
238
+ "-f", "s16le", "-ar", "24000", "-ch_layout", "mono",
239
+ filePath,
240
+ ], { stdio: ["ignore", "ignore", "pipe"] });
241
+
242
+ let stderr = "";
243
+ player.stderr?.on("data", (d: Buffer) => { stderr += d.toString(); });
244
+
245
+ player.on("close", (code: number) => {
246
+ const ms = Math.round(performance.now() - start);
247
+ if (code !== 0) log(`ffplay error (code ${code}): ${stderr.slice(0, 100)}`);
248
+ else log(`played "${phrase}" in ${ms}ms`);
249
+ resolve(ms);
250
+ });
251
+
252
+ player.on("error", (err: Error) => {
253
+ log(`ffplay spawn error: ${err.message}`);
254
+ reject(err);
255
+ });
256
+ });
257
+ }
258
+
259
+ /** Play a random ack phrase from cache. */
260
+ function playAck(): Promise<number> {
261
+ const phrase = ACK_PHRASES[Math.floor(Math.random() * ACK_PHRASES.length)];
262
+ return playCached(phrase);
263
+ }
264
+
265
+ /** Play the right confirmation for an action. */
266
+ function playConfirm(intent: string): Promise<number> {
267
+ const map: Record<string, string> = {
268
+ tile_window: "Tiled.",
269
+ focus: "Focused.",
270
+ distribute: "Distributed.",
271
+ search: "Searching.",
272
+ switch_layer: "Switched.",
273
+ create_layer: "Done.",
274
+ };
275
+ return playCached(map[intent] ?? "Done.");
276
+ }
277
+
278
+ // ── Fast path: local intent matching (no LLM needed) ──────────────
279
+
280
+ interface FastMatch {
281
+ actions: Array<{ intent: string; slots: Record<string, string> }>;
282
+ confirm: string; // which confirmation to play
283
+ }
284
+
285
+ function tryFastMatch(transcript: string, snapshot: any): FastMatch | null {
286
+ const t = transcript.toLowerCase().trim();
287
+ const activeApps = (snapshot.activeStage ?? []).map((w: any) => ({
288
+ app: w.app as string,
289
+ wid: w.wid as number,
290
+ }));
291
+
292
+ // Tile patterns
293
+ const tileMatch = t.match(
294
+ /(?:tile|snap|put|move)\s+(\w+)\s+(?:to\s+)?(?:the\s+)?(left|right|top|bottom|maximize|center|top.?left|top.?right|bottom.?left|bottom.?right|left.?third|center.?third|right.?third)/
295
+ );
296
+ if (tileMatch) {
297
+ const app = tileMatch[1];
298
+ const pos = tileMatch[2].replace(/\s+/g, "-");
299
+ return {
300
+ actions: [{ intent: "tile_window", slots: { app, position: pos } }],
301
+ confirm: "tile_window",
302
+ };
303
+ }
304
+
305
+ // Split screen: "split X and Y" or "X left Y right"
306
+ const splitMatch = t.match(/split\s+(\w+)\s+(?:and|&)\s+(\w+)/);
307
+ if (splitMatch) {
308
+ return {
309
+ actions: [
310
+ { intent: "tile_window", slots: { app: splitMatch[1], position: "left" } },
311
+ { intent: "tile_window", slots: { app: splitMatch[2], position: "right" } },
312
+ ],
313
+ confirm: "tile_window",
314
+ };
315
+ }
316
+
317
+ // Focus: "focus X" / "focus on X" / "switch to X" / "go to X"
318
+ const focusMatch = t.match(/(?:focus(?:\s+on)?|switch\s+to|go\s+to|show)\s+(?:the\s+)?(?:on\s+)?(\w+)/);
319
+ if (focusMatch && !t.includes("tile") && !t.includes("split")) {
320
+ const app = focusMatch[1];
321
+ if (app && app !== "on" && app !== "the") {
322
+ return {
323
+ actions: [{ intent: "focus", slots: { app } }],
324
+ confirm: "focus",
325
+ };
326
+ }
327
+ }
328
+
329
+ // Maximize: "maximize" / "full screen" / "make it big"
330
+ if (/maximize|full\s*screen|make\s+it\s+big/.test(t)) {
331
+ return {
332
+ actions: [{ intent: "tile_window", slots: { position: "maximize" } }],
333
+ confirm: "tile_window",
334
+ };
335
+ }
336
+
337
+ // Distribute: "grid" / "mosaic" / "distribute" / "even"
338
+ if (/grid|mosaic|distribute|even\s+(?:out|grid)|arrange/.test(t)) {
339
+ return {
340
+ actions: [{ intent: "distribute", slots: {} }],
341
+ confirm: "distribute",
342
+ };
343
+ }
344
+
345
+ // Corners: "quadrants" / "four corners"
346
+ if (/quadrants?|four\s+corners?|corners/.test(t) && activeApps.length >= 4) {
347
+ const positions = ["top-left", "top-right", "bottom-left", "bottom-right"];
348
+ return {
349
+ actions: activeApps.slice(0, 4).map((a: any, i: number) => ({
350
+ intent: "tile_window",
351
+ slots: { app: a.app, position: positions[i] },
352
+ })),
353
+ confirm: "tile_window",
354
+ };
355
+ }
356
+
357
+ // Thirds: "thirds"
358
+ if (/thirds/.test(t) && activeApps.length >= 3) {
359
+ const positions = ["left-third", "center-third", "right-third"];
360
+ return {
361
+ actions: activeApps.slice(0, 3).map((a: any, i: number) => ({
362
+ intent: "tile_window",
363
+ slots: { app: a.app, position: positions[i] },
364
+ })),
365
+ confirm: "tile_window",
366
+ };
367
+ }
368
+
369
+ return null; // No fast match — fall through to LLM
370
+ }
371
+
372
+ // Warm up cache on startup
373
+ ensureVoiceCache().then(() => log("voice cache ready"));
374
+
375
+ log("worker started, streaming TTS ready");
376
+
377
+ // ── Load system prompt once ────────────────────────────────────────
378
+
379
+ const promptDir = join(dirname(import.meta.dir), "docs", "prompts");
380
+ let systemPrompt: string;
381
+ try {
382
+ systemPrompt = readFileSync(join(promptDir, "hands-off-system.md"), "utf-8")
383
+ .split("\n")
384
+ .filter((l) => !l.startsWith("# "))
385
+ .join("\n")
386
+ .trim();
387
+ } catch {
388
+ systemPrompt = "You are a workspace assistant. Respond with JSON: {actions, spoken}.";
389
+ }
390
+
391
+ const intentCatalog = `
392
+ tile_window: Tile a window to a screen position
393
+ Slots:
394
+ position (required): Named position or grid:CxR:C,R syntax.
395
+ Halves: left, right, top, bottom
396
+ Quarters (2x2): top-left, top-right, bottom-left, bottom-right
397
+ Thirds (3x1): left-third, center-third, right-third
398
+ Sixths (3x2): top-left-third, top-center-third, top-right-third, bottom-left-third, bottom-center-third, bottom-right-third
399
+ Fourths (4x1): first-fourth, second-fourth, third-fourth, last-fourth
400
+ Eighths (4x2): top-first-fourth, top-second-fourth, top-third-fourth, top-last-fourth, bottom-first-fourth, bottom-second-fourth, bottom-third-fourth, bottom-last-fourth
401
+ Special: maximize (full screen), center (centered floating)
402
+ Grid syntax: grid:CxR:C,R (e.g. grid:5x3:2,1 = center cell of 5x3 grid)
403
+ app (optional): Target app name — match loosely (e.g. "chrome" matches "Google Chrome")
404
+ wid (optional): Target window ID (from snapshot)
405
+ session (optional): Tmux session name
406
+ If no app/wid/session given, tiles the frontmost window.
407
+ "quarter" = 2x2 cell (top-left etc.), NOT a 4x1 fourth.
408
+ "top quarter" = top-left or top-right (2x2). "top third" = top-left-third (3x2).
409
+
410
+ focus: Focus a window, app, or session
411
+ Slots: app, session, or wid (at least one)
412
+
413
+ distribute: Arrange all visible windows in an even grid. No slots.
414
+
415
+ search: Search windows by text
416
+ Slots: query (required)
417
+
418
+ list_windows: List all visible windows. No slots.
419
+
420
+ switch_layer: Switch to a workspace layer
421
+ Slots: layer (required) — name or index
422
+
423
+ create_layer: Save current arrangement as a named layer
424
+ Slots: name (required)
425
+
426
+ TILING PRESETS (use multiple tile_window actions):
427
+ "split screen" → left + right
428
+ "thirds" → left-third, center-third, right-third
429
+ "mosaic"/"grid" → use distribute
430
+ "corners"/"quadrants" → top-left, top-right, bottom-left, bottom-right
431
+ "stack" → top + bottom
432
+ "six-up"/"3 by 2" → 3x2 grid using the sixth positions
433
+ "eight-up"/"4 by 2" → 4x2 grid using the eighth positions
434
+ `;
435
+
436
+ systemPrompt = systemPrompt.replace("{{intent_catalog}}", intentCatalog);
437
+ log("system prompt loaded");
438
+
439
+ // ── Auto-restart on file changes ───────────────────────────────────
440
+
441
+ const watchFiles = [
442
+ join(promptDir, "hands-off-system.md"),
443
+ import.meta.path, // this script itself
444
+ ];
445
+
446
+ for (const f of watchFiles) {
447
+ try {
448
+ const { watch } = await import("fs");
449
+ let debounce: ReturnType<typeof setTimeout> | null = null;
450
+ watch(f, () => {
451
+ if (debounce) return;
452
+ debounce = setTimeout(() => {
453
+ log(`file changed: ${f.split("/").pop()} — exiting for restart`);
454
+ process.exit(0); // Swift auto-restarts in 2s
455
+ }, 500);
456
+ });
457
+ log(`watching: ${f.split("/").pop()}`);
458
+ } catch {}
459
+ }
460
+
461
+ // ── Build context message from snapshot ─────────────────────────────
462
+
463
+ function buildContextMessage(transcript: string, snap: any): string {
464
+ let msg = `USER: "${transcript}"\n\n`;
465
+ msg += "--- DESKTOP SNAPSHOT ---\n";
466
+
467
+ // Screens
468
+ const screens = snap.screens ?? [];
469
+ if (screens.length > 1) {
470
+ msg += `Displays: ${screens.map((s: any) => `${s.width}x${s.height}${s.isMain ? " (main)" : ""}`).join(", ")}\n`;
471
+ } else if (screens.length === 1) {
472
+ msg += `Screen: ${screens[0].width}x${screens[0].height}\n`;
473
+ }
474
+
475
+ // Stage Manager
476
+ if (snap.stageManager) {
477
+ msg += `Stage Manager: ON (grouping: ${snap.smGrouping ?? "all-at-once"})\n`;
478
+ }
479
+
480
+ // All windows — full inventory, ordered front-to-back (zIndex 0 = frontmost)
481
+ const windows = snap.windows ?? snap.activeStage ?? [];
482
+ const onScreen = windows.filter((w: any) => w.onScreen !== false);
483
+ const offScreen = windows.filter((w: any) => w.onScreen === false);
484
+
485
+ msg += `\nVisible windows (${onScreen.length}, front-to-back order):\n`;
486
+ for (const w of onScreen) {
487
+ const flags: string[] = [];
488
+ if (w.zIndex === 0) flags.push("FRONTMOST");
489
+ if (w.session) flags.push(`session:${w.session}`);
490
+ const flagStr = flags.length ? ` [${flags.join(", ")}]` : "";
491
+ msg += ` wid:${w.wid} ${w.app}: "${w.title}" — ${w.frame}${flagStr}\n`;
492
+ }
493
+
494
+ if (offScreen.length > 0) {
495
+ // Summarize hidden windows by app instead of listing all
496
+ const hiddenByApp: Record<string, number> = {};
497
+ for (const w of offScreen) {
498
+ const app = w.app;
499
+ hiddenByApp[app] = (hiddenByApp[app] || 0) + 1;
500
+ }
501
+ const summary = Object.entries(hiddenByApp)
502
+ .filter(([app]) => !["WindowManager", "Spotlight", "CursorUIViewService", "AutoFill", "coreautha", "loginwindow", "Open and Save Panel Service"].includes(app))
503
+ .map(([app, count]) => `${app}(${count})`)
504
+ .join(", ");
505
+ if (summary) {
506
+ msg += `\nHidden windows: ${summary}\n`;
507
+ }
508
+ }
509
+
510
+ // Terminals — cwd, running commands, claude, tmux
511
+ const terminals = snap.terminals ?? [];
512
+ if (terminals.length > 0) {
513
+ msg += `\nTerminal tabs (${terminals.length}):\n`;
514
+ for (const t of terminals) {
515
+ const flags: string[] = [];
516
+ if (t.hasClaude) flags.push("Claude Code");
517
+ if (t.tmuxSession) flags.push(`tmux:${t.tmuxSession}`);
518
+ if (!t.isActiveTab) flags.push("background tab");
519
+ const flagStr = flags.length ? ` [${flags.join(", ")}]` : "";
520
+ const cwd = t.cwd ? ` cwd:${t.cwd.replace(/^\/Users\/\w+\//, "~/")}` : "";
521
+ const cmds = (t.runningCommands ?? []).map((c: any) => c.command).join(", ");
522
+ const cmdStr = cmds ? ` running:${cmds}` : "";
523
+ msg += ` ${t.displayName}${cwd}${cmdStr}${flagStr}`;
524
+ if (t.windowId) msg += ` (wid:${t.windowId})`;
525
+ msg += "\n";
526
+ }
527
+ }
528
+
529
+ // Tmux sessions
530
+ const tmux = snap.tmuxSessions ?? [];
531
+ if (tmux.length > 0) {
532
+ msg += `\nTmux sessions: ${tmux.map((s: any) => `${s.name} (${s.windows} windows${s.attached ? ", attached" : ""})`).join(", ")}\n`;
533
+ }
534
+
535
+ // Layer
536
+ if (snap.currentLayer) {
537
+ msg += `\nCurrent layer: ${snap.currentLayer.name} (index: ${snap.currentLayer.index})\n`;
538
+ }
539
+
540
+ msg += "--- END SNAPSHOT ---\n";
541
+ return msg;
542
+ }
543
+
544
+ // ── Command loop ───────────────────────────────────────────────────
545
+
546
+ const decoder = new TextDecoder();
547
+ const reader = Bun.stdin.stream().getReader();
548
+ let buffer = "";
549
+
550
+ async function processLine(line: string) {
551
+ const trimmed = line.trim();
552
+ if (!trimmed) return;
553
+
554
+ let cmd: any;
555
+ try {
556
+ cmd = JSON.parse(trimmed);
557
+ } catch {
558
+ respond({ ok: false, error: "invalid JSON" });
559
+ return;
560
+ }
561
+
562
+ switch (cmd.cmd) {
563
+ case "ping":
564
+ respond({ ok: true, data: { pong: true } });
565
+ break;
566
+
567
+ case "speak":
568
+ try {
569
+ const ms = await streamSpeak(cmd.text);
570
+ log(`spoke "${cmd.text.slice(0, 40)}" in ${ms}ms`);
571
+ respond({ ok: true, data: { durationMs: ms } });
572
+ } catch (err: any) {
573
+ log(`TTS error: ${err.message}`);
574
+ respond({ ok: false, error: err.message });
575
+ }
576
+ break;
577
+
578
+ case "ack":
579
+ // Fire and forget — respond immediately, speak in background
580
+ respond({ ok: true, data: { queued: true } });
581
+ streamSpeak(cmd.text).catch((e) => log(`ack TTS error: ${e.message}`));
582
+ break;
583
+
584
+ case "play_cached":
585
+ respond({ ok: true, data: { queued: true, cached: true } });
586
+ playCached(cmd.text).catch((e) => log(`play_cached error: ${e.message}`));
587
+ break;
588
+
589
+ case "infer":
590
+ try {
591
+ const userMessage = buildContextMessage(cmd.transcript, cmd.snapshot ?? {});
592
+
593
+ const messages = (cmd.history ?? []).map((h: any) => ({
594
+ role: h.role as "user" | "assistant",
595
+ content: h.content,
596
+ }));
597
+
598
+ const { data, raw } = await inferSmart(userMessage, {
599
+ provider: "xai",
600
+ model: "grok-4.20-beta-0309-non-reasoning",
601
+ system: systemPrompt,
602
+ messages,
603
+ temperature: 0.2,
604
+ maxTokens: 512,
605
+ tag: "hands-off",
606
+ });
607
+
608
+ respond({
609
+ ok: true,
610
+ data: {
611
+ ...data,
612
+ _meta: {
613
+ provider: raw.provider,
614
+ model: raw.model,
615
+ durationMs: raw.durationMs,
616
+ tokens: raw.usage?.totalTokens,
617
+ },
618
+ },
619
+ });
620
+ } catch (err: any) {
621
+ respond({
622
+ ok: false,
623
+ error: err.message,
624
+ data: {
625
+ actions: [],
626
+ spoken: "Sorry, I had trouble processing that.",
627
+ },
628
+ });
629
+ }
630
+ break;
631
+
632
+ case "turn": {
633
+ // Full orchestrated turn — parallel where possible.
634
+ //
635
+ // Timeline:
636
+ // t=0 ──┬── ack TTS (fire & forget)
637
+ // └── Groq inference
638
+ // t=~600ms ─┬── narrate TTS (what we're doing)
639
+ // └── execute actions (in parallel with narrate)
640
+ // t=done ── respond with results
641
+ //
642
+ const turnStart = performance.now();
643
+ const transcript = cmd.transcript;
644
+ const snap = cmd.snapshot ?? {};
645
+ const history = cmd.history ?? [];
646
+
647
+ log(`⏱ turn start: "${transcript.slice(0, 50)}"`);
648
+
649
+ // Fire cached ack sound + inference in PARALLEL
650
+ const ackPromise = playAck().catch((e) => log(`ack error: ${e.message}`));
651
+
652
+ // Build full context message from snapshot
653
+ const userMessage = buildContextMessage(transcript, snap);
654
+
655
+ const messages = history.map((h: any) => ({
656
+ role: h.role as "user" | "assistant",
657
+ content: typeof h.content === "string" ? h.content : JSON.stringify(h.content),
658
+ })).filter((m: any) => m.content && m.content.length > 0);
659
+
660
+ let inferResult: any = null;
661
+ try {
662
+ const { data, raw } = await inferSmart(userMessage, {
663
+ provider: "xai",
664
+ model: "grok-4.20-beta-0309-non-reasoning",
665
+ system: systemPrompt,
666
+ messages,
667
+ temperature: 0.2,
668
+ maxTokens: 512,
669
+ tag: "hands-off",
670
+ });
671
+ inferResult = { ...data, _meta: { provider: raw.provider, model: raw.model, durationMs: raw.durationMs, tokens: raw.usage?.totalTokens } };
672
+ log(`⏱ inference done in ${raw.durationMs}ms`);
673
+ } catch (err: any) {
674
+ log(`⏱ inference error: ${err.message}`);
675
+ inferResult = { actions: [], spoken: "Sorry, I had trouble with that.", _meta: { error: err.message } };
676
+ }
677
+
678
+ // Wait for ack to finish before narrating (don't overlap speech)
679
+ await ackPromise;
680
+
681
+ // Step 2: Narrate + execute in PARALLEL
682
+ const hasActions = Array.isArray(inferResult.actions) && inferResult.actions.length > 0;
683
+ const spokenText = inferResult.spoken;
684
+
685
+ if (hasActions && spokenText) {
686
+ // SPEAK FIRST — user must hear what's about to happen before windows move
687
+ log(`⏱ narrating: "${spokenText.slice(0, 50)}"`);
688
+ await streamSpeak(spokenText).catch((e) => log(`narrate error: ${e.message}`));
689
+
690
+ // NOW respond with actions — Swift executes after user heard the plan
691
+ const turnMs = Math.round(performance.now() - turnStart);
692
+ log(`⏱ turn response at ${turnMs}ms — actions sent after narration`);
693
+ respond({ ok: true, data: inferResult, turnMs });
694
+
695
+ // Confirm
696
+ await playCached("Done.").catch(() => {});
697
+ } else if (spokenText) {
698
+ // Conversation only — speak and respond
699
+ await streamSpeak(spokenText).catch((e) => log(`speak error: ${e.message}`));
700
+ const turnMs = Math.round(performance.now() - turnStart);
701
+ respond({ ok: true, data: inferResult, turnMs });
702
+ } else {
703
+ const turnMs = Math.round(performance.now() - turnStart);
704
+ respond({ ok: true, data: inferResult, turnMs });
705
+ }
706
+
707
+ const totalMs = Math.round(performance.now() - turnStart);
708
+ log(`⏱ turn complete: ${totalMs}ms total`);
709
+ break;
710
+ }
711
+
712
+ default:
713
+ respond({ ok: false, error: `unknown command: ${cmd.cmd}` });
714
+ }
715
+ }
716
+
717
+ // Read stdin line by line
718
+ (async () => {
719
+ while (true) {
720
+ const { done, value } = await reader.read();
721
+ if (done) break;
722
+
723
+ buffer += decoder.decode(value, { stream: true });
724
+ const lines = buffer.split("\n");
725
+ buffer = lines.pop() ?? "";
726
+
727
+ for (const line of lines) {
728
+ await processLine(line);
729
+ }
730
+ }
731
+ })();
732
+
733
+ function respond(obj: any) {
734
+ console.log(JSON.stringify(obj));
735
+ }
736
+
737
+ function log(msg: string) {
738
+ const ts = new Date().toISOString().slice(11, 23);
739
+ console.error(`[${ts}] handsoff-worker: ${msg}`);
740
+ }