@lattices/cli 0.3.0 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +85 -9
- package/app/Package.swift +8 -1
- package/app/Sources/AdvisorLearningStore.swift +90 -0
- package/app/Sources/AgentSession.swift +377 -0
- package/app/Sources/AppDelegate.swift +44 -12
- package/app/Sources/AppShellView.swift +81 -8
- package/app/Sources/AudioProvider.swift +386 -0
- package/app/Sources/CheatSheetHUD.swift +261 -19
- package/app/Sources/DaemonProtocol.swift +13 -0
- package/app/Sources/DaemonServer.swift +8 -0
- package/app/Sources/DesktopModel.swift +164 -5
- package/app/Sources/DesktopModelTypes.swift +2 -0
- package/app/Sources/DiagnosticLog.swift +104 -2
- package/app/Sources/EventBus.swift +1 -0
- package/app/Sources/HUDBottomBar.swift +279 -0
- package/app/Sources/HUDController.swift +1158 -0
- package/app/Sources/HUDLeftBar.swift +849 -0
- package/app/Sources/HUDMinimap.swift +179 -0
- package/app/Sources/HUDRightBar.swift +774 -0
- package/app/Sources/HUDState.swift +367 -0
- package/app/Sources/HUDTopBar.swift +243 -0
- package/app/Sources/HandsOffSession.swift +733 -0
- package/app/Sources/HomeDashboardView.swift +125 -0
- package/app/Sources/HotkeyManager.swift +2 -0
- package/app/Sources/HotkeyStore.swift +45 -9
- package/app/Sources/IntentEngine.swift +925 -0
- package/app/Sources/Intents/CreateLayerIntent.swift +54 -0
- package/app/Sources/Intents/DistributeIntent.swift +56 -0
- package/app/Sources/Intents/FocusIntent.swift +69 -0
- package/app/Sources/Intents/HelpIntent.swift +41 -0
- package/app/Sources/Intents/KillIntent.swift +47 -0
- package/app/Sources/Intents/LatticeIntent.swift +78 -0
- package/app/Sources/Intents/LaunchIntent.swift +67 -0
- package/app/Sources/Intents/ListSessionsIntent.swift +32 -0
- package/app/Sources/Intents/ListWindowsIntent.swift +30 -0
- package/app/Sources/Intents/ScanIntent.swift +52 -0
- package/app/Sources/Intents/SearchIntent.swift +190 -0
- package/app/Sources/Intents/SwitchLayerIntent.swift +50 -0
- package/app/Sources/Intents/TileIntent.swift +61 -0
- package/app/Sources/LatticesApi.swift +1235 -30
- package/app/Sources/LauncherHUD.swift +348 -0
- package/app/Sources/MainView.swift +147 -44
- package/app/Sources/OcrModel.swift +34 -1
- package/app/Sources/OmniSearchState.swift +99 -102
- package/app/Sources/OnboardingView.swift +457 -0
- package/app/Sources/PermissionChecker.swift +2 -12
- package/app/Sources/PiChatDock.swift +454 -0
- package/app/Sources/PiChatSession.swift +815 -0
- package/app/Sources/PiWorkspaceView.swift +364 -0
- package/app/Sources/PlacementSpec.swift +195 -0
- package/app/Sources/Preferences.swift +59 -0
- package/app/Sources/ProjectScanner.swift +1 -1
- package/app/Sources/ScreenMapState.swift +701 -55
- package/app/Sources/ScreenMapView.swift +843 -103
- package/app/Sources/ScreenMapWindowController.swift +22 -0
- package/app/Sources/SessionLayerStore.swift +285 -0
- package/app/Sources/SessionManager.swift +4 -1
- package/app/Sources/SettingsView.swift +186 -3
- package/app/Sources/Theme.swift +9 -8
- package/app/Sources/TmuxModel.swift +7 -0
- package/app/Sources/TmuxQuery.swift +27 -3
- package/app/Sources/VoiceChatView.swift +192 -0
- package/app/Sources/VoiceCommandWindow.swift +1594 -0
- package/app/Sources/VoiceIntentResolver.swift +671 -0
- package/app/Sources/VoxClient.swift +454 -0
- package/app/Sources/WindowTiler.swift +348 -87
- package/app/Sources/WorkspaceManager.swift +127 -18
- package/bin/client.ts +16 -0
- package/bin/{daemon-client.js → daemon-client.ts} +49 -30
- package/bin/handsoff-infer.ts +280 -0
- package/bin/handsoff-worker.ts +731 -0
- package/bin/{lattices-app.js → lattices-app.ts} +67 -32
- package/bin/lattices-dev +160 -0
- package/bin/{lattices.js → lattices.ts} +600 -137
- package/bin/project-twin.ts +645 -0
- package/docs/agent-execution-plan.md +562 -0
- package/docs/agents.md +142 -0
- package/docs/api.md +153 -34
- package/docs/app.md +29 -1
- package/docs/config.md +5 -1
- package/docs/handsoff-test-scenarios.md +84 -0
- package/docs/layers.md +20 -20
- package/docs/ocr.md +14 -5
- package/docs/overview.md +5 -1
- package/docs/presentation-execution-review.md +491 -0
- package/docs/prompts/hands-off-system.md +374 -0
- package/docs/prompts/hands-off-turn.md +30 -0
- package/docs/prompts/voice-advisor.md +31 -0
- package/docs/prompts/voice-fallback.md +23 -0
- package/docs/tiling-reference.md +167 -0
- package/docs/twins.md +138 -0
- package/docs/voice-command-protocol.md +278 -0
- package/docs/voice.md +219 -0
- package/package.json +21 -10
- package/bin/client.js +0 -4
|
@@ -0,0 +1,731 @@
|
|
|
1
|
+
#!/usr/bin/env bun
|
|
2
|
+
/**
|
|
3
|
+
* Hands-off worker — long-running process that handles both inference and TTS.
|
|
4
|
+
*
|
|
5
|
+
* Reads newline-delimited JSON commands from stdin, writes JSON responses to stdout.
|
|
6
|
+
* Keeps SpeakEasy and inference warm — no cold starts.
|
|
7
|
+
*
|
|
8
|
+
* Commands:
|
|
9
|
+
* {"cmd":"infer","transcript":"...","snapshot":{...},"history":[...]}
|
|
10
|
+
* {"cmd":"speak","text":"..."}
|
|
11
|
+
* {"cmd":"ack","text":"..."} (speak + don't wait for completion)
|
|
12
|
+
* {"cmd":"ping"}
|
|
13
|
+
*
|
|
14
|
+
* Responses:
|
|
15
|
+
* {"ok":true,"data":{...}}
|
|
16
|
+
* {"ok":false,"error":"..."}
|
|
17
|
+
*/
|
|
18
|
+
|
|
19
|
+
import { infer, inferJSON } from "../lib/infer.ts";
|
|
20
|
+
|
|
21
|
+
/** Call infer and parse JSON if possible, otherwise treat as spoken-only response */
|
|
22
|
+
async function inferSmart(prompt: string, options: any): Promise<{ data: any; raw: any }> {
|
|
23
|
+
const raw = await infer(prompt, options);
|
|
24
|
+
|
|
25
|
+
// Try to parse as JSON
|
|
26
|
+
let cleaned = raw.text
|
|
27
|
+
.replace(/```json\s*/g, "")
|
|
28
|
+
.replace(/```\s*/g, "")
|
|
29
|
+
.trim();
|
|
30
|
+
|
|
31
|
+
const start = cleaned.indexOf("{");
|
|
32
|
+
const end = cleaned.lastIndexOf("}");
|
|
33
|
+
|
|
34
|
+
if (start !== -1 && end !== -1) {
|
|
35
|
+
try {
|
|
36
|
+
const data = JSON.parse(cleaned.slice(start, end + 1));
|
|
37
|
+
return { data, raw };
|
|
38
|
+
} catch {}
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
// Not JSON — treat as conversational response (spoken-only, no actions)
|
|
42
|
+
log(`response was plain text, wrapping as spoken: "${raw.text.slice(0, 80)}"`);
|
|
43
|
+
return {
|
|
44
|
+
data: { actions: [], spoken: raw.text },
|
|
45
|
+
raw,
|
|
46
|
+
};
|
|
47
|
+
}
|
|
48
|
+
import { readFileSync } from "fs";
|
|
49
|
+
import { join, dirname } from "path";
|
|
50
|
+
import { spawn } from "child_process";
|
|
51
|
+
|
|
52
|
+
// ── Streaming TTS via OpenAI API → ffplay ──────────────────────────
|
|
53
|
+
|
|
54
|
+
const OPENAI_TTS_URL = "https://api.openai.com/v1/audio/speech";
|
|
55
|
+
const ttsConfig = loadTTSConfig();
|
|
56
|
+
|
|
57
|
+
function loadTTSConfig() {
|
|
58
|
+
// Load API key from speakeasy config or env
|
|
59
|
+
let apiKey = process.env.OPENAI_API_KEY || "";
|
|
60
|
+
let voice = "nova";
|
|
61
|
+
|
|
62
|
+
try {
|
|
63
|
+
const cfg = JSON.parse(
|
|
64
|
+
readFileSync(join(process.env.HOME || "", ".config/speakeasy/settings.json"), "utf-8")
|
|
65
|
+
);
|
|
66
|
+
if (!apiKey && cfg.providers?.openai?.apiKey) apiKey = cfg.providers.openai.apiKey;
|
|
67
|
+
if (cfg.providers?.openai?.voice) voice = cfg.providers.openai.voice;
|
|
68
|
+
} catch {}
|
|
69
|
+
|
|
70
|
+
return { apiKey, voice };
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
/** Stream TTS: fetch audio from OpenAI and pipe directly to ffplay. Playback starts immediately. */
|
|
74
|
+
async function streamSpeak(text: string): Promise<number> {
|
|
75
|
+
const start = performance.now();
|
|
76
|
+
|
|
77
|
+
const res = await fetch(OPENAI_TTS_URL, {
|
|
78
|
+
method: "POST",
|
|
79
|
+
headers: {
|
|
80
|
+
"Authorization": `Bearer ${ttsConfig.apiKey}`,
|
|
81
|
+
"Content-Type": "application/json",
|
|
82
|
+
},
|
|
83
|
+
body: JSON.stringify({
|
|
84
|
+
model: "tts-1",
|
|
85
|
+
voice: ttsConfig.voice,
|
|
86
|
+
input: text,
|
|
87
|
+
response_format: "pcm",
|
|
88
|
+
speed: 1.1,
|
|
89
|
+
}),
|
|
90
|
+
});
|
|
91
|
+
|
|
92
|
+
if (!res.ok) {
|
|
93
|
+
throw new Error(`OpenAI TTS error: ${res.status} ${res.statusText}`);
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
const ttfb = Math.round(performance.now() - start);
|
|
97
|
+
log(`TTS first byte in ${ttfb}ms`);
|
|
98
|
+
|
|
99
|
+
// Pipe response body directly to ffplay — playback starts as chunks arrive
|
|
100
|
+
return new Promise((resolve, reject) => {
|
|
101
|
+
const player = spawn("ffplay", [
|
|
102
|
+
"-nodisp", // no video window
|
|
103
|
+
"-autoexit", // quit when done
|
|
104
|
+
"-loglevel", "quiet",
|
|
105
|
+
"-f", "s16le", // PCM signed 16-bit little-endian
|
|
106
|
+
"-ar", "24000", // OpenAI TTS outputs 24kHz
|
|
107
|
+
"-ch_layout", "mono",
|
|
108
|
+
"-", // read from stdin
|
|
109
|
+
], { stdio: ["pipe", "ignore", "ignore"] });
|
|
110
|
+
|
|
111
|
+
const reader = res.body?.getReader();
|
|
112
|
+
if (!reader) {
|
|
113
|
+
reject(new Error("No response body"));
|
|
114
|
+
return;
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
// Pump chunks from fetch → ffplay stdin
|
|
118
|
+
(async () => {
|
|
119
|
+
while (true) {
|
|
120
|
+
const { done, value } = await reader.read();
|
|
121
|
+
if (done) break;
|
|
122
|
+
player.stdin.write(value);
|
|
123
|
+
}
|
|
124
|
+
player.stdin.end();
|
|
125
|
+
})().catch(reject);
|
|
126
|
+
|
|
127
|
+
player.on("close", () => {
|
|
128
|
+
const ms = Math.round(performance.now() - start);
|
|
129
|
+
resolve(ms);
|
|
130
|
+
});
|
|
131
|
+
|
|
132
|
+
player.on("error", reject);
|
|
133
|
+
});
|
|
134
|
+
}
|
|
135
|
+
|
|
136
|
+
// ── Pre-cached ack sounds (no API call needed) ────────────────────
|
|
137
|
+
|
|
138
|
+
// Ack phrases — played immediately when user stops talking
|
|
139
|
+
const ACK_PHRASES = [
|
|
140
|
+
"Got it.",
|
|
141
|
+
"Heard you.",
|
|
142
|
+
"On it.",
|
|
143
|
+
"Yep.",
|
|
144
|
+
"Cool.",
|
|
145
|
+
"Sure.",
|
|
146
|
+
"Okay.",
|
|
147
|
+
"One sec.",
|
|
148
|
+
];
|
|
149
|
+
|
|
150
|
+
// Confirmation phrases — played after executing known actions
|
|
151
|
+
const CONFIRM_PHRASES = [
|
|
152
|
+
"Tiled.",
|
|
153
|
+
"Focused.",
|
|
154
|
+
"Done.",
|
|
155
|
+
"Maximized.",
|
|
156
|
+
"Split.",
|
|
157
|
+
"Switched.",
|
|
158
|
+
"Distributed.",
|
|
159
|
+
"Restored.",
|
|
160
|
+
"Searching.",
|
|
161
|
+
];
|
|
162
|
+
|
|
163
|
+
const ackCacheDir = join(process.env.HOME || "", ".lattices", "tts-cache");
|
|
164
|
+
const ackCache = new Map<string, string>(); // phrase → file path
|
|
165
|
+
|
|
166
|
+
async function ensureVoiceCache() {
|
|
167
|
+
const { mkdirSync, existsSync, writeFileSync } = await import("fs");
|
|
168
|
+
mkdirSync(ackCacheDir, { recursive: true });
|
|
169
|
+
|
|
170
|
+
const allPhrases = [...ACK_PHRASES, ...CONFIRM_PHRASES];
|
|
171
|
+
let cached = 0;
|
|
172
|
+
let generated = 0;
|
|
173
|
+
|
|
174
|
+
for (const phrase of allPhrases) {
|
|
175
|
+
const safeName = phrase.replace(/[^a-z]/gi, "_").toLowerCase();
|
|
176
|
+
const filePath = join(ackCacheDir, `voice_${safeName}.pcm`);
|
|
177
|
+
|
|
178
|
+
if (existsSync(filePath)) {
|
|
179
|
+
ackCache.set(phrase, filePath);
|
|
180
|
+
cached++;
|
|
181
|
+
continue;
|
|
182
|
+
}
|
|
183
|
+
|
|
184
|
+
// Generate and cache
|
|
185
|
+
try {
|
|
186
|
+
const res = await fetch(OPENAI_TTS_URL, {
|
|
187
|
+
method: "POST",
|
|
188
|
+
headers: {
|
|
189
|
+
"Authorization": `Bearer ${ttsConfig.apiKey}`,
|
|
190
|
+
"Content-Type": "application/json",
|
|
191
|
+
},
|
|
192
|
+
body: JSON.stringify({
|
|
193
|
+
model: "tts-1",
|
|
194
|
+
voice: ttsConfig.voice,
|
|
195
|
+
input: phrase,
|
|
196
|
+
response_format: "pcm",
|
|
197
|
+
speed: 1.1,
|
|
198
|
+
}),
|
|
199
|
+
});
|
|
200
|
+
|
|
201
|
+
if (res.ok) {
|
|
202
|
+
const buf = Buffer.from(await res.arrayBuffer());
|
|
203
|
+
writeFileSync(filePath, buf);
|
|
204
|
+
ackCache.set(phrase, filePath);
|
|
205
|
+
generated++;
|
|
206
|
+
log(`cached: "${phrase}"`);
|
|
207
|
+
}
|
|
208
|
+
} catch (e: any) {
|
|
209
|
+
log(`cache failed for "${phrase}": ${e.message}`);
|
|
210
|
+
}
|
|
211
|
+
}
|
|
212
|
+
log(`voice cache: ${cached} hit, ${generated} generated, ${allPhrases.length} total`);
|
|
213
|
+
}
|
|
214
|
+
|
|
215
|
+
/** Play a pre-cached audio file. Near-instant — no API call. */
|
|
216
|
+
async function playCached(phrase: string): Promise<number> {
|
|
217
|
+
const start = performance.now();
|
|
218
|
+
const filePath = ackCache.get(phrase);
|
|
219
|
+
|
|
220
|
+
if (!filePath) {
|
|
221
|
+
log(`playCached: cache miss for "${phrase}", falling back to TTS`);
|
|
222
|
+
return streamSpeak(phrase);
|
|
223
|
+
}
|
|
224
|
+
|
|
225
|
+
log(`playing cached: "${phrase}"`);
|
|
226
|
+
return new Promise((resolve, reject) => {
|
|
227
|
+
const player = spawn("ffplay", [
|
|
228
|
+
"-nodisp", "-autoexit", "-loglevel", "quiet",
|
|
229
|
+
"-f", "s16le", "-ar", "24000", "-ch_layout", "mono",
|
|
230
|
+
filePath,
|
|
231
|
+
], { stdio: ["ignore", "ignore", "pipe"] });
|
|
232
|
+
|
|
233
|
+
let stderr = "";
|
|
234
|
+
player.stderr?.on("data", (d: Buffer) => { stderr += d.toString(); });
|
|
235
|
+
|
|
236
|
+
player.on("close", (code: number) => {
|
|
237
|
+
const ms = Math.round(performance.now() - start);
|
|
238
|
+
if (code !== 0) log(`ffplay error (code ${code}): ${stderr.slice(0, 100)}`);
|
|
239
|
+
else log(`played "${phrase}" in ${ms}ms`);
|
|
240
|
+
resolve(ms);
|
|
241
|
+
});
|
|
242
|
+
|
|
243
|
+
player.on("error", (err: Error) => {
|
|
244
|
+
log(`ffplay spawn error: ${err.message}`);
|
|
245
|
+
reject(err);
|
|
246
|
+
});
|
|
247
|
+
});
|
|
248
|
+
}
|
|
249
|
+
|
|
250
|
+
/** Play a random ack phrase from cache. */
|
|
251
|
+
function playAck(): Promise<number> {
|
|
252
|
+
const phrase = ACK_PHRASES[Math.floor(Math.random() * ACK_PHRASES.length)];
|
|
253
|
+
return playCached(phrase);
|
|
254
|
+
}
|
|
255
|
+
|
|
256
|
+
/** Play the right confirmation for an action. */
|
|
257
|
+
function playConfirm(intent: string): Promise<number> {
|
|
258
|
+
const map: Record<string, string> = {
|
|
259
|
+
tile_window: "Tiled.",
|
|
260
|
+
focus: "Focused.",
|
|
261
|
+
distribute: "Distributed.",
|
|
262
|
+
search: "Searching.",
|
|
263
|
+
switch_layer: "Switched.",
|
|
264
|
+
create_layer: "Done.",
|
|
265
|
+
};
|
|
266
|
+
return playCached(map[intent] ?? "Done.");
|
|
267
|
+
}
|
|
268
|
+
|
|
269
|
+
// ── Fast path: local intent matching (no LLM needed) ──────────────
|
|
270
|
+
|
|
271
|
+
interface FastMatch {
|
|
272
|
+
actions: Array<{ intent: string; slots: Record<string, string> }>;
|
|
273
|
+
confirm: string; // which confirmation to play
|
|
274
|
+
}
|
|
275
|
+
|
|
276
|
+
function tryFastMatch(transcript: string, snapshot: any): FastMatch | null {
|
|
277
|
+
const t = transcript.toLowerCase().trim();
|
|
278
|
+
const activeApps = (snapshot.activeStage ?? []).map((w: any) => ({
|
|
279
|
+
app: w.app as string,
|
|
280
|
+
wid: w.wid as number,
|
|
281
|
+
}));
|
|
282
|
+
|
|
283
|
+
// Tile patterns
|
|
284
|
+
const tileMatch = t.match(
|
|
285
|
+
/(?:tile|snap|put|move)\s+(\w+)\s+(?:to\s+)?(?:the\s+)?(left|right|top|bottom|maximize|center|top.?left|top.?right|bottom.?left|bottom.?right|left.?third|center.?third|right.?third)/
|
|
286
|
+
);
|
|
287
|
+
if (tileMatch) {
|
|
288
|
+
const app = tileMatch[1];
|
|
289
|
+
const pos = tileMatch[2].replace(/\s+/g, "-");
|
|
290
|
+
return {
|
|
291
|
+
actions: [{ intent: "tile_window", slots: { app, position: pos } }],
|
|
292
|
+
confirm: "tile_window",
|
|
293
|
+
};
|
|
294
|
+
}
|
|
295
|
+
|
|
296
|
+
// Split screen: "split X and Y" or "X left Y right"
|
|
297
|
+
const splitMatch = t.match(/split\s+(\w+)\s+(?:and|&)\s+(\w+)/);
|
|
298
|
+
if (splitMatch) {
|
|
299
|
+
return {
|
|
300
|
+
actions: [
|
|
301
|
+
{ intent: "tile_window", slots: { app: splitMatch[1], position: "left" } },
|
|
302
|
+
{ intent: "tile_window", slots: { app: splitMatch[2], position: "right" } },
|
|
303
|
+
],
|
|
304
|
+
confirm: "tile_window",
|
|
305
|
+
};
|
|
306
|
+
}
|
|
307
|
+
|
|
308
|
+
// Focus: "focus X" / "focus on X" / "switch to X" / "go to X"
|
|
309
|
+
const focusMatch = t.match(/(?:focus(?:\s+on)?|switch\s+to|go\s+to|show)\s+(?:the\s+)?(?:on\s+)?(\w+)/);
|
|
310
|
+
if (focusMatch && !t.includes("tile") && !t.includes("split")) {
|
|
311
|
+
const app = focusMatch[1];
|
|
312
|
+
if (app && app !== "on" && app !== "the") {
|
|
313
|
+
return {
|
|
314
|
+
actions: [{ intent: "focus", slots: { app } }],
|
|
315
|
+
confirm: "focus",
|
|
316
|
+
};
|
|
317
|
+
}
|
|
318
|
+
}
|
|
319
|
+
|
|
320
|
+
// Maximize: "maximize" / "full screen" / "make it big"
|
|
321
|
+
if (/maximize|full\s*screen|make\s+it\s+big/.test(t)) {
|
|
322
|
+
return {
|
|
323
|
+
actions: [{ intent: "tile_window", slots: { position: "maximize" } }],
|
|
324
|
+
confirm: "tile_window",
|
|
325
|
+
};
|
|
326
|
+
}
|
|
327
|
+
|
|
328
|
+
// Distribute: "grid" / "mosaic" / "distribute" / "even"
|
|
329
|
+
if (/grid|mosaic|distribute|even\s+(?:out|grid)|arrange/.test(t)) {
|
|
330
|
+
return {
|
|
331
|
+
actions: [{ intent: "distribute", slots: {} }],
|
|
332
|
+
confirm: "distribute",
|
|
333
|
+
};
|
|
334
|
+
}
|
|
335
|
+
|
|
336
|
+
// Corners: "quadrants" / "four corners"
|
|
337
|
+
if (/quadrants?|four\s+corners?|corners/.test(t) && activeApps.length >= 4) {
|
|
338
|
+
const positions = ["top-left", "top-right", "bottom-left", "bottom-right"];
|
|
339
|
+
return {
|
|
340
|
+
actions: activeApps.slice(0, 4).map((a: any, i: number) => ({
|
|
341
|
+
intent: "tile_window",
|
|
342
|
+
slots: { app: a.app, position: positions[i] },
|
|
343
|
+
})),
|
|
344
|
+
confirm: "tile_window",
|
|
345
|
+
};
|
|
346
|
+
}
|
|
347
|
+
|
|
348
|
+
// Thirds: "thirds"
|
|
349
|
+
if (/thirds/.test(t) && activeApps.length >= 3) {
|
|
350
|
+
const positions = ["left-third", "center-third", "right-third"];
|
|
351
|
+
return {
|
|
352
|
+
actions: activeApps.slice(0, 3).map((a: any, i: number) => ({
|
|
353
|
+
intent: "tile_window",
|
|
354
|
+
slots: { app: a.app, position: positions[i] },
|
|
355
|
+
})),
|
|
356
|
+
confirm: "tile_window",
|
|
357
|
+
};
|
|
358
|
+
}
|
|
359
|
+
|
|
360
|
+
return null; // No fast match — fall through to LLM
|
|
361
|
+
}
|
|
362
|
+
|
|
363
|
+
// Warm up cache on startup
|
|
364
|
+
ensureVoiceCache().then(() => log("voice cache ready"));
|
|
365
|
+
|
|
366
|
+
log("worker started, streaming TTS ready");
|
|
367
|
+
|
|
368
|
+
// ── Load system prompt once ────────────────────────────────────────
|
|
369
|
+
|
|
370
|
+
const promptDir = join(dirname(import.meta.dir), "docs", "prompts");
|
|
371
|
+
let systemPrompt: string;
|
|
372
|
+
try {
|
|
373
|
+
systemPrompt = readFileSync(join(promptDir, "hands-off-system.md"), "utf-8")
|
|
374
|
+
.split("\n")
|
|
375
|
+
.filter((l) => !l.startsWith("# "))
|
|
376
|
+
.join("\n")
|
|
377
|
+
.trim();
|
|
378
|
+
} catch {
|
|
379
|
+
systemPrompt = "You are a workspace assistant. Respond with JSON: {actions, spoken}.";
|
|
380
|
+
}
|
|
381
|
+
|
|
382
|
+
const intentCatalog = `
|
|
383
|
+
tile_window: Tile a window to a screen position
|
|
384
|
+
Slots:
|
|
385
|
+
position (required): Named position or grid:CxR:C,R syntax.
|
|
386
|
+
Halves: left, right, top, bottom
|
|
387
|
+
Quarters (2x2): top-left, top-right, bottom-left, bottom-right
|
|
388
|
+
Thirds (3x1): left-third, center-third, right-third
|
|
389
|
+
Sixths (3x2): top-left-third, top-center-third, top-right-third, bottom-left-third, bottom-center-third, bottom-right-third
|
|
390
|
+
Fourths (4x1): first-fourth, second-fourth, third-fourth, last-fourth
|
|
391
|
+
Eighths (4x2): top-first-fourth, top-second-fourth, top-third-fourth, top-last-fourth, bottom-first-fourth, bottom-second-fourth, bottom-third-fourth, bottom-last-fourth
|
|
392
|
+
Special: maximize (full screen), center (centered floating)
|
|
393
|
+
Grid syntax: grid:CxR:C,R (e.g. grid:5x3:2,1 = center cell of 5x3 grid)
|
|
394
|
+
app (optional): Target app name — match loosely (e.g. "chrome" matches "Google Chrome")
|
|
395
|
+
wid (optional): Target window ID (from snapshot)
|
|
396
|
+
session (optional): Tmux session name
|
|
397
|
+
If no app/wid/session given, tiles the frontmost window.
|
|
398
|
+
"quarter" = 2x2 cell (top-left etc.), NOT a 4x1 fourth.
|
|
399
|
+
"top quarter" = top-left or top-right (2x2). "top third" = top-left-third (3x2).
|
|
400
|
+
|
|
401
|
+
focus: Focus a window, app, or session
|
|
402
|
+
Slots: app, session, or wid (at least one)
|
|
403
|
+
|
|
404
|
+
distribute: Arrange all visible windows in an even grid. No slots.
|
|
405
|
+
|
|
406
|
+
search: Search windows by text
|
|
407
|
+
Slots: query (required)
|
|
408
|
+
|
|
409
|
+
list_windows: List all visible windows. No slots.
|
|
410
|
+
|
|
411
|
+
switch_layer: Switch to a workspace layer
|
|
412
|
+
Slots: layer (required) — name or index
|
|
413
|
+
|
|
414
|
+
create_layer: Save current arrangement as a named layer
|
|
415
|
+
Slots: name (required)
|
|
416
|
+
|
|
417
|
+
TILING PRESETS (use multiple tile_window actions):
|
|
418
|
+
"split screen" → left + right
|
|
419
|
+
"thirds" → left-third, center-third, right-third
|
|
420
|
+
"mosaic"/"grid" → use distribute
|
|
421
|
+
"corners"/"quadrants" → top-left, top-right, bottom-left, bottom-right
|
|
422
|
+
"stack" → top + bottom
|
|
423
|
+
"six-up"/"3 by 2" → 3x2 grid using the sixth positions
|
|
424
|
+
"eight-up"/"4 by 2" → 4x2 grid using the eighth positions
|
|
425
|
+
`;
|
|
426
|
+
|
|
427
|
+
systemPrompt = systemPrompt.replace("{{intent_catalog}}", intentCatalog);
|
|
428
|
+
log("system prompt loaded");
|
|
429
|
+
|
|
430
|
+
// ── Auto-restart on file changes ───────────────────────────────────
|
|
431
|
+
|
|
432
|
+
const watchFiles = [
|
|
433
|
+
join(promptDir, "hands-off-system.md"),
|
|
434
|
+
import.meta.path, // this script itself
|
|
435
|
+
];
|
|
436
|
+
|
|
437
|
+
for (const f of watchFiles) {
|
|
438
|
+
try {
|
|
439
|
+
const { watch } = await import("fs");
|
|
440
|
+
let debounce: ReturnType<typeof setTimeout> | null = null;
|
|
441
|
+
watch(f, () => {
|
|
442
|
+
if (debounce) return;
|
|
443
|
+
debounce = setTimeout(() => {
|
|
444
|
+
log(`file changed: ${f.split("/").pop()} — exiting for restart`);
|
|
445
|
+
process.exit(0); // Swift auto-restarts in 2s
|
|
446
|
+
}, 500);
|
|
447
|
+
});
|
|
448
|
+
log(`watching: ${f.split("/").pop()}`);
|
|
449
|
+
} catch {}
|
|
450
|
+
}
|
|
451
|
+
|
|
452
|
+
// ── Build context message from snapshot ─────────────────────────────
|
|
453
|
+
|
|
454
|
+
function buildContextMessage(transcript: string, snap: any): string {
|
|
455
|
+
let msg = `USER: "${transcript}"\n\n`;
|
|
456
|
+
msg += "--- DESKTOP SNAPSHOT ---\n";
|
|
457
|
+
|
|
458
|
+
// Screens
|
|
459
|
+
const screens = snap.screens ?? [];
|
|
460
|
+
if (screens.length > 1) {
|
|
461
|
+
msg += `Displays: ${screens.map((s: any) => `${s.width}x${s.height}${s.isMain ? " (main)" : ""}`).join(", ")}\n`;
|
|
462
|
+
} else if (screens.length === 1) {
|
|
463
|
+
msg += `Screen: ${screens[0].width}x${screens[0].height}\n`;
|
|
464
|
+
}
|
|
465
|
+
|
|
466
|
+
// Stage Manager
|
|
467
|
+
if (snap.stageManager) {
|
|
468
|
+
msg += `Stage Manager: ON (grouping: ${snap.smGrouping ?? "all-at-once"})\n`;
|
|
469
|
+
}
|
|
470
|
+
|
|
471
|
+
// All windows — full inventory, ordered front-to-back (zIndex 0 = frontmost)
|
|
472
|
+
const windows = snap.windows ?? snap.activeStage ?? [];
|
|
473
|
+
const onScreen = windows.filter((w: any) => w.onScreen !== false);
|
|
474
|
+
const offScreen = windows.filter((w: any) => w.onScreen === false);
|
|
475
|
+
|
|
476
|
+
msg += `\nVisible windows (${onScreen.length}, front-to-back order):\n`;
|
|
477
|
+
for (const w of onScreen) {
|
|
478
|
+
const flags: string[] = [];
|
|
479
|
+
if (w.zIndex === 0) flags.push("FRONTMOST");
|
|
480
|
+
if (w.session) flags.push(`session:${w.session}`);
|
|
481
|
+
const flagStr = flags.length ? ` [${flags.join(", ")}]` : "";
|
|
482
|
+
msg += ` wid:${w.wid} ${w.app}: "${w.title}" — ${w.frame}${flagStr}\n`;
|
|
483
|
+
}
|
|
484
|
+
|
|
485
|
+
if (offScreen.length > 0) {
|
|
486
|
+
// Summarize hidden windows by app instead of listing all
|
|
487
|
+
const hiddenByApp: Record<string, number> = {};
|
|
488
|
+
for (const w of offScreen) {
|
|
489
|
+
const app = w.app;
|
|
490
|
+
hiddenByApp[app] = (hiddenByApp[app] || 0) + 1;
|
|
491
|
+
}
|
|
492
|
+
const summary = Object.entries(hiddenByApp)
|
|
493
|
+
.filter(([app]) => !["WindowManager", "Spotlight", "CursorUIViewService", "AutoFill", "coreautha", "loginwindow", "Open and Save Panel Service"].includes(app))
|
|
494
|
+
.map(([app, count]) => `${app}(${count})`)
|
|
495
|
+
.join(", ");
|
|
496
|
+
if (summary) {
|
|
497
|
+
msg += `\nHidden windows: ${summary}\n`;
|
|
498
|
+
}
|
|
499
|
+
}
|
|
500
|
+
|
|
501
|
+
// Terminals — cwd, running commands, claude, tmux
|
|
502
|
+
const terminals = snap.terminals ?? [];
|
|
503
|
+
if (terminals.length > 0) {
|
|
504
|
+
msg += `\nTerminal tabs (${terminals.length}):\n`;
|
|
505
|
+
for (const t of terminals) {
|
|
506
|
+
const flags: string[] = [];
|
|
507
|
+
if (t.hasClaude) flags.push("Claude Code");
|
|
508
|
+
if (t.tmuxSession) flags.push(`tmux:${t.tmuxSession}`);
|
|
509
|
+
if (!t.isActiveTab) flags.push("background tab");
|
|
510
|
+
const flagStr = flags.length ? ` [${flags.join(", ")}]` : "";
|
|
511
|
+
const cwd = t.cwd ? ` cwd:${t.cwd.replace(/^\/Users\/\w+\//, "~/")}` : "";
|
|
512
|
+
const cmds = (t.runningCommands ?? []).map((c: any) => c.command).join(", ");
|
|
513
|
+
const cmdStr = cmds ? ` running:${cmds}` : "";
|
|
514
|
+
msg += ` ${t.displayName}${cwd}${cmdStr}${flagStr}`;
|
|
515
|
+
if (t.windowId) msg += ` (wid:${t.windowId})`;
|
|
516
|
+
msg += "\n";
|
|
517
|
+
}
|
|
518
|
+
}
|
|
519
|
+
|
|
520
|
+
// Tmux sessions
|
|
521
|
+
const tmux = snap.tmuxSessions ?? [];
|
|
522
|
+
if (tmux.length > 0) {
|
|
523
|
+
msg += `\nTmux sessions: ${tmux.map((s: any) => `${s.name} (${s.windows} windows${s.attached ? ", attached" : ""})`).join(", ")}\n`;
|
|
524
|
+
}
|
|
525
|
+
|
|
526
|
+
// Layer
|
|
527
|
+
if (snap.currentLayer) {
|
|
528
|
+
msg += `\nCurrent layer: ${snap.currentLayer.name} (index: ${snap.currentLayer.index})\n`;
|
|
529
|
+
}
|
|
530
|
+
|
|
531
|
+
msg += "--- END SNAPSHOT ---\n";
|
|
532
|
+
return msg;
|
|
533
|
+
}
|
|
534
|
+
|
|
535
|
+
// ── Command loop ───────────────────────────────────────────────────
|
|
536
|
+
|
|
537
|
+
const decoder = new TextDecoder();
|
|
538
|
+
const reader = Bun.stdin.stream().getReader();
|
|
539
|
+
let buffer = "";
|
|
540
|
+
|
|
541
|
+
async function processLine(line: string) {
|
|
542
|
+
const trimmed = line.trim();
|
|
543
|
+
if (!trimmed) return;
|
|
544
|
+
|
|
545
|
+
let cmd: any;
|
|
546
|
+
try {
|
|
547
|
+
cmd = JSON.parse(trimmed);
|
|
548
|
+
} catch {
|
|
549
|
+
respond({ ok: false, error: "invalid JSON" });
|
|
550
|
+
return;
|
|
551
|
+
}
|
|
552
|
+
|
|
553
|
+
switch (cmd.cmd) {
|
|
554
|
+
case "ping":
|
|
555
|
+
respond({ ok: true, data: { pong: true } });
|
|
556
|
+
break;
|
|
557
|
+
|
|
558
|
+
case "speak":
|
|
559
|
+
try {
|
|
560
|
+
const ms = await streamSpeak(cmd.text);
|
|
561
|
+
log(`spoke "${cmd.text.slice(0, 40)}" in ${ms}ms`);
|
|
562
|
+
respond({ ok: true, data: { durationMs: ms } });
|
|
563
|
+
} catch (err: any) {
|
|
564
|
+
log(`TTS error: ${err.message}`);
|
|
565
|
+
respond({ ok: false, error: err.message });
|
|
566
|
+
}
|
|
567
|
+
break;
|
|
568
|
+
|
|
569
|
+
case "ack":
|
|
570
|
+
// Fire and forget — respond immediately, speak in background
|
|
571
|
+
respond({ ok: true, data: { queued: true } });
|
|
572
|
+
streamSpeak(cmd.text).catch((e) => log(`ack TTS error: ${e.message}`));
|
|
573
|
+
break;
|
|
574
|
+
|
|
575
|
+
case "play_cached":
|
|
576
|
+
respond({ ok: true, data: { queued: true, cached: true } });
|
|
577
|
+
playCached(cmd.text).catch((e) => log(`play_cached error: ${e.message}`));
|
|
578
|
+
break;
|
|
579
|
+
|
|
580
|
+
case "infer":
|
|
581
|
+
try {
|
|
582
|
+
const userMessage = buildContextMessage(cmd.transcript, cmd.snapshot ?? {});
|
|
583
|
+
|
|
584
|
+
const messages = (cmd.history ?? []).map((h: any) => ({
|
|
585
|
+
role: h.role as "user" | "assistant",
|
|
586
|
+
content: h.content,
|
|
587
|
+
}));
|
|
588
|
+
|
|
589
|
+
const { data, raw } = await inferSmart(userMessage, {
|
|
590
|
+
provider: "xai",
|
|
591
|
+
model: "grok-4.20-beta-0309-non-reasoning",
|
|
592
|
+
system: systemPrompt,
|
|
593
|
+
messages,
|
|
594
|
+
temperature: 0.2,
|
|
595
|
+
maxTokens: 512,
|
|
596
|
+
tag: "hands-off",
|
|
597
|
+
});
|
|
598
|
+
|
|
599
|
+
respond({
|
|
600
|
+
ok: true,
|
|
601
|
+
data: {
|
|
602
|
+
...data,
|
|
603
|
+
_meta: {
|
|
604
|
+
provider: raw.provider,
|
|
605
|
+
model: raw.model,
|
|
606
|
+
durationMs: raw.durationMs,
|
|
607
|
+
tokens: raw.usage?.totalTokens,
|
|
608
|
+
},
|
|
609
|
+
},
|
|
610
|
+
});
|
|
611
|
+
} catch (err: any) {
|
|
612
|
+
respond({
|
|
613
|
+
ok: false,
|
|
614
|
+
error: err.message,
|
|
615
|
+
data: {
|
|
616
|
+
actions: [],
|
|
617
|
+
spoken: "Sorry, I had trouble processing that.",
|
|
618
|
+
},
|
|
619
|
+
});
|
|
620
|
+
}
|
|
621
|
+
break;
|
|
622
|
+
|
|
623
|
+
case "turn": {
|
|
624
|
+
// Full orchestrated turn — parallel where possible.
|
|
625
|
+
//
|
|
626
|
+
// Timeline:
|
|
627
|
+
// t=0 ──┬── ack TTS (fire & forget)
|
|
628
|
+
// └── Groq inference
|
|
629
|
+
// t=~600ms ─┬── narrate TTS (what we're doing)
|
|
630
|
+
// └── execute actions (in parallel with narrate)
|
|
631
|
+
// t=done ── respond with results
|
|
632
|
+
//
|
|
633
|
+
const turnStart = performance.now();
|
|
634
|
+
const transcript = cmd.transcript;
|
|
635
|
+
const snap = cmd.snapshot ?? {};
|
|
636
|
+
const history = cmd.history ?? [];
|
|
637
|
+
|
|
638
|
+
log(`⏱ turn start: "${transcript.slice(0, 50)}"`);
|
|
639
|
+
|
|
640
|
+
// Fire cached ack sound + inference in PARALLEL
|
|
641
|
+
const ackPromise = playAck().catch((e) => log(`ack error: ${e.message}`));
|
|
642
|
+
|
|
643
|
+
// Build full context message from snapshot
|
|
644
|
+
const userMessage = buildContextMessage(transcript, snap);
|
|
645
|
+
|
|
646
|
+
const messages = history.map((h: any) => ({
|
|
647
|
+
role: h.role as "user" | "assistant",
|
|
648
|
+
content: typeof h.content === "string" ? h.content : JSON.stringify(h.content),
|
|
649
|
+
})).filter((m: any) => m.content && m.content.length > 0);
|
|
650
|
+
|
|
651
|
+
let inferResult: any = null;
|
|
652
|
+
try {
|
|
653
|
+
const { data, raw } = await inferSmart(userMessage, {
|
|
654
|
+
provider: "xai",
|
|
655
|
+
model: "grok-4.20-beta-0309-non-reasoning",
|
|
656
|
+
system: systemPrompt,
|
|
657
|
+
messages,
|
|
658
|
+
temperature: 0.2,
|
|
659
|
+
maxTokens: 512,
|
|
660
|
+
tag: "hands-off",
|
|
661
|
+
});
|
|
662
|
+
inferResult = { ...data, _meta: { provider: raw.provider, model: raw.model, durationMs: raw.durationMs, tokens: raw.usage?.totalTokens } };
|
|
663
|
+
log(`⏱ inference done in ${raw.durationMs}ms`);
|
|
664
|
+
} catch (err: any) {
|
|
665
|
+
log(`⏱ inference error: ${err.message}`);
|
|
666
|
+
inferResult = { actions: [], spoken: "Sorry, I had trouble with that.", _meta: { error: err.message } };
|
|
667
|
+
}
|
|
668
|
+
|
|
669
|
+
// Wait for ack to finish before narrating (don't overlap speech)
|
|
670
|
+
await ackPromise;
|
|
671
|
+
|
|
672
|
+
// Step 2: Narrate + execute in PARALLEL
|
|
673
|
+
const hasActions = Array.isArray(inferResult.actions) && inferResult.actions.length > 0;
|
|
674
|
+
const spokenText = inferResult.spoken;
|
|
675
|
+
|
|
676
|
+
if (hasActions && spokenText) {
|
|
677
|
+
// SPEAK FIRST — user must hear what's about to happen before windows move
|
|
678
|
+
log(`⏱ narrating: "${spokenText.slice(0, 50)}"`);
|
|
679
|
+
await streamSpeak(spokenText).catch((e) => log(`narrate error: ${e.message}`));
|
|
680
|
+
|
|
681
|
+
// NOW respond with actions — Swift executes after user heard the plan
|
|
682
|
+
const turnMs = Math.round(performance.now() - turnStart);
|
|
683
|
+
log(`⏱ turn response at ${turnMs}ms — actions sent after narration`);
|
|
684
|
+
respond({ ok: true, data: inferResult, turnMs });
|
|
685
|
+
|
|
686
|
+
// Confirm
|
|
687
|
+
await playCached("Done.").catch(() => {});
|
|
688
|
+
} else if (spokenText) {
|
|
689
|
+
// Conversation only — speak and respond
|
|
690
|
+
await streamSpeak(spokenText).catch((e) => log(`speak error: ${e.message}`));
|
|
691
|
+
const turnMs = Math.round(performance.now() - turnStart);
|
|
692
|
+
respond({ ok: true, data: inferResult, turnMs });
|
|
693
|
+
} else {
|
|
694
|
+
const turnMs = Math.round(performance.now() - turnStart);
|
|
695
|
+
respond({ ok: true, data: inferResult, turnMs });
|
|
696
|
+
}
|
|
697
|
+
|
|
698
|
+
const totalMs = Math.round(performance.now() - turnStart);
|
|
699
|
+
log(`⏱ turn complete: ${totalMs}ms total`);
|
|
700
|
+
break;
|
|
701
|
+
}
|
|
702
|
+
|
|
703
|
+
default:
|
|
704
|
+
respond({ ok: false, error: `unknown command: ${cmd.cmd}` });
|
|
705
|
+
}
|
|
706
|
+
}
|
|
707
|
+
|
|
708
|
+
// Read stdin line by line
|
|
709
|
+
(async () => {
|
|
710
|
+
while (true) {
|
|
711
|
+
const { done, value } = await reader.read();
|
|
712
|
+
if (done) break;
|
|
713
|
+
|
|
714
|
+
buffer += decoder.decode(value, { stream: true });
|
|
715
|
+
const lines = buffer.split("\n");
|
|
716
|
+
buffer = lines.pop() ?? "";
|
|
717
|
+
|
|
718
|
+
for (const line of lines) {
|
|
719
|
+
await processLine(line);
|
|
720
|
+
}
|
|
721
|
+
}
|
|
722
|
+
})();
|
|
723
|
+
|
|
724
|
+
function respond(obj: any) {
|
|
725
|
+
console.log(JSON.stringify(obj));
|
|
726
|
+
}
|
|
727
|
+
|
|
728
|
+
function log(msg: string) {
|
|
729
|
+
const ts = new Date().toISOString().slice(11, 23);
|
|
730
|
+
console.error(`[${ts}] handsoff-worker: ${msg}`);
|
|
731
|
+
}
|