cursor-buddy 0.0.2 → 0.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,350 +1,624 @@
1
- import html2canvas from "html2canvas-pro";
2
1
  import { atom } from "nanostores";
3
- //#region src/core/state-machine.ts
2
+ import html2canvas from "html2canvas-pro";
3
+ //#region src/core/atoms.ts
4
4
  /**
5
- * State transition table for the voice interaction flow.
6
- * Maps current state + event type to next state.
5
+ * Nanostores atoms for reactive values that don't need state machine semantics.
6
+ * These update frequently (e.g., 60fps audio levels) and are framework-agnostic.
7
7
  */
8
- const transitions = {
9
- idle: { HOTKEY_PRESSED: "listening" },
10
- listening: {
11
- HOTKEY_RELEASED: "processing",
12
- ERROR: "idle"
13
- },
14
- processing: {
15
- AI_RESPONSE_COMPLETE: "responding",
16
- HOTKEY_PRESSED: "listening",
17
- ERROR: "idle"
18
- },
19
- responding: {
20
- TTS_COMPLETE: "idle",
21
- HOTKEY_PRESSED: "listening",
22
- ERROR: "idle"
23
- }
24
- };
8
+ const $audioLevel = atom(0);
9
+ const $cursorPosition = atom({
10
+ x: 0,
11
+ y: 0
12
+ });
13
+ const $buddyPosition = atom({
14
+ x: 0,
15
+ y: 0
16
+ });
17
+ const $buddyRotation = atom(0);
18
+ const $buddyScale = atom(1);
19
+ const $pointingTarget = atom(null);
20
+ const $isEnabled = atom(true);
21
+ atom(false);
22
+ const $conversationHistory = atom([]);
23
+ //#endregion
24
+ //#region src/core/pointing.ts
25
25
  /**
26
- * Create a simple typed state machine for the voice interaction flow.
27
- *
28
- * States: idle -> listening -> processing -> responding -> idle
26
+ * Parses POINT tags from AI responses.
29
27
  *
30
- * Supports interruption: pressing hotkey during processing or responding
31
- * immediately transitions back to listening.
28
+ * Supports two formats:
29
+ * - Marker-based: [POINT:5:label] - 3 parts, references a numbered marker
30
+ * - Coordinate-based: [POINT:640,360:label] - 4 parts, raw pixel coordinates
32
31
  */
33
- function createStateMachine(initial = "idle") {
34
- let state = initial;
35
- const listeners = /* @__PURE__ */ new Set();
36
- function notify() {
37
- listeners.forEach((listener) => listener());
38
- }
32
+ const POINTING_TAG_REGEX = /\[POINT:(\d+)(?:,(\d+))?:([^\]]+)\]\s*$/;
33
+ /**
34
+ * Parse pointing tag into structured result.
35
+ * Returns null if no valid POINT tag is found at the end.
36
+ */
37
+ function parsePointingTagRaw(response) {
38
+ const match = response.match(POINTING_TAG_REGEX);
39
+ if (!match) return null;
40
+ const first = Number.parseInt(match[1], 10);
41
+ const second = match[2] ? Number.parseInt(match[2], 10) : null;
42
+ const label = match[3].trim();
43
+ if (second !== null) return {
44
+ type: "coordinates",
45
+ x: first,
46
+ y: second,
47
+ label
48
+ };
39
49
  return {
40
- getState: () => state,
41
- transition: (event) => {
42
- const nextState = transitions[state][event.type];
43
- if (!nextState) return false;
44
- state = nextState;
45
- notify();
46
- return true;
47
- },
48
- subscribe: (listener) => {
49
- listeners.add(listener);
50
- return () => listeners.delete(listener);
51
- },
52
- reset: () => {
53
- state = "idle";
54
- notify();
55
- }
50
+ type: "marker",
51
+ markerId: first,
52
+ label
56
53
  };
57
54
  }
58
- //#endregion
59
- //#region src/core/utils/audio-worklet.ts
60
55
  /**
61
- * AudioWorklet processor code for voice capture.
62
- * Inlined as a blob URL to avoid separate file serving requirements.
56
+ * Remove POINT tag from response text for display/TTS.
63
57
  */
64
- const workletCode = `
65
- class AudioCaptureProcessor extends AudioWorkletProcessor {
66
- constructor() {
67
- super()
68
- this.isRecording = true
69
- }
70
-
71
- process(inputs) {
72
- if (!this.isRecording) return false
73
-
74
- const input = inputs[0]
75
- if (input && input.length > 0) {
76
- const channelData = input[0]
77
-
78
- // Send audio data to main thread
79
- this.port.postMessage({
80
- type: "audio",
81
- data: new Float32Array(channelData)
82
- })
83
-
84
- // Calculate RMS for audio level visualization
85
- let sum = 0
86
- for (let i = 0; i < channelData.length; i++) {
87
- sum += channelData[i] * channelData[i]
88
- }
89
- const rms = Math.sqrt(sum / channelData.length)
90
- this.port.postMessage({ type: "level", rms })
91
- }
92
-
93
- return true
94
- }
58
+ function stripPointingTag(response) {
59
+ return response.replace(POINTING_TAG_REGEX, "").trim();
95
60
  }
96
-
97
- registerProcessor("audio-capture-processor", AudioCaptureProcessor)
98
- `;
99
- let cachedBlobURL = null;
61
+ //#endregion
62
+ //#region src/core/services/audio-playback.ts
100
63
  /**
101
- * Create a blob URL for the audio worklet processor.
102
- * Caches the URL to avoid creating multiple blobs.
64
+ * Framework-agnostic service for audio playback with abort support.
103
65
  */
104
- function createWorkletBlobURL() {
105
- if (!cachedBlobURL) {
106
- const blob = new Blob([workletCode], { type: "application/javascript" });
107
- cachedBlobURL = URL.createObjectURL(blob);
66
+ var AudioPlaybackService = class {
67
+ audio = null;
68
+ currentUrl = null;
69
+ settlePlayback = null;
70
+ removeAbortListener = null;
71
+ /**
72
+ * Play audio from a blob. Stops any currently playing audio first.
73
+ * @param blob - Audio blob to play
74
+ * @param signal - Optional AbortSignal to cancel playback
75
+ * @returns Promise that resolves when playback completes
76
+ */
77
+ async play(blob, signal) {
78
+ this.stop();
79
+ if (signal?.aborted) return;
80
+ const url = URL.createObjectURL(blob);
81
+ this.currentUrl = url;
82
+ this.audio = new Audio(url);
83
+ return new Promise((resolve, reject) => {
84
+ if (!this.audio) {
85
+ this.cleanup();
86
+ resolve();
87
+ return;
88
+ }
89
+ let settled = false;
90
+ const audio = this.audio;
91
+ const settle = (outcome, error) => {
92
+ if (settled) return;
93
+ settled = true;
94
+ if (this.settlePlayback === settle) this.settlePlayback = null;
95
+ this.removeAbortListener?.();
96
+ this.removeAbortListener = null;
97
+ if (this.audio === audio) {
98
+ this.audio.onended = null;
99
+ this.audio.onerror = null;
100
+ this.audio = null;
101
+ }
102
+ this.cleanup();
103
+ if (outcome === "resolve") {
104
+ resolve();
105
+ return;
106
+ }
107
+ reject(error ?? /* @__PURE__ */ new Error("Audio playback failed"));
108
+ };
109
+ this.settlePlayback = settle;
110
+ const abortHandler = () => {
111
+ audio.pause();
112
+ settle("resolve");
113
+ };
114
+ if (signal) {
115
+ signal.addEventListener("abort", abortHandler, { once: true });
116
+ this.removeAbortListener = () => {
117
+ signal.removeEventListener("abort", abortHandler);
118
+ };
119
+ }
120
+ this.audio.onended = () => {
121
+ settle("resolve");
122
+ };
123
+ this.audio.onerror = () => {
124
+ settle("reject", /* @__PURE__ */ new Error("Audio playback failed"));
125
+ };
126
+ this.audio.play().catch((err) => {
127
+ settle("reject", err instanceof Error ? err : new Error(String(err)));
128
+ });
129
+ });
108
130
  }
109
- return cachedBlobURL;
110
- }
131
+ /**
132
+ * Stop any currently playing audio.
133
+ */
134
+ stop() {
135
+ if (this.audio) this.audio.pause();
136
+ if (this.settlePlayback) {
137
+ const settlePlayback = this.settlePlayback;
138
+ this.settlePlayback = null;
139
+ settlePlayback("resolve");
140
+ return;
141
+ }
142
+ this.removeAbortListener?.();
143
+ this.removeAbortListener = null;
144
+ if (this.audio) {
145
+ this.audio.onended = null;
146
+ this.audio.onerror = null;
147
+ this.audio = null;
148
+ }
149
+ this.cleanup();
150
+ }
151
+ cleanup() {
152
+ if (this.currentUrl) {
153
+ URL.revokeObjectURL(this.currentUrl);
154
+ this.currentUrl = null;
155
+ }
156
+ }
157
+ };
111
158
  //#endregion
112
- //#region src/core/utils/audio.ts
159
+ //#region src/core/bezier.ts
113
160
  /**
114
- * Audio conversion utilities for voice capture.
115
- * Converts Float32 audio data to WAV format for server transcription.
161
+ * Bezier flight animation for cursor pointing.
116
162
  */
117
163
  /**
118
- * Merge multiple Float32Array chunks into a single array
164
+ * Quadratic bezier curve: B(t) = (1-t)²P₀ + 2(1-t)t·P₁ + t²P₂
119
165
  */
120
- function mergeAudioChunks(chunks) {
121
- const totalLength = chunks.reduce((acc, chunk) => acc + chunk.length, 0);
122
- const result = new Float32Array(totalLength);
123
- let offset = 0;
124
- for (const chunk of chunks) {
125
- result.set(chunk, offset);
126
- offset += chunk.length;
127
- }
128
- return result;
166
+ function quadraticBezier(p0, p1, p2, t) {
167
+ const oneMinusT = 1 - t;
168
+ return {
169
+ x: oneMinusT * oneMinusT * p0.x + 2 * oneMinusT * t * p1.x + t * t * p2.x,
170
+ y: oneMinusT * oneMinusT * p0.y + 2 * oneMinusT * t * p1.y + t * t * p2.y
171
+ };
129
172
  }
130
173
  /**
131
- * Convert Float32 audio data to 16-bit PCM
174
+ * Bezier tangent (derivative): B'(t) = 2(1-t)(P₁-P₀) + 2t(P₂-P₁)
132
175
  */
133
- function floatTo16BitPCM(output, offset, input) {
134
- for (let i = 0; i < input.length; i++, offset += 2) {
135
- const sample = Math.max(-1, Math.min(1, input[i]));
136
- output.setInt16(offset, sample < 0 ? sample * 32768 : sample * 32767, true);
137
- }
176
+ function bezierTangent(p0, p1, p2, t) {
177
+ const oneMinusT = 1 - t;
178
+ return {
179
+ x: 2 * oneMinusT * (p1.x - p0.x) + 2 * t * (p2.x - p1.x),
180
+ y: 2 * oneMinusT * (p1.y - p0.y) + 2 * t * (p2.y - p1.y)
181
+ };
138
182
  }
139
183
  /**
140
- * Write a string to a DataView
184
+ * Ease-in-out cubic for smooth acceleration/deceleration
141
185
  */
142
- function writeString(view, offset, string) {
143
- for (let i = 0; i < string.length; i++) view.setUint8(offset + i, string.charCodeAt(i));
186
+ function easeInOutCubic(t) {
187
+ return t < .5 ? 4 * t * t * t : 1 - (-2 * t + 2) ** 3 / 2;
144
188
  }
145
189
  /**
146
- * Encode Float32 audio data as a WAV file
190
+ * Animate cursor along a parabolic bezier arc from start to end.
191
+ * Used when the AI points at a UI element.
192
+ *
193
+ * @param from - Starting position
194
+ * @param to - Target position
195
+ * @param durationMs - Flight duration in milliseconds
196
+ * @param callbacks - Frame and completion callbacks
197
+ * @returns Cancel function to stop the animation
147
198
  */
148
- function encodeWAV(samples, sampleRate) {
149
- const numChannels = 1;
150
- const bitsPerSample = 16;
151
- const bytesPerSample = bitsPerSample / 8;
152
- const blockAlign = numChannels * bytesPerSample;
153
- const dataLength = samples.length * bytesPerSample;
154
- const buffer = new ArrayBuffer(44 + dataLength);
155
- const view = new DataView(buffer);
156
- writeString(view, 0, "RIFF");
157
- view.setUint32(4, 36 + dataLength, true);
158
- writeString(view, 8, "WAVE");
159
- writeString(view, 12, "fmt ");
160
- view.setUint32(16, 16, true);
161
- view.setUint16(20, 1, true);
162
- view.setUint16(22, numChannels, true);
163
- view.setUint32(24, sampleRate, true);
164
- view.setUint32(28, sampleRate * blockAlign, true);
165
- view.setUint16(32, blockAlign, true);
166
- view.setUint16(34, bitsPerSample, true);
167
- writeString(view, 36, "data");
168
- view.setUint32(40, dataLength, true);
169
- floatTo16BitPCM(view, 44, samples);
170
- return new Blob([buffer], { type: "audio/wav" });
199
+ function animateBezierFlight(from, to, durationMs, callbacks) {
200
+ const startTime = performance.now();
201
+ const distance = Math.hypot(to.x - from.x, to.y - from.y);
202
+ const controlPoint = {
203
+ x: (from.x + to.x) / 2,
204
+ y: Math.min(from.y, to.y) - distance * .2
205
+ };
206
+ let animationFrameId;
207
+ function animate(now) {
208
+ const elapsed = now - startTime;
209
+ const linearProgress = Math.min(elapsed / durationMs, 1);
210
+ const easedProgress = easeInOutCubic(linearProgress);
211
+ const position = quadraticBezier(from, controlPoint, to, easedProgress);
212
+ const tangent = bezierTangent(from, controlPoint, to, easedProgress);
213
+ const rotation = Math.atan2(tangent.y, tangent.x);
214
+ const scale = 1 + Math.sin(linearProgress * Math.PI) * .3;
215
+ callbacks.onFrame(position, rotation, scale);
216
+ if (linearProgress < 1) animationFrameId = requestAnimationFrame(animate);
217
+ else callbacks.onComplete();
218
+ }
219
+ animationFrameId = requestAnimationFrame(animate);
220
+ return () => cancelAnimationFrame(animationFrameId);
171
221
  }
172
222
  //#endregion
173
- //#region src/core/services/voice-capture.ts
174
- const SAMPLE_RATE = 16e3;
175
- const AUDIO_LEVEL_BOOST = 10.2;
223
+ //#region src/core/services/pointer-controller.ts
224
+ const POINTING_LOCK_TIMEOUT_MS = 1e4;
176
225
  /**
177
- * Framework-agnostic service for voice capture using AudioWorkletNode.
226
+ * Controller for cursor pointing behavior.
227
+ * Manages the pointer state machine (follow -> flying -> anchored -> follow)
228
+ * and cursor animation.
178
229
  */
179
- var VoiceCaptureService = class {
180
- audioContext = null;
181
- workletNode = null;
182
- stream = null;
183
- chunks = [];
184
- levelCallback = null;
185
- /**
186
- * Register a callback to receive audio level updates (0-1).
187
- * Called at ~60fps during recording for waveform visualization.
188
- */
189
- onLevel(callback) {
190
- this.levelCallback = callback;
191
- }
230
+ var PointerController = class {
231
+ mode = "follow";
232
+ cancelAnimation = null;
233
+ releaseTimeout = null;
234
+ listeners = /* @__PURE__ */ new Set();
192
235
  /**
193
- * Start recording audio from the microphone.
194
- * @throws Error if microphone access is denied
236
+ * Animate cursor to point at a target.
195
237
  */
196
- async start() {
197
- this.chunks = [];
198
- const stream = await navigator.mediaDevices.getUserMedia({ audio: {
199
- sampleRate: SAMPLE_RATE,
200
- channelCount: 1,
201
- echoCancellation: true,
202
- noiseSuppression: true
203
- } });
204
- this.stream = stream;
205
- const audioContext = new AudioContext({ sampleRate: SAMPLE_RATE });
206
- this.audioContext = audioContext;
207
- const workletURL = createWorkletBlobURL();
208
- await audioContext.audioWorklet.addModule(workletURL);
209
- const source = audioContext.createMediaStreamSource(stream);
210
- const workletNode = new AudioWorkletNode(audioContext, "audio-capture-processor");
211
- this.workletNode = workletNode;
212
- workletNode.port.onmessage = (event) => {
213
- const { type, data, rms } = event.data;
214
- if (type === "audio") this.chunks.push(data);
215
- else if (type === "level" && this.levelCallback) {
216
- const boostedLevel = Math.min(rms * AUDIO_LEVEL_BOOST, 1);
217
- this.levelCallback(boostedLevel);
218
- }
238
+ pointAt(target) {
239
+ this.release();
240
+ this.mode = "flying";
241
+ $pointingTarget.set(target);
242
+ const startPos = $buddyPosition.get();
243
+ const endPos = {
244
+ x: target.x,
245
+ y: target.y
219
246
  };
220
- source.connect(workletNode);
247
+ this.cancelAnimation = animateBezierFlight(startPos, endPos, 800, {
248
+ onFrame: (position, rotation, scale) => {
249
+ $buddyPosition.set(position);
250
+ $buddyRotation.set(rotation);
251
+ $buddyScale.set(scale);
252
+ },
253
+ onComplete: () => {
254
+ this.cancelAnimation = null;
255
+ this.mode = "anchored";
256
+ $buddyPosition.set(endPos);
257
+ $buddyRotation.set(0);
258
+ $buddyScale.set(1);
259
+ this.scheduleRelease();
260
+ this.notify();
261
+ }
262
+ });
263
+ this.notify();
221
264
  }
222
265
  /**
223
- * Stop recording and return the captured audio as a WAV blob.
266
+ * Release the cursor from pointing mode back to follow mode.
224
267
  */
225
- async stop() {
226
- if (this.stream) {
227
- this.stream.getTracks().forEach((track) => track.stop());
228
- this.stream = null;
229
- }
230
- if (this.workletNode) {
231
- this.workletNode.disconnect();
232
- this.workletNode = null;
268
+ release() {
269
+ if (this.cancelAnimation) {
270
+ this.cancelAnimation();
271
+ this.cancelAnimation = null;
233
272
  }
234
- if (this.audioContext) {
235
- await this.audioContext.close();
236
- this.audioContext = null;
273
+ if (this.releaseTimeout) {
274
+ clearTimeout(this.releaseTimeout);
275
+ this.releaseTimeout = null;
237
276
  }
238
- this.levelCallback?.(0);
239
- const wavBlob = encodeWAV(mergeAudioChunks(this.chunks), SAMPLE_RATE);
240
- this.chunks = [];
241
- return wavBlob;
277
+ this.mode = "follow";
278
+ $pointingTarget.set(null);
279
+ $buddyPosition.set($cursorPosition.get());
280
+ $buddyRotation.set(0);
281
+ $buddyScale.set(1);
282
+ this.notify();
242
283
  }
243
284
  /**
244
- * Clean up all resources.
285
+ * Check if cursor is currently pointing (flying or anchored).
245
286
  */
246
- dispose() {
247
- if (this.stream) {
248
- this.stream.getTracks().forEach((track) => track.stop());
249
- this.stream = null;
250
- }
251
- if (this.workletNode) {
252
- this.workletNode.disconnect();
253
- this.workletNode = null;
254
- }
255
- if (this.audioContext) {
256
- this.audioContext.close();
257
- this.audioContext = null;
258
- }
259
- this.chunks = [];
260
- this.levelCallback = null;
287
+ isPointing() {
288
+ return this.mode !== "follow";
261
289
  }
262
- };
263
- //#endregion
264
- //#region src/core/services/audio-playback.ts
265
- /**
266
- * Framework-agnostic service for audio playback with abort support.
267
- */
268
- var AudioPlaybackService = class {
269
- audio = null;
270
- currentUrl = null;
271
290
  /**
272
- * Play audio from a blob. Stops any currently playing audio first.
273
- * @param blob - Audio blob to play
274
- * @param signal - Optional AbortSignal to cancel playback
275
- * @returns Promise that resolves when playback completes
291
+ * Get current pointer mode.
276
292
  */
277
- async play(blob, signal) {
278
- this.stop();
279
- if (signal?.aborted) return;
280
- const url = URL.createObjectURL(blob);
281
- this.currentUrl = url;
282
- this.audio = new Audio(url);
283
- const abortHandler = () => this.stop();
284
- signal?.addEventListener("abort", abortHandler);
285
- return new Promise((resolve, reject) => {
286
- if (!this.audio) {
287
- this.cleanup();
288
- resolve();
289
- return;
290
- }
291
- this.audio.onended = () => {
292
- signal?.removeEventListener("abort", abortHandler);
293
- this.cleanup();
294
- resolve();
295
- };
296
- this.audio.onerror = () => {
297
- signal?.removeEventListener("abort", abortHandler);
298
- this.cleanup();
299
- reject(/* @__PURE__ */ new Error("Audio playback failed"));
300
- };
301
- this.audio.play().catch((err) => {
302
- signal?.removeEventListener("abort", abortHandler);
303
- this.cleanup();
304
- reject(err);
305
- });
306
- });
293
+ getMode() {
294
+ return this.mode;
295
+ }
296
+ /**
297
+ * Subscribe to pointer state changes.
298
+ */
299
+ subscribe(listener) {
300
+ this.listeners.add(listener);
301
+ return () => this.listeners.delete(listener);
307
302
  }
308
303
  /**
309
- * Stop any currently playing audio.
304
+ * Update buddy position to follow cursor when in follow mode.
305
+ * Call this on cursor position changes.
310
306
  */
311
- stop() {
312
- if (this.audio) {
313
- this.audio.pause();
314
- this.audio.onended = null;
315
- this.audio.onerror = null;
316
- this.audio = null;
307
+ updateFollowPosition() {
308
+ if (this.mode === "follow") {
309
+ $buddyPosition.set($cursorPosition.get());
310
+ $buddyRotation.set(0);
311
+ $buddyScale.set(1);
317
312
  }
318
- this.cleanup();
319
313
  }
320
- cleanup() {
321
- if (this.currentUrl) {
322
- URL.revokeObjectURL(this.currentUrl);
323
- this.currentUrl = null;
324
- }
314
+ scheduleRelease() {
315
+ this.releaseTimeout = setTimeout(() => {
316
+ this.releaseTimeout = null;
317
+ this.release();
318
+ }, POINTING_LOCK_TIMEOUT_MS);
319
+ }
320
+ notify() {
321
+ this.listeners.forEach((listener) => listener());
322
+ }
323
+ };
324
+ //#endregion
325
+ //#region src/core/utils/elements.ts
326
+ /**
327
+ * Element discovery for annotated screenshots.
328
+ * Finds visible interactive elements and assigns marker IDs.
329
+ */
330
+ /** Max characters for element descriptions passed to the model. */
331
+ const MAX_DESCRIPTION_LENGTH = 50;
332
+ /** Pixels tolerance for grouping elements into the same visual row. */
333
+ const ROW_TOLERANCE_PX = 20;
334
+ /**
335
+ * Interactive element selectors - elements users would want to click/interact with.
336
+ * Mirrors accessibility roles from agent-browser but using CSS selectors.
337
+ */
338
+ const INTERACTIVE_SELECTORS = [
339
+ "button",
340
+ "[role=\"button\"]",
341
+ "input[type=\"button\"]",
342
+ "input[type=\"submit\"]",
343
+ "input[type=\"reset\"]",
344
+ "a[href]",
345
+ "[role=\"link\"]",
346
+ "input:not([type=\"hidden\"])",
347
+ "textarea",
348
+ "select",
349
+ "[role=\"textbox\"]",
350
+ "[role=\"searchbox\"]",
351
+ "[role=\"combobox\"]",
352
+ "[role=\"listbox\"]",
353
+ "[role=\"slider\"]",
354
+ "[role=\"spinbutton\"]",
355
+ "[role=\"checkbox\"]",
356
+ "[role=\"radio\"]",
357
+ "[role=\"switch\"]",
358
+ "[role=\"menuitem\"]",
359
+ "[role=\"menuitemcheckbox\"]",
360
+ "[role=\"menuitemradio\"]",
361
+ "[role=\"option\"]",
362
+ "[role=\"tab\"]",
363
+ "[role=\"treeitem\"]",
364
+ "video",
365
+ "audio",
366
+ "[data-cursor-buddy-interactive]"
367
+ ];
368
+ /**
369
+ * Check if an element is visible in the viewport.
370
+ */
371
+ function isElementVisible(element, rect = element.getBoundingClientRect()) {
372
+ if (rect.width <= 0 || rect.height <= 0) return false;
373
+ if (rect.bottom < 0 || rect.top > window.innerHeight || rect.right < 0 || rect.left > window.innerWidth) return false;
374
+ const style = window.getComputedStyle(element);
375
+ if (style.visibility === "hidden" || style.display === "none") return false;
376
+ if (Number.parseFloat(style.opacity) === 0) return false;
377
+ return true;
378
+ }
379
+ function truncateDescription(value) {
380
+ return value.slice(0, MAX_DESCRIPTION_LENGTH);
381
+ }
382
+ /**
383
+ * Generate a brief description for an element.
384
+ */
385
+ function describeElement(element) {
386
+ const tag = element.tagName.toLowerCase();
387
+ const ariaLabel = element.getAttribute("aria-label");
388
+ if (ariaLabel) return truncateDescription(ariaLabel);
389
+ if (tag === "button" || tag === "a") {
390
+ const text = element.textContent?.trim();
391
+ if (text) return truncateDescription(text);
392
+ }
393
+ if (tag === "input" || tag === "textarea") {
394
+ const placeholder = element.getAttribute("placeholder");
395
+ if (placeholder) return truncateDescription(placeholder);
396
+ return `${element.getAttribute("type") || "text"} input`;
397
+ }
398
+ if (tag === "img") {
399
+ const alt = element.getAttribute("alt");
400
+ if (alt) return truncateDescription(alt);
401
+ return "image";
402
+ }
403
+ const role = element.getAttribute("role");
404
+ if (role) return role;
405
+ return tag;
406
+ }
407
+ function collectVisibleInteractiveElements() {
408
+ const selector = INTERACTIVE_SELECTORS.join(",");
409
+ const allElements = document.querySelectorAll(selector);
410
+ const visible = [];
411
+ for (const element of allElements) {
412
+ const rect = element.getBoundingClientRect();
413
+ if (!isElementVisible(element, rect)) continue;
414
+ visible.push({
415
+ element,
416
+ rect
417
+ });
325
418
  }
419
+ visible.sort((a, b) => {
420
+ const rowDiff = Math.floor(a.rect.top / ROW_TOLERANCE_PX) - Math.floor(b.rect.top / ROW_TOLERANCE_PX);
421
+ if (rowDiff !== 0) return rowDiff;
422
+ return a.rect.left - b.rect.left;
423
+ });
424
+ return visible;
425
+ }
426
+ /**
427
+ * Create marker map from visible interactive elements.
428
+ * Assigns sequential IDs starting from 1.
429
+ */
430
+ function createMarkerMap() {
431
+ const elements = collectVisibleInteractiveElements();
432
+ const map = /* @__PURE__ */ new Map();
433
+ elements.forEach(({ element, rect }, index) => {
434
+ const id = index + 1;
435
+ map.set(id, {
436
+ id,
437
+ element,
438
+ rect,
439
+ description: describeElement(element)
440
+ });
441
+ });
442
+ return map;
443
+ }
444
+ /**
445
+ * Get the center point of an element in viewport coordinates.
446
+ */
447
+ function getElementCenter(element) {
448
+ const rect = element.getBoundingClientRect();
449
+ return {
450
+ x: Math.round(rect.left + rect.width / 2),
451
+ y: Math.round(rect.top + rect.height / 2)
452
+ };
453
+ }
454
+ /**
455
+ * Resolve a marker ID to viewport coordinates.
456
+ * Returns null if marker not found or element no longer visible.
457
+ */
458
+ function resolveMarkerToCoordinates(markerMap, markerId) {
459
+ const marker = markerMap.get(markerId);
460
+ if (!marker) return null;
461
+ if (!document.contains(marker.element)) return null;
462
+ if (!isElementVisible(marker.element)) return null;
463
+ return getElementCenter(marker.element);
464
+ }
465
+ //#endregion
466
+ //#region src/core/utils/annotations.ts
467
+ const DEFAULT_STYLE = {
468
+ borderColor: "rgba(255, 0, 0, 0.8)",
469
+ labelBackground: "rgba(255, 0, 0, 0.9)",
470
+ labelColor: "#ffffff",
471
+ borderWidth: 2,
472
+ fontSize: 11,
473
+ labelPadding: 4
326
474
  };
475
+ /**
476
+ * Draw annotation markers onto a canvas.
477
+ * Modifies the canvas in place.
478
+ *
479
+ * @param ctx Canvas 2D context to draw on
480
+ * @param markers Marker map from element discovery
481
+ * @param style Optional style overrides
482
+ */
483
+ function drawAnnotations(ctx, markers, style = {}) {
484
+ const s = {
485
+ ...DEFAULT_STYLE,
486
+ ...style
487
+ };
488
+ ctx.save();
489
+ for (const marker of markers.values()) {
490
+ const { rect, id } = marker;
491
+ ctx.strokeStyle = s.borderColor;
492
+ ctx.lineWidth = s.borderWidth;
493
+ ctx.strokeRect(rect.left, rect.top, rect.width, rect.height);
494
+ const label = String(id);
495
+ ctx.font = `bold ${s.fontSize}px monospace`;
496
+ const textWidth = ctx.measureText(label).width;
497
+ const textHeight = s.fontSize;
498
+ const labelWidth = textWidth + s.labelPadding * 2;
499
+ const labelHeight = textHeight + s.labelPadding;
500
+ const labelX = rect.left - s.borderWidth;
501
+ const labelY = rect.top < labelHeight + 4 ? rect.top + 2 : rect.top - labelHeight;
502
+ ctx.fillStyle = s.labelBackground;
503
+ ctx.beginPath();
504
+ ctx.roundRect(labelX, labelY, labelWidth, labelHeight, 2);
505
+ ctx.fill();
506
+ ctx.fillStyle = s.labelColor;
507
+ ctx.textBaseline = "top";
508
+ ctx.fillText(label, labelX + s.labelPadding, labelY + s.labelPadding / 2);
509
+ }
510
+ ctx.restore();
511
+ }
512
+ /**
513
+ * Create an annotated copy of a canvas.
514
+ * Does not modify the original canvas.
515
+ *
516
+ * @param sourceCanvas Original screenshot canvas
517
+ * @param markers Marker map from element discovery
518
+ * @returns New canvas with annotations drawn
519
+ */
520
+ function createAnnotatedCanvas(sourceCanvas, markers) {
521
+ const canvas = document.createElement("canvas");
522
+ canvas.width = sourceCanvas.width;
523
+ canvas.height = sourceCanvas.height;
524
+ const ctx = canvas.getContext("2d");
525
+ if (!ctx) throw new Error("Failed to get canvas 2D context");
526
+ ctx.drawImage(sourceCanvas, 0, 0);
527
+ drawAnnotations(ctx, markers);
528
+ return canvas;
529
+ }
530
+ /**
531
+ * Generate marker context string for AI prompt.
532
+ * Lists available markers with their descriptions.
533
+ *
534
+ * @param markers Marker map from element discovery
535
+ * @returns Formatted string listing markers
536
+ */
537
+ function generateMarkerContext(markers) {
538
+ if (markers.size === 0) return "No interactive elements detected.";
539
+ const lines = ["Interactive elements (use marker number to point):"];
540
+ for (const marker of markers.values()) lines.push(` ${marker.id}: ${marker.description}`);
541
+ return lines.join("\n");
542
+ }
327
543
  //#endregion
328
544
  //#region src/core/utils/screenshot.ts
329
- const MAX_WIDTH = 1280;
545
+ const CLONE_RESOURCE_TIMEOUT_MS = 3e3;
330
546
  function getCaptureMetrics() {
331
547
  return {
332
548
  viewportWidth: window.innerWidth,
333
549
  viewportHeight: window.innerHeight
334
550
  };
335
551
  }
336
- /**
337
- * Resize canvas to max width while maintaining aspect ratio
338
- */
339
- function resizeCanvas(canvas, maxWidth) {
340
- if (canvas.width <= maxWidth) return canvas;
341
- const scale = maxWidth / canvas.width;
342
- const resized = document.createElement("canvas");
343
- resized.width = maxWidth;
344
- resized.height = Math.round(canvas.height * scale);
345
- const ctx = resized.getContext("2d");
346
- if (ctx) ctx.drawImage(canvas, 0, 0, resized.width, resized.height);
347
- return resized;
552
+ function waitForNextPaint(doc) {
553
+ const view = doc.defaultView;
554
+ if (!view?.requestAnimationFrame) return Promise.resolve();
555
+ return new Promise((resolve) => {
556
+ view.requestAnimationFrame(() => {
557
+ view.requestAnimationFrame(() => resolve());
558
+ });
559
+ });
560
+ }
561
+ function isStylesheetReady(link) {
562
+ const sheet = link.sheet;
563
+ if (!sheet) return false;
564
+ try {
565
+ sheet.cssRules;
566
+ return true;
567
+ } catch (error) {
568
+ return error instanceof DOMException && error.name === "SecurityError";
569
+ }
570
+ }
571
+ function waitForStylesheetLink(link) {
572
+ if (isStylesheetReady(link)) return Promise.resolve();
573
+ return new Promise((resolve) => {
574
+ let settled = false;
575
+ let timeoutId = 0;
576
+ const finish = () => {
577
+ if (settled) return;
578
+ settled = true;
579
+ window.clearTimeout(timeoutId);
580
+ link.removeEventListener("load", handleReady);
581
+ link.removeEventListener("error", handleReady);
582
+ resolve();
583
+ };
584
+ const handleReady = () => {
585
+ if (isStylesheetReady(link)) {
586
+ finish();
587
+ return;
588
+ }
589
+ window.requestAnimationFrame(() => {
590
+ if (isStylesheetReady(link)) finish();
591
+ });
592
+ };
593
+ timeoutId = window.setTimeout(finish, CLONE_RESOURCE_TIMEOUT_MS);
594
+ link.addEventListener("load", handleReady, { once: true });
595
+ link.addEventListener("error", finish, { once: true });
596
+ handleReady();
597
+ });
598
+ }
599
+ async function waitForClonedDocumentStyles(doc) {
600
+ const stylesheetLinks = Array.from(doc.querySelectorAll("link[rel=\"stylesheet\"][href]"));
601
+ await Promise.all(stylesheetLinks.map(waitForStylesheetLink));
602
+ if (doc.fonts?.ready) await doc.fonts.ready;
603
+ await waitForNextPaint(doc);
604
+ }
605
+ function getHtml2CanvasOptions(captureMetrics) {
606
+ return {
607
+ scale: 1,
608
+ useCORS: true,
609
+ logging: false,
610
+ width: captureMetrics.viewportWidth,
611
+ height: captureMetrics.viewportHeight,
612
+ windowWidth: captureMetrics.viewportWidth,
613
+ windowHeight: captureMetrics.viewportHeight,
614
+ x: window.scrollX,
615
+ y: window.scrollY,
616
+ scrollX: window.scrollX,
617
+ scrollY: window.scrollY,
618
+ onclone: async (doc) => {
619
+ await waitForClonedDocumentStyles(doc);
620
+ }
621
+ };
348
622
  }
349
623
  /**
350
624
  * Create a fallback canvas when screenshot capture fails.
@@ -352,8 +626,8 @@ function resizeCanvas(canvas, maxWidth) {
352
626
  */
353
627
  function createFallbackCanvas() {
354
628
  const canvas = document.createElement("canvas");
355
- canvas.width = Math.min(window.innerWidth, MAX_WIDTH);
356
- canvas.height = Math.round(window.innerHeight / window.innerWidth * canvas.width);
629
+ canvas.width = window.innerWidth;
630
+ canvas.height = window.innerHeight;
357
631
  const ctx = canvas.getContext("2d");
358
632
  if (ctx) {
359
633
  ctx.fillStyle = "#f0f0f0";
@@ -374,27 +648,44 @@ async function captureViewport() {
374
648
  const captureMetrics = getCaptureMetrics();
375
649
  let canvas;
376
650
  try {
377
- canvas = await html2canvas(document.body, {
378
- scale: 1,
379
- useCORS: true,
380
- logging: false,
381
- width: captureMetrics.viewportWidth,
382
- height: captureMetrics.viewportHeight,
383
- x: window.scrollX,
384
- y: window.scrollY
385
- });
651
+ canvas = await html2canvas(document.body, getHtml2CanvasOptions(captureMetrics));
386
652
  } catch {
387
653
  canvas = createFallbackCanvas();
388
654
  }
389
- const resized = resizeCanvas(canvas, MAX_WIDTH);
390
655
  return {
391
- imageData: resized.toDataURL("image/jpeg", .8),
392
- width: resized.width,
393
- height: resized.height,
656
+ imageData: canvas.toDataURL("image/png"),
657
+ width: canvas.width,
658
+ height: canvas.height,
394
659
  viewportWidth: captureMetrics.viewportWidth,
395
660
  viewportHeight: captureMetrics.viewportHeight
396
661
  };
397
662
  }
663
+ /**
664
+ * Capture an annotated screenshot of the current viewport.
665
+ * Interactive elements are marked with numbered labels.
666
+ * Returns both the annotated image and a marker map for resolving IDs.
667
+ */
668
+ async function captureAnnotatedViewport() {
669
+ const captureMetrics = getCaptureMetrics();
670
+ const markerMap = createMarkerMap();
671
+ let sourceCanvas;
672
+ try {
673
+ sourceCanvas = await html2canvas(document.body, getHtml2CanvasOptions(captureMetrics));
674
+ } catch {
675
+ sourceCanvas = createFallbackCanvas();
676
+ }
677
+ const canvas = markerMap.size > 0 ? createAnnotatedCanvas(sourceCanvas, markerMap) : sourceCanvas;
678
+ const markerContext = generateMarkerContext(markerMap);
679
+ return {
680
+ imageData: canvas.toDataURL("image/png"),
681
+ width: canvas.width,
682
+ height: canvas.height,
683
+ viewportWidth: captureMetrics.viewportWidth,
684
+ viewportHeight: captureMetrics.viewportHeight,
685
+ markerMap,
686
+ markerContext
687
+ };
688
+ }
398
689
  //#endregion
399
690
  //#region src/core/services/screen-capture.ts
400
691
  /**
@@ -408,233 +699,418 @@ var ScreenCaptureService = class {
408
699
  async capture() {
409
700
  return captureViewport();
410
701
  }
702
+ /**
703
+ * Capture an annotated screenshot with marker overlays.
704
+ * Interactive elements are marked with numbered labels.
705
+ * @returns Annotated screenshot result with marker map
706
+ */
707
+ async captureAnnotated() {
708
+ return captureAnnotatedViewport();
709
+ }
411
710
  };
412
711
  //#endregion
413
- //#region src/core/atoms.ts
712
+ //#region src/core/utils/audio.ts
414
713
  /**
415
- * Nanostores atoms for reactive values that don't need state machine semantics.
416
- * These update frequently (e.g., 60fps audio levels) and are framework-agnostic.
714
+ * Audio conversion utilities for voice capture.
715
+ * Converts Float32 audio data to WAV format for server transcription.
417
716
  */
418
- const $audioLevel = atom(0);
419
- const $cursorPosition = atom({
420
- x: 0,
421
- y: 0
422
- });
423
- const $buddyPosition = atom({
424
- x: 0,
425
- y: 0
426
- });
427
- const $buddyRotation = atom(0);
428
- const $buddyScale = atom(1);
429
- const $pointingTarget = atom(null);
430
- const $isEnabled = atom(true);
431
- atom(false);
432
- const $conversationHistory = atom([]);
433
- //#endregion
434
- //#region src/core/bezier.ts
435
717
  /**
436
- * Bezier flight animation for cursor pointing.
718
+ * Merge multiple Float32Array chunks into a single array
437
719
  */
720
+ function mergeAudioChunks(chunks) {
721
+ const totalLength = chunks.reduce((acc, chunk) => acc + chunk.length, 0);
722
+ const result = new Float32Array(totalLength);
723
+ let offset = 0;
724
+ for (const chunk of chunks) {
725
+ result.set(chunk, offset);
726
+ offset += chunk.length;
727
+ }
728
+ return result;
729
+ }
438
730
  /**
439
- * Quadratic bezier curve: B(t) = (1-t)²P₀ + 2(1-t)t·P₁ + t²P₂
731
+ * Convert Float32 audio data to 16-bit PCM
440
732
  */
441
- function quadraticBezier(p0, p1, p2, t) {
442
- const oneMinusT = 1 - t;
443
- return {
444
- x: oneMinusT * oneMinusT * p0.x + 2 * oneMinusT * t * p1.x + t * t * p2.x,
445
- y: oneMinusT * oneMinusT * p0.y + 2 * oneMinusT * t * p1.y + t * t * p2.y
446
- };
733
+ function floatTo16BitPCM(output, offset, input) {
734
+ for (let i = 0; i < input.length; i++, offset += 2) {
735
+ const sample = Math.max(-1, Math.min(1, input[i]));
736
+ output.setInt16(offset, sample < 0 ? sample * 32768 : sample * 32767, true);
737
+ }
447
738
  }
448
739
  /**
449
- * Bezier tangent (derivative): B'(t) = 2(1-t)(P₁-P₀) + 2t(P₂-P₁)
740
+ * Write a string to a DataView
450
741
  */
451
- function bezierTangent(p0, p1, p2, t) {
452
- const oneMinusT = 1 - t;
453
- return {
454
- x: 2 * oneMinusT * (p1.x - p0.x) + 2 * t * (p2.x - p1.x),
455
- y: 2 * oneMinusT * (p1.y - p0.y) + 2 * t * (p2.y - p1.y)
456
- };
742
+ function writeString(view, offset, string) {
743
+ for (let i = 0; i < string.length; i++) view.setUint8(offset + i, string.charCodeAt(i));
457
744
  }
458
745
  /**
459
- * Ease-in-out cubic for smooth acceleration/deceleration
746
+ * Encode Float32 audio data as a WAV file
460
747
  */
461
- function easeInOutCubic(t) {
462
- return t < .5 ? 4 * t * t * t : 1 - Math.pow(-2 * t + 2, 3) / 2;
748
+ function encodeWAV(samples, sampleRate) {
749
+ const numChannels = 1;
750
+ const bitsPerSample = 16;
751
+ const bytesPerSample = bitsPerSample / 8;
752
+ const blockAlign = numChannels * bytesPerSample;
753
+ const dataLength = samples.length * bytesPerSample;
754
+ const buffer = new ArrayBuffer(44 + dataLength);
755
+ const view = new DataView(buffer);
756
+ writeString(view, 0, "RIFF");
757
+ view.setUint32(4, 36 + dataLength, true);
758
+ writeString(view, 8, "WAVE");
759
+ writeString(view, 12, "fmt ");
760
+ view.setUint32(16, 16, true);
761
+ view.setUint16(20, 1, true);
762
+ view.setUint16(22, numChannels, true);
763
+ view.setUint32(24, sampleRate, true);
764
+ view.setUint32(28, sampleRate * blockAlign, true);
765
+ view.setUint16(32, blockAlign, true);
766
+ view.setUint16(34, bitsPerSample, true);
767
+ writeString(view, 36, "data");
768
+ view.setUint32(40, dataLength, true);
769
+ floatTo16BitPCM(view, 44, samples);
770
+ return new Blob([buffer], { type: "audio/wav" });
463
771
  }
772
+ //#endregion
773
+ //#region src/core/utils/audio-worklet.ts
464
774
  /**
465
- * Animate cursor along a parabolic bezier arc from start to end.
466
- * Used when the AI points at a UI element.
467
- *
468
- * @param from - Starting position
469
- * @param to - Target position
470
- * @param durationMs - Flight duration in milliseconds
471
- * @param callbacks - Frame and completion callbacks
472
- * @returns Cancel function to stop the animation
775
+ * AudioWorklet processor code for voice capture.
776
+ * Inlined as a blob URL to avoid separate file serving requirements.
473
777
  */
474
- function animateBezierFlight(from, to, durationMs, callbacks) {
475
- const startTime = performance.now();
476
- const distance = Math.hypot(to.x - from.x, to.y - from.y);
477
- const controlPoint = {
478
- x: (from.x + to.x) / 2,
479
- y: Math.min(from.y, to.y) - distance * .2
480
- };
481
- let animationFrameId;
482
- function animate(now) {
483
- const elapsed = now - startTime;
484
- const linearProgress = Math.min(elapsed / durationMs, 1);
485
- const easedProgress = easeInOutCubic(linearProgress);
486
- const position = quadraticBezier(from, controlPoint, to, easedProgress);
487
- const tangent = bezierTangent(from, controlPoint, to, easedProgress);
488
- const rotation = Math.atan2(tangent.y, tangent.x);
489
- const scale = 1 + Math.sin(linearProgress * Math.PI) * .3;
490
- callbacks.onFrame(position, rotation, scale);
491
- if (linearProgress < 1) animationFrameId = requestAnimationFrame(animate);
492
- else callbacks.onComplete();
778
+ const workletCode = `
779
+ class AudioCaptureProcessor extends AudioWorkletProcessor {
780
+ constructor() {
781
+ super()
782
+ this.isRecording = true
783
+ this.audioChunkSize = 2048
784
+ this.audioBuffer = new Float32Array(this.audioChunkSize)
785
+ this.audioBufferIndex = 0
786
+ this.levelFramesPerUpdate = 4
787
+ this.levelFrameCount = 0
788
+ this.levelRmsSum = 0
789
+ this.levelPeak = 0
790
+
791
+ this.port.onmessage = (event) => {
792
+ if (event.data?.type === "flush") {
793
+ this.flushAudio()
794
+ this.flushLevel()
795
+ this.port.postMessage({ type: "flush-complete" })
796
+ }
797
+ }
798
+ }
799
+
800
+ flushAudio() {
801
+ if (this.audioBufferIndex === 0) return
802
+
803
+ const chunk = this.audioBuffer.slice(0, this.audioBufferIndex)
804
+ this.port.postMessage({
805
+ type: "audio",
806
+ data: chunk
807
+ })
808
+ this.audioBufferIndex = 0
809
+ }
810
+
811
+ flushLevel() {
812
+ if (this.levelFrameCount === 0) return
813
+
814
+ this.port.postMessage({
815
+ type: "level",
816
+ rms: this.levelRmsSum / this.levelFrameCount,
817
+ peak: this.levelPeak
818
+ })
819
+
820
+ this.levelFrameCount = 0
821
+ this.levelRmsSum = 0
822
+ this.levelPeak = 0
823
+ }
824
+
825
+ process(inputs) {
826
+ if (!this.isRecording) return false
827
+
828
+ const input = inputs[0]
829
+ if (input && input.length > 0) {
830
+ const channelData = input[0]
831
+ let sum = 0
832
+ let peak = 0
833
+ for (let i = 0; i < channelData.length; i++) {
834
+ const sample = channelData[i]
835
+ sum += sample * sample
836
+ const absolute = Math.abs(sample)
837
+ if (absolute > peak) peak = absolute
838
+ }
839
+
840
+ this.levelRmsSum += Math.sqrt(sum / channelData.length)
841
+ this.levelPeak = Math.max(this.levelPeak, peak)
842
+ this.levelFrameCount += 1
843
+
844
+ if (this.levelFrameCount >= this.levelFramesPerUpdate) {
845
+ this.flushLevel()
846
+ }
847
+
848
+ let readOffset = 0
849
+ while (readOffset < channelData.length) {
850
+ const remaining = this.audioBuffer.length - this.audioBufferIndex
851
+ const copyLength = Math.min(remaining, channelData.length - readOffset)
852
+
853
+ this.audioBuffer.set(
854
+ channelData.subarray(readOffset, readOffset + copyLength),
855
+ this.audioBufferIndex
856
+ )
857
+
858
+ this.audioBufferIndex += copyLength
859
+ readOffset += copyLength
860
+
861
+ if (this.audioBufferIndex >= this.audioBuffer.length) {
862
+ this.flushAudio()
863
+ }
864
+ }
865
+ }
866
+
867
+ return true
868
+ }
869
+ }
870
+
871
+ registerProcessor("audio-capture-processor", AudioCaptureProcessor)
872
+ `;
873
+ let cachedBlobURL = null;
874
+ /**
875
+ * Create a blob URL for the audio worklet processor.
876
+ * Caches the URL to avoid creating multiple blobs.
877
+ */
878
+ function createWorkletBlobURL() {
879
+ if (!cachedBlobURL) {
880
+ const blob = new Blob([workletCode], { type: "application/javascript" });
881
+ cachedBlobURL = URL.createObjectURL(blob);
493
882
  }
494
- animationFrameId = requestAnimationFrame(animate);
495
- return () => cancelAnimationFrame(animationFrameId);
883
+ return cachedBlobURL;
496
884
  }
497
885
  //#endregion
498
- //#region src/core/services/pointer-controller.ts
499
- const POINTING_LOCK_TIMEOUT_MS = 1e4;
886
+ //#region src/core/services/voice-capture.ts
887
+ const SAMPLE_RATE = 16e3;
888
+ const AUDIO_LEVEL_NOISE_GATE = 5e-4;
889
+ const AUDIO_LEVEL_INPUT_GAIN = 600;
890
+ const AUDIO_LEVEL_ATTACK = .7;
891
+ const AUDIO_LEVEL_RELEASE = .25;
892
+ function clamp$1(value, min, max) {
893
+ return Math.min(Math.max(value, min), max);
894
+ }
895
+ function normalizeAudioLevel(rms) {
896
+ const gatedRms = Math.max(0, rms - AUDIO_LEVEL_NOISE_GATE);
897
+ return clamp$1(Math.log1p(gatedRms * AUDIO_LEVEL_INPUT_GAIN) / Math.log1p(AUDIO_LEVEL_INPUT_GAIN), 0, 1);
898
+ }
899
+ function smoothAudioLevel(current, target) {
900
+ const smoothing = target > current ? AUDIO_LEVEL_ATTACK : AUDIO_LEVEL_RELEASE;
901
+ return current + (target - current) * smoothing;
902
+ }
500
903
  /**
501
- * Controller for cursor pointing behavior.
502
- * Manages the pointer state machine (follow -> flying -> anchored -> follow)
503
- * and cursor animation.
904
+ * Framework-agnostic service for voice capture using AudioWorkletNode.
504
905
  */
505
- var PointerController = class {
506
- mode = "follow";
507
- cancelAnimation = null;
508
- releaseTimeout = null;
509
- listeners = /* @__PURE__ */ new Set();
510
- /**
511
- * Animate cursor to point at a target.
512
- */
513
- pointAt(target) {
514
- this.release();
515
- this.mode = "flying";
516
- $pointingTarget.set(target);
517
- const startPos = $buddyPosition.get();
518
- const endPos = {
519
- x: target.x,
520
- y: target.y
521
- };
522
- this.cancelAnimation = animateBezierFlight(startPos, endPos, 800, {
523
- onFrame: (position, rotation, scale) => {
524
- $buddyPosition.set(position);
525
- $buddyRotation.set(rotation);
526
- $buddyScale.set(scale);
527
- },
528
- onComplete: () => {
529
- this.cancelAnimation = null;
530
- this.mode = "anchored";
531
- $buddyPosition.set(endPos);
532
- $buddyRotation.set(0);
533
- $buddyScale.set(1);
534
- this.scheduleRelease();
535
- this.notify();
536
- }
537
- });
538
- this.notify();
539
- }
540
- /**
541
- * Release the cursor from pointing mode back to follow mode.
542
- */
543
- release() {
544
- if (this.cancelAnimation) {
545
- this.cancelAnimation();
546
- this.cancelAnimation = null;
547
- }
548
- if (this.releaseTimeout) {
549
- clearTimeout(this.releaseTimeout);
550
- this.releaseTimeout = null;
551
- }
552
- this.mode = "follow";
553
- $pointingTarget.set(null);
554
- $buddyPosition.set($cursorPosition.get());
555
- $buddyRotation.set(0);
556
- $buddyScale.set(1);
557
- this.notify();
558
- }
906
+ var VoiceCaptureService = class {
907
+ audioContext = null;
908
+ workletNode = null;
909
+ sourceNode = null;
910
+ silentGainNode = null;
911
+ stream = null;
912
+ chunks = [];
913
+ levelCallback = null;
914
+ visualLevel = 0;
915
+ flushResolve = null;
559
916
  /**
560
- * Check if cursor is currently pointing (flying or anchored).
917
+ * Register a callback to receive audio level updates (0-1).
918
+ * Called at ~60fps during recording for waveform visualization.
561
919
  */
562
- isPointing() {
563
- return this.mode !== "follow";
920
+ onLevel(callback) {
921
+ this.levelCallback = callback;
564
922
  }
565
923
  /**
566
- * Get current pointer mode.
924
+ * Start recording audio from the microphone.
925
+ * @throws Error if microphone access is denied
567
926
  */
568
- getMode() {
569
- return this.mode;
927
+ async start() {
928
+ this.chunks = [];
929
+ this.visualLevel = 0;
930
+ const stream = await navigator.mediaDevices.getUserMedia({ audio: {
931
+ sampleRate: SAMPLE_RATE,
932
+ channelCount: 1,
933
+ echoCancellation: true,
934
+ noiseSuppression: true
935
+ } });
936
+ this.stream = stream;
937
+ const audioContext = new AudioContext({ sampleRate: SAMPLE_RATE });
938
+ this.audioContext = audioContext;
939
+ await audioContext.resume();
940
+ const workletURL = createWorkletBlobURL();
941
+ await audioContext.audioWorklet.addModule(workletURL);
942
+ const source = audioContext.createMediaStreamSource(stream);
943
+ this.sourceNode = source;
944
+ const workletNode = new AudioWorkletNode(audioContext, "audio-capture-processor");
945
+ this.workletNode = workletNode;
946
+ const silentGainNode = audioContext.createGain();
947
+ silentGainNode.gain.value = 0;
948
+ this.silentGainNode = silentGainNode;
949
+ workletNode.port.onmessage = (event) => {
950
+ const { type, data, rms, peak } = event.data;
951
+ if (type === "audio") this.chunks.push(data);
952
+ else if (type === "level" && this.levelCallback) {
953
+ const targetLevel = normalizeAudioLevel(Math.max(rms ?? 0, (peak ?? 0) * .6));
954
+ this.visualLevel = smoothAudioLevel(this.visualLevel, targetLevel);
955
+ this.levelCallback(this.visualLevel);
956
+ } else if (type === "flush-complete") {
957
+ this.flushResolve?.();
958
+ this.flushResolve = null;
959
+ }
960
+ };
961
+ source.connect(workletNode);
962
+ workletNode.connect(silentGainNode);
963
+ silentGainNode.connect(audioContext.destination);
570
964
  }
571
965
  /**
572
- * Subscribe to pointer state changes.
966
+ * Stop recording and return the captured audio as a WAV blob.
573
967
  */
574
- subscribe(listener) {
575
- this.listeners.add(listener);
576
- return () => this.listeners.delete(listener);
968
+ async stop() {
969
+ await this.flushPendingAudio();
970
+ if (this.stream) {
971
+ this.stream.getTracks().forEach((track) => track.stop());
972
+ this.stream = null;
973
+ }
974
+ if (this.sourceNode) {
975
+ this.sourceNode.disconnect();
976
+ this.sourceNode = null;
977
+ }
978
+ if (this.workletNode) {
979
+ this.workletNode.disconnect();
980
+ this.workletNode = null;
981
+ }
982
+ if (this.silentGainNode) {
983
+ this.silentGainNode.disconnect();
984
+ this.silentGainNode = null;
985
+ }
986
+ if (this.audioContext) {
987
+ await this.audioContext.close();
988
+ this.audioContext = null;
989
+ }
990
+ this.visualLevel = 0;
991
+ this.levelCallback?.(0);
992
+ const wavBlob = encodeWAV(mergeAudioChunks(this.chunks), SAMPLE_RATE);
993
+ this.chunks = [];
994
+ return wavBlob;
577
995
  }
578
996
  /**
579
- * Update buddy position to follow cursor when in follow mode.
580
- * Call this on cursor position changes.
997
+ * Clean up all resources.
581
998
  */
582
- updateFollowPosition() {
583
- if (this.mode === "follow") {
584
- $buddyPosition.set($cursorPosition.get());
585
- $buddyRotation.set(0);
586
- $buddyScale.set(1);
999
+ dispose() {
1000
+ if (this.stream) {
1001
+ this.stream.getTracks().forEach((track) => track.stop());
1002
+ this.stream = null;
587
1003
  }
1004
+ if (this.sourceNode) {
1005
+ this.sourceNode.disconnect();
1006
+ this.sourceNode = null;
1007
+ }
1008
+ if (this.workletNode) {
1009
+ this.workletNode.disconnect();
1010
+ this.workletNode = null;
1011
+ }
1012
+ if (this.silentGainNode) {
1013
+ this.silentGainNode.disconnect();
1014
+ this.silentGainNode = null;
1015
+ }
1016
+ if (this.audioContext) {
1017
+ this.audioContext.close();
1018
+ this.audioContext = null;
1019
+ }
1020
+ this.chunks = [];
1021
+ this.visualLevel = 0;
1022
+ this.flushResolve = null;
1023
+ this.levelCallback = null;
588
1024
  }
589
- scheduleRelease() {
590
- this.releaseTimeout = setTimeout(() => {
591
- this.releaseTimeout = null;
592
- this.release();
593
- }, POINTING_LOCK_TIMEOUT_MS);
594
- }
595
- notify() {
596
- this.listeners.forEach((listener) => listener());
1025
+ async flushPendingAudio() {
1026
+ if (!this.workletNode) return;
1027
+ await new Promise((resolve) => {
1028
+ const timeoutId = setTimeout(() => {
1029
+ this.flushResolve = null;
1030
+ resolve();
1031
+ }, 50);
1032
+ this.flushResolve = () => {
1033
+ clearTimeout(timeoutId);
1034
+ resolve();
1035
+ };
1036
+ this.workletNode?.port.postMessage({ type: "flush" });
1037
+ });
597
1038
  }
598
1039
  };
599
1040
  //#endregion
600
- //#region src/core/pointing.ts
1041
+ //#region src/core/state-machine.ts
601
1042
  /**
602
- * Parses [POINT:x,y:label] tags from AI responses.
603
- * Format matches the Swift Clicky app for consistency.
1043
+ * State transition table for the voice interaction flow.
1044
+ * Maps current state + event type to next state.
604
1045
  */
605
- const POINTING_TAG_REGEX = /\[POINT:(\d+),(\d+):([^\]]+)\]\s*$/;
1046
+ const transitions = {
1047
+ idle: { HOTKEY_PRESSED: "listening" },
1048
+ listening: {
1049
+ HOTKEY_RELEASED: "processing",
1050
+ ERROR: "idle"
1051
+ },
1052
+ processing: {
1053
+ AI_RESPONSE_COMPLETE: "responding",
1054
+ HOTKEY_PRESSED: "listening",
1055
+ ERROR: "idle"
1056
+ },
1057
+ responding: {
1058
+ TTS_COMPLETE: "idle",
1059
+ HOTKEY_PRESSED: "listening",
1060
+ ERROR: "idle"
1061
+ }
1062
+ };
606
1063
  /**
607
- * Extract pointing target from response text.
608
- * Returns null if no valid POINT tag is found at the end.
1064
+ * Create a simple typed state machine for the voice interaction flow.
1065
+ *
1066
+ * States: idle -> listening -> processing -> responding -> idle
1067
+ *
1068
+ * Supports interruption: pressing hotkey during processing or responding
1069
+ * immediately transitions back to listening.
609
1070
  */
610
- function parsePointingTag(response) {
611
- const match = response.match(POINTING_TAG_REGEX);
612
- if (!match) return null;
1071
+ function createStateMachine(initial = "idle") {
1072
+ let state = initial;
1073
+ const listeners = /* @__PURE__ */ new Set();
1074
+ function notify() {
1075
+ listeners.forEach((listener) => listener());
1076
+ }
613
1077
  return {
614
- x: parseInt(match[1], 10),
615
- y: parseInt(match[2], 10),
616
- label: match[3].trim()
1078
+ getState: () => state,
1079
+ transition: (event) => {
1080
+ const nextState = transitions[state][event.type];
1081
+ if (!nextState) return false;
1082
+ state = nextState;
1083
+ notify();
1084
+ return true;
1085
+ },
1086
+ subscribe: (listener) => {
1087
+ listeners.add(listener);
1088
+ return () => listeners.delete(listener);
1089
+ },
1090
+ reset: () => {
1091
+ state = "idle";
1092
+ notify();
1093
+ }
617
1094
  };
618
1095
  }
619
- /**
620
- * Remove POINT tag from response text for display/TTS.
621
- */
622
- function stripPointingTag(response) {
623
- return response.replace(POINTING_TAG_REGEX, "").trim();
624
- }
625
1096
  //#endregion
626
1097
  //#region src/core/client.ts
627
1098
  function clamp(value, min, max) {
628
1099
  return Math.min(Math.max(value, min), max);
629
1100
  }
630
- function mapPointToViewport(target, screenshot) {
631
- if (screenshot.width <= 0 || screenshot.height <= 0) return target;
1101
+ /**
1102
+ * Map coordinate-based pointing from screenshot space to viewport space.
1103
+ */
1104
+ function mapCoordinatesToViewport(x, y, screenshot) {
1105
+ if (screenshot.width <= 0 || screenshot.height <= 0) return {
1106
+ x,
1107
+ y
1108
+ };
632
1109
  const scaleX = screenshot.viewportWidth / screenshot.width;
633
1110
  const scaleY = screenshot.viewportHeight / screenshot.height;
634
1111
  return {
635
- ...target,
636
- x: clamp(Math.round(target.x * scaleX), 0, Math.max(screenshot.viewportWidth - 1, 0)),
637
- y: clamp(Math.round(target.y * scaleY), 0, Math.max(screenshot.viewportHeight - 1, 0))
1112
+ x: clamp(Math.round(x * scaleX), 0, Math.max(screenshot.viewportWidth - 1, 0)),
1113
+ y: clamp(Math.round(y * scaleY), 0, Math.max(screenshot.viewportHeight - 1, 0))
638
1114
  };
639
1115
  }
640
1116
  /**
@@ -658,6 +1134,7 @@ var CursorBuddyClient = class {
658
1134
  response = "";
659
1135
  error = null;
660
1136
  abortController = null;
1137
+ historyCommittedForTurn = false;
661
1138
  cachedSnapshot;
662
1139
  listeners = /* @__PURE__ */ new Set();
663
1140
  constructor(endpoint, options = {}, services = {}) {
@@ -685,6 +1162,7 @@ var CursorBuddyClient = class {
685
1162
  this.transcript = "";
686
1163
  this.response = "";
687
1164
  this.error = null;
1165
+ this.historyCommittedForTurn = false;
688
1166
  this.pointerController.release();
689
1167
  this.stateMachine.transition({ type: "HOTKEY_PRESSED" });
690
1168
  this.notify();
@@ -699,7 +1177,7 @@ var CursorBuddyClient = class {
699
1177
  this.stateMachine.transition({ type: "HOTKEY_RELEASED" });
700
1178
  const signal = this.abortController?.signal;
701
1179
  try {
702
- const [audioBlob, screenshot] = await Promise.all([this.voiceCapture.stop(), this.screenCapture.capture()]);
1180
+ const [audioBlob, screenshot] = await Promise.all([this.voiceCapture.stop(), this.screenCapture.captureAnnotated()]);
703
1181
  if (signal?.aborted) return;
704
1182
  const transcript = await this.transcribe(audioBlob, signal);
705
1183
  if (signal?.aborted) return;
@@ -708,7 +1186,7 @@ var CursorBuddyClient = class {
708
1186
  this.notify();
709
1187
  const response = await this.chat(transcript, screenshot, signal);
710
1188
  if (signal?.aborted) return;
711
- const pointTarget = parsePointingTag(response);
1189
+ const parsed = parsePointingTagRaw(response);
712
1190
  const cleanResponse = stripPointingTag(response);
713
1191
  this.response = cleanResponse;
714
1192
  this.stateMachine.transition({
@@ -728,10 +1206,21 @@ var CursorBuddyClient = class {
728
1206
  }
729
1207
  ];
730
1208
  $conversationHistory.set(newHistory);
1209
+ this.historyCommittedForTurn = true;
1210
+ let pointTarget = null;
1211
+ if (parsed) if (parsed.type === "marker") {
1212
+ const coords = resolveMarkerToCoordinates(screenshot.markerMap, parsed.markerId);
1213
+ if (coords) pointTarget = {
1214
+ ...coords,
1215
+ label: parsed.label
1216
+ };
1217
+ } else pointTarget = {
1218
+ ...mapCoordinatesToViewport(parsed.x, parsed.y, screenshot),
1219
+ label: parsed.label
1220
+ };
731
1221
  if (pointTarget) {
732
- const mappedTarget = mapPointToViewport(pointTarget, screenshot);
733
- this.options.onPoint?.(mappedTarget);
734
- this.pointerController.pointAt(mappedTarget);
1222
+ this.options.onPoint?.(pointTarget);
1223
+ this.pointerController.pointAt(pointTarget);
735
1224
  }
736
1225
  if (cleanResponse) await this.speak(cleanResponse, signal);
737
1226
  if (signal?.aborted) return;
@@ -772,6 +1261,7 @@ var CursorBuddyClient = class {
772
1261
  this.transcript = "";
773
1262
  this.response = "";
774
1263
  this.error = null;
1264
+ this.historyCommittedForTurn = false;
775
1265
  this.pointerController.release();
776
1266
  this.stateMachine.reset();
777
1267
  this.notify();
@@ -811,11 +1301,34 @@ var CursorBuddyClient = class {
811
1301
  };
812
1302
  }
813
1303
  abort() {
1304
+ this.commitPartialHistory();
814
1305
  this.abortController?.abort();
815
1306
  this.abortController = null;
816
1307
  this.audioPlayback.stop();
817
1308
  $audioLevel.set(0);
818
1309
  }
1310
+ /**
1311
+ * Commit partial turn to history when interrupted.
1312
+ * Only commits if we have both transcript and response,
1313
+ * and haven't already committed for this turn.
1314
+ */
1315
+ commitPartialHistory() {
1316
+ if (this.historyCommittedForTurn) return;
1317
+ if (!this.transcript || !this.response) return;
1318
+ const newHistory = [
1319
+ ...$conversationHistory.get(),
1320
+ {
1321
+ role: "user",
1322
+ content: this.transcript
1323
+ },
1324
+ {
1325
+ role: "assistant",
1326
+ content: this.response
1327
+ }
1328
+ ];
1329
+ $conversationHistory.set(newHistory);
1330
+ this.historyCommittedForTurn = true;
1331
+ }
819
1332
  async transcribe(blob, signal) {
820
1333
  const formData = new FormData();
821
1334
  formData.append("audio", blob, "recording.wav");
@@ -840,7 +1353,8 @@ var CursorBuddyClient = class {
840
1353
  height: screenshot.height
841
1354
  },
842
1355
  transcript,
843
- history
1356
+ history,
1357
+ markerContext: screenshot.markerContext
844
1358
  }),
845
1359
  signal
846
1360
  });
@@ -887,4 +1401,4 @@ var CursorBuddyClient = class {
887
1401
  //#endregion
888
1402
  export { $buddyScale as a, $buddyRotation as i, $audioLevel as n, $cursorPosition as o, $buddyPosition as r, $pointingTarget as s, CursorBuddyClient as t };
889
1403
 
890
- //# sourceMappingURL=client-Bd33JD8T.mjs.map
1404
+ //# sourceMappingURL=client-DAa4L2fE.mjs.map