cursor-buddy 0.0.0-beta.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 leojuriolli7
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
package/README.md ADDED
@@ -0,0 +1,422 @@
1
+ # cursor-buddy
2
+
3
+
4
+ https://github.com/user-attachments/assets/3cdfe011-aee2-4c8e-b695-34f83a972593
5
+
6
+
7
+ AI Agent that lives in your cursor, built for web apps. Push-to-talk voice assistant that can see your screen and point at things.
8
+
9
+ Customize its prompt, pass custom tools, choose between browser or server-side speech APIs, use any AI SDK models and customize the UI to fit your needs.
10
+
11
+ ## Features
12
+
13
+ - **Push-to-talk voice input** — Hold a hotkey to speak, release to send
14
+ - **Browser-first live transcription** — Realtime transcript while speaking, with server fallback
15
+ - **Annotated screenshot context** — AI sees your current viewport with numbered interactive elements
16
+ - **Voice responses** — Browser or server TTS, with optional streaming playback
17
+ - **Cursor pointing** — AI can point at UI elements it references
18
+ - **Voice interruption** — Start talking again to cut off current response
19
+ - **Framework agnostic** — Core client written in Typescript, adapter-based architecture
20
+ - **Customizable** — CSS variables, custom components, headless mode
21
+ - **Configurable** — Choose any AI SDK models, equip the agent with tools, or modify the system prompt
22
+
23
+ ## Installation
24
+
25
+ ```bash
26
+ npm install cursor-buddy
27
+ # or
28
+ pnpm add cursor-buddy
29
+ ```
30
+
31
+ ## Quick Start
32
+
33
+ ### 1. Server Setup
34
+
35
+ Create an API route that handles chat, transcription, and TTS.
36
+
37
+ Keep `transcriptionModel` configured if you want browser transcription to fall
38
+ back to the server in `auto` mode. Keep `speechModel` configured if you want
39
+ server speech or browser speech fallback in `auto` mode.
40
+
41
+ ```ts
42
+ // lib/cursor-buddy.ts
43
+ import { createCursorBuddyHandler } from "cursor-buddy/server"
44
+ import { openai } from "@ai-sdk/openai"
45
+
46
+ export const cursorBuddy = createCursorBuddyHandler({
47
+ model: openai("gpt-4o"),
48
+ speechModel: openai.speech("tts-1"),
49
+ transcriptionModel: openai.transcription("whisper-1"),
50
+ })
51
+ ```
52
+
53
+ #### Next.js App Router
54
+
55
+ ```ts
56
+ // app/api/cursor-buddy/[...path]/route.ts
57
+ import { toNextJsHandler } from "cursor-buddy/server/next"
58
+ import { cursorBuddy } from "@/lib/cursor-buddy"
59
+
60
+ export const { GET, POST } = toNextJsHandler(cursorBuddy)
61
+ ```
62
+
63
+ ### 2. Client Setup
64
+
65
+ Add the `<CursorBuddy />` component to your app.
66
+
67
+ ```tsx
68
+ // app/layout.tsx
69
+ import { CursorBuddy } from "cursor-buddy/react"
70
+
71
+ export default function RootLayout({ children }) {
72
+ return (
73
+ <html>
74
+ <body>
75
+ {children}
76
+ <CursorBuddy endpoint="/api/cursor-buddy" />
77
+ </body>
78
+ </html>
79
+ )
80
+ }
81
+ ```
82
+
83
+ That's it! Hold **Ctrl+Alt** to speak, release to send.
84
+
85
+ ## Server Configuration
86
+
87
+ ```ts
88
+ createCursorBuddyHandler({
89
+ // Required
90
+ model: LanguageModel, // AI SDK chat model
91
+ speechModel: SpeechModel, // Optional server TTS model
92
+ transcriptionModel: TranscriptionModel, // Optional server fallback for STT
93
+
94
+ // Optional
95
+ system: string | ((ctx) => string), // Custom system prompt
96
+ tools: Record<string, Tool>, // AI SDK tools
97
+ maxHistory: number, // Max conversation history (default: 10)
98
+ })
99
+ ```
100
+
101
+ ### Custom System Prompt
102
+
103
+ ```ts
104
+ createCursorBuddyHandler({
105
+ model: openai("gpt-4o"),
106
+ speechModel: openai.speech("tts-1"),
107
+ transcriptionModel: openai.transcription("whisper-1"),
108
+
109
+ // Extend the default prompt
110
+ system: ({ defaultPrompt }) => `
111
+ ${defaultPrompt}
112
+
113
+ You are helping users navigate a project management dashboard.
114
+ The sidebar contains: Projects, Tasks, Calendar, Settings.
115
+ `,
116
+ })
117
+ ```
118
+
119
+ ## Client Configuration
120
+
121
+ ```tsx
122
+ <CursorBuddy
123
+ // Required
124
+ endpoint="/api/cursor-buddy"
125
+
126
+ // Optional
127
+ hotkey="ctrl+alt" // Push-to-talk hotkey (default: "ctrl+alt")
128
+ container={element} // Portal container (default: document.body)
129
+ transcription={{ mode: "auto" }} // "auto" | "browser" | "server"
130
+ speech={{ mode: "server", allowStreaming: false }}
131
+ // mode: "auto" | "browser" | "server"
132
+ // allowStreaming: speak sentence-by-sentence while chat streams
133
+
134
+ // Custom components
135
+ cursor={(props) => <CustomCursor {...props} />}
136
+ speechBubble={(props) => <CustomBubble {...props} />}
137
+ waveform={(props) => <CustomWaveform {...props} />}
138
+
139
+ // Callbacks
140
+ onTranscript={(text) => {}} // Called when speech is transcribed
141
+ onResponse={(text) => {}} // Called when AI responds
142
+ onPoint={(target) => {}} // Called when AI points at element
143
+ onStateChange={(state) => {}} // Called on state change
144
+ onError={(error) => {}} // Called on error
145
+ />
146
+ ```
147
+
148
+ ### Transcription Modes
149
+
150
+ - `"auto"` — Try browser speech recognition first, then fall back to the
151
+ server transcription route if needed.
152
+ - `"browser"` — Require browser speech recognition. If it fails, the turn
153
+ errors and no server fallback is attempted.
154
+ - `"server"` — Skip browser speech recognition and always use the server
155
+ transcription route.
156
+
157
+ ### Speech Modes
158
+
159
+ - `"auto"` — Try browser speech synthesis first, then fall back to the server
160
+ TTS route if browser speech is unavailable or fails.
161
+ - `"browser"` — Require browser speech synthesis. If it fails, the turn
162
+ errors and no server fallback is attempted.
163
+ - `"server"` — Skip browser speech synthesis and always use the server TTS
164
+ route.
165
+
166
+ ### Speech Streaming
167
+
168
+ - `speech.allowStreaming: false` — Wait for the full `/chat` response, then
169
+ speak it once.
170
+ - `speech.allowStreaming: true` — Speak completed sentence segments as the chat
171
+ stream arrives.
172
+
173
+ ## Customization
174
+
175
+ ### CSS Variables
176
+
177
+ Cursor buddy styles are customizable via CSS variables. Override them in your stylesheet:
178
+
179
+ ```css
180
+ :root {
181
+ /* Cursor colors by state */
182
+ --cursor-buddy-color-idle: #3b82f6;
183
+ --cursor-buddy-color-listening: #ef4444;
184
+ --cursor-buddy-color-processing: #eab308;
185
+ --cursor-buddy-color-responding: #22c55e;
186
+
187
+ /* Speech bubble */
188
+ --cursor-buddy-bubble-bg: #ffffff;
189
+ --cursor-buddy-bubble-text: #1f2937;
190
+ --cursor-buddy-bubble-radius: 8px;
191
+ --cursor-buddy-bubble-shadow: 0 4px 12px rgba(0, 0, 0, 0.15);
192
+
193
+ /* Waveform */
194
+ --cursor-buddy-waveform-color: #ef4444;
195
+ }
196
+ ```
197
+
198
+ ### Custom Components
199
+
200
+ Replace default components with your own:
201
+
202
+ ```tsx
203
+ import { CursorBuddy, type CursorRenderProps } from "cursor-buddy/react"
204
+
205
+ function MyCursor({ state, rotation, scale }: CursorRenderProps) {
206
+ return (
207
+ <div style={{ transform: `rotate(${rotation}rad) scale(${scale})` }}>
208
+ {state === "listening" ? "Listening..." : "Point"}
209
+ </div>
210
+ )
211
+ }
212
+
213
+ <CursorBuddy
214
+ endpoint="/api/cursor-buddy"
215
+ cursor={(props) => <MyCursor {...props} />}
216
+ />
217
+ ```
218
+
219
+ ## Headless Mode
220
+
221
+ For full control, use the provider and hook directly:
222
+
223
+ ```tsx
224
+ import {
225
+ CursorBuddyProvider,
226
+ useCursorBuddy
227
+ } from "cursor-buddy/react"
228
+
229
+ function App() {
230
+ return (
231
+ <CursorBuddyProvider endpoint="/api/cursor-buddy">
232
+ <MyCustomUI />
233
+ </CursorBuddyProvider>
234
+ )
235
+ }
236
+
237
+ function MyCustomUI() {
238
+ const {
239
+ state, // "idle" | "listening" | "processing" | "responding"
240
+ liveTranscript, // In-progress transcript while speaking
241
+ transcript, // Latest user speech
242
+ response, // Latest AI response
243
+ audioLevel, // 0-1, for waveform visualization
244
+ isEnabled,
245
+ isPointing,
246
+ error,
247
+
248
+ // Actions
249
+ startListening,
250
+ stopListening,
251
+ setEnabled,
252
+ pointAt, // Manually point at coordinates
253
+ dismissPointing,
254
+ reset,
255
+ } = useCursorBuddy()
256
+
257
+ return (
258
+ <div>
259
+ <p>State: {state}</p>
260
+ <p>Live transcript: {liveTranscript}</p>
261
+ <button
262
+ onMouseDown={startListening}
263
+ onMouseUp={stopListening}
264
+ >
265
+ Hold to speak
266
+ </button>
267
+ </div>
268
+ )
269
+ }
270
+ ```
271
+
272
+ Complete Render Props types:
273
+
274
+ ```ts
275
+ interface CursorRenderProps {
276
+ state: "idle" | "listening" | "processing" | "responding"
277
+ isPointing: boolean
278
+ rotation: number // Radians, direction of travel
279
+ scale: number // 1.0 normal, up to 1.3 during flight
280
+ }
281
+
282
+ interface SpeechBubbleRenderProps {
283
+ text: string
284
+ isVisible: boolean
285
+ }
286
+
287
+ interface WaveformRenderProps {
288
+ audioLevel: number // 0-1
289
+ isListening: boolean
290
+ }
291
+ ```
292
+
293
+ ## Framework-Agnostic Usage
294
+
295
+ For non-React environments, use the core client directly:
296
+
297
+ ```ts
298
+ import { CursorBuddyClient } from "cursor-buddy"
299
+
300
+ const client = new CursorBuddyClient("/api/cursor-buddy", {
301
+ transcription: { mode: "auto" },
302
+ speech: { mode: "server", allowStreaming: false },
303
+ onStateChange: (state) => console.log("State:", state),
304
+ onTranscript: (text) => console.log("Transcript:", text),
305
+ onResponse: (text) => console.log("Response:", text),
306
+ onError: (err) => console.error("Error:", err),
307
+ })
308
+
309
+ // Subscribe to state changes
310
+ client.subscribe(() => {
311
+ const snapshot = client.getSnapshot()
312
+ console.log(snapshot)
313
+ })
314
+
315
+ // Trigger voice interaction
316
+ client.startListening()
317
+ // ... user speaks ...
318
+ client.stopListening()
319
+ ```
320
+
321
+ ## API Reference
322
+
323
+ ### Core Exports (`cursor-buddy`)
324
+
325
+ | Export | Description |
326
+ |--------|-------------|
327
+ | `CursorBuddyClient` | Framework-agnostic client class |
328
+ | `VoiceState` | Type: `"idle" \| "listening" \| "processing" \| "responding"` |
329
+ | `PointingTarget` | Type: `{ x: number, y: number, label: string }` |
330
+ | `Point` | Type: `{ x: number, y: number }` |
331
+
332
+ ### Server Exports (`cursor-buddy/server`)
333
+
334
+ | Export | Description |
335
+ |--------|-------------|
336
+ | `createCursorBuddyHandler` | Create the main request handler |
337
+ | `DEFAULT_SYSTEM_PROMPT` | Default system prompt for reference |
338
+ | `CursorBuddyHandlerConfig` | Type for handler configuration |
339
+ | `CursorBuddyHandler` | Return type of `createCursorBuddyHandler` |
340
+
341
+ ### Server Adapters (`cursor-buddy/server/next`)
342
+
343
+ | Export | Description |
344
+ |--------|-------------|
345
+ | `toNextJsHandler` | Convert handler to Next.js App Router format |
346
+
347
+ ### React Exports (`cursor-buddy/react`)
348
+
349
+ | Export | Description |
350
+ |--------|-------------|
351
+ | `CursorBuddy` | Drop-in component with built-in UI |
352
+ | `CursorBuddyProvider` | Headless provider for custom UI |
353
+ | `useCursorBuddy` | Hook to access state and actions |
354
+
355
+ ### Types (`cursor-buddy/react`)
356
+
357
+ | Export | Description |
358
+ |--------|-------------|
359
+ | `CursorBuddyProps` | Props for `<CursorBuddy />` |
360
+ | `CursorBuddyProviderProps` | Props for `<CursorBuddyProvider />` |
361
+ | `UseCursorBuddyReturn` | Return type of `useCursorBuddy()` |
362
+ | `CursorRenderProps` | Props passed to custom cursor |
363
+ | `SpeechBubbleRenderProps` | Props passed to custom speech bubble |
364
+ | `WaveformRenderProps` | Props passed to custom waveform |
365
+
366
+ ## How It Works
367
+
368
+ 1. User holds the hotkey
369
+ 2. Microphone captures audio, waveform shows audio level, and browser speech recognition starts when available
370
+ 3. User releases hotkey
371
+ 4. An annotated screenshot of the viewport is captured, with numbered markers on visible interactive elements, based on [agent-browser](https://github.com/vercel-labs/agent-browser) implementation.
372
+ 5. The client prefers the browser transcript; if it is unavailable or empty in `auto` mode, the recorded audio is transcribed on the server
373
+ 6. Screenshot + marker context are sent to the AI model
374
+ 7. AI responds with text and can optionally call the `point` tool to indicate a location on screen:
375
+ - `type: "marker"` with `markerId` for numbered interactive elements (most accurate)
376
+ - `type: "coordinates"` with `x, y` pixel coordinates for anything without a marker
377
+ 8. Response is spoken in the browser or on the server based on `speech.mode`,
378
+ and can either wait for the full response or stream sentence-by-sentence
379
+ based on `speech.allowStreaming`
380
+ 9. If the AI calls the point tool, the cursor animates to the target location — markers resolve to live DOM elements, coordinates map to viewport positions
381
+ 10. **If user presses hotkey again at any point, current response is interrupted**
382
+
383
+ ## Security Best Practices
384
+
385
+ Since the cursor-buddy endpoints allow direct LLM communication, it is strongly recommended to configure CORS and rate limiting to prevent abuse, unauthorized access, and unexpected API costs.
386
+
387
+ Wrap the handler with CORS and rate limiting:
388
+
389
+ ```ts
390
+ // app/api/cursor-buddy/[...path]/route.ts
391
+ import { toNextJsHandler } from "cursor-buddy/server/next"
392
+ import { cursorBuddy } from "@/lib/cursor-buddy"
393
+
394
+ const handler = toNextJsHandler(cursorBuddy)
395
+
396
+ export async function POST(request: Request) {
397
+ // Verify origin
398
+ const origin = request.headers.get("origin")
399
+ if (origin !== process.env.ALLOWED_ORIGIN) {
400
+ return new Response("Unauthorized", { status: 403 })
401
+ }
402
+
403
+ // Check rate limit (e.g., 10 requests per minute)
404
+ const ip = request.headers.get("x-forwarded-for") || "unknown"
405
+ const { success } = await rateLimiter.limit(ip)
406
+ if (!success) {
407
+ return new Response("Rate limit exceeded", { status: 429 })
408
+ }
409
+
410
+ return handler(request)
411
+ }
412
+
413
+ export const GET = POST
414
+ ```
415
+
416
+ ## TODOs
417
+
418
+ - [ ] Medium: Proper test structure without relying on `as any` for audio and voice capture
419
+
420
+ ## License
421
+
422
+ MIT